From 727872d3124d12d386169a3ff05c23caf9051ede Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 21 Jun 2022 07:25:29 -0700 Subject: [PATCH 01/99] Initial work towards unification of gauge fields. Replaced Gauge_p() method with a better replacement named data() --- include/gauge_field.h | 130 +++++++--------- include/gauge_field_order.h | 46 +++--- lib/coarse_op.cuh | 8 +- lib/coarse_op_preconditioned.cu | 6 +- lib/cpu_gauge_field.cpp | 147 +++++------------- lib/cuda_gauge_field.cpp | 103 ++++-------- lib/gauge_field.cpp | 94 ++++++++++- lib/gauge_stout.cu | 4 +- lib/interface_quda.cpp | 8 +- lib/staggered_kd_build_xinv.cu | 4 +- lib/staggered_oprod.cu | 4 +- lib/unitarize_links_quda.cu | 31 ++-- tests/gauge_force_test.cpp | 22 +-- tests/heatbath_test.cpp | 6 +- tests/hisq_paths_force_test.cpp | 13 +- tests/hisq_unitarize_force_test.cpp | 8 +- .../domain_wall_dslash_reference.cpp | 68 ++++---- .../domain_wall_dslash_reference.h | 52 +++---- .../host_reference/gauge_force_reference.cpp | 8 +- tests/host_reference/gauge_force_reference.h | 4 +- tests/host_reference/hisq_force_reference.cpp | 21 ++- .../staggered_dslash_reference.cpp | 12 +- .../staggered_dslash_reference.h | 8 +- tests/multigrid_evolve_test.cpp | 6 +- tests/staggered_dslash_test_utils.h | 3 +- tests/unitarize_link_test.cpp | 4 +- tests/utils/host_utils.cpp | 2 +- tests/utils/host_utils.h | 2 +- tests/utils/staggered_host_utils.cpp | 2 +- 29 files changed, 399 insertions(+), 427 deletions(-) diff --git a/include/gauge_field.h b/include/gauge_field.h index 706e82ccf1..7e484bd3a0 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -195,6 +195,9 @@ namespace quda { class GaugeField : public LatticeField { protected: + void *gauge; /** The gauge field allocation */ + void *gauge_h; /** Mapped-memory pointer when allocating on the host */ + void **gauge_qdp; /** Array of pointers to each subset (QDP order) */ size_t bytes; // bytes allocated per full field size_t phase_offset; // offset in bytes to gauge phases - useful to keep track of texture alignment size_t phase_bytes; // bytes needed to store the phases @@ -203,6 +206,7 @@ namespace quda { int nColor; int nFace; QudaFieldGeometry geometry; // whether the field is a scale, vector or tensor + int site_dim; // the dimensionality of each site (number of matrices per lattice site) QudaReconstructType reconstruct; int nInternal; // number of degrees of freedom per link matrix @@ -350,24 +354,46 @@ namespace quda { size_t TotalBytes() const { return bytes; } - virtual void* Gauge_p() { errorQuda("Not implemented"); return (void*)0;} - virtual void* Even_p() { errorQuda("Not implemented"); return (void*)0;} - virtual void* Odd_p() { errorQuda("Not implemented"); return (void*)0;} + /** + @brief Helper function that returns true if the gauge order is an array of pointers + @param[in] order The gauge order requested + @return If the order is an array of pointers + */ + constexpr bool is_pointer_array(QudaGaugeFieldOrder order) const + { + switch (order) { + case QUDA_QDP_GAUGE_ORDER: + case QUDA_QDPJIT_GAUGE_ORDER: + return true; + default: + return false; + } + } - virtual const void* Gauge_p() const { errorQuda("Not implemented"); return (void*)0;} - virtual const void* Even_p() const { errorQuda("Not implemented"); return (void*)0;} - virtual const void* Odd_p() const { errorQuda("Not implemented"); return (void*)0;} + /** + @brief Return base pointer to the gauge field allocation. + @tparam T Optional type to cast the pointer to. + @return Base pointer to the gauge field allocation + */ + template auto data() const + { + static_assert(std::is_pointer_v, "data() requires a pointer cast type"); + + using U = typename std::remove_pointer::type; + if constexpr (std::is_pointer_v) { + if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order); + return reinterpret_cast(gauge_qdp); + } else { + if (is_pointer_array(order) && !std::is_same_v) errorQuda("Non dim-array ordered field requested but order is %d", order); + return reinterpret_cast(gauge); + } + } virtual int full_dim(int d) const { return x[d]; } - const void** Ghost() const { + auto Ghost() const { if ( isNative() ) errorQuda("No ghost zone pointer for quda-native gauge fields"); - return (const void**)ghost; - } - - void** Ghost() { - if ( isNative() ) errorQuda("No ghost zone pointer for quda-native gauge fields"); - return ghost; + return (void * const *)ghost; } /** @@ -383,9 +409,9 @@ namespace quda { size_t SiteSize() const { return site_size; } /** - Set all field elements to zero (virtual) + Set all field elements to zero */ - virtual void zero() = 0; + void zero(); /** * Generic gauge field copy @@ -439,15 +465,28 @@ namespace quda { */ static GaugeField* Create(const GaugeFieldParam ¶m); + /** + @brief If managed memory and prefetch is enabled, prefetch + the gauge field and buffers to the CPU or the GPU + @param[in] mem_space Memory space we are prefetching to + @param[in] stream Which stream to run the prefetch in (default 0) + */ + void prefetch(QudaFieldLocation mem_space, qudaStream_t stream = device::get_default_stream()) const; + + /** + @brief Backs up the GaugeField + */ + void backup() const; + + /** + @brief Restores the GaugeField + */ + void restore() const; }; class cudaGaugeField : public GaugeField { private: - void *gauge; - void *gauge_h; // mapped-memory pointer when allocating on the host - void *even; - void *odd; /** @brief Initialize the padded region to 0 @@ -571,15 +610,6 @@ namespace quda { */ void saveCPUField(cpuGaugeField &cpu, TimeProfile &profile) const; - // (ab)use with care - void* Gauge_p() { return gauge; } - void* Even_p() { return even; } - void* Odd_p() { return odd; } - - const void* Gauge_p() const { return gauge; } - const void* Even_p() const { return even; } - const void *Odd_p() const { return odd; } - /** @brief Copy all contents of the field to a host buffer. @param[in] the host buffer to copy to. @@ -593,29 +623,6 @@ namespace quda { virtual void copy_from_buffer(void *buffer); void setGauge(void* _gauge); //only allowed when create== QUDA_REFERENCE_FIELD_CREATE - - /** - Set all field elements to zero - */ - void zero(); - - /** - @brief Backs up the cudaGaugeField to CPU memory - */ - void backup() const; - - /** - @brief Restores the cudaGaugeField to CUDA memory - */ - void restore() const; - - /** - @brief If managed memory and prefetch is enabled, prefetch - the gauge field and buffers to the CPU or the GPU - @param[in] mem_space Memory space we are prefetching to - @param[in] stream Which stream to run the prefetch in (default 0) - */ - void prefetch(QudaFieldLocation mem_space, qudaStream_t stream = device::get_default_stream()) const; }; class cpuGaugeField : public GaugeField { @@ -624,9 +631,6 @@ namespace quda { friend void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu); friend void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const; - private: - void **gauge; // the actual gauge field - public: /** @brief Constructor for cpuGaugeField from a GaugeFieldParam @@ -680,9 +684,6 @@ namespace quda { */ void copy(const GaugeField &src); - void* Gauge_p() { return gauge; } - const void* Gauge_p() const { return gauge; } - /** @brief Copy all contents of the field to a host buffer. @param[in] the host buffer to copy to. @@ -696,21 +697,6 @@ namespace quda { virtual void copy_from_buffer(void *buffer); void setGauge(void** _gauge); //only allowed when create== QUDA_REFERENCE_FIELD_CREATE - - /** - Set all field elements to zero - */ - void zero(); - - /** - @brief Backs up the cpuGaugeField - */ - void backup() const; - - /** - @brief Restores the cpuGaugeField - */ - void restore() const; }; /** diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 5f0186533f..2b0f4e2faa 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -355,10 +355,9 @@ namespace quda { scale(static_cast(1.0)), scale_inv(static_cast(1.0)) { - for (int d=0; d**>(gauge_)[d] : - static_cast**>(const_cast(U.Gauge_p()))[d]; - resetScale(U.Scale()); + for (int d = 0; d < U.Geometry(); d++) + u[d] = gauge_ ? static_cast **>(gauge_)[d] : U.data *const *>()[d]; + resetScale(U.Scale()); } void resetScale(Float max) @@ -466,8 +465,7 @@ namespace quda { static constexpr bool fixed = fixed_point(); Accessor(const GaugeField &U, void *gauge_ = nullptr, void ** = nullptr) : - u(gauge_ ? static_cast *>(gauge_) : - static_cast *>(const_cast(U.Gauge_p()))), + u(gauge_ ? static_cast *>(gauge_) : U.data *>()), volumeCB(U.VolumeCB()), geometry(U.Geometry()), scale(static_cast(1.0)), @@ -601,8 +599,7 @@ namespace quda { static constexpr bool fixed = fixed_point(); Accessor(const GaugeField &U, void *gauge_ = nullptr, void ** = nullptr) : - u(gauge_ ? static_cast *>(gauge_) : - static_cast *>(const_cast(U.Gauge_p()))), + u(gauge_ ? static_cast *>(gauge_) : U.data *>()), offset_cb((U.Bytes() >> 1) / sizeof(complex)), volumeCB(U.VolumeCB()), stride(U.Stride()), @@ -1512,7 +1509,7 @@ namespace quda { FloatNOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : reconstruct(u), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + gauge(gauge_ ? gauge_ : u.data()), offset(u.Bytes() / (2 * sizeof(Float) * N)), ghostExchange(u.GhostExchange()), volumeCB(u.VolumeCB()), @@ -1829,7 +1826,9 @@ namespace quda { const int volumeCB; QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) : LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) - { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; } + { + for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data()[i]; + } __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const { @@ -1873,7 +1872,9 @@ namespace quda { const int volumeCB; QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) : LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) - { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; } + { + for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data()[i]; + } __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const { @@ -1920,9 +1921,14 @@ namespace quda { Float *gauge; const int volumeCB; const int geometry; - MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) : - LegacyOrder(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()), - volumeCB(u.VolumeCB()), geometry(u.Geometry()) { ; } + MILCOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : + LegacyOrder(u, ghost_), + gauge(gauge_ ? gauge_ : u.data()), + volumeCB(u.VolumeCB()), + geometry(u.Geometry()) + { + ; + } __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const { @@ -1980,7 +1986,7 @@ namespace quda { const size_t size; MILCSiteOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + gauge(gauge_ ? gauge_ : u.data()), volumeCB(u.VolumeCB()), geometry(u.Geometry()), offset(u.SiteOffset()), @@ -2040,7 +2046,7 @@ namespace quda { const int geometry; CPSOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + gauge(gauge_ ? gauge_ : u.data()), volumeCB(u.VolumeCB()), anisotropy(u.Anisotropy()), anisotropy_inv(1.0 / anisotropy), @@ -2106,9 +2112,7 @@ namespace quda { int exVolumeCB; // extended checkerboard volume static constexpr int Nc = 3; BQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : - LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), - volumeCB(u.VolumeCB()) + LegacyOrder(u, ghost_), gauge(gauge_ ? gauge_ : u.data()), volumeCB(u.VolumeCB()) { if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); // compute volumeCB + halo region @@ -2172,7 +2176,7 @@ namespace quda { const real scale_inv; TIFROrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + gauge(gauge_ ? gauge_ : u.data()), volumeCB(u.VolumeCB()), scale(u.Scale()), scale_inv(1.0 / scale) @@ -2239,7 +2243,7 @@ namespace quda { const int exDim[4]; TIFRPaddedOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + gauge(gauge_ ? gauge_ : u.data()), volumeCB(u.VolumeCB()), exVolumeCB(1), scale(u.Scale()), diff --git a/lib/coarse_op.cuh b/lib/coarse_op.cuh index fbefd474a0..67433a2291 100644 --- a/lib/coarse_op.cuh +++ b/lib/coarse_op.cuh @@ -877,8 +877,8 @@ namespace quda { X_atomic.backup(); break; case COMPUTE_CONVERT: - if (Y_atomic.Gauge_p() == Y.Gauge_p()) Y.backup(); - if (X_atomic.Gauge_p() == X.Gauge_p()) X.backup(); + if (Y_atomic.data() == Y.data()) Y.backup(); + if (X_atomic.data() == X.data()) X.backup(); break; case COMPUTE_RESCALE: Y.backup(); @@ -911,8 +911,8 @@ namespace quda { X_atomic.restore(); break; case COMPUTE_CONVERT: - if (Y_atomic.Gauge_p() == Y.Gauge_p()) Y.restore(); - if (X_atomic.Gauge_p() == X.Gauge_p()) X.restore(); + if (Y_atomic.data() == Y.data()) Y.restore(); + if (X_atomic.data() == X.data()) X.restore(); break; case COMPUTE_RESCALE: Y.restore(); diff --git a/lib/coarse_op_preconditioned.cu b/lib/coarse_op_preconditioned.cu index 9cf4da755e..7a75c1d895 100644 --- a/lib/coarse_op_preconditioned.cu +++ b/lib/coarse_op_preconditioned.cu @@ -174,8 +174,7 @@ namespace quda GaugeField *X_aos = create_gauge_copy(X, true); Xinv_aos = create_gauge_copy(Xinv, false); - blas::flops += invert((void *)Xinv_aos->Gauge_p(), (void *)X_aos->Gauge_p(), n, X_aos->Volume(), - X_aos->Precision(), X.Location()); + blas::flops += invert(Xinv_aos->data(), X_aos->data(), n, X_aos->Volume(), X_aos->Precision(), X.Location()); if (&Xinv != Xinv_aos) { if (Xinv.Precision() < QUDA_SINGLE_PRECISION) Xinv.Scale(Xinv_aos->abs_max()); @@ -188,7 +187,8 @@ namespace quda } else if (X.Location() == QUDA_CPU_FIELD_LOCATION && X.Order() == QUDA_QDP_GAUGE_ORDER) { const cpuGaugeField *X_h = static_cast(&X); cpuGaugeField *Xinv_h = static_cast(&Xinv); - blas::flops += invert(*(void**)Xinv_h->Gauge_p(), *(void**)X_h->Gauge_p(), n, X_h->Volume(), X.Precision(), X.Location()); + blas::flops += invert(Xinv_h->data()[0], X_h->data()[0], n, X_h->Volume(), + X.Precision(), X.Location()); } else { errorQuda("Unsupported location=%d and order=%d", X.Location(), X.Order()); } diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp index f4b27109a8..f3063d5d32 100644 --- a/lib/cpu_gauge_field.cpp +++ b/lib/cpu_gauge_field.cpp @@ -26,39 +26,30 @@ namespace quda { errorQuda("10-reconstruction only supported with momentum links"); } - int siteDim=0; - if (geometry == QUDA_SCALAR_GEOMETRY) siteDim = 1; - else if (geometry == QUDA_VECTOR_GEOMETRY) siteDim = nDim; - else if (geometry == QUDA_TENSOR_GEOMETRY) siteDim = nDim * (nDim-1) / 2; - else if (geometry == QUDA_COARSE_GEOMETRY) siteDim = 2*nDim; - else if (geometry == QUDA_KDINVERSE_GEOMETRY) - siteDim = 1 << nDim; - else errorQuda("Unknown geometry type %d", geometry); - // compute the correct bytes size for these padded field orders if (order == QUDA_TIFR_PADDED_GAUGE_ORDER) { - bytes = siteDim * (x[0]*x[1]*(x[2]+4)*x[3]) * nInternal * precision; + bytes = site_dim * (x[0] * x[1] * (x[2] + 4) * x[3]) * nInternal * precision; } else if (order == QUDA_BQCD_GAUGE_ORDER) { - bytes = siteDim * (x[0]+4)*(x[1]+2)*(x[2]+2)*(x[3]+2) * nInternal * precision; + bytes = site_dim * (x[0] + 4) * (x[1] + 2) * (x[2] + 2) * (x[3] + 2) * nInternal * precision; } else if (order == QUDA_MILC_SITE_GAUGE_ORDER) { bytes = volume * site_size; } if (order == QUDA_QDP_GAUGE_ORDER) { - gauge = (void**) safe_malloc(siteDim * sizeof(void*)); - - for (int d=0; d(gauge); + for (int d = 0; d < site_dim; d++) { + if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) { + gauge_qdp[d] = nbytes ? safe_malloc(nbytes) : nullptr; + if (create == QUDA_ZERO_FIELD_CREATE && nbytes) memset(gauge_qdp[d], 0, nbytes); } else if (create == QUDA_REFERENCE_FIELD_CREATE) { - gauge[d] = ((void **)param.gauge)[d]; + gauge_qdp[d] = ((void **)param.gauge)[d]; } else { errorQuda("Unsupported creation type %d", create); } } - + } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER || order == QUDA_BQCD_GAUGE_ORDER || order == QUDA_TIFR_GAUGE_ORDER || order == QUDA_TIFR_PADDED_GAUGE_ORDER || order == QUDA_MILC_SITE_GAUGE_ORDER) { @@ -71,7 +62,7 @@ namespace quda { gauge = bytes ? (void **)safe_malloc(bytes) : nullptr; if (create == QUDA_ZERO_FIELD_CREATE && bytes) memset(gauge, 0, bytes); } else if (create == QUDA_REFERENCE_FIELD_CREATE) { - gauge = (void**) param.gauge; + gauge = param.gauge; } else { errorQuda("Unsupported creation type %d", create); } @@ -104,24 +95,13 @@ namespace quda { cpuGaugeField::~cpuGaugeField() { - int siteDim = 0; - if (geometry == QUDA_SCALAR_GEOMETRY) siteDim = 1; - else if (geometry == QUDA_VECTOR_GEOMETRY) siteDim = nDim; - else if (geometry == QUDA_TENSOR_GEOMETRY) siteDim = nDim * (nDim-1) / 2; - else if (geometry == QUDA_COARSE_GEOMETRY) siteDim = 2*nDim; - else if (geometry == QUDA_KDINVERSE_GEOMETRY) - siteDim = 1 << nDim; - else errorQuda("Unknown geometry type %d", geometry); - if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) { if (order == QUDA_QDP_GAUGE_ORDER) { - for (int d=0; d(src).Gauge_p(), src.Bytes(), qudaMemcpyDeviceToHost); + qudaMemcpy(buffer, src.data(), src.Bytes(), qudaMemcpyDeviceToHost); - copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, gauge, buffer); - pool_pinned_free(buffer); + copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, nullptr, buffer); + pool_pinned_free(buffer); } else { // else on the GPU @@ -297,9 +276,11 @@ namespace quda { void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, order, geometry) : nullptr; if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED) { - copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0, ghost_buffer, 0); - if (geometry == QUDA_COARSE_GEOMETRY) copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0, ghost_buffer, 0, 3); // forwards links if bi-directional - } else { + copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr); + if (geometry == QUDA_COARSE_GEOMETRY) + copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr, + 3); // forwards links if bi-directional + } else { copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0); } @@ -321,8 +302,7 @@ namespace quda { } else if (typeid(src) == typeid(cpuGaugeField)) { // copy field and ghost zone directly - copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, gauge, - const_cast(static_cast(src).Gauge_p())); + copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION); } else { errorQuda("Invalid gauge field type"); } @@ -343,88 +323,35 @@ namespace quda { gauge = gauge_; } - void cpuGaugeField::backup() const { - if (backed_up) errorQuda("Gauge field already backed up"); - - if (order == QUDA_QDP_GAUGE_ORDER) { - char **buffer = new char*[geometry]; - for (int d=0; d(buffer); - } else { - backup_h = new char[bytes]; - memcpy(backup_h, gauge, bytes); - } - - backed_up = true; - } - - void cpuGaugeField::restore() const - { - if (!backed_up) errorQuda("Cannot restore since not backed up"); - - if (order == QUDA_QDP_GAUGE_ORDER) { - char **buffer = reinterpret_cast(backup_h); - for (int d=0; d(Gauge_p()); - int dbytes = Bytes() / 4; - static_assert(sizeof(char) == 1, "Assuming sizeof(char) == 1"); + if (is_pointer_array(order)) { char *dst_buffer = reinterpret_cast(buffer); - for (int d = 0; d < 4; d++) { std::memcpy(&dst_buffer[d * dbytes], p[d], dbytes); } + for (int d = 0; d < geometry; d++) { + std::memcpy(&dst_buffer[d * bytes / geometry], data()[d], bytes / geometry); + } } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { - const void *p = Gauge_p(); - int bytes = Bytes(); - std::memcpy(buffer, p, bytes); + std::memcpy(buffer, data(), Bytes()); } else { - errorQuda("Unsupported order = %d\n", Order()); + errorQuda("Unsupported order = %d", Order()); } } void cpuGaugeField::copy_from_buffer(void *buffer) { - - if (Order() == QUDA_QDP_GAUGE_ORDER || Order() == QUDA_QDPJIT_GAUGE_ORDER) { - void **p = static_cast(Gauge_p()); - size_t dbytes = Bytes() / 4; - static_assert(sizeof(char) == 1, "Assuming sizeof(char) == 1"); + if (is_pointer_array(order)) { const char *dst_buffer = reinterpret_cast(buffer); - for (int d = 0; d < 4; d++) { std::memcpy(p[d], &dst_buffer[d * dbytes], dbytes); } + for (int d = 0; d < geometry; d++) { + std::memcpy(data()[d], &dst_buffer[d * bytes / geometry], Bytes() / geometry); + } } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { - void *p = Gauge_p(); - size_t bytes = Bytes(); - std::memcpy(p, buffer, bytes); + std::memcpy(data(), buffer, Bytes()); } else { - errorQuda("Unsupported order = %d\n", Order()); + errorQuda("Unsupported order = %d", Order()); } } diff --git a/lib/cuda_gauge_field.cpp b/lib/cuda_gauge_field.cpp index a7e6ff8952..23b4331cc0 100644 --- a/lib/cuda_gauge_field.cpp +++ b/lib/cuda_gauge_field.cpp @@ -7,8 +7,7 @@ namespace quda { - cudaGaugeField::cudaGaugeField(const GaugeFieldParam ¶m) : - GaugeField(param), gauge(0), even(0), odd(0) + cudaGaugeField::cudaGaugeField(const GaugeFieldParam ¶m) : GaugeField(param) { if ((order == QUDA_QDP_GAUGE_ORDER || order == QUDA_QDPJIT_GAUGE_ORDER) && create != QUDA_REFERENCE_FIELD_CREATE) { @@ -68,8 +67,6 @@ namespace quda { if (create == QUDA_REFERENCE_FIELD_CREATE) exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL); } - even = gauge; - odd = static_cast(gauge) + bytes/2; if (create != QUDA_ZERO_FIELD_CREATE && isNative() && ghostExchange == QUDA_GHOST_EXCHANGE_PAD) zeroPad(); } @@ -79,8 +76,10 @@ namespace quda { size_t pitch = stride*order*precision; if (pad_bytes) { - qudaMemset2D(static_cast(even) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad); - qudaMemset2D(static_cast(odd) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad); + qudaMemset2D(static_cast(gauge) + 0 * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, + Npad); + qudaMemset2D(static_cast(gauge) + 1 * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, + Npad); } } @@ -511,12 +510,12 @@ namespace quda { if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) { // copy field and ghost zone into this field - copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast(src).gauge); + copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION); if (geometry == QUDA_COARSE_GEOMETRY) - copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast(src).gauge, 0, 0, 3); + copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, nullptr, nullptr, nullptr, 3); } else { - copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, static_cast(src).gauge); + copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, nullptr); if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported"); } @@ -526,17 +525,15 @@ namespace quda { if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) { // copy field and ghost zone into buffer - copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast(src).gauge); + copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr); if (geometry == QUDA_COARSE_GEOMETRY) - copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast(src).gauge, - 0, 0, 3); + copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr, 0, 0, 3); } else { - copyExtendedGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, static_cast(src).gauge); + copyExtendedGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr); if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported"); } - // this copies over both even and odd qudaMemcpy(gauge, buffer, bytes, qudaMemcpyDefault); pool_pinned_free(buffer); } else { // else on the GPU @@ -545,7 +542,7 @@ namespace quda { src.Order() == QUDA_BQCD_GAUGE_ORDER || src.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { // special case where we use zero-copy memory to read/write directly from application's array - void *src_d = get_mapped_device_pointer(src.Gauge_p()); + void *src_d = get_mapped_device_pointer(src.data()); if (src.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) { copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, src_d); @@ -562,10 +559,10 @@ namespace quda { if (src.Order() == QUDA_QDP_GAUGE_ORDER) { for (int d=0; d()[d], src.Bytes() / geometry, qudaMemcpyDefault); } } else { - qudaMemcpy(buffer, src.Gauge_p(), src.Bytes(), qudaMemcpyDefault); + qudaMemcpy(buffer, src.data(), src.Bytes(), qudaMemcpyDefault); } if (src.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD @@ -574,11 +571,11 @@ namespace quda { qudaMemcpy(ghost_buffer[d], src.Ghost()[d], ghost_bytes[d], qudaMemcpyDefault); if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) { - copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer, 0, ghost_buffer); + copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer, nullptr, ghost_buffer); if (geometry == QUDA_COARSE_GEOMETRY) - copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer, 0, ghost_buffer, 3); + copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer, nullptr, ghost_buffer, 3); } else { - copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, gauge, buffer); + copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer); if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported"); } free_gauge_buffer(buffer, src.Order(), src.Geometry()); @@ -612,7 +609,7 @@ namespace quda { void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const { - static_cast(cpu).checkField(*this); + cpu.checkField(*this); if (reorder_location() == QUDA_CUDA_FIELD_LOCATION) { @@ -620,9 +617,9 @@ namespace quda { cpu.Order() == QUDA_BQCD_GAUGE_ORDER || cpu.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { // special case where we use zero-copy memory to read/write directly from application's array - void *cpu_d = get_mapped_device_pointer(cpu.Gauge_p()); + void *cpu_d = get_mapped_device_pointer(cpu.data()); if (cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) { - copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, cpu_d, gauge); + copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, cpu_d, nullptr); } else { errorQuda("Ghost copy not supported here"); } @@ -636,17 +633,18 @@ namespace quda { void **ghost_buffer = (nFace > 0) ? create_ghost_buffer(ghost_bytes, cpu.Order(), geometry) : nullptr; if (cpu.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) { - copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge, ghost_buffer, 0); - if (geometry == QUDA_COARSE_GEOMETRY) copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge, ghost_buffer, 0, 3); - } else { - copyExtendedGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, gauge); - } + copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr); + if (geometry == QUDA_COARSE_GEOMETRY) + copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr, 3); + } else { + copyExtendedGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr); + } - if (cpu.Order() == QUDA_QDP_GAUGE_ORDER) { + if (cpu.Order() == QUDA_QDP_GAUGE_ORDER) { for (int d = 0; d < geometry; d++) - qudaMemcpy(((void **)cpu.gauge)[d], ((void **)buffer)[d], cpu.Bytes() / geometry, qudaMemcpyDefault); + qudaMemcpy((cpu.data())[d], ((void **)buffer)[d], cpu.Bytes() / geometry, qudaMemcpyDefault); } else { - qudaMemcpy(cpu.gauge, buffer, cpu.Bytes(), qudaMemcpyDefault); + qudaMemcpy(cpu.data(), buffer, cpu.Bytes(), qudaMemcpyDefault); } if (cpu.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD @@ -663,9 +661,9 @@ namespace quda { qudaMemcpy(buffer, gauge, bytes, qudaMemcpyDefault); if (cpu.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) { - copyGenericGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, buffer); + copyGenericGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, nullptr, buffer); } else { - copyExtendedGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, cpu.gauge, buffer); + copyExtendedGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, nullptr, buffer); } pool_pinned_free(buffer); @@ -685,46 +683,11 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_D2H); } - void cudaGaugeField::backup() const { - if (backed_up) errorQuda("Gauge field already backed up"); - backup_h = new char[bytes]; - qudaMemcpy(backup_h, gauge, bytes, qudaMemcpyDefault); - backed_up = true; - } - - void cudaGaugeField::restore() const - { - if (!backed_up) errorQuda("Cannot restore since not backed up"); - qudaMemcpy(gauge, backup_h, bytes, qudaMemcpyDefault); - delete []backup_h; - backed_up = false; - } - - void cudaGaugeField::prefetch(QudaFieldLocation mem_space, qudaStream_t stream) const - { - if (is_prefetch_enabled() && mem_type == QUDA_MEMORY_DEVICE) { - if (gauge) qudaMemPrefetchAsync(gauge, bytes, mem_space, stream); - if (!isNative()) { - for (int i = 0; i < nDim; i++) { - size_t nbytes = nFace * surface[i] * nInternal * precision; - if (ghost[i] && nbytes) qudaMemPrefetchAsync(ghost[i], nbytes, mem_space, stream); - if (ghost[i + 4] && nbytes && geometry == QUDA_COARSE_GEOMETRY) - qudaMemPrefetchAsync(ghost[i + 4], nbytes, mem_space, stream); - } - } - } - } - - void cudaGaugeField::zero() { qudaMemset(gauge, 0, bytes); } - void cudaGaugeField::copy_to_buffer(void *buffer) const { - qudaMemcpy(buffer, Gauge_p(), Bytes(), qudaMemcpyDeviceToHost); + qudaMemcpy(buffer, data(), Bytes(), qudaMemcpyDeviceToHost); } - void cudaGaugeField::copy_from_buffer(void *buffer) - { - qudaMemcpy(Gauge_p(), buffer, Bytes(), qudaMemcpyHostToDevice); - } + void cudaGaugeField::copy_from_buffer(void *buffer) { qudaMemcpy(data(), buffer, Bytes(), qudaMemcpyHostToDevice); } } // namespace quda diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index 1181ecb733..ea17cb4610 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -29,12 +29,16 @@ namespace quda { GaugeField::GaugeField(const GaugeFieldParam ¶m) : LatticeField(param), + gauge(nullptr), + gauge_h(nullptr), + gauge_qdp {}, bytes(0), phase_offset(0), phase_bytes(0), nColor(param.nColor), nFace(param.nFace), geometry(param.geometry), + site_dim(1), reconstruct(param.reconstruct), nInternal(reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : nColor * nColor * 2), order(param.order), @@ -103,6 +107,19 @@ namespace quda { } total_bytes = bytes; + if (geometry == QUDA_SCALAR_GEOMETRY) + site_dim = 1; + else if (geometry == QUDA_VECTOR_GEOMETRY) + site_dim = nDim; + else if (geometry == QUDA_TENSOR_GEOMETRY) + site_dim = nDim * (nDim - 1) / 2; + else if (geometry == QUDA_COARSE_GEOMETRY) + site_dim = 2 * nDim; + else if (geometry == QUDA_KDINVERSE_GEOMETRY) + site_dim = 1 << nDim; + else + errorQuda("Unknown geometry type %d", geometry); + setTuningString(); } @@ -296,6 +313,19 @@ namespace quda { return output; // for multiple << operators. } + void GaugeField::zero() + { + if (location == QUDA_CUDA_FIELD_LOCATION) { + qudaMemset(gauge, 0, bytes); + } else { + if (order != QUDA_QDP_GAUGE_ORDER) { + memset(gauge, 0, bytes); + } else { + for (int g = 0; g < geometry; g++) memset(gauge_qdp[g], 0, volume * nInternal * precision); + } + } + } + ColorSpinorParam colorSpinorParam(const GaugeField &a) { if (a.FieldOrder() == QUDA_QDP_GAUGE_ORDER || a.FieldOrder() == QUDA_QDPJIT_GAUGE_ORDER) errorQuda("Not implemented for this order %d", a.FieldOrder()); @@ -318,7 +348,7 @@ namespace quda { spinor_param.setPrecision(a.Precision(), a.Precision(), true); spinor_param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; spinor_param.create = QUDA_REFERENCE_FIELD_CREATE; - spinor_param.v = (void*)a.Gauge_p(); + spinor_param.v = a.data(); spinor_param.location = a.Location(); return spinor_param; } @@ -417,4 +447,66 @@ namespace quda { return padded_cpu; } + void GaugeField::prefetch(QudaFieldLocation mem_space, qudaStream_t stream) const + { + if (location == QUDA_CUDA_FIELD_LOCATION && is_prefetch_enabled() && mem_type == QUDA_MEMORY_DEVICE) { + if (gauge) qudaMemPrefetchAsync(gauge, bytes, mem_space, stream); + if (!isNative()) { + for (int i = 0; i < nDim; i++) { + size_t nbytes = nFace * surface[i] * nInternal * precision; + if (ghost[i] && nbytes) qudaMemPrefetchAsync(ghost[i], nbytes, mem_space, stream); + if (ghost[i + 4] && nbytes && geometry == QUDA_COARSE_GEOMETRY) + qudaMemPrefetchAsync(ghost[i + 4], nbytes, mem_space, stream); + } + } + } + } + + void GaugeField::backup() const + { + if (backed_up) errorQuda("Gauge field already backed up"); + + if (location == QUDA_CUDA_FIELD_LOCATION) { + backup_h = new char[bytes]; + qudaMemcpy(backup_h, gauge, bytes, qudaMemcpyDefault); + } else { + if (order == QUDA_QDP_GAUGE_ORDER) { + char **buffer = new char *[geometry]; + for (int d = 0; d < geometry; d++) { + buffer[d] = new char[bytes / geometry]; + memcpy(buffer[d], gauge_qdp[d], bytes / geometry); + } + backup_h = reinterpret_cast(buffer); + } else { + backup_h = new char[bytes]; + memcpy(backup_h, gauge, bytes); + } + } + + backed_up = true; + } + + void GaugeField::restore() const + { + if (!backed_up) errorQuda("Cannot restore since not backed up"); + + if (location == QUDA_CUDA_FIELD_LOCATION) { + qudaMemcpy(gauge, backup_h, bytes, qudaMemcpyDefault); + delete[] backup_h; + } else { + if (order == QUDA_QDP_GAUGE_ORDER) { + char **buffer = reinterpret_cast(backup_h); + for (int d = 0; d < geometry; d++) { + memcpy(gauge_qdp[d], buffer[d], bytes / geometry); + delete[] buffer[d]; + } + delete[] buffer; + } else { + memcpy(gauge, backup_h, bytes); + delete[] backup_h; + } + } + backed_up = false; + } + } // namespace quda diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu index af644ec238..f74f3fd50a 100644 --- a/lib/gauge_stout.cu +++ b/lib/gauge_stout.cu @@ -41,8 +41,8 @@ namespace quda { } } - void preTune() { if (out.Gauge_p() == in.Gauge_p()) out.backup(); } - void postTune() { if (out.Gauge_p() == in.Gauge_p()) out.restore(); } + void preTune() { if (out.data() == in.data()) out.backup(); } + void postTune() { if (out.data() == in.data()) out.restore(); } long long flops() const // just counts matrix multiplication { diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 5ff1191dac..9a8878499b 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -3324,11 +3324,11 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col // the split topology. if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loading gauge field...\n"); } if (!is_staggered) { - loadGaugeQuda(collected_gauge->Gauge_p(), gauge_param); + loadGaugeQuda(collected_gauge->data(), gauge_param); } else { // freeGaugeQuda(); - loadFatLongGaugeQuda(param, gauge_param, collected_milc_fatlink_field->Gauge_p(), - collected_milc_longlink_field->Gauge_p()); + loadFatLongGaugeQuda(param, gauge_param, collected_milc_fatlink_field->data(), + collected_milc_longlink_field->data()); } if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loaded gauge field...\n"); } @@ -4619,7 +4619,7 @@ void computeHISQForceQuda(void* const milc_momentum, if (*num_failures_h>0) errorQuda("Error in the unitarization component of the hisq fermion force: %d failures\n", *num_failures_h); - qudaMemset((void **)(cudaOutForce->Gauge_p()), 0, cudaOutForce->Bytes()); + cudaOutForce->zero(); // read in u-link cudaGauge->loadCPUField(cpuULink, profileHISQForce); diff --git a/lib/staggered_kd_build_xinv.cu b/lib/staggered_kd_build_xinv.cu index b109bc2388..34ddb23137 100644 --- a/lib/staggered_kd_build_xinv.cu +++ b/lib/staggered_kd_build_xinv.cu @@ -245,11 +245,11 @@ namespace quda { X_.copy(X); - blas::flops += invert((void*)xInvMilcOrder->Gauge_p(), (void*)X_.Gauge_p(), n, X_.Volume(), X_.Precision(), X.Location()); + blas::flops += invert(xInvMilcOrder->data(), X_.data(), n, X_.Volume(), X_.Precision(), X.Location()); } else if (location == QUDA_CPU_FIELD_LOCATION) { - blas::flops += invert((void*)xInvMilcOrder->Gauge_p(), (void*)X.Gauge_p(), n, X.Volume(), X.Precision(), X.Location()); + blas::flops += invert(xInvMilcOrder->data(), X.data(), n, X.Volume(), X.Precision(), X.Location()); } if (getVerbosity() >= QUDA_VERBOSE) printfQuda("xInvMilcOrder = %e\n", xInvMilcOrder->norm2(0)); diff --git a/lib/staggered_oprod.cu b/lib/staggered_oprod.cu index a1af1e9903..af1f520333 100644 --- a/lib/staggered_oprod.cu +++ b/lib/staggered_oprod.cu @@ -86,8 +86,8 @@ namespace quda { } } // apply - void preTune() { U.backup(); if (U.Gauge_p() != L.Gauge_p()) L.backup(); } - void postTune() { U.restore(); if (U.Gauge_p() != L.Gauge_p()) L.restore(); } + void preTune() { U.backup(); if (U.data() != L.data()) L.backup(); } + void postTune() { U.restore(); if (U.data() != L.data()) L.restore(); } long long flops() const { return 0; } // FIXME long long bytes() const { return 0; } // FIXME diff --git a/lib/unitarize_links_quda.cu b/lib/unitarize_links_quda.cu index fb08b7feb1..058dd91592 100644 --- a/lib/unitarize_links_quda.cu +++ b/lib/unitarize_links_quda.cu @@ -61,14 +61,14 @@ namespace quda { for (unsigned int i = 0; i < infield.Volume(); ++i) { for (int dir=0; dir<4; ++dir){ if (infield.Precision() == QUDA_SINGLE_PRECISION) { - copyArrayToLink(inlink, ((float*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments? - if (unitarizeLinkNewton(outlink, inlink, max_iter_newton) == false ) num_failures++; - copyLinkToArray(((float*)(outfield.Gauge_p()) + (i*4 + dir)*18), outlink); - } else if (infield.Precision() == QUDA_DOUBLE_PRECISION) { - copyArrayToLink(inlink, ((double*)(infield.Gauge_p()) + (i*4 + dir)*18)); // order of arguments? - if (unitarizeLinkNewton(outlink, inlink, max_iter_newton) == false ) num_failures++; - copyLinkToArray(((double*)(outfield.Gauge_p()) + (i*4 + dir)*18), outlink); - } // precision? + copyArrayToLink(inlink, infield.data() + (i * 4 + dir) * 18); // order of arguments? + if (unitarizeLinkNewton(outlink, inlink, max_iter_newton) == false ) num_failures++; + copyLinkToArray(outfield.data() + (i * 4 + dir) * 18, outlink); + } else if (infield.Precision() == QUDA_DOUBLE_PRECISION) { + copyArrayToLink(inlink, infield.data() + (i * 4 + dir) * 18); // order of arguments? + if (unitarizeLinkNewton(outlink, inlink, max_iter_newton) == false ) num_failures++; + copyLinkToArray(outfield.data() + (i * 4 + dir) * 18, outlink); + } // precision? } // dir } // loop over volume } @@ -82,10 +82,10 @@ namespace quda { for (unsigned int i = 0; i < field.Volume(); ++i) { for (int dir=0; dir<4; ++dir) { if (field.Precision() == QUDA_SINGLE_PRECISION) { - copyArrayToLink(link, ((float*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments? - } else if (field.Precision() == QUDA_DOUBLE_PRECISION) { - copyArrayToLink(link, ((double*)(field.Gauge_p()) + (i*4 + dir)*18)); // order of arguments? - } else { + copyArrayToLink(link, field.data() + (i * 4 + dir) * 18); // order of arguments? + } else if (field.Precision() == QUDA_DOUBLE_PRECISION) { + copyArrayToLink(link, field.data() + (i * 4 + dir) * 18); // order of arguments? + } else { errorQuda("Unsupported precision\n"); } if (link.isUnitary(max_error) == false) { @@ -126,9 +126,12 @@ namespace quda { UnitarizeArg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error)); } - void preTune() { if (in.Gauge_p() == out.Gauge_p()) out.backup(); } + void preTune() + { + if (in.data() == out.data()) out.backup(); + } void postTune() { - if (in.Gauge_p() == out.Gauge_p()) out.restore(); + if (in.data() == out.data()) out.restore(); qudaMemset(fails, 0, sizeof(int)); // reset fails counter } diff --git a/tests/gauge_force_test.cpp b/tests/gauge_force_test.cpp index 88df55beef..60dfa94cd0 100644 --- a/tests/gauge_force_test.cpp +++ b/tests/gauge_force_test.cpp @@ -116,7 +116,7 @@ void gauge_force_test(bool compute_force = true) auto U_qdp = new quda::cpuGaugeField(param); // fills the gauge field with random numbers - createSiteLinkCPU((void **)U_qdp->Gauge_p(), gauge_param.cpu_prec, 0); + createSiteLinkCPU(U_qdp->data(), gauge_param.cpu_prec, 0); param.order = QUDA_MILC_GAUGE_ORDER; auto U_milc = new quda::cpuGaugeField(param); @@ -134,7 +134,7 @@ void gauge_force_test(bool compute_force = true) // initialize some data in cpuMom if (compute_force) { - createMomCPU(Mom_ref_milc->Gauge_p(), gauge_param.cpu_prec); + createMomCPU(Mom_ref_milc->data(), gauge_param.cpu_prec); if (gauge_order == QUDA_MILC_GAUGE_ORDER) Mom_milc->copy(*Mom_ref_milc); if (gauge_order == QUDA_QDP_GAUGE_ORDER) Mom_qdp->copy(*Mom_ref_milc); } @@ -142,11 +142,11 @@ void gauge_force_test(bool compute_force = true) void *sitelink = nullptr; if (gauge_order == QUDA_MILC_GAUGE_ORDER) { - sitelink = U_milc->Gauge_p(); - mom = Mom_milc->Gauge_p(); + sitelink = U_milc->data(); + mom = Mom_milc->data(); } else if (gauge_order == QUDA_QDP_GAUGE_ORDER) { - sitelink = U_qdp->Gauge_p(); - mom = Mom_qdp->Gauge_p(); + sitelink = U_qdp->data(); + mom = Mom_qdp->data(); } else { errorQuda("Unsupported gauge order %d", gauge_order); } @@ -180,14 +180,14 @@ void gauge_force_test(bool compute_force = true) // The number comes from CPU implementation in MILC, gauge_force_imp.c int flops = 153004; - void *refmom = Mom_ref_milc->Gauge_p(); + void *refmom = Mom_ref_milc->data(); int *check_out = compute_force ? &force_check : &path_check; if (verify_results) { - gauge_force_reference(refmom, eb3, (void **)U_qdp->Gauge_p(), gauge_param.cpu_prec, input_path_buf, length, + gauge_force_reference(refmom, eb3, U_qdp->data(), gauge_param.cpu_prec, input_path_buf, length, loop_coeff, num_paths, compute_force); - *check_out = compare_floats(Mom_milc->Gauge_p(), refmom, 4 * V * mom_site_size, getTolerance(cuda_prec), - gauge_param.cpu_prec); - if (compute_force) strong_check_mom(Mom_milc->Gauge_p(), refmom, 4 * V, gauge_param.cpu_prec); + *check_out + = compare_floats(Mom_milc->data(), refmom, 4 * V * mom_site_size, getTolerance(cuda_prec), gauge_param.cpu_prec); + if (compute_force) strong_check_mom(Mom_milc->data(), refmom, 4 * V, gauge_param.cpu_prec); } if (compute_force) { diff --git a/tests/heatbath_test.cpp b/tests/heatbath_test.cpp index 673712201f..d840fda4c7 100644 --- a/tests/heatbath_test.cpp +++ b/tests/heatbath_test.cpp @@ -159,7 +159,7 @@ int main(int argc, char **argv) gauge_param.gauge_order = gauge->Order(); gauge_param.location = QUDA_CUDA_FIELD_LOCATION; - loadGaugeQuda(gauge->Gauge_p(), &gauge_param); + loadGaugeQuda(gauge->data(), &gauge_param); } QudaGaugeObservableParam param = newQudaGaugeObservableParam(); @@ -189,7 +189,7 @@ int main(int argc, char **argv) gauge_param.gauge_order = gauge->Order(); gauge_param.location = QUDA_CUDA_FIELD_LOCATION; - loadGaugeQuda(gauge->Gauge_p(), &gauge_param); + loadGaugeQuda(gauge->data(), &gauge_param); gaugeObservablesQuda(¶m); printfQuda("step=0 plaquette = %e topological charge = %e\n", param.plaquette[0], param.qcharge); @@ -205,7 +205,7 @@ int main(int argc, char **argv) // copy into regular field copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION); - loadGaugeQuda(gauge->Gauge_p(), &gauge_param); + loadGaugeQuda(gauge->data(), &gauge_param); gaugeObservablesQuda(¶m); printfQuda("step=%d plaquette = %e topological charge = %e\n", step, param.plaquette[0], param.qcharge); diff --git a/tests/hisq_paths_force_test.cpp b/tests/hisq_paths_force_test.cpp index 6c32ca4853..e19d874e31 100644 --- a/tests/hisq_paths_force_test.cpp +++ b/tests/hisq_paths_force_test.cpp @@ -169,7 +169,7 @@ static void hisq_force_init() cpuGauge_ex = new cpuGaugeField(gParam_ex); if (gauge_order == QUDA_QDP_GAUGE_ORDER) { - createSiteLinkCPU((void **)cpuGauge->Gauge_p(), qudaGaugeParam.cpu_prec, 1); + createSiteLinkCPU(cpuGauge->data(), qudaGaugeParam.cpu_prec, 1); } else { errorQuda("Unsupported gauge order %d", gauge_order); } @@ -221,8 +221,6 @@ static void hisq_force_init() cpuMom = new cpuGaugeField(gParam); refMom = new cpuGaugeField(gParam); - // createMomCPU(cpuMom->Gauge_p(), mom_prec); - hw = safe_malloc(4 * cpuGauge->Volume() * hw_site_size * qudaGaugeParam.cpu_prec); createHwCPU(hw, hw_prec); @@ -232,9 +230,9 @@ static void hisq_force_init() gParam.order = gauge_order; gParam.pad = 0; cpuOprod = new cpuGaugeField(gParam); - computeLinkOrderedOuterProduct(hw, cpuOprod->Gauge_p(), hw_prec, 1, gauge_order); + computeLinkOrderedOuterProduct(hw, cpuOprod->data(), hw_prec, 1, gauge_order); cpuLongLinkOprod = new cpuGaugeField(gParam); - computeLinkOrderedOuterProduct(hw, cpuLongLinkOprod->Gauge_p(), hw_prec, 3, gauge_order); + computeLinkOrderedOuterProduct(hw, cpuLongLinkOprod->data(), hw_prec, 3, gauge_order); gParam_ex.location = QUDA_CPU_FIELD_LOCATION; gParam_ex.link_type = QUDA_GENERAL_LINKS; @@ -366,10 +364,9 @@ static int hisq_force_test(void) int accuracy_level = 3; if (verify_results) { - int res = compare_floats(cpuMom->Gauge_p(), refMom->Gauge_p(), 4 * cpuMom->Volume() * mom_site_size, 1e-5, + int res = compare_floats(cpuMom->data(), refMom->data(), 4 * cpuMom->Volume() * mom_site_size, 1e-5, qudaGaugeParam.cpu_prec); - accuracy_level - = strong_check_mom(cpuMom->Gauge_p(), refMom->Gauge_p(), 4 * cpuMom->Volume(), qudaGaugeParam.cpu_prec); + accuracy_level = strong_check_mom(cpuMom->data(), refMom->data(), 4 * cpuMom->Volume(), qudaGaugeParam.cpu_prec); printfQuda("Test %s\n", (1 == res) ? "PASSED" : "FAILED"); } double total_io; diff --git a/tests/hisq_unitarize_force_test.cpp b/tests/hisq_unitarize_force_test.cpp index 1ab7b6a71b..f6d68a9553 100644 --- a/tests/hisq_unitarize_force_test.cpp +++ b/tests/hisq_unitarize_force_test.cpp @@ -26,7 +26,7 @@ quda::cpuGaugeField *cpuReference = NULL; static QudaGaugeParam gaugeParam; // Create a field of links that are not su3_matrices -void createNoisyLinkCPU(void **field, QudaPrecision prec, int seed) +void createNoisyLinkCPU(void *const *field, QudaPrecision prec, int seed) { createSiteLinkCPU(field, prec, 0); @@ -77,8 +77,8 @@ static void hisq_force_init() seed += quda::comm_rank(); #endif - createNoisyLinkCPU((void **)cpuFatLink->Gauge_p(), gaugeParam.cpu_prec, seed); - createNoisyLinkCPU((void **)cpuOprod->Gauge_p(), gaugeParam.cpu_prec, seed + 1); + createNoisyLinkCPU(cpuFatLink->data(), gaugeParam.cpu_prec, seed); + createNoisyLinkCPU(cpuOprod->data(), gaugeParam.cpu_prec, seed + 1); gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.setPrecision(gaugeParam.cuda_prec, true); @@ -142,7 +142,7 @@ TEST(hisq_force_unitarize, verify) double accuracy = prec == QUDA_DOUBLE_PRECISION ? 1e-10 : 1e-5; for (int dir = 0; dir < 4; ++dir) { - res[dir] = compare_floats(((char **)cpuReference->Gauge_p())[dir], ((char **)cpuResult->Gauge_p())[dir], + res[dir] = compare_floats(cpuReference->data()[dir], cpuResult->data()[dir], cpuReference->Volume() * gauge_site_size, accuracy, gaugeParam.cpu_prec); quda::comm_allreduce_int(res[dir]); diff --git a/tests/host_reference/domain_wall_dslash_reference.cpp b/tests/host_reference/domain_wall_dslash_reference.cpp index c42b0e4a71..46d2620ce0 100644 --- a/tests/host_reference/domain_wall_dslash_reference.cpp +++ b/tests/host_reference/domain_wall_dslash_reference.cpp @@ -746,8 +746,8 @@ void mdw_eofa_m5inv(void *res, void *spinorField, int oddBit, int daggerBit, dou // this actually applies the preconditioned dslash, e.g., D_ee^{-1} D_eo or D_oo^{-1} D_oe #ifndef MULTI_GPU -void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &, - double mferm) +void dw_dslash(void *out, void *const *gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, + QudaGaugeParam &, double mferm) { if (precision == QUDA_DOUBLE_PRECISION) { dslashReference_4d_sgpu((double *)out, (double **)gauge, (double *)in, oddBit, daggerBit); @@ -758,10 +758,10 @@ void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, Qud } } #else -void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, +void dw_dslash(void *out, void *const *gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm) { - GaugeFieldParam gauge_field_param(gauge_param, gauge); + GaugeFieldParam gauge_field_param(gauge_param, (void **)gauge); gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; cpuGaugeField cpu(gauge_field_param); void **ghostGauge = (void **)cpu.Ghost(); @@ -815,7 +815,7 @@ void dw_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, Qud #endif #ifndef MULTI_GPU -void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, +void dslash_4_4d(void *out, void *const *gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &, double) { if (precision == QUDA_DOUBLE_PRECISION) { @@ -825,10 +825,10 @@ void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, Q } } #else -void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, +void dslash_4_4d(void *out, void *const *gauge, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param, double) { - GaugeFieldParam gauge_field_param(gauge_param, gauge); + GaugeFieldParam gauge_field_param(gauge_param, (void **)gauge); gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; cpuGaugeField cpu(gauge_field_param); void **ghostGauge = (void **)cpu.Ghost(); @@ -879,8 +879,8 @@ void dslash_4_4d(void *out, void **gauge, void *in, int oddBit, int daggerBit, Q } #endif -void dw_dslash_5_4d(void *out, void **, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &, - double mferm, bool zero_initialize) +void dw_dslash_5_4d(void *out, void *const *, void *in, int oddBit, int daggerBit, QudaPrecision precision, + QudaGaugeParam &, double mferm, bool zero_initialize) { if (precision == QUDA_DOUBLE_PRECISION) { if (zero_initialize) @@ -895,8 +895,8 @@ void dw_dslash_5_4d(void *out, void **, void *in, int oddBit, int daggerBit, Qud } } -void dslash_5_inv(void *out, void **, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &, - double mferm, double *kappa) +void dslash_5_inv(void *out, void *const *, void *in, int oddBit, int daggerBit, QudaPrecision precision, + QudaGaugeParam &, double mferm, double *kappa) { if (precision == QUDA_DOUBLE_PRECISION) { dslashReference_5th_inv((double *)out, (double *)in, oddBit, daggerBit, mferm, kappa); @@ -905,7 +905,7 @@ void dslash_5_inv(void *out, void **, void *in, int oddBit, int daggerBit, QudaP } } -void mdw_dslash_5_inv(void *out, void **, void *in, int oddBit, int daggerBit, QudaPrecision precision, +void mdw_dslash_5_inv(void *out, void *const *, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &, double mferm, double _Complex *kappa) { if (precision == QUDA_DOUBLE_PRECISION) { @@ -915,8 +915,8 @@ void mdw_dslash_5_inv(void *out, void **, void *in, int oddBit, int daggerBit, Q } } -void mdw_dslash_5(void *out, void **, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &, - double mferm, double _Complex *kappa, bool zero_initialize) +void mdw_dslash_5(void *out, void *const *, void *in, int oddBit, int daggerBit, QudaPrecision precision, + QudaGaugeParam &, double mferm, double _Complex *kappa, bool zero_initialize) { if (precision == QUDA_DOUBLE_PRECISION) { if (zero_initialize) @@ -935,7 +935,7 @@ void mdw_dslash_5(void *out, void **, void *in, int oddBit, int daggerBit, QudaP } } -void mdw_dslash_4_pre(void *out, void **, void *in, int oddBit, int daggerBit, QudaPrecision precision, +void mdw_dslash_4_pre(void *out, void *const *, void *in, int oddBit, int daggerBit, QudaPrecision precision, QudaGaugeParam &, double mferm, double _Complex *b5, double _Complex *c5, bool zero_initialize) { if (precision == QUDA_DOUBLE_PRECISION) { @@ -960,7 +960,7 @@ void mdw_dslash_4_pre(void *out, void **, void *in, int oddBit, int daggerBit, Q } } -void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, +void dw_mat(void *out, void *const *gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm) { @@ -976,7 +976,7 @@ void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, Qud xpay(in, -kappa, out, V5 * spinor_site_size, precision); } -void dw_4d_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, +void dw_4d_mat(void *out, void *const *gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm) { @@ -995,7 +995,7 @@ void dw_4d_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, xpay(in, -kappa, out, V5 * spinor_site_size, precision); } -void mdw_mat(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, int dagger, +void mdw_mat(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *b5, double _Complex *c5) { void *tmp = safe_malloc(V5h * spinor_site_size * precision); @@ -1042,9 +1042,9 @@ void mdw_mat(void *out, void **gauge, void *in, double _Complex *kappa_b, double host_free(tmp); } -void mdw_eofa_mat(void *out, void **gauge, void *in, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, - double mferm, double m5, double b, double c, double mq1, double mq2, double mq3, int eofa_pm, - double eofa_shift) +void mdw_eofa_mat(void *out, void *const *gauge, void *in, int dagger, QudaPrecision precision, + QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c, double mq1, double mq2, + double mq3, int eofa_pm, double eofa_shift) { void *tmp = safe_malloc(V5h * spinor_site_size * precision); @@ -1096,7 +1096,7 @@ void mdw_eofa_mat(void *out, void **gauge, void *in, int dagger, QudaPrecision p host_free(tmp); } // -void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, +void dw_matdagmat(void *out, void *const *gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm) { void *tmp = safe_malloc(V5 * spinor_site_size * precision); @@ -1108,7 +1108,7 @@ void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger_bi host_free(tmp); } -void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, +void dw_matpc(void *out, void *const *gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm) { void *tmp = safe_malloc(V5h * spinor_site_size * precision); @@ -1128,7 +1128,7 @@ void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType mat host_free(tmp); } -void dw_4d_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, +void dw_4d_matpc(void *out, void *const *gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm) { double kappa2 = -kappa * kappa; @@ -1168,7 +1168,7 @@ void dw_4d_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType host_free(kappa5); } -void mdw_matpc(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, +void mdw_matpc(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *b5, double _Complex *c5) { @@ -1240,9 +1240,9 @@ void mdw_matpc(void *out, void **gauge, void *in, double _Complex *kappa_b, doub host_free(kappa_mdwf); } -void mdw_eofa_matpc(void *out, void **gauge, void *in, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, - QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c, double mq1, double mq2, - double mq3, int eofa_pm, double eofa_shift) +void mdw_eofa_matpc(void *out, void *const *gauge, void *in, QudaMatPCType matpc_type, int dagger, + QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c, + double mq1, double mq2, double mq3, int eofa_pm, double eofa_shift) { void *tmp = safe_malloc(V5h * spinor_site_size * precision); @@ -1311,14 +1311,14 @@ void mdw_eofa_matpc(void *out, void **gauge, void *in, QudaMatPCType matpc_type, host_free(tmp); } -void mdw_mdagm_local(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, +void mdw_mdagm_local(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, QudaMatPCType matpc_type, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *b5, double _Complex *c5) { lat_dim_t R; for (int d = 0; d < 4; d++) { R[d] = comm_dim_partitioned(d) ? 2 : 0; } - cpuGaugeField *padded_gauge = createExtendedGauge(gauge, gauge_param, R); + cpuGaugeField *padded_gauge = createExtendedGauge((void **)gauge, gauge_param, R); int padded_V = 1; int W[4]; @@ -1357,7 +1357,7 @@ void mdw_mdagm_local(void *out, void **gauge, void *in, double _Complex *kappa_b QudaGaugeParam padded_gauge_param(gauge_param); for (int d = 0; d < 4; d++) { padded_gauge_param.X[d] += 2 * R[d]; } - void **padded_gauge_p = (void **)(padded_gauge->Gauge_p()); + auto padded_gauge_p = padded_gauge->data(); // Extend these global variables then restore them int V5_old = V5; @@ -1458,7 +1458,7 @@ void MatPCDag(sFloat *outEven, gFloat **gauge, sFloat *inEven, sFloat kappa, } */ -void matpc(void *, void **, void *, double, QudaMatPCType, int, QudaPrecision, QudaPrecision, double) +void matpc(void *, void *const *, void *, double, QudaMatPCType, int, QudaPrecision, QudaPrecision, double) { /* if (!dagger_bit) { @@ -1513,7 +1513,7 @@ void MatPCDagMatPC(sFloat *out, gFloat **gauge, sFloat *in, sFloat kappa, } */ // Wrapper to templates that handles different precisions. -void matdagmat(void *, void **, void *, double, QudaPrecision, QudaPrecision, double) +void matdagmat(void *, void *const *, void *, double, QudaPrecision, QudaPrecision, double) { /* if (sPrecision == QUDA_DOUBLE_PRECISION) { @@ -1533,7 +1533,7 @@ void matdagmat(void *, void **, void *, double, QudaPrecision, QudaPrecision, do } // Wrapper to templates that handles different precisions. -void matpcdagmatpc(void *, void **, void *, double, QudaPrecision, QudaPrecision, double, QudaMatPCType) +void matpcdagmatpc(void *, void *const *, void *, double, QudaPrecision, QudaPrecision, double, QudaMatPCType) { /* if (sPrecision == QUDA_DOUBLE_PRECISION) { diff --git a/tests/host_reference/domain_wall_dslash_reference.h b/tests/host_reference/domain_wall_dslash_reference.h index 3751fe88f4..4e6ff1edfb 100644 --- a/tests/host_reference/domain_wall_dslash_reference.h +++ b/tests/host_reference/domain_wall_dslash_reference.h @@ -8,51 +8,51 @@ extern "C" { #endif -void dw_dslash(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision, +void dw_dslash(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision, QudaGaugeParam ¶m, double mferm); -void dslash_4_4d(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision, +void dslash_4_4d(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision, QudaGaugeParam ¶m, double mferm); -void dw_dslash_5_4d(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision, - QudaGaugeParam ¶m, double mferm, bool zero_initialize); +void dw_dslash_5_4d(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger, + QudaPrecision precision, QudaGaugeParam ¶m, double mferm, bool zero_initialize); -void dslash_5_inv(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision, +void dslash_5_inv(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision, QudaGaugeParam ¶m, double mferm, double *kappa); -void mdw_dslash_5_inv(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision, - QudaGaugeParam ¶m, double mferm, double _Complex *kappa); +void mdw_dslash_5_inv(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger, + QudaPrecision precision, QudaGaugeParam ¶m, double mferm, double _Complex *kappa); -void mdw_dslash_5(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision, +void mdw_dslash_5(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision, QudaGaugeParam ¶m, double mferm, double _Complex *kappa, bool zero_initialize); -void mdw_dslash_4_pre(void *res, void **gaugeFull, void *spinorField, int oddBit, int dagger, QudaPrecision precision, - QudaGaugeParam ¶m, double mferm, double _Complex *b5, double _Complex *c5, - bool zero_initialize); +void mdw_dslash_4_pre(void *res, void *const *gaugeFull, void *spinorField, int oddBit, int dagger, + QudaPrecision precision, QudaGaugeParam ¶m, double mferm, double _Complex *b5, + double _Complex *c5, bool zero_initialize); -void dw_mat(void *out, void **gauge, void *in, double kappa, int dagger, QudaPrecision precision, QudaGaugeParam ¶m, - double mferm); +void dw_mat(void *out, void *const *gauge, void *in, double kappa, int dagger, QudaPrecision precision, + QudaGaugeParam ¶m, double mferm); -void dw_4d_mat(void *out, void **gauge, void *in, double kappa, int dagger, QudaPrecision precision, +void dw_4d_mat(void *out, void *const *gauge, void *in, double kappa, int dagger, QudaPrecision precision, QudaGaugeParam ¶m, double mferm); -void mdw_mat(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, int dagger, +void mdw_mat(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, int dagger, QudaPrecision precision, QudaGaugeParam ¶m, double mferm, double _Complex *b5, double _Complex *c5); -void dw_matdagmat(void *out, void **gauge, void *in, double kappa, int dagger, QudaPrecision precision, +void dw_matdagmat(void *out, void *const *gauge, void *in, double kappa, int dagger, QudaPrecision precision, QudaGaugeParam ¶m, double mferm); -void dw_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger, +void dw_matpc(void *out, void *const *gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm); -void dw_4d_matpc(void *out, void **gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger, +void dw_4d_matpc(void *out, void *const *gauge, void *in, double kappa, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm); -void mdw_matpc(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, +void mdw_matpc(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *b5, double _Complex *c5); -void mdw_mdagm_local(void *out, void **gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, +void mdw_mdagm_local(void *out, void *const *gauge, void *in, double _Complex *kappa_b, double _Complex *kappa_c, QudaMatPCType matpc_type, QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double _Complex *b5, double _Complex *c5); void mdw_eofa_m5(void *res, void *spinorField, int oddBit, int daggerBit, double mferm, double m5, double b, double c, @@ -61,13 +61,13 @@ void mdw_eofa_m5(void *res, void *spinorField, int oddBit, int daggerBit, double void mdw_eofa_m5inv(void *res, void *spinorField, int oddBit, int daggerBit, double mferm, double m5, double b, double c, double mq1, double mq2, double mq3, int eofa_pm, double eofa_shift, QudaPrecision precision); -void mdw_eofa_mat(void *out, void **gauge, void *in, int dagger, QudaPrecision precision, QudaGaugeParam &gauge_param, - double mferm, double m5, double b, double c, double mq1, double mq2, double mq3, int eofa_pm, - double eofa_shift); +void mdw_eofa_mat(void *out, void *const *gauge, void *in, int dagger, QudaPrecision precision, + QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c, double mq1, double mq2, + double mq3, int eofa_pm, double eofa_shift); -void mdw_eofa_matpc(void *out, void **gauge, void *in, QudaMatPCType matpc_type, int dagger, QudaPrecision precision, - QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c, double mq1, double mq2, - double mq3, int eofa_pm, double eofa_shift); +void mdw_eofa_matpc(void *out, void *const *gauge, void *in, QudaMatPCType matpc_type, int dagger, + QudaPrecision precision, QudaGaugeParam &gauge_param, double mferm, double m5, double b, double c, + double mq1, double mq2, double mq3, int eofa_pm, double eofa_shift); #ifdef __cplusplus } diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp index ef1ce87b77..ffe8cc4494 100644 --- a/tests/host_reference/gauge_force_reference.cpp +++ b/tests/host_reference/gauge_force_reference.cpp @@ -369,7 +369,7 @@ static void update_gauge(su3_matrix *gauge, int dir, su3_matrix **sitelink, su3_ /* This function only computes one direction @dir * */ -void gauge_force_reference_dir(void *refMom, int dir, double eb3, void **sitelink, void **sitelink_ex, +void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *sitelink, void *const *sitelink_ex, QudaPrecision prec, int **path_dir, int *length, void *loop_coeff, int num_paths, const lattice_t &lat, bool compute_force) { @@ -405,8 +405,8 @@ void gauge_force_reference_dir(void *refMom, int dir, double eb3, void **sitelin host_free(staple); } -void gauge_force_reference(void *refMom, double eb3, void **sitelink, QudaPrecision prec, int ***path_dir, int *length, - void *loop_coeff, int num_paths, bool compute_force) +void gauge_force_reference(void *refMom, double eb3, void *const *const sitelink, QudaPrecision prec, int ***path_dir, + int *length, void *loop_coeff, int num_paths, bool compute_force) { // created extended field quda::lat_dim_t R; @@ -420,7 +420,7 @@ void gauge_force_reference(void *refMom, double eb3, void **sitelink, QudaPrecis lattice_t lat(*qdp_ex); for (int dir = 0; dir < 4; dir++) { - gauge_force_reference_dir(refMom, dir, eb3, sitelink, (void **)qdp_ex->Gauge_p(), prec, path_dir[dir], length, + gauge_force_reference_dir(refMom, dir, eb3, sitelink, qdp_ex->data(), prec, path_dir[dir], length, loop_coeff, num_paths, lat, compute_force); } diff --git a/tests/host_reference/gauge_force_reference.h b/tests/host_reference/gauge_force_reference.h index 4bf04f3f52..44106e5427 100644 --- a/tests/host_reference/gauge_force_reference.h +++ b/tests/host_reference/gauge_force_reference.h @@ -1,4 +1,4 @@ #pragma once -void gauge_force_reference(void *refMom, double eb3, void **sitelink, QudaPrecision prec, int ***path_dir, int *length, - void *loop_coeff, int num_paths, bool compute_force); +void gauge_force_reference(void *refMom, double eb3, void *const *sitelink, QudaPrecision prec, int ***path_dir, + int *length, void *loop_coeff, int num_paths, bool compute_force); diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp index d4cb82e0c8..f3f080bed8 100644 --- a/tests/host_reference/hisq_force_reference.cpp +++ b/tests/host_reference/hisq_force_reference.cpp @@ -1266,12 +1266,12 @@ void hisqStaplesForceCPU(const double *path_coeff, const QudaGaugeParam ¶m, act_path_coeff.lepage = path_coeff[5]; if (param.cpu_prec == QUDA_DOUBLE_PRECISION) { - doHisqStaplesForceCPU(param.X, act_path_coeff, (double *)oprod.Gauge_p(), (double *)link.Gauge_p(), - (double **)tempmat, (double *)newOprod->Gauge_p()); + doHisqStaplesForceCPU(param.X, act_path_coeff, oprod.data(), link.data(), + (double **)tempmat, newOprod->data()); } else if (param.cpu_prec == QUDA_SINGLE_PRECISION) { - doHisqStaplesForceCPU(param.X, act_path_coeff, (float *)oprod.Gauge_p(), (float *)link.Gauge_p(), - (float **)tempmat, (float *)newOprod->Gauge_p()); + doHisqStaplesForceCPU(param.X, act_path_coeff, oprod.data(), link.data(), + (float **)tempmat, newOprod->data()); } else { errorQuda("Unsupported precision"); } @@ -1350,11 +1350,11 @@ void hisqLongLinkForceCPU(double coeff, const QudaGaugeParam ¶m, quda::cpuGa { for (int sig = 0; sig < 4; ++sig) { if (param.cpu_prec == QUDA_SINGLE_PRECISION) { - computeLongLinkField(param.X, (float *)oprod.Gauge_p(), (float *)link.Gauge_p(), sig, coeff, - (float *)newOprod->Gauge_p()); + computeLongLinkField(param.X, (float *)oprod.data(), link.data(), sig, coeff, + newOprod->data()); } else if (param.cpu_prec == QUDA_DOUBLE_PRECISION) { - computeLongLinkField(param.X, (double *)oprod.Gauge_p(), (double *)link.Gauge_p(), sig, coeff, - (double *)newOprod->Gauge_p()); + computeLongLinkField(param.X, oprod.data(), link.data(), sig, coeff, + newOprod->data()); } else { errorQuda("Unrecognised precision\n"); } @@ -1405,10 +1405,9 @@ void hisqCompleteForceCPU(const QudaGaugeParam ¶m, quda::cpuGaugeField &opro { for (int sig = 0; sig < 4; ++sig) { if (param.cpu_prec == QUDA_SINGLE_PRECISION) { - completeForceField(param.X, (float *)oprod.Gauge_p(), (float *)link.Gauge_p(), sig, (float *)mom->Gauge_p()); + completeForceField(param.X, oprod.data(), link.data(), sig, mom->data()); } else if (param.cpu_prec == QUDA_DOUBLE_PRECISION) { - completeForceField(param.X, (double *)oprod.Gauge_p(), (double *)link.Gauge_p(), sig, - (double *)mom->Gauge_p()); + completeForceField(param.X, oprod.data(), link.data(), sig, mom->data()); } else { errorQuda("Unrecognised precision\n"); } diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp index 95c104be99..40860dc308 100644 --- a/tests/host_reference/staggered_dslash_reference.cpp +++ b/tests/host_reference/staggered_dslash_reference.cpp @@ -126,8 +126,8 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink, } // right-hand-side } -void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink, - void **ghost_longlink, const ColorSpinorField &in, int oddBit, int daggerBit, +void staggeredDslash(ColorSpinorField &out, void *const *fatlink, void *const *longlink, void *const *ghost_fatlink, + void *const *ghost_longlink, const ColorSpinorField &in, int oddBit, int daggerBit, QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type) { const int nSrc = in.X(4); @@ -144,8 +144,8 @@ void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, voi in.exchangeGhost(otherparity, nFace, daggerBit); - void **fwd_nbr_spinor = in.fwdGhostFaceBuffer; - void **back_nbr_spinor = in.backGhostFaceBuffer; + auto fwd_nbr_spinor = in.fwdGhostFaceBuffer; + auto back_nbr_spinor = in.backGhostFaceBuffer; if (sPrecision == QUDA_DOUBLE_PRECISION) { if (gPrecision == QUDA_DOUBLE_PRECISION) { @@ -170,8 +170,8 @@ void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, voi } } -void staggeredMatDagMat(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink, - void **ghost_longlink, const ColorSpinorField &in, double mass, int dagger_bit, +void staggeredMatDagMat(ColorSpinorField &out, void *const *fatlink, void *const *longlink, void *const *ghost_fatlink, + void *const *ghost_longlink, const ColorSpinorField &in, double mass, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type) { diff --git a/tests/host_reference/staggered_dslash_reference.h b/tests/host_reference/staggered_dslash_reference.h index 54d40fdc0d..2d47138dc0 100644 --- a/tests/host_reference/staggered_dslash_reference.h +++ b/tests/host_reference/staggered_dslash_reference.h @@ -16,11 +16,11 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink, gFloat **ghostLonglink, sFloat *spinorField, sFloat **fwd_nbr_spinor, sFloat **back_nbr_spinor, int oddBit, int daggerBit, int nSrc, QudaDslashType dslash_type); -void staggeredDslash(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink, - void **ghost_longlink, const ColorSpinorField &in, int oddBit, int daggerBit, +void staggeredDslash(ColorSpinorField &out, void *const *fatlink, void *const *longlink, void *const *ghost_fatlink, + void *const *ghost_longlink, const ColorSpinorField &in, int oddBit, int daggerBit, QudaPrecision sPrecision, QudaPrecision gPrecision, QudaDslashType dslash_type); -void staggeredMatDagMat(ColorSpinorField &out, void **fatlink, void **longlink, void **ghost_fatlink, - void **ghost_longlink, const ColorSpinorField &in, double mass, int dagger_bit, +void staggeredMatDagMat(ColorSpinorField &out, void *const *fatlink, void *const *longlink, void *const *ghost_fatlink, + void *const *ghost_longlink, const ColorSpinorField &in, double mass, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity, QudaDslashType dslash_type); diff --git a/tests/multigrid_evolve_test.cpp b/tests/multigrid_evolve_test.cpp index d8c72f19fc..2fd02228a0 100644 --- a/tests/multigrid_evolve_test.cpp +++ b/tests/multigrid_evolve_test.cpp @@ -270,7 +270,7 @@ int main(int argc, char **argv) // load the gauge field from gauge gauge_param.gauge_order = gauge->Order(); gauge_param.location = QUDA_CUDA_FIELD_LOCATION; - loadGaugeQuda(gauge->Gauge_p(), &gauge_param); + loadGaugeQuda(gauge->data(), &gauge_param); gaugeObservablesQuda(&obs_param); // Demonstrate MG evolution on an evolving gauge field @@ -318,7 +318,7 @@ int main(int argc, char **argv) // Copy into regular field copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION); - loadGaugeQuda(gauge->Gauge_p(), &gauge_param); + loadGaugeQuda(gauge->data(), &gauge_param); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { constructHostCloverField(clover, clover_inv, inv_param); @@ -384,7 +384,7 @@ int main(int argc, char **argv) // copy into regular field copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION); - loadGaugeQuda(gauge->Gauge_p(), &gauge_param); + loadGaugeQuda(gauge->data(), &gauge_param); // Recompute Gauge Observables gaugeObservablesQuda(&obs_param); diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index 831351a36d..c6379c3342 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -71,7 +71,8 @@ struct StaggeredDslashTestWrapper { // In the HISQ case, we include building fat/long links in this unit test void *qdp_fatlink_cpu[4] = {nullptr, nullptr, nullptr, nullptr}; void *qdp_longlink_cpu[4] = {nullptr, nullptr, nullptr, nullptr}; - void **ghost_fatlink_cpu, **ghost_longlink_cpu; + void *const *ghost_fatlink_cpu; + void *const *ghost_longlink_cpu; QudaParity parity = QUDA_EVEN_PARITY; diff --git a/tests/unitarize_link_test.cpp b/tests/unitarize_link_test.cpp index a0322e397b..1c4849ba4a 100644 --- a/tests/unitarize_link_test.cpp +++ b/tests/unitarize_link_test.cpp @@ -42,8 +42,8 @@ TEST(unitarization, verify) unitarizeLinksCPU(*cpuULink, *cpuFatLink); cudaULink->saveCPUField(*cudaResult); - int res = compare_floats(cudaResult->Gauge_p(), cpuULink->Gauge_p(), 4 * cudaResult->Volume() * gauge_site_size, - unittol, cpu_prec); + int res = compare_floats(cudaResult->data(), cpuULink->data(), 4 * cudaResult->Volume() * gauge_site_size, unittol, + cpu_prec); #ifdef MULTI_GPU quda::comm_allreduce_int(res); diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp index ce188dbdcc..4df1882297 100644 --- a/tests/utils/host_utils.cpp +++ b/tests/utils/host_utils.cpp @@ -1272,7 +1272,7 @@ void check_gauge(void **oldG, void **newG, double epsilon, QudaPrecision precisi checkGauge((float **)oldG, (float **)newG, epsilon); } -void createSiteLinkCPU(void **link, QudaPrecision precision, int phase) +void createSiteLinkCPU(void *const *link, QudaPrecision precision, int phase) { if (precision == QUDA_DOUBLE_PRECISION) { constructUnitaryGaugeField((double **)link); diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index 569cae8643..4d9b284e0a 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -154,7 +154,7 @@ int fullLatticeIndex(int i, int oddBit); int fullLatticeIndex(int dim[], int index, int oddBit); int getOddBit(int X); -void createSiteLinkCPU(void **link, QudaPrecision precision, int phase); +void createSiteLinkCPU(void *const *const link, QudaPrecision precision, int phase); void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision); void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param); diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp index 118b849d17..3750fe05bc 100644 --- a/tests/utils/staggered_host_utils.cpp +++ b/tests/utils/staggered_host_utils.cpp @@ -490,7 +490,7 @@ void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, vo unitarizeLinksCPU(*cpuWLink, *cpuVLink); // Copy back into "w_reflink" - reorderMILCtoQDP(w_reflink, cpuWLink->Gauge_p(), V, gauge_site_size, prec, prec); + reorderMILCtoQDP(w_reflink, cpuWLink->data(), V, gauge_site_size, prec, prec); // Clean up cpuGaugeFields, we don't need them anymore. delete cpuVLink; From 53b7517b3e7ae34650339170e23f43f0fe3d8e50 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 21 Jul 2022 14:44:15 -0700 Subject: [PATCH 02/99] Improve error reporting when vol_string exceeds max size --- lib/lattice_field.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp index a2a9340dd6..c657a89e3b 100644 --- a/lib/lattice_field.cpp +++ b/lib/lattice_field.cpp @@ -557,7 +557,8 @@ namespace quda { vol_ss << x[0]; for (int d = 1; d < nDim; d++) vol_ss << "x" << x[d]; vol_string = vol_ss.str(); - if (vol_string.size() >= TuneKey::volume_n) errorQuda("Vol string too large %lu", vol_string.size()); + if (vol_string.size() >= TuneKey::volume_n) + errorQuda("Vol string %s (size = %lu) larger than maximum %d", vol_string.c_str(), vol_string.size(), TuneKey::volume_n); } void LatticeField::checkField(const LatticeField &a) const { From c3fb2eb4959cf4cd6bde07bacb2ba144a463ed61 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 8 May 2023 11:31:43 -0700 Subject: [PATCH 03/99] Significant rework of memory allocation to facilitate gauge field unification. Introduced new memory allocation wrapper quda_ptr, which is deployed for gauge field allocations. Still a WIP --- include/enum_quda.h | 5 +- include/enum_quda_fortran.h | 5 - include/gauge_field.h | 80 +++--- include/gauge_field_order.h | 26 +- include/lattice_field.h | 6 +- include/malloc_quda.h | 78 ++++++ include/quda_api.h | 21 +- lib/coarse_op_preconditioned.cu | 3 +- lib/color_spinor_field.cpp | 8 +- lib/cpu_gauge_field.cpp | 163 +++--------- lib/cuda_gauge_field.cpp | 136 +--------- lib/gauge_field.cpp | 249 ++++++++++++------ lib/interface_quda.cpp | 2 +- lib/lattice_field.cpp | 10 +- lib/targets/cuda/malloc.cpp | 143 ++++++++++ lib/targets/cuda/quda_api.cpp | 21 ++ tests/covdev_test.cpp | 7 +- tests/gauge_force_test.cpp | 13 +- tests/hisq_paths_force_test.cpp | 2 +- tests/hisq_unitarize_force_test.cpp | 12 +- tests/host_reference/covdev_reference.cpp | 41 +-- tests/host_reference/covdev_reference.h | 12 +- .../domain_wall_dslash_reference.cpp | 6 +- tests/host_reference/dslash_reference.cpp | 10 +- tests/host_reference/dslash_reference.h | 6 +- .../host_reference/gauge_force_reference.cpp | 7 +- tests/host_reference/gauge_force_reference.h | 4 +- .../wilson_dslash_reference.cpp | 3 +- tests/multigrid_evolve_test.cpp | 39 ++- tests/staggered_dslash_test_utils.h | 8 +- tests/staggered_invert_test.cpp | 6 +- tests/utils/host_utils.cpp | 6 + tests/utils/host_utils.h | 1 + tests/utils/misc.cpp | 4 +- 34 files changed, 658 insertions(+), 485 deletions(-) diff --git a/include/enum_quda.h b/include/enum_quda.h index 62f580e50d..665cffbf91 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -10,8 +10,11 @@ typedef enum qudaError_t { QUDA_SUCCESS = 0, QUDA_ERROR = 1, QUDA_ERROR_UNINITIA typedef enum QudaMemoryType_s { QUDA_MEMORY_DEVICE, - QUDA_MEMORY_PINNED, + QUDA_MEMORY_DEVICE_PINNED, + QUDA_MEMORY_HOST, + QUDA_MEMORY_HOST_PINNED, QUDA_MEMORY_MAPPED, + QUDA_MEMORY_MANAGED, QUDA_MEMORY_INVALID = QUDA_INVALID_ENUM } QudaMemoryType; diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index 5e17a9df8f..21da3c138b 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -17,11 +17,6 @@ #define QUDA_ERROR 1 #define QUDA_ERROR_UNINITIALIZED 2 -#define QUDA_MEMORY_DEVICE 0 -#define QUDA_MEMORY_PINNED 1 -#define QUDA_MEMORY_MAPPED 2 -#define QUDA_MEMORY_INVALID QUDA_INVALID_ENUM - #define QUDA_SU3_LINKS 0 #define QUDA_GENERAL_LINKS 1 #define QUDA_THREE_LINKS 2 diff --git a/include/gauge_field.h b/include/gauge_field.h index 7e484bd3a0..155f68b958 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -195,9 +195,8 @@ namespace quda { class GaugeField : public LatticeField { protected: - void *gauge; /** The gauge field allocation */ - void *gauge_h; /** Mapped-memory pointer when allocating on the host */ - void **gauge_qdp; /** Array of pointers to each subset (QDP order) */ + quda_ptr gauge; /** The gauge field allocation */ + array gauge_array; /** Array of pointers to each subset (e.g., QDP or QDPJITorder) */ size_t bytes; // bytes allocated per full field size_t phase_offset; // offset in bytes to gauge phases - useful to keep track of texture alignment size_t phase_bytes; // bytes needed to store the phases @@ -221,7 +220,7 @@ namespace quda { QudaFieldCreate create; // used to determine the type of field created - mutable void *ghost[2 * QUDA_MAX_DIM]; // stores the ghost zone of the gauge field (non-native fields only) + mutable array ghost; // stores the ghost zone of the gauge field (non-native fields only) mutable int ghostFace[QUDA_MAX_DIM]; // the size of each face @@ -273,6 +272,11 @@ namespace quda { */ void setTuningString(); + /** + @brief Initialize the padded region to 0 + */ + void zeroPad(); + public: GaugeField(const GaugeFieldParam ¶m); virtual ~GaugeField(); @@ -372,28 +376,55 @@ namespace quda { /** @brief Return base pointer to the gauge field allocation. - @tparam T Optional type to cast the pointer to. + @tparam T Optional type to cast the pointer to (default is void*). @return Base pointer to the gauge field allocation */ - template auto data() const + template + std::enable_if_t && !std::is_pointer_v::type>, T> data() const { - static_assert(std::is_pointer_v, "data() requires a pointer cast type"); - - using U = typename std::remove_pointer::type; - if constexpr (std::is_pointer_v) { - if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order); - return reinterpret_cast(gauge_qdp); - } else { - if (is_pointer_array(order) && !std::is_same_v) errorQuda("Non dim-array ordered field requested but order is %d", order); - return reinterpret_cast(gauge); - } + if (is_pointer_array(order)) + errorQuda("Non dim-array ordered field requested but order is %d", order); + return reinterpret_cast(gauge.data()); + } + + /** + @brief Return base pointer to the gauge field allocation + specified by the array index. This is for geometry-array + ordered fields, e.g., QDP or QDPJIT. + + @tparam T Optional type to cast the pointer to (default is void*) + @param[in] d Dimension index when the allocation is an array type + @return Base pointer to the gauge field allocation + */ + template auto data(unsigned int d) const + { + static_assert(std::is_pointer_v && !std::is_pointer_v::type>, "data() requires a pointer cast type"); + if (d >= (unsigned)geometry) errorQuda("Invalid array index %d for geometry %d field", d, geometry); + if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order); + return reinterpret_cast(gauge_array[d].data()); + } + + /** + @brief Return array of pointers to the per dimension gauge field allocation(s). + @tparam T Optional type to cast the pointer to (default is + void*). this is for geometry-array ordered fields, e.g., QDP + or QDPJIT. + @return Array of pointers to the gauge field allocations + */ + template + std::enable_if_t && !std::is_pointer_v::type>, array> data_array() const + { + if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order); + array u = {}; + for (auto d = 0; d < geometry; d++) u[d] = static_cast(gauge_array[d]); + return u; } virtual int full_dim(int d) const { return x[d]; } - auto Ghost() const { + auto& Ghost() const { if ( isNative() ) errorQuda("No ghost zone pointer for quda-native gauge fields"); - return (void * const *)ghost; + return ghost; } /** @@ -486,16 +517,8 @@ namespace quda { class cudaGaugeField : public GaugeField { - private: - - /** - @brief Initialize the padded region to 0 - */ - void zeroPad(); - public: cudaGaugeField(const GaugeFieldParam &); - virtual ~cudaGaugeField(); /** @brief Exchange the ghost and store store in the padded region @@ -621,8 +644,6 @@ namespace quda { @param[in] the host buffer to copy from. */ virtual void copy_from_buffer(void *buffer); - - void setGauge(void* _gauge); //only allowed when create== QUDA_REFERENCE_FIELD_CREATE }; class cpuGaugeField : public GaugeField { @@ -640,7 +661,6 @@ namespace quda { extended. */ cpuGaugeField(const GaugeFieldParam ¶m); - virtual ~cpuGaugeField(); /** @brief Exchange the ghost and store store in the padded region @@ -695,8 +715,6 @@ namespace quda { @param[in] the host buffer to copy from. */ virtual void copy_from_buffer(void *buffer); - - void setGauge(void** _gauge); //only allowed when create== QUDA_REFERENCE_FIELD_CREATE }; /** diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 0fd3e944d5..3b9db8648b 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -356,7 +356,7 @@ namespace quda { scale_inv(static_cast(1.0)) { for (int d = 0; d < U.Geometry(); d++) - u[d] = gauge_ ? static_cast **>(gauge_)[d] : U.data *const *>()[d]; + u[d] = gauge_ ? static_cast **>(gauge_)[d] : U.data *>(d); resetScale(U.Scale()); } @@ -427,12 +427,12 @@ namespace quda { { for (int d=0; d<4; d++) { ghost[d] = ghost_ ? static_cast*>(ghost_[d]) : - static_cast*>(const_cast(U.Ghost()[d])); + static_cast*>(const_cast(U.Ghost()[d].data())); ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : ghost_ ? static_cast*>(ghost_[d+4]) : - static_cast*>(const_cast(U.Ghost()[d+4])); + static_cast*>(const_cast(U.Ghost()[d+4].data())); ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); } @@ -548,12 +548,12 @@ namespace quda { { for (int d=0; d<4; d++) { ghost[d] = ghost_ ? static_cast*>(ghost_[d]) : - static_cast*>(const_cast(U.Ghost()[d])); + static_cast*>(const_cast(U.Ghost()[d].data())); ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : ghost_ ? static_cast*>(ghost_[d+4]) : - static_cast*>(const_cast(U.Ghost()[d+4])); + static_cast*>(const_cast(U.Ghost()[d+4].data())); ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); } @@ -1753,8 +1753,8 @@ namespace quda { using store_t = Float; using real = typename mapper::type; using complex = complex; - Float *ghost[QUDA_MAX_DIM]; - int faceVolumeCB[QUDA_MAX_DIM]; + Float *ghost[QUDA_MAX_DIM] = {}; + int faceVolumeCB[QUDA_MAX_DIM] = {}; const int volumeCB; const int stride; const int geometry; @@ -1769,9 +1769,11 @@ namespace quda { if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone"); - for (int i = 0; i < 4; i++) { - ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.Ghost()[i]); - faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth + if (u.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) { + for (int i = 0; i < 4; i++) { + ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.Ghost()[i].data()); + faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth + } } } @@ -1831,7 +1833,7 @@ namespace quda { QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) : LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) { - for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data()[i]; + for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data(i); } __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const @@ -1877,7 +1879,7 @@ namespace quda { QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) : LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) { - for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data()[i]; + for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data(i); } __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const diff --git a/include/lattice_field.h b/include/lattice_field.h index 1079da4553..005e09871b 100644 --- a/include/lattice_field.h +++ b/include/lattice_field.h @@ -72,7 +72,7 @@ namespace quda { QudaSiteSubset siteSubset = QUDA_INVALID_SITE_SUBSET; - QudaMemoryType mem_type = QUDA_MEMORY_DEVICE; + QudaMemoryType mem_type = QUDA_MEMORY_INVALID; /** The type of ghost exchange to be done with this field */ QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_PAD; @@ -104,7 +104,7 @@ namespace quda { nDim(nDim), pad(pad), siteSubset(QUDA_FULL_SITE_SUBSET), - mem_type(QUDA_MEMORY_DEVICE), + mem_type(location == QUDA_CUDA_FIELD_LOCATION ? QUDA_MEMORY_DEVICE : QUDA_MEMORY_HOST), ghostExchange(ghostExchange), scale(1.0) { @@ -128,7 +128,7 @@ namespace quda { nDim(4), pad(0), siteSubset(QUDA_FULL_SITE_SUBSET), - mem_type(QUDA_MEMORY_DEVICE), + mem_type(QUDA_MEMORY_HOST), ghostExchange(QUDA_GHOST_EXCHANGE_NO), scale(param.scale) { diff --git a/include/malloc_quda.h b/include/malloc_quda.h index 8df59bbf56..d1a7de9161 100644 --- a/include/malloc_quda.h +++ b/include/malloc_quda.h @@ -114,6 +114,9 @@ namespace quda { #define register_pinned(ptr, bytes) quda::register_pinned_(__func__, quda::file_name(__FILE__), __LINE__, ptr, bytes) #define unregister_pinned(size) quda::unregister_pinned_(__func__, quda::file_name(__FILE__), __LINE__, ptr) +#define quda_malloc(size) quda::quda_malloc_(__func__, quda::file_name(__FILE__), __LINE__, size) +#define quda_free(ptr) quda::quda_free_(__func__, quda::file_name(__FILE__), __LINE__, ptr) + namespace quda { namespace pool { @@ -169,3 +172,78 @@ namespace quda { #define pool_device_free(ptr) quda::pool::device_free_(__func__, __FILE__, __LINE__, ptr) #define pool_pinned_malloc(size) quda::pool::pinned_malloc_(__func__, __FILE__, __LINE__, size) #define pool_pinned_free(ptr) quda::pool::pinned_free_(__func__, __FILE__, __LINE__, ptr) + +namespace quda { + + /** + Object that stores a memory allocation with different views for + host or device. Depending on the nature of the underlying memory + type, both views may not be defined + + type defined views + QUDA_MEMORY_DEVICE device only + QUDA_MEMORY_DEVICE_PINNED device only + QUDA_MEMORY_HOST host only + QUDA_MEMORY_HOST_PINNED both + QUDA_MEMORY_MAPPED both (pinned to host) + QUDA_MEMORY_MANAGED both + */ + class quda_ptr { + QudaMemoryType type = QUDA_MEMORY_INVALID; + size_t size = 0; + bool pool = false; + void *device = nullptr; + void *host = nullptr; + + public: + quda_ptr() = default; + + quda_ptr &operator=(quda_ptr &&); + + /** + @brief Constructor for quda_ptr + @param[in] type The memory type of the allocation + @param[in] size The size of the allocation + @param[in] pool Whether the allocation should be in the memory pool (default is true) + */ + quda_ptr(QudaMemoryType type, size_t size, bool pool = true); + + /** + @brief Constructor for quda_ptr where we are wrapping a non-owned pointer + @param[in] ptr Raw base pointer + @param[in] type The memory type of the allocation + */ + quda_ptr(void *ptr, QudaMemoryType type); + + /** + @brief Destructor for the quda_ptr + */ + virtual ~quda_ptr(); + + /** + @return Returns true if allocation is visible to the device + */ + bool is_device() const; + + /** + @return Returns true if allocation is visible to the host + */ + bool is_host() const; + + /** + Return view of the pointer. For mapped memory we return the device view. + */ + void *data() const; + + /** + Return the device view of the pointer + */ + void *data_device() const; + + /** + Return the host view of the pointer + */ + void *data_host() const; + }; + +} diff --git a/include/quda_api.h b/include/quda_api.h index 45c226ba19..ea475c43f6 100644 --- a/include/quda_api.h +++ b/include/quda_api.h @@ -3,6 +3,7 @@ #include #include #include +#include /** @file quda_api.h @@ -63,6 +64,14 @@ namespace quda void qudaMemcpyP2PAsync_(void *dst, const void *src, size_t count, const qudaStream_t &stream, const char *func, const char *file, const char *line); + /** + @brief Heterogenous memset function + @param[out] ptr Heterogeneous pointer + @param[in] value Value to set for each byte of specified memory + @param[in] count Size in bytes to set + */ + void qudaMemset_(quda_ptr &ptr, int value, size_t count, const char *func, const char *file, const char *line); + /** @brief Wrapper around cudaMemset or driver API equivalent @param[out] ptr Starting address pointer @@ -72,15 +81,15 @@ namespace quda void qudaMemset_(void *ptr, int value, size_t count, const char *func, const char *file, const char *line); /** - @brief Wrapper around cudaMemset2D or driver API equivalent - @param[out] ptr Starting address pointer + @brief Heterogenous memset2d function + @param[out] ptr Heterogeneous pointer + @param[in] offset Offset shift in bytes from the base pointer @param[in] Pitch in bytes @param[in] value Value to set for each byte of specified memory @param[in] width Width in bytes @param[in] height Height in bytes */ - void qudaMemset2D_(void *ptr, size_t pitch, int value, size_t width, size_t height, const char *func, - const char *file, const char *line); + void qudaMemset2D_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height, const char *func, const char *file, const char *line); /** @brief Wrapper around cudaMemsetAsync or driver API equivalent @@ -224,8 +233,8 @@ namespace quda #define qudaMemset(ptr, value, count) \ ::quda::qudaMemset_(ptr, value, count, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)) -#define qudaMemset2D(ptr, pitch, value, width, height) \ - ::quda::qudaMemset2D_(ptr, pitch, value, width, height, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)) +#define qudaMemset2D(ptr, offset, pitch, value, width, height) \ + ::quda::qudaMemset2D_(ptr, offset, pitch, value, width, height, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)) #define qudaMemsetAsync(ptr, value, count, stream) \ ::quda::qudaMemsetAsync_(ptr, value, count, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)) diff --git a/lib/coarse_op_preconditioned.cu b/lib/coarse_op_preconditioned.cu index 1a2dbda501..ab8ee88f7f 100644 --- a/lib/coarse_op_preconditioned.cu +++ b/lib/coarse_op_preconditioned.cu @@ -187,8 +187,7 @@ namespace quda } else if (X.Location() == QUDA_CPU_FIELD_LOCATION && X.Order() == QUDA_QDP_GAUGE_ORDER) { const cpuGaugeField *X_h = static_cast(&X); cpuGaugeField *Xinv_h = static_cast(&Xinv); - blas::flops += invert(Xinv_h->data()[0], X_h->data()[0], n, X_h->Volume(), - X.Precision(), X.Location()); + blas::flops += invert(Xinv_h->data(0), X_h->data(0), n, X_h->Volume(), X.Precision(), X.Location()); } else { errorQuda("Unsupported location=%d and order=%d", X.Location(), X.Order()); } diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index 4bf7457584..56b6631832 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -908,9 +908,7 @@ namespace quda coarseParam.setPrecision(new_precision); // set where we allocate the field - coarseParam.mem_type = (new_mem_type != QUDA_MEMORY_INVALID) ? - new_mem_type : - (new_location == QUDA_CUDA_FIELD_LOCATION ? QUDA_MEMORY_DEVICE : QUDA_MEMORY_PINNED); + coarseParam.mem_type = new_mem_type; return new ColorSpinorField(coarseParam); } @@ -941,9 +939,7 @@ namespace quda } // set where we allocate the field - fineParam.mem_type = (new_mem_type != QUDA_MEMORY_INVALID) ? - new_mem_type : - (new_location == QUDA_CUDA_FIELD_LOCATION ? QUDA_MEMORY_DEVICE : QUDA_MEMORY_PINNED); + fineParam.mem_type = new_mem_type; return new ColorSpinorField(fineParam); } diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp index f3063d5d32..604bf04c13 100644 --- a/lib/cpu_gauge_field.cpp +++ b/lib/cpu_gauge_field.cpp @@ -10,112 +10,16 @@ namespace quda { cpuGaugeField::cpuGaugeField(const GaugeFieldParam ¶m) : GaugeField(param) { - if (precision == QUDA_HALF_PRECISION) { - errorQuda("CPU fields do not support half precision"); - } - if (precision == QUDA_QUARTER_PRECISION) { - errorQuda("CPU fields do not support quarter precision"); - } - if (pad != 0) { - errorQuda("CPU fields do not support non-zero padding"); - } - if (reconstruct != QUDA_RECONSTRUCT_NO && reconstruct != QUDA_RECONSTRUCT_10) { - errorQuda("Reconstruction type %d not supported", reconstruct); - } - if (reconstruct == QUDA_RECONSTRUCT_10 && link_type != QUDA_ASQTAD_MOM_LINKS) { - errorQuda("10-reconstruction only supported with momentum links"); - } - - // compute the correct bytes size for these padded field orders - if (order == QUDA_TIFR_PADDED_GAUGE_ORDER) { - bytes = site_dim * (x[0] * x[1] * (x[2] + 4) * x[3]) * nInternal * precision; - } else if (order == QUDA_BQCD_GAUGE_ORDER) { - bytes = site_dim * (x[0] + 4) * (x[1] + 2) * (x[2] + 2) * (x[3] + 2) * nInternal * precision; - } else if (order == QUDA_MILC_SITE_GAUGE_ORDER) { - bytes = volume * site_size; - } - - if (order == QUDA_QDP_GAUGE_ORDER) { - gauge = safe_malloc(site_dim * sizeof(void *)); - size_t nbytes = volume * nInternal * precision; - gauge_qdp = reinterpret_cast(gauge); - for (int d = 0; d < site_dim; d++) { - if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) { - gauge_qdp[d] = nbytes ? safe_malloc(nbytes) : nullptr; - if (create == QUDA_ZERO_FIELD_CREATE && nbytes) memset(gauge_qdp[d], 0, nbytes); - } else if (create == QUDA_REFERENCE_FIELD_CREATE) { - gauge_qdp[d] = ((void **)param.gauge)[d]; - } else { - errorQuda("Unsupported creation type %d", create); - } + // exchange the boundaries if a non-trivial field + if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) + if (create == QUDA_REFERENCE_FIELD_CREATE && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) { + exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL); } - } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER || - order == QUDA_BQCD_GAUGE_ORDER || order == QUDA_TIFR_GAUGE_ORDER || - order == QUDA_TIFR_PADDED_GAUGE_ORDER || order == QUDA_MILC_SITE_GAUGE_ORDER) { - - if (order == QUDA_MILC_SITE_GAUGE_ORDER && create != QUDA_REFERENCE_FIELD_CREATE) { - errorQuda("MILC site gauge order only supported for reference fields"); - } - - if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) { - gauge = bytes ? (void **)safe_malloc(bytes) : nullptr; - if (create == QUDA_ZERO_FIELD_CREATE && bytes) memset(gauge, 0, bytes); - } else if (create == QUDA_REFERENCE_FIELD_CREATE) { - gauge = param.gauge; - } else { - errorQuda("Unsupported creation type %d", create); - } - - } else { - errorQuda("Unsupported gauge order type %d", order); - } - - // no need to exchange data if this is a momentum field - if (link_type != QUDA_ASQTAD_MOM_LINKS) { - // Ghost zone is always 2-dimensional - for (int i=0; iabs_max(); } - - cpuGaugeField::~cpuGaugeField() - { - if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) { - if (order == QUDA_QDP_GAUGE_ORDER) { - for (int d = 0; d < site_dim; d++) { - if (gauge_qdp[d]) host_free(gauge_qdp[d]); - } - } - if (gauge) host_free(gauge); - } else { // QUDA_REFERENCE_FIELD_CREATE - if (order == QUDA_QDP_GAUGE_ORDER){ - if (gauge) host_free(gauge); - } - } - - if (link_type != QUDA_ASQTAD_MOM_LINKS) { - for (int i=0; i(recv[d])+bytes[d], send[d], bytes[d]); memcpy(recv[d], static_cast(send[d])+bytes[d], bytes[d]); - } + } // inject back into the gauge field extractExtendedGaugeGhost(*this, d, R, recv, false); @@ -286,15 +196,15 @@ namespace quda { if (order == QUDA_QDP_GAUGE_ORDER) { for (int d=0; d 4 && ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace) for (int d=0; d 0) free_ghost_buffer(ghost_buffer, order, geometry); @@ -314,21 +224,12 @@ namespace quda { } } - void cpuGaugeField::setGauge(void **gauge_) - { - if(create != QUDA_REFERENCE_FIELD_CREATE) { - errorQuda("Setting gauge pointer is only allowed when create=" - "QUDA_REFERENCE_FIELD_CREATE type\n"); - } - gauge = gauge_; - } - void cpuGaugeField::copy_to_buffer(void *buffer) const { if (is_pointer_array(order)) { char *dst_buffer = reinterpret_cast(buffer); - for (int d = 0; d < geometry; d++) { - std::memcpy(&dst_buffer[d * bytes / geometry], data()[d], bytes / geometry); + for (int d = 0; d < site_dim; d++) { + std::memcpy(&dst_buffer[d * bytes / site_dim], gauge_array[d].data(), bytes / site_dim); } } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER @@ -343,8 +244,8 @@ namespace quda { { if (is_pointer_array(order)) { const char *dst_buffer = reinterpret_cast(buffer); - for (int d = 0; d < geometry; d++) { - std::memcpy(data()[d], &dst_buffer[d * bytes / geometry], Bytes() / geometry); + for (int d = 0; d < site_dim; d++) { + std::memcpy(gauge_array[d].data(), &dst_buffer[d * bytes / site_dim], bytes / site_dim); } } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER diff --git a/lib/cuda_gauge_field.cpp b/lib/cuda_gauge_field.cpp index bb0299d027..ae21213770 100644 --- a/lib/cuda_gauge_field.cpp +++ b/lib/cuda_gauge_field.cpp @@ -9,112 +9,11 @@ namespace quda { cudaGaugeField::cudaGaugeField(const GaugeFieldParam ¶m) : GaugeField(param) { - if ((order == QUDA_QDP_GAUGE_ORDER || order == QUDA_QDPJIT_GAUGE_ORDER) && - create != QUDA_REFERENCE_FIELD_CREATE) { - errorQuda("QDP ordering only supported for reference fields"); - } - - if (order == QUDA_QDP_GAUGE_ORDER || - order == QUDA_TIFR_GAUGE_ORDER || order == QUDA_TIFR_PADDED_GAUGE_ORDER || - order == QUDA_BQCD_GAUGE_ORDER || order == QUDA_CPS_WILSON_GAUGE_ORDER) - errorQuda("Field ordering %d presently disabled for this type", order); - -#ifdef MULTI_GPU - if (link_type != QUDA_ASQTAD_MOM_LINKS && - ghostExchange == QUDA_GHOST_EXCHANGE_PAD && - isNative()) { - bool pad_check = true; - for (int i=0; i(gauge) + 0 * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, - Npad); - qudaMemset2D(static_cast(gauge) + 1 * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, - Npad); - } - } - - cudaGaugeField::~cudaGaugeField() - { - if (create != QUDA_REFERENCE_FIELD_CREATE) { - switch(mem_type) { - case QUDA_MEMORY_DEVICE: - if (gauge) pool_device_free(gauge); - break; - case QUDA_MEMORY_MAPPED: - if (gauge_h) host_free(gauge_h); - break; - default: - errorQuda("Unsupported memory type %d", mem_type); - } - } - - if ( !isNative() ) { - for (int i=0; i()[d], src.Bytes() / geometry, qudaMemcpyDefault); + qudaMemcpy(((void **)buffer)[d], src.data(d), src.Bytes() / geometry, qudaMemcpyDefault); } } else { qudaMemcpy(buffer, src.data(), src.Bytes(), qudaMemcpyDefault); @@ -578,7 +468,7 @@ namespace quda { if (src.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace) for (int d = 0; d < geometry; d++) - qudaMemcpy(ghost_buffer[d], src.Ghost()[d], ghost_bytes[d], qudaMemcpyDefault); + qudaMemcpy(ghost_buffer[d], src.Ghost()[d].data(), ghost_bytes[d], qudaMemcpyDefault); if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) { copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer, nullptr, ghost_buffer); @@ -652,7 +542,7 @@ namespace quda { if (cpu.Order() == QUDA_QDP_GAUGE_ORDER) { for (int d = 0; d < geometry; d++) - qudaMemcpy((cpu.data())[d], ((void **)buffer)[d], cpu.Bytes() / geometry, qudaMemcpyDefault); + qudaMemcpy(cpu.data(d), ((void **)buffer)[d], cpu.Bytes() / geometry, qudaMemcpyDefault); } else { qudaMemcpy(cpu.data(), buffer, cpu.Bytes(), qudaMemcpyDefault); } @@ -660,7 +550,7 @@ namespace quda { if (cpu.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace) for (int d = 0; d < geometry; d++) - qudaMemcpy(cpu.Ghost()[d], ghost_buffer[d], ghost_bytes[d], qudaMemcpyDefault); + qudaMemcpy(cpu.Ghost()[d].data(), ghost_buffer[d], ghost_bytes[d], qudaMemcpyDefault); free_gauge_buffer(buffer, cpu.Order(), cpu.Geometry()); if (nFace > 0) free_ghost_buffer(ghost_buffer, cpu.Order(), geometry); @@ -668,7 +558,7 @@ namespace quda { } else if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do copy then host-side reorder void *buffer = pool_pinned_malloc(bytes); - qudaMemcpy(buffer, gauge, bytes, qudaMemcpyDefault); + qudaMemcpy(buffer, gauge.data(), bytes, qudaMemcpyDefault); if (cpu.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) { copyGenericGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, nullptr, buffer); diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index ea17cb4610..caeddfa298 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -29,9 +29,8 @@ namespace quda { GaugeField::GaugeField(const GaugeFieldParam ¶m) : LatticeField(param), - gauge(nullptr), - gauge_h(nullptr), - gauge_qdp {}, + gauge(), + gauge_array {}, bytes(0), phase_offset(0), phase_bytes(0), @@ -88,43 +87,152 @@ namespace quda { errorQuda("Cannot request a 12/8 reconstruct type without SU(3) link type"); } - if (reconstruct == QUDA_RECONSTRUCT_9 || reconstruct == QUDA_RECONSTRUCT_13) { - // Need to adjust the phase alignment as well. - int half_phase_bytes - = (length / (2 * reconstruct)) * precision; // number of bytes needed to store phases for a single parity - int half_gauge_bytes = (length / 2) * precision - - half_phase_bytes; // number of bytes needed to store the gauge field for a single parity excluding the phases - // Adjust the alignments for the gauge and phase separately - half_phase_bytes = ((half_phase_bytes + (512-1))/512)*512; - half_gauge_bytes = ((half_gauge_bytes + (512-1))/512)*512; - - phase_offset = half_gauge_bytes; - phase_bytes = half_phase_bytes*2; - bytes = (half_gauge_bytes + half_phase_bytes)*2; + if (reconstruct == QUDA_RECONSTRUCT_10 && link_type != QUDA_ASQTAD_MOM_LINKS) { + errorQuda("10-reconstruction only supported with momentum links"); + } + + if (create != QUDA_NULL_FIELD_CREATE && create != QUDA_ZERO_FIELD_CREATE && create != QUDA_REFERENCE_FIELD_CREATE) { + errorQuda("ERROR: create type(%d) not supported yet\n", create); + } + + switch (geometry) { + case QUDA_SCALAR_GEOMETRY: site_dim = 1; break; + case QUDA_VECTOR_GEOMETRY: site_dim = nDim; break; + case QUDA_TENSOR_GEOMETRY: site_dim = nDim * (nDim - 1) / 2; break; + case QUDA_COARSE_GEOMETRY: site_dim = 2 * nDim; break; + case QUDA_KDINVERSE_GEOMETRY: site_dim = 1 << nDim; break; + default: errorQuda("Unknown geometry type %d", geometry); + } + + if (isNative()) { + if (reconstruct == QUDA_RECONSTRUCT_9 || reconstruct == QUDA_RECONSTRUCT_13) { + // Need to adjust the phase alignment as well. + int half_phase_bytes + = (length / (2 * reconstruct)) * precision; // bytes needed to store phases for a single parity + int half_gauge_bytes = (length / 2) * precision + - half_phase_bytes; // bytes needed to store the gauge field for a single parity excluding the phases + // Adjust the alignments for the gauge and phase separately + half_phase_bytes = ALIGNMENT_ADJUST(half_phase_bytes); + half_gauge_bytes = ALIGNMENT_ADJUST(half_gauge_bytes); + phase_offset = half_gauge_bytes; + phase_bytes = half_phase_bytes * 2; + bytes = (half_gauge_bytes + half_phase_bytes) * 2; + } else { + bytes = length * precision; + bytes = 2 * ALIGNMENT_ADJUST(bytes / 2); + } } else { - bytes = length * precision; - if (isNative()) bytes = 2*ALIGNMENT_ADJUST(bytes/2); + // compute the correct bytes size for these padded field orders + if (order == QUDA_TIFR_PADDED_GAUGE_ORDER) { + bytes = site_dim * (x[0] * x[1] * (x[2] + 4) * x[3]) * nInternal * precision; + } else if (order == QUDA_BQCD_GAUGE_ORDER) { + bytes = site_dim * (x[0] + 4) * (x[1] + 2) * (x[2] + 2) * (x[3] + 2) * nInternal * precision; + } else if (order == QUDA_MILC_SITE_GAUGE_ORDER) { + bytes = volume * site_size; + } else { + bytes = length * precision; + } } + total_bytes = bytes; - if (geometry == QUDA_SCALAR_GEOMETRY) - site_dim = 1; - else if (geometry == QUDA_VECTOR_GEOMETRY) - site_dim = nDim; - else if (geometry == QUDA_TENSOR_GEOMETRY) - site_dim = nDim * (nDim - 1) / 2; - else if (geometry == QUDA_COARSE_GEOMETRY) - site_dim = 2 * nDim; - else if (geometry == QUDA_KDINVERSE_GEOMETRY) - site_dim = 1 << nDim; - else - errorQuda("Unknown geometry type %d", geometry); + if (isNative() && ghostExchange == QUDA_GHOST_EXCHANGE_PAD) { + bool pad_check = true; + for (int i = 0; i < nDim; i++) { + // when we have coarse links we need to double the pad since we're storing forwards and backwards links + int minimum_pad = comm_dim_partitioned(i) ? nFace*surfaceCB[i] * (geometry == QUDA_COARSE_GEOMETRY ? 2 : 1) : 0; + if (pad < minimum_pad) pad_check = false; + if (!pad_check) errorQuda("GaugeField being constructed with insufficient padding in dim %d (%d < %d)", i, pad, minimum_pad); + } + } + + if (isNative()) { + if (create != QUDA_REFERENCE_FIELD_CREATE) { + gauge = std::move(quda_ptr(mem_type, bytes)); + if (create == QUDA_ZERO_FIELD_CREATE) qudaMemset(gauge, 0, bytes); + } else { + gauge = std::move(quda_ptr(param.gauge, mem_type)); + } + } else if (is_pointer_array(order)) { + + size_t nbytes = volume * nInternal * precision; + for (int d = 0; d < site_dim; d++) { + if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) { + gauge_array[d] = std::move(quda_ptr(mem_type, nbytes)); + if (create == QUDA_ZERO_FIELD_CREATE) qudaMemset(gauge_array[d], 0, nbytes); + } else if (create == QUDA_REFERENCE_FIELD_CREATE) { + gauge_array[d] = std::move(quda_ptr(static_cast(param.gauge)[d], mem_type)); + } else { + errorQuda("Unsupported creation type %d", create); + } + } + + } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER || + order == QUDA_BQCD_GAUGE_ORDER || order == QUDA_TIFR_GAUGE_ORDER || + order == QUDA_TIFR_PADDED_GAUGE_ORDER || order == QUDA_MILC_SITE_GAUGE_ORDER) { + // does not support device + + if (order == QUDA_MILC_SITE_GAUGE_ORDER && create != QUDA_REFERENCE_FIELD_CREATE) { + errorQuda("MILC site gauge order only supported for reference fields"); + } + + if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) { + gauge = std::move(quda_ptr(mem_type, bytes)); + if (create == QUDA_ZERO_FIELD_CREATE) qudaMemset(gauge, 0, bytes); + } else if (create == QUDA_REFERENCE_FIELD_CREATE) { + gauge = std::move(quda_ptr(param.gauge, mem_type)); + } else { + errorQuda("Unsupported creation type %d", create); + } + + } else { + errorQuda("Unsupported gauge order type %d", order); + } + + if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) { + if (!isNative()) { + for (int i=0; i() + parity * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad); + } + } else { + for (int parity = 0; parity < 2; parity++) + for (int p = 0; p < Npad; p++) + memset(data() + parity * (bytes / 2) + (volumeCB + p * stride) * order * precision, 0, pad_bytes); + } + } +#endif } void GaugeField::setTuningString() { @@ -194,7 +302,8 @@ namespace quda { staggeredPhaseApplied = false; } - void GaugeField::exchange(void **ghost_link, void **link_sendbuf, QudaDirection dir) const { + void GaugeField::exchange(void **ghost_link, void **link_sendbuf, QudaDirection dir) const + { MsgHandle *mh_send[4]; MsgHandle *mh_recv[4]; size_t bytes[4]; @@ -219,16 +328,8 @@ namespace quda { if (no_comms_fill) memcpy(ghost_link[i], link_sendbuf[i], bytes[i]); } } - } else { // FIXME for CUDA field copy back to the CPU - for (int i=0; i(buffer); - } else { - backup_h = new char[bytes]; - memcpy(backup_h, gauge, bytes); + if (order == QUDA_QDP_GAUGE_ORDER) { + char **buffer = new char *[geometry]; + for (int d = 0; d < geometry; d++) { + buffer[d] = new char[bytes / geometry]; + qudaMemcpy(buffer[d], gauge_array[d].data(), bytes / geometry, qudaMemcpyDefault); } + backup_h = reinterpret_cast(buffer); + } else { + backup_h = new char[bytes]; + qudaMemcpy(backup_h, gauge.data(), bytes, qudaMemcpyDefault); } backed_up = true; @@ -490,21 +582,16 @@ namespace quda { { if (!backed_up) errorQuda("Cannot restore since not backed up"); - if (location == QUDA_CUDA_FIELD_LOCATION) { - qudaMemcpy(gauge, backup_h, bytes, qudaMemcpyDefault); - delete[] backup_h; - } else { - if (order == QUDA_QDP_GAUGE_ORDER) { - char **buffer = reinterpret_cast(backup_h); - for (int d = 0; d < geometry; d++) { - memcpy(gauge_qdp[d], buffer[d], bytes / geometry); - delete[] buffer[d]; - } - delete[] buffer; - } else { - memcpy(gauge, backup_h, bytes); - delete[] backup_h; + if (order == QUDA_QDP_GAUGE_ORDER) { + char **buffer = reinterpret_cast(backup_h); + for (int d = 0; d < geometry; d++) { + qudaMemcpy(gauge_array[d].data(), buffer[d], bytes / geometry, qudaMemcpyDefault); + delete[] buffer[d]; } + delete[] buffer; + } else { + qudaMemcpy(gauge.data(), backup_h, bytes, qudaMemcpyDefault); + delete[] backup_h; } backed_up = false; } diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 65464e3f6e..6a8cc64e54 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -2662,7 +2662,7 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile) //but if not sufficient device memory, then the user may choose mapped type of memory ritzParam.mem_type = eig_param.mem_type_ritz; } else { //host location - ritzParam.mem_type = QUDA_MEMORY_PINNED; + ritzParam.mem_type = QUDA_MEMORY_HOST_PINNED; } int ritzVolume = 1; diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp index c657a89e3b..af108d17ff 100644 --- a/lib/lattice_field.cpp +++ b/lib/lattice_field.cpp @@ -183,8 +183,16 @@ namespace quda { // for 5-dimensional fields, we only communicate in the space-time dimensions nDimComms = nDim == 5 ? 4 : nDim; + // if the memory location isn't set, use field location to set it mem_type = param.mem_type; - + if (mem_type == QUDA_MEMORY_INVALID) { + mem_type = location == QUDA_CUDA_FIELD_LOCATION ? QUDA_MEMORY_DEVICE : QUDA_MEMORY_HOST; + logQuda(QUDA_DEBUG_VERBOSE, "setting default memory type mem_type %d\n", mem_type); + } else if (mem_type == QUDA_MEMORY_DEVICE && location == QUDA_CPU_FIELD_LOCATION) { + mem_type = QUDA_MEMORY_HOST; + } else if (mem_type == QUDA_MEMORY_HOST && location == QUDA_CUDA_FIELD_LOCATION) { + mem_type = QUDA_MEMORY_DEVICE; + } setTuningString(); } diff --git a/lib/targets/cuda/malloc.cpp b/lib/targets/cuda/malloc.cpp index 82ec226a6d..1a486b98e8 100644 --- a/lib/targets/cuda/malloc.cpp +++ b/lib/targets/cuda/malloc.cpp @@ -788,4 +788,147 @@ namespace quda } // namespace pool + + quda_ptr::quda_ptr(QudaMemoryType type, size_t size, bool pool) : + type(type), + size(size), + pool(pool) + { + if (pool && (type != QUDA_MEMORY_DEVICE && type != QUDA_MEMORY_HOST_PINNED && type != QUDA_MEMORY_HOST)) + errorQuda("Memory pool not available for memory type %d", type); + + if (size > 0) { + switch (type) { + case QUDA_MEMORY_DEVICE: + device = pool ? pool_device_malloc(size) : device_malloc(size); + break; + case QUDA_MEMORY_DEVICE_PINNED: + device = device_pinned_malloc(size); + break; + case QUDA_MEMORY_HOST: + host = safe_malloc(size); + break; + case QUDA_MEMORY_HOST_PINNED: + host = pool ? pool_pinned_malloc(size) : pinned_malloc(size); + break; + case QUDA_MEMORY_MAPPED: + host = mapped_malloc(size); + device = get_mapped_device_pointer(host); + break; + case QUDA_MEMORY_MANAGED: + host = managed_malloc(size); + device = host; + break; + default: errorQuda("Unknown memory type %d", type); + } + } + } + + quda_ptr::quda_ptr(void *ptr, QudaMemoryType type) : + type(type) + { + switch (type) { + case QUDA_MEMORY_DEVICE: + case QUDA_MEMORY_DEVICE_PINNED: + device = ptr; + host = nullptr; + break; + case QUDA_MEMORY_HOST: + case QUDA_MEMORY_HOST_PINNED: + device = nullptr; + host = ptr; + break; + case QUDA_MEMORY_MANAGED: + device = ptr; + host = ptr; + break; + default: errorQuda("Unsupported memory type %d", type); + } + } + + quda_ptr& quda_ptr::operator=(quda_ptr &&other) + { + if (&other != this) { + type = std::exchange(other.type, QUDA_MEMORY_INVALID); + size = std::exchange(other.size, 0); + pool = std::exchange(other.pool, false); + device = std::exchange(other.device, nullptr); + host = std::exchange(other.host, nullptr); + } + return *this; + } + + quda_ptr::~quda_ptr() + { + if (size > 0) { + switch (type) { + case QUDA_MEMORY_DEVICE: pool ? pool_device_free(device) : device_free(device); break; + case QUDA_MEMORY_DEVICE_PINNED: device_pinned_free(device); break; + case QUDA_MEMORY_HOST: host_free(host); break; + case QUDA_MEMORY_HOST_PINNED: pool ? pool_pinned_free(host) : host_free(host); break; + case QUDA_MEMORY_MAPPED: host_free(host); break; + default: errorQuda("Unknown memory type %d", type); + } + } + + device = nullptr; + host = nullptr; + } + + bool quda_ptr::is_device() const + { + switch (type) { + case QUDA_MEMORY_DEVICE: + case QUDA_MEMORY_DEVICE_PINNED: + case QUDA_MEMORY_MAPPED: + case QUDA_MEMORY_MANAGED: + return true; + default: return false; + } + } + + bool quda_ptr::is_host() const + { + switch (type) { + case QUDA_MEMORY_HOST: + case QUDA_MEMORY_HOST_PINNED: + case QUDA_MEMORY_MANAGED: + return true; + default: return false; + } + } + + void *quda_ptr::data() const + { + void *ptr = nullptr; + + switch (type) { + case QUDA_MEMORY_DEVICE: + case QUDA_MEMORY_DEVICE_PINNED: + case QUDA_MEMORY_MAPPED: + case QUDA_MEMORY_MANAGED: + ptr = device; + break; + case QUDA_MEMORY_HOST: + case QUDA_MEMORY_HOST_PINNED: + ptr = host; + break; + default: errorQuda("Unknown memory type %d", type); + } + + return ptr; + } + + void *quda_ptr::data_device() const + { + if (!device) errorQuda("Device view not defined"); + return device; + } + + void *quda_ptr::data_host() const + { + if (!host) errorQuda("Host view not defined"); + return host; + } + } // namespace quda diff --git a/lib/targets/cuda/quda_api.cpp b/lib/targets/cuda/quda_api.cpp index 1af28417d8..856aa44e2d 100644 --- a/lib/targets/cuda/quda_api.cpp +++ b/lib/targets/cuda/quda_api.cpp @@ -376,6 +376,16 @@ namespace quda QudaMem set(ptr, value, count, device::get_default_stream(), false, func, file, line); } + void qudaMemset_(quda_ptr &ptr, int value, size_t count, const char *func, const char *file, const char *line) + { + if (count == 0) return; + if (ptr.is_device()) { + QudaMem set(ptr.data(), value, count, device::get_default_stream(), false, func, file, line); + } else { + memset(ptr.data(), value, count); + } + } + void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func, const char *file, const char *line) { @@ -390,6 +400,17 @@ namespace quda set_runtime_error(error, __func__, func, file, line); } + void qudaMemset2D_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height, const char *func, + const char *file, const char *line) + { + if (ptr.is_device()) { + cudaError_t error = cudaMemset2D(static_cast(ptr.data()) + offset, pitch, value, width, height); + set_runtime_error(error, __func__, func, file, line); + } else { + for (auto i = 0u; i < height; i++) memset(static_cast(ptr.data()) + offset + i * pitch, value, width); + } + } + void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width, size_t height, const qudaStream_t &stream, const char *func, const char *file, const char *line) { diff --git a/tests/covdev_test.cpp b/tests/covdev_test.cpp index 50473151a1..a59f26bf28 100644 --- a/tests/covdev_test.cpp +++ b/tests/covdev_test.cpp @@ -34,8 +34,6 @@ std::unique_ptr tmp; void *links[4]; -void **ghostLink; - QudaParity parity = QUDA_EVEN_PARITY; GaugeCovDev *dirac; @@ -97,7 +95,6 @@ void init(int argc, char **argv) GaugeFieldParam cpuParam(gauge_param, links); cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; cpuLink = new cpuGaugeField(cpuParam); - ghostLink = cpuLink->Ghost(); printfQuda("Links sending..."); loadGaugeQuda(links, &gauge_param); @@ -166,9 +163,9 @@ void covdevRef(int mu) // compare to dslash reference implementation printfQuda("Calculating reference implementation..."); #ifdef MULTI_GPU - mat_mg4dir(*spinorRef, links, ghostLink, *spinor, dagger, mu, inv_param.cpu_prec, gauge_param.cpu_prec); + mat_mg4dir(*spinorRef, *cpuLink, *spinor, dagger, mu, inv_param.cpu_prec, gauge_param.cpu_prec); #else - mat(spinorRef->V(), links, spinor->V(), dagger, mu, inv_param.cpu_prec, gauge_param.cpu_prec); + mat(spinorRef->V(), *cpuLink, spinor->V(), dagger, mu, inv_param.cpu_prec, gauge_param.cpu_prec); #endif printfQuda("done.\n"); } diff --git a/tests/gauge_force_test.cpp b/tests/gauge_force_test.cpp index 60dfa94cd0..64ba5c1048 100644 --- a/tests/gauge_force_test.cpp +++ b/tests/gauge_force_test.cpp @@ -111,12 +111,13 @@ void gauge_force_test(bool compute_force = true) } quda::GaugeFieldParam param(gauge_param); + param.location = QUDA_CPU_FIELD_LOCATION; param.create = QUDA_NULL_FIELD_CREATE; param.order = QUDA_QDP_GAUGE_ORDER; auto U_qdp = new quda::cpuGaugeField(param); // fills the gauge field with random numbers - createSiteLinkCPU(U_qdp->data(), gauge_param.cpu_prec, 0); + createSiteLinkCPU(*U_qdp, gauge_param.cpu_prec, 0); param.order = QUDA_MILC_GAUGE_ORDER; auto U_milc = new quda::cpuGaugeField(param); @@ -140,13 +141,17 @@ void gauge_force_test(bool compute_force = true) } void *mom = nullptr; void *sitelink = nullptr; + void *sitelink_array[QUDA_MAX_DIM]; + void *mom_array[QUDA_MAX_DIM]; if (gauge_order == QUDA_MILC_GAUGE_ORDER) { sitelink = U_milc->data(); mom = Mom_milc->data(); } else if (gauge_order == QUDA_QDP_GAUGE_ORDER) { - sitelink = U_qdp->data(); - mom = Mom_qdp->data(); + for (int d = 0; d < 4; d++) sitelink_array[d] = U_qdp->data(d); + sitelink = reinterpret_cast(sitelink_array); + for (int d = 0; d < 4; d++) mom_array[d] = Mom_qdp->data(d); + mom = reinterpret_cast(mom_array); } else { errorQuda("Unsupported gauge order %d", gauge_order); } @@ -183,7 +188,7 @@ void gauge_force_test(bool compute_force = true) void *refmom = Mom_ref_milc->data(); int *check_out = compute_force ? &force_check : &path_check; if (verify_results) { - gauge_force_reference(refmom, eb3, U_qdp->data(), gauge_param.cpu_prec, input_path_buf, length, + gauge_force_reference(refmom, eb3, *U_qdp, gauge_param.cpu_prec, input_path_buf, length, loop_coeff, num_paths, compute_force); *check_out = compare_floats(Mom_milc->data(), refmom, 4 * V * mom_site_size, getTolerance(cuda_prec), gauge_param.cpu_prec); diff --git a/tests/hisq_paths_force_test.cpp b/tests/hisq_paths_force_test.cpp index e19d874e31..58e5ba97b1 100644 --- a/tests/hisq_paths_force_test.cpp +++ b/tests/hisq_paths_force_test.cpp @@ -169,7 +169,7 @@ static void hisq_force_init() cpuGauge_ex = new cpuGaugeField(gParam_ex); if (gauge_order == QUDA_QDP_GAUGE_ORDER) { - createSiteLinkCPU(cpuGauge->data(), qudaGaugeParam.cpu_prec, 1); + createSiteLinkCPU(*cpuGauge, qudaGaugeParam.cpu_prec, 1); } else { errorQuda("Unsupported gauge order %d", gauge_order); } diff --git a/tests/hisq_unitarize_force_test.cpp b/tests/hisq_unitarize_force_test.cpp index f6d68a9553..7a9d19255c 100644 --- a/tests/hisq_unitarize_force_test.cpp +++ b/tests/hisq_unitarize_force_test.cpp @@ -26,7 +26,7 @@ quda::cpuGaugeField *cpuReference = NULL; static QudaGaugeParam gaugeParam; // Create a field of links that are not su3_matrices -void createNoisyLinkCPU(void *const *field, QudaPrecision prec, int seed) +void createNoisyLinkCPU(quda::GaugeField &field, QudaPrecision prec, int seed) { createSiteLinkCPU(field, prec, 0); @@ -34,10 +34,10 @@ void createNoisyLinkCPU(void *const *field, QudaPrecision prec, int seed) for (int dir = 0; dir < 4; ++dir) { for (int i = 0; i < V * 18; ++i) { if (prec == QUDA_DOUBLE_PRECISION) { - double *ptr = ((double **)field)[dir] + i; + double *ptr = field.data(dir) + i; *ptr += (rand() - RAND_MAX / 2.0) / (20.0 * RAND_MAX); } else if (prec == QUDA_SINGLE_PRECISION) { - float *ptr = ((float **)field)[dir] + i; + float *ptr = field.data(dir) + i; *ptr += (rand() - RAND_MAX / 2.0) / (20.0 * RAND_MAX); } } @@ -77,8 +77,8 @@ static void hisq_force_init() seed += quda::comm_rank(); #endif - createNoisyLinkCPU(cpuFatLink->data(), gaugeParam.cpu_prec, seed); - createNoisyLinkCPU(cpuOprod->data(), gaugeParam.cpu_prec, seed + 1); + createNoisyLinkCPU(*cpuFatLink, gaugeParam.cpu_prec, seed); + createNoisyLinkCPU(*cpuOprod, gaugeParam.cpu_prec, seed + 1); gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.setPrecision(gaugeParam.cuda_prec, true); @@ -142,7 +142,7 @@ TEST(hisq_force_unitarize, verify) double accuracy = prec == QUDA_DOUBLE_PRECISION ? 1e-10 : 1e-5; for (int dir = 0; dir < 4; ++dir) { - res[dir] = compare_floats(cpuReference->data()[dir], cpuResult->data()[dir], + res[dir] = compare_floats(cpuReference->data(dir), cpuResult->data(dir), cpuReference->Volume() * gauge_site_size, accuracy, gaugeParam.cpu_prec); quda::comm_allreduce_int(res[dir]); diff --git a/tests/host_reference/covdev_reference.cpp b/tests/host_reference/covdev_reference.cpp index 081b19142c..a8c178af00 100644 --- a/tests/host_reference/covdev_reference.cpp +++ b/tests/host_reference/covdev_reference.cpp @@ -193,7 +193,7 @@ void covdevReference_mg4dir(sFloat *res, gFloat **link, gFloat **ghostLink, cons } // 4-d volume } -void covdev_dslash_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int oddBit, +void covdev_dslash_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int oddBit, int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision) { QudaParity otherparity = QUDA_INVALID_PARITY; @@ -208,32 +208,38 @@ void covdev_dslash_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, in.exchangeGhost(otherparity, nFace, daggerBit); + void *data[4] = {link.data(0), link.data(1), link.data(2), link.data(3)}; + void *ghostLink[4] = {link.Ghost()[0].data(), link.Ghost()[1].data(), link.Ghost()[2].data(), link.Ghost()[3].data()}; + if (sPrecision == QUDA_DOUBLE_PRECISION) { if (gPrecision == QUDA_DOUBLE_PRECISION) { - covdevReference_mg4dir((double *)out.V(), (double **)link, (double **)ghostLink, in, oddBit, daggerBit, mu); + covdevReference_mg4dir((double *)out.V(), reinterpret_cast(data), (double **)ghostLink, in, oddBit, daggerBit, mu); } else { - covdevReference_mg4dir((double *)out.V(), (float **)link, (float **)ghostLink, in, oddBit, daggerBit, mu); + covdevReference_mg4dir((double *)out.V(), reinterpret_cast(data), (float **)ghostLink, in, oddBit, daggerBit, mu); } } else { if (gPrecision == QUDA_DOUBLE_PRECISION) { - covdevReference_mg4dir((float *)out.V(), (double **)link, (double **)ghostLink, in, oddBit, daggerBit, mu); + covdevReference_mg4dir((float *)out.V(), reinterpret_cast(data), (double **)ghostLink, in, oddBit, daggerBit, mu); } else { - covdevReference_mg4dir((float *)out.V(), (float **)link, (float **)ghostLink, in, oddBit, daggerBit, mu); + covdevReference_mg4dir((float *)out.V(), reinterpret_cast(data), (float **)ghostLink, in, oddBit, daggerBit, mu); } } } template -void Mat_mg4dir(ColorSpinorField &out, gFloat **link, gFloat **ghostLink, const ColorSpinorField &in, int daggerBit, - int mu) +void Mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu) { + void *data[4] = {link.data(0), link.data(1), link.data(2), link.data(3)}; + void *ghostLink[4] = {link.Ghost()[0].data(), link.Ghost()[1].data(), link.Ghost()[2].data(), link.Ghost()[3].data()}; + const int nFace = 1; { auto &inEven = in.Even(); auto &outOdd = out.Odd(); inEven.exchangeGhost(QUDA_EVEN_PARITY, nFace, daggerBit); - covdevReference_mg4dir(reinterpret_cast(outOdd.V()), link, ghostLink, in.Even(), 1, daggerBit, mu); + covdevReference_mg4dir(reinterpret_cast(outOdd.V()), reinterpret_cast(data), + reinterpret_cast(ghostLink), in.Even(), 1, daggerBit, mu); } { @@ -241,29 +247,30 @@ void Mat_mg4dir(ColorSpinorField &out, gFloat **link, gFloat **ghostLink, const auto &outEven = out.Even(); inOdd.exchangeGhost(QUDA_ODD_PARITY, nFace, daggerBit); - covdevReference_mg4dir(reinterpret_cast(outEven.V()), link, ghostLink, in.Odd(), 0, daggerBit, mu); + covdevReference_mg4dir(reinterpret_cast(outEven.V()), reinterpret_cast(data), + reinterpret_cast(ghostLink), in.Odd(), 0, daggerBit, mu); } } -void mat_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int dagger_bit, +void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision) { if (sPrecision == QUDA_DOUBLE_PRECISION) { if (gPrecision == QUDA_DOUBLE_PRECISION) { - Mat_mg4dir(out, (double **)link, (double **)ghostLink, in, dagger_bit, mu); + Mat_mg4dir(out, link, in, dagger_bit, mu); } else { - Mat_mg4dir(out, (float **)link, (float **)ghostLink, in, dagger_bit, mu); + Mat_mg4dir(out, link, in, dagger_bit, mu); } } else { if (gPrecision == QUDA_DOUBLE_PRECISION) { - Mat_mg4dir(out, (double **)link, (double **)ghostLink, in, dagger_bit, mu); + Mat_mg4dir(out, link, in, dagger_bit, mu); } else { - Mat_mg4dir(out, (float **)link, (float **)ghostLink, in, dagger_bit, mu); + Mat_mg4dir(out, link, in, dagger_bit, mu); } } } -void matdagmat_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int dagger_bit, +void matdagmat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity) { @@ -279,9 +286,9 @@ void matdagmat_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, cons errorQuda("full parity not supported"); } - covdev_dslash_mg4dir(tmp, link, ghostLink, in, otherparity, dagger_bit, mu, sPrecision, gPrecision); + covdev_dslash_mg4dir(tmp, link, in, otherparity, dagger_bit, mu, sPrecision, gPrecision); - covdev_dslash_mg4dir(out, link, ghostLink, tmp, parity, dagger_bit, mu, sPrecision, gPrecision); + covdev_dslash_mg4dir(out, link, tmp, parity, dagger_bit, mu, sPrecision, gPrecision); } #endif diff --git a/tests/host_reference/covdev_reference.h b/tests/host_reference/covdev_reference.h index 19b1809cf0..c2045773ed 100644 --- a/tests/host_reference/covdev_reference.h +++ b/tests/host_reference/covdev_reference.h @@ -6,18 +6,18 @@ using namespace quda; void setDims(int *); -void covdev_dslash(void *res, void **link, void *spinorField, int oddBit, int daggerBit, int mu, +void covdev_dslash(void *res, const GaugeField &link, void *spinorField, int oddBit, int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision); -void covdev_dslash_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int oddBit, +void covdev_dslash_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int oddBit, int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision); -void mat(void *out, void **link, void *in, int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision); +void mat(void *out, const GaugeField &link, void *in, int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision); -void matdagmat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, +void matdagmat(void *out, const GaugeField &link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, void *tmp, QudaParity parity); -void mat_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int daggerBit, int mu, +void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision); -void matdagmat_mg4dir(ColorSpinorField &out, void **link, void **ghostLink, const ColorSpinorField &in, int dagger_bit, +void matdagmat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity); diff --git a/tests/host_reference/domain_wall_dslash_reference.cpp b/tests/host_reference/domain_wall_dslash_reference.cpp index 46d2620ce0..5fba06fe30 100644 --- a/tests/host_reference/domain_wall_dslash_reference.cpp +++ b/tests/host_reference/domain_wall_dslash_reference.cpp @@ -764,7 +764,7 @@ void dw_dslash(void *out, void *const *gauge, void *in, int oddBit, int daggerBi GaugeFieldParam gauge_field_param(gauge_param, (void **)gauge); gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; cpuGaugeField cpu(gauge_field_param); - void **ghostGauge = (void **)cpu.Ghost(); + void *ghostGauge[4] = {cpu.Ghost()[0].data(), cpu.Ghost()[1].data(), cpu.Ghost()[2].data(), cpu.Ghost()[3].data()}; // Get spinor ghost fields // First wrap the input spinor into a ColorSpinorField @@ -831,7 +831,7 @@ void dslash_4_4d(void *out, void *const *gauge, void *in, int oddBit, int dagger GaugeFieldParam gauge_field_param(gauge_param, (void **)gauge); gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; cpuGaugeField cpu(gauge_field_param); - void **ghostGauge = (void **)cpu.Ghost(); + void *ghostGauge[4] = {cpu.Ghost()[0].data(), cpu.Ghost()[1].data(), cpu.Ghost()[2].data(), cpu.Ghost()[3].data()}; // Get spinor ghost fields // First wrap the input spinor into a ColorSpinorField @@ -1357,7 +1357,7 @@ void mdw_mdagm_local(void *out, void *const *gauge, void *in, double _Complex *k QudaGaugeParam padded_gauge_param(gauge_param); for (int d = 0; d < 4; d++) { padded_gauge_param.X[d] += 2 * R[d]; } - auto padded_gauge_p = padded_gauge->data(); + void *padded_gauge_p[] = {padded_gauge->data(0), padded_gauge->data(1), padded_gauge->data(2), padded_gauge->data(3)}; // Extend these global variables then restore them int V5_old = V5; diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp index eeb1a56bd4..907a857824 100644 --- a/tests/host_reference/dslash_reference.cpp +++ b/tests/host_reference/dslash_reference.cpp @@ -743,10 +743,14 @@ double verifyWilsonTypeSingularVector(void *spinor_left, void *spinor_right, dou } double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, - quda::ColorSpinorField &out, double mass, void *qdp_fatlink[], void *qdp_longlink[], - void **ghost_fatlink, void **ghost_longlink, QudaGaugeParam &gauge_param, - QudaInvertParam &inv_param, int shift) + quda::ColorSpinorField &out, double mass, quda::GaugeField &fatlink, quda::GaugeField &longlink, + QudaGaugeParam &gauge_param, QudaInvertParam &inv_param, int shift) { + void *qdp_fatlink[] = {fatlink.data(0), fatlink.data(1), fatlink.data(2), fatlink.data(3)}; + void *qdp_longlink[] = {longlink.data(0), longlink.data(1), longlink.data(2), longlink.data(3)}; + void *ghost_fatlink[] = {fatlink.Ghost()[0].data(), fatlink.Ghost()[1].data(), fatlink.Ghost()[2].data(), fatlink.Ghost()[3].data()}; + void *ghost_longlink[] = {longlink.Ghost()[0].data(), longlink.Ghost()[1].data(), longlink.Ghost()[2].data(), longlink.Ghost()[3].data()}; + switch (test_type) { case 0: // full parity solution, full parity system case 1: // full parity solution, solving EVEN EVEN prec system diff --git a/tests/host_reference/dslash_reference.h b/tests/host_reference/dslash_reference.h index f124e99f24..48188d9a1e 100644 --- a/tests/host_reference/dslash_reference.h +++ b/tests/host_reference/dslash_reference.h @@ -2,6 +2,7 @@ #include #include +#include template static inline void sum(Float *dst, Float *a, Float *b, int cnt) { @@ -107,9 +108,8 @@ double verifyWilsonTypeInversion(void *spinorOut, void **spinorOutMulti, void *s void *clover_inv); double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorField &ref, quda::ColorSpinorField &in, - quda::ColorSpinorField &out, double mass, void *qdp_fatlink[], void *qdp_longlink[], - void **ghost_fatlink, void **ghost_longlink, QudaGaugeParam &gauge_param, - QudaInvertParam &inv_param, int shift); + quda::ColorSpinorField &out, double mass, quda::GaugeField &fatlink, quda::GaugeField &longlink, + QudaGaugeParam &gauge_param, QudaInvertParam &inv_param, int shift); // i represents a "half index" into an even or odd "half lattice". // when oddBit={0,1} the half lattice is {even,odd}. diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp index ffe8cc4494..4d12185981 100644 --- a/tests/host_reference/gauge_force_reference.cpp +++ b/tests/host_reference/gauge_force_reference.cpp @@ -405,9 +405,11 @@ void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *s host_free(staple); } -void gauge_force_reference(void *refMom, double eb3, void *const *const sitelink, QudaPrecision prec, int ***path_dir, +void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, QudaPrecision prec, int ***path_dir, int *length, void *loop_coeff, int num_paths, bool compute_force) { + void *sitelink[] = {u.data(0), u.data(1), u.data(2), u.data(3)}; + // created extended field quda::lat_dim_t R; for (int d = 0; d < 4; d++) R[d] = 2 * quda::comm_dim_partitioned(d); @@ -419,8 +421,9 @@ void gauge_force_reference(void *refMom, double eb3, void *const *const sitelink auto qdp_ex = quda::createExtendedGauge((void **)sitelink, param, R); lattice_t lat(*qdp_ex); + void *sitelink_ex[] = {qdp_ex->data(0), qdp_ex->data(1), qdp_ex->data(2), qdp_ex->data(3)}; for (int dir = 0; dir < 4; dir++) { - gauge_force_reference_dir(refMom, dir, eb3, sitelink, qdp_ex->data(), prec, path_dir[dir], length, + gauge_force_reference_dir(refMom, dir, eb3, sitelink, sitelink_ex, prec, path_dir[dir], length, loop_coeff, num_paths, lat, compute_force); } diff --git a/tests/host_reference/gauge_force_reference.h b/tests/host_reference/gauge_force_reference.h index 44106e5427..adaeaacdda 100644 --- a/tests/host_reference/gauge_force_reference.h +++ b/tests/host_reference/gauge_force_reference.h @@ -1,4 +1,6 @@ #pragma once -void gauge_force_reference(void *refMom, double eb3, void *const *sitelink, QudaPrecision prec, int ***path_dir, +#include + +void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, QudaPrecision prec, int ***path_dir, int *length, void *loop_coeff, int num_paths, bool compute_force); diff --git a/tests/host_reference/wilson_dslash_reference.cpp b/tests/host_reference/wilson_dslash_reference.cpp index 3a766e570c..fbe5aa241d 100644 --- a/tests/host_reference/wilson_dslash_reference.cpp +++ b/tests/host_reference/wilson_dslash_reference.cpp @@ -191,8 +191,9 @@ void wil_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, Qu GaugeFieldParam gauge_field_param(gauge_param, gauge); gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; + gauge_field_param.location = QUDA_CPU_FIELD_LOCATION; cpuGaugeField cpu(gauge_field_param); - void **ghostGauge = (void **)cpu.Ghost(); + void *ghostGauge[4] = {cpu.Ghost()[0].data(), cpu.Ghost()[1].data(), cpu.Ghost()[2].data(), cpu.Ghost()[3].data()}; // Get spinor ghost fields // First wrap the input spinor into a ColorSpinorField diff --git a/tests/multigrid_evolve_test.cpp b/tests/multigrid_evolve_test.cpp index 2fd02228a0..2436ddabf7 100644 --- a/tests/multigrid_evolve_test.cpp +++ b/tests/multigrid_evolve_test.cpp @@ -35,13 +35,13 @@ void setReunitarizationConsts() setUnitarizeLinksConstants(unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); } -void CallUnitarizeLinks(quda::cudaGaugeField *cudaInGauge) +void CallUnitarizeLinks(quda::GaugeField &gauge) { using namespace quda; int *num_failures_dev = (int *)device_malloc(sizeof(int)); int num_failures; qudaMemset(num_failures_dev, 0, sizeof(int)); - unitarizeLinks(*cudaInGauge, num_failures_dev); + unitarizeLinks(gauge, num_failures_dev); qudaMemcpy(&num_failures, num_failures_dev, sizeof(int), qudaMemcpyDeviceToHost); if (num_failures > 0) errorQuda("Error in the unitarization\n"); @@ -219,12 +219,13 @@ int main(int argc, char **argv) { using namespace quda; GaugeFieldParam gParam(gauge_param); + gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; gParam.create = QUDA_NULL_FIELD_CREATE; gParam.link_type = gauge_param.type; gParam.reconstruct = gauge_param.reconstruct; gParam.setPrecision(gParam.Precision(), true); - cudaGaugeField *gauge = new cudaGaugeField(gParam); + cudaGaugeField gauge(gParam); int pad = 0; lat_dim_t y; @@ -239,15 +240,15 @@ int main(int argc, char **argv) gParamEx.siteSubset = QUDA_FULL_SITE_SUBSET; gParamEx.t_boundary = gParam.t_boundary; gParamEx.nFace = 1; - for (int dir = 0; dir < 4; ++dir) gParamEx.r[dir] = R[dir]; - cudaGaugeField *gaugeEx = new cudaGaugeField(gParamEx); + gParamEx.r = R; + cudaGaugeField gaugeEx(gParamEx); QudaGaugeObservableParam obs_param = newQudaGaugeObservableParam(); obs_param.compute_plaquette = QUDA_BOOLEAN_TRUE; obs_param.compute_qcharge = QUDA_BOOLEAN_TRUE; // CURAND random generator initialization - RNG *randstates = new RNG(*gauge, 1234); + RNG randstates(gauge, 1234); int nsteps = 10; int nhbsteps = 1; int novrsteps = 1; @@ -255,22 +256,22 @@ int main(int argc, char **argv) double beta_value = 6.2; if (link_recon != QUDA_RECONSTRUCT_8 && coldstart) - InitGaugeField(*gaugeEx); + InitGaugeField(gaugeEx); else - InitGaugeField(*gaugeEx, *randstates); + InitGaugeField(gaugeEx, randstates); // Reunitarization setup setReunitarizationConsts(); // Do a series of Heatbath updates - Monte(*gaugeEx, *randstates, beta_value, 100 * nhbsteps, 100 * novrsteps); + Monte(gaugeEx, randstates, beta_value, 100 * nhbsteps, 100 * novrsteps); // Copy into regular field - copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION); + copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION); // load the gauge field from gauge - gauge_param.gauge_order = gauge->Order(); + gauge_param.gauge_order = gauge.Order(); gauge_param.location = QUDA_CUDA_FIELD_LOCATION; - loadGaugeQuda(gauge->data(), &gauge_param); + loadGaugeQuda(gauge.data(), &gauge_param); gaugeObservablesQuda(&obs_param); // Demonstrate MG evolution on an evolving gauge field @@ -311,14 +312,14 @@ int main(int argc, char **argv) for (int step = 1; step < nsteps; ++step) { freeGaugeQuda(); - Monte(*gaugeEx, *randstates, beta_value, nhbsteps, novrsteps); + Monte(gaugeEx, randstates, beta_value, nhbsteps, novrsteps); // Reunitarize gauge links CallUnitarizeLinks(gaugeEx); // Copy into regular field - copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION); - loadGaugeQuda(gauge->data(), &gauge_param); + copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION); + loadGaugeQuda(gauge.data(), &gauge_param); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { constructHostCloverField(clover, clover_inv, inv_param); @@ -382,9 +383,9 @@ int main(int argc, char **argv) CallUnitarizeLinks(gaugeEx); // copy into regular field - copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION); + copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION); - loadGaugeQuda(gauge->data(), &gauge_param); + loadGaugeQuda(gauge.data(), &gauge_param); // Recompute Gauge Observables gaugeObservablesQuda(&obs_param); @@ -447,12 +448,8 @@ int main(int argc, char **argv) // free the multigrid solver if (inv_multigrid) destroyMultigridQuda(mg_preconditioner); - delete gauge; - delete gaugeEx; // Release all temporary memory used for data exchange between GPUs in multi-GPU mode PGaugeExchangeFree(); - - delete randstates; } // stop the timer diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index c6379c3342..3756eac6b8 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -71,8 +71,8 @@ struct StaggeredDslashTestWrapper { // In the HISQ case, we include building fat/long links in this unit test void *qdp_fatlink_cpu[4] = {nullptr, nullptr, nullptr, nullptr}; void *qdp_longlink_cpu[4] = {nullptr, nullptr, nullptr, nullptr}; - void *const *ghost_fatlink_cpu; - void *const *ghost_longlink_cpu; + void **ghost_fatlink_cpu; + void **ghost_longlink_cpu; QudaParity parity = QUDA_EVEN_PARITY; @@ -225,14 +225,14 @@ struct StaggeredDslashTestWrapper { GaugeFieldParam cpuFatParam(gauge_param, milc_fatlink_cpu); cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; cpuFat = new cpuGaugeField(cpuFatParam); - ghost_fatlink_cpu = cpuFat->Ghost(); + for (int i = 0; i < 4; i++) ghost_fatlink_cpu[i] = cpuFat->Ghost()[i].data(); if (dslash_type == QUDA_ASQTAD_DSLASH) { gauge_param.type = QUDA_ASQTAD_LONG_LINKS; GaugeFieldParam cpuLongParam(gauge_param, milc_longlink_cpu); cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; cpuLong = new cpuGaugeField(cpuLongParam); - ghost_longlink_cpu = cpuLong ? cpuLong->Ghost() : nullptr; + for (int i = 0; i < 4; i++) ghost_longlink_cpu[i] = cpuLong ? cpuLong->Ghost()[i].data() : nullptr; } #endif diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 12b67cec13..87c574d974 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -364,8 +364,7 @@ int main(int argc, char **argv) for (int k = 0; k < Nsrc; k++) { if (verify_results) - verifyStaggeredInversion(*tmp, *ref, *in[k], *out[k], mass, qdp_fatlink, qdp_longlink, (void **)cpuFat->Ghost(), - (void **)cpuLong->Ghost(), gauge_param, inv_param, 0); + verifyStaggeredInversion(*tmp, *ref, *in[k], *out[k], mass, *cpuFat, *cpuLong, gauge_param, inv_param, 0); } break; @@ -405,8 +404,7 @@ int main(int argc, char **argv) for (int i = 0; i < multishift; i++) { printfQuda("%dth solution: mass=%f, ", i, masses[i]); - verifyStaggeredInversion(*tmp, *ref, *in[k], *qudaOutArray[i], masses[i], qdp_fatlink, qdp_longlink, - (void **)cpuFat->Ghost(), (void **)cpuLong->Ghost(), gauge_param, inv_param, i); + verifyStaggeredInversion(*tmp, *ref, *in[k], *qudaOutArray[i], masses[i], *cpuFat, *cpuLong, gauge_param, inv_param, i); } } diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp index 4df1882297..f6d9dd4074 100644 --- a/tests/utils/host_utils.cpp +++ b/tests/utils/host_utils.cpp @@ -1376,6 +1376,12 @@ void createSiteLinkCPU(void *const *link, QudaPrecision precision, int phase) return; } +void createSiteLinkCPU(quda::GaugeField &u, QudaPrecision precision, int phase) +{ + void *link[] = {u.data(0), u.data(1), u.data(2), u.data(3)}; + createSiteLinkCPU(link, precision, phase); +} + template int compareLink(Float **linkA, Float **linkB, int len) { const int fail_check = 16; diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index c6da599fcc..a804c46f80 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -155,6 +155,7 @@ int fullLatticeIndex(int dim[], int index, int oddBit); int getOddBit(int X); void createSiteLinkCPU(void *const *const link, QudaPrecision precision, int phase); +void createSiteLinkCPU(quda::GaugeField &u, QudaPrecision precision, int phase); void su3_construct(void *mat, QudaReconstructType reconstruct, QudaPrecision precision); void su3_reconstruct(void *mat, int dir, int ga_idx, QudaReconstructType reconstruct, QudaPrecision precision, QudaGaugeParam *param); diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp index 61f5c9ef2e..6de4e900d0 100644 --- a/tests/utils/misc.cpp +++ b/tests/utils/misc.cpp @@ -339,7 +339,9 @@ const char *get_memory_type_str(QudaMemoryType type) switch (type) { case QUDA_MEMORY_DEVICE: s = "device"; break; - case QUDA_MEMORY_PINNED: s = "pinned"; break; + case QUDA_MEMORY_DEVICE_PINNED: s = "device_pinned"; break; + case QUDA_MEMORY_HOST: s = "host"; break; + case QUDA_MEMORY_HOST_PINNED: s = "host_pinned"; break; case QUDA_MEMORY_MAPPED: s = "mapped"; break; default: fprintf(stderr, "Error: invalid memory type\n"); exit(1); } From 8e0207ef56c977cc2f3ec95684bfabdd9e1961be Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 11 May 2023 09:36:46 -0700 Subject: [PATCH 04/99] Move gauge field exchange functions to GaugeField from cpu/cuda children --- include/gauge_field.h | 212 ++++++----------- lib/cpu_gauge_field.cpp | 160 ------------- lib/cuda_gauge_field.cpp | 346 --------------------------- lib/gauge_field.cpp | 488 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 559 insertions(+), 647 deletions(-) diff --git a/include/gauge_field.h b/include/gauge_field.h index 0d70441f3f..cfc38855f4 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -273,8 +273,64 @@ namespace quda { GaugeField(const GaugeFieldParam ¶m); virtual ~GaugeField(); - virtual void exchangeGhost(QudaLinkDirection = QUDA_LINK_BACKWARDS) = 0; - virtual void injectGhost(QudaLinkDirection = QUDA_LINK_BACKWARDS) = 0; + /** + @brief Create the communication handlers and buffers + @param[in] R The thickness of the extended region in each dimension + @param[in] no_comms_fill Do local exchange to fill out the extended + region in non-partitioned dimensions + @param[in] bidir Whether to allocate communication buffers to + allow for simultaneous bi-directional exchange. If false, then + the forwards and backwards buffers will alias (saving memory). + */ + void createComms(const lat_dim_t &R, bool no_comms_fill, bool bidir = true); + + /** + @brief Allocate the ghost buffers + @param[in] R The thickness of the extended region in each dimension + @param[in] no_comms_fill Do local exchange to fill out the extended + @param[in] bidir Is this a bi-directional exchange - if not + then we alias the fowards and backwards offsetss + region in non-partitioned dimensions + */ + void allocateGhostBuffer(const lat_dim_t &R, bool no_comms_fill, bool bidir = true) const; + + /** + @brief Start the receive communicators + @param[in] dim The communication dimension + @param[in] dir The communication direction (0=backwards, 1=forwards) + */ + void recvStart(int dim, int dir); + + /** + @brief Start the sending communicators + @param[in] dim The communication dimension + @param[in] dir The communication direction (0=backwards, 1=forwards) + @param[in] stream_p Pointer to CUDA stream to post the + communication in (if 0, then use null stream) + */ + void sendStart(int dim, int dir, const qudaStream_t &stream_p); + + /** + @brief Wait for communication to complete + @param[in] dim The communication dimension + @param[in] dir The communication direction (0=backwards, 1=forwards) + */ + void commsComplete(int dim, int dir); + + /** + @brief Exchange the ghost and store store in the padded region + @param[in] link_direction Which links are we exchanging: this + flag only applies to bi-directional coarse-link fields + */ + void exchangeGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS); + + /** + @brief The opposite of exchangeGhost: take the ghost zone on x, + send to node x-1, and inject back into the field + @param[in] link_direction Which links are we injecting: this + flag only applies to bi-directional coarse-link fields + */ + void injectGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS); size_t Length() const { return length; } int Ncolor() const { return nColor; } @@ -323,7 +379,7 @@ namespace quda { @param no_comms_fill Do local exchange to fill out the extended region in non-partitioned dimensions */ - virtual void exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill = false) = 0; + void exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill = false); /** @brief This routine will populate the border / halo region @@ -334,7 +390,7 @@ namespace quda { @param no_comms_fill Do local exchange to fill out the extended region in non-partitioned dimensions */ - virtual void exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill = false) = 0; + void exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill = false); void checkField(const LatticeField &) const; @@ -505,91 +561,25 @@ namespace quda { @brief Restores the GaugeField */ void restore() const; - }; - - class cudaGaugeField : public GaugeField { - - public: - cudaGaugeField(const GaugeFieldParam &); - - /** - @brief Exchange the ghost and store store in the padded region - @param[in] link_direction Which links are we exchanging: this - flag only applies to bi-directional coarse-link fields - */ - void exchangeGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS); - - /** - @brief The opposite of exchangeGhost: take the ghost zone on x, - send to node x-1, and inject back into the field - @param[in] link_direction Which links are we injecting: this - flag only applies to bi-directional coarse-link fields - */ - void injectGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS); - - /** - @brief Create the communication handlers and buffers - @param[in] R The thickness of the extended region in each dimension - @param[in] no_comms_fill Do local exchange to fill out the extended - region in non-partitioned dimensions - @param[in] bidir Whether to allocate communication buffers to - allow for simultaneous bi-directional exchange. If false, then - the forwards and backwards buffers will alias (saving memory). - */ - void createComms(const lat_dim_t &R, bool no_comms_fill, bool bidir = true); - - /** - @brief Allocate the ghost buffers - @param[in] R The thickness of the extended region in each dimension - @param[in] no_comms_fill Do local exchange to fill out the extended - @param[in] bidir Is this a bi-directional exchange - if not - then we alias the fowards and backwards offsetss - region in non-partitioned dimensions - */ - void allocateGhostBuffer(const lat_dim_t &R, bool no_comms_fill, bool bidir = true) const; /** - @brief Start the receive communicators - @param[in] dim The communication dimension - @param[in] dir The communication direction (0=backwards, 1=forwards) - */ - void recvStart(int dim, int dir); - - /** - @brief Start the sending communicators - @param[in] dim The communication dimension - @param[in] dir The communication direction (0=backwards, 1=forwards) - @param[in] stream_p Pointer to CUDA stream to post the - communication in (if 0, then use null stream) + @brief Copy all contents of the field to a host buffer. + @param[in] the host buffer to copy to. */ - void sendStart(int dim, int dir, const qudaStream_t &stream_p); + void copy_to_buffer(void *buffer) const; /** - @brief Wait for communication to complete - @param[in] dim The communication dimension - @param[in] dir The communication direction (0=backwards, 1=forwards) + @brief Copy all contents of the field from a host buffer to this field. + @param[in] the host buffer to copy from. */ - void commsComplete(int dim, int dir); + void copy_from_buffer(void *buffer); + }; - /** - @brief This does routine will populate the border / halo region of a - gauge field that has been created using copyExtendedGauge. - @param R The thickness of the extended region in each dimension - @param no_comms_fill Do local exchange to fill out the extended - region in non-partitioned dimensions - */ - void exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill = false); + class cudaGaugeField : public GaugeField + { - /** - @brief This does routine will populate the border / halo region - of a gauge field that has been created using copyExtendedGauge. - Overloaded variant that will start and stop a comms profile. - @param R The thickness of the extended region in each dimension - @param profile TimeProfile intance which will record the time taken - @param no_comms_fill Do local exchange to fill out the extended - region in non-partitioned dimensions - */ - void exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill = false); + public: + cudaGaugeField(const GaugeFieldParam &); /** * Generic gauge field copy @@ -624,18 +614,6 @@ namespace quda { @param[in] profile Time profile to record the transfer */ void saveCPUField(cpuGaugeField &cpu, TimeProfile &profile) const; - - /** - @brief Copy all contents of the field to a host buffer. - @param[in] the host buffer to copy to. - */ - virtual void copy_to_buffer(void *buffer) const; - - /** - @brief Copy all contents of the field from a host buffer to this field. - @param[in] the host buffer to copy from. - */ - virtual void copy_from_buffer(void *buffer); }; class cpuGaugeField : public GaugeField { @@ -654,59 +632,11 @@ namespace quda { */ cpuGaugeField(const GaugeFieldParam ¶m); - /** - @brief Exchange the ghost and store store in the padded region - @param[in] link_direction Which links are we extracting: this - flag only applies to bi-directional coarse-link fields - */ - void exchangeGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS); - - /** - @brief The opposite of exchangeGhost: take the ghost zone on x, - send to node x-1, and inject back into the field - @param[in] link_direction Which links are we injecting: this - flag only applies to bi-directional coarse-link fields - */ - void injectGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS); - - /** - @brief This does routine will populate the border / halo region of a - gauge field that has been created using copyExtendedGauge. - - @param R The thickness of the extended region in each dimension - @param no_comms_fill Do local exchange to fill out the extended - region in non-partitioned dimenions - */ - void exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill = false); - - /** - @brief This does routine will populate the border / halo region - of a gauge field that has been created using copyExtendedGauge. - Overloaded variant that will start and stop a comms profile. - @param R The thickness of the extended region in each dimension - @param profile TimeProfile intance which will record the time taken - @param no_comms_fill Do local exchange to fill out the extended - region in non-partitioned dimensions - */ - void exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill = false); - /** * Generic gauge field copy * @param[in] src Source from which we are copying */ void copy(const GaugeField &src); - - /** - @brief Copy all contents of the field to a host buffer. - @param[in] the host buffer to copy to. - */ - virtual void copy_to_buffer(void *buffer) const; - - /** - @brief Copy all contents of the field from a host buffer to this field. - @param[in] the host buffer to copy from. - */ - virtual void copy_from_buffer(void *buffer); }; /** diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp index 604bf04c13..a6eb000b58 100644 --- a/lib/cpu_gauge_field.cpp +++ b/lib/cpu_gauge_field.cpp @@ -20,134 +20,6 @@ namespace quda { if (param.compute_fat_link_max) fat_link_max = this->abs_max(); } - // This does the exchange of the gauge field ghost zone and places it - // into the ghost array. - void cpuGaugeField::exchangeGhost(QudaLinkDirection link_direction) { - if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) - errorQuda("Cannot exchange for %d geometry gauge field", geometry); - - if ( (link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == QUDA_LINK_FORWARDS) && geometry != QUDA_COARSE_GEOMETRY) - errorQuda("Cannot request exchange of forward links on non-coarse geometry"); - - void *send[2 * QUDA_MAX_DIM]; - for (int d=0; d(recv[d])+bytes[d], send[d], bytes[d]); - memcpy(recv[d], static_cast(send[d])+bytes[d], bytes[d]); - } - - // inject back into the gauge field - extractExtendedGaugeGhost(*this, d, R, recv, false); - } - - for (int d=0; d(buffer); - for (int d = 0; d < site_dim; d++) { - std::memcpy(&dst_buffer[d * bytes / site_dim], gauge_array[d].data(), bytes / site_dim); - } - } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER - || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER - || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { - std::memcpy(buffer, data(), Bytes()); - } else { - errorQuda("Unsupported order = %d", Order()); - } - } - - void cpuGaugeField::copy_from_buffer(void *buffer) - { - if (is_pointer_array(order)) { - const char *dst_buffer = reinterpret_cast(buffer); - for (int d = 0; d < site_dim; d++) { - std::memcpy(gauge_array[d].data(), &dst_buffer[d * bytes / site_dim], bytes / site_dim); - } - } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER - || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER - || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { - std::memcpy(data(), buffer, Bytes()); - } else { - errorQuda("Unsupported order = %d", Order()); - } - } - } // namespace quda diff --git a/lib/cuda_gauge_field.cpp b/lib/cuda_gauge_field.cpp index ae21213770..3e5e60acdc 100644 --- a/lib/cuda_gauge_field.cpp +++ b/lib/cuda_gauge_field.cpp @@ -16,345 +16,6 @@ namespace quda { } } - // This does the exchange of the forwards boundary gauge field ghost zone and places - // it into the ghost array of the next node - void cudaGaugeField::exchangeGhost(QudaLinkDirection link_direction) { - - if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange); - if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) errorQuda("Invalid geometry=%d", geometry); - if ( (link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == QUDA_LINK_FORWARDS) && geometry != QUDA_COARSE_GEOMETRY) - errorQuda("Cannot request exchange of forward links on non-coarse geometry"); - if (nFace == 0) errorQuda("nFace = 0"); - - const int dir = 1; // sending forwards only - const lat_dim_t R = {nFace, nFace, nFace, nFace}; - const bool no_comms_fill = true; // dslash kernels presently require this - const bool bidir = false; // communication is only ever done in one direction at once - createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge - - // loop over backwards and forwards links - const QudaLinkDirection directions[] = {QUDA_LINK_BACKWARDS, QUDA_LINK_FORWARDS}; - for (int link_dir = 0; link_dir<2; link_dir++) { - if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue; - - void *send_d[2*QUDA_MAX_DIM] = { }; - void *recv_d[2*QUDA_MAX_DIM] = { }; - - size_t offset = 0; - for (int d=0; d(ghost_recv_buffer_d[bufferIndex]) + offset; - if (bidir) offset += ghost_face_bytes_aligned[d]; - send_d[d] = static_cast(ghost_send_buffer_d[bufferIndex]) + offset; - offset += ghost_face_bytes_aligned[d]; - } - - extractGaugeGhost(*this, send_d, true, link_dir*nDim); // get the links into contiguous buffers - qudaDeviceSynchronize(); // synchronize before issuing mem copies in different streams - could replace with event post and wait - - // issue receive preposts and host-to-device copies if needed - for (int dim=0; dim(ghost_send_buffer_d[bufferIndex]) + offset; - if (bidir) offset += ghost_face_bytes_aligned[d]; - // receive from forwards is the second half of each ghost_recv_buffer - recv_d[d] = static_cast(ghost_recv_buffer_d[bufferIndex]) + offset; - offset += ghost_face_bytes_aligned[d]; - } - - if (isNative()) { // copy from padded region in gauge field into send buffer - copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, send_d, 0, 1 + 2*link_dir); - } else { // copy from receive buffer into ghost array - for (int dim = 0; dim < nDim; dim++) - qudaMemcpy(send_d[dim], ghost[dim + link_dir * nDim].data(), ghost_face_bytes[dim], qudaMemcpyDeviceToDevice); - } - qudaDeviceSynchronize(); // need to synchronize before issueing copies in different streams - could replace with event post and wait - - // issue receive preposts and host-to-device copies if needed - for (int dim=0; dim(ghost_remote_send_buffer_d[bufferIndex][dim][dir]) + ghost_offset[dim][(dir + 1) % 2]; - - qudaMemcpyP2PAsync(ghost_dst, my_face_dim_dir_d[bufferIndex][dim][dir], ghost_face_bytes[dim], stream); - - // record the event - qudaEventRecord(ipcCopyEvent[bufferIndex][dim][dir], stream); - // send to the neighboring processor - comm_start(mh_send_p2p[bufferIndex][dim][dir]); - } - } - - void cudaGaugeField::commsComplete(int dim, int dir) - { - if (!comm_dim_partitioned(dim)) return; - - if (comm_peer2peer_enabled(1 - dir, dim)) { - comm_wait(mh_recv_p2p[bufferIndex][dim][1 - dir]); - qudaEventSynchronize(ipcRemoteCopyEvent[bufferIndex][dim][1 - dir]); - } else if (comm_gdr_enabled()) { - comm_wait(mh_recv_rdma[bufferIndex][dim][1 - dir]); - } else { - comm_wait(mh_recv[bufferIndex][dim][1 - dir]); - } - - if (comm_peer2peer_enabled(dir, dim)) { - comm_wait(mh_send_p2p[bufferIndex][dim][dir]); - qudaEventSynchronize(ipcCopyEvent[bufferIndex][dim][dir]); - } else if (comm_gdr_enabled()) { - comm_wait(mh_send_rdma[bufferIndex][dim][dir]); - } else { - comm_wait(mh_send[bufferIndex][dim][dir]); - } - } - - void cudaGaugeField::exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill) - { - const int b = bufferIndex; - void *send_d[QUDA_MAX_DIM], *recv_d[QUDA_MAX_DIM]; - - createComms(R, no_comms_fill); - - size_t offset = 0; - for (int dim=0; dim(ghost_send_buffer_d[b]) + offset; - recv_d[dim] = static_cast(ghost_recv_buffer_d[b]) + offset; - - // silence cuda-memcheck initcheck errors that arise since we - // have an oversized ghost buffer when doing the extended exchange - qudaMemsetAsync(send_d[dim], 0, 2 * ghost_face_bytes_aligned[dim], device::get_default_stream()); - offset += 2 * ghost_face_bytes_aligned[dim]; // factor of two from fwd/back - } - - for (int dim=0; dim(ghost_remote_send_buffer_d[bufferIndex][dim][dir]) + ghost_offset[dim][(dir + 1) % 2]; + + qudaMemcpyP2PAsync(ghost_dst, my_face_dim_dir_d[bufferIndex][dim][dir], ghost_face_bytes[dim], stream); + + // record the event + qudaEventRecord(ipcCopyEvent[bufferIndex][dim][dir], stream); + // send to the neighboring processor + comm_start(mh_send_p2p[bufferIndex][dim][dir]); + } + } + + void GaugeField::commsComplete(int dim, int dir) + { + if (!comm_dim_partitioned(dim)) return; + + if (comm_peer2peer_enabled(1 - dir, dim)) { + comm_wait(mh_recv_p2p[bufferIndex][dim][1 - dir]); + qudaEventSynchronize(ipcRemoteCopyEvent[bufferIndex][dim][1 - dir]); + } else if (comm_gdr_enabled()) { + comm_wait(mh_recv_rdma[bufferIndex][dim][1 - dir]); + } else { + comm_wait(mh_recv[bufferIndex][dim][1 - dir]); + } + + if (comm_peer2peer_enabled(dir, dim)) { + comm_wait(mh_send_p2p[bufferIndex][dim][dir]); + qudaEventSynchronize(ipcCopyEvent[bufferIndex][dim][dir]); + } else if (comm_gdr_enabled()) { + comm_wait(mh_send_rdma[bufferIndex][dim][dir]); + } else { + comm_wait(mh_send[bufferIndex][dim][dir]); + } + } + + // This does the exchange of the forwards boundary gauge field ghost zone and places + // it into the ghost array of the next node + void GaugeField::exchangeGhost(QudaLinkDirection link_direction) + { + if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) + errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange); + if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) + errorQuda("Invalid geometry=%d", geometry); + if ((link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == QUDA_LINK_FORWARDS) + && geometry != QUDA_COARSE_GEOMETRY) + errorQuda("Cannot request exchange of forward links on non-coarse geometry"); + if (nFace == 0) errorQuda("nFace = 0"); + + if (location == QUDA_CUDA_FIELD_LOCATION) { + const int dir = 1; // sending forwards only + const lat_dim_t R = {nFace, nFace, nFace, nFace}; + const bool no_comms_fill = true; // dslash kernels presently require this + const bool bidir = false; // communication is only ever done in one direction at once + createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge + + // loop over backwards and forwards links + const QudaLinkDirection directions[] = {QUDA_LINK_BACKWARDS, QUDA_LINK_FORWARDS}; + for (int link_dir = 0; link_dir < 2; link_dir++) { + if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue; + + void *send_d[2 * QUDA_MAX_DIM] = {}; + void *recv_d[2 * QUDA_MAX_DIM] = {}; + + size_t offset = 0; + for (int d = 0; d < nDim; d++) { + recv_d[d] = static_cast(ghost_recv_buffer_d[bufferIndex]) + offset; + if (bidir) offset += ghost_face_bytes_aligned[d]; + send_d[d] = static_cast(ghost_send_buffer_d[bufferIndex]) + offset; + offset += ghost_face_bytes_aligned[d]; + } + + extractGaugeGhost(*this, send_d, true, link_dir * nDim); // get the links into contiguous buffers + qudaDeviceSynchronize(); // synchronize before issuing mem copies in different streams - could replace with event post and wait + + // issue receive preposts and host-to-device copies if needed + for (int dim = 0; dim < nDim; dim++) { + if (!comm_dim_partitioned(dim)) continue; + recvStart(dim, dir); // prepost the receive + if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) { + qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir], + ghost_face_bytes[dim], qudaMemcpyDeviceToHost, device::get_stream(2 * dim + dir)); + } + } + + // if gdr enabled then synchronize + if (comm_gdr_enabled()) qudaDeviceSynchronize(); + + // if the sending direction is not peer-to-peer then we need to synchronize before we start sending + for (int dim = 0; dim < nDim; dim++) { + if (!comm_dim_partitioned(dim)) continue; + if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) + qudaStreamSynchronize(device::get_stream(2 * dim + dir)); + sendStart(dim, dir, device::get_stream(2 * dim + dir)); // start sending + } + + // complete communication and issue host-to-device copies if needed + for (int dim = 0; dim < nDim; dim++) { + if (!comm_dim_partitioned(dim)) continue; + commsComplete(dim, dir); + if (!comm_peer2peer_enabled(1 - dir, dim) && !comm_gdr_enabled()) { + qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1 - dir], + from_face_dim_dir_h[bufferIndex][dim][1 - dir], ghost_face_bytes[dim], + qudaMemcpyHostToDevice, device::get_stream(2 * dim + dir)); + } + } + + qudaDeviceSynchronize(); // synchronize before issuing kernels / copies in default stream - could replace with event post and wait + + // fill in the halos for non-partitioned dimensions + for (int dim = 0; dim < nDim; dim++) { + if (!comm_dim_partitioned(dim) && no_comms_fill) { + qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice); + } + } + + if (isNative()) { + copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, 0, recv_d, 1 + 2 * link_dir); // 1, 3 + } else { + // copy from receive buffer into ghost array + for (int dim = 0; dim < nDim; dim++) + qudaMemcpy(ghost[dim + link_dir * nDim].data(), recv_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice); + } + + bufferIndex = 1 - bufferIndex; + } // link_dir + + qudaDeviceSynchronize(); + } else { // cpu field + void *send[2 * QUDA_MAX_DIM]; + for (int d = 0; d < nDim; d++) { + send[d] = safe_malloc(nFace * surface[d] * nInternal * precision); + if (geometry == QUDA_COARSE_GEOMETRY) send[d + 4] = safe_malloc(nFace * surface[d] * nInternal * precision); + } + + void *ghost_[2 * QUDA_MAX_DIM]; + for (auto i = 0; i < geometry; i++) ghost_[i] = ghost[i].data(); + + // get the links into contiguous buffers + if (link_direction == QUDA_LINK_BACKWARDS || link_direction == QUDA_LINK_BIDIRECTIONAL) { + extractGaugeGhost(*this, send, true); + + // communicate between nodes + exchange(ghost_, send, QUDA_FORWARDS); + } + + // repeat if requested and links are bi-directional + if (link_direction == QUDA_LINK_FORWARDS || link_direction == QUDA_LINK_BIDIRECTIONAL) { + extractGaugeGhost(*this, send, true, nDim); + exchange(ghost_ + nDim, send + nDim, QUDA_FORWARDS); + } + + for (int d = 0; d < geometry; d++) host_free(send[d]); + } + } + + // This does the opposite of exchangeGhost and sends back the ghost + // zone to the node from which it came and injects it back into the + // field + void GaugeField::injectGhost(QudaLinkDirection link_direction) + { + if (ghostExchange != QUDA_GHOST_EXCHANGE_PAD) + errorQuda("Cannot call exchangeGhost with ghostExchange=%d", ghostExchange); + if (geometry != QUDA_VECTOR_GEOMETRY && geometry != QUDA_COARSE_GEOMETRY) + errorQuda("Invalid geometry=%d", geometry); + if (link_direction != QUDA_LINK_BACKWARDS) errorQuda("Invalid link_direction = %d", link_direction); + if (nFace == 0) errorQuda("nFace = 0"); + + if (location == QUDA_CUDA_FIELD_LOCATION) { + const int dir = 0; // sending backwards only + const lat_dim_t R = {nFace, nFace, nFace, nFace}; + const bool no_comms_fill = false; // injection never does no_comms_fill + const bool bidir = false; // communication is only ever done in one direction at once + createComms(R, true, bidir); // always need to allocate space for non-partitioned dimension for copyGenericGauge + + // loop over backwards and forwards links (forwards links never sent but leave here just in case) + const QudaLinkDirection directions[] = {QUDA_LINK_BACKWARDS, QUDA_LINK_FORWARDS}; + for (int link_dir = 0; link_dir < 2; link_dir++) { + if (!(link_direction == QUDA_LINK_BIDIRECTIONAL || link_direction == directions[link_dir])) continue; + + void *send_d[2 * QUDA_MAX_DIM] = {}; + void *recv_d[2 * QUDA_MAX_DIM] = {}; + + size_t offset = 0; + for (int d = 0; d < nDim; d++) { + // send backwards is first half of each ghost_send_buffer + send_d[d] = static_cast(ghost_send_buffer_d[bufferIndex]) + offset; + if (bidir) offset += ghost_face_bytes_aligned[d]; + // receive from forwards is the second half of each ghost_recv_buffer + recv_d[d] = static_cast(ghost_recv_buffer_d[bufferIndex]) + offset; + offset += ghost_face_bytes_aligned[d]; + } + + if (isNative()) { // copy from padded region in gauge field into send buffer + copyGenericGauge(*this, *this, QUDA_CUDA_FIELD_LOCATION, 0, 0, send_d, 0, 1 + 2 * link_dir); + } else { // copy from receive buffer into ghost array + for (int dim = 0; dim < nDim; dim++) + qudaMemcpy(send_d[dim], ghost[dim + link_dir * nDim].data(), ghost_face_bytes[dim], qudaMemcpyDeviceToDevice); + } + qudaDeviceSynchronize(); // need to synchronize before issueing copies in different streams - could replace with event post and wait + + // issue receive preposts and host-to-device copies if needed + for (int dim = 0; dim < nDim; dim++) { + if (!comm_dim_partitioned(dim)) continue; + recvStart(dim, dir); // prepost the receive + if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) { + qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir], + ghost_face_bytes[dim], qudaMemcpyDeviceToHost, device::get_stream(2 * dim + dir)); + } + } + + // if gdr enabled then synchronize + if (comm_gdr_enabled()) qudaDeviceSynchronize(); + + // if the sending direction is not peer-to-peer then we need to synchronize before we start sending + for (int dim = 0; dim < nDim; dim++) { + if (!comm_dim_partitioned(dim)) continue; + if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) + qudaStreamSynchronize(device::get_stream(2 * dim + dir)); + sendStart(dim, dir, device::get_stream(2 * dim + dir)); // start sending + } + + // complete communication and issue host-to-device copies if needed + for (int dim = 0; dim < nDim; dim++) { + if (!comm_dim_partitioned(dim)) continue; + commsComplete(dim, dir); + if (!comm_peer2peer_enabled(1 - dir, dim) && !comm_gdr_enabled()) { + qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][1 - dir], + from_face_dim_dir_h[bufferIndex][dim][1 - dir], ghost_face_bytes[dim], + qudaMemcpyHostToDevice, device::get_stream(2 * dim + dir)); + } + } + + qudaDeviceSynchronize(); // synchronize before issuing kernel / copies in default stream - could replace with event post and wait + + // fill in the halos for non-partitioned dimensions + for (int dim = 0; dim < nDim; dim++) { + if (!comm_dim_partitioned(dim) && no_comms_fill) { + qudaMemcpy(recv_d[dim], send_d[dim], ghost_face_bytes[dim], qudaMemcpyDeviceToDevice); + } + } + + // get the links into contiguous buffers + extractGaugeGhost(*this, recv_d, false, link_dir * nDim); + + bufferIndex = 1 - bufferIndex; + } // link_dir + + qudaDeviceSynchronize(); + } else { + void *recv[QUDA_MAX_DIM]; + for (int d = 0; d < nDim; d++) recv[d] = safe_malloc(nFace * surface[d] * nInternal * precision); + + void *ghost_[] = {ghost[0].data(), ghost[1].data(), ghost[2].data(), ghost[3].data(), + ghost[4].data(), ghost[5].data(), ghost[6].data(), ghost[7].data()}; + + // communicate between nodes + exchange(recv, ghost_, QUDA_BACKWARDS); + + // get the links into contiguous buffers + extractGaugeGhost(*this, recv, false); + + for (int d = 0; d < QUDA_MAX_DIM; d++) host_free(recv[d]); + } + } + + void GaugeField::exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill) + { + if (location == QUDA_CUDA_FIELD_LOCATION) { + const int b = bufferIndex; + void *send_d[QUDA_MAX_DIM], *recv_d[QUDA_MAX_DIM]; + + createComms(R, no_comms_fill); + + size_t offset = 0; + for (int dim = 0; dim < nDim; dim++) { + if (!(comm_dim_partitioned(dim) || (no_comms_fill && R[dim]))) continue; + send_d[dim] = static_cast(ghost_send_buffer_d[b]) + offset; + recv_d[dim] = static_cast(ghost_recv_buffer_d[b]) + offset; + + // silence cuda-memcheck initcheck errors that arise since we + // have an oversized ghost buffer when doing the extended exchange + qudaMemsetAsync(send_d[dim], 0, 2 * ghost_face_bytes_aligned[dim], device::get_default_stream()); + offset += 2 * ghost_face_bytes_aligned[dim]; // factor of two from fwd/back + } + + for (int dim = 0; dim < nDim; dim++) { + if (!(comm_dim_partitioned(dim) || (no_comms_fill && R[dim]))) continue; + + // extract into a contiguous buffer + extractExtendedGaugeGhost(*this, dim, R, send_d, true); + + if (comm_dim_partitioned(dim)) { + qudaDeviceSynchronize(); // synchronize before issuing mem copies in different streams - could replace with event post and wait + + for (int dir = 0; dir < 2; dir++) recvStart(dim, dir); + + for (int dir = 0; dir < 2; dir++) { + // issue host-to-device copies if needed + if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) { + qudaMemcpyAsync(my_face_dim_dir_h[bufferIndex][dim][dir], my_face_dim_dir_d[bufferIndex][dim][dir], + ghost_face_bytes[dim], qudaMemcpyDeviceToHost, device::get_stream(dir)); + } + } + + // if either direction is not peer-to-peer then we need to synchronize + if (!comm_peer2peer_enabled(0, dim) || !comm_peer2peer_enabled(1, dim)) qudaDeviceSynchronize(); + + for (int dir = 0; dir < 2; dir++) sendStart(dim, dir, device::get_stream(dir)); + for (int dir = 0; dir < 2; dir++) commsComplete(dim, dir); + + for (int dir = 0; dir < 2; dir++) { + // issue host-to-device copies if needed + if (!comm_peer2peer_enabled(dir, dim) && !comm_gdr_enabled()) { + qudaMemcpyAsync(from_face_dim_dir_d[bufferIndex][dim][dir], from_face_dim_dir_h[bufferIndex][dim][dir], + ghost_face_bytes[dim], qudaMemcpyHostToDevice, device::get_stream(dir)); + } + } + + } else { // if just doing a local exchange to fill halo then need to swap faces + qudaMemcpy(from_face_dim_dir_d[b][dim][1], my_face_dim_dir_d[b][dim][0], ghost_face_bytes[dim], + qudaMemcpyDeviceToDevice); + qudaMemcpy(from_face_dim_dir_d[b][dim][0], my_face_dim_dir_d[b][dim][1], ghost_face_bytes[dim], + qudaMemcpyDeviceToDevice); + } + + // inject back into the gauge field + // need to synchronize the copy streams before rejoining the compute stream - could replace with event post and wait + qudaDeviceSynchronize(); + extractExtendedGaugeGhost(*this, dim, R, recv_d, false); + } + + bufferIndex = 1 - bufferIndex; + qudaDeviceSynchronize(); + } else { + void *send[QUDA_MAX_DIM]; + void *recv[QUDA_MAX_DIM]; + size_t bytes[QUDA_MAX_DIM]; + // store both parities and directions in each + for (int d = 0; d < nDim; d++) { + if (!(comm_dim_partitioned(d) || (no_comms_fill && R[d]))) continue; + bytes[d] = surface[d] * R[d] * geometry * nInternal * precision; + send[d] = safe_malloc(2 * bytes[d]); + recv[d] = safe_malloc(2 * bytes[d]); + } + + for (int d = 0; d < nDim; d++) { + if (!(comm_dim_partitioned(d) || (no_comms_fill && R[d]))) continue; + // extract into a contiguous buffer + extractExtendedGaugeGhost(*this, d, R, send, true); + + if (comm_dim_partitioned(d)) { + // do the exchange + MsgHandle *mh_recv_back; + MsgHandle *mh_recv_fwd; + MsgHandle *mh_send_fwd; + MsgHandle *mh_send_back; + + mh_recv_back = comm_declare_receive_relative(recv[d], d, -1, bytes[d]); + mh_recv_fwd = comm_declare_receive_relative(((char *)recv[d]) + bytes[d], d, +1, bytes[d]); + mh_send_back = comm_declare_send_relative(send[d], d, -1, bytes[d]); + mh_send_fwd = comm_declare_send_relative(((char *)send[d]) + bytes[d], d, +1, bytes[d]); + + comm_start(mh_recv_back); + comm_start(mh_recv_fwd); + comm_start(mh_send_fwd); + comm_start(mh_send_back); + + comm_wait(mh_send_fwd); + comm_wait(mh_send_back); + comm_wait(mh_recv_back); + comm_wait(mh_recv_fwd); + + comm_free(mh_send_fwd); + comm_free(mh_send_back); + comm_free(mh_recv_back); + comm_free(mh_recv_fwd); + } else { + memcpy(static_cast(recv[d]) + bytes[d], send[d], bytes[d]); + memcpy(recv[d], static_cast(send[d]) + bytes[d], bytes[d]); + } + + // inject back into the gauge field + extractExtendedGaugeGhost(*this, d, R, recv, false); + } + + for (int d = 0; d < nDim; d++) { + if (!(comm_dim_partitioned(d) || (no_comms_fill && R[d]))) continue; + host_free(send[d]); + host_free(recv[d]); + } + } + } + + void GaugeField::exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill) + { + profile.TPSTART(QUDA_PROFILE_COMMS); + exchangeExtendedGhost(R, no_comms_fill); + profile.TPSTOP(QUDA_PROFILE_COMMS); + } + void GaugeField::exchange(void **ghost_link, void **link_sendbuf, QudaDirection dir) const { MsgHandle *mh_send[4]; @@ -596,4 +1044,44 @@ namespace quda { backed_up = false; } + void GaugeField::copy_to_buffer(void *buffer) const + { + if (location == QUDA_CUDA_FIELD_LOCATION) { + qudaMemcpy(buffer, data(), Bytes(), qudaMemcpyDeviceToHost); + } else { + if (is_pointer_array(order)) { + char *dst_buffer = reinterpret_cast(buffer); + for (int d = 0; d < site_dim; d++) { + std::memcpy(&dst_buffer[d * bytes / site_dim], gauge_array[d].data(), bytes / site_dim); + } + } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER + || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER + || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { + std::memcpy(buffer, data(), Bytes()); + } else { + errorQuda("Unsupported order = %d", Order()); + } + } + } + + void GaugeField::copy_from_buffer(void *buffer) + { + if (location == QUDA_CUDA_FIELD_LOCATION) { + qudaMemcpy(data(), buffer, Bytes(), qudaMemcpyHostToDevice); + } else { + if (is_pointer_array(order)) { + const char *dst_buffer = reinterpret_cast(buffer); + for (int d = 0; d < site_dim; d++) { + std::memcpy(gauge_array[d].data(), &dst_buffer[d * bytes / site_dim], bytes / site_dim); + } + } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER + || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER + || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { + std::memcpy(data(), buffer, Bytes()); + } else { + errorQuda("Unsupported order = %d", Order()); + } + } + } + } // namespace quda From af7fb2c76f2e985df8bbdffc383e7bfb36b840a6 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 11 May 2023 13:28:46 -0700 Subject: [PATCH 05/99] Further steps towards gauge field unification (copy/load/save routines now unified) --- include/gauge_field.h | 15 +-- include/gauge_field_order.h | 84 +++++++------- lib/cpu_gauge_field.cpp | 82 -------------- lib/cuda_gauge_field.cpp | 165 ++------------------------- lib/gauge_field.cpp | 218 +++++++++++++++++++++++++++++++++++- 5 files changed, 269 insertions(+), 295 deletions(-) diff --git a/include/gauge_field.h b/include/gauge_field.h index cfc38855f4..4dd2352484 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -496,7 +496,7 @@ namespace quda { * Generic gauge field copy * @param[in] src Source from which we are copying */ - virtual void copy(const GaugeField &src) = 0; + void copy(const GaugeField &src); /** @brief Compute the L1 norm of the field @@ -581,12 +581,6 @@ namespace quda { public: cudaGaugeField(const GaugeFieldParam &); - /** - * Generic gauge field copy - * @param[in] src Source from which we are copying - */ - void copy(const GaugeField &src); - /** @brief Download into this field from a CPU field @param[in] cpu The CPU field source @@ -618,7 +612,6 @@ namespace quda { class cpuGaugeField : public GaugeField { - friend void cudaGaugeField::copy(const GaugeField &cpu); friend void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu); friend void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const; @@ -631,12 +624,6 @@ namespace quda { extended. */ cpuGaugeField(const GaugeFieldParam ¶m); - - /** - * Generic gauge field copy - * @param[in] src Source from which we are copying - */ - void copy(const GaugeField &src); }; /** diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index c9afebde5d..451c8312c6 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -422,25 +422,26 @@ namespace quda { template struct GhostAccessor { using wrapper = fieldorder_wrapper; - complex *ghost[8]; - unsigned int ghostOffset[8]; - Float scale; - Float scale_inv; + complex *ghost[8] = {}; + unsigned int ghostOffset[8] = {}; + Float scale = static_cast(1.0); + Float scale_inv = static_cast(1.0); static constexpr bool fixed = fixed_point(); - GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) : - scale(static_cast(1.0)), scale_inv(static_cast(1.0)) + GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) { - for (int d=0; d<4; d++) { - ghost[d] = ghost_ ? static_cast*>(ghost_[d]) : - static_cast*>(const_cast(U.Ghost()[d].data())); - ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); + if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) { + for (int d=0; d<4; d++) { + ghost[d] = ghost_ ? static_cast*>(ghost_[d]) : + static_cast*>(const_cast(U.Ghost()[d].data())); + ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : - ghost_ ? static_cast*>(ghost_[d+4]) : - static_cast*>(const_cast(U.Ghost()[d+4].data())); - ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - } + ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : + ghost_ ? static_cast*>(ghost_[d+4]) : + static_cast*>(const_cast(U.Ghost()[d+4].data())); + ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); + } + } resetScale(U.Scale()); } @@ -543,25 +544,26 @@ namespace quda { template struct GhostAccessor { using wrapper = fieldorder_wrapper; - complex *ghost[8]; - unsigned int ghostOffset[8]; - Float scale; - Float scale_inv; + complex *ghost[8] = {}; + unsigned int ghostOffset[8] = {}; + Float scale = static_cast(1.0); + Float scale_inv = static_cast(1.0); static constexpr bool fixed = fixed_point(); - GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) : - scale(static_cast(1.0)), scale_inv(static_cast(1.0)) + GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) { - for (int d=0; d<4; d++) { - ghost[d] = ghost_ ? static_cast*>(ghost_[d]) : - static_cast*>(const_cast(U.Ghost()[d].data())); - ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); + if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) { + for (int d=0; d<4; d++) { + ghost[d] = ghost_ ? static_cast*>(ghost_[d]) : + static_cast*>(const_cast(U.Ghost()[d].data())); + ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : - ghost_ ? static_cast*>(ghost_[d+4]) : - static_cast*>(const_cast(U.Ghost()[d+4].data())); - ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - } + ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : + ghost_ ? static_cast*>(ghost_[d+4]) : + static_cast*>(const_cast(U.Ghost()[d+4].data())); + ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); + } + } resetScale(U.Scale()); } @@ -674,26 +676,26 @@ namespace quda { template struct GhostAccessor { using wrapper = fieldorder_wrapper; - complex *ghost[8]; + complex *ghost[8] = {}; const int volumeCB; - unsigned int ghostVolumeCB[8]; - Float scale; - Float scale_inv; + unsigned int ghostVolumeCB[8] = {}; + Float scale = static_cast(1.0); + Float scale_inv = static_cast(1.0); static constexpr bool fixed = fixed_point(); Accessor accessor; GhostAccessor(const GaugeField &U, void *gauge_, void **ghost_ = 0) : volumeCB(U.VolumeCB()), - scale(static_cast(1.0)), - scale_inv(static_cast(1.0)), accessor(U, gauge_, ghost_) { if constexpr (!native_ghost) assert(ghost_ != nullptr); - for (int d = 0; d < 4; d++) { - ghost[d] = !native_ghost ? static_cast*>(ghost_[d]) : nullptr; - ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d); - ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast*>(ghost_[d+4]) : nullptr; - ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d); + if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) { + for (int d = 0; d < 4; d++) { + ghost[d] = !native_ghost ? static_cast*>(ghost_[d]) : nullptr; + ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d); + ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast*>(ghost_[d+4]) : nullptr; + ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d); + } } resetScale(U.Scale()); } diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp index a6eb000b58..0c340504bd 100644 --- a/lib/cpu_gauge_field.cpp +++ b/lib/cpu_gauge_field.cpp @@ -10,90 +10,8 @@ namespace quda { cpuGaugeField::cpuGaugeField(const GaugeFieldParam ¶m) : GaugeField(param) { - // exchange the boundaries if a non-trivial field - if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) - if (create == QUDA_REFERENCE_FIELD_CREATE && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) { - exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL); - } - // compute the fat link max now in case it is needed later (i.e., for half precision) if (param.compute_fat_link_max) fat_link_max = this->abs_max(); } - // defined in cudaGaugeField - void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry); - void **create_ghost_buffer(size_t bytes[], QudaGaugeFieldOrder order, QudaFieldGeometry geometry); - void free_gauge_buffer(void *buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry); - void free_ghost_buffer(void **buffer, QudaGaugeFieldOrder order, QudaFieldGeometry geometry); - - void cpuGaugeField::copy(const GaugeField &src) { - if (this == &src) return; - - checkField(src); - - if (link_type == QUDA_ASQTAD_FAT_LINKS) { - fat_link_max = src.LinkMax(); - if (fat_link_max == 0.0 && precision < QUDA_SINGLE_PRECISION) fat_link_max = src.abs_max(); - } else { - fat_link_max = 1.0; - } - - if (typeid(src) == typeid(cudaGaugeField)) { - - if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { - - if (!src.isNative()) errorQuda("Only native order is supported"); - void *buffer = pool_pinned_malloc(src.Bytes()); - qudaMemcpy(buffer, src.data(), src.Bytes(), qudaMemcpyDeviceToHost); - - copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, nullptr, buffer); - pool_pinned_free(buffer); - - } else { // else on the GPU - - void *buffer = create_gauge_buffer(bytes, order, geometry); - size_t ghost_bytes[8]; - int dstNinternal = reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : 2*nColor*nColor; - for (int d=0; d 0) ? create_ghost_buffer(ghost_bytes, order, geometry) : nullptr; - - if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED) { - copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr); - if (geometry == QUDA_COARSE_GEOMETRY) - copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr, - 3); // forwards links if bi-directional - } else { - copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0); - } - - if (order == QUDA_QDP_GAUGE_ORDER) { - for (int d=0; d 4 && ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace) - for (int d=0; d 0) free_ghost_buffer(ghost_buffer, order, geometry); - } - - } else if (typeid(src) == typeid(cpuGaugeField)) { - // copy field and ghost zone directly - copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION); - } else { - errorQuda("Invalid gauge field type"); - } - - // if we have copied from a source without a pad then we need to exchange - if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD && - src.GhostExchange() != QUDA_GHOST_EXCHANGE_PAD) { - exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL); - } - } - } // namespace quda diff --git a/lib/cuda_gauge_field.cpp b/lib/cuda_gauge_field.cpp index 3e5e60acdc..39d3a28d02 100644 --- a/lib/cuda_gauge_field.cpp +++ b/lib/cuda_gauge_field.cpp @@ -7,168 +7,19 @@ namespace quda { - cudaGaugeField::cudaGaugeField(const GaugeFieldParam ¶m) : GaugeField(param) - { - // exchange the boundaries if a non-trivial field - if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) - if (create == QUDA_REFERENCE_FIELD_CREATE && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) { - exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL); - } - } - - void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) { - if (order == QUDA_QDP_GAUGE_ORDER) { - void **buffer = new void*[geometry]; - for (int d=0; d 4) { - void **buffer = new void*[geometry]; - for (int d=0; d 4) { - for (int d=0; d 0) ? create_ghost_buffer(ghost_bytes, src.Order(), geometry) : nullptr; - - if (src.Order() == QUDA_QDP_GAUGE_ORDER) { - for (int d=0; d 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD - && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace) - for (int d = 0; d < geometry; d++) - qudaMemcpy(ghost_buffer[d], src.Ghost()[d].data(), ghost_bytes[d], qudaMemcpyDefault); - - if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) { - copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer, nullptr, ghost_buffer); - if (geometry == QUDA_COARSE_GEOMETRY) - copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer, nullptr, ghost_buffer, 3); - } else { - copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer); - if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported"); - } - free_gauge_buffer(buffer, src.Order(), src.Geometry()); - if (nFace > 0) free_ghost_buffer(ghost_buffer, src.Order(), geometry); - } - } // reorder_location - } else { - errorQuda("Invalid gauge field type"); - } - - // if we have copied from a source without a pad then we need to exchange - if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() != QUDA_GHOST_EXCHANGE_PAD) - exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL); - - staggeredPhaseApplied = src.StaggeredPhaseApplied(); - staggeredPhaseType = src.StaggeredPhase(); - - qudaDeviceSynchronize(); // include sync here for accurate host-device profiling - } - - void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu) { - copy(cpu); - qudaDeviceSynchronize(); - } - - void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu, TimeProfile &profile) { + void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu, TimeProfile &profile) + { profile.TPSTART(QUDA_PROFILE_H2D); - loadCPUField(cpu); + copy(cpu); profile.TPSTOP(QUDA_PROFILE_H2D); } - void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const + void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const { cpu.copy(*this); } +#if 0 { cpu.checkField(*this); @@ -237,7 +88,7 @@ namespace quda { qudaDeviceSynchronize(); } - +#endif void cudaGaugeField::saveCPUField(cpuGaugeField &cpu, TimeProfile &profile) const { profile.TPSTART(QUDA_PROFILE_D2H); saveCPUField(cpu); diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index 6e23168e87..2f59c11760 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -205,6 +205,12 @@ namespace quda { } setTuningString(); + + // exchange the boundaries if a non-trivial field + if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) + if (create == QUDA_REFERENCE_FIELD_CREATE && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) { + exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL); + } } GaugeField::~GaugeField() { } @@ -840,7 +846,217 @@ namespace quda { } } - std::ostream& operator<<(std::ostream& output, const GaugeFieldParam& param) { + void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) { + if (order == QUDA_QDP_GAUGE_ORDER) { + void **buffer = new void*[geometry]; + for (int d=0; d 4) { + void **buffer = new void*[geometry]; + for (int d=0; d 4) { + for (int d=0; d 0) ? create_ghost_buffer(ghost_bytes, order, geometry) : nullptr; + + if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED) { + copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr); + if (geometry == QUDA_COARSE_GEOMETRY) + copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr, + 3); // forwards links if bi-directional + } else { + copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0); + } + + if (order == QUDA_QDP_GAUGE_ORDER) { + for (int d=0; d 4 && ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace) + for (int d=0; d 0) free_ghost_buffer(ghost_buffer, order, geometry); + } // order + } + + } + + } else if (src.Location() == QUDA_CPU_FIELD_LOCATION) { + + if (location == QUDA_CPU_FIELD_LOCATION) { + // copy field and ghost zone directly + copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION); + } else { + if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do reorder on the CPU + void *buffer = pool_pinned_malloc(bytes); + + if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) { + // copy field and ghost zone into buffer + copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr); + + if (geometry == QUDA_COARSE_GEOMETRY) + copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr, 0, 0, 3); + } else { + copyExtendedGauge(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, nullptr); + if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported"); + } + + qudaMemcpy(gauge.data(), buffer, bytes, qudaMemcpyDefault); + pool_pinned_free(buffer); + } else { // else on the GPU + + if (src.Order() == QUDA_MILC_SITE_GAUGE_ORDER || + src.Order() == QUDA_BQCD_GAUGE_ORDER || + src.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { + // special case where we use zero-copy memory to read/write directly from application's array + void *src_d = get_mapped_device_pointer(src.data()); + + if (src.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) { + copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, data(), src_d); + } else { + errorQuda("Ghost copy not supported here"); + } + + } else { + void *buffer = create_gauge_buffer(src.Bytes(), src.Order(), src.Geometry()); + size_t ghost_bytes[8]; + int srcNinternal = src.Reconstruct() != QUDA_RECONSTRUCT_NO ? src.Reconstruct() : 2*nColor*nColor; + for (int d=0; d 0) ? create_ghost_buffer(ghost_bytes, src.Order(), geometry) : nullptr; + + if (src.Order() == QUDA_QDP_GAUGE_ORDER) { + for (int d=0; d 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD + && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace) + for (int d = 0; d < geometry; d++) + qudaMemcpy(ghost_buffer[d], src.Ghost()[d].data(), ghost_bytes[d], qudaMemcpyDefault); + + if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED && src.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) { + copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer, nullptr, ghost_buffer); + if (geometry == QUDA_COARSE_GEOMETRY) + copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer, nullptr, ghost_buffer, 3); + } else { + copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, nullptr, buffer); + if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("Extended gauge copy for coarse geometry not supported"); + } + free_gauge_buffer(buffer, src.Order(), src.Geometry()); + if (nFace > 0) free_ghost_buffer(ghost_buffer, src.Order(), geometry); + } + } // reorder_location + } // this location + } else { + errorQuda("Invalid gauge field type"); + } + + // if we have copied from a source without a pad then we need to exchange + if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() != QUDA_GHOST_EXCHANGE_PAD) + exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL); + + staggeredPhaseApplied = src.StaggeredPhaseApplied(); + staggeredPhaseType = src.StaggeredPhase(); + + qudaDeviceSynchronize(); // include sync here for accurate host-device profiling + } + + std::ostream& operator<<(std::ostream& output, const GaugeFieldParam& param) + { output << static_cast(param); output << "nColor = " << param.nColor << std::endl; output << "nFace = " << param.nFace << std::endl; From 0671db1111295922c29f11be7d6dad436c4ce34e Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 11 May 2023 14:15:37 -0700 Subject: [PATCH 06/99] Removed legacy load/save CPUField routines, replaced with GaugeField::copy --- include/gauge_field.h | 30 -------- lib/cpu_gauge_field.cpp | 7 +- lib/cuda_gauge_field.cpp | 86 --------------------- lib/gauge_field.cpp | 3 + lib/interface_quda.cpp | 113 +++++++++++++++++----------- lib/staggered_kd_build_xinv.cu | 2 +- tests/hisq_paths_force_test.cpp | 12 +-- tests/hisq_unitarize_force_test.cpp | 6 +- tests/pack_test.cpp | 8 +- tests/unitarize_link_test.cpp | 4 +- 10 files changed, 90 insertions(+), 181 deletions(-) diff --git a/include/gauge_field.h b/include/gauge_field.h index 4dd2352484..7648ba7f9b 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -581,40 +581,10 @@ namespace quda { public: cudaGaugeField(const GaugeFieldParam &); - /** - @brief Download into this field from a CPU field - @param[in] cpu The CPU field source - */ - void loadCPUField(const cpuGaugeField &cpu); - - /** - @brief Download into this field from a CPU field. Overloaded - variant that includes profiling - @param[in] cpu The CPU field source - @param[in] profile Time profile to record the transfer - */ - void loadCPUField(const cpuGaugeField &cpu, TimeProfile &profile); - - /** - @brief Upload from this field into a CPU field - @param[out] cpu The CPU field source - */ - void saveCPUField(cpuGaugeField &cpu) const; - - /** - @brief Upload from this field into a CPU field. Overloaded - variant that includes profiling. - @param[out] cpu The CPU field source - @param[in] profile Time profile to record the transfer - */ - void saveCPUField(cpuGaugeField &cpu, TimeProfile &profile) const; }; class cpuGaugeField : public GaugeField { - friend void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu); - friend void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const; - public: /** @brief Constructor for cpuGaugeField from a GaugeFieldParam diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp index 0c340504bd..8927fdb2d3 100644 --- a/lib/cpu_gauge_field.cpp +++ b/lib/cpu_gauge_field.cpp @@ -7,11 +7,6 @@ namespace quda { - cpuGaugeField::cpuGaugeField(const GaugeFieldParam ¶m) : - GaugeField(param) - { - // compute the fat link max now in case it is needed later (i.e., for half precision) - if (param.compute_fat_link_max) fat_link_max = this->abs_max(); - } + cpuGaugeField::cpuGaugeField(const GaugeFieldParam ¶m) : GaugeField(param) {} } // namespace quda diff --git a/lib/cuda_gauge_field.cpp b/lib/cuda_gauge_field.cpp index 39d3a28d02..e4d56bdfce 100644 --- a/lib/cuda_gauge_field.cpp +++ b/lib/cuda_gauge_field.cpp @@ -9,90 +9,4 @@ namespace quda { cudaGaugeField::cudaGaugeField(const GaugeFieldParam ¶m) : GaugeField(param) {} - void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu) { copy(cpu); } - - void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu, TimeProfile &profile) - { - profile.TPSTART(QUDA_PROFILE_H2D); - copy(cpu); - profile.TPSTOP(QUDA_PROFILE_H2D); - } - - void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const { cpu.copy(*this); } -#if 0 - { - cpu.checkField(*this); - - if (reorder_location() == QUDA_CUDA_FIELD_LOCATION) { - - if (cpu.Order() == QUDA_MILC_SITE_GAUGE_ORDER || - cpu.Order() == QUDA_BQCD_GAUGE_ORDER || - cpu.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { - // special case where we use zero-copy memory to read/write directly from application's array - void *cpu_d = get_mapped_device_pointer(cpu.data()); - if (cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_NO) { - copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, cpu_d, nullptr); - } else { - errorQuda("Ghost copy not supported here"); - } - } else { - void *buffer = create_gauge_buffer(cpu.Bytes(), cpu.Order(), cpu.Geometry()); - - // Allocate space for ghost zone if required - size_t ghost_bytes[8]; - int cpuNinternal = cpu.Reconstruct() != QUDA_RECONSTRUCT_NO ? cpu.Reconstruct() : 2*nColor*nColor; - for (int d=0; d 0) ? create_ghost_buffer(ghost_bytes, cpu.Order(), geometry) : nullptr; - - if (cpu.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) { - copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr); - if (geometry == QUDA_COARSE_GEOMETRY) - copyGenericGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr, ghost_buffer, nullptr, 3); - } else { - copyExtendedGauge(cpu, *this, QUDA_CUDA_FIELD_LOCATION, buffer, nullptr); - } - - if (cpu.Order() == QUDA_QDP_GAUGE_ORDER) { - for (int d = 0; d < geometry; d++) - qudaMemcpy(cpu.data(d), ((void **)buffer)[d], cpu.Bytes() / geometry, qudaMemcpyDefault); - } else { - qudaMemcpy(cpu.data(), buffer, cpu.Bytes(), qudaMemcpyDefault); - } - - if (cpu.Order() > 4 && GhostExchange() == QUDA_GHOST_EXCHANGE_PAD - && cpu.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace) - for (int d = 0; d < geometry; d++) - qudaMemcpy(cpu.Ghost()[d].data(), ghost_buffer[d], ghost_bytes[d], qudaMemcpyDefault); - - free_gauge_buffer(buffer, cpu.Order(), cpu.Geometry()); - if (nFace > 0) free_ghost_buffer(ghost_buffer, cpu.Order(), geometry); - } - } else if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // do copy then host-side reorder - - void *buffer = pool_pinned_malloc(bytes); - qudaMemcpy(buffer, gauge.data(), bytes, qudaMemcpyDefault); - - if (cpu.GhostExchange() != QUDA_GHOST_EXCHANGE_EXTENDED) { - copyGenericGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, nullptr, buffer); - } else { - copyExtendedGauge(cpu, *this, QUDA_CPU_FIELD_LOCATION, nullptr, buffer); - } - pool_pinned_free(buffer); - - } else { - errorQuda("Invalid pack location %d", reorder_location()); - } - - cpu.staggeredPhaseApplied = staggeredPhaseApplied; - cpu.staggeredPhaseType = staggeredPhaseType; - - qudaDeviceSynchronize(); - } -#endif - void cudaGaugeField::saveCPUField(cpuGaugeField &cpu, TimeProfile &profile) const { - profile.TPSTART(QUDA_PROFILE_D2H); - saveCPUField(cpu); - profile.TPSTOP(QUDA_PROFILE_D2H); - } - } // namespace quda diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index 2f59c11760..40cb1bf9b6 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -211,6 +211,9 @@ namespace quda { if (create == QUDA_REFERENCE_FIELD_CREATE && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) { exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL); } + + // compute the fat link max now in case it is needed later (i.e., for half precision) + if (param.compute_fat_link_max) fat_link_max = this->abs_max(); } GaugeField::~GaugeField() { } diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index dcb9873caf..27930a3b9a 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -799,7 +799,7 @@ void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param) } profileGauge.TPSTART(QUDA_PROFILE_D2H); - cudaGauge->saveCPUField(cpuGauge); + cpuGauge.copy(*cudaGauge); profileGauge.TPSTOP(QUDA_PROFILE_D2H); if (param->type == QUDA_SMEARED_LINKS) { delete cudaGauge; } @@ -3852,7 +3852,9 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, cudaGaugeField *cudaInLink = new cudaGaugeField(gParam); profileFatLink.TPSTOP(QUDA_PROFILE_INIT); - cudaInLink->loadCPUField(cpuInLink, profileFatLink); + profileFatLink.TPSTART(QUDA_PROFILE_H2D); + cudaInLink->copy(cpuInLink); + profileFatLink.TPSTOP(QUDA_PROFILE_H2D); cudaGaugeField *cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileFatLink); profileFatLink.TPSTART(QUDA_PROFILE_FREE); @@ -3874,7 +3876,9 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, longKSLink(cudaLongLink, *cudaInLinkEx, path_coeff); profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE); - cudaLongLink->saveCPUField(cpuLongLink, profileFatLink); + profileFatLink.TPSTART(QUDA_PROFILE_D2H); + cpuLongLink.copy(*cudaLongLink); + profileFatLink.TPSTOP(QUDA_PROFILE_D2H); profileFatLink.TPSTART(QUDA_PROFILE_FREE); delete cudaLongLink; @@ -3889,7 +3893,11 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, fatKSLink(cudaFatLink, *cudaInLinkEx, path_coeff); profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE); - if (fatlink) cudaFatLink->saveCPUField(cpuFatLink, profileFatLink); + if (fatlink) { + profileFatLink.TPSTART(QUDA_PROFILE_D2H); + cpuFatLink.copy(*cudaFatLink); + profileFatLink.TPSTOP(QUDA_PROFILE_D2H); + } profileFatLink.TPSTART(QUDA_PROFILE_FREE); delete cudaInLinkEx; @@ -3914,7 +3922,9 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, errorQuda("Error in unitarization component of the hisq fattening: %d failures", *num_failures_h); profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE); - cudaUnitarizedLink->saveCPUField(cpuUnitarizedLink, profileFatLink); + profileFatLink.TPSTART(QUDA_PROFILE_D2H); + cpuUnitarizedLink.copy(*cudaUnitarizedLink); + profileFatLink.TPSTOP(QUDA_PROFILE_D2H); profileFatLink.TPSTART(QUDA_PROFILE_FREE); delete cudaUnitarizedLink; @@ -3954,7 +3964,9 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) cudaGaugeField *cudaInLink = new cudaGaugeField(gParam); profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT); - cudaInLink->loadCPUField(cpuInLink, profileGaussianSmear); + profileGaussianSmear.TPSTART(QUDA_PROFILE_H2D); + cudaInLink->copy(cpuInLink); + profileGaussianSmear.TPSTOP(QUDA_PROFILE_H2D); // cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileGaussianSmear); // @@ -3990,8 +4002,10 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) gaugeSmeared->exchangeGhost(); profileGaussianSmear.TPSTOP(QUDA_PROFILE_COMPUTE); - // - gaugeSmeared->saveCPUField(cpuTwoLink, profileGaussianSmear); + + profileGaussianSmear.TPSTART(QUDA_PROFILE_D2H); + cpuTwoLink.copy(*gaugeSmeared); + profileGaussianSmear.TPSTOP(QUDA_PROFILE_D2H); profileGaussianSmear.TPSTART(QUDA_PROFILE_FREE); @@ -4031,7 +4045,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT); profileGaugeForce.TPSTART(QUDA_PROFILE_H2D); - cudaSiteLink->loadCPUField(*cpuSiteLink); + cudaSiteLink->copy(*cpuSiteLink); profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D); profileGaugeForce.TPSTART(QUDA_PROFILE_INIT); @@ -4065,7 +4079,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT); if (!qudaGaugeParam->overwrite_mom) { profileGaugeForce.TPSTART(QUDA_PROFILE_H2D); - cudaMom->loadCPUField(*cpuMom); + cudaMom->copy(*cpuMom); profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D); } } @@ -4103,7 +4117,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int if (qudaGaugeParam->return_result_mom) { profileGaugeForce.TPSTART(QUDA_PROFILE_D2H); - cudaMom->saveCPUField(*cpuMom); + cpuMom->copy(*cudaMom); profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H); } @@ -4166,7 +4180,7 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int * profileGaugePath.TPSTOP(QUDA_PROFILE_INIT); profileGaugePath.TPSTART(QUDA_PROFILE_H2D); - cudaSiteLink->loadCPUField(*cpuSiteLink); + cudaSiteLink->copy(*cpuSiteLink); profileGaugePath.TPSTOP(QUDA_PROFILE_H2D); profileGaugePath.TPSTART(QUDA_PROFILE_INIT); @@ -4185,7 +4199,7 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int * profileGaugePath.TPSTOP(QUDA_PROFILE_INIT); if (!qudaGaugeParam->overwrite_gauge) { profileGaugePath.TPSTART(QUDA_PROFILE_H2D); - cudaOut->loadCPUField(*cpuOut); + cudaOut->copy(*cpuOut); profileGaugePath.TPSTOP(QUDA_PROFILE_H2D); } @@ -4211,7 +4225,7 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int * profileGaugePath.TPSTOP(QUDA_PROFILE_COMPUTE); profileGaugePath.TPSTART(QUDA_PROFILE_D2H); - cudaOut->saveCPUField(*cpuOut); + cpuOut->copy(*cudaOut); profileGaugePath.TPSTOP(QUDA_PROFILE_D2H); profileGaugePath.TPSTART(QUDA_PROFILE_FREE); @@ -4274,12 +4288,12 @@ void momResidentQuda(void *mom, QudaGaugeParam *param) if (param->make_resident_mom) { // we are downloading the momentum from the host profileGaugeForce.TPSTART(QUDA_PROFILE_H2D); - momResident->loadCPUField(cpuMom); + momResident->copy(cpuMom); profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D); } else if (param->return_result_mom) { // we are uploading the momentum to the host profileGaugeForce.TPSTART(QUDA_PROFILE_D2H); - momResident->saveCPUField(cpuMom); + cpuMom.copy(*momResident); profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H); profileGaugeForce.TPSTART(QUDA_PROFILE_FREE); @@ -4348,7 +4362,7 @@ void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param) auto* cudaGauge = new cudaGaugeField(gParam); if (gauge) { - cudaGauge->loadCPUField(*cpuGauge); + cudaGauge->copy(*cpuGauge); delete cpuGauge; } @@ -4363,7 +4377,7 @@ void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param) gParam.geometry = cudaGauge->Geometry(); cpuGaugeField cpuGauge(gParam); - cudaGauge->saveCPUField(cpuGauge); + cpuGauge.copy(*cudaGauge); } void destroyGaugeFieldQuda(void *gauge) @@ -4424,7 +4438,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi cudaMom = momResident; } else { // download the initial momentum (FIXME make an option just to return?) - cudaMom->loadCPUField(cpuMom); + cudaMom->copy(cpuMom); } // resident gauge field is required @@ -4508,7 +4522,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi if (gauge_param->return_result_mom) { // copy the momentum field back to the host - cudaMom->saveCPUField(cpuMom); + cpuMom.copy(*cudaMom); } if (gauge_param->make_resident_mom) { @@ -4762,7 +4776,10 @@ void computeHISQForceQuda(void* const milc_momentum, cudaGaugeField *cudaWLink = new cudaGaugeField(wParam); profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); - cudaWLink->loadCPUField(cpuWLink, profileHISQForce); + profileHISQForce.TPSTART(QUDA_PROFILE_H2D); + cudaWLink->copy(cpuWLink); + profileHISQForce.TPSTOP(QUDA_PROFILE_H2D); + cudaWLink->exchangeExtendedGhost(cudaWLink->R(), profileHISQForce); cudaInForce->exchangeExtendedGhost(R, profileHISQForce); @@ -4807,7 +4824,9 @@ void computeHISQForceQuda(void* const milc_momentum, cudaGaugeField *cudaVLink = new cudaGaugeField(vParam); profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); - cudaVLink->loadCPUField(cpuVLink, profileHISQForce); + profileHISQForce.TPSTART(QUDA_PROFILE_H2D); + cudaVLink->copy(cpuVLink); + profileHISQForce.TPSTOP(QUDA_PROFILE_H2D); cudaVLink->exchangeExtendedGhost(cudaVLink->R(), profileHISQForce); profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE); @@ -4840,7 +4859,9 @@ void computeHISQForceQuda(void* const milc_momentum, cudaGaugeField *cudaULink = new cudaGaugeField(uParam); profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); - cudaULink->loadCPUField(cpuULink, profileHISQForce); + profileHISQForce.TPSTART(QUDA_PROFILE_H2D); + cudaULink->copy(cpuULink); + profileHISQForce.TPSTOP(QUDA_PROFILE_H2D); cudaULink->exchangeExtendedGhost(cudaULink->R(), profileHISQForce); // Compute Fat7-staple term @@ -4870,7 +4891,11 @@ void computeHISQForceQuda(void* const milc_momentum, if (gParam->return_result_mom) { // Close the paths, make anti-hermitian, and store in compressed format - if (gParam->return_result_mom) cudaMom->saveCPUField(*cpuMom, profileHISQForce); + if (gParam->return_result_mom) { + profileHISQForce.TPSTART(QUDA_PROFILE_H2D); + cpuMom->copy(*cudaMom); + profileHISQForce.TPSTOP(QUDA_PROFILE_H2D); + } } profileHISQForce.TPSTART(QUDA_PROFILE_FREE); @@ -5049,7 +5074,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double // copy the outer product field back to the host profileCloverForce.TPSTART(QUDA_PROFILE_D2H); - cudaMom.saveCPUField(cpuMom); + cpuMom.copy(cudaMom); profileCloverForce.TPSTOP(QUDA_PROFILE_D2H); profileCloverForce.TPSTART(QUDA_PROFILE_FREE); @@ -5117,7 +5142,7 @@ void updateGaugeFieldQuda(void* gauge, profileGaugeUpdate.TPSTART(QUDA_PROFILE_H2D); if (!param->use_resident_gauge) { // load fields onto the device - cudaInGauge->loadCPUField(*cpuGauge); + cudaInGauge->copy(*cpuGauge); } else { // or use resident fields already present if (!gaugePrecise) errorQuda("No resident gauge field allocated"); cudaInGauge = gaugePrecise; @@ -5125,7 +5150,7 @@ void updateGaugeFieldQuda(void* gauge, } if (!param->use_resident_mom) { - cudaMom->loadCPUField(*cpuMom); + cudaMom->copy(*cpuMom); } else { if (!momResident) errorQuda("No resident mom field allocated"); cudaMom = momResident; @@ -5143,7 +5168,7 @@ void updateGaugeFieldQuda(void* gauge, if (param->return_result_gauge) { // copy the gauge field back to the host profileGaugeUpdate.TPSTART(QUDA_PROFILE_D2H); - cudaOutGauge->saveCPUField(*cpuGauge); + cpuGauge->copy(*cudaOutGauge); profileGaugeUpdate.TPSTOP(QUDA_PROFILE_D2H); } @@ -5198,7 +5223,7 @@ void updateGaugeFieldQuda(void* gauge, gaugePrecise = nullptr; } else { profileProject.TPSTART(QUDA_PROFILE_H2D); - cudaGauge->loadCPUField(*cpuGauge); + cudaGauge->copy(*cpuGauge); profileProject.TPSTOP(QUDA_PROFILE_H2D); } @@ -5215,9 +5240,11 @@ void updateGaugeFieldQuda(void* gauge, if(*num_failures_h>0) errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h); - profileProject.TPSTART(QUDA_PROFILE_D2H); - if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge); - profileProject.TPSTOP(QUDA_PROFILE_D2H); + if (param->return_result_gauge) { + profileProject.TPSTART(QUDA_PROFILE_D2H); + cpuGauge->copy(*cudaGauge); + profileProject.TPSTOP(QUDA_PROFILE_D2H); + } if (param->make_resident_gauge) { if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); @@ -5258,7 +5285,7 @@ void updateGaugeFieldQuda(void* gauge, cudaGauge = gaugePrecise; } else { profilePhase.TPSTART(QUDA_PROFILE_H2D); - cudaGauge->loadCPUField(*cpuGauge); + cudaGauge->copy(*cpuGauge); profilePhase.TPSTOP(QUDA_PROFILE_H2D); } @@ -5271,9 +5298,11 @@ void updateGaugeFieldQuda(void* gauge, profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE); - profilePhase.TPSTART(QUDA_PROFILE_D2H); - if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge); - profilePhase.TPSTOP(QUDA_PROFILE_D2H); + if (param->return_result_gauge) { + profilePhase.TPSTART(QUDA_PROFILE_D2H); + cpuGauge->copy(*cudaGauge); + profilePhase.TPSTOP(QUDA_PROFILE_D2H); + } if (param->make_resident_gauge) { if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); @@ -5319,7 +5348,7 @@ double momActionQuda(void* momentum, QudaGaugeParam* param) profileMomAction.TPSTART(QUDA_PROFILE_H2D); if (!param->use_resident_mom) { - cudaMom->loadCPUField(*cpuMom); + cudaMom->copy(*cpuMom); } else { if (!momResident) errorQuda("No resident mom field allocated"); cudaMom = momResident; @@ -5803,7 +5832,7 @@ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const u GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_INIT); GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_H2D); - cudaInGauge->loadCPUField(*cpuGauge); + cudaInGauge->copy(*cpuGauge); GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_H2D); @@ -5829,7 +5858,7 @@ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const u // copy the gauge field back to the host GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_D2H); - cudaInGauge->saveCPUField(*cpuGauge); + cpuGauge->copy(*cudaInGauge); GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_D2H); GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_TOTAL); @@ -5881,9 +5910,7 @@ int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir, const GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_INIT); GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_H2D); - - cudaInGauge->loadCPUField(*cpuGauge); - + cudaInGauge->copy(*cpuGauge); GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_H2D); // perform the update @@ -5895,7 +5922,7 @@ int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir, const // copy the gauge field back to the host GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_D2H); - cudaInGauge->saveCPUField(*cpuGauge); + cpuGauge->copy(*cudaInGauge); GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_D2H); GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_TOTAL); diff --git a/lib/staggered_kd_build_xinv.cu b/lib/staggered_kd_build_xinv.cu index 34ddb23137..2ed47976f4 100644 --- a/lib/staggered_kd_build_xinv.cu +++ b/lib/staggered_kd_build_xinv.cu @@ -193,7 +193,7 @@ namespace quda { tmp_U = std::make_unique(gf_param); //Copy the cuda gauge field to the cpu - gauge.saveCPUField(reinterpret_cast(*tmp_U)); + tmp_U.get()->copy(gauge); } else if (location == QUDA_CUDA_FIELD_LOCATION) { diff --git a/tests/hisq_paths_force_test.cpp b/tests/hisq_paths_force_test.cpp index 116780a790..58b8299223 100644 --- a/tests/hisq_paths_force_test.cpp +++ b/tests/hisq_paths_force_test.cpp @@ -376,15 +376,15 @@ static void hisq_force_startup() * Copy to and exchange gauge and outer product fields on the device * ********************************************************************/ cpuGauge_ex->exchangeExtendedGhost(R, true); - cudaGauge_ex->loadCPUField(*cpuGauge); + cudaGauge_ex->copy(*cpuGauge); cudaGauge_ex->exchangeExtendedGhost(cudaGauge_ex->R()); cpuOprod_ex->exchangeExtendedGhost(R, true); - cudaOprod_ex->loadCPUField(*cpuOprod); + cudaOprod_ex->copy(*cpuOprod); cudaOprod_ex->exchangeExtendedGhost(cudaOprod_ex->R()); cpuLongLinkOprod_ex->exchangeExtendedGhost(R, true); - cudaLongLinkOprod_ex->loadCPUField(*cpuLongLinkOprod); + cudaLongLinkOprod_ex->copy(*cpuLongLinkOprod); cudaLongLinkOprod_ex->exchangeExtendedGhost(cudaLongLinkOprod_ex->R()); /********************** @@ -460,7 +460,7 @@ static int hisq_force_test(bool lepage) copyExtendedGauge(*cpuForce, *cpuForce_ex, QUDA_CPU_FIELD_LOCATION); copyExtendedGauge(*cudaForce, *cudaForce_ex, QUDA_CUDA_FIELD_LOCATION); - cudaForce->saveCPUField(*hostVerifyForce); + hostVerifyForce->copy(*cudaForce); int res = 1; for (int dir = 0; dir < 4; dir++) { @@ -497,7 +497,7 @@ static int hisq_force_test(bool lepage) copyExtendedGauge(*cpuForce, *cpuForce_ex, QUDA_CPU_FIELD_LOCATION); copyExtendedGauge(*cudaForce, *cudaForce_ex, QUDA_CUDA_FIELD_LOCATION); - cudaForce->saveCPUField(*hostVerifyForce); + hostVerifyForce->copy(*cudaForce); int res = 1; for (int dir = 0; dir < 4; dir++) { @@ -526,7 +526,7 @@ static int hisq_force_test(bool lepage) host_timer.stop(); host_time_sec += host_timer.last(); - cudaMom->saveCPUField(*cpuMom); + cpuMom->copy(*cudaMom); } int accuracy_level = 3; diff --git a/tests/hisq_unitarize_force_test.cpp b/tests/hisq_unitarize_force_test.cpp index 7a9d19255c..d27b09bfd8 100644 --- a/tests/hisq_unitarize_force_test.cpp +++ b/tests/hisq_unitarize_force_test.cpp @@ -89,8 +89,8 @@ static void hisq_force_init() gParam.order = QUDA_QDP_GAUGE_ORDER; - cudaFatLink->loadCPUField(*cpuFatLink); - cudaOprod->loadCPUField(*cpuOprod); + cudaFatLink->copy(*cpuFatLink); + cudaOprod->copy(*cpuOprod); } static void hisq_force_end() @@ -135,7 +135,7 @@ TEST(hisq_force_unitarize, verify) quda::fermion_force::unitarizeForceCPU(*cpuResult, *cpuOprod, *cpuFatLink); } - cudaResult->saveCPUField(*cpuReference); + cpuReference->copy(*cudaResult); printfQuda("Comparing CPU and GPU results\n"); int res[4]; diff --git a/tests/pack_test.cpp b/tests/pack_test.cpp index 3c5974ddc8..694c993895 100644 --- a/tests/pack_test.cpp +++ b/tests/pack_test.cpp @@ -116,12 +116,12 @@ void packTest() cudaGaugeField cudaCpsGauge(cpsParam); host_timer.start(); - cudaCpsGauge.loadCPUField(cpsCpuGauge); + cudaCpsGauge.copy(cpsCpuGauge); host_timer.stop(); printfQuda("CPS Gauge send time = %e seconds\n", host_timer.last()); host_timer.start(); - cudaCpsGauge.saveCPUField(cpsCpuGauge); + cpuCpuGauge.copy(cudaCpsGauge); host_timer.stop(); printfQuda("CPS Gauge restore time = %e seconds\n", host_timer.last()); } @@ -140,12 +140,12 @@ void packTest() cudaGaugeField cudaQdpGauge(qdpParam); host_timer.start(); - cudaQdpGauge.loadCPUField(qdpCpuGauge); + cudaQdpGauge.copy(qdpCpuGauge); host_timer.stop(); printfQuda("QDP Gauge send time = %e seconds\n", host_timer.last()); host_timer.start(); - cudaQdpGauge.saveCPUField(qdpCpuGauge); + qdpCpuGauge.copy(cudaQdpGauge); host_timer.stop(); printfQuda("QDP Gauge restore time = %e seconds\n", host_timer.last()); } diff --git a/tests/unitarize_link_test.cpp b/tests/unitarize_link_test.cpp index 1c4849ba4a..2d9dc14210 100644 --- a/tests/unitarize_link_test.cpp +++ b/tests/unitarize_link_test.cpp @@ -40,7 +40,7 @@ const double unittol = (prec == QUDA_DOUBLE_PRECISION) ? 1e-10 : 1e-6; TEST(unitarization, verify) { unitarizeLinksCPU(*cpuULink, *cpuFatLink); - cudaULink->saveCPUField(*cudaResult); + cudaResult->copy(*cudaULink); int res = compare_floats(cudaResult->data(), cpuULink->data(), 4 * cudaResult->Volume() * gauge_site_size, unittol, cpu_prec); @@ -151,7 +151,7 @@ static int unitarize_link_test(int &test_rc) computeKSLinkQuda(fatlink, NULL, NULL, inlink, act_path_coeff, &qudaGaugeParam); - cudaFatLink->loadCPUField(*cpuFatLink); + cudaFatLink->copy(*cpuFatLink); } quda::setUnitarizeLinksConstants(unitarize_eps, max_allowed_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, From f5e8eaced32d193fd29f58e4283e98afa7d33fd6 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 12 May 2023 18:07:17 -0700 Subject: [PATCH 07/99] Removal of cpuGaugeField and cudaGaugeField, we have now only GaugeField --- include/dirac_quda.h | 66 ++-- include/gauge_field.h | 27 +- include/gauge_field_order.h | 64 ++-- include/lattice_field.h | 3 - include/multigrid.h | 14 +- include/quda.h | 4 +- include/quda_milc_interface.h | 2 +- include/staggered_kd_build_xinv.h | 4 +- lib/CMakeLists.txt | 2 +- lib/coarse_op.in.cu | 6 +- lib/coarse_op_preconditioned.in.cu | 8 +- lib/coarsecoarse_op_mma.in.cu | 4 +- lib/cpu_gauge_field.cpp | 12 - lib/cuda_gauge_field.cpp | 12 - lib/dirac_coarse.cpp | 34 +- lib/dirac_improved_staggered_kd.cpp | 4 +- lib/dirac_staggered_kd.cpp | 2 +- lib/gauge_field.cpp | 48 +-- lib/gauge_observable.cpp | 2 +- lib/gauge_polyakov_loop.cu | 4 +- lib/interface_quda.cpp | 310 +++++++++--------- lib/lattice_field.cpp | 2 +- lib/milc_interface.cpp | 2 +- lib/multigrid.cpp | 14 +- lib/staggered_coarse_op.in.cpp | 12 +- lib/staggered_coarse_op.in.cu | 22 +- lib/staggered_kd_build_xinv.cu | 24 +- tests/covdev_test.cpp | 4 +- tests/gauge_alg_test.cpp | 8 +- tests/gauge_path_test.cpp | 14 +- tests/heatbath_test.cpp | 4 +- tests/hisq_paths_force_test.cpp | 72 ++-- tests/hisq_unitarize_force_test.cpp | 28 +- .../domain_wall_dslash_reference.cpp | 6 +- tests/host_reference/dslash_test_helpers.cpp | 6 +- .../host_reference/gauge_force_reference.cpp | 3 + tests/host_reference/hisq_force_reference.cpp | 10 +- tests/host_reference/hisq_force_reference.h | 10 +- .../wilson_dslash_reference.cpp | 2 +- tests/multigrid_benchmark_test.cpp | 10 +- tests/multigrid_evolve_test.cpp | 9 +- tests/pack_test.cpp | 10 +- tests/staggered_dslash_test_utils.h | 8 +- tests/unitarize_link_test.cpp | 14 +- 44 files changed, 422 insertions(+), 504 deletions(-) delete mode 100644 lib/cpu_gauge_field.cpp delete mode 100644 lib/cuda_gauge_field.cpp diff --git a/include/dirac_quda.h b/include/dirac_quda.h index dd47a6f8ba..e10437651f 100644 --- a/include/dirac_quda.h +++ b/include/dirac_quda.h @@ -51,9 +51,9 @@ namespace quda { QudaMatPCType matpcType; QudaDagType dagger; - cudaGaugeField *gauge; - cudaGaugeField *fatGauge; // used by staggered only - cudaGaugeField *longGauge; // used by staggered only + GaugeField *gauge; + GaugeField *fatGauge; // used by staggered only + GaugeField *longGauge; // used by staggered only int laplace3D; CloverField *clover; GaugeField *xInvKD; // used for the Kahler-Dirac operator only @@ -164,7 +164,7 @@ namespace quda { friend class DiracG5M; protected: - cudaGaugeField *gauge; + GaugeField *gauge; double kappa; double mass; int laplace3D; @@ -446,7 +446,7 @@ namespace quda { @return Error for non-staggered operators */ - virtual cudaGaugeField *getStaggeredShortLinkField() const + virtual GaugeField *getStaggeredShortLinkField() const { errorQuda("Invalid dirac type %d", getDiracType()); return nullptr; @@ -457,7 +457,7 @@ namespace quda { @return Error for non-improved staggered operators */ - virtual cudaGaugeField *getStaggeredLongLinkField() const + virtual GaugeField *getStaggeredLongLinkField() const { errorQuda("Invalid dirac type %d", getDiracType()); return nullptr; @@ -472,7 +472,7 @@ namespace quda { * @param long_gauge_in Updated long links * @param clover_in Updated clover field */ - virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *) + virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *) { gauge = gauge_in; } @@ -619,7 +619,7 @@ namespace quda { * @param long_gauge_in Updated long links * @param clover_in Updated clover field */ - virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *clover_in) + virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *clover_in) { DiracWilson::updateFields(gauge_in, nullptr, nullptr, nullptr); clover = clover_in; @@ -975,7 +975,7 @@ namespace quda { class DiracMobiusPC : public DiracMobius { protected: - mutable cudaGaugeField *extended_gauge; + mutable GaugeField *extended_gauge; private: public: @@ -1223,7 +1223,7 @@ namespace quda { * @param long_gauge_in Updated long links * @param clover_in Updated clover field */ - virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *clover_in) + virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *clover_in) { DiracWilson::updateFields(gauge_in, nullptr, nullptr, nullptr); clover = clover_in; @@ -1361,7 +1361,7 @@ namespace quda { @return Gauge field */ - virtual cudaGaugeField *getStaggeredShortLinkField() const { return gauge; } + virtual GaugeField *getStaggeredShortLinkField() const { return gauge; } /** * @brief Create the coarse staggered operator. @@ -1496,7 +1496,7 @@ namespace quda { * @param long_gauge_in Updated long links * @param clover_in Updated clover field */ - virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in, + virtual void updateFields(GaugeField *gauge_in, GaugeField *fat_gauge_in, GaugeField *long_gauge_in, CloverField *clover_in); /** @@ -1537,8 +1537,8 @@ namespace quda { class DiracImprovedStaggered : public Dirac { protected: - cudaGaugeField *fatGauge; - cudaGaugeField *longGauge; + GaugeField *fatGauge; + GaugeField *longGauge; public: DiracImprovedStaggered(const DiracParam ¶m); @@ -1565,14 +1565,14 @@ namespace quda { @return fat link field */ - virtual cudaGaugeField *getStaggeredShortLinkField() const { return fatGauge; } + virtual GaugeField *getStaggeredShortLinkField() const { return fatGauge; } /** @brief return the long link field for staggered operators for MG setup @return long link field */ - virtual cudaGaugeField *getStaggeredLongLinkField() const { return longGauge; } + virtual GaugeField *getStaggeredLongLinkField() const { return longGauge; } /** * @brief Update the internal gauge, fat gauge, long gauge, clover field pointer as appropriate. @@ -1583,7 +1583,7 @@ namespace quda { * @param long_gauge_in Updated long links * @param clover_in Updated clover field */ - virtual void updateFields(cudaGaugeField *, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in, CloverField *) + virtual void updateFields(GaugeField *, GaugeField *fat_gauge_in, GaugeField *long_gauge_in, CloverField *) { Dirac::updateFields(fat_gauge_in, nullptr, nullptr, nullptr); fatGauge = fat_gauge_in; @@ -1732,7 +1732,7 @@ namespace quda { * @param long_gauge_in Updated long links * @param clover_in Updated clover field */ - virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in, + virtual void updateFields(GaugeField *gauge_in, GaugeField *fat_gauge_in, GaugeField *long_gauge_in, CloverField *clover_in); /** @@ -1785,15 +1785,15 @@ namespace quda { const bool allow_truncation; /** Whether or not we let coarsening drop improvements, for ex dropping long links for small aggregate sizes */ const bool use_mma; /** Whether to use tensor cores or not */ - mutable cpuGaugeField *Y_h; /** CPU copy of the coarse link field */ - mutable cpuGaugeField *X_h; /** CPU copy of the coarse clover term */ - mutable cpuGaugeField *Xinv_h; /** CPU copy of the inverse coarse clover term */ - mutable cpuGaugeField *Yhat_h; /** CPU copy of the preconditioned coarse link field */ + mutable GaugeField *Y_h; /** CPU copy of the coarse link field */ + mutable GaugeField *X_h; /** CPU copy of the coarse clover term */ + mutable GaugeField *Xinv_h; /** CPU copy of the inverse coarse clover term */ + mutable GaugeField *Yhat_h; /** CPU copy of the preconditioned coarse link field */ - mutable cudaGaugeField *Y_d; /** GPU copy of the coarse link field */ - mutable cudaGaugeField *X_d; /** GPU copy of the coarse clover term */ - mutable cudaGaugeField *Xinv_d; /** GPU copy of inverse coarse clover term */ - mutable cudaGaugeField *Yhat_d; /** GPU copy of the preconditioned coarse link field */ + mutable GaugeField *Y_d; /** GPU copy of the coarse link field */ + mutable GaugeField *X_d; /** GPU copy of the coarse clover term */ + mutable GaugeField *Xinv_d; /** GPU copy of inverse coarse clover term */ + mutable GaugeField *Yhat_d; /** GPU copy of the preconditioned coarse link field */ /** @brief Initialize the coarse gauge fields. Location is @@ -1852,9 +1852,9 @@ namespace quda { @param[in] Xinv_d GPU coarse inverse clover field @param[in] Yhat_d GPU coarse preconditioned link field */ - DiracCoarse(const DiracParam ¶m, cpuGaugeField *Y_h, cpuGaugeField *X_h, cpuGaugeField *Xinv_h, - cpuGaugeField *Yhat_h, cudaGaugeField *Y_d = nullptr, cudaGaugeField *X_d = nullptr, - cudaGaugeField *Xinv_d = nullptr, cudaGaugeField *Yhat_d = nullptr); + DiracCoarse(const DiracParam ¶m, GaugeField *Y_h, GaugeField *X_h, GaugeField *Xinv_h, + GaugeField *Yhat_h, GaugeField *Y_d = nullptr, GaugeField *X_d = nullptr, + GaugeField *Xinv_d = nullptr, GaugeField *Yhat_d = nullptr); /** @param[in] dirac Another operator instance to clone from (shallow copy) @@ -1944,7 +1944,7 @@ namespace quda { virtual QudaDiracType getDiracType() const { return QUDA_COARSE_DIRAC; } - virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *) + virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *) { Dirac::updateFields(gauge_in, nullptr, nullptr, nullptr); warningQuda("Coarse gauge links cannot be trivially updated for DiracCoarse(PC). Perform an MG update instead."); @@ -2008,9 +2008,9 @@ namespace quda { @param[in] Xinv_d GPU coarse inverse clover field @param[in] Yhat_d GPU coarse preconditioned link field */ - DiracCoarsePC(const DiracParam ¶m, cpuGaugeField *Y_h, cpuGaugeField *X_h, cpuGaugeField *Xinv_h, - cpuGaugeField *Yhat_h, cudaGaugeField *Y_d = nullptr, cudaGaugeField *X_d = nullptr, - cudaGaugeField *Xinv_d = nullptr, cudaGaugeField *Yhat_d = nullptr); + DiracCoarsePC(const DiracParam ¶m, GaugeField *Y_h, GaugeField *X_h, GaugeField *Xinv_h, + GaugeField *Yhat_h, GaugeField *Y_d = nullptr, GaugeField *X_d = nullptr, + GaugeField *Xinv_d = nullptr, GaugeField *Yhat_d = nullptr); /** @param[in] dirac Another operator instance to clone from (shallow copy) diff --git a/include/gauge_field.h b/include/gauge_field.h index 7648ba7f9b..23fb8939e3 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -575,27 +575,6 @@ namespace quda { void copy_from_buffer(void *buffer); }; - class cudaGaugeField : public GaugeField - { - - public: - cudaGaugeField(const GaugeFieldParam &); - - }; - - class cpuGaugeField : public GaugeField { - - public: - /** - @brief Constructor for cpuGaugeField from a GaugeFieldParam - @param[in,out] param Parameter struct - note that in the case - that we are wrapping host-side extended fields, this param is - modified for subsequent creation of fields that are not - extended. - */ - cpuGaugeField(const GaugeFieldParam ¶m); - }; - /** @brief This is a debugging function, where we cast a gauge field into a spinor field so we can compute its L1 norm. @@ -666,8 +645,8 @@ namespace quda { @param recon The reconsturction type @return the pointer to the extended gauge field */ - cudaGaugeField *createExtendedGauge(cudaGaugeField &in, const lat_dim_t &R, TimeProfile &profile, - bool redundant_comms = false, QudaReconstructType recon = QUDA_RECONSTRUCT_INVALID); + GaugeField *createExtendedGauge(GaugeField &in, const lat_dim_t &R, TimeProfile &profile, + bool redundant_comms = false, QudaReconstructType recon = QUDA_RECONSTRUCT_INVALID); /** This function is used for creating an exteneded (cpu) gauge field from the input, @@ -676,7 +655,7 @@ namespace quda { @param R By how many do we want to extend the gauge field in each direction @return the pointer to the extended gauge field */ - cpuGaugeField *createExtendedGauge(void **gauge, QudaGaugeParam &gauge_param, const lat_dim_t &R); + GaugeField *createExtendedGauge(void **gauge, QudaGaugeParam &gauge_param, const lat_dim_t &R); /** This function is used for extracting the gauge ghost zone from a diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 451c8312c6..82ae78b29d 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -430,17 +430,17 @@ namespace quda { GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) { - if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) { - for (int d=0; d<4; d++) { - ghost[d] = ghost_ ? static_cast*>(ghost_[d]) : - static_cast*>(const_cast(U.Ghost()[d].data())); - ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - - ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : - ghost_ ? static_cast*>(ghost_[d+4]) : - static_cast*>(const_cast(U.Ghost()[d+4].data())); - ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - } + for (int d=0; d<4; d++) { + ghost[d] = ghost_ ? static_cast*>(ghost_[d]) : + U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ? + static_cast*>(const_cast(U.Ghost()[d].data())) : nullptr; + ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); + + ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : + ghost_ ? static_cast*>(ghost_[d+4]) : + U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ? + static_cast*>(const_cast(U.Ghost()[d+4].data())) : nullptr; + ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); } resetScale(U.Scale()); @@ -552,17 +552,16 @@ namespace quda { GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) { - if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) { - for (int d=0; d<4; d++) { - ghost[d] = ghost_ ? static_cast*>(ghost_[d]) : - static_cast*>(const_cast(U.Ghost()[d].data())); - ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); + for (int d=0; d<4; d++) { + ghost[d] = ghost_ ? static_cast*>(ghost_[d]) : + U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ? static_cast*>(const_cast(U.Ghost()[d].data())) : nullptr; + ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : - ghost_ ? static_cast*>(ghost_[d+4]) : - static_cast*>(const_cast(U.Ghost()[d+4].data())); - ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - } + ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : + ghost_ ? static_cast*>(ghost_[d+4]) : + U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ? + static_cast*>(const_cast(U.Ghost()[d+4].data())) : nullptr; + ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); } resetScale(U.Scale()); @@ -689,13 +688,11 @@ namespace quda { accessor(U, gauge_, ghost_) { if constexpr (!native_ghost) assert(ghost_ != nullptr); - if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) { - for (int d = 0; d < 4; d++) { - ghost[d] = !native_ghost ? static_cast*>(ghost_[d]) : nullptr; - ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d); - ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast*>(ghost_[d+4]) : nullptr; - ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d); - } + for (int d = 0; d < 4; d++) { + ghost[d] = !native_ghost ? static_cast*>(ghost_[d]) : nullptr; + ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d); + ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast*>(ghost_[d+4]) : nullptr; + ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d); } resetScale(U.Scale()); } @@ -1752,7 +1749,7 @@ namespace quda { /** @brief The LegacyOrder defines the ghost zone storage and ordering for - all cpuGaugeFields, which use the same ghost zone storage. + all non-native fields, which use the same ghost zone storage. */ template struct LegacyOrder { static constexpr int length = length_; @@ -1776,11 +1773,10 @@ namespace quda { if (geometry == QUDA_COARSE_GEOMETRY) errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone"); - if (u.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) { - for (int i = 0; i < 4; i++) { - ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.Ghost()[i].data()); - faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth - } + for (int i = 0; i < 4; i++) { + ghost[i] = (ghost_) ? ghost_[i] : + u.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ? (Float *)(u.Ghost()[i].data()) : nullptr; + faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth } } diff --git a/include/lattice_field.h b/include/lattice_field.h index 887fc248e7..38653350cc 100644 --- a/include/lattice_field.h +++ b/include/lattice_field.h @@ -34,9 +34,6 @@ namespace quda { class cudaEigVecSet; class GaugeField; - class cpuGaugeField; - class cudaGaugeField; - class CloverField; enum class QudaOffsetCopyMode { COLLECT, DISPERSE }; diff --git a/include/multigrid.h b/include/multigrid.h index 32273032e8..5204f32b25 100644 --- a/include/multigrid.h +++ b/include/multigrid.h @@ -382,9 +382,9 @@ namespace quda { @brief This method only resets the KD operators with the updated fine links and rebuilds the KD inverse */ - void resetStaggeredKD(cudaGaugeField *gauge_in, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in, - cudaGaugeField *gauge_sloppy_in, cudaGaugeField *fat_gauge_sloppy_in, - cudaGaugeField *long_gauge_sloppy_in, double mass); + void resetStaggeredKD(GaugeField *gauge_in, GaugeField *fat_gauge_in, GaugeField *long_gauge_in, + GaugeField *gauge_sloppy_in, GaugeField *fat_gauge_sloppy_in, + GaugeField *long_gauge_sloppy_in, double mass); /** @brief Dump the null-space vectors to disk. Will recurse dumping all levels. @@ -595,13 +595,13 @@ namespace quda { operator we are constructing the coarse grid operator from. For staggered, should always be QUDA_MATPC_INVALID. */ - void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge, - const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation, + void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge, + const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation, QudaDiracType dirac, QudaMatPCType matpc); template - void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge, - const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation, + void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge, + const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation, QudaDiracType dirac, QudaMatPCType matpc); /** diff --git a/include/quda.h b/include/quda.h index 31ed24bd01..b697ef7400 100644 --- a/include/quda.h +++ b/include/quda.h @@ -62,7 +62,7 @@ extern "C" { QudaGaugeFixed gauge_fix; /**< Whether the input gauge field is in the axial gauge or not */ - int ga_pad; /**< The pad size that the cudaGaugeField will use (default=0) */ + int ga_pad; /**< The pad size that native GaugeFields will use (default=0) */ int site_ga_pad; /**< Used by link fattening and the gauge and fermion forces */ @@ -1488,7 +1488,7 @@ extern "C" { void saveGaugeFieldQuda(void* outGauge, void* inGauge, QudaGaugeParam* param); /** - * Reinterpret gauge as a pointer to cudaGaugeField and call destructor. + * Reinterpret gauge as a pointer to a GaugeField and call destructor. * * @param gauge Gauge field to be freed */ diff --git a/include/quda_milc_interface.h b/include/quda_milc_interface.h index 23275eedb2..88904d481d 100644 --- a/include/quda_milc_interface.h +++ b/include/quda_milc_interface.h @@ -1014,7 +1014,7 @@ extern "C" { void* inGauge); /** - * Reinterpret gauge as a pointer to cudaGaugeField and call destructor. + * Reinterpret gauge as a pointer to a GaugeField and call destructor. * * @param gauge Gauge field to be freed */ diff --git a/include/staggered_kd_build_xinv.h b/include/staggered_kd_build_xinv.h index fdf57eccf8..2bd1b4f600 100644 --- a/include/staggered_kd_build_xinv.h +++ b/include/staggered_kd_build_xinv.h @@ -14,7 +14,7 @@ namespace quda @param mass [in] Mass of the original staggered operator w/out factor of 2 convention @param dagger_approximation[in] Whether or not to use the dagger approximation, using the dagger of X instead of Xinv */ - void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const cudaGaugeField &gauge, const double mass, + void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const GaugeField &gauge, const double mass, const bool dagger_approximation); /** @@ -34,7 +34,7 @@ namespace quda @param dagger_approximation[in] Whether or not to use the dagger approximation, using the dagger of X instead of Xinv @return constructed Xinv */ - std::shared_ptr AllocateAndBuildStaggeredKahlerDiracInverse(const cudaGaugeField &gauge, const double mass, + std::shared_ptr AllocateAndBuildStaggeredKahlerDiracInverse(const GaugeField &gauge, const double mass, const bool dagger_approximation); } // namespace quda diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index dd354ca735..37a83e001c 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -36,7 +36,7 @@ set (QUDA_OBJS field_cache.cpp gauge_covdev.cpp dirac.cpp clover_field.cpp lattice_field.cpp gauge_field.cpp - cpu_gauge_field.cpp cuda_gauge_field.cpp extract_gauge_ghost.cu + extract_gauge_ghost.cu gauge_norm.cu gauge_update_quda.cu max_clover.cu dirac_clover.cpp dirac_wilson.cpp dirac_staggered.cpp dirac_staggered_kd.cpp dirac_clover_hasenbusch_twist.cpp diff --git a/lib/coarse_op.in.cu b/lib/coarse_op.in.cu index 320c14bf12..0684e0e97a 100644 --- a/lib/coarse_op.in.cu +++ b/lib/coarse_op.in.cu @@ -173,17 +173,17 @@ namespace quda { gf_param.nFace = 1; gf_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; - U = new cpuGaugeField(gf_param); + U = new GaugeField(gf_param); //Copy the cuda gauge field to the cpu - static_cast(gauge).saveCPUField(*static_cast(U)); + U->copy(gauge); } else if (location == QUDA_CUDA_FIELD_LOCATION && gauge.Reconstruct() != QUDA_RECONSTRUCT_NO) { //Create a copy of the gauge field with no reconstruction, required for fine-grained access GaugeFieldParam gf_param(gauge); gf_param.reconstruct = QUDA_RECONSTRUCT_NO; gf_param.order = QUDA_FLOAT2_GAUGE_ORDER; gf_param.setPrecision(gf_param.Precision()); - U = new cudaGaugeField(gf_param); + U = new GaugeField(gf_param); U->copy(gauge); } diff --git a/lib/coarse_op_preconditioned.in.cu b/lib/coarse_op_preconditioned.in.cu index b80e018a8e..ae41a3cde7 100644 --- a/lib/coarse_op_preconditioned.in.cu +++ b/lib/coarse_op_preconditioned.in.cu @@ -160,7 +160,7 @@ namespace quda GaugeFieldParam param(X); param.order = gOrder_milc; param.setPrecision(X.Precision() < QUDA_SINGLE_PRECISION ? QUDA_SINGLE_PRECISION : X.Precision()); - output = cudaGaugeField::Create(param); + output = new GaugeField(param); if (copy_content) output->copy(X); } return output; @@ -180,9 +180,7 @@ namespace quda if (!use_mma) { delete Xinv_aos; } } else if (X.Location() == QUDA_CPU_FIELD_LOCATION && X.Order() == QUDA_QDP_GAUGE_ORDER) { - const cpuGaugeField *X_h = static_cast(&X); - cpuGaugeField *Xinv_h = static_cast(&Xinv); - blas::flops += invert(Xinv_h->data(0), X_h->data(0), n, X_h->Volume(), X.Precision(), X.Location()); + blas::flops += invert(Xinv.data(0), X.data(0), n, X.Volume(), X.Precision(), X.Location()); } else { errorQuda("Unsupported location=%d and order=%d", X.Location(), X.Order()); } @@ -206,7 +204,7 @@ namespace quda param.order = order; // if we did the exchange on AoS order, then this zero initialize wouldn't be needed if (!copy_content) param.create = QUDA_ZERO_FIELD_CREATE; - output = cudaGaugeField::Create(param); + output = new GaugeField(param); if (copy_content) output->copy(X); } return output; diff --git a/lib/coarsecoarse_op_mma.in.cu b/lib/coarsecoarse_op_mma.in.cu index ee18191dbb..8ccd052a1c 100644 --- a/lib/coarsecoarse_op_mma.in.cu +++ b/lib/coarsecoarse_op_mma.in.cu @@ -40,10 +40,10 @@ namespace quda { } else { GaugeFieldParam param(X); param.order = order; - output = cudaGaugeField::Create(param); + output = new GaugeField(param); if (copy_content) output->copy(X); } - return static_cast(output); + return static_cast(output); }; auto Y_order = create_gauge_copy(Y, gOrder, false); diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp deleted file mode 100644 index 8927fdb2d3..0000000000 --- a/lib/cpu_gauge_field.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include -#include -#include -#include -#include -#include - -namespace quda { - - cpuGaugeField::cpuGaugeField(const GaugeFieldParam ¶m) : GaugeField(param) {} - -} // namespace quda diff --git a/lib/cuda_gauge_field.cpp b/lib/cuda_gauge_field.cpp deleted file mode 100644 index e4d56bdfce..0000000000 --- a/lib/cuda_gauge_field.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include -#include -#include -#include -#include -#include - -namespace quda { - - cudaGaugeField::cudaGaugeField(const GaugeFieldParam ¶m) : GaugeField(param) {} - -} // namespace quda diff --git a/lib/dirac_coarse.cpp b/lib/dirac_coarse.cpp index 697f86bf9e..053f71b8f4 100644 --- a/lib/dirac_coarse.cpp +++ b/lib/dirac_coarse.cpp @@ -33,10 +33,10 @@ namespace quda { initializeCoarse(); } - DiracCoarse::DiracCoarse(const DiracParam ¶m, cpuGaugeField *Y_h, cpuGaugeField *X_h, cpuGaugeField *Xinv_h, - cpuGaugeField *Yhat_h, // cpu link fields - cudaGaugeField *Y_d, cudaGaugeField *X_d, cudaGaugeField *Xinv_d, - cudaGaugeField *Yhat_d) // gpu link field + DiracCoarse::DiracCoarse(const DiracParam ¶m, GaugeField *Y_h, GaugeField *X_h, GaugeField *Xinv_h, + GaugeField *Yhat_h, // cpu link fields + GaugeField *Y_d, GaugeField *X_d, GaugeField *Xinv_d, + GaugeField *Yhat_d) // gpu link field : Dirac(param), mass(param.mass), @@ -138,16 +138,16 @@ namespace quda { int pad = std::max( { (x[0]*x[1]*x[2])/2, (x[1]*x[2]*x[3])/2, (x[0]*x[2]*x[3])/2, (x[0]*x[1]*x[3])/2 } ); gParam.pad = gpu ? gParam.nFace * pad * 2 : 0; // factor of 2 since we have to store bi-directional ghost zone - if (gpu) Y_d = new cudaGaugeField(gParam); - else Y_h = new cpuGaugeField(gParam); + if (gpu) Y_d = new GaugeField(gParam); + else Y_h = new GaugeField(gParam); gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; gParam.nFace = 0; gParam.geometry = QUDA_SCALAR_GEOMETRY; gParam.pad = 0; - if (gpu) X_d = new cudaGaugeField(gParam); - else X_h = new cpuGaugeField(gParam); + if (gpu) X_d = new GaugeField(gParam); + else X_h = new GaugeField(gParam); } void DiracCoarse::createYhat(bool gpu) const @@ -180,8 +180,8 @@ namespace quda { int pad = std::max( { (x[0]*x[1]*x[2])/2, (x[1]*x[2]*x[3])/2, (x[0]*x[2]*x[3])/2, (x[0]*x[1]*x[3])/2 } ); gParam.pad = gpu ? gParam.nFace * pad * 2 : 0; // factor of 2 since we have to store bi-directional ghost zone - if (gpu) Yhat_d = new cudaGaugeField(gParam); - else Yhat_h = new cpuGaugeField(gParam); + if (gpu) Yhat_d = new GaugeField(gParam); + else Yhat_h = new GaugeField(gParam); gParam.setPrecision(gpu ? X_d->Precision() : X_h->Precision()); gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; @@ -189,8 +189,8 @@ namespace quda { gParam.geometry = QUDA_SCALAR_GEOMETRY; gParam.pad = 0; - if (gpu) Xinv_d = new cudaGaugeField(gParam); - else Xinv_h = new cpuGaugeField(gParam); + if (gpu) Xinv_d = new GaugeField(gParam); + else Xinv_h = new GaugeField(gParam); } void DiracCoarse::initializeCoarse() @@ -224,8 +224,8 @@ namespace quda { Y_param.order = gOrder; X_param.order = gOrder; - GaugeField *Y_order = cudaGaugeField::Create(Y_param); - GaugeField *X_order = cudaGaugeField::Create(X_param); + GaugeField *Y_order = GaugeField::Create(Y_param); + GaugeField *X_order = GaugeField::Create(X_param); dirac->createCoarseOp(*Y_order, *X_order, *transfer, kappa, mass, Mu(), MuFactor(), AllowTruncation()); @@ -438,9 +438,9 @@ namespace quda { /* do nothing */ } - DiracCoarsePC::DiracCoarsePC(const DiracParam ¶m, cpuGaugeField *Y_h, cpuGaugeField *X_h, cpuGaugeField *Xinv_h, - cpuGaugeField *Yhat_h, cudaGaugeField *Y_d, cudaGaugeField *X_d, cudaGaugeField *Xinv_d, - cudaGaugeField *Yhat_d) : + DiracCoarsePC::DiracCoarsePC(const DiracParam ¶m, GaugeField *Y_h, GaugeField *X_h, GaugeField *Xinv_h, + GaugeField *Yhat_h, GaugeField *Y_d, GaugeField *X_d, GaugeField *Xinv_d, + GaugeField *Yhat_d) : DiracCoarse(param, Y_h, X_h, Xinv_h, Yhat_h, Y_d, X_d, Xinv_d, Yhat_d) { } diff --git a/lib/dirac_improved_staggered_kd.cpp b/lib/dirac_improved_staggered_kd.cpp index fdba112b7f..39e6080cd6 100644 --- a/lib/dirac_improved_staggered_kd.cpp +++ b/lib/dirac_improved_staggered_kd.cpp @@ -154,8 +154,8 @@ namespace quda // Should we support "preparing" and "reconstructing"? } - void DiracImprovedStaggeredKD::updateFields(cudaGaugeField *, cudaGaugeField *fat_gauge_in, - cudaGaugeField *long_gauge_in, CloverField *) + void DiracImprovedStaggeredKD::updateFields(GaugeField *, GaugeField *fat_gauge_in, + GaugeField *long_gauge_in, CloverField *) { Dirac::updateFields(fat_gauge_in, nullptr, nullptr, nullptr); fatGauge = fat_gauge_in; diff --git a/lib/dirac_staggered_kd.cpp b/lib/dirac_staggered_kd.cpp index 9271c8afc3..db339402da 100644 --- a/lib/dirac_staggered_kd.cpp +++ b/lib/dirac_staggered_kd.cpp @@ -150,7 +150,7 @@ namespace quda // Should we support "preparing" and "reconstructing"? } - void DiracStaggeredKD::updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *) + void DiracStaggeredKD::updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *) { Dirac::updateFields(gauge_in, nullptr, nullptr, nullptr); } diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index 40cb1bf9b6..61ea7ab505 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -256,7 +256,7 @@ namespace quda { void GaugeField::createGhostZone(const lat_dim_t &R, bool no_comms_fill, bool bidir) const { - if (typeid(*this) == typeid(cpuGaugeField)) return; + if (location == QUDA_CPU_FIELD_LOCATION) return; // if this is not a bidirectional exchange then we are doing a // scalar exchange, e.g., only the link matrix in the direcion we @@ -288,26 +288,14 @@ namespace quda { if (phase != QUDA_STAGGERED_PHASE_INVALID) staggeredPhaseType = phase; applyGaugePhase(*this); - if (ghostExchange==QUDA_GHOST_EXCHANGE_PAD) { - if (typeid(*this)==typeid(cudaGaugeField)) { - static_cast(*this).exchangeGhost(); - } else { - static_cast(*this).exchangeGhost(); - } - } + if (ghostExchange==QUDA_GHOST_EXCHANGE_PAD) exchangeGhost(); staggeredPhaseApplied = true; } void GaugeField::removeStaggeredPhase() { if (!staggeredPhaseApplied) errorQuda("No staggered phases to remove"); applyGaugePhase(*this); - if (ghostExchange==QUDA_GHOST_EXCHANGE_PAD) { - if (typeid(*this)==typeid(cudaGaugeField)) { - static_cast(*this).exchangeGhost(); - } else { - static_cast(*this).exchangeGhost(); - } - } + if (ghostExchange==QUDA_GHOST_EXCHANGE_PAD) exchangeGhost(); staggeredPhaseApplied = false; } @@ -1144,27 +1132,15 @@ namespace quda { return Checksum(*this, mini); } - GaugeField* GaugeField::Create(const GaugeFieldParam ¶m) { - - GaugeField *field = nullptr; - if (param.location == QUDA_CPU_FIELD_LOCATION) { - field = new cpuGaugeField(param); - } else if (param.location== QUDA_CUDA_FIELD_LOCATION) { - field = new cudaGaugeField(param); - } else { - errorQuda("Invalid field location %d", param.location); - } - - return field; - } + GaugeField* GaugeField::Create(const GaugeFieldParam ¶m) { return new GaugeField(param); } // helper for creating extended gauge fields - cudaGaugeField *createExtendedGauge(cudaGaugeField &in, const lat_dim_t &R, TimeProfile &profile, - bool redundant_comms, QudaReconstructType recon) + GaugeField *createExtendedGauge(GaugeField &in, const lat_dim_t &R, TimeProfile &profile, + bool redundant_comms, QudaReconstructType recon) { profile.TPSTART(QUDA_PROFILE_INIT); GaugeFieldParam gParamEx(in); - gParamEx.location = QUDA_CUDA_FIELD_LOCATION; + //gParamEx.location = QUDA_CUDA_FIELD_LOCATION; gParamEx.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED; gParamEx.pad = 0; gParamEx.nFace = 1; @@ -1177,10 +1153,10 @@ namespace quda { if (recon != QUDA_RECONSTRUCT_INVALID) gParamEx.reconstruct = recon; gParamEx.setPrecision(gParamEx.Precision(), true); - auto *out = new cudaGaugeField(gParamEx); + auto *out = new GaugeField(gParamEx); // copy input field into the extended device gauge field - copyExtendedGauge(*out, in, QUDA_CUDA_FIELD_LOCATION); + copyExtendedGauge(*out, in, QUDA_CUDA_FIELD_LOCATION); // wrong location if both fields cpu profile.TPSTOP(QUDA_PROFILE_INIT); @@ -1191,10 +1167,10 @@ namespace quda { } // helper for creating extended (cpu) gauge fields - cpuGaugeField *createExtendedGauge(void **gauge, QudaGaugeParam &gauge_param, const lat_dim_t &R) + GaugeField *createExtendedGauge(void **gauge, QudaGaugeParam &gauge_param, const lat_dim_t &R) { GaugeFieldParam gauge_field_param(gauge_param, gauge); - cpuGaugeField cpu(gauge_field_param); + GaugeField cpu(gauge_field_param); gauge_field_param.location = QUDA_CPU_FIELD_LOCATION; gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED; @@ -1203,7 +1179,7 @@ namespace quda { gauge_field_param.x[d] += 2 * R[d]; gauge_field_param.r[d] = R[d]; } - cpuGaugeField *padded_cpu = new cpuGaugeField(gauge_field_param); + GaugeField *padded_cpu = new GaugeField(gauge_field_param); copyExtendedGauge(*padded_cpu, cpu, QUDA_CPU_FIELD_LOCATION); padded_cpu->exchangeExtendedGhost(R, true); // Do comm to fill halo = true diff --git a/lib/gauge_observable.cpp b/lib/gauge_observable.cpp index 42d07e19cc..b825a2ad81 100644 --- a/lib/gauge_observable.cpp +++ b/lib/gauge_observable.cpp @@ -66,7 +66,7 @@ namespace quda tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET; tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER; tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; - cudaGaugeField gaugeFmunu(tensorParam); + GaugeField gaugeFmunu(tensorParam); profile.TPSTOP(QUDA_PROFILE_INIT); profile.TPSTART(QUDA_PROFILE_COMPUTE); diff --git a/lib/gauge_polyakov_loop.cu b/lib/gauge_polyakov_loop.cu index a61027dc81..99ae5ea149 100644 --- a/lib/gauge_polyakov_loop.cu +++ b/lib/gauge_polyakov_loop.cu @@ -164,14 +164,14 @@ namespace quda { // as a function of the number of ranks in the `t` dimension gParam.setPrecision(QUDA_DOUBLE_PRECISION); - std::unique_ptr product_field = std::make_unique(gParam); + std::unique_ptr product_field = std::make_unique(gParam); GaugeField& product_field_ref = reinterpret_cast(*product_field.get()); // Create the field we reduce into x[3] = comm_dim(3); gParam.x = x; gParam.create = QUDA_NULL_FIELD_CREATE; - condensed_field = std::make_unique(gParam); + condensed_field = std::make_unique(gParam); GaugeField& condensed_field_ref = reinterpret_cast(*condensed_field.get()); profile.TPSTOP(QUDA_PROFILE_INIT); diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 27930a3b9a..5ed54e37f2 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -72,28 +72,28 @@ static bool redundant_comms = false; #include -cudaGaugeField *gaugePrecise = nullptr; -cudaGaugeField *gaugeSloppy = nullptr; -cudaGaugeField *gaugePrecondition = nullptr; -cudaGaugeField *gaugeRefinement = nullptr; -cudaGaugeField *gaugeEigensolver = nullptr; -cudaGaugeField *gaugeExtended = nullptr; - -cudaGaugeField *gaugeFatPrecise = nullptr; -cudaGaugeField *gaugeFatSloppy = nullptr; -cudaGaugeField *gaugeFatPrecondition = nullptr; -cudaGaugeField *gaugeFatRefinement = nullptr; -cudaGaugeField *gaugeFatEigensolver = nullptr; -cudaGaugeField *gaugeFatExtended = nullptr; - -cudaGaugeField *gaugeLongPrecise = nullptr; -cudaGaugeField *gaugeLongSloppy = nullptr; -cudaGaugeField *gaugeLongPrecondition = nullptr; -cudaGaugeField *gaugeLongRefinement = nullptr; -cudaGaugeField *gaugeLongEigensolver = nullptr; -cudaGaugeField *gaugeLongExtended = nullptr; - -cudaGaugeField *gaugeSmeared = nullptr; +GaugeField *gaugePrecise = nullptr; +GaugeField *gaugeSloppy = nullptr; +GaugeField *gaugePrecondition = nullptr; +GaugeField *gaugeRefinement = nullptr; +GaugeField *gaugeEigensolver = nullptr; +GaugeField *gaugeExtended = nullptr; + +GaugeField *gaugeFatPrecise = nullptr; +GaugeField *gaugeFatSloppy = nullptr; +GaugeField *gaugeFatPrecondition = nullptr; +GaugeField *gaugeFatRefinement = nullptr; +GaugeField *gaugeFatEigensolver = nullptr; +GaugeField *gaugeFatExtended = nullptr; + +GaugeField *gaugeLongPrecise = nullptr; +GaugeField *gaugeLongSloppy = nullptr; +GaugeField *gaugeLongPrecondition = nullptr; +GaugeField *gaugeLongRefinement = nullptr; +GaugeField *gaugeLongEigensolver = nullptr; +GaugeField *gaugeLongExtended = nullptr; + +GaugeField *gaugeSmeared = nullptr; CloverField *cloverPrecise = nullptr; CloverField *cloverSloppy = nullptr; @@ -101,8 +101,8 @@ CloverField *cloverPrecondition = nullptr; CloverField *cloverRefinement = nullptr; CloverField *cloverEigensolver = nullptr; -cudaGaugeField *momResident = nullptr; -cudaGaugeField *extendedGaugeResident = nullptr; +GaugeField *momResident = nullptr; +GaugeField *extendedGaugeResident = nullptr; std::vector solutionResident; @@ -536,8 +536,8 @@ static bool invalidate_clover = true; * @param refinement[in/out] Reference the to pointer of a given "refinement" field. * @param eigensolver[in/out] Reference then to pointer of a given "eigensolver" field. */ -void freeUniqueSloppyGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&sloppy, cudaGaugeField *&precondition, - cudaGaugeField *&refinement, cudaGaugeField *&eigensolver); +void freeUniqueSloppyGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeField *&precondition, + GaugeField *&refinement, GaugeField *&eigensolver); /** * Abstraction utility that cleans up the full set of sloppy fields, as well as @@ -552,8 +552,8 @@ void freeUniqueSloppyGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&slo * @param extended[in/out] Reference to the pointer of a given "extended" field. * @param preserve_precise[in] Whether (true) or not (false) to preserve the precise field. */ -void freeUniqueGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&sloppy, cudaGaugeField *&precondition, - cudaGaugeField *&refinement, cudaGaugeField *&eigensolver, cudaGaugeField *&extended, +void freeUniqueGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeField *&precondition, + GaugeField *&refinement, GaugeField *&eigensolver, GaugeField *&extended, bool preserve_precise); void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) @@ -571,8 +571,8 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) if (gauge_param.order <= 4) gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO; GaugeField *in = (param->location == QUDA_CPU_FIELD_LOCATION) ? - static_cast(new cpuGaugeField(gauge_param)) : - static_cast(new cudaGaugeField(gauge_param)); + static_cast(new GaugeField(gauge_param)) : + static_cast(new GaugeField(gauge_param)); if (in->Order() == QUDA_BQCD_GAUGE_ORDER) { static size_t checksum = SIZE_MAX; @@ -610,7 +610,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) } // if not preserving then copy the gauge field passed in - cudaGaugeField *precise = nullptr; + GaugeField *precise = nullptr; // switch the parameters for creating the mirror precise cuda gauge field gauge_param.create = QUDA_NULL_FIELD_CREATE; @@ -620,7 +620,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) gauge_param.pad = param->ga_pad; gauge_param.location = QUDA_CUDA_FIELD_LOCATION; - precise = new cudaGaugeField(gauge_param); + precise = new GaugeField(gauge_param); if (param->use_resident_gauge) { if(gaugePrecise == nullptr) errorQuda("No resident gauge field"); @@ -655,44 +655,44 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) // switch the parameters for creating the mirror sloppy cuda gauge field gauge_param.reconstruct = param->reconstruct_sloppy; gauge_param.setPrecision(param->cuda_prec_sloppy, true); - cudaGaugeField *sloppy = nullptr; + GaugeField *sloppy = nullptr; if (param->cuda_prec == param->cuda_prec_sloppy && param->reconstruct == param->reconstruct_sloppy) { sloppy = precise; } else { - sloppy = new cudaGaugeField(gauge_param); + sloppy = new GaugeField(gauge_param); sloppy->copy(*precise); } // switch the parameters for creating the mirror preconditioner cuda gauge field gauge_param.reconstruct = param->reconstruct_precondition; gauge_param.setPrecision(param->cuda_prec_precondition, true); - cudaGaugeField *precondition = nullptr; + GaugeField *precondition = nullptr; if (param->cuda_prec == param->cuda_prec_precondition && param->reconstruct == param->reconstruct_precondition) { precondition = precise; } else if (param->cuda_prec_sloppy == param->cuda_prec_precondition && param->reconstruct_sloppy == param->reconstruct_precondition) { precondition = sloppy; } else { - precondition = new cudaGaugeField(gauge_param); + precondition = new GaugeField(gauge_param); precondition->copy(*precise); } // switch the parameters for creating the refinement cuda gauge field gauge_param.reconstruct = param->reconstruct_refinement_sloppy; gauge_param.setPrecision(param->cuda_prec_refinement_sloppy, true); - cudaGaugeField *refinement = nullptr; + GaugeField *refinement = nullptr; if (param->cuda_prec_sloppy == param->cuda_prec_refinement_sloppy && param->reconstruct_sloppy == param->reconstruct_refinement_sloppy) { refinement = sloppy; } else { - refinement = new cudaGaugeField(gauge_param); + refinement = new GaugeField(gauge_param); refinement->copy(*sloppy); } // switch the parameters for creating the eigensolver cuda gauge field gauge_param.reconstruct = param->reconstruct_eigensolver; gauge_param.setPrecision(param->cuda_prec_eigensolver, true); - cudaGaugeField *eigensolver = nullptr; + GaugeField *eigensolver = nullptr; if (param->cuda_prec == param->cuda_prec_eigensolver && param->reconstruct == param->reconstruct_eigensolver) { eigensolver = precise; } else if (param->cuda_prec_precondition == param->cuda_prec_eigensolver @@ -702,14 +702,14 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) && param->reconstruct_sloppy == param->reconstruct_eigensolver) { eigensolver = sloppy; } else { - eigensolver = new cudaGaugeField(gauge_param); + eigensolver = new GaugeField(gauge_param); eigensolver->copy(*precise); } profileGauge.TPSTOP(QUDA_PROFILE_COMPUTE); // create an extended preconditioning field - cudaGaugeField* extended = nullptr; + GaugeField* extended = nullptr; if (param->overlap){ lat_dim_t R; // domain-overlap widths in different directions for (int i=0; i<4; ++i) R[i] = param->overlap*commDimPartitioned(i); @@ -780,8 +780,8 @@ void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param) // Set the specific cpu parameters and create the cpu gauge field GaugeFieldParam gauge_param(*param, h_gauge); - cpuGaugeField cpuGauge(gauge_param); - cudaGaugeField *cudaGauge = nullptr; + GaugeField cpuGauge(gauge_param); + GaugeField *cudaGauge = nullptr; switch (param->type) { case QUDA_WILSON_LINKS: cudaGauge = gaugePrecise; break; case QUDA_ASQTAD_FAT_LINKS: cudaGauge = gaugeFatPrecise; break; @@ -792,7 +792,7 @@ void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param) gauge_param.setPrecision(param->cuda_prec, true); gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; gauge_param.pad = param->ga_pad; - cudaGauge = new cudaGaugeField(gauge_param); + cudaGauge = new GaugeField(gauge_param); copyExtendedGauge(*cudaGauge, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION); break; default: errorQuda("Invalid gauge type"); @@ -1047,8 +1047,8 @@ void freeGaugeQuda(void) } // These utility functions are declared w/doxygen above -void freeUniqueSloppyGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&sloppy, cudaGaugeField *&precondition, - cudaGaugeField *&refinement, cudaGaugeField *&eigensolver) +void freeUniqueSloppyGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeField *&precondition, + GaugeField *&refinement, GaugeField *&eigensolver) { // In theory, we're checking for aliasing and freeing fields in the opposite order // from which they were allocated... but in any case, we're doing an all-to-all @@ -1073,8 +1073,8 @@ void freeUniqueSloppyGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&slo sloppy = nullptr; } -void freeUniqueGaugeUtility(cudaGaugeField *&precise, cudaGaugeField *&sloppy, cudaGaugeField *&precondition, - cudaGaugeField *&refinement, cudaGaugeField *&eigensolver, cudaGaugeField *&extended, +void freeUniqueGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeField *&precondition, + GaugeField *&refinement, GaugeField *&eigensolver, GaugeField *&extended, bool preserve_precise) { freeUniqueSloppyGaugeUtility(precise, sloppy, precondition, refinement, eigensolver); @@ -1135,7 +1135,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r if (gauge_param.Precision() == gaugePrecise->Precision() && gauge_param.reconstruct == gaugePrecise->Reconstruct()) { gaugeSloppy = gaugePrecise; } else { - gaugeSloppy = new cudaGaugeField(gauge_param); + gaugeSloppy = new GaugeField(gauge_param); gaugeSloppy->copy(*gaugePrecise); } @@ -1151,7 +1151,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r && gauge_param.reconstruct == gaugeSloppy->Reconstruct()) { gaugePrecondition = gaugeSloppy; } else { - gaugePrecondition = new cudaGaugeField(gauge_param); + gaugePrecondition = new GaugeField(gauge_param); gaugePrecondition->copy(*gaugePrecise); } @@ -1164,7 +1164,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r if (gauge_param.Precision() == gaugeSloppy->Precision() && gauge_param.reconstruct == gaugeSloppy->Reconstruct()) { gaugeRefinement = gaugeSloppy; } else { - gaugeRefinement = new cudaGaugeField(gauge_param); + gaugeRefinement = new GaugeField(gauge_param); gaugeRefinement->copy(*gaugeSloppy); } @@ -1183,7 +1183,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r && gauge_param.reconstruct == gaugePrecondition->Reconstruct()) { gaugeEigensolver = gaugePrecondition; } else { - gaugeEigensolver = new cudaGaugeField(gauge_param); + gaugeEigensolver = new GaugeField(gauge_param); gaugeEigensolver->copy(*gaugePrecise); } } @@ -1201,7 +1201,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r && gauge_param.reconstruct == gaugeFatPrecise->Reconstruct()) { gaugeFatSloppy = gaugeFatPrecise; } else { - gaugeFatSloppy = new cudaGaugeField(gauge_param); + gaugeFatSloppy = new GaugeField(gauge_param); gaugeFatSloppy->copy(*gaugeFatPrecise); } @@ -1217,7 +1217,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r && gauge_param.reconstruct == gaugeFatSloppy->Reconstruct()) { gaugeFatPrecondition = gaugeFatSloppy; } else { - gaugeFatPrecondition = new cudaGaugeField(gauge_param); + gaugeFatPrecondition = new GaugeField(gauge_param); gaugeFatPrecondition->copy(*gaugeFatPrecise); } @@ -1230,7 +1230,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r && gauge_param.reconstruct == gaugeFatSloppy->Reconstruct()) { gaugeFatRefinement = gaugeFatSloppy; } else { - gaugeFatRefinement = new cudaGaugeField(gauge_param); + gaugeFatRefinement = new GaugeField(gauge_param); gaugeFatRefinement->copy(*gaugeFatSloppy); } @@ -1249,7 +1249,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r && gauge_param.reconstruct == gaugeFatPrecondition->Reconstruct()) { gaugeFatEigensolver = gaugeFatPrecondition; } else { - gaugeFatEigensolver = new cudaGaugeField(gauge_param); + gaugeFatEigensolver = new GaugeField(gauge_param); gaugeFatEigensolver->copy(*gaugeFatPrecise); } } @@ -1268,7 +1268,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r && gauge_param.reconstruct == gaugeLongPrecise->Reconstruct()) { gaugeLongSloppy = gaugeLongPrecise; } else { - gaugeLongSloppy = new cudaGaugeField(gauge_param); + gaugeLongSloppy = new GaugeField(gauge_param); gaugeLongSloppy->copy(*gaugeLongPrecise); } @@ -1285,7 +1285,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r && gauge_param.reconstruct == gaugeLongSloppy->Reconstruct()) { gaugeLongPrecondition = gaugeLongSloppy; } else { - gaugeLongPrecondition = new cudaGaugeField(gauge_param); + gaugeLongPrecondition = new GaugeField(gauge_param); gaugeLongPrecondition->copy(*gaugeLongPrecise); } @@ -1299,7 +1299,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r && gauge_param.reconstruct == gaugeLongSloppy->Reconstruct()) { gaugeLongRefinement = gaugeLongSloppy; } else { - gaugeLongRefinement = new cudaGaugeField(gauge_param); + gaugeLongRefinement = new GaugeField(gauge_param); gaugeLongRefinement->copy(*gaugeLongSloppy); } @@ -1319,7 +1319,7 @@ void loadSloppyGaugeQuda(const QudaPrecision *prec, const QudaReconstructType *r && gauge_param.reconstruct == gaugeLongPrecondition->Reconstruct()) { gaugeLongEigensolver = gaugeLongPrecondition; } else { - gaugeLongEigensolver = new cudaGaugeField(gauge_param); + gaugeLongEigensolver = new GaugeField(gauge_param); gaugeLongEigensolver->copy(*gaugeLongPrecise); } } @@ -2068,9 +2068,9 @@ void checkClover(QudaInvertParam *param) { if (cloverEigensolver == nullptr) errorQuda("Eigensolver clover field doesn't exist"); } -quda::cudaGaugeField *checkGauge(QudaInvertParam *param) +quda::GaugeField *checkGauge(QudaInvertParam *param) { - quda::cudaGaugeField *cudaGauge = nullptr; + quda::GaugeField *cudaGauge = nullptr; if (param->dslash_type != QUDA_ASQTAD_DSLASH) { if (gaugePrecise == nullptr) errorQuda("Precise gauge field doesn't exist"); @@ -2241,7 +2241,7 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam checkEigParam(eig_param); // Check that the gauge field is valid - cudaGaugeField *cudaGauge = checkGauge(inv_param); + GaugeField *cudaGauge = checkGauge(inv_param); // Set all timing statistics to zero inv_param->secs = 0; @@ -2386,7 +2386,7 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr blas_lapack::set_native(param->native_blas_lapack); checkMultigridParam(&mg_param); - cudaGaugeField *cudaGauge = checkGauge(param); + GaugeField *cudaGauge = checkGauge(param); // check MG params (needs to go somewhere else) if (mg_param.n_level > QUDA_MAX_MG_LEVEL) @@ -2624,7 +2624,7 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile) profile.TPSTART(QUDA_PROFILE_INIT); - cudaGaugeField *cudaGauge = checkGauge(param); + GaugeField *cudaGauge = checkGauge(param); eig_param.secs = 0; eig_param.gflops = 0; @@ -2710,7 +2710,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) checkInvertParam(param, hp_x, hp_b); // check the gauge fields have been created - cudaGaugeField *cudaGauge = checkGauge(param); + GaugeField *cudaGauge = checkGauge(param); // It was probably a bad design decision to encode whether the system is even/odd preconditioned (PC) in // solve_type and solution_type, rather than in separate members of QudaInvertParam. We're stuck with it @@ -3292,15 +3292,15 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col if (!is_staggered) { gf_param->create = QUDA_NULL_FIELD_CREATE; - collected_gauge = new quda::cpuGaugeField(*gf_param); + collected_gauge = new quda::GaugeField(*gf_param); std::vector v_g(1); v_g[0] = in; quda::split_field(*collected_gauge, v_g, split_key); } else { milc_fatlink_param->create = QUDA_NULL_FIELD_CREATE; milc_longlink_param->create = QUDA_NULL_FIELD_CREATE; - collected_milc_fatlink_field = new quda::cpuGaugeField(*milc_fatlink_param); - collected_milc_longlink_field = new quda::cpuGaugeField(*milc_longlink_param); + collected_milc_fatlink_field = new quda::GaugeField(*milc_fatlink_param); + collected_milc_longlink_field = new quda::GaugeField(*milc_longlink_param); std::vector v_g(1); v_g[0] = milc_fatlink_field; quda::split_field(*collected_milc_fatlink_field, v_g, split_key); @@ -3835,27 +3835,27 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, GaugeFieldParam gParam(*param, fatlink, QUDA_GENERAL_LINKS); gParam.location = QUDA_CPU_FIELD_LOCATION; - cpuGaugeField cpuFatLink(gParam); // create the host fatlink + GaugeField cpuFatLink(gParam); // create the host fatlink gParam.gauge = longlink; - cpuGaugeField cpuLongLink(gParam); // create the host longlink + GaugeField cpuLongLink(gParam); // create the host longlink gParam.gauge = ulink; - cpuGaugeField cpuUnitarizedLink(gParam); + GaugeField cpuUnitarizedLink(gParam); gParam.link_type = param->type; gParam.gauge = inlink; - cpuGaugeField cpuInLink(gParam); // create the host sitelink + GaugeField cpuInLink(gParam); // create the host sitelink // create the device fields gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.reconstruct = param->reconstruct; gParam.setPrecision(param->cuda_prec, true); gParam.create = QUDA_NULL_FIELD_CREATE; - cudaGaugeField *cudaInLink = new cudaGaugeField(gParam); + GaugeField *cudaInLink = new GaugeField(gParam); profileFatLink.TPSTOP(QUDA_PROFILE_INIT); profileFatLink.TPSTART(QUDA_PROFILE_H2D); cudaInLink->copy(cpuInLink); profileFatLink.TPSTOP(QUDA_PROFILE_H2D); - cudaGaugeField *cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileFatLink); + GaugeField *cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileFatLink); profileFatLink.TPSTART(QUDA_PROFILE_FREE); delete cudaInLink; @@ -3869,7 +3869,7 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, if (longlink) { profileFatLink.TPSTART(QUDA_PROFILE_INIT); - cudaGaugeField *cudaLongLink = new cudaGaugeField(gParam); + GaugeField *cudaLongLink = new GaugeField(gParam); profileFatLink.TPSTOP(QUDA_PROFILE_INIT); profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE); @@ -3886,7 +3886,7 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, } profileFatLink.TPSTART(QUDA_PROFILE_INIT); - cudaGaugeField *cudaFatLink = new cudaGaugeField(gParam); + GaugeField *cudaFatLink = new GaugeField(gParam); profileFatLink.TPSTOP(QUDA_PROFILE_INIT); profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE); @@ -3913,7 +3913,7 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, quda::setUnitarizeLinksConstants(unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); - cudaGaugeField *cudaUnitarizedLink = new cudaGaugeField(gParam); + GaugeField *cudaUnitarizedLink = new GaugeField(gParam); profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE); *num_failures_h = 0; @@ -3947,21 +3947,21 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) GaugeFieldParam gParam(*param, inlink, QUDA_GENERAL_LINKS); gParam.gauge = twolink; - cpuGaugeField cpuTwoLink(gParam); // create the host twolink + GaugeField cpuTwoLink(gParam); // create the host twolink profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT); - cudaGaugeField *cudaInLinkEx = nullptr; + GaugeField *cudaInLinkEx = nullptr; if(inlink) { gParam.link_type = param->type; gParam.gauge = inlink; - cpuGaugeField cpuInLink(gParam); // create the host sitelink + GaugeField cpuInLink(gParam); // create the host sitelink // create the device fields gParam.reconstruct = param->reconstruct; gParam.setPrecision(param->cuda_prec, true); gParam.create = QUDA_NULL_FIELD_CREATE; - cudaGaugeField *cudaInLink = new cudaGaugeField(gParam); + GaugeField *cudaInLink = new GaugeField(gParam); profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT); profileGaussianSmear.TPSTART(QUDA_PROFILE_H2D); @@ -3991,7 +3991,7 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) profileGaussianSmear.TPSTART(QUDA_PROFILE_INIT); freeUniqueGaugeQuda(QUDA_SMEARED_LINKS); - gaugeSmeared = new cudaGaugeField(gsParam); + gaugeSmeared = new GaugeField(gsParam); profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT); @@ -4028,9 +4028,9 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int gParam.location = QUDA_CPU_FIELD_LOCATION; gParam.site_offset = qudaGaugeParam->gauge_offset; gParam.site_size = qudaGaugeParam->site_size; - cpuGaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new cpuGaugeField(gParam) : nullptr; + GaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new GaugeField(gParam) : nullptr; - cudaGaugeField* cudaSiteLink = nullptr; + GaugeField* cudaSiteLink = nullptr; if (qudaGaugeParam->use_resident_gauge) { if (!gaugePrecise) errorQuda("No resident gauge field to use"); @@ -4041,7 +4041,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int gParam.setPrecision(qudaGaugeParam->cuda_prec, true); gParam.location = QUDA_CUDA_FIELD_LOCATION; - cudaSiteLink = new cudaGaugeField(gParam); + cudaSiteLink = new GaugeField(gParam); profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT); profileGaugeForce.TPSTART(QUDA_PROFILE_H2D); @@ -4060,9 +4060,9 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int gParamMom.site_offset = qudaGaugeParam->mom_offset; gParamMom.site_size = qudaGaugeParam->site_size; - cpuGaugeField* cpuMom = (!qudaGaugeParam->use_resident_mom) ? new cpuGaugeField(gParamMom) : nullptr; + GaugeField* cpuMom = (!qudaGaugeParam->use_resident_mom) ? new GaugeField(gParamMom) : nullptr; - cudaGaugeField* cudaMom = nullptr; + GaugeField* cudaMom = nullptr; if (qudaGaugeParam->use_resident_mom) { if (!momResident) errorQuda("No resident momentum field to use"); cudaMom = momResident; @@ -4075,7 +4075,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS; gParamMom.setPrecision(qudaGaugeParam->cuda_prec, true); gParamMom.create = QUDA_ZERO_FIELD_CREATE; - cudaMom = new cudaGaugeField(gParamMom); + cudaMom = new GaugeField(gParamMom); profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT); if (!qudaGaugeParam->overwrite_mom) { profileGaugeForce.TPSTART(QUDA_PROFILE_H2D); @@ -4084,7 +4084,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int } } - cudaGaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugeForce); + GaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugeForce); // apply / remove phase as appropriate if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase(); @@ -4163,9 +4163,9 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int * gParam.location = QUDA_CPU_FIELD_LOCATION; gParam.site_offset = qudaGaugeParam->gauge_offset; gParam.site_size = qudaGaugeParam->site_size; - cpuGaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new cpuGaugeField(gParam) : nullptr; + GaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new GaugeField(gParam) : nullptr; - cudaGaugeField *cudaSiteLink = nullptr; + GaugeField *cudaSiteLink = nullptr; if (qudaGaugeParam->use_resident_gauge) { if (!gaugePrecise) errorQuda("No resident gauge field to use"); @@ -4176,7 +4176,7 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int * gParam.reconstruct = qudaGaugeParam->reconstruct; gParam.setPrecision(qudaGaugeParam->cuda_prec, true); - cudaSiteLink = new cudaGaugeField(gParam); + cudaSiteLink = new GaugeField(gParam); profileGaugePath.TPSTOP(QUDA_PROFILE_INIT); profileGaugePath.TPSTART(QUDA_PROFILE_H2D); @@ -4190,12 +4190,12 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int * gParamOut.location = QUDA_CPU_FIELD_LOCATION; gParamOut.site_offset = qudaGaugeParam->gauge_offset; gParamOut.site_size = qudaGaugeParam->site_size; - cpuGaugeField *cpuOut = new cpuGaugeField(gParamOut); + GaugeField *cpuOut = new GaugeField(gParamOut); gParamOut.location = QUDA_CUDA_FIELD_LOCATION; gParamOut.create = qudaGaugeParam->overwrite_gauge ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE; gParamOut.reconstruct = QUDA_RECONSTRUCT_NO; gParamOut.setPrecision(qudaGaugeParam->cuda_prec, true); - cudaGaugeField *cudaOut = new cudaGaugeField(gParamOut); + GaugeField *cudaOut = new GaugeField(gParamOut); profileGaugePath.TPSTOP(QUDA_PROFILE_INIT); if (!qudaGaugeParam->overwrite_gauge) { profileGaugePath.TPSTART(QUDA_PROFILE_H2D); @@ -4203,7 +4203,7 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int * profileGaugePath.TPSTOP(QUDA_PROFILE_H2D); } - cudaGaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugePath); + GaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugePath); // apply / remove phase as appropriate if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase(); @@ -4265,7 +4265,7 @@ void momResidentQuda(void *mom, QudaGaugeParam *param) gParamMom.site_offset = param->mom_offset; gParamMom.site_size = param->site_size; - cpuGaugeField cpuMom(gParamMom); + GaugeField cpuMom(gParamMom); if (param->make_resident_mom && !param->return_result_mom) { if (momResident) delete momResident; @@ -4275,7 +4275,7 @@ void momResidentQuda(void *mom, QudaGaugeParam *param) gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS; gParamMom.setPrecision(param->cuda_prec, true); gParamMom.create = QUDA_ZERO_FIELD_CREATE; - momResident = new cudaGaugeField(gParamMom); + momResident = new GaugeField(gParamMom); } else if (param->return_result_mom && !param->make_resident_mom) { if (!momResident) errorQuda("No resident momentum to return"); } else { @@ -4314,7 +4314,7 @@ void createCloverQuda(QudaInvertParam* invertParam) // for clover we optimize to only send depth 1 halos in y/z/t (FIXME - make work for x, make robust in general) lat_dim_t R; for (int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d)); - cudaGaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profileClover, false, recon); + GaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profileClover, false, recon); profileClover.TPSTART(QUDA_PROFILE_INIT); @@ -4333,7 +4333,7 @@ void createCloverQuda(QudaInvertParam* invertParam) tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET; tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER; tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; - cudaGaugeField Fmunu(tensorParam); + GaugeField Fmunu(tensorParam); profileClover.TPSTOP(QUDA_PROFILE_INIT); profileClover.TPSTART(QUDA_PROFILE_COMPUTE); computeFmunu(Fmunu, *ex); @@ -4354,12 +4354,12 @@ void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param) if (geometry != QUDA_SCALAR_GEOMETRY && geometry != QUDA_VECTOR_GEOMETRY) errorQuda("Only scalar and vector geometries are supported\n"); - cpuGaugeField *cpuGauge = nullptr; - if (gauge) cpuGauge = new cpuGaugeField(gParam); + GaugeField *cpuGauge = nullptr; + if (gauge) cpuGauge = new GaugeField(gParam); gParam.order = QUDA_FLOAT2_GAUGE_ORDER; gParam.create = QUDA_ZERO_FIELD_CREATE; - auto* cudaGauge = new cudaGaugeField(gParam); + auto* cudaGauge = new GaugeField(gParam); if (gauge) { cudaGauge->copy(*cpuGauge); @@ -4371,18 +4371,18 @@ void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param) void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param) { - auto* cudaGauge = reinterpret_cast(inGauge); + auto* cudaGauge = reinterpret_cast(inGauge); GaugeFieldParam gParam(*param, gauge, QUDA_GENERAL_LINKS); gParam.geometry = cudaGauge->Geometry(); - cpuGaugeField cpuGauge(gParam); + GaugeField cpuGauge(gParam); cpuGauge.copy(*cudaGauge); } void destroyGaugeFieldQuda(void *gauge) { - auto* g = reinterpret_cast(gauge); + auto* g = reinterpret_cast(gauge); delete g; } @@ -4398,7 +4398,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi gParam.location = QUDA_CPU_FIELD_LOCATION; gParam.reconstruct = gauge_param->reconstruct; gParam.t_boundary = QUDA_PERIODIC_T; - cpuGaugeField cpuMom(gParam); + GaugeField cpuMom(gParam); // create the device momentum field gParam.location = QUDA_CUDA_FIELD_LOCATION; @@ -4406,13 +4406,13 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi gParam.create = QUDA_ZERO_FIELD_CREATE; // FIXME gParam.order = QUDA_FLOAT2_GAUGE_ORDER; gParam.reconstruct = QUDA_RECONSTRUCT_10; - cudaGaugeField *cudaMom = !gauge_param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr; + GaugeField *cudaMom = !gauge_param->use_resident_mom ? new GaugeField(gParam) : nullptr; // create temporary field for quark-field outer product gParam.reconstruct = QUDA_RECONSTRUCT_NO; gParam.link_type = QUDA_GENERAL_LINKS; gParam.create = QUDA_ZERO_FIELD_CREATE; - cudaGaugeField cudaForce(gParam); + GaugeField cudaForce(gParam); GaugeField *cudaForce_[2] = {&cudaForce}; ColorSpinorParam qParam; @@ -4606,9 +4606,9 @@ void computeHISQForceQuda(void* const milc_momentum, oParam.setPrecision(gParam->cpu_prec, true); oParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; - cudaGaugeField *stapleOprod = new cudaGaugeField(oParam); - cudaGaugeField *oneLinkOprod = new cudaGaugeField(oParam); - cudaGaugeField *naikOprod = new cudaGaugeField(oParam); + GaugeField *stapleOprod = new GaugeField(oParam); + GaugeField *oneLinkOprod = new GaugeField(oParam); + GaugeField *naikOprod = new GaugeField(oParam); double act_path_coeff[6] = {0, 1, level2_coeff[2], level2_coeff[3], level2_coeff[4], level2_coeff[5]}; // You have to look at the MILC routine to understand the following @@ -4712,11 +4712,11 @@ void computeHISQForceQuda(void* const milc_momentum, oParam.r[dir] = R[dir]; } - cudaGaugeField *cudaInForce = new cudaGaugeField(oParam); + GaugeField *cudaInForce = new GaugeField(oParam); copyExtendedGauge(*cudaInForce, *stapleOprod, QUDA_CUDA_FIELD_LOCATION); delete stapleOprod; - cudaGaugeField *cudaOutForce = new cudaGaugeField(oParam); + GaugeField *cudaOutForce = new GaugeField(oParam); copyExtendedGauge(*cudaOutForce, *oneLinkOprod, QUDA_CUDA_FIELD_LOCATION); delete oneLinkOprod; @@ -4729,7 +4729,7 @@ void computeHISQForceQuda(void* const milc_momentum, param.reconstruct = QUDA_RECONSTRUCT_10; param.ghostExchange = QUDA_GHOST_EXCHANGE_NO; param.gauge = milc_momentum; - cpuGaugeField *cpuMom = (!gParam->use_resident_mom) ? new cpuGaugeField(param) : nullptr; + GaugeField *cpuMom = (!gParam->use_resident_mom) ? new GaugeField(param) : nullptr; param.location = QUDA_CUDA_FIELD_LOCATION; param.create = QUDA_ZERO_FIELD_CREATE; @@ -4750,15 +4750,15 @@ void computeHISQForceQuda(void* const milc_momentum, wParam.link_type = QUDA_GENERAL_LINKS; wParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; wParam.gauge = (void *)w_link; - cpuGaugeField cpuWLink(wParam); + GaugeField cpuWLink(wParam); GaugeFieldParam vParam(wParam); vParam.gauge = (void *)v_link; - cpuGaugeField cpuVLink(vParam); + GaugeField cpuVLink(vParam); GaugeFieldParam uParam(vParam); uParam.gauge = (void *)u_link; - cpuGaugeField cpuULink(uParam); + GaugeField cpuULink(uParam); // Load the W field, which contains U(3) matrices, to the device gParam_field.ga_pad = 3 * pad_size; @@ -4773,7 +4773,7 @@ void computeHISQForceQuda(void* const milc_momentum, wParam.create = QUDA_NULL_FIELD_CREATE; wParam.setPrecision(gParam->cpu_prec, true); - cudaGaugeField *cudaWLink = new cudaGaugeField(wParam); + GaugeField *cudaWLink = new GaugeField(wParam); profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); profileHISQForce.TPSTART(QUDA_PROFILE_H2D); @@ -4821,7 +4821,7 @@ void computeHISQForceQuda(void* const milc_momentum, vParam.setPrecision(gParam->cpu_prec, true); vParam.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED; vParam.pad = 3 * pad_size; - cudaGaugeField *cudaVLink = new cudaGaugeField(vParam); + GaugeField *cudaVLink = new GaugeField(vParam); profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); profileHISQForce.TPSTART(QUDA_PROFILE_H2D); @@ -4856,7 +4856,7 @@ void computeHISQForceQuda(void* const milc_momentum, uParam.setPrecision(gParam->cpu_prec, true); uParam.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED; uParam.pad = 3 * pad_size; - cudaGaugeField *cudaULink = new cudaGaugeField(uParam); + GaugeField *cudaULink = new GaugeField(uParam); profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); profileHISQForce.TPSTART(QUDA_PROFILE_H2D); @@ -4874,7 +4874,7 @@ void computeHISQForceQuda(void* const milc_momentum, delete cudaInForce; profileHISQForce.TPSTOP(QUDA_PROFILE_FREE); profileHISQForce.TPSTART(QUDA_PROFILE_INIT); - cudaGaugeField* cudaMom = new cudaGaugeField(momParam); + GaugeField* cudaMom = new GaugeField(momParam); profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE); @@ -4930,20 +4930,20 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double fParam.location = QUDA_CPU_FIELD_LOCATION; fParam.reconstruct = QUDA_RECONSTRUCT_10; fParam.order = gauge_param->gauge_order; - cpuGaugeField cpuMom(fParam); + GaugeField cpuMom(fParam); // create the device momentum field fParam.location = QUDA_CUDA_FIELD_LOCATION; fParam.create = QUDA_ZERO_FIELD_CREATE; fParam.order = QUDA_FLOAT2_GAUGE_ORDER; - cudaGaugeField cudaMom(fParam); + GaugeField cudaMom(fParam); // create the device force field fParam.link_type = QUDA_GENERAL_LINKS; fParam.create = QUDA_ZERO_FIELD_CREATE; fParam.order = QUDA_FLOAT2_GAUGE_ORDER; fParam.reconstruct = QUDA_RECONSTRUCT_NO; - cudaGaugeField cudaForce(fParam); + GaugeField cudaForce(fParam); ColorSpinorParam qParam; qParam.location = QUDA_CUDA_FIELD_LOCATION; @@ -4988,11 +4988,11 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double solutionResident.size(), nvector); } - cudaGaugeField &gaugeEx = *extendedGaugeResident; + GaugeField &gaugeEx = *extendedGaugeResident; // create oprod and trace fields fParam.geometry = QUDA_TENSOR_GEOMETRY; - cudaGaugeField oprod(fParam); + GaugeField oprod(fParam); profileCloverForce.TPSTOP(QUDA_PROFILE_INIT); profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE); @@ -5040,11 +5040,11 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double computeCloverForce(cudaForce, *gaugePrecise, quarkX, quarkP, force_coeff); // In double precision the clover derivative is faster with no reconstruct - cudaGaugeField *u = &gaugeEx; + GaugeField *u = &gaugeEx; if (gaugeEx.Reconstruct() == QUDA_RECONSTRUCT_12 && gaugeEx.Precision() == QUDA_DOUBLE_PRECISION) { GaugeFieldParam param(gaugeEx); param.reconstruct = QUDA_RECONSTRUCT_NO; - u = new cudaGaugeField(param); + u = new GaugeField(param); u -> copy(gaugeEx); } @@ -5060,7 +5060,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double computeCloverSigmaOprod(oprod, quarkX, quarkP, ferm_epsilon); - cudaGaugeField *oprodEx = createExtendedGauge(oprod, R, profileCloverForce); + GaugeField *oprodEx = createExtendedGauge(oprod, R, profileCloverForce); profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE); @@ -5112,7 +5112,7 @@ void updateGaugeFieldQuda(void* gauge, gParam.site_offset = param->gauge_offset; gParam.site_size = param->site_size; bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; - cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr; + GaugeField *cpuGauge = need_cpu ? new GaugeField(gParam) : nullptr; GaugeFieldParam gParamMom(*param, momentum); gParamMom.reconstruct = (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ? @@ -5120,7 +5120,7 @@ void updateGaugeFieldQuda(void* gauge, gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS; gParamMom.site_offset = param->mom_offset; gParamMom.site_size = param->site_size; - cpuGaugeField *cpuMom = !param->use_resident_mom ? new cpuGaugeField(gParamMom) : nullptr; + GaugeField *cpuMom = !param->use_resident_mom ? new GaugeField(gParamMom) : nullptr; // create the device fields gParam.location = QUDA_CUDA_FIELD_LOCATION; @@ -5130,12 +5130,12 @@ void updateGaugeFieldQuda(void* gauge, gParam.reconstruct = QUDA_RECONSTRUCT_10; gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; gParam.pad = 0; - cudaGaugeField *cudaMom = !param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr; + GaugeField *cudaMom = !param->use_resident_mom ? new GaugeField(gParam) : nullptr; gParam.link_type = QUDA_SU3_LINKS; gParam.reconstruct = param->reconstruct; - cudaGaugeField *cudaInGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr; - auto *cudaOutGauge = new cudaGaugeField(gParam); + GaugeField *cudaInGauge = !param->use_resident_gauge ? new GaugeField(gParam) : nullptr; + auto *cudaOutGauge = new GaugeField(gParam); profileGaugeUpdate.TPSTOP(QUDA_PROFILE_INIT); @@ -5207,14 +5207,14 @@ void updateGaugeFieldQuda(void* gauge, gParam.site_offset = param->gauge_offset; gParam.site_size = param->site_size; bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; - cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr; + GaugeField *cpuGauge = need_cpu ? new GaugeField(gParam) : nullptr; // create the device fields gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.create = QUDA_NULL_FIELD_CREATE; gParam.order = QUDA_FLOAT2_GAUGE_ORDER; gParam.reconstruct = param->reconstruct; - cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr; + GaugeField *cudaGauge = !param->use_resident_gauge ? new GaugeField(gParam) : nullptr; profileProject.TPSTOP(QUDA_PROFILE_INIT); if (param->use_resident_gauge) { @@ -5270,14 +5270,14 @@ void updateGaugeFieldQuda(void* gauge, GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS); bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; gParam.location = QUDA_CPU_FIELD_LOCATION; - cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr; + GaugeField *cpuGauge = need_cpu ? new GaugeField(gParam) : nullptr; // create the device fields gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.create = QUDA_NULL_FIELD_CREATE; gParam.order = QUDA_FLOAT2_GAUGE_ORDER; gParam.reconstruct = param->reconstruct; - cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr; + GaugeField *cudaGauge = !param->use_resident_gauge ? new GaugeField(gParam) : nullptr; profilePhase.TPSTOP(QUDA_PROFILE_INIT); if (param->use_resident_gauge) { @@ -5334,7 +5334,7 @@ double momActionQuda(void* momentum, QudaGaugeParam* param) gParam.site_offset = param->mom_offset; gParam.site_size = param->site_size; - cpuGaugeField *cpuMom = !param->use_resident_mom ? new cpuGaugeField(gParam) : nullptr; + GaugeField *cpuMom = !param->use_resident_mom ? new GaugeField(gParam) : nullptr; // create the device fields gParam.location = QUDA_CUDA_FIELD_LOCATION; @@ -5342,7 +5342,7 @@ double momActionQuda(void* momentum, QudaGaugeParam* param) gParam.reconstruct = QUDA_RECONSTRUCT_10; gParam.setPrecision(param->cuda_prec, true); - cudaGaugeField *cudaMom = !param->use_resident_mom ? new cudaGaugeField(gParam) : nullptr; + GaugeField *cudaMom = !param->use_resident_mom ? new GaugeField(gParam) : nullptr; profileMomAction.TPSTOP(QUDA_PROFILE_INIT); @@ -5384,7 +5384,7 @@ void gaussGaugeQuda(unsigned long long seed, double sigma) if (!gaugePrecise) errorQuda("Cannot generate Gauss GaugeField as there is no resident gauge field"); - cudaGaugeField *data = gaugePrecise; + GaugeField *data = gaugePrecise; profileGauss.TPSTART(QUDA_PROFILE_COMPUTE); quda::gaugeGauss(*data, seed, sigma); @@ -5404,7 +5404,7 @@ void gaussMomQuda(unsigned long long seed, double sigma) if (!momResident) errorQuda("Cannot generate Gauss GaugeField as there is no resident momentum field"); - cudaGaugeField *data = momResident; + GaugeField *data = momResident; profileGauss.TPSTART(QUDA_PROFILE_COMPUTE); quda::gaugeGauss(*data, seed, sigma); @@ -5422,7 +5422,7 @@ void plaqQuda(double plaq[3]) if (!gaugePrecise) errorQuda("Cannot compute plaquette as there is no resident gauge field"); - cudaGaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq); + GaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq); extendedGaugeResident = data; profilePlaq.TPSTART(QUDA_PROFILE_COMPUTE); @@ -5493,13 +5493,13 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, pushVerbosity(inv_param->verbosity); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); - cudaGaugeField *precise = nullptr; + GaugeField *precise = nullptr; if (gaugeSmeared != nullptr) { if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Wuppertal smearing done with gaugeSmeared\n"); GaugeFieldParam gParam(*gaugePrecise); gParam.create = QUDA_NULL_FIELD_CREATE; - precise = new cudaGaugeField(gParam); + precise = new GaugeField(gParam); copyExtendedGauge(*precise, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION); precise->exchangeGhost(); } else { @@ -5586,9 +5586,9 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par gParam.nFace = 3; // FIXME: need a QudaLinkType with nFace=2. gParam.pad = gParam.pad*gParam.nFace; // - gaugeSmeared = new cudaGaugeField(gParam); + gaugeSmeared = new GaugeField(gParam); - cudaGaugeField *two_link_ext = createExtendedGauge(*gaugePrecise, R, profileGauge);//aux field + GaugeField *two_link_ext = createExtendedGauge(*gaugePrecise, R, profileGauge);//aux field computeTwoLink(*gaugeSmeared, *two_link_ext); @@ -5714,7 +5714,7 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable GaugeFieldParam gParam(*gaugeSmeared); gParam.location = QUDA_CUDA_FIELD_LOCATION; - auto *cudaGaugeTemp = new cudaGaugeField(gParam); + auto *cudaGaugeTemp = new GaugeField(gParam); int measurement_n = 0; // The nth measurement to take gaugeObservablesQuda(&obs_param[measurement_n]); @@ -5820,14 +5820,14 @@ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const u gParam.location = QUDA_CPU_FIELD_LOCATION; gParam.site_offset = param->gauge_offset; gParam.site_size = param->site_size; - auto *cpuGauge = new cpuGaugeField(gParam); + auto *cpuGauge = new GaugeField(gParam); gParam.create = QUDA_NULL_FIELD_CREATE; gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.link_type = param->type; gParam.reconstruct = param->reconstruct; gParam.setPrecision(gParam.Precision(), true); - auto *cudaInGauge = new cudaGaugeField(gParam); + auto *cudaInGauge = new GaugeField(gParam); GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_INIT); GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_H2D); @@ -5836,7 +5836,7 @@ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const u GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_H2D); - cudaGaugeField *cudaInGaugeEx = nullptr; + GaugeField *cudaInGaugeEx = nullptr; if (comm_size() == 1) { // perform the update @@ -5898,14 +5898,14 @@ int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir, const gParam.location = QUDA_CPU_FIELD_LOCATION; gParam.site_offset = param->gauge_offset; gParam.site_size = param->site_size; - auto *cpuGauge = new cpuGaugeField(gParam); + auto *cpuGauge = new GaugeField(gParam); gParam.create = QUDA_NULL_FIELD_CREATE; gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.link_type = param->type; gParam.reconstruct = param->reconstruct; gParam.setPrecision(gParam.Precision(), true); - auto *cudaInGauge = new cudaGaugeField(gParam); + auto *cudaInGauge = new GaugeField(gParam); GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_INIT); @@ -6000,7 +6000,7 @@ void gaugeObservablesQuda(QudaGaugeObservableParam *param) if (!gaugePrecise) errorQuda("Cannot compute Polyakov loop as there is no resident gauge field"); - cudaGaugeField *gauge = nullptr; + GaugeField *gauge = nullptr; if (!gaugeSmeared) { if (!extendedGaugeResident) extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileGaugeObs); gauge = extendedGaugeResident; diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp index 8becce7c7b..8b4b123776 100644 --- a/lib/lattice_field.cpp +++ b/lib/lattice_field.cpp @@ -613,7 +613,7 @@ namespace quda { const ColorSpinorField &csField = static_cast(*this); if (csField.FieldOrder() == 2 || csField.FieldOrder() == 4) return static_cast(csField.FieldOrder()); - } else if (typeid(*this) == typeid(const cudaGaugeField)) { + } else if (typeid(*this) == typeid(const GaugeField)) { const GaugeField &gField = static_cast(*this); if (gField.Order() == 2 || gField.Order() == 4) return static_cast(gField.Order()); diff --git a/lib/milc_interface.cpp b/lib/milc_interface.cpp index 781bdb4461..8f33083574 100644 --- a/lib/milc_interface.cpp +++ b/lib/milc_interface.cpp @@ -2573,7 +2573,7 @@ void* qudaCreateGaugeField(void* gauge, int geometry, int precision) void qudaSaveGaugeField(void* gauge, void* inGauge) { qudamilc_called(__func__); - cudaGaugeField* cudaGauge = reinterpret_cast(inGauge); + auto cudaGauge = reinterpret_cast(inGauge); QudaGaugeParam qudaGaugeParam = newMILCGaugeParam(localDim, cudaGauge->Precision(), QUDA_GENERAL_LINKS); saveGaugeFieldQuda(gauge, inGauge, &qudaGaugeParam); qudamilc_called(__func__); diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index b358c83c55..929849fdec 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -245,9 +245,9 @@ namespace quda popLevel(); } - void MG::resetStaggeredKD(cudaGaugeField *gauge_in, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in, - cudaGaugeField *gauge_sloppy_in, cudaGaugeField *fat_gauge_sloppy_in, - cudaGaugeField *long_gauge_sloppy_in, double mass) + void MG::resetStaggeredKD(GaugeField *gauge_in, GaugeField *fat_gauge_in, GaugeField *long_gauge_in, + GaugeField *gauge_sloppy_in, GaugeField *fat_gauge_sloppy_in, + GaugeField *long_gauge_sloppy_in, double mass) { if (param.level != 0) errorQuda("The staggered KD operator can only be updated from level 0"); @@ -509,8 +509,8 @@ namespace quda bool is_coarse_naive_staggered = is_naive_staggered || (is_improved_staggered && param.mg_global.transfer_type[param.level] == QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG); - cudaGaugeField *fine_gauge = diracSmoother->getStaggeredShortLinkField(); - cudaGaugeField *sloppy_gauge = mixed_precision_setup ? diracSmootherSloppy->getStaggeredShortLinkField() : fine_gauge; + auto fine_gauge = diracSmoother->getStaggeredShortLinkField(); + auto sloppy_gauge = mixed_precision_setup ? diracSmootherSloppy->getStaggeredShortLinkField() : fine_gauge; xInvKD = AllocateAndBuildStaggeredKahlerDiracInverse( *fine_gauge, diracSmoother->Mass(), param.mg_global.staggered_kd_dagger_approximation == QUDA_BOOLEAN_TRUE); @@ -523,7 +523,7 @@ namespace quda // true is to force FLOAT2 xinv_param.setPrecision(param.mg_global.invert_param->cuda_prec_precondition, true); - xInvKD_sloppy = std::shared_ptr(reinterpret_cast(new cudaGaugeField(xinv_param))); + xInvKD_sloppy = std::shared_ptr(reinterpret_cast(new GaugeField(xinv_param))); xInvKD_sloppy->copy(*xInvKD); ColorSpinorParam sloppy_tmp_param(*tmp_coarse); @@ -544,7 +544,7 @@ namespace quda diracParamKD.mu_factor = 1.0; // doesn't matter diracParamKD.dagger = QUDA_DAG_NO; diracParamKD.matpcType = QUDA_MATPC_EVEN_EVEN; // We can use this to track left vs right block jacobi in the future - diracParamKD.gauge = const_cast(fine_gauge); + diracParamKD.gauge = fine_gauge; diracParamKD.xInvKD = xInvKD.get(); // FIXME: pulling a raw unmanaged pointer out of a unique_ptr... diracParamKD.dirac = const_cast(diracSmoother); // used to determine if the outer solve is preconditioned or not diff --git a/lib/staggered_coarse_op.in.cpp b/lib/staggered_coarse_op.in.cpp index 9560f79b58..bab3e1ffba 100644 --- a/lib/staggered_coarse_op.in.cpp +++ b/lib/staggered_coarse_op.in.cpp @@ -7,8 +7,8 @@ namespace quda }; template - void StaggeredCoarseOp2(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge, - const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation, + void StaggeredCoarseOp2(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge, + const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation, QudaDiracType dirac, QudaMatPCType matpc, IntList) { if (Y.Ncolor() / 2 == coarseColor) { @@ -24,8 +24,8 @@ namespace quda } template - void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge, - const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation, + void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge, + const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation, QudaDiracType dirac, QudaMatPCType matpc, IntList) { if (gauge.Ncolor() == fineColor) { @@ -43,8 +43,8 @@ namespace quda } } - void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge, - const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation, + void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge, + const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation, QudaDiracType dirac, QudaMatPCType matpc) { if constexpr (is_enabled_spin(1) && is_enabled_multigrid()) { diff --git a/lib/staggered_coarse_op.in.cu b/lib/staggered_coarse_op.in.cu index 103b242655..3a03467d9a 100644 --- a/lib/staggered_coarse_op.in.cu +++ b/lib/staggered_coarse_op.in.cu @@ -306,8 +306,8 @@ namespace quda { constexpr int coarseColor = @QUDA_MULTIGRID_NVEC@; template <> - void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge, - const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, + void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge, + const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation, QudaDiracType dirac, QudaMatPCType matpc) { QudaPrecision precision = checkPrecision(T.Vectors(X.Location()), X, Y); @@ -351,11 +351,11 @@ namespace quda { gf_param.nFace = 1; gf_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; - tmp_U = std::make_unique(gf_param); + tmp_U = std::make_unique(gf_param); need_tmp_U = true; //Copy the cuda gauge field to the cpu - gauge.saveCPUField(reinterpret_cast(*tmp_U)); + tmp_U.get()->copy(gauge); // Create either a real or a dummy L field GaugeFieldParam lgf_param(longGauge.X(), precision, QUDA_RECONSTRUCT_NO, pad, longGauge.Geometry()); @@ -373,12 +373,12 @@ namespace quda { lgf_param.nFace = 3; lgf_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; - tmp_L = std::make_unique(lgf_param); + tmp_L = std::make_unique(lgf_param); need_tmp_L = true; //Copy the cuda gauge field to the cpu if (dirac == QUDA_ASQTAD_DIRAC || dirac == QUDA_ASQTADPC_DIRAC || dirac == QUDA_ASQTADKD_DIRAC) - longGauge.saveCPUField(reinterpret_cast(*tmp_L)); + tmp_L.get()->copy(longGauge); // Create either a real or a dummy Xinv field GaugeFieldParam xgf_param(XinvKD.X(), precision, QUDA_RECONSTRUCT_NO, pad, XinvKD.Geometry()); @@ -400,7 +400,7 @@ namespace quda { xgf_param.nFace = 0; xgf_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO; - tmp_Xinv = std::make_unique(xgf_param); + tmp_Xinv = std::make_unique(xgf_param); need_tmp_Xinv = true; //Copy the cuda gauge field to the cpu @@ -419,7 +419,7 @@ namespace quda { lgf_param.order = QUDA_FLOAT2_GAUGE_ORDER; lgf_param.setPrecision(lgf_param.Precision()); lgf_param.create = QUDA_NULL_FIELD_CREATE; - tmp_L = std::make_unique(lgf_param); + tmp_L = std::make_unique(lgf_param); need_tmp_L = true; } else if ((dirac == QUDA_ASQTAD_DIRAC || dirac == QUDA_ASQTADPC_DIRAC || dirac == QUDA_ASQTADKD_DIRAC) && longGauge.Reconstruct() != QUDA_RECONSTRUCT_NO) { // create a copy of the gauge field with no reconstruction @@ -427,7 +427,7 @@ namespace quda { lgf_param.reconstruct = QUDA_RECONSTRUCT_NO; lgf_param.order = QUDA_FLOAT2_GAUGE_ORDER; lgf_param.setPrecision(lgf_param.Precision()); - tmp_L = std::make_unique(lgf_param); + tmp_L = std::make_unique(lgf_param); tmp_L->copy(longGauge); tmp_L->exchangeGhost(); @@ -443,7 +443,7 @@ namespace quda { xgf_param.order = QUDA_FLOAT2_GAUGE_ORDER; xgf_param.setPrecision(xgf_param.Precision()); xgf_param.create = QUDA_NULL_FIELD_CREATE; - tmp_Xinv = std::make_unique(xgf_param); + tmp_Xinv = std::make_unique(xgf_param); need_tmp_Xinv = true; } // no need to worry about XinvKD's reconstruct @@ -454,7 +454,7 @@ namespace quda { gf_param.reconstruct = QUDA_RECONSTRUCT_NO; gf_param.order = QUDA_FLOAT2_GAUGE_ORDER; gf_param.setPrecision(gf_param.Precision()); - tmp_U = std::make_unique(gf_param); + tmp_U = std::make_unique(gf_param); need_tmp_U = true; tmp_U->copy(gauge); diff --git a/lib/staggered_kd_build_xinv.cu b/lib/staggered_kd_build_xinv.cu index 2ed47976f4..b1195d9f4e 100644 --- a/lib/staggered_kd_build_xinv.cu +++ b/lib/staggered_kd_build_xinv.cu @@ -113,7 +113,7 @@ namespace quda { @param mass[in] Mass of staggered fermion @param dagger_approximation[in] Whether or not to use the dagger approximation, using the dagger of X instead of Xinv */ - void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const cudaGaugeField &gauge, const double mass, const bool dagger_approximation) + void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const GaugeField &gauge, const double mass, const bool dagger_approximation) { using namespace blas_lapack; auto invert = use_native() ? native::BatchInvertMatrix : generic::BatchInvertMatrix; @@ -154,13 +154,7 @@ namespace quda { gParam.geometry = QUDA_SCALAR_GEOMETRY; gParam.pad = 0; - if (location == QUDA_CUDA_FIELD_LOCATION) - xInvMilcOrder = std::make_unique(gParam); - else if (location == QUDA_CPU_FIELD_LOCATION) - xInvMilcOrder = std::make_unique(gParam); - else - errorQuda("Invalid field location %d", location); - + xInvMilcOrder = std::make_unique(gParam); } // Step 2: build a host or device gauge field as appropriate, but @@ -190,7 +184,7 @@ namespace quda { gf_param.nFace = 1; gf_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; - tmp_U = std::make_unique(gf_param); + tmp_U = std::make_unique(gf_param); //Copy the cuda gauge field to the cpu tmp_U.get()->copy(gauge); @@ -202,7 +196,7 @@ namespace quda { gf_param.reconstruct = QUDA_RECONSTRUCT_NO; gf_param.order = QUDA_FLOAT2_GAUGE_ORDER; // guaranteed for no recon gf_param.setPrecision( QUDA_SINGLE_PRECISION ); - tmp_U = std::make_unique(gf_param); + tmp_U = std::make_unique(gf_param); tmp_U->copy(gauge); } @@ -216,10 +210,8 @@ namespace quda { if (location == QUDA_CUDA_FIELD_LOCATION) { x_param.order = QUDA_FLOAT2_GAUGE_ORDER; x_param.setPrecision(x_param.Precision()); - tmp_X = std::make_unique(x_param); - } else { - tmp_X = std::make_unique(x_param); } + tmp_X = std::make_unique(x_param); GaugeField& X = *tmp_X; // Step 4: Calculate X from U @@ -241,7 +233,7 @@ namespace quda { GaugeFieldParam param(*xInvMilcOrder); param.order = QUDA_MILC_GAUGE_ORDER; // MILC order == QDP order for Xinv param.setPrecision(QUDA_SINGLE_PRECISION); - cudaGaugeField X_(param); + GaugeField X_(param); X_.copy(X); @@ -268,7 +260,7 @@ namespace quda { // Allocates and calculates the inverse KD block, returning Xinv - std::shared_ptr AllocateAndBuildStaggeredKahlerDiracInverse(const cudaGaugeField &gauge, const double mass, const bool dagger_approximation) + std::shared_ptr AllocateAndBuildStaggeredKahlerDiracInverse(const GaugeField &gauge, const double mass, const bool dagger_approximation) { GaugeFieldParam gParam(gauge); gParam.reconstruct = QUDA_RECONSTRUCT_NO; @@ -282,7 +274,7 @@ namespace quda { // latter true is to force FLOAT2 gParam.setPrecision(gauge.Precision(), true); - std::shared_ptr Xinv(reinterpret_cast(new cudaGaugeField(gParam))); + std::shared_ptr Xinv(reinterpret_cast(new GaugeField(gParam))); BuildStaggeredKahlerDiracInverse(*Xinv, gauge, mass, dagger_approximation); diff --git a/tests/covdev_test.cpp b/tests/covdev_test.cpp index ebe5e784b8..0a5d5d38c7 100644 --- a/tests/covdev_test.cpp +++ b/tests/covdev_test.cpp @@ -25,7 +25,7 @@ using namespace quda; QudaGaugeParam gauge_param; QudaInvertParam inv_param; -cpuGaugeField *cpuLink = nullptr; +GaugeField *cpuLink = nullptr; std::unique_ptr spinor, spinorOut, spinorRef; std::unique_ptr cudaSpinor, cudaSpinorOut; @@ -94,7 +94,7 @@ void init(int argc, char **argv) // cpuLink is only used for ghost allocation GaugeFieldParam cpuParam(gauge_param, links); cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; - cpuLink = new cpuGaugeField(cpuParam); + cpuLink = new GaugeField(cpuParam); printfQuda("Links sending..."); loadGaugeQuda(links, &gauge_param); diff --git a/tests/gauge_alg_test.cpp b/tests/gauge_alg_test.cpp index adf22a0f30..00ba1f3689 100644 --- a/tests/gauge_alg_test.cpp +++ b/tests/gauge_alg_test.cpp @@ -108,7 +108,7 @@ class GaugeAlgTest : public ::testing::Test gParam.x[d] += 2 * gParam.r[d]; } - U = new cudaGaugeField(gParam); + U = new GaugeField(gParam); RNG randstates(*U, 1234); @@ -160,12 +160,12 @@ class GaugeAlgTest : public ::testing::Test for (int d = 0; d < 4; d++) if (comm_dim_partitioned(d)) R[d] = 2; static TimeProfile GaugeFix("GaugeFix"); - cudaGaugeField *tmp = new cudaGaugeField(gauge_field_param); + GaugeField *tmp = new GaugeField(gauge_field_param); tmp->copy(*host); U = createExtendedGauge(*tmp, R, GaugeFix); delete tmp; } else { - U = new cudaGaugeField(gauge_field_param); + U = new GaugeField(gauge_field_param); U->copy(*host); } @@ -266,7 +266,7 @@ class GaugeAlgTest : public ::testing::Test gParam.reconstruct = param.reconstruct; gParam.setPrecision(gParam.Precision(), true); - cudaGaugeField *gauge = new cudaGaugeField(gParam); + GaugeField *gauge = new GaugeField(gParam); // copy into regular field copyExtendedGauge(*gauge, *U, QUDA_CUDA_FIELD_LOCATION); diff --git a/tests/gauge_path_test.cpp b/tests/gauge_path_test.cpp index 56c2146b4b..7d37c9faad 100644 --- a/tests/gauge_path_test.cpp +++ b/tests/gauge_path_test.cpp @@ -128,13 +128,13 @@ void gauge_force_test(bool compute_force = true) param.create = QUDA_NULL_FIELD_CREATE; param.order = QUDA_QDP_GAUGE_ORDER; param.location = QUDA_CPU_FIELD_LOCATION; - quda::cpuGaugeField U_qdp(param); + quda::GaugeField U_qdp(param); // fills the gauge field with random numbers createSiteLinkCPU(U_qdp, gauge_param.cpu_prec, 0); param.order = QUDA_MILC_GAUGE_ORDER; - quda::cpuGaugeField U_milc(param); + quda::GaugeField U_milc(param); if (gauge_order == QUDA_MILC_GAUGE_ORDER) U_milc.copy(U_qdp); if (compute_force) { param.reconstruct = QUDA_RECONSTRUCT_10; @@ -143,11 +143,11 @@ void gauge_force_test(bool compute_force = true) param.reconstruct = QUDA_RECONSTRUCT_NO; } param.create = QUDA_ZERO_FIELD_CREATE; - quda::cpuGaugeField Mom_milc(param); - quda::cpuGaugeField Mom_ref_milc(param); + quda::GaugeField Mom_milc(param); + quda::GaugeField Mom_ref_milc(param); param.order = QUDA_QDP_GAUGE_ORDER; - quda::cpuGaugeField Mom_qdp(param); + quda::GaugeField Mom_qdp(param); // initialize some data in cpuMom if (compute_force) { @@ -260,13 +260,13 @@ void gauge_loop_test() param.create = QUDA_NULL_FIELD_CREATE; param.order = QUDA_QDP_GAUGE_ORDER; param.location = QUDA_CPU_FIELD_LOCATION; - quda::cpuGaugeField U_qdp(param); + quda::GaugeField U_qdp(param); // fills the gauge field with random numbers createSiteLinkCPU(U_qdp, gauge_param.cpu_prec, 0); param.order = QUDA_MILC_GAUGE_ORDER; - quda::cpuGaugeField U_milc(param); + quda::GaugeField U_milc(param); if (gauge_order == QUDA_MILC_GAUGE_ORDER) U_milc.copy(U_qdp); void *sitelink = nullptr; diff --git a/tests/heatbath_test.cpp b/tests/heatbath_test.cpp index d840fda4c7..4ad648958b 100644 --- a/tests/heatbath_test.cpp +++ b/tests/heatbath_test.cpp @@ -110,7 +110,7 @@ int main(int argc, char **argv) gParam.link_type = gauge_param.type; gParam.reconstruct = gauge_param.reconstruct; gParam.setPrecision(gParam.Precision(), true); - cudaGaugeField *gauge = new cudaGaugeField(gParam); + GaugeField *gauge = new GaugeField(gParam); int pad = 0; lat_dim_t y; @@ -126,7 +126,7 @@ int main(int argc, char **argv) gParamEx.t_boundary = gParam.t_boundary; gParamEx.nFace = 1; for (int dir = 0; dir < 4; ++dir) gParamEx.r[dir] = R[dir]; - cudaGaugeField *gaugeEx = new cudaGaugeField(gParamEx); + GaugeField *gaugeEx = new GaugeField(gParamEx); // CURAND random generator initialization RNG *randstates = new RNG(*gauge, 1234); diff --git a/tests/hisq_paths_force_test.cpp b/tests/hisq_paths_force_test.cpp index 58b8299223..07f7b4e17b 100644 --- a/tests/hisq_paths_force_test.cpp +++ b/tests/hisq_paths_force_test.cpp @@ -15,34 +15,34 @@ using namespace quda; -cpuGaugeField *cpuGauge = NULL; -cudaGaugeField *cudaForce = NULL; -cpuGaugeField *cpuForce = NULL; -cpuGaugeField *hostVerifyForce = NULL; +GaugeField *cpuGauge = NULL; +GaugeField *cudaForce = NULL; +GaugeField *cpuForce = NULL; +GaugeField *hostVerifyForce = NULL; -cudaGaugeField *cudaMom = NULL; -cpuGaugeField *cpuMom = NULL; -cpuGaugeField *refMom = NULL; +GaugeField *cudaMom = NULL; +GaugeField *cpuMom = NULL; +GaugeField *refMom = NULL; QudaGaugeFieldOrder gauge_order = QUDA_QDP_GAUGE_ORDER; -cpuGaugeField *cpuOprod = NULL; -cudaGaugeField *cudaOprod = NULL; -cpuGaugeField *cpuLongLinkOprod = NULL; -cudaGaugeField *cudaLongLinkOprod = NULL; +GaugeField *cpuOprod = NULL; +GaugeField *cudaOprod = NULL; +GaugeField *cpuLongLinkOprod = NULL; +GaugeField *cudaLongLinkOprod = NULL; int ODD_BIT = 1; QudaPrecision force_prec = QUDA_DOUBLE_PRECISION; -cudaGaugeField *cudaGauge_ex = NULL; -cpuGaugeField *cpuGauge_ex = NULL; -cudaGaugeField *cudaForce_ex = NULL; -cpuGaugeField *cpuForce_ex = NULL; -cpuGaugeField *cpuOprod_ex = NULL; -cudaGaugeField *cudaOprod_ex = NULL; -cpuGaugeField *cpuLongLinkOprod_ex = NULL; -cudaGaugeField *cudaLongLinkOprod_ex = NULL; +GaugeField *cudaGauge_ex = NULL; +GaugeField *cpuGauge_ex = NULL; +GaugeField *cudaForce_ex = NULL; +GaugeField *cpuForce_ex = NULL; +GaugeField *cpuOprod_ex = NULL; +GaugeField *cudaOprod_ex = NULL; +GaugeField *cpuLongLinkOprod_ex = NULL; +GaugeField *cudaLongLinkOprod_ex = NULL; static void setPrecision(QudaPrecision precision) { @@ -227,7 +227,7 @@ static void hisq_force_startup() gParam_ex.r[d] = (comm_dim_partitioned(d)) ? 2 : 0; gParam_ex.x[d] = X[d] + 2 * gParam_ex.r[d]; } // set halo region for GPU - cudaGauge_ex = new cudaGaugeField(gParam_ex); + cudaGauge_ex = new GaugeField(gParam_ex); // Create the host gauge field memcpy(&qudaGaugeParam_ex, &qudaGaugeParam, sizeof(QudaGaugeParam)); @@ -238,7 +238,7 @@ static void hisq_force_startup() gParam.create = QUDA_NULL_FIELD_CREATE; gParam.link_type = QUDA_GENERAL_LINKS; gParam.order = gauge_order; - cpuGauge = new cpuGaugeField(gParam); + cpuGauge = new GaugeField(gParam); gParam_ex = GaugeFieldParam(qudaGaugeParam_ex); gParam.location = QUDA_CPU_FIELD_LOCATION; @@ -250,7 +250,7 @@ static void hisq_force_startup() gParam_ex.r[d] = R[d]; gParam_ex.x[d] = gParam.x[d] + 2 * gParam_ex.r[d]; } // set halo region for CPU - cpuGauge_ex = new cpuGaugeField(gParam_ex); + cpuGauge_ex = new GaugeField(gParam_ex); auto generated_link_type = (link_recon == QUDA_RECONSTRUCT_NO ? SITELINK_PHASE_NO : @@ -279,8 +279,8 @@ static void hisq_force_startup() gParam.create = QUDA_NULL_FIELD_CREATE; gParam.link_type = QUDA_GENERAL_LINKS; gParam.order = gauge_order; - cpuForce = new cpuGaugeField(gParam); - hostVerifyForce = new cpuGaugeField(gParam); + cpuForce = new GaugeField(gParam); + hostVerifyForce = new GaugeField(gParam); gParam_ex.location = QUDA_CPU_FIELD_LOCATION; gParam_ex.reconstruct = QUDA_RECONSTRUCT_NO; @@ -292,7 +292,7 @@ static void hisq_force_startup() gParam_ex.r[d] = R[d]; gParam_ex.x[d] = gParam.x[d] + 2 * gParam_ex.r[d]; } - cpuForce_ex = new cpuGaugeField(gParam_ex); + cpuForce_ex = new GaugeField(gParam_ex); // create the momentum matrix gParam.location = QUDA_CPU_FIELD_LOCATION; @@ -302,8 +302,8 @@ static void hisq_force_startup() gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; gParam.order = QUDA_MILC_GAUGE_ORDER; gParam.create = QUDA_NULL_FIELD_CREATE; - cpuMom = new cpuGaugeField(gParam); - refMom = new cpuGaugeField(gParam); + cpuMom = new GaugeField(gParam); + refMom = new GaugeField(gParam); /********************************** * Create the outer product fields * @@ -316,8 +316,8 @@ static void hisq_force_startup() gParam.link_type = QUDA_GENERAL_LINKS; gParam.reconstruct = QUDA_RECONSTRUCT_NO; gParam.order = gauge_order; - cpuOprod = new cpuGaugeField(gParam); - cpuLongLinkOprod = new cpuGaugeField(gParam); + cpuOprod = new GaugeField(gParam); + cpuLongLinkOprod = new GaugeField(gParam); // Create extended outer product fields gParam_ex.location = QUDA_CPU_FIELD_LOCATION; @@ -328,8 +328,8 @@ static void hisq_force_startup() gParam_ex.r[d] = R[d]; gParam_ex.x[d] = gParam.x[d] + 2 * gParam_ex.r[d]; } // set halo region for CPU - cpuOprod_ex = new cpuGaugeField(gParam_ex); - cpuLongLinkOprod_ex = new cpuGaugeField(gParam_ex); + cpuOprod_ex = new GaugeField(gParam_ex); + cpuLongLinkOprod_ex = new GaugeField(gParam_ex); // initialize the CPU outer product fields and exchange once createStagForOprodCPU(stag_for_oprod, force_prec, qudaGaugeParam.X, *rng); @@ -352,9 +352,9 @@ static void hisq_force_startup() gParam_ex.r[d] = (comm_dim_partitioned(d)) ? 2 : 0; gParam_ex.x[d] = gParam.x[d] + 2 * gParam_ex.r[d]; } // set halo region - cudaForce_ex = new cudaGaugeField(gParam_ex); - cudaOprod_ex = new cudaGaugeField(gParam_ex); - cudaLongLinkOprod_ex = new cudaGaugeField(gParam_ex); + cudaForce_ex = new GaugeField(gParam_ex); + cudaOprod_ex = new GaugeField(gParam_ex); + cudaLongLinkOprod_ex = new GaugeField(gParam_ex); // create a device force for verify gParam.location = QUDA_CUDA_FIELD_LOCATION; @@ -362,7 +362,7 @@ static void hisq_force_startup() gParam.reconstruct = QUDA_RECONSTRUCT_NO; gParam.link_type = QUDA_GENERAL_LINKS; gParam.setPrecision(prec, true); - cudaForce = new cudaGaugeField(gParam); + cudaForce = new GaugeField(gParam); // create the device momentum field gParam.location = QUDA_CUDA_FIELD_LOCATION; @@ -370,7 +370,7 @@ static void hisq_force_startup() gParam.reconstruct = QUDA_RECONSTRUCT_10; gParam.link_type = QUDA_ASQTAD_MOM_LINKS; gParam.setPrecision(prec, true); - cudaMom = new cudaGaugeField(gParam); + cudaMom = new GaugeField(gParam); /******************************************************************** * Copy to and exchange gauge and outer product fields on the device * diff --git a/tests/hisq_unitarize_force_test.cpp b/tests/hisq_unitarize_force_test.cpp index d27b09bfd8..41b977b8e1 100644 --- a/tests/hisq_unitarize_force_test.cpp +++ b/tests/hisq_unitarize_force_test.cpp @@ -12,16 +12,16 @@ #include #include -quda::cudaGaugeField *cudaFatLink = NULL; -quda::cpuGaugeField *cpuFatLink = NULL; +quda::GaugeField *cudaFatLink = NULL; +quda::GaugeField *cpuFatLink = NULL; -quda::cudaGaugeField *cudaOprod = NULL; -quda::cpuGaugeField *cpuOprod = NULL; +quda::GaugeField *cudaOprod = NULL; +quda::GaugeField *cpuOprod = NULL; -quda::cudaGaugeField *cudaResult = NULL; -quda::cpuGaugeField *cpuResult = NULL; +quda::GaugeField *cudaResult = NULL; +quda::GaugeField *cpuResult = NULL; -quda::cpuGaugeField *cpuReference = NULL; +quda::GaugeField *cpuReference = NULL; static QudaGaugeParam gaugeParam; @@ -66,10 +66,10 @@ static void hisq_force_init() gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; gParam.anisotropy = 1; - cpuFatLink = new quda::cpuGaugeField(gParam); - cpuOprod = new quda::cpuGaugeField(gParam); - cpuResult = new quda::cpuGaugeField(gParam); - cpuReference = new quda::cpuGaugeField(gParam); + cpuFatLink = new quda::GaugeField(gParam); + cpuOprod = new quda::GaugeField(gParam); + cpuResult = new quda::GaugeField(gParam); + cpuReference = new quda::GaugeField(gParam); // create "gauge fields" int seed = 0; @@ -83,9 +83,9 @@ static void hisq_force_init() gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.setPrecision(gaugeParam.cuda_prec, true); - cudaFatLink = new quda::cudaGaugeField(gParam); - cudaOprod = new quda::cudaGaugeField(gParam); - cudaResult = new quda::cudaGaugeField(gParam); + cudaFatLink = new quda::GaugeField(gParam); + cudaOprod = new quda::GaugeField(gParam); + cudaResult = new quda::GaugeField(gParam); gParam.order = QUDA_QDP_GAUGE_ORDER; diff --git a/tests/host_reference/domain_wall_dslash_reference.cpp b/tests/host_reference/domain_wall_dslash_reference.cpp index 5fba06fe30..29edd18a44 100644 --- a/tests/host_reference/domain_wall_dslash_reference.cpp +++ b/tests/host_reference/domain_wall_dslash_reference.cpp @@ -763,7 +763,7 @@ void dw_dslash(void *out, void *const *gauge, void *in, int oddBit, int daggerBi { GaugeFieldParam gauge_field_param(gauge_param, (void **)gauge); gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; - cpuGaugeField cpu(gauge_field_param); + GaugeField cpu(gauge_field_param); void *ghostGauge[4] = {cpu.Ghost()[0].data(), cpu.Ghost()[1].data(), cpu.Ghost()[2].data(), cpu.Ghost()[3].data()}; // Get spinor ghost fields @@ -830,7 +830,7 @@ void dslash_4_4d(void *out, void *const *gauge, void *in, int oddBit, int dagger { GaugeFieldParam gauge_field_param(gauge_param, (void **)gauge); gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; - cpuGaugeField cpu(gauge_field_param); + GaugeField cpu(gauge_field_param); void *ghostGauge[4] = {cpu.Ghost()[0].data(), cpu.Ghost()[1].data(), cpu.Ghost()[2].data(), cpu.Ghost()[3].data()}; // Get spinor ghost fields @@ -1318,7 +1318,7 @@ void mdw_mdagm_local(void *out, void *const *gauge, void *in, double _Complex *k lat_dim_t R; for (int d = 0; d < 4; d++) { R[d] = comm_dim_partitioned(d) ? 2 : 0; } - cpuGaugeField *padded_gauge = createExtendedGauge((void **)gauge, gauge_param, R); + GaugeField *padded_gauge = createExtendedGauge((void **)gauge, gauge_param, R); int padded_V = 1; int W[4]; diff --git a/tests/host_reference/dslash_test_helpers.cpp b/tests/host_reference/dslash_test_helpers.cpp index b46b69ff75..be2a7cac18 100644 --- a/tests/host_reference/dslash_test_helpers.cpp +++ b/tests/host_reference/dslash_test_helpers.cpp @@ -7,9 +7,9 @@ using namespace quda; // need a better solution here but as long as they gauge field live in interface probably ok -extern cudaGaugeField *gaugePrecise; -extern cudaGaugeField *gaugeFatPrecise; -extern cudaGaugeField *gaugeLongPrecise; +extern GaugeField *gaugePrecise; +extern GaugeField *gaugeFatPrecise; +extern GaugeField *gaugeLongPrecise; void dslashQuda_4dpc(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, dslash_test_type test_type) { diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp index a895730b33..83c5251e27 100644 --- a/tests/host_reference/gauge_force_reference.cpp +++ b/tests/host_reference/gauge_force_reference.cpp @@ -9,6 +9,7 @@ #include "host_utils.h" #include "misc.h" #include "gauge_force_reference.h" +#include "timer.h" extern int Z[4]; extern int V; @@ -491,6 +492,8 @@ void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int ** param.t_boundary = QUDA_PERIODIC_T; auto qdp_ex = quda::createExtendedGauge((void **)sitelink, param, R); + //quda::TimeProfile dummy("blah"); + //auto qdp_ex = quda::createExtendedGauge(u, R, dummy); lattice_t lat(*qdp_ex); void *sitelink_ex[] = {qdp_ex->data(0), qdp_ex->data(1), qdp_ex->data(2), qdp_ex->data(3)}; diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp index 9ed6c0913d..58c6762e70 100644 --- a/tests/host_reference/hisq_force_reference.cpp +++ b/tests/host_reference/hisq_force_reference.cpp @@ -1197,8 +1197,8 @@ void doHisqStaplesForceCPU(const int dim[4], PathCoefficients staple_coe #undef Qmu #undef Qnumu -void hisqStaplesForceCPU(const double *path_coeff, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, - quda::cpuGaugeField *newOprod) +void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda::GaugeField &link, + quda::GaugeField *newOprod) { int X_[4]; for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d]; @@ -1301,8 +1301,8 @@ void computeLongLinkField(const int dim[4], const Real *const oprod, const Real } } -void hisqLongLinkForceCPU(double coeff, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, - quda::cpuGaugeField *newOprod) +void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeField &link, + quda::GaugeField *newOprod) { int X_[4]; for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d]; @@ -1360,7 +1360,7 @@ void completeForceField(const int dim[4], const Real *const oprod, const Real *c for (int site = 0; site < half_volume; ++site) { completeForceSite(site, dim, oprod, link, sig, ls, mom); } } -void hisqCompleteForceCPU(quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *mom) +void hisqCompleteForceCPU(quda::GaugeField &oprod, quda::GaugeField &link, quda::GaugeField *mom) { int X_[4]; for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d]; diff --git a/tests/host_reference/hisq_force_reference.h b/tests/host_reference/hisq_force_reference.h index 6e5e2923e4..fb8b773f84 100644 --- a/tests/host_reference/hisq_force_reference.h +++ b/tests/host_reference/hisq_force_reference.h @@ -21,8 +21,8 @@ void computeLinkOrderedOuterProduct(void *src, void *dest, QudaPrecision precisi @param[in] link Gauge field links @param[out] newOprod Force accumulated with fat link contributions */ -void hisqStaplesForceCPU(const double *path_coeff, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, - quda::cpuGaugeField *newOprod); +void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda::GaugeField &link, + quda::GaugeField *newOprod); /** @brief Compute the force contribution from the long link, CPU version @@ -31,8 +31,8 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::cpuGaugeField &oprod, q @param[in] link Gauge field links @param[out] newOprod Force accumulated with fat link contributions */ -void hisqLongLinkForceCPU(double coeff, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, - quda::cpuGaugeField *newOprod); +void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeField &link, + quda::GaugeField *newOprod); /** @brief Accumulate the force contributions into the momentum field, CPU version @@ -40,6 +40,6 @@ void hisqLongLinkForceCPU(double coeff, quda::cpuGaugeField &oprod, quda::cpuGau @param[in] link Gauge field links @param[out] mom Accumulated momentum */ -void hisqCompleteForceCPU(quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *mom); +void hisqCompleteForceCPU(quda::GaugeField &oprod, quda::GaugeField &link, quda::GaugeField *mom); #endif diff --git a/tests/host_reference/wilson_dslash_reference.cpp b/tests/host_reference/wilson_dslash_reference.cpp index fbe5aa241d..471f79c38d 100644 --- a/tests/host_reference/wilson_dslash_reference.cpp +++ b/tests/host_reference/wilson_dslash_reference.cpp @@ -192,7 +192,7 @@ void wil_dslash(void *out, void **gauge, void *in, int oddBit, int daggerBit, Qu GaugeFieldParam gauge_field_param(gauge_param, gauge); gauge_field_param.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; gauge_field_param.location = QUDA_CPU_FIELD_LOCATION; - cpuGaugeField cpu(gauge_field_param); + GaugeField cpu(gauge_field_param); void *ghostGauge[4] = {cpu.Ghost()[0].data(), cpu.Ghost()[1].data(), cpu.Ghost()[2].data(), cpu.Ghost()[3].data()}; // Get spinor ghost fields diff --git a/tests/multigrid_benchmark_test.cpp b/tests/multigrid_benchmark_test.cpp index f954abe366..517d48de51 100644 --- a/tests/multigrid_benchmark_test.cpp +++ b/tests/multigrid_benchmark_test.cpp @@ -23,7 +23,7 @@ using namespace quda; std::vector xD, yD; -cudaGaugeField *Y_d, *X_d, *Xinv_d, *Yhat_d; +GaugeField *Y_d, *X_d, *Xinv_d, *Yhat_d; int Ncolor; @@ -97,14 +97,14 @@ void initFields(QudaPrecision prec) gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; - Y_d = new cudaGaugeField(gParam); - Yhat_d = new cudaGaugeField(gParam); + Y_d = new GaugeField(gParam); + Yhat_d = new GaugeField(gParam); gParam.geometry = QUDA_SCALAR_GEOMETRY; gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; gParam.nFace = 0; - X_d = new cudaGaugeField(gParam); - Xinv_d = new cudaGaugeField(gParam); + X_d = new GaugeField(gParam); + Xinv_d = new GaugeField(gParam); // insert random noise into the gauge fields { diff --git a/tests/multigrid_evolve_test.cpp b/tests/multigrid_evolve_test.cpp index 2436ddabf7..9545942a51 100644 --- a/tests/multigrid_evolve_test.cpp +++ b/tests/multigrid_evolve_test.cpp @@ -225,11 +225,11 @@ int main(int argc, char **argv) gParam.link_type = gauge_param.type; gParam.reconstruct = gauge_param.reconstruct; gParam.setPrecision(gParam.Precision(), true); - cudaGaugeField gauge(gParam); + GaugeField gauge(gParam); int pad = 0; - lat_dim_t y; - lat_dim_t R; + lat_dim_t y = {}; + lat_dim_t R = {}; for (int dir = 0; dir < 4; ++dir) if (comm_dim_partitioned(dir)) R[dir] = 2; for (int dir = 0; dir < 4; ++dir) y[dir] = gauge_param.X[dir] + 2 * R[dir]; @@ -241,7 +241,8 @@ int main(int argc, char **argv) gParamEx.t_boundary = gParam.t_boundary; gParamEx.nFace = 1; gParamEx.r = R; - cudaGaugeField gaugeEx(gParamEx); + + GaugeField gaugeEx(gParamEx); QudaGaugeObservableParam obs_param = newQudaGaugeObservableParam(); obs_param.compute_plaquette = QUDA_BOOLEAN_TRUE; diff --git a/tests/pack_test.cpp b/tests/pack_test.cpp index 694c993895..fe68c2645c 100644 --- a/tests/pack_test.cpp +++ b/tests/pack_test.cpp @@ -108,12 +108,12 @@ void packTest() param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER; GaugeFieldParam cpsParam(param, cpsCpuGauge_p); - cpuGaugeField cpsCpuGauge(cpsParam); + GaugeField cpsCpuGauge(cpsParam); cpsParam.create = QUDA_NULL_FIELD_CREATE; cpsParam.reconstruct = param.reconstruct; cpsParam.setPrecision(param.cuda_prec, true); cpsParam.pad = param.ga_pad; - cudaGaugeField cudaCpsGauge(cpsParam); + GaugeField cudaCpsGauge(cpsParam); host_timer.start(); cudaCpsGauge.copy(cpsCpuGauge); @@ -121,7 +121,7 @@ void packTest() printfQuda("CPS Gauge send time = %e seconds\n", host_timer.last()); host_timer.start(); - cpuCpuGauge.copy(cudaCpsGauge); + cpsCpuGauge.copy(cudaCpsGauge); host_timer.stop(); printfQuda("CPS Gauge restore time = %e seconds\n", host_timer.last()); } @@ -132,12 +132,12 @@ void packTest() param.gauge_order = QUDA_QDP_GAUGE_ORDER; GaugeFieldParam qdpParam(param, qdpCpuGauge_p); - cpuGaugeField qdpCpuGauge(qdpParam); + GaugeField qdpCpuGauge(qdpParam); qdpParam.create = QUDA_NULL_FIELD_CREATE; qdpParam.reconstruct = param.reconstruct; qdpParam.setPrecision(param.cuda_prec, true); qdpParam.pad = param.ga_pad; - cudaGaugeField cudaQdpGauge(qdpParam); + GaugeField cudaQdpGauge(qdpParam); host_timer.start(); cudaQdpGauge.copy(qdpCpuGauge); diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index 3ff8fecb5a..5ee2616ad8 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -52,8 +52,8 @@ struct StaggeredDslashTestWrapper { void *milc_fatlink_gpu; void *milc_longlink_gpu; - cpuGaugeField *cpuFat = nullptr; - cpuGaugeField *cpuLong = nullptr; + GaugeField *cpuFat = nullptr; + GaugeField *cpuLong = nullptr; ColorSpinorField spinor; ColorSpinorField spinorOut; @@ -204,14 +204,14 @@ struct StaggeredDslashTestWrapper { gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; GaugeFieldParam cpuFatParam(gauge_param, milc_fatlink_cpu); cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; - cpuFat = new cpuGaugeField(cpuFatParam); + cpuFat = new GaugeField(cpuFatParam); for (int i = 0; i < 4; i++) ghost_fatlink_cpu[i] = cpuFat->Ghost()[i].data(); if (dslash_type == QUDA_ASQTAD_DSLASH) { gauge_param.type = QUDA_ASQTAD_LONG_LINKS; GaugeFieldParam cpuLongParam(gauge_param, milc_longlink_cpu); cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; - cpuLong = new cpuGaugeField(cpuLongParam); + cpuLong = new GaugeField(cpuLongParam); for (int i = 0; i < 4; i++) ghost_longlink_cpu[i] = cpuLong ? cpuLong->Ghost()[i].data() : nullptr; } #endif diff --git a/tests/unitarize_link_test.cpp b/tests/unitarize_link_test.cpp index 2d9dc14210..4cd8553fdd 100644 --- a/tests/unitarize_link_test.cpp +++ b/tests/unitarize_link_test.cpp @@ -32,8 +32,8 @@ static double max_allowed_error = 1e-11; static QudaGaugeFieldOrder gauge_order = QUDA_MILC_GAUGE_ORDER; -quda::cpuGaugeField *cpuFatLink, *cpuULink, *cudaResult; -quda::cudaGaugeField *cudaFatLink, *cudaULink; +quda::GaugeField *cpuFatLink, *cpuULink, *cudaResult; +quda::GaugeField *cudaFatLink, *cudaULink; const double unittol = (prec == QUDA_DOUBLE_PRECISION) ? 1e-10 : 1e-6; @@ -124,21 +124,21 @@ static int unitarize_link_test(int &test_rc) gParam.create = QUDA_REFERENCE_FIELD_CREATE; gParam.gauge = fatlink; gParam.location = QUDA_CPU_FIELD_LOCATION; - cpuFatLink = new quda::cpuGaugeField(gParam); + cpuFatLink = new quda::GaugeField(gParam); gParam.create = QUDA_ZERO_FIELD_CREATE; - cpuULink = new quda::cpuGaugeField(gParam); + cpuULink = new quda::GaugeField(gParam); gParam.create = QUDA_ZERO_FIELD_CREATE; - cudaResult = new quda::cpuGaugeField(gParam); + cudaResult = new quda::GaugeField(gParam); gParam.pad = 0; gParam.create = QUDA_NULL_FIELD_CREATE; gParam.reconstruct = QUDA_RECONSTRUCT_NO; gParam.setPrecision(prec, true); gParam.location = QUDA_CUDA_FIELD_LOCATION; - cudaFatLink = new quda::cudaGaugeField(gParam); - cudaULink = new quda::cudaGaugeField(gParam); + cudaFatLink = new quda::GaugeField(gParam); + cudaULink = new quda::GaugeField(gParam); { // create fat links double act_path_coeff[6]; From 19aa064690b96a339959d3c6e525dc1f25022957 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 16 May 2023 15:50:54 -0700 Subject: [PATCH 08/99] Add null, move and copy constructors, as well as copy and move assignment operators for GaugeField --- include/color_spinor_field.h | 18 +-- include/gauge_field.h | 294 ++++++++++++++++++----------------- include/malloc_quda.h | 2 +- lib/gauge_field.cpp | 275 +++++++++++++++++++------------- 4 files changed, 325 insertions(+), 264 deletions(-) diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 76fa31b943..1bfd1be413 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -121,18 +121,13 @@ namespace quda } }; - class ColorSpinorParam : public LatticeFieldParam - { - - public: + struct ColorSpinorParam : public LatticeFieldParam { int nColor = 0; // Number of colors of the field int nSpin = 0; // =1 for staggered, =2 for coarse Dslash, =4 for 4d spinor int nVec = 1; // number of packed vectors (for multigrid transfer operator) QudaTwistFlavorType twistFlavor = QUDA_TWIST_INVALID; // used by twisted mass - QudaSiteOrder siteOrder = QUDA_INVALID_SITE_ORDER; // defined for full fields - QudaFieldOrder fieldOrder = QUDA_INVALID_FIELD_ORDER; // Float, Float2, Float4 etc. QudaGammaBasis gammaBasis = QUDA_INVALID_GAMMA_BASIS; QudaFieldCreate create = QUDA_INVALID_FIELD_CREATE; @@ -179,7 +174,6 @@ namespace quda ColorSpinorParam() = default; // used to create cpu params - ColorSpinorParam(void *V, QudaInvertParam &inv_param, const lat_dim_t &X, const bool pc_solution, QudaFieldLocation location = QUDA_CPU_FIELD_LOCATION) : LatticeFieldParam(4, X, 0, location, inv_param.cpu_prec), @@ -188,20 +182,12 @@ namespace quda || inv_param.dslash_type == QUDA_LAPLACE_DSLASH) ? 1 : 4), - nVec(1), twistFlavor(inv_param.twist_flavor), - siteOrder(QUDA_INVALID_SITE_ORDER), - fieldOrder(QUDA_INVALID_FIELD_ORDER), gammaBasis(inv_param.gamma_basis), create(QUDA_REFERENCE_FIELD_CREATE), pc_type(inv_param.dslash_type == QUDA_DOMAIN_WALL_DSLASH ? QUDA_5D_PC : QUDA_4D_PC), - v(V), - is_composite(false), - composite_dim(0), - is_component(false), - component_id(0) + v(V) { - if (nDim > QUDA_MAX_DIM) errorQuda("Number of dimensions too great"); for (int d = 0; d < nDim; d++) x[d] = X[d]; diff --git a/include/gauge_field.h b/include/gauge_field.h index 23fb8939e3..71e1628370 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -37,99 +37,60 @@ namespace quda { } // namespace gauge struct GaugeFieldParam : public LatticeFieldParam { + int nColor = 3; + int nFace = 0; - int nColor; - int nFace; + QudaReconstructType reconstruct = QUDA_RECONSTRUCT_NO; + QudaGaugeFieldOrder order = QUDA_INVALID_GAUGE_ORDER; + QudaGaugeFixed fixed = QUDA_GAUGE_FIXED_NO; + QudaLinkType link_type = QUDA_WILSON_LINKS; + QudaTboundary t_boundary = QUDA_INVALID_T_BOUNDARY; - QudaReconstructType reconstruct; - QudaGaugeFieldOrder order; - QudaGaugeFixed fixed; - QudaLinkType link_type; - QudaTboundary t_boundary; + double anisotropy = 1.0; + double tadpole = 1.0; + GaugeField *field = nullptr; // pointer to a pre-allocated field + void *gauge = nullptr; // used when we use a reference to an external field - double anisotropy; - double tadpole; - void *gauge; // used when we use a reference to an external field + QudaFieldCreate create = QUDA_REFERENCE_FIELD_CREATE; // used to determine the type of field created - QudaFieldCreate create; // used to determine the type of field created - - QudaFieldGeometry geometry; // whether the field is a scale, vector or tensor + QudaFieldGeometry geometry = QUDA_VECTOR_GEOMETRY; // whether the field is a scale, vector or tensor // whether we need to compute the fat link maxima // FIXME temporary flag until we have a kernel that can do this, then we just do this in copy() // always set to false, requires external override - bool compute_fat_link_max; + bool compute_fat_link_max = false; /** The staggered phase convention to use */ - QudaStaggeredPhase staggeredPhaseType; + QudaStaggeredPhase staggeredPhaseType = QUDA_STAGGERED_PHASE_NO; /** Whether the staggered phase factor has been applied */ - bool staggeredPhaseApplied; + bool staggeredPhaseApplied = false; /** Imaginary chemical potential */ - double i_mu; + double i_mu = 0.0; /** Offset into MILC site struct to the desired matrix field (only if gauge_order=MILC_SITE_GAUGE_ORDER) */ - size_t site_offset; + size_t site_offset = 0; /** Size of MILC site struct (only if gauge_order=MILC_SITE_GAUGE_ORDER) */ - size_t site_size; + size_t site_size = 0; // Default constructor - GaugeFieldParam(void *const h_gauge = NULL) : - LatticeFieldParam(), - nColor(3), - nFace(0), - reconstruct(QUDA_RECONSTRUCT_NO), - order(QUDA_INVALID_GAUGE_ORDER), - fixed(QUDA_GAUGE_FIXED_NO), - link_type(QUDA_WILSON_LINKS), - t_boundary(QUDA_INVALID_T_BOUNDARY), - anisotropy(1.0), - tadpole(1.0), - gauge(h_gauge), - create(QUDA_REFERENCE_FIELD_CREATE), - geometry(QUDA_VECTOR_GEOMETRY), - compute_fat_link_max(false), - staggeredPhaseType(QUDA_STAGGERED_PHASE_NO), - staggeredPhaseApplied(false), - i_mu(0.0), - site_offset(0), - site_size(0) - { - } + GaugeFieldParam(void *const h_gauge = nullptr) : gauge(h_gauge) { } GaugeFieldParam(const GaugeField &u); GaugeFieldParam(const lat_dim_t &x, QudaPrecision precision, QudaReconstructType reconstruct, int pad, QudaFieldGeometry geometry, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_PAD) : LatticeFieldParam(4, x, pad, QUDA_INVALID_FIELD_LOCATION, precision, ghostExchange), - nColor(3), - nFace(0), reconstruct(reconstruct), - order(QUDA_INVALID_GAUGE_ORDER), - fixed(QUDA_GAUGE_FIXED_NO), - link_type(QUDA_WILSON_LINKS), - t_boundary(QUDA_INVALID_T_BOUNDARY), - anisotropy(1.0), - tadpole(1.0), - gauge(0), create(QUDA_NULL_FIELD_CREATE), - geometry(geometry), - compute_fat_link_max(false), - staggeredPhaseType(QUDA_STAGGERED_PHASE_NO), - staggeredPhaseApplied(false), - i_mu(0.0), - site_offset(0), - site_size(0) + geometry(geometry) { } GaugeFieldParam(const QudaGaugeParam ¶m, void *h_gauge = nullptr, QudaLinkType link_type_ = QUDA_INVALID_LINKS) : LatticeFieldParam(param), - nColor(3), - nFace(0), - reconstruct(QUDA_RECONSTRUCT_NO), order(param.gauge_order), fixed(param.gauge_fix), link_type(link_type_ != QUDA_INVALID_LINKS ? link_type_ : param.type), @@ -137,9 +98,6 @@ namespace quda { anisotropy(param.anisotropy), tadpole(param.tadpole_coeff), gauge(h_gauge), - create(QUDA_REFERENCE_FIELD_CREATE), - geometry(QUDA_VECTOR_GEOMETRY), - compute_fat_link_max(false), staggeredPhaseType(param.staggered_phase_type), staggeredPhaseApplied(param.staggered_phase_applied), i_mu(param.i_mu), @@ -186,83 +144,103 @@ namespace quda { class GaugeField : public LatticeField { + private: + /** + @brief Create the field as specified by the param + @param[in] Parameter struct + */ + void create(const GaugeFieldParam ¶m); + + /** + @brief Move the contents of a field to this + @param[in,out] other Field we are moving from + */ + void move(GaugeField &&other); + + /** + @brief Fills the param with this field's meta data (used for + creating a cloned field) + @param[in] param The parameter we are filling + */ + void fill(GaugeFieldParam &) const; + protected: - quda_ptr gauge; /** The gauge field allocation */ - array gauge_array; /** Array of pointers to each subset (e.g., QDP or QDPJITorder) */ - size_t bytes; // bytes allocated per full field - size_t phase_offset; // offset in bytes to gauge phases - useful to keep track of texture alignment - size_t phase_bytes; // bytes needed to store the phases - size_t length; - size_t real_length; - int nColor; - int nFace; - QudaFieldGeometry geometry; // whether the field is a scale, vector or tensor - int site_dim; // the dimensionality of each site (number of matrices per lattice site) - - QudaReconstructType reconstruct; - int nInternal; // number of degrees of freedom per link matrix - QudaGaugeFieldOrder order; - QudaGaugeFixed fixed; - QudaLinkType link_type; - QudaTboundary t_boundary; - - double anisotropy; - double tadpole; - double fat_link_max; - - QudaFieldCreate create; // used to determine the type of field created - - mutable array ghost; // stores the ghost zone of the gauge field (non-native fields only) - - mutable int ghostFace[QUDA_MAX_DIM]; // the size of each face - - /** - The staggered phase convention to use - */ - QudaStaggeredPhase staggeredPhaseType; - - /** - Whether the staggered phase factor has been applied - */ - bool staggeredPhaseApplied; - - /** - @brief Exchange the buffers across all dimensions in a given direction - @param[out] recv Receive buffer - @param[in] send Send buffer - @param[in] dir Direction in which we are sending (forwards OR backwards only) - */ - void exchange(void **recv, void **send, QudaDirection dir) const; - - /** - Imaginary chemical potential - */ - double i_mu; - - /** - Offset into MILC site struct to the desired matrix field (only if gauge_order=MILC_SITE_GAUGE_ORDER) - */ - size_t site_offset; - - /** - Size of MILC site struct (only if gauge_order=MILC_SITE_GAUGE_ORDER) - */ - size_t site_size; - - /** - Compute the required extended ghost zone sizes and offsets - @param[in] R Radius of the ghost zone - @param[in] no_comms_fill If true we create a full halo - regardless of partitioning - @param[in] bidir Is this a bi-directional exchange - if not - then we alias the fowards and backwards offsetss - */ - void createGhostZone(const lat_dim_t &R, bool no_comms_fill, bool bidir = true) const; - - /** - @brief Set the vol_string and aux_string for use in tuning - */ - void setTuningString(); + bool init = false; + quda_ptr gauge = {}; /** The gauge field allocation */ + array gauge_array = {}; /** Array of pointers to each subset (e.g., QDP or QDPJITorder) */ + size_t bytes = 0; // bytes allocated per full field + size_t phase_offset = 0; // offset in bytes to gauge phases - useful to keep track of texture alignment + size_t phase_bytes = 0; // bytes needed to store the phases + size_t length = 0; + size_t real_length = 0; + int nColor = 0; + int nFace = 0; + QudaFieldGeometry geometry = QUDA_INVALID_GEOMETRY; // whether the field is a scale, vector or tensor + int site_dim = 0; // the dimensionality of each site (number of matrices per lattice site) + + QudaReconstructType reconstruct = QUDA_RECONSTRUCT_INVALID; + int nInternal = 0; // number of degrees of freedom per link matrix + QudaGaugeFieldOrder order = QUDA_INVALID_GAUGE_ORDER; + QudaGaugeFixed fixed = QUDA_GAUGE_FIXED_INVALID; + QudaLinkType link_type = QUDA_INVALID_LINKS; + QudaTboundary t_boundary = QUDA_INVALID_T_BOUNDARY; + + double anisotropy = 0.0; + double tadpole = 0.0; + double fat_link_max = 0.0; + + mutable array ghost + = {}; // stores the ghost zone of the gauge field (non-native fields only) + + mutable array ghostFace = {}; // the size of each face + + /** + The staggered phase convention to use + */ + QudaStaggeredPhase staggeredPhaseType = QUDA_STAGGERED_PHASE_INVALID; + + /** + Whether the staggered phase factor has been applied + */ + bool staggeredPhaseApplied = false; + + /** + Imaginary chemical potential + */ + double i_mu = 0.0; + + /** + Offset into MILC site struct to the desired matrix field (only if gauge_order=MILC_SITE_GAUGE_ORDER) + */ + size_t site_offset = 0; + + /** + Size of MILC site struct (only if gauge_order=MILC_SITE_GAUGE_ORDER) + */ + size_t site_size = 0; + + /** + @brief Exchange the buffers across all dimensions in a given direction + @param[out] recv Receive buffer + @param[in] send Send buffer + @param[in] dir Direction in which we are sending (forwards OR backwards only) + */ + void exchange(void **recv, void **send, QudaDirection dir) const; + + /** + Compute the required extended ghost zone sizes and offsets + @param[in] R Radius of the ghost zone + @param[in] no_comms_fill If true we create a full halo + regardless of partitioning + @param[in] bidir Is this a bi-directional exchange - if not + then we alias the fowards and backwards offsetss + */ + void createGhostZone(const lat_dim_t &R, bool no_comms_fill, bool bidir = true) const; + + /** + @brief Set the vol_string and aux_string for use in tuning + */ + void setTuningString(); /** @brief Initialize the padded region to 0 @@ -270,8 +248,42 @@ namespace quda { void zeroPad(); public: + /** + @brief Default constructor + */ + GaugeField() = default; + + /** + @brief Copy constructor for creating a GaugeField from another GaugeField + @param field Instance of GaugeField from which we are cloning + */ + GaugeField(const GaugeField &field) noexcept; + + /** + @brief Move constructor for creating a GaugeField from another GaugeField + @param field Instance of GaugeField from which we are moving + */ + GaugeField(GaugeField &&field) noexcept; + + /** + @brief Constructor for creating a GaugeField from a GaugeFieldParam + @param param Contains the metadata for creating the field + */ GaugeField(const GaugeFieldParam ¶m); - virtual ~GaugeField(); + + /** + @brief Copy assignment operator + @param[in] field Instance from which we are copying + @return Reference to this field + */ + GaugeField &operator=(const GaugeField &field); + + /** + @brief Move assignment operator + @param[in] field Instance from which we are moving + @return Reference to this field + */ + GaugeField &operator=(GaugeField &&field); /** @brief Create the communication handlers and buffers @@ -573,6 +585,8 @@ namespace quda { @param[in] the host buffer to copy from. */ void copy_from_buffer(void *buffer); + + friend class GaugeFieldParam; }; /** diff --git a/include/malloc_quda.h b/include/malloc_quda.h index d1a7de9161..8cbc2fbb47 100644 --- a/include/malloc_quda.h +++ b/include/malloc_quda.h @@ -197,7 +197,7 @@ namespace quda { public: quda_ptr() = default; - + quda_ptr(quda_ptr &&) = default; quda_ptr &operator=(quda_ptr &&); /** diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index 61ea7ab505..8bc61c2035 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -5,67 +5,100 @@ namespace quda { - GaugeFieldParam::GaugeFieldParam(const GaugeField &u) : - LatticeFieldParam(u), - nColor(u.Ncolor()), - nFace(u.Nface()), - reconstruct(u.Reconstruct()), - order(u.Order()), - fixed(u.GaugeFixed()), - link_type(u.LinkType()), - t_boundary(u.TBoundary()), - anisotropy(u.Anisotropy()), - tadpole(u.Tadpole()), - gauge(NULL), - create(QUDA_NULL_FIELD_CREATE), - geometry(u.Geometry()), - compute_fat_link_max(false), - staggeredPhaseType(u.StaggeredPhase()), - staggeredPhaseApplied(u.StaggeredPhaseApplied()), - i_mu(u.iMu()), - site_offset(u.SiteOffset()), - site_size(u.SiteSize()) - { } - - GaugeField::GaugeField(const GaugeFieldParam ¶m) : - LatticeField(param), - gauge(), - gauge_array {}, - bytes(0), - phase_offset(0), - phase_bytes(0), - nColor(param.nColor), - nFace(param.nFace), - geometry(param.geometry), - site_dim(1), - reconstruct(param.reconstruct), - nInternal(reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : nColor * nColor * 2), - order(param.order), - fixed(param.fixed), - link_type(param.link_type), - t_boundary(param.t_boundary), - anisotropy(param.anisotropy), - tadpole(param.tadpole), - fat_link_max(link_type == QUDA_ASQTAD_FAT_LINKS ? 0.0 : 1.0), - create(param.create), - staggeredPhaseType(param.staggeredPhaseType), - staggeredPhaseApplied(param.staggeredPhaseApplied), - i_mu(param.i_mu), - site_offset(param.site_offset), - site_size(param.site_size) + GaugeFieldParam::GaugeFieldParam(const GaugeField &u) : LatticeFieldParam(u) { u.fill(*this); } + + GaugeField::GaugeField(const GaugeFieldParam ¶m) : LatticeField(param) + { + create(param); + + switch (param.create) { + case QUDA_NULL_FIELD_CREATE: + case QUDA_REFERENCE_FIELD_CREATE: break; // do nothing + case QUDA_ZERO_FIELD_CREATE: zero(); break; + case QUDA_COPY_FIELD_CREATE: copy(*param.field); break; + default: errorQuda("ERROR: create type(%d) not supported yet", param.create); + } + } + + GaugeField::GaugeField(const GaugeField &u) noexcept : LatticeField(u) + { + GaugeFieldParam param; + u.fill(param); + param.create = QUDA_COPY_FIELD_CREATE; + create(param); + copy(u); + } + + GaugeField::GaugeField(GaugeField &&u) noexcept : LatticeField(std::move(u)) { move(std::move(u)); } + + GaugeField &GaugeField::operator=(const GaugeField &src) + { + if (&src != this) { + if (!init) { // keep current attributes unless unset + LatticeField::operator=(src); + GaugeFieldParam param; + src.fill(param); + param.create = QUDA_COPY_FIELD_CREATE; + create(param); + } + + copy(src); + } + return *this; + } + + GaugeField &GaugeField::operator=(GaugeField &&src) + { + if (&src != this) { + // if field not already initialized then move the field + if (!init) { + LatticeField::operator=(std::move(src)); + move(std::move(src)); + } else { + // we error if the field is not compatible with this + errorQuda("Moving to already created field"); + } + } + return *this; + } + + void GaugeField::create(const GaugeFieldParam ¶m) { - if (siteSubset != QUDA_FULL_SITE_SUBSET) errorQuda("Unexpected siteSubset %d", siteSubset); - if (order == QUDA_NATIVE_GAUGE_ORDER) errorQuda("Invalid gauge order %d", order); - if (ghost_precision != precision) ghost_precision = precision; // gauge fields require matching precision - - if (link_type != QUDA_COARSE_LINKS && nColor != 3) - errorQuda("nColor must be 3, not %d for this link type", nColor); - if (nDim != 4) - errorQuda("Number of dimensions must be 4 not %d", nDim); - if (link_type != QUDA_WILSON_LINKS && anisotropy != 1.0) + if (param.siteSubset != QUDA_FULL_SITE_SUBSET) errorQuda("Unexpected siteSubset %d", param.siteSubset); + if (param.order == QUDA_NATIVE_GAUGE_ORDER) errorQuda("Invalid gauge order %d", param.order); + if (param.GhostPrecision() != param.Precision()) + errorQuda("Ghost precision %d doesn't match field precision %d", param.GhostPrecision(), param.Precision()); + if (param.link_type != QUDA_COARSE_LINKS && param.nColor != 3) + errorQuda("nColor must be 3, not %d for this link type", param.nColor); + if (param.nDim != 4) errorQuda("Number of dimensions must be 4 not %d", param.nDim); + if (param.link_type != QUDA_WILSON_LINKS && param.anisotropy != 1.0) errorQuda("Anisotropy only supported for Wilson links"); - if (link_type != QUDA_WILSON_LINKS && fixed == QUDA_GAUGE_FIXED_YES) + if (param.link_type != QUDA_WILSON_LINKS && param.fixed == QUDA_GAUGE_FIXED_YES) errorQuda("Temporal gauge fixing only supported for Wilson links"); + if ((param.reconstruct == QUDA_RECONSTRUCT_12 || param.reconstruct == QUDA_RECONSTRUCT_8) + && param.link_type != QUDA_SU3_LINKS) + errorQuda("Cannot request a 12/8 reconstruct type without SU(3) link type"); + if (param.reconstruct == QUDA_RECONSTRUCT_10 && param.link_type != QUDA_ASQTAD_MOM_LINKS) + errorQuda("10-reconstruction only supported with momentum links"); + + nColor = param.nColor; + nFace = param.nFace; + geometry = param.geometry; + reconstruct = param.reconstruct; + nInternal = reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : nColor * nColor * 2; + order = param.order; + fixed = param.fixed; + link_type = param.link_type; + t_boundary = param.t_boundary; + anisotropy = param.anisotropy; + tadpole = param.tadpole; + fat_link_max = link_type == QUDA_ASQTAD_FAT_LINKS ? 0.0 : 1.0; + staggeredPhaseType = param.staggeredPhaseType; + staggeredPhaseApplied = param.staggeredPhaseApplied; + i_mu = param.i_mu; + site_offset = param.site_offset; + site_size = param.site_size; + if (geometry == QUDA_SCALAR_GEOMETRY) { real_length = volume*nInternal; length = 2*stride*nInternal; // two comes from being full lattice @@ -83,18 +116,6 @@ namespace quda { length = 2 * (1 << nDim) * nDim * stride * nInternal; // two comes from being full lattice } - if ((reconstruct == QUDA_RECONSTRUCT_12 || reconstruct == QUDA_RECONSTRUCT_8) && link_type != QUDA_SU3_LINKS) { - errorQuda("Cannot request a 12/8 reconstruct type without SU(3) link type"); - } - - if (reconstruct == QUDA_RECONSTRUCT_10 && link_type != QUDA_ASQTAD_MOM_LINKS) { - errorQuda("10-reconstruction only supported with momentum links"); - } - - if (create != QUDA_NULL_FIELD_CREATE && create != QUDA_ZERO_FIELD_CREATE && create != QUDA_REFERENCE_FIELD_CREATE) { - errorQuda("ERROR: create type(%d) not supported yet\n", create); - } - switch (geometry) { case QUDA_SCALAR_GEOMETRY: site_dim = 1; break; case QUDA_VECTOR_GEOMETRY: site_dim = nDim; break; @@ -147,9 +168,8 @@ namespace quda { } if (isNative()) { - if (create != QUDA_REFERENCE_FIELD_CREATE) { + if (param.create != QUDA_REFERENCE_FIELD_CREATE) { gauge = std::move(quda_ptr(mem_type, bytes)); - if (create == QUDA_ZERO_FIELD_CREATE) qudaMemset(gauge, 0, bytes); } else { gauge = std::move(quda_ptr(param.gauge, mem_type)); } @@ -157,13 +177,12 @@ namespace quda { size_t nbytes = volume * nInternal * precision; for (int d = 0; d < site_dim; d++) { - if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) { + if (param.create != QUDA_REFERENCE_FIELD_CREATE) { gauge_array[d] = std::move(quda_ptr(mem_type, nbytes)); - if (create == QUDA_ZERO_FIELD_CREATE) qudaMemset(gauge_array[d], 0, nbytes); - } else if (create == QUDA_REFERENCE_FIELD_CREATE) { + } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) { gauge_array[d] = std::move(quda_ptr(static_cast(param.gauge)[d], mem_type)); } else { - errorQuda("Unsupported creation type %d", create); + errorQuda("Unsupported creation type %d", param.create); } } @@ -172,17 +191,16 @@ namespace quda { order == QUDA_TIFR_PADDED_GAUGE_ORDER || order == QUDA_MILC_SITE_GAUGE_ORDER) { // does not support device - if (order == QUDA_MILC_SITE_GAUGE_ORDER && create != QUDA_REFERENCE_FIELD_CREATE) { - errorQuda("MILC site gauge order only supported for reference fields"); + if (order == QUDA_MILC_SITE_GAUGE_ORDER && param.create != QUDA_REFERENCE_FIELD_CREATE) { + errorQuda("MILC site gauge order only supported for reference fields"); } - if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) { + if (param.create != QUDA_REFERENCE_FIELD_CREATE) { gauge = std::move(quda_ptr(mem_type, bytes)); - if (create == QUDA_ZERO_FIELD_CREATE) qudaMemset(gauge, 0, bytes); - } else if (create == QUDA_REFERENCE_FIELD_CREATE) { + } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) { gauge = std::move(quda_ptr(param.gauge, mem_type)); } else { - errorQuda("Unsupported creation type %d", create); + errorQuda("Unsupported creation type %d", param.create); } } else { @@ -200,15 +218,17 @@ namespace quda { if (geometry == QUDA_COARSE_GEOMETRY) qudaMemset(ghost[i + 4], 0, nbytes); } } else { - if (create != QUDA_ZERO_FIELD_CREATE) zeroPad(); + if (param.create != QUDA_ZERO_FIELD_CREATE) zeroPad(); } } + init = true; setTuningString(); // exchange the boundaries if a non-trivial field if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) - if (create == QUDA_REFERENCE_FIELD_CREATE && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) { + if (param.create == QUDA_REFERENCE_FIELD_CREATE + && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) { exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL); } @@ -216,7 +236,70 @@ namespace quda { if (param.compute_fat_link_max) fat_link_max = this->abs_max(); } - GaugeField::~GaugeField() { } + void GaugeField::move(GaugeField &&src) + { + gauge = std::exchange(src.gauge, {}); + gauge_array = std::exchange(src.gauge_array, {}); + bytes = std::exchange(src.bytes, 0); + phase_offset = std::exchange(src.phase_offset, 0); + phase_bytes = std::exchange(src.phase_bytes, 0); + length = std::exchange(src.length, 0); + real_length = std::exchange(src.real_length, 0); + nColor = std::exchange(src.nColor, 0); + nFace = std::exchange(src.nFace, 0); + geometry = std::exchange(src.geometry, QUDA_INVALID_GEOMETRY); + site_dim = std::exchange(src.site_dim, 0); + reconstruct = std::exchange(src.reconstruct, QUDA_RECONSTRUCT_INVALID); + nInternal = std::exchange(src.nInternal, 0); + order = std::exchange(src.order, QUDA_INVALID_GAUGE_ORDER); + fixed = std::exchange(src.fixed, QUDA_GAUGE_FIXED_INVALID); + link_type = std::exchange(src.link_type, QUDA_INVALID_LINKS); + t_boundary = std::exchange(src.t_boundary, QUDA_INVALID_T_BOUNDARY); + anisotropy = std::exchange(src.anisotropy, 0.0); + tadpole = std::exchange(src.tadpole, 0.0); + fat_link_max = std::exchange(src.fat_link_max, 0.0); + ghost = std::exchange(src.ghost, {}); + ghostFace = std::exchange(src.ghostFace, {}); + staggeredPhaseType = std::exchange(src.staggeredPhaseType, QUDA_STAGGERED_PHASE_INVALID); + staggeredPhaseApplied = std::exchange(src.staggeredPhaseApplied, false); + i_mu = std::exchange(src.i_mu, 0.0); + site_offset = std::exchange(src.site_offset, 0); + site_size = std::exchange(src.site_size, 0); + } + + void GaugeField::fill(GaugeFieldParam ¶m) const + { + LatticeField::fill(param); + param.gauge = nullptr; + param.nColor = nColor; + param.nFace = nFace; + param.reconstruct = reconstruct; + param.order = order; + param.fixed = fixed; + param.link_type = link_type; + param.t_boundary = t_boundary; + param.anisotropy = anisotropy; + param.tadpole = tadpole; + param.create = QUDA_NULL_FIELD_CREATE; + param.geometry = geometry; + param.compute_fat_link_max = false; + param.staggeredPhaseType = staggeredPhaseType; + param.staggeredPhaseApplied = staggeredPhaseApplied; + param.i_mu = i_mu; + param.site_offset = site_offset; + param.site_size = site_size; + } + + void GaugeField::setTuningString() + { + LatticeField::setTuningString(); + std::stringstream aux_ss; + aux_ss << "vol=" << volume << "stride=" << stride << "precision=" << precision << "geometry=" << geometry + << "Nc=" << nColor; + if (ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED) aux_ss << "r=" << r[0] << r[1] << r[2] << r[3]; + aux_string = aux_ss.str(); + if (aux_string.size() >= TuneKey::aux_n / 2) errorQuda("Aux string too large %lu", aux_string.size()); + } void GaugeField::zeroPad() { @@ -230,28 +313,6 @@ namespace quda { qudaMemset2D(gauge, parity * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad); } } -#if 0 - if (location == QUDA_CUDA_FIELD_LOCATION) { - for (int parity = 0; parity < 2; parity++) { - qudaMemset2D(data() + parity * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad); - } - } else { - for (int parity = 0; parity < 2; parity++) - for (int p = 0; p < Npad; p++) - memset(data() + parity * (bytes / 2) + (volumeCB + p * stride) * order * precision, 0, pad_bytes); - } - } -#endif - } - - void GaugeField::setTuningString() { - LatticeField::setTuningString(); - std::stringstream aux_ss; - aux_ss << "vol=" << volume << "stride=" << stride << "precision=" << precision << "geometry=" << geometry - << "Nc=" << nColor; - if (ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED) aux_ss << "r=" << r[0] << r[1] << r[2] << r[3]; - aux_string = aux_ss.str(); - if (aux_string.size() >= TuneKey::aux_n / 2) errorQuda("Aux string too large %lu", aux_string.size()); } void GaugeField::createGhostZone(const lat_dim_t &R, bool no_comms_fill, bool bidir) const From bc3dba0bb993915ca679e24df1db317ac6e626d7 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 16 May 2023 15:53:48 -0700 Subject: [PATCH 09/99] Fix some issues with staggered quark smearing --- lib/interface_quda.cpp | 20 ++++++-------------- tests/staggered_gsmear_test_utils.h | 4 ++-- tests/utils/host_utils.h | 2 +- tests/utils/staggered_host_utils.cpp | 11 +++++++---- 4 files changed, 16 insertions(+), 21 deletions(-) diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 5ed54e37f2..36a252d809 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -3945,14 +3945,14 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) checkGaugeParam(param); - GaugeFieldParam gParam(*param, inlink, QUDA_GENERAL_LINKS); - gParam.gauge = twolink; + GaugeFieldParam gParam(*param, inlink, QUDA_ASQTAD_LONG_LINKS); + gParam.gauge = twolink; GaugeField cpuTwoLink(gParam); // create the host twolink profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT); GaugeField *cudaInLinkEx = nullptr; - if(inlink) { + if (inlink) { gParam.link_type = param->type; gParam.gauge = inlink; GaugeField cpuInLink(gParam); // create the host sitelink @@ -3961,19 +3961,13 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) gParam.reconstruct = param->reconstruct; gParam.setPrecision(param->cuda_prec, true); gParam.create = QUDA_NULL_FIELD_CREATE; - GaugeField *cudaInLink = new GaugeField(gParam); + GaugeField cudaInLink(gParam); profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT); profileGaussianSmear.TPSTART(QUDA_PROFILE_H2D); - cudaInLink->copy(cpuInLink); + cudaInLink.copy(cpuInLink); profileGaussianSmear.TPSTOP(QUDA_PROFILE_H2D); - // - cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileGaussianSmear); - // - profileGaussianSmear.TPSTART(QUDA_PROFILE_FREE); - delete cudaInLink; - profileGaussianSmear.TPSTOP(QUDA_PROFILE_FREE); - + cudaInLinkEx = createExtendedGauge(cudaInLink, R, profileGaussianSmear); } else { cudaInLinkEx = createExtendedGauge(*gaugePrecise, R, profileGaussianSmear); } @@ -3992,7 +3986,6 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) freeUniqueGaugeQuda(QUDA_SMEARED_LINKS); gaugeSmeared = new GaugeField(gsParam); - profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT); @@ -4006,7 +3999,6 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) profileGaussianSmear.TPSTART(QUDA_PROFILE_D2H); cpuTwoLink.copy(*gaugeSmeared); profileGaussianSmear.TPSTOP(QUDA_PROFILE_D2H); - profileGaussianSmear.TPSTART(QUDA_PROFILE_FREE); freeUniqueGaugeQuda(QUDA_SMEARED_LINKS); diff --git a/tests/staggered_gsmear_test_utils.h b/tests/staggered_gsmear_test_utils.h index 42bd43e293..7266844798 100644 --- a/tests/staggered_gsmear_test_utils.h +++ b/tests/staggered_gsmear_test_utils.h @@ -128,9 +128,9 @@ struct StaggeredGSmearTestWrapper { // quda::blas::ax(ftmp, tmp); quda::blas::axpy(a, tmp, tmp2); - staggeredTwoLinkGaussianSmear(spinorRef.Even(), qdp_twolnk, (void **)cpuTwoLink->Ghost(), tmp.Even(), + staggeredTwoLinkGaussianSmear(spinorRef.Even(), qdp_twolnk, *cpuTwoLink, tmp.Even(), &gauge_param, &inv_param, 0, smear_coeff, smear_t0, gauge_param.cpu_prec); - staggeredTwoLinkGaussianSmear(spinorRef.Odd(), qdp_twolnk, (void **)cpuTwoLink->Ghost(), tmp.Odd(), + staggeredTwoLinkGaussianSmear(spinorRef.Odd(), qdp_twolnk, *cpuTwoLink, tmp.Odd(), &gauge_param, &inv_param, 1, smear_coeff, smear_t0, gauge_param.cpu_prec); // blas::xpay(*tmp2, -1.0, *spinorRef); diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index 51cec06d27..88aed1f020 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -59,7 +59,7 @@ void computeLongLinkCPU(void **longlink, void **sitelink, QudaPrecision prec, vo void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, void **longlink_eps, void **sitelink, void *qudaGaugeParamPtr, double **act_path_coeffs, double eps_naik); void computeTwoLinkCPU(void **twolink, void **sitelink, QudaGaugeParam *gauge_param); -void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], void** ghost_twolnk, quda::ColorSpinorField &in, QudaGaugeParam *qudaGaugeParam, QudaInvertParam *inv_param, const int oddBit, const double width, const int t0, QudaPrecision prec); +void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], const quda::GaugeField &twolnk, quda::ColorSpinorField &in, QudaGaugeParam *qudaGaugeParam, QudaInvertParam *inv_param, const int oddBit, const double width, const int t0, QudaPrecision prec); template void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type); void applyGaugeFieldScaling_long(void **gauge, int Vh, QudaGaugeParam *param, QudaDslashType dslash_type, diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp index ba71e91926..365781c7d0 100644 --- a/tests/utils/staggered_host_utils.cpp +++ b/tests/utils/staggered_host_utils.cpp @@ -437,11 +437,14 @@ void staggeredTwoLinkGaussianSmear(sFloat *res, gFloat **twolink, gFloat **ghost return; } -void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], void **ghost_twolnk, +void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], const quda::GaugeField &twolnk, quda::ColorSpinorField &in, QudaGaugeParam * /*qudaGaugeParam*/, QudaInvertParam * /*inv_param*/, const int oddBit, const double /*width*/, const int t0, QudaPrecision prec) { + void *ghost[4]; + for (int i = 0; i < 4; i++) ghost[i] = twolnk.Ghost()[i].data(); + QudaParity otherparity = QUDA_INVALID_PARITY; if (oddBit == QUDA_EVEN_PARITY) { otherparity = QUDA_ODD_PARITY; @@ -459,19 +462,19 @@ void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk if (prec == QUDA_DOUBLE_PRECISION) { { - staggeredTwoLinkGaussianSmear((double *)out.V(), (double **)qdp_twolnk, (double **)ghost_twolnk, (double *)in.V(), + staggeredTwoLinkGaussianSmear((double *)out.V(), (double **)qdp_twolnk, (double **)ghost, (double *)in.V(), (double **)fwd_nbr_spinor, (double **)back_nbr_spinor, t0, oddBit); } } else { { - staggeredTwoLinkGaussianSmear((float *)out.V(), (float **)qdp_twolnk, (float **)ghost_twolnk, (float *)in.V(), + staggeredTwoLinkGaussianSmear((float *)out.V(), (float **)qdp_twolnk, (float **)ghost, (float *)in.V(), (float **)fwd_nbr_spinor, (float **)back_nbr_spinor, t0, oddBit); } } return; } #else -void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &, void **, void** , quda::ColorSpinorField&, QudaGaugeParam* , QudaInvertParam* , const int , const double , const int , QudaPrecision ) +void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &, void **, const quda::GaugeField &, quda::ColorSpinorField&, QudaGaugeParam* , QudaInvertParam* , const int , const double , const int , QudaPrecision ) {} #endif From 13eb7e1d901604e87d4c3a3d6249e97ce0d3885f Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 18 May 2023 14:45:48 -0700 Subject: [PATCH 10/99] Fix HISQ force since unification, and renable hisq force ctests which were accidentally not being run --- tests/CMakeLists.txt | 2 - tests/hisq_paths_force_test.cpp | 12 ++--- tests/host_reference/hisq_force_reference.cpp | 50 +++++++++++-------- tests/host_reference/hisq_force_reference.h | 4 +- tests/utils/host_utils.cpp | 41 ++++++++++++++- tests/utils/host_utils.h | 1 + 6 files changed, 76 insertions(+), 34 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9ca912f8b3..14c1508a82 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1225,9 +1225,7 @@ foreach(prec IN LISTS TEST_PRECS) COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} --dim 2 4 6 8 --prec ${prec} --gtest_output=xml:unitarize_link_test_${prec}.xml) - endif() - if(QUDA_FORCE_HISQ) add_test(NAME hisq_paths_force_${prec} COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} --dim 2 4 6 8 --prec ${prec} diff --git a/tests/hisq_paths_force_test.cpp b/tests/hisq_paths_force_test.cpp index 07f7b4e17b..9df6d2ec4c 100644 --- a/tests/hisq_paths_force_test.cpp +++ b/tests/hisq_paths_force_test.cpp @@ -333,8 +333,8 @@ static void hisq_force_startup() // initialize the CPU outer product fields and exchange once createStagForOprodCPU(stag_for_oprod, force_prec, qudaGaugeParam.X, *rng); - computeLinkOrderedOuterProduct(stag_for_oprod, cpuOprod->data(), force_prec, 1); - computeLinkOrderedOuterProduct(stag_for_oprod, cpuLongLinkOprod->data(), force_prec, 3); + computeLinkOrderedOuterProduct(stag_for_oprod, *cpuOprod, force_prec, 1); + computeLinkOrderedOuterProduct(stag_for_oprod, *cpuLongLinkOprod, force_prec, 3); copyExtendedGauge(*cpuOprod_ex, *cpuOprod, QUDA_CPU_FIELD_LOCATION); copyExtendedGauge(*cpuLongLinkOprod_ex, *cpuLongLinkOprod, QUDA_CPU_FIELD_LOCATION); @@ -469,9 +469,7 @@ static int hisq_force_test(bool lepage) getTolerance(force_prec), force_prec); } - strong_check_link(reinterpret_cast(hostVerifyForce->data()), - "GPU results: ", reinterpret_cast(cpuForce->data()), "CPU reference results:", V, - force_prec); + strong_check_link(*hostVerifyForce, "GPU result:", *cpuForce, "CPU reference results:"); logQuda(QUDA_SUMMARIZE, "Lepage %s staples force test %s\n\n", lepage ? "enabled" : "disabled", (1 == res) ? "PASSED" : "FAILED"); } @@ -506,9 +504,7 @@ static int hisq_force_test(bool lepage) getTolerance(force_prec), force_prec); } - strong_check_link(reinterpret_cast(hostVerifyForce->data()), - "GPU results: ", reinterpret_cast(cpuForce->data()), "CPU reference results:", V, - force_prec); + strong_check_link(*hostVerifyForce, "GPU results: ", *cpuForce, "CPU reference results:"); logQuda(QUDA_SUMMARIZE, "Long link force test %s\n\n", (1 == res) ? "PASSED" : "FAILED"); } } diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp index 58c6762e70..d0cfc197a2 100644 --- a/tests/host_reference/hisq_force_reference.cpp +++ b/tests/host_reference/hisq_force_reference.cpp @@ -84,9 +84,9 @@ typedef struct { double space; } danti_hermitmat; -template su3_matrix *get_su3_matrix(su3_matrix *p, int idx, int dir) +template su3_matrix *get_su3_matrix(quda::GaugeField &p, int idx, int dir) { - su3_matrix *data = ((su3_matrix **)p)[dir]; + auto data = static_cast(p.data(dir)); return data + idx; } @@ -96,8 +96,8 @@ template void su3_projector(su3_vecto for (int j = 0; j < 3; j++) CMUL_J(a->c[i], b->c[j], c->e[i][j]); } -template -void computeLinkOrderedOuterProduct(su3_vector *src, su3_matrix *dest, size_t nhops) +template +void computeLinkOrderedOuterProduct(su3_vector *src, quda::GaugeField &dest, size_t nhops) { int dx[4]; for (int i = 0; i < V; ++i) { @@ -106,18 +106,18 @@ void computeLinkOrderedOuterProduct(su3_vector *src, su3_matrix *dest, size_t nh dx[dir] = nhops; int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]); su3_vector *hw = src + nbr_idx; - su3_matrix *p = get_su3_matrix(dest, i, dir); + su3_matrix *p = get_su3_matrix(dest, i, dir); su3_projector(hw, &src[i], p); } // dir } // i } -void computeLinkOrderedOuterProduct(void *src, void *dst, QudaPrecision precision, size_t nhops) +void computeLinkOrderedOuterProduct(void *src, quda::GaugeField &dst, QudaPrecision precision, size_t nhops) { if (precision == QUDA_SINGLE_PRECISION) { - computeLinkOrderedOuterProduct((fsu3_vector *)src, (fsu3_matrix *)dst, nhops); + computeLinkOrderedOuterProduct((fsu3_vector *)src, dst, nhops); } else { - computeLinkOrderedOuterProduct((dsu3_vector *)src, (dsu3_matrix *)dst, nhops); + computeLinkOrderedOuterProduct((dsu3_vector *)src, dst, nhops); } } @@ -1222,12 +1222,15 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda act_path_coeff.seven = path_coeff[4]; act_path_coeff.lepage = path_coeff[5]; + void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)}; + void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)}; + void *noprod_array[] = {newOprod->data(0), newOprod->data(1), newOprod->data(2), newOprod->data(3)}; if (precision == QUDA_DOUBLE_PRECISION) { - doHisqStaplesForceCPU(X_, act_path_coeff, oprod.data(), link.data(), - (double **)tempmat, newOprod->data()); + doHisqStaplesForceCPU(X_, act_path_coeff, reinterpret_cast(oprod_array), reinterpret_cast(link_array), + (double **)tempmat, reinterpret_cast(noprod_array)); } else if (precision == QUDA_SINGLE_PRECISION) { - doHisqStaplesForceCPU(X_, act_path_coeff, oprod.data(), (float *)link.data(), - (float **)tempmat, newOprod->data()); + doHisqStaplesForceCPU(X_, act_path_coeff, reinterpret_cast(oprod_array), reinterpret_cast(link_array), + (float **)tempmat, reinterpret_cast(noprod_array)); } else { errorQuda("Unsupported precision"); } @@ -1308,15 +1311,18 @@ void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeFiel for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d]; QudaPrecision precision = oprod.Precision(); + void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)}; + void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)}; + void *noprod_array[] = {newOprod->data(0), newOprod->data(1), newOprod->data(2), newOprod->data(3)}; for (int sig = 0; sig < 4; ++sig) { if (precision == QUDA_SINGLE_PRECISION) { - computeLongLinkField(X_, oprod.data(), link.data(), sig, coeff, - newOprod->data()); + computeLongLinkField(X_, reinterpret_cast(oprod_array), reinterpret_cast(link_array), + sig, coeff, reinterpret_cast(noprod_array)); } else if (precision == QUDA_DOUBLE_PRECISION) { - computeLongLinkField(X_, oprod.data(), link.data(), sig, coeff, - newOprod->data()); + computeLongLinkField(X_, reinterpret_cast(oprod_array), reinterpret_cast(link_array), + sig, coeff, reinterpret_cast(noprod_array)); } else { - errorQuda("Unrecognised precision\n"); + errorQuda("Unrecognised precision"); } } // sig } @@ -1366,13 +1372,17 @@ void hisqCompleteForceCPU(quda::GaugeField &oprod, quda::GaugeField &link, quda: for (int d = 0; d < 4; d++) X_[d] = oprod.X()[d] - 2 * oprod.R()[d]; QudaPrecision precision = oprod.Precision(); + void *oprod_array[] = {oprod.data(0), oprod.data(1), oprod.data(2), oprod.data(3)}; + void *link_array[] = {link.data(0), link.data(1), link.data(2), link.data(3)}; for (int sig = 0; sig < 4; ++sig) { if (precision == QUDA_SINGLE_PRECISION) { - completeForceField(X_, oprod.data(), link.data(), sig, mom->data()); + completeForceField(X_, reinterpret_cast(oprod_array), reinterpret_cast(link_array), + sig, mom->data()); } else if (precision == QUDA_DOUBLE_PRECISION) { - completeForceField(X_, oprod.data(), link.data(), sig, mom->data()); + completeForceField(X_, reinterpret_cast(oprod_array), reinterpret_cast(link_array), + sig, mom->data()); } else { - errorQuda("Unrecognised precision\n"); + errorQuda("Unrecognised precision"); } } // loop over sig } diff --git a/tests/host_reference/hisq_force_reference.h b/tests/host_reference/hisq_force_reference.h index fb8b773f84..da6a8b770e 100644 --- a/tests/host_reference/hisq_force_reference.h +++ b/tests/host_reference/hisq_force_reference.h @@ -8,11 +8,11 @@ /** @brief Compute a staggered spinor outer product for some offset, CPU version @param[in] src Pointer to an appropriately sized host staggered spinor field - @param[out] dest Pointer to an appropriately sized output outer product field + @param[out] dest Reference to a gauge field for the outer product @param[in] precision Precision of data (single or double) @param[in] separation Offset for outer product (1 for fat links, 3 for long links) */ -void computeLinkOrderedOuterProduct(void *src, void *dest, QudaPrecision precision, size_t separation); +void computeLinkOrderedOuterProduct(void *src, quda::GaugeField &dest, QudaPrecision precision, size_t separation); /** @brief Compute the force contribution from the fat links, CPU version diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp index 70421fb118..09ec26f415 100644 --- a/tests/utils/host_utils.cpp +++ b/tests/utils/host_utils.cpp @@ -1493,6 +1493,21 @@ static int compare_link(void **linkA, void **linkB, int len, QudaPrecision preci return ret; } +static int compare_link(const GaugeField &linkA, const GaugeField &linkB) +{ + int ret; + + void *a[] = {linkA.data(0), linkA.data(1), linkA.data(2), linkA.data(3)}; + void *b[] = {linkB.data(0), linkB.data(1), linkB.data(2), linkB.data(3)}; + if (checkPrecision(linkA, linkB) == QUDA_DOUBLE_PRECISION) { + ret = compareLink((double **)a, (double **)b, linkA.Volume()); + } else { + ret = compareLink((float **)a, (float **)b, linkA.Volume()); + } + + return ret; +} + // X indexes the lattice site static void printLinkElement(void *link, int X, QudaPrecision precision) { @@ -1524,8 +1539,30 @@ int strong_check_link(void **linkA, const char *msgA, void **linkB, const char * printfQuda("\n"); } - int ret = compare_link(linkA, linkB, len, prec); - return ret; + return compare_link(linkA, linkB, len, prec); +} + +int strong_check_link(const GaugeField &linkA, const std::string &msgA, const GaugeField &linkB, const std::string &msgB) +{ + if (verbosity >= QUDA_VERBOSE) { + printfQuda("%s\n", msgA.c_str()); + printLinkElement(linkA.data(0), 0, prec); + printfQuda("\n"); + printLinkElement(linkA.data(0), 1, prec); + printfQuda("...\n"); + printLinkElement(linkA.data(3), linkA.Volume() - 1, prec); + printfQuda("\n"); + + printfQuda("\n%s\n", msgB.c_str()); + printLinkElement(linkB.data(0), 0, prec); + printfQuda("\n"); + printLinkElement(linkB.data(0), 1, prec); + printfQuda("...\n"); + printLinkElement(linkB.data(3), linkB.Volume() - 1, prec); + printfQuda("\n"); + } + + return compare_link(linkA, linkB); } void createMomCPU(void *mom, QudaPrecision precision) diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index 88aed1f020..d6eb26304f 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -183,6 +183,7 @@ double compare_floats_v2(void *a, void *b, int len, double epsilon, QudaPrecisio void check_gauge(void **, void **, double epsilon, QudaPrecision precision); int strong_check_link(void **linkA, const char *msgA, void **linkB, const char *msgB, int len, QudaPrecision prec); +int strong_check_link(const quda::GaugeField &linkA, const std::string &msgA, const quda::GaugeField &linkB, const std::string &msgB); int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec); /** From e538fa028164647f2079c76f932adf9ed30f9d43 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 18 May 2023 14:56:05 -0700 Subject: [PATCH 11/99] Commenced use of new GaugeField features (default contructor, move and copy assigment) to clean up interface_quda.cpp. Added new profile stack to allow for autoprofiling while also dramatically reducing LOC in the interface. Work in progress --- include/gauge_field.h | 15 +- include/timer.h | 6 + lib/gauge_field.cpp | 29 ++- lib/gauge_random.cu | 8 + lib/gauge_update_quda.cu | 3 + lib/interface_quda.cpp | 440 +++++++++--------------------------- lib/momentum.cu | 7 + lib/staggered_oprod.cu | 3 + lib/targets/cuda/malloc.cpp | 9 + lib/timer.cpp | 22 ++ 10 files changed, 200 insertions(+), 342 deletions(-) diff --git a/include/gauge_field.h b/include/gauge_field.h index 71e1628370..52a4a40b06 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -40,11 +40,11 @@ namespace quda { int nColor = 3; int nFace = 0; - QudaReconstructType reconstruct = QUDA_RECONSTRUCT_NO; QudaGaugeFieldOrder order = QUDA_INVALID_GAUGE_ORDER; QudaGaugeFixed fixed = QUDA_GAUGE_FIXED_NO; QudaLinkType link_type = QUDA_WILSON_LINKS; QudaTboundary t_boundary = QUDA_INVALID_T_BOUNDARY; + QudaReconstructType reconstruct = QUDA_RECONSTRUCT_NO; double anisotropy = 1.0; double tadpole = 1.0; @@ -95,6 +95,9 @@ namespace quda { fixed(param.gauge_fix), link_type(link_type_ != QUDA_INVALID_LINKS ? link_type_ : param.type), t_boundary(param.t_boundary), + // if we have momentum field and not using TIFR field, then we always have recon-10 + reconstruct(link_type == QUDA_ASQTAD_MOM_LINKS && order != QUDA_TIFR_GAUGE_ORDER && order != QUDA_TIFR_PADDED_GAUGE_ORDER ? + QUDA_RECONSTRUCT_10 : QUDA_RECONSTRUCT_NO), anisotropy(param.anisotropy), tadpole(param.tadpole_coeff), gauge(h_gauge), @@ -556,6 +559,16 @@ namespace quda { */ static GaugeField* Create(const GaugeFieldParam ¶m); + /** + @brief Create a field that aliases this field's storage. The + alias field can use a different precision than this field, + though it cannot be greater. This functionality is useful for + the case where we have multiple temporaries in different + precisions, but do not need them simultaneously. Use this functionality with caution. + @param[in] param Parameters for the alias field + */ + GaugeField create_alias(const GaugeFieldParam ¶m = GaugeFieldParam()); + /** @brief If managed memory and prefetch is enabled, prefetch the gauge field and buffers to the CPU or the GPU diff --git a/include/timer.h b/include/timer.h index 4c1557b7ce..20b9df45ff 100644 --- a/include/timer.h +++ b/include/timer.h @@ -296,6 +296,12 @@ namespace quda { static TimeProfile dummy("dummy"); + void pushProfile(TimeProfile &profile); + + void popProfile(); + + TimeProfile& getProfile(); + } // namespace quda #undef PUSH_RANGE diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index 8bc61c2035..e8c4994670 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -227,8 +227,7 @@ namespace quda { // exchange the boundaries if a non-trivial field if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD) - if (param.create == QUDA_REFERENCE_FIELD_CREATE - && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) { + if (param.create == QUDA_REFERENCE_FIELD_CREATE && (geometry == QUDA_VECTOR_GEOMETRY || geometry == QUDA_COARSE_GEOMETRY)) { exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL); } @@ -939,6 +938,13 @@ namespace quda { void GaugeField::copy(const GaugeField &src) { + auto &profile = getProfile(); + if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) { + profile.TPSTART(QUDA_PROFILE_D2H); + } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) { + profile.TPSTART(QUDA_PROFILE_H2D); + } + if (this == &src) return; checkField(src); @@ -1104,7 +1110,12 @@ namespace quda { staggeredPhaseApplied = src.StaggeredPhaseApplied(); staggeredPhaseType = src.StaggeredPhase(); - qudaDeviceSynchronize(); // include sync here for accurate host-device profiling + if (src.Location() != location) qudaDeviceSynchronize(); // include sync here for accurate host-device profiling + if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) { + profile.TPSTOP(QUDA_PROFILE_D2H); + } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) { + profile.TPSTOP(QUDA_PROFILE_H2D); + } } std::ostream& operator<<(std::ostream& output, const GaugeFieldParam& param) @@ -1195,11 +1206,19 @@ namespace quda { GaugeField* GaugeField::Create(const GaugeFieldParam ¶m) { return new GaugeField(param); } + GaugeField GaugeField::create_alias(const GaugeFieldParam ¶m_) + { + if (param_.init && param_.Precision() > precision) + errorQuda("Cannot create an alias to source with lower precision than the alias"); + GaugeFieldParam param = param_.init ? param_ : GaugeFieldParam(*this); + param.create = QUDA_REFERENCE_FIELD_CREATE; + return GaugeField(param); + } + // helper for creating extended gauge fields GaugeField *createExtendedGauge(GaugeField &in, const lat_dim_t &R, TimeProfile &profile, bool redundant_comms, QudaReconstructType recon) { - profile.TPSTART(QUDA_PROFILE_INIT); GaugeFieldParam gParamEx(in); //gParamEx.location = QUDA_CUDA_FIELD_LOCATION; gParamEx.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED; @@ -1219,8 +1238,6 @@ namespace quda { // copy input field into the extended device gauge field copyExtendedGauge(*out, in, QUDA_CUDA_FIELD_LOCATION); // wrong location if both fields cpu - profile.TPSTOP(QUDA_PROFILE_INIT); - // now fill up the halos out->exchangeExtendedGhost(R, profile, redundant_comms); diff --git a/lib/gauge_random.cu b/lib/gauge_random.cu index 0e056d305b..f3bfe8e22c 100644 --- a/lib/gauge_random.cu +++ b/lib/gauge_random.cu @@ -4,6 +4,7 @@ #include #include #include +#include "timer.h" namespace quda { @@ -55,19 +56,26 @@ namespace quda { if (U.LinkType() != QUDA_SU3_LINKS && U.LinkType() != QUDA_MOMENTUM_LINKS) errorQuda("Unexpected link type %d", U.LinkType()); + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(U, rng, sigma); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); // ensure multi-gpu consistency if required + getProfile().TPSTART(QUDA_PROFILE_COMMS); if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_EXTENDED) { U.exchangeExtendedGhost(U.R()); } else if (U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) { U.exchangeGhost(); } + getProfile().TPSTOP(QUDA_PROFILE_COMMS); } void gaugeGauss(GaugeField &U, unsigned long long seed, double sigma) { + getProfile().TPSTART(QUDA_PROFILE_COMMS); RNG randstates(U, seed); + getProfile().TPSTOP(QUDA_PROFILE_COMMS); + gaugeGauss(U, randstates, sigma); } diff --git a/lib/gauge_update_quda.cu b/lib/gauge_update_quda.cu index 0fdcb17387..78c4b47f4a 100644 --- a/lib/gauge_update_quda.cu +++ b/lib/gauge_update_quda.cu @@ -2,6 +2,7 @@ #include #include #include +#include "timer.h" namespace quda { @@ -61,11 +62,13 @@ namespace quda { void updateGaugeField(GaugeField &out, double dt, const GaugeField& in, const GaugeField& mom, bool conj_mom, bool exact) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); checkPrecision(out, in, mom); checkLocation(out, in, mom); checkReconstruct(out, in); if (mom.Reconstruct() != QUDA_RECONSTRUCT_10) errorQuda("Reconstruction type %d not supported", mom.Reconstruct()); instantiate(out, in, mom, dt, conj_mom, exact); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } } // namespace quda diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 36a252d809..a8351fd35e 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -101,7 +101,7 @@ CloverField *cloverPrecondition = nullptr; CloverField *cloverRefinement = nullptr; CloverField *cloverEigensolver = nullptr; -GaugeField *momResident = nullptr; +GaugeField momResident; GaugeField *extendedGaugeResident = nullptr; std::vector solutionResident; @@ -1379,8 +1379,6 @@ void endQuda(void) solutionResident.clear(); - if(momResident) delete momResident; - LatticeField::freeGhostBuffer(); ColorSpinorField::freeGhostBuffer(); FieldTmp::destroy(); @@ -4011,72 +4009,38 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int* path_length, double* loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam* qudaGaugeParam) { - profileGaugeForce.TPSTART(QUDA_PROFILE_TOTAL); - profileGaugeForce.TPSTART(QUDA_PROFILE_INIT); - + pushProfile(profileGaugeForce); checkGaugeParam(qudaGaugeParam); GaugeFieldParam gParam(*qudaGaugeParam, siteLink); gParam.location = QUDA_CPU_FIELD_LOCATION; - gParam.site_offset = qudaGaugeParam->gauge_offset; - gParam.site_size = qudaGaugeParam->site_size; - GaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new GaugeField(gParam) : nullptr; - - GaugeField* cudaSiteLink = nullptr; - - if (qudaGaugeParam->use_resident_gauge) { - if (!gaugePrecise) errorQuda("No resident gauge field to use"); - cudaSiteLink = gaugePrecise; - } else { - gParam.create = QUDA_NULL_FIELD_CREATE; - gParam.reconstruct = qudaGaugeParam->reconstruct; - gParam.setPrecision(qudaGaugeParam->cuda_prec, true); - gParam.location = QUDA_CUDA_FIELD_LOCATION; - - cudaSiteLink = new GaugeField(gParam); - profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT); - - profileGaugeForce.TPSTART(QUDA_PROFILE_H2D); - cudaSiteLink->copy(*cpuSiteLink); - profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D); + GaugeField cpuSiteLink = !qudaGaugeParam->use_resident_gauge ? GaugeField(gParam) : GaugeField(); - profileGaugeForce.TPSTART(QUDA_PROFILE_INIT); - } + if (qudaGaugeParam->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field to use"); + gParam.create = QUDA_COPY_FIELD_CREATE; + gParam.field = &cpuSiteLink; + gParam.reconstruct = qudaGaugeParam->reconstruct; + gParam.setPrecision(qudaGaugeParam->cuda_prec, true); + gParam.location = QUDA_CUDA_FIELD_LOCATION; + GaugeField cudaSiteLink = qudaGaugeParam->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam); GaugeFieldParam gParamMom(*qudaGaugeParam, mom, QUDA_ASQTAD_MOM_LINKS); gParamMom.location = QUDA_CPU_FIELD_LOCATION; - if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) - gParamMom.reconstruct = QUDA_RECONSTRUCT_NO; - else - gParamMom.reconstruct = QUDA_RECONSTRUCT_10; - gParamMom.site_offset = qudaGaugeParam->mom_offset; - gParamMom.site_size = qudaGaugeParam->site_size; - GaugeField* cpuMom = (!qudaGaugeParam->use_resident_mom) ? new GaugeField(gParamMom) : nullptr; + GaugeField cpuMom = !qudaGaugeParam->use_resident_mom ? GaugeField(gParamMom) : GaugeField(); - GaugeField* cudaMom = nullptr; - if (qudaGaugeParam->use_resident_mom) { - if (!momResident) errorQuda("No resident momentum field to use"); - cudaMom = momResident; - if (qudaGaugeParam->overwrite_mom) cudaMom->zero(); - profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT); - } else { - gParamMom.location = QUDA_CUDA_FIELD_LOCATION; - gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE; - gParamMom.reconstruct = QUDA_RECONSTRUCT_10; - gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS; - gParamMom.setPrecision(qudaGaugeParam->cuda_prec, true); - gParamMom.create = QUDA_ZERO_FIELD_CREATE; - cudaMom = new GaugeField(gParamMom); - profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT); - if (!qudaGaugeParam->overwrite_mom) { - profileGaugeForce.TPSTART(QUDA_PROFILE_H2D); - cudaMom->copy(*cpuMom); - profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D); - } - } + if (qudaGaugeParam->use_resident_mom && !momResident.Volume()) errorQuda("No resident momentum field to use"); + gParamMom.location = QUDA_CUDA_FIELD_LOCATION; + gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_COPY_FIELD_CREATE; + gParamMom.field = &cpuMom; + gParamMom.reconstruct = QUDA_RECONSTRUCT_10; + gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS; + gParamMom.setPrecision(qudaGaugeParam->cuda_prec, true); - GaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugeForce); + GaugeField cudaMom = qudaGaugeParam->use_resident_mom ? momResident.create_alias() : GaugeField(gParamMom); + if (qudaGaugeParam->use_resident_mom && qudaGaugeParam->overwrite_mom) cudaMom.zero(); + + GaugeField *cudaGauge = createExtendedGauge(cudaSiteLink, R, profileGaugeForce); // apply / remove phase as appropriate if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase(); @@ -4095,41 +4059,26 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int // actually do the computation profileGaugeForce.TPSTART(QUDA_PROFILE_COMPUTE); if (!forceMonitor()) { - gaugeForce(*cudaMom, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length); + gaugeForce(cudaMom, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length); } else { // if we are monitoring the force, separate the force computation from the momentum update - GaugeFieldParam gParam(*cudaMom); + GaugeFieldParam gParam(cudaMom); gParam.create = QUDA_ZERO_FIELD_CREATE; - GaugeField *force = GaugeField::Create(gParam); - gaugeForce(*force, *cudaGauge, 1.0, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length); - updateMomentum(*cudaMom, eb3, *force, "gauge"); - delete force; + GaugeField force(gParam); + gaugeForce(force, *cudaGauge, 1.0, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length); + updateMomentum(cudaMom, eb3, force, "gauge"); } profileGaugeForce.TPSTOP(QUDA_PROFILE_COMPUTE); - if (qudaGaugeParam->return_result_mom) { - profileGaugeForce.TPSTART(QUDA_PROFILE_D2H); - cpuMom->copy(*cudaMom); - profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H); - } + if (qudaGaugeParam->return_result_mom) cpuMom.copy(cudaMom); - profileGaugeForce.TPSTART(QUDA_PROFILE_FREE); if (qudaGaugeParam->make_resident_gauge) { - if (gaugePrecise && gaugePrecise != cudaSiteLink) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); - gaugePrecise = cudaSiteLink; - } else { - delete cudaSiteLink; - } - - if (qudaGaugeParam->make_resident_mom) { - if (momResident && momResident != cudaMom) delete momResident; - momResident = cudaMom; - } else { - delete cudaMom; + if (gaugePrecise && !qudaGaugeParam->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); + std::exchange(*gaugePrecise, cudaSiteLink); } - if (cpuSiteLink) delete cpuSiteLink; - if (cpuMom) delete cpuMom; + if (qudaGaugeParam->make_resident_mom && !qudaGaugeParam->use_resident_gauge) std::exchange(momResident, cudaMom); + else momResident = GaugeField(); if (qudaGaugeParam->make_resident_gauge) { if (extendedGaugeResident) delete extendedGaugeResident; @@ -4137,24 +4086,19 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int } else { delete cudaGauge; } - profileGaugeForce.TPSTOP(QUDA_PROFILE_FREE); - profileGaugeForce.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); return 0; } int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam) { - profileGaugePath.TPSTART(QUDA_PROFILE_TOTAL); - profileGaugePath.TPSTART(QUDA_PROFILE_INIT); - + pushProfile(profileGaugePath); checkGaugeParam(qudaGaugeParam); GaugeFieldParam gParam(*qudaGaugeParam, siteLink); gParam.location = QUDA_CPU_FIELD_LOCATION; - gParam.site_offset = qudaGaugeParam->gauge_offset; - gParam.site_size = qudaGaugeParam->site_size; GaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new GaugeField(gParam) : nullptr; GaugeField *cudaSiteLink = nullptr; @@ -4169,30 +4113,19 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int * gParam.setPrecision(qudaGaugeParam->cuda_prec, true); cudaSiteLink = new GaugeField(gParam); - profileGaugePath.TPSTOP(QUDA_PROFILE_INIT); - - profileGaugePath.TPSTART(QUDA_PROFILE_H2D); cudaSiteLink->copy(*cpuSiteLink); - profileGaugePath.TPSTOP(QUDA_PROFILE_H2D); - - profileGaugePath.TPSTART(QUDA_PROFILE_INIT); } GaugeFieldParam gParamOut(*qudaGaugeParam, out); gParamOut.location = QUDA_CPU_FIELD_LOCATION; - gParamOut.site_offset = qudaGaugeParam->gauge_offset; - gParamOut.site_size = qudaGaugeParam->site_size; GaugeField *cpuOut = new GaugeField(gParamOut); gParamOut.location = QUDA_CUDA_FIELD_LOCATION; gParamOut.create = qudaGaugeParam->overwrite_gauge ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE; gParamOut.reconstruct = QUDA_RECONSTRUCT_NO; gParamOut.setPrecision(qudaGaugeParam->cuda_prec, true); GaugeField *cudaOut = new GaugeField(gParamOut); - profileGaugePath.TPSTOP(QUDA_PROFILE_INIT); if (!qudaGaugeParam->overwrite_gauge) { - profileGaugePath.TPSTART(QUDA_PROFILE_H2D); cudaOut->copy(*cpuOut); - profileGaugePath.TPSTOP(QUDA_PROFILE_H2D); } GaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugePath); @@ -4216,11 +4149,8 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int * gaugePath(*cudaOut, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length); profileGaugePath.TPSTOP(QUDA_PROFILE_COMPUTE); - profileGaugePath.TPSTART(QUDA_PROFILE_D2H); cpuOut->copy(*cudaOut); - profileGaugePath.TPSTOP(QUDA_PROFILE_D2H); - profileGaugePath.TPSTART(QUDA_PROFILE_FREE); if (qudaGaugeParam->make_resident_gauge) { if (gaugePrecise && gaugePrecise != cudaSiteLink) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); gaugePrecise = cudaSiteLink; @@ -4235,66 +4165,46 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int * if (cpuSiteLink) delete cpuSiteLink; if (cpuOut) delete cpuOut; - profileGaugePath.TPSTOP(QUDA_PROFILE_FREE); - profileGaugePath.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); return 0; } void momResidentQuda(void *mom, QudaGaugeParam *param) { - profileGaugeForce.TPSTART(QUDA_PROFILE_TOTAL); - profileGaugeForce.TPSTART(QUDA_PROFILE_INIT); - + pushProfile(profileGaugeForce); checkGaugeParam(param); GaugeFieldParam gParamMom(*param, mom, QUDA_ASQTAD_MOM_LINKS); gParamMom.location = QUDA_CPU_FIELD_LOCATION; - if (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) - gParamMom.reconstruct = QUDA_RECONSTRUCT_NO; - else - gParamMom.reconstruct = QUDA_RECONSTRUCT_10; - gParamMom.site_offset = param->mom_offset; - gParamMom.site_size = param->site_size; GaugeField cpuMom(gParamMom); if (param->make_resident_mom && !param->return_result_mom) { - if (momResident) delete momResident; gParamMom.location = QUDA_CUDA_FIELD_LOCATION; gParamMom.create = QUDA_NULL_FIELD_CREATE; gParamMom.reconstruct = QUDA_RECONSTRUCT_10; gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS; gParamMom.setPrecision(param->cuda_prec, true); gParamMom.create = QUDA_ZERO_FIELD_CREATE; - momResident = new GaugeField(gParamMom); + momResident = GaugeField(gParamMom); } else if (param->return_result_mom && !param->make_resident_mom) { - if (!momResident) errorQuda("No resident momentum to return"); + if (!momResident.Volume()) errorQuda("No resident momentum to return"); } else { errorQuda("Unexpected combination make_resident_mom = %d return_result_mom = %d", param->make_resident_mom, param->return_result_mom); } - profileGaugeForce.TPSTOP(QUDA_PROFILE_INIT); - if (param->make_resident_mom) { // we are downloading the momentum from the host - profileGaugeForce.TPSTART(QUDA_PROFILE_H2D); - momResident->copy(cpuMom); - profileGaugeForce.TPSTOP(QUDA_PROFILE_H2D); + momResident.copy(cpuMom); } else if (param->return_result_mom) { // we are uploading the momentum to the host - profileGaugeForce.TPSTART(QUDA_PROFILE_D2H); - cpuMom.copy(*momResident); - profileGaugeForce.TPSTOP(QUDA_PROFILE_D2H); - - profileGaugeForce.TPSTART(QUDA_PROFILE_FREE); - delete momResident; - momResident = nullptr; - profileGaugeForce.TPSTOP(QUDA_PROFILE_FREE); + cpuMom.copy(momResident); + momResident = GaugeField(); } - profileGaugeForce.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); } void createCloverQuda(QudaInvertParam* invertParam) @@ -4381,8 +4291,7 @@ void destroyGaugeFieldQuda(void *gauge) void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, void **, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param) { - profileStaggeredForce.TPSTART(QUDA_PROFILE_TOTAL); - profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT); + pushProfile(profileStaggeredForce); GaugeFieldParam gParam(*gauge_param, h_mom, QUDA_ASQTAD_MOM_LINKS); @@ -4393,12 +4302,14 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi GaugeField cpuMom(gParam); // create the device momentum field + if (gauge_param->use_resident_mom && !momResident.Volume()) errorQuda("Cannot use resident momentum field since none appears resident"); gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.link_type = QUDA_ASQTAD_MOM_LINKS; - gParam.create = QUDA_ZERO_FIELD_CREATE; // FIXME + gParam.create = QUDA_COPY_FIELD_CREATE; + gParam.field = &cpuMom; gParam.order = QUDA_FLOAT2_GAUGE_ORDER; gParam.reconstruct = QUDA_RECONSTRUCT_10; - GaugeField *cudaMom = !gauge_param->use_resident_mom ? new GaugeField(gParam) : nullptr; + GaugeField cudaMom = gauge_param->use_resident_mom ? momResident.create_alias() : GaugeField(gParam); // create temporary field for quark-field outer product gParam.reconstruct = QUDA_RECONSTRUCT_NO; @@ -4407,6 +4318,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi GaugeField cudaForce(gParam); GaugeField *cudaForce_[2] = {&cudaForce}; + profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT); ColorSpinorParam qParam; qParam.location = QUDA_CUDA_FIELD_LOCATION; qParam.nColor = 3; @@ -4421,25 +4333,11 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi qParam.x[4] = 1; qParam.create = QUDA_NULL_FIELD_CREATE; qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; - profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT); - profileStaggeredForce.TPSTART(QUDA_PROFILE_H2D); - - if (gauge_param->use_resident_mom) { - if (!momResident) errorQuda("Cannot use resident momentum field since none appears resident"); - cudaMom = momResident; - } else { - // download the initial momentum (FIXME make an option just to return?) - cudaMom->copy(cpuMom); - } // resident gauge field is required - if (!gauge_param->use_resident_gauge || !gaugePrecise) - errorQuda("Resident gauge field is required"); - - if (!gaugePrecise->StaggeredPhaseApplied()) { - errorQuda("Gauge field requires the staggered phase factors to be applied"); - } + if (!gauge_param->use_resident_gauge || !gaugePrecise) errorQuda("Resident gauge field is required"); + if (!gaugePrecise->StaggeredPhaseApplied()) errorQuda("Gauge field requires the staggered phase factors to be applied"); // check if staggered phase is the desired one if (gauge_param->staggered_phase_type != gaugePrecise->StaggeredPhase()) { @@ -4447,12 +4345,11 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi gauge_param->staggered_phase_type, gaugePrecise->StaggeredPhase()); } - profileStaggeredForce.TPSTOP(QUDA_PROFILE_H2D); profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT); const int nvector = inv_param->num_offset; std::vector X(nvector); - for ( int i=0; iuse_resident_solution) { if (solutionResident.size() < (unsigned int)nvector) @@ -4484,15 +4381,13 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi } profileStaggeredForce.TPSTOP(QUDA_PROFILE_PREAMBLE); - profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE); + profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE); #if 0 if (inv_param->use_resident_solution) solutionResident.clear(); #endif delete dirac; - profileStaggeredForce.TPSTOP(QUDA_PROFILE_FREE); - profileStaggeredForce.TPSTART(QUDA_PROFILE_COMPUTE); // compute quark-field outer product for (int i=0; ireturn_result_mom) cpuMom.copy(cudaMom); - if (gauge_param->return_result_mom) { - // copy the momentum field back to the host - cpuMom.copy(*cudaMom); - } + if (gauge_param->make_resident_mom && !gauge_param->use_resident_mom) std::exchange(momResident, cudaMom); + else momResident = GaugeField(); - if (gauge_param->make_resident_mom) { - // make the momentum field resident - momResident = cudaMom; - } else { - delete cudaMom; - } - - profileStaggeredForce.TPSTOP(QUDA_PROFILE_D2H); profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE); - for (int i=0; igauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported input field order %d", gParam->gauge_order); checkGaugeParam(gParam); @@ -4768,9 +4653,7 @@ void computeHISQForceQuda(void* const milc_momentum, GaugeField *cudaWLink = new GaugeField(wParam); profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); - profileHISQForce.TPSTART(QUDA_PROFILE_H2D); cudaWLink->copy(cpuWLink); - profileHISQForce.TPSTOP(QUDA_PROFILE_H2D); cudaWLink->exchangeExtendedGhost(cudaWLink->R(), profileHISQForce); @@ -4816,9 +4699,7 @@ void computeHISQForceQuda(void* const milc_momentum, GaugeField *cudaVLink = new GaugeField(vParam); profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); - profileHISQForce.TPSTART(QUDA_PROFILE_H2D); cudaVLink->copy(cpuVLink); - profileHISQForce.TPSTOP(QUDA_PROFILE_H2D); cudaVLink->exchangeExtendedGhost(cudaVLink->R(), profileHISQForce); profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE); @@ -4851,9 +4732,7 @@ void computeHISQForceQuda(void* const milc_momentum, GaugeField *cudaULink = new GaugeField(uParam); profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); - profileHISQForce.TPSTART(QUDA_PROFILE_H2D); cudaULink->copy(cpuULink); - profileHISQForce.TPSTOP(QUDA_PROFILE_H2D); cudaULink->exchangeExtendedGhost(cudaULink->R(), profileHISQForce); // Compute Fat7-staple term @@ -4873,8 +4752,8 @@ void computeHISQForceQuda(void* const milc_momentum, hisqCompleteForce(*cudaOutForce, *cudaULink); if (gParam->use_resident_mom) { - if (!momResident) errorQuda("No resident momentum field to use"); - updateMomentum(*momResident, dt, *cudaOutForce, "hisq"); + if (momResident.Length()) errorQuda("No resident momentum field to use"); + updateMomentum(momResident, dt, *cudaOutForce, "hisq"); } else { updateMomentum(*cudaMom, dt, *cudaOutForce, "hisq"); } @@ -4883,27 +4762,16 @@ void computeHISQForceQuda(void* const milc_momentum, if (gParam->return_result_mom) { // Close the paths, make anti-hermitian, and store in compressed format - if (gParam->return_result_mom) { - profileHISQForce.TPSTART(QUDA_PROFILE_H2D); - cpuMom->copy(*cudaMom); - profileHISQForce.TPSTOP(QUDA_PROFILE_H2D); - } + if (gParam->return_result_mom) cpuMom->copy(*cudaMom); } - profileHISQForce.TPSTART(QUDA_PROFILE_FREE); - if (cpuMom) delete cpuMom; - - if (!gParam->make_resident_mom) { - delete momResident; - momResident = nullptr; - } + if (!gParam->make_resident_mom) momResident = GaugeField(); if (cudaMom) delete cudaMom; delete cudaOutForce; delete cudaULink; - profileHISQForce.TPSTOP(QUDA_PROFILE_FREE); - profileHISQForce.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); } void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double *coeff, double kappa2, double ck, @@ -5085,106 +4953,55 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double profileCloverForce.TPSTOP(QUDA_PROFILE_TOTAL); } -void updateGaugeFieldQuda(void* gauge, - void* momentum, - double dt, - int conj_mom, - int exact, - QudaGaugeParam* param) +void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom, int exact, QudaGaugeParam* param) { - profileGaugeUpdate.TPSTART(QUDA_PROFILE_TOTAL); - + pushProfile(profileGaugeUpdate); checkGaugeParam(param); - profileGaugeUpdate.TPSTART(QUDA_PROFILE_INIT); - // create the host fields GaugeFieldParam gParam(*param, gauge, QUDA_SU3_LINKS); gParam.location = QUDA_CPU_FIELD_LOCATION; - gParam.site_offset = param->gauge_offset; - gParam.site_size = param->site_size; bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; - GaugeField *cpuGauge = need_cpu ? new GaugeField(gParam) : nullptr; + GaugeField cpuGauge = need_cpu ? GaugeField(gParam) : GaugeField(); - GaugeFieldParam gParamMom(*param, momentum); - gParamMom.reconstruct = (gParamMom.order == QUDA_TIFR_GAUGE_ORDER || gParamMom.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ? - QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10; - gParamMom.link_type = QUDA_ASQTAD_MOM_LINKS; - gParamMom.site_offset = param->mom_offset; - gParamMom.site_size = param->site_size; - GaugeField *cpuMom = !param->use_resident_mom ? new GaugeField(gParamMom) : nullptr; + GaugeFieldParam gParamMom(*param, momentum, QUDA_ASQTAD_MOM_LINKS); + GaugeField cpuMom = !param->use_resident_mom ? GaugeField(gParamMom) : GaugeField(); // create the device fields + if (param->use_resident_mom && !momResident.Volume()) errorQuda("No resident mom field allocated"); gParam.location = QUDA_CUDA_FIELD_LOCATION; - gParam.create = QUDA_NULL_FIELD_CREATE; + gParam.create = QUDA_COPY_FIELD_CREATE; + gParam.field = &cpuMom; gParam.order = QUDA_FLOAT2_GAUGE_ORDER; gParam.link_type = QUDA_ASQTAD_MOM_LINKS; gParam.reconstruct = QUDA_RECONSTRUCT_10; gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; gParam.pad = 0; - GaugeField *cudaMom = !param->use_resident_mom ? new GaugeField(gParam) : nullptr; + GaugeField cudaMom = param->use_resident_mom ? momResident.create_alias() : GaugeField(gParam); + if (param->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field allocated"); gParam.link_type = QUDA_SU3_LINKS; gParam.reconstruct = param->reconstruct; - GaugeField *cudaInGauge = !param->use_resident_gauge ? new GaugeField(gParam) : nullptr; - auto *cudaOutGauge = new GaugeField(gParam); - - profileGaugeUpdate.TPSTOP(QUDA_PROFILE_INIT); - - profileGaugeUpdate.TPSTART(QUDA_PROFILE_H2D); - - if (!param->use_resident_gauge) { // load fields onto the device - cudaInGauge->copy(*cpuGauge); - } else { // or use resident fields already present - if (!gaugePrecise) errorQuda("No resident gauge field allocated"); - cudaInGauge = gaugePrecise; - gaugePrecise = nullptr; - } - - if (!param->use_resident_mom) { - cudaMom->copy(*cpuMom); - } else { - if (!momResident) errorQuda("No resident mom field allocated"); - cudaMom = momResident; - momResident = nullptr; - } - - profileGaugeUpdate.TPSTOP(QUDA_PROFILE_H2D); + gParam.field = &cpuGauge; + GaugeField u_in = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam); + gParam.create = QUDA_NULL_FIELD_CREATE; + GaugeField u_out(gParam); // perform the update - profileGaugeUpdate.TPSTART(QUDA_PROFILE_COMPUTE); - updateGaugeField(*cudaOutGauge, dt, *cudaInGauge, *cudaMom, - (bool)conj_mom, (bool)exact); - profileGaugeUpdate.TPSTOP(QUDA_PROFILE_COMPUTE); + updateGaugeField(u_out, dt, u_in, cudaMom, (bool)conj_mom, (bool)exact); - if (param->return_result_gauge) { - // copy the gauge field back to the host - profileGaugeUpdate.TPSTART(QUDA_PROFILE_D2H); - cpuGauge->copy(*cudaOutGauge); - profileGaugeUpdate.TPSTOP(QUDA_PROFILE_D2H); - } + // copy the gauge field back to the host + if (param->return_result_gauge) cpuGauge.copy(u_out); - profileGaugeUpdate.TPSTART(QUDA_PROFILE_FREE); if (param->make_resident_gauge) { - if (gaugePrecise != nullptr) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); - gaugePrecise = cudaOutGauge; - } else { - delete cudaOutGauge; - } - - if (param->make_resident_mom) { - if (momResident != nullptr && momResident != cudaMom) delete momResident; - momResident = cudaMom; - } else { - delete cudaMom; + if (gaugePrecise && !param->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); + std::exchange(*gaugePrecise, u_out); } - delete cudaInGauge; - if (cpuMom) delete cpuMom; - if (cpuGauge) delete cpuGauge; + if (param->make_resident_mom && !param->use_resident_mom) std::exchange(momResident, cudaMom); + else momResident = GaugeField(); - profileGaugeUpdate.TPSTOP(QUDA_PROFILE_FREE); - profileGaugeUpdate.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); } void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) { @@ -5196,8 +5013,6 @@ void updateGaugeFieldQuda(void* gauge, // create the gauge field GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS); gParam.location = QUDA_CPU_FIELD_LOCATION; - gParam.site_offset = param->gauge_offset; - gParam.site_size = param->site_size; bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; GaugeField *cpuGauge = need_cpu ? new GaugeField(gParam) : nullptr; @@ -5313,96 +5128,55 @@ void updateGaugeFieldQuda(void* gauge, // evaluate the momentum action double momActionQuda(void* momentum, QudaGaugeParam* param) { - profileMomAction.TPSTART(QUDA_PROFILE_TOTAL); - - profileMomAction.TPSTART(QUDA_PROFILE_INIT); + pushProfile(profileMomAction); checkGaugeParam(param); // create the momentum fields GaugeFieldParam gParam(*param, momentum, QUDA_ASQTAD_MOM_LINKS); gParam.location = QUDA_CPU_FIELD_LOCATION; - gParam.reconstruct = (gParam.order == QUDA_TIFR_GAUGE_ORDER || gParam.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ? - QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10; - gParam.site_offset = param->mom_offset; - gParam.site_size = param->site_size; - - GaugeField *cpuMom = !param->use_resident_mom ? new GaugeField(gParam) : nullptr; + GaugeField cpuMom = !param->use_resident_mom ? GaugeField(gParam) : GaugeField(); // create the device fields gParam.location = QUDA_CUDA_FIELD_LOCATION; - gParam.create = QUDA_NULL_FIELD_CREATE; + gParam.field = &cpuMom; + gParam.create = QUDA_COPY_FIELD_CREATE; gParam.reconstruct = QUDA_RECONSTRUCT_10; gParam.setPrecision(param->cuda_prec, true); - GaugeField *cudaMom = !param->use_resident_mom ? new GaugeField(gParam) : nullptr; - - profileMomAction.TPSTOP(QUDA_PROFILE_INIT); - - profileMomAction.TPSTART(QUDA_PROFILE_H2D); - if (!param->use_resident_mom) { - cudaMom->copy(*cpuMom); - } else { - if (!momResident) errorQuda("No resident mom field allocated"); - cudaMom = momResident; - } - profileMomAction.TPSTOP(QUDA_PROFILE_H2D); + if (param->use_resident_mom && !momResident.Volume()) errorQuda("No resident mom field allocated"); + GaugeField cudaMom = param->use_resident_mom ? momResident.create_alias() : GaugeField(gParam); // perform the update - profileMomAction.TPSTART(QUDA_PROFILE_COMPUTE); - double action = computeMomAction(*cudaMom); - profileMomAction.TPSTOP(QUDA_PROFILE_COMPUTE); - - profileMomAction.TPSTART(QUDA_PROFILE_FREE); - if (param->make_resident_mom) { - if (momResident != nullptr && momResident != cudaMom) delete momResident; - momResident = cudaMom; - } else { - delete cudaMom; - momResident = nullptr; - } - if (cpuMom) { - delete cpuMom; - } + double action = computeMomAction(cudaMom); - profileMomAction.TPSTOP(QUDA_PROFILE_FREE); - profileMomAction.TPSTOP(QUDA_PROFILE_TOTAL); + if (param->make_resident_mom && !param->use_resident_gauge) std::exchange(momResident, cudaMom); + else momResident = GaugeField(); + popProfile(); return action; } void gaussGaugeQuda(unsigned long long seed, double sigma) { - profileGauss.TPSTART(QUDA_PROFILE_TOTAL); + pushProfile(profileGauss); if (!gaugePrecise) errorQuda("Cannot generate Gauss GaugeField as there is no resident gauge field"); - - GaugeField *data = gaugePrecise; - - profileGauss.TPSTART(QUDA_PROFILE_COMPUTE); - quda::gaugeGauss(*data, seed, sigma); - profileGauss.TPSTOP(QUDA_PROFILE_COMPUTE); + quda::gaugeGauss(*gaugePrecise, seed, sigma); if (extendedGaugeResident) { extendedGaugeResident->copy(*gaugePrecise); extendedGaugeResident->exchangeExtendedGhost(R, profileGauss, redundant_comms); } - profileGauss.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); } void gaussMomQuda(unsigned long long seed, double sigma) { - profileGauss.TPSTART(QUDA_PROFILE_TOTAL); - - if (!momResident) errorQuda("Cannot generate Gauss GaugeField as there is no resident momentum field"); - - GaugeField *data = momResident; - - profileGauss.TPSTART(QUDA_PROFILE_COMPUTE); - quda::gaugeGauss(*data, seed, sigma); - profileGauss.TPSTOP(QUDA_PROFILE_COMPUTE); - - profileGauss.TPSTOP(QUDA_PROFILE_TOTAL); + pushProfile(profileGauss); + if (!momResident.Volume()) errorQuda("Cannot generate Gauss GaugeField as there is no resident momentum field"); + quda::gaugeGauss(momResident, seed, sigma); + popProfile(); } /* @@ -5810,8 +5584,6 @@ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const u GaugeFieldParam gParam(*param, gauge); gParam.location = QUDA_CPU_FIELD_LOCATION; - gParam.site_offset = param->gauge_offset; - gParam.site_size = param->site_size; auto *cpuGauge = new GaugeField(gParam); gParam.create = QUDA_NULL_FIELD_CREATE; @@ -5888,8 +5660,6 @@ int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir, const GaugeFieldParam gParam(*param, gauge); gParam.location = QUDA_CPU_FIELD_LOCATION; - gParam.site_offset = param->gauge_offset; - gParam.site_size = param->site_size; auto *cpuGauge = new GaugeField(gParam); gParam.create = QUDA_NULL_FIELD_CREATE; diff --git a/lib/momentum.cu b/lib/momentum.cu index 78a981e509..7a574687ca 100644 --- a/lib/momentum.cu +++ b/lib/momentum.cu @@ -9,6 +9,7 @@ #include #include #include +#include "timer.h" namespace quda { @@ -92,9 +93,11 @@ namespace quda { }; double computeMomAction(const GaugeField& mom) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); if (!mom.isNative()) errorQuda("Unsupported output ordering: %d\n", mom.Order()); double action = 0.0; instantiate(mom, action); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); return action; } @@ -132,11 +135,13 @@ namespace quda { void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); if (mom.Reconstruct() != QUDA_RECONSTRUCT_10) errorQuda("Momentum field with reconstruct %d not supported", mom.Reconstruct()); checkPrecision(mom, force); instantiate(force, mom, coeff, fname); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } template @@ -173,9 +178,11 @@ namespace quda { void applyU(GaugeField &force, GaugeField &U) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); if (!force.isNative()) errorQuda("Unsupported output ordering: %d\n", force.Order()); checkPrecision(force, U); instantiate(U, force); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } } // namespace quda diff --git a/lib/staggered_oprod.cu b/lib/staggered_oprod.cu index 8f4943061b..1f9cbcccf7 100644 --- a/lib/staggered_oprod.cu +++ b/lib/staggered_oprod.cu @@ -2,6 +2,7 @@ #include #include #include +#include "timer.h" namespace quda { @@ -106,6 +107,7 @@ namespace quda { #ifdef GPU_STAGGERED_DIRAC void computeStaggeredOprod(GaugeField *out[], ColorSpinorField& in, const double coeff[], int nFace) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); if (nFace == 1) { computeStaggeredOprod(*out[0], *out[0], in.Even(), in.Odd(), 0, coeff, nFace); double coeff_[2] = {-coeff[0],0.0}; // need to multiply by -1 on odd sites @@ -116,6 +118,7 @@ namespace quda { } else { errorQuda("Invalid nFace=%d", nFace); } + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } #else // GPU_STAGGERED_DIRAC not defined void computeStaggeredOprod(GaugeField *[], ColorSpinorField &, const double [], int) diff --git a/lib/targets/cuda/malloc.cpp b/lib/targets/cuda/malloc.cpp index d4f4de254d..1f78d936bc 100644 --- a/lib/targets/cuda/malloc.cpp +++ b/lib/targets/cuda/malloc.cpp @@ -7,6 +7,7 @@ #include #include #include +#include "timer.h" #ifdef USE_QDPJIT #include "qdp_cache.h" @@ -795,6 +796,7 @@ namespace quda size(size), pool(pool) { + getProfile().TPSTART(QUDA_PROFILE_INIT); if (pool && (type != QUDA_MEMORY_DEVICE && type != QUDA_MEMORY_HOST_PINNED && type != QUDA_MEMORY_HOST)) errorQuda("Memory pool not available for memory type %d", type); @@ -823,11 +825,13 @@ namespace quda default: errorQuda("Unknown memory type %d", type); } } + getProfile().TPSTOP(QUDA_PROFILE_INIT); } quda_ptr::quda_ptr(void *ptr, QudaMemoryType type) : type(type) { + getProfile().TPSTART(QUDA_PROFILE_INIT); switch (type) { case QUDA_MEMORY_DEVICE: case QUDA_MEMORY_DEVICE_PINNED: @@ -845,6 +849,7 @@ namespace quda break; default: errorQuda("Unsupported memory type %d", type); } + getProfile().TPSTOP(QUDA_PROFILE_INIT); } quda_ptr& quda_ptr::operator=(quda_ptr &&other) @@ -861,6 +866,8 @@ namespace quda quda_ptr::~quda_ptr() { + getProfile().TPSTART(QUDA_PROFILE_FREE); + if (size > 0) { switch (type) { case QUDA_MEMORY_DEVICE: pool ? pool_device_free(device) : device_free(device); break; @@ -874,6 +881,8 @@ namespace quda device = nullptr; host = nullptr; + + getProfile().TPSTOP(QUDA_PROFILE_FREE); } bool quda_ptr::is_device() const diff --git a/lib/timer.cpp b/lib/timer.cpp index e8e427fd74..c4e924ee6e 100644 --- a/lib/timer.cpp +++ b/lib/timer.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -113,4 +114,25 @@ namespace quda { } } + static std::stack tpstack; + + void pushProfile(TimeProfile &profile) + { + profile.TPSTART(QUDA_PROFILE_TOTAL); + tpstack.push(&profile); + } + + void popProfile() + { + if (tpstack.empty()) errorQuda("popProfile() called with empty stack"); + auto &profile = *(tpstack.top()); + tpstack.pop(); + profile.TPSTOP(QUDA_PROFILE_TOTAL); + } + + TimeProfile& getProfile() + { + if (tpstack.empty()) return dummy; + return *(tpstack.top()); + } } From 3db98e143a2fde6ff8e4720414e2f72230149952 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 19 May 2023 10:24:56 -0700 Subject: [PATCH 12/99] Continued to add auto profiling support and GaugeField cleanup to various QUDA interfaces. Add ref counting support to the profiling, to allow for multiple starts without throwing an error: if a timer has already been started we simply increment the ref counter and return. Profiling now performs a device sync if the type is H2D, D2H or COMPUTE: this negates the need to use explicit synchronization and ensures accurate profiling --- include/gauge_tools.h | 3 +- include/quda.h | 7 +- include/quda_internal.h | 1 + include/timer.h | 17 +- lib/clover_field.cpp | 2 - lib/color_spinor_field.cpp | 1 - lib/contract.cu | 2 + lib/gauge_ape.cu | 2 + lib/gauge_field.cpp | 2 +- lib/gauge_fix_fft.cu | 2 + lib/gauge_fix_ovr.cu | 2 + lib/gauge_force.cu | 4 + lib/gauge_loop_trace.cu | 2 + lib/gauge_observable.cpp | 19 +- lib/gauge_plaq.cu | 2 + lib/gauge_qcharge.cu | 4 + lib/gauge_stout.cu | 4 + lib/gauge_wilson_flow.cu | 2 + lib/hisq_paths_force_quda.cu | 37 ++- lib/interface_quda.cpp | 506 ++++++++++++----------------------- lib/milc_interface.cpp | 19 +- lib/staggered_oprod.cu | 1 - lib/unitarize_force_quda.cu | 2 + lib/unitarize_links_quda.cu | 4 + 24 files changed, 249 insertions(+), 398 deletions(-) diff --git a/include/gauge_tools.h b/include/gauge_tools.h index 503c20bc9f..9b7d68db37 100644 --- a/include/gauge_tools.h +++ b/include/gauge_tools.h @@ -9,9 +9,8 @@ namespace quda * @param[in] Gauge field upon which we are measuring. * @param[in,out] param Parameter struct that defines which * observables we are making and the resulting observables. - * @param[in] profile TimeProfile instance used for profiling. */ - void gaugeObservables(GaugeField &u, QudaGaugeObservableParam ¶m, TimeProfile &profile); + void gaugeObservables(GaugeField &u, QudaGaugeObservableParam ¶m); /** * @brief Project the input gauge field onto the SU(3) group. This diff --git a/include/quda.h b/include/quda.h index b697ef7400..cb22e50033 100644 --- a/include/quda.h +++ b/include/quda.h @@ -1673,12 +1673,11 @@ extern "C" { * @param[in] reunit_interval, reunitarize gauge field when iteration count is a multiple of this * @param[in] stopWtheta, 0 for MILC criterion and 1 to use the theta value * @param[in] param The parameters of the external fields and the computation settings - * @param[out] timeinfo */ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double relax_boost, const double tolerance, const unsigned int reunit_interval, const unsigned int stopWtheta, - QudaGaugeParam *param, double *timeinfo); + QudaGaugeParam *param); /** * @brief Gauge fixing with Steepest descent method with FFTs with support for single GPU only. @@ -1692,12 +1691,10 @@ extern "C" { * iteration reachs the maximum number of steps defined by Nsteps * @param[in] stopWtheta, 0 for MILC criterion and 1 to use the theta value * @param[in] param The parameters of the external fields and the computation settings - * @param[out] timeinfo */ int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double alpha, const unsigned int autotune, - const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, - double *timeinfo); + const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param); /** * @brief Strided Batched GEMM diff --git a/include/quda_internal.h b/include/quda_internal.h index 756d5822e0..dd8a6c8177 100644 --- a/include/quda_internal.h +++ b/include/quda_internal.h @@ -49,6 +49,7 @@ #include #include #include +#include "timer.h" namespace quda { diff --git a/include/timer.h b/include/timer.h index 20b9df45ff..2475fee154 100644 --- a/include/timer.h +++ b/include/timer.h @@ -65,15 +65,16 @@ namespace quda { } } + int ref_count = 0; + /** @brief Start the timer */ - void start(const char *func = nullptr, const char *file = nullptr, int line = 0) + void start(const char * = nullptr, const char * = nullptr, int = 0) { - if (running) { - printfQuda("ERROR: Cannot start an already running timer (%s:%d in %s())", file ? file : "", line, - func ? func : ""); - errorQuda("Aborting"); + if (running) { // if the timer has already started, we increment the ref counter and return + ref_count++; + return; } if (!device) { gettimeofday(&host_start, NULL); @@ -110,6 +111,10 @@ namespace quda { */ void stop(const char *func = nullptr, const char *file = nullptr, int line = 0) { + if (ref_count > 0) { + ref_count--; + return; + } peek(func, file, line); time += last_interval; count++; @@ -271,6 +276,8 @@ namespace quda { } void Stop_(const char *func, const char *file, int line, QudaProfileType idx) { + if (idx == QUDA_PROFILE_COMPUTE || idx == QUDA_PROFILE_H2D || idx == QUDA_PROFILE_D2H) + qudaDeviceSynchronize(); // ensure accurate profiling profile[idx].stop(func, file, line); POP_RANGE diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp index bb952ba324..46394c332b 100644 --- a/lib/clover_field.cpp +++ b/lib/clover_field.cpp @@ -257,8 +257,6 @@ namespace quda { pool_device_free(packClover); } } - - qudaDeviceSynchronize(); } void CloverField::copy(const CloverField &src) diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index 73417b4462..96df00ba55 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -482,7 +482,6 @@ namespace quda pool_device_free(buffer); } } - qudaDeviceSynchronize(); // include sync here for accurate host-device profiling } else if (Location() == QUDA_CPU_FIELD_LOCATION && src.Location() == QUDA_CUDA_FIELD_LOCATION) { // D2H diff --git a/lib/contract.cu b/lib/contract.cu index 491652ae9c..74206419c6 100644 --- a/lib/contract.cu +++ b/lib/contract.cu @@ -58,12 +58,14 @@ public: #ifdef GPU_CONTRACT void contractQuda(const ColorSpinorField &x, const ColorSpinorField &y, void *result, const QudaContractType cType) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); checkPrecision(x, y); if (x.GammaBasis() != QUDA_DEGRAND_ROSSI_GAMMA_BASIS || y.GammaBasis() != QUDA_DEGRAND_ROSSI_GAMMA_BASIS) errorQuda("Unexpected gamma basis x=%d y=%d", x.GammaBasis(), y.GammaBasis()); if (x.Nspin() != 4 || y.Nspin() != 4) errorQuda("Unexpected number of spins x=%d y=%d", x.Nspin(), y.Nspin()); instantiate(x, y, result, cType); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } #else void contractQuda(const ColorSpinorField &, const ColorSpinorField &, void *, const QudaContractType) diff --git a/lib/gauge_ape.cu b/lib/gauge_ape.cu index 5ace8e5a29..248b7d1d6c 100644 --- a/lib/gauge_ape.cu +++ b/lib/gauge_ape.cu @@ -57,7 +57,9 @@ namespace quda { copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); in.exchangeExtendedGhost(in.R(), false); + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(out, in, alpha); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); out.exchangeExtendedGhost(out.R(), false); } diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index e8c4994670..0003663e25 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -995,6 +995,7 @@ namespace quda { } else { errorQuda("Ghost copy not supported here"); } + qudaDeviceSynchronize(); // synchronize to ensure visibility on the host } else { void *buffer = create_gauge_buffer(bytes, order, geometry); size_t ghost_bytes[8]; @@ -1110,7 +1111,6 @@ namespace quda { staggeredPhaseApplied = src.StaggeredPhaseApplied(); staggeredPhaseType = src.StaggeredPhase(); - if (src.Location() != location) qudaDeviceSynchronize(); // include sync here for accurate host-device profiling if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) { profile.TPSTOP(QUDA_PROFILE_D2H); } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) { diff --git a/lib/gauge_fix_fft.cu b/lib/gauge_fix_fft.cu index b85f5b4457..1de3980332 100644 --- a/lib/gauge_fix_fft.cu +++ b/lib/gauge_fix_fft.cu @@ -389,8 +389,10 @@ namespace quda { void gaugeFixingFFT(GaugeField& data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double alpha, const int autotune, const double tolerance, const int stopWtheta) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); if (comm_partitioned()) errorQuda("Gauge Fixing with FFTs in multi-GPU support NOT implemented yet!"); instantiate(data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } } diff --git a/lib/gauge_fix_ovr.cu b/lib/gauge_fix_ovr.cu index 814b65427d..064ed5b158 100644 --- a/lib/gauge_fix_ovr.cu +++ b/lib/gauge_fix_ovr.cu @@ -502,7 +502,9 @@ namespace quda { void gaugeFixingOVR(GaugeField& data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double relax_boost, const double tolerance, const int reunit_interval, const int stopWtheta) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(data, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, stopWtheta); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } } //namespace quda diff --git a/lib/gauge_force.cu b/lib/gauge_force.cu index 2558dadcac..5e43fa64e6 100644 --- a/lib/gauge_force.cu +++ b/lib/gauge_force.cu @@ -48,6 +48,7 @@ namespace quda { void gaugeForce(GaugeField& mom, const GaugeField& u, double epsilon, std::vector& input_path, std::vector& length, std::vector& path_coeff, int num_paths, int path_max_length) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); checkPrecision(mom, u); checkLocation(mom, u); if (mom.Reconstruct() != QUDA_RECONSTRUCT_10) errorQuda("Reconstruction type %d not supported", mom.Reconstruct()); @@ -57,11 +58,13 @@ namespace quda { // gauge field must be passed as first argument so we peel off its reconstruct type instantiate(u, mom, epsilon, p); p.free(); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } void gaugePath(GaugeField& out, const GaugeField& u, double coeff, std::vector& input_path, std::vector& length, std::vector& path_coeff, int num_paths, int path_max_length) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); checkPrecision(out, u); checkLocation(out, u); if (out.Reconstruct() != QUDA_RECONSTRUCT_NO) errorQuda("Reconstruction type %d not supported", out.Reconstruct()); @@ -71,6 +74,7 @@ namespace quda { // gauge field must be passed as first argument so we peel off its reconstruct type instantiate(u, out, coeff, p); p.free(); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } } // namespace quda diff --git a/lib/gauge_loop_trace.cu b/lib/gauge_loop_trace.cu index faaaa97d99..0b1af50ba4 100644 --- a/lib/gauge_loop_trace.cu +++ b/lib/gauge_loop_trace.cu @@ -55,6 +55,7 @@ namespace quda { void gaugeLoopTrace(const GaugeField& u, std::vector& loop_traces, double factor, std::vector& input_path, std::vector& length, std::vector& path_coeff, int num_paths, int path_max_length) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); paths<1> p(input_path, length, path_coeff, num_paths, path_max_length); std::vector> tr_array(loop_traces.size()); @@ -65,6 +66,7 @@ namespace quda { for (auto i = 0u; i < tr_array.size(); i++) { loop_traces[i] = Complex(tr_array[i][0], tr_array[i][1]); } p.free(); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } } // namespace quda diff --git a/lib/gauge_observable.cpp b/lib/gauge_observable.cpp index b825a2ad81..041dc6164d 100644 --- a/lib/gauge_observable.cpp +++ b/lib/gauge_observable.cpp @@ -5,9 +5,9 @@ namespace quda { - void gaugeObservables(GaugeField &u, QudaGaugeObservableParam ¶m, TimeProfile &profile) + void gaugeObservables(GaugeField &u, QudaGaugeObservableParam ¶m) { - profile.TPSTART(QUDA_PROFILE_COMPUTE); + auto &profile = getProfile(); if (param.su_project) { int *num_failures_h = static_cast(pool_pinned_malloc(sizeof(int))); int *num_failures_d = static_cast(get_mapped_device_pointer(num_failures_h)); @@ -24,7 +24,6 @@ namespace quda param.plaquette[1] = plaq.y; param.plaquette[2] = plaq.z; } - profile.TPSTOP(QUDA_PROFILE_COMPUTE); if (param.compute_polyakov_loop) { gaugePolyakovLoop(param.ploop, u, 3, profile); } @@ -45,10 +44,8 @@ namespace quda std::vector loop_traces(param.num_paths); // actually do the computation - profile.TPSTART(QUDA_PROFILE_COMPUTE); gaugeLoopTrace(u, loop_traces, param.factor, input_path_v, path_length_v, loop_coeff_v, param.num_paths, param.max_length); - profile.TPSTOP(QUDA_PROFILE_COMPUTE); for (int i = 0; i < param.num_paths; i++) { memcpy(param.traces + i, &loop_traces[i], sizeof(Complex)); } } @@ -57,7 +54,6 @@ namespace quda if (!param.compute_qcharge && !param.compute_qcharge_density) return; // create the Fmunu field - profile.TPSTART(QUDA_PROFILE_INIT); // u is an extended field we need to shrink for the Fmunu field lat_dim_t x; for (int i = 0; i < 4; i++) x[i] = u.X()[i] - 2 * u.R()[i]; @@ -67,15 +63,10 @@ namespace quda tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER; tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; GaugeField gaugeFmunu(tensorParam); - profile.TPSTOP(QUDA_PROFILE_INIT); - profile.TPSTART(QUDA_PROFILE_COMPUTE); computeFmunu(gaugeFmunu, u); - profile.TPSTOP(QUDA_PROFILE_COMPUTE); - profile.TPSTOP(QUDA_PROFILE_TOTAL); if (param.compute_qcharge || param.compute_qcharge_density) { - profile.TPSTART(QUDA_PROFILE_TOTAL); profile.TPSTART(QUDA_PROFILE_INIT); if (param.compute_qcharge_density && !param.qcharge_density) errorQuda("Charge density requested, but destination field not defined"); @@ -83,23 +74,17 @@ namespace quda void *d_qDensity = param.compute_qcharge_density ? pool_device_malloc(size) : nullptr; profile.TPSTOP(QUDA_PROFILE_INIT); - profile.TPSTART(QUDA_PROFILE_COMPUTE); - if (param.compute_qcharge_density) computeQChargeDensity(param.energy, param.qcharge, d_qDensity, gaugeFmunu); else computeQCharge(param.energy, param.qcharge, gaugeFmunu); - profile.TPSTOP(QUDA_PROFILE_COMPUTE); - if (param.compute_qcharge_density) { profile.TPSTART(QUDA_PROFILE_D2H); qudaMemcpy(param.qcharge_density, d_qDensity, size, qudaMemcpyDeviceToHost); profile.TPSTOP(QUDA_PROFILE_D2H); - profile.TPSTART(QUDA_PROFILE_FREE); pool_device_free(d_qDensity); - profile.TPSTOP(QUDA_PROFILE_FREE); } } } diff --git a/lib/gauge_plaq.cu b/lib/gauge_plaq.cu index 7ad5c0399e..ee48d2e3d2 100644 --- a/lib/gauge_plaq.cu +++ b/lib/gauge_plaq.cu @@ -37,9 +37,11 @@ namespace quda { double3 plaquette(const GaugeField &U) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); array plq{0.0, 0.0}; instantiate(U, plq); double3 plaq = make_double3(0.5*(plq[0] + plq[1]), plq[0], plq[1]); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); return plaq; } diff --git a/lib/gauge_qcharge.cu b/lib/gauge_qcharge.cu index d847b219ae..3b4e584b02 100644 --- a/lib/gauge_qcharge.cu +++ b/lib/gauge_qcharge.cu @@ -62,12 +62,16 @@ namespace quda void computeQCharge(double energy[3], double &qcharge, const GaugeField &Fmunu) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(Fmunu, energy, qcharge, nullptr, false); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } void computeQChargeDensity(double energy[3], double &qcharge, void *qdensity, const GaugeField &Fmunu) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(Fmunu, energy, qcharge, qdensity, true); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } } // namespace quda diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu index d8d40af42f..f537ca60ea 100644 --- a/lib/gauge_stout.cu +++ b/lib/gauge_stout.cu @@ -72,7 +72,9 @@ namespace quda { copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); in.exchangeExtendedGhost(in.R(), false); + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(out, in, false, rho); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); out.exchangeExtendedGhost(out.R(), false); } @@ -84,7 +86,9 @@ namespace quda { copyExtendedGauge(in, out, QUDA_CUDA_FIELD_LOCATION); in.exchangeExtendedGhost(in.R(), false); + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(out, in, true, rho, epsilon); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); out.exchangeExtendedGhost(out.R(), false); } diff --git a/lib/gauge_wilson_flow.cu b/lib/gauge_wilson_flow.cu index a3ce38ba81..d92fb0a68c 100644 --- a/lib/gauge_wilson_flow.cu +++ b/lib/gauge_wilson_flow.cu @@ -38,6 +38,7 @@ namespace quda { wflow_type(wflow_type), step_type(step_type) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); strcat(aux, comm_dim_partitioned_string()); switch (wflow_type) { case QUDA_GAUGE_SMEAR_WILSON_FLOW: strcat(aux,",computeWFlowStepWilson"); break; @@ -52,6 +53,7 @@ namespace quda { } apply(device::get_default_stream()); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } template using Arg = diff --git a/lib/hisq_paths_force_quda.cu b/lib/hisq_paths_force_quda.cu index 320000dc75..e6e30f90bc 100644 --- a/lib/hisq_paths_force_quda.cu +++ b/lib/hisq_paths_force_quda.cu @@ -547,6 +547,7 @@ namespace quda { #ifdef GPU_STAGGERED_DIRAC void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff_array[6]) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); checkNative(link, oprod, newOprod); checkLocation(newOprod, oprod, link); checkPrecision(oprod, link, newOprod); @@ -557,32 +558,24 @@ namespace quda { gauge_param.geometry = QUDA_SCALAR_GEOMETRY; gauge_param.setPrecision(gauge_param.Precision(), true); - auto P3 = GaugeField::Create(gauge_param); - - auto Pmu = GaugeField::Create(gauge_param); - auto P5 = GaugeField::Create(gauge_param); - auto Pnumu = GaugeField::Create(gauge_param); - auto Qnumu = GaugeField::Create(gauge_param); + auto P3 = GaugeField(gauge_param); + auto Pmu = GaugeField(gauge_param); + auto P5 = GaugeField(gauge_param); + auto Pnumu = GaugeField(gauge_param); + auto Qnumu = GaugeField(gauge_param); // need double buffers for these fields to fuse "side link" terms with // subsequent "middle link" terms in a different direction - auto Pmu_next = GaugeField::Create(gauge_param); - auto Pnumu_next = GaugeField::Create(gauge_param); - auto Qnumu_next = GaugeField::Create(gauge_param); + auto Pmu_next = GaugeField(gauge_param); + auto Pnumu_next = GaugeField(gauge_param); + auto Qnumu_next = GaugeField(gauge_param); - instantiateGaugeStaggered(link, *P3, GaugeField_ref(*Pmu), - GaugeField_ref(*P5), GaugeField_ref(*Pnumu), GaugeField_ref(*Qnumu), - GaugeField_ref(*Pmu_next), GaugeField_ref(*Pnumu_next), GaugeField_ref(*Qnumu_next), + instantiateGaugeStaggered(link, P3, GaugeField_ref(Pmu), + GaugeField_ref(P5), GaugeField_ref(Pnumu), GaugeField_ref(Qnumu), + GaugeField_ref(Pmu_next), GaugeField_ref(Pnumu_next), GaugeField_ref(Qnumu_next), newOprod, oprod, path_coeff_array); - delete Pmu; - delete P3; - delete P5; - delete Pnumu; - delete Qnumu; - delete Pmu_next; - delete Pnumu_next; - delete Qnumu_next; + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } #else void hisqStaplesForce(GaugeField &, const GaugeField &, const GaugeField &, const double[6]) @@ -651,10 +644,12 @@ namespace quda { #ifdef GPU_STAGGERED_DIRAC void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oldOprod, const GaugeField &link, double coeff) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); checkNative(link, oldOprod, newOprod); checkLocation(newOprod, oldOprod, link); checkPrecision(newOprod, link, oldOprod); instantiateGaugeStaggered(link, newOprod, oldOprod, coeff); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } #else void hisqLongLinkForce(GaugeField &, const GaugeField &, const GaugeField &, double) @@ -725,10 +720,12 @@ namespace quda { #ifdef GPU_STAGGERED_DIRAC void hisqCompleteForce(GaugeField &force, const GaugeField &link) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); checkNative(link, force); checkLocation(force, link); checkPrecision(link, force); instantiateGaugeStaggered(link, force); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } #else void hisqCompleteForce(GaugeField &, const GaugeField &) diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index a8351fd35e..23d06d3564 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -4057,7 +4057,6 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int for (int d = 0; d < 4; d++) { input_path_v[d] = input_path_buf[d]; } // actually do the computation - profileGaugeForce.TPSTART(QUDA_PROFILE_COMPUTE); if (!forceMonitor()) { gaugeForce(cudaMom, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length); } else { @@ -4068,16 +4067,17 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int gaugeForce(force, *cudaGauge, 1.0, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length); updateMomentum(cudaMom, eb3, force, "gauge"); } - profileGaugeForce.TPSTOP(QUDA_PROFILE_COMPUTE); if (qudaGaugeParam->return_result_mom) cpuMom.copy(cudaMom); if (qudaGaugeParam->make_resident_gauge) { if (gaugePrecise && !qudaGaugeParam->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); + gaugePrecise = new GaugeField(); std::exchange(*gaugePrecise, cudaSiteLink); } - if (qudaGaugeParam->make_resident_mom && !qudaGaugeParam->use_resident_gauge) std::exchange(momResident, cudaMom); + if (qudaGaugeParam->make_resident_mom && !qudaGaugeParam->use_resident_mom) + std::exchange(momResident, cudaMom); else momResident = GaugeField(); if (qudaGaugeParam->make_resident_gauge) { @@ -4145,9 +4145,7 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int * for (int d = 0; d < 4; d++) { input_path_v[d] = input_path_buf[d]; } // actually do the computation - profileGaugePath.TPSTART(QUDA_PROFILE_COMPUTE); gaugePath(*cudaOut, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length); - profileGaugePath.TPSTOP(QUDA_PROFILE_COMPUTE); cpuOut->copy(*cudaOut); @@ -4402,7 +4400,6 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi // mom += delta * [U * force]TA applyU(cudaForce, *gaugePrecise); updateMomentum(cudaMom, dt * delta, cudaForce, "staggered"); - qudaDeviceSynchronize(); // copy the momentum field back to the host if (gauge_param->return_result_mom) cpuMom.copy(cudaMom); @@ -4431,15 +4428,12 @@ void computeHISQForceQuda(void* const milc_momentum, QudaGaugeParam* gParam) { pushProfile(profileHISQForce); + checkGaugeParam(gParam); using namespace quda; using namespace quda::fermion_force; if (gParam->gauge_order != QUDA_MILC_GAUGE_ORDER) errorQuda("Unsupported input field order %d", gParam->gauge_order); - checkGaugeParam(gParam); - - profileHISQForce.TPSTART(QUDA_PROFILE_INIT); - { // default settings for the unitarization const double unitarize_eps = 1e-14; @@ -4483,16 +4477,14 @@ void computeHISQForceQuda(void* const milc_momentum, oParam.setPrecision(gParam->cpu_prec, true); oParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; - GaugeField *stapleOprod = new GaugeField(oParam); - GaugeField *oneLinkOprod = new GaugeField(oParam); - GaugeField *naikOprod = new GaugeField(oParam); + GaugeField stapleOprod(oParam); + GaugeField oneLinkOprod(oParam); + GaugeField naikOprod(oParam); double act_path_coeff[6] = {0, 1, level2_coeff[2], level2_coeff[3], level2_coeff[4], level2_coeff[5]}; // You have to look at the MILC routine to understand the following // Basically, I have already absorbed the one-link coefficient - profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); - { // do outer-product computation ColorSpinorParam qParam; qParam.nColor = 3; @@ -4517,10 +4509,10 @@ void computeHISQForceQuda(void* const milc_momentum, qParam.v = fermion[0]; { // regular terms - GaugeField *oprod[2] = {stapleOprod, naikOprod}; + GaugeField *oprod[2] = {&stapleOprod, &naikOprod}; // loop over different quark fields - for(int i=0; icopy(*stapleOprod); - ax(level2_coeff[0], *oneLinkOprod); - GaugeField *oprod[2] = {oneLinkOprod, naikOprod}; + oneLinkOprod.copy(stapleOprod); + ax(level2_coeff[0], oneLinkOprod); + GaugeField *oprod[2] = {&oneLinkOprod, &naikOprod}; // loop over different quark fields - for(int i=0; iuse_resident_mom) ? new GaugeField(param) : nullptr; + GaugeField cpuMom = (!gParam->use_resident_mom) ? GaugeField(param) : GaugeField(); param.location = QUDA_CUDA_FIELD_LOCATION; param.create = QUDA_ZERO_FIELD_CREATE; @@ -4650,41 +4634,32 @@ void computeHISQForceQuda(void* const milc_momentum, wParam.create = QUDA_NULL_FIELD_CREATE; wParam.setPrecision(gParam->cpu_prec, true); - GaugeField *cudaWLink = new GaugeField(wParam); - profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); + GaugeField cudaWLink(wParam); - cudaWLink->copy(cpuWLink); + cudaWLink.copy(cpuWLink); - cudaWLink->exchangeExtendedGhost(cudaWLink->R(), profileHISQForce); + cudaWLink.exchangeExtendedGhost(cudaWLink.R(), profileHISQForce); - cudaInForce->exchangeExtendedGhost(R, profileHISQForce); - cudaWLink->exchangeExtendedGhost(cudaWLink->R(), profileHISQForce); - cudaOutForce->exchangeExtendedGhost(R, profileHISQForce); + cudaInForce.exchangeExtendedGhost(R, profileHISQForce); + cudaWLink.exchangeExtendedGhost(cudaWLink.R(), profileHISQForce); + cudaOutForce.exchangeExtendedGhost(R, profileHISQForce); // Compute level two term - profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE); - hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaWLink, act_path_coeff); - qudaDeviceSynchronize(); - profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE); + hisqStaplesForce(cudaOutForce, cudaInForce, cudaWLink, act_path_coeff); // Load naik outer product - copyExtendedGauge(*cudaInForce, *naikOprod, QUDA_CUDA_FIELD_LOCATION); - cudaInForce->exchangeExtendedGhost(cudaWLink->R(), profileHISQForce); - delete naikOprod; + copyExtendedGauge(cudaInForce, naikOprod, QUDA_CUDA_FIELD_LOCATION); + cudaInForce.exchangeExtendedGhost(cudaWLink.R(), profileHISQForce); + naikOprod = GaugeField(); // Compute Naik three-link term contribution - profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE); - hisqLongLinkForce(*cudaOutForce, *cudaInForce, *cudaWLink, act_path_coeff[1]); - qudaDeviceSynchronize(); - profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE); + hisqLongLinkForce(cudaOutForce, cudaInForce, cudaWLink, act_path_coeff[1]); - cudaOutForce->exchangeExtendedGhost(R, profileHISQForce); + cudaOutForce.exchangeExtendedGhost(R, profileHISQForce); // Load the V field, which contains general matrices, to the device - profileHISQForce.TPSTART(QUDA_PROFILE_FREE); - delete cudaWLink; - profileHISQForce.TPSTOP(QUDA_PROFILE_FREE); - profileHISQForce.TPSTART(QUDA_PROFILE_INIT); + cudaWLink = GaugeField(); + for (int dir = 0; dir < 4; ++dir) { vParam.x[dir] += 2 * R[dir]; vParam.r[dir] = R[dir]; @@ -4696,28 +4671,20 @@ void computeHISQForceQuda(void* const milc_momentum, vParam.setPrecision(gParam->cpu_prec, true); vParam.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED; vParam.pad = 3 * pad_size; - GaugeField *cudaVLink = new GaugeField(vParam); - profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); + GaugeField cudaVLink(vParam); - cudaVLink->copy(cpuVLink); - cudaVLink->exchangeExtendedGhost(cudaVLink->R(), profileHISQForce); + cudaVLink.copy(cpuVLink); + cudaVLink.exchangeExtendedGhost(cudaVLink.R(), profileHISQForce); - profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE); *num_failures_h = 0; - unitarizeForce(*cudaInForce, *cudaOutForce, *cudaVLink, num_failures_d); + unitarizeForce(cudaInForce, cudaOutForce, cudaVLink, num_failures_d); if (*num_failures_h>0) errorQuda("Error in the unitarization component of the hisq fermion force: %d failures\n", *num_failures_h); - cudaOutForce->zero(); - qudaDeviceSynchronize(); - profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE); - // Load the U field, which contains U(3) matrices, to the device // TODO: in theory these should just be SU(3) matrices with MILC phases? - profileHISQForce.TPSTART(QUDA_PROFILE_FREE); - delete cudaVLink; - profileHISQForce.TPSTOP(QUDA_PROFILE_FREE); - profileHISQForce.TPSTART(QUDA_PROFILE_INIT); + cudaVLink = GaugeField(); + for (int dir = 0; dir < 4; ++dir) { uParam.x[dir] += 2 * R[dir]; uParam.r[dir] = R[dir]; @@ -4729,47 +4696,31 @@ void computeHISQForceQuda(void* const milc_momentum, uParam.setPrecision(gParam->cpu_prec, true); uParam.ghostExchange = QUDA_GHOST_EXCHANGE_EXTENDED; uParam.pad = 3 * pad_size; - GaugeField *cudaULink = new GaugeField(uParam); - profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); + GaugeField cudaULink(uParam); - cudaULink->copy(cpuULink); - cudaULink->exchangeExtendedGhost(cudaULink->R(), profileHISQForce); + cudaULink.copy(cpuULink); + cudaULink.exchangeExtendedGhost(cudaULink.R(), profileHISQForce); // Compute Fat7-staple term - profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE); - hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaULink, fat7_coeff); - qudaDeviceSynchronize(); - profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE); - - profileHISQForce.TPSTART(QUDA_PROFILE_FREE); - delete cudaInForce; - profileHISQForce.TPSTOP(QUDA_PROFILE_FREE); - profileHISQForce.TPSTART(QUDA_PROFILE_INIT); - GaugeField* cudaMom = new GaugeField(momParam); - profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); - - profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE); - hisqCompleteForce(*cudaOutForce, *cudaULink); - - if (gParam->use_resident_mom) { - if (momResident.Length()) errorQuda("No resident momentum field to use"); - updateMomentum(momResident, dt, *cudaOutForce, "hisq"); - } else { - updateMomentum(*cudaMom, dt, *cudaOutForce, "hisq"); - } - qudaDeviceSynchronize(); - profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE); + cudaOutForce.zero(); + hisqStaplesForce(cudaOutForce, cudaInForce, cudaULink, fat7_coeff); - if (gParam->return_result_mom) { - // Close the paths, make anti-hermitian, and store in compressed format - if (gParam->return_result_mom) cpuMom->copy(*cudaMom); - } + cudaInForce = GaugeField(); - if (cpuMom) delete cpuMom; + hisqCompleteForce(cudaOutForce, cudaULink); + + if (gParam->use_resident_mom && !momResident.Length()) errorQuda("No resident momentum field to use"); + GaugeField mom = gParam->use_resident_mom ? momResident.create_alias() : GaugeField(momParam); + updateMomentum(mom, dt, cudaOutForce, "hisq"); + + // Close the paths, make anti-hermitian, and store in compressed format + if (gParam->return_result_mom) cpuMom.copy(mom); if (!gParam->make_resident_mom) momResident = GaugeField(); - if (cudaMom) delete cudaMom; - delete cudaOutForce; - delete cudaULink; + + if (gParam->make_resident_mom && !gParam->use_resident_mom) + std::exchange(momResident, mom); + else + momResident = GaugeField(); popProfile(); } @@ -4995,6 +4946,7 @@ void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom, if (param->make_resident_gauge) { if (gaugePrecise && !param->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); + gaugePrecise = new GaugeField(); std::exchange(*gaugePrecise, u_out); } @@ -5004,126 +4956,87 @@ void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom, popProfile(); } - void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) { - profileProject.TPSTART(QUDA_PROFILE_TOTAL); - - profileProject.TPSTART(QUDA_PROFILE_INIT); - checkGaugeParam(param); - - // create the gauge field - GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS); - gParam.location = QUDA_CPU_FIELD_LOCATION; - bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; - GaugeField *cpuGauge = need_cpu ? new GaugeField(gParam) : nullptr; - - // create the device fields - gParam.location = QUDA_CUDA_FIELD_LOCATION; - gParam.create = QUDA_NULL_FIELD_CREATE; - gParam.order = QUDA_FLOAT2_GAUGE_ORDER; - gParam.reconstruct = param->reconstruct; - GaugeField *cudaGauge = !param->use_resident_gauge ? new GaugeField(gParam) : nullptr; - profileProject.TPSTOP(QUDA_PROFILE_INIT); - - if (param->use_resident_gauge) { - if (!gaugePrecise) errorQuda("No resident gauge field to use"); - cudaGauge = gaugePrecise; - gaugePrecise = nullptr; - } else { - profileProject.TPSTART(QUDA_PROFILE_H2D); - cudaGauge->copy(*cpuGauge); - profileProject.TPSTOP(QUDA_PROFILE_H2D); - } +void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) +{ + pushProfile(profileProject); + checkGaugeParam(param); - profileProject.TPSTART(QUDA_PROFILE_COMPUTE); - *num_failures_h = 0; + // create the gauge field + GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS); + gParam.location = QUDA_CPU_FIELD_LOCATION; + bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; + GaugeField cpuGauge = need_cpu ? GaugeField(gParam) : GaugeField(); - // project onto SU(3) - if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase(); - projectSU3(*cudaGauge, tol, num_failures_d); - if (!cudaGauge->StaggeredPhaseApplied() && param->staggered_phase_applied) cudaGauge->applyStaggeredPhase(); + // create the device fields + if (param->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field to use"); + gParam.location = QUDA_CUDA_FIELD_LOCATION; + gParam.create = QUDA_COPY_FIELD_CREATE; + gParam.field = &cpuGauge; + gParam.order = QUDA_FLOAT2_GAUGE_ORDER; + gParam.reconstruct = param->reconstruct; + GaugeField cudaGauge = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam); - profileProject.TPSTOP(QUDA_PROFILE_COMPUTE); + *num_failures_h = 0; - if(*num_failures_h>0) - errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h); + // project onto SU(3) + if (cudaGauge.StaggeredPhaseApplied()) cudaGauge.removeStaggeredPhase(); + projectSU3(cudaGauge, tol, num_failures_d); + if (!cudaGauge.StaggeredPhaseApplied() && param->staggered_phase_applied) cudaGauge.applyStaggeredPhase(); - if (param->return_result_gauge) { - profileProject.TPSTART(QUDA_PROFILE_D2H); - cpuGauge->copy(*cudaGauge); - profileProject.TPSTOP(QUDA_PROFILE_D2H); - } + if (*num_failures_h > 0) errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h); - if (param->make_resident_gauge) { - if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); - gaugePrecise = cudaGauge; - } else { - delete cudaGauge; - } + if (param->return_result_gauge) cpuGauge.copy(cudaGauge); - profileProject.TPSTART(QUDA_PROFILE_FREE); - if (cpuGauge) delete cpuGauge; - profileProject.TPSTOP(QUDA_PROFILE_FREE); - - profileProject.TPSTOP(QUDA_PROFILE_TOTAL); - } - - void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) { - profilePhase.TPSTART(QUDA_PROFILE_TOTAL); - - profilePhase.TPSTART(QUDA_PROFILE_INIT); - checkGaugeParam(param); - - // create the gauge field - GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS); - bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; - gParam.location = QUDA_CPU_FIELD_LOCATION; - GaugeField *cpuGauge = need_cpu ? new GaugeField(gParam) : nullptr; - - // create the device fields - gParam.location = QUDA_CUDA_FIELD_LOCATION; - gParam.create = QUDA_NULL_FIELD_CREATE; - gParam.order = QUDA_FLOAT2_GAUGE_ORDER; - gParam.reconstruct = param->reconstruct; - GaugeField *cudaGauge = !param->use_resident_gauge ? new GaugeField(gParam) : nullptr; - profilePhase.TPSTOP(QUDA_PROFILE_INIT); - - if (param->use_resident_gauge) { - if (!gaugePrecise) errorQuda("No resident gauge field to use"); - cudaGauge = gaugePrecise; - } else { - profilePhase.TPSTART(QUDA_PROFILE_H2D); - cudaGauge->copy(*cpuGauge); - profilePhase.TPSTOP(QUDA_PROFILE_H2D); - } + if (param->make_resident_gauge) { + if (gaugePrecise != nullptr && !param->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); + gaugePrecise = new GaugeField(); + std::exchange(*gaugePrecise, cudaGauge); + } - profilePhase.TPSTART(QUDA_PROFILE_COMPUTE); - *num_failures_h = 0; + popProfile(); +} - // apply / remove phase as appropriate - if (!cudaGauge->StaggeredPhaseApplied()) cudaGauge->applyStaggeredPhase(); - else cudaGauge->removeStaggeredPhase(); +void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) +{ + pushProfile(profilePhase); + checkGaugeParam(param); - profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE); + // create the gauge field + GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS); + bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; + gParam.location = QUDA_CPU_FIELD_LOCATION; + GaugeField cpuGauge = need_cpu ? GaugeField(gParam) : GaugeField(); - if (param->return_result_gauge) { - profilePhase.TPSTART(QUDA_PROFILE_D2H); - cpuGauge->copy(*cudaGauge); - profilePhase.TPSTOP(QUDA_PROFILE_D2H); - } + // create the device fields + if (param->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field to use"); + gParam.location = QUDA_CUDA_FIELD_LOCATION; + gParam.create = QUDA_COPY_FIELD_CREATE; + gParam.field = &cpuGauge; + gParam.order = QUDA_FLOAT2_GAUGE_ORDER; + gParam.reconstruct = param->reconstruct; + GaugeField cudaGauge = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam); - if (param->make_resident_gauge) { - if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); - gaugePrecise = cudaGauge; - } else { - delete cudaGauge; - } + profilePhase.TPSTART(QUDA_PROFILE_COMPUTE); + *num_failures_h = 0; + + // apply / remove phase as appropriate + if (!cudaGauge.StaggeredPhaseApplied()) + cudaGauge.applyStaggeredPhase(); + else + cudaGauge.removeStaggeredPhase(); - profilePhase.TPSTART(QUDA_PROFILE_FREE); - if (cpuGauge) delete cpuGauge; - profilePhase.TPSTOP(QUDA_PROFILE_FREE); + profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE); - profilePhase.TPSTOP(QUDA_PROFILE_TOTAL); - } + if (param->return_result_gauge) cpuGauge.copy(cudaGauge); + + if (param->make_resident_gauge) { + if (gaugePrecise != nullptr && !param->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); + gaugePrecise = new GaugeField(); + std::exchange(*gaugePrecise, cudaGauge); + } + + popProfile(); +} // evaluate the momentum action double momActionQuda(void* momentum, QudaGaugeParam* param) @@ -5149,7 +5062,8 @@ double momActionQuda(void* momentum, QudaGaugeParam* param) // perform the update double action = computeMomAction(cudaMom); - if (param->make_resident_mom && !param->use_resident_gauge) std::exchange(momResident, cudaMom); + if (param->make_resident_mom && !param->use_resident_mom) + std::exchange(momResident, cudaMom); else momResident = GaugeField(); popProfile(); @@ -5389,7 +5303,6 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par ColorSpinorField out(cudaParam); ColorSpinorField temp1(cudaParam); - // Create the smearing operator //------------------------------------------------------ Dirac *d = nullptr; @@ -5470,8 +5383,8 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param) { + pushProfile(profileGaugeSmear); pushOutputPrefix("performGaugeSmearQuda: "); - profileGaugeSmear.TPSTART(QUDA_PROFILE_TOTAL); checkGaugeSmearParam(smear_param); if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded"); @@ -5480,7 +5393,7 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable GaugeFieldParam gParam(*gaugeSmeared); gParam.location = QUDA_CUDA_FIELD_LOCATION; - auto *cudaGaugeTemp = new GaugeField(gParam); + GaugeField tmp(gParam); int measurement_n = 0; // The nth measurement to take gaugeObservablesQuda(&obs_param[measurement_n]); @@ -5489,18 +5402,15 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable } for (unsigned int i = 0; i < smear_param->n_steps; i++) { - profileGaugeSmear.TPSTART(QUDA_PROFILE_COMPUTE); - switch (smear_param->smear_type) { - case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, *cudaGaugeTemp, smear_param->alpha); break; - case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, *cudaGaugeTemp, smear_param->rho); break; + case QUDA_GAUGE_SMEAR_APE: APEStep(*gaugeSmeared, tmp, smear_param->alpha); break; + case QUDA_GAUGE_SMEAR_STOUT: STOUTStep(*gaugeSmeared, tmp, smear_param->rho); break; case QUDA_GAUGE_SMEAR_OVRIMP_STOUT: - OvrImpSTOUTStep(*gaugeSmeared, *cudaGaugeTemp, smear_param->rho, smear_param->epsilon); + OvrImpSTOUTStep(*gaugeSmeared, tmp, smear_param->rho, smear_param->epsilon); break; default: errorQuda("Unkown gauge smear type %d", smear_param->smear_type); } - profileGaugeSmear.TPSTOP(QUDA_PROFILE_COMPUTE); if ((i + 1) % smear_param->meas_interval == 0) { measurement_n++; gaugeObservablesQuda(&obs_param[measurement_n]); @@ -5510,15 +5420,14 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable } } - delete cudaGaugeTemp; - profileGaugeSmear.TPSTOP(QUDA_PROFILE_TOTAL); popOutputPrefix(); + popProfile(); } void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param) { + pushProfile(profileWFlow); pushOutputPrefix("performWFlowQuda: "); - profileWFlow.TPSTART(QUDA_PROFILE_TOTAL); checkGaugeSmearParam(smear_param); if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded"); @@ -5526,18 +5435,18 @@ void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam gaugeSmeared = createExtendedGauge(*gaugePrecise, R, profileWFlow); GaugeFieldParam gParamEx(*gaugeSmeared); - auto *gaugeAux = GaugeField::Create(gParamEx); + GaugeField gaugeAux(gParamEx); GaugeFieldParam gParam(*gaugePrecise); gParam.reconstruct = QUDA_RECONSTRUCT_NO; // temporary field is not on manifold so cannot use reconstruct - auto *gaugeTemp = GaugeField::Create(gParam); + GaugeField gaugeTemp(gParam); - GaugeField *in = gaugeSmeared; - GaugeField *out = gaugeAux; + GaugeField &in = *gaugeSmeared; + GaugeField &out = gaugeAux; int measurement_n = 0; // The nth measurement to take - gaugeObservables(*in, obs_param[measurement_n], profileWFlow); + gaugeObservables(in, obs_param[measurement_n]); if (getVerbosity() >= QUDA_SUMMARIZE) { printfQuda("flow t, plaquette, E_tot, E_spatial, E_temporal, Q charge\n"); @@ -5548,14 +5457,12 @@ void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam for (unsigned int i = 0; i < smear_param->n_steps; i++) { // Perform W1, W2, and Vt Wilson Flow steps as defined in // https://arxiv.org/abs/1006.4518v3 - profileWFlow.TPSTART(QUDA_PROFILE_COMPUTE); if (i > 0) std::swap(in, out); // output from prior step becomes input for next step - WFlowStep(*out, *gaugeTemp, *in, smear_param->epsilon, smear_param->smear_type); - profileWFlow.TPSTOP(QUDA_PROFILE_COMPUTE); + WFlowStep(out, gaugeTemp, in, smear_param->epsilon, smear_param->smear_type); if ((i + 1) % smear_param->meas_interval == 0) { measurement_n++; // increment measurements. - gaugeObservables(*out, obs_param[measurement_n], profileWFlow); + gaugeObservables(out, obs_param[measurement_n]); if (getVerbosity() >= QUDA_SUMMARIZE) { printfQuda("%le %.16e %+.16e %+.16e %+.16e %+.16e\n", smear_param->epsilon * (i + 1), obs_param[measurement_n].plaquette[0], obs_param[measurement_n].energy[0], @@ -5565,153 +5472,98 @@ void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam } } - delete gaugeTemp; - delete gaugeAux; - profileWFlow.TPSTOP(QUDA_PROFILE_TOTAL); popOutputPrefix(); + popProfile(); } int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double relax_boost, const double tolerance, - const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param, - double *timeinfo) + const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param) { - GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_TOTAL); - + pushProfile(GaugeFixOVRQuda); checkGaugeParam(param); - GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_INIT); - GaugeFieldParam gParam(*param, gauge); gParam.location = QUDA_CPU_FIELD_LOCATION; - auto *cpuGauge = new GaugeField(gParam); + GaugeField cpuGauge(gParam); gParam.create = QUDA_NULL_FIELD_CREATE; gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.link_type = param->type; gParam.reconstruct = param->reconstruct; gParam.setPrecision(gParam.Precision(), true); - auto *cudaInGauge = new GaugeField(gParam); - - GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_INIT); - GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_H2D); - - cudaInGauge->copy(*cpuGauge); + GaugeField cudaInGauge(gParam); - GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_H2D); + cudaInGauge.copy(cpuGauge); - GaugeField *cudaInGaugeEx = nullptr; - - if (comm_size() == 1) { - // perform the update - GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_COMPUTE); - gaugeFixingOVR(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, - stopWtheta); - GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_COMPUTE); - } else { - cudaInGaugeEx = createExtendedGauge(*cudaInGauge, R, GaugeFixOVRQuda); + GaugeField *cudaInGaugeEx = createExtendedGauge(cudaInGauge, R, GaugeFixOVRQuda); - // perform the update - GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_COMPUTE); - gaugeFixingOVR(*cudaInGaugeEx, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, - stopWtheta); - GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_COMPUTE); + // perform the update + gaugeFixingOVR(*cudaInGaugeEx, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, + stopWtheta); - copyExtendedGauge(*cudaInGauge, *cudaInGaugeEx, QUDA_CUDA_FIELD_LOCATION); - } + copyExtendedGauge(cudaInGauge, *cudaInGaugeEx, QUDA_CUDA_FIELD_LOCATION); // copy the gauge field back to the host - GaugeFixOVRQuda.TPSTART(QUDA_PROFILE_D2H); - cpuGauge->copy(*cudaInGauge); - GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_D2H); - - GaugeFixOVRQuda.TPSTOP(QUDA_PROFILE_TOTAL); + cpuGauge.copy(cudaInGauge); if (param->make_resident_gauge) { - if (gaugePrecise != nullptr) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); - gaugePrecise = cudaInGauge; + freeUniqueGaugeQuda(QUDA_WILSON_LINKS); + gaugePrecise = new GaugeField(); + std::exchange(*gaugePrecise, cudaInGauge); if (extendedGaugeResident) delete extendedGaugeResident; extendedGaugeResident = cudaInGaugeEx; } else { - delete cudaInGauge; - if (cudaInGaugeEx) delete cudaInGaugeEx; - } - - delete cpuGauge; - - if(timeinfo){ - timeinfo[0] = GaugeFixOVRQuda.Last(QUDA_PROFILE_H2D); - timeinfo[1] = GaugeFixOVRQuda.Last(QUDA_PROFILE_COMPUTE); - timeinfo[2] = GaugeFixOVRQuda.Last(QUDA_PROFILE_D2H); + delete cudaInGaugeEx; } + popProfile(); return 0; } -int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir, const unsigned int Nsteps, \ - const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, \ - const unsigned int stopWtheta, QudaGaugeParam* param , double* timeinfo) +int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, + const unsigned int verbose_interval, const double alpha, const unsigned int autotune, + const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param) { - GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_TOTAL); - + pushProfile(GaugeFixFFTQuda); checkGaugeParam(param); - GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_INIT); - GaugeFieldParam gParam(*param, gauge); gParam.location = QUDA_CPU_FIELD_LOCATION; - auto *cpuGauge = new GaugeField(gParam); + GaugeField cpuGauge(gParam); gParam.create = QUDA_NULL_FIELD_CREATE; gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.link_type = param->type; gParam.reconstruct = param->reconstruct; gParam.setPrecision(gParam.Precision(), true); - auto *cudaInGauge = new GaugeField(gParam); + GaugeField cudaInGauge(gParam); - GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_INIT); - - GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_H2D); - cudaInGauge->copy(*cpuGauge); - GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_H2D); + cudaInGauge.copy(cpuGauge); // perform the update - GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_COMPUTE); - - gaugeFixingFFT(*cudaInGauge, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta); - - GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_COMPUTE); + gaugeFixingFFT(cudaInGauge, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta); // copy the gauge field back to the host - GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_D2H); - cpuGauge->copy(*cudaInGauge); - GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_D2H); - - GaugeFixFFTQuda.TPSTOP(QUDA_PROFILE_TOTAL); + cpuGauge.copy(cudaInGauge); if (param->make_resident_gauge) { - if (gaugePrecise != nullptr) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); - gaugePrecise = cudaInGauge; - } else { - delete cudaInGauge; - } - - if (timeinfo) { - timeinfo[0] = GaugeFixFFTQuda.Last(QUDA_PROFILE_H2D); - timeinfo[1] = GaugeFixFFTQuda.Last(QUDA_PROFILE_COMPUTE); - timeinfo[2] = GaugeFixFFTQuda.Last(QUDA_PROFILE_D2H); + freeUniqueGaugeQuda(QUDA_WILSON_LINKS); + gaugePrecise = new GaugeField(); + std::exchange(*gaugePrecise, cudaInGauge); } + popProfile(); return 0; } void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const QudaContractType cType, QudaInvertParam *param, const int *X) { + pushProfile(profileContract); // DMH: Easiest way to construct ColorSpinorField? Do we require the user // to declare and fill and invert_param, or can it just be hacked?. - profileContract.TPSTART(QUDA_PROFILE_TOTAL); profileContract.TPSTART(QUDA_PROFILE_INIT); // wrap CPU host side pointers @@ -5743,21 +5595,19 @@ void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const Quda y[0] = h_y; profileContract.TPSTOP(QUDA_PROFILE_H2D); - profileContract.TPSTART(QUDA_PROFILE_COMPUTE); contractQuda(x[0], y[0], d_result, cType); - profileContract.TPSTOP(QUDA_PROFILE_COMPUTE); profileContract.TPSTART(QUDA_PROFILE_D2H); qudaMemcpy(h_result, d_result, data_bytes, qudaMemcpyDeviceToHost); profileContract.TPSTOP(QUDA_PROFILE_D2H); pool_device_free(d_result); - profileContract.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); } void gaugeObservablesQuda(QudaGaugeObservableParam *param) { - profileGaugeObs.TPSTART(QUDA_PROFILE_TOTAL); + pushProfile(profileGaugeObs); checkGaugeObservableParam(param); if (!gaugePrecise) errorQuda("Cannot compute Polyakov loop as there is no resident gauge field"); @@ -5778,6 +5628,6 @@ void gaugeObservablesQuda(QudaGaugeObservableParam *param) errorQuda("Removing staggered phases was requested, however staggered phases aren't already applied"); } - gaugeObservables(*gauge, *param, profileGaugeObs); - profileGaugeObs.TPSTOP(QUDA_PROFILE_TOTAL); + gaugeObservables(*gauge, *param); + popProfile(); } diff --git a/lib/milc_interface.cpp b/lib/milc_interface.cpp index 8f33083574..1c9e25cb54 100644 --- a/lib/milc_interface.cpp +++ b/lib/milc_interface.cpp @@ -3007,14 +3007,8 @@ void qudaGaugeFixingOVR(int precision, unsigned int gauge_dir, int Nsteps, int v qudaGaugeParam.site_size = arg->size; qudaGaugeParam.gauge_order = arg->site ? QUDA_MILC_SITE_GAUGE_ORDER : QUDA_MILC_GAUGE_ORDER; - double timeinfo[3]; computeGaugeFixingOVRQuda(gauge, gauge_dir, Nsteps, verbose_interval, relax_boost, tolerance, reunit_interval, - stopWtheta, &qudaGaugeParam, timeinfo); - - printfQuda("Time H2D: %lf\n", timeinfo[0]); - printfQuda("Time to Compute: %lf\n", timeinfo[1]); - printfQuda("Time D2H: %lf\n", timeinfo[2]); - printfQuda("Time all: %lf\n", timeinfo[0]+timeinfo[1]+timeinfo[2]); + stopWtheta, &qudaGaugeParam); qudamilc_called(__func__, verbosity); } @@ -3036,13 +3030,6 @@ void qudaGaugeFixingFFT( int precision, qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; //qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_12; - - double timeinfo[3]; - computeGaugeFixingFFTQuda(milc_sitelink, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta, \ - &qudaGaugeParam, timeinfo); - - printfQuda("Time H2D: %lf\n", timeinfo[0]); - printfQuda("Time to Compute: %lf\n", timeinfo[1]); - printfQuda("Time D2H: %lf\n", timeinfo[2]); - printfQuda("Time all: %lf\n", timeinfo[0]+timeinfo[1]+timeinfo[2]); + computeGaugeFixingFFTQuda(milc_sitelink, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta, + &qudaGaugeParam); } diff --git a/lib/staggered_oprod.cu b/lib/staggered_oprod.cu index 1f9cbcccf7..bd085c899b 100644 --- a/lib/staggered_oprod.cu +++ b/lib/staggered_oprod.cu @@ -2,7 +2,6 @@ #include #include #include -#include "timer.h" namespace quda { diff --git a/lib/unitarize_force_quda.cu b/lib/unitarize_force_quda.cu index 29b315d2ef..84b94a0d54 100644 --- a/lib/unitarize_force_quda.cu +++ b/lib/unitarize_force_quda.cu @@ -56,6 +56,7 @@ namespace quda { void unitarizeForce(GaugeField &newForce, const GaugeField &oldForce, const GaugeField &u, int* fails) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); checkReconstruct(u, oldForce, newForce); checkPrecision(u, oldForce, newForce); @@ -63,6 +64,7 @@ namespace quda { errorQuda("Only native order supported"); instantiate(newForce, oldForce, u, fails); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } #else void unitarizeForce(GaugeField &, const GaugeField &, const GaugeField &, int*) diff --git a/lib/unitarize_links_quda.cu b/lib/unitarize_links_quda.cu index 2bdd24880a..83ea615c48 100644 --- a/lib/unitarize_links_quda.cu +++ b/lib/unitarize_links_quda.cu @@ -141,8 +141,10 @@ namespace quda { void unitarizeLinks(GaugeField& out, const GaugeField &in, int* fails) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); checkPrecision(out, in); instantiate(out, in, fails); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } void unitarizeLinks(GaugeField &links, int* fails) { unitarizeLinks(links, links, fails); } @@ -182,11 +184,13 @@ namespace quda { void projectSU3(GaugeField &u, double tol, int *fails) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); // check the the field doesn't have staggered phases applied if (u.StaggeredPhaseApplied()) errorQuda("Cannot project gauge field with staggered phases applied"); instantiate(u, tol, fails); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } } // namespace quda From a75fad1244dbc01930f23d5d34af7109bc824b90 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 22 May 2023 14:35:08 -0700 Subject: [PATCH 13/99] More interface code related cleanup --- include/llfat_quda.h | 4 +- lib/interface_quda.cpp | 141 ++++++++------------------------- lib/llfat_quda.cu | 33 ++++---- lib/staggered_two_link_quda.cu | 2 + 4 files changed, 55 insertions(+), 125 deletions(-) diff --git a/include/llfat_quda.h b/include/llfat_quda.h index 696c67d3f8..0bf9f5b249 100644 --- a/include/llfat_quda.h +++ b/include/llfat_quda.h @@ -11,7 +11,7 @@ namespace quda { @param u[in] The input gauge field @param coeff[in] Array of path coefficients */ - void fatKSLink(GaugeField *fat, const GaugeField &u, const double *coeff); + void fatKSLink(GaugeField &fat, const GaugeField &u, const double *coeff); /** @brief Compute the long links for an improved staggered (Kogut-Susskind) fermions. @@ -19,6 +19,6 @@ namespace quda { @param u[in] The input gauge field @param coeff[in] Array of path coefficients */ - void longKSLink(GaugeField *lng, const GaugeField &u, const double *coeff); + void longKSLink(GaugeField &lng, const GaugeField &u, const double *coeff); } // namespace quda diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 23d06d3564..e39bf48b5c 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -3826,9 +3826,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param) { - profileFatLink.TPSTART(QUDA_PROFILE_TOTAL); - profileFatLink.TPSTART(QUDA_PROFILE_INIT); - + pushProfile(profileFatLink); checkGaugeParam(param); GaugeFieldParam gParam(*param, fatlink, QUDA_GENERAL_LINKS); @@ -3848,16 +3846,11 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, gParam.setPrecision(param->cuda_prec, true); gParam.create = QUDA_NULL_FIELD_CREATE; GaugeField *cudaInLink = new GaugeField(gParam); - profileFatLink.TPSTOP(QUDA_PROFILE_INIT); - profileFatLink.TPSTART(QUDA_PROFILE_H2D); cudaInLink->copy(cpuInLink); - profileFatLink.TPSTOP(QUDA_PROFILE_H2D); GaugeField *cudaInLinkEx = createExtendedGauge(*cudaInLink, R, profileFatLink); - profileFatLink.TPSTART(QUDA_PROFILE_FREE); delete cudaInLink; - profileFatLink.TPSTOP(QUDA_PROFILE_FREE); gParam.create = QUDA_ZERO_FIELD_CREATE; gParam.link_type = QUDA_GENERAL_LINKS; @@ -3866,40 +3859,14 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; if (longlink) { - profileFatLink.TPSTART(QUDA_PROFILE_INIT); - GaugeField *cudaLongLink = new GaugeField(gParam); - profileFatLink.TPSTOP(QUDA_PROFILE_INIT); - - profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE); - longKSLink(cudaLongLink, *cudaInLinkEx, path_coeff); - profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE); - - profileFatLink.TPSTART(QUDA_PROFILE_D2H); - cpuLongLink.copy(*cudaLongLink); - profileFatLink.TPSTOP(QUDA_PROFILE_D2H); - - profileFatLink.TPSTART(QUDA_PROFILE_FREE); - delete cudaLongLink; - profileFatLink.TPSTOP(QUDA_PROFILE_FREE); + GaugeField longLink(gParam); + longKSLink(longLink, *cudaInLinkEx, path_coeff); + cpuLongLink.copy(longLink); } - profileFatLink.TPSTART(QUDA_PROFILE_INIT); - GaugeField *cudaFatLink = new GaugeField(gParam); - profileFatLink.TPSTOP(QUDA_PROFILE_INIT); - - profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE); - fatKSLink(cudaFatLink, *cudaInLinkEx, path_coeff); - profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE); - - if (fatlink) { - profileFatLink.TPSTART(QUDA_PROFILE_D2H); - cpuFatLink.copy(*cudaFatLink); - profileFatLink.TPSTOP(QUDA_PROFILE_D2H); - } - - profileFatLink.TPSTART(QUDA_PROFILE_FREE); - delete cudaInLinkEx; - profileFatLink.TPSTOP(QUDA_PROFILE_FREE); + GaugeField fatLink(gParam); + fatKSLink(fatLink, *cudaInLinkEx, path_coeff); + if (fatlink) cpuFatLink.copy(fatLink); if (ulink) { const double unitarize_eps = 1e-14; @@ -3911,42 +3878,28 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, quda::setUnitarizeLinksConstants(unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); - GaugeField *cudaUnitarizedLink = new GaugeField(gParam); + GaugeField unitarizedLink(gParam); - profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE); *num_failures_h = 0; - quda::unitarizeLinks(*cudaUnitarizedLink, *cudaFatLink, num_failures_d); // unitarize on the gpu + quda::unitarizeLinks(unitarizedLink, fatLink, num_failures_d); // unitarize on the gpu if (*num_failures_h > 0) errorQuda("Error in unitarization component of the hisq fattening: %d failures", *num_failures_h); - profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE); - - profileFatLink.TPSTART(QUDA_PROFILE_D2H); - cpuUnitarizedLink.copy(*cudaUnitarizedLink); - profileFatLink.TPSTOP(QUDA_PROFILE_D2H); - profileFatLink.TPSTART(QUDA_PROFILE_FREE); - delete cudaUnitarizedLink; - profileFatLink.TPSTOP(QUDA_PROFILE_FREE); + cpuUnitarizedLink.copy(unitarizedLink); } - profileFatLink.TPSTART(QUDA_PROFILE_FREE); - delete cudaFatLink; - profileFatLink.TPSTOP(QUDA_PROFILE_FREE); - - profileFatLink.TPSTOP(QUDA_PROFILE_TOTAL); + delete cudaInLinkEx; + popProfile(); } void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) { - profileGaussianSmear.TPSTART(QUDA_PROFILE_TOTAL); - profileGaussianSmear.TPSTART(QUDA_PROFILE_INIT); - + pushProfile(profileGaussianSmear); checkGaugeParam(param); GaugeFieldParam gParam(*param, inlink, QUDA_ASQTAD_LONG_LINKS); gParam.gauge = twolink; GaugeField cpuTwoLink(gParam); // create the host twolink - profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT); GaugeField *cudaInLinkEx = nullptr; @@ -3962,9 +3915,7 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) GaugeField cudaInLink(gParam); profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT); - profileGaussianSmear.TPSTART(QUDA_PROFILE_H2D); cudaInLink.copy(cpuInLink); - profileGaussianSmear.TPSTOP(QUDA_PROFILE_H2D); cudaInLinkEx = createExtendedGauge(cudaInLink, R, profileGaussianSmear); } else { cudaInLinkEx = createExtendedGauge(*gaugePrecise, R, profileGaussianSmear); @@ -3980,30 +3931,18 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) gsParam.nFace = 3; gsParam.pad = gsParam.pad*gsParam.nFace; - profileGaussianSmear.TPSTART(QUDA_PROFILE_INIT); - freeUniqueGaugeQuda(QUDA_SMEARED_LINKS); gaugeSmeared = new GaugeField(gsParam); - profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT); - - profileGaussianSmear.TPSTART(QUDA_PROFILE_COMPUTE); - computeTwoLink(*gaugeSmeared, *cudaInLinkEx); gaugeSmeared->exchangeGhost(); - profileGaussianSmear.TPSTOP(QUDA_PROFILE_COMPUTE); - - profileGaussianSmear.TPSTART(QUDA_PROFILE_D2H); cpuTwoLink.copy(*gaugeSmeared); - profileGaussianSmear.TPSTOP(QUDA_PROFILE_D2H); - profileGaussianSmear.TPSTART(QUDA_PROFILE_FREE); freeUniqueGaugeQuda(QUDA_SMEARED_LINKS); delete cudaInLinkEx; - profileGaussianSmear.TPSTOP(QUDA_PROFILE_FREE); - profileGaussianSmear.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); } int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int* path_length, @@ -4017,11 +3956,11 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int GaugeField cpuSiteLink = !qudaGaugeParam->use_resident_gauge ? GaugeField(gParam) : GaugeField(); if (qudaGaugeParam->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field to use"); + gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.create = QUDA_COPY_FIELD_CREATE; gParam.field = &cpuSiteLink; gParam.reconstruct = qudaGaugeParam->reconstruct; gParam.setPrecision(qudaGaugeParam->cuda_prec, true); - gParam.location = QUDA_CUDA_FIELD_LOCATION; GaugeField cudaSiteLink = qudaGaugeParam->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam); GaugeFieldParam gParamMom(*qudaGaugeParam, mom, QUDA_ASQTAD_MOM_LINKS); @@ -4099,36 +4038,27 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int * GaugeFieldParam gParam(*qudaGaugeParam, siteLink); gParam.location = QUDA_CPU_FIELD_LOCATION; - GaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new GaugeField(gParam) : nullptr; - - GaugeField *cudaSiteLink = nullptr; - - if (qudaGaugeParam->use_resident_gauge) { - if (!gaugePrecise) errorQuda("No resident gauge field to use"); - cudaSiteLink = gaugePrecise; - } else { - gParam.location = QUDA_CUDA_FIELD_LOCATION; - gParam.create = QUDA_NULL_FIELD_CREATE; - gParam.reconstruct = qudaGaugeParam->reconstruct; - gParam.setPrecision(qudaGaugeParam->cuda_prec, true); + GaugeField cpuSiteLink = !qudaGaugeParam->use_resident_gauge ? GaugeField(gParam) : GaugeField(); - cudaSiteLink = new GaugeField(gParam); - cudaSiteLink->copy(*cpuSiteLink); - } + if (qudaGaugeParam->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field to use"); + gParam.location = QUDA_CUDA_FIELD_LOCATION; + gParam.create = QUDA_COPY_FIELD_CREATE; + gParam.field = &cpuSiteLink; + gParam.reconstruct = qudaGaugeParam->reconstruct; + gParam.setPrecision(qudaGaugeParam->cuda_prec, true); + GaugeField cudaSiteLink = qudaGaugeParam->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam); GaugeFieldParam gParamOut(*qudaGaugeParam, out); gParamOut.location = QUDA_CPU_FIELD_LOCATION; - GaugeField *cpuOut = new GaugeField(gParamOut); + GaugeField cpuOut = GaugeField(gParamOut); gParamOut.location = QUDA_CUDA_FIELD_LOCATION; - gParamOut.create = qudaGaugeParam->overwrite_gauge ? QUDA_ZERO_FIELD_CREATE : QUDA_NULL_FIELD_CREATE; + gParamOut.create = qudaGaugeParam->overwrite_gauge ? QUDA_ZERO_FIELD_CREATE : QUDA_COPY_FIELD_CREATE; + gParamOut.field = &cpuOut; gParamOut.reconstruct = QUDA_RECONSTRUCT_NO; gParamOut.setPrecision(qudaGaugeParam->cuda_prec, true); - GaugeField *cudaOut = new GaugeField(gParamOut); - if (!qudaGaugeParam->overwrite_gauge) { - cudaOut->copy(*cpuOut); - } + GaugeField cudaOut(gParamOut); - GaugeField *cudaGauge = createExtendedGauge(*cudaSiteLink, R, profileGaugePath); + GaugeField *cudaGauge = createExtendedGauge(cudaSiteLink, R, profileGaugePath); // apply / remove phase as appropriate if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase(); @@ -4145,25 +4075,20 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int * for (int d = 0; d < 4; d++) { input_path_v[d] = input_path_buf[d]; } // actually do the computation - gaugePath(*cudaOut, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length); + gaugePath(cudaOut, *cudaGauge, eb3, input_path_v, path_length_v, loop_coeff_v, num_paths, max_length); - cpuOut->copy(*cudaOut); + cpuOut.copy(cudaOut); if (qudaGaugeParam->make_resident_gauge) { - if (gaugePrecise && gaugePrecise != cudaSiteLink) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); - gaugePrecise = cudaSiteLink; + if (gaugePrecise && !qudaGaugeParam->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); + gaugePrecise = new GaugeField(); + std::exchange(*gaugePrecise, cudaSiteLink); if (extendedGaugeResident) delete extendedGaugeResident; extendedGaugeResident = cudaGauge; } else { - delete cudaSiteLink; delete cudaGauge; } - delete cudaOut; - - if (cpuSiteLink) delete cpuSiteLink; - if (cpuOut) delete cpuOut; - popProfile(); return 0; } diff --git a/lib/llfat_quda.cu b/lib/llfat_quda.cu index f39233aeea..8ac2e25d36 100644 --- a/lib/llfat_quda.cu +++ b/lib/llfat_quda.cu @@ -166,46 +166,50 @@ namespace quda { } #ifdef GPU_STAGGERED_DIRAC - void longKSLink(GaugeField *lng, const GaugeField &u, const double *coeff) + void longKSLink(GaugeField &lng, const GaugeField &u, const double *coeff) { - computeLongLink(*lng, u, coeff[1]); + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); + computeLongLink(lng, u, coeff[1]); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } - void fatKSLink(GaugeField *fat, const GaugeField& u, const double *coeff) + void fatKSLink(GaugeField &fat, const GaugeField& u, const double *coeff) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); + GaugeFieldParam gParam(u); gParam.reconstruct = QUDA_RECONSTRUCT_NO; gParam.setPrecision(gParam.Precision()); gParam.create = QUDA_NULL_FIELD_CREATE; - auto staple = GaugeField::Create(gParam); - auto staple1 = GaugeField::Create(gParam); + GaugeField staple(gParam); + GaugeField staple1(gParam); - if ( ((fat->X()[0] % 2 != 0) || (fat->X()[1] % 2 != 0) || (fat->X()[2] % 2 != 0) || (fat->X()[3] % 2 != 0)) + if ( ((fat.X()[0] % 2 != 0) || (fat.X()[1] % 2 != 0) || (fat.X()[2] % 2 != 0) || (fat.X()[3] % 2 != 0)) && (u.Reconstruct() != QUDA_RECONSTRUCT_NO)){ errorQuda("Reconstruct %d and odd dimensionsize is not supported by link fattening code (yet)\n", u.Reconstruct()); } - computeOneLink(*fat, u, coeff[0]-6.0*coeff[5]); + computeOneLink(fat, u, coeff[0]-6.0*coeff[5]); // Check the coefficients. If all of the following are zero, return. if (fabs(coeff[2]) >= MIN_COEFF || fabs(coeff[3]) >= MIN_COEFF || fabs(coeff[4]) >= MIN_COEFF || fabs(coeff[5]) >= MIN_COEFF) { for (int nu = 0; nu < 4; nu++) { - computeStaple(*fat, *staple, u, u, nu, -1, -1, coeff[2], 1); + computeStaple(fat, staple, u, u, nu, -1, -1, coeff[2], 1); - if (coeff[5] != 0.0) computeStaple(*fat, *staple, *staple, u, nu, -1, -1, coeff[5], 0); + if (coeff[5] != 0.0) computeStaple(fat, staple, staple, u, nu, -1, -1, coeff[5], 0); for (int rho = 0; rho < 4; rho++) { if (rho != nu) { - computeStaple(*fat, *staple1, *staple, u, rho, nu, -1, coeff[3], 1); + computeStaple(fat, staple1, staple, u, rho, nu, -1, coeff[3], 1); if (fabs(coeff[4]) > MIN_COEFF) { for (int sig = 0; sig < 4; sig++) { if (sig != nu && sig != rho) { - computeStaple(*fat, *staple, *staple1, u, sig, nu, rho, coeff[4], 0); + computeStaple(fat, staple, staple1, u, sig, nu, rho, coeff[4], 0); } } //sig } // MIN_COEFF @@ -214,16 +218,15 @@ namespace quda { } //nu } - delete staple; - delete staple1; + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } #else - void longKSLink(GaugeField *, const GaugeField&, const double *) + void longKSLink(GaugeField &, const GaugeField&, const double *) { errorQuda("Long-link computation not enabled"); } - void fatKSLink(GaugeField *, const GaugeField&, const double *) + void fatKSLink(GaugeField &, const GaugeField&, const double *) { errorQuda("Fat-link computation not enabled"); } diff --git a/lib/staggered_two_link_quda.cu b/lib/staggered_two_link_quda.cu index 8dce83c997..3afb950d82 100644 --- a/lib/staggered_two_link_quda.cu +++ b/lib/staggered_two_link_quda.cu @@ -53,10 +53,12 @@ namespace quda #if defined(GPU_STAGGERED_DIRAC) && defined(GPU_TWOLINK_GSMEAR) void computeTwoLink(GaugeField &newTwoLink, const GaugeField &link) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); checkNative(newTwoLink, link); checkLocation(newTwoLink, link); checkPrecision(newTwoLink, link); instantiate(link, newTwoLink);//FIXME : enable link-12/8 reconstruction + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } #else void computeTwoLink(GaugeField &, const GaugeField &) From 52a1d1ca9ac79e47bb216a0cb32506f00fd7fc6d Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 22 May 2023 15:39:19 -0700 Subject: [PATCH 14/99] ColorSpinorField and CloverField now autoprofile any H2D and D2H transfers. Further interface cleanup --- lib/clover_deriv_quda.cu | 2 + lib/clover_field.cpp | 12 +++ lib/clover_invert.cu | 2 + lib/clover_outer_product.cu | 2 + lib/clover_quda.cu | 2 + lib/clover_sigma_outer_product.cu | 2 + lib/color_spinor_field.cpp | 13 +++ lib/gauge_field.cpp | 13 ++- lib/gauge_field_strength_tensor.cu | 2 + lib/gauge_phase.cu | 2 + lib/interface_quda.cpp | 123 +++++++---------------------- 11 files changed, 72 insertions(+), 103 deletions(-) diff --git a/lib/clover_deriv_quda.cu b/lib/clover_deriv_quda.cu index 34f121de93..34ef0b993b 100644 --- a/lib/clover_deriv_quda.cu +++ b/lib/clover_deriv_quda.cu @@ -66,6 +66,7 @@ namespace quda { #if defined(GPU_CLOVER_DIRAC) && (QUDA_PRECISION & 8) void cloverDerivative(GaugeField &force, GaugeField &gauge, GaugeField &oprod, double coeff, QudaParity parity) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); assert(oprod.Geometry() == QUDA_TENSOR_GEOMETRY); assert(force.Geometry() == QUDA_VECTOR_GEOMETRY); @@ -79,6 +80,7 @@ namespace quda { } else { errorQuda("Precision %d not supported", force.Precision()); } + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } #else void cloverDerivative(GaugeField &, GaugeField &, GaugeField &, double, QudaParity) diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp index 46394c332b..cd8cc04ba2 100644 --- a/lib/clover_field.cpp +++ b/lib/clover_field.cpp @@ -184,6 +184,12 @@ namespace quda { void CloverField::copy(const CloverField &src, bool is_inverse) { + if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) { + getProfile().TPSTART(QUDA_PROFILE_D2H); + } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) { + getProfile().TPSTART(QUDA_PROFILE_H2D); + } + // special case where we wish to make a copy of the inverse field when dynamic_inverse is enabled static bool dynamic_inverse_copy = false; if (is_inverse && clover::dynamic_inverse() && V(true) && !src.V(true) && !dynamic_inverse_copy) { @@ -257,6 +263,12 @@ namespace quda { pool_device_free(packClover); } } + + if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) { + getProfile().TPSTOP(QUDA_PROFILE_D2H); + } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) { + getProfile().TPSTOP(QUDA_PROFILE_H2D); + } } void CloverField::copy(const CloverField &src) diff --git a/lib/clover_invert.cu b/lib/clover_invert.cu index ac7f15fbfe..903ce4e76c 100644 --- a/lib/clover_invert.cu +++ b/lib/clover_invert.cu @@ -49,9 +49,11 @@ namespace quda { #ifdef GPU_CLOVER_DIRAC void cloverInvert(CloverField &clover, bool computeTraceLog) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); if (clover.Reconstruct()) errorQuda("Cannot store the inverse with a reconstruct field"); if (clover.Precision() < QUDA_SINGLE_PRECISION) errorQuda("Cannot use fixed-point precision here"); instantiate(clover, computeTraceLog); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } #else void cloverInvert(CloverField &, bool) diff --git a/lib/clover_outer_product.cu b/lib/clover_outer_product.cu index 93096d15e7..d579476714 100644 --- a/lib/clover_outer_product.cu +++ b/lib/clover_outer_product.cu @@ -136,6 +136,7 @@ namespace quda { void computeCloverForce(GaugeField &force, const GaugeField &U, std::vector &x, std::vector &p, std::vector &coeff) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); checkNative(*x[0], *p[0], force, U); checkPrecision(*x[0], *p[0], force, U); @@ -159,6 +160,7 @@ namespace quda { instantiate(U, force, inA, inB, inC, inD, parity, coeff[i]); } } + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } #else // GPU_CLOVER_DIRAC not defined void computeCloverForce(GaugeField &, const GaugeField &, std::vector &, diff --git a/lib/clover_quda.cu b/lib/clover_quda.cu index 853fdbe156..c000310f6b 100644 --- a/lib/clover_quda.cu +++ b/lib/clover_quda.cu @@ -37,9 +37,11 @@ namespace quda { #ifdef GPU_CLOVER_DIRAC void computeClover(CloverField &clover, const GaugeField& f, double coeff) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); if (clover.Precision() < QUDA_SINGLE_PRECISION) errorQuda("Cannot use fixed-point precision here"); clover.Diagonal(0.5); // 0.5 comes from scaling used on native fields instantiate(clover, f, coeff); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } #else void computeClover(CloverField &, const GaugeField &, double) diff --git a/lib/clover_sigma_outer_product.cu b/lib/clover_sigma_outer_product.cu index 1c34a7ff33..370ada813f 100644 --- a/lib/clover_sigma_outer_product.cu +++ b/lib/clover_sigma_outer_product.cu @@ -61,6 +61,7 @@ namespace quda { void computeCloverSigmaOprod(GaugeField& oprod, std::vector &x, std::vector &p, std::vector > &coeff) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); if (x.size() > MAX_NVECTOR) { // divide and conquer std::vector x0(x.begin(), x.begin()+x.size()/2); @@ -83,6 +84,7 @@ namespace quda { } instantiate(oprod, x, p, coeff); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } #else // GPU_CLOVER_DIRAC not defined void computeCloverSigmaOprod(GaugeField &, std::vector &, diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index 96df00ba55..9649ce9a7f 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -442,6 +442,13 @@ namespace quda void ColorSpinorField::copy(const ColorSpinorField &src) { test_compatible_weak(*this, src); + + if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) { + getProfile().TPSTART(QUDA_PROFILE_D2H); + } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) { + getProfile().TPSTART(QUDA_PROFILE_H2D); + } + if (Location() == src.Location()) { // H2H and D2D copyGenericColorSpinor(*this, src, Location()); @@ -525,6 +532,12 @@ namespace quda qudaDeviceSynchronize(); // need to sync before data can be used on CPU } + + if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) { + getProfile().TPSTOP(QUDA_PROFILE_D2H); + } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) { + getProfile().TPSTOP(QUDA_PROFILE_H2D); + } } // Fills the param with the contents of this field diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index 0003663e25..cb4319857e 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -938,15 +938,14 @@ namespace quda { void GaugeField::copy(const GaugeField &src) { - auto &profile = getProfile(); + if (this == &src) return; + if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) { - profile.TPSTART(QUDA_PROFILE_D2H); + getProfile().TPSTART(QUDA_PROFILE_D2H); } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) { - profile.TPSTART(QUDA_PROFILE_H2D); + getProfile().TPSTART(QUDA_PROFILE_H2D); } - if (this == &src) return; - checkField(src); if (link_type == QUDA_ASQTAD_FAT_LINKS) { @@ -1112,9 +1111,9 @@ namespace quda { staggeredPhaseType = src.StaggeredPhase(); if (src.Location() == QUDA_CUDA_FIELD_LOCATION && location == QUDA_CPU_FIELD_LOCATION) { - profile.TPSTOP(QUDA_PROFILE_D2H); + getProfile().TPSTOP(QUDA_PROFILE_D2H); } else if (src.Location() == QUDA_CPU_FIELD_LOCATION && location == QUDA_CUDA_FIELD_LOCATION) { - profile.TPSTOP(QUDA_PROFILE_H2D); + getProfile().TPSTOP(QUDA_PROFILE_H2D); } } diff --git a/lib/gauge_field_strength_tensor.cu b/lib/gauge_field_strength_tensor.cu index d0ec026881..dc6b763b54 100644 --- a/lib/gauge_field_strength_tensor.cu +++ b/lib/gauge_field_strength_tensor.cu @@ -34,8 +34,10 @@ namespace quda void computeFmunu(GaugeField &f, const GaugeField &u) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); checkPrecision(f, u); instantiate2(u, f); // u must be first here for correct template instantiation + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } } // namespace quda diff --git a/lib/gauge_phase.cu b/lib/gauge_phase.cu index ff959ef0b3..929c5eadb5 100644 --- a/lib/gauge_phase.cu +++ b/lib/gauge_phase.cu @@ -45,9 +45,11 @@ namespace quda { void applyGaugePhase(GaugeField &u) { + getProfile().TPSTART(QUDA_PROFILE_COMPUTE); instantiate(u); // ensure that ghosts are updated if needed if (u.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) u.exchangeGhost(); + getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); } } // namespace quda diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index e39bf48b5c..aa25b06621 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -558,13 +558,12 @@ void freeUniqueGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeFiel void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) { - profileGauge.TPSTART(QUDA_PROFILE_TOTAL); + pushProfile(profileGauge); + checkGaugeParam(param); if (!initialized) errorQuda("QUDA not initialized"); if (getVerbosity() == QUDA_DEBUG_VERBOSE) printQudaGaugeParam(param); - checkGaugeParam(param); - profileGauge.TPSTART(QUDA_PROFILE_INIT); // Set the specific input parameters and create the cpu gauge field GaugeFieldParam gauge_param(*param, h_gauge); @@ -631,9 +630,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) profileGauge.TPSTOP(QUDA_PROFILE_INIT); } else { profileGauge.TPSTOP(QUDA_PROFILE_INIT); - profileGauge.TPSTART(QUDA_PROFILE_H2D); precise->copy(*in); - profileGauge.TPSTOP(QUDA_PROFILE_H2D); } // for gaugeSmeared we are interested only in the precise version @@ -645,7 +642,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) delete in; profileGauge.TPSTOP(QUDA_PROFILE_FREE); - profileGauge.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); return; } @@ -766,12 +763,12 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileGauge, false, recon); } - profileGauge.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); } void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param) { - profileGauge.TPSTART(QUDA_PROFILE_TOTAL); + pushProfile(profileGauge); if (param->location != QUDA_CPU_FIELD_LOCATION) errorQuda("Non-cpu output location not yet supported"); @@ -798,13 +795,11 @@ void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param) default: errorQuda("Invalid gauge type"); } - profileGauge.TPSTART(QUDA_PROFILE_D2H); cpuGauge.copy(*cudaGauge); - profileGauge.TPSTOP(QUDA_PROFILE_D2H); if (param->type == QUDA_SMEARED_LINKS) { delete cudaGauge; } - profileGauge.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); } void loadSloppyCloverQuda(const QudaPrecision prec[]); @@ -812,8 +807,8 @@ void freeSloppyCloverQuda(); void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) { + pushProfile(profileClover); pushVerbosity(inv_param->verbosity); - profileClover.TPSTART(QUDA_PROFILE_TOTAL); profileClover.TPSTART(QUDA_PROFILE_INIT); checkCloverParam(inv_param); @@ -890,11 +885,9 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) profileClover.TPSTOP(QUDA_PROFILE_INIT); if (!device_calc) { - profileClover.TPSTART(QUDA_PROFILE_H2D); cloverPrecise->copy(*in, false); if ((h_clovinv && !inv_param->compute_clover_inverse) && !clover::dynamic_inverse()) cloverPrecise->copy(*in, true); - profileClover.TPSTOP(QUDA_PROFILE_H2D); } else { profileClover.TPSTOP(QUDA_PROFILE_TOTAL); createCloverQuda(inv_param); @@ -902,13 +895,11 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) } if ((!h_clovinv || inv_param->compute_clover_inverse) && !clover::dynamic_inverse()) { - profileClover.TPSTART(QUDA_PROFILE_COMPUTE); cloverInvert(*cloverPrecise, inv_param->compute_clover_trlog); if (inv_param->compute_clover_trlog) { inv_param->trlogA[0] = cloverPrecise->TrLog()[0]; inv_param->trlogA[1] = cloverPrecise->TrLog()[1]; } - profileClover.TPSTOP(QUDA_PROFILE_COMPUTE); } } else { if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gauge field unchanged - using cached clover field\n"); @@ -918,16 +909,12 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) if (inv_param->return_clover || inv_param->return_clover_inverse) { if (inv_param->return_clover) { if (!h_clover) errorQuda("Requested clover field return but no clover host pointer set"); - profileClover.TPSTART(QUDA_PROFILE_D2H); in->copy(*cloverPrecise, false); - profileClover.TPSTOP(QUDA_PROFILE_D2H); } if (inv_param->return_clover_inverse) { if (!h_clovinv) errorQuda("Requested clover field inverse return but no clover host pointer set"); - profileClover.TPSTART(QUDA_PROFILE_D2H); in->copy(*cloverPrecise, true); - profileClover.TPSTOP(QUDA_PROFILE_D2H); } } @@ -950,8 +937,8 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) inv_param->clover_cuda_prec_refinement_sloppy, inv_param->clover_cuda_prec_eigensolver}; loadSloppyCloverQuda(prec); - profileClover.TPSTOP(QUDA_PROFILE_TOTAL); popVerbosity(); + popProfile(); } void freeSloppyCloverQuda(); @@ -1819,7 +1806,7 @@ namespace quda { void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity) { - profileDslash.TPSTART(QUDA_PROFILE_TOTAL); + pushProfile(profileDslash); profileDslash.TPSTART(QUDA_PROFILE_INIT); const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise; @@ -1850,9 +1837,7 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity profileDslash.TPSTOP(QUDA_PROFILE_INIT); - profileDslash.TPSTART(QUDA_PROFILE_H2D); in = in_h; - profileDslash.TPSTOP(QUDA_PROFILE_H2D); profileDslash.TPSTART(QUDA_PROFILE_COMPUTE); @@ -1886,19 +1871,16 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity } profileDslash.TPSTOP(QUDA_PROFILE_COMPUTE); - profileDslash.TPSTART(QUDA_PROFILE_D2H); out_h = out; - profileDslash.TPSTOP(QUDA_PROFILE_D2H); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out)); profileDslash.TPSTART(QUDA_PROFILE_FREE); delete dirac; // clean up - profileDslash.TPSTOP(QUDA_PROFILE_FREE); popVerbosity(); - profileDslash.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); } void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) @@ -2201,8 +2183,7 @@ void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param) { if (!initialized) errorQuda("QUDA not initialized"); - - profileEigensolve.TPSTART(QUDA_PROFILE_TOTAL); + pushProfile(profileEigensolve); profileEigensolve.TPSTART(QUDA_PROFILE_INIT); // Transfer the inv param structure contained in eig_param. @@ -2357,9 +2338,7 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam // host side gamma basis. for (int i = 0; i < eig_param->n_conv; i++) { memcpy(host_evals + i, &evals[i], sizeof(Complex)); } if (!(eig_param->arpack_check)) { - profileEigensolve.TPSTART(QUDA_PROFILE_D2H); for (int i = 0; i < n_eig; i++) host_evecs_[i] = kSpace[i]; - profileEigensolve.TPSTOP(QUDA_PROFILE_D2H); } profileEigensolve.TPSTART(QUDA_PROFILE_FREE); @@ -2373,7 +2352,7 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam // cache is written out even if a long benchmarking job gets interrupted saveTuneCache(); - profileEigensolve.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); } multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile) @@ -2696,10 +2675,9 @@ void destroyDeflationQuda(void *df) { void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) { + pushProfile(profileInvert); profilerStart(__func__); - profileInvert.TPSTART(QUDA_PROFILE_TOTAL); - if (!initialized) errorQuda("QUDA not initialized"); pushVerbosity(param->verbosity); @@ -2743,8 +2721,6 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) Dirac &diracPre = *dPre; Dirac &diracEig = *dEig; - profileInvert.TPSTART(QUDA_PROFILE_H2D); - ColorSpinorField *in = nullptr; ColorSpinorField *out = nullptr; @@ -2805,7 +2781,6 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) diracPre.prefetch(QUDA_CUDA_FIELD_LOCATION); } - profileInvert.TPSTOP(QUDA_PROFILE_H2D); profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE); double nb = blas::norm2(b); @@ -3028,9 +3003,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE); if (!param->make_resident_solution) { - profileInvert.TPSTART(QUDA_PROFILE_D2H); h_x = x; - profileInvert.TPSTOP(QUDA_PROFILE_D2H); } profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE); @@ -3064,9 +3037,8 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) // cache is written out even if a long benchmarking job gets interrupted saveTuneCache(); - profileInvert.TPSTOP(QUDA_PROFILE_TOTAL); - profilerStop(__func__); + popProfile(); } void loadFatLongGaugeQuda(QudaInvertParam *inv_param, QudaGaugeParam *gauge_param, void *milc_fatlinks, @@ -3473,9 +3445,9 @@ void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param */ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) { + pushProfile(profileMulti); profilerStart(__func__); - profileMulti.TPSTART(QUDA_PROFILE_TOTAL); profileMulti.TPSTART(QUDA_PROFILE_INIT); if (!initialized) errorQuda("QUDA not initialized"); @@ -3582,7 +3554,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) } profileMulti.TPSTOP(QUDA_PROFILE_INIT); - profileMulti.TPSTART(QUDA_PROFILE_H2D); // Now I need a colorSpinorParam for the device ColorSpinorParam cudaParam(cpuParam, *param, QUDA_CUDA_FIELD_LOCATION); // This setting will download a host vector @@ -3590,8 +3561,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) cudaParam.field = &h_b; ColorSpinorField b(cudaParam); // Creates b and downloads h_b to it - profileMulti.TPSTOP(QUDA_PROFILE_H2D); - profileMulti.TPSTART(QUDA_PROFILE_INIT); // Create the solution fields filled with zero cudaParam.create = QUDA_ZERO_FIELD_CREATE; @@ -3781,8 +3750,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) // restore shifts for (int i = 0; i < param->num_offset; i++) param->offset[i] = unscaled_shifts[i]; - profileMulti.TPSTART(QUDA_PROFILE_D2H); - if (param->compute_action) { Complex action(0); for (int i = 0; i < param->num_offset; i++) action += param->residue[i] * blas::cDotProduct(b, x[i]); @@ -3799,7 +3766,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) if (!param->make_resident_solution) *h_x[i] = x[i]; } - profileMulti.TPSTOP(QUDA_PROFILE_D2H); profileMulti.TPSTART(QUDA_PROFILE_EPILOGUE); @@ -3819,9 +3785,8 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) // cache is written out even if a long benchmarking job gets interrupted saveTuneCache(); - profileMulti.TPSTOP(QUDA_PROFILE_TOTAL); - profilerStop(__func__); + popProfile(); } void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param) @@ -4132,7 +4097,7 @@ void momResidentQuda(void *mom, QudaGaugeParam *param) void createCloverQuda(QudaInvertParam* invertParam) { - profileClover.TPSTART(QUDA_PROFILE_TOTAL); + pushProfile(profileClover); if (!cloverPrecise) errorQuda("Clover field not allocated"); QudaReconstructType recon = (gaugePrecise->Reconstruct() == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_12 : gaugePrecise->Reconstruct(); @@ -4141,8 +4106,6 @@ void createCloverQuda(QudaInvertParam* invertParam) for (int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d)); GaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profileClover, false, recon); - profileClover.TPSTART(QUDA_PROFILE_INIT); - GaugeField *ex = gauge; if (gauge->Precision() < cloverPrecise->Precision()) { GaugeFieldParam param(*gauge); @@ -4159,17 +4122,14 @@ void createCloverQuda(QudaInvertParam* invertParam) tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER; tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; GaugeField Fmunu(tensorParam); - profileClover.TPSTOP(QUDA_PROFILE_INIT); - profileClover.TPSTART(QUDA_PROFILE_COMPUTE); computeFmunu(Fmunu, *ex); computeClover(*cloverPrecise, Fmunu, invertParam->clover_coeff); - profileClover.TPSTOP(QUDA_PROFILE_COMPUTE); - profileClover.TPSTOP(QUDA_PROFILE_TOTAL); if (ex != gauge) delete ex; // FIXME always preserve the extended gauge extendedGaugeResident = gauge; + popProfile(); } void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param) @@ -4445,10 +4405,7 @@ void computeHISQForceQuda(void* const milc_momentum, ColorSpinorField cpuQuark(qParam); // create host quark field profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); - profileHISQForce.TPSTART(QUDA_PROFILE_H2D); cudaQuark = cpuQuark; - profileHISQForce.TPSTOP(QUDA_PROFILE_H2D); - computeStaggeredOprod(oprod, cudaQuark, coeff[i], 3); } } @@ -4467,10 +4424,7 @@ void computeHISQForceQuda(void* const milc_momentum, ColorSpinorField cpuQuark(qParam); // create host quark field profileHISQForce.TPSTOP(QUDA_PROFILE_INIT); - profileHISQForce.TPSTART(QUDA_PROFILE_H2D); cudaQuark = cpuQuark; - profileHISQForce.TPSTOP(QUDA_PROFILE_H2D); - computeStaggeredOprod(oprod, cudaQuark, coeff[i + num_terms], 3); } } @@ -4655,7 +4609,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double QudaInvertParam *inv_param) { using namespace quda; - profileCloverForce.TPSTART(QUDA_PROFILE_TOTAL); + pushProfile(profileCloverForce); profileCloverForce.TPSTART(QUDA_PROFILE_INIT); checkGaugeParam(gauge_param); @@ -4731,7 +4685,6 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double GaugeField oprod(fParam); profileCloverForce.TPSTOP(QUDA_PROFILE_INIT); - profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE); std::vector force_coeff(nvector); // loop over different quark fields @@ -4745,17 +4698,13 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double qParam.x[0] /= 2; // Wrap the even-parity MILC quark field - profileCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE); profileCloverForce.TPSTART(QUDA_PROFILE_INIT); qParam.v = h_x[i]; ColorSpinorField cpuQuarkX(qParam); // create host quark field profileCloverForce.TPSTOP(QUDA_PROFILE_INIT); - profileCloverForce.TPSTART(QUDA_PROFILE_H2D); x.Even() = cpuQuarkX; - profileCloverForce.TPSTOP(QUDA_PROFILE_H2D); - profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE); gamma5(x.Even(), x.Even()); } else { x.Even() = solutionResident[i]; @@ -4798,20 +4747,15 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double GaugeField *oprodEx = createExtendedGauge(oprod, R, profileCloverForce); - profileCloverForce.TPSTART(QUDA_PROFILE_COMPUTE); - cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_ODD_PARITY); cloverDerivative(cudaForce, *u, *oprodEx, 1.0, QUDA_EVEN_PARITY); if (u != &gaugeEx) delete u; updateMomentum(cudaMom, -1.0, cudaForce, "clover"); - profileCloverForce.TPSTOP(QUDA_PROFILE_COMPUTE); // copy the outer product field back to the host - profileCloverForce.TPSTART(QUDA_PROFILE_D2H); cpuMom.copy(cudaMom); - profileCloverForce.TPSTOP(QUDA_PROFILE_D2H); profileCloverForce.TPSTART(QUDA_PROFILE_FREE); @@ -4824,9 +4768,8 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double if (inv_param->use_resident_solution) solutionResident.clear(); #endif delete dirac; - profileCloverForce.TPSTOP(QUDA_PROFILE_FREE); - profileCloverForce.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); } void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom, int exact, QudaGaugeParam* param) @@ -4941,7 +4884,6 @@ void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) gParam.reconstruct = param->reconstruct; GaugeField cudaGauge = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam); - profilePhase.TPSTART(QUDA_PROFILE_COMPUTE); *num_failures_h = 0; // apply / remove phase as appropriate @@ -4950,8 +4892,6 @@ void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) else cudaGauge.removeStaggeredPhase(); - profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE); - if (param->return_result_gauge) cpuGauge.copy(cudaGauge); if (param->make_resident_gauge) { @@ -5023,21 +4963,19 @@ void gaussMomQuda(unsigned long long seed, double sigma) */ void plaqQuda(double plaq[3]) { - profilePlaq.TPSTART(QUDA_PROFILE_TOTAL); + pushProfile(profilePlaq); if (!gaugePrecise) errorQuda("Cannot compute plaquette as there is no resident gauge field"); GaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq); extendedGaugeResident = data; - profilePlaq.TPSTART(QUDA_PROFILE_COMPUTE); double3 plaq3 = quda::plaquette(*data); plaq[0] = plaq3.x; plaq[1] = plaq3.y; plaq[2] = plaq3.z; - profilePlaq.TPSTOP(QUDA_PROFILE_COMPUTE); - profilePlaq.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); } /* @@ -5165,12 +5103,11 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_param) { - if(smear_param->n_steps == 0) return; + if (smear_param->n_steps == 0) return; + pushProfile(profileGaussianSmear); + profileGaussianSmear.TPSTART(QUDA_PROFILE_INIT); QudaInvertParam *inv_param = smear_param->inv_param; - - profileGaussianSmear.TPSTART(QUDA_PROFILE_TOTAL); - profileGaussianSmear.TPSTART(QUDA_PROFILE_INIT); if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded"); @@ -5258,9 +5195,7 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT); // Copy host data to device - profileGaussianSmear.TPSTART(QUDA_PROFILE_H2D); in = in_h; - profileGaussianSmear.TPSTOP(QUDA_PROFILE_H2D); const double ftmp = -(smear_param->width*smear_param->width)/(4.0*smear_param->n_steps*4.0); /* Extra 4 to compensate for stride 2 */ // Scale up the source to prevent underflow @@ -5286,23 +5221,21 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par profileGaussianSmear.TPSTOP(QUDA_PROFILE_COMPUTE); // Copy device data to host. - profileGaussianSmear.TPSTART(QUDA_PROFILE_D2H); in_h = out; - profileGaussianSmear.TPSTOP(QUDA_PROFILE_D2H); profileGaussianSmear.TPSTART(QUDA_PROFILE_FREE); if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Finished 2link Gaussian smearing.\n"); delete d; + profileGaussianSmear.TPSTOP(QUDA_PROFILE_FREE); smear_param->gflops = dirac.Flops(); if (smear_param->delete_2link != 0) { freeUniqueGaugeQuda(QUDA_SMEARED_LINKS); } - profileGaussianSmear.TPSTOP(QUDA_PROFILE_FREE); - profileGaussianSmear.TPSTOP(QUDA_PROFILE_TOTAL); saveTuneCache(); + popProfile(); } @@ -5515,10 +5448,8 @@ void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const Quda void *d_result = pool_device_malloc(data_bytes); profileContract.TPSTOP(QUDA_PROFILE_INIT); - profileContract.TPSTART(QUDA_PROFILE_H2D); x[0] = h_x; y[0] = h_y; - profileContract.TPSTOP(QUDA_PROFILE_H2D); contractQuda(x[0], y[0], d_result, cType); From 442a4601d0816e2cc53b1832a41a42327e9112d6 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 25 May 2023 17:18:05 -0700 Subject: [PATCH 15/99] Add qudaMemsetAsync and qudaMemcpy overloads for quda_ptr --- include/quda_api.h | 20 ++++++++++++++++++++ lib/targets/cuda/quda_api.cpp | 18 ++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/include/quda_api.h b/include/quda_api.h index ea475c43f6..d2abba24d1 100644 --- a/include/quda_api.h +++ b/include/quda_api.h @@ -43,6 +43,16 @@ namespace quda void qudaMemcpy_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const char *func, const char *file, const char *line); + /** + @brief Wrapper around cudaMemcpy or driver API equivalent + @param[out] dst Destination pointer + @param[in] src Source pointer + @param[in] count Size of transfer + @param[in] kind Type of memory copy + */ + void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func, const char *file, + const char *line); + /** @brief Wrapper around cudaMemcpyAsync or driver API equivalent @param[out] dst Destination pointer @@ -101,6 +111,16 @@ namespace quda void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func, const char *file, const char *line); + /** + @brief Wrapper around cudaMemsetAsync or driver API equivalent + @param[out] ptr Starting address pointer + @param[in] value Value to set for each byte of specified memory + @param[in] count Size in bytes to set + @param[in] stream Stream to issue memset + */ + void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream, const char *func, + const char *file, const char *line); + /** @brief Wrapper around cudaMemsetAsync or driver API equivalent @param[out] ptr Starting address pointer diff --git a/lib/targets/cuda/quda_api.cpp b/lib/targets/cuda/quda_api.cpp index 3e4ced01bc..8c57fe9079 100644 --- a/lib/targets/cuda/quda_api.cpp +++ b/lib/targets/cuda/quda_api.cpp @@ -325,6 +325,13 @@ namespace quda QudaMem copy(dst, src, count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line); } + void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func, const char *file, + const char *line) + { + if (count == 0) return; + QudaMem copy(dst.data(), src.data(), count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line); + } + void qudaMemcpyAsync_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const qudaStream_t &stream, const char *func, const char *file, const char *line) { @@ -389,6 +396,17 @@ namespace quda QudaMem copy(ptr, value, count, stream, true, func, file, line); } + void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream, + const char *func, const char *file, const char *line) + { + if (count == 0) return; + if (ptr.is_device()) { + QudaMem set(ptr.data(), value, count, stream, true, func, file, line); + } else { + memset(ptr.data(), value, count); + } + } + void qudaMemset2D_(void *ptr, size_t pitch, int value, size_t width, size_t height, const char *func, const char *file, const char *line) { From 44586cde7cfef274376ff8808a71d1a56138d2e2 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 25 May 2023 23:11:10 -0700 Subject: [PATCH 16/99] Use quda_ptr for both color_spinor_field.cpp and clover_field.cpp allocations. Some cleanup --- include/clover_field.h | 18 +++--- include/color_spinor_field.h | 29 ++------- include/lattice_field.h | 4 +- lib/clover_field.cpp | 119 +++++++++++------------------------ lib/color_spinor_field.cpp | 117 ++++++++++------------------------ lib/dirac_clover.cpp | 2 +- lib/gauge_field.cpp | 44 ++++++------- lib/lattice_field.cpp | 24 +------ 8 files changed, 108 insertions(+), 249 deletions(-) diff --git a/include/clover_field.h b/include/clover_field.h index 402ee2936f..579e7eeb1e 100644 --- a/include/clover_field.h +++ b/include/clover_field.h @@ -178,9 +178,10 @@ namespace quda { int nColor = 0; int nSpin = 0; - void *clover = nullptr; - void *cloverInv = nullptr; + quda_ptr clover = {}; + quda_ptr cloverInv = {}; + bool inverse = false; double diagonal = 0.0; array max = {}; @@ -213,12 +214,15 @@ namespace quda { public: CloverField(const CloverFieldParam ¶m); - virtual ~CloverField(); static CloverField *Create(const CloverFieldParam ¶m); - void* V(bool inverse=false) { return inverse ? cloverInv : clover; } - const void* V(bool inverse=false) const { return inverse ? cloverInv : clover; } + void *V(bool inverse = false) const { return inverse ? cloverInv.data() : clover.data(); } + + /** + @return whether the inverse is explicitly been allocated + */ + bool Inverse() const { return inverse; } /** @return diagonal scaling factor applied to the identity @@ -406,10 +410,6 @@ namespace quda { */ void copy_from_buffer(void *buffer); - friend class DiracClover; - friend class DiracCloverPC; - friend class DiracTwistedClover; - friend class DiracTwistedCloverPC; }; /** diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 1bfd1be413..9b88534e58 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -329,8 +329,7 @@ namespace quda size_t length = 0; // length including pads, but not norm zone - void *v = nullptr; // the field elements - void *v_h = nullptr; // the field elements + quda_ptr v = {}; // the field elements size_t norm_offset = 0; /** offset to the norm (if applicable) */ // multi-GPU parameters @@ -463,37 +462,19 @@ namespace quda /** @brief Return pointer to the field allocation */ - void *V() + void *V() const { if (ghost_only) errorQuda("Not defined for ghost-only field"); - return v; - } - - /** - @brief Return pointer to the field allocation - */ - const void *V() const - { - if (ghost_only) errorQuda("Not defined for ghost-only field"); - return v; - } - - /** - @brief Return pointer to the norm base pointer in the field allocation - */ - void *Norm() - { - if (ghost_only) errorQuda("Not defined for ghost-only field"); - return static_cast(v) + norm_offset; + return v.data(); } /** @brief Return pointer to the norm base pointer in the field allocation */ - const void *Norm() const + void *Norm() const { if (ghost_only) errorQuda("Not defined for ghost-only field"); - return static_cast(v) + norm_offset; + return static_cast(v.data()) + norm_offset; } size_t NormOffset() const { return norm_offset; } diff --git a/include/lattice_field.h b/include/lattice_field.h index 38653350cc..e7c43b7d69 100644 --- a/include/lattice_field.h +++ b/include/lattice_field.h @@ -460,9 +460,7 @@ namespace quda { } } - mutable char *backup_h = nullptr; - mutable char *backup_norm_h = nullptr; - mutable bool backed_up = false; + mutable std::vector backup_h = {}; public: /** diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp index cd8cc04ba2..78076c0c9a 100644 --- a/lib/clover_field.cpp +++ b/lib/clover_field.cpp @@ -15,9 +15,7 @@ namespace quda { CloverFieldParam::CloverFieldParam(const CloverField &a) : LatticeFieldParam(a), reconstruct(clover::reconstruct()), - inverse(a.V(true)), - clover(nullptr), - cloverInv(nullptr), + inverse(a.Inverse()), csw(a.Csw()), coeff(a.Coeff()), twist_flavor(a.TwistFlavor()), @@ -36,21 +34,16 @@ namespace quda { CloverField::CloverField(const CloverFieldParam ¶m) : LatticeField(param), reconstruct(param.reconstruct), - bytes(0), nColor(3), nSpin(4), - clover(nullptr), - cloverInv(nullptr), - diagonal(0.0), - max {0, 0}, + inverse(param.inverse), csw(param.csw), coeff(param.coeff), twist_flavor(param.twist_flavor), mu2(param.mu2), rho(param.rho), order(param.order), - create(param.create), - trlog {0, 0} + create(param.create) { if (siteSubset != QUDA_FULL_SITE_SUBSET) errorQuda("Unexpected siteSubset %d", siteSubset); if (nDim != 4) errorQuda("Number of dimensions must be 4, not %d", nDim); @@ -79,53 +72,26 @@ namespace quda { if (bytes) { if (create != QUDA_REFERENCE_FIELD_CREATE) { - if (location == QUDA_CUDA_FIELD_LOCATION) { - clover = pool_device_malloc(bytes); - } else { - clover = safe_malloc(bytes); - } - + clover = std::move(quda_ptr(mem_type, bytes)); } else { - clover = param.clover; + clover = std::move(quda_ptr(param.clover, mem_type)); } total_bytes += bytes; - if (param.inverse) { + if (inverse) { if (create != QUDA_REFERENCE_FIELD_CREATE) { - if (location == QUDA_CUDA_FIELD_LOCATION) { - cloverInv = pool_device_malloc(bytes); - } else { - cloverInv = safe_malloc(bytes); - } + cloverInv = std::move(quda_ptr(mem_type, bytes)); } else { - cloverInv = param.cloverInv; + cloverInv = std::move(quda_ptr(param.cloverInv, mem_type)); } total_bytes += bytes; } if (create == QUDA_ZERO_FIELD_CREATE) { - if (location == QUDA_CUDA_FIELD_LOCATION) { - qudaMemset(clover, '\0', bytes); - if (param.inverse) qudaMemset(cloverInv, '\0', bytes); - } else { - memset(clover, '\0', bytes); - if (param.inverse) memset(cloverInv, '\0', bytes); - } - } - } - } - - CloverField::~CloverField() - { - if (create != QUDA_REFERENCE_FIELD_CREATE) { - if (location == QUDA_CUDA_FIELD_LOCATION) { - if (clover) pool_device_free(clover); - if (cloverInv) pool_device_free(cloverInv); - } else { - if (clover) host_free(clover); - if (cloverInv) host_free(cloverInv); + qudaMemset(clover, '\0', bytes); + if (inverse) qudaMemset(cloverInv, '\0', bytes); } } } @@ -141,38 +107,31 @@ namespace quda { void CloverField::backup(bool which) const { - if (Location() == QUDA_CUDA_FIELD_LOCATION) { - qudaMemcpy(backup_h + which * bytes, V(which), bytes, qudaMemcpyDeviceToHost); - } else { - memcpy(backup_h + which * bytes, V(which), bytes); - } + qudaMemcpy(backup_h[which], which ? cloverInv : clover, bytes, qudaMemcpyDefault); } void CloverField::backup() const { - if (backup_h) errorQuda("Already allocated host backup"); - backup_h = static_cast(safe_malloc(2 * bytes)); + if (backup_h.size()) errorQuda("Already allocated host backup"); + backup_h.resize(2); + for (auto &b : backup_h) b = std::move(quda_ptr(QUDA_MEMORY_HOST, bytes)); - if (V(false)) backup(false); - if (V(true)) backup(true); + backup(false); + if (inverse) backup(true); } void CloverField::restore(bool which) const { - if (Location() == QUDA_CUDA_FIELD_LOCATION) { - qudaMemcpy((void *)V(which), backup_h + which * bytes, bytes, qudaMemcpyHostToDevice); - } else { - memcpy((void *)V(which), backup_h + which * bytes, bytes); - } + qudaMemcpy(which ? cloverInv : clover, backup_h[which], bytes, qudaMemcpyDefault); } void CloverField::restore() const { - if (V(false)) restore(false); - if (V(true)) restore(true); + if (!backup_h.size()) errorQuda("Cannot restore since not backed up"); + restore(false); + if (inverse) restore(true); - host_free(backup_h); - backup_h = nullptr; + backup_h.resize(0); } CloverField *CloverField::Create(const CloverFieldParam ¶m) { return new CloverField(param); } @@ -192,7 +151,7 @@ namespace quda { // special case where we wish to make a copy of the inverse field when dynamic_inverse is enabled static bool dynamic_inverse_copy = false; - if (is_inverse && clover::dynamic_inverse() && V(true) && !src.V(true) && !dynamic_inverse_copy) { + if (is_inverse && clover::dynamic_inverse() && inverse && !src.inverse && !dynamic_inverse_copy) { dynamic_inverse_copy = true; // create a copy of the clover field that we will invert in place and use as the source CloverFieldParam param(src); @@ -207,8 +166,8 @@ namespace quda { } checkField(src); - if (!V(is_inverse)) errorQuda("Destination field's is_inverse=%d component does not exist", is_inverse); - if (!src.V(is_inverse) && !dynamic_inverse_copy) + if (is_inverse && !inverse) errorQuda("Destination field's is_inverse=%d component does not exist", is_inverse); + if (is_inverse && !src.Inverse() && !dynamic_inverse_copy) errorQuda("Source field's is_inverse=%d component does not exist", is_inverse); auto src_v = dynamic_inverse_copy ? src.V(false) : src.V(is_inverse); @@ -280,26 +239,22 @@ namespace quda { void CloverField::copy_to_buffer(void *buffer) const { size_t buffer_offset = 0; - if (V(false)) { // direct - qudaMemcpy(buffer, clover, bytes, qudaMemcpyDefault); - buffer_offset += bytes; - } + qudaMemcpy(buffer, clover.data(), bytes, qudaMemcpyDefault); + buffer_offset += bytes; - if (V(true)) { // inverse - qudaMemcpy(static_cast(buffer) + buffer_offset, cloverInv, bytes, qudaMemcpyDefault); + if (inverse) { // inverse + qudaMemcpy(static_cast(buffer) + buffer_offset, cloverInv.data(), bytes, qudaMemcpyDefault); } } void CloverField::copy_from_buffer(void *buffer) { size_t buffer_offset = 0; - if (V(false)) { // direct - qudaMemcpy(clover, static_cast(buffer), bytes, qudaMemcpyDefault); - buffer_offset += bytes; - } + qudaMemcpy(clover.data(), static_cast(buffer), bytes, qudaMemcpyDefault); + buffer_offset += bytes; - if (V(true)) { // inverse - qudaMemcpy(cloverInv, static_cast(buffer) + buffer_offset, bytes, qudaMemcpyDefault); + if (inverse) { // inverse + qudaMemcpy(cloverInv.data(), static_cast(buffer) + buffer_offset, bytes, qudaMemcpyDefault); } } @@ -313,12 +268,12 @@ namespace quda { QudaParity parity) const { if (location == QUDA_CUDA_FIELD_LOCATION && is_prefetch_enabled()) { - auto clover_parity = clover; - auto cloverInv_parity = cloverInv; auto bytes_parity = parity == QUDA_INVALID_PARITY ? bytes : bytes / 2; + auto clover_parity = clover.data(); + auto cloverInv_parity = inverse ? cloverInv.data() : nullptr; if (parity == QUDA_ODD_PARITY) { - clover_parity = clover ? static_cast(clover_parity) + bytes_parity : nullptr; - cloverInv_parity = cloverInv ? static_cast(cloverInv_parity) + bytes_parity : nullptr; + clover_parity = static_cast(clover_parity) + bytes_parity; + cloverInv_parity = inverse ? static_cast(cloverInv_parity) + bytes_parity : nullptr; } switch (type) { @@ -376,7 +331,7 @@ namespace quda { spinor_param.fieldOrder = colorspinor::getNative(a.Precision(), a.Nspin()); spinor_param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; spinor_param.create = QUDA_REFERENCE_FIELD_CREATE; - spinor_param.v = (void*)a.V(inverse); + spinor_param.v = a.V(inverse); spinor_param.location = a.Location(); return spinor_param; } diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index 9649ce9a7f..26a373d29e 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -23,15 +23,6 @@ namespace quda composite_descr(param.is_composite, param.composite_dim, param.is_component, param.component_id), components(0) { - // this must come before create - if (param.create == QUDA_REFERENCE_FIELD_CREATE) { - v = param.v; - norm_offset = param.norm_offset; - reference = true; - } else if (param.create == QUDA_GHOST_FIELD_CREATE) { - ghost_only = true; - } - create(param); switch (param.create) { @@ -157,21 +148,13 @@ namespace quda errorQuda("Subset not implemented"); if (param.create != QUDA_REFERENCE_FIELD_CREATE && param.create != QUDA_GHOST_FIELD_CREATE) { - if (location == QUDA_CPU_FIELD_LOCATION) { - v = safe_malloc(bytes); - } else if (location == QUDA_CUDA_FIELD_LOCATION) { - switch (mem_type) { - case QUDA_MEMORY_DEVICE: v = pool_device_malloc(bytes); break; - case QUDA_MEMORY_MAPPED: - v_h = mapped_malloc(bytes); - v = get_mapped_device_pointer(v_h); - break; - default: errorQuda("Unsupported memory type %d", mem_type); - } - } else { - errorQuda("Unexpected field location %d", location); - } + v = std::move(quda_ptr(mem_type, bytes)); alloc = true; + } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) { + v = std::move(quda_ptr(param.v, mem_type)); + reference = true; + } else if (param.create == QUDA_GHOST_FIELD_CREATE) { + ghost_only = true; } if (composite_descr.is_composite && param.create != QUDA_REFERENCE_FIELD_CREATE @@ -186,7 +169,7 @@ namespace quda components.reserve(composite_descr.dim); for (int cid = 0; cid < composite_descr.dim; cid++) { param.component_id = cid; - param.v = static_cast(static_cast(v) + cid * bytes / composite_descr.dim); + param.v = static_cast(static_cast(v.data()) + cid * bytes / composite_descr.dim); components.push_back(new ColorSpinorField(param)); } } @@ -203,7 +186,7 @@ namespace quda param.is_component = composite_descr.is_component; param.component_id = composite_descr.id; even = new ColorSpinorField(param); - param.v = static_cast(v) + bytes / 2; + param.v = static_cast(v.data()) + bytes / 2; odd = new ColorSpinorField(param); } @@ -231,10 +214,10 @@ namespace quda size_t subset_bytes_raw = bytes_raw / siteSubset; for (int subset = 0; subset < siteSubset; subset++) { if (location == QUDA_CUDA_FIELD_LOCATION) - qudaMemsetAsync(static_cast(v) + subset_bytes_raw + subset_bytes * subset, 0, + qudaMemsetAsync(static_cast(v.data()) + subset_bytes_raw + subset_bytes * subset, 0, subset_bytes - subset_bytes_raw, device::get_default_stream()); else - memset(static_cast(v) + subset_bytes_raw + subset_bytes * subset, 0, subset_bytes - subset_bytes_raw); + memset(static_cast(v.data()) + subset_bytes_raw + subset_bytes * subset, 0, subset_bytes - subset_bytes_raw); } } } @@ -252,8 +235,7 @@ namespace quda pc_type = std::exchange(src.pc_type, QUDA_PC_INVALID); suggested_parity = std::exchange(src.suggested_parity, QUDA_INVALID_PARITY); length = std::exchange(src.length, 0); - v = std::exchange(src.v, nullptr); - v_h = std::exchange(src.v_h, nullptr); + v = std::exchange(src.v, {}); norm_offset = std::exchange(src.norm_offset, 0); ghost = std::exchange(src.ghost, {}); ghostFace = std::exchange(src.ghostFace, {}); @@ -274,18 +256,7 @@ namespace quda void ColorSpinorField::destroy() { if (alloc) { - if (location == QUDA_CPU_FIELD_LOCATION) { - host_free(v); - } else { // device field - switch (mem_type) { - case QUDA_MEMORY_DEVICE: pool_device_free(v); break; - case QUDA_MEMORY_MAPPED: host_free(v_h); break; - default: errorQuda("Unsupported memory type %d", mem_type); - } - } alloc = false; - v = nullptr; - v_h = nullptr; if (composite_descr.is_composite) { CompositeColorSpinorField::iterator vec; @@ -432,11 +403,7 @@ namespace quda void ColorSpinorField::zero() { - if (Location() == QUDA_CUDA_FIELD_LOCATION) { - qudaMemsetAsync(v, 0, bytes, device::get_default_stream()); - } else { - memset(v, '\0', bytes); - } + qudaMemsetAsync(v, 0, bytes, device::get_default_stream()); } void ColorSpinorField::copy(const ColorSpinorField &src) @@ -459,7 +426,7 @@ namespace quda void *buffer = pool_pinned_malloc(bytes); memset(buffer, 0, bytes); // FIXME (temporary?) bug fix for padding copyGenericColorSpinor(*this, src, QUDA_CPU_FIELD_LOCATION, buffer, 0); - qudaMemcpy(v, buffer, bytes, qudaMemcpyDefault); + qudaMemcpy(v.data(), buffer, bytes, qudaMemcpyDefault); pool_pinned_free(buffer); } else { // reorder on device @@ -467,7 +434,7 @@ namespace quda if (src.FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) { // special case where we use mapped memory to read/write directly from application's array void *src_d = get_mapped_device_pointer(src.V()); - copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, v, src_d); + copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, v.data(), src_d); } else { void *Src = nullptr, *buffer = nullptr; if (!zeroCopy) { @@ -494,7 +461,7 @@ namespace quda if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // reorder on the host void *buffer = pool_pinned_malloc(bytes); - qudaMemcpy(buffer, v, bytes, qudaMemcpyDefault); + qudaMemcpy(buffer, v.data(), bytes, qudaMemcpyDefault); copyGenericColorSpinor(*this, src, QUDA_CPU_FIELD_LOCATION, 0, buffer); pool_pinned_free(buffer); @@ -502,7 +469,7 @@ namespace quda if (FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) { // special case where we use zero-copy memory to read/write directly from application's array - void *dest_d = get_mapped_device_pointer(v); + void *dest_d = get_mapped_device_pointer(v.data()); copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, dest_d, src.V()); } else { void *dst = nullptr, *buffer = nullptr; @@ -517,10 +484,10 @@ namespace quda copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, dst, 0); if (!zeroCopy) { - qudaMemcpy(v, dst, Bytes(), qudaMemcpyDefault); + qudaMemcpy(v.data(), dst, Bytes(), qudaMemcpyDefault); } else { qudaDeviceSynchronize(); - memcpy(v, buffer, bytes); + memcpy(v.data(), buffer, bytes); } if (zeroCopy) @@ -545,7 +512,7 @@ namespace quda { LatticeField::fill(param); param.field = const_cast(this); - param.v = v; + param.v = v.data(); param.nColor = nColor; param.nSpin = nSpin; param.nVec = nVec; @@ -1516,49 +1483,29 @@ namespace quda void ColorSpinorField::backup() const { - if (backed_up) errorQuda("ColorSpinorField already backed up"); - - backup_h = new char[bytes]; - if (Location() == QUDA_CUDA_FIELD_LOCATION) { - qudaMemcpy(backup_h, v, bytes, qudaMemcpyDefault); - } else { - memcpy(backup_h, v, bytes); - } - - backed_up = true; + if (backup_h.size()) errorQuda("ColorSpinorField already backed up"); + backup_h.resize(1); + backup_h[0] = std::move(quda_ptr(QUDA_MEMORY_HOST, bytes)); + qudaMemcpy(backup_h[0], v, bytes, qudaMemcpyDefault); } void ColorSpinorField::restore() const { - if (!backed_up) errorQuda("Cannot restore since not backed up"); - - if (Location() == QUDA_CUDA_FIELD_LOCATION) { - qudaMemcpy(v, backup_h, bytes, qudaMemcpyDefault); - delete[] backup_h; - } else { - memcpy(v, backup_h, bytes); - delete[] backup_h; - } - - backed_up = false; + if (!backup_h.size()) errorQuda("Cannot restore since not backed up"); + qudaMemcpy(v, backup_h[0], bytes, qudaMemcpyDefault); + backup_h.resize(0); } void ColorSpinorField::copy_to_buffer(void *buffer) const { - if (Location() == QUDA_CUDA_FIELD_LOCATION) { - qudaMemcpy(buffer, v, bytes, qudaMemcpyDeviceToHost); - } else { - std::memcpy(buffer, v, bytes); - } + quda_ptr buf(buffer, QUDA_MEMORY_HOST); + qudaMemcpy(buf, v, bytes, qudaMemcpyDefault); } void ColorSpinorField::copy_from_buffer(void *buffer) { - if (Location() == QUDA_CUDA_FIELD_LOCATION) { - qudaMemcpy(v, buffer, bytes, qudaMemcpyHostToDevice); - } else { - std::memcpy(v, buffer, bytes); - } + quda_ptr buf(buffer, QUDA_MEMORY_HOST); + qudaMemcpy(v, buf, bytes, qudaMemcpyDefault); } void ColorSpinorField::prefetch(QudaFieldLocation mem_space, qudaStream_t stream) const @@ -1566,7 +1513,7 @@ namespace quda if (Location() == QUDA_CUDA_FIELD_LOCATION) { // conditionals based on destructor if (is_prefetch_enabled() && alloc && mem_type == QUDA_MEMORY_DEVICE) - qudaMemPrefetchAsync(v, bytes, mem_space, stream); + qudaMemPrefetchAsync(v.data(), bytes, mem_space, stream); } } @@ -1607,7 +1554,7 @@ namespace quda std::ostream &operator<<(std::ostream &out, const ColorSpinorField &a) { out << "location = " << a.Location() << std::endl; - out << "v = " << a.v << std::endl; + out << "v = " << a.v.data() << std::endl; out << "alloc = " << a.alloc << std::endl; out << "reference = " << a.reference << std::endl; out << "init = " << a.init << std::endl; diff --git a/lib/dirac_clover.cpp b/lib/dirac_clover.cpp index cf57b39352..6bb8e56df5 100644 --- a/lib/dirac_clover.cpp +++ b/lib/dirac_clover.cpp @@ -105,7 +105,7 @@ namespace quda { DiracClover(param) { // For the preconditioned operator, we need to check that the inverse of the clover term is present - if (!clover->cloverInv && !clover::dynamic_inverse()) errorQuda("Clover inverse required for DiracCloverPC"); + if (!clover->Inverse() && !clover::dynamic_inverse()) errorQuda("Clover inverse required for DiracCloverPC"); } DiracCloverPC::DiracCloverPC(const DiracCloverPC &dirac) : DiracClover(dirac) { } diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index cb4319857e..e129de86bf 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -897,7 +897,8 @@ namespace quda { } } - void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) { + void *create_gauge_buffer(size_t bytes, QudaGaugeFieldOrder order, QudaFieldGeometry geometry) + { if (order == QUDA_QDP_GAUGE_ORDER) { void **buffer = new void*[geometry]; for (int d=0; d 4) { void **buffer = new void*[geometry]; for (int d=0; d 4) { for (int d=0; d(buffer); } else { - backup_h = new char[bytes]; - qudaMemcpy(backup_h, gauge.data(), bytes, qudaMemcpyDefault); + backup_h.resize(1); + backup_h[0] = std::move(quda_ptr(QUDA_MEMORY_HOST, bytes)); + qudaMemcpy(backup_h[0], gauge, bytes, qudaMemcpyDefault); } - - backed_up = true; } void GaugeField::restore() const { - if (!backed_up) errorQuda("Cannot restore since not backed up"); + if (!backup_h.size()) errorQuda("Cannot restore since not backed up"); if (order == QUDA_QDP_GAUGE_ORDER) { - char **buffer = reinterpret_cast(backup_h); for (int d = 0; d < geometry; d++) { - qudaMemcpy(gauge_array[d].data(), buffer[d], bytes / geometry, qudaMemcpyDefault); - delete[] buffer[d]; + qudaMemcpy(gauge_array[d], backup_h[d], bytes / geometry, qudaMemcpyDefault); } - delete[] buffer; } else { - qudaMemcpy(gauge.data(), backup_h, bytes, qudaMemcpyDefault); - delete[] backup_h; + qudaMemcpy(gauge, backup_h[0], bytes, qudaMemcpyDefault); } - backed_up = false; + + backup_h.resize(0); } void GaugeField::copy_to_buffer(void *buffer) const diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp index 8b4b123776..b75b1dcff8 100644 --- a/lib/lattice_field.cpp +++ b/lib/lattice_field.cpp @@ -29,17 +29,13 @@ namespace quda { volume(1), localVolume(1), pad(param.pad), - total_bytes(0), nDim(param.nDim), location(param.location), precision(param.Precision()), ghost_precision(param.GhostPrecision()), - ghost_precision_reset(false), scale(param.scale), siteSubset(param.siteSubset), ghostExchange(param.ghostExchange), - ghost_bytes(0), - ghost_bytes_old(0), ghost_face_bytes {}, ghost_face_bytes_aligned {}, ghost_offset(), @@ -59,11 +55,7 @@ namespace quda { mh_send {}, mh_recv_rdma {}, mh_send_rdma {}, - initComms(false), - mem_type(param.mem_type), - backup_h(nullptr), - backup_norm_h(nullptr), - backed_up(false) + mem_type(param.mem_type) { create(param); } @@ -75,18 +67,14 @@ namespace quda { localVolumeCB(field.localVolumeCB), stride(field.stride), pad(field.pad), - total_bytes(0), nDim(field.nDim), location(field.location), precision(field.precision), ghost_precision(field.ghost_precision), - ghost_precision_reset(false), scale(field.scale), siteSubset(field.siteSubset), ghostExchange(field.ghostExchange), nDimComms(field.nDimComms), - ghost_bytes(0), - ghost_bytes_old(0), ghost_face_bytes {}, ghost_face_bytes_aligned {}, ghost_offset(), @@ -106,11 +94,7 @@ namespace quda { mh_send {}, mh_recv_rdma {}, mh_send_rdma {}, - initComms(false), - mem_type(field.mem_type), - backup_h(nullptr), - backup_norm_h(nullptr), - backed_up(false) + mem_type(field.mem_type) { LatticeFieldParam param; field.fill(param); @@ -247,9 +231,7 @@ namespace quda { vol_string = std::exchange(src.vol_string, {}); aux_string = std::exchange(src.aux_string, {}); mem_type = std::exchange(src.mem_type, QUDA_MEMORY_INVALID); - backup_h = std::exchange(src.backup_h, nullptr); - backup_norm_h = std::exchange(src.backup_norm_h, nullptr); - backed_up = std::exchange(src.backed_up, false); + backup_h = std::exchange(src.backup_h, {}); } void LatticeField::fill(LatticeFieldParam ¶m) const From ece19db8847cde90a40bb44c3ef9ae2189386ee7 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 26 May 2023 13:15:12 -0700 Subject: [PATCH 17/99] Fix clang warnings --- include/color_spinor_field.h | 2 +- include/gauge_field.h | 2 +- lib/dslash_coarse.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 9b88534e58..8186425d1c 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -905,7 +905,7 @@ namespace quda static void test_compatible_weak(const ColorSpinorField &a, const ColorSpinorField &b); friend std::ostream &operator<<(std::ostream &out, const ColorSpinorField &); - friend class ColorSpinorParam; + friend struct ColorSpinorParam; }; /** diff --git a/include/gauge_field.h b/include/gauge_field.h index 52a4a40b06..bf75bc6bfa 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -599,7 +599,7 @@ namespace quda { */ void copy_from_buffer(void *buffer); - friend class GaugeFieldParam; + friend struct GaugeFieldParam; }; /** diff --git a/lib/dslash_coarse.hpp b/lib/dslash_coarse.hpp index 486217ddd5..a98290d129 100644 --- a/lib/dslash_coarse.hpp +++ b/lib/dslash_coarse.hpp @@ -740,7 +740,7 @@ namespace quda { strcat(aux, dslash.inA[0].AuxString().c_str()); strcat(aux, ",gauge_prec="); - char prec_str[8]; + char prec_str[16]; i32toa(prec_str, dslash.Y.Precision()); strcat(aux, prec_str); strcat(aux, ",halo_prec="); From f5685855cc26d2a9faab7d3407b44df44bac41f4 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 30 May 2023 12:16:26 -0700 Subject: [PATCH 18/99] Clean up and fix some bugs that creeped in --- include/quda_api.h | 25 ++++++----------------- lib/clover_field.cpp | 18 ++++++++--------- lib/coarse_op.in.cu | 2 +- lib/color_spinor_field.cpp | 13 ++++-------- lib/gauge_field.cpp | 37 ++++++++++++----------------------- lib/targets/cuda/quda_api.cpp | 20 +++---------------- 6 files changed, 35 insertions(+), 80 deletions(-) diff --git a/include/quda_api.h b/include/quda_api.h index d2abba24d1..b3b9f35b69 100644 --- a/include/quda_api.h +++ b/include/quda_api.h @@ -90,17 +90,6 @@ namespace quda */ void qudaMemset_(void *ptr, int value, size_t count, const char *func, const char *file, const char *line); - /** - @brief Heterogenous memset2d function - @param[out] ptr Heterogeneous pointer - @param[in] offset Offset shift in bytes from the base pointer - @param[in] Pitch in bytes - @param[in] value Value to set for each byte of specified memory - @param[in] width Width in bytes - @param[in] height Height in bytes - */ - void qudaMemset2D_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height, const char *func, const char *file, const char *line); - /** @brief Wrapper around cudaMemsetAsync or driver API equivalent @param[out] ptr Starting address pointer @@ -122,16 +111,17 @@ namespace quda const char *file, const char *line); /** - @brief Wrapper around cudaMemsetAsync or driver API equivalent + @brief Asynchronous heterogenous memset2d function @param[out] ptr Starting address pointer + @param[in] Initial offset from pointer @param[in] Pitch in bytes @param[in] value Value to set for each byte of specified memory @param[in] width Width in bytes @param[in] height Height in bytes @param[in] stream Stream to issue memset */ - void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width, size_t height, const qudaStream_t &stream, - const char *func, const char *file, const char *line); + void qudaMemset2DAsync_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height, + const qudaStream_t &stream, const char *func, const char *file, const char *line); /** @brief Wrapper around cudaMemPrefetchAsync or driver API equivalent @@ -253,14 +243,11 @@ namespace quda #define qudaMemset(ptr, value, count) \ ::quda::qudaMemset_(ptr, value, count, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)) -#define qudaMemset2D(ptr, offset, pitch, value, width, height) \ - ::quda::qudaMemset2D_(ptr, offset, pitch, value, width, height, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)) - #define qudaMemsetAsync(ptr, value, count, stream) \ ::quda::qudaMemsetAsync_(ptr, value, count, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)) -#define qudaMemset2DAsync(ptr, pitch, value, width, height, stream) \ - ::quda::qudaMemset2DAsync_(ptr, pitch, value, width, height, stream, __func__, quda::file_name(__FILE__), \ +#define qudaMemset2DAsync(ptr, offset, pitch, value, width, height, stream) \ + ::quda::qudaMemset2DAsync_(ptr, offset, pitch, value, width, height, stream, __func__, quda::file_name(__FILE__), \ __STRINGIFY__(__LINE__)) #define qudaMemPrefetchAsync(ptr, count, mem_space, stream) \ diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp index 78076c0c9a..2727069224 100644 --- a/lib/clover_field.cpp +++ b/lib/clover_field.cpp @@ -337,19 +337,17 @@ namespace quda { } // Return the L2 norm squared of the clover field - double norm2(const CloverField &a, bool inverse) { - ColorSpinorField *b = ColorSpinorField::Create(colorSpinorParam(a, inverse)); - double nrm2 = blas::norm2(*b); - delete b; - return nrm2; + double norm2(const CloverField &a, bool inverse) + { + ColorSpinorField b(colorSpinorParam(a, inverse)); + return blas::norm2(b); } // Return the L1 norm of the clover field - double norm1(const CloverField &a, bool inverse) { - ColorSpinorField *b = ColorSpinorField::Create(colorSpinorParam(a, inverse)); - double nrm1 = blas::norm1(*b); - delete b; - return nrm1; + double norm1(const CloverField &a, bool inverse) + { + ColorSpinorField b(colorSpinorParam(a, inverse)); + return blas::norm1(b); } } // namespace quda diff --git a/lib/coarse_op.in.cu b/lib/coarse_op.in.cu index 0684e0e97a..358c3ba0b9 100644 --- a/lib/coarse_op.in.cu +++ b/lib/coarse_op.in.cu @@ -97,7 +97,7 @@ namespace quda { gCoarseAtomic yAccessorAtomic(const_cast(Yatomic)); gCoarseAtomic xAccessorAtomic(const_cast(Xatomic)); cFine cAccessor(const_cast(c), false); - cFine cInvAccessor(const_cast(c), true); + cFine cInvAccessor(const_cast(c), c.Inverse()); calculateY (yAccessor, xAccessor, yAccessorAtomic, xAccessorAtomic, uvAccessor, diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index 26a373d29e..a40f191712 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -186,7 +186,7 @@ namespace quda param.is_component = composite_descr.is_component; param.component_id = composite_descr.id; even = new ColorSpinorField(param); - param.v = static_cast(v.data()) + bytes / 2; + param.v = !ghost_only ? static_cast(v.data()) + bytes / 2 : nullptr; odd = new ColorSpinorField(param); } @@ -208,17 +208,12 @@ namespace quda void ColorSpinorField::zeroPad() { + if (!isNative()) return; // zero the region added for alignment reasons if (bytes != bytes_raw) { size_t subset_bytes = bytes / siteSubset; size_t subset_bytes_raw = bytes_raw / siteSubset; - for (int subset = 0; subset < siteSubset; subset++) { - if (location == QUDA_CUDA_FIELD_LOCATION) - qudaMemsetAsync(static_cast(v.data()) + subset_bytes_raw + subset_bytes * subset, 0, - subset_bytes - subset_bytes_raw, device::get_default_stream()); - else - memset(static_cast(v.data()) + subset_bytes_raw + subset_bytes * subset, 0, subset_bytes - subset_bytes_raw); - } + qudaMemset2DAsync(v, subset_bytes_raw, subset_bytes, 0, subset_bytes - subset_bytes_raw, siteSubset, device::get_default_stream()); } } @@ -512,7 +507,7 @@ namespace quda { LatticeField::fill(param); param.field = const_cast(this); - param.v = v.data(); + param.v = !ghost_only ? v.data() : nullptr; param.nColor = nColor; param.nSpin = nSpin; param.nVec = nVec; diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index e129de86bf..51d5b59a47 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -309,7 +309,7 @@ namespace quda { size_t pitch = stride * order * precision; if (pad_bytes) { for (int parity = 0; parity < 2; parity++) { - qudaMemset2D(gauge, parity * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad); + qudaMemset2DAsync(gauge, parity * (bytes / 2) + volumeCB * order * precision, pitch, 0, pad_bytes, Npad, device::get_default_stream()); } } } @@ -863,15 +863,6 @@ namespace quda { comm_wait(mh_recv[i]); } - if (Location() == QUDA_CUDA_FIELD_LOCATION) { - for (int i=0; i(ptr.data()) + offset, pitch, value, width, height); + cudaError_t error = cudaMemset2DAsync(static_cast(ptr.data()) + offset, pitch, value, width, height, get_stream(stream)); set_runtime_error(error, __func__, func, file, line); } else { for (auto i = 0u; i < height; i++) memset(static_cast(ptr.data()) + offset + i * pitch, value, width); } } - void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width, size_t height, const qudaStream_t &stream, - const char *func, const char *file, const char *line) - { - cudaError_t error = cudaMemset2DAsync(ptr, pitch, value, width, height, get_stream(stream)); - set_runtime_error(error, __func__, func, file, line); - } - void qudaMemPrefetchAsync_(void *ptr, size_t count, QudaFieldLocation mem_space, const qudaStream_t &stream, const char *func, const char *file, const char *line) { From b15f94d9dabb258eb5792671af1164fc7611498a Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 30 May 2023 22:44:39 -0700 Subject: [PATCH 19/99] Update MRE solver to use getProfile --- include/invert_quda.h | 4 +--- lib/interface_quda.cpp | 6 +++--- lib/inv_mre.cpp | 20 ++++++++++---------- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/include/invert_quda.h b/include/invert_quda.h index b2699ad3b7..8043fb7f0d 100644 --- a/include/invert_quda.h +++ b/include/invert_quda.h @@ -1542,8 +1542,6 @@ namespace quda { bool apply_mat; //! Whether to compute q = Ap or assume it is provided bool hermitian; //! Whether A is hermitian or not - TimeProfile &profile; - /** @brief Solve the equation A p_k psi_k = q_k psi_k = b by minimizing the residual and using Eigen's SVD algorithm for numerical stability @@ -1562,7 +1560,7 @@ namespace quda { @param apply_mat Whether to apply the operator in place or assume q already contains this @profile Timing profile to use */ - MinResExt(const DiracMatrix &mat, bool orthogonal, bool apply_mat, bool hermitian, TimeProfile &profile = dummy); + MinResExt(const DiracMatrix &mat, bool orthogonal, bool apply_mat, bool hermitian); /** @param x The optimum for the solution vector. diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index aa25b06621..3c32443205 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -2898,7 +2898,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) bool orthogonal = true; bool apply_mat = false; bool hermitian = false; - MinResExt mre(m, orthogonal, apply_mat, hermitian, profileInvert); + MinResExt mre(m, orthogonal, apply_mat, hermitian); mre(*out, *in, basis, Ap); profileInvert.TPSTOP(QUDA_PROFILE_CHRONO); @@ -2933,7 +2933,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) bool orthogonal = true; bool apply_mat = false; bool hermitian = true; - MinResExt mre(m, orthogonal, apply_mat, hermitian, profileInvert); + MinResExt mre(m, orthogonal, apply_mat, hermitian); mre(*out, *in, basis, Ap); profileInvert.TPSTOP(QUDA_PROFILE_CHRONO); @@ -3712,7 +3712,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) bool orthogonal = false; bool apply_mat = true; bool hermitian = true; - MinResExt mre(*m, orthogonal, apply_mat, hermitian, profileMulti); + MinResExt mre(*m, orthogonal, apply_mat, hermitian); mre(x[i], b, z, q); } diff --git a/lib/inv_mre.cpp b/lib/inv_mre.cpp index 91a79bab55..10733a6aaa 100644 --- a/lib/inv_mre.cpp +++ b/lib/inv_mre.cpp @@ -5,8 +5,8 @@ namespace quda { - MinResExt::MinResExt(const DiracMatrix &mat, bool orthogonal, bool apply_mat, bool hermitian, TimeProfile &profile) : - mat(mat), orthogonal(orthogonal), apply_mat(apply_mat), hermitian(hermitian), profile(profile) + MinResExt::MinResExt(const DiracMatrix &mat, bool orthogonal, bool apply_mat, bool hermitian) : + mat(mat), orthogonal(orthogonal), apply_mat(apply_mat), hermitian(hermitian) { } @@ -44,14 +44,14 @@ namespace quda for (int j = 0; j < N; j++) { A(i, j) = A_[i * (N + 1) + j]; } } - profile.TPSTOP(QUDA_PROFILE_CHRONO); - profile.TPSTART(QUDA_PROFILE_EIGEN); + getProfile().TPSTOP(QUDA_PROFILE_CHRONO); + getProfile().TPSTART(QUDA_PROFILE_EIGEN); LDLT cholesky(A); psi = cholesky.solve(phi); - profile.TPSTOP(QUDA_PROFILE_EIGEN); - profile.TPSTART(QUDA_PROFILE_CHRONO); + getProfile().TPSTOP(QUDA_PROFILE_EIGEN); + getProfile().TPSTART(QUDA_PROFILE_CHRONO); for (int i = 0; i < N; i++) psi_[i] = psi(i); } @@ -70,8 +70,8 @@ namespace quda void MinResExt::operator()(ColorSpinorField &x, const ColorSpinorField &b, std::vector &p, std::vector &q) { - bool running = profile.isRunning(QUDA_PROFILE_CHRONO); - if (!running) profile.TPSTART(QUDA_PROFILE_CHRONO); + bool running = getProfile().isRunning(QUDA_PROFILE_CHRONO); + if (!running) getProfile().TPSTART(QUDA_PROFILE_CHRONO); const int N = p.size(); logQuda(QUDA_VERBOSE, "Constructing minimum residual extrapolation with basis size %d\n", N); @@ -81,7 +81,7 @@ namespace quda blas::zero(x); else blas::copy(x, p[0]); - if (!running) profile.TPSTOP(QUDA_PROFILE_CHRONO); + if (!running) getProfile().TPSTOP(QUDA_PROFILE_CHRONO); return; } @@ -133,7 +133,7 @@ namespace quda printfQuda("MinResExt: N = %d, |res| / |src| = %e\n", N, sqrt(blas::norm2(r) / blas::norm2(b))); } - if (!running) profile.TPSTOP(QUDA_PROFILE_CHRONO); + if (!running) getProfile().TPSTOP(QUDA_PROFILE_CHRONO); } } // namespace quda From 2f4c41d6da30aa273b66fedbe8bea54515f82a68 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 30 May 2023 22:46:48 -0700 Subject: [PATCH 20/99] Include some missing headers that broke jitify --- include/multi_blas_helper.cuh | 1 + include/reference_wrapper_helper.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/include/multi_blas_helper.cuh b/include/multi_blas_helper.cuh index 6a470fe576..78aaa1ac4b 100644 --- a/include/multi_blas_helper.cuh +++ b/include/multi_blas_helper.cuh @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include diff --git a/include/reference_wrapper_helper.h b/include/reference_wrapper_helper.h index 2b85c497fd..3f73709ca6 100644 --- a/include/reference_wrapper_helper.h +++ b/include/reference_wrapper_helper.h @@ -1,6 +1,8 @@ #pragma once +#include #include +#include #include #include #include From 0178ab5f92c98a15ec25d660ffb057dee2c409ae Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 30 May 2023 22:49:39 -0700 Subject: [PATCH 21/99] Move contents of TimeProfile to timer.cpp to avoid breaking jitify. --- include/timer.h | 94 ++++---------------------------------------- include/tune_quda.h | 59 ++++------------------------ lib/timer.cpp | 96 ++++++++++++++++++++++++++++++++++++++++++--- lib/tune.cpp | 54 +++++++++++++++++++++++++ 4 files changed, 161 insertions(+), 142 deletions(-) diff --git a/include/timer.h b/include/timer.h index 2475fee154..0d529867cb 100644 --- a/include/timer.h +++ b/include/timer.h @@ -2,10 +2,6 @@ #include -#ifdef INTERFACE_NVTX -#include "nvtx3/nvToolsExt.h" -#endif - #include #include #include @@ -191,70 +187,25 @@ namespace quda { QUDA_PROFILE_COUNT /**< The total number of timers we have. Must be last enum type. */ }; -#ifdef INTERFACE_NVTX - -#define PUSH_RANGE(name,cid) { \ - int color_id = cid; \ - color_id = color_id%nvtx_num_colors;\ - nvtxEventAttributes_t eventAttrib = {}; \ - eventAttrib.version = NVTX_VERSION; \ - eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ - eventAttrib.colorType = NVTX_COLOR_ARGB; \ - eventAttrib.color = nvtx_colors[color_id]; \ - eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ - eventAttrib.message.ascii = name; \ - eventAttrib.category = cid;\ - nvtxRangePushEx(&eventAttrib); \ -} -#define POP_RANGE nvtxRangePop(); -#else -#define PUSH_RANGE(name,cid) -#define POP_RANGE -#endif - class TimeProfile { std::string fname; /**< Which function are we profiling */ #ifdef INTERFACE_NVTX static const uint32_t nvtx_colors[];// = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff }; static const int nvtx_num_colors;// = sizeof(nvtx_colors)/sizeof(uint32_t); #endif - host_timer_t profile[QUDA_PROFILE_COUNT]; + array profile; static std::string pname[]; bool switchOff; bool use_global; - // global timer - static host_timer_t global_profile[QUDA_PROFILE_COUNT]; - static bool global_switchOff[QUDA_PROFILE_COUNT]; - static int global_total_level[QUDA_PROFILE_COUNT]; // zero initialize - - static void StopGlobal(const char *func, const char *file, int line, QudaProfileType idx) { - - global_total_level[idx]--; - if (global_total_level[idx] == 0) global_profile[idx].stop(func, file, line); - - // switch off total timer if we need to - if (global_switchOff[idx]) { - global_total_level[idx]--; - if (global_total_level[idx] == 0) global_profile[idx].stop(func, file, line); - global_switchOff[idx] = false; - } - } - - static void StartGlobal(const char *func, const char *file, int line, QudaProfileType idx) { - // if total timer isn't running, then start it running - if (!global_profile[idx].running) { - global_profile[idx].start(func, file, line); - global_total_level[idx]++; - global_switchOff[idx] = true; - } - - if (global_total_level[idx] == 0) global_profile[idx].start(func, file, line); - global_total_level[idx]++; - } + static void StopGlobal(const char *func, const char *file, int line, QudaProfileType idx); + static void StartGlobal(const char *func, const char *file, int line, QudaProfileType idx); public: + TimeProfile() = default; + TimeProfile(const TimeProfile &) = default; + TimeProfile(std::string fname) : fname(fname), switchOff(false), use_global(true) { ; } TimeProfile(std::string fname, bool use_global) : fname(fname), switchOff(false), use_global(use_global) { ; } @@ -262,32 +213,8 @@ namespace quda { /**< Print out the profile information */ void Print(); - void Start_(const char *func, const char *file, int line, QudaProfileType idx) - { - // if total timer isn't running, then start it running - if (!profile[QUDA_PROFILE_TOTAL].running && idx != QUDA_PROFILE_TOTAL) { - profile[QUDA_PROFILE_TOTAL].start(func, file, line); - switchOff = true; - } - - profile[idx].start(func, file, line); - PUSH_RANGE(fname.c_str(),idx) - if (use_global) StartGlobal(func,file,line,idx); - } - - void Stop_(const char *func, const char *file, int line, QudaProfileType idx) { - if (idx == QUDA_PROFILE_COMPUTE || idx == QUDA_PROFILE_H2D || idx == QUDA_PROFILE_D2H) - qudaDeviceSynchronize(); // ensure accurate profiling - profile[idx].stop(func, file, line); - POP_RANGE - - // switch off total timer if we need to - if (switchOff && idx != QUDA_PROFILE_TOTAL) { - profile[QUDA_PROFILE_TOTAL].stop(func, file, line); - switchOff = false; - } - if (use_global) StopGlobal(func,file,line,idx); - } + void Start_(const char *func, const char *file, int line, QudaProfileType idx); + void Stop_(const char *func, const char *file, int line, QudaProfileType idx); void Reset_(const char *func, const char *file, int line) { for (int idx = 0; idx < QUDA_PROFILE_COUNT; idx++) profile[idx].reset(func, file, line); @@ -301,8 +228,6 @@ namespace quda { }; - static TimeProfile dummy("dummy"); - void pushProfile(TimeProfile &profile); void popProfile(); @@ -311,9 +236,6 @@ namespace quda { } // namespace quda -#undef PUSH_RANGE -#undef POP_RANGE - #define TPSTART(idx) Start_(__func__, __FILE__, __LINE__, idx) #define TPSTOP(idx) Stop_(__func__, __FILE__, __LINE__, idx) #define TPRESET() Reset_(__func__, __FILE__, __LINE__) diff --git a/include/tune_quda.h b/include/tune_quda.h index 1511f6f881..2aacde55f7 100644 --- a/include/tune_quda.h +++ b/include/tune_quda.h @@ -17,7 +17,7 @@ namespace quda { - class TuneParam { + struct TuneParam { public: dim3 block; @@ -35,16 +35,10 @@ namespace quda { TuneParam(TuneParam &&) = default; TuneParam &operator=(const TuneParam &) = default; TuneParam &operator=(TuneParam &&) = default; - - friend std::ostream& operator<<(std::ostream& output, const TuneParam& param) { - output << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), "; - output << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), "; - output << "shared_bytes=" << param.shared_bytes; - output << ", aux=(" << param.aux.x << "," << param.aux.y << "," << param.aux.z << "," << param.aux.w << ")"; - return output; - } }; + std::ostream &operator<<(std::ostream &, const TuneParam &); + /** * @brief Returns a reference to the tunecache map * @return tunecache reference @@ -68,20 +62,7 @@ namespace quda { virtual bool tuneGridDim() const { return true; } virtual bool tuneAuxDim() const { return false; } - virtual bool tuneSharedBytes() const - { - static bool tune_shared = true; - static bool init = false; - - if (!init) { - char *enable_shared_env = getenv("QUDA_ENABLE_TUNING_SHARED"); - if (enable_shared_env) { - if (strcmp(enable_shared_env, "0") == 0) { tune_shared = false; } - } - init = true; - } - return tune_shared; - } + virtual bool tuneSharedBytes() const; virtual bool advanceGridDim(TuneParam ¶m) const { @@ -239,16 +220,7 @@ namespace quda { @brief Whether the present instance has already been tuned or not @return True if tuned, false if not */ - bool tuned() - { - // not tuning is equivalent to already tuned - if (!getTuning()) return true; - - TuneKey key = tuneKey(); - if (use_managed_memory()) strcat(key.aux, ",managed"); - // if key is present in cache then already tuned - return getTuneCache().find(key) != getTuneCache().end(); - } + bool tuned() const; public: Tunable() : launch_error(QUDA_SUCCESS) { aux[0] = '\0'; } @@ -287,24 +259,9 @@ namespace quda { */ virtual float min_tune_time() const { return 1e-3; } - virtual std::string paramString(const TuneParam ¶m) const - { - std::stringstream ps; - ps << param; - return ps.str(); - } - - virtual std::string perfString(float time) const - { - float gflops = flops() / (1e9 * time); - float gbytes = bytes() / (1e9 * time); - std::stringstream ss; - ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops << " Gflop/s, "; - ss << gbytes << " GB/s"; - return ss.str(); - } - - virtual std::string miscString(const TuneParam &) const { return std::string(); } + virtual std::string paramString(const TuneParam ¶m) const; + virtual std::string perfString(float time) const; + virtual std::string miscString(const TuneParam &) const; virtual void initTuneParam(TuneParam ¶m) const { diff --git a/lib/timer.cpp b/lib/timer.cpp index c4e924ee6e..2214ebd0ec 100644 --- a/lib/timer.cpp +++ b/lib/timer.cpp @@ -2,10 +2,15 @@ #include #include +#ifdef INTERFACE_NVTX +#include "nvtx3/nvToolsExt.h" +#endif + namespace quda { /**< Print out the profile information */ - void TimeProfile::Print() { + void TimeProfile::Print() + { if (profile[QUDA_PROFILE_TOTAL].time > 0.0) { printfQuda("\n %20s Total time = %9.3f secs\n", fname.c_str(), profile[QUDA_PROFILE_TOTAL].time); } @@ -31,7 +36,6 @@ namespace quda { warningQuda("Accounted time %9.3f secs in %s is greater than total time %9.3f secs", accounted, (const char *)&fname[0], profile[QUDA_PROFILE_TOTAL].time); } - } std::string TimeProfile::pname[] = {"download", @@ -79,9 +83,89 @@ namespace quda { const int TimeProfile::nvtx_num_colors = sizeof(nvtx_colors)/sizeof(uint32_t); #endif - Timer<> TimeProfile::global_profile[QUDA_PROFILE_COUNT]; - bool TimeProfile::global_switchOff[QUDA_PROFILE_COUNT] = {}; - int TimeProfile::global_total_level[QUDA_PROFILE_COUNT] = {}; + // global timer + host_timer_t global_profile[QUDA_PROFILE_COUNT] = {}; + static bool global_switchOff[QUDA_PROFILE_COUNT] = {}; + static int global_total_level[QUDA_PROFILE_COUNT] = {}; + + void TimeProfile::StopGlobal(const char *func, const char *file, int line, QudaProfileType idx) + { + global_total_level[idx]--; + if (global_total_level[idx] == 0) global_profile[idx].stop(func, file, line); + + // switch off total timer if we need to + if (global_switchOff[idx]) { + global_total_level[idx]--; + if (global_total_level[idx] == 0) global_profile[idx].stop(func, file, line); + global_switchOff[idx] = false; + } + } + + void TimeProfile::StartGlobal(const char *func, const char *file, int line, QudaProfileType idx) + { + // if total timer isn't running, then start it running + if (!global_profile[idx].running) { + global_profile[idx].start(func, file, line); + global_total_level[idx]++; + global_switchOff[idx] = true; + } + + if (global_total_level[idx] == 0) global_profile[idx].start(func, file, line); + global_total_level[idx]++; + } + +#ifdef INTERFACE_NVTX + +#define PUSH_RANGE(name, cid) \ + { \ + int color_id = cid; \ + color_id = color_id % nvtx_num_colors; \ + nvtxEventAttributes_t eventAttrib = {}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = nvtx_colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + eventAttrib.category = cid; \ + nvtxRangePushEx(&eventAttrib); \ + } +#define POP_RANGE nvtxRangePop(); +#else +#define PUSH_RANGE(name, cid) +#define POP_RANGE +#endif + + void TimeProfile::Start_(const char *func, const char *file, int line, QudaProfileType idx) + { + // if total timer isn't running, then start it running + if (!profile[QUDA_PROFILE_TOTAL].running && idx != QUDA_PROFILE_TOTAL) { + profile[QUDA_PROFILE_TOTAL].start(func, file, line); + switchOff = true; + } + + profile[idx].start(func, file, line); + PUSH_RANGE(fname.c_str(), idx) + if (use_global) StartGlobal(func, file, line, idx); + } + + void TimeProfile::Stop_(const char *func, const char *file, int line, QudaProfileType idx) + { + if (idx == QUDA_PROFILE_COMPUTE || idx == QUDA_PROFILE_H2D || idx == QUDA_PROFILE_D2H) + qudaDeviceSynchronize(); // ensure accurate profiling + profile[idx].stop(func, file, line); + POP_RANGE + + // switch off total timer if we need to + if (switchOff && idx != QUDA_PROFILE_TOTAL) { + profile[QUDA_PROFILE_TOTAL].stop(func, file, line); + switchOff = false; + } + if (use_global) StopGlobal(func, file, line, idx); + } + +#undef PUSH_RANGE +#undef POP_RANGE void TimeProfile::PrintGlobal() { if (global_profile[QUDA_PROFILE_TOTAL].time > 0.0) { @@ -114,6 +198,8 @@ namespace quda { } } + TimeProfile dummy("dummy"); + static std::stack tpstack; void pushProfile(TimeProfile &profile) diff --git a/lib/tune.cpp b/lib/tune.cpp index 57134ec3d4..1d6971db3c 100644 --- a/lib/tune.cpp +++ b/lib/tune.cpp @@ -656,9 +656,63 @@ namespace quda aux = make_int4(1, 1, 1, 1); } + std::ostream &operator<<(std::ostream &output, const TuneParam ¶m) + { + output << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), "; + output << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), "; + output << "shared_bytes=" << param.shared_bytes; + output << ", aux=(" << param.aux.x << "," << param.aux.y << "," << param.aux.z << "," << param.aux.w << ")"; + return output; + } + + bool Tunable::tuneSharedBytes() const + { + static bool tune_shared = true; + static bool init = false; + + if (!init) { + char *enable_shared_env = getenv("QUDA_ENABLE_TUNING_SHARED"); + if (enable_shared_env) { + if (strcmp(enable_shared_env, "0") == 0) { tune_shared = false; } + } + init = true; + } + return tune_shared; + } + int Tunable::blockStep() const { return device::warp_size(); } int Tunable::blockMin() const { return device::warp_size(); } + bool Tunable::tuned() const + { + // not tuning is equivalent to already tuned + if (!getTuning()) return true; + + TuneKey key = tuneKey(); + if (use_managed_memory()) strcat(key.aux, ",managed"); + // if key is present in cache then already tuned + return getTuneCache().find(key) != getTuneCache().end(); + } + + std::string Tunable::paramString(const TuneParam ¶m) const + { + std::stringstream ps; + ps << param; + return ps.str(); + } + + std::string Tunable::perfString(float time) const + { + float gflops = flops() / (1e9 * time); + float gbytes = bytes() / (1e9 * time); + std::stringstream ss; + ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops << " Gflop/s, "; + ss << gbytes << " GB/s"; + return ss.str(); + } + + std::string Tunable::miscString(const TuneParam &) const { return std::string(); } + int32_t Tunable::getTuneRank() const { static bool init = false; From bf14e687ec317693b322b4cea491235d7b8fe8e2 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 30 May 2023 22:51:17 -0700 Subject: [PATCH 22/99] Fixed for covdev_test --- tests/covdev_test.cpp | 4 +-- tests/host_reference/covdev_reference.cpp | 40 +++++++++++------------ tests/host_reference/covdev_reference.h | 5 ++- 3 files changed, 23 insertions(+), 26 deletions(-) diff --git a/tests/covdev_test.cpp b/tests/covdev_test.cpp index 0a5d5d38c7..d296a553af 100644 --- a/tests/covdev_test.cpp +++ b/tests/covdev_test.cpp @@ -161,9 +161,9 @@ void covdevRef(int mu) // compare to dslash reference implementation printfQuda("Calculating reference implementation..."); #ifdef MULTI_GPU - mat_mg4dir(*spinorRef, *cpuLink, *spinor, dagger, mu, inv_param.cpu_prec, gauge_param.cpu_prec); + mat_mg4dir(*spinorRef, *cpuLink, *spinor, dagger, mu); #else - mat(spinorRef->V(), *cpuLink, spinor->V(), dagger, mu, inv_param.cpu_prec, gauge_param.cpu_prec); + mat(*spinorRef, *cpuLink, *spinor, dagger, mu); #endif printfQuda("done.\n"); } diff --git a/tests/host_reference/covdev_reference.cpp b/tests/host_reference/covdev_reference.cpp index a8c178af00..97dae09402 100644 --- a/tests/host_reference/covdev_reference.cpp +++ b/tests/host_reference/covdev_reference.cpp @@ -82,32 +82,31 @@ void covdev_dslash(void *res, void **link, void *spinorField, int oddBit, int da } } -template void Mat(sFloat *out, gFloat **link, sFloat *in, int daggerBit, int mu) +template +void Mat(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu) { - sFloat *inEven = in; - sFloat *inOdd = in + Vh * spinor_site_size; - sFloat *outEven = out; - sFloat *outOdd = out + Vh * spinor_site_size; - // full dslash operator - covdevReference(outOdd, link, inEven, 1, daggerBit, mu); - covdevReference(outEven, link, inOdd, 0, daggerBit, mu); + void *data[4] = {link.data(0), link.data(1), link.data(2), link.data(3)}; + covdevReference(reinterpret_cast(out.Odd().V()), reinterpret_cast(data), + reinterpret_cast(in.Even().V()), 1, daggerBit, mu); + covdevReference(reinterpret_cast(out.Even().V()), reinterpret_cast(data), + reinterpret_cast(in.Odd().V()), 0, daggerBit, mu); } -void mat(void *out, void **link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision) +void mat(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu) { - if (sPrecision == QUDA_DOUBLE_PRECISION) { - if (gPrecision == QUDA_DOUBLE_PRECISION) { - Mat((double *)out, (double **)link, (double *)in, dagger_bit, mu); + if (checkPrecision(in, out) == QUDA_DOUBLE_PRECISION) { + if (link.Precision() == QUDA_DOUBLE_PRECISION) { + Mat(out, link, in, dagger_bit, mu); } else { - Mat((double *)out, (float **)link, (double *)in, dagger_bit, mu); + Mat(out, link, in, dagger_bit, mu); } } else { - if (gPrecision == QUDA_DOUBLE_PRECISION) { - Mat((float *)out, (double **)link, (float *)in, dagger_bit, mu); + if (link.Precision() == QUDA_DOUBLE_PRECISION) { + Mat(out, link, in, dagger_bit, mu); } else { - Mat((float *)out, (float **)link, (float *)in, dagger_bit, mu); + Mat(out, link, in, dagger_bit, mu); } } } @@ -252,17 +251,16 @@ void Mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinor } } -void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, - int mu, QudaPrecision sPrecision, QudaPrecision gPrecision) +void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu) { - if (sPrecision == QUDA_DOUBLE_PRECISION) { - if (gPrecision == QUDA_DOUBLE_PRECISION) { + if (checkPrecision(in, out) == QUDA_DOUBLE_PRECISION) { + if (link.Precision() == QUDA_DOUBLE_PRECISION) { Mat_mg4dir(out, link, in, dagger_bit, mu); } else { Mat_mg4dir(out, link, in, dagger_bit, mu); } } else { - if (gPrecision == QUDA_DOUBLE_PRECISION) { + if (link.Precision() == QUDA_DOUBLE_PRECISION) { Mat_mg4dir(out, link, in, dagger_bit, mu); } else { Mat_mg4dir(out, link, in, dagger_bit, mu); diff --git a/tests/host_reference/covdev_reference.h b/tests/host_reference/covdev_reference.h index c2045773ed..679736109a 100644 --- a/tests/host_reference/covdev_reference.h +++ b/tests/host_reference/covdev_reference.h @@ -11,13 +11,12 @@ void covdev_dslash(void *res, const GaugeField &link, void *spinorField, int odd void covdev_dslash_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int oddBit, int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision); -void mat(void *out, const GaugeField &link, void *in, int daggerBit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision); +void mat(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu); void matdagmat(void *out, const GaugeField &link, void *in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, void *tmp, QudaParity parity); -void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu, - QudaPrecision sPrecision, QudaPrecision gPrecision); +void mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int daggerBit, int mu); void matdagmat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu, QudaPrecision sPrecision, QudaPrecision gPrecision, ColorSpinorField &tmp, QudaParity parity); From 7bf774c1dc311b0b60bb8215755dbe4c967225a5 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 30 May 2023 22:59:05 -0700 Subject: [PATCH 23/99] Update jitify to latest with some custom additions yet to be back ported --- include/targets/cuda/externals/jitify.hpp | 258 ++++++++++++++++++---- 1 file changed, 212 insertions(+), 46 deletions(-) diff --git a/include/targets/cuda/externals/jitify.hpp b/include/targets/cuda/externals/jitify.hpp index 46a51a97cd..110be5d22e 100644 --- a/include/targets/cuda/externals/jitify.hpp +++ b/include/targets/cuda/externals/jitify.hpp @@ -365,7 +365,7 @@ inline std::string path_base(std::string p) { // "foo/bar" -> "foo" // "foo/bar/" -> "foo/bar" #if defined _WIN32 || defined _WIN64 - char sep = '\\'; + const char* sep = "\\/"; #else char sep = '/'; #endif @@ -496,10 +496,13 @@ inline std::string comment_out_code_line(int line_num, std::string source) { inline void print_with_line_numbers(std::string const& source) { int linenum = 1; std::stringstream source_ss(source); + std::stringstream output_ss; + output_ss.imbue(std::locale::classic()); for (std::string line; std::getline(source_ss, line); ++linenum) { - std::cout << std::setfill(' ') << std::setw(3) << linenum << " " << line + output_ss << std::setfill(' ') << std::setw(3) << linenum << " " << line << std::endl; } + std::cout << output_ss.str(); } inline void print_compile_log(std::string program_name, @@ -554,7 +557,7 @@ inline bool load_source( std::string filename, std::map& sources, std::string current_dir = "", std::vector include_paths = std::vector(), - file_callback_type file_callback = 0, + file_callback_type file_callback = 0, std::string* program_name = nullptr, std::map* fullpaths = nullptr, bool search_current_dir = true) { std::istream* source_stream = 0; @@ -568,6 +571,9 @@ inline bool load_source( string_stream << source; source_stream = &string_stream; } + if (program_name) { + *program_name = filename; + } if (sources.count(filename)) { // Already got this one return true; @@ -672,6 +678,8 @@ inline bool load_source( // TODO: Handle block comments (currently they cause a compilation error). size_t comment_start = line_after_pragma.find("//"); std::string pragma_args = line_after_pragma.substr(0, comment_start); + // handle quote character used in #pragma expression + pragma_args = replace_token(pragma_args, "\"", "\\\""); std::string comment = comment_start != std::string::npos ? line_after_pragma.substr(comment_start) : ""; @@ -682,7 +690,7 @@ inline bool load_source( source += line + "\n"; } // HACK TESTING (WAR for cub) - // source = "#define cudaDeviceSynchronize() cudaSuccess\n" + source; + source = "#define cudaDeviceSynchronize() cudaSuccess\n" + source; ////source = "cudaError_t cudaDeviceSynchronize() { return cudaSuccess; }\n" + /// source; @@ -690,6 +698,7 @@ inline bool load_source( // of the same header from different paths. if (pragma_once) { std::stringstream ss; + ss.imbue(std::locale::classic()); ss << std::uppercase << std::hex << std::setw(8) << std::setfill('0') << hash; std::string include_guard_name = "_JITIFY_INCLUDE_GUARD_" + ss.str() + "\n"; @@ -1385,7 +1394,16 @@ static const char* jitsafe_header_preinclude_h = R"( // WAR to allow exceptions to be parsed #define try #define catch(...) -)"; +)" +#if defined(_WIN32) || defined(_WIN64) +// WAR for NVRTC <= 11.0 not defining _WIN64. +R"( +#ifndef _WIN64 +#define _WIN64 1 +#endif +)" +#endif +; static const char* jitsafe_header_float_h = R"( #pragma once @@ -1403,12 +1421,12 @@ static const char* jitsafe_header_float_h = R"( #define DBL_MAX_EXP 1024 #define FLT_MAX_10_EXP 38 #define DBL_MAX_10_EXP 308 -#define FLT_MAX 3.4028234e38f -#define DBL_MAX 1.7976931348623157e308 -#define FLT_EPSILON 1.19209289e-7f -#define DBL_EPSILON 2.220440492503130e-16 -#define FLT_MIN 1.1754943e-38f; -#define DBL_MIN 2.2250738585072013e-308 +#define FLT_MAX 3.4028234e38f +#define DBL_MAX 1.7976931348623157e308 +#define FLT_EPSILON 1.19209289e-7f +#define DBL_EPSILON 2.220440492503130e-16 +#define FLT_MIN 1.1754943e-38f +#define DBL_MIN 2.2250738585072013e-308 #define FLT_ROUNDS 1 #if defined __cplusplus && __cplusplus >= 201103L #define FLT_EVAL_METHOD 0 @@ -1596,14 +1614,28 @@ struct IntegerLimits { #endif // __cplusplus >= 201103L enum { is_specialized = true, - digits = (Digits == -1) ? (int)(sizeof(T)*8 - (Min != 0)) : Digits, - digits10 = (digits * 30103) / 100000, - is_signed = ((T)(-1)<0), - is_integer = true, - is_exact = true, - radix = 2, - is_bounded = true, - is_modulo = false + digits = (Digits == -1) ? (int)(sizeof(T)*8 - (Min != 0)) : Digits, + digits10 = (digits * 30103) / 100000, + is_signed = ((T)(-1)<0), + is_integer = true, + is_exact = true, + has_infinity = false, + has_quiet_NaN = false, + has_signaling_NaN = false, + has_denorm = 0, + has_denorm_loss = false, + round_style = 0, + is_iec559 = false, + is_bounded = true, + is_modulo = !(is_signed || Max == 1 /*is bool*/), + max_digits10 = 0, + radix = 2, + min_exponent = 0, + min_exponent10 = 0, + max_exponent = 0, + max_exponent10 = 0, + tinyness_before = false, + traps = false }; }; } // namespace __jitify_detail @@ -1910,6 +1942,46 @@ static const char* jitsafe_header_type_traits = R"( template struct aligned_storage { struct type { alignas(alignment) char data[len]; }; }; template struct alignment_of : std::integral_constant {}; + template struct make_unsigned; + template <> struct make_unsigned { typedef unsigned char type; }; + template <> struct make_unsigned { typedef unsigned short type; }; + template <> struct make_unsigned { typedef unsigned int type; }; + template <> struct make_unsigned { typedef unsigned long type; }; + template <> struct make_unsigned { typedef unsigned long long type; }; + template <> struct make_unsigned { typedef unsigned char type; }; + template <> struct make_unsigned { typedef unsigned short type; }; + template <> struct make_unsigned { typedef unsigned int type; }; + template <> struct make_unsigned { typedef unsigned long type; }; + template <> struct make_unsigned { typedef unsigned long long type; }; + template <> struct make_unsigned { typedef unsigned char type; }; + #if defined _WIN32 || defined _WIN64 + template <> struct make_unsigned { typedef unsigned short type; }; + #else + template <> struct make_unsigned { typedef unsigned int type; }; + #endif + + template struct make_signed; + template <> struct make_signed { typedef signed char type; }; + template <> struct make_signed { typedef signed short type; }; + template <> struct make_signed { typedef signed int type; }; + template <> struct make_signed { typedef signed long type; }; + template <> struct make_signed { typedef signed long long type; }; + template <> struct make_signed { typedef signed char type; }; + template <> struct make_signed { typedef signed short type; }; + template <> struct make_signed { typedef signed int type; }; + template <> struct make_signed { typedef signed long type; }; + template <> struct make_signed { typedef signed long long type; }; + template <> struct make_signed { typedef signed char type; }; + #if defined _WIN32 || defined _WIN64 + template <> struct make_signed { typedef signed short type; }; + #else + template <> struct make_signed { typedef signed int type; }; + #endif + + #if __cplusplus >= 201703L + template< typename... Ts > struct make_void { typedef void type; }; + template< typename... Ts > using void_t = typename make_void::type; + #endif // __cplusplus >= 201703L } // namespace std #endif // c++11 )"; @@ -1949,8 +2021,8 @@ static const char* jitsafe_header_stdint_h = "#define INT8_MIN SCHAR_MIN\n" "#define INT16_MIN SHRT_MIN\n" "#if defined _WIN32 || defined _WIN64\n" - "#define WCHAR_MIN SHRT_MIN\n" - "#define WCHAR_MAX SHRT_MAX\n" + "#define WCHAR_MIN 0\n" + "#define WCHAR_MAX USHRT_MAX\n" "typedef unsigned long long uintptr_t; //optional\n" "#else\n" "#define WCHAR_MIN INT_MIN\n" @@ -2083,24 +2155,33 @@ static const char* jitsafe_header_sstream = "#include \n" "#include \n"; -static const char* jitsafe_header_utility = - "#pragma once\n" - "namespace std {\n" - "template\n" - "struct pair {\n" - " T1 first;\n" - " T2 second;\n" - " inline pair() {}\n" - " inline pair(T1 const& first_, T2 const& second_)\n" - " : first(first_), second(second_) {}\n" - " // TODO: Standard includes many more constructors...\n" - " // TODO: Comparison operators\n" - "};\n" - "template\n" - "pair make_pair(T1 const& first, T2 const& second) {\n" - " return pair(first, second);\n" - "}\n" - "} // namespace std\n"; +static const char* jitsafe_header_utility = R"( + #pragma once + namespace std { + template + struct pair { + T1 first; + T2 second; + inline pair() {} + inline pair(T1 const& first_, T2 const& second_): first(first_), second(second_) {} + // TODO: Standard includes many more constructors... + // TODO: Comparison operators + }; + template + pair make_pair(T1 const& first, T2 const& second) { + return pair(first, second); + } + + template + constexpr bool always_false = false; + + template + typename std::add_rvalue_reference::type declval() noexcept + { + static_assert(always_false, "declval not allowed in an evaluated context"); + } + } // namespace std +)"; // TODO: incomplete static const char* jitsafe_header_vector = @@ -2340,14 +2421,81 @@ static const char* jitsafe_header_tuple = R"( #if __cplusplus >= 201103L namespace std { template class tuple; + + template< size_t I, class T > + struct tuple_element; + // recursive case + template< size_t I, class Head, class... Tail > + struct tuple_element> + : tuple_element> { }; + // base case + template< class Head, class... Tail > + struct tuple_element<0, tuple> { + using type = Head; + }; } // namespace std #endif )"; +static const char* jitsafe_header_functional = R"( + #pragma once + #if __cplusplus >= 201103L + namespace std { + template + class reference_wrapper + { + public: + // types + using type = T; + reference_wrapper(const reference_wrapper&) noexcept = default; + // assignment + reference_wrapper& operator=(const reference_wrapper& x) noexcept = default; + // access + constexpr operator T& () const noexcept { return *_ptr; } + constexpr T& get() const noexcept { return *_ptr; } + private: + T* _ptr; + }; + } // namespace std + #endif +)"; + +static const char* jitsafe_header_map = R"( + #pragma once + namespace std { + template class map {}; + } // namespace std +)"; + +static const char* jitsafe_header_stack = R"( + #pragma once + namespace std { + template class stack {}; + } // namespace std +)"; + +static const char* jitsafe_header_initializer_list = R"( + #pragma once +)"; + static const char* jitsafe_header_assert = R"( #pragma once )"; +static const char* jitsafe_header_sys_time = R"( + #pragma once + struct timeval { + unsigned long long tv_sec; + unsigned long long tv_usec; + }; + struct timeval it_interval; + struct timeval it_value; + int getitimer(int, struct itimerval *); + int gettimeofday(struct timeval *, void *); + int setitimer(int, const struct itimerval *, struct itimerval *); + int utimes(const char *, const struct timeval [2]); + )"; + // WAR: These need to be pre-included as a workaround for NVRTC implicitly using // /usr/include as an include path. The other built-in headers will be included // lazily as needed. @@ -2406,8 +2554,13 @@ static const std::map& get_jitsafe_headers_map() { {"time.h", jitsafe_header_time_h}, {"ctime", jitsafe_header_time_h}, {"tuple", jitsafe_header_tuple}, + {"functional", jitsafe_header_functional}, + {"map", jitsafe_header_map}, + {"stack", jitsafe_header_stack}, + {"initializer_list", jitsafe_header_initializer_list}, {"assert.h", jitsafe_header_assert}, - {"cassert", jitsafe_header_assert}}; + {"cassert", jitsafe_header_assert}, + {"sys/time.h", jitsafe_header_sys_time}}; return jitsafe_headers_map; } @@ -2673,6 +2826,17 @@ inline nvrtcResult compile_kernel(std::string program_name, &nvrtc_program, program_source.c_str(), program_name.c_str(), num_headers, header_sources_c.data(), header_names_c.data())); + // Ensure nvrtc_program gets destroyed. + struct ScopedNvrtcProgramDestroyer { + nvrtcProgram& nvrtc_program_; + ScopedNvrtcProgramDestroyer(nvrtcProgram& nvrtc_program) + : nvrtc_program_(nvrtc_program) {} + ~ScopedNvrtcProgramDestroyer() { nvrtcDestroyProgram(&nvrtc_program_); } + ScopedNvrtcProgramDestroyer(const ScopedNvrtcProgramDestroyer&) = delete; + ScopedNvrtcProgramDestroyer& operator=(const ScopedNvrtcProgramDestroyer&) = + delete; + } nvrtc_program_scope_guard{nvrtc_program}; + #if CUDA_VERSION >= 8000 if (!instantiation.empty()) { CHECK_NVRTC(nvrtcAddNameExpression(nvrtc_program, instantiation.c_str())); @@ -2720,7 +2884,6 @@ inline nvrtcResult compile_kernel(std::string program_name, #endif } - CHECK_NVRTC(nvrtcDestroyProgram(&nvrtc_program)); #undef CHECK_NVRTC return NVRTC_SUCCESS; } @@ -2746,10 +2909,9 @@ inline void load_program(std::string const& cuda_source, // Load program source if (!detail::load_source(cuda_source, *program_sources, "", *include_paths, - file_callback)) { + file_callback, program_name)) { throw std::runtime_error("Source not found: " + cuda_source); } - *program_name = program_sources->begin()->first; // Maps header include names to their full file paths. std::map header_fullpaths; @@ -2757,7 +2919,7 @@ inline void load_program(std::string const& cuda_source, // Load header sources for (std::string const& header : headers) { if (!detail::load_source(header, *program_sources, "", *include_paths, - file_callback, &header_fullpaths)) { + file_callback, nullptr, &header_fullpaths)) { // **TODO: Deal with source not found throw std::runtime_error("Source not found: " + header); } @@ -2816,8 +2978,8 @@ inline void load_program(std::string const& cuda_source, std::string include_parent_fullpath = header_fullpaths[include_parent]; std::string include_path = detail::path_base(include_parent_fullpath); if (detail::load_source(include_name, *program_sources, include_path, - *include_paths, file_callback, &header_fullpaths, - is_included_with_quotes)) { + *include_paths, file_callback, nullptr, + &header_fullpaths, is_included_with_quotes)) { #if JITIFY_PRINT_HEADER_PATHS std::cout << "Found #include " << include_name << " from " << include_parent << ":" << line_num << " [" @@ -3067,6 +3229,7 @@ class KernelLauncher { std::unique_ptr _impl; public: + KernelLauncher() = default; inline KernelLauncher(KernelInstantiation const& kernel_inst, dim3 grid, dim3 block, unsigned int smem = 0, cudaStream_t stream = 0); @@ -3135,6 +3298,7 @@ class KernelInstantiation { std::unique_ptr _impl; public: + KernelInstantiation() = default; inline KernelInstantiation(Kernel const& kernel, std::vector const& template_args); @@ -3282,6 +3446,7 @@ class Kernel { std::unique_ptr _impl; public: + Kernel() = default; Kernel(Program const& program, std::string name, jitify::detail::vector options = 0); @@ -3346,6 +3511,7 @@ class Program { std::unique_ptr _impl; public: + Program() = default; Program(JitCache& cache, std::string source, jitify::detail::vector headers = 0, jitify::detail::vector options = 0, From c42794fbe74b4e80ad08b5c2b3f1c755f7252a55 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 31 May 2023 22:25:07 -0700 Subject: [PATCH 24/99] Rename ColorSpinorField/CloverField::V methods to data, with an optional template cast type --- include/blas_helper.cuh | 6 +- include/blas_quda.h | 2 +- include/clover_field.h | 5 +- include/clover_field_order.h | 16 +- include/color_spinor_field.h | 4 +- include/color_spinor_field_order.h | 17 +- include/dslash_helper.cuh | 12 +- include/kernels/covDev.cuh | 8 +- include/kernels/dslash_staggered.cuh | 8 +- include/kernels/dslash_wilson.cuh | 8 +- include/kernels/laplace.cuh | 9 +- .../staggered_kd_apply_xinv_kernel.cuh | 2 +- include/kernels/staggered_quark_smearing.cuh | 9 +- lib/block_orthogonalize.in.cu | 2 +- lib/clover_field.cpp | 8 +- lib/coarse_op.in.cu | 2 +- lib/color_spinor_field.cpp | 12 +- lib/color_spinor_util.in.cu | 2 +- lib/copy_clover_offset.cu | 4 +- lib/copy_color_spinor_mg.in.hpp | 4 +- lib/dirac.cpp | 2 +- lib/dslash_clover_helper.cu | 8 +- lib/dslash_coarse.hpp | 2 +- lib/dslash_gamma_helper.cu | 4 +- ..._clover_hasenbusch_twist_preconditioned.cu | 10 - lib/interface_quda.cpp | 4 +- lib/inv_gmresdr_quda.cpp | 2 +- lib/inv_mr_quda.cpp | 2 +- lib/max_clover.cu | 2 +- lib/multi_reduce_quda.cu | 2 +- lib/staggered_kd_apply_xinv.cu | 2 +- lib/vector_io.cpp | 4 +- tests/dslash_test_utils.h | 204 +++++++++--------- tests/eigensolve_test.cpp | 6 +- tests/host_reference/covdev_reference.cpp | 22 +- tests/host_reference/dslash_reference.cpp | 12 +- .../staggered_dslash_reference.cpp | 20 +- tests/invert_test.cpp | 12 +- tests/staggered_dslash_test_utils.h | 8 +- tests/staggered_gsmear_test_utils.h | 10 +- tests/staggered_invert_test.cpp | 10 +- tests/utils/staggered_host_utils.cpp | 4 +- 42 files changed, 228 insertions(+), 264 deletions(-) diff --git a/include/blas_helper.cuh b/include/blas_helper.cuh index e2617e879a..80e974c0c9 100644 --- a/include/blas_helper.cuh +++ b/include/blas_helper.cuh @@ -111,7 +111,7 @@ namespace quda {} data_t(const ColorSpinorField &x) : - spinor(static_cast(const_cast(x).V())), + spinor(x.data()), stride(x.VolumeCB()), cb_offset(x.Bytes() / (2 * sizeof(store_t) * N)) {} @@ -141,8 +141,8 @@ namespace quda {} data_t(const ColorSpinorField &x) : - spinor(static_cast(const_cast(x).V())), - norm(static_cast(const_cast(x).Norm())), + spinor(x.data()), + norm(static_cast(x.Norm())), stride(x.VolumeCB()), cb_offset(x.Bytes() / (2 * sizeof(store_t) * N)), cb_norm_offset(x.Bytes() / (2 * sizeof(norm_t))) diff --git a/include/blas_quda.h b/include/blas_quda.h index 3fc051d3ff..8df40df452 100644 --- a/include/blas_quda.h +++ b/include/blas_quda.h @@ -33,7 +33,7 @@ namespace quda { inline void copy(ColorSpinorField &dst, const ColorSpinorField &src) { - if (dst.V() == src.V()) { + if (dst.data() == src.data()) { // check the fields are equivalent else error if (ColorSpinorField::are_compatible(dst, src)) return; diff --git a/include/clover_field.h b/include/clover_field.h index 579e7eeb1e..380a399492 100644 --- a/include/clover_field.h +++ b/include/clover_field.h @@ -217,7 +217,10 @@ namespace quda { static CloverField *Create(const CloverFieldParam ¶m); - void *V(bool inverse = false) const { return inverse ? cloverInv.data() : clover.data(); } + template auto data(bool inverse = false) const + { + return inverse ? reinterpret_cast(cloverInv.data()) : reinterpret_cast(clover.data()); + } /** @return whether the inverse is explicitly been allocated diff --git a/include/clover_field_order.h b/include/clover_field_order.h index 1464a02629..05b77eee63 100644 --- a/include/clover_field_order.h +++ b/include/clover_field_order.h @@ -312,7 +312,7 @@ namespace quda { static constexpr int N = nColor * nSpin / 2; reconstruct_t recon; FloatNAccessor(const CloverField &A, bool inverse = false) : - a(static_cast(const_cast(A.V(inverse)))), + a(A.data(inverse)), stride(A.VolumeCB()), offset_cb(A.Bytes() / (2 * sizeof(Float))), compressed_block_size(A.compressed_block_size()), @@ -403,9 +403,7 @@ namespace quda { const int N = nSpin * nColor / 2; const complex zero; Accessor(const CloverField &A, bool inverse = false) : - a(static_cast(const_cast(A.V(inverse)))), - offset_cb(A.Bytes() / (2 * sizeof(Float))), - zero(complex(0.0, 0.0)) + a(A.data(inverse)), offset_cb(A.Bytes() / (2 * sizeof(Float))), zero(complex(0.0, 0.0)) { } @@ -639,7 +637,7 @@ namespace quda { if (clover.max_element(is_inverse) == 0.0 && isFixed::value) errorQuda("%p max_element(%d) appears unset", &clover, is_inverse); if (clover.Diagonal() == 0.0 && clover.Reconstruct()) errorQuda("%p diagonal appears unset", &clover); - this->clover = clover_ ? clover_ : (Float *)(clover.V(is_inverse)); + this->clover = clover_ ? clover_ : clover.data(is_inverse); } QudaTwistFlavorType TwistFlavor() const { return twist_flavor; } @@ -844,7 +842,7 @@ namespace quda { if (clover.Order() != QUDA_PACKED_CLOVER_ORDER) { errorQuda("Invalid clover order %d for this accessor", clover.Order()); } - this->clover = clover_ ? clover_ : (Float *)(clover.V(inverse)); + this->clover = clover_ ? clover_ : clover.data(inverse); } QudaTwistFlavorType TwistFlavor() const { return twist_flavor; } @@ -892,8 +890,8 @@ namespace quda { if (clover.Order() != QUDA_QDPJIT_CLOVER_ORDER) { errorQuda("Invalid clover order %d for this accessor", clover.Order()); } - offdiag = clover_ ? ((Float **)clover_)[0] : ((Float **)clover.V(inverse))[0]; - diag = clover_ ? ((Float **)clover_)[1] : ((Float **)clover.V(inverse))[1]; + offdiag = clover_ ? ((Float **)clover_)[0] : clover.data(inverse)[0]; + diag = clover_ ? ((Float **)clover_)[1] : clover.data(inverse)[1]; } QudaTwistFlavorType TwistFlavor() const { return twist_flavor; } @@ -970,7 +968,7 @@ namespace quda { if (clover.Order() != QUDA_BQCD_CLOVER_ORDER) { errorQuda("Invalid clover order %d for this accessor", clover.Order()); } - this->clover[0] = clover_ ? clover_ : (Float *)(clover.V(inverse)); + this->clover[0] = clover_ ? clover_ : clover.data(inverse); this->clover[1] = (Float *)((char *)this->clover[0] + clover.Bytes() / 2); } diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 8186425d1c..1bb81a450d 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -462,10 +462,10 @@ namespace quda /** @brief Return pointer to the field allocation */ - void *V() const + template auto data() const { if (ghost_only) errorQuda("Not defined for ghost-only field"); - return v.data(); + return reinterpret_cast(v.data()); } /** diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 48b8d20a62..dab488931f 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -861,14 +861,13 @@ namespace quda FieldOrderCB(const ColorSpinorField &field, int nFace = 1, void *const v_ = 0, void *const *ghost_ = 0) : GhostOrder(field, nFace, ghost_), volumeCB(field.VolumeCB()), accessor(field) { - v.v = v_ ? static_cast *>(const_cast(v_)) : - static_cast *>(const_cast(field.V())); + v.v = v_ ? static_cast *>(const_cast(v_)) : field.data *>(); resetScale(field.Scale()); if constexpr (fixed && block_float) { if constexpr (nColor == 3 && nSpin == 1 && nVec == 1 && order == 2) // special case where the norm is packed into the per site struct - v.norm = reinterpret_cast(const_cast(field.V())); + v.norm = field.data(); else v.norm = static_cast(const_cast(field.Norm())); v.norm_offset = field.Bytes() / (2 * sizeof(norm_t)); @@ -1072,7 +1071,7 @@ namespace quda size_t bytes; FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *buffer = 0, Float **ghost_ = 0) : - field(buffer ? buffer : (Float *)a.V()), + field(buffer ? buffer : a.data()), norm(buffer ? reinterpret_cast(reinterpret_cast(buffer) + a.NormOffset()) : const_cast(reinterpret_cast(a.Norm()))), offset(a.Bytes() / (2 * sizeof(Float) * N)), @@ -1300,7 +1299,7 @@ namespace quda size_t bytes; FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *buffer = 0, Float **ghost_ = 0) : - field(buffer ? buffer : (Float *)a.V()), + field(buffer ? buffer : a.data()), offset(a.Bytes() / (2 * sizeof(Vector))), volumeCB(a.VolumeCB()), nParity(a.SiteSubset()), @@ -1489,7 +1488,7 @@ namespace quda int faceVolumeCB[4]; int nParity; SpaceColorSpinorOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) : - field(field_ ? field_ : (Float *)a.V()), + field(field_ ? field_ : a.data()), offset(a.Bytes() / (2 * sizeof(Float))), volumeCB(a.VolumeCB()), nParity(a.SiteSubset()) @@ -1573,7 +1572,7 @@ namespace quda int faceVolumeCB[4]; int nParity; SpaceSpinorColorOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) : - field(field_ ? field_ : (Float *)a.V()), + field(field_ ? field_ : a.data()), offset(a.Bytes() / (2 * sizeof(Float))), volumeCB(a.VolumeCB()), nParity(a.SiteSubset()) @@ -1652,7 +1651,7 @@ namespace quda int exDim[4]; // full field dimensions PaddedSpaceSpinorColorOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) : - field(field_ ? field_ : (Float *)a.V()), + field(field_ ? field_ : a.data()), volumeCB(a.VolumeCB()), exVolumeCB(1), nParity(a.SiteSubset()), @@ -1747,7 +1746,7 @@ namespace quda int volumeCB; int nParity; QDPJITDiracOrder(const ColorSpinorField &a, int = 1, Float *field_ = 0, float * = 0) : - field(field_ ? field_ : (Float *)a.V()), volumeCB(a.VolumeCB()), nParity(a.SiteSubset()) + field(field_ ? field_ : a.data()), volumeCB(a.VolumeCB()), nParity(a.SiteSubset()) { } diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh index 836b474cf0..e67582b682 100644 --- a/include/dslash_helper.cuh +++ b/include/dslash_helper.cuh @@ -305,8 +305,8 @@ namespace quda #endif // constructor needed for staggered to set xpay from derived class - DslashArg(const ColorSpinorField &in, const GaugeField &U, int parity, bool dagger, bool xpay, int nFace, - int spin_project, const int *comm_override, + DslashArg(const ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, const ColorSpinorField &x, + int parity, bool dagger, bool xpay, int nFace, int spin_project, const int *comm_override, #ifdef NVSHMEM_COMMS int shmem_ = 0) : #else @@ -348,8 +348,14 @@ namespace quda retcount_intra(dslash::get_shmem_retcount_intra()), retcount_inter(dslash::get_shmem_retcount_inter()) #endif - { + if (in.data() == out.data()) errorQuda("Aliasing pointers"); + checkOrder(out, in, x); // check all orders match + checkPrecision(out, in, x, U); // check all precisions match + checkLocation(out, in, x, U); // check all locations match + if (!in.isNative() || !U.isNative()) + errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder()); + for (int d = 0; d < 4; d++) { commDim[d] = (comm_override[d] == 0) ? 0 : comm_dim_partitioned(d); } diff --git a/include/kernels/covDev.cuh b/include/kernels/covDev.cuh index b86e989bf7..28c52e9b38 100644 --- a/include/kernels/covDev.cuh +++ b/include/kernels/covDev.cuh @@ -37,19 +37,13 @@ namespace quda CovDevArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int mu, int parity, bool dagger, const int *comm_override) : - DslashArg(in, U, parity, dagger, false, 1, spin_project, comm_override), + DslashArg(out, in, U, in, parity, dagger, false, 1, spin_project, comm_override), out(out), in(in), in_pack(in), U(U), mu(mu) { - if (in.V() == out.V()) errorQuda("Aliasing pointers"); - checkOrder(out, in); // check all orders match - checkPrecision(out, in, U); // check all precisions match - checkLocation(out, in, U); // check all locations match - if (!out.isNative() || !in.isNative() || !U.isNative()) - errorQuda("Unsupported field order colorspinor(in)=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder()); } }; diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh index 8f772165bf..deb38455f8 100644 --- a/include/kernels/dslash_staggered.cuh +++ b/include/kernels/dslash_staggered.cuh @@ -51,7 +51,7 @@ namespace quda StaggeredArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, const GaugeField &L, double a, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override) : - DslashArg(in, U, parity, dagger, a == 0.0 ? false : true, improved_ ? 3 : 1, spin_project, + DslashArg(out, in, U, x, parity, dagger, a == 0.0 ? false : true, improved_ ? 3 : 1, spin_project, comm_override), out(out), in(in, improved_ ? 3 : 1), @@ -65,12 +65,6 @@ namespace quda is_last_time_slice(comm_coord(3) == comm_dim(3) - 1 ? true : false), dagger_scale(dagger ? static_cast(-1.0) : static_cast(1.0)) { - if (in.V() == out.V()) errorQuda("Aliasing pointers"); - checkOrder(out, in, x); // check all orders match - checkPrecision(out, in, x, U); // check all precisions match - checkLocation(out, in, x, U); // check all locations match - if (!in.isNative() || !U.isNative()) - errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder()); } }; diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh index cd7575974a..f87e8f9865 100644 --- a/include/kernels/dslash_wilson.cuh +++ b/include/kernels/dslash_wilson.cuh @@ -38,7 +38,7 @@ namespace quda WilsonArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override) : - DslashArg(in, U, parity, dagger, a != 0.0 ? true : false, 1, spin_project, comm_override), + DslashArg(out, in, U, x, parity, dagger, a != 0.0 ? true : false, 1, spin_project, comm_override), out(out), in(in), in_pack(in), @@ -46,12 +46,6 @@ namespace quda U(U), a(a) { - if (in.V() == out.V()) errorQuda("Aliasing pointers"); - checkOrder(out, in, x); // check all orders match - checkPrecision(out, in, x, U); // check all precisions match - checkLocation(out, in, x, U); // check all locations match - if (!in.isNative() || !U.isNative()) - errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder()); } }; diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh index ac09ddc5ed..a029242210 100644 --- a/include/kernels/laplace.cuh +++ b/include/kernels/laplace.cuh @@ -40,8 +40,7 @@ namespace quda LaplaceArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir, double a, double b, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override) : - - DslashArg(in, U, parity, dagger, a != 0.0 ? true : false, 1, false, comm_override), + DslashArg(out, in, U, x, parity, dagger, a != 0.0 ? true : false, 1, false, comm_override), out(out), in(in), in_pack(in), @@ -51,12 +50,6 @@ namespace quda b(b), dir(dir) { - if (in.V() == out.V()) errorQuda("Aliasing pointers"); - checkOrder(out, in, x); // check all orders match - checkPrecision(out, in, x, U); // check all precisions match - checkLocation(out, in, x, U); // check all locations match - if (!in.isNative() || !U.isNative()) - errorQuda("Unsupported field order colorspinor(in)=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder()); if (dir < 3 || dir > 4) errorQuda("Unsupported laplace direction %d (must be 3 or 4)", dir); } }; diff --git a/include/kernels/staggered_kd_apply_xinv_kernel.cuh b/include/kernels/staggered_kd_apply_xinv_kernel.cuh index bbe8b70166..f5b137486f 100644 --- a/include/kernels/staggered_kd_apply_xinv_kernel.cuh +++ b/include/kernels/staggered_kd_apply_xinv_kernel.cuh @@ -39,7 +39,7 @@ namespace quda { X0h(out.X()[0]/2), volumeCB(in.VolumeCB()) { - if (in.V() == out.V()) errorQuda("Aliasing pointers"); + if (in.data() == out.data()) errorQuda("Aliasing pointers"); checkOrder(out, in); // check all orders match checkPrecision(out, in, xInv); // check all precisions match checkLocation(out, in, xInv); diff --git a/include/kernels/staggered_quark_smearing.cuh b/include/kernels/staggered_quark_smearing.cuh index 2fdb42f17a..9f4db096e8 100644 --- a/include/kernels/staggered_quark_smearing.cuh +++ b/include/kernels/staggered_quark_smearing.cuh @@ -45,8 +45,7 @@ namespace quda StaggeredQSmearArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int t0, bool is_t0_kernel, int parity, int dir, bool dagger, const int *comm_override) : - - DslashArg(in, U, parity, dagger, false, 3, false, comm_override), + DslashArg(out, in, U, in, parity, dagger, false, 3, false, comm_override), out(out, 3), in(in, 3), in_pack(in, 3), @@ -56,12 +55,6 @@ namespace quda is_t0_kernel(is_t0_kernel), t0_offset(is_t0_kernel ? in.VolumeCB() / in.X(3) : 0) { - if (in.V() == out.V()) errorQuda("Aliasing pointers"); - checkOrder(out, in); // check all orders match - checkPrecision(out, in, U); // check all precisions match - checkLocation(out, in, U); // check all locations match - if (!in.isNative() || !U.isNative()) - errorQuda("Unsupported field order colorspinor(in)=%d gauge=%d combination", in.FieldOrder(), U.FieldOrder()); if (dir < 3 || dir > 4) errorQuda("Unsupported laplace direction %d (must be 3 or 4)", dir); for (int i = 0; i < 4; i++) { diff --git a/lib/block_orthogonalize.in.cu b/lib/block_orthogonalize.in.cu index 27b7d68f22..64651fb55f 100644 --- a/lib/block_orthogonalize.in.cu +++ b/lib/block_orthogonalize.in.cu @@ -278,7 +278,7 @@ namespace quda { QUDA_PRECISION, V.Precision(), B[0]->Precision()); if constexpr (is_enabled_multigrid()) { - if (B[0]->V() == nullptr) { + if (B[0]->data() == nullptr) { warningQuda("Trying to BlockOrthogonalize staggered transform, skipping..."); return; } diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp index 2727069224..e91600b6c3 100644 --- a/lib/clover_field.cpp +++ b/lib/clover_field.cpp @@ -170,7 +170,7 @@ namespace quda { if (is_inverse && !src.Inverse() && !dynamic_inverse_copy) errorQuda("Source field's is_inverse=%d component does not exist", is_inverse); - auto src_v = dynamic_inverse_copy ? src.V(false) : src.V(is_inverse); + auto src_v = dynamic_inverse_copy ? src.data(false) : src.data(is_inverse); // if we copying to a reconstruction field, we must find the overall scale factor to allow us to reconstruct if (Reconstruct()) { @@ -192,7 +192,7 @@ namespace quda { void *packClover = pool_pinned_malloc(bytes); copyGenericClover(*this, src, is_inverse, QUDA_CPU_FIELD_LOCATION, packClover, src_v); - qudaMemcpy(V(is_inverse), packClover, bytes, qudaMemcpyHostToDevice); + qudaMemcpy(data(is_inverse), packClover, bytes, qudaMemcpyHostToDevice); pool_pinned_free(packClover); } else if (reorder_location() == QUDA_CUDA_FIELD_LOCATION && src.Location() == QUDA_CPU_FIELD_LOCATION) { @@ -217,7 +217,7 @@ namespace quda { void *packClover = pool_device_malloc(bytes); copyGenericClover(*this, src, is_inverse, QUDA_CUDA_FIELD_LOCATION, packClover, src_v); - qudaMemcpy(V(is_inverse), packClover, bytes, qudaMemcpyDeviceToHost); + qudaMemcpy(data(is_inverse), packClover, bytes, qudaMemcpyDeviceToHost); pool_device_free(packClover); } @@ -331,7 +331,7 @@ namespace quda { spinor_param.fieldOrder = colorspinor::getNative(a.Precision(), a.Nspin()); spinor_param.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; spinor_param.create = QUDA_REFERENCE_FIELD_CREATE; - spinor_param.v = a.V(inverse); + spinor_param.v = a.data(inverse); spinor_param.location = a.Location(); return spinor_param; } diff --git a/lib/coarse_op.in.cu b/lib/coarse_op.in.cu index 358c3ba0b9..259da32c98 100644 --- a/lib/coarse_op.in.cu +++ b/lib/coarse_op.in.cu @@ -197,7 +197,7 @@ namespace quda { for (int i = 0; i < cf_param.nDim; i++) cf_param.x[i] = clover ? clover->X()[i] : 0; // only create inverse if not doing dynamic clover and one already exists - cf_param.inverse = !clover::dynamic_inverse() && clover && clover->V(true); + cf_param.inverse = !clover::dynamic_inverse() && clover && clover->Inverse(); cf_param.clover = nullptr; cf_param.cloverInv = nullptr; cf_param.create = QUDA_NULL_FIELD_CREATE; diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index a40f191712..b26897948b 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -428,17 +428,17 @@ namespace quda if (src.FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) { // special case where we use mapped memory to read/write directly from application's array - void *src_d = get_mapped_device_pointer(src.V()); + void *src_d = get_mapped_device_pointer(src.data()); copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, v.data(), src_d); } else { void *Src = nullptr, *buffer = nullptr; if (!zeroCopy) { buffer = pool_device_malloc(src.Bytes()); Src = buffer; - qudaMemcpy(Src, src.V(), src.Bytes(), qudaMemcpyDefault); + qudaMemcpy(Src, src.data(), src.Bytes(), qudaMemcpyDefault); } else { buffer = pool_pinned_malloc(src.Bytes()); - memcpy(buffer, src.V(), src.Bytes()); + memcpy(buffer, src.data(), src.Bytes()); Src = get_mapped_device_pointer(buffer); } @@ -465,7 +465,7 @@ namespace quda if (FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) { // special case where we use zero-copy memory to read/write directly from application's array void *dest_d = get_mapped_device_pointer(v.data()); - copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, dest_d, src.V()); + copyGenericColorSpinor(*this, src, QUDA_CUDA_FIELD_LOCATION, dest_d, src.data()); } else { void *dst = nullptr, *buffer = nullptr; if (!zeroCopy) { @@ -837,7 +837,7 @@ namespace quda errorQuda("Cannot create an alias to source with lower precision than the alias"); ColorSpinorParam param = param_.init ? param_ : ColorSpinorParam(*this); param.create = QUDA_REFERENCE_FIELD_CREATE; - param.v = V(); + param.v = data(); return ColorSpinorField(param); } @@ -848,7 +848,7 @@ namespace quda errorQuda("Cannot create an alias to source with lower precision than the alias"); ColorSpinorParam param(param_); param.create = QUDA_REFERENCE_FIELD_CREATE; - param.v = V(); + param.v = data(); return new ColorSpinorField(param); } diff --git a/lib/color_spinor_util.in.cu b/lib/color_spinor_util.in.cu index 3681438c9f..b018bc0e3f 100644 --- a/lib/color_spinor_util.in.cu +++ b/lib/color_spinor_util.in.cu @@ -417,7 +417,7 @@ namespace quda { param.create = create; if (create == QUDA_COPY_FIELD_CREATE) param.field = &const_cast(src); - else if (create == QUDA_REFERENCE_FIELD_CREATE) param.v = const_cast(src).V(); + else if (create == QUDA_REFERENCE_FIELD_CREATE) param.v = src.data(); resize(v, new_size, param); } diff --git a/lib/copy_clover_offset.cu b/lib/copy_clover_offset.cu index 1300082c24..f29e663c14 100644 --- a/lib/copy_clover_offset.cu +++ b/lib/copy_clover_offset.cu @@ -70,8 +70,8 @@ namespace quda if (pc_type != QUDA_4D_PC) { errorQuda("Gauge field copy must use 4d even-odd preconditioning."); } - if (in.V(true)) { instantiate(out, in, offset, true); } - if (in.V(false)) { instantiate(out, in, offset, false); } + if (in.Inverse()) instantiate(out, in, offset, true); + instantiate(out, in, offset, false); } #else void copyFieldOffset(CloverField &, const CloverField &, CommKey, QudaPCType) diff --git a/lib/copy_color_spinor_mg.in.hpp b/lib/copy_color_spinor_mg.in.hpp index a6678143b4..d28ffa4e80 100644 --- a/lib/copy_color_spinor_mg.in.hpp +++ b/lib/copy_color_spinor_mg.in.hpp @@ -117,14 +117,14 @@ namespace quda { } // set for the source subset ordering - srcFloat *srcEven = Src ? Src : (srcFloat*)src.V(); + srcFloat *srcEven = Src ? Src : src.data(); srcFloat *srcOdd = (srcFloat*)((char*)srcEven + src.Bytes()/2); if (src.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER) { std::swap(srcEven, srcOdd); } // set for the destination subset ordering - dstFloat *dstEven = Dst ? Dst : (dstFloat*)dst.V(); + dstFloat *dstEven = Dst ? Dst : dst.data(); dstFloat *dstOdd = (dstFloat*)((char*)dstEven + dst.Bytes()/2); if (dst.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER) { std::swap(dstEven, dstOdd); diff --git a/lib/dirac.cpp b/lib/dirac.cpp index 6e0a5912d3..e7be7cdc6d 100644 --- a/lib/dirac.cpp +++ b/lib/dirac.cpp @@ -115,7 +115,7 @@ namespace quda { } void Dirac::checkSpinorAlias(const ColorSpinorField &a, const ColorSpinorField &b) const { - if (a.V() == b.V()) errorQuda("Aliasing pointers"); + if (a.data() == b.data()) errorQuda("Aliasing pointers"); } // Dirac operator factory diff --git a/lib/dslash_clover_helper.cu b/lib/dslash_clover_helper.cu index 7389394ba1..accc50d31a 100644 --- a/lib/dslash_clover_helper.cu +++ b/lib/dslash_clover_helper.cu @@ -35,8 +35,8 @@ namespace quda { launch(tp, stream, CloverArg(out, in, clover, parity)); } - void preTune() { if (out.V() == in.V()) out.backup(); } // Backup if in and out fields alias - void postTune() { if (out.V() == in.V()) out.restore(); } // Restore if the in and out fields alias + void preTune() { if (out.data() == in.data()) out.backup(); } // Backup if in and out fields alias + void postTune() { if (out.data() == in.data()) out.restore(); } // Restore if the in and out fields alias long long flops() const { return in.Volume()*504ll; } long long bytes() const { return out.Bytes() + in.Bytes() + clover.Bytes() / (3 - in.SiteSubset()); } }; @@ -115,8 +115,8 @@ namespace quda { } } - void preTune() { if (out.V() == in.V()) out.backup(); } // Restore if the in and out fields alias - void postTune() { if (out.V() == in.V()) out.restore(); } // Restore if the in and out fields alias + void preTune() { if (out.data() == in.data()) out.backup(); } // Restore if the in and out fields alias + void postTune() { if (out.data() == in.data()) out.restore(); } // Restore if the in and out fields alias long long flops() const { return (inverse ? 1056ll : 552ll) * in.Volume(); } long long bytes() const { long long rtn = out.Bytes() + in.Bytes() + clover.Bytes() / (3 - in.SiteSubset()); diff --git a/lib/dslash_coarse.hpp b/lib/dslash_coarse.hpp index a98290d129..9816a9af0c 100644 --- a/lib/dslash_coarse.hpp +++ b/lib/dslash_coarse.hpp @@ -413,7 +413,7 @@ namespace quda { */ inline void operator()(DslashCoarsePolicy policy) { - if (inA[0].V() == out[0].V()) errorQuda("Aliasing pointers"); + if (inA[0].data() == out[0].data()) errorQuda("Aliasing pointers"); // check all precisions match QudaPrecision precision = checkPrecision(out[0], inA[0], inB[0]); diff --git a/lib/dslash_gamma_helper.cu b/lib/dslash_gamma_helper.cu index 4b7ef2458c..2e76504afd 100644 --- a/lib/dslash_gamma_helper.cu +++ b/lib/dslash_gamma_helper.cu @@ -74,8 +74,8 @@ namespace quda { launch(tp, stream, GammaArg(out, in, d, kappa, mu, epsilon, dagger, type)); } - void preTune() { if (out.V() == in.V()) out.backup(); } - void postTune() { if (out.V() == in.V()) out.restore(); } + void preTune() { if (out.data() == in.data()) out.backup(); } + void postTune() { if (out.data() == in.data()) out.restore(); } long long flops() const { return 0; } long long bytes() const { return out.Bytes() + in.Bytes(); } }; diff --git a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu index d169f4f0e1..ca8ce572d9 100644 --- a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu +++ b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu @@ -140,16 +140,6 @@ namespace quda const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, TimeProfile &profile) { - if (in.V() == out.V()) errorQuda("Aliasing pointers"); - if (in.FieldOrder() != out.FieldOrder()) - errorQuda("Field order mismatch in = %d, out = %d", in.FieldOrder(), out.FieldOrder()); - - // check all precisions match - checkPrecision(out, in, U, A); - - // check all locations match - checkLocation(out, in, U, A); - instantiate(out, in, U, A, a, b, x, parity, dagger, comm_override, profile); } diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 3c32443205..17b6bd4391 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -3322,7 +3322,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col || param->dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) { if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loading clover field...\n"); } if (collected_clover) { - loadCloverQuda(collected_clover->V(false), collected_clover->V(true), param); + loadCloverQuda(collected_clover->data(false), collected_clover->data(true), param); } else { loadCloverQuda(nullptr, nullptr, param); } @@ -3330,7 +3330,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col } for (int n = 0; n < param->num_src_per_sub_partition; n++) { - op(_collect_x[n]->V(), _collect_b[n]->V(), param, args...); + op(_collect_x[n]->data(), _collect_b[n]->data(), param, args...); } profileInvertMultiSrc.TPSTART(QUDA_PROFILE_TOTAL); diff --git a/lib/inv_gmresdr_quda.cpp b/lib/inv_gmresdr_quda.cpp index 62d7685eda..389206853e 100644 --- a/lib/inv_gmresdr_quda.cpp +++ b/lib/inv_gmresdr_quda.cpp @@ -282,7 +282,7 @@ namespace quda { blas::zero(Vm->Component(i)); } - if (Zm->V() != Vm->V()) { + if (Zm->data() != Vm->data()) { std::vector z(Zm->Components()); std::vector vk(args.Vkp1->Components().begin(), args.Vkp1->Components().begin() + args.k); diff --git a/lib/inv_mr_quda.cpp b/lib/inv_mr_quda.cpp index 44078ce783..4f636bf279 100644 --- a/lib/inv_mr_quda.cpp +++ b/lib/inv_mr_quda.cpp @@ -38,7 +38,7 @@ namespace quda bool mixed = param.precision != param.precision_sloppy; if (!mixed) csParam.create = QUDA_REFERENCE_FIELD_CREATE; - csParam.v = r.V(); + csParam.v = r.data(); r_sloppy = ColorSpinorField(csParam); init = true; diff --git a/lib/max_clover.cu b/lib/max_clover.cu index 18c84ca7a3..48e9630421 100644 --- a/lib/max_clover.cu +++ b/lib/max_clover.cu @@ -50,7 +50,7 @@ namespace quda { #ifdef GPU_CLOVER_DIRAC double _norm(const CloverField &u, bool inverse, norm_type_ type) { - if (!u.V(inverse)) errorQuda("reqeusted clover is_inverse=%d, but not allocated", inverse); + if (!u.data(inverse)) errorQuda("reqeusted clover is_inverse=%d, but not allocated", inverse); double nrm = 0.0; switch(u.Precision()) { case QUDA_DOUBLE_PRECISION: nrm = _norm(u, inverse, type); break; diff --git a/lib/multi_reduce_quda.cu b/lib/multi_reduce_quda.cu index 6af44e8107..f93ab431e4 100644 --- a/lib/multi_reduce_quda.cu +++ b/lib/multi_reduce_quda.cu @@ -88,7 +88,7 @@ namespace quda { if (NXZ == NYW) { is_norm = true; for (int i = 0; i < NXZ; i++) { - if (x[i].V() != y[i].V() || x[i].V() != z[i].V() || x[i].V() != w[i].V()) { + if (x[i].data() != y[i].data() || x[i].data() != z[i].data() || x[i].data() != w[i].data()) { is_norm = false; break; } diff --git a/lib/staggered_kd_apply_xinv.cu b/lib/staggered_kd_apply_xinv.cu index 60e9034663..247668cb1c 100644 --- a/lib/staggered_kd_apply_xinv.cu +++ b/lib/staggered_kd_apply_xinv.cu @@ -22,7 +22,7 @@ namespace quda { Xinv(Xinv), dagger(dagger) { - if (out.V() == in.V()) errorQuda("Spinor fields cannot alias"); + if (out.data() == in.data()) errorQuda("Spinor fields cannot alias"); if (in.Nspin() != 1 || out.Nspin() != 1) errorQuda("Unsupported nSpin=%d %d", out.Nspin(), in.Nspin()); if (Xinv.Geometry() != QUDA_KDINVERSE_GEOMETRY) errorQuda("Unsupported gauge geometry %d , expected %d for Xinv", Xinv.Geometry(), QUDA_KDINVERSE_GEOMETRY); diff --git a/lib/vector_io.cpp b/lib/vector_io.cpp index 736cc4d84d..52da9b2cb5 100644 --- a/lib/vector_io.cpp +++ b/lib/vector_io.cpp @@ -52,7 +52,7 @@ namespace quda std::vector V(Nvec * Ls); for (int i = 0; i < Nvec; i++) { auto &v = create_tmp ? tmp[i] : vecs[i]; - for (int j = 0; j < Ls; j++) { V[i * Ls + j] = static_cast(v.V()) + j * stride; } + for (int j = 0; j < Ls; j++) { V[i * Ls + j] = v.data() + j * stride; } } read_spinor_field(filename.c_str(), V.data(), v0.Precision(), v0.X(), v0.SiteSubset(), @@ -125,7 +125,7 @@ namespace quda std::vector V(Nvec * Ls); for (int i = 0; i < Nvec; i++) { auto &v = create_tmp ? tmp[i] : vecs[i]; - for (int j = 0; j < Ls; j++) { V[i * Ls + j] = static_cast(v.V()) + j * stride; } + for (int j = 0; j < Ls; j++) { V[i * Ls + j] = v.data() + j * stride; } } write_spinor_field(filename.c_str(), V.data(), save_prec, v0.X(), v0.SiteSubset(), diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h index abc1270cb4..8f01594579 100644 --- a/tests/dslash_test_utils.h +++ b/tests/dslash_test_utils.h @@ -347,51 +347,51 @@ struct DslashTestWrapper { if (dslash_type == QUDA_WILSON_DSLASH) { switch (dtest_type) { case dslash_test_type::Dslash: - wil_dslash(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, inv_param.cpu_prec, gauge_param); + wil_dslash(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, inv_param.cpu_prec, gauge_param); break; case dslash_test_type::MatPC: - wil_matpc(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.matpc_type, inv_param.dagger, + wil_matpc(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); break; case dslash_test_type::Mat: - wil_mat(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec, gauge_param); + wil_mat(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec, gauge_param); break; case dslash_test_type::MatPCDagMatPC: - wil_matpc(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.matpc_type, inv_param.dagger, + wil_matpc(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); - wil_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, inv_param.matpc_type, not_dagger, + wil_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param); break; case dslash_test_type::MatDagMat: - wil_mat(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec, gauge_param); - wil_mat(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, not_dagger, inv_param.cpu_prec, gauge_param); + wil_mat(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec, gauge_param); + wil_mat(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, not_dagger, inv_param.cpu_prec, gauge_param); break; default: printfQuda("Test type not defined\n"); exit(-1); } } else if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { switch (dtest_type) { case dslash_test_type::Dslash: - clover_dslash(spinorRef.V(), hostGauge, hostCloverInv, spinor.V(), parity, inv_param.dagger, inv_param.cpu_prec, + clover_dslash(spinorRef.data(), hostGauge, hostCloverInv, spinor.data(), parity, inv_param.dagger, inv_param.cpu_prec, gauge_param); break; case dslash_test_type::MatPC: - clover_matpc(spinorRef.V(), hostGauge, hostClover, hostCloverInv, spinor.V(), inv_param.kappa, + clover_matpc(spinorRef.data(), hostGauge, hostClover, hostCloverInv, spinor.data(), inv_param.kappa, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); break; case dslash_test_type::Mat: - clover_mat(spinorRef.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.dagger, + clover_mat(spinorRef.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec, gauge_param); break; case dslash_test_type::MatPCDagMatPC: - clover_matpc(spinorTmp.V(), hostGauge, hostClover, hostCloverInv, spinor.V(), inv_param.kappa, + clover_matpc(spinorTmp.data(), hostGauge, hostClover, hostCloverInv, spinor.data(), inv_param.kappa, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); - clover_matpc(spinorRef.V(), hostGauge, hostClover, hostCloverInv, spinorTmp.V(), inv_param.kappa, + clover_matpc(spinorRef.data(), hostGauge, hostClover, hostCloverInv, spinorTmp.data(), inv_param.kappa, inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param); break; case dslash_test_type::MatDagMat: - clover_mat(spinorTmp.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.dagger, + clover_mat(spinorTmp.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.dagger, inv_param.cpu_prec, gauge_param); - clover_mat(spinorRef.V(), hostGauge, hostClover, spinorTmp.V(), inv_param.kappa, not_dagger, inv_param.cpu_prec, + clover_mat(spinorRef.data(), hostGauge, hostClover, spinorTmp.data(), inv_param.kappa, not_dagger, inv_param.cpu_prec, gauge_param); break; default: printfQuda("Test type not defined\n"); exit(-1); @@ -401,37 +401,37 @@ struct DslashTestWrapper { switch (dtest_type) { case dslash_test_type::Dslash: // My dslash should be the same as the clover dslash - clover_dslash(spinorRef.V(), hostGauge, hostCloverInv, spinor.V(), parity, inv_param.dagger, inv_param.cpu_prec, + clover_dslash(spinorRef.data(), hostGauge, hostCloverInv, spinor.data(), parity, inv_param.dagger, inv_param.cpu_prec, gauge_param); break; case dslash_test_type::MatPC: // my matpc op - cloverHasenbuschTwist_matpc(spinorRef.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, + cloverHasenbuschTwist_matpc(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); break; case dslash_test_type::Mat: // my mat - cloverHasenbuchTwist_mat(spinorRef.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu, + cloverHasenbuchTwist_mat(spinorRef.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.matpc_type); break; case dslash_test_type::MatPCDagMatPC: // matpc^\dagger matpc // my matpc op - cloverHasenbuschTwist_matpc(spinorTmp.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, + cloverHasenbuschTwist_matpc(spinorTmp.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); - cloverHasenbuschTwist_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), hostClover, hostCloverInv, inv_param.kappa, + cloverHasenbuschTwist_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param); break; case dslash_test_type::MatDagMat: // my mat - cloverHasenbuchTwist_mat(spinorTmp.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu, + cloverHasenbuchTwist_mat(spinorTmp.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.matpc_type); - cloverHasenbuchTwist_mat(spinorRef.V(), hostGauge, hostClover, spinorTmp.V(), inv_param.kappa, inv_param.mu, + cloverHasenbuchTwist_mat(spinorRef.data(), hostGauge, hostClover, spinorTmp.data(), inv_param.kappa, inv_param.mu, not_dagger, inv_param.cpu_prec, gauge_param, inv_param.matpc_type); break; @@ -441,54 +441,54 @@ struct DslashTestWrapper { switch (dtest_type) { case dslash_test_type::Dslash: if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) - tm_dslash(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, parity, + tm_dslash(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, parity, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); else { - tm_ndeg_dslash(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon, parity, + tm_ndeg_dslash(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon, parity, inv_param.dagger, inv_param.matpc_type, inv_param.cpu_prec, gauge_param); } break; case dslash_test_type::MatPC: if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) - tm_matpc(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, + tm_matpc(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); else { - tm_ndeg_matpc(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon, + tm_ndeg_matpc(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); } break; case dslash_test_type::Mat: if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) - tm_mat(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, + tm_mat(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.dagger, inv_param.cpu_prec, gauge_param); else { - tm_ndeg_mat(spinorRef.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon, + tm_ndeg_mat(spinorRef.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon, inv_param.dagger, inv_param.cpu_prec, gauge_param); } break; case dslash_test_type::MatPCDagMatPC: if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) { - tm_matpc(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, + tm_matpc(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); - tm_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, + tm_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param); } else { - tm_ndeg_matpc(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon, + tm_ndeg_matpc(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); - tm_ndeg_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon, + tm_ndeg_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon, inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param); } break; case dslash_test_type::MatDagMat: if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) { - tm_mat(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, + tm_mat(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.dagger, inv_param.cpu_prec, gauge_param); - tm_mat(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, + tm_mat(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, not_dagger, inv_param.cpu_prec, gauge_param); } else { - tm_ndeg_mat(spinorTmp.V(), hostGauge, spinor.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon, + tm_ndeg_mat(spinorTmp.data(), hostGauge, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon, inv_param.dagger, inv_param.cpu_prec, gauge_param); - tm_ndeg_mat(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.kappa, inv_param.mu, inv_param.epsilon, + tm_ndeg_mat(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon, not_dagger, inv_param.cpu_prec, gauge_param); } break; @@ -498,54 +498,54 @@ struct DslashTestWrapper { switch (dtest_type) { case dslash_test_type::Dslash: if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) - tmc_dslash(spinorRef.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, + tmc_dslash(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, parity, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); else - tmc_ndeg_dslash(spinorRef.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, + tmc_ndeg_dslash(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.epsilon, parity, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); break; case dslash_test_type::MatPC: if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) - tmc_matpc(spinorRef.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, + tmc_matpc(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); else - tmc_ndeg_matpc(spinorRef.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, + tmc_ndeg_matpc(spinorRef.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.epsilon, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); break; case dslash_test_type::Mat: if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) - tmc_mat(spinorRef.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu, + tmc_mat(spinorRef.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.dagger, inv_param.cpu_prec, gauge_param); else - tmc_ndeg_mat(spinorRef.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu, + tmc_ndeg_mat(spinorRef.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon, inv_param.dagger, inv_param.cpu_prec, gauge_param); break; case dslash_test_type::MatPCDagMatPC: if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) { - tmc_matpc(spinorTmp.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, + tmc_matpc(spinorTmp.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); - tmc_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, + tmc_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param); } else { - tmc_ndeg_matpc(spinorTmp.V(), hostGauge, spinor.V(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, + tmc_ndeg_matpc(spinorTmp.data(), hostGauge, spinor.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.epsilon, inv_param.matpc_type, inv_param.dagger, inv_param.cpu_prec, gauge_param); - tmc_ndeg_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), hostClover, hostCloverInv, inv_param.kappa, + tmc_ndeg_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), hostClover, hostCloverInv, inv_param.kappa, inv_param.mu, inv_param.epsilon, inv_param.matpc_type, not_dagger, inv_param.cpu_prec, gauge_param); } break; case dslash_test_type::MatDagMat: if (inv_param.twist_flavor == QUDA_TWIST_SINGLET) { - tmc_mat(spinorTmp.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu, + tmc_mat(spinorTmp.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.dagger, inv_param.cpu_prec, gauge_param); - tmc_mat(spinorRef.V(), hostGauge, hostClover, spinorTmp.V(), inv_param.kappa, inv_param.mu, + tmc_mat(spinorRef.data(), hostGauge, hostClover, spinorTmp.data(), inv_param.kappa, inv_param.mu, inv_param.twist_flavor, not_dagger, inv_param.cpu_prec, gauge_param); } else { - tmc_ndeg_mat(spinorTmp.V(), hostGauge, hostClover, spinor.V(), inv_param.kappa, inv_param.mu, + tmc_ndeg_mat(spinorTmp.data(), hostGauge, hostClover, spinor.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon, inv_param.dagger, inv_param.cpu_prec, gauge_param); - tmc_ndeg_mat(spinorRef.V(), hostGauge, hostClover, spinorTmp.V(), inv_param.kappa, inv_param.mu, + tmc_ndeg_mat(spinorRef.data(), hostGauge, hostClover, spinorTmp.data(), inv_param.kappa, inv_param.mu, inv_param.epsilon, not_dagger, inv_param.cpu_prec, gauge_param); } break; @@ -554,25 +554,25 @@ struct DslashTestWrapper { } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) { switch (dtest_type) { case dslash_test_type::Dslash: - dw_dslash(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param, + dw_dslash(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass); break; case dslash_test_type::MatPC: - dw_matpc(spinorRef.V(), hostGauge, spinor.V(), kappa5, inv_param.matpc_type, inv_param.dagger, + dw_matpc(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.matpc_type, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass); break; case dslash_test_type::Mat: - dw_mat(spinorRef.V(), hostGauge, spinor.V(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param, + dw_mat(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass); break; case dslash_test_type::MatPCDagMatPC: - dw_matpc(spinorTmp.V(), hostGauge, spinor.V(), kappa5, inv_param.matpc_type, inv_param.dagger, + dw_matpc(spinorTmp.data(), hostGauge, spinor.data(), kappa5, inv_param.matpc_type, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass); - dw_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), kappa5, inv_param.matpc_type, not_dagger, + dw_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), kappa5, inv_param.matpc_type, not_dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass); break; case dslash_test_type::MatDagMat: - dw_matdagmat(spinorRef.V(), hostGauge, spinor.V(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param, + dw_matdagmat(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass); break; default: printf("Test type not supported for domain wall\n"); exit(-1); @@ -582,35 +582,35 @@ struct DslashTestWrapper { for (int xs = 0; xs < Ls; xs++) kappa_5[xs] = kappa5; switch (dtest_type) { case dslash_test_type::Dslash: - dslash_4_4d(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param, + dslash_4_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass); break; case dslash_test_type::M5: - dw_dslash_5_4d(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, + dw_dslash_5_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, true); break; case dslash_test_type::M5inv: - dslash_5_inv(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param, + dslash_5_inv(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, kappa_5); break; case dslash_test_type::MatPC: - dw_4d_matpc(spinorRef.V(), hostGauge, spinor.V(), kappa5, inv_param.matpc_type, inv_param.dagger, + dw_4d_matpc(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.matpc_type, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass); break; case dslash_test_type::Mat: - dw_4d_mat(spinorRef.V(), hostGauge, spinor.V(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param, + dw_4d_mat(spinorRef.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass); break; case dslash_test_type::MatPCDagMatPC: - dw_4d_matpc(spinorTmp.V(), hostGauge, spinor.V(), kappa5, inv_param.matpc_type, inv_param.dagger, + dw_4d_matpc(spinorTmp.data(), hostGauge, spinor.data(), kappa5, inv_param.matpc_type, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass); - dw_4d_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), kappa5, inv_param.matpc_type, not_dagger, + dw_4d_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), kappa5, inv_param.matpc_type, not_dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass); break; case dslash_test_type::MatDagMat: - dw_4d_mat(spinorTmp.V(), hostGauge, spinor.V(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param, + dw_4d_mat(spinorTmp.data(), hostGauge, spinor.data(), kappa5, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass); - dw_4d_mat(spinorRef.V(), hostGauge, spinorTmp.V(), kappa5, not_dagger, gauge_param.cpu_prec, gauge_param, + dw_4d_mat(spinorRef.data(), hostGauge, spinorTmp.data(), kappa5, not_dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass); break; default: printf("Test type not supported for domain wall\n"); exit(-1); @@ -629,44 +629,44 @@ struct DslashTestWrapper { } switch (dtest_type) { case dslash_test_type::Dslash: - dslash_4_4d(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param, + dslash_4_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass); break; case dslash_test_type::M5: - mdw_dslash_5(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param, + mdw_dslash_5(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, kappa_5, true); break; case dslash_test_type::Dslash4pre: - mdw_dslash_4_pre(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, + mdw_dslash_4_pre(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5, true); break; case dslash_test_type::M5inv: - mdw_dslash_5_inv(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, + mdw_dslash_5_inv(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, kappa_mdwf); break; case dslash_test_type::MatPC: - mdw_matpc(spinorRef.V(), hostGauge, spinor.V(), kappa_b, kappa_c, inv_param.matpc_type, inv_param.dagger, + mdw_matpc(spinorRef.data(), hostGauge, spinor.data(), kappa_b, kappa_c, inv_param.matpc_type, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5); break; case dslash_test_type::Mat: - mdw_mat(spinorRef.V(), hostGauge, spinor.V(), kappa_b, kappa_c, inv_param.dagger, gauge_param.cpu_prec, + mdw_mat(spinorRef.data(), hostGauge, spinor.data(), kappa_b, kappa_c, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5); break; case dslash_test_type::MatPCDagMatPC: - mdw_matpc(spinorTmp.V(), hostGauge, spinor.V(), kappa_b, kappa_c, inv_param.matpc_type, inv_param.dagger, + mdw_matpc(spinorTmp.data(), hostGauge, spinor.data(), kappa_b, kappa_c, inv_param.matpc_type, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5); - mdw_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), kappa_b, kappa_c, inv_param.matpc_type, not_dagger, + mdw_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), kappa_b, kappa_c, inv_param.matpc_type, not_dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5); break; case dslash_test_type::MatDagMat: - mdw_mat(spinorTmp.V(), hostGauge, spinor.V(), kappa_b, kappa_c, inv_param.dagger, gauge_param.cpu_prec, + mdw_mat(spinorTmp.data(), hostGauge, spinor.data(), kappa_b, kappa_c, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5); - mdw_mat(spinorRef.V(), hostGauge, spinorTmp.V(), kappa_b, kappa_c, not_dagger, gauge_param.cpu_prec, + mdw_mat(spinorRef.data(), hostGauge, spinorTmp.data(), kappa_b, kappa_c, not_dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5); break; case dslash_test_type::MatPCDagMatPCLocal: // reference for MdagM local operator - mdw_mdagm_local(spinorRef.V(), hostGauge, spinor.V(), kappa_b, kappa_c, inv_param.matpc_type, + mdw_mdagm_local(spinorRef.data(), hostGauge, spinor.data(), kappa_b, kappa_c, inv_param.matpc_type, gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5); break; default: printf("Test type not supported for Mobius domain wall\n"); exit(-1); @@ -688,48 +688,48 @@ struct DslashTestWrapper { } switch (dtest_type) { case dslash_test_type::Dslash: - dslash_4_4d(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param, + dslash_4_4d(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass); break; case dslash_test_type::M5: - mdw_eofa_m5(spinorRef.V(), spinor.V(), parity, inv_param.dagger, inv_param.mass, inv_param.m5, + mdw_eofa_m5(spinorRef.data(), spinor.data(), parity, inv_param.dagger, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift, gauge_param.cpu_prec); break; case dslash_test_type::Dslash4pre: - mdw_dslash_4_pre(spinorRef.V(), hostGauge, spinor.V(), parity, inv_param.dagger, gauge_param.cpu_prec, + mdw_dslash_4_pre(spinorRef.data(), hostGauge, spinor.data(), parity, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5, true); break; case dslash_test_type::M5inv: - mdw_eofa_m5inv(spinorRef.V(), spinor.V(), parity, inv_param.dagger, inv_param.mass, inv_param.m5, + mdw_eofa_m5inv(spinorRef.data(), spinor.data(), parity, inv_param.dagger, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift, gauge_param.cpu_prec); break; case dslash_test_type::Mat: - mdw_eofa_mat(spinorRef.V(), hostGauge, spinor.V(), inv_param.dagger, gauge_param.cpu_prec, gauge_param, + mdw_eofa_mat(spinorRef.data(), hostGauge, spinor.data(), inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift); break; case dslash_test_type::MatDagMat: - mdw_eofa_mat(spinorTmp.V(), hostGauge, spinor.V(), inv_param.dagger, gauge_param.cpu_prec, gauge_param, + mdw_eofa_mat(spinorTmp.data(), hostGauge, spinor.data(), inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift); - mdw_eofa_mat(spinorRef.V(), hostGauge, spinorTmp.V(), not_dagger, gauge_param.cpu_prec, gauge_param, + mdw_eofa_mat(spinorRef.data(), hostGauge, spinorTmp.data(), not_dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift); break; case dslash_test_type::MatPC: - mdw_eofa_matpc(spinorRef.V(), hostGauge, spinor.V(), inv_param.matpc_type, inv_param.dagger, + mdw_eofa_matpc(spinorRef.data(), hostGauge, spinor.data(), inv_param.matpc_type, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift); break; case dslash_test_type::MatPCDagMatPC: - mdw_eofa_matpc(spinorTmp.V(), hostGauge, spinor.V(), inv_param.matpc_type, inv_param.dagger, + mdw_eofa_matpc(spinorTmp.data(), hostGauge, spinor.data(), inv_param.matpc_type, inv_param.dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift); - mdw_eofa_matpc(spinorRef.V(), hostGauge, spinorTmp.V(), inv_param.matpc_type, not_dagger, gauge_param.cpu_prec, + mdw_eofa_matpc(spinorRef.data(), hostGauge, spinorTmp.data(), inv_param.matpc_type, not_dagger, gauge_param.cpu_prec, gauge_param, inv_param.mass, inv_param.m5, (__real__ inv_param.b_5[0]), (__real__ inv_param.c_5[0]), inv_param.mq1, inv_param.mq2, inv_param.mq3, inv_param.eofa_pm, inv_param.eofa_shift); @@ -764,8 +764,8 @@ struct DslashTestWrapper { std::vector _hp_x(inv_param.num_src); std::vector _hp_b(inv_param.num_src); for (int i = 0; i < inv_param.num_src; i++) { - _hp_x[i] = vp_spinorOut[i].V(); - _hp_b[i] = vp_spinor[i].V(); + _hp_x[i] = vp_spinorOut[i].data(); + _hp_b[i] = vp_spinor[i].data(); } if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH @@ -786,21 +786,21 @@ struct DslashTestWrapper { switch (dtest_type) { case dslash_test_type::Dslash: if (transfer) { - dslashQuda_4dpc(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type); + dslashQuda_4dpc(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type); } else { static_cast(dirac)->Dslash4(cudaSpinorOut, cudaSpinor, parity); } break; case dslash_test_type::M5: if (transfer) { - dslashQuda_4dpc(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type); + dslashQuda_4dpc(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type); } else { static_cast(dirac)->Dslash5(cudaSpinorOut, cudaSpinor); } break; case dslash_test_type::M5inv: if (transfer) { - dslashQuda_4dpc(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type); + dslashQuda_4dpc(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type); } else { static_cast(dirac)->M5inv(cudaSpinorOut, cudaSpinor); } @@ -808,7 +808,7 @@ struct DslashTestWrapper { case dslash_test_type::MatPC: case dslash_test_type::Mat: if (transfer) { - MatQuda(spinorOut.V(), spinor.V(), &inv_param); + MatQuda(spinorOut.data(), spinor.data(), &inv_param); } else { dirac->M(cudaSpinorOut, cudaSpinor); } @@ -816,7 +816,7 @@ struct DslashTestWrapper { case dslash_test_type::MatPCDagMatPC: case dslash_test_type::MatDagMat: if (transfer) { - MatDagMatQuda(spinorOut.V(), spinor.V(), &inv_param); + MatDagMatQuda(spinorOut.data(), spinor.data(), &inv_param); } else { dirac->MdagM(cudaSpinorOut, cudaSpinor); } @@ -828,28 +828,28 @@ struct DslashTestWrapper { switch (dtest_type) { case dslash_test_type::Dslash: if (transfer) { - dslashQuda_mdwf(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type); + dslashQuda_mdwf(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type); } else { static_cast(dirac)->Dslash4(cudaSpinorOut, cudaSpinor, parity); } break; case dslash_test_type::M5: if (transfer) { - dslashQuda_mdwf(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type); + dslashQuda_mdwf(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type); } else { static_cast(dirac)->Dslash5(cudaSpinorOut, cudaSpinor); } break; case dslash_test_type::Dslash4pre: if (transfer) { - dslashQuda_mdwf(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type); + dslashQuda_mdwf(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type); } else { static_cast(dirac)->Dslash4pre(cudaSpinorOut, cudaSpinor); } break; case dslash_test_type::M5inv: if (transfer) { - dslashQuda_mdwf(spinorOut.V(), spinor.V(), &inv_param, parity, dtest_type); + dslashQuda_mdwf(spinorOut.data(), spinor.data(), &inv_param, parity, dtest_type); } else { static_cast(dirac)->M5inv(cudaSpinorOut, cudaSpinor); } @@ -857,7 +857,7 @@ struct DslashTestWrapper { case dslash_test_type::MatPC: case dslash_test_type::Mat: if (transfer) { - MatQuda(spinorOut.V(), spinor.V(), &inv_param); + MatQuda(spinorOut.data(), spinor.data(), &inv_param); } else { dirac->M(cudaSpinorOut, cudaSpinor); } @@ -865,7 +865,7 @@ struct DslashTestWrapper { case dslash_test_type::MatPCDagMatPC: case dslash_test_type::MatDagMat: if (transfer) { - MatDagMatQuda(spinorOut.V(), spinor.V(), &inv_param); + MatDagMatQuda(spinorOut.data(), spinor.data(), &inv_param); } else { dirac->MdagM(cudaSpinorOut, cudaSpinor); } @@ -940,13 +940,13 @@ struct DslashTestWrapper { case dslash_test_type::Dslash: if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { if (transfer) { - dslashQuda(spinorOut.V(), spinor.V(), &inv_param, parity); + dslashQuda(spinorOut.data(), spinor.data(), &inv_param, parity); } else { dirac->Dslash(cudaSpinorOut, cudaSpinor, parity); } } else { if (transfer) { - dslashQuda(spinorOut.V(), spinor.V(), &inv_param, parity); + dslashQuda(spinorOut.data(), spinor.data(), &inv_param, parity); } else { dirac->Dslash(cudaSpinorOut, cudaSpinor, parity); } @@ -955,7 +955,7 @@ struct DslashTestWrapper { case dslash_test_type::MatPC: case dslash_test_type::Mat: if (transfer) { - MatQuda(spinorOut.V(), spinor.V(), &inv_param); + MatQuda(spinorOut.data(), spinor.data(), &inv_param); } else { dirac->M(cudaSpinorOut, cudaSpinor); } @@ -963,7 +963,7 @@ struct DslashTestWrapper { case dslash_test_type::MatPCDagMatPC: case dslash_test_type::MatDagMat: if (transfer) { - MatDagMatQuda(spinorOut.V(), spinor.V(), &inv_param); + MatDagMatQuda(spinorOut.data(), spinor.data(), &inv_param); } else { dirac->MdagM(cudaSpinorOut, cudaSpinor); } diff --git a/tests/eigensolve_test.cpp b/tests/eigensolve_test.cpp index e22879ff92..7c17540a60 100644 --- a/tests/eigensolve_test.cpp +++ b/tests/eigensolve_test.cpp @@ -179,7 +179,7 @@ std::vector eigensolve(test_t test_param) // Allocate host side memory and pointers for (int i = 0; i < n_eig; i++) { evecs[i] = quda::ColorSpinorField(cs_param); - host_evecs_ptr[i] = evecs[i].V(); + host_evecs_ptr[i] = evecs[i].data(); } // Complex eigenvalues @@ -208,12 +208,12 @@ std::vector eigensolve(test_t test_param) for (int i = 0; i < eig_n_conv; i++) { if (eig_param.compute_svd == QUDA_BOOLEAN_TRUE) { double _Complex sigma = evals[i]; - residua[i] = verifyWilsonTypeSingularVector(evecs[i].V(), evecs[i + eig_n_conv].V(), sigma, i, gauge_param, + residua[i] = verifyWilsonTypeSingularVector(evecs[i].data(), evecs[i + eig_n_conv].data(), sigma, i, gauge_param, eig_param, gauge.data(), clover.data(), clover_inv.data()); } else { double _Complex lambda = evals[i]; - residua[i] = verifyWilsonTypeEigenvector(evecs[i].V(), lambda, i, gauge_param, eig_param, gauge.data(), + residua[i] = verifyWilsonTypeEigenvector(evecs[i].data(), lambda, i, gauge_param, eig_param, gauge.data(), clover.data(), clover_inv.data()); } } diff --git a/tests/host_reference/covdev_reference.cpp b/tests/host_reference/covdev_reference.cpp index 97dae09402..05a8fe839b 100644 --- a/tests/host_reference/covdev_reference.cpp +++ b/tests/host_reference/covdev_reference.cpp @@ -87,10 +87,10 @@ void Mat(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField & { // full dslash operator void *data[4] = {link.data(0), link.data(1), link.data(2), link.data(3)}; - covdevReference(reinterpret_cast(out.Odd().V()), reinterpret_cast(data), - reinterpret_cast(in.Even().V()), 1, daggerBit, mu); - covdevReference(reinterpret_cast(out.Even().V()), reinterpret_cast(data), - reinterpret_cast(in.Odd().V()), 0, daggerBit, mu); + covdevReference(reinterpret_cast(out.Odd().data()), reinterpret_cast(data), + reinterpret_cast(in.Even().data()), 1, daggerBit, mu); + covdevReference(reinterpret_cast(out.Even().data()), reinterpret_cast(data), + reinterpret_cast(in.Odd().data()), 0, daggerBit, mu); } void mat(ColorSpinorField &out, const GaugeField &link, const ColorSpinorField &in, int dagger_bit, int mu) @@ -178,7 +178,7 @@ void covdevReference_mg4dir(sFloat *res, gFloat **link, gFloat **ghostLink, cons int offset = spinor_site_size * sid; gFloat *lnk = gaugeLink_mg4dir(sid, mu, oddBit, linkEven, linkOdd, ghostLinkEven, ghostLinkOdd, 1, 1); - const sFloat *spinor = spinorNeighbor_mg4dir(sid, mu, oddBit, static_cast(in.V()), fwd_nbr_spinor, + const sFloat *spinor = spinorNeighbor_mg4dir(sid, mu, oddBit, static_cast(in.data()), fwd_nbr_spinor, back_nbr_spinor, 1, 1); sFloat gaugedSpinor[spinor_site_size]; @@ -212,15 +212,15 @@ void covdev_dslash_mg4dir(ColorSpinorField &out, const GaugeField &link, const C if (sPrecision == QUDA_DOUBLE_PRECISION) { if (gPrecision == QUDA_DOUBLE_PRECISION) { - covdevReference_mg4dir((double *)out.V(), reinterpret_cast(data), (double **)ghostLink, in, oddBit, daggerBit, mu); + covdevReference_mg4dir((double *)out.data(), reinterpret_cast(data), (double **)ghostLink, in, oddBit, daggerBit, mu); } else { - covdevReference_mg4dir((double *)out.V(), reinterpret_cast(data), (float **)ghostLink, in, oddBit, daggerBit, mu); + covdevReference_mg4dir((double *)out.data(), reinterpret_cast(data), (float **)ghostLink, in, oddBit, daggerBit, mu); } } else { if (gPrecision == QUDA_DOUBLE_PRECISION) { - covdevReference_mg4dir((float *)out.V(), reinterpret_cast(data), (double **)ghostLink, in, oddBit, daggerBit, mu); + covdevReference_mg4dir((float *)out.data(), reinterpret_cast(data), (double **)ghostLink, in, oddBit, daggerBit, mu); } else { - covdevReference_mg4dir((float *)out.V(), reinterpret_cast(data), (float **)ghostLink, in, oddBit, daggerBit, mu); + covdevReference_mg4dir((float *)out.data(), reinterpret_cast(data), (float **)ghostLink, in, oddBit, daggerBit, mu); } } } @@ -237,7 +237,7 @@ void Mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinor auto &outOdd = out.Odd(); inEven.exchangeGhost(QUDA_EVEN_PARITY, nFace, daggerBit); - covdevReference_mg4dir(reinterpret_cast(outOdd.V()), reinterpret_cast(data), + covdevReference_mg4dir(reinterpret_cast(outOdd.data()), reinterpret_cast(data), reinterpret_cast(ghostLink), in.Even(), 1, daggerBit, mu); } @@ -246,7 +246,7 @@ void Mat_mg4dir(ColorSpinorField &out, const GaugeField &link, const ColorSpinor auto &outEven = out.Even(); inOdd.exchangeGhost(QUDA_ODD_PARITY, nFace, daggerBit); - covdevReference_mg4dir(reinterpret_cast(outEven.V()), reinterpret_cast(data), + covdevReference_mg4dir(reinterpret_cast(outEven.data()), reinterpret_cast(data), reinterpret_cast(ghostLink), in.Odd(), 0, daggerBit, mu); } } diff --git a/tests/host_reference/dslash_reference.cpp b/tests/host_reference/dslash_reference.cpp index 907a857824..65af1a6680 100644 --- a/tests/host_reference/dslash_reference.cpp +++ b/tests/host_reference/dslash_reference.cpp @@ -766,10 +766,10 @@ double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorFi QUDA_DAG_YES, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type); if (dslash_type == QUDA_LAPLACE_DSLASH) { - xpay(out.V(), kappa, ref.V(), ref.Length(), gauge_param.cpu_prec); - ax(0.5 / kappa, ref.V(), ref.Length(), gauge_param.cpu_prec); + xpay(out.data(), kappa, ref.data(), ref.Length(), gauge_param.cpu_prec); + ax(0.5 / kappa, ref.data(), ref.Length(), gauge_param.cpu_prec); } else { - axpy(2 * mass, out.V(), ref.V(), ref.Length(), gauge_param.cpu_prec); + axpy(2 * mass, out.data(), ref.data(), ref.Length(), gauge_param.cpu_prec); } break; @@ -791,9 +791,9 @@ double verifyStaggeredInversion(quda::ColorSpinorField &tmp, quda::ColorSpinorFi len = Vh; } - mxpy(in.V(), ref.V(), len * stag_spinor_site_size, inv_param.cpu_prec); - double nrm2 = norm_2(ref.V(), len * stag_spinor_site_size, inv_param.cpu_prec); - double src2 = norm_2(in.V(), len * stag_spinor_site_size, inv_param.cpu_prec); + mxpy(in.data(), ref.data(), len * stag_spinor_site_size, inv_param.cpu_prec); + double nrm2 = norm_2(ref.data(), len * stag_spinor_site_size, inv_param.cpu_prec); + double src2 = norm_2(in.data(), len * stag_spinor_site_size, inv_param.cpu_prec); double hqr = sqrt(quda::blas::HeavyQuarkResidualNorm(out, ref).z); double l2r = sqrt(nrm2 / src2); diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp index 6fbdf91c42..04fc5d035e 100644 --- a/tests/host_reference/staggered_dslash_reference.cpp +++ b/tests/host_reference/staggered_dslash_reference.cpp @@ -143,22 +143,22 @@ void staggeredDslash(ColorSpinorField &out, void *const *fatlink, void *const *l if (sPrecision == QUDA_DOUBLE_PRECISION) { if (gPrecision == QUDA_DOUBLE_PRECISION) { - staggeredDslashReference((double *)out.V(), (double **)fatlink, (double **)longlink, (double **)ghost_fatlink, - (double **)ghost_longlink, (double *)in.V(), (double **)fwd_nbr_spinor, + staggeredDslashReference((double *)out.data(), (double **)fatlink, (double **)longlink, (double **)ghost_fatlink, + (double **)ghost_longlink, (double *)in.data(), (double **)fwd_nbr_spinor, (double **)back_nbr_spinor, oddBit, daggerBit, dslash_type); } else { - staggeredDslashReference((double *)out.V(), (float **)fatlink, (float **)longlink, (float **)ghost_fatlink, - (float **)ghost_longlink, (double *)in.V(), (double **)fwd_nbr_spinor, + staggeredDslashReference((double *)out.data(), (float **)fatlink, (float **)longlink, (float **)ghost_fatlink, + (float **)ghost_longlink, (double *)in.data(), (double **)fwd_nbr_spinor, (double **)back_nbr_spinor, oddBit, daggerBit, dslash_type); } } else { if (gPrecision == QUDA_DOUBLE_PRECISION) { - staggeredDslashReference((float *)out.V(), (double **)fatlink, (double **)longlink, (double **)ghost_fatlink, - (double **)ghost_longlink, (float *)in.V(), (float **)fwd_nbr_spinor, + staggeredDslashReference((float *)out.data(), (double **)fatlink, (double **)longlink, (double **)ghost_fatlink, + (double **)ghost_longlink, (float *)in.data(), (float **)fwd_nbr_spinor, (float **)back_nbr_spinor, oddBit, daggerBit, dslash_type); } else { - staggeredDslashReference((float *)out.V(), (float **)fatlink, (float **)longlink, (float **)ghost_fatlink, - (float **)ghost_longlink, (float *)in.V(), (float **)fwd_nbr_spinor, + staggeredDslashReference((float *)out.data(), (float **)fatlink, (float **)longlink, (float **)ghost_fatlink, + (float **)ghost_longlink, (float *)in.data(), (float **)fwd_nbr_spinor, (float **)back_nbr_spinor, oddBit, daggerBit, dslash_type); } } @@ -189,8 +189,8 @@ void staggeredMatDagMat(ColorSpinorField &out, void *const *fatlink, void *const double msq_x4 = mass * mass * 4; if (sPrecision == QUDA_DOUBLE_PRECISION) { - axmy((double *)in.V(), (double)msq_x4, (double *)out.V(), Vh * stag_spinor_site_size); + axmy((double *)in.data(), (double)msq_x4, (double *)out.data(), Vh * stag_spinor_site_size); } else { - axmy((float *)in.V(), (float)msq_x4, (float *)out.V(), Vh * stag_spinor_site_size); + axmy((float *)in.data(), (float)msq_x4, (float *)out.data(), Vh * stag_spinor_site_size); } } diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp index 2f29a3de08..6bcf0bc380 100644 --- a/tests/invert_test.cpp +++ b/tests/invert_test.cpp @@ -248,7 +248,7 @@ std::vector solve(test_t param) // Allocate memory and set pointers for (int n = 0; n < Nsrc; n++) { out_multishift[n * multishift + i] = quda::ColorSpinorField(cs_param); - _hp_multi_x[n][i] = out_multishift[n * multishift + i].V(); + _hp_multi_x[n][i] = out_multishift[n * multishift + i].data(); } } } @@ -273,9 +273,9 @@ std::vector solve(test_t param) if (inv_deflate) eig_param.preserve_deflation = i < Nsrc - 1 ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; // Perform QUDA inversions if (multishift > 1) { - invertMultiShiftQuda(_hp_multi_x[i].data(), in[i].V(), &inv_param); + invertMultiShiftQuda(_hp_multi_x[i].data(), in[i].data(), &inv_param); } else { - invertQuda(out[i].V(), in[i].V(), &inv_param); + invertQuda(out[i].data(), in[i].data(), &inv_param); } time[i] = inv_param.secs; @@ -292,8 +292,8 @@ std::vector solve(test_t param) std::vector _hp_x(Nsrc); std::vector _hp_b(Nsrc); for (int i = 0; i < Nsrc; i++) { - _hp_x[i] = out[i].V(); - _hp_b[i] = in[i].V(); + _hp_x[i] = out[i].data(); + _hp_b[i] = in[i].data(); } // Run split grid if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH @@ -326,7 +326,7 @@ std::vector solve(test_t param) // Perform host side verification of inversion if requested if (verify_results) { for (int i = 0; i < Nsrc; i++) { - res[i] = verifyInversion(out[i].V(), _hp_multi_x[i].data(), in[i].V(), check.V(), gauge_param, inv_param, + res[i] = verifyInversion(out[i].data(), _hp_multi_x[i].data(), in[i].data(), check.data(), gauge_param, inv_param, gauge.data(), clover.data(), clover_inv.data()); } } diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index 5ee2616ad8..5cae0d80c2 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -103,9 +103,9 @@ struct StaggeredDslashTestWrapper { staggeredDslash(spinorRef.Odd(), qdp_fatlink_cpu, qdp_longlink_cpu, ghost_fatlink_cpu, ghost_longlink_cpu, spinor.Even(), QUDA_ODD_PARITY, !dagger, inv_param.cpu_prec, gauge_param.cpu_prec, dslash_type); if (dslash_type == QUDA_LAPLACE_DSLASH) { - xpay(spinor.V(), kappa, spinorRef.V(), spinor.Length(), gauge_param.cpu_prec); + xpay(spinor.data(), kappa, spinorRef.data(), spinor.Length(), gauge_param.cpu_prec); } else { - axpy(2 * mass, spinor.V(), spinorRef.V(), spinor.Length(), gauge_param.cpu_prec); + axpy(2 * mass, spinor.data(), spinorRef.data(), spinor.Length(), gauge_param.cpu_prec); } break; default: errorQuda("Test type %d not defined", static_cast(dtest_type)); @@ -364,8 +364,8 @@ struct StaggeredDslashTestWrapper { std::vector _hp_x(inv_param.num_src); std::vector _hp_b(inv_param.num_src); for (int i = 0; i < inv_param.num_src; i++) { - _hp_x[i] = vp_spinor_out[i].V(); - _hp_b[i] = vp_spinor[i].V(); + _hp_x[i] = vp_spinor_out[i].data(); + _hp_b[i] = vp_spinor[i].data(); } dslashMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, parity, milc_fatlink_gpu, milc_longlink_gpu, &gauge_param); diff --git a/tests/staggered_gsmear_test_utils.h b/tests/staggered_gsmear_test_utils.h index 7266844798..b9adfe4361 100644 --- a/tests/staggered_gsmear_test_utils.h +++ b/tests/staggered_gsmear_test_utils.h @@ -134,11 +134,11 @@ struct StaggeredGSmearTestWrapper { // &gauge_param, &inv_param, 1, smear_coeff, smear_t0, gauge_param.cpu_prec); // blas::xpay(*tmp2, -1.0, *spinorRef); - xpay(tmp2.Even().V(), -1.0, spinorRef.Even().V(), spinor.Even().Length(), gauge_param.cpu_prec); - xpay(tmp2.Odd().V(), -1.0, spinorRef.Odd().V(), spinor.Odd().Length(), gauge_param.cpu_prec); + xpay(tmp2.Even().data(), -1.0, spinorRef.Even().data(), spinor.Even().Length(), gauge_param.cpu_prec); + xpay(tmp2.Odd().data(), -1.0, spinorRef.Odd().data(), spinor.Odd().Length(), gauge_param.cpu_prec); // - memset(tmp2.Even().V(), 0, spinor.Even().Length() * gauge_param.cpu_prec); - memset(tmp2.Odd().V(), 0, spinor.Odd().Length() * gauge_param.cpu_prec); + memset(tmp2.Even().data(), 0, spinor.Even().Length() * gauge_param.cpu_prec); + memset(tmp2.Odd().data(), 0, spinor.Odd().Length() * gauge_param.cpu_prec); } break; } @@ -327,7 +327,7 @@ struct StaggeredGSmearTestWrapper { // qsm_param.delete_2link = smear_delete_two_link; qsm_param.t0 = smear_t0; - performTwoLinkGaussianSmearNStep(spinor.V(), &qsm_param); + performTwoLinkGaussianSmearNStep(spinor.data(), &qsm_param); quda_gflops = qsm_param.gflops; diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index 87c574d974..1fab095147 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -335,7 +335,7 @@ int main(int argc, char **argv) if (!use_split_grid) { for (int k = 0; k < Nsrc; k++) { if (inv_deflate) eig_param.preserve_deflation = k < Nsrc - 1 ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - invertQuda(out[k]->V(), in[k]->V(), &inv_param); + invertQuda(out[k]->data(), in[k]->data(), &inv_param); time[k] = inv_param.secs; gflops[k] = inv_param.gflops / inv_param.secs; iter[k] = inv_param.iter; @@ -346,8 +346,8 @@ int main(int argc, char **argv) std::vector _hp_x(Nsrc); std::vector _hp_b(Nsrc); for (int k = 0; k < Nsrc; k++) { - _hp_x[k] = out[k]->V(); - _hp_b[k] = in[k]->V(); + _hp_x[k] = out[k]->data(); + _hp_b[k] = in[k]->data(); } inv_param.num_src = Nsrc; inv_param.num_src_per_sub_partition = Nsrc / num_sub_partition; @@ -389,12 +389,12 @@ int main(int argc, char **argv) inv_param.tol_hq_offset[i] = inv_param.tol_hq; // Allocate memory and set pointers qudaOutArray[i] = ColorSpinorField::Create(cs_param); - outArray[i] = qudaOutArray[i]->V(); + outArray[i] = qudaOutArray[i]->data(); } for (int k = 0; k < Nsrc; k++) { quda::spinorNoise(*in[k], *rng, QUDA_NOISE_UNIFORM); - invertMultiShiftQuda((void **)outArray, in[k]->V(), &inv_param); + invertMultiShiftQuda((void **)outArray, in[k]->data(), &inv_param); time[k] = inv_param.secs; gflops[k] = inv_param.gflops / inv_param.secs; diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp index 365781c7d0..cc9148fca5 100644 --- a/tests/utils/staggered_host_utils.cpp +++ b/tests/utils/staggered_host_utils.cpp @@ -462,12 +462,12 @@ void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk if (prec == QUDA_DOUBLE_PRECISION) { { - staggeredTwoLinkGaussianSmear((double *)out.V(), (double **)qdp_twolnk, (double **)ghost, (double *)in.V(), + staggeredTwoLinkGaussianSmear((double *)out.data(), (double **)qdp_twolnk, (double **)ghost, (double *)in.data(), (double **)fwd_nbr_spinor, (double **)back_nbr_spinor, t0, oddBit); } } else { { - staggeredTwoLinkGaussianSmear((float *)out.V(), (float **)qdp_twolnk, (float **)ghost, (float *)in.V(), + staggeredTwoLinkGaussianSmear((float *)out.data(), (float **)qdp_twolnk, (float **)ghost, (float *)in.data(), (float **)fwd_nbr_spinor, (float **)back_nbr_spinor, t0, oddBit); } } From 838ff4f711ead8efd538879d7760b76d0094f3dd Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 31 May 2023 23:32:11 -0700 Subject: [PATCH 25/99] Fix clang warning --- include/quda_api.h | 2 +- include/tune_quda.h | 14 ++++++-------- lib/tune.cpp | 12 +----------- 3 files changed, 8 insertions(+), 20 deletions(-) diff --git a/include/quda_api.h b/include/quda_api.h index b3b9f35b69..9feea16297 100644 --- a/include/quda_api.h +++ b/include/quda_api.h @@ -23,7 +23,7 @@ enum qudaMemcpyKind { namespace quda { - class TuneParam; + struct TuneParam; struct qudaStream_t { int idx; diff --git a/include/tune_quda.h b/include/tune_quda.h index 2aacde55f7..ff99826149 100644 --- a/include/tune_quda.h +++ b/include/tune_quda.h @@ -18,17 +18,15 @@ namespace quda { struct TuneParam { - - public: - dim3 block; + dim3 block = {1, 1, 1}; dim3 grid; - unsigned int shared_bytes; - bool set_max_shared_bytes; // whether to opt in to max shared bytes per thread block - int4 aux; // free parameter that can be used as an arbitrary autotuning dimension outside of launch parameters + unsigned int shared_bytes = 0; + bool set_max_shared_bytes = false; // whether to opt in to max shared bytes per thread block + int4 aux = {1, 1, 1, 1}; // free parameter that can be used as an arbitrary autotuning dimension outside of launch parameters std::string comment; - float time; - long long n_calls; + float time = FLT_MAX; + long long n_calls = 0; TuneParam(); TuneParam(const TuneParam &) = default; diff --git a/lib/tune.cpp b/lib/tune.cpp index 1d6971db3c..fea2a7b509 100644 --- a/lib/tune.cpp +++ b/lib/tune.cpp @@ -644,17 +644,7 @@ namespace quda } } - TuneParam::TuneParam() : - block(device::warp_size(), 1, 1), - grid(1, 1, 1), - shared_bytes(0), - set_max_shared_bytes(false), - aux(), - time(FLT_MAX), - n_calls(0) - { - aux = make_int4(1, 1, 1, 1); - } + TuneParam::TuneParam() : block(device::warp_size(), 1, 1) { } std::ostream &operator<<(std::ostream &output, const TuneParam ¶m) { From 9aa20ce752829c4f9093680e4cbba9b8fefd9d3e Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 1 Jun 2023 09:29:35 -0700 Subject: [PATCH 26/99] Remove std::move on temporary quda_ptr objects since this prevents the compiler from doing copy elision --- lib/clover_field.cpp | 10 +++++----- lib/color_spinor_field.cpp | 6 +++--- lib/gauge_field.cpp | 20 ++++++++++---------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp index e91600b6c3..0d859c4fdc 100644 --- a/lib/clover_field.cpp +++ b/lib/clover_field.cpp @@ -72,18 +72,18 @@ namespace quda { if (bytes) { if (create != QUDA_REFERENCE_FIELD_CREATE) { - clover = std::move(quda_ptr(mem_type, bytes)); + clover = quda_ptr(mem_type, bytes); } else { - clover = std::move(quda_ptr(param.clover, mem_type)); + clover = quda_ptr(param.clover, mem_type); } total_bytes += bytes; if (inverse) { if (create != QUDA_REFERENCE_FIELD_CREATE) { - cloverInv = std::move(quda_ptr(mem_type, bytes)); + cloverInv = quda_ptr(mem_type, bytes); } else { - cloverInv = std::move(quda_ptr(param.cloverInv, mem_type)); + cloverInv = quda_ptr(param.cloverInv, mem_type); } total_bytes += bytes; @@ -114,7 +114,7 @@ namespace quda { { if (backup_h.size()) errorQuda("Already allocated host backup"); backup_h.resize(2); - for (auto &b : backup_h) b = std::move(quda_ptr(QUDA_MEMORY_HOST, bytes)); + for (auto &b : backup_h) b = quda_ptr(QUDA_MEMORY_HOST, bytes); backup(false); if (inverse) backup(true); diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index b26897948b..a76a29b0eb 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -148,10 +148,10 @@ namespace quda errorQuda("Subset not implemented"); if (param.create != QUDA_REFERENCE_FIELD_CREATE && param.create != QUDA_GHOST_FIELD_CREATE) { - v = std::move(quda_ptr(mem_type, bytes)); + v = quda_ptr(mem_type, bytes); alloc = true; } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) { - v = std::move(quda_ptr(param.v, mem_type)); + v = quda_ptr(param.v, mem_type); reference = true; } else if (param.create == QUDA_GHOST_FIELD_CREATE) { ghost_only = true; @@ -1480,7 +1480,7 @@ namespace quda { if (backup_h.size()) errorQuda("ColorSpinorField already backed up"); backup_h.resize(1); - backup_h[0] = std::move(quda_ptr(QUDA_MEMORY_HOST, bytes)); + backup_h[0] = quda_ptr(QUDA_MEMORY_HOST, bytes); qudaMemcpy(backup_h[0], v, bytes, qudaMemcpyDefault); } diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index 51d5b59a47..d1700709fc 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -169,18 +169,18 @@ namespace quda { if (isNative()) { if (param.create != QUDA_REFERENCE_FIELD_CREATE) { - gauge = std::move(quda_ptr(mem_type, bytes)); + gauge = quda_ptr(mem_type, bytes); } else { - gauge = std::move(quda_ptr(param.gauge, mem_type)); + gauge = quda_ptr(param.gauge, mem_type); } } else if (is_pointer_array(order)) { size_t nbytes = volume * nInternal * precision; for (int d = 0; d < site_dim; d++) { if (param.create != QUDA_REFERENCE_FIELD_CREATE) { - gauge_array[d] = std::move(quda_ptr(mem_type, nbytes)); + gauge_array[d] = quda_ptr(mem_type, nbytes); } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) { - gauge_array[d] = std::move(quda_ptr(static_cast(param.gauge)[d], mem_type)); + gauge_array[d] = quda_ptr(static_cast(param.gauge)[d], mem_type); } else { errorQuda("Unsupported creation type %d", param.create); } @@ -196,9 +196,9 @@ namespace quda { } if (param.create != QUDA_REFERENCE_FIELD_CREATE) { - gauge = std::move(quda_ptr(mem_type, bytes)); + gauge = quda_ptr(mem_type, bytes); } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) { - gauge = std::move(quda_ptr(param.gauge, mem_type)); + gauge = quda_ptr(param.gauge, mem_type); } else { errorQuda("Unsupported creation type %d", param.create); } @@ -211,8 +211,8 @@ namespace quda { if (!isNative()) { for (int i=0; i Date: Fri, 2 Jun 2023 13:09:54 -0700 Subject: [PATCH 27/99] Move quda_ptr to its own file, and make it generic --- include/malloc_quda.h | 75 ----------------- include/quda_api.h | 2 +- include/quda_ptr.h | 78 ++++++++++++++++++ lib/CMakeLists.txt | 2 +- lib/quda_ptr.cpp | 157 ++++++++++++++++++++++++++++++++++++ lib/targets/cuda/malloc.cpp | 151 ---------------------------------- 6 files changed, 237 insertions(+), 228 deletions(-) create mode 100644 include/quda_ptr.h create mode 100644 lib/quda_ptr.cpp diff --git a/include/malloc_quda.h b/include/malloc_quda.h index 8cbc2fbb47..05a36fcd77 100644 --- a/include/malloc_quda.h +++ b/include/malloc_quda.h @@ -172,78 +172,3 @@ namespace quda { #define pool_device_free(ptr) quda::pool::device_free_(__func__, __FILE__, __LINE__, ptr) #define pool_pinned_malloc(size) quda::pool::pinned_malloc_(__func__, __FILE__, __LINE__, size) #define pool_pinned_free(ptr) quda::pool::pinned_free_(__func__, __FILE__, __LINE__, ptr) - -namespace quda { - - /** - Object that stores a memory allocation with different views for - host or device. Depending on the nature of the underlying memory - type, both views may not be defined - - type defined views - QUDA_MEMORY_DEVICE device only - QUDA_MEMORY_DEVICE_PINNED device only - QUDA_MEMORY_HOST host only - QUDA_MEMORY_HOST_PINNED both - QUDA_MEMORY_MAPPED both (pinned to host) - QUDA_MEMORY_MANAGED both - */ - class quda_ptr { - QudaMemoryType type = QUDA_MEMORY_INVALID; - size_t size = 0; - bool pool = false; - void *device = nullptr; - void *host = nullptr; - - public: - quda_ptr() = default; - quda_ptr(quda_ptr &&) = default; - quda_ptr &operator=(quda_ptr &&); - - /** - @brief Constructor for quda_ptr - @param[in] type The memory type of the allocation - @param[in] size The size of the allocation - @param[in] pool Whether the allocation should be in the memory pool (default is true) - */ - quda_ptr(QudaMemoryType type, size_t size, bool pool = true); - - /** - @brief Constructor for quda_ptr where we are wrapping a non-owned pointer - @param[in] ptr Raw base pointer - @param[in] type The memory type of the allocation - */ - quda_ptr(void *ptr, QudaMemoryType type); - - /** - @brief Destructor for the quda_ptr - */ - virtual ~quda_ptr(); - - /** - @return Returns true if allocation is visible to the device - */ - bool is_device() const; - - /** - @return Returns true if allocation is visible to the host - */ - bool is_host() const; - - /** - Return view of the pointer. For mapped memory we return the device view. - */ - void *data() const; - - /** - Return the device view of the pointer - */ - void *data_device() const; - - /** - Return the host view of the pointer - */ - void *data_host() const; - }; - -} diff --git a/include/quda_api.h b/include/quda_api.h index 9feea16297..becec68c8b 100644 --- a/include/quda_api.h +++ b/include/quda_api.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include /** @file quda_api.h diff --git a/include/quda_ptr.h b/include/quda_ptr.h new file mode 100644 index 0000000000..3e829f310f --- /dev/null +++ b/include/quda_ptr.h @@ -0,0 +1,78 @@ +#pragma once + +#include "malloc_quda.h" + +namespace quda { + + /** + Object that stores a memory allocation with different views for + host or device. Depending on the nature of the underlying memory + type, both views may not be defined + + type defined views + QUDA_MEMORY_DEVICE device only + QUDA_MEMORY_DEVICE_PINNED device only + QUDA_MEMORY_HOST host only + QUDA_MEMORY_HOST_PINNED both + QUDA_MEMORY_MAPPED both (pinned to host) + QUDA_MEMORY_MANAGED both + */ + class quda_ptr { + QudaMemoryType type = QUDA_MEMORY_INVALID; + size_t size = 0; + bool pool = false; + void *device = nullptr; + void *host = nullptr; + + public: + quda_ptr() = default; + quda_ptr(quda_ptr &&) = default; + quda_ptr &operator=(quda_ptr &&); + + /** + @brief Constructor for quda_ptr + @param[in] type The memory type of the allocation + @param[in] size The size of the allocation + @param[in] pool Whether the allocation should be in the memory pool (default is true) + */ + quda_ptr(QudaMemoryType type, size_t size, bool pool = true); + + /** + @brief Constructor for quda_ptr where we are wrapping a non-owned pointer + @param[in] ptr Raw base pointer + @param[in] type The memory type of the allocation + */ + quda_ptr(void *ptr, QudaMemoryType type); + + /** + @brief Destructor for the quda_ptr + */ + virtual ~quda_ptr(); + + /** + @return Returns true if allocation is visible to the device + */ + bool is_device() const; + + /** + @return Returns true if allocation is visible to the host + */ + bool is_host() const; + + /** + Return view of the pointer. For mapped memory we return the device view. + */ + void *data() const; + + /** + Return the device view of the pointer + */ + void *data_device() const; + + /** + Return the host view of the pointer + */ + void *data_host() const; + }; + +} diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 37a83e001c..5050133341 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -84,7 +84,7 @@ set (QUDA_OBJS clover_sigma_outer_product.cu momentum.cu gauge_qcharge.cu deflation.cpp checksum.cu transform_reduce.cu dslash5_mobius_eofa.cu - madwf_ml.cpp + madwf_ml.cpp quda_ptr.cpp instantiate.cpp version.cpp ) # cmake-format: on diff --git a/lib/quda_ptr.cpp b/lib/quda_ptr.cpp new file mode 100644 index 0000000000..8b366afcbb --- /dev/null +++ b/lib/quda_ptr.cpp @@ -0,0 +1,157 @@ +#include "quda_ptr.h" +#include "util_quda.h" +#include "timer.h" + +namespace quda { + + quda_ptr::quda_ptr(QudaMemoryType type, size_t size, bool pool) : + type(type), + size(size), + pool(pool) + { + getProfile().TPSTART(QUDA_PROFILE_INIT); + if (pool && (type != QUDA_MEMORY_DEVICE && type != QUDA_MEMORY_HOST_PINNED && type != QUDA_MEMORY_HOST)) + errorQuda("Memory pool not available for memory type %d", type); + + if (size > 0) { + switch (type) { + case QUDA_MEMORY_DEVICE: + device = pool ? pool_device_malloc(size) : device_malloc(size); + break; + case QUDA_MEMORY_DEVICE_PINNED: + device = device_pinned_malloc(size); + break; + case QUDA_MEMORY_HOST: + host = safe_malloc(size); + break; + case QUDA_MEMORY_HOST_PINNED: + host = pool ? pool_pinned_malloc(size) : pinned_malloc(size); + break; + case QUDA_MEMORY_MAPPED: + host = mapped_malloc(size); + device = get_mapped_device_pointer(host); + break; + case QUDA_MEMORY_MANAGED: + host = managed_malloc(size); + device = host; + break; + default: errorQuda("Unknown memory type %d", type); + } + } + getProfile().TPSTOP(QUDA_PROFILE_INIT); + } + + quda_ptr::quda_ptr(void *ptr, QudaMemoryType type) : + type(type) + { + getProfile().TPSTART(QUDA_PROFILE_INIT); + switch (type) { + case QUDA_MEMORY_DEVICE: + case QUDA_MEMORY_DEVICE_PINNED: + device = ptr; + host = nullptr; + break; + case QUDA_MEMORY_HOST: + case QUDA_MEMORY_HOST_PINNED: + device = nullptr; + host = ptr; + break; + case QUDA_MEMORY_MANAGED: + device = ptr; + host = ptr; + break; + default: errorQuda("Unsupported memory type %d", type); + } + getProfile().TPSTOP(QUDA_PROFILE_INIT); + } + + quda_ptr& quda_ptr::operator=(quda_ptr &&other) + { + if (&other != this) { + type = std::exchange(other.type, QUDA_MEMORY_INVALID); + size = std::exchange(other.size, 0); + pool = std::exchange(other.pool, false); + device = std::exchange(other.device, nullptr); + host = std::exchange(other.host, nullptr); + } + return *this; + } + + quda_ptr::~quda_ptr() + { + getProfile().TPSTART(QUDA_PROFILE_FREE); + + if (size > 0) { + switch (type) { + case QUDA_MEMORY_DEVICE: pool ? pool_device_free(device) : device_free(device); break; + case QUDA_MEMORY_DEVICE_PINNED: device_pinned_free(device); break; + case QUDA_MEMORY_HOST: host_free(host); break; + case QUDA_MEMORY_HOST_PINNED: pool ? pool_pinned_free(host) : host_free(host); break; + case QUDA_MEMORY_MAPPED: host_free(host); break; + default: errorQuda("Unknown memory type %d", type); + } + } + + device = nullptr; + host = nullptr; + + getProfile().TPSTOP(QUDA_PROFILE_FREE); + } + + bool quda_ptr::is_device() const + { + switch (type) { + case QUDA_MEMORY_DEVICE: + case QUDA_MEMORY_DEVICE_PINNED: + case QUDA_MEMORY_MAPPED: + case QUDA_MEMORY_MANAGED: + return true; + default: return false; + } + } + + bool quda_ptr::is_host() const + { + switch (type) { + case QUDA_MEMORY_HOST: + case QUDA_MEMORY_HOST_PINNED: + case QUDA_MEMORY_MANAGED: + return true; + default: return false; + } + } + + void *quda_ptr::data() const + { + void *ptr = nullptr; + + switch (type) { + case QUDA_MEMORY_DEVICE: + case QUDA_MEMORY_DEVICE_PINNED: + case QUDA_MEMORY_MAPPED: + case QUDA_MEMORY_MANAGED: + ptr = device; + break; + case QUDA_MEMORY_HOST: + case QUDA_MEMORY_HOST_PINNED: + ptr = host; + break; + default: errorQuda("Unknown memory type %d", type); + } + + return ptr; + } + + void *quda_ptr::data_device() const + { + if (!device) errorQuda("Device view not defined"); + return device; + } + + void *quda_ptr::data_host() const + { + if (!host) errorQuda("Host view not defined"); + return host; + } + +} diff --git a/lib/targets/cuda/malloc.cpp b/lib/targets/cuda/malloc.cpp index 1f78d936bc..2b0d3c97ba 100644 --- a/lib/targets/cuda/malloc.cpp +++ b/lib/targets/cuda/malloc.cpp @@ -790,155 +790,4 @@ namespace quda } // namespace pool - - quda_ptr::quda_ptr(QudaMemoryType type, size_t size, bool pool) : - type(type), - size(size), - pool(pool) - { - getProfile().TPSTART(QUDA_PROFILE_INIT); - if (pool && (type != QUDA_MEMORY_DEVICE && type != QUDA_MEMORY_HOST_PINNED && type != QUDA_MEMORY_HOST)) - errorQuda("Memory pool not available for memory type %d", type); - - if (size > 0) { - switch (type) { - case QUDA_MEMORY_DEVICE: - device = pool ? pool_device_malloc(size) : device_malloc(size); - break; - case QUDA_MEMORY_DEVICE_PINNED: - device = device_pinned_malloc(size); - break; - case QUDA_MEMORY_HOST: - host = safe_malloc(size); - break; - case QUDA_MEMORY_HOST_PINNED: - host = pool ? pool_pinned_malloc(size) : pinned_malloc(size); - break; - case QUDA_MEMORY_MAPPED: - host = mapped_malloc(size); - device = get_mapped_device_pointer(host); - break; - case QUDA_MEMORY_MANAGED: - host = managed_malloc(size); - device = host; - break; - default: errorQuda("Unknown memory type %d", type); - } - } - getProfile().TPSTOP(QUDA_PROFILE_INIT); - } - - quda_ptr::quda_ptr(void *ptr, QudaMemoryType type) : - type(type) - { - getProfile().TPSTART(QUDA_PROFILE_INIT); - switch (type) { - case QUDA_MEMORY_DEVICE: - case QUDA_MEMORY_DEVICE_PINNED: - device = ptr; - host = nullptr; - break; - case QUDA_MEMORY_HOST: - case QUDA_MEMORY_HOST_PINNED: - device = nullptr; - host = ptr; - break; - case QUDA_MEMORY_MANAGED: - device = ptr; - host = ptr; - break; - default: errorQuda("Unsupported memory type %d", type); - } - getProfile().TPSTOP(QUDA_PROFILE_INIT); - } - - quda_ptr& quda_ptr::operator=(quda_ptr &&other) - { - if (&other != this) { - type = std::exchange(other.type, QUDA_MEMORY_INVALID); - size = std::exchange(other.size, 0); - pool = std::exchange(other.pool, false); - device = std::exchange(other.device, nullptr); - host = std::exchange(other.host, nullptr); - } - return *this; - } - - quda_ptr::~quda_ptr() - { - getProfile().TPSTART(QUDA_PROFILE_FREE); - - if (size > 0) { - switch (type) { - case QUDA_MEMORY_DEVICE: pool ? pool_device_free(device) : device_free(device); break; - case QUDA_MEMORY_DEVICE_PINNED: device_pinned_free(device); break; - case QUDA_MEMORY_HOST: host_free(host); break; - case QUDA_MEMORY_HOST_PINNED: pool ? pool_pinned_free(host) : host_free(host); break; - case QUDA_MEMORY_MAPPED: host_free(host); break; - default: errorQuda("Unknown memory type %d", type); - } - } - - device = nullptr; - host = nullptr; - - getProfile().TPSTOP(QUDA_PROFILE_FREE); - } - - bool quda_ptr::is_device() const - { - switch (type) { - case QUDA_MEMORY_DEVICE: - case QUDA_MEMORY_DEVICE_PINNED: - case QUDA_MEMORY_MAPPED: - case QUDA_MEMORY_MANAGED: - return true; - default: return false; - } - } - - bool quda_ptr::is_host() const - { - switch (type) { - case QUDA_MEMORY_HOST: - case QUDA_MEMORY_HOST_PINNED: - case QUDA_MEMORY_MANAGED: - return true; - default: return false; - } - } - - void *quda_ptr::data() const - { - void *ptr = nullptr; - - switch (type) { - case QUDA_MEMORY_DEVICE: - case QUDA_MEMORY_DEVICE_PINNED: - case QUDA_MEMORY_MAPPED: - case QUDA_MEMORY_MANAGED: - ptr = device; - break; - case QUDA_MEMORY_HOST: - case QUDA_MEMORY_HOST_PINNED: - ptr = host; - break; - default: errorQuda("Unknown memory type %d", type); - } - - return ptr; - } - - void *quda_ptr::data_device() const - { - if (!device) errorQuda("Device view not defined"); - return device; - } - - void *quda_ptr::data_host() const - { - if (!host) errorQuda("Host view not defined"); - return host; - } - } // namespace quda From 2516878950c8cb97fd336d65b2760e398b3e9db4 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Sun, 4 Jun 2023 22:31:45 -0700 Subject: [PATCH 28/99] Add missing utility header --- lib/quda_ptr.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/quda_ptr.cpp b/lib/quda_ptr.cpp index 8b366afcbb..7db16b641d 100644 --- a/lib/quda_ptr.cpp +++ b/lib/quda_ptr.cpp @@ -1,3 +1,4 @@ +#include #include "quda_ptr.h" #include "util_quda.h" #include "timer.h" From 27badee5f11d2e3887250a08937b5df785830237 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 23 Jun 2023 10:24:05 -0700 Subject: [PATCH 29/99] Fix issue with Wilson MG --- include/clover_field_order.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/clover_field_order.h b/include/clover_field_order.h index 05b77eee63..65d5ef6cff 100644 --- a/include/clover_field_order.h +++ b/include/clover_field_order.h @@ -312,7 +312,7 @@ namespace quda { static constexpr int N = nColor * nSpin / 2; reconstruct_t recon; FloatNAccessor(const CloverField &A, bool inverse = false) : - a(A.data(inverse)), + a(A.Bytes() ? A.data(inverse) : nullptr), stride(A.VolumeCB()), offset_cb(A.Bytes() / (2 * sizeof(Float))), compressed_block_size(A.compressed_block_size()), @@ -403,7 +403,9 @@ namespace quda { const int N = nSpin * nColor / 2; const complex zero; Accessor(const CloverField &A, bool inverse = false) : - a(A.data(inverse)), offset_cb(A.Bytes() / (2 * sizeof(Float))), zero(complex(0.0, 0.0)) + a(A.Bytes() ? A.data(inverse) : nullptr), + offset_cb(A.Bytes() / (2 * sizeof(Float))), + zero(complex(0.0, 0.0)) { } From 10ed6a4bad7fd581a2cc2a3fa235c8b61533569c Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 18 Jul 2023 13:31:50 -0700 Subject: [PATCH 30/99] Add local block-size dilution support to QUDA, and extend dilution_test to test this --- include/array.h | 2 +- include/color_spinor_field.h | 22 ++--- include/color_spinor_field_order.h | 50 +++++++----- include/enum_quda.h | 1 + include/enum_quda_fortran.h | 1 + include/kernels/spinor_dilute.cuh | 67 ++++++++++++--- lib/CMakeLists.txt | 1 + lib/spinor_dilute.cu | 62 -------------- lib/spinor_dilute.in.cu | 121 ++++++++++++++++++++++++++++ tests/CMakeLists.txt | 2 +- tests/dilution_test.cpp | 35 +++++++- tests/utils/command_line_params.cpp | 6 ++ tests/utils/command_line_params.h | 1 + tests/utils/misc.cpp | 1 + 14 files changed, 261 insertions(+), 111 deletions(-) delete mode 100644 lib/spinor_dilute.cu create mode 100644 lib/spinor_dilute.in.cu diff --git a/include/array.h b/include/array.h index acb4ee4e0a..3d535adeda 100644 --- a/include/array.h +++ b/include/array.h @@ -12,7 +12,7 @@ namespace quda template struct array { using value_type = T; static constexpr int N = n; - T data[n]; + T data[n] = {}; constexpr T &operator[](int i) { return data[i]; } constexpr const T &operator[](int i) const { return data[i]; } diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 1bb81a450d..3c5ccf194e 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -989,28 +989,30 @@ namespace quda /** @brief Generate a random noise spinor. This variant allows the user to manage the RNG state. - @param src The colorspinorfield - @param randstates Random state - @param type The type of noise to create (QUDA_NOISE_GAUSSIAN or QUDA_NOISE_UNIFORM) + @param[out] src The colorspinorfield + @param[in,out] randstates Random state + @param[in] type The type of noise to create (QUDA_NOISE_GAUSSIAN or QUDA_NOISE_UNIFORM) */ void spinorNoise(ColorSpinorField &src, RNG &randstates, QudaNoiseType type); /** @brief Generate a random noise spinor. This variant just requires a seed and will create and destroy the random number state. - @param src The colorspinorfield - @param seed Seed - @param type The type of noise to create (QUDA_NOISE_GAUSSIAN or QUDA_NOISE_UNIFORM) + @param[out] src The colorspinorfield + @param[in] seed Seed + @param[in] type The type of noise to create (QUDA_NOISE_GAUSSIAN or QUDA_NOISE_UNIFORM) */ void spinorNoise(ColorSpinorField &src, unsigned long long seed, QudaNoiseType type); /** @brief Generate a set of diluted color spinors from a single source. - @param v Diluted vector set - @param src The input source - @param type The type of dilution to apply (QUDA_DILUTION_SPIN_COLOR, etc.) + @param[out] v Diluted vector set + @param[in] src The input source + @param[in] type The type of dilution to apply (QUDA_DILUTION_SPIN_COLOR, etc.) + @param[in] local_block The local block size to use when using QUDA_DILUTION_BLOCK dilution */ - void spinorDilute(std::vector &v, const ColorSpinorField &src, QudaDilutionType type); + void spinorDilute(std::vector &v, const ColorSpinorField &src, QudaDilutionType type, + const lat_dim_t &local_block = {}); /** @brief Helper function for determining if the preconditioning diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index dab488931f..4265884962 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1058,17 +1058,20 @@ namespace quda using GhostVector = typename VectorType::type; using AllocInt = typename AllocType::type; using norm_type = float; - Float *field; - norm_type *norm; - const AllocInt offset; // offset can be 32-bit or 64-bit - const AllocInt norm_offset; - int volumeCB; - int faceVolumeCB[4]; - mutable Float *ghost[8]; - mutable norm_type *ghost_norm[8]; - int nParity; - void *backup_h; //! host memory for backing up the field when tuning - size_t bytes; + Float *field = nullptr; + norm_type *norm = nullptr; + AllocInt offset = 0; // offset can be 32-bit or 64-bit + AllocInt norm_offset = 0; + int volumeCB = 0; + array faceVolumeCB = {}; + mutable array ghost = {}; + mutable array ghost_norm = {}; + int nParity = 0; + void *backup_h = nullptr; //! host memory for backing up the field when tuning + size_t bytes = 0; + + FloatNOrder() = default; + FloatNOrder(const FloatNOrder &) = default; FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *buffer = 0, Float **ghost_ = 0) : field(buffer ? buffer : a.data()), @@ -1078,13 +1081,14 @@ namespace quda norm_offset(a.Bytes() / (2 * sizeof(norm_type))), volumeCB(a.VolumeCB()), nParity(a.SiteSubset()), - backup_h(nullptr), bytes(a.Bytes()) { for (int i = 0; i < 4; i++) { faceVolumeCB[i] = a.SurfaceCB(i) * nFace; } resetGhost(ghost_ ? (void **)ghost_ : a.Ghost()); } + FloatNOrder &operator=(const FloatNOrder &) = default; + void resetGhost(void *const *ghost_) const { for (int dim = 0; dim < 4; dim++) { @@ -1289,27 +1293,31 @@ namespace quda using GhostVector = int4; // 128-bit packed type using AllocInt = typename AllocType::type; using norm_type = float; - Float *field; - const AllocInt offset; // offset can be 32-bit or 64-bit - int volumeCB; - int faceVolumeCB[4]; - mutable Float *ghost[8]; - int nParity; - void *backup_h; //! host memory for backing up the field when tuning - size_t bytes; + Float *field = nullptr; + const AllocInt offset = 0; // offset can be 32-bit or 64-bit + int volumeCB = 0; + array faceVolumeCB = {}; + mutable array ghost = {}; + int nParity = 0; + void *backup_h = nullptr; //! host memory for backing up the field when tuning + size_t bytes = 0; + + FloatNOrder() = default; + FloatNOrder(const FloatNOrder &) = default; FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *buffer = 0, Float **ghost_ = 0) : field(buffer ? buffer : a.data()), offset(a.Bytes() / (2 * sizeof(Vector))), volumeCB(a.VolumeCB()), nParity(a.SiteSubset()), - backup_h(nullptr), bytes(a.Bytes()) { for (int i = 0; i < 4; i++) { faceVolumeCB[i] = a.SurfaceCB(i) * nFace; } resetGhost(ghost_ ? (void **)ghost_ : a.Ghost()); } + FloatNOrder &operator=(const FloatNOrder &) = default; + void resetGhost(void *const *ghost_) const { for (int dim = 0; dim < 4; dim++) { diff --git a/include/enum_quda.h b/include/enum_quda.h index c3d39d913f..abf3821185 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -397,6 +397,7 @@ typedef enum QudaDilutionType_s { QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, QUDA_DILUTION_SPIN_COLOR_EVEN_ODD, + QUDA_DILUTION_BLOCK, QUDA_DILUTION_INVALID = QUDA_INVALID_ENUM } QudaDilutionType; diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index e77d5a0e15..bff43e0734 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -361,6 +361,7 @@ #define QUDA_DILUTION_COLOR 1 #define QUDA_DILUTION_SPIN_COLOR 2 #define QUDA_DILUTION_SPIN_COLOR_EVEN_ODD 3 +#define QUDA_DILUTION_BLOCK 4 #define QUDA_DILUTION_INVALID QUDA_INVALID_ENUM #define QudaProjectionType integer(4) diff --git a/include/kernels/spinor_dilute.cuh b/include/kernels/spinor_dilute.cuh index 538610ff44..d7c5774114 100644 --- a/include/kernels/spinor_dilute.cuh +++ b/include/kernels/spinor_dilute.cuh @@ -18,7 +18,7 @@ namespace quda { case QUDA_DILUTION_COLOR: return nColor; case QUDA_DILUTION_SPIN_COLOR: return nSpin * nColor; case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: return nSpin * nColor * 2; - default: return 1; + default: return 128; } } @@ -28,10 +28,15 @@ namespace quda { static constexpr int nSpin = nSpin_; static constexpr int nColor = nColor_; static constexpr QudaDilutionType type = type_; - static constexpr int dilution_size = get_size(type); + static constexpr int max_dilution_size = get_size(type); using V = typename colorspinor_mapper::type; - V v[dilution_size]; + int dilution_size; + V v[max_dilution_size]; V src; + int nParity; + lat_dim_t dims = {}; + lat_dim_t dilution_block_dims = {}; + lat_dim_t dilution_block_grid = {}; /** @brief Constructor for the dilution arg @@ -39,14 +44,36 @@ namespace quda { @param src The source vector we are diluting */ template - SpinorDiluteArg(std::vector &v, const ColorSpinorField &src, std::index_sequence) : + SpinorDiluteArg(std::vector &v, const ColorSpinorField &src, const lat_dim_t &dilution_block_dims, + std::index_sequence) : kernel_param(dim3(src.VolumeCB(), src.SiteSubset(), 1)), - v{v[S]...}, - src(src) + dilution_size(v.size()), + src(src), + nParity(src.SiteSubset()), + dims(static_cast(src).X()), + dilution_block_dims(dilution_block_dims) { + for (auto i = 0u; i < v.size(); i++) this->v[i] = V(v[i]); + if (nParity == 1) { // dimensions need to be full-field + this->dims[0] *= 2; + this->dilution_block_dims[0] *= 2; + } + for (auto i = 0; i < src.Ndim() && type == QUDA_DILUTION_BLOCK; i++) + dilution_block_grid[i] = (dims[i] * comms_dim[i]) / this->dilution_block_dims[i]; } }; + template + __device__ __host__ void getCoordsGlobal(coord_t &coords, int x_cb, int parity, const Arg &arg) + { + getCoords(coords, x_cb, arg.dims, parity); + + // first 4 dimensions are potentially distributed so include global offsets + for (int i = 0; i < 4; i++) { + coords[i] += arg.comms_coord[i] * arg.dims[i]; // global coordinate + } + } + /** Functor for diluting the src vector */ @@ -78,16 +105,30 @@ namespace quda { using vector = ColorSpinor; vector src = arg.src(x_cb, parity); - for (int i = 0; i < Arg::dilution_size; i++) { - vector v; + if (Arg::type == QUDA_DILUTION_BLOCK) { + lat_dim_t coords; + getCoordsGlobal(coords, x_cb, parity, arg); - for (int s = 0; s < Arg::nSpin; s++) { - for (int c = 0; c < Arg::nColor; c++) { - v(s, c) = write_source(i, s, c, parity) ? src(s, c) : complex(0.0, 0.0); - } + lat_dim_t block_coords; + for (int i = 0; i < coords.size(); i++) block_coords[i] = coords[i] / arg.dilution_block_dims[i]; + int block_idx = ((block_coords[3] * arg.dilution_block_grid[2] + block_coords[2]) * arg.dilution_block_grid[1] + block_coords[1]) + * arg.dilution_block_grid[0] + block_coords[0]; + + for (int i = 0; i < arg.dilution_size; i++) { + arg.v[i](x_cb, parity) = i == block_idx ? src : vector(); } + } else { + for (int i = 0; i < Arg::max_dilution_size; i++) { // for these types max = actual size + vector v; - arg.v[i](x_cb, parity) = v; + for (int s = 0; s < Arg::nSpin; s++) { + for (int c = 0; c < Arg::nColor; c++) { + v(s, c) = write_source(i, s, c, parity) ? src(s, c) : complex(0.0, 0.0); + } + } + + arg.v[i](x_cb, parity) = v; + } } } diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 5050133341..523fbe6101 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -166,6 +166,7 @@ configure_file(extract_gauge_ghost.in.cu extract_gauge_ghost.cu @ONLY) configure_file(gauge_noise.in.cu gauge_noise.cu @ONLY) configure_file(gauge_norm.in.cu gauge_norm.cu @ONLY) configure_file(spinor_noise.in.cu spinor_noise.cu @ONLY) +configure_file(spinor_dilute.in.cu spinor_dilute.cu @ONLY) configure_file(copy_color_spinor_mg.in.hpp copy_color_spinor_mg.hpp @ONLY) configure_file(color_spinor_pack.in.cu color_spinor_pack.cu @ONLY) configure_file(color_spinor_util.in.cu color_spinor_util.cu @ONLY) diff --git a/lib/spinor_dilute.cu b/lib/spinor_dilute.cu deleted file mode 100644 index f95011d2ea..0000000000 --- a/lib/spinor_dilute.cu +++ /dev/null @@ -1,62 +0,0 @@ -#include -#include -#include -#include - -namespace quda { - - template - class SpinorDilute : TunableKernel2D { - std::vector &v; - const ColorSpinorField &src; - QudaDilutionType type; - unsigned int minThreads() const { return src.VolumeCB(); } - - public: - SpinorDilute(const ColorSpinorField &src, std::vector &v, QudaDilutionType type) : - TunableKernel2D(src, src.SiteSubset()), - v(v), - src(src), - type(type) - { - switch (type) { - case QUDA_DILUTION_SPIN: strcat(aux, ",spin_dilution"); break; - case QUDA_DILUTION_COLOR: strcat(aux, ",color_dilution"); break; - case QUDA_DILUTION_SPIN_COLOR: strcat(aux, ",spin_color_dilution"); break; - case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: strcat(aux, ",spin_color_even_odd_dilution"); break; - default: errorQuda("Unsupported dilution type %d", type); - } - if (v.size() != static_cast(get_size(type))) - errorQuda("Input container size %lu does not match expected size %d for dilution type", v.size(), get_size(type)); - apply(device::get_default_stream()); - } - - template using Arg = SpinorDiluteArg; - - template - auto constexpr sequence() { return std::make_index_sequence(type)>(); } - - template - void apply(TuneParam &tp, const qudaStream_t &stream) { launch(tp, stream, Arg(v, src, sequence())); } - - void apply(const qudaStream_t &stream) - { - TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); - switch (type) { - case QUDA_DILUTION_SPIN: apply(tp, stream); break; - case QUDA_DILUTION_COLOR: apply(tp, stream); break; - case QUDA_DILUTION_SPIN_COLOR: apply(tp, stream); break; - case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: apply(tp, stream); break; - default: errorQuda("Dilution type %d not supported", type); - } - } - - long long bytes() const { return v.size() * v[0].Bytes() + src.Bytes(); } - }; - - void spinorDilute(std::vector &v, const ColorSpinorField &src, QudaDilutionType type) - { - instantiateSpinor(src, v, type); - } - -} // namespace quda diff --git a/lib/spinor_dilute.in.cu b/lib/spinor_dilute.in.cu new file mode 100644 index 0000000000..c97f571abc --- /dev/null +++ b/lib/spinor_dilute.in.cu @@ -0,0 +1,121 @@ +#include +#include +#include +#include + +namespace quda { + + template + class SpinorDilute : TunableKernel2D { + std::vector &v; + const ColorSpinorField &src; + QudaDilutionType type; + const lat_dim_t &local_block; + unsigned int minThreads() const { return src.VolumeCB(); } + template using Arg = SpinorDiluteArg; + + public: + SpinorDilute(const ColorSpinorField &src, std::vector &v, QudaDilutionType type, const lat_dim_t &local_block) : + TunableKernel2D(src, src.SiteSubset()), + v(v), + src(src), + type(type), + local_block(local_block) + { + switch (type) { + case QUDA_DILUTION_SPIN: strcat(aux, ",spin_dilution"); break; + case QUDA_DILUTION_COLOR: strcat(aux, ",color_dilution"); break; + case QUDA_DILUTION_SPIN_COLOR: strcat(aux, ",spin_color_dilution"); break; + case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: strcat(aux, ",spin_color_even_odd_dilution"); break; + case QUDA_DILUTION_BLOCK: strcat(aux, ",block_dilution"); break; + default: errorQuda("Unsupported dilution type %d", type); + } + if (type != QUDA_DILUTION_BLOCK && v.size() != static_cast(get_size(type))) + errorQuda("Input container size %lu does not match expected size %d for dilution type", v.size(), get_size(type)); + + size_t block_volume = 1; + for (int i = 0; i < src.Ndim(); i++) block_volume *= local_block[i]; + size_t n_blocks = comm_size() * src.Volume() / block_volume; + if (type == QUDA_DILUTION_BLOCK) { + if (v.size() != n_blocks) + errorQuda("Input container size %lu does not match expected size %lu for dilution block size (%d,%d,%d,%d)", + v.size(), n_blocks, local_block[0], local_block[1], local_block[2], local_block[3]); + if (v.size() > Arg::max_dilution_size) + errorQuda("Container size %lu exceeds maximum size %d", v.size(), Arg::max_dilution_size); + + for (auto i = 0; i < src.Ndim(); i++) { + if (local_block[i] == 0) errorQuda("Dim %d: Dilution block size = 0", i); + if ((src.X(i) * comm_dim(i)) % local_block[i] != 0) + errorQuda("Dim %d: Invalid dilution block size %d for global lattice dim = %d", + i, local_block[i], src.X(i) * comm_dim(i)); + } + } + + apply(device::get_default_stream()); + } + + template + auto constexpr sequence() { return std::make_index_sequence(type)>(); } + + template + void apply(TuneParam &tp, const qudaStream_t &stream) { launch(tp, stream, Arg(v, src, local_block, sequence())); } + + void apply(const qudaStream_t &stream) + { + TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); + switch (type) { + case QUDA_DILUTION_SPIN: apply(tp, stream); break; + case QUDA_DILUTION_COLOR: apply(tp, stream); break; + case QUDA_DILUTION_SPIN_COLOR: apply(tp, stream); break; + case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: apply(tp, stream); break; + case QUDA_DILUTION_BLOCK: apply(tp, stream); break; + default: errorQuda("Dilution type %d not supported", type); + } + } + + long long bytes() const { return v.size() * v[0].Bytes() + src.Bytes(); } + }; + + template struct IntList { }; + + template + void spinorDilute(const ColorSpinorField &src, std::vector &v, QudaDilutionType type, + const lat_dim_t &local_block, IntList) + { + if (src.Ncolor() == Nc) { + SpinorDilute(src, v, type, local_block); + } else { + if constexpr (sizeof...(N) > 0) spinorDilute(src, v, type, local_block, IntList()); + else errorQuda("nColor = %d not implemented", src.Ncolor()); + } + } + + template + void spinorDilute(const ColorSpinorField &src, std::vector &v, QudaDilutionType type, const lat_dim_t &local_block) + { + checkNative(src); + if (!is_enabled_spin(src.Nspin())) + errorQuda("spinorNoise has not been built for nSpin=%d fields", src.Nspin()); + + if (src.Nspin() == 4) { + if constexpr (is_enabled_spin(4)) spinorDilute(src, v, type, local_block, IntList<3>()); + } else if (src.Nspin() == 2) { + if constexpr (is_enabled_spin(2)) spinorDilute(src, v, type, local_block, IntList<3, @QUDA_MULTIGRID_NVEC_LIST@>()); + } else if (src.Nspin() == 1) { + if constexpr (is_enabled_spin(1)) spinorDilute(src, v, type, local_block, IntList<3>()); + } else { + errorQuda("Nspin = %d not implemented", src.Nspin()); + } + } + + void spinorDilute(std::vector &v, const ColorSpinorField &src, QudaDilutionType type, + const lat_dim_t &local_block) + { + switch (src.Precision()) { + case QUDA_DOUBLE_PRECISION: spinorDilute(src, v, type, local_block); break; + case QUDA_SINGLE_PRECISION: spinorDilute(src, v, type, local_block); break; + default: errorQuda("Not instantiated %d\n", src.Precision()); + } + } + +} // namespace quda diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 14c1508a82..65135b9def 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1245,7 +1245,7 @@ foreach(prec IN LISTS TEST_PRECS) if (TARGET dilution_test) add_test(NAME dilution_test_${prec} COMMAND ${QUDA_CTEST_LAUNCH} $ ${MPIEXEC_POSTFLAGS} - --dim 4 6 8 10 --prec ${prec} + --dim 4 6 8 10 --dilution-block-size 4 6 4 5 --prec ${prec} --gtest_output=xml:dilution_test_${prec}.xml) endif() diff --git a/tests/dilution_test.cpp b/tests/dilution_test.cpp index 9d7475d114..9bb5993be5 100644 --- a/tests/dilution_test.cpp +++ b/tests/dilution_test.cpp @@ -42,12 +42,24 @@ TEST_P(DilutionTest, verify) ColorSpinorParam param; constructWilsonTestSpinorParam(¶m, &inv_param, &gauge_param); param.siteSubset = site_subset; + if (site_subset == QUDA_PARITY_SITE_SUBSET) param.x[0] /= 2; param.nSpin = nSpin; param.setPrecision(inv_param.cuda_prec, inv_param.cuda_prec, true); // change order to native order param.location = QUDA_CUDA_FIELD_LOCATION; param.create = QUDA_NULL_FIELD_CREATE; ColorSpinorField src(param); + // compute number of blocks when using block dilution + int block_volume = 1; + lat_dim_t block_size = {dilution_block_size[0], dilution_block_size[1], dilution_block_size[2], dilution_block_size[3]}; + if (src.SiteSubset() == QUDA_PARITY_SITE_SUBSET) block_size[0] /= 2; + for (int i = 0; i < src.Ndim(); i++) block_volume *= block_size[i]; + int n_blocks = comm_size() * src.Volume() / block_volume; + if (dilution_type == QUDA_DILUTION_BLOCK) { + logQuda(QUDA_VERBOSE, "Dilution block size = %d x %d x %d x %d\n", block_size[0], block_size[1], block_size[2], block_size[3]); + logQuda(QUDA_VERBOSE, "Number of dilution blocks = %d\n", n_blocks); + } + RNG rng(src, 1234); for (int i = 0; i < Nsrc; i++) { @@ -59,11 +71,12 @@ TEST_P(DilutionTest, verify) case QUDA_DILUTION_COLOR: size = src.Ncolor(); break; case QUDA_DILUTION_SPIN_COLOR: size = src.Nspin() * src.Ncolor(); break; case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: size = src.Nspin() * src.Ncolor() * src.SiteSubset(); break; + case QUDA_DILUTION_BLOCK: size = n_blocks; break; default: errorQuda("Invalid dilution type %d", dilution_type); } std::vector v(size, param); - spinorDilute(v, src, dilution_type); + spinorDilute(v, src, dilution_type, block_size); param.create = QUDA_ZERO_FIELD_CREATE; ColorSpinorField sum(param); @@ -88,13 +101,29 @@ using ::testing::Values; INSTANTIATE_TEST_SUITE_P( WilsonFull, DilutionTest, Combine(Values(QUDA_FULL_SITE_SUBSET), - Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, QUDA_DILUTION_SPIN_COLOR_EVEN_ODD), + Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, QUDA_DILUTION_SPIN_COLOR_EVEN_ODD, + QUDA_DILUTION_BLOCK), Values(4)), [](testing::TestParamInfo param) { return get_dilution_type_str(::testing::get<1>(param.param)); }); INSTANTIATE_TEST_SUITE_P(WilsonParity, DilutionTest, Combine(Values(QUDA_PARITY_SITE_SUBSET), - Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR), Values(4)), + Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, + QUDA_DILUTION_BLOCK), Values(4)), + [](testing::TestParamInfo param) { + return get_dilution_type_str(::testing::get<1>(param.param)); + }); + +INSTANTIATE_TEST_SUITE_P( + CoarseFull, DilutionTest, + Combine(Values(QUDA_FULL_SITE_SUBSET), + Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, QUDA_DILUTION_SPIN_COLOR_EVEN_ODD), + Values(2)), + [](testing::TestParamInfo param) { return get_dilution_type_str(::testing::get<1>(param.param)); }); + +INSTANTIATE_TEST_SUITE_P(CoarseParity, DilutionTest, + Combine(Values(QUDA_PARITY_SITE_SUBSET), + Values(QUDA_DILUTION_SPIN, QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR), Values(2)), [](testing::TestParamInfo param) { return get_dilution_type_str(::testing::get<1>(param.param)); }); diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp index 5677a80543..a0efaae83a 100644 --- a/tests/utils/command_line_params.cpp +++ b/tests/utils/command_line_params.cpp @@ -116,6 +116,7 @@ QudaMatPCType matpc_type = QUDA_MATPC_EVEN_EVEN; QudaSolveType solve_type = QUDA_NORMOP_PC_SOLVE; QudaSolutionType solution_type = QUDA_MAT_SOLUTION; QudaTboundary fermion_t_boundary = QUDA_ANTI_PERIODIC_T; +std::array dilution_block_size = {1, 1, 1, 1}; int mg_levels = 2; @@ -594,6 +595,11 @@ std::shared_ptr make_app(std::string app_description, std::string app_n "The fermoinic temporal boundary conditions (anti-periodic (default), periodic") ->transform(CLI::QUDACheckedTransformer(fermion_t_boundary_map)); + quda_app + ->add_option("--dilution-block-size", dilution_block_size, + "Set the dilution block size in all four dimension (default 1 1 1 1)") + ->expected(4); + quda_app ->add_option("--solve-type", solve_type, "The type of solve to do (direct, direct-pc, normop, normop-pc, normerr, normerr-pc)") diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h index fc3d07959a..7d1b4840bd 100644 --- a/tests/utils/command_line_params.h +++ b/tests/utils/command_line_params.h @@ -251,6 +251,7 @@ extern QudaMatPCType matpc_type; extern QudaSolveType solve_type; extern QudaSolutionType solution_type; extern QudaTboundary fermion_t_boundary; +extern std::array dilution_block_size; extern int mg_levels; diff --git a/tests/utils/misc.cpp b/tests/utils/misc.cpp index 57eadf7718..fd920e5e16 100644 --- a/tests/utils/misc.cpp +++ b/tests/utils/misc.cpp @@ -371,6 +371,7 @@ std::string get_dilution_type_str(QudaDilutionType type) case QUDA_DILUTION_COLOR: s = std::string("color"); break; case QUDA_DILUTION_SPIN_COLOR: s = std::string("spin_color"); break; case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: s = std::string("spin_color_even_odd"); break; + case QUDA_DILUTION_BLOCK: s = std::string("block"); break; default: fprintf(stderr, "Error: invalid dilution type\n"); exit(1); } return s; From dd66595e1a5e2f61a04d5ec4dbc494b3da5840b4 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 11 Aug 2023 09:18:22 -0700 Subject: [PATCH 31/99] Removed unneeded static_cast --- lib/coarsecoarse_op_mma.in.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/coarsecoarse_op_mma.in.cu b/lib/coarsecoarse_op_mma.in.cu index 8ccd052a1c..eee43a43ac 100644 --- a/lib/coarsecoarse_op_mma.in.cu +++ b/lib/coarsecoarse_op_mma.in.cu @@ -43,7 +43,7 @@ namespace quda { output = new GaugeField(param); if (copy_content) output->copy(X); } - return static_cast(output); + return output; }; auto Y_order = create_gauge_copy(Y, gOrder, false); From e818659c4f3b1056eaad12728eb06b9ea89cce13 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 11 Aug 2023 18:24:22 -0700 Subject: [PATCH 32/99] Fix HIP builds --- lib/targets/hip/quda_api.cpp | 76 ++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 43 deletions(-) diff --git a/lib/targets/hip/quda_api.cpp b/lib/targets/hip/quda_api.cpp index 9191ec16a3..6d9345a884 100644 --- a/lib/targets/hip/quda_api.cpp +++ b/lib/targets/hip/quda_api.cpp @@ -261,6 +261,13 @@ namespace quda QudaMem copy(dst, src, count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line); } + void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func, const char *file, + const char *line) + { + if (count == 0) return; + QudaMem copy(dst.data(), src.data(), count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line); + } + void qudaMemcpyAsync_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const qudaStream_t &stream, const char *func, const char *file, const char *line) { @@ -288,6 +295,16 @@ namespace quda QudaMem set(ptr, value, count, device::get_default_stream(), false, func, file, line); } + void qudaMemset_(quda_ptr &ptr, int value, size_t count, const char *func, const char *file, const char *line) + { + if (count == 0) return; + if (ptr.is_device()) { + QudaMem set(ptr.data(), value, count, device::get_default_stream(), false, func, file, line); + } else { + memset(ptr.data(), value, count); + } + } + void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func, const char *file, const char *line) { @@ -295,18 +312,26 @@ namespace quda QudaMem copy(ptr, value, count, stream, true, func, file, line); } - void qudaMemset2D_(void *ptr, size_t pitch, int value, size_t width, size_t height, const char *func, - const char *file, const char *line) + void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream, + const char *func, const char *file, const char *line) { - hipError_t error = hipMemset2D(ptr, pitch, value, width, height); - set_runtime_error(error, __func__, func, file, line); + if (count == 0) return; + if (ptr.is_device()) { + QudaMem set(ptr.data(), value, count, stream, true, func, file, line); + } else { + memset(ptr.data(), value, count); + } } - void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width, size_t height, const qudaStream_t &stream, - const char *func, const char *file, const char *line) + void qudaMemset2DAsync_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height, + const qudaStream_t &stream, const char *func, const char *file, const char *line) { - hipError_t error = hipMemset2DAsync(ptr, pitch, value, width, height, get_stream(stream)); - set_runtime_error(error, __func__, func, file, line); + if (ptr.is_device()) { + hipError_t error = hipMemset2DAsync(static_cast(ptr.data()) + offset, pitch, value, width, height, get_stream(stream)); + set_runtime_error(error, __func__, func, file, line); + } else { + for (auto i = 0u; i < height; i++) memset(static_cast(ptr.data()) + offset + i * pitch, value, width); + } } void qudaMemPrefetchAsync_(void *, size_t, QudaFieldLocation, const qudaStream_t &, const char *, const char *, @@ -315,41 +340,6 @@ namespace quda // No prefetch } -#if 0 - bool qudaEventQuery_(qudaEvent_t &quda_event, const char *func, const char *file, const char *line) - { - cudaEvent_t &event = reinterpret_cast(quda_event.event); -#ifdef USE_DRIVER_API - PROFILE(CUresult error = cuEventQuery(event), QUDA_PROFILE_EVENT_QUERY); - switch (error) { - case CUDA_SUCCESS: return true; - case CUDA_ERROR_NOT_READY: return false; - default: set_driver_error(error, __func__, func, file, line); - } -#else - PROFILE(cudaError_t error = cudaEventQuery(event), QUDA_PROFILE_EVENT_QUERY); - switch (error) { - case cudaSuccess: return true; - case cudaErrorNotReady: return false; - default: set_runtime_error(error, __func__, func, file, line); - } -#endif - return false; - } - - void qudaEventRecord_(qudaEvent_t &quda_event, qudaStream_t stream, const char *func, const char *file, const char *line) - { - cudaEvent_t &event = reinterpret_cast(quda_event.event); -#ifdef USE_DRIVER_API - PROFILE(CUresult error = cuEventRecord(event, get_stream(stream)), QUDA_PROFILE_EVENT_RECORD); - set_driver_error(error, __func__, func, file, line); -#else - PROFILE(cudaError_t error = cudaEventRecord(event, get_stream(stream)), QUDA_PROFILE_EVENT_RECORD); - set_runtime_error(error, __func__, func, file, line); -#endif - } -#endif - bool qudaEventQuery_(qudaEvent_t &quda_event, const char *func, const char *file, const char *line) { hipEvent_t &event = reinterpret_cast(quda_event.event); From 50987b1b96a55143adae9810e1e45b38b7f93b13 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 18 Aug 2023 15:15:14 -0700 Subject: [PATCH 33/99] Minor review comment --- tests/host_reference/gauge_force_reference.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp index 83c5251e27..eb18f10568 100644 --- a/tests/host_reference/gauge_force_reference.cpp +++ b/tests/host_reference/gauge_force_reference.cpp @@ -492,8 +492,6 @@ void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int ** param.t_boundary = QUDA_PERIODIC_T; auto qdp_ex = quda::createExtendedGauge((void **)sitelink, param, R); - //quda::TimeProfile dummy("blah"); - //auto qdp_ex = quda::createExtendedGauge(u, R, dummy); lattice_t lat(*qdp_ex); void *sitelink_ex[] = {qdp_ex->data(0), qdp_ex->data(1), qdp_ex->data(2), qdp_ex->data(3)}; From f8b324439be89fd6056c934241186bd47db154d6 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 18 Aug 2023 18:05:15 -0700 Subject: [PATCH 34/99] Add default assignment operator for TimeProfile class --- include/timer.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/timer.h b/include/timer.h index 0d529867cb..b819b81bb2 100644 --- a/include/timer.h +++ b/include/timer.h @@ -205,6 +205,7 @@ namespace quda { public: TimeProfile() = default; TimeProfile(const TimeProfile &) = default; + TimeProfile& operator=(const TimeProfile &) = default; TimeProfile(std::string fname) : fname(fname), switchOff(false), use_global(true) { ; } From 06d2dcbeb259fb35a4b0e95561d594167d278daa Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 29 Aug 2023 15:38:55 -0700 Subject: [PATCH 35/99] Further cleanup and minor fixes --- lib/interface_quda.cpp | 334 +++++++++++++---------------------------- 1 file changed, 101 insertions(+), 233 deletions(-) diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index e63f9c1dc4..19a97983b0 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -429,16 +429,14 @@ void initQudaDevice(int dev) initialized = true; profileInit2End.TPSTART(QUDA_PROFILE_TOTAL); - profileInit.TPSTART(QUDA_PROFILE_TOTAL); + pushProfile(profileInit); profileInit.TPSTART(QUDA_PROFILE_INIT); - if (getVerbosity() >= QUDA_SUMMARIZE) { #ifdef GITVERSION - printfQuda("QUDA %s (git %s)\n",quda_version.c_str(),gitversion); + logQuda(QUDA_SUMMARIZE, "QUDA %s (git %s)\n",quda_version.c_str(),gitversion); #else - printfQuda("QUDA %s\n",quda_version.c_str()); + logQuda(QUDA_SUMMARIZE, "QUDA %s\n",quda_version.c_str()); #endif - } #ifdef MULTI_GPU if (dev < 0) { @@ -466,7 +464,7 @@ void initQudaDevice(int dev) } profileInit.TPSTOP(QUDA_PROFILE_INIT); - profileInit.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); } /* @@ -474,7 +472,7 @@ void initQudaDevice(int dev) */ void initQudaMemory() { - profileInit.TPSTART(QUDA_PROFILE_TOTAL); + pushProfile(profileInit); profileInit.TPSTART(QUDA_PROFILE_INIT); if (!comms_initialized) init_default_comms(); @@ -498,7 +496,7 @@ void initQudaMemory() for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d)); profileInit.TPSTOP(QUDA_PROFILE_INIT); - profileInit.TPSTOP(QUDA_PROFILE_TOTAL); + pushProfile(profileInit); } void updateR() @@ -564,25 +562,20 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) if (!initialized) errorQuda("QUDA not initialized"); if (getVerbosity() == QUDA_DEBUG_VERBOSE) printQudaGaugeParam(param); - profileGauge.TPSTART(QUDA_PROFILE_INIT); // Set the specific input parameters and create the cpu gauge field GaugeFieldParam gauge_param(*param, h_gauge); if (gauge_param.order <= 4) gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO; - GaugeField *in = (param->location == QUDA_CPU_FIELD_LOCATION) ? - static_cast(new GaugeField(gauge_param)) : - static_cast(new GaugeField(gauge_param)); + GaugeField *in = GaugeField::Create(gauge_param); if (in->Order() == QUDA_BQCD_GAUGE_ORDER) { static size_t checksum = SIZE_MAX; size_t in_checksum = in->checksum(true); if (in_checksum == checksum) { - if (getVerbosity() >= QUDA_VERBOSE) - printfQuda("Gauge field unchanged - using cached gauge field %lu\n", checksum); - profileGauge.TPSTOP(QUDA_PROFILE_INIT); - profileGauge.TPSTOP(QUDA_PROFILE_TOTAL); + logQuda(QUDA_VERBOSE, "Gauge field unchanged - using cached gauge field %lu\n", checksum); delete in; invalidate_clover = false; + popProfile(); return; } checksum = in_checksum; @@ -627,9 +620,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) precise->copy(*gaugePrecise); precise->exchangeGhost(); freeUniqueGaugeQuda(QUDA_WILSON_LINKS); - profileGauge.TPSTOP(QUDA_PROFILE_INIT); } else { - profileGauge.TPSTOP(QUDA_PROFILE_INIT); precise->copy(*in); } @@ -637,10 +628,8 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) if (param->type == QUDA_SMEARED_LINKS) { gaugeSmeared = createExtendedGauge(*precise, R, profileGauge); - profileGauge.TPSTART(QUDA_PROFILE_FREE); delete precise; delete in; - profileGauge.TPSTOP(QUDA_PROFILE_FREE); popProfile(); return; @@ -751,9 +740,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) errorQuda("Invalid gauge type %d", param->type); } - profileGauge.TPSTART(QUDA_PROFILE_FREE); delete in; - profileGauge.TPSTOP(QUDA_PROFILE_FREE); if (extendedGaugeResident) { // updated the resident gauge field if needed @@ -809,7 +796,6 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) { pushProfile(profileClover); pushVerbosity(inv_param->verbosity); - profileClover.TPSTART(QUDA_PROFILE_INIT); checkCloverParam(inv_param); bool device_calc = false; // calculate clover and inverse on the device? @@ -847,8 +833,6 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) CloverField *in = nullptr; - profileClover.TPSTOP(QUDA_PROFILE_INIT); - bool clover_update = false; // If either of the clover params have changed, trigger a recompute double csw_old = cloverPrecise ? cloverPrecise->Csw() : 0.0; @@ -862,11 +846,10 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) // compute or download clover field only if gauge field has been updated or clover field doesn't exist if (clover_update) { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating new clover field\n"); + logQuda(QUDA_VERBOSE, "Creating new clover field\n"); freeSloppyCloverQuda(); if (cloverPrecise) delete cloverPrecise; - profileClover.TPSTART(QUDA_PROFILE_INIT); cloverPrecise = new CloverField(clover_param); if (!device_calc || inv_param->return_clover || inv_param->return_clover_inverse) { @@ -882,16 +865,13 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) inParam.reconstruct = false; in = new CloverField(inParam); } - profileClover.TPSTOP(QUDA_PROFILE_INIT); if (!device_calc) { cloverPrecise->copy(*in, false); if ((h_clovinv && !inv_param->compute_clover_inverse) && !clover::dynamic_inverse()) cloverPrecise->copy(*in, true); } else { - profileClover.TPSTOP(QUDA_PROFILE_TOTAL); createCloverQuda(inv_param); - profileClover.TPSTART(QUDA_PROFILE_TOTAL); } if ((!h_clovinv || inv_param->compute_clover_inverse) && !clover::dynamic_inverse()) { @@ -902,7 +882,7 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) } } } else { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gauge field unchanged - using cached clover field\n"); + logQuda(QUDA_VERBOSE, "Gauge field unchanged - using cached clover field\n"); } // if requested, copy back the clover / inverse field @@ -929,9 +909,7 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) delete tmp; } - profileClover.TPSTART(QUDA_PROFILE_FREE); if (in) delete in; // delete object referencing input field - profileClover.TPSTOP(QUDA_PROFILE_FREE); QudaPrecision prec[] = {inv_param->clover_cuda_prec_sloppy, inv_param->clover_cuda_prec_precondition, inv_param->clover_cuda_prec_refinement_sloppy, inv_param->clover_cuda_prec_eigensolver}; @@ -1355,7 +1333,7 @@ void flushChronoQuda(int i) void endQuda(void) { - profileEnd.TPSTART(QUDA_PROFILE_TOTAL); + pushProfile(profileEnd); if (!initialized) return; @@ -1394,7 +1372,7 @@ void endQuda(void) comm_finalize(); comms_initialized = false; - profileEnd.TPSTOP(QUDA_PROFILE_TOTAL); + popProfile(); profileInit2End.TPSTOP(QUDA_PROFILE_TOTAL); // print out the profile information of the lifetime of the library @@ -1498,15 +1476,11 @@ namespace quda { } memcpy(diracParam.b_5, inv_param->b_5, sizeof(Complex) * inv_param->Ls); memcpy(diracParam.c_5, inv_param->c_5, sizeof(Complex) * inv_param->Ls); - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { - printfQuda("Printing b_5 and c_5 values\n"); - for (int i = 0; i < diracParam.Ls; i++) { - printfQuda("fromQUDA diracParam: b5[%d] = %f + i%f, c5[%d] = %f + i%f\n", i, diracParam.b_5[i].real(), - diracParam.b_5[i].imag(), i, diracParam.c_5[i].real(), diracParam.c_5[i].imag()); - // printfQuda("fromQUDA inv_param: b5[%d] = %f %f c5[%d] = %f %f\n", i, inv_param->b_5[i], i, - // inv_param->c_5[i] ); printfQuda("fromQUDA creal: b5[%d] = %f %f c5[%d] = %f %f \n", i, - // creal(inv_param->b_5[i]), cimag(inv_param->b_5[i]), i, creal(inv_param->c_5[i]), cimag(inv_param->c_5[i]) ); - } + logQuda(QUDA_DEBUG_VERBOSE, "Printing b_5 and c_5 values\n"); + for (int i = 0; i < diracParam.Ls; i++) { + logQuda(QUDA_DEBUG_VERBOSE, "fromQUDA diracParam: b5[%d] = %f + i%f, c5[%d] = %f + i%f\n", + i, diracParam.b_5[i].real(), + diracParam.b_5[i].imag(), i, diracParam.c_5[i].real(), diracParam.c_5[i].imag()); } break; case QUDA_STAGGERED_DSLASH: @@ -1807,7 +1781,6 @@ namespace quda { void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity) { pushProfile(profileDslash); - profileDslash.TPSTART(QUDA_PROFILE_INIT); const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise; @@ -1835,13 +1808,11 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity DiracParam diracParam; setDiracParam(diracParam, inv_param, pc); - profileDslash.TPSTOP(QUDA_PROFILE_INIT); - in = in_h; profileDslash.TPSTART(QUDA_PROFILE_COMPUTE); - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in)); + logQuda(QUDA_DEBUG_VERBOSE, "In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in)); if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION && (inv_param->dslash_type == QUDA_STAGGERED_DSLASH || @@ -1873,11 +1844,9 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity out_h = out; - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out)); + logQuda(QUDA_DEBUG_VERBOSE, "Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out)); - profileDslash.TPSTART(QUDA_PROFILE_FREE); delete dirac; // clean up - profileDslash.TPSTOP(QUDA_PROFILE_FREE); popVerbosity(); popProfile(); @@ -1906,7 +1875,7 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) ColorSpinorField in(cudaParam); in = in_h; - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in)); + logQuda(QUDA_DEBUG_VERBOSE, "In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in)); cudaParam.create = QUDA_NULL_FIELD_CREATE; cudaParam.location = QUDA_CUDA_FIELD_LOCATION; @@ -1938,8 +1907,7 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) ColorSpinorField out_h(cpuParam); out_h = out; - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out)); - + logQuda(QUDA_DEBUG_VERBOSE, "Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out)); popVerbosity(); } @@ -1967,7 +1935,7 @@ void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) ColorSpinorField in(cudaParam); in = in_h; - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in)); + logQuda(QUDA_DEBUG_VERBOSE, "In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in)); cudaParam.create = QUDA_NULL_FIELD_CREATE; ColorSpinorField out(cudaParam); @@ -2001,8 +1969,7 @@ void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) ColorSpinorField out_h(cpuParam); out_h = out; - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out)); - + logQuda(QUDA_DEBUG_VERBOSE, "Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out)); popVerbosity(); } @@ -2148,7 +2115,7 @@ void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity ColorSpinorField in(cudaParam); in = in_h; - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in)); + logQuda(QUDA_DEBUG_VERBOSE, "In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in)); cudaParam.create = QUDA_NULL_FIELD_CREATE; ColorSpinorField out(cudaParam); @@ -2175,8 +2142,7 @@ void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity ColorSpinorField out_h(cpuParam); out_h = out; - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out)); - + logQuda(QUDA_DEBUG_VERBOSE, "Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out)); popVerbosity(); } @@ -2184,7 +2150,6 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam { if (!initialized) errorQuda("QUDA not initialized"); pushProfile(profileEigensolve); - profileEigensolve.TPSTART(QUDA_PROFILE_INIT); // Transfer the inv param structure contained in eig_param. // This will define the operator to be eigensolved. @@ -2306,8 +2271,6 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam } } //------------------------------------------------------ - profileEigensolve.TPSTOP(QUDA_PROFILE_INIT); - // We must construct the correct Dirac operator type based on the three // options: The normal operator, the daggered operator, and if we pre // multiply by gamma5. Each combination requires a unique Dirac operator @@ -2346,11 +2309,9 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam for (int i = 0; i < n_eig; i++) host_evecs_[i] = kSpace[i]; } - profileEigensolve.TPSTART(QUDA_PROFILE_FREE); delete d; delete dSloppy; delete dPre; - profileEigensolve.TPSTOP(QUDA_PROFILE_FREE); popVerbosity(); @@ -2362,7 +2323,6 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile) : profile(profile) { - profile.TPSTART(QUDA_PROFILE_INIT); QudaInvertParam *param = mg_param.invert_param; // set whether we are going use native or generic blas blas_lapack::set_native(param->native_blas_lapack); @@ -2441,22 +2401,19 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr // cache is written out even if a long benchmarking job gets interrupted saveTuneCache(); - profile.TPSTOP(QUDA_PROFILE_INIT); } void* newMultigridQuda(QudaMultigridParam *mg_param) { profilerStart(__func__); - + pushProfile(profileInvert); pushVerbosity(mg_param->invert_param->verbosity); - profileInvert.TPSTART(QUDA_PROFILE_TOTAL); auto *mg = new multigrid_solver(*mg_param, profileInvert); - profileInvert.TPSTOP(QUDA_PROFILE_TOTAL); saveTuneCache(); popVerbosity(); - + popProfile(); profilerStop(__func__); return static_cast(mg); } @@ -2468,10 +2425,9 @@ void destroyMultigridQuda(void *mg) { void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param) { profilerStart(__func__); - + pushProfile(profileInvert); pushVerbosity(mg_param->invert_param->verbosity); - profileInvert.TPSTART(QUDA_PROFILE_TOTAL); profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE); auto *mg = static_cast(mg_); @@ -2573,18 +2529,17 @@ void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param) saveTuneCache(); profileInvert.TPSTOP(QUDA_PROFILE_PREAMBLE); - profileInvert.TPSTOP(QUDA_PROFILE_TOTAL); popVerbosity(); - + popProfile(); profilerStop(__func__); } void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param) { profilerStart(__func__); + pushProfile(profileInvert); pushVerbosity(mg_param->invert_param->verbosity); - profileInvert.TPSTART(QUDA_PROFILE_TOTAL); auto *mg = static_cast(mg_); checkMultigridParam(mg_param); @@ -2592,8 +2547,8 @@ void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param) mg->mg->dumpNullVectors(); - profileInvert.TPSTOP(QUDA_PROFILE_TOTAL); popVerbosity(); + popProfile(); profilerStop(__func__); } @@ -2604,8 +2559,6 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile) if (param->inv_type != QUDA_EIGCG_INVERTER && param->inv_type != QUDA_INC_EIGCG_INVERTER) return; - profile.TPSTART(QUDA_PROFILE_INIT); - GaugeField *cudaGauge = checkGauge(param); eig_param.secs = 0; eig_param.gflops = 0; @@ -2659,16 +2612,12 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile) deflParam = new DeflationParam(eig_param, RV, *m); defl = new Deflation(*deflParam, profile); - - profile.TPSTOP(QUDA_PROFILE_INIT); } void* newDeflationQuda(QudaEigParam *eig_param) { - profileInvert.TPSTART(QUDA_PROFILE_TOTAL); + pushProfile(profileInvert); auto *defl = new deflated_solver(*eig_param, profileInvert); - - profileInvert.TPSTOP(QUDA_PROFILE_TOTAL); - + popProfile(); saveProfile(__func__); flushProfile(); return static_cast(defl); @@ -2811,17 +2760,8 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) dirac.prepare(in, out, x, b, param->solution_type); - if (getVerbosity() >= QUDA_VERBOSE) { - double nin = blas::norm2(*in); - double nout = blas::norm2(*out); - printfQuda("Prepared source = %g\n", nin); - printfQuda("Prepared solution = %g\n", nout); - } - - if (getVerbosity() >= QUDA_VERBOSE) { - double nin = blas::norm2(*in); - printfQuda("Prepared source post mass rescale = %g\n", nin); - } + logQuda(QUDA_VERBOSE, "Prepared source = %g\n", blas::norm2(*in)); + logQuda(QUDA_VERBOSE, "Prepared solution = %g\n", blas::norm2(*out)); // solution_type specifies *what* system is to be solved. // solve_type specifies *how* the system is to be solved. @@ -2968,7 +2908,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) solverParam.updateInvertParam(*param); } - if (getVerbosity() >= QUDA_VERBOSE) { printfQuda("Solution = %g\n", blas::norm2(x)); } + logQuda(QUDA_VERBOSE, "Solution = %g\n", blas::norm2(x)); profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE); if (param->chrono_make_resident) { @@ -3026,8 +2966,6 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) } profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE); - profileInvert.TPSTART(QUDA_PROFILE_FREE); - if (param->use_resident_solution && !param->make_resident_solution) solutionResident.clear(); delete d; @@ -3035,14 +2973,11 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) delete dPre; delete dEig; - profileInvert.TPSTOP(QUDA_PROFILE_FREE); - - popVerbosity(); - // cache is written out even if a long benchmarking job gets interrupted saveTuneCache(); profilerStop(__func__); + popVerbosity(); popProfile(); } @@ -3112,12 +3047,13 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col */ profilerStart(__func__); + pushProfile(profileInvertMultiSrc); CommKey split_key = {param->split_grid[0], param->split_grid[1], param->split_grid[2], param->split_grid[3]}; int num_sub_partition = quda::product(split_key); if (!split_key.is_valid()) { - errorQuda("split_key = [%d,%d,%d,%d] is not valid.\n", split_key[0], split_key[1], split_key[2], split_key[3]); + errorQuda("split_key = [%d,%d,%d,%d] is not valid", split_key[0], split_key[1], split_key[2], split_key[3]); } if (num_sub_partition == 1) { // In this case we don't split the grid. @@ -3126,10 +3062,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col } else { - profileInvertMultiSrc.TPSTART(QUDA_PROFILE_TOTAL); - profileInvertMultiSrc.TPSTART(QUDA_PROFILE_INIT); - - if (gauge_param == nullptr) { errorQuda("gauge_param == nullptr.\n"); } + if (gauge_param == nullptr) { errorQuda("gauge_param == nullptr"); } // Doing the sub-partition arithmatics if (param->num_src_per_sub_partition * num_sub_partition != param->num_src) { @@ -3143,7 +3076,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col if (param->dslash_type == QUDA_DOMAIN_WALL_DSLASH) { pc_type = QUDA_5D_PC; } // Doesn't work for MG yet. - if (param->inv_type_precondition == QUDA_MG_INVERTER) { errorQuda("Split Grid does NOT work with MG yet."); } + if (param->inv_type_precondition == QUDA_MG_INVERTER) errorQuda("Split Grid does NOT work with MG yet"); checkInvertParam(param, _hp_x[0], _hp_b[0]); @@ -3169,14 +3102,14 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col // set up the gauge field params. if (!is_staggered) { // not staggered gf_param = new GaugeFieldParam(*gauge_param, h_gauge); - if (gf_param->order <= 4) { gf_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; } + if (gf_param->order <= 4) gf_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; in = GaugeField::Create(*gf_param); } else { // staggered milc_fatlink_param = new GaugeFieldParam(*gauge_param, milc_fatlinks); - if (milc_fatlink_param->order <= 4) { milc_fatlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; } + if (milc_fatlink_param->order <= 4) milc_fatlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; milc_fatlink_field = GaugeField::Create(*milc_fatlink_param); milc_longlink_param = new GaugeFieldParam(*gauge_param, milc_longlinks); - if (milc_longlink_param->order <= 4) { milc_longlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; } + if (milc_longlink_param->order <= 4) milc_longlink_param->ghostExchange = QUDA_GHOST_EXCHANGE_NO; milc_longlink_field = GaugeField::Create(*milc_longlink_param); } @@ -3200,13 +3133,14 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col } // Make the gauge param dimensions larger - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { - printfQuda("Spliting the grid into sub-partitions: (%2d,%2d,%2d,%2d) / (%2d,%2d,%2d,%2d).\n", comm_dim(0), - comm_dim(1), comm_dim(2), comm_dim(3), split_key[0], split_key[1], split_key[2], split_key[3]); - } + logQuda(QUDA_DEBUG_VERBOSE, + "Spliting the grid into sub-partitions: (%2d,%2d,%2d,%2d) / (%2d,%2d,%2d,%2d)\n", + comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3), + split_key[0], split_key[1], split_key[2], split_key[3]); + for (int d = 0; d < CommKey::n_dim; d++) { if (comm_dim(d) % split_key[d] != 0) { - errorQuda("Split not possible: %2d %% %2d != 0.", comm_dim(d), split_key[d]); + errorQuda("Split not possible: %2d %% %2d != 0", comm_dim(d), split_key[d]); } if (!is_staggered) { gf_param->x[d] *= split_key[d]; @@ -3283,7 +3217,6 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col quda::split_field(*collected_milc_longlink_field, v_g, split_key); } - profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_INIT); profileInvertMultiSrc.TPSTART(QUDA_PROFILE_PREAMBLE); comm_barrier(); @@ -3309,11 +3242,10 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col comm_barrier(); profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_PREAMBLE); - profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_TOTAL); // Load gauge field after pushing the split communicator so the comm buffers, etc are setup according to // the split topology. - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loading gauge field...\n"); } + logQuda(QUDA_DEBUG_VERBOSE, "Split grid loading gauge field...\n"); if (!is_staggered) { loadGaugeQuda(collected_gauge->data(), gauge_param); } else { @@ -3321,24 +3253,23 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col loadFatLongGaugeQuda(param, gauge_param, collected_milc_fatlink_field->data(), collected_milc_longlink_field->data()); } - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loaded gauge field...\n"); } + logQuda(QUDA_DEBUG_VERBOSE, "Split grid loaded gauge field...\n"); if (param->dslash_type == QUDA_CLOVER_WILSON_DSLASH || param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH || param->dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) { - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loading clover field...\n"); } + logQuda(QUDA_DEBUG_VERBOSE, "Split grid loading clover field...\n"); if (collected_clover) { loadCloverQuda(collected_clover->data(false), collected_clover->data(true), param); } else { loadCloverQuda(nullptr, nullptr, param); } - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Split grid loaded clover field...\n"); } + logQuda(QUDA_DEBUG_VERBOSE, "Split grid loaded clover field...\n"); } for (int n = 0; n < param->num_src_per_sub_partition; n++) { op(_collect_x[n]->data(), _collect_b[n]->data(), param, args...); } - profileInvertMultiSrc.TPSTART(QUDA_PROFILE_TOTAL); profileInvertMultiSrc.TPSTART(QUDA_PROFILE_EPILOGUE); push_communicator(default_comm_key); updateR(); @@ -3376,7 +3307,6 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col if (collected_clover) { delete collected_clover; } profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_EPILOGUE); - profileInvertMultiSrc.TPSTOP(QUDA_PROFILE_TOTAL); // Restore the gauge field if (!is_staggered) { @@ -3391,6 +3321,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col } } + popProfile(); profilerStop(__func__); } @@ -3453,8 +3384,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) pushProfile(profileMulti); profilerStart(__func__); - profileMulti.TPSTART(QUDA_PROFILE_INIT); - if (!initialized) errorQuda("QUDA not initialized"); checkInvertParam(param, hp_x[0], hp_b); @@ -3558,7 +3487,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) h_x[i] = std::make_unique(cpuParam); } - profileMulti.TPSTOP(QUDA_PROFILE_INIT); // Now I need a colorSpinorParam for the device ColorSpinorParam cudaParam(cpuParam, *param, QUDA_CUDA_FIELD_LOCATION); // This setting will download a host vector @@ -3566,7 +3494,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) cudaParam.field = &h_b; ColorSpinorField b(cudaParam); // Creates b and downloads h_b to it - profileMulti.TPSTART(QUDA_PROFILE_INIT); // Create the solution fields filled with zero cudaParam.create = QUDA_ZERO_FIELD_CREATE; @@ -3586,8 +3513,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) std::vector &x = solutionResident; std::vector p; - profileMulti.TPSTOP(QUDA_PROFILE_INIT); - profileMulti.TPSTART(QUDA_PROFILE_PREAMBLE); // Check source norms @@ -3634,10 +3559,8 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) if (param->compute_true_res) { // check each shift has the desired tolerance and use sequential CG to refine - profileMulti.TPSTART(QUDA_PROFILE_INIT); cudaParam.create = QUDA_ZERO_FIELD_CREATE; ColorSpinorField r(cudaParam); - profileMulti.TPSTOP(QUDA_PROFILE_INIT); QudaInvertParam refineparam = *param; refineparam.cuda_prec_sloppy = param->cuda_prec_refinement_sloppy; Dirac &dirac = *d; @@ -3667,9 +3590,9 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) const double refine_tol = (param->tol_offset[i] == 0.0 ? iter_tol : param->tol_offset[i]); // refine if either L2 or heavy quark residual tolerances have not been met, only if desired residual is > 0 if (param->true_res_offset[i] > refine_tol || rsd_hq > tol_hq) { - if (getVerbosity() >= QUDA_SUMMARIZE) - printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n", - i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq); + logQuda(QUDA_SUMMARIZE, + "Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n", + i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq); // for staggered the shift is just a change in mass term (FIXME: for twisted mass also) if (param->dslash_type == QUDA_ASQTAD_DSLASH || @@ -3767,8 +3690,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) blas::ax(sqrt(nb), x[i]); } - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Solution %d = %g\n", i, blas::norm2(x[i])); - + logQuda(QUDA_VERBOSE, "Solution %d = %g\n", i, blas::norm2(x[i])); if (!param->make_resident_solution) *h_x[i] = x[i]; } @@ -3778,19 +3700,16 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) profileMulti.TPSTOP(QUDA_PROFILE_EPILOGUE); - profileMulti.TPSTART(QUDA_PROFILE_FREE); delete d; delete dSloppy; delete dPre; delete dRefine; - profileMulti.TPSTOP(QUDA_PROFILE_FREE); - - popVerbosity(); // cache is written out even if a long benchmarking job gets interrupted saveTuneCache(); profilerStop(__func__); + popVerbosity(); popProfile(); } @@ -3883,7 +3802,6 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) gParam.setPrecision(param->cuda_prec, true); gParam.create = QUDA_NULL_FIELD_CREATE; GaugeField cudaInLink(gParam); - profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT); cudaInLink.copy(cpuInLink); cudaInLinkEx = createExtendedGauge(cudaInLink, R, profileGaussianSmear); @@ -3903,7 +3821,7 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) freeUniqueGaugeQuda(QUDA_SMEARED_LINKS); gaugeSmeared = new GaugeField(gsParam); - + computeTwoLink(*gaugeSmeared, *cudaInLinkEx); gaugeSmeared->exchangeGhost(); @@ -4206,7 +4124,6 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi GaugeField cudaForce(gParam); GaugeField *cudaForce_[2] = {&cudaForce}; - profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT); ColorSpinorParam qParam; qParam.location = QUDA_CUDA_FIELD_LOCATION; qParam.nColor = 3; @@ -4221,7 +4138,6 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi qParam.x[4] = 1; qParam.create = QUDA_NULL_FIELD_CREATE; qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; - profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT); // resident gauge field is required if (!gauge_param->use_resident_gauge || !gaugePrecise) errorQuda("Resident gauge field is required"); @@ -4233,8 +4149,6 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi gauge_param->staggered_phase_type, gaugePrecise->StaggeredPhase()); } - profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT); - const int nvector = inv_param->num_offset; std::vector X(nvector); for (int i=0; iuse_resident_solution) solutionResident.clear(); #endif delete dirac; - profileStaggeredForce.TPSTOP(QUDA_PROFILE_FREE); // compute quark-field outer product for (int i=0; imake_resident_mom && !gauge_param->use_resident_mom) std::exchange(momResident, cudaMom); else momResident = GaugeField(); - profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE); for (int i=0; i force_coeff(nvector); // loop over different quark fields for(int i=0; iverbosity); if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded"); - pushVerbosity(inv_param->verbosity); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); GaugeField *precise = nullptr; if (gaugeSmeared != nullptr) { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Wuppertal smearing done with gaugeSmeared\n"); + logQuda(QUDA_VERBOSE, "Wuppertal smearing done with gaugeSmeared\n"); GaugeFieldParam gParam(*gaugePrecise); gParam.create = QUDA_NULL_FIELD_CREATE; precise = new GaugeField(gParam); copyExtendedGauge(*precise, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION); precise->exchangeGhost(); } else { - if (getVerbosity() >= QUDA_VERBOSE) - printfQuda("Wuppertal smearing done with gaugePrecise\n"); + logQuda(QUDA_VERBOSE, "Wuppertal smearing done with gaugePrecise\n"); precise = gaugePrecise; } @@ -5061,11 +4959,7 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, ColorSpinorField in(cudaParam); in = in_h; - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { - double cpu = blas::norm2(in_h); - double gpu = blas::norm2(in); - printfQuda("In CPU %e CUDA %e\n", cpu, gpu); - } + logQuda(QUDA_DEBUG_VERBOSE, "In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in)); cudaParam.create = QUDA_NULL_FIELD_CREATE; ColorSpinorField out(cudaParam); @@ -5085,10 +4979,7 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, for (unsigned int i = 0; i < n_steps; i++) { if (i) in = out; ApplyLaplace(out, in, *precise, 3, a, b, in, parity, false, comm_dim, profileWuppertal); - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { - double norm = blas::norm2(out); - printfQuda("Step %d, vector norm %e\n", i, norm); - } + logQuda(QUDA_DEBUG_VERBOSE, "Step %d, vector norm %e\n", i, blas::norm2(out)); } cpuParam.v = h_out; @@ -5096,34 +4987,29 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, ColorSpinorField out_h(cpuParam); out_h = out; - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { - double cpu = blas::norm2(out_h); - double gpu = blas::norm2(out); - printfQuda("Out CPU %e CUDA %e\n", cpu, gpu); - } + logQuda(QUDA_DEBUG_VERBOSE, "Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out)); - if (gaugeSmeared != nullptr) - delete precise; + if (gaugeSmeared != nullptr) delete precise; popVerbosity(); + popProfile(); } - + void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_param) { if (smear_param->n_steps == 0) return; pushProfile(profileGaussianSmear); - profileGaussianSmear.TPSTART(QUDA_PROFILE_INIT); - + QudaInvertParam *inv_param = smear_param->inv_param; if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded"); - + if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); if ( gaugeSmeared == nullptr || smear_param->compute_2link != 0 ) { - - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Gaussian smearing done with gaugeSmeared\n"); + + logQuda(QUDA_VERBOSE, "Gaussian smearing done with gaugeSmeared\n"); freeUniqueGaugeQuda(QUDA_SMEARED_LINKS); GaugeFieldParam gParam(*gaugePrecise); @@ -5137,14 +5023,14 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par gParam.pad = gParam.pad*gParam.nFace; // gaugeSmeared = new GaugeField(gParam); - + GaugeField *two_link_ext = createExtendedGauge(*gaugePrecise, R, profileGauge);//aux field - + computeTwoLink(*gaugeSmeared, *two_link_ext); - + gaugeSmeared->exchangeGhost(); - - delete two_link_ext; + + delete two_link_ext; } if (!initialized) errorQuda("QUDA not initialized"); @@ -5152,13 +5038,13 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printQudaInvertParam(inv_param); } checkInvertParam(inv_param); - + // Create device side ColorSpinorField vectors and to pass to the // compute function. const lat_dim_t X = gaugeSmeared->X(); - + inv_param->dslash_type = QUDA_ASQTAD_DSLASH; - + ColorSpinorParam cpuParam(h_in, *inv_param, X, QUDA_MAT_SOLUTION, QUDA_CPU_FIELD_LOCATION); cpuParam.nSpin = 1; // QUDA style pointer for host data. @@ -5172,7 +5058,7 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par ColorSpinorField in(cudaParam); ColorSpinorField out(cudaParam); ColorSpinorField temp1(cudaParam); - + // Create the smearing operator //------------------------------------------------------ Dirac *d = nullptr; @@ -5197,10 +5083,9 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(), inv_param->cuda_prec); // d = Dirac::create(diracParam); // create the Dirac operator - + Dirac &dirac = *d; DiracM qsmear_op(dirac); - profileGaussianSmear.TPSTOP(QUDA_PROFILE_INIT); // Copy host data to device in = in_h; @@ -5208,20 +5093,17 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par const double ftmp = -(smear_param->width*smear_param->width)/(4.0*smear_param->n_steps*4.0); /* Extra 4 to compensate for stride 2 */ // Scale up the source to prevent underflow profileGaussianSmear.TPSTART(QUDA_PROFILE_COMPUTE); - - const double msq = 1. / ftmp; + + const double msq = 1. / ftmp; const double a = inv_param->laplace3D * 2.0 + msq; const QudaParity parity = QUDA_INVALID_PARITY; for (int i = 0; i < smear_param->n_steps; i++) { if (i > 0) std::swap(in, out); blas::ax(ftmp, in); blas::axpy(a, in, temp1); - + qsmear_op.Expose()->SmearOp(out, in, a, 0.0, smear_param->t0, parity); - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { - double norm = blas::norm2(out); - printfQuda("Step %d, vector norm %e\n", i, norm); - } + logQuda(QUDA_DEBUG_VERBOSE, "Step %d, vector norm %e\n", i, blas::norm2(out)); blas::xpay(temp1, -1.0, out); blas::zero(temp1); } @@ -5231,12 +5113,8 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par // Copy device data to host. in_h = out; - profileGaussianSmear.TPSTART(QUDA_PROFILE_FREE); - - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Finished 2link Gaussian smearing.\n"); - + logQuda(QUDA_VERBOSE, "Finished 2link Gaussian smearing.\n"); delete d; - profileGaussianSmear.TPSTOP(QUDA_PROFILE_FREE); smear_param->gflops = dirac.Flops(); @@ -5263,9 +5141,7 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable int measurement_n = 0; // The nth measurement to take gaugeObservablesQuda(&obs_param[measurement_n]); - if (getVerbosity() >= QUDA_SUMMARIZE) { - printfQuda("Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge); - } + logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", 0, obs_param[measurement_n].qcharge); for (unsigned int i = 0; i < smear_param->n_steps; i++) { switch (smear_param->smear_type) { @@ -5280,9 +5156,7 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable if ((i + 1) % smear_param->meas_interval == 0) { measurement_n++; gaugeObservablesQuda(&obs_param[measurement_n]); - if (getVerbosity() >= QUDA_SUMMARIZE) { - printfQuda("Q charge at step %03d = %+.16e\n", i + 1, obs_param[measurement_n].qcharge); - } + logQuda(QUDA_SUMMARIZE, "Q charge at step %03d = %+.16e\n", i + 1, obs_param[measurement_n].qcharge); } } @@ -5314,11 +5188,10 @@ void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam gaugeObservables(in, obs_param[measurement_n]); - if (getVerbosity() >= QUDA_SUMMARIZE) { - printfQuda("flow t, plaquette, E_tot, E_spatial, E_temporal, Q charge\n"); - printfQuda("%le %.16e %+.16e %+.16e %+.16e %+.16e\n", 0.0, obs_param[0].plaquette[0], obs_param[0].energy[0], - obs_param[0].energy[1], obs_param[0].energy[2], obs_param[0].qcharge); - } + logQuda(QUDA_SUMMARIZE, "flow t, plaquette, E_tot, E_spatial, E_temporal, Q charge\n"); + logQuda(QUDA_SUMMARIZE, "%le %.16e %+.16e %+.16e %+.16e %+.16e\n", 0.0, + obs_param[0].plaquette[0], obs_param[0].energy[0], + obs_param[0].energy[1], obs_param[0].energy[2], obs_param[0].qcharge); for (unsigned int i = 0; i < smear_param->n_steps; i++) { // Perform W1, W2, and Vt Wilson Flow steps as defined in @@ -5329,12 +5202,10 @@ void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam if ((i + 1) % smear_param->meas_interval == 0) { measurement_n++; // increment measurements. gaugeObservables(out, obs_param[measurement_n]); - if (getVerbosity() >= QUDA_SUMMARIZE) { - printfQuda("%le %.16e %+.16e %+.16e %+.16e %+.16e\n", smear_param->epsilon * (i + 1), - obs_param[measurement_n].plaquette[0], obs_param[measurement_n].energy[0], - obs_param[measurement_n].energy[1], obs_param[measurement_n].energy[2], - obs_param[measurement_n].qcharge); - } + logQuda(QUDA_SUMMARIZE, "%le %.16e %+.16e %+.16e %+.16e %+.16e\n", smear_param->epsilon * (i + 1), + obs_param[measurement_n].plaquette[0], obs_param[measurement_n].energy[0], + obs_param[measurement_n].energy[1], obs_param[measurement_n].energy[2], + obs_param[measurement_n].qcharge); } } @@ -5430,8 +5301,6 @@ void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const Quda // DMH: Easiest way to construct ColorSpinorField? Do we require the user // to declare and fill and invert_param, or can it just be hacked?. - profileContract.TPSTART(QUDA_PROFILE_INIT); - // wrap CPU host side pointers lat_dim_t X_ = {X[0], X[1], X[2], X[3]}; ColorSpinorParam cpuParam((void *)hp_x, *param, X_, false, param->input_location); @@ -5454,7 +5323,6 @@ void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const Quda size_t data_bytes = x[0].Volume() * x[0].Nspin() * x[0].Nspin() * 2 * x[0].Precision(); void *d_result = pool_device_malloc(data_bytes); - profileContract.TPSTOP(QUDA_PROFILE_INIT); x[0] = h_x; y[0] = h_y; From a61fbbaf9e4e32e9e21be2ed1c93f1dcb84ecb2d Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 30 Aug 2023 10:54:12 -0700 Subject: [PATCH 36/99] Fix issues with staggered_invert_test related to gauge-field unification --- tests/staggered_invert_test.cpp | 249 +++++++++++++------------------- 1 file changed, 102 insertions(+), 147 deletions(-) diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index de60f45d41..ea5aab17fd 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -102,60 +102,8 @@ void display_test_info() dimPartitioned(3)); } -int main(int argc, char **argv) +void test(int argc, char **argv) { - setQudaDefaultMgTestParams(); - // Parse command line options - auto app = make_app(); - add_eigen_option_group(app); - add_deflation_option_group(app); - add_multigrid_option_group(app); - add_comms_option_group(app); - CLI::TransformPairs test_type_map {{"full", 0}, {"full_ee_prec", 1}, {"full_oo_prec", 2}, {"even", 3}, - {"odd", 4}, {"mcg_even", 5}, {"mcg_odd", 6}}; - app->add_option("--test", test_type, "Test method")->transform(CLI::CheckedTransformer(test_type_map)); - try { - app->parse(argc, argv); - } catch (const CLI::ParseError &e) { - return app->exit(e); - } - setVerbosity(verbosity); - if (!inv_multigrid) solve_type = QUDA_INVALID_SOLVE; - - if (inv_deflate && inv_multigrid) { - printfQuda("Error: Cannot use both deflation and multigrid preconditioners on top level solve.\n"); - exit(0); - } - - // Set values for precisions via the command line. - setQudaPrecisions(); - - // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp) - initComms(argc, argv, gridsize_from_cmdline); - - initRand(); - - // Only these fermions are supported in this file. Ensure a reasonable default, - // ensure that the default is improved staggered - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) { - printfQuda("dslash_type %s not supported, defaulting to %s\n", get_dslash_str(dslash_type), - get_dslash_str(QUDA_ASQTAD_DSLASH)); - dslash_type = QUDA_ASQTAD_DSLASH; - } - - // Need to add support for LAPLACE MG? - if (inv_multigrid) { - if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) { - printfQuda("dslash_type %s not supported for multigrid preconditioner\n", get_dslash_str(dslash_type)); - exit(0); - } - } - - // Deduce operator, solution, and operator preconditioning types - if (!inv_multigrid) setQudaStaggeredInvTestParams(); - - display_test_info(); - // Set QUDA internal parameters QudaGaugeParam gauge_param = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); @@ -167,11 +115,7 @@ int main(int argc, char **argv) QudaEigParam mg_eig_param[mg_levels]; // params related to split grid. - inv_param.split_grid[0] = grid_partition[0]; - inv_param.split_grid[1] = grid_partition[1]; - inv_param.split_grid[2] = grid_partition[2]; - inv_param.split_grid[3] = grid_partition[3]; - + for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i]; int num_sub_partition = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3]; bool use_split_grid = num_sub_partition > 1; @@ -205,9 +149,6 @@ int main(int argc, char **argv) inv_param.eig_param = nullptr; } - // This must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME) - initQuda(device_ordinal); - setDims(gauge_param.X); // Hack: use the domain wall dimensions so we may use the 5th dim for multi indexing dw_setDims(gauge_param.X, 1); @@ -215,29 +156,35 @@ int main(int argc, char **argv) // Staggered Gauge construct START //----------------------------------------------------------------------------------- // Allocate host staggered gauge fields - void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr}; - void *qdp_fatlink[4] = {nullptr, nullptr, nullptr, nullptr}; - void *qdp_longlink[4] = {nullptr, nullptr, nullptr, nullptr}; - void *milc_fatlink = nullptr; - void *milc_longlink = nullptr; - GaugeField *cpuFat = nullptr; - GaugeField *cpuLong = nullptr; - - for (int dir = 0; dir < 4; dir++) { - qdp_inlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - qdp_fatlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - qdp_longlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - } - milc_fatlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - milc_longlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - - // For load, etc + gauge_param.type = (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) ? + QUDA_SU3_LINKS : + QUDA_ASQTAD_FAT_LINKS; gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; + gauge_param.location = QUDA_CPU_FIELD_LOCATION; + GaugeFieldParam cpuParam(gauge_param); + cpuParam.create = QUDA_NULL_FIELD_CREATE; + cpuParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; + cpuParam.order = QUDA_QDP_GAUGE_ORDER; + GaugeField cpuIn = GaugeField(cpuParam); + GaugeField cpuFatQDP = GaugeField(cpuParam); + cpuParam.order = QUDA_MILC_GAUGE_ORDER; + GaugeField cpuFatMILC = GaugeField(cpuParam); + + cpuParam.link_type = QUDA_ASQTAD_LONG_LINKS; + cpuParam.nFace = 3; + cpuParam.order = QUDA_QDP_GAUGE_ORDER; + GaugeField cpuLongQDP = GaugeField(cpuParam); + cpuParam.order = QUDA_MILC_GAUGE_ORDER; + GaugeField cpuLongMILC = GaugeField(cpuParam); + + void* qdp_inlink[4] = {cpuIn.data(0), cpuIn.data(1), cpuIn.data(2), cpuIn.data(3)}; + void* qdp_fatlink[4] = {cpuFatQDP.data(0), cpuFatQDP.data(1), cpuFatQDP.data(2), cpuFatQDP.data(3)}; + void* qdp_longlink[4] = {cpuLongQDP.data(0), cpuLongQDP.data(1), cpuLongQDP.data(2), cpuLongQDP.data(3)}; constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv); // Reorder gauge fields to MILC order - reorderQDPtoMILC(milc_fatlink, qdp_fatlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - reorderQDPtoMILC(milc_longlink, qdp_longlink, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); + cpuFatMILC = cpuFatQDP; + cpuLongMILC = cpuLongQDP; // Compute plaquette. Routine is aware that the gauge fields already have the phases on them. // This needs to be called before `loadFatLongGaugeQuda` because this routine also loads the @@ -252,23 +199,14 @@ int main(int argc, char **argv) printfQuda("Computed fat link plaquette is %e (spatial = %e, temporal = %e)\n", plaq[0], plaq[1], plaq[2]); } - // Create ghost gauge fields in case of multi GPU builds. - gauge_param.type = (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) ? - QUDA_SU3_LINKS : - QUDA_ASQTAD_FAT_LINKS; - gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; - gauge_param.location = QUDA_CPU_FIELD_LOCATION; - - GaugeFieldParam cpuFatParam(gauge_param, milc_fatlink); - cpuFatParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; - cpuFat = GaugeField::Create(cpuFatParam); + loadFatLongGaugeQuda(cpuFatMILC.data(), cpuLongMILC.data(), gauge_param); - gauge_param.type = QUDA_ASQTAD_LONG_LINKS; - GaugeFieldParam cpuLongParam(gauge_param, milc_longlink); - cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD; - cpuLong = GaugeField::Create(cpuLongParam); - - loadFatLongGaugeQuda(milc_fatlink, milc_longlink, gauge_param); + // now copy back to QDP aliases, since these are used for the reference dslash + cpuFatQDP = cpuFatMILC; + cpuLongQDP = cpuLongMILC; + // ensure QDP alias has exchanged ghosts + cpuFatQDP.exchangeGhost(); + cpuLongQDP.exchangeGhost(); // Staggered Gauge construct END //----------------------------------------------------------------------------------- @@ -283,33 +221,27 @@ int main(int argc, char **argv) // Staggered vector construct START //----------------------------------------------------------------------------------- - std::vector in; - std::vector out; - quda::ColorSpinorField *ref; - quda::ColorSpinorField *tmp; + std::vector in(Nsrc); + std::vector out(Nsrc); quda::ColorSpinorParam cs_param; constructStaggeredTestSpinorParam(&cs_param, &inv_param, &gauge_param); for (int k = 0; k < Nsrc; k++) { - in.emplace_back(quda::ColorSpinorField::Create(cs_param)); - out.emplace_back(quda::ColorSpinorField::Create(cs_param)); + in[k] = quda::ColorSpinorField(cs_param); + out[k] = quda::ColorSpinorField(cs_param); } - ref = quda::ColorSpinorField::Create(cs_param); - tmp = quda::ColorSpinorField::Create(cs_param); + ColorSpinorField ref(cs_param); + ColorSpinorField tmp(cs_param); // Staggered vector construct END //----------------------------------------------------------------------------------- // Prepare rng - auto *rng = new quda::RNG(*ref, 1234); + quda::RNG rng(ref, 1234); // Performance measuring std::vector time(Nsrc); std::vector gflops(Nsrc); std::vector iter(Nsrc); - // Pointers for split grid tests - std::vector _h_b(Nsrc, nullptr); - std::vector _h_x(Nsrc, nullptr); - // QUDA invert test //---------------------------------------------------------------------------- @@ -320,17 +252,14 @@ int main(int argc, char **argv) // case 3: // even parity solution, solving EVEN system // case 4: // odd parity solution, solving ODD system - if (multishift != 1) { - printfQuda("Multishift not supported for test %d\n", test_type); - exit(0); - } + if (multishift != 1) errorQuda("Multishift not supported for test %d\n", test_type); - for (int k = 0; k < Nsrc; k++) { quda::spinorNoise(*in[k], *rng, QUDA_NOISE_UNIFORM); } + for (int k = 0; k < Nsrc; k++) { quda::spinorNoise(in[k], rng, QUDA_NOISE_UNIFORM); } if (!use_split_grid) { for (int k = 0; k < Nsrc; k++) { if (inv_deflate) eig_param.preserve_deflation = k < Nsrc - 1 ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - invertQuda(out[k]->data(), in[k]->data(), &inv_param); + invertQuda(out[k].data(), in[k].data(), &inv_param); time[k] = inv_param.secs; gflops[k] = inv_param.gflops / inv_param.secs; iter[k] = inv_param.iter; @@ -341,13 +270,13 @@ int main(int argc, char **argv) std::vector _hp_x(Nsrc); std::vector _hp_b(Nsrc); for (int k = 0; k < Nsrc; k++) { - _hp_x[k] = out[k]->data(); - _hp_b[k] = in[k]->data(); + _hp_x[k] = out[k].data(); + _hp_b[k] = in[k].data(); } inv_param.num_src = Nsrc; inv_param.num_src_per_sub_partition = Nsrc / num_sub_partition; - invertMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, (void *)milc_fatlink, (void *)milc_longlink, - &gauge_param); + invertMultiSrcStaggeredQuda(_hp_x.data(), _hp_b.data(), &inv_param, cpuFatMILC.data(), + cpuLongMILC.data(), &gauge_param); quda::comm_allreduce_int(inv_param.iter); inv_param.iter /= comm_size() / num_sub_partition; quda::comm_allreduce_sum(inv_param.gflops); @@ -359,7 +288,7 @@ int main(int argc, char **argv) for (int k = 0; k < Nsrc; k++) { if (verify_results) - verifyStaggeredInversion(*tmp, *ref, *in[k], *out[k], mass, *cpuFat, *cpuLong, gauge_param, inv_param, 0); + verifyStaggeredInversion(tmp, ref, in[k], out[k], mass, cpuFatQDP, cpuLongQDP, gauge_param, inv_param, 0); } } else if (test_type == 5 || test_type == 6) { // case 5: // multi mass CG, even parity solution, solving EVEN system @@ -403,8 +332,8 @@ int main(int argc, char **argv) } for (int k = 0; k < Nsrc; k++) { - quda::spinorNoise(*in[k], *rng, QUDA_NOISE_UNIFORM); - invertMultiShiftQuda((void **)outArray.data(), in[k]->data(), &inv_param); + quda::spinorNoise(in[k], rng, QUDA_NOISE_UNIFORM); + invertMultiShiftQuda((void **)outArray.data(), in[k].data(), &inv_param); time[k] = inv_param.secs; gflops[k] = inv_param.gflops / inv_param.secs; @@ -414,7 +343,7 @@ int main(int argc, char **argv) for (int i = 0; i < multishift; i++) { printfQuda("%dth solution: mass=%f, ", i, masses[i]); - verifyStaggeredInversion(*tmp, *ref, *in[k], qudaOutArray[i], masses[i], *cpuFat, *cpuLong, gauge_param, inv_param, i); + verifyStaggeredInversion(tmp, ref, in[k], qudaOutArray[i], masses[i], cpuFatQDP, cpuLongQDP, gauge_param, inv_param, i); } } } else { @@ -424,39 +353,65 @@ int main(int argc, char **argv) // Compute timings if (Nsrc > 1 && !use_split_grid) performanceStats(time, gflops, iter); - // Free RNG - delete rng; - // Free the multigrid solver if (inv_multigrid) destroyMultigridQuda(mg_preconditioner); +} + +int main(int argc, char **argv) +{ + setQudaDefaultMgTestParams(); + // Parse command line options + auto app = make_app(); + add_eigen_option_group(app); + add_deflation_option_group(app); + add_multigrid_option_group(app); + add_comms_option_group(app); + CLI::TransformPairs test_type_map {{"full", 0}, {"full_ee_prec", 1}, {"full_oo_prec", 2}, {"even", 3}, + {"odd", 4}, {"mcg_even", 5}, {"mcg_odd", 6}}; + app->add_option("--test", test_type, "Test method")->transform(CLI::CheckedTransformer(test_type_map)); + try { + app->parse(argc, argv); + } catch (const CLI::ParseError &e) { + return app->exit(e); + } + setVerbosity(verbosity); + if (!inv_multigrid) solve_type = QUDA_INVALID_SOLVE; - // Clean up gauge fields - for (int dir = 0; dir < 4; dir++) { - host_free(qdp_inlink[dir]); - host_free(qdp_fatlink[dir]); - host_free(qdp_longlink[dir]); + if (inv_deflate && inv_multigrid) { + errorQuda("Error: Cannot use both deflation and multigrid preconditioners on top level solve"); } - host_free(milc_fatlink); - host_free(milc_longlink); - if (cpuFat != nullptr) { - delete cpuFat; - cpuFat = nullptr; + // Set values for precisions via the command line. + setQudaPrecisions(); + + // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp) + initComms(argc, argv, gridsize_from_cmdline); + + initRand(); + + // Only these fermions are supported in this file. Ensure a reasonable default, + // ensure that the default is improved staggered + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH && dslash_type != QUDA_LAPLACE_DSLASH) { + printfQuda("dslash_type %s not supported, defaulting to %s\n", get_dslash_str(dslash_type), + get_dslash_str(QUDA_ASQTAD_DSLASH)); + dslash_type = QUDA_ASQTAD_DSLASH; } - if (cpuLong != nullptr) { - delete cpuLong; - cpuLong = nullptr; + + // Need to add support for LAPLACE MG? + if (inv_multigrid) { + if (dslash_type != QUDA_STAGGERED_DSLASH && dslash_type != QUDA_ASQTAD_DSLASH) { + errorQuda("dslash_type %s not supported for multigrid preconditioner", get_dslash_str(dslash_type)); + } } - for (auto in_vec : in) { delete in_vec; } - for (auto out_vec : out) { delete out_vec; } - delete ref; - delete tmp; + // Deduce operator, solution, and operator preconditioning types + if (!inv_multigrid) setQudaStaggeredInvTestParams(); - if (use_split_grid) { - for (auto p : _h_b) { delete p; } - for (auto p : _h_x) { delete p; } - } + display_test_info(); + + initQuda(device_ordinal); + + test(argc, argv); // Finalize the QUDA library endQuda(); From 3963f6329ef8af6d3cc56d1ce6b44859bdf77dbb Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 30 Aug 2023 10:58:32 -0700 Subject: [PATCH 37/99] Pushing a profile onto the stack is now handled using an auxiliary container allowing us to use RAII, resulting in auto-popping when the container goes out of scope --- include/timer.h | 17 +++- lib/interface_quda.cpp | 182 +++++++++++++++-------------------------- lib/timer.cpp | 5 +- 3 files changed, 82 insertions(+), 122 deletions(-) diff --git a/include/timer.h b/include/timer.h index b819b81bb2..2de1829c18 100644 --- a/include/timer.h +++ b/include/timer.h @@ -229,10 +229,21 @@ namespace quda { }; - void pushProfile(TimeProfile &profile); - - void popProfile(); + /** + @brief Container that we use for pushing a profile onto the + profile stack. While this object is in scope it will exist on + the profile stack, and be popped when its destructor is called. + */ + struct pushProfile { + TimeProfile &profile; + pushProfile(TimeProfile &profile); + virtual ~pushProfile(); + }; + /** + @brief Return a reference to the present profile at the top of + the stack + */ TimeProfile& getProfile(); } // namespace quda diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 19a97983b0..0dbe006026 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -429,7 +429,7 @@ void initQudaDevice(int dev) initialized = true; profileInit2End.TPSTART(QUDA_PROFILE_TOTAL); - pushProfile(profileInit); + auto profile = pushProfile(profileInit); profileInit.TPSTART(QUDA_PROFILE_INIT); #ifdef GITVERSION @@ -464,7 +464,6 @@ void initQudaDevice(int dev) } profileInit.TPSTOP(QUDA_PROFILE_INIT); - popProfile(); } /* @@ -472,7 +471,7 @@ void initQudaDevice(int dev) */ void initQudaMemory() { - pushProfile(profileInit); + auto profile = pushProfile(profileInit); profileInit.TPSTART(QUDA_PROFILE_INIT); if (!comms_initialized) init_default_comms(); @@ -496,7 +495,6 @@ void initQudaMemory() for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d)); profileInit.TPSTOP(QUDA_PROFILE_INIT); - pushProfile(profileInit); } void updateR() @@ -556,7 +554,7 @@ void freeUniqueGaugeUtility(GaugeField *&precise, GaugeField *&sloppy, GaugeFiel void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) { - pushProfile(profileGauge); + auto profile = pushProfile(profileGauge); checkGaugeParam(param); if (!initialized) errorQuda("QUDA not initialized"); @@ -575,7 +573,6 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) logQuda(QUDA_VERBOSE, "Gauge field unchanged - using cached gauge field %lu\n", checksum); delete in; invalidate_clover = false; - popProfile(); return; } checksum = in_checksum; @@ -631,7 +628,6 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) delete precise; delete in; - popProfile(); return; } @@ -749,13 +745,11 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) // Use the static R (which is defined at the very beginning of lib/interface_quda.cpp) here extendedGaugeResident = createExtendedGauge(*gaugePrecise, R, profileGauge, false, recon); } - - popProfile(); } void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param) { - pushProfile(profileGauge); + auto profile = pushProfile(profileGauge); if (param->location != QUDA_CPU_FIELD_LOCATION) errorQuda("Non-cpu output location not yet supported"); @@ -785,8 +779,6 @@ void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param) cpuGauge.copy(*cudaGauge); if (param->type == QUDA_SMEARED_LINKS) { delete cudaGauge; } - - popProfile(); } void loadSloppyCloverQuda(const QudaPrecision prec[]); @@ -794,7 +786,7 @@ void freeSloppyCloverQuda(); void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) { - pushProfile(profileClover); + auto profile = pushProfile(profileClover); pushVerbosity(inv_param->verbosity); checkCloverParam(inv_param); @@ -916,7 +908,6 @@ void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) loadSloppyCloverQuda(prec); popVerbosity(); - popProfile(); } void freeSloppyCloverQuda(); @@ -1333,46 +1324,47 @@ void flushChronoQuda(int i) void endQuda(void) { - pushProfile(profileEnd); - if (!initialized) return; - freeGaugeQuda(); - freeCloverQuda(); + { + auto profile = pushProfile(profileEnd); - for (int i = 0; i < QUDA_MAX_CHRONO; i++) flushChronoQuda(i); + freeGaugeQuda(); + freeCloverQuda(); - solutionResident.clear(); + for (int i = 0; i < QUDA_MAX_CHRONO; i++) flushChronoQuda(i); - LatticeField::freeGhostBuffer(); - ColorSpinorField::freeGhostBuffer(); - FieldTmp::destroy(); + solutionResident.clear(); - blas_lapack::generic::destroy(); - blas_lapack::native::destroy(); - reducer::destroy(); + LatticeField::freeGhostBuffer(); + ColorSpinorField::freeGhostBuffer(); + FieldTmp::destroy(); - pool::flush_pinned(); - pool::flush_device(); + blas_lapack::generic::destroy(); + blas_lapack::native::destroy(); + reducer::destroy(); - host_free(num_failures_h); - num_failures_h = nullptr; - num_failures_d = nullptr; + pool::flush_pinned(); + pool::flush_device(); - destroyDslashEvents(); + host_free(num_failures_h); + num_failures_h = nullptr; + num_failures_d = nullptr; - saveTuneCache(); - saveProfile(); + destroyDslashEvents(); - // flush any outstanding force monitoring (if enabled) - flushForceMonitor(); + saveTuneCache(); + saveProfile(); - initialized = false; + // flush any outstanding force monitoring (if enabled) + flushForceMonitor(); - comm_finalize(); - comms_initialized = false; + initialized = false; + + comm_finalize(); + comms_initialized = false; + } - popProfile(); profileInit2End.TPSTOP(QUDA_PROFILE_TOTAL); // print out the profile information of the lifetime of the library @@ -1780,7 +1772,7 @@ namespace quda { void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity) { - pushProfile(profileDslash); + auto profile = pushProfile(profileDslash); const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise; @@ -1849,7 +1841,6 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity delete dirac; // clean up popVerbosity(); - popProfile(); } void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) @@ -2149,7 +2140,7 @@ void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param) { if (!initialized) errorQuda("QUDA not initialized"); - pushProfile(profileEigensolve); + auto profile = pushProfile(profileEigensolve); // Transfer the inv param structure contained in eig_param. // This will define the operator to be eigensolved. @@ -2317,8 +2308,6 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam // cache is written out even if a long benchmarking job gets interrupted saveTuneCache(); - - popProfile(); } multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile) @@ -2405,7 +2394,7 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr void* newMultigridQuda(QudaMultigridParam *mg_param) { profilerStart(__func__); - pushProfile(profileInvert); + auto profile = pushProfile(profileInvert); pushVerbosity(mg_param->invert_param->verbosity); auto *mg = new multigrid_solver(*mg_param, profileInvert); @@ -2413,7 +2402,6 @@ void* newMultigridQuda(QudaMultigridParam *mg_param) { saveTuneCache(); popVerbosity(); - popProfile(); profilerStop(__func__); return static_cast(mg); } @@ -2425,7 +2413,7 @@ void destroyMultigridQuda(void *mg) { void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param) { profilerStart(__func__); - pushProfile(profileInvert); + auto profile = pushProfile(profileInvert); pushVerbosity(mg_param->invert_param->verbosity); profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE); @@ -2531,14 +2519,13 @@ void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param) profileInvert.TPSTOP(QUDA_PROFILE_PREAMBLE); popVerbosity(); - popProfile(); profilerStop(__func__); } void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param) { profilerStart(__func__); - pushProfile(profileInvert); + auto profile = pushProfile(profileInvert); pushVerbosity(mg_param->invert_param->verbosity); auto *mg = static_cast(mg_); @@ -2548,7 +2535,6 @@ void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param) mg->mg->dumpNullVectors(); popVerbosity(); - popProfile(); profilerStop(__func__); } @@ -2615,9 +2601,8 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile) } void* newDeflationQuda(QudaEigParam *eig_param) { - pushProfile(profileInvert); + auto profile = pushProfile(profileInvert); auto *defl = new deflated_solver(*eig_param, profileInvert); - popProfile(); saveProfile(__func__); flushProfile(); return static_cast(defl); @@ -2629,7 +2614,7 @@ void destroyDeflationQuda(void *df) { void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) { - pushProfile(profileInvert); + auto profile = pushProfile(profileInvert); profilerStart(__func__); if (!initialized) errorQuda("QUDA not initialized"); @@ -2978,7 +2963,6 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) profilerStop(__func__); popVerbosity(); - popProfile(); } void loadFatLongGaugeQuda(QudaInvertParam *inv_param, QudaGaugeParam *gauge_param, void *milc_fatlinks, @@ -3047,7 +3031,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col */ profilerStart(__func__); - pushProfile(profileInvertMultiSrc); + auto profile = pushProfile(profileInvertMultiSrc); CommKey split_key = {param->split_grid[0], param->split_grid[1], param->split_grid[2], param->split_grid[3]}; int num_sub_partition = quda::product(split_key); @@ -3321,7 +3305,6 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col } } - popProfile(); profilerStop(__func__); } @@ -3381,7 +3364,7 @@ void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param */ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) { - pushProfile(profileMulti); + auto profile = pushProfile(profileMulti); profilerStart(__func__); if (!initialized) errorQuda("QUDA not initialized"); @@ -3710,12 +3693,11 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) profilerStop(__func__); popVerbosity(); - popProfile(); } void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, QudaGaugeParam *param) { - pushProfile(profileFatLink); + auto profile = pushProfile(profileFatLink); checkGaugeParam(param); GaugeFieldParam gParam(*param, fatlink, QUDA_GENERAL_LINKS); @@ -3778,12 +3760,11 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, } delete cudaInLinkEx; - popProfile(); } void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) { - pushProfile(profileGaussianSmear); + auto profile = pushProfile(profileGaussianSmear); checkGaugeParam(param); GaugeFieldParam gParam(*param, inlink, QUDA_ASQTAD_LONG_LINKS); @@ -3829,14 +3810,12 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) freeUniqueGaugeQuda(QUDA_SMEARED_LINKS); delete cudaInLinkEx; - - popProfile(); } int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int* path_length, double* loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam* qudaGaugeParam) { - pushProfile(profileGaugeForce); + auto profile = pushProfile(profileGaugeForce); checkGaugeParam(qudaGaugeParam); GaugeFieldParam gParam(*qudaGaugeParam, siteLink); @@ -3914,14 +3893,13 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int delete cudaGauge; } - popProfile(); return 0; } int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam) { - pushProfile(profileGaugePath); + auto profile = pushProfile(profileGaugePath); checkGaugeParam(qudaGaugeParam); GaugeFieldParam gParam(*qudaGaugeParam, siteLink); @@ -3977,13 +3955,12 @@ int computeGaugePathQuda(void *out, void *siteLink, int ***input_path_buf, int * delete cudaGauge; } - popProfile(); return 0; } void momResidentQuda(void *mom, QudaGaugeParam *param) { - pushProfile(profileGaugeForce); + auto profile = pushProfile(profileGaugeForce); checkGaugeParam(param); GaugeFieldParam gParamMom(*param, mom, QUDA_ASQTAD_MOM_LINKS); @@ -4014,13 +3991,11 @@ void momResidentQuda(void *mom, QudaGaugeParam *param) cpuMom.copy(momResident); momResident = GaugeField(); } - - popProfile(); } void createCloverQuda(QudaInvertParam* invertParam) { - pushProfile(profileClover); + auto profile = pushProfile(profileClover); if (!cloverPrecise) errorQuda("Clover field not allocated"); QudaReconstructType recon = (gaugePrecise->Reconstruct() == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_12 : gaugePrecise->Reconstruct(); @@ -4052,7 +4027,6 @@ void createCloverQuda(QudaInvertParam* invertParam) // FIXME always preserve the extended gauge extendedGaugeResident = gauge; - popProfile(); } void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param) @@ -4097,7 +4071,7 @@ void destroyGaugeFieldQuda(void *gauge) void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, void **, QudaGaugeParam *gauge_param, QudaInvertParam *inv_param) { - pushProfile(profileStaggeredForce); + auto profile = pushProfile(profileStaggeredForce); GaugeFieldParam gParam(*gauge_param, h_mom, QUDA_ASQTAD_MOM_LINKS); @@ -4209,8 +4183,6 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi else momResident = GaugeField(); for (int i=0; iuse_resident_solution) solutionResident.clear(); #endif delete dirac; - - popProfile(); } void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom, int exact, QudaGaugeParam* param) { - pushProfile(profileGaugeUpdate); + auto profile = pushProfile(profileGaugeUpdate); checkGaugeParam(param); // create the host fields @@ -4723,13 +4691,11 @@ void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom, if (param->make_resident_mom && !param->use_resident_mom) std::exchange(momResident, cudaMom); else momResident = GaugeField(); - - popProfile(); } void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) { - pushProfile(profileProject); + auto profile = pushProfile(profileProject); checkGaugeParam(param); // create the gauge field @@ -4763,13 +4729,11 @@ void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) gaugePrecise = new GaugeField(); std::exchange(*gaugePrecise, cudaGauge); } - - popProfile(); } void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) { - pushProfile(profilePhase); + auto profile = pushProfile(profilePhase); checkGaugeParam(param); // create the gauge field @@ -4802,14 +4766,12 @@ void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) gaugePrecise = new GaugeField(); std::exchange(*gaugePrecise, cudaGauge); } - - popProfile(); } // evaluate the momentum action double momActionQuda(void* momentum, QudaGaugeParam* param) { - pushProfile(profileMomAction); + auto profile = pushProfile(profileMomAction); checkGaugeParam(param); // create the momentum fields @@ -4834,13 +4796,12 @@ double momActionQuda(void* momentum, QudaGaugeParam* param) std::exchange(momResident, cudaMom); else momResident = GaugeField(); - popProfile(); return action; } void gaussGaugeQuda(unsigned long long seed, double sigma) { - pushProfile(profileGauss); + auto profile = pushProfile(profileGauss); if (!gaugePrecise) errorQuda("Cannot generate Gauss GaugeField as there is no resident gauge field"); quda::gaugeGauss(*gaugePrecise, seed, sigma); @@ -4849,16 +4810,13 @@ void gaussGaugeQuda(unsigned long long seed, double sigma) extendedGaugeResident->copy(*gaugePrecise); extendedGaugeResident->exchangeExtendedGhost(R, profileGauss, redundant_comms); } - - popProfile(); } void gaussMomQuda(unsigned long long seed, double sigma) { - pushProfile(profileGauss); + auto profile = pushProfile(profileGauss); if (!momResident.Volume()) errorQuda("Cannot generate Gauss GaugeField as there is no resident momentum field"); quda::gaugeGauss(momResident, seed, sigma); - popProfile(); } /* @@ -4866,7 +4824,7 @@ void gaussMomQuda(unsigned long long seed, double sigma) */ void plaqQuda(double plaq[3]) { - pushProfile(profilePlaq); + auto profile = pushProfile(profilePlaq); if (!gaugePrecise) errorQuda("Cannot compute plaquette as there is no resident gauge field"); @@ -4877,8 +4835,6 @@ void plaqQuda(double plaq[3]) plaq[0] = plaq3.x; plaq[1] = plaq3.y; plaq[2] = plaq3.z; - - popProfile(); } /* @@ -4932,7 +4888,7 @@ void copyExtendedResidentGaugeQuda(void *resident_gauge) void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, unsigned int n_steps, double alpha) { - pushProfile(profileWuppertal); + auto profile = pushProfile(profileWuppertal); pushVerbosity(inv_param->verbosity); if (gaugePrecise == nullptr) errorQuda("Gauge field must be loaded"); @@ -4992,14 +4948,13 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, if (gaugeSmeared != nullptr) delete precise; popVerbosity(); - popProfile(); } void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_param) { if (smear_param->n_steps == 0) return; - pushProfile(profileGaussianSmear); + auto profile = pushProfile(profileGaussianSmear); QudaInvertParam *inv_param = smear_param->inv_param; @@ -5121,13 +5076,12 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par if (smear_param->delete_2link != 0) { freeUniqueGaugeQuda(QUDA_SMEARED_LINKS); } saveTuneCache(); - popProfile(); } void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param) { - pushProfile(profileGaugeSmear); + auto profile = pushProfile(profileGaugeSmear); pushOutputPrefix("performGaugeSmearQuda: "); checkGaugeSmearParam(smear_param); @@ -5161,12 +5115,11 @@ void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservable } popOutputPrefix(); - popProfile(); } void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param) { - pushProfile(profileWFlow); + auto profile = pushProfile(profileWFlow); pushOutputPrefix("performWFlowQuda: "); checkGaugeSmearParam(smear_param); @@ -5210,14 +5163,13 @@ void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam } popOutputPrefix(); - popProfile(); } int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double relax_boost, const double tolerance, const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param) { - pushProfile(GaugeFixOVRQuda); + auto profile = pushProfile(GaugeFixOVRQuda); checkGaugeParam(param); GaugeFieldParam gParam(*param, gauge); @@ -5254,7 +5206,6 @@ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const u delete cudaInGaugeEx; } - popProfile(); return 0; } @@ -5262,7 +5213,7 @@ int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const u const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param) { - pushProfile(GaugeFixFFTQuda); + auto profile = pushProfile(GaugeFixFFTQuda); checkGaugeParam(param); GaugeFieldParam gParam(*param, gauge); @@ -5290,14 +5241,13 @@ int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const u std::exchange(*gaugePrecise, cudaInGauge); } - popProfile(); return 0; } void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const QudaContractType cType, QudaInvertParam *param, const int *X) { - pushProfile(profileContract); + auto profile = pushProfile(profileContract); // DMH: Easiest way to construct ColorSpinorField? Do we require the user // to declare and fill and invert_param, or can it just be hacked?. @@ -5334,12 +5284,11 @@ void contractQuda(const void *hp_x, const void *hp_y, void *h_result, const Quda profileContract.TPSTOP(QUDA_PROFILE_D2H); pool_device_free(d_result); - popProfile(); } void gaugeObservablesQuda(QudaGaugeObservableParam *param) { - pushProfile(profileGaugeObs); + auto profile = pushProfile(profileGaugeObs); checkGaugeObservableParam(param); if (!gaugePrecise) errorQuda("Cannot compute Polyakov loop as there is no resident gauge field"); @@ -5361,5 +5310,4 @@ void gaugeObservablesQuda(QudaGaugeObservableParam *param) } gaugeObservables(*gauge, *param); - popProfile(); } diff --git a/lib/timer.cpp b/lib/timer.cpp index 2214ebd0ec..986d7b045f 100644 --- a/lib/timer.cpp +++ b/lib/timer.cpp @@ -202,16 +202,17 @@ namespace quda { static std::stack tpstack; - void pushProfile(TimeProfile &profile) + pushProfile::pushProfile(TimeProfile &profile) : profile(profile) { profile.TPSTART(QUDA_PROFILE_TOTAL); tpstack.push(&profile); } - void popProfile() + pushProfile::~pushProfile() { if (tpstack.empty()) errorQuda("popProfile() called with empty stack"); auto &profile = *(tpstack.top()); + if (&(this->profile) != &profile) errorQuda("Popped profile is not the expected one"); tpstack.pop(); profile.TPSTOP(QUDA_PROFILE_TOTAL); } From 426b59a579dd093e76940ecc539f17a3f927b9aa Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 31 Aug 2023 14:38:54 -0700 Subject: [PATCH 38/99] Respond to review comments --- include/gauge_field.h | 6 ++++++ lib/interface_quda.cpp | 21 +++++++++++---------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/include/gauge_field.h b/include/gauge_field.h index bf75bc6bfa..297065842f 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -288,6 +288,12 @@ namespace quda { */ GaugeField &operator=(GaugeField &&field); + /** + @brief Returns if the object is empty (not initialized) + @return true if the object has been allocated, otherwise false + */ + bool empty() const { return init; } + /** @brief Create the communication handlers and buffers @param[in] R The thickness of the extended region in each dimension diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 0dbe006026..48f342a31b 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -3835,7 +3835,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int GaugeField cpuMom = !qudaGaugeParam->use_resident_mom ? GaugeField(gParamMom) : GaugeField(); - if (qudaGaugeParam->use_resident_mom && !momResident.Volume()) errorQuda("No resident momentum field to use"); + if (qudaGaugeParam->use_resident_mom && momResident.empty()) errorQuda("No resident momentum field to use"); gParamMom.location = QUDA_CUDA_FIELD_LOCATION; gParamMom.create = qudaGaugeParam->overwrite_mom ? QUDA_ZERO_FIELD_CREATE : QUDA_COPY_FIELD_CREATE; gParamMom.field = &cpuMom; @@ -4017,7 +4017,7 @@ void createCloverQuda(QudaInvertParam* invertParam) GaugeFieldParam tensorParam(gaugePrecise->X(), ex->Precision(), QUDA_RECONSTRUCT_NO, 0, QUDA_TENSOR_GEOMETRY); tensorParam.location = QUDA_CUDA_FIELD_LOCATION; tensorParam.siteSubset = QUDA_FULL_SITE_SUBSET; - tensorParam.order = QUDA_FLOAT2_GAUGE_ORDER; + tensorParam.setPrecision(tensorParam.Precision(), true); tensorParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; GaugeField Fmunu(tensorParam); computeFmunu(Fmunu, *ex); @@ -4039,7 +4039,7 @@ void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param) GaugeField *cpuGauge = nullptr; if (gauge) cpuGauge = new GaugeField(gParam); - gParam.order = QUDA_FLOAT2_GAUGE_ORDER; + gParam.setPrecision(gParam.Precision(), true); gParam.create = QUDA_ZERO_FIELD_CREATE; auto* cudaGauge = new GaugeField(gParam); @@ -4087,8 +4087,8 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi gParam.link_type = QUDA_ASQTAD_MOM_LINKS; gParam.create = QUDA_COPY_FIELD_CREATE; gParam.field = &cpuMom; - gParam.order = QUDA_FLOAT2_GAUGE_ORDER; gParam.reconstruct = QUDA_RECONSTRUCT_10; + gParam.setPrecision(gParam.Precision(), true); GaugeField cudaMom = gauge_param->use_resident_mom ? momResident.create_alias() : GaugeField(gParam); // create temporary field for quark-field outer product @@ -4355,7 +4355,7 @@ void computeHISQForceQuda(void* const milc_momentum, param.location = QUDA_CUDA_FIELD_LOCATION; param.create = QUDA_ZERO_FIELD_CREATE; - param.order = QUDA_FLOAT2_GAUGE_ORDER; + param.setPrecision(param.Precision(), true); GaugeFieldParam momParam(param); // Create CPU W, V, and U fields @@ -4504,14 +4504,14 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double // create the device momentum field fParam.location = QUDA_CUDA_FIELD_LOCATION; fParam.create = QUDA_ZERO_FIELD_CREATE; - fParam.order = QUDA_FLOAT2_GAUGE_ORDER; + fParam.setPrecision(fParam.Precision(), true); GaugeField cudaMom(fParam); // create the device force field fParam.link_type = QUDA_GENERAL_LINKS; fParam.create = QUDA_ZERO_FIELD_CREATE; - fParam.order = QUDA_FLOAT2_GAUGE_ORDER; fParam.reconstruct = QUDA_RECONSTRUCT_NO; + fParam.setPrecision(fParam.Precision(), true); GaugeField cudaForce(fParam); ColorSpinorParam qParam; @@ -4662,9 +4662,9 @@ void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom, gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.create = QUDA_COPY_FIELD_CREATE; gParam.field = &cpuMom; - gParam.order = QUDA_FLOAT2_GAUGE_ORDER; gParam.link_type = QUDA_ASQTAD_MOM_LINKS; gParam.reconstruct = QUDA_RECONSTRUCT_10; + gParam.setPrecision(gParam.Precision(), true); gParam.ghostExchange = QUDA_GHOST_EXCHANGE_NO; gParam.pad = 0; GaugeField cudaMom = param->use_resident_mom ? momResident.create_alias() : GaugeField(gParam); @@ -4672,6 +4672,7 @@ void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom, if (param->use_resident_gauge && !gaugePrecise) errorQuda("No resident gauge field allocated"); gParam.link_type = QUDA_SU3_LINKS; gParam.reconstruct = param->reconstruct; + gParam.setPrecision(gParam.Precision(), true); gParam.field = &cpuGauge; GaugeField u_in = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam); gParam.create = QUDA_NULL_FIELD_CREATE; @@ -4709,8 +4710,8 @@ void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.create = QUDA_COPY_FIELD_CREATE; gParam.field = &cpuGauge; - gParam.order = QUDA_FLOAT2_GAUGE_ORDER; gParam.reconstruct = param->reconstruct; + gParam.setPrecision(gParam.Precision(), true); GaugeField cudaGauge = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam); *num_failures_h = 0; @@ -4747,8 +4748,8 @@ void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.create = QUDA_COPY_FIELD_CREATE; gParam.field = &cpuGauge; - gParam.order = QUDA_FLOAT2_GAUGE_ORDER; gParam.reconstruct = param->reconstruct; + gParam.setPrecision(gParam.Precision(), true); GaugeField cudaGauge = param->use_resident_gauge ? gaugePrecise->create_alias() : GaugeField(gParam); *num_failures_h = 0; From 63e474d18a5985bcc393942149688b5192e85942 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 11 Sep 2023 14:30:37 -0700 Subject: [PATCH 39/99] Fix some overflow issues with large volumes --- include/gauge_field_order.h | 38 ++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 17eb70b42c..c50f216e5e 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -690,7 +690,7 @@ namespace quda { struct GhostAccessor { using wrapper = fieldorder_wrapper; complex *ghost[8] = {}; - const int volumeCB; + const unsigned int volumeCB; unsigned int ghostVolumeCB[8] = {}; Float scale = static_cast(1.0); Float scale_inv = static_cast(1.0); @@ -751,7 +751,7 @@ namespace quda { using wrapper = fieldorder_wrapper; /** An internal reference to the actual field we are accessing */ - const int volumeCB; + const unsigned int volumeCB; const int nDim; const int_fastdiv geometry; const QudaFieldLocation location; @@ -870,10 +870,10 @@ namespace quda { __device__ __host__ inline int Ncolor() const { return nColor; } /** Returns the field volume */ - __device__ __host__ inline int Volume() const { return 2*volumeCB; } + __device__ __host__ inline auto Volume() const { return 2*volumeCB; } /** Returns the field volume */ - __device__ __host__ inline int VolumeCB() const { return volumeCB; } + __device__ __host__ inline auto VolumeCB() const { return volumeCB; } /** Returns the field geometric dimension */ __device__ __host__ inline int Ndim() const { return nDim; } @@ -1526,7 +1526,7 @@ namespace quda { int coords[QUDA_MAX_DIM]; int_fastdiv X[QUDA_MAX_DIM]; int R[QUDA_MAX_DIM]; - const int volumeCB; + const unsigned int volumeCB; int faceVolumeCB[4]; const int stride; const int geometry; @@ -1773,7 +1773,7 @@ namespace quda { using complex = complex; Float *ghost[QUDA_MAX_DIM] = {}; int faceVolumeCB[QUDA_MAX_DIM] = {}; - const int volumeCB; + const unsigned int volumeCB; const int stride; const int geometry; const int hasPhase; @@ -1846,7 +1846,7 @@ namespace quda { using real = typename mapper::type; using complex = complex; Float *gauge[QUDA_MAX_DIM]; - const int volumeCB; + const unsigned int volumeCB; QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) : LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) { @@ -1892,7 +1892,7 @@ namespace quda { using real = typename mapper::type; using complex = complex; Float *gauge[QUDA_MAX_DIM]; - const int volumeCB; + const unsigned int volumeCB; QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) : LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) { @@ -1942,7 +1942,7 @@ namespace quda { using real = typename mapper::type; using complex = complex; Float *gauge; - const int volumeCB; + const unsigned int volumeCB; const int geometry; MILCOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : LegacyOrder(u, ghost_), @@ -1953,10 +1953,10 @@ namespace quda { ; } - __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const - { - auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; - block_load(v, reinterpret_cast(in)); + __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const + { + auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; + block_load(v, reinterpret_cast(in)); } __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const @@ -2003,7 +2003,7 @@ namespace quda { using real = typename mapper::type; using complex = complex; Float *gauge; - const int volumeCB; + const unsigned int volumeCB; const int geometry; const size_t offset; const size_t size; @@ -2062,7 +2062,7 @@ namespace quda { using real = typename mapper::type; using complex = complex; Float *gauge; - const int volumeCB; + const unsigned int volumeCB; const real anisotropy; const real anisotropy_inv; static constexpr int Nc = 3; @@ -2131,8 +2131,8 @@ namespace quda { using real = typename mapper::type; using complex = complex; Float *gauge; - const int volumeCB; - int exVolumeCB; // extended checkerboard volume + const unsigned int volumeCB; + unsigned int exVolumeCB; // extended checkerboard volume static constexpr int Nc = 3; BQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : LegacyOrder(u, ghost_), gauge(gauge_ ? gauge_ : u.data()), volumeCB(u.VolumeCB()) @@ -2193,7 +2193,7 @@ namespace quda { using real = typename mapper::type; using complex = complex; Float *gauge; - const int volumeCB; + const unsigned int volumeCB; static constexpr int Nc = 3; const real scale; const real scale_inv; @@ -2257,7 +2257,7 @@ namespace quda { using real = typename mapper::type; using complex = complex; Float *gauge; - const int volumeCB; + const unsigned int volumeCB; int exVolumeCB; static constexpr int Nc = 3; const real scale; From 56a719d7d03d54df18bacc4230358988893733b6 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 12 Sep 2023 10:27:22 -0700 Subject: [PATCH 40/99] Fix some overflow issues with tests --- tests/host_reference/gauge_force_reference.cpp | 2 +- tests/host_reference/hisq_force_reference.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp index eb18f10568..a575532731 100644 --- a/tests/host_reference/gauge_force_reference.cpp +++ b/tests/host_reference/gauge_force_reference.cpp @@ -446,7 +446,7 @@ void gauge_force_reference_dir(void *refMom, int dir, double eb3, void *const *s QudaPrecision prec, int **path_dir, int *length, void *loop_coeff, int num_paths, const lattice_t &lat, bool compute_force) { - size_t size = V * 2 * lat.n_color * lat.n_color * prec; + size_t size = size_t(V) * 2 * lat.n_color * lat.n_color * prec; void *staple = safe_malloc(size); memset(staple, 0, size); diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp index d0cfc197a2..9cd4ee4d9c 100644 --- a/tests/host_reference/hisq_force_reference.cpp +++ b/tests/host_reference/hisq_force_reference.cpp @@ -1205,9 +1205,9 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda QudaPrecision precision = oprod.Precision(); #ifdef MULTI_GPU - int len = Vh_ex * 2; + uint64_t len = Vh_ex * 2; #else - int len = 1; + uint64_t len = 1; for (int dir = 0; dir < 4; ++dir) len *= X_[dir]; #endif // allocate memory for temporary fields From f14d7ffbee054bb182463273f56f0b8bbf2be62e Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 12 Sep 2023 11:21:02 -0700 Subject: [PATCH 41/99] Minor cleanup of heatbath_test and fix an issue found in testing with saveGaugeQuda --- lib/interface_quda.cpp | 2 +- tests/heatbath_test.cpp | 117 ++++++++++++++++++++-------------------- 2 files changed, 59 insertions(+), 60 deletions(-) diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 48f342a31b..1833a0f766 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -4055,7 +4055,7 @@ void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param) { auto* cudaGauge = reinterpret_cast(inGauge); - GaugeFieldParam gParam(*param, gauge, QUDA_GENERAL_LINKS); + GaugeFieldParam gParam(*param, gauge); gParam.geometry = cudaGauge->Geometry(); GaugeField cpuGauge(gParam); diff --git a/tests/heatbath_test.cpp b/tests/heatbath_test.cpp index 4ad648958b..557914b772 100644 --- a/tests/heatbath_test.cpp +++ b/tests/heatbath_test.cpp @@ -53,33 +53,9 @@ void display_test_info() dimPartitioned(3)); } -int main(int argc, char **argv) +void heatbath_test(int argc, char **argv) { - // command line options - auto app = make_app(); - add_heatbath_option_group(app); - try { - app->parse(argc, argv); - } catch (const CLI::ParseError &e) { - return app->exit(e); - } - - if (prec_sloppy == QUDA_INVALID_PRECISION) prec_sloppy = prec; - if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID) link_recon_sloppy = link_recon; - - // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp) - initComms(argc, argv, gridsize_from_cmdline); - - // call srand() with a rank-dependent seed - initRand(); - - display_test_info(); - - // initialize the QUDA library - initQuda(device_ordinal); - // *** QUDA parameters begin here. - QudaGaugeParam gauge_param = newQudaGaugeParam(); setWilsonGaugeParam(gauge_param); gauge_param.t_boundary = QUDA_PERIODIC_T; @@ -91,12 +67,17 @@ int main(int argc, char **argv) // Allocate space on the host (always best to allocate and free in the same scope) for (int dir = 0; dir < 4; dir++) { load_gauge[dir] = safe_malloc(V * gauge_site_size * gauge_param.cpu_prec); } constructHostGaugeField(load_gauge, gauge_param, argc, argv); + + if (prec_sloppy == QUDA_INVALID_PRECISION) prec_sloppy = prec; + if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID) link_recon_sloppy = link_recon; + // Load the gauge field to the device loadGaugeQuda((void *)load_gauge, &gauge_param); - int *num_failures_h = (int *)mapped_malloc(sizeof(int)); - int *num_failures_d = (int *)get_mapped_device_pointer(num_failures_h); - *num_failures_h = 0; + quda::quda_ptr num_failures(QUDA_MEMORY_MAPPED, sizeof(int), false); + int &num_failures_h = *static_cast(num_failures.data_host()); + int &num_failures_d = *static_cast(num_failures.data_device()); + num_failures_h = 0; // start the timer double time0 = -((double)clock()); @@ -110,7 +91,7 @@ int main(int argc, char **argv) gParam.link_type = gauge_param.type; gParam.reconstruct = gauge_param.reconstruct; gParam.setPrecision(gParam.Precision(), true); - GaugeField *gauge = new GaugeField(gParam); + GaugeField gauge(gParam); int pad = 0; lat_dim_t y; @@ -126,9 +107,9 @@ int main(int argc, char **argv) gParamEx.t_boundary = gParam.t_boundary; gParamEx.nFace = 1; for (int dir = 0; dir < 4; ++dir) gParamEx.r[dir] = R[dir]; - GaugeField *gaugeEx = new GaugeField(gParamEx); + GaugeField gaugeEx(gParamEx); // CURAND random generator initialization - RNG *randstates = new RNG(*gauge, 1234); + RNG randstates(gauge, 1234); int nsteps = heatbath_num_steps; int nwarm = heatbath_warmup_steps; @@ -145,21 +126,21 @@ int main(int argc, char **argv) if (latfile.size() > 0) { // We loaded in a gauge field // copy internal extended field to gaugeEx - copyExtendedResidentGaugeQuda((void *)gaugeEx); + copyExtendedResidentGaugeQuda(&gaugeEx); } else { if (coldstart) - InitGaugeField(*gaugeEx); + InitGaugeField(gaugeEx); else - InitGaugeField(*gaugeEx, *randstates); + InitGaugeField(gaugeEx, randstates); // copy into regular field - copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION); + copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION); // load the gauge field from gauge - gauge_param.gauge_order = gauge->Order(); + gauge_param.gauge_order = gauge.Order(); gauge_param.location = QUDA_CUDA_FIELD_LOCATION; - loadGaugeQuda(gauge->data(), &gauge_param); + loadGaugeQuda(gauge.data(), &gauge_param); } QudaGaugeObservableParam param = newQudaGaugeObservableParam(); @@ -175,37 +156,37 @@ int main(int argc, char **argv) // Do a warmup if requested if (nwarm > 0) { for (int step = 1; step <= nwarm; ++step) { - Monte(*gaugeEx, *randstates, beta_value, nhbsteps, novrsteps); + Monte(gaugeEx, randstates, beta_value, nhbsteps, novrsteps); - quda::unitarizeLinks(*gaugeEx, num_failures_d); - if (*num_failures_h > 0) errorQuda("Error in the unitarization\n"); + quda::unitarizeLinks(gaugeEx, &num_failures_d); + if (num_failures_h > 0) errorQuda("Error in the unitarization\n"); } } // copy into regular field - copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION); + copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION); // load the gauge field from gauge - gauge_param.gauge_order = gauge->Order(); + gauge_param.gauge_order = gauge.Order(); gauge_param.location = QUDA_CUDA_FIELD_LOCATION; - loadGaugeQuda(gauge->data(), &gauge_param); + loadGaugeQuda(gauge.data(), &gauge_param); gaugeObservablesQuda(¶m); printfQuda("step=0 plaquette = %e topological charge = %e\n", param.plaquette[0], param.qcharge); freeGaugeQuda(); for (int step = 1; step <= nsteps; ++step) { - Monte(*gaugeEx, *randstates, beta_value, nhbsteps, novrsteps); + Monte(gaugeEx, randstates, beta_value, nhbsteps, novrsteps); // Reunitarize gauge links... - quda::unitarizeLinks(*gaugeEx, num_failures_d); - if (*num_failures_h > 0) errorQuda("Error in the unitarization\n"); + quda::unitarizeLinks(gaugeEx, &num_failures_d); + if (num_failures_h > 0) errorQuda("Error in the unitarization\n"); // copy into regular field - copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION); + copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION); - loadGaugeQuda(gauge->data(), &gauge_param); + loadGaugeQuda(gauge.data(), &gauge_param); gaugeObservablesQuda(¶m); printfQuda("step=%d plaquette = %e topological charge = %e\n", step, param.plaquette[0], param.qcharge); @@ -219,14 +200,15 @@ int main(int argc, char **argv) QudaGaugeParam gauge_param = newQudaGaugeParam(); setWilsonGaugeParam(gauge_param); + gauge_param.t_boundary = gauge.TBoundary(); void *cpu_gauge[4]; for (int dir = 0; dir < 4; dir++) { cpu_gauge[dir] = safe_malloc(V * gauge_site_size * gauge_param.cpu_prec); } // copy into regular field - copyExtendedGauge(*gauge, *gaugeEx, QUDA_CUDA_FIELD_LOCATION); + copyExtendedGauge(gauge, gaugeEx, QUDA_CUDA_FIELD_LOCATION); - saveGaugeFieldQuda((void *)cpu_gauge, (void *)gauge, &gauge_param); + saveGaugeFieldQuda((void *)cpu_gauge, &gauge, &gauge_param); write_gauge_field(gauge_outfile.c_str(), cpu_gauge, gauge_param.cpu_prec, gauge_param.X, 0, (char **)0); @@ -235,27 +217,44 @@ int main(int argc, char **argv) printfQuda("No output file specified.\n"); } - delete gauge; - delete gaugeEx; // Release all temporary memory used for data exchange between GPUs in multi-GPU mode PGaugeExchangeFree(); - - delete randstates; } // stop the timer time0 += clock(); time0 /= CLOCKS_PER_SEC; - // printfQuda("\nDone: %i iter / %g secs = %g Gflops, total time = %g secs\n", - // inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs, time0); printfQuda("\nDone, total time = %g secs\n", time0); - host_free(num_failures_h); - freeGaugeQuda(); - for (int dir = 0; dir < 4; dir++) host_free(load_gauge[dir]); +} + +int main(int argc, char **argv) +{ + // command line options + auto app = make_app(); + add_heatbath_option_group(app); + try { + app->parse(argc, argv); + } catch (const CLI::ParseError &e) { + return app->exit(e); + } + + // initialize QMP/MPI, QUDA comms grid and RNG (host_utils.cpp) + initComms(argc, argv, gridsize_from_cmdline); + + // call srand() with a rank-dependent seed + initRand(); + + display_test_info(); + + // initialize the QUDA library + initQuda(device_ordinal); + + // run the test + heatbath_test(argc, argv); // finalize the QUDA library endQuda(); From b19fe5443e6b9b8a2787cc684d3e65500cc19268 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 12 Sep 2023 11:24:57 -0700 Subject: [PATCH 42/99] Fix typo --- include/gauge_field.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/gauge_field.h b/include/gauge_field.h index 297065842f..54d446839d 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -53,7 +53,7 @@ namespace quda { QudaFieldCreate create = QUDA_REFERENCE_FIELD_CREATE; // used to determine the type of field created - QudaFieldGeometry geometry = QUDA_VECTOR_GEOMETRY; // whether the field is a scale, vector or tensor + QudaFieldGeometry geometry = QUDA_VECTOR_GEOMETRY; // whether the field is a scalar, vector or tensor // whether we need to compute the fat link maxima // FIXME temporary flag until we have a kernel that can do this, then we just do this in copy() From bf29f03d02a404ac384ca34eee4aa3ca570e4682 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 12 Sep 2023 16:43:52 -0700 Subject: [PATCH 43/99] Fix typo --- lib/llfat_quda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/llfat_quda.cu b/lib/llfat_quda.cu index 8ac2e25d36..cd32a54e6c 100644 --- a/lib/llfat_quda.cu +++ b/lib/llfat_quda.cu @@ -186,7 +186,7 @@ namespace quda { if ( ((fat.X()[0] % 2 != 0) || (fat.X()[1] % 2 != 0) || (fat.X()[2] % 2 != 0) || (fat.X()[3] % 2 != 0)) && (u.Reconstruct() != QUDA_RECONSTRUCT_NO)){ - errorQuda("Reconstruct %d and odd dimensionsize is not supported by link fattening code (yet)\n", + errorQuda("Reconstruct %d and odd dimension size is not supported by link fattening code (yet)", u.Reconstruct()); } From dc5ec219a0b1f1502bc60ea4d0267b5177402485 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 21 Sep 2023 12:52:08 -0700 Subject: [PATCH 44/99] Updates for quda_ptr: add custom exchange function since std::exchange doesn't work; add ostream overload; add reference() query function; move assignment will now fail if destination is already allocated --- include/quda_ptr.h | 36 +++++++++++++++++++++++++++++++----- lib/quda_ptr.cpp | 30 ++++++++++++++++++++++++++---- 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/include/quda_ptr.h b/include/quda_ptr.h index 3e829f310f..185d852d57 100644 --- a/include/quda_ptr.h +++ b/include/quda_ptr.h @@ -1,5 +1,6 @@ #pragma once +#include #include "malloc_quda.h" namespace quda { @@ -18,16 +19,25 @@ namespace quda { QUDA_MEMORY_MANAGED both */ class quda_ptr { - QudaMemoryType type = QUDA_MEMORY_INVALID; - size_t size = 0; - bool pool = false; - void *device = nullptr; - void *host = nullptr; + friend std::ostream& operator<<(std::ostream& output, const quda_ptr& ptr); + QudaMemoryType type = QUDA_MEMORY_INVALID; /** Memory type of the allocation */ + size_t size = 0; /** Size of the allocation */ + bool pool = false; /** Is the allocation is pooled */ + void *device = nullptr; /** Device-view of the allocation */ + void *host = nullptr; /** Host-view of the allocation */ + bool reference = false; /** Is this a reference to another allocation */ + + /** + @brief Internal deallocation routine + */ + void destroy(); public: quda_ptr() = default; quda_ptr(quda_ptr &&) = default; quda_ptr &operator=(quda_ptr &&); + quda_ptr(const quda_ptr &) = delete; + quda_ptr &operator=(const quda_ptr &) = delete; /** @brief Constructor for quda_ptr @@ -49,6 +59,15 @@ namespace quda { */ virtual ~quda_ptr(); + /** + @brief Specialized exchange function to use in place of + std::exchange when exchanging quda_ptr objects: moves obj to + *this, and moves new_value to obj + @param[in,out] obj + @param[in] new_value New value for obj to take + */ + void exchange(quda_ptr &obj, quda_ptr &&new_value); + /** @return Returns true if allocation is visible to the device */ @@ -73,6 +92,13 @@ namespace quda { Return the host view of the pointer */ void *data_host() const; + + /** + Return if the instance is a reference rather than an allocation + */ + bool is_reference() const; }; + std::ostream& operator<<(std::ostream& output, const quda_ptr& ptr); + } diff --git a/lib/quda_ptr.cpp b/lib/quda_ptr.cpp index 7db16b641d..bbb8d88457 100644 --- a/lib/quda_ptr.cpp +++ b/lib/quda_ptr.cpp @@ -43,7 +43,8 @@ namespace quda { } quda_ptr::quda_ptr(void *ptr, QudaMemoryType type) : - type(type) + type(type), + reference(true) { getProfile().TPSTART(QUDA_PROFILE_INIT); switch (type) { @@ -69,6 +70,7 @@ namespace quda { quda_ptr& quda_ptr::operator=(quda_ptr &&other) { if (&other != this) { + if (size > 0) errorQuda("Cannot move to already initialized quda_ptr"); type = std::exchange(other.type, QUDA_MEMORY_INVALID); size = std::exchange(other.size, 0); pool = std::exchange(other.pool, false); @@ -78,10 +80,8 @@ namespace quda { return *this; } - quda_ptr::~quda_ptr() + void quda_ptr::destroy() { - getProfile().TPSTART(QUDA_PROFILE_FREE); - if (size > 0) { switch (type) { case QUDA_MEMORY_DEVICE: pool ? pool_device_free(device) : device_free(device); break; @@ -93,12 +93,25 @@ namespace quda { } } + size = 0; device = nullptr; host = nullptr; + } + quda_ptr::~quda_ptr() + { + getProfile().TPSTART(QUDA_PROFILE_FREE); + destroy(); getProfile().TPSTOP(QUDA_PROFILE_FREE); } + void quda_ptr::exchange(quda_ptr &obj, quda_ptr &&new_value) + { + destroy(); + *this = std::move(obj); + obj = std::move(new_value); + } + bool quda_ptr::is_device() const { switch (type) { @@ -155,4 +168,13 @@ namespace quda { return host; } + bool quda_ptr::is_reference() const { return reference; } + + std::ostream& operator<<(std::ostream& output, const quda_ptr& ptr) + { + output << "{type = " << ptr.type << ", size = " << ptr.size << ", pool = " << ptr.pool << ", device = " << ptr.device + << ", host = " << ptr.host << ", reference = " << ptr.reference << "}"; + return output; + } + } From 8d6871e58bb04b1e0909b0122b663dfd5a97b2b5 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 21 Sep 2023 13:07:03 -0700 Subject: [PATCH 45/99] Fix issues with move assignment with GaugeField and ColorSpinorField objects (quda_ptr should use internal exchange, not std::exchange); add ostream overloads for LatticeField and GaugeField. Fix verbosity for llfat_test --- include/gauge_field.h | 26 ++++++++++++++-- include/lattice_field.h | 12 +++++++- lib/color_spinor_field.cpp | 2 +- lib/gauge_field.cpp | 57 ++++++++++++++++++++++++++++++++--- lib/lattice_field.cpp | 61 ++++++++++++++++++++++++++++++++++---- tests/llfat_test.cpp | 1 + 6 files changed, 144 insertions(+), 15 deletions(-) diff --git a/include/gauge_field.h b/include/gauge_field.h index 54d446839d..1c4bdfc852 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -104,7 +104,7 @@ namespace quda { staggeredPhaseType(param.staggered_phase_type), staggeredPhaseApplied(param.staggered_phase_applied), i_mu(param.i_mu), - site_offset(param.gauge_offset), + site_offset(link_type == QUDA_ASQTAD_MOM_LINKS ? param.mom_offset : param.gauge_offset), site_size(param.site_size) { switch (link_type) { @@ -144,9 +144,12 @@ namespace quda { }; std::ostream& operator<<(std::ostream& output, const GaugeFieldParam& param); + std::ostream& operator<<(std::ostream& output, const GaugeField& param); class GaugeField : public LatticeField { + friend std::ostream& operator<<(std::ostream& output, const GaugeField& param); + private: /** @brief Create the field as specified by the param @@ -290,9 +293,9 @@ namespace quda { /** @brief Returns if the object is empty (not initialized) - @return true if the object has been allocated, otherwise false + @return true if the object has not been allocated, otherwise false */ - bool empty() const { return init; } + bool empty() const { return !init; } /** @brief Create the communication handlers and buffers @@ -605,6 +608,23 @@ namespace quda { */ void copy_from_buffer(void *buffer); + /** + @brief Check if two instances are compatible + @param[in] a Input field + @param[in] b Input field + @return Return true if two fields are compatible + */ + static bool are_compatible(const GaugeField &a, const GaugeField &b); + + /** + @brief Check if two instances are weakly compatible (precision + and order can differ) + @param[in] a Input field + @param[in] b Input field + @return Return true if two fields are compatible + */ + static bool are_compatible_weak(const GaugeField &a, const GaugeField &b); + friend struct GaugeFieldParam; }; diff --git a/include/lattice_field.h b/include/lattice_field.h index e7c43b7d69..6c13df2fda 100644 --- a/include/lattice_field.h +++ b/include/lattice_field.h @@ -68,10 +68,13 @@ namespace quda { /** Array storing the length of dimension */ lat_dim_t x = {}; + /** Padding to be added to the checker-boarded volume (only for native field ordering) */ int pad = 0; + /** Whether the field is full or single parity */ QudaSiteSubset siteSubset = QUDA_INVALID_SITE_SUBSET; + /** The type of memory allocation to use for the field */ QudaMemoryType mem_type = QUDA_MEMORY_INVALID; /** The type of ghost exchange to be done with this field */ @@ -141,15 +144,18 @@ namespace quda { } /** - @brief Contructor for creating LatticeFieldParam from a LatticeField + @brief Constructor for creating LatticeFieldParam from a LatticeField */ LatticeFieldParam(const LatticeField &field); }; std::ostream& operator<<(std::ostream& output, const LatticeFieldParam& param); + std::ostream& operator<<(std::ostream& output, const LatticeField& field); class LatticeField : public Object { + friend std::ostream& operator<<(std::ostream& output, const LatticeField& param); + /** @brief Create the field as specified by the param @param[in] Parameter struct @@ -175,9 +181,13 @@ namespace quda { /** Checkerboarded local volume */ size_t localVolumeCB = 0; + /** Stride used for native field ordering (stride = volumeCB + pad) */ size_t stride = 0; + + /** Padding to be added to the checker-boarded volume (only for native field ordering) */ int pad = 0; + /** Total size of the allocation */ size_t total_bytes = 0; /** Number of field dimensions */ diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index a76a29b0eb..b1e7aa6060 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -230,7 +230,7 @@ namespace quda pc_type = std::exchange(src.pc_type, QUDA_PC_INVALID); suggested_parity = std::exchange(src.suggested_parity, QUDA_INVALID_PARITY); length = std::exchange(src.length, 0); - v = std::exchange(src.v, {}); + v.exchange(src.v, {}); // cannot use std::exchange for quda_ptr norm_offset = std::exchange(src.norm_offset, 0); ghost = std::exchange(src.ghost, {}); ghostFace = std::exchange(src.ghostFace, {}); diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index d1700709fc..f9975bb757 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -33,6 +33,7 @@ namespace quda { GaugeField &GaugeField::operator=(const GaugeField &src) { + if (src.empty()) errorQuda("Copying from empty field"); if (&src != this) { if (!init) { // keep current attributes unless unset LatticeField::operator=(src); @@ -51,7 +52,7 @@ namespace quda { { if (&src != this) { // if field not already initialized then move the field - if (!init) { + if (!init || are_compatible(*this, src) || src.empty()) { LatticeField::operator=(std::move(src)); move(std::move(src)); } else { @@ -237,8 +238,10 @@ namespace quda { void GaugeField::move(GaugeField &&src) { - gauge = std::exchange(src.gauge, {}); - gauge_array = std::exchange(src.gauge_array, {}); + init = std::exchange(src.init, {}); + if (src.gauge.is_reference()) errorQuda("Cannot move a reference allocation"); + gauge.exchange(src.gauge, {}); + for (auto i = 0; i < gauge_array.size(); i++) gauge_array[i].exchange(src.gauge_array[i], {}); bytes = std::exchange(src.bytes, 0); phase_offset = std::exchange(src.phase_offset, 0); phase_bytes = std::exchange(src.phase_bytes, 0); @@ -257,7 +260,7 @@ namespace quda { anisotropy = std::exchange(src.anisotropy, 0.0); tadpole = std::exchange(src.tadpole, 0.0); fat_link_max = std::exchange(src.fat_link_max, 0.0); - ghost = std::exchange(src.ghost, {}); + for (auto i = 0; i < ghost.size(); i++) ghost[i].exchange(src.ghost[i], {}); ghostFace = std::exchange(src.ghostFace, {}); staggeredPhaseType = std::exchange(src.staggeredPhaseType, QUDA_STAGGERED_PHASE_INVALID); staggeredPhaseApplied = std::exchange(src.staggeredPhaseApplied, false); @@ -871,6 +874,17 @@ namespace quda { } + bool GaugeField::are_compatible_weak(const GaugeField &a, const GaugeField &b) + { + return (a.LinkType() == b.LinkType() && a.Ncolor() == b.Ncolor() && a.Nface() == b.Nface() && a.GaugeFixed() == b.GaugeFixed() + && a.TBoundary() == b.TBoundary() && a.Anisotropy() == b.Anisotropy() && a.Tadpole() == b.Tadpole()); + } + + bool GaugeField::are_compatible(const GaugeField &a, const GaugeField &b) + { + return (a.Precision() == b.Precision() && a.Order() == b.Order() && are_compatible_weak(a, b)); + } + void GaugeField::checkField(const LatticeField &l) const { LatticeField::checkField(l); try { @@ -1132,6 +1146,40 @@ namespace quda { return output; // for multiple << operators. } + std::ostream& operator<<(std::ostream& output, const GaugeField& field) + { + output << static_cast(field); + output << "init = " << field.init << std::endl; + output << "gauge = " << field.gauge << std::endl; + output << "gauge_array = " << field.gauge_array << std::endl; + output << "bytes = " << field.bytes << std::endl; + output << "phase_offset = " << field.phase_offset << std::endl; + output << "phase_bytes = " << field.phase_bytes << std::endl; + output << "length = " << field.length << std::endl; + output << "real_length = " << field.real_length << std::endl; + output << "nColor = " << field.nColor << std::endl; + output << "nFace = " << field.nFace << std::endl; + output << "geometry = " << field.geometry << std::endl; + output << "site_dim = " << field.geometry << std::endl; + output << "reconstruct = " << field.reconstruct << std::endl; + output << "nInternal = " << field.nInternal << std::endl; + output << "order = " << field.order << std::endl; + output << "fixed = " << field.fixed << std::endl; + output << "link_type = " << field.link_type << std::endl; + output << "t_boundary = " << field.t_boundary << std::endl; + output << "anisotropy = " << field.anisotropy << std::endl; + output << "tadpole = " << field.tadpole << std::endl; + output << "fat_link_max = " << field.fat_link_max << std::endl; + output << "ghost = " << field.ghost << std::endl; + output << "ghostFace = " << field.ghostFace << std::endl; + output << "staggeredPhaseType = " << field.staggeredPhaseType << std::endl; + output << "staggeredPhaseApplied = " << field.staggeredPhaseApplied << std::endl; + output << "i_mu = " << field.i_mu << std::endl; + output << "site_offset = " << field.site_offset << std::endl; + output << "size_size = " << field.site_size << std::endl; + return output; // for multiple << operators. + } + void GaugeField::zero() { if (order != QUDA_QDP_GAUGE_ORDER) { @@ -1201,6 +1249,7 @@ namespace quda { errorQuda("Cannot create an alias to source with lower precision than the alias"); GaugeFieldParam param = param_.init ? param_ : GaugeFieldParam(*this); param.create = QUDA_REFERENCE_FIELD_CREATE; + param.gauge = gauge.data(); return GaugeField(param); } diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp index b75b1dcff8..d195b7edb3 100644 --- a/lib/lattice_field.cpp +++ b/lib/lattice_field.cpp @@ -613,20 +613,69 @@ namespace quda { std::ostream& operator<<(std::ostream& output, const LatticeFieldParam& param) { output << "nDim = " << param.nDim << std::endl; - for (int i = 0; i < param.nDim; i++) { output << "x[" << i << "] = " << param.x[i] << std::endl; } + output << "x = " << param.x << std::endl; output << "pad = " << param.pad << std::endl; output << "precision = " << param.Precision() << std::endl; output << "ghost_precision = " << param.GhostPrecision() << std::endl; output << "scale = " << param.scale << std::endl; - output << "ghostExchange = " << param.ghostExchange << std::endl; - for (int i=0; imake_resident_mom, param->return_result_mom); @@ -4082,7 +4085,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi GaugeField cpuMom(gParam); // create the device momentum field - if (gauge_param->use_resident_mom && !momResident.Volume()) errorQuda("Cannot use resident momentum field since none appears resident"); + if (gauge_param->use_resident_mom && momResident.empty()) errorQuda("Cannot use resident momentum field since none appears resident"); gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.link_type = QUDA_ASQTAD_MOM_LINKS; gParam.create = QUDA_COPY_FIELD_CREATE; @@ -4180,7 +4183,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi if (gauge_param->return_result_mom) cpuMom.copy(cudaMom); if (gauge_param->make_resident_mom && !gauge_param->use_resident_mom) std::exchange(momResident, cudaMom); - else momResident = GaugeField(); + else if (!gauge_param->make_resident_mom) momResident = GaugeField(); for (int i=0; ireturn_result_mom) cpuMom.copy(mom); - if (!gParam->make_resident_mom) momResident = GaugeField(); - if (gParam->make_resident_mom && !gParam->use_resident_mom) - std::exchange(momResident, mom); - else - momResident = GaugeField(); + if (gParam->make_resident_mom && !gParam->use_resident_mom) std::exchange(momResident, mom); + else if (!gParam->make_resident_mom) momResident = GaugeField(); } void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double *coeff, double kappa2, double ck, @@ -4658,7 +4658,7 @@ void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom, GaugeField cpuMom = !param->use_resident_mom ? GaugeField(gParamMom) : GaugeField(); // create the device fields - if (param->use_resident_mom && !momResident.Volume()) errorQuda("No resident mom field allocated"); + if (param->use_resident_mom && momResident.empty()) errorQuda("No resident mom field allocated"); gParam.location = QUDA_CUDA_FIELD_LOCATION; gParam.create = QUDA_COPY_FIELD_CREATE; gParam.field = &cpuMom; @@ -4685,13 +4685,13 @@ void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, int conj_mom, if (param->return_result_gauge) cpuGauge.copy(u_out); if (param->make_resident_gauge) { - if (gaugePrecise && !param->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); + if (gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); gaugePrecise = new GaugeField(); std::exchange(*gaugePrecise, u_out); } if (param->make_resident_mom && !param->use_resident_mom) std::exchange(momResident, cudaMom); - else momResident = GaugeField(); + else if (!param->make_resident_mom) momResident = GaugeField(); } void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) @@ -4700,7 +4700,7 @@ void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) checkGaugeParam(param); // create the gauge field - GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS); + GaugeFieldParam gParam(*param, gauge_h, QUDA_SU3_LINKS); gParam.location = QUDA_CPU_FIELD_LOCATION; bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; GaugeField cpuGauge = need_cpu ? GaugeField(gParam) : GaugeField(); @@ -4725,8 +4725,8 @@ void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) if (param->return_result_gauge) cpuGauge.copy(cudaGauge); - if (param->make_resident_gauge) { - if (gaugePrecise != nullptr && !param->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); + if (param->make_resident_gauge && !param->use_resident_gauge) { + if (gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); gaugePrecise = new GaugeField(); std::exchange(*gaugePrecise, cudaGauge); } @@ -4762,8 +4762,8 @@ void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) if (param->return_result_gauge) cpuGauge.copy(cudaGauge); - if (param->make_resident_gauge) { - if (gaugePrecise != nullptr && !param->use_resident_gauge) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); + if (param->make_resident_gauge && !param->use_resident_gauge) { + if (gaugePrecise) freeUniqueGaugeQuda(QUDA_WILSON_LINKS); gaugePrecise = new GaugeField(); std::exchange(*gaugePrecise, cudaGauge); } @@ -4787,15 +4787,14 @@ double momActionQuda(void* momentum, QudaGaugeParam* param) gParam.reconstruct = QUDA_RECONSTRUCT_10; gParam.setPrecision(param->cuda_prec, true); - if (param->use_resident_mom && !momResident.Volume()) errorQuda("No resident mom field allocated"); + if (param->use_resident_mom && momResident.empty()) errorQuda("No resident mom field allocated"); GaugeField cudaMom = param->use_resident_mom ? momResident.create_alias() : GaugeField(gParam); // perform the update double action = computeMomAction(cudaMom); - if (param->make_resident_mom && !param->use_resident_mom) - std::exchange(momResident, cudaMom); - else momResident = GaugeField(); + if (param->make_resident_mom && !param->use_resident_mom) std::exchange(momResident, cudaMom); + else if (!param->make_resident_mom) momResident = GaugeField(); return action; } @@ -4816,7 +4815,7 @@ void gaussGaugeQuda(unsigned long long seed, double sigma) void gaussMomQuda(unsigned long long seed, double sigma) { auto profile = pushProfile(profileGauss); - if (!momResident.Volume()) errorQuda("Cannot generate Gauss GaugeField as there is no resident momentum field"); + if (momResident.empty()) errorQuda("Cannot generate Gauss GaugeField as there is no resident momentum field"); quda::gaugeGauss(momResident, seed, sigma); } From 69f73031f9c3bc39be494496bae75aedc3679998 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 21 Sep 2023 13:36:03 -0700 Subject: [PATCH 47/99] Fix #1406 --- include/kernels/dslash_gamma_helper.cuh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/kernels/dslash_gamma_helper.cuh b/include/kernels/dslash_gamma_helper.cuh index 3b5e27492a..5261ea5b32 100644 --- a/include/kernels/dslash_gamma_helper.cuh +++ b/include/kernels/dslash_gamma_helper.cuh @@ -78,11 +78,11 @@ namespace quda { { ColorSpinor in = arg.in(x_cb, parity); switch(arg.d) { - case 0: arg.out(x_cb, parity) = in.gamma(0); - case 1: arg.out(x_cb, parity) = in.gamma(1); - case 2: arg.out(x_cb, parity) = in.gamma(2); - case 3: arg.out(x_cb, parity) = in.gamma(3); - case 4: arg.out(x_cb, parity) = in.gamma(4); + case 0: arg.out(x_cb, parity) = in.gamma(0); break; + case 1: arg.out(x_cb, parity) = in.gamma(1); break; + case 2: arg.out(x_cb, parity) = in.gamma(2); break; + case 3: arg.out(x_cb, parity) = in.gamma(3); break; + case 4: arg.out(x_cb, parity) = in.gamma(4); break; } } }; From 8aac21a3318fa1015ff9794e6615943feaa07e04 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 21 Sep 2023 13:38:00 -0700 Subject: [PATCH 48/99] Fix 32-bit overflow issue when sizing compressed gauge fields (Thanks to @stevengottlieb) --- lib/gauge_field.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index f9975bb757..af9cc7bf90 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -129,9 +129,9 @@ namespace quda { if (isNative()) { if (reconstruct == QUDA_RECONSTRUCT_9 || reconstruct == QUDA_RECONSTRUCT_13) { // Need to adjust the phase alignment as well. - int half_phase_bytes + size_t half_phase_bytes = (length / (2 * reconstruct)) * precision; // bytes needed to store phases for a single parity - int half_gauge_bytes = (length / 2) * precision + size_t half_gauge_bytes = (length / 2) * precision - half_phase_bytes; // bytes needed to store the gauge field for a single parity excluding the phases // Adjust the alignments for the gauge and phase separately half_phase_bytes = ALIGNMENT_ADJUST(half_phase_bytes); From b961afad6054e43cf622374d179fa64558ea1099 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 26 Sep 2023 16:17:47 -0700 Subject: [PATCH 49/99] Baseline additions to get OpenMP (host) enabled --- include/quda_arch.h | 3 +++ lib/CMakeLists.txt | 1 + 2 files changed, 4 insertions(+) diff --git a/include/quda_arch.h b/include/quda_arch.h index ed88fb0b8e..45a8ed34e4 100644 --- a/include/quda_arch.h +++ b/include/quda_arch.h @@ -14,5 +14,8 @@ #elif defined(QUDA_TARGET_SYCL) #include +#endif +#ifdef QUDA_OPENMP +#include #endif diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index c165214251..3bf5edc6d9 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -469,6 +469,7 @@ endif() if(QUDA_OPENMP) target_link_libraries(quda PUBLIC OpenMP::OpenMP_CXX) + target_compile_definitions(quda PUBLIC QUDA_OPENMP) endif() # set which precisions to enable From 3a59ef67339d661d63589f1c9604a26012985313 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 26 Sep 2023 16:20:24 -0700 Subject: [PATCH 50/99] OMP parallelizaton of gauge force / path reference code --- tests/gauge_path_test.cpp | 10 ++++++ .../host_reference/gauge_force_reference.cpp | 35 ++++++++++--------- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/tests/gauge_path_test.cpp b/tests/gauge_path_test.cpp index 7d37c9faad..d9110d899f 100644 --- a/tests/gauge_path_test.cpp +++ b/tests/gauge_path_test.cpp @@ -204,11 +204,15 @@ void gauge_force_test(bool compute_force = true) void *refmom = Mom_ref_milc.data(); int *check_out = compute_force ? &force_check : &path_check; if (verify_results) { + quda::host_timer_t verify_timer; + verify_timer.start(); gauge_force_reference(refmom, eb3, U_qdp, input_path_buf, length, loop_coeff, num_paths, compute_force); *check_out = compare_floats(Mom_milc.data(), refmom, 4 * V * mom_site_size, getTolerance(cuda_prec), gauge_param.cpu_prec); if (compute_force) strong_check_mom(Mom_milc.data(), refmom, 4 * V, gauge_param.cpu_prec); + verify_timer.stop(); + printfQuda("Verification time = %.2f ms\n", verify_timer.last()); } if (compute_force) { @@ -318,6 +322,9 @@ void gauge_loop_test() std::vector traces_ref(num_paths); if (verify_results) { + quda::host_timer_t verify_timer; + verify_timer.start(); + gauge_loop_trace_reference(U_qdp, traces_ref, scale_factor, trace_path_p, trace_loop_length_p, trace_loop_coeff_p, num_paths); @@ -349,6 +356,9 @@ void gauge_loop_test() "Plaquette loop space %e time %e total %e ; plaqQuda space %e time %e total %e ; deviation %e\n", plaq_loop[0], plaq_loop[1], plaq_loop[2], obsParam.plaquette[0], obsParam.plaquette[1], obsParam.plaquette[2], plaq_deviation); + + verify_timer.stop(); + printfQuda("Verification time = %.2f ms\n", verify_timer.last()); } double perf = 1.0 * niter * flops * V / (host_timer.last() * 1e+9); diff --git a/tests/host_reference/gauge_force_reference.cpp b/tests/host_reference/gauge_force_reference.cpp index a575532731..1b422856f7 100644 --- a/tests/host_reference/gauge_force_reference.cpp +++ b/tests/host_reference/gauge_force_reference.cpp @@ -69,8 +69,15 @@ struct fcomplex { struct dcomplex { double real; double imag; + + void operator+=(const dcomplex &other) { + real += other.real; + imag += other.imag; + } }; +#pragma omp declare reduction(dcomplex_sum: dcomplex: omp_out += omp_in) + struct fsu3_matrix { using real_t = float; using complex_t = fcomplex; @@ -322,9 +329,7 @@ int gf_neighborIndexFullLattice(size_t i, int dx[], const lattice_t &lat) template static su3_matrix compute_gauge_path(su3_matrix **sitelink, int i, int *path, int len, int dx[4], const lattice_t &lat) { - su3_matrix prev_matrix, curr_matrix; - - memset(&curr_matrix, 0, sizeof(curr_matrix)); + su3_matrix prev_matrix, curr_matrix = {}; curr_matrix.e[0][0].real = 1; curr_matrix.e[1][1].real = 1; @@ -366,16 +371,14 @@ template static void compute_path_product(su3_matrix *staple, su3_matrix **sitelink, int *path, int len, Float loop_coeff, int dir, const lattice_t &lat) { - su3_matrix curr_matrix, tmat; - int dx[4]; - +#pragma omp parallel for for (size_t i = 0; i < lat.volume; i++) { - memset(dx, 0, sizeof(dx)); - + int dx[4] = {}; dx[dir] = 1; - curr_matrix = compute_gauge_path(sitelink, i, path, len, dx, lat); + su3_matrix curr_matrix = compute_gauge_path(sitelink, i, path, len, dx, lat); + su3_matrix tmat; su3_adjoint(&curr_matrix, &tmat); scalar_mult_add_su3_matrix(staple + i, &tmat, loop_coeff, staple + i); } // i @@ -384,16 +387,14 @@ static void compute_path_product(su3_matrix *staple, su3_matrix **sitelink, int template static dcomplex compute_loop_trace(su3_matrix **sitelink, int *path, int len, double loop_coeff, const lattice_t &lat) { - su3_matrix tmat; - dcomplex accum; - memset(&accum, 0, sizeof(accum)); - int dx[4]; + dcomplex accum = {}; +#pragma omp parallel for reduction(dcomplex_sum : accum) for (size_t i = 0; i < lat.volume; i++) { - memset(dx, 0, sizeof(dx)); - tmat = compute_gauge_path(sitelink, i, path, len, dx, lat); + int dx[4] = {}; + su3_matrix tmat = compute_gauge_path(sitelink, i, path, len, dx, lat); auto tr = trace_su3(&tmat); - CSUM(accum, tr); + accum += dcomplex{tr.real, tr.imag}; } CSCALE(accum, loop_coeff); @@ -405,6 +406,7 @@ template static void update_mom(anti_hermitmat *momentum, int dir, su3_matrix **sitelink, su3_matrix *staple, Float eb3, const lattice_t &lat) { +#pragma omp parallel for for (size_t i = 0; i < lat.volume; i++) { su3_matrix tmat1; su3_matrix tmat2; @@ -426,6 +428,7 @@ template static void update_gauge(su3_matrix *gauge, int dir, su3_matrix **sitelink, su3_matrix *staple, Float eb3, const lattice_t &lat) { +#pragma omp parallel for for (size_t i = 0; i < lat.volume; i++) { su3_matrix tmat; From 1b91ee36a6dd8ef36553b576a9e82bb18260a612 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 26 Sep 2023 16:21:43 -0700 Subject: [PATCH 51/99] OMP parallelization of host reference fat-link construction --- tests/utils/llfat_utils.cpp | 46 ++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/tests/utils/llfat_utils.cpp b/tests/utils/llfat_utils.cpp index 43d2acaffa..c0979b7829 100644 --- a/tests/utils/llfat_utils.cpp +++ b/tests/utils/llfat_utils.cpp @@ -29,10 +29,6 @@ template void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matrix *mulink, su3_matrix **sitelink, void **fatlink, Real coef, int use_staple) { - su3_matrix tmat1, tmat2; - int i; - su3_matrix *fat1; - /* Upper staple */ /* Computes the staple : * mu (B) @@ -46,16 +42,15 @@ void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matr * It also adds the computed staple to the fatlink[mu] with weight coef. */ - int dx[4]; - /* upper staple */ - for (i = 0; i < V; i++) { +#pragma omp parallel for + for (int i = 0; i < V; i++) { - fat1 = ((su3_matrix *)fatlink[mu]) + i; + auto fat1 = ((su3_matrix *)fatlink[mu]) + i; su3_matrix *A = sitelink[nu] + i; - memset(dx, 0, sizeof(dx)); + int dx[4] = {}; dx[nu] = 1; int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]); su3_matrix *B; @@ -70,6 +65,7 @@ void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matr nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]); su3_matrix *C = sitelink[nu] + nbr_idx; + su3_matrix tmat1, tmat2; llfat_mult_su3_nn(A, B, &tmat1); if (staple != NULL) { /* Save the staple */ @@ -89,10 +85,11 @@ void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matr * *********************************************/ - for (i = 0; i < V; i++) { +#pragma omp parallel for + for (int i = 0; i < V; i++) { - fat1 = ((su3_matrix *)fatlink[mu]) + i; - memset(dx, 0, sizeof(dx)); + auto fat1 = ((su3_matrix *)fatlink[mu]) + i; + int dx[4] = {}; dx[nu] = -1; int nbr_idx = neighborIndexFullLattice(i, dx[3], dx[2], dx[1], dx[0]); if (nbr_idx >= V || nbr_idx < 0) { @@ -113,6 +110,7 @@ void llfat_compute_gen_staple_field(su3_matrix *staple, int mu, int nu, su3_matr nbr_idx = neighborIndexFullLattice(nbr_idx, dx[3], dx[2], dx[1], dx[0]); su3_matrix *C = sitelink[nu] + nbr_idx; + su3_matrix tmat1, tmat2; llfat_mult_su3_an(A, B, &tmat1); llfat_mult_su3_nn(&tmat1, C, &tmat2); @@ -148,6 +146,7 @@ void llfat_cpu(void **fatlink, su3_matrix **sitelink, Float *act_path_coeff) for (int dir = XUP; dir <= TUP; dir++) { // Intialize fat links with c_1*U_\mu(x) +#pragma omp parallel for for (int i = 0; i < V; i++) { su3_matrix *fat1 = ((su3_matrix *)fatlink[dir]) + i; llfat_scalar_mult_su3_matrix(sitelink[dir] + i, one_link, fat1); @@ -210,10 +209,6 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m su3_matrix **ghost_mulink, su3_matrix **sitelink, su3_matrix **ghost_sitelink, su3_matrix **ghost_sitelink_diag, void **fatlink, Real coef, int use_staple) { - su3_matrix tmat1, tmat2; - int i; - su3_matrix *fat1; - int X1 = Z[0]; int X2 = Z[1]; int X3 = Z[2]; @@ -237,11 +232,10 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m * It also adds the computed staple to the fatlink[mu] with weight coef. */ - int dx[4]; - // upper staple - for (i = 0; i < V; i++) { +#pragma omp parallel for + for (int i = 0; i < V; i++) { int half_index = i; int oddBit = 0; @@ -264,10 +258,10 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m int space_con[4] = {(x4 * X3X2 + x3 * X2 + x2) / 2, (x4 * X3X1 + x3 * X1 + x1) / 2, (x4 * X2X1 + x2 * X1 + x1) / 2, (x3 * X2X1 + x2 * X1 + x1) / 2}; - fat1 = ((su3_matrix *)fatlink[mu]) + i; + auto fat1 = ((su3_matrix *)fatlink[mu]) + i; su3_matrix *A = sitelink[nu] + i; - memset(dx, 0, sizeof(dx)); + int dx[4] = {}; dx[nu] = 1; int nbr_idx; @@ -299,6 +293,7 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m C = sitelink[nu] + nbr_idx; } + su3_matrix tmat1, tmat2; llfat_mult_su3_nn(A, B, &tmat1); if (staple != NULL) { /* Save the staple */ @@ -318,7 +313,8 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m * *********************************************/ - for (i = 0; i < V; i++) { +#pragma omp parallel for + for (int i = 0; i < V; i++) { int half_index = i; int oddBit = 0; @@ -342,11 +338,11 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m // int x4 = x4_from_full_index(i); - fat1 = ((su3_matrix *)fatlink[mu]) + i; + auto fat1 = ((su3_matrix *)fatlink[mu]) + i; // we could be in the ghost link area if nu is T and we are at low T boundary su3_matrix *A; - memset(dx, 0, sizeof(dx)); + int dx[4] = {}; dx[nu] = -1; int nbr_idx; @@ -412,6 +408,7 @@ void llfat_compute_gen_staple_field_mg(su3_matrix *staple, int mu, int nu, su3_m } else { C = sitelink[nu] + nbr_idx; } + su3_matrix tmat1, tmat2; llfat_mult_su3_an(A, B, &tmat1); llfat_mult_su3_nn(&tmat1, C, &tmat2); @@ -455,6 +452,7 @@ void llfat_cpu_mg(void **fatlink, su3_matrix **sitelink, su3_matrix **ghost_site for (int dir = XUP; dir <= TUP; dir++) { // Intialize fat links with c_1*U_\mu(x) +#pragma omp parallel for for (int i = 0; i < V; i++) { su3_matrix *fat1 = ((su3_matrix *)fatlink[dir]) + i; llfat_scalar_mult_su3_matrix(sitelink[dir] + i, one_link, fat1); From 90948556a8686c1c43df79a0918e464c01ec9637 Mon Sep 17 00:00:00 2001 From: SaltyChiang Date: Tue, 3 Oct 2023 02:55:44 +0800 Subject: [PATCH 52/99] Make unitarized fat7 link special unitarized. --- include/kernels/gauge_phase.cuh | 6 ++-- include/kernels/unitarize_links.cuh | 46 ++++++++++++++++++++++++----- lib/cpu_gauge_field.cpp | 2 +- lib/unitarize_links_quda.cu | 14 +++++++-- 4 files changed, 54 insertions(+), 14 deletions(-) diff --git a/include/kernels/gauge_phase.cuh b/include/kernels/gauge_phase.cuh index ef57369cb8..3fe03842a7 100644 --- a/include/kernels/gauge_phase.cuh +++ b/include/kernels/gauge_phase.cuh @@ -67,12 +67,12 @@ namespace quda { if (dim==0) { phase = 1.0; } else if (dim == 1) { - phase = (1.0 - 2.0 * ((1 + x) % 2) ); + phase = (1.0 - 2.0 * ((x) % 2) ); } else if (dim == 2) { - phase = (1.0 - 2.0 * ((1 + x + y) % 2) ); + phase = (1.0 - 2.0 * ((x + y) % 2) ); } else if (dim == 3) { // also apply boundary condition phase = ((t == arg.X[3]-1) ? arg.tBoundary : 1.0) * - (1.0 - 2 * ((1 + x + y + z) % 2) ); + (1.0 - 2 * ((x + y + z) % 2) ); } } return phase; diff --git a/include/kernels/unitarize_links.cuh b/include/kernels/unitarize_links.cuh index f0faef8570..742c5a9205 100644 --- a/include/kernels/unitarize_links.cuh +++ b/include/kernels/unitarize_links.cuh @@ -7,19 +7,23 @@ #include #include #include +#include namespace quda { - template + template struct UnitarizeArg : kernel_param<> { - using real = typename mapper::type; + using Float = double; + using real = typename mapper::type; static constexpr int nColor = nColor_; static constexpr QudaReconstructType recon = recon_; - typedef typename gauge_mapper::type Gauge; + static constexpr QudaStaggeredPhase phase = phase_; + typedef typename gauge_mapper::type Gauge; Gauge out; const Gauge in; int X[4]; // grid dimensions + double tBoundary; int *fails; const int max_iter; const double unitarize_eps; @@ -46,6 +50,9 @@ namespace quda { svd_abs_error(svd_abs_error) { for (int dir=0; dir<4; ++dir) X[dir] = in.X()[dir]; + + bool last_node_in_t = (commCoords(3) == commDim(3)-1); + tBoundary = (Float)(last_node_in_t ? in.TBoundary() : QUDA_PERIODIC_T); } }; @@ -182,6 +189,16 @@ namespace quda { return true; } // unitarizeMILC + template + __host__ __device__ void specialUnitarizeLinkMILC(mat &out, const mat &in, const Arg &arg) + { + complex det = getDeterminant(in); + real r = exp(-log(abs(det)) / Arg::nColor); + real alpha = atan2(det.imag(), det.real()) / Arg::nColor; + + out = in * polar(r, -alpha); + } // specialUnitarizeLinkMILC + template __host__ __device__ bool unitarizeLinkNewton(mat &out, const mat& in, int max_iter) { @@ -209,16 +226,29 @@ namespace quda { __device__ __host__ inline void operator()(int x_cb, int parity, int mu) { - // result is always in double precision - Matrix,Arg::nColor> v, result; + // v and w are always in double precision + Matrix,Arg::nColor> v, w; Matrix,Arg::nColor> tmp = arg.in(mu, x_cb, parity); v = tmp; - unitarizeLinkMILC(result, v, arg); + unitarizeLinkMILC(w, v, arg); if (arg.check_unitarization) { - if (result.isUnitary(arg.max_error) == false) atomic_fetch_add(arg.fails, 1); + if (w.isUnitary(arg.max_error) == false) atomic_fetch_add(arg.fails, 1); + } + + int x[4]; + getCoords(x, x_cb, arg.X, parity); + + double phase; + switch (mu) { + case 0: phase = getPhase<0>(x[0], x[1], x[2], x[3], arg); break; + case 1: phase = getPhase<1>(x[0], x[1], x[2], x[3], arg); break; + case 2: phase = getPhase<2>(x[0], x[1], x[2], x[3], arg); break; + case 3: phase = getPhase<3>(x[0], x[1], x[2], x[3], arg); break; } - tmp = result; + v = w * phase; + specialUnitarizeLinkMILC(w, v, arg); + tmp = w * phase; arg.out(mu, x_cb, parity) = tmp; } diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp index f4b27109a8..a8d21c896b 100644 --- a/lib/cpu_gauge_field.cpp +++ b/lib/cpu_gauge_field.cpp @@ -53,7 +53,7 @@ namespace quda { gauge[d] = nbytes ? safe_malloc(nbytes) : nullptr; if (create == QUDA_ZERO_FIELD_CREATE && nbytes) memset(gauge[d], 0, nbytes); } else if (create == QUDA_REFERENCE_FIELD_CREATE) { - gauge[d] = ((void **)param.gauge)[d]; + gauge[d] = (param.gauge) ? ((void **)param.gauge)[d] : nullptr; } else { errorQuda("Unsupported creation type %d", create); } diff --git a/lib/unitarize_links_quda.cu b/lib/unitarize_links_quda.cu index fa006f0b4a..0ad0986180 100644 --- a/lib/unitarize_links_quda.cu +++ b/lib/unitarize_links_quda.cu @@ -121,8 +121,18 @@ namespace quda { void apply(const qudaStream_t &stream) { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); - launch(tp, stream, - UnitarizeArg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error)); + if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_MILC) { + UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); + launch(tp, stream, arg); + } else if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_CPS) { + UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); + launch(tp, stream, arg); + } else if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_TIFR) { + UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); + launch(tp, stream, arg); + } else { + errorQuda("Undefined phase type %d", in.StaggeredPhase()); + } } void preTune() { if (in.Gauge_p() == out.Gauge_p()) out.backup(); } From 415a443afae5aea024a3cfc1aa7a389c3c6721b0 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 3 Oct 2023 12:35:16 -0700 Subject: [PATCH 53/99] LatticeFieldParam should set its location from QudaGaugeParam::location --- include/lattice_field.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/lattice_field.h b/include/lattice_field.h index 6c13df2fda..a7ca3984ee 100644 --- a/include/lattice_field.h +++ b/include/lattice_field.h @@ -126,7 +126,7 @@ namespace quda { @param[in] param Contains the metadata for filling out the LatticeFieldParam */ LatticeFieldParam(const QudaGaugeParam ¶m) : - location(QUDA_CPU_FIELD_LOCATION), + location(param.location), precision(param.cpu_prec), ghost_precision(param.cpu_prec), init(true), From b211699a7398bac2e3a1169046b9662d57a7356f Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 3 Oct 2023 13:06:53 -0700 Subject: [PATCH 54/99] Fix for QUDA_CTEST_LAUNCH --- tests/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 533de3a8c1..d7532d821c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -265,8 +265,8 @@ if(QUDA_MPI OR QUDA_QMP) if(DEFINED ENV{QUDA_TEST_GRID_SIZE}) get_test_ranks($ENV{QUDA_TEST_GRID_SIZE} QUDA_TEST_NUM_PROCS) endif() - set(QUDA_CTEST_LAUNCH "${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${QUDA_TEST_NUM_PROCS} ${MPIEXEC_PREFLAGS}" - CACHE STRING "CTest Launcher command for QUDA's tests") + set(QUDA_CTEST_LAUNCH ${MPIEXEC_EXECUTABLE};${MPIEXEC_NUMPROC_FLAG};${QUDA_TEST_NUM_PROCS};${MPIEXEC_PREFLAGS} + CACHE STRING "CTest Launcher command for QUDA's tests") endif() # BLAS tests From 830eed0d334f383acee97c1d09c7b8177bb29a72 Mon Sep 17 00:00:00 2001 From: SaltyChiang Date: Wed, 4 Oct 2023 19:46:17 +0800 Subject: [PATCH 55/99] Add QUDA_STAGGERED_PHASE_CHROMA to match Chroma's convention. --- include/enum_quda.h | 1 + include/enum_quda_fortran.h | 9 ++++---- include/gauge_field_order.h | 3 ++- include/kernels/gauge_phase.cuh | 12 ++++++++++ include/kernels/unitarize_links.cuh | 36 ++++++++++++++++------------- lib/gauge_phase.cu | 3 +++ lib/unitarize_links_quda.cu | 3 +++ 7 files changed, 46 insertions(+), 21 deletions(-) diff --git a/include/enum_quda.h b/include/enum_quda.h index 0aa7966d55..ab90e26067 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -534,6 +534,7 @@ typedef enum QudaStaggeredPhase_s { QUDA_STAGGERED_PHASE_MILC = 1, QUDA_STAGGERED_PHASE_CPS = 2, QUDA_STAGGERED_PHASE_TIFR = 3, + QUDA_STAGGERED_PHASE_CHROMA = 4, QUDA_STAGGERED_PHASE_INVALID = QUDA_INVALID_ENUM } QudaStaggeredPhase; diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index c24dd3b869..65d9f515d4 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -474,10 +474,11 @@ #define QUDA_GHOST_EXCHANGE_INVALID QUDA_INVALID_ENUM #define QudaStaggeredPhase integer(4) -#define QUDA_STAGGERED_PHASE_NO 0 -#define QUDA_STAGGERED_PHASE_MILC 1 -#define QUDA_STAGGERED_PHASE_CPS 2 -#define QUDA_STAGGERED_PHASE_TIFR 3 +#define QUDA_STAGGERED_PHASE_NO 0 +#define QUDA_STAGGERED_PHASE_MILC 1 +#define QUDA_STAGGERED_PHASE_CPS 2 +#define QUDA_STAGGERED_PHASE_TIFR 3 +#define QUDA_STAGGERED_PHASE_CHROMA 4 #define QUDA_STAGGERED_PHASE_INVALID QUDA_INVALID_ENUM #define QudaContractType integer(4) diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 899f187ad7..2d5d0c8c6e 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -1502,7 +1502,8 @@ namespace quda { switch (phase) { case QUDA_STAGGERED_PHASE_MILC: case QUDA_STAGGERED_PHASE_CPS: - case QUDA_STAGGERED_PHASE_TIFR: return true; + case QUDA_STAGGERED_PHASE_TIFR: + case QUDA_STAGGERED_PHASE_CHROMA: return true; default: return false; } } diff --git a/include/kernels/gauge_phase.cuh b/include/kernels/gauge_phase.cuh index 3fe03842a7..41801df74a 100644 --- a/include/kernels/gauge_phase.cuh +++ b/include/kernels/gauge_phase.cuh @@ -74,6 +74,18 @@ namespace quda { phase = ((t == arg.X[3]-1) ? arg.tBoundary : 1.0) * (1.0 - 2 * ((x + y + z) % 2) ); } + } else if (Arg::phase == QUDA_STAGGERED_PHASE_CHROMA) { + // Chroma follows CPS convention, but uses -Dslash instead of Dslash compared to QUDA + if (dim==0) { + phase = -1.0; + } else if (dim == 1) { + phase = (1.0 - 2.0 * ((1 + x) % 2) ); + } else if (dim == 2) { + phase = (1.0 - 2.0 * ((1 + x + y) % 2) ); + } else if (dim == 3) { // also apply boundary condition + phase = ((t == arg.X[3]-1) ? arg.tBoundary : 1.0) * + (1.0 - 2 * ((1 + x + y + z) % 2) ); + } } return phase; } diff --git a/include/kernels/unitarize_links.cuh b/include/kernels/unitarize_links.cuh index 742c5a9205..87f7a7ec02 100644 --- a/include/kernels/unitarize_links.cuh +++ b/include/kernels/unitarize_links.cuh @@ -226,29 +226,33 @@ namespace quda { __device__ __host__ inline void operator()(int x_cb, int parity, int mu) { - // v and w are always in double precision - Matrix,Arg::nColor> v, w; + // result is always in double precision + Matrix,Arg::nColor> v, result; Matrix,Arg::nColor> tmp = arg.in(mu, x_cb, parity); v = tmp; - unitarizeLinkMILC(w, v, arg); + unitarizeLinkMILC(result, v, arg); if (arg.check_unitarization) { - if (w.isUnitary(arg.max_error) == false) atomic_fetch_add(arg.fails, 1); + if (result.isUnitary(arg.max_error) == false) atomic_fetch_add(arg.fails, 1); } - int x[4]; - getCoords(x, x_cb, arg.X, parity); - - double phase; - switch (mu) { - case 0: phase = getPhase<0>(x[0], x[1], x[2], x[3], arg); break; - case 1: phase = getPhase<1>(x[0], x[1], x[2], x[3], arg); break; - case 2: phase = getPhase<2>(x[0], x[1], x[2], x[3], arg); break; - case 3: phase = getPhase<3>(x[0], x[1], x[2], x[3], arg); break; + if (Arg::phase == QUDA_STAGGERED_PHASE_CHROMA) { // Special unitrize the result for Chroma convention + int x[4]; + getCoords(x, x_cb, arg.X, parity); + + double phase; + switch (mu) { + case 0: phase = getPhase<0>(x[0], x[1], x[2], x[3], arg); break; + case 1: phase = getPhase<1>(x[0], x[1], x[2], x[3], arg); break; + case 2: phase = getPhase<2>(x[0], x[1], x[2], x[3], arg); break; + case 3: phase = getPhase<3>(x[0], x[1], x[2], x[3], arg); break; + } + v = result * phase; + specialUnitarizeLinkMILC(result, v, arg); + result *= phase; } - v = w * phase; - specialUnitarizeLinkMILC(w, v, arg); - tmp = w * phase; + + tmp = result; arg.out(mu, x_cb, parity) = tmp; } diff --git a/lib/gauge_phase.cu b/lib/gauge_phase.cu index ff959ef0b3..c8149d5a16 100644 --- a/lib/gauge_phase.cu +++ b/lib/gauge_phase.cu @@ -31,6 +31,9 @@ namespace quda { } else if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_TIFR) { GaugePhaseArg arg(u); launch(tp, stream, arg); + } else if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_CHROMA) { + GaugePhaseArg arg(u); + launch(tp, stream, arg); } else { errorQuda("Undefined phase type %d", u.StaggeredPhase()); } diff --git a/lib/unitarize_links_quda.cu b/lib/unitarize_links_quda.cu index 0ad0986180..51a7b6d309 100644 --- a/lib/unitarize_links_quda.cu +++ b/lib/unitarize_links_quda.cu @@ -130,6 +130,9 @@ namespace quda { } else if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_TIFR) { UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); launch(tp, stream, arg); + } else if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_CHROMA) { + UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); + launch(tp, stream, arg); } else { errorQuda("Undefined phase type %d", in.StaggeredPhase()); } From 37a08acc151f6b1b7f5cd09a56ac031ff6d51e88 Mon Sep 17 00:00:00 2001 From: SaltyChiang Date: Thu, 5 Oct 2023 00:54:42 +0800 Subject: [PATCH 56/99] Fix typo and missing situation. --- include/kernels/unitarize_links.cuh | 2 +- lib/unitarize_links_quda.cu | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/kernels/unitarize_links.cuh b/include/kernels/unitarize_links.cuh index 87f7a7ec02..62d526a553 100644 --- a/include/kernels/unitarize_links.cuh +++ b/include/kernels/unitarize_links.cuh @@ -236,7 +236,7 @@ namespace quda { if (result.isUnitary(arg.max_error) == false) atomic_fetch_add(arg.fails, 1); } - if (Arg::phase == QUDA_STAGGERED_PHASE_CHROMA) { // Special unitrize the result for Chroma convention + if constexpr (Arg::phase == QUDA_STAGGERED_PHASE_CHROMA) { // Special unitraize the result for Chroma convention int x[4]; getCoords(x, x_cb, arg.X, parity); diff --git a/lib/unitarize_links_quda.cu b/lib/unitarize_links_quda.cu index 51a7b6d309..eb0a81bacf 100644 --- a/lib/unitarize_links_quda.cu +++ b/lib/unitarize_links_quda.cu @@ -133,6 +133,9 @@ namespace quda { } else if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_CHROMA) { UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); launch(tp, stream, arg); + } else if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_NO) { + UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); + launch(tp, stream, arg); } else { errorQuda("Undefined phase type %d", in.StaggeredPhase()); } From dfef80f6b7d3b7707cae49442919591f0d3d943b Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 4 Oct 2023 10:59:36 -0700 Subject: [PATCH 57/99] Fix for modern Fortran compilers --- include/enum_quda_fortran.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index e77d5a0e15..8a21cf2660 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -9,7 +9,7 @@ # gfortran). #*/ -#define QUDA_INVALID_ENUM (-Z'7fffffff' - 1) +#define QUDA_INVALID_ENUM -int(Z'7FFFFFFF') - 1 #define QudaLinkType integer(4) From c5410be65a168ff26495e05d6c960afbc2e89955 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 4 Oct 2023 11:01:40 -0700 Subject: [PATCH 58/99] When creating momentum field, always use periodic boundary conditions --- include/gauge_field.h | 2 +- lib/interface_quda.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/gauge_field.h b/include/gauge_field.h index 1c4bdfc852..c85a8bed06 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -94,7 +94,7 @@ namespace quda { order(param.gauge_order), fixed(param.gauge_fix), link_type(link_type_ != QUDA_INVALID_LINKS ? link_type_ : param.type), - t_boundary(param.t_boundary), + t_boundary(link_type == QUDA_ASQTAD_MOM_LINKS ? QUDA_PERIODIC_T : param.t_boundary), // if we have momentum field and not using TIFR field, then we always have recon-10 reconstruct(link_type == QUDA_ASQTAD_MOM_LINKS && order != QUDA_TIFR_GAUGE_ORDER && order != QUDA_TIFR_PADDED_GAUGE_ORDER ? QUDA_RECONSTRUCT_10 : QUDA_RECONSTRUCT_NO), diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 840d6f5e96..6e4a058e9e 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -4081,7 +4081,6 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi // create the host momentum field gParam.location = QUDA_CPU_FIELD_LOCATION; gParam.reconstruct = gauge_param->reconstruct; - gParam.t_boundary = QUDA_PERIODIC_T; GaugeField cpuMom(gParam); // create the device momentum field From 1f514b85bc89990ec7caa36383e1ee0b728b5d80 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 4 Oct 2023 11:18:00 -0700 Subject: [PATCH 59/99] Small cleanup --- tests/utils/face_gauge.cpp | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/utils/face_gauge.cpp b/tests/utils/face_gauge.cpp index 4b4af5ca62..bebaae5622 100644 --- a/tests/utils/face_gauge.cpp +++ b/tests/utils/face_gauge.cpp @@ -906,15 +906,10 @@ void do_exchange_cpu_staple(Float *staple, Float **ghost_staple, Float **staple_ Float *ghost_staple_back = ghost_staple[dir]; Float *ghost_staple_fwd = ghost_staple[dir] + 2 * Vsh[dir] * gauge_site_size; - MsgHandle *mh_recv_back; - MsgHandle *mh_recv_fwd; - MsgHandle *mh_send_fwd; - MsgHandle *mh_send_back; - - mh_recv_back = comm_declare_receive_relative(ghost_staple_back, dir, -1, 2 * len[dir]); - mh_recv_fwd = comm_declare_receive_relative(ghost_staple_fwd, dir, +1, 2 * len[dir]); - mh_send_fwd = comm_declare_send_relative(staple_fwd_sendbuf[dir], dir, +1, 2 * len[dir]); - mh_send_back = comm_declare_send_relative(staple_back_sendbuf[dir], dir, -1, 2 * len[dir]); + MsgHandle *mh_recv_back = comm_declare_receive_relative(ghost_staple_back, dir, -1, 2 * len[dir]); + MsgHandle *mh_recv_fwd = comm_declare_receive_relative(ghost_staple_fwd, dir, +1, 2 * len[dir]); + MsgHandle *mh_send_fwd = comm_declare_send_relative(staple_fwd_sendbuf[dir], dir, +1, 2 * len[dir]); + MsgHandle *mh_send_back = comm_declare_send_relative(staple_back_sendbuf[dir], dir, -1, 2 * len[dir]); comm_start(mh_recv_back); comm_start(mh_recv_fwd); From 7b3044d021a71fc5df8c86f84112069e897b4332 Mon Sep 17 00:00:00 2001 From: SaltyChiang Date: Thu, 5 Oct 2023 10:11:47 +0800 Subject: [PATCH 60/99] Remove `QUDA_STAGGERED_PHASE_CPS` enum. --- include/enum_quda.h | 3 +-- include/enum_quda_fortran.h | 3 +-- include/gauge_field_order.h | 5 ++--- include/kernels/gauge_phase.cuh | 11 ----------- lib/gauge_phase.cu | 7 ++----- lib/unitarize_links_quda.cu | 7 ++----- 6 files changed, 8 insertions(+), 28 deletions(-) diff --git a/include/enum_quda.h b/include/enum_quda.h index ab90e26067..3115ea70af 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -532,9 +532,8 @@ typedef enum QudaGhostExchange_s { typedef enum QudaStaggeredPhase_s { QUDA_STAGGERED_PHASE_NO = 0, QUDA_STAGGERED_PHASE_MILC = 1, - QUDA_STAGGERED_PHASE_CPS = 2, + QUDA_STAGGERED_PHASE_CHROMA = 2, QUDA_STAGGERED_PHASE_TIFR = 3, - QUDA_STAGGERED_PHASE_CHROMA = 4, QUDA_STAGGERED_PHASE_INVALID = QUDA_INVALID_ENUM } QudaStaggeredPhase; diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index 65d9f515d4..f590dc1308 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -476,9 +476,8 @@ #define QudaStaggeredPhase integer(4) #define QUDA_STAGGERED_PHASE_NO 0 #define QUDA_STAGGERED_PHASE_MILC 1 -#define QUDA_STAGGERED_PHASE_CPS 2 +#define QUDA_STAGGERED_PHASE_CHROMA 2 #define QUDA_STAGGERED_PHASE_TIFR 3 -#define QUDA_STAGGERED_PHASE_CHROMA 4 #define QUDA_STAGGERED_PHASE_INVALID QUDA_INVALID_ENUM #define QudaContractType integer(4) diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 2d5d0c8c6e..fd5ba863ab 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -1501,9 +1501,8 @@ namespace quda { { switch (phase) { case QUDA_STAGGERED_PHASE_MILC: - case QUDA_STAGGERED_PHASE_CPS: - case QUDA_STAGGERED_PHASE_TIFR: - case QUDA_STAGGERED_PHASE_CHROMA: return true; + case QUDA_STAGGERED_PHASE_CHROMA: + case QUDA_STAGGERED_PHASE_TIFR: return true; default: return false; } } diff --git a/include/kernels/gauge_phase.cuh b/include/kernels/gauge_phase.cuh index 41801df74a..73def9ab49 100644 --- a/include/kernels/gauge_phase.cuh +++ b/include/kernels/gauge_phase.cuh @@ -63,17 +63,6 @@ namespace quda { } else if (dim == 3) { // also apply boundary condition phase = (t == arg.X[3]-1) ? arg.tBoundary : 1.0; } - } else if (Arg::phase == QUDA_STAGGERED_PHASE_CPS) { - if (dim==0) { - phase = 1.0; - } else if (dim == 1) { - phase = (1.0 - 2.0 * ((x) % 2) ); - } else if (dim == 2) { - phase = (1.0 - 2.0 * ((x + y) % 2) ); - } else if (dim == 3) { // also apply boundary condition - phase = ((t == arg.X[3]-1) ? arg.tBoundary : 1.0) * - (1.0 - 2 * ((x + y + z) % 2) ); - } } else if (Arg::phase == QUDA_STAGGERED_PHASE_CHROMA) { // Chroma follows CPS convention, but uses -Dslash instead of Dslash compared to QUDA if (dim==0) { diff --git a/lib/gauge_phase.cu b/lib/gauge_phase.cu index c8149d5a16..41c62205dc 100644 --- a/lib/gauge_phase.cu +++ b/lib/gauge_phase.cu @@ -25,15 +25,12 @@ namespace quda { if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_MILC) { GaugePhaseArg arg(u); launch(tp, stream, arg); - } else if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_CPS) { - GaugePhaseArg arg(u); + } else if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_CHROMA) { + GaugePhaseArg arg(u); launch(tp, stream, arg); } else if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_TIFR) { GaugePhaseArg arg(u); launch(tp, stream, arg); - } else if (u.StaggeredPhase() == QUDA_STAGGERED_PHASE_CHROMA) { - GaugePhaseArg arg(u); - launch(tp, stream, arg); } else { errorQuda("Undefined phase type %d", u.StaggeredPhase()); } diff --git a/lib/unitarize_links_quda.cu b/lib/unitarize_links_quda.cu index eb0a81bacf..6c4d325c4d 100644 --- a/lib/unitarize_links_quda.cu +++ b/lib/unitarize_links_quda.cu @@ -124,15 +124,12 @@ namespace quda { if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_MILC) { UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); launch(tp, stream, arg); - } else if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_CPS) { - UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); + } else if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_CHROMA) { + UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); launch(tp, stream, arg); } else if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_TIFR) { UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); launch(tp, stream, arg); - } else if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_CHROMA) { - UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); - launch(tp, stream, arg); } else if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_NO) { UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); launch(tp, stream, arg); From 6e90b49a2d12891f2f68e35b7cfa7defbd506526 Mon Sep 17 00:00:00 2001 From: SaltyChiang Date: Thu, 5 Oct 2023 15:22:18 +0800 Subject: [PATCH 61/99] Use `projectSU3` to implement the projection. Revert changes in unitarize_links_quda.cu and unitarize_links.cuh. --- include/kernels/unitarize_links.cuh | 40 +++-------------------------- lib/interface_quda.cpp | 13 ++++++++++ lib/unitarize_links_quda.cu | 17 ++---------- 3 files changed, 18 insertions(+), 52 deletions(-) diff --git a/include/kernels/unitarize_links.cuh b/include/kernels/unitarize_links.cuh index 62d526a553..f0faef8570 100644 --- a/include/kernels/unitarize_links.cuh +++ b/include/kernels/unitarize_links.cuh @@ -7,23 +7,19 @@ #include #include #include -#include namespace quda { - template + template struct UnitarizeArg : kernel_param<> { - using Float = double; - using real = typename mapper::type; + using real = typename mapper::type; static constexpr int nColor = nColor_; static constexpr QudaReconstructType recon = recon_; - static constexpr QudaStaggeredPhase phase = phase_; - typedef typename gauge_mapper::type Gauge; + typedef typename gauge_mapper::type Gauge; Gauge out; const Gauge in; int X[4]; // grid dimensions - double tBoundary; int *fails; const int max_iter; const double unitarize_eps; @@ -50,9 +46,6 @@ namespace quda { svd_abs_error(svd_abs_error) { for (int dir=0; dir<4; ++dir) X[dir] = in.X()[dir]; - - bool last_node_in_t = (commCoords(3) == commDim(3)-1); - tBoundary = (Float)(last_node_in_t ? in.TBoundary() : QUDA_PERIODIC_T); } }; @@ -189,16 +182,6 @@ namespace quda { return true; } // unitarizeMILC - template - __host__ __device__ void specialUnitarizeLinkMILC(mat &out, const mat &in, const Arg &arg) - { - complex det = getDeterminant(in); - real r = exp(-log(abs(det)) / Arg::nColor); - real alpha = atan2(det.imag(), det.real()) / Arg::nColor; - - out = in * polar(r, -alpha); - } // specialUnitarizeLinkMILC - template __host__ __device__ bool unitarizeLinkNewton(mat &out, const mat& in, int max_iter) { @@ -235,23 +218,6 @@ namespace quda { if (arg.check_unitarization) { if (result.isUnitary(arg.max_error) == false) atomic_fetch_add(arg.fails, 1); } - - if constexpr (Arg::phase == QUDA_STAGGERED_PHASE_CHROMA) { // Special unitraize the result for Chroma convention - int x[4]; - getCoords(x, x_cb, arg.X, parity); - - double phase; - switch (mu) { - case 0: phase = getPhase<0>(x[0], x[1], x[2], x[3], arg); break; - case 1: phase = getPhase<1>(x[0], x[1], x[2], x[3], arg); break; - case 2: phase = getPhase<2>(x[0], x[1], x[2], x[3], arg); break; - case 3: phase = getPhase<3>(x[0], x[1], x[2], x[3], arg); break; - } - v = result * phase; - specialUnitarizeLinkMILC(result, v, arg); - result *= phase; - } - tmp = result; arg.out(mu, x_cb, parity) = tmp; diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index e019312324..1da16e4895 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -3919,6 +3919,19 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, errorQuda("Error in unitarization component of the hisq fattening: %d failures", *num_failures_h); profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE); + // project onto SU(3) if using the Chroma convention + if (param->staggered_phase_type == QUDA_STAGGERED_PHASE_CHROMA) { + profileFatLink.TPSTART(QUDA_PROFILE_COMPUTE); + *num_failures_h = 0; + const double tol = cudaUnitarizedLink->Precision() == QUDA_DOUBLE_PRECISION ? 1e-15 : 2e-6; + if (cudaUnitarizedLink->StaggeredPhaseApplied()) cudaUnitarizedLink->removeStaggeredPhase(); + projectSU3(*cudaUnitarizedLink, tol, num_failures_d); + if (!cudaUnitarizedLink->StaggeredPhaseApplied() && param->staggered_phase_applied) cudaUnitarizedLink->applyStaggeredPhase(); + if(*num_failures_h>0) + errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h); + profileFatLink.TPSTOP(QUDA_PROFILE_COMPUTE); + } + cudaUnitarizedLink->saveCPUField(cpuUnitarizedLink, profileFatLink); profileFatLink.TPSTART(QUDA_PROFILE_FREE); diff --git a/lib/unitarize_links_quda.cu b/lib/unitarize_links_quda.cu index 6c4d325c4d..fa006f0b4a 100644 --- a/lib/unitarize_links_quda.cu +++ b/lib/unitarize_links_quda.cu @@ -121,21 +121,8 @@ namespace quda { void apply(const qudaStream_t &stream) { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); - if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_MILC) { - UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); - launch(tp, stream, arg); - } else if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_CHROMA) { - UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); - launch(tp, stream, arg); - } else if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_TIFR) { - UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); - launch(tp, stream, arg); - } else if (in.StaggeredPhase() == QUDA_STAGGERED_PHASE_NO) { - UnitarizeArg arg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); - launch(tp, stream, arg); - } else { - errorQuda("Undefined phase type %d", in.StaggeredPhase()); - } + launch(tp, stream, + UnitarizeArg(out, in, fails, max_iter, unitarize_eps, max_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error)); } void preTune() { if (in.Gauge_p() == out.Gauge_p()) out.backup(); } From bc0bdf047f98f67964b0733ff93db7fe7aab8e1a Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 5 Oct 2023 09:58:19 -0700 Subject: [PATCH 62/99] Use same host-constructed gauge field across different partitioning in dslash_ctest --- tests/dslash_ctest.cpp | 13 ++++++- tests/dslash_test.cpp | 13 ++++++- tests/dslash_test_utils.h | 81 +++++++++++++++++++++------------------ 3 files changed, 68 insertions(+), 39 deletions(-) diff --git a/tests/dslash_ctest.cpp b/tests/dslash_ctest.cpp index d75d1f3da9..9443677305 100644 --- a/tests/dslash_ctest.cpp +++ b/tests/dslash_ctest.cpp @@ -99,7 +99,18 @@ class DslashTest : public ::testing::TestWithParam<::testing::tuple 1; - if (test_split_grid) { dtest_type = dslash_test_type::Dslash; } - if (dslash_type == QUDA_ASQTAD_DSLASH || dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) { errorQuda("Asqtad not supported. Please try staggered_dslash_test instead"); } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH @@ -137,6 +143,34 @@ struct DslashTestWrapper { Ls = 1; } + if (inv_param.cpu_prec != gauge_param.cpu_prec) errorQuda("Gauge and spinor CPU precisions must match"); + + // construct input fields + for (int dir = 0; dir < 4; dir++) hostGauge[dir] = safe_malloc((size_t)V * gauge_site_size * gauge_param.cpu_prec); + + if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH + || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { + hostClover = safe_malloc((size_t)V * clover_site_size * inv_param.clover_cpu_prec); + hostCloverInv = safe_malloc((size_t)V * clover_site_size * inv_param.clover_cpu_prec); + + if (compute_clover) + printfQuda("Computing clover field on GPU\n"); + else { + printfQuda("Sending clover field to GPU\n"); + constructHostCloverField(hostClover, hostCloverInv, inv_param); + } + } + + printfQuda("Randomizing fields... "); + constructHostGaugeField(hostGauge, gauge_param, argc, argv); + } + + void init() + { + num_src = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3]; + test_split_grid = num_src > 1; + if (test_split_grid) { dtest_type = dslash_test_type::Dslash; } + inv_param.dagger = dagger ? QUDA_DAG_YES : QUDA_DAG_NO; inv_param.solve_type = (dtest_type == dslash_test_type::Mat || dtest_type == dslash_test_type::MatDagMat) ? QUDA_DIRECT_SOLVE : @@ -177,17 +211,6 @@ struct DslashTestWrapper { } } - if (inv_param.cpu_prec != gauge_param.cpu_prec) errorQuda("Gauge and spinor CPU precisions must match"); - - // construct input fields - for (int dir = 0; dir < 4; dir++) hostGauge[dir] = safe_malloc((size_t)V * gauge_site_size * gauge_param.cpu_prec); - - if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH - || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { - hostClover = safe_malloc((size_t)V * clover_site_size * inv_param.clover_cpu_prec); - hostCloverInv = safe_malloc((size_t)V * clover_site_size * inv_param.clover_cpu_prec); - } - ColorSpinorParam csParam; csParam.nColor = 3; csParam.nSpin = 4; @@ -256,20 +279,11 @@ struct DslashTestWrapper { setVerbosity(verbosity); inv_param.verbosity = verbosity; - printfQuda("Randomizing fields... "); - constructHostGaugeField(hostGauge, gauge_param, argc, argv); - printfQuda("Sending gauge field to GPU\n"); loadGaugeQuda(hostGauge, &gauge_param); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH || dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) { - if (compute_clover) - printfQuda("Computing clover field on GPU\n"); - else { - printfQuda("Sending clover field to GPU\n"); - constructHostCloverField(hostClover, hostCloverInv, inv_param); - } inv_param.compute_clover = compute_clover; inv_param.return_clover = compute_clover; inv_param.compute_clover_inverse = true; @@ -329,13 +343,6 @@ struct DslashTestWrapper { dirac = nullptr; } } - - for (int dir = 0; dir < 4; dir++) host_free(hostGauge[dir]); - if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH - || dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) { - host_free(hostClover); - host_free(hostCloverInv); - } } void dslashRef() From a70c935c645650ec7af0641acaaa4de4e063254b Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 5 Oct 2023 10:14:04 -0700 Subject: [PATCH 63/99] Use same host-construct gauge field across different partitioning in staggered_dslash_ctest --- tests/hisq_stencil_test.cpp | 19 ++-- tests/staggered_dslash_ctest.cpp | 18 +++- tests/staggered_dslash_test.cpp | 13 ++- tests/staggered_dslash_test_utils.h | 121 ++++++++++++++++---------- tests/staggered_eigensolve_test.cpp | 3 +- tests/staggered_invert_test.cpp | 3 +- tests/utils/host_utils.h | 4 +- tests/utils/llfat_utils.cpp | 8 +- tests/utils/staggered_gauge_utils.cpp | 61 +++++-------- tests/utils/staggered_gauge_utils.h | 3 + tests/utils/staggered_host_utils.cpp | 74 ++++++---------- 11 files changed, 172 insertions(+), 155 deletions(-) diff --git a/tests/hisq_stencil_test.cpp b/tests/hisq_stencil_test.cpp index cb6572b25b..98c2ae91d3 100644 --- a/tests/hisq_stencil_test.cpp +++ b/tests/hisq_stencil_test.cpp @@ -104,8 +104,10 @@ static void hisq_test() double u4 = u2 * u2; double u6 = u4 * u2; + std::array, 3> act_paths; + // First path: create V, W links - double act_path_coeff_1[6] = { + act_paths[0] = { (1.0 / 8.0), /* one link */ u2 * (0.0), /* Naik */ u2 * (-1.0 / 8.0) * 0.5, /* simple staple */ @@ -115,7 +117,7 @@ static void hisq_test() }; // Second path: create X, long links - double act_path_coeff_2[6] = { + act_paths[1] = { ((1.0 / 8.0) + (2.0 * 6.0 / 16.0) + (1.0 / 8.0)), /* one link */ /* One link is 1/8 as in fat7 + 2*3/8 for Lepage + 1/8 for Naik */ (-1.0 / 24.0), /* Naik */ @@ -126,7 +128,7 @@ static void hisq_test() }; // Paths for epsilon corrections. Not used if n_naiks = 1. - double act_path_coeff_3[6] = { + act_paths[2] = { (1.0 / 8.0), /* one link b/c of Naik */ (-1.0 / 24.0), /* Naik */ 0.0, /* simple staple */ @@ -185,7 +187,7 @@ static void hisq_test() // Tuning run... { printfQuda("Tuning...\n"); - computeKSLinkQuda(vlink, longlink, wlink, milc_sitelink, act_path_coeff_2, &qudaGaugeParam); + computeKSLinkQuda(vlink, longlink, wlink, milc_sitelink, act_paths[1].data(), &qudaGaugeParam); } struct timeval t0, t1; @@ -196,11 +198,11 @@ static void hisq_test() // If we create cudaGaugeField objs, we can do this 100% on the GPU, no copying! // Create V links (fat7 links) and W links (unitarized V links), 1st path table set - computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_path_coeff_1, &qudaGaugeParam); + computeKSLinkQuda(vlink, nullptr, wlink, milc_sitelink, act_paths[0].data(), &qudaGaugeParam); if (n_naiks > 1) { // Create Naiks, 3rd path table set - computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_path_coeff_3, &qudaGaugeParam); + computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[2].data(), &qudaGaugeParam); // Rescale+copy Naiks into Naik field cpu_axy(prec, eps_naik, fatlink, fatlink_eps, V * 4 * gauge_site_size); @@ -211,7 +213,7 @@ static void hisq_test() } // Create X and long links, 2nd path table set - computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_path_coeff_2, &qudaGaugeParam); + computeKSLinkQuda(fatlink, longlink, nullptr, wlink, act_paths[1].data(), &qudaGaugeParam); if (n_naiks > 1) { // Add into Naik field @@ -244,9 +246,6 @@ static void hisq_test() } if (verify_results) { - - double *act_paths[3] = {act_path_coeff_1, act_path_coeff_2, act_path_coeff_3}; - computeHISQLinksCPU(fat_reflink, long_reflink, fat_reflink_eps, long_reflink_eps, sitelink, &qudaGaugeParam, act_paths, eps_naik); } diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp index 8505b04fe6..f695a92178 100644 --- a/tests/staggered_dslash_ctest.cpp +++ b/tests/staggered_dslash_ctest.cpp @@ -92,7 +92,18 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple= QUDA_HALF_PRECISION) + tol *= 10; // if recon 8, we tolerate a greater deviation ASSERT_LE(deviation, tol) << "Reference CPU and QUDA implementations do not agree"; } diff --git a/tests/staggered_dslash_test.cpp b/tests/staggered_dslash_test.cpp index f61e62d88c..ed8ac2ae0f 100644 --- a/tests/staggered_dslash_test.cpp +++ b/tests/staggered_dslash_test.cpp @@ -37,7 +37,18 @@ class StaggeredDslashTest : public ::testing::Test // Per-test-case tear-down. // Called after the last test in this test case. // Can be omitted if not needed. - static void TearDownTestCase() { endQuda(); } + static void TearDownTestCase() + { + for (int dir = 0; dir < 4; dir++) { + if (StaggeredDslashTestWrapper::qdp_inlink[dir]) + host_free(StaggeredDslashTestWrapper::qdp_inlink[dir]); + if (StaggeredDslashTestWrapper::qdp_fatlink_cpu[dir]) + host_free(StaggeredDslashTestWrapper::qdp_fatlink_cpu[dir]); + if (StaggeredDslashTestWrapper::qdp_longlink_cpu[dir]) + host_free(StaggeredDslashTestWrapper::qdp_longlink_cpu[dir]); + } + endQuda(); + } }; TEST_F(StaggeredDslashTest, benchmark) { dslash_test_wrapper.run_test(niter, /**show_metrics =*/true); } diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index 5cae0d80c2..d23ba31d38 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -44,7 +44,10 @@ struct DslashTime { struct StaggeredDslashTestWrapper { - void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr}; + static inline void *qdp_inlink[4] = {nullptr, nullptr, nullptr, nullptr}; + // In the HISQ case, we include building fat/long links in this unit test + static inline void *qdp_fatlink_cpu[4] = {}; + static inline void *qdp_longlink_cpu[4] = {}; QudaGaugeParam gauge_param; QudaInvertParam inv_param; @@ -65,9 +68,6 @@ struct StaggeredDslashTestWrapper { std::vector vp_spinor; std::vector vp_spinor_out; - // In the HISQ case, we include building fat/long links in this unit test - void *qdp_fatlink_cpu[4] = {}; - void *qdp_longlink_cpu[4] = {}; void *ghost_fatlink_cpu[4] = {}; void *ghost_longlink_cpu[4] = {}; @@ -132,7 +132,12 @@ struct StaggeredDslashTestWrapper { link_recon = link_recon_; - init(argc, argv); + static bool first_time = true; + if (first_time) { + init_host(argc, argv); + first_time = false; + } + init(); } void init_test(int argc, char **argv) @@ -143,10 +148,36 @@ struct StaggeredDslashTestWrapper { setStaggeredGaugeParam(gauge_param); setStaggeredInvertParam(inv_param); - init(argc, argv); + static bool first_time = true; + if (first_time) { + init_host(argc, argv); + first_time = false; + } + init(); } - void init(int argc, char **argv) + void init_host(int argc, char **argv) + { + setDims(gauge_param.X); + dw_setDims(gauge_param.X, 1); + if (Nsrc != 1) { + warningQuda("Ignoring Nsrc = %d, setting to 1.", Nsrc); + Nsrc = 1; + } + + for (int dir = 0; dir < 4; dir++) { + qdp_inlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + qdp_fatlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + qdp_longlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + } + + bool gauge_loaded = false; + bool compute_on_gpu = false; // reference fat/long fields should be computed on cpu + constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_fatlink_cpu, gauge_param, argc, argv, + gauge_loaded, compute_on_gpu); + } + + void init() { inv_param.split_grid[0] = grid_partition[0]; inv_param.split_grid[1] = grid_partition[1]; @@ -159,36 +190,41 @@ struct StaggeredDslashTestWrapper { inv_param.dagger = dagger ? QUDA_DAG_YES : QUDA_DAG_NO; - setDims(gauge_param.X); - dw_setDims(gauge_param.X, 1); - if (Nsrc != 1) { - warningQuda("Ignoring Nsrc = %d, setting to 1.", Nsrc); - Nsrc = 1; - } - - // Allocate a lot of memory because I'm very confused - void *milc_fatlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - void *milc_longlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - - milc_fatlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - milc_longlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - + // Prepare the fields to be used for the GPU computation void *qdp_fatlink_gpu[4]; void *qdp_longlink_gpu[4]; - for (int dir = 0; dir < 4; dir++) { - qdp_inlink[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - qdp_fatlink_gpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); qdp_longlink_gpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - - qdp_fatlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); - qdp_longlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); + } + // QUDA_STAGGERED_DSLASH follows the same codepath whether or not you + // "compute" the fat/long links or not. + if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) { + for (int dir = 0; dir < 4; dir++) { + memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); + memset(qdp_longlink_gpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size); + } + } else { + // QUDA_ASQTAD_DSLASH + if (compute_fatlong) { + computeFatLongGPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks, eps_naik); + } else { + // Not computing FatLong + for (int dir = 0; dir < 4; dir++) { + memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); + memcpy(qdp_longlink_gpu[dir], qdp_longlink_cpu[dir], V * gauge_site_size * host_gauge_data_type_size); + } + } } - bool gauge_loaded = false; - constructStaggeredHostDeviceGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_longlink_gpu, qdp_fatlink_cpu, - qdp_fatlink_gpu, gauge_param, argc, argv, gauge_loaded); + + // Create ghost zones for CPU fields, + // prepare and load the GPU fields + void *milc_fatlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); + void *milc_longlink_cpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); + + milc_fatlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); + milc_longlink_gpu = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); // Alright, we've created all the void** links. // Create the void* pointers @@ -196,8 +232,6 @@ struct StaggeredDslashTestWrapper { reorderQDPtoMILC(milc_fatlink_cpu, qdp_fatlink_cpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); reorderQDPtoMILC(milc_longlink_gpu, qdp_longlink_gpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); reorderQDPtoMILC(milc_longlink_cpu, qdp_longlink_cpu, V, gauge_site_size, gauge_param.cpu_prec, gauge_param.cpu_prec); - // Create ghost zones for CPU fields, - // prepare and load the GPU fields #ifdef MULTI_GPU gauge_param.type = (dslash_type == QUDA_ASQTAD_DSLASH) ? QUDA_ASQTAD_FAT_LINKS : QUDA_SU3_LINKS; @@ -304,28 +338,23 @@ struct StaggeredDslashTestWrapper { setDiracParam(diracParam, &inv_param, pc); dirac = Dirac::create(diracParam); - for (int dir = 0; dir < 4; dir++) { - host_free(qdp_fatlink_gpu[dir]); - host_free(qdp_longlink_gpu[dir]); - host_free(qdp_inlink[dir]); - } host_free(milc_fatlink_cpu); host_free(milc_longlink_cpu); - } - void end() - { for (int dir = 0; dir < 4; dir++) { - if (qdp_fatlink_cpu[dir] != nullptr) { - host_free(qdp_fatlink_cpu[dir]); - qdp_fatlink_cpu[dir] = nullptr; + if (qdp_fatlink_gpu[dir] != nullptr) { + host_free(qdp_fatlink_gpu[dir]); + qdp_fatlink_gpu[dir] = nullptr; } - if (qdp_longlink_cpu[dir] != nullptr) { - host_free(qdp_longlink_cpu[dir]); - qdp_longlink_cpu[dir] = nullptr; + if (qdp_longlink_gpu[dir] != nullptr) { + host_free(qdp_longlink_gpu[dir]); + qdp_longlink_gpu[dir] = nullptr; } } + } + void end() + { if (dirac != nullptr) { delete dirac; dirac = nullptr; diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp index 70877d36d2..f31d54de6b 100644 --- a/tests/staggered_eigensolve_test.cpp +++ b/tests/staggered_eigensolve_test.cpp @@ -132,7 +132,8 @@ int main(int argc, char **argv) milc_fatlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); milc_longlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv); + bool gauge_loaded; + constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv, gauge_loaded, true); // Compute plaquette. Routine is aware that the gauge fields already have the phases on them. double plaq[3]; diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index ea5aab17fd..ea55986e8e 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -181,7 +181,8 @@ void test(int argc, char **argv) void* qdp_inlink[4] = {cpuIn.data(0), cpuIn.data(1), cpuIn.data(2), cpuIn.data(3)}; void* qdp_fatlink[4] = {cpuFatQDP.data(0), cpuFatQDP.data(1), cpuFatQDP.data(2), cpuFatQDP.data(3)}; void* qdp_longlink[4] = {cpuLongQDP.data(0), cpuLongQDP.data(1), cpuLongQDP.data(2), cpuLongQDP.data(3)}; - constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv); + bool gauge_loaded; + constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv, gauge_loaded, true); // Reorder gauge fields to MILC order cpuFatMILC = cpuFatQDP; cpuLongMILC = cpuLongQDP; diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index d6eb26304f..8deaae174e 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -51,13 +51,13 @@ void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longli void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param, int argc, char **argv, bool &gauge_loaded); void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink, - QudaGaugeParam &gauge_param, int argc, char **argv); + QudaGaugeParam &gauge_param, int argc, char **argv, bool &gauge_loaded, bool compute_on_gpu); void constructFatLongGaugeField(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *, QudaDslashType dslash_type); void loadFatLongGaugeQuda(void *milc_fatlink, void *milc_longlink, QudaGaugeParam &gauge_param); void computeLongLinkCPU(void **longlink, void **sitelink, QudaPrecision prec, void *act_path_coeff); void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, void **longlink_eps, void **sitelink, - void *qudaGaugeParamPtr, double **act_path_coeffs, double eps_naik); + void *qudaGaugeParamPtr, std::array, 3> &act_path_coeffs, double eps_naik); void computeTwoLinkCPU(void **twolink, void **sitelink, QudaGaugeParam *gauge_param); void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &out, void *qdp_twolnk[], const quda::GaugeField &twolnk, quda::ColorSpinorField &in, QudaGaugeParam *qudaGaugeParam, QudaInvertParam *inv_param, const int oddBit, const double width, const int t0, QudaPrecision prec); template diff --git a/tests/utils/llfat_utils.cpp b/tests/utils/llfat_utils.cpp index c0979b7829..db17db10c1 100644 --- a/tests/utils/llfat_utils.cpp +++ b/tests/utils/llfat_utils.cpp @@ -427,12 +427,8 @@ template void llfat_cpu_mg(void **fatlink, su3_matrix **sitelink, su3_matrix **ghost_sitelink, su3_matrix **ghost_sitelink_diag, Float *act_path_coeff) { - QudaPrecision prec; - if (sizeof(Float) == 4) { - prec = QUDA_SINGLE_PRECISION; - } else { - prec = QUDA_DOUBLE_PRECISION; - } + QudaPrecision prec = sizeof(Float) == 4 ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION; + su3_matrix *staple = (su3_matrix *)safe_malloc(V * sizeof(su3_matrix)); diff --git a/tests/utils/staggered_gauge_utils.cpp b/tests/utils/staggered_gauge_utils.cpp index 2759e3489b..aceba148d1 100644 --- a/tests/utils/staggered_gauge_utils.cpp +++ b/tests/utils/staggered_gauge_utils.cpp @@ -23,8 +23,8 @@ static double max_allowed_error = 1e-11; // Wrap everything for the GPU construction of fat/long links here void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fatlink_eps, void **qdp_longlink_eps, - void **qdp_inlink, QudaGaugeParam &gauge_param_in, double **act_path_coeffs, double eps_naik, - size_t gSize, int n_naiks) + void **qdp_inlink, QudaGaugeParam &gauge_param_in, + std::array, 3> &act_path_coeffs, double eps_naik, size_t gSize, int n_naiks) { // since a lot of intermediaries can be general matrices, override the recon in `gauge_param_in` auto gauge_param = gauge_param_in; @@ -52,11 +52,11 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat } // Create V links (fat7 links) and W links (unitarized V links), 1st path table set - computeKSLinkQuda(milc_vlink, nullptr, milc_wlink, milc_inlink, act_path_coeffs[0], &gauge_param); + computeKSLinkQuda(milc_vlink, nullptr, milc_wlink, milc_inlink, act_path_coeffs[0].data(), &gauge_param); if (n_naiks > 1) { // Create Naiks, 3rd path table set - computeKSLinkQuda(milc_fatlink, milc_longlink, nullptr, milc_wlink, act_path_coeffs[2], &gauge_param); + computeKSLinkQuda(milc_fatlink, milc_longlink, nullptr, milc_wlink, act_path_coeffs[2].data(), &gauge_param); // Rescale+copy Naiks into Naik field cpu_axy(gauge_param.cpu_prec, eps_naik, milc_fatlink, milc_fatlink_eps, V * 4 * gauge_site_size); @@ -67,7 +67,7 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat } // Create X and long links, 2nd path table set - computeKSLinkQuda(milc_fatlink, milc_longlink, nullptr, milc_wlink, act_path_coeffs[1], &gauge_param); + computeKSLinkQuda(milc_fatlink, milc_longlink, nullptr, milc_wlink, act_path_coeffs[1].data(), &gauge_param); if (n_naiks > 1) { // Add into Naik field @@ -98,7 +98,7 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat } } -void setActionPaths(double **act_paths) +template void setActionPaths(T &act_paths) { /////////////////////////// // Set path coefficients // @@ -160,8 +160,7 @@ void setActionPaths(double **act_paths) void computeFatLongGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize, int n_naiks, double eps_naik) { - double **act_paths = new double *[3]; - for (int i = 0; i < 3; i++) act_paths[i] = new double[6]; + std::array, 3> act_paths; setActionPaths(act_paths); /////////////////////////////////////////////////////////////////////// @@ -196,17 +195,12 @@ void computeFatLongGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlin host_free(qdp_longlink_naik_temp[dir]); } } - - for (int i = 0; i < 3; i++) delete[] act_paths[i]; - delete[] act_paths; } -void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, - void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize, - int n_naiks, double eps_naik) +void computeFatLongCPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param, + size_t gSize, int n_naiks, double eps_naik) { - double **act_paths = new double *[3]; - for (int i = 0; i < 3; i++) act_paths[i] = new double[6]; + std::array, 3> act_paths; setActionPaths(act_paths); /////////////////////////////////////////////////////////////////////// @@ -229,41 +223,26 @@ void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, vo ////////////////////////// // defined in "llfat_reference.cpp" - computeHISQLinksCPU(qdp_fatlink_cpu, qdp_longlink_cpu, (n_naiks == 2) ? qdp_fatlink_naik_temp : nullptr, + computeHISQLinksCPU(qdp_fatlink, qdp_longlink, (n_naiks == 2) ? qdp_fatlink_naik_temp : nullptr, (n_naiks == 2) ? qdp_longlink_naik_temp : nullptr, qdp_inlink, &gauge_param, act_paths, eps_naik); if (n_naiks == 2) { // Override the naik fields into the fat/long link fields for (int dir = 0; dir < 4; dir++) { - memcpy(qdp_fatlink_cpu[dir], qdp_fatlink_naik_temp[dir], V * gauge_site_size * gSize); - memcpy(qdp_longlink_cpu[dir], qdp_longlink_naik_temp[dir], V * gauge_site_size * gSize); - memset(qdp_fatlink_naik_temp[dir], 0, V * gauge_site_size * gSize); - memset(qdp_longlink_naik_temp[dir], 0, V * gauge_site_size * gSize); - } - } - - ////////////////////////// - // Create the GPU links // - ////////////////////////// - - // Skip eps field for now - // Note: GPU link creation only works for single and double precision - computeHISQLinksGPU(qdp_fatlink_gpu, qdp_longlink_gpu, (n_naiks == 2) ? qdp_fatlink_naik_temp : nullptr, - (n_naiks == 2) ? qdp_longlink_naik_temp : nullptr, qdp_inlink, gauge_param, act_paths, eps_naik, - gSize, n_naiks); - - if (n_naiks == 2) { - // Override the naik fields into the fat/long link fields - for (int dir = 0; dir < 4; dir++) { - memcpy(qdp_fatlink_gpu[dir], qdp_fatlink_naik_temp[dir], V * gauge_site_size * gSize); - memcpy(qdp_longlink_gpu[dir], qdp_longlink_naik_temp[dir], V * gauge_site_size * gSize); + memcpy(qdp_fatlink[dir], qdp_fatlink_naik_temp[dir], V * gauge_site_size * gSize); + memcpy(qdp_longlink[dir], qdp_longlink_naik_temp[dir], V * gauge_site_size * gSize); host_free(qdp_fatlink_naik_temp[dir]); host_free(qdp_longlink_naik_temp[dir]); } } +} - for (int i = 0; i < 3; i++) delete[] act_paths[i]; - delete[] act_paths; +void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, + void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize, + int n_naiks, double eps_naik) +{ + computeFatLongGPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_inlink, gauge_param, gSize, n_naiks, eps_naik); + computeFatLongCPU(qdp_fatlink_cpu, qdp_longlink_cpu, qdp_inlink, gauge_param, gSize, n_naiks, eps_naik); } // Routine that takes in a QDP-ordered field and outputs the plaquette. diff --git a/tests/utils/staggered_gauge_utils.h b/tests/utils/staggered_gauge_utils.h index f2cc4e4749..d969d437e1 100644 --- a/tests/utils/staggered_gauge_utils.h +++ b/tests/utils/staggered_gauge_utils.h @@ -18,6 +18,9 @@ void computeHISQLinksGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_fat void computeFatLongGPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize, int n_naiks, double eps_naik); +void computeFatLongCPU(void **qdp_fatlink, void **qdp_longlink, void **qdp_inlink, QudaGaugeParam &gauge_param, + size_t gSize, int n_naiks, double eps_naik); + void computeFatLongGPUandCPU(void **qdp_fatlink_gpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, void **qdp_longlink_cpu, void **qdp_inlink, QudaGaugeParam &gauge_param, size_t gSize, int n_naiks, double eps_naik); diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp index cc9148fca5..9eb4242bcb 100644 --- a/tests/utils/staggered_host_utils.cpp +++ b/tests/utils/staggered_host_utils.cpp @@ -26,25 +26,25 @@ template using complex = std::complex; // Staggered gauge field utils //------------------------------------------------------ -void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu, - void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param, - int argc, char **argv, bool &gauge_loaded) +void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink, + QudaGaugeParam &gauge_param, int argc, char **argv, bool &gauge_loaded, bool compute_on_gpu) { + gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; + // load a field WITHOUT PHASES if (latfile.size() > 0) { - if (!gauge_loaded) { - read_gauge_field(latfile.c_str(), qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv); - if (dslash_type != QUDA_LAPLACE_DSLASH) { - applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec); - } - gauge_loaded = true; - } // else it's already been loaded + // load in the command line supplied gauge field using QIO and LIME + read_gauge_field(latfile.c_str(), qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv); + if (dslash_type != QUDA_LAPLACE_DSLASH) { + applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec); + } + gauge_loaded = true; } else { int construct_type = (unit_gauge) ? 0 : 1; if (dslash_type == QUDA_LAPLACE_DSLASH) { constructQudaGaugeField(qdp_inlink, construct_type, gauge_param.cpu_prec, &gauge_param); } else { - constructFatLongGaugeField(qdp_inlink, qdp_longlink_cpu, construct_type, gauge_param.cpu_prec, &gauge_param, + constructFatLongGaugeField(qdp_inlink, qdp_longlink, construct_type, gauge_param.cpu_prec, &gauge_param, compute_fatlong ? QUDA_STAGGERED_DSLASH : dslash_type); } } @@ -53,62 +53,46 @@ void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longli // "compute" the fat/long links or not. if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) { for (int dir = 0; dir < 4; dir++) { - memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); - memcpy(qdp_fatlink_cpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); - memset(qdp_longlink_gpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size); - memset(qdp_longlink_cpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size); + memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); + memset(qdp_longlink[dir], 0, V * gauge_site_size * host_gauge_data_type_size); } } else { // QUDA_ASQTAD_DSLASH if (compute_fatlong) { - computeFatLongGPUandCPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_fatlink_cpu, qdp_longlink_cpu, qdp_inlink, - gauge_param, host_gauge_data_type_size, n_naiks, eps_naik); + if (compute_on_gpu) + computeFatLongGPU(qdp_fatlink, qdp_longlink, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks, eps_naik); + else + computeFatLongCPU(qdp_fatlink, qdp_longlink, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks, eps_naik); } else { - // Not computing FatLong for (int dir = 0; dir < 4; dir++) { - memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); - memcpy(qdp_fatlink_cpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); - memcpy(qdp_longlink_gpu[dir], qdp_longlink_cpu[dir], V * gauge_site_size * host_gauge_data_type_size); + memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); } } } } -void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink, - QudaGaugeParam &gauge_param, int argc, char **argv) +void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu, + void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param, + int argc, char **argv, bool &gauge_loaded) { - gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; - - if (latfile.size() > 0) { - // load in the command line supplied gauge field using QIO and LIME - read_gauge_field(latfile.c_str(), qdp_inlink, gauge_param.cpu_prec, gauge_param.X, argc, argv); - if (dslash_type != QUDA_LAPLACE_DSLASH) { - applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec); - } - } else { - int construct_type = (unit_gauge) ? 0 : 1; - if (dslash_type == QUDA_LAPLACE_DSLASH) { - constructQudaGaugeField(qdp_inlink, construct_type, gauge_param.cpu_prec, &gauge_param); - } else { - constructFatLongGaugeField(qdp_inlink, qdp_longlink, construct_type, gauge_param.cpu_prec, &gauge_param, - compute_fatlong ? QUDA_STAGGERED_DSLASH : dslash_type); - } - } + constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_fatlink_cpu, gauge_param, argc, argv, gauge_loaded, false); // QUDA_STAGGERED_DSLASH follows the same codepath whether or not you // "compute" the fat/long links or not. if (dslash_type == QUDA_STAGGERED_DSLASH || dslash_type == QUDA_LAPLACE_DSLASH) { for (int dir = 0; dir < 4; dir++) { - memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); - memset(qdp_longlink[dir], 0, V * gauge_site_size * host_gauge_data_type_size); + memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); + memset(qdp_longlink_gpu[dir], 0, V * gauge_site_size * host_gauge_data_type_size); } } else { // QUDA_ASQTAD_DSLASH if (compute_fatlong) { - computeFatLongGPU(qdp_fatlink, qdp_longlink, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks, eps_naik); + computeFatLongGPU(qdp_fatlink_gpu, qdp_longlink_gpu, qdp_inlink, gauge_param, host_gauge_data_type_size, n_naiks, eps_naik); } else { + // Not computing FatLong for (int dir = 0; dir < 4; dir++) { - memcpy(qdp_fatlink[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); + memcpy(qdp_fatlink_gpu[dir], qdp_inlink[dir], V * gauge_site_size * host_gauge_data_type_size); + memcpy(qdp_longlink_gpu[dir], qdp_longlink_cpu[dir], V * gauge_site_size * host_gauge_data_type_size); } } } @@ -483,7 +467,7 @@ void staggeredTwoLinkGaussianSmear(quda::ColorSpinorField &, void **, const quda // If "eps_naik" is 0, there's no naik correction, // and this routine skips building the paths in "act_path_coeffs[2]" void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, void **longlink_eps, void **sitelink, - void *qudaGaugeParamPtr, double **act_path_coeffs, double eps_naik) + void *qudaGaugeParamPtr, std::array, 3> &act_path_coeffs, double eps_naik) { // Prepare various things QudaGaugeParam &qudaGaugeParam = *((QudaGaugeParam *)qudaGaugeParamPtr); From 8d0c26b7e48179afb6f8fea8ea5a1ffe3e21bc9f Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 5 Oct 2023 11:19:08 -0700 Subject: [PATCH 64/99] Apply some OMP parallelization to staggered_host_utils and remove unnecessary gauge_loaded parameter --- tests/staggered_dslash_test_utils.h | 3 +-- tests/staggered_eigensolve_test.cpp | 3 +-- tests/staggered_invert_test.cpp | 3 +-- tests/utils/host_utils.h | 4 ++-- tests/utils/staggered_host_utils.cpp | 22 ++++++++++++++-------- 5 files changed, 19 insertions(+), 16 deletions(-) diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h index d23ba31d38..c633ee3730 100644 --- a/tests/staggered_dslash_test_utils.h +++ b/tests/staggered_dslash_test_utils.h @@ -171,10 +171,9 @@ struct StaggeredDslashTestWrapper { qdp_longlink_cpu[dir] = safe_malloc(V * gauge_site_size * host_gauge_data_type_size); } - bool gauge_loaded = false; bool compute_on_gpu = false; // reference fat/long fields should be computed on cpu constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_fatlink_cpu, gauge_param, argc, argv, - gauge_loaded, compute_on_gpu); + compute_on_gpu); } void init() diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp index f31d54de6b..a495a11251 100644 --- a/tests/staggered_eigensolve_test.cpp +++ b/tests/staggered_eigensolve_test.cpp @@ -132,8 +132,7 @@ int main(int argc, char **argv) milc_fatlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); milc_longlink = safe_malloc(4 * V * gauge_site_size * host_gauge_data_type_size); - bool gauge_loaded; - constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv, gauge_loaded, true); + constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv, true); // Compute plaquette. Routine is aware that the gauge fields already have the phases on them. double plaq[3]; diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index ea55986e8e..5b9658717f 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -181,8 +181,7 @@ void test(int argc, char **argv) void* qdp_inlink[4] = {cpuIn.data(0), cpuIn.data(1), cpuIn.data(2), cpuIn.data(3)}; void* qdp_fatlink[4] = {cpuFatQDP.data(0), cpuFatQDP.data(1), cpuFatQDP.data(2), cpuFatQDP.data(3)}; void* qdp_longlink[4] = {cpuLongQDP.data(0), cpuLongQDP.data(1), cpuLongQDP.data(2), cpuLongQDP.data(3)}; - bool gauge_loaded; - constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv, gauge_loaded, true); + constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink, qdp_fatlink, gauge_param, argc, argv, true); // Reorder gauge fields to MILC order cpuFatMILC = cpuFatQDP; cpuLongMILC = cpuLongQDP; diff --git a/tests/utils/host_utils.h b/tests/utils/host_utils.h index 8deaae174e..bd6652df1f 100644 --- a/tests/utils/host_utils.h +++ b/tests/utils/host_utils.h @@ -49,9 +49,9 @@ void setQudaStaggeredInvTestParams(); //------------------------------------------------------ void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param, - int argc, char **argv, bool &gauge_loaded); + int argc, char **argv); void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink, - QudaGaugeParam &gauge_param, int argc, char **argv, bool &gauge_loaded, bool compute_on_gpu); + QudaGaugeParam &gauge_param, int argc, char **argv, bool compute_on_gpu); void constructFatLongGaugeField(void **fatlink, void **longlink, int type, QudaPrecision precision, QudaGaugeParam *, QudaDslashType dslash_type); void loadFatLongGaugeQuda(void *milc_fatlink, void *milc_longlink, QudaGaugeParam &gauge_param); diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp index 9eb4242bcb..8bf18990d1 100644 --- a/tests/utils/staggered_host_utils.cpp +++ b/tests/utils/staggered_host_utils.cpp @@ -27,7 +27,7 @@ template using complex = std::complex; // Staggered gauge field utils //------------------------------------------------------ void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, void **qdp_fatlink, - QudaGaugeParam &gauge_param, int argc, char **argv, bool &gauge_loaded, bool compute_on_gpu) + QudaGaugeParam &gauge_param, int argc, char **argv, bool compute_on_gpu) { gauge_param.reconstruct = QUDA_RECONSTRUCT_NO; @@ -38,7 +38,6 @@ void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, vo if (dslash_type != QUDA_LAPLACE_DSLASH) { applyGaugeFieldScaling_long(qdp_inlink, Vh, &gauge_param, QUDA_STAGGERED_DSLASH, gauge_param.cpu_prec); } - gauge_loaded = true; } else { int construct_type = (unit_gauge) ? 0 : 1; if (dslash_type == QUDA_LAPLACE_DSLASH) { @@ -73,9 +72,9 @@ void constructStaggeredHostGaugeField(void **qdp_inlink, void **qdp_longlink, vo void constructStaggeredHostDeviceGaugeField(void **qdp_inlink, void **qdp_longlink_cpu, void **qdp_longlink_gpu, void **qdp_fatlink_cpu, void **qdp_fatlink_gpu, QudaGaugeParam &gauge_param, - int argc, char **argv, bool &gauge_loaded) + int argc, char **argv) { - constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_fatlink_cpu, gauge_param, argc, argv, gauge_loaded, false); + constructStaggeredHostGaugeField(qdp_inlink, qdp_longlink_cpu, qdp_fatlink_cpu, gauge_param, argc, argv, false); // QUDA_STAGGERED_DSLASH follows the same codepath whether or not you // "compute" the fat/long links or not. @@ -172,6 +171,7 @@ void constructFatLongGaugeField(void **fatlink, void **longlink, int type, QudaP // FIXME: may break host comparison if (dslash_type == QUDA_STAGGERED_DSLASH) { for (int dir = 0; dir < 4; ++dir) { +#pragma omp parallel for for (int i = 0; i < V; ++i) { for (auto j = 0lu; j < gauge_site_size; j += 2) { if (precision == QUDA_DOUBLE_PRECISION) { @@ -236,11 +236,12 @@ void loadFatLongGaugeQuda(void *milc_fatlink, void *milc_longlink, QudaGaugePara template void computeLongLinkCPU(void **longlink, su3_matrix **sitelink, Float *act_path_coeff) { - su3_matrix temp; for (int dir = XUP; dir <= TUP; ++dir) { int dx[4] = {0, 0, 0, 0}; +#pragma omp parallel for for (int i = 0; i < V; ++i) { // Initialize the longlinks + su3_matrix temp; su3_matrix *llink = ((su3_matrix *)longlink[dir]) + i; llfat_scalar_mult_su3_matrix(sitelink[dir] + i, act_path_coeff[1], llink); dx[dir] = 1; @@ -251,7 +252,6 @@ void computeLongLinkCPU(void **longlink, su3_matrix **sitelink, Float *act_path_ llfat_mult_su3_nn(&temp, sitelink[dir] + nbr_idx, llink); } } - return; } #else @@ -262,7 +262,7 @@ void computeLongLinkCPU(void **longlink, su3_matrix **sitelinkEx, Float *act_pat for (int dir = 0; dir < 4; ++dir) E[dir] = Z[dir] + 4; const int extended_volume = E[3] * E[2] * E[1] * E[0]; - su3_matrix temp; +#pragma omp parallel for for (int t = 0; t < Z[3]; ++t) { for (int z = 0; z < Z[2]; ++z) { for (int y = 0; y < Z[1]; ++y) { @@ -278,6 +278,7 @@ void computeLongLinkCPU(void **longlink, su3_matrix **sitelinkEx, Float *act_pat llfat_scalar_mult_su3_matrix(sitelinkEx[dir] + large_index, act_path_coeff[1], llink); dx[dir] = 1; int nbr_index = neighborIndexFullLattice(E, large_index, dx); + su3_matrix temp; llfat_mult_su3_nn(llink, sitelinkEx[dir] + nbr_index, &temp); dx[dir] = 2; nbr_index = neighborIndexFullLattice(E, large_index, dx); @@ -287,7 +288,6 @@ void computeLongLinkCPU(void **longlink, su3_matrix **sitelinkEx, Float *act_pat } // y } // z } // t - return; } #endif @@ -383,6 +383,7 @@ void staggeredTwoLinkGaussianSmear(sFloat *res, gFloat **twolink, gFloat **ghost } { +#pragma omp parallel for for (int i = 0; i < Vh; i++) { // Get local time-slice index: const int local_t = i / Vsh_t; @@ -500,6 +501,7 @@ void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, vo int X3 = Z[2]; int X4 = Z[3]; +#pragma omp parallel for for (int i = 0; i < V_ex; i++) { int sid = i; int oddBit = 0; @@ -639,6 +641,7 @@ void computeHISQLinksCPU(void **fatlink, void **longlink, void **fatlink_eps, vo // Prepare for extended W fields // /////////////////////////////////// +#pragma omp parallel for for (int i = 0; i < V_ex; i++) { int sid = i; int oddBit = 0; @@ -855,6 +858,7 @@ void reorderQDPtoMILC(void *milc_out, void **qdp_in, int V, int siteSize, QudaPr template void reorderMILCtoQDP(Out **qdp_out, In *milc_in, int V, int siteSize) { +#pragma omp parallel for for (int i = 0; i < V; i++) { for (int dir = 0; dir < 4; dir++) { for (int j = 0; j < siteSize; j++) { @@ -909,6 +913,7 @@ void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, Q for (int d = 0; d < 3; d++) { // even +#pragma omp parallel for for (int i = 0; i < Vh; i++) { int index = fullLatticeIndex(i, 0); @@ -957,6 +962,7 @@ void applyGaugeFieldScaling_long(Float **gauge, int Vh, QudaGaugeParam *param, Q // Apply boundary conditions to temporal links if (param->t_boundary == QUDA_ANTI_PERIODIC_T && last_node_in_t()) { +#pragma omp parallel for for (int j = 0; j < Vh; j++) { int sign = 1; if (dslash_type == QUDA_ASQTAD_DSLASH) { From 535f1a70b63f6bc94d3c0c97a9426cb92787d3d3 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 5 Oct 2023 11:20:04 -0700 Subject: [PATCH 65/99] OMP parallelization to host_utils.cpp --- tests/utils/host_utils.cpp | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp index 65ea318840..70aea9cdc2 100644 --- a/tests/utils/host_utils.cpp +++ b/tests/utils/host_utils.cpp @@ -1054,6 +1054,7 @@ template void constructUnitGaugeField(Float **res, QudaGaugePar } for (int dir = 0; dir < 4; dir++) { +#pragma omp parallel for for (int i = 0; i < Vh; i++) { for (int m = 0; m < 3; m++) { for (int n = 0; n < 3; n++) { @@ -1269,14 +1270,21 @@ template static void checkGauge(Float **oldG, Float **newG, dou for (int d = 0; d < 4; d++) { for (int eo = 0; eo < 2; eo++) { +#pragma omp parallel for for (int i = 0; i < Vh; i++) { int ga_idx = (eo * Vh + i); for (int j = 0; j < 18; j++) { double diff = fabs(newG[d][ga_idx * 18 + j] - oldG[d][ga_idx * 18 + j]); /// fabs(oldG[d][ga_idx*18+j]); for (int f = 0; f < fail_check; f++) - if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) fail[d][f]++; - if (diff > epsilon || std::isnan(diff)) iter[d][j]++; + if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) { +#pragma omp atomic + fail[d][f]++; + } + if (diff > epsilon || std::isnan(diff)) { +#pragma omp atomic + iter[d][j]++; + } } } } @@ -1311,6 +1319,7 @@ void createSiteLinkCPU(void *const *link, QudaPrecision precision, int phase) } if (phase == SITELINK_PHASE_MILC) { +#pragma omp parallel for for (int i = 0; i < V; i++) { for (int dir = XUP; dir <= TUP; dir++) { int idx = i; @@ -1459,14 +1468,21 @@ template int compareLink(Float **linkA, Float **linkB, int len) for (int i = 0; i < 18; i++) iter[i] = 0; for (int dir = 0; dir < 4; dir++) { +#pragma omp parallel for for (int i = 0; i < len; i++) { for (int j = 0; j < 18; j++) { int is = i * 18 + j; double diff = fabs(linkA[dir][is] - linkB[dir][is]); for (int f = 0; f < fail_check; f++) - if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) fail[f]++; + if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) { +#pragma omp atomic + fail[f]++; + } // if (diff > 1e-1) printf("%d %d %e\n", i, j, diff); - if (diff > 1e-3 || std::isnan(diff)) iter[j]++; + if (diff > 1e-3 || std::isnan(diff)) { +#pragma omp atomic + iter[j]++; + } } } } @@ -1624,14 +1640,21 @@ template int compare_mom(Float *momA, Float *momB, int len) int iter[mom_site_size]; for (auto i = 0lu; i < mom_site_size; i++) iter[i] = 0; +#pragma omp parallel for for (int i = 0; i < len; i++) { for (auto j = 0lu; j < mom_site_size - 1; j++) { int is = i * mom_site_size + j; double diff = fabs(momA[is] - momB[is]); for (int f = 0; f < fail_check; f++) - if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) fail[f]++; + if (diff > pow(10.0, -(f + 1)) || std::isnan(diff)) { +#pragma omp atomic + fail[f]++; + } // if (diff > 1e-1) printf("%d %d %e\n", i, j, diff); - if (diff > 1e-3 || std::isnan(diff)) iter[j]++; + if (diff > 1e-3 || std::isnan(diff)) { +#pragma omp atomic + iter[j]++; + } } } From 77958796eab170a8edc75f6cfbd4fddabc19f7e6 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 5 Oct 2023 11:20:38 -0700 Subject: [PATCH 66/99] OMP parallelization to hisq_force_reference --- tests/host_reference/hisq_force_reference.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/host_reference/hisq_force_reference.cpp b/tests/host_reference/hisq_force_reference.cpp index 9cd4ee4d9c..78a5f36f1c 100644 --- a/tests/host_reference/hisq_force_reference.cpp +++ b/tests/host_reference/hisq_force_reference.cpp @@ -99,8 +99,9 @@ template void su3_projector(su3_vecto template void computeLinkOrderedOuterProduct(su3_vector *src, quda::GaugeField &dest, size_t nhops) { - int dx[4]; +#pragma omp parallel for for (int i = 0; i < V; ++i) { + int dx[4]; for (int dir = 0; dir < 4; ++dir) { dx[3] = dx[2] = dx[1] = dx[0] = 0; dx[dir] = nhops; @@ -894,10 +895,12 @@ void computeMiddleLinkField(const int dim[4], const Real *const oprod, const Rea // To keep the code as close to the GPU code as possible, we'll // loop over the even sites first and then the odd sites LoadStore ls(volume); +#pragma omp parallel for for (int site = 0; site < loop_count; ++site) { computeMiddleLinkSite(site, dim, oprod, Qprev, link, sig, mu, coeff, ls, Pmu, P3, Qmu, newOprod); } // Loop over odd lattice sites +#pragma omp parallel for for (int site = 0; site < loop_count; ++site) { computeMiddleLinkSite(site, dim, oprod, Qprev, link, sig, mu, coeff, ls, Pmu, P3, Qmu, newOprod); } @@ -988,10 +991,12 @@ void computeSideLinkField(const int dim[4], const Real *const P3, #endif LoadStore ls(volume); +#pragma omp parallel for for (int site = 0; site < loop_count; ++site) { computeSideLinkSite(site, dim, P3, Qprod, link, sig, mu, coeff, accumu_coeff, ls, shortP, newOprod); } +#pragma omp parallel for for (int site = 0; site < loop_count; ++site) { computeSideLinkSite(site, dim, P3, Qprod, link, sig, mu, coeff, accumu_coeff, ls, shortP, newOprod); } @@ -1098,6 +1103,7 @@ void computeAllLinkField(const int dim[4], const Real *const oprod, const Real * #endif LoadStore ls(volume); +#pragma omp parallel for for (int site = 0; site < loop_count; ++site) { computeAllLinkSite(site, dim, oprod, Qprev, link, sig, mu, coeff, accumu_coeff, ls, shortP, newOprod); @@ -1295,10 +1301,12 @@ void computeLongLinkField(const int dim[4], const Real *const oprod, const Real const int half_volume = volume / 2; LoadStore ls(volume); +#pragma omp parallel for for (int site = 0; site < half_volume; ++site) { computeLongLinkSite(site, dim, oprod, link, sig, coeff, ls, output); } // Loop over odd lattice sites +#pragma omp parallel for for (int site = 0; site < half_volume; ++site) { computeLongLinkSite(site, dim, oprod, link, sig, coeff, ls, output); } @@ -1362,7 +1370,9 @@ void completeForceField(const int dim[4], const Real *const oprod, const Real *c const int half_volume = volume / 2; LoadStore ls(volume); +#pragma omp parallel for for (int site = 0; site < half_volume; ++site) { completeForceSite(site, dim, oprod, link, sig, ls, mom); } +#pragma omp parallel for for (int site = 0; site < half_volume; ++site) { completeForceSite(site, dim, oprod, link, sig, ls, mom); } } From 12b9632459ba015f0912354df518ffa10766d38a Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 5 Oct 2023 11:21:04 -0700 Subject: [PATCH 67/99] OMP parallelization to staggered reference dslash --- tests/host_reference/staggered_dslash_reference.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/host_reference/staggered_dslash_reference.cpp b/tests/host_reference/staggered_dslash_reference.cpp index 5716ce5de8..14852a9f22 100644 --- a/tests/host_reference/staggered_dslash_reference.cpp +++ b/tests/host_reference/staggered_dslash_reference.cpp @@ -42,6 +42,7 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink, sFloat **, sFloat **, int oddBit, int daggerBit, QudaDslashType dslash_type) #endif { +#pragma omp parallel for for (auto i = 0lu; i < Vh * stag_spinor_site_size; i++) res[i] = 0.0; gFloat *fatlinkEven[4], *fatlinkOdd[4]; @@ -66,6 +67,7 @@ void staggeredDslashReference(sFloat *res, gFloat **fatlink, gFloat **longlink, #endif } +#pragma omp parallel for for (int sid = 0; sid < Vh; sid++) { int offset = stag_spinor_site_size * sid; From 4c308f6f72a2239e97f3423137459e049dc56b03 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Sat, 7 Oct 2023 16:11:42 -0700 Subject: [PATCH 68/99] Don't dereference nullptr when creating reference QDP fields --- lib/gauge_field.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index af9cc7bf90..6a0b4c6bb9 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -181,7 +181,7 @@ namespace quda { if (param.create != QUDA_REFERENCE_FIELD_CREATE) { gauge_array[d] = quda_ptr(mem_type, nbytes); } else if (param.create == QUDA_REFERENCE_FIELD_CREATE) { - gauge_array[d] = quda_ptr(static_cast(param.gauge)[d], mem_type); + if (param.gauge) gauge_array[d] = quda_ptr(static_cast(param.gauge)[d], mem_type); } else { errorQuda("Unsupported creation type %d", param.create); } From a16e51c5cae63fba7b8bfdbb7f217ce2a54faf7c Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 11 Oct 2023 16:54:02 -0700 Subject: [PATCH 69/99] Prevent concurrent timers from running: check if a timer is already running, and if so push it to the stack, and restore after the newly started timer is stopped. Fixes timing issues as noted by Jiqun --- lib/timer.cpp | 49 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/lib/timer.cpp b/lib/timer.cpp index 986d7b045f..2c6f9b21c3 100644 --- a/lib/timer.cpp +++ b/lib/timer.cpp @@ -136,6 +136,8 @@ namespace quda { #define POP_RANGE #endif + static std::stack pt_stack; + void TimeProfile::Start_(const char *func, const char *file, int line, QudaProfileType idx) { // if total timer isn't running, then start it running @@ -144,6 +146,17 @@ namespace quda { switchOff = true; } + // if a timer is already running, stop it and push to stack + for (auto i = 0; i < QUDA_PROFILE_COUNT - 1; i++) { + if (i == static_cast(idx)) continue; + if (profile[i].running) { + if (i == QUDA_PROFILE_COMPUTE || i == QUDA_PROFILE_H2D || i == QUDA_PROFILE_D2H) qudaDeviceSynchronize(); + profile[i].stop(file, func, line); + if (use_global) StopGlobal(func, file, line, static_cast(i)); + pt_stack.push(static_cast(i)); + } + } + profile[idx].start(func, file, line); PUSH_RANGE(fname.c_str(), idx) if (use_global) StartGlobal(func, file, line, idx); @@ -156,12 +169,22 @@ namespace quda { profile[idx].stop(func, file, line); POP_RANGE - // switch off total timer if we need to - if (switchOff && idx != QUDA_PROFILE_TOTAL) { - profile[QUDA_PROFILE_TOTAL].stop(func, file, line); - switchOff = false; + if (pt_stack.empty()) { + // switch off total timer if we need to (only if no timer being popped) + if (switchOff && idx != QUDA_PROFILE_TOTAL) { + profile[QUDA_PROFILE_TOTAL].stop(func, file, line); + switchOff = false; + } + if (use_global) StopGlobal(func, file, line, idx); + } + + // restore any pre-existing timers if needed + if (!pt_stack.empty()) { + auto i = pt_stack.top(); + pt_stack.pop(); + profile[i].start(func, file, line); + if (use_global) StartGlobal(func, file, line, i); } - if (use_global) StopGlobal(func, file, line, idx); } #undef PUSH_RANGE @@ -198,28 +221,28 @@ namespace quda { } } - TimeProfile dummy("dummy"); + TimeProfile dummy("default", false); - static std::stack tpstack; + static std::stack tp_stack; pushProfile::pushProfile(TimeProfile &profile) : profile(profile) { profile.TPSTART(QUDA_PROFILE_TOTAL); - tpstack.push(&profile); + tp_stack.push(&profile); } pushProfile::~pushProfile() { - if (tpstack.empty()) errorQuda("popProfile() called with empty stack"); - auto &profile = *(tpstack.top()); + if (tp_stack.empty()) errorQuda("popProfile() called with empty stack"); + auto &profile = *(tp_stack.top()); if (&(this->profile) != &profile) errorQuda("Popped profile is not the expected one"); - tpstack.pop(); + tp_stack.pop(); profile.TPSTOP(QUDA_PROFILE_TOTAL); } TimeProfile& getProfile() { - if (tpstack.empty()) return dummy; - return *(tpstack.top()); + if (tp_stack.empty()) return dummy; + return *(tp_stack.top()); } } From 85292b24463bfade92a4451a9b26943209cb9f92 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 13 Oct 2023 16:22:02 -0700 Subject: [PATCH 70/99] Cleanup of solver timing and flops handling: add global flops counter which is incremented whenever tuneLaunch is called; for solver gflops and timing, we now compute the time and gflop between pushing the present interface profile, this now ensures we include all operations and includes upload/download time --- include/invert_quda.h | 19 ----------------- include/multigrid.h | 5 ----- include/quda.h | 4 +++- include/timer.h | 7 ++++++- include/tune_quda.h | 10 +++++++++ lib/eigensolve_quda.cpp | 2 -- lib/gauge_fix_fft.cu | 10 ++++----- lib/interface_quda.cpp | 40 ++++++++++++------------------------ lib/inv_bicgstab_quda.cpp | 14 ------------- lib/inv_bicgstabl_quda.cpp | 11 ---------- lib/inv_ca_cg.cpp | 27 +----------------------- lib/inv_ca_gcr.cpp | 25 +---------------------- lib/inv_cg3_quda.cpp | 13 +----------- lib/inv_cg_quda.cpp | 42 +------------------------------------- lib/inv_eigcg_quda.cpp | 24 ---------------------- lib/inv_gcr_quda.cpp | 15 -------------- lib/inv_gmresdr_quda.cpp | 9 -------- lib/inv_mr_quda.cpp | 15 +------------- lib/inv_msrc_cg_quda.cpp | 10 --------- lib/inv_multi_cg_quda.cpp | 9 -------- lib/inv_pcg_quda.cpp | 13 ------------ lib/multigrid.cpp | 28 ------------------------- lib/solver.cpp | 3 --- lib/timer.cpp | 11 +++++++++- lib/tune.cpp | 6 ++++++ 25 files changed, 58 insertions(+), 314 deletions(-) diff --git a/include/invert_quda.h b/include/invert_quda.h index 35a21ce31c..11ac64708e 100644 --- a/include/invert_quda.h +++ b/include/invert_quda.h @@ -225,12 +225,6 @@ namespace quda { /** The type of accelerator type to use for preconditioner */ QudaAcceleratorType accelerator_type_precondition; - /**< The time taken by the solver */ - double secs; - - /**< The Gflops rate of the solver */ - double gflops; - // Incremental EigCG solver parameters /**< The precision of the Ritz vectors */ QudaPrecision precision_ritz;//also search space precision @@ -333,8 +327,6 @@ namespace quda { ca_lambda_max_precondition(param.ca_lambda_max_precondition), schwarz_type(param.schwarz_type), accelerator_type_precondition(param.accelerator_type_precondition), - secs(param.secs), - gflops(param.gflops), precision_ritz(param.cuda_prec_ritz), n_ev(param.n_ev), m(param.max_search_dim), @@ -422,8 +414,6 @@ namespace quda { ca_lambda_max_precondition(param.ca_lambda_max_precondition), schwarz_type(param.schwarz_type), accelerator_type_precondition(param.accelerator_type_precondition), - secs(param.secs), - gflops(param.gflops), precision_ritz(param.precision_ritz), n_ev(param.n_ev), m(param.m), @@ -466,9 +456,6 @@ namespace quda { param.true_res = true_res; param.true_res_hq = true_res_hq; param.iter += iter; - comm_allreduce_sum(gflops); - param.gflops += gflops; - param.secs += secs; if (offset >= 0) { param.true_res_offset[offset] = true_res_offset[offset]; param.iter_res_offset[offset] = iter_res_offset[offset]; @@ -786,12 +773,6 @@ namespace quda { static void computeCAKrylovSpace(const DiracMatrix &diracm, std::vector &Ap, std::vector &p, int n_krylov, QudaCABasis basis, double m_map, double b_map, Args &&...args); - - /** - * @brief Return flops - * @return flops expended by this operator - */ - virtual double flops() const { return 0; } }; /** diff --git a/include/multigrid.h b/include/multigrid.h index 82a46998c4..e5981baac2 100644 --- a/include/multigrid.h +++ b/include/multigrid.h @@ -486,11 +486,6 @@ namespace quda { */ void buildFreeVectors(std::vector &B); - /** - @brief Return the total flops done on this and all coarser levels. - */ - double flops() const; - /** @brief Return if we're on a fine grid right now */ diff --git a/include/quda.h b/include/quda.h index d6e9ee66aa..b2ddefa72c 100644 --- a/include/quda.h +++ b/include/quda.h @@ -1760,8 +1760,10 @@ extern "C" { int delete_2link; /** Set if the input spinor is on a time slice **/ int t0; + /** Time taken for the smearing operations **/ + double secs; /** Flops count for the smearing operations **/ - int gflops; + double gflops; } QudaQuarkSmearParam; diff --git a/include/timer.h b/include/timer.h index 2de1829c18..8402deb89c 100644 --- a/include/timer.h +++ b/include/timer.h @@ -235,8 +235,13 @@ namespace quda { the profile stack, and be popped when its destructor is called. */ struct pushProfile { + static inline double secs_dummy = 0; + static inline double gflops_dummy = 0; TimeProfile &profile; - pushProfile(TimeProfile &profile); + double &secs; + double &gflops; + uint64_t flops; + pushProfile(TimeProfile &profile, double &secs = secs_dummy, double &gflops = gflops_dummy); virtual ~pushProfile(); }; diff --git a/include/tune_quda.h b/include/tune_quda.h index ff99826149..2750e57e9c 100644 --- a/include/tune_quda.h +++ b/include/tune_quda.h @@ -45,6 +45,10 @@ namespace quda { class Tunable { + friend TuneParam tuneLaunch(Tunable &, QudaTune, QudaVerbosity); + static inline uint64_t _flops_global = 0; + static inline uint64_t _bytes_global = 0; + protected: virtual long long flops() const { return 0; } virtual long long bytes() const { return 0; } @@ -340,6 +344,12 @@ namespace quda { qudaError_t launchError() const { return launch_error; } qudaError_t &launchError() { return launch_error; } + + static void flops_global(uint64_t value) { _flops_global = value; } + static uint64_t flops_global() { return _flops_global; } + + static void bytes_global(uint64_t value) { _bytes_global = value; } + static uint64_t bytes_global() { return _bytes_global; } }; /** diff --git a/lib/eigensolve_quda.cpp b/lib/eigensolve_quda.cpp index 00c888dd88..710d6ac13a 100644 --- a/lib/eigensolve_quda.cpp +++ b/lib/eigensolve_quda.cpp @@ -259,8 +259,6 @@ namespace quda io.save(kSpace, save_prec, n_eig); } - mat.flops(); - logQuda(QUDA_SUMMARIZE, "********************************\n"); logQuda(QUDA_SUMMARIZE, "***** END QUDA EIGENSOLVER *****\n"); logQuda(QUDA_SUMMARIZE, "********************************\n"); diff --git a/lib/gauge_fix_fft.cu b/lib/gauge_fix_fft.cu index 1de3980332..fea9a92623 100644 --- a/lib/gauge_fix_fft.cu +++ b/lib/gauge_fix_fft.cu @@ -217,7 +217,7 @@ namespace quda { GaugeFixQuality gfixquality(argQ, data); gfixquality.apply(device::get_default_stream()); double action0 = argQ.getAction(); - if(getVerbosity() >= QUDA_SUMMARIZE) printf("Step: %d\tAction: %.16e\ttheta: %.16e\n", 0, argQ.getAction(), argQ.getTheta()); + logQuda(QUDA_SUMMARIZE, "Step: %d\tAction: %.16e\ttheta: %.16e\n", 0, argQ.getAction(), argQ.getTheta()); double diff = 0.0; int iter = 0; @@ -289,7 +289,7 @@ namespace quda { if ( autotune && ((action - action0) < -1e-14) ) { if ( arg.alpha > 0.01 ) { arg.alpha = 0.95 * arg.alpha; - if(getVerbosity() >= QUDA_SUMMARIZE) printf(">>>>>>>>>>>>>> Warning: changing alpha down -> %.4e\n", arg.alpha); + logQuda(QUDA_SUMMARIZE, ">>>>>>>>>>>>>> Warning: changing alpha down -> %.4e\n", arg.alpha); } } //------------------------------------------------------------------------ @@ -356,7 +356,7 @@ namespace quda { gflops = (gflops * 1e-9) / (secs); gbytes = gbytes / (secs * 1e9); - if (getVerbosity() > QUDA_SUMMARIZE) printfQuda("Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes); + logQuda(QUDA_SUMMARIZE, "Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes); host_free(num_failures_h); } @@ -366,10 +366,10 @@ namespace quda { double alpha, int autotune, double tolerance, int stopWtheta) { if (gauge_dir != 3) { - if (getVerbosity() > QUDA_SUMMARIZE) printfQuda("Starting Landau gauge fixing with FFTs...\n"); + logQuda(QUDA_SUMMARIZE, "Starting Landau gauge fixing with FFTs...\n"); gaugeFixingFFT(data, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta); } else { - if (getVerbosity() > QUDA_SUMMARIZE) printfQuda("Starting Coulomb gauge fixing with FFTs...\n"); + logQuda(QUDA_SUMMARIZE, "Starting Coulomb gauge fixing with FFTs...\n"); gaugeFixingFFT(data, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta); } } diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index f05e633d51..6286bb04a9 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -1773,7 +1773,7 @@ namespace quda { void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity) { - auto profile = pushProfile(profileDslash); + auto profile = pushProfile(profileDslash, inv_param->secs, inv_param->gflops); const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise; @@ -2141,12 +2141,13 @@ void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam *eig_param) { if (!initialized) errorQuda("QUDA not initialized"); - auto profile = pushProfile(profileEigensolve); // Transfer the inv param structure contained in eig_param. // This will define the operator to be eigensolved. QudaInvertParam *inv_param = eig_param->invert_param; + auto profile = pushProfile(profileEigensolve, inv_param->secs, inv_param->gflops); + // QUDA can employ even-odd preconditioning to an operator. // For the eigensolver the solution type must match // the solve type, i.e., there is no full solution reconstruction @@ -2179,9 +2180,7 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam // Check that the gauge field is valid GaugeField *cudaGauge = checkGauge(inv_param); - // Set all timing statistics to zero - inv_param->secs = 0; - inv_param->gflops = 0; + // Set iter statistics to zero inv_param->iter = 0; // Dump all eigensolver and invert param variables to stdout if requested. @@ -2331,8 +2330,6 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr errorQuda("Outer MG solver can only use QUDA_DIRECT_SOLVE at present"); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaMultigridParam(&mg_param); - mg_param.secs = 0; - mg_param.gflops = 0; bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); @@ -2395,7 +2392,7 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr void* newMultigridQuda(QudaMultigridParam *mg_param) { profilerStart(__func__); - auto profile = pushProfile(profileInvert); + auto profile = pushProfile(profileInvert, mg_param->secs, mg_param->gflops); pushVerbosity(mg_param->invert_param->verbosity); auto *mg = new multigrid_solver(*mg_param, profileInvert); @@ -2414,7 +2411,7 @@ void destroyMultigridQuda(void *mg) { void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param) { profilerStart(__func__); - auto profile = pushProfile(profileInvert); + auto profile = pushProfile(profileInvert, mg_param->secs, mg_param->gflops); pushVerbosity(mg_param->invert_param->verbosity); profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE); @@ -2526,7 +2523,7 @@ void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param) void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param) { profilerStart(__func__); - auto profile = pushProfile(profileInvert); + auto profile = pushProfile(profileInvert, mg_param->secs, mg_param->gflops); pushVerbosity(mg_param->invert_param->verbosity); auto *mg = static_cast(mg_); @@ -2547,8 +2544,6 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile) if (param->inv_type != QUDA_EIGCG_INVERTER && param->inv_type != QUDA_INC_EIGCG_INVERTER) return; GaugeField *cudaGauge = checkGauge(param); - eig_param.secs = 0; - eig_param.gflops = 0; DiracParam diracParam; if(eig_param.cuda_prec_ritz == param->cuda_prec) @@ -2602,7 +2597,7 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile) } void* newDeflationQuda(QudaEigParam *eig_param) { - auto profile = pushProfile(profileInvert); + auto profile = pushProfile(profileInvert, eig_param->secs, eig_param->gflops); auto *defl = new deflated_solver(*eig_param, profileInvert); saveProfile(__func__); flushProfile(); @@ -2615,7 +2610,7 @@ void destroyDeflationQuda(void *df) { void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) { - auto profile = pushProfile(profileInvert); + auto profile = pushProfile(profileInvert, param->secs, param->gflops); profilerStart(__func__); if (!initialized) errorQuda("QUDA not initialized"); @@ -2643,8 +2638,6 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) || (param->solve_type == QUDA_NORMERR_PC_SOLVE); - param->secs = 0; - param->gflops = 0; param->iter = 0; Dirac *d = nullptr; @@ -2933,9 +2926,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) } profileInvert.TPSTOP(QUDA_PROFILE_EPILOGUE); - if (!param->make_resident_solution) { - h_x = x; - } + if (!param->make_resident_solution) h_x = x; profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE); @@ -3032,7 +3023,7 @@ void callMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, // col */ profilerStart(__func__); - auto profile = pushProfile(profileInvertMultiSrc); + auto profile = pushProfile(profileInvertMultiSrc, param->secs, param->gflops); CommKey split_key = {param->split_grid[0], param->split_grid[1], param->split_grid[2], param->split_grid[3]}; int num_sub_partition = quda::product(split_key); @@ -3365,7 +3356,7 @@ void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param */ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) { - auto profile = pushProfile(profileMulti); + auto profile = pushProfile(profileMulti, param->secs, param->gflops); profilerStart(__func__); if (!initialized) errorQuda("QUDA not initialized"); @@ -3413,9 +3404,6 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) } } - // Timing and FLOP counters - param->secs = 0; - param->gflops = 0; param->iter = 0; for (int i=0; inum_offset-1; i++) { @@ -4963,7 +4951,7 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_param) { if (smear_param->n_steps == 0) return; - auto profile = pushProfile(profileGaussianSmear); + auto profile = pushProfile(profileGaussianSmear, smear_param->secs, smear_param->gflops); QudaInvertParam *inv_param = smear_param->inv_param; @@ -5080,8 +5068,6 @@ void performTwoLinkGaussianSmearNStep(void *h_in, QudaQuarkSmearParam *smear_par logQuda(QUDA_VERBOSE, "Finished 2link Gaussian smearing.\n"); delete d; - smear_param->gflops = dirac.Flops(); - if (smear_param->delete_2link != 0) { freeUniqueGaugeQuda(QUDA_SMEARED_LINKS); } saveTuneCache(); diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp index c5d3bf90a1..4fdf08020a 100644 --- a/lib/inv_bicgstab_quda.cpp +++ b/lib/inv_bicgstab_quda.cpp @@ -214,10 +214,6 @@ namespace quda { PrintStats("BiCGstab", k, r2, b2, heavy_quark_res); - if (!param.is_preconditioner) { // do not do the below if we this is an inner solver - blas::flops = 0; - } - profile.TPSTOP(QUDA_PROFILE_PREAMBLE); profile.TPSTART(QUDA_PROFILE_COMPUTE); @@ -344,10 +340,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_COMPUTE); profile.TPSTART(QUDA_PROFILE_EPILOGUE); - param.secs += profile.Last(QUDA_PROFILE_COMPUTE); - double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops())*1e-9; - - param.gflops += gflops; param.iter += k; if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); @@ -363,12 +355,6 @@ namespace quda { PrintSummary("BiCGstab", k, r2, b2, stop, param.tol_hq); } - // reset the flops counters - blas::flops = 0; - mat.flops(); - matSloppy.flops(); - matPrecon.flops(); - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); profile.TPSTART(QUDA_PROFILE_FREE); diff --git a/lib/inv_bicgstabl_quda.cpp b/lib/inv_bicgstabl_quda.cpp index 0393fe308c..b0e00d9ff5 100644 --- a/lib/inv_bicgstabl_quda.cpp +++ b/lib/inv_bicgstabl_quda.cpp @@ -50,7 +50,6 @@ namespace quda { if (!param.is_preconditioner) { profile.TPSTOP(QUDA_PROFILE_COMPUTE); - param.secs += profile.Last(QUDA_PROFILE_COMPUTE); profile.TPSTART(QUDA_PROFILE_EIGEN); } @@ -61,7 +60,6 @@ namespace quda { if (!param.is_preconditioner) { profile.TPSTOP(QUDA_PROFILE_EIGEN); - param.secs += profile.Last(QUDA_PROFILE_EIGEN); profile.TPSTART(QUDA_PROFILE_COMPUTE); } @@ -562,7 +560,6 @@ namespace quda { double heavy_quark_res = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x, r_full).z) : 0.0; const int heavy_quark_check = param.heavy_quark_check; // how often to check the heavy quark residual - blas::flops = 0; //bool l2_converge = false; //double r2_old = r2; @@ -706,9 +703,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_COMPUTE); profile.TPSTART(QUDA_PROFILE_EPILOGUE); - param.secs += profile.Last(QUDA_PROFILE_COMPUTE); - double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matEig.flops()) * 1e-9; - param.gflops = gflops; param.iter += total_iter; if (total_iter >= param.maxiter) // >= if n_krylov doesn't divide max iter. @@ -726,12 +720,7 @@ namespace quda { param.true_res_hq = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x, r[0]).z) : 0.0; } - // Reset flops counters. - blas::flops = 0; - mat.flops(); - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); - param.secs += profile.Last(QUDA_PROFILE_EPILOGUE); PrintSummary(solver_name.c_str(), total_iter, r2, b2, stop, param.tol_hq); } diff --git a/lib/inv_ca_cg.cpp b/lib/inv_ca_cg.cpp index ec95bf3ffe..445b2acaf3 100644 --- a/lib/inv_ca_cg.cpp +++ b/lib/inv_ca_cg.cpp @@ -184,10 +184,7 @@ namespace quda { Solver::create(x, b); if (!init) { - if (!param.is_preconditioner) { - blas::flops = 0; - profile.TPSTART(QUDA_PROFILE_INIT); - } + if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_INIT); Q_AQandg.resize(param.Nkrylov * (param.Nkrylov + 1)); Q_AS.resize(param.Nkrylov * param.Nkrylov); @@ -248,7 +245,6 @@ namespace quda { if (!param.is_preconditioner) { profile.TPSTOP(QUDA_PROFILE_COMPUTE); - param.secs += profile.Last(QUDA_PROFILE_COMPUTE); profile.TPSTART(QUDA_PROFILE_EIGEN); } @@ -290,7 +286,6 @@ namespace quda if (!param.is_preconditioner) { profile.TPSTOP(QUDA_PROFILE_EIGEN); - param.secs += profile.Last(QUDA_PROFILE_EIGEN); profile.TPSTART(QUDA_PROFILE_COMPUTE); } } @@ -318,7 +313,6 @@ namespace quda { if (!param.is_preconditioner) { profile.TPSTOP(QUDA_PROFILE_COMPUTE); - param.secs += profile.Last(QUDA_PROFILE_COMPUTE); profile.TPSTART(QUDA_PROFILE_EIGEN); } @@ -357,7 +351,6 @@ namespace quda if (!param.is_preconditioner) { profile.TPSTOP(QUDA_PROFILE_EIGEN); - param.secs += profile.Last(QUDA_PROFILE_EIGEN); profile.TPSTART(QUDA_PROFILE_COMPUTE); } } @@ -522,7 +515,6 @@ namespace quda int resIncreaseTotal = 0; if (!param.is_preconditioner) { - blas::flops = 0; profile.TPSTOP(QUDA_PROFILE_PREAMBLE); profile.TPSTART(QUDA_PROFILE_COMPUTE); } @@ -675,25 +667,8 @@ namespace quda } if (!param.is_preconditioner) { - qudaDeviceSynchronize(); // ensure solver is complete before ending timing profile.TPSTOP(QUDA_PROFILE_COMPUTE); - profile.TPSTART(QUDA_PROFILE_EPILOGUE); - param.secs += profile.Last(QUDA_PROFILE_COMPUTE); - - // store flops and reset counters - double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matEig.flops()) * 1e-9; - - param.gflops += gflops; param.iter += total_iter; - - // reset the flops counters - blas::flops = 0; - mat.flops(); - matSloppy.flops(); - matPrecon.flops(); - matEig.flops(); - - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); } PrintSummary("CA-CG", total_iter, r2, b2, stop, param.tol_hq); diff --git a/lib/inv_ca_gcr.cpp b/lib/inv_ca_gcr.cpp index f9e605ea86..5b893bd3fc 100644 --- a/lib/inv_ca_gcr.cpp +++ b/lib/inv_ca_gcr.cpp @@ -28,10 +28,7 @@ namespace quda Solver::create(x, b); if (!init) { - if (!param.is_preconditioner) { - blas::flops = 0; - profile.TPSTART(QUDA_PROFILE_INIT); - } + if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_INIT); alpha.resize(param.Nkrylov); @@ -103,7 +100,6 @@ namespace quda if (!param.is_preconditioner) { profile.TPSTOP(QUDA_PROFILE_COMPUTE); - param.secs += profile.Last(QUDA_PROFILE_COMPUTE); profile.TPSTART(QUDA_PROFILE_EIGEN); } @@ -115,7 +111,6 @@ namespace quda if (!param.is_preconditioner) { profile.TPSTOP(QUDA_PROFILE_EIGEN); - param.secs += profile.Last(QUDA_PROFILE_EIGEN); profile.TPSTART(QUDA_PROFILE_COMPUTE); } } @@ -268,7 +263,6 @@ namespace quda int resIncreaseTotal = 0; if (!param.is_preconditioner) { - blas::flops = 0; profile.TPSTOP(QUDA_PROFILE_PREAMBLE); profile.TPSTART(QUDA_PROFILE_COMPUTE); } @@ -375,25 +369,8 @@ namespace quda } if (!param.is_preconditioner) { - qudaDeviceSynchronize(); // ensure solver is complete before ending timing profile.TPSTOP(QUDA_PROFILE_COMPUTE); - profile.TPSTART(QUDA_PROFILE_EPILOGUE); - param.secs += profile.Last(QUDA_PROFILE_COMPUTE); - - // store flops and reset counters - double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matMdagM.flops()) * 1e-9; - - param.gflops += gflops; param.iter += total_iter; - - // reset the flops counters - blas::flops = 0; - mat.flops(); - matSloppy.flops(); - matPrecon.flops(); - matMdagM.flops(); - - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); } PrintSummary("CA-GCR", total_iter, r2, b2, stop, param.tol_hq); diff --git a/lib/inv_cg3_quda.cpp b/lib/inv_cg3_quda.cpp index 42ab22fcab..9ac9f85b9f 100644 --- a/lib/inv_cg3_quda.cpp +++ b/lib/inv_cg3_quda.cpp @@ -268,8 +268,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_INIT); profile.TPSTART(QUDA_PROFILE_PREAMBLE); - blas::flops = 0; - // compute initial residual depending on whether we have an initial guess or not double r2; if (param.use_init_guess == QUDA_USE_INIT_GUESS_YES) { @@ -474,13 +472,9 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_COMPUTE); profile.TPSTART(QUDA_PROFILE_EPILOGUE); - param.secs = profile.Last(QUDA_PROFILE_COMPUTE); - double gflops = (blas::flops + mat.flops() + matSloppy.flops())*1e-9; - param.gflops = gflops; param.iter += k; - if (k == param.maxiter) - warningQuda("Exceeded maximum iterations %d", param.maxiter); + if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); // compute the true residuals if (!mixed_precision && param.compute_true_res) { @@ -491,11 +485,6 @@ namespace quda { PrintSummary("CG3", k, r2, b2, stop, param.tol_hq); - // reset the flops counters - blas::flops = 0; - mat.flops(); - matSloppy.flops(); - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); } diff --git a/lib/inv_cg_quda.cpp b/lib/inv_cg_quda.cpp index 15bdf23002..40cd15ea1c 100644 --- a/lib/inv_cg_quda.cpp +++ b/lib/inv_cg_quda.cpp @@ -369,7 +369,6 @@ namespace quda { if (!param.is_preconditioner) { profile.TPSTOP(QUDA_PROFILE_PREAMBLE); profile.TPSTART(QUDA_PROFILE_COMPUTE); - blas::flops = 0; } int k = 0; @@ -544,9 +543,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_COMPUTE); profile.TPSTART(QUDA_PROFILE_EPILOGUE); - param.secs = profile.Last(QUDA_PROFILE_COMPUTE); - double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matEig.flops()) * 1e-9; - param.gflops = gflops; param.iter += k; if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); @@ -563,15 +559,7 @@ namespace quda { PrintSummary("CG", k, r2, b2, stop, 0.0); - if (!param.is_preconditioner) { - // reset the flops counters - blas::flops = 0; - mat.flops(); - matSloppy.flops(); - matPrecon.flops(); - - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); - } + if (!param.is_preconditioner) profile.TPSTOP(QUDA_PROFILE_EPILOGUE); if (param.is_preconditioner) commGlobalReductionPop(); } @@ -692,7 +680,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_PREAMBLE); profile.TPSTART(QUDA_PROFILE_COMPUTE); - blas::flops = 0; int k = 0; @@ -988,9 +975,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_COMPUTE); profile.TPSTART(QUDA_PROFILE_EPILOGUE); - param.secs = profile.Last(QUDA_PROFILE_COMPUTE); - double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matEig.flops()) * 1e-9; - param.gflops = gflops; param.iter += k; if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); @@ -1006,12 +990,6 @@ namespace quda { PrintSummary("CG", k, r2, b2, stop, param.tol_hq); - // reset the flops counters - blas::flops = 0; - mat.flops(); - matSloppy.flops(); - matPrecon.flops(); - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); } @@ -1163,7 +1141,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_PREAMBLE); profile.TPSTART(QUDA_PROFILE_COMPUTE); - blas::flops = 0; int k = 0; @@ -1311,9 +1288,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_COMPUTE); profile.TPSTART(QUDA_PROFILE_EPILOGUE); - param.secs = profile.Last(QUDA_PROFILE_COMPUTE); - double gflops = (blas::flops + mat.flops() + matSloppy.flops()) * 1e-9; - param.gflops = gflops; param.iter += k; if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); @@ -1332,11 +1306,6 @@ namespace quda { PrintSummary("CG", k, r2(i, i).real(), b2[i], stop[i], 0.0); } - // reset the flops counters - blas::flops = 0; - mat.flops(); - matSloppy.flops(); - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); profile.TPSTART(QUDA_PROFILE_FREE); @@ -1533,7 +1502,6 @@ void CG::solve(ColorSpinorField& x, ColorSpinorField& b) { profile.TPSTOP(QUDA_PROFILE_PREAMBLE); profile.TPSTART(QUDA_PROFILE_COMPUTE); - blas::flops = 0; int k = 0; @@ -1879,9 +1847,6 @@ void CG::solve(ColorSpinorField& x, ColorSpinorField& b) { profile.TPSTOP(QUDA_PROFILE_COMPUTE); profile.TPSTART(QUDA_PROFILE_EPILOGUE); - param.secs = profile.Last(QUDA_PROFILE_COMPUTE); - double gflops = (blas::flops + mat.flops() + matSloppy.flops())*1e-9; - param.gflops = gflops; param.iter += k; if (k == param.maxiter) @@ -1901,11 +1866,6 @@ void CG::solve(ColorSpinorField& x, ColorSpinorField& b) { PrintSummary("CG", k, r2(i,i).real(), b2[i], stop[i], 0.0); } - // reset the flops counters - blas::flops = 0; - mat.flops(); - matSloppy.flops(); - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); profile.TPSTART(QUDA_PROFILE_FREE); diff --git a/lib/inv_eigcg_quda.cpp b/lib/inv_eigcg_quda.cpp index 57a963a20e..decb31af7e 100644 --- a/lib/inv_eigcg_quda.cpp +++ b/lib/inv_eigcg_quda.cpp @@ -179,11 +179,7 @@ namespace quda { inner.delta = 1e-20; // no reliable updates within the inner solver inner.precision = outer.precision_precondition; // preconditioners are uni-precision solvers inner.precision_sloppy = outer.precision_precondition; - inner.iter = 0; - inner.gflops = 0; - inner.secs = 0; - inner.inv_type_precondition = QUDA_INVALID_INVERTER; inner.is_preconditioner = true; // used to tell the inner solver it is an inner solver @@ -193,9 +189,6 @@ namespace quda { // set the required parameters for the initCG solver static void fillInitCGSolverParam(SolverParam &inner, const SolverParam &outer) { inner.iter = 0; - inner.gflops = 0; - inner.secs = 0; - inner.tol = outer.tol; inner.tol_restart = outer.tol_restart; inner.maxiter = outer.maxiter; @@ -460,7 +453,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_PREAMBLE); profile.TPSTART(QUDA_PROFILE_COMPUTE); - blas::flops = 0; double rMinvr = blas::reDotProduct(r,*z); //Begin EigCG iterations: @@ -517,9 +509,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_COMPUTE); profile.TPSTART(QUDA_PROFILE_EPILOGUE); - param.secs = profile.Last(QUDA_PROFILE_COMPUTE); - double gflops = (blas::flops + matSloppy.flops())*1e-9; - param.gflops = gflops; param.iter += k; if (k == param.maxiter) @@ -532,10 +521,6 @@ namespace quda { PrintSummary("eigCG", k, r2, b2, args.global_stop, param.tol_hq); - // reset the flops counters - blas::flops = 0; - matSloppy.flops(); - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); profile.TPSTART(QUDA_PROFILE_FREE); @@ -588,20 +573,11 @@ namespace quda { xProj = x; rProj = r; - if(getVerbosity() >= QUDA_VERBOSE) printfQuda("\ninitCG stat: %i iter / %g secs = %g Gflops. \n", Kparam.iter, Kparam.secs, Kparam.gflops); - Kparam.tol *= param.inc_tol; if(restart_idx == (param.max_restart_num-1)) Kparam.tol = full_tol;//do the last solve in the next cycle to full tolerance - - param.secs += Kparam.secs; } - if(getVerbosity() >= QUDA_VERBOSE) printfQuda("\ninitCG stat: %i iter / %g secs = %g Gflops. \n", Kparam.iter, Kparam.secs, Kparam.gflops); - // - param.secs += Kparam.secs; - param.gflops += Kparam.gflops; - k += Kparam.iter; delete rp; diff --git a/lib/inv_gcr_quda.cpp b/lib/inv_gcr_quda.cpp index 9227f573b3..3caf952929 100644 --- a/lib/inv_gcr_quda.cpp +++ b/lib/inv_gcr_quda.cpp @@ -276,8 +276,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_INIT); profile.TPSTART(QUDA_PROFILE_PREAMBLE); - blas::flops = 0; - blas::copy(r_sloppy, r); int total_iter = 0; @@ -386,11 +384,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_COMPUTE); profile.TPSTART(QUDA_PROFILE_EPILOGUE); - param.secs += profile.Last(QUDA_PROFILE_COMPUTE); - - double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matMdagM.flops()) * 1e-9; - if (K) gflops += K->flops()*1e-9; - if (k >= param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); logQuda(QUDA_VERBOSE, "GCR: number of restarts = %d\n", restart); @@ -410,16 +403,8 @@ namespace quda { if (0) blas::copy(b, K ? r_sloppy : p[k_break]); } - param.gflops += gflops; param.iter += total_iter; - // reset the flops counters - blas::flops = 0; - mat.flops(); - matSloppy.flops(); - matPrecon.flops(); - matMdagM.flops(); - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); profile.TPSTART(QUDA_PROFILE_FREE); diff --git a/lib/inv_gmresdr_quda.cpp b/lib/inv_gmresdr_quda.cpp index 389206853e..ab56176a2a 100644 --- a/lib/inv_gmresdr_quda.cpp +++ b/lib/inv_gmresdr_quda.cpp @@ -143,8 +143,6 @@ namespace quda { inner.precision_sloppy = outer.precision_precondition; inner.iter = 0; - inner.gflops = 0; - inner.secs = 0; inner.inv_type_precondition = QUDA_INVALID_INVERTER; inner.is_preconditioner = true; @@ -469,7 +467,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_PREAMBLE); profile.TPSTART(QUDA_PROFILE_COMPUTE); - blas::flops = 0; const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false; @@ -549,9 +546,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_COMPUTE); profile.TPSTART(QUDA_PROFILE_EPILOGUE); - param.secs = profile.Last(QUDA_PROFILE_COMPUTE); - double gflops = (blas::flops + mat.flops()) * 1e-9; - param.gflops = gflops; param.iter += tot_iters; mat(r, x); @@ -560,9 +554,6 @@ namespace quda { PrintSummary("FGMResDR:", tot_iters, r2, b2, stop, param.tol_hq); - blas::flops = 0; - mat.flops(); - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); param.rhs_idx += 1; diff --git a/lib/inv_mr_quda.cpp b/lib/inv_mr_quda.cpp index 4f636bf279..cc69c3cd14 100644 --- a/lib/inv_mr_quda.cpp +++ b/lib/inv_mr_quda.cpp @@ -62,10 +62,7 @@ namespace quda create(x, b); // allocate fields - if (!param.is_preconditioner) { - blas::flops = 0; - profile.TPSTART(QUDA_PROFILE_COMPUTE); - } + if (!param.is_preconditioner) profile.TPSTART(QUDA_PROFILE_COMPUTE); double b2 = blas::norm2(b); // Save norm of b double r2 = 0.0; // if zero source then we will exit immediately doing no work @@ -160,17 +157,7 @@ namespace quda if (!param.is_preconditioner) { profile.TPSTOP(QUDA_PROFILE_COMPUTE); - profile.TPSTART(QUDA_PROFILE_EPILOGUE); - param.secs += profile.Last(QUDA_PROFILE_COMPUTE); - - // store flops and reset counters - double gflops = (blas::flops + mat.flops() + matSloppy.flops()) * 1e-9; - - param.gflops += gflops; param.iter += iter; - blas::flops = 0; - - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); } } diff --git a/lib/inv_msrc_cg_quda.cpp b/lib/inv_msrc_cg_quda.cpp index 9a64386095..70bcb9a089 100644 --- a/lib/inv_msrc_cg_quda.cpp +++ b/lib/inv_msrc_cg_quda.cpp @@ -146,7 +146,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_PREAMBLE); profile.TPSTART(QUDA_PROFILE_COMPUTE); - blas::flops = 0; int k=0; @@ -315,10 +314,6 @@ namespace quda { profile.TPSTOP(QUDA_PROFILE_COMPUTE); profile.TPSTART(QUDA_PROFILE_EPILOGUE); - param.secs = profile.Last(QUDA_PROFILE_COMPUTE); - double gflops = (blas::flops + mat.flops() + matSloppy.flops())*1e-9; - reduceDouble(gflops); - param.gflops = gflops; param.iter += k; if (k==param.maxiter) @@ -334,11 +329,6 @@ namespace quda { PrintSummary("CG", k, r2, b2, stop, inv.tol_hq); - // reset the flops counters - blas::flops = 0; - mat.flops(); - matSloppy.flops(); - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); profile.TPSTART(QUDA_PROFILE_FREE); diff --git a/lib/inv_multi_cg_quda.cpp b/lib/inv_multi_cg_quda.cpp index b6757440f4..ada5a3326a 100644 --- a/lib/inv_multi_cg_quda.cpp +++ b/lib/inv_multi_cg_quda.cpp @@ -262,7 +262,6 @@ namespace quda { int k = 0; int rUpdate = 0; - blas::flops = 0; // now create the worker class for updating the shifted solutions and gradient vectors bool aux_update = false; @@ -443,9 +442,6 @@ namespace quda { logQuda(QUDA_VERBOSE, "Reliable updates = %d\n", rUpdate); if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d\n", param.maxiter); - param.secs = profile.Last(QUDA_PROFILE_COMPUTE); - double gflops = (blas::flops + mat.flops() + matSloppy.flops())*1e-9; - param.gflops = gflops; param.iter += k; if (param.compute_true_res) { @@ -490,11 +486,6 @@ namespace quda { } } - // reset the flops counters - blas::flops = 0; - mat.flops(); - matSloppy.flops(); - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); popOutputPrefix(); } diff --git a/lib/inv_pcg_quda.cpp b/lib/inv_pcg_quda.cpp index 24d9259ef8..2fe62692de 100644 --- a/lib/inv_pcg_quda.cpp +++ b/lib/inv_pcg_quda.cpp @@ -203,8 +203,6 @@ namespace quda profile.TPSTOP(QUDA_PROFILE_PREAMBLE); profile.TPSTART(QUDA_PROFILE_COMPUTE); - blas::flops = 0; - int k = 0; PrintStats("PCG", k, r2, b2, heavy_quark_res); @@ -378,10 +376,6 @@ namespace quda if (mixed()) copy(x, x_sloppy); xpy(y, x); // x += y - param.secs = profile.Last(QUDA_PROFILE_COMPUTE); - double gflops = (blas::flops + mat.flops() + matSloppy.flops() + matPrecon.flops() + matEig.flops()) * 1e-9; - if (K) gflops += K->flops() * 1e-9; - param.gflops = gflops; param.iter += k; if (k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); @@ -393,13 +387,6 @@ namespace quda double true_res = xmyNorm(b, r); param.true_res = sqrt(true_res / b2); - // reset the flops counters - blas::flops = 0; - mat.flops(); - matSloppy.flops(); - matPrecon.flops(); - matEig.flops(); - profile.TPSTOP(QUDA_PROFILE_EPILOGUE); } diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index 4defe3878f..475a1d9b3b 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -804,34 +804,6 @@ namespace quda popLevel(); } - // FIXME need to make this more robust (implement Solver::flops() for all solvers) - double MG::flops() const { - double flops = 0; - - if (param_coarse_solver) { - flops += param_coarse_solver->gflops * 1e9; - param_coarse_solver->gflops = 0; - } else if (param.level < param.Nlevel-1) { - flops += coarse->flops(); - } - - if (param_presmooth) { - flops += param_presmooth->gflops * 1e9; - param_presmooth->gflops = 0; - } - - if (param_postsmooth) { - flops += param_postsmooth->gflops * 1e9; - param_postsmooth->gflops = 0; - } - - if (transfer) { - flops += transfer->flops(); - } - - return flops; - } - bool check_deviation(double deviation, double tol) { return (deviation > tol || std::isnan(deviation) || std::isinf(deviation)); diff --git a/lib/solver.cpp b/lib/solver.cpp index 8b734e8bc5..12cce8f532 100644 --- a/lib/solver.cpp +++ b/lib/solver.cpp @@ -223,9 +223,6 @@ namespace quda { = (outer.inv_type_precondition == QUDA_MR_INVERTER) ? QUDA_INVALID_RESIDUAL : QUDA_L2_RELATIVE_RESIDUAL; inner.iter = 0; - inner.gflops = 0; - inner.secs = 0; - inner.inv_type_precondition = QUDA_INVALID_INVERTER; inner.is_preconditioner = true; // tell inner solver it is a preconditioner inner.pipeline = true; diff --git a/lib/timer.cpp b/lib/timer.cpp index 2c6f9b21c3..125b9242d4 100644 --- a/lib/timer.cpp +++ b/lib/timer.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #ifdef INTERFACE_NVTX #include "nvtx3/nvToolsExt.h" @@ -225,7 +226,12 @@ namespace quda { static std::stack tp_stack; - pushProfile::pushProfile(TimeProfile &profile) : profile(profile) + pushProfile::pushProfile(TimeProfile &profile, double &secs, double &gflops) : + profile(profile), + secs(secs), + gflops(gflops), + flops(Tunable::flops_global()) + { profile.TPSTART(QUDA_PROFILE_TOTAL); tp_stack.push(&profile); @@ -238,6 +244,9 @@ namespace quda { if (&(this->profile) != &profile) errorQuda("Popped profile is not the expected one"); tp_stack.pop(); profile.TPSTOP(QUDA_PROFILE_TOTAL); + secs = profile.Last(QUDA_PROFILE_TOTAL); + gflops = (Tunable::flops_global() - flops) * 1e-9; + if (&gflops != &gflops_dummy) comm_allreduce_sum(gflops); } TimeProfile& getProfile() diff --git a/lib/tune.cpp b/lib/tune.cpp index fea2a7b509..608a77e3c9 100644 --- a/lib/tune.cpp +++ b/lib/tune.cpp @@ -890,6 +890,8 @@ namespace quda trace_list.push_back(trace_entry); } + Tunable::flops_global(Tunable::flops_global() + tunable.flops()); // increment flops counter + Tunable::bytes_global(Tunable::bytes_global() + tunable.bytes()); // increment bytes counter return param_tuned; } @@ -908,6 +910,8 @@ namespace quda logQuda(QUDA_DEBUG_VERBOSE, "Launching %s with %s at vol=%s with %s (untuned)\n", key.name, key.aux, key.volume, tunable.paramString(param_default).c_str()); + Tunable::flops_global(Tunable::flops_global() + tunable.flops()); // increment flops counter + Tunable::bytes_global(Tunable::bytes_global() + tunable.bytes()); // increment bytes counter return param_default; } else if (!tuning) { @@ -1121,6 +1125,8 @@ namespace quda param.n_calls = profile_count ? 1 : 0; + Tunable::flops_global(Tunable::flops_global() + tunable.flops()); // increment flops counter + Tunable::bytes_global(Tunable::bytes_global() + tunable.bytes()); // increment bytes counter return param; } From fe2979807f5d2fda0f10fcda6c1313cc722c72a8 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Fri, 13 Oct 2023 16:22:31 -0700 Subject: [PATCH 71/99] Report MG setup time and performance in invert_test and staggered_invert_test --- tests/invert_test.cpp | 2 ++ tests/staggered_invert_test.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/invert_test.cpp b/tests/invert_test.cpp index 9de2ecfe83..20af8bd390 100644 --- a/tests/invert_test.cpp +++ b/tests/invert_test.cpp @@ -214,6 +214,8 @@ std::vector> solve(test_t param) if (use_split_grid) { errorQuda("Split grid does not work with MG yet."); } mg_preconditioner = newMultigridQuda(&mg_param); inv_param.preconditioner = mg_preconditioner; + + printfQuda("MG Setup Done: %g secs, %g Gflops\n", mg_param.secs, mg_param.gflops / mg_param.secs); } // Vector construct START diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp index ea5aab17fd..0d9f2c3e5e 100644 --- a/tests/staggered_invert_test.cpp +++ b/tests/staggered_invert_test.cpp @@ -217,6 +217,8 @@ void test(int argc, char **argv) if (use_split_grid) { errorQuda("Split grid does not work with MG yet."); } mg_preconditioner = newMultigridQuda(&mg_param); inv_param.preconditioner = mg_preconditioner; + + printfQuda("MG Setup Done: %g secs, %g Gflops\n", mg_param.secs, mg_param.gflops / mg_param.secs); } // Staggered vector construct START From d3649dd16304446c90da5bfa7f8de842ea08f64f Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Mon, 16 Oct 2023 10:46:13 -0700 Subject: [PATCH 72/99] Remove legacy blas flop and byte counting --- include/blas_quda.h | 3 --- lib/blas_quda.cu | 6 ------ lib/coarse_op_preconditioned.in.cu | 4 ++-- lib/multi_blas_quda.cu | 3 --- lib/multi_reduce_quda.cu | 3 --- lib/reduce_quda.cu | 3 --- lib/staggered_kd_build_xinv.cu | 19 ++++++++----------- tests/blas_test.cpp | 11 ++++++----- 8 files changed, 16 insertions(+), 36 deletions(-) diff --git a/include/blas_quda.h b/include/blas_quda.h index 8df40df452..07b09f1209 100644 --- a/include/blas_quda.h +++ b/include/blas_quda.h @@ -23,9 +23,6 @@ namespace quda { void setParam(int kernel, int prec, int threads, int blocks); - extern unsigned long long flops; - extern unsigned long long bytes; - inline void zero(cvector_ref &x) { for (auto i = 0u; i < x.size(); i++) x[i].zero(); diff --git a/lib/blas_quda.cu b/lib/blas_quda.cu index 4c8719f309..f84f2eeb59 100644 --- a/lib/blas_quda.cu +++ b/lib/blas_quda.cu @@ -7,9 +7,6 @@ namespace quda { namespace blas { - unsigned long long flops; - unsigned long long bytes; - template