lattice · maddyscientist · Jul 29, 2015 · Mar 20, 2015 · Mar 20, 2015 · Mar 20, 2015
@@ -11,13 +11,42 @@ furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
 
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+QUDA is supported by NVIDIA, and includes the NVIDIA-licensed
+libraries cub and generics.
+
+Copyright (c) 2011-2015, NVIDIA Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT
+HOLDER> BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 QUDA leverages Google Test for unit testing, contained within tests/gtest.h

@@ -618,6 +618,7 @@ BUILD_MULTI_GPU
 DYNAMIC_CLOVER
 BUILD_CONTRACT
 BUILD_SSTEP
+BUILD_GAUGE_ALG
 BUILD_GAUGE_TOOLS
 BUILD_HISQ_FORCE
 BUILD_FERMION_FORCE
@@ -723,6 +724,7 @@ enable_gauge_force
 enable_staggered_force
 enable_hisq_force
 enable_gauge_tools
+enable_gauge_alg
 enable_sstep
 enable_contract
 enable_dynamic_clover
@@ -1406,6 +1408,8 @@ Optional Features:
   --enable-gauge-tools    Build auxilary gauge tools: plaquette, gauge
                           evolver, APE, extended gauge routines (default:
                           disabled)
+  --enable-gauge-alg      Build gauge fixing and pure gauge algorithms
+                          (default: disabled)
   --enable-sstep          Build s-step linear solvers (default: disabled)
   --enable-contract       Build bilinear contraction code (default: disabled)
   --enable-dynamic-clover Invert dynamically the clover term for
@@ -2247,6 +2251,15 @@ else
 fi
 
 
+# Check whether --enable-gauge-alg was given.
+if test "${enable_gauge_alg+set}" = set; then :
+  enableval=$enable_gauge_alg;  build_gauge_alg=${enableval}
+else
+   build_gauge_alg="no"
+
+fi
+
+
 # Check whether --enable-sstep was given.
 if test "${enable_sstep+set}" = set; then :
   enableval=$enable_sstep;  build_sstep=${enableval}
@@ -4174,6 +4187,13 @@ yes|no);;
   ;;
 esac
 
+case ${build_gauge_alg} in
+yes|no);;
+*)
+  as_fn_error $? " invalid value for --enable-gauge-alg " "$LINENO" 5
+  ;;
+esac
+
 case ${build_sstep} in
 yes|no);;
 *)
@@ -4255,6 +4275,11 @@ $as_echo "$as_me: Enabling Multi-GPU" >&6;}
 $as_echo "$as_me: Asqtad fermion force doesn't support multi-GPU yet: disabling " >&6;}
   build_staggered_force="no";
 
+  if test "X${build_gauge_alg}X" = "XyesX"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Gauge fixing with FFTs only supported for single-GPU. Use gauge fixing with overrelaxation in multi-GPU mode. " >&5
+$as_echo "$as_me: WARNING: Gauge fixing with FFTs only supported for single-GPU. Use gauge fixing with overrelaxation in multi-GPU mode. " >&2;}
+  fi
+
 
   if test "X${qmp_home}X" = "XX"; then
 #    if test "X${mpi_home}X" = "XX"; then
@@ -4404,6 +4429,11 @@ $as_echo "$as_me: Setting BUILD_GAUGE_TOOLS = ${build_gauge_tools}  " >&6;}
 BUILD_GAUGE_TOOLS=${build_gauge_tools}
 
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: Setting BUILD_GAUGE_ALG = ${build_gauge_alg}  " >&5
+$as_echo "$as_me: Setting BUILD_GAUGE_ALG = ${build_gauge_alg}  " >&6;}
+BUILD_GAUGE_ALG=${build_gauge_alg}
+
+
 { $as_echo "$as_me:${as_lineno-$LINENO}: Setting BUILD_SSTEP = ${build_sstep}  " >&5
 $as_echo "$as_me: Setting BUILD_SSTEP = ${build_sstep}  " >&6;}
 BUILD_SSTEP=${build_sstep}
@@ -5720,4 +5750,3 @@ if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then
   { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
 $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
 fi
-
@@ -167,6 +167,12 @@ AC_ARG_ENABLE(gauge-tools,
   [ build_gauge_tools="no" ]
 )
 
+AC_ARG_ENABLE(gauge-alg,
+  AC_HELP_STRING([--enable-gauge-alg], [ Build gauge fixing and pure gauge algorithms (default: disabled)]),
+  [ build_gauge_alg=${enableval} ],
+  [ build_gauge_alg="no" ]
+)
+
 AC_ARG_ENABLE(sstep,
   AC_HELP_STRING([--enable-sstep], [ Build s-step linear solvers (default: disabled)]),
   [ build_sstep=${enableval} ],
@@ -516,14 +522,22 @@ yes|no);;
   ;;
 esac
 
-dnl Build Hisq force
+dnl Build gauge tools
 case ${build_gauge_tools} in
 yes|no);;
 *) 
   AC_MSG_ERROR([ invalid value for --enable-gauge-tools ])
   ;;
 esac
 
+dnl Build gauge algorithms
+case ${build_gauge_alg} in
+yes|no);;
+*) 
+  AC_MSG_ERROR([ invalid value for --enable-gauge-alg ])
+  ;;
+esac
+
 dnl Build sstep
 case ${build_sstep} in
 yes|no);;
@@ -619,6 +633,10 @@ then
 
   AC_MSG_NOTICE([Asqtad fermion force doesn't support multi-GPU yet: disabling ])
   build_staggered_force="no";
+
+  if test "X${build_gauge_alg}X" = "XyesX"; then
+  AC_MSG_WARN([Gauge fixing with FFTs only supported for single-GPU. Use gauge fixing with overrelaxation in multi-GPU mode. ])
+  fi
 
 
   if test "X${qmp_home}X" = "XX"; then
@@ -729,6 +747,9 @@ AC_SUBST( BUILD_HISQ_FORCE, [${build_hisq_force}])
 AC_MSG_NOTICE([Setting BUILD_GAUGE_TOOLS = ${build_gauge_tools} ] )
 AC_SUBST( BUILD_GAUGE_TOOLS, [${build_gauge_tools}])
 
+AC_MSG_NOTICE([Setting BUILD_GAUGE_ALG = ${build_gauge_alg} ] )
+AC_SUBST( BUILD_GAUGE_ALG, [${build_gauge_alg}])
+
 AC_MSG_NOTICE([Setting BUILD_SSTEP = ${build_sstep} ] )
 AC_SUBST( BUILD_SSTEP, [${build_sstep}])
 

@@ -0,0 +1,44 @@
+#pragma once
+
+/**
+   @file atomic.cuh
+
+   @section Description
+
+   Provides definitions of atomic functions that are not native to
+   CUDA.  These are intentionally not declared in the namespace to
+   avoid confusion when resolving the native atomicAdd functions.
+ */
+
+/**
+   Implementation of double-precision atomic addition using compare
+   and swap.
+
+   @param addr Address that stores the atomic variable to be updated
+   @param val Value to be added to the atomic
+*/
+static inline __device__ double atomicAdd(double *addr, double val){
+  double old = *addr, assumed;
+  do {
+    assumed = old;
+    old = __longlong_as_double( atomicCAS((unsigned long long int*)addr,
+					  __double_as_longlong(assumed),
+					  __double_as_longlong(val + assumed)));
+  } while ( __double_as_longlong(assumed) != __double_as_longlong(old) );
+
+  return old;
+}
+
+/**
+   Implementation of double2 atomic addition using two
+   double-precision additions.
+
+   @param addr Address that stores the atomic variable to be updated
+   @param val Value to be added to the atomic
+*/
+static inline __device__ double2 atomicAdd(double2 *addr, double2 val){
+  double2 old = *addr;
+  old.x = atomicAdd((double*)addr, val.x);
+  old.y = atomicAdd((double*)addr + 1, val.y);
+  return old;
+}
@@ -61,6 +61,12 @@ namespace quda {
     const void* V(bool inverse=false) const { return inverse ? cloverInv : clover; }
     const void* Norm(bool inverse=false) const { return inverse ? invNorm : norm; }
 
+    /**
+       This function returns true if the field is stored in an
+       internal field order for the given precision.
+    */
+    bool isNative() const;
+
     double* TrLog() const { return trlog; }
 
     QudaCloverFieldOrder Order() const { return order; }

@@ -300,4 +300,17 @@ namespace quda {
     };
 
 
+  // Use traits to reduce the template explosion
+  template<typename Float,int N=72> struct clover_mapper { };
+
+  // double precision uses Float2
+  template<int N> struct clover_mapper<double,N> { typedef FloatNOrder<double, N, 2> type; };
+
+  // single precision uses Float4
+  template<int N> struct clover_mapper<float,N> { typedef FloatNOrder<float, N, 4> type; };
+
+  // half precision uses Float4
+  template<int N> struct clover_mapper<short,N> { typedef FloatNOrder<short, N, 4> type; };
+
+
 }
@@ -0,0 +1,34 @@
+#pragma once
+#include <cub/cub.cuh>
+
+/**
+   @file cub_helper.cuh
+
+   @section Description
+
+   Provides helper functors for custom datatypes for cub algorithms.
+ */
+
+namespace quda {
+
+  /**
+     Helper functor for generic addition reduction.
+  */
+  template <typename T>
+  struct Summ {
+    __host__ __device__ __forceinline__ T operator() (const T &a, const T &b){
+      return a + b;
+    }
+  };
+
+  /**
+     Helper functor for double2 addition reduction.
+  */
+  template <>
+  struct Summ<double2>{
+    __host__ __device__ __forceinline__ double2 operator() (const double2 &a, const double2 &b){
+      return make_double2(a.x + b.x, a.y + b.y);
+    }
+  };
+
+}
@@ -111,6 +111,18 @@ namespace quda {
 	  else errorQuda("Error: invalid link type(%d)\n", link_type);
 	  for (int d=0; d<nDim; d++) r[d] = 0;
 	}
+
+    /**
+       Helper function for setting the precision and corresponding
+       field order for QUDA internal fields.
+       @param precision The precision to use 
+     */
+    void setPrecision(QudaPrecision precision) {
+      this->precision = precision;
+      order = (precision == QUDA_DOUBLE_PRECISION || reconstruct == QUDA_RECONSTRUCT_NO) ? 
+	QUDA_FLOAT2_GAUGE_ORDER : QUDA_FLOAT4_GAUGE_ORDER; 
+    }
+
   };
 
   std::ostream& operator<<(std::ostream& output, const GaugeFieldParam& param);
@@ -153,12 +165,6 @@ namespace quda {
     /** Whether the staggered phase factor has been applied */
     bool staggeredPhaseApplied;
 
-    /**
-       This function returns true if the field is stored in an
-       internal field order for the given precision.
-    */ 
-    bool isNative() const;
-
   public:
     GaugeField(const GaugeFieldParam &param);
     virtual ~GaugeField();
@@ -178,7 +184,6 @@ namespace quda {
     const int* R() const { return r; }
     QudaGhostExchange GhostExchange() const { return ghostExchange; }
     QudaStaggeredPhase StaggeredPhase() const { return staggeredPhaseType; }
-
     /**
        Apply the staggered phase factors to the gauge field.
     */
@@ -194,6 +199,12 @@ namespace quda {
 
     void checkField(const GaugeField &);
 
+    /**
+       This function returns true if the field is stored in an
+       internal field order for the given precision.
+    */ 
+    bool isNative() const;
+
     size_t Bytes() const { return bytes; }
     size_t PhaseBytes() const { return phase_bytes; }
     size_t PhaseOffset() const { return phase_offset; }