Merge branch 'develop' into feature/milc_interface_cloverinvert

lattice · May 15, 2015 · cdf0907 · cdf0907
2 parents 9cefdb9 + 0859c72
commit cdf0907
Show file tree

Hide file tree

Showing 17 changed files with 2,313 additions and 139 deletions.
diff --git a/LICENSE b/LICENSE
@@ -52,3 +52,69 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+QUDA leverages LLNL's PMPI wrapper generator for generation of NVTX
+wrappers to enable the Visual Profiler to display MPI timeline
+information.  The file lib/generate/wrap.py is supplied under the
+following license:
+
+Copyright (c) 2010, Lawrence Livermore National Security, LLC.
+Produced at the Lawrence Livermore National Laboratory Written by Todd
+Gamblin, [email protected].  LLNL-CODE-417602 All rights reserved.
+
+This file is part of Libra. For details, see
+http://github.com/tgamblin/libra.  Please also read the LICENSE file
+for further information.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the disclaimer below.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the disclaimer (as noted below)
+   in the documentation and/or other materials provided with the
+   distribution.
+ * Neither the name of the LLNS/LLNL nor the names of its contributors
+   may be used to endorse or promote products derived from this
+   software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE
+LIVERMORE NATIONAL SECURITY, LLC, THE U.S. DEPARTMENT OF ENERGY OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+Additional BSD Notice  
+
+1. This notice is required to be provided under our contract with the
+   U.S. Department of Energy (DOE).  This work was produced at
+   Lawrence Livermore National Laboratory under Contract
+   No. DE-AC52-07NA27344 with the DOE.
+
+2. Neither the United States Government nor Lawrence Livermore
+   National Security, LLC nor any of their employees, makes any
+   warranty, express or implied, or assumes any liability or
+   responsibility for the accuracy, completeness, or usefulness of any
+   information, apparatus, product, or process disclosed, or
+   represents that its use would not infringe privately-owned rights.
+
+3. Also, reference herein to any specific commercial products,
+   process, or services by trade name, trademark, manufacturer or
+   otherwise does not necessarily constitute or imply its endorsement,
+   recommendation, or favoring by the United States Government or
+   Lawrence Livermore National Security, LLC. The views and opinions
+   of authors expressed herein do not necessarily state or reflect
+   those of the United States Government or Lawrence Livermore
+   National Security, LLC, and shall not be used for advertising or
+   product endorsement purposes.
diff --git a/Makefile b/Makefile
@@ -19,6 +19,9 @@ tune:
 gen:
 	$(MAKE) -C lib/ gen
 
+mpi_nvtx:
+	$(MAKE) -C lib/ mpi_nvtx
+
 clean:
 	$(MAKE) -C lib/ clean
 	$(MAKE) -C tests/ clean

diff --git a/README b/README
@@ -203,6 +203,7 @@ Authors:
 Ronald Babich (NVIDIA)
 Kipton Barros (Los Alamos National Laboratory)
 Richard Brower (Boston University)
+Nuno Cardoso (NCSA)
 Mike Clark (NVIDIA)
 Justin Foley (University of Utah)
 Joel Giedt (Rensselaer Polytechnic Institute)

diff --git a/configure b/configure
@@ -607,6 +607,7 @@ BUILD_QDPJIT_INTERFACE
 BUILD_CPS_INTERFACE
 BUILD_MILC_INTERFACE
 BUILD_QDP_INTERFACE
+MPI_NVTX
 GPU_COMMS
 GPU_DIRECT
 POSIX_THREADS
@@ -725,6 +726,7 @@ enable_contract
 enable_multi_gpu
 enable_gpu_direct
 enable_gpu_comms
+enable_mpi_nvtx
 enable_device_pack
 with_mpi
 enable_pthreads
@@ -1407,6 +1409,8 @@ Optional Features:
                           enabled)
   --enable-gpu-comms      Enable direct CUDA-NIC communication (experimental,
                           default: disabled)
+  --enable-mpi-nvtx       Enable NVTX markup for profiling MPI API calls in
+                          the visual profiler (default: disabled)
   --enable-device-pack    Enable the packing / unpacking of fields on the
                           device (default: disabled)
   --enable-pthreads       Enable pthreads in the multi-GPU dslash build
@@ -2281,6 +2285,15 @@ else
 fi
 
 
+# Check whether --enable-mpi-nvtx was given.
+if test "${enable_mpi_nvtx+set}" = set; then :
+  enableval=$enable_mpi_nvtx;  mpi_nvtx=${enableval}
+else
+   mpi_nvtx="no"
+
+fi
+
+
 # Check whether --enable-device-pack was given.
 if test "${enable_device_pack+set}" = set; then :
   enableval=$enable_device_pack;  device_pack=${enableval}
@@ -4170,6 +4183,13 @@ yes|no);;
   ;;
 esac
 
+case ${mpi_nvtx} in
+yes|no);;
+*)
+  as_fn_error $? " invalid value for --enable-mpi-nvtx " "$LINENO" 5
+  ;;
+esac
+
 case ${build_qdpjit} in
 yes|no);;
 *)
@@ -4383,6 +4403,11 @@ $as_echo "$as_me: Setting GPU_COMMS= ${gpu_comms}" >&6;}
 GPU_COMMS=${gpu_comms}
 
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: Setting MPI_NVTXS= ${mpi_nvtx}" >&5
+$as_echo "$as_me: Setting MPI_NVTXS= ${mpi_nvtx}" >&6;}
+MPI_NVTX=${mpi_nvtx}
+
+
 { $as_echo "$as_me:${as_lineno-$LINENO}: Setting BUILD_QDP_INTERFACE= ${build_qdp_interface}" >&5
 $as_echo "$as_me: Setting BUILD_QDP_INTERFACE= ${build_qdp_interface}" >&6;}
 BUILD_QDP_INTERFACE=${build_qdp_interface}

diff --git a/configure.ac b/configure.ac
@@ -203,6 +203,13 @@ AC_ARG_ENABLE(gpu-comms,
   [ gpu_comms="no" ]
 )
 
+dnl enable NVTX mark up for the MPI in the visual profiler
+AC_ARG_ENABLE(mpi-nvtx,
+  AC_HELP_STRING([--enable-mpi-nvtx], [ Enable NVTX markup for profiling MPI API calls in the visual profiler (default: disabled)]),
+  [ mpi_nvtx=${enableval}],
+  [ mpi_nvtx="no" ]
+)
+
 dnl enable packing and unpacking on the device
 AC_ARG_ENABLE(device-pack,
   AC_HELP_STRING([--enable-device-pack], [ Enable the packing / unpacking of fields on the device (default: disabled)]),
@@ -544,6 +551,14 @@ yes|no);;
   ;;
 esac
 
+dnl enable NVTX mark up for the MPI in the visual profiler
+case ${mpi_nvtx} in
+yes|no);;
+*) 
+  AC_MSG_ERROR([ invalid value for --enable-mpi-nvtx ])
+  ;;
+esac
+
 dnl QDP-JIT support
 case ${build_qdpjit} in
 yes|no);;
@@ -708,6 +723,9 @@ AC_SUBST( GPU_DIRECT, [${gpu_direct}])
 AC_MSG_NOTICE([Setting GPU_COMMS= ${gpu_comms}])
 AC_SUBST( GPU_COMMS, [${gpu_comms}])
 
+AC_MSG_NOTICE([Setting MPI_NVTXS= ${mpi_nvtx}])
+AC_SUBST( MPI_NVTX, [${mpi_nvtx}])
+
 AC_MSG_NOTICE([Setting BUILD_QDP_INTERFACE= ${build_qdp_interface}])
 AC_SUBST( BUILD_QDP_INTERFACE, [${build_qdp_interface}])
 

diff --git a/include/comm_quda.h b/include/comm_quda.h
@@ -29,46 +29,70 @@ extern "C" {
   int comm_coord(int dim);
 
   /**
-     Create a persistent message handler for a relative send
+     Create a persistent message handler for a relative send.  This
+     should not be called directly, and instead the helper macro
+     (without the trailing underscore) should be called instead.
      @param buffer Buffer from which message will be sent
      @param dim Dimension in which message will be sent
      @param dir Direction in which messaged with be sent (0 - backwards, 1 forwards)
      @param nbytes Size of message in bytes
   */
-  MsgHandle *comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes);
+  MsgHandle *comm_declare_send_relative_(const char *func, const char *file, int line,
+					 void *buffer, int dim, int dir, size_t nbytes);
+
+#define comm_declare_send_relative(buffer, dim, dir, nbytes)		\
+  comm_declare_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes)
 
   /**
-     Create a persistent message handler for a relative receive
+     Create a persistent message handler for a relative send.  This
+     should not be called directly, and instead the helper macro
+     (without the trailing underscore) should be called instead.
      @param buffer Buffer into which message will be received
      @param dim Dimension from message will be received
      @param dir Direction from messaged with be recived (0 - backwards, 1 forwards)
      @param nbytes Size of message in bytes
   */
-  MsgHandle *comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes);
+  MsgHandle *comm_declare_receive_relative_(const char *func, const char *file, int line,
+					    void *buffer, int dim, int dir, size_t nbytes);
+
+#define comm_declare_receive_relative(buffer, dim, dir, nbytes)		\
+  comm_declare_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, nbytes)
 
   /**
-     Create a persistent strided message handler for a relative send
+     Create a persistent strided message handler for a relative send.
+     This should not be called directly, and instead the helper macro
+     (without the trailing underscore) should be called instead.
      @param buffer Buffer from which message will be sent
      @param dim Dimension in which message will be sent
      @param dir Direction in which messaged with be sent (0 - backwards, 1 forwards)
      @param blksize Size of block in bytes
      @param nblocks Number of blocks
      @param stride Stride between blocks in bytes
   */
-  MsgHandle *comm_declare_strided_send_relative(void *buffer, int dim, int dir, 
-						size_t blksize, int nblocks, size_t stride);
+  MsgHandle *comm_declare_strided_send_relative_(const char *func, const char *file, int line,
+						 void *buffer, int dim, int dir,
+						 size_t blksize, int nblocks, size_t stride);
+
+#define comm_declare_strided_send_relative(buffer, dim, dir, blksize, nblocks, stride) \
+  comm_declare_strided_send_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride)
 
   /**
      Create a persistent strided message handler for a relative receive
+     This should not be called directly, and instead the helper macro
+     (without the trailing underscore) should be called instead.
      @param buffer Buffer into which message will be received
      @param dim Dimension from message will be received
      @param dir Direction from messaged with be recived (0 - backwards, 1 forwards)
      @param blksize Size of block in bytes
      @param nblocks Number of blocks
      @param stride Stride between blocks in bytes
   */
-  MsgHandle *comm_declare_strided_receive_relative(void *buffer, int dim, int dir, 
-						   size_t blksize, int nblocks, size_t stride);
+  MsgHandle *comm_declare_strided_receive_relative_(const char *func, const char *file, int line,
+						    void *buffer, int dim, int dir,
+						    size_t blksize, int nblocks, size_t stride);
+
+#define comm_declare_strided_receive_relative(buffer, dim, dir, blksize, nblocks, stride) \
+  comm_declare_strided_receive_relative_(__func__, __FILE__, __LINE__, buffer, dim, dir, blksize, nblocks, stride)
 
   void comm_finalize(void);
   void comm_dim_partitioned_set(int dim);

diff --git a/lib/Makefile b/lib/Makefile
@@ -179,6 +179,9 @@ gen:
 	$(PYTHON) generate/dw_dslash_4D_cuda_gen.py
 	$(PYTHON) generate/fused_exterior_dw_dslash_4D_cuda_gen.py
 
+mpi_nvtx:
+	$(PYTHON) generate/wrap.py -g -o nvtx_pmpi.c generate/nvtx.w
+
 clean:
 	-rm -f *.o $(QUDA)
 

diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
@@ -52,32 +52,27 @@ namespace quda {
     if (getVerbosity() == QUDA_DEBUG_VERBOSE) 
       printfQuda("Precision = %d, Subset = %d\n", precision, siteSubset);
 
-    int num_faces = 1;
-    int num_norm_faces=2;
-
-    // FIXME - this is a hack from hell that needs to be fixed.  When
-    // the TIFR interface is enabled we are forcing naive staggered
-    // support which breaks asqtad/hisq fermions.  The problem occurs
-    // because the ghost zone is allocated before we know which
-    // operator (and hence number of faces are needed).  One solution
-    // may be to separate the ghost zone memory allocation from the
-    // field itself, which has other benefits (1. on multi-gpu
-    // machines with UVA, we can read the ghost zone directly from the
-    // neighbouring field and 2.) we can use a single contiguous
-    // buffer for the ghost zone and its norm which will reduce
-    // latency for half precision and allow us to enable GPU_COMMS
-    // support for half precision).
-#ifdef BUILD_TIFR_INTERFACE
-    if (nSpin == 1) { //staggered
-      num_faces=2;
-      num_norm_faces=2;
-    }
-#else
-    if (nSpin == 1) { // improved staggered
-      num_faces=6;
-      num_norm_faces=6;
-    }
-#endif
+    // FIXME - The ghost zone is allocated before we know which
+    // operator (and hence number of faces are needed), thus we
+    // allocate a ghost zone large enough to cope with the maximum
+    // number of faces.  All Wilson-like operators support only
+    // involve the excahnge of one face so this is no problem.
+    // However, for staggered fermions, we have either nFace=1 or 3,
+    // thus we allocated using the latter.  This will artificially
+    // raise the GPU memory requirements for naive staggered fermions.
+    // One potential future solution may be to separate the ghost zone
+    // memory allocation from the field itself, which has other
+    // benefits (1. on multi-gpu machines with UVA, we can read the
+    // ghost zone directly from the neighbouring field and 2.) we can
+    // use a single contiguous buffer for the ghost zone and its norm
+    // which will reduce latency for half precision and allow us to
+    // enable GPU_COMMS support for half precision).
+    int nFaceGhost = (nSpin == 1) ? 3 : 1;
+
+    // For Wilson we have the number of effective faces since the
+    // fields are spin projected.
+    int num_faces = ((nSpin == 1) ? 2 : 1) * nFaceGhost;
+    int num_norm_faces = 2*nFaceGhost;
 
     // calculate size of ghost zone required
     int ghostVolume = 0;