Merge branch 'develop' into VirtualParticle_MO_batched

QMCPACK · Sep 19, 2023 · 99074f3 · 99074f3
2 parents 7abe3ba + db96869
commit 99074f3
Show file tree

Hide file tree

Showing 12 changed files with 189 additions and 87 deletions.
diff --git a/docs/intro_wavefunction.rst b/docs/intro_wavefunction.rst
@@ -912,6 +912,68 @@ the associated CSF, and the excitation degree relative to the first determinant.
   scf    2022200000000000000000000000000000000000000000000000000000
   excitation degree  2
 
+.. _orbitalrotation:
+
+Orbital Rotation
+----------------
+Orbital rotation mixes orbitals between those occupied by electrons and those unoccupied by electrons.
+Because it changes the orbitals, a well-chosen optimized orbital rotation can improve the trial wavefunction for VMC,
+can change the nodal structure, and can potentially improve the fixed-node DMC energy.
+
+Combining orbitals is complicated by the need to maintain the normalization of the
+orbitals.
+A rotation matrix will preserve the normalization of the vectors in linear combinations.
+However the entries in a rotation matrix are not independent.
+A rotation matrix can alternatively be expressed as the matrix exponential of a skew-symmetric matrix.
+The entries in that skew-symmetric matrix are independent and can form an independent set of optimizable parameters.
+
+Optimizable orbitals are given in the input file by enclosing an SPO
+in an `rotated_sposet` element.  The `determinant` element `id` attribute should reference the name of the rotated sposet.
+
+The `rotated_sposet` element requires use of the updated `sposet_collection` style.
+
+``rotated_sposet`` element:
+
+.. _Table_rotated_sposet:
+.. table::
+
+     +-----------------+--------------------------+
+     | Parent elements | ``sposet_collection``    |
+     +-----------------+--------------------------+
+     | Child elements  | ``sposet``, ``opt_vars`` |
+     +-----------------+--------------------------+
+
+Attribute:
+
++-----------------+----------+----------+---------+-------------------------+
+| Name            | Datatype | Values   | Default | Description             |
++=================+==========+==========+=========+=========================+
+| ``name``        | Text     |          |         | Name of rotated SPOSet  |
++-----------------+----------+----------+---------+-------------------------+
+
+.. code-block::
+   :caption: Orbital Rotation XML element.
+   :name: Listing 1
+
+   <sposet_collection ...>
+     <rotated_sposet name="rot_spo">
+       <sposet name="spo" size="8">
+         ...
+       </sposet>
+     </rotated_sposet>
+   </sposet_collection>
+   <determinantset>
+     <slaterdeterminant>
+       <determinant sposet="rot_spo"/>
+       <determinant sposet="rot_spo"/>
+     </slaterdeterminant>
+   </determinantset>
+
+
+The `opt_vars` element can be used to specify initial rotation parameters.
+The parameters are given as a space-separated list of numbers in the element text.
+The length of this list must match the expected number of rotation parameters.
+
 .. _backflow:
 
 Backflow Wavefunctions

diff --git a/src/Platforms/tests/SYCL/test_SYCLallocator.cpp b/src/Platforms/tests/SYCL/test_SYCLallocator.cpp
@@ -26,15 +26,11 @@ TEST_CASE("SYCL_allocator", "[SYCL]")
   Vector<double, SYCLAllocator<double>> vec(1024);
   Vector<double> vec_h(1024);
 
-  sycl::event e;
-  {
-    double* V = vec.data();
-
-    e = m_queue.parallel_for(sycl::range<1>{1024}, [=](sycl::id<1> item) { V[item] = item + 1; });
-  }
+  double* V = vec.data();
+  m_queue.parallel_for(sycl::range<1>{1024}, [=](sycl::id<1> item) { V[item] = item + 1; });
 
   //copy to host
-  m_queue.memcpy(vec_h.data(), vec.data(), 1024 * sizeof(double), {e}).wait();
+  m_queue.memcpy(vec_h.data(), vec.data(), 1024 * sizeof(double)).wait();
 
   CHECK(vec_h[0] == 1);
   CHECK(vec_h[77] == 78);
@@ -46,10 +42,8 @@ TEST_CASE("SYCL_host_allocator", "[SYCL]")
   // SYCLHostAllocator
   Vector<double, SYCLHostAllocator<double>> vec(1024, 1);
 
-  {
-    double* V = vec.data();
-    m_queue.parallel_for(sycl::range<1>{1024}, [=](sycl::id<1> item) { V[item] += item + 1; }).wait();
-  }
+  double* V = vec.data();
+  m_queue.parallel_for(sycl::range<1>{1024}, [=](sycl::id<1> item) { V[item] += item + 1; }).wait();
 
   CHECK(vec[0] == 2);
   CHECK(vec[77] == 79);

diff --git a/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h b/src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h
@@ -60,7 +60,6 @@ class DelayedUpdateSYCL
   Matrix<T> Ainv_buffer;
 
   sycl::queue m_queue_;
-  sycl::event ainv_event_;
 
   /// reset delay count to 0
   inline void clearDelayCount()
@@ -131,9 +130,7 @@ class DelayedUpdateSYCL
     if (!prefetched_range.checkRange(rowchanged))
     {
       const int last_row = std::min(rowchanged + Ainv_buffer.rows(), Ainv.rows());
-      m_queue_
-          .memcpy(Ainv_buffer.data(), Ainv_gpu[rowchanged], invRow.size() * (last_row - rowchanged) * sizeof(T),
-                  ainv_event_)
+      m_queue_.memcpy(Ainv_buffer.data(), Ainv_gpu[rowchanged], invRow.size() * (last_row - rowchanged) * sizeof(T))
           .wait();
       prefetched_range.setRange(rowchanged, last_row);
     }
@@ -204,33 +201,33 @@ class DelayedUpdateSYCL
       const int norb     = Ainv.rows();
       const int lda_Binv = Binv.cols();
 
-      auto u_event = m_queue_.memcpy(U_gpu.data(), U.data(), norb * delay_count * sizeof(T));
-      auto b_event = m_queue_.memcpy(Binv_gpu.data(), Binv.data(), lda_Binv * delay_count * sizeof(T));
+      m_queue_.memcpy(U_gpu.data(), U.data(), norb * delay_count * sizeof(T));
+      m_queue_.memcpy(Binv_gpu.data(), Binv.data(), lda_Binv * delay_count * sizeof(T));
 
-      auto temp_event = syclBLAS::gemm(m_queue_, 'T', 'N', delay_count, norb, norb, cone, U_gpu.data(), norb,
-                                       Ainv_gpu.data(), norb, czero, temp_gpu.data(), lda_Binv, {u_event});
+      syclBLAS::gemm(m_queue_, 'T', 'N', delay_count, norb, norb, cone, U_gpu.data(), norb, Ainv_gpu.data(), norb,
+                     czero, temp_gpu.data(), lda_Binv);
 
-      auto temp_v_event = applyW_stageV_sycl(m_queue_, {temp_event}, delay_list.data(), delay_count, temp_gpu.data(),
-                                             norb, temp_gpu.cols(), V_gpu.data(), Ainv_gpu.data());
+      applyW_stageV_sycl(m_queue_, delay_list.data(), delay_count, temp_gpu.data(), norb, temp_gpu.cols(), V_gpu.data(),
+                         Ainv_gpu.data());
 
-      u_event = syclBLAS::gemm(m_queue_, 'N', 'N', norb, delay_count, delay_count, cone, V_gpu.data(), norb,
-                               Binv_gpu.data(), lda_Binv, czero, U_gpu.data(), norb, {temp_v_event, b_event});
+      syclBLAS::gemm(m_queue_, 'N', 'N', norb, delay_count, delay_count, cone, V_gpu.data(), norb, Binv_gpu.data(),
+                     lda_Binv, czero, U_gpu.data(), norb);
 
 #ifdef SYCL_BLOCKING
       syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb, temp_gpu.data(), lda_Binv,
-                     cone, Ainv_gpu.data(), norb, {u_event})
+                     cone, Ainv_gpu.data(), norb)
           .wait();
 #else
-      ainv_event_ = syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb,
-                                   temp_gpu.data(), lda_Binv, cone, Ainv_gpu.data(), norb, {u_event});
+      syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb, temp_gpu.data(), lda_Binv,
+                     cone, Ainv_gpu.data(), norb);
 #endif
 
       clearDelayCount();
     }
 
     // transfer Ainv_gpu to Ainv and wait till completion
     if (transfer_to_host)
-      m_queue_.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(T), ainv_event_).wait();
+      m_queue_.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(T)).wait();
   }
 };
 } // namespace qmcplusplus

diff --git a/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp b/src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp
@@ -67,12 +67,12 @@ class syclSolverInverter
     const int norb = logdetT.rows();
     resize(norb, m_queue);
 
-    auto c_event = m_queue.memcpy(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT));
-    auto t_event = syclBLAS::transpose(m_queue, Mat1_gpu.data(), norb, Mat1_gpu.cols(), Ainv_gpu.data(), norb,
-                                       Ainv_gpu.cols(), {c_event});
+    m_queue.memcpy(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT));
+    syclBLAS::transpose(m_queue, Mat1_gpu.data(), norb, Mat1_gpu.cols(), Ainv_gpu.data(), norb,
+                                       Ainv_gpu.cols());
     try
     {
-      syclSolver::getrf(m_queue, norb, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws, {t_event})
+      syclSolver::getrf(m_queue, norb, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws)
           .wait();
     }
     catch (sycl::exception const& ex)
@@ -86,9 +86,9 @@ class syclSolverInverter
 
     log_value = computeLogDet_sycl<TREAL>(m_queue, norb, Ainv_gpu.cols(), Ainv_gpu.data(), ipiv.data());
 
-    c_event = syclSolver::getri(m_queue, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws);
+    syclSolver::getri(m_queue, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws);
 
-    m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), {c_event}).wait();
+    m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT)).wait();
   }
 
   /** compute the inverse of the transpose of matrix A and its determinant value in log
@@ -105,15 +105,15 @@ class syclSolverInverter
     const int norb = logdetT.rows();
     resize(norb, m_queue);
     //use Ainv_gpu for transpose
-    auto c_event = m_queue.memcpy(Ainv_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT));
+    m_queue.memcpy(Ainv_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT));
     //transpose
-    auto t_event = syclBLAS::transpose(m_queue, Ainv_gpu.data(), norb, Ainv_gpu.cols(), Mat1_gpu.data(), norb,
-                                       Mat1_gpu.cols(), {c_event});
+    syclBLAS::transpose(m_queue, Ainv_gpu.data(), norb, Ainv_gpu.cols(), Mat1_gpu.data(), norb,
+                                       Mat1_gpu.cols());
 
     //getrf (LU) -> getri (inverse)
     try
     {
-      syclSolver::getrf(m_queue, norb, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws, {t_event})
+      syclSolver::getrf(m_queue, norb, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws)
           .wait();
     }
     catch (sycl::exception const& ex)
@@ -127,11 +127,11 @@ class syclSolverInverter
 
     log_value = computeLogDet_sycl<TREAL>(m_queue, norb, Mat1_gpu.cols(), Mat1_gpu.data(), ipiv.data());
 
-    c_event = syclSolver::getri(m_queue, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws);
+    syclSolver::getri(m_queue, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws);
 
-    t_event = syclBLAS::copy_n(m_queue, Mat1_gpu.data(), Mat1_gpu.size(), Ainv_gpu.data(), {c_event});
+    syclBLAS::copy_n(m_queue, Mat1_gpu.data(), Mat1_gpu.size(), Ainv_gpu.data());
 
-    m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), {t_event}).wait();
+    m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT)).wait();
   }
 };
 } // namespace qmcplusplus

diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp
@@ -17,14 +17,14 @@ namespace qmcplusplus
 {
 template<typename T>
 sycl::event applyW_stageV_sycl(sycl::queue& aq,
-                               const std::vector<sycl::event>& dependencies,
                                const int* restrict delay_list_gpu,
                                const int delay_count,
                                T* restrict temp_gpu,
                                const int numorbs,
                                const int ndelay,
                                T* restrict V_gpu,
-                               const T* restrict Ainv)
+                               const T* restrict Ainv,
+                               const std::vector<sycl::event>& dependencies)
 {
   const size_t BS = 128;
   const size_t NB = (numorbs + BS - 1) / BS;
@@ -48,44 +48,44 @@ sycl::event applyW_stageV_sycl(sycl::queue& aq,
 }
 
 template sycl::event applyW_stageV_sycl(sycl::queue& aq,
-                                        const std::vector<sycl::event>& dependencies,
                                         const int* restrict delay_list_gpu,
                                         const int delay_count,
                                         float* restrict temp_gpu,
                                         const int numorbs,
                                         const int ndelay,
                                         float* restrict V_gpu,
-                                        const float* restrict Ainv);
+                                        const float* restrict Ainv,
+                                        const std::vector<sycl::event>& dependencies);
 
 template sycl::event applyW_stageV_sycl(sycl::queue& aq,
-                                        const std::vector<sycl::event>& dependencies,
                                         const int* restrict delay_list_gpu,
                                         const int delay_count,
                                         double* restrict temp_gpu,
                                         const int numorbs,
                                         const int ndelay,
                                         double* restrict V_gpu,
-                                        const double* restrict Ainv);
+                                        const double* restrict Ainv,
+                                        const std::vector<sycl::event>& dependencies);
 
 template sycl::event applyW_stageV_sycl(sycl::queue& aq,
-                                        const std::vector<sycl::event>& dependencies,
                                         const int* restrict delay_list_gpu,
                                         const int delay_count,
                                         std::complex<float>* restrict temp_gpu,
                                         const int numorbs,
                                         const int ndelay,
                                         std::complex<float>* restrict V_gpu,
-                                        const std::complex<float>* restrict Ainv);
+                                        const std::complex<float>* restrict Ainv,
+                                        const std::vector<sycl::event>& dependencies);
 
 template sycl::event applyW_stageV_sycl(sycl::queue& aq,
-                                        const std::vector<sycl::event>& dependencies,
                                         const int* restrict delay_list_gpu,
                                         const int delay_count,
                                         std::complex<double>* restrict temp_gpu,
                                         const int numorbs,
                                         const int ndelay,
                                         std::complex<double>* restrict V_gpu,
-                                        const std::complex<double>* restrict Ainv);
+                                        const std::complex<double>* restrict Ainv,
+                                        const std::vector<sycl::event>& dependencies);
 
 
 template<typename T, typename TMAT, typename INDEX>

diff --git a/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp b/src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp
@@ -22,14 +22,14 @@ namespace qmcplusplus
 {
 template<typename T>
 sycl::event applyW_stageV_sycl(sycl::queue& aq,
-                               const std::vector<sycl::event>& dependencies,
                                const int* delay_list_gpu,
                                const int delay_count,
                                T* temp_gpu,
                                const int numorbs,
                                const int ndelay,
                                T* V_gpu,
-                               const T* Ainv);
+                               const T* Ainv,
+                               const std::vector<sycl::event>& dependencies = {});
 
 template<typename T, typename TMAT, typename INDEX>
 std::complex<T> computeLogDet_sycl(sycl::queue& aq,

diff --git a/src/QMCWaveFunctions/tests/test_ConstantSPOSet.cpp b/src/QMCWaveFunctions/tests/test_ConstantSPOSet.cpp
@@ -114,8 +114,10 @@ TEST_CASE("ConstantSPOSet", "[wavefunction]")
   const int last_index  = 2;
   sposet->evaluate_notranspose(elec, first_index, last_index, phimat, gphimat, lphimat);
 
-  checkMatrix(phimat, spomat);
-  checkMatrix(lphimat, laplspomat);
+  auto check = checkMatrix(phimat, spomat);
+  CHECKED_ELSE(check.result) { FAIL(check.result_message); }
+  check = checkMatrix(lphimat, laplspomat);
+  CHECKED_ELSE(check.result) { FAIL(check.result_message); }
 
   //Test of makeClone()
   auto sposet_vgl2 = sposet->makeClone();
@@ -125,8 +127,10 @@ TEST_CASE("ConstantSPOSet", "[wavefunction]")
 
   sposet_vgl2->evaluate_notranspose(elec, first_index, last_index, phimat, gphimat, lphimat);
 
-  checkMatrix(phimat, spomat);
-  checkMatrix(lphimat, laplspomat);
+  check = checkMatrix(phimat, spomat);
+  CHECKED_ELSE(check.result) { FAIL(check.result_message); }
+  check = checkMatrix(lphimat, laplspomat);
+  CHECKED_ELSE(check.result) { FAIL(check.result_message); }
 
   //Lastly, check if name is correct.
   std::string myname = sposet_vgl2->getClassName();