Skip to content

Commit

Permalink
Merge branch 'develop' into VirtualParticle_MO_batched
Browse files Browse the repository at this point in the history
  • Loading branch information
prckent authored Sep 19, 2023
2 parents 7abe3ba + db96869 commit 99074f3
Show file tree
Hide file tree
Showing 12 changed files with 189 additions and 87 deletions.
62 changes: 62 additions & 0 deletions docs/intro_wavefunction.rst
Original file line number Diff line number Diff line change
Expand Up @@ -912,6 +912,68 @@ the associated CSF, and the excitation degree relative to the first determinant.
scf 2022200000000000000000000000000000000000000000000000000000
excitation degree 2

.. _orbitalrotation:

Orbital Rotation
----------------
Orbital rotation mixes orbitals between those occupied by electrons and those unoccupied by electrons.
Because it changes the orbitals, a well-chosen optimized orbital rotation can improve the trial wavefunction for VMC,
can change the nodal structure, and can potentially improve the fixed-node DMC energy.

Combining orbitals is complicated by the need to maintain the normalization of the
orbitals.
A rotation matrix will preserve the normalization of the vectors in linear combinations.
However the entries in a rotation matrix are not independent.
A rotation matrix can alternatively be expressed as the matrix exponential of a skew-symmetric matrix.
The entries in that skew-symmetric matrix are independent and can form an independent set of optimizable parameters.

Optimizable orbitals are given in the input file by enclosing an SPO
in an `rotated_sposet` element. The `determinant` element `id` attribute should reference the name of the rotated sposet.

The `rotated_sposet` element requires use of the updated `sposet_collection` style.

``rotated_sposet`` element:

.. _Table_rotated_sposet:
.. table::

+-----------------+--------------------------+
| Parent elements | ``sposet_collection`` |
+-----------------+--------------------------+
| Child elements | ``sposet``, ``opt_vars`` |
+-----------------+--------------------------+

Attribute:

+-----------------+----------+----------+---------+-------------------------+
| Name | Datatype | Values | Default | Description |
+=================+==========+==========+=========+=========================+
| ``name`` | Text | | | Name of rotated SPOSet |
+-----------------+----------+----------+---------+-------------------------+

.. code-block::
:caption: Orbital Rotation XML element.
:name: Listing 1
<sposet_collection ...>
<rotated_sposet name="rot_spo">
<sposet name="spo" size="8">
...
</sposet>
</rotated_sposet>
</sposet_collection>
<determinantset>
<slaterdeterminant>
<determinant sposet="rot_spo"/>
<determinant sposet="rot_spo"/>
</slaterdeterminant>
</determinantset>
The `opt_vars` element can be used to specify initial rotation parameters.
The parameters are given as a space-separated list of numbers in the element text.
The length of this list must match the expected number of rotation parameters.

.. _backflow:

Backflow Wavefunctions
Expand Down
16 changes: 5 additions & 11 deletions src/Platforms/tests/SYCL/test_SYCLallocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,11 @@ TEST_CASE("SYCL_allocator", "[SYCL]")
Vector<double, SYCLAllocator<double>> vec(1024);
Vector<double> vec_h(1024);

sycl::event e;
{
double* V = vec.data();

e = m_queue.parallel_for(sycl::range<1>{1024}, [=](sycl::id<1> item) { V[item] = item + 1; });
}
double* V = vec.data();
m_queue.parallel_for(sycl::range<1>{1024}, [=](sycl::id<1> item) { V[item] = item + 1; });

//copy to host
m_queue.memcpy(vec_h.data(), vec.data(), 1024 * sizeof(double), {e}).wait();
m_queue.memcpy(vec_h.data(), vec.data(), 1024 * sizeof(double)).wait();

CHECK(vec_h[0] == 1);
CHECK(vec_h[77] == 78);
Expand All @@ -46,10 +42,8 @@ TEST_CASE("SYCL_host_allocator", "[SYCL]")
// SYCLHostAllocator
Vector<double, SYCLHostAllocator<double>> vec(1024, 1);

{
double* V = vec.data();
m_queue.parallel_for(sycl::range<1>{1024}, [=](sycl::id<1> item) { V[item] += item + 1; }).wait();
}
double* V = vec.data();
m_queue.parallel_for(sycl::range<1>{1024}, [=](sycl::id<1> item) { V[item] += item + 1; }).wait();

CHECK(vec[0] == 2);
CHECK(vec[77] == 79);
Expand Down
29 changes: 13 additions & 16 deletions src/QMCWaveFunctions/Fermion/DelayedUpdateSYCL.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ class DelayedUpdateSYCL
Matrix<T> Ainv_buffer;

sycl::queue m_queue_;
sycl::event ainv_event_;

/// reset delay count to 0
inline void clearDelayCount()
Expand Down Expand Up @@ -131,9 +130,7 @@ class DelayedUpdateSYCL
if (!prefetched_range.checkRange(rowchanged))
{
const int last_row = std::min(rowchanged + Ainv_buffer.rows(), Ainv.rows());
m_queue_
.memcpy(Ainv_buffer.data(), Ainv_gpu[rowchanged], invRow.size() * (last_row - rowchanged) * sizeof(T),
ainv_event_)
m_queue_.memcpy(Ainv_buffer.data(), Ainv_gpu[rowchanged], invRow.size() * (last_row - rowchanged) * sizeof(T))
.wait();
prefetched_range.setRange(rowchanged, last_row);
}
Expand Down Expand Up @@ -204,33 +201,33 @@ class DelayedUpdateSYCL
const int norb = Ainv.rows();
const int lda_Binv = Binv.cols();

auto u_event = m_queue_.memcpy(U_gpu.data(), U.data(), norb * delay_count * sizeof(T));
auto b_event = m_queue_.memcpy(Binv_gpu.data(), Binv.data(), lda_Binv * delay_count * sizeof(T));
m_queue_.memcpy(U_gpu.data(), U.data(), norb * delay_count * sizeof(T));
m_queue_.memcpy(Binv_gpu.data(), Binv.data(), lda_Binv * delay_count * sizeof(T));

auto temp_event = syclBLAS::gemm(m_queue_, 'T', 'N', delay_count, norb, norb, cone, U_gpu.data(), norb,
Ainv_gpu.data(), norb, czero, temp_gpu.data(), lda_Binv, {u_event});
syclBLAS::gemm(m_queue_, 'T', 'N', delay_count, norb, norb, cone, U_gpu.data(), norb, Ainv_gpu.data(), norb,
czero, temp_gpu.data(), lda_Binv);

auto temp_v_event = applyW_stageV_sycl(m_queue_, {temp_event}, delay_list.data(), delay_count, temp_gpu.data(),
norb, temp_gpu.cols(), V_gpu.data(), Ainv_gpu.data());
applyW_stageV_sycl(m_queue_, delay_list.data(), delay_count, temp_gpu.data(), norb, temp_gpu.cols(), V_gpu.data(),
Ainv_gpu.data());

u_event = syclBLAS::gemm(m_queue_, 'N', 'N', norb, delay_count, delay_count, cone, V_gpu.data(), norb,
Binv_gpu.data(), lda_Binv, czero, U_gpu.data(), norb, {temp_v_event, b_event});
syclBLAS::gemm(m_queue_, 'N', 'N', norb, delay_count, delay_count, cone, V_gpu.data(), norb, Binv_gpu.data(),
lda_Binv, czero, U_gpu.data(), norb);

#ifdef SYCL_BLOCKING
syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb, temp_gpu.data(), lda_Binv,
cone, Ainv_gpu.data(), norb, {u_event})
cone, Ainv_gpu.data(), norb)
.wait();
#else
ainv_event_ = syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb,
temp_gpu.data(), lda_Binv, cone, Ainv_gpu.data(), norb, {u_event});
syclBLAS::gemm(m_queue_, 'N', 'N', norb, norb, delay_count, -cone, U_gpu.data(), norb, temp_gpu.data(), lda_Binv,
cone, Ainv_gpu.data(), norb);
#endif

clearDelayCount();
}

// transfer Ainv_gpu to Ainv and wait till completion
if (transfer_to_host)
m_queue_.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(T), ainv_event_).wait();
m_queue_.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(T)).wait();
}
};
} // namespace qmcplusplus
Expand Down
26 changes: 13 additions & 13 deletions src/QMCWaveFunctions/Fermion/syclSolverInverter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,12 @@ class syclSolverInverter
const int norb = logdetT.rows();
resize(norb, m_queue);

auto c_event = m_queue.memcpy(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT));
auto t_event = syclBLAS::transpose(m_queue, Mat1_gpu.data(), norb, Mat1_gpu.cols(), Ainv_gpu.data(), norb,
Ainv_gpu.cols(), {c_event});
m_queue.memcpy(Mat1_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT));
syclBLAS::transpose(m_queue, Mat1_gpu.data(), norb, Mat1_gpu.cols(), Ainv_gpu.data(), norb,
Ainv_gpu.cols());
try
{
syclSolver::getrf(m_queue, norb, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws, {t_event})
syclSolver::getrf(m_queue, norb, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws)
.wait();
}
catch (sycl::exception const& ex)
Expand All @@ -86,9 +86,9 @@ class syclSolverInverter

log_value = computeLogDet_sycl<TREAL>(m_queue, norb, Ainv_gpu.cols(), Ainv_gpu.data(), ipiv.data());

c_event = syclSolver::getri(m_queue, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws);
syclSolver::getri(m_queue, norb, Ainv_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws);

m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), {c_event}).wait();
m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT)).wait();
}

/** compute the inverse of the transpose of matrix A and its determinant value in log
Expand All @@ -105,15 +105,15 @@ class syclSolverInverter
const int norb = logdetT.rows();
resize(norb, m_queue);
//use Ainv_gpu for transpose
auto c_event = m_queue.memcpy(Ainv_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT));
m_queue.memcpy(Ainv_gpu.data(), logdetT.data(), logdetT.size() * sizeof(TMAT));
//transpose
auto t_event = syclBLAS::transpose(m_queue, Ainv_gpu.data(), norb, Ainv_gpu.cols(), Mat1_gpu.data(), norb,
Mat1_gpu.cols(), {c_event});
syclBLAS::transpose(m_queue, Ainv_gpu.data(), norb, Ainv_gpu.cols(), Mat1_gpu.data(), norb,
Mat1_gpu.cols());

//getrf (LU) -> getri (inverse)
try
{
syclSolver::getrf(m_queue, norb, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws, {t_event})
syclSolver::getrf(m_queue, norb, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getrf_ws)
.wait();
}
catch (sycl::exception const& ex)
Expand All @@ -127,11 +127,11 @@ class syclSolverInverter

log_value = computeLogDet_sycl<TREAL>(m_queue, norb, Mat1_gpu.cols(), Mat1_gpu.data(), ipiv.data());

c_event = syclSolver::getri(m_queue, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws);
syclSolver::getri(m_queue, norb, Mat1_gpu.data(), norb, ipiv.data(), workspace.data(), getri_ws);

t_event = syclBLAS::copy_n(m_queue, Mat1_gpu.data(), Mat1_gpu.size(), Ainv_gpu.data(), {c_event});
syclBLAS::copy_n(m_queue, Mat1_gpu.data(), Mat1_gpu.size(), Ainv_gpu.data());

m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT), {t_event}).wait();
m_queue.memcpy(Ainv.data(), Ainv_gpu.data(), Ainv.size() * sizeof(TMAT)).wait();
}
};
} // namespace qmcplusplus
Expand Down
20 changes: 10 additions & 10 deletions src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ namespace qmcplusplus
{
template<typename T>
sycl::event applyW_stageV_sycl(sycl::queue& aq,
const std::vector<sycl::event>& dependencies,
const int* restrict delay_list_gpu,
const int delay_count,
T* restrict temp_gpu,
const int numorbs,
const int ndelay,
T* restrict V_gpu,
const T* restrict Ainv)
const T* restrict Ainv,
const std::vector<sycl::event>& dependencies)
{
const size_t BS = 128;
const size_t NB = (numorbs + BS - 1) / BS;
Expand All @@ -48,44 +48,44 @@ sycl::event applyW_stageV_sycl(sycl::queue& aq,
}

template sycl::event applyW_stageV_sycl(sycl::queue& aq,
const std::vector<sycl::event>& dependencies,
const int* restrict delay_list_gpu,
const int delay_count,
float* restrict temp_gpu,
const int numorbs,
const int ndelay,
float* restrict V_gpu,
const float* restrict Ainv);
const float* restrict Ainv,
const std::vector<sycl::event>& dependencies);

template sycl::event applyW_stageV_sycl(sycl::queue& aq,
const std::vector<sycl::event>& dependencies,
const int* restrict delay_list_gpu,
const int delay_count,
double* restrict temp_gpu,
const int numorbs,
const int ndelay,
double* restrict V_gpu,
const double* restrict Ainv);
const double* restrict Ainv,
const std::vector<sycl::event>& dependencies);

template sycl::event applyW_stageV_sycl(sycl::queue& aq,
const std::vector<sycl::event>& dependencies,
const int* restrict delay_list_gpu,
const int delay_count,
std::complex<float>* restrict temp_gpu,
const int numorbs,
const int ndelay,
std::complex<float>* restrict V_gpu,
const std::complex<float>* restrict Ainv);
const std::complex<float>* restrict Ainv,
const std::vector<sycl::event>& dependencies);

template sycl::event applyW_stageV_sycl(sycl::queue& aq,
const std::vector<sycl::event>& dependencies,
const int* restrict delay_list_gpu,
const int delay_count,
std::complex<double>* restrict temp_gpu,
const int numorbs,
const int ndelay,
std::complex<double>* restrict V_gpu,
const std::complex<double>* restrict Ainv);
const std::complex<double>* restrict Ainv,
const std::vector<sycl::event>& dependencies);


template<typename T, typename TMAT, typename INDEX>
Expand Down
4 changes: 2 additions & 2 deletions src/QMCWaveFunctions/detail/SYCL/sycl_determinant_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ namespace qmcplusplus
{
template<typename T>
sycl::event applyW_stageV_sycl(sycl::queue& aq,
const std::vector<sycl::event>& dependencies,
const int* delay_list_gpu,
const int delay_count,
T* temp_gpu,
const int numorbs,
const int ndelay,
T* V_gpu,
const T* Ainv);
const T* Ainv,
const std::vector<sycl::event>& dependencies = {});

template<typename T, typename TMAT, typename INDEX>
std::complex<T> computeLogDet_sycl(sycl::queue& aq,
Expand Down
12 changes: 8 additions & 4 deletions src/QMCWaveFunctions/tests/test_ConstantSPOSet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,10 @@ TEST_CASE("ConstantSPOSet", "[wavefunction]")
const int last_index = 2;
sposet->evaluate_notranspose(elec, first_index, last_index, phimat, gphimat, lphimat);

checkMatrix(phimat, spomat);
checkMatrix(lphimat, laplspomat);
auto check = checkMatrix(phimat, spomat);
CHECKED_ELSE(check.result) { FAIL(check.result_message); }
check = checkMatrix(lphimat, laplspomat);
CHECKED_ELSE(check.result) { FAIL(check.result_message); }

//Test of makeClone()
auto sposet_vgl2 = sposet->makeClone();
Expand All @@ -125,8 +127,10 @@ TEST_CASE("ConstantSPOSet", "[wavefunction]")

sposet_vgl2->evaluate_notranspose(elec, first_index, last_index, phimat, gphimat, lphimat);

checkMatrix(phimat, spomat);
checkMatrix(lphimat, laplspomat);
check = checkMatrix(phimat, spomat);
CHECKED_ELSE(check.result) { FAIL(check.result_message); }
check = checkMatrix(lphimat, laplspomat);
CHECKED_ELSE(check.result) { FAIL(check.result_message); }

//Lastly, check if name is correct.
std::string myname = sposet_vgl2->getClassName();
Expand Down
Loading

0 comments on commit 99074f3

Please sign in to comment.