This repository has been archived by the owner on Jul 26, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathkronmult2.hpp
109 lines (87 loc) · 2.73 KB
/
kronmult2.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#ifndef KRONMULT2_HPP
#define KRONMULT2_HPP 1
#include "kroncommon.hpp"
#include "kgemm_nt.hpp"
#include "kronmult1.hpp"
// -------------------------------------------
// device function to evaluate
// Y = kron(A1,A2)*X as
// W(:,k) = X(:,k) * transpose(A1), k=1:nvec
// Y = A2*W
// -------------------------------------------
template<typename T>
DEVICE_FUNCTION
void kronmult2( int const n,
int const nvec,
T const A1_[],
T const A2_[],
T X_[],
T Y_[],
T W_[],
int const lda_in = 0
)
// -----------------
// note A1 is n by n
// X is (n^3 by nvec)
// -----------------
{
int const lda = (lda_in == 0) ? n : lda_in;
int const n2 = n*n;
int const ldX = n2;
int const ldW = n2;
auto X = [&] (int const i,
int const j) -> T& {
return( X_[ indx2f(i,j,ldX) ] );
};
auto W = [&] (int const i,
int const j) -> T& {
return( W_[ indx2f(i,j,ldW) ] );
};
for(int i=1; i <= nvec; i++) {
T *Xi_ = &( X(1, i) );
T *Wi_ = &( W(1, i) );
int const ldXi = n;
int const ldWi = n;
// ----------------------------
// Xi viewed as (n by n) array
// Wi viewed as (n by n) array
// ----------------------------
auto Xi = [&] (int const ii,
int const jj) -> T& {
return( Xi_[ indx2f(ii,jj,ldXi) ] );
};
auto Wi = [&] (int const ii,
int const jj) -> T& {
return( Wi_[ indx2f(ii,jj,ldWi) ] );
};
// --------------------------------------------------------
// Wi(1:n, 1:n) = Xi(1:n, 1:n) * transpose(A1(1:n,1:n))
// --------------------------------------------------------
int const mm = n;
int const nn = n;
int const kk = n;
T const alpha = 1;
T const beta = 0;
T const * const Ap = &(Xi(1,1));
T const * const Bp = A1_;
T * const Cp = &(Wi(1,1));
int const ld1 = ldXi;
int const ld2 = lda;
int const ld3 = ldWi;
kgemm_nt( mm,nn,kk,
alpha, Ap, ld1,
Bp, ld2,
beta, Cp, ld3 );
};
int const next_nvec = nvec * n;
// --------------------------------
// note now X_ is used as workspace
// --------------------------------
SYNCTHREADS;
{
kronmult1( n, next_nvec,
A2_,
W_, Y_, X_, lda );
}
}
#endif