This repository has been archived by the owner on Jul 26, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathkronmult3.hpp
108 lines (88 loc) · 2.82 KB
/
kronmult3.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#ifndef KRONMULT3_HPP
#define KRONMULT3_HPP 1
#include "kroncommon.hpp"
#include "kgemm_nt.hpp"
#include "kronmult2.hpp"
// -------------------------------------------
// device function to evaluate
// Y = kron(A1,...,A3)*X as
// W(:,k) = X(:,k) * transpose(A1), k=1:nvec
// Y = kron(A2,..,A3) * W
// -------------------------------------------
template<typename T>
DEVICE_FUNCTION
void kronmult3( int const n,
int const nvec,
T const A1_[],
T const A2_[],
T const A3_[],
T X_[],
T Y_[],
T W_[],
int const lda_in = 0 )
// -----------------
// note A1 is n by n
// X is (n^3 by nvec)
// -----------------
{
int const lda = (lda_in == 0) ? n : lda_in;
int const n2 = n*n;
int const n3 = n*n2;
int const ldX = n3;
int const ldW = n3;
auto X = [&] (int const i,
int const j) -> T& {
return( X_[ indx2f(i,j,ldX) ] );
};
auto W = [&] (int const i,
int const j) -> T& {
return( W_[ indx2f(i,j,ldW) ] );
};
for(int i=1; i <= nvec; i++) {
T *Xi_ = &( X(1, i) );
T *Wi_ = &( W(1, i) );
int const ldXi = n2;
int const ldWi = n2;
// ----------------------------
// Xi viewed as (n^2 by n) array
// Wi viewed as (n^2 by n) array
// ----------------------------
auto Xi = [&] (int const ii,
int const jj) -> T& {
return( Xi_[ indx2f(ii,jj,ldXi) ] );
};
auto Wi = [&] (int const ii,
int const jj) -> T& {
return( Wi_[ indx2f(ii,jj,ldWi) ] );
};
// --------------------------------------------------------
// Wi(1:n^2, 1:n) = Xi(1:n^2, 1:n) * transpose(A1(1:n,1:n))
// --------------------------------------------------------
int const mm = n2;
int const nn = n;
int const kk = n;
T const alpha = 1;
T const beta = 0;
T const * const Ap = &(Xi(1,1));
T const * const Bp = A1_;
T * const Cp = &(Wi(1,1));
int const ld1 = ldXi;
int const ld2 = lda;
int const ld3 = ldWi;
kgemm_nt( mm,nn,kk,
alpha, Ap, ld1,
Bp, ld2,
beta, Cp, ld3 );
};
int const next_nvec = nvec * n;
SYNCTHREADS;
// --------------------------------
// note now X_ is used as workspace
// --------------------------------
{
kronmult2( n, next_nvec,
A2_, A3_,
W_, Y_, X_, lda );
}
}
#endif