This repository has been archived by the owner on Jul 26, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathkronmult6_batched.hpp
94 lines (75 loc) · 2.7 KB
/
kronmult6_batched.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#ifndef KRONMULT6_BATCHED_HPP
#define KRONMULT6_BATCHED_HPP 1
#include "kroncommon.hpp"
#include "kronmult6.hpp"
// --------------------------------------------------------------------
// Performs Y(:,k) = kron(A1(k),...,A6(k)) * X(:,k), k=1:batchCount
// Note result in Y but X and W may be modified as temporary work space
// --------------------------------------------------------------------
template<typename T>
GLOBAL_FUNCTION
void kronmult6_batched(
int const n,
T const Aarray_[],
T X_[],
T Y_[],
T W_[],
int const batchCount)
//
// conceptual shape of Aarray is (n,n,6,batchCount)
// X_ is (n^6, batchCount)
// Y_ is (n^6, batchCount)
// W_ is (n^6, batchCount)
//
{
#ifdef USE_GPU
// -------------------------------------------
// note 1-based matlab convention for indexing
// -------------------------------------------
int const iz_start = blockIdx.x + 1;
int const iz_size = gridDim.x;
expect( gridDim.y == 1);
expect( gridDim.z == 1);
#else
int const iz_start = 1;
int const iz_size = 1;
#endif
int const n2 = n*n;
int const n4 = n2*n2;
int const n6 = n2*n4;
auto X = [&] (int const i,
int const j) -> T& {
return( X_[ indx2f(i,j,n6) ] );
};
auto Y = [&] (int const i,
int const j) -> T& {
return( Y_[ indx2f(i,j,n6) ] );
};
auto W = [&] (int const i,
int const j) -> T& {
return( W_[ indx2f(i,j,n6) ] );
};
auto Aarray = [&] (int const i1,
int const i2,
int const i3,
int const i4) -> T const & {
return( Aarray_[ indx4f(i1,i2,i3,i4, n,n,6 ) ] );
};
#ifndef USE_GPU
#pragma omp parallel for
#endif
for(int ibatch=iz_start; ibatch <= batchCount; ibatch += iz_size) {
T* const Xp = &( X(1,ibatch) );
T* const Yp = &( Y(1,ibatch) );
T* const Wp = &( W(1,ibatch) );
T const * const A1 = &(Aarray(1,1,1,ibatch));
T const * const A2 = &(Aarray(1,1,2,ibatch));
T const * const A3 = &(Aarray(1,1,3,ibatch));
T const * const A4 = &(Aarray(1,1,4,ibatch));
T const * const A5 = &(Aarray(1,1,5,ibatch));
T const * const A6 = &(Aarray(1,1,6,ibatch));
int const nvec = 1;
kronmult6( n, nvec, A1,A2,A3,A4,A5,A6, Xp, Yp, Wp );
};
}
#endif