-
Notifications
You must be signed in to change notification settings - Fork 15
/
dil_tal_cp.F90
497 lines (491 loc) · 24.7 KB
/
dil_tal_cp.F90
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
!TAL_CP: Tensor Algebra Library Routines for Multicore CPUs and Intel MIC.
!REVISION: 2015/01/15 (started 2013/05)
!Copyright (C) 2015 Dmitry I. Lyakh (email: [email protected])
!LICENSE: BSD 3-Clause
!-------------------------------------------------------------------------------
!NOTES:
! # Fortran 2003 at least + OpenMP.
!OPTIONS:
! # -DUSE_MIC: enables MIC offloading directives (disabled by default).
! # -DNO_OMP: disables OpenMP multithreading (enabled by default).
module dil_tal_cp
use, intrinsic:: ISO_C_BINDING
#ifndef NO_OMP
use omp_lib
#endif
implicit none
!PARAMETERS:
integer, parameter, private:: INTD=4 !default integer kind (size)
integer, parameter, private:: INTL=C_SIZE_T !long integer kind (size), used for addressing
integer(INTD), parameter, private:: INT_BLAS=INTD !default integer size for BLAS/LAPACK
integer(INTD), parameter, private:: MAX_TENSOR_RANK=16 !max allowed tensor rank
integer(INTD), parameter, private:: MAX_THREADS=1024 !max number of CPU threads on multi-core CPU (or MIC)
integer(INTD), private:: CONS_OUT=6 !console output device (defaults to screen)
logical, private:: VERBOSE=.true. !verbosity (for errors)
logical, private:: DIL_DEBUG=.false. !debugging
#ifdef USE_MIC
!DIR$ ATTRIBUTES OFFLOAD:mic:: INTD,INTL,INT_BLAS,MAX_TENSOR_RANK,MAX_THREADS,CONS_OUT,VERBOSE,DIL_DEBUG
!DIR$ ATTRIBUTES ALIGN:128:: INTD,INTL,INT_BLAS,MAX_TENSOR_RANK,MAX_THREADS,CONS_OUT,VERBOSE,DIL_DEBUG
#endif
!FUNCTION visibility:
private divide_segment
private thread_wtime
private permutation_invert
private permutation_trivial
public dil_tensor_slice
public dil_tensor_insert
public dil_tensor_transpose
contains
!------------------------------------------------------------------------
#ifdef USE_MIC
!DIR$ ATTRIBUTES OFFLOAD:mic:: divide_segment
#endif
subroutine divide_segment(seg_range,subseg_num,subseg_sizes,ierr) !SERIAL
!A segment of extent <seg_range> will be divided into <subseg_num> subsegments maximally uniformly.
!The length of each subsegment will be returned in the array <subseg_sizes(1:subseg_num)>.
!Any two subsegments will not differ in length by more than 1, longer subsegments preceding the shorter ones.
implicit none
integer(INTL), intent(in):: seg_range !in: total segment extent
integer(INTL), intent(in):: subseg_num !in: number of subsegments requested
integer(INTL), intent(out):: subseg_sizes(1:subseg_num) !out: lengths of the subsegments
integer(INTD), intent(inout):: ierr !error code (0:success)
integer(INTL):: i,j,k,l,m,n
ierr=0_INTD
if(seg_range.gt.0_INTL.and.subseg_num.gt.0_INTL) then
n=seg_range/subseg_num; m=mod(seg_range,subseg_num)
do i=1_INTL,m; subseg_sizes(i)=n+1_INTL; enddo
do i=m+1_INTL,subseg_num; subseg_sizes(i)=n; enddo
else
ierr=-1_INTD
endif
return
end subroutine divide_segment
!----------------------------------------------------
#ifdef USE_MIC
!DIR$ ATTRIBUTES OFFLOAD:mic:: thread_wtime
#endif
function thread_wtime(time_start) result(thw) !SERIAL
!Returns the time in seconds.
implicit none
real(8), intent(in), optional:: time_start !in: clock start time
real(8):: thw
#ifndef NO_OMP
thw=omp_get_wtime()
#else
call cpu_time(thw)
#endif
if(present(time_start)) thw=thw-time_start
return
end function thread_wtime
!--------------------------------------------------------
#ifdef USE_MIC
!DIR$ ATTRIBUTES OFFLOAD:mic:: permutation_invert
#endif
subroutine permutation_invert(plen,pin,pout,ierr) !SERIAL
!This subroutine returns an inverse for a given permutation.
!Numeration starts from 1.
implicit none
integer(INTD), intent(in):: plen !in: length of the permutation
integer(INTD), intent(in):: pin(1:plen) !in: input permutation (must be valid!)
integer(INTD), intent(out):: pout(1:plen) !out: inverse of the input permutation
integer(INTD), intent(inout):: ierr !out: error code (0:success)
integer(INTD):: i
ierr=0
if(plen.gt.0) then
do i=1,plen; pout(pin(i))=i; enddo
elseif(plen.lt.0) then
ierr=1
endif
return
end subroutine permutation_invert
!-------------------------------------------------------------------
#ifdef USE_MIC
!DIR$ ATTRIBUTES OFFLOAD:mic:: permutation_trivial
#endif
function permutation_trivial(plen,prm,ierr) result(prm_triv) !SERIAL
!This function returns .true. if the given permutation is trivial, .false. otherwise.
!Numeration starts from 1.
implicit none
integer(INTD), intent(in):: plen !in: permutation length
integer(INTD), intent(in):: prm(1:plen) !in: permutation
integer(INTD), intent(inout):: ierr !out: error code (0:success)
logical:: prm_triv !out: result
integer(INTD):: i
ierr=0; prm_triv=.true.
if(plen.ge.0) then
do i=1,plen; if(prm(i).ne.i) then; prm_triv=.false.; exit; endif; enddo
else
ierr=1
endif
return
end function permutation_trivial
!--------------------------------------------------------------------------------------
#ifdef USE_MIC
!DIR$ ATTRIBUTES OFFLOAD:mic:: dil_tensor_slice
#endif
subroutine dil_tensor_slice(dim_num,tens,tens_ext,slice,slice_ext,ext_beg,ierr) !PARALLEL (OMP)
!This subroutine extracts a slice from a tensor.
!Tensor dimensions are assumed beginning from 0: [0..(extent-1)].
implicit none
!---------------------------------------------
integer(INTD), parameter:: real_kind=8
!---------------------------------------------
integer(INTD), intent(in):: dim_num !in: number of tensor dimensions (tensor rank)
real(real_kind), intent(in):: tens(0:*) !in: tensor (Fortran-like linearized storage)
integer(INTD), intent(in):: tens_ext(1:dim_num) !in: dimension extents for <tens>
real(real_kind), intent(inout):: slice(0:*) !out: slice (Fortran-like linearized storage)
integer(INTD), intent(in):: slice_ext(1:dim_num) !in: dimension extents for <slice>
integer(INTD), intent(in):: ext_beg(1:dim_num) !in: beginning dimension offsets for <tens> (numeration starts at 0)
integer(INTD), intent(inout):: ierr !out: error code (0:success)
integer(INTD):: i,j,k,l,m,n,ks,kf,im(1:dim_num)
integer(INTL):: lts,lss,l_in,l_out,lb,le,ll,bases_in(1:dim_num),bases_out(1:dim_num),segs(0:MAX_THREADS)
real(8) time_beg
#ifdef USE_MIC
!DIR$ ATTRIBUTES OFFLOAD:mic:: real_kind
!DIR$ ATTRIBUTES ALIGN:128:: real_kind,im,bases_in,bases_out,segs
#endif
ierr=0
! time_beg=thread_wtime() !debug
if(dim_num.gt.0) then
lts=1_INTL; do i=1,dim_num; bases_in(i)=lts; lts=lts*tens_ext(i); enddo !tensor indexing bases
lss=1_INTL; do i=1,dim_num; bases_out(i)=lss; lss=lss*slice_ext(i); enddo !slice indexing bases
!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,m,n,im,l_in,l_out,lb,le,ll)
#ifndef NO_OMP
n=omp_get_thread_num(); m=omp_get_num_threads()
#else
n=0; m=1
#endif
!$OMP MASTER
segs(0)=0_INTL; call divide_segment(lss,int(m,INTL),segs(1:),ierr); do i=2,m; segs(i)=segs(i)+segs(i-1); enddo
!$OMP END MASTER
!$OMP BARRIER
!$OMP FLUSH(segs)
l_out=segs(n); do i=dim_num,1,-1; im(i)=l_out/bases_out(i); l_out=l_out-im(i)*bases_out(i); enddo
l_in=ext_beg(1); do i=2,dim_num; l_in=l_in+(ext_beg(i)+im(i))*bases_in(i); enddo
lb=int(im(1),INTL); le=int(slice_ext(1)-1,INTL); l_out=segs(n)-lb
sloop: do while(l_out+lb.lt.segs(n+1))
le=min(le,segs(n+1)-1_INTL-l_out)
do ll=lb,le; slice(l_out+ll)=tens(l_in+ll); enddo
l_out=l_out+le+1_INTL; lb=0_INTL
do i=2,dim_num
l_in=l_in+bases_in(i); im(i)=im(i)+1
if(im(i).ge.slice_ext(i)) then
l_in=l_in-im(i)*bases_in(i); im(i)=0
else
exit
endif
enddo
enddo sloop
!$OMP END PARALLEL
else
ierr=1 !zero-rank tensor
endif
! write(CONS_OUT,'("#DEBUG(dil_tensor_algebra::dil_tensor_slice): kernel time/error code: ",F10.4,1x,i3)')&
! &thread_wtime(time_beg),ierr !debug
return
end subroutine dil_tensor_slice
!---------------------------------------------------------------------------------------
#ifdef USE_MIC
!DIR$ ATTRIBUTES OFFLOAD:mic:: dil_tensor_insert
#endif
subroutine dil_tensor_insert(dim_num,tens,tens_ext,slice,slice_ext,ext_beg,ierr) !PARALLEL (OMP)
!This subroutine inserts a slice into a tensor.
!Tensor dimensions are assumed beginning from 0: [0..(extent-1)].
implicit none
!---------------------------------------------
integer(INTD), parameter:: real_kind=8
!---------------------------------------------
integer(INTD), intent(in):: dim_num !in: number of tensor dimensions
real(real_kind), intent(inout):: tens(0:*) !inout: tensor (Fortran-like linearized storage)
integer(INTD), intent(in):: tens_ext(1:dim_num) !in: dimension extents for <tens>
real(real_kind), intent(in):: slice(0:*) !in: slice (Fortran-like linearized storage)
integer(INTD), intent(in):: slice_ext(1:dim_num) !in: dimension extents for <slice>
integer(INTD), intent(in):: ext_beg(1:dim_num) !in: beginning dimension offsets for <tens> (numeration starts at 0)
integer(INTD), intent(inout):: ierr !out: error code (0:success)
integer(INTD):: i,j,k,l,m,n,ks,kf,im(1:dim_num)
integer(INTL):: lts,lss,l_in,l_out,lb,le,ll,bases_in(1:dim_num),bases_out(1:dim_num),segs(0:MAX_THREADS)
real(8) time_beg
#ifdef USE_MIC
!DIR$ ATTRIBUTES OFFLOAD:mic:: real_kind
!DIR$ ATTRIBUTES ALIGN:128:: real_kind,im,bases_in,bases_out,segs
#endif
ierr=0
! time_beg=thread_wtime() !debug
if(dim_num.gt.0) then
lts=1_INTL; do i=1,dim_num; bases_out(i)=lts; lts=lts*tens_ext(i); enddo !tensor indexing bases
lss=1_INTL; do i=1,dim_num; bases_in(i)=lss; lss=lss*slice_ext(i); enddo !slice indexing bases
!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,m,n,im,l_in,l_out,lb,le,ll)
#ifndef NO_OMP
n=omp_get_thread_num(); m=omp_get_num_threads()
#else
n=0; m=1
#endif
!$OMP MASTER
segs(0)=0_INTL; call divide_segment(lss,int(m,INTL),segs(1:),ierr); do i=2,m; segs(i)=segs(i)+segs(i-1); enddo
!$OMP END MASTER
!$OMP BARRIER
!$OMP FLUSH(segs)
l_in=segs(n); do i=dim_num,1,-1; im(i)=l_in/bases_in(i); l_in=l_in-im(i)*bases_in(i); enddo
l_out=ext_beg(1); do i=2,dim_num; l_out=l_out+(ext_beg(i)+im(i))*bases_out(i); enddo
lb=int(im(1),INTL); le=int(slice_ext(1)-1,INTL); l_in=segs(n)-lb
sloop: do while(l_in+lb.lt.segs(n+1))
le=min(le,segs(n+1)-1_INTL-l_in)
do ll=lb,le; tens(l_out+ll)=slice(l_in+ll); enddo
l_in=l_in+le+1_INTL; lb=0_INTL
do i=2,dim_num
l_out=l_out+bases_out(i); im(i)=im(i)+1
if(im(i).ge.slice_ext(i)) then
l_out=l_out-im(i)*bases_out(i); im(i)=0
else
exit
endif
enddo
enddo sloop
!$OMP END PARALLEL
else
ierr=1 !zero-rank tensor
endif
! write(CONS_OUT,'("#DEBUG(dil_tensor_algebra::dil_tensor_insert): kernel time/error code: ",F10.4,1x,i3)')&
! &thread_wtime(time_beg),ierr !debug
return
end subroutine dil_tensor_insert
!--------------------------------------------------------------------------------------------
#ifdef USE_MIC
!DIR$ ATTRIBUTES OFFLOAD:mic:: dil_tensor_transpose
#endif
subroutine dil_tensor_transpose(dim_num,dim_extents,dim_transp,tens_in,tens_out,ierr) !PARALLEL (OMP)
!This subroutine reorders dimensions of an arbitrary, locally stored tensor (out-of-place).
!Algorithm: Lyakh, D.I. Computer Physics Communications, 2014. DOI:10.1016/j.cpc.2014.12.013.
implicit none
!---------------------------------------------
integer(INTD), parameter:: real_kind=8
!---------------------------------------------
logical, parameter:: cache_efficiency=.true. !turns on/off cache-efficiency
integer(INTL), parameter:: cache_line_len=64/real_kind !cache line length (words)
integer(INTL), parameter:: cache_line_min=cache_line_len*2 !lower bound for the input/output minor volume: => L1_cache_line*2
integer(INTL), parameter:: cache_line_lim=cache_line_len*4 !upper bound for the input/output minor volume: <= SQRT(L1_size)
integer(INTL), parameter:: small_tens_size=2**10 !up to this size it is useless to apply cache efficiency (fully fits in L1)
integer(INTL), parameter:: vec_size=2**8 !loop reorganization parameter for direct copy
#ifdef USE_MIC
!DIR$ ATTRIBUTES OFFLOAD:mic:: real_kind,cache_efficiency,cache_line_len,cache_line_min,cache_line_lim,small_tens_size,vec_size
!DIR$ ATTRIBUTES ALIGN:128:: real_kind,cache_efficiency,cache_line_len,cache_line_min,cache_line_lim,small_tens_size,vec_size
#endif
!------------------------------------------
integer(INTD), intent(in):: dim_num !in: tensor rank
integer(INTD), intent(in):: dim_extents(1:dim_num) !in: tensor dimension extents
integer(INTD), intent(in):: dim_transp(1:dim_num) !in: dimension permutation (O2N)
real(real_kind), intent(in):: tens_in(0:*) !in: input tensor (Fortran-like linearized storage)
real(real_kind), intent(out):: tens_out(0:*) !out: output (permuted) tensor (Fortran-like linearized storage)
integer(INTD), intent(inout):: ierr !out: error code (0:success)
integer(INTD):: i,j,k,l,m,n,k1,k2,ks,kf,split_in,split_out
integer(INTD):: im(1:dim_num),n2o(0:dim_num+1),ipr(1:dim_num+1),dim_beg(1:dim_num),dim_end(1:dim_num)
integer(INTL):: bases_in(1:dim_num+1),bases_out(1:dim_num+1),bases_pri(1:dim_num+1),segs(0:MAX_THREADS)
integer(INTL):: bs,l0,l1,l2,l3,ll,lb,le,ls,l_in,l_out,seg_in,seg_out,vol_min,vol_ext
logical:: trivial
real(8):: time_beg,tm
#ifdef USE_MIC
!DIR$ ATTRIBUTES ALIGN:128:: im,n2o,ipr,dim_beg,dim_end,bases_in,bases_out,bases_pri,segs
#endif
ierr=0
! if(DIL_DEBUG) write(CONS_OUT,'(3x,"#DEBUG(DIL): Transposing ... ")',ADVANCE='NO')
! time_beg=thread_wtime() !debug
if(dim_num.lt.0) then; ierr=1; return; elseif(dim_num.eq.0) then; tens_out(0)=tens_in(0); return; endif
!Check the index permutation:
trivial=.true.; do i=1,dim_num; if(dim_transp(i).ne.i) then; trivial=.false.; exit; endif; enddo
if(trivial.and.cache_efficiency) then
!Trivial index permutation (no permutation):
!Compute indexing bases:
bs=1_INTL; do i=1,dim_num; bases_in(i)=bs; bs=bs*dim_extents(i); enddo
!Copy input to output:
!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(l0,l1)
!$OMP DO SCHEDULE(GUIDED)
do l0=0_INTL,bs-1_INTL-mod(bs,vec_size),vec_size
do l1=0_INTL,vec_size-1_INTL; tens_out(l0+l1)=tens_in(l0+l1); enddo
enddo
!$OMP END DO NOWAIT
!$OMP SINGLE
do l0=bs-mod(bs,vec_size),bs-1_INTL; tens_out(l0)=tens_in(l0); enddo
!$OMP END SINGLE
!$OMP END PARALLEL
else
!Non-trivial index permutation:
!Compute indexing bases:
do i=1,dim_num; n2o(dim_transp(i))=i; enddo; n2o(dim_num+1)=dim_num+1 !get the N2O
bs=1_INTL; do i=1,dim_num; bases_in(i)=bs; bs=bs*dim_extents(i); enddo; bases_in(dim_num+1)=bs
bs=1_INTL; do i=1,dim_num; bases_out(n2o(i))=bs; bs=bs*dim_extents(n2o(i)); enddo; bases_out(dim_num+1)=bs
!Configure cache-efficient algorithm:
if(bs.le.small_tens_size.or.(.not.cache_efficiency)) then !tensor block is too small to think hard about it
ipr(1:dim_num+1)=(/(j,j=1,dim_num+1)/); kf=dim_num !trivial priorities, all indices are minor
split_in=kf; seg_in=dim_extents(split_in); split_out=kf; seg_out=dim_extents(split_out)
else
do k1=1,dim_num; if(bases_in(k1+1).ge.cache_line_lim) exit; enddo; k1=k1-1
do k2=1,dim_num; if(bases_out(n2o(k2+1)).ge.cache_line_lim) exit; enddo; k2=k2-1
do j=k1+1,dim_num; if(dim_transp(j).le.k2) then; k1=k1+1; else; exit; endif; enddo
do j=k2+1,dim_num; if(n2o(j).le.k1) then; k2=k2+1; else; exit; endif; enddo
if(bases_in(k1+1).lt.cache_line_min.and.bases_out(n2o(k2+1)).ge.cache_line_min) then !split the last minor input dim
k1=k1+1; split_in=k1; seg_in=(cache_line_lim-1_INTL)/bases_in(split_in)+1_INTL
split_out=n2o(k2); seg_out=dim_extents(split_out)
elseif(bases_in(k1+1).ge.cache_line_min.and.bases_out(n2o(k2+1)).lt.cache_line_min) then !split the last minor output dim
k2=k2+1; split_in=n2o(k2); seg_in=(cache_line_lim-1_INTL)/bases_out(split_in)+1_INTL
split_out=k1; seg_out=dim_extents(split_out)
elseif(bases_in(k1+1).lt.cache_line_min.and.bases_out(n2o(k2+1)).lt.cache_line_min) then !split both
k1=k1+1; k2=k2+1
if(k1.eq.n2o(k2)) then
split_in=k1; seg_in=(cache_line_lim-1_INTL)/min(bases_in(split_in),bases_out(split_in))+1_INTL
split_out=k1; seg_out=dim_extents(split_out)
else
split_in=k1; seg_in=(cache_line_lim-1_INTL)/bases_in(split_in)+1_INTL
split_out=n2o(k2); seg_out=(cache_line_lim-1_INTL)/bases_out(split_out)+1_INTL
endif
else !split none
split_in=k1; seg_in=dim_extents(split_in)
split_out=n2o(k2); seg_out=dim_extents(split_out)
endif
vol_min=1_INTL
if(seg_in.lt.dim_extents(split_in)) vol_min=vol_min*seg_in
if(seg_out.lt.dim_extents(split_out)) vol_min=vol_min*seg_out
if(vol_min.gt.1_INTL) then
do j=1,k1
if(j.ne.split_in.and.j.ne.split_out) vol_min=vol_min*dim_extents(j)
enddo
do j=1,k2
l=n2o(j)
if(l.gt.k1.and.l.ne.split_in.and.l.ne.split_out) vol_min=vol_min*dim_extents(l)
enddo
l=int((cache_line_lim*cache_line_lim)/vol_min,4)
if(l.ge.2) then
if(split_in.eq.split_out) then
seg_in=seg_in*l
else
if(l.gt.4) then
l=int(sqrt(float(l)),4)
seg_in=min(seg_in*l,int(dim_extents(split_in),INTL))
seg_out=min(seg_out*l,int(dim_extents(split_out),INTL))
else
seg_in=min(seg_in*l,int(dim_extents(split_in),INTL))
endif
endif
endif
endif
l=0
do while(l.lt.k1)
l=l+1; ipr(l)=l; if(bases_in(l+1).ge.cache_line_min) exit
enddo
m=l+1
j=0
do while(j.lt.k2)
j=j+1; n=n2o(j)
if(n.ge.m) then; l=l+1; ipr(l)=n; endif
if(bases_out(n2o(j+1)).ge.cache_line_min) exit
enddo
n=j+1
do j=m,k1; if(dim_transp(j).ge.n) then; l=l+1; ipr(l)=j; endif; enddo
do j=n,k2; if(n2o(j).gt.k1) then; l=l+1; ipr(l)=n2o(j); endif; enddo
kf=l
do j=k2+1,dim_num; if(n2o(j).gt.k1) then; l=l+1; ipr(l)=n2o(j); endif; enddo !kf is the length of the combined minor set
ipr(dim_num+1)=dim_num+1 !special setting
endif
vol_ext=1_INTL; do j=kf+1,dim_num; vol_ext=vol_ext*dim_extents(ipr(j)); enddo !external volume
! write(CONS_OUT,'("#DEBUG(dil_tensor_algebra::dil_tensor_transpose): extents:",99(1x,i5))') dim_extents(1:dim_num) !debug
! write(CONS_OUT,'("#DEBUG(dil_tensor_algebra::dil_tensor_transpose): permutation:",99(1x,i2))') dim_transp(1:dim_num) !debug
! write(CONS_OUT,'("#DEBUG(dil_tensor_algebra::dil_tensor_transpose): minor ",i3,": priority:",99(1x,i2))')&
! &kf,ipr(1:dim_num) !debug
! write(CONS_OUT,'("#DEBUG(dil_tensor_algebra::dil_tensor_transpose): vol_ext ",i11,": segs:",4(1x,i5))')&
! &vol_ext,split_in,split_out,seg_in,seg_out !debug
!Transpose:
!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(i,j,m,n,ks,l0,l1,l2,l3,ll,lb,le,ls,l_in,l_out,vol_min,im,dim_beg,dim_end)
#ifndef NO_OMP
n=omp_get_thread_num(); m=omp_get_num_threads() !multi-threaded execution
#else
n=0; m=1 !serial execution
#endif
! if(n.eq.0) write(CONS_OUT,'("#DEBUG(dil_tensor_algebra::dil_tensor_transpose): number of threads = ",i5)') m !debug
if(kf.lt.dim_num) then !external indices present
!$OMP MASTER
segs(0)=0_INTL; call divide_segment(vol_ext,int(m,INTL),segs(1:),i); do j=2,m; segs(j)=segs(j)+segs(j-1); enddo
l0=1_INTL; do i=kf+1,dim_num; bases_pri(ipr(i))=l0; l0=l0*dim_extents(ipr(i)); enddo !priority bases
!$OMP END MASTER
!$OMP BARRIER
!$OMP FLUSH(segs,bases_pri)
dim_beg(1:dim_num)=0; dim_end(1:dim_num)=dim_extents(1:dim_num)-1
l2=dim_end(split_in); l3=dim_end(split_out); ls=bases_out(1)
loop0: do l1=0_INTL,l3,seg_out !output dimension
dim_beg(split_out)=l1; dim_end(split_out)=min(l1+seg_out-1_INTL,l3)
do l0=0_INTL,l2,seg_in !input dimension
dim_beg(split_in)=l0; dim_end(split_in)=min(l0+seg_in-1_INTL,l2)
ll=segs(n); do i=dim_num,kf+1,-1; j=ipr(i); im(j)=ll/bases_pri(j); ll=ll-im(j)*bases_pri(j); enddo
vol_min=1_INTL; do i=1,kf; j=ipr(i); vol_min=vol_min*(dim_end(j)-dim_beg(j)+1); im(j)=dim_beg(j); enddo
l_in=0_INTL; do j=1,dim_num; l_in=l_in+im(j)*bases_in(j); enddo
l_out=0_INTL; do j=1,dim_num; l_out=l_out+im(j)*bases_out(j); enddo
le=dim_end(1)-dim_beg(1); lb=(segs(n+1)-segs(n))*vol_min; ks=0
loop1: do while(lb.gt.0_INTL)
do ll=0_INTL,le
tens_out(l_out+ll*ls)=tens_in(l_in+ll)
enddo
lb=lb-(le+1_INTL)
do i=2,dim_num
j=ipr(i) !old index number
if(im(j).lt.dim_end(j)) then
im(j)=im(j)+1; l_in=l_in+bases_in(j); l_out=l_out+bases_out(j)
ks=ks+1; exit
else
l_in=l_in-(im(j)-dim_beg(j))*bases_in(j); l_out=l_out-(im(j)-dim_beg(j))*bases_out(j); im(j)=dim_beg(j)
endif
enddo !i
ks=ks-1; if(ks.lt.0) exit loop1
enddo loop1
if(lb.ne.0_INTL) then
if(VERBOSE) write(CONS_OUT,'("ERROR(dil_tensor_algebra::dil_tensor_transpose): invalid remainder: ",i11,1x,i4)') lb,n
!$OMP ATOMIC WRITE
ierr=2
exit loop0
endif
enddo !l0
enddo loop0 !l1
else !external indices absent
!$OMP MASTER
l0=1_INTL; do i=kf+1,dim_num; bases_pri(ipr(i))=l0; l0=l0*dim_extents(ipr(i)); enddo !priority bases
!$OMP END MASTER
!$OMP BARRIER
!$OMP FLUSH(bases_pri)
dim_beg(1:dim_num)=0; dim_end(1:dim_num)=dim_extents(1:dim_num)-1
l2=dim_end(split_in); l3=dim_end(split_out); ls=bases_out(1)
!$OMP DO SCHEDULE(DYNAMIC) COLLAPSE(2)
do l1=0_INTL,l3,seg_out !output dimension
do l0=0_INTL,l2,seg_in !input dimension
dim_beg(split_out)=l1; dim_end(split_out)=min(l1+seg_out-1_INTL,l3)
dim_beg(split_in)=l0; dim_end(split_in)=min(l0+seg_in-1_INTL,l2)
vol_min=1_INTL; do i=1,kf; j=ipr(i); vol_min=vol_min*(dim_end(j)-dim_beg(j)+1); im(j)=dim_beg(j); enddo
l_in=0_INTL; do j=1,dim_num; l_in=l_in+im(j)*bases_in(j); enddo
l_out=0_INTL; do j=1,dim_num; l_out=l_out+im(j)*bases_out(j); enddo
le=dim_end(1)-dim_beg(1); lb=vol_min; ks=0
loop2: do while(lb.gt.0_INTL)
do ll=0_INTL,le
tens_out(l_out+ll*ls)=tens_in(l_in+ll)
enddo
lb=lb-(le+1_INTL)
do i=2,dim_num
j=ipr(i) !old index number
if(im(j).lt.dim_end(j)) then
im(j)=im(j)+1; l_in=l_in+bases_in(j); l_out=l_out+bases_out(j)
ks=ks+1; exit
else
l_in=l_in-(im(j)-dim_beg(j))*bases_in(j); l_out=l_out-(im(j)-dim_beg(j))*bases_out(j); im(j)=dim_beg(j)
endif
enddo !i
ks=ks-1; if(ks.lt.0) exit loop2
enddo loop2
enddo !l0
enddo !l1
!$OMP END DO
endif
!$OMP END PARALLEL
endif !trivial or not
! tm=thread_wtime(time_beg) !debug
! if(DIL_DEBUG) write(CONS_OUT,'("Done: ",F10.4," s, ",F10.4," GB/s: stat ",i9)')&
! &tm,dble(2_INTL*bs*real_kind)/(tm*1048576d0*1024d0),ierr !debug
! write(CONS_OUT,'("#DEBUG(dil_tensor_algebra::dil_tensor_transpose): Done: ",F10.4," sec, ",F10.4," GB/s, error ",i3)')&
! &tm,dble(2_INTL*bs*real_kind)/(tm*1024d0*1024d0*1024d0),ierr !debug
return
end subroutine dil_tensor_transpose
end module dil_tal_cp