-
Notifications
You must be signed in to change notification settings - Fork 346
/
lbm.cpp
1350 lines (1272 loc) · 73.1 KB
/
lbm.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include "lbm.hpp"
Units units; // for unit conversion
#if defined(D2Q9)
const uint velocity_set = 9u;
const uint dimensions = 2u;
const uint transfers = 3u;
#elif defined(D3Q15)
const uint velocity_set = 15u;
const uint dimensions = 3u;
const uint transfers = 5u;
#elif defined(D3Q19)
const uint velocity_set = 19u;
const uint dimensions = 3u;
const uint transfers = 5u;
#elif defined(D3Q27)
const uint velocity_set = 27u;
const uint dimensions = 3u;
const uint transfers = 9u;
#endif // D3Q27
uint bytes_per_cell_host() { // returns the number of Bytes per cell allocated in host memory
uint bytes_per_cell = 17u; // rho, u, flags
#ifdef FORCE_FIELD
bytes_per_cell += 12u; // F
#endif // FORCE_FIELD
#ifdef SURFACE
bytes_per_cell += 4u; // phi
#endif // SURFACE
#ifdef TEMPERATURE
bytes_per_cell += 4u; // T
#endif // TEMPERATURE
return bytes_per_cell;
}
uint bytes_per_cell_device() { // returns the number of Bytes per cell allocated in device memory
uint bytes_per_cell = velocity_set*sizeof(fpxx)+17u; // fi, rho, u, flags
#ifdef FORCE_FIELD
bytes_per_cell += 12u; // F
#endif // FORCE_FIELD
#ifdef SURFACE
bytes_per_cell += 12u; // phi, mass, flags
#endif // SURFACE
#ifdef TEMPERATURE
bytes_per_cell += 7u*sizeof(fpxx)+4u; // gi, T
#endif // TEMPERATURE
return bytes_per_cell;
}
uint bandwidth_bytes_per_cell_device() { // returns the bandwidth in Bytes per cell per time step from/to device memory
uint bandwidth_bytes_per_cell = velocity_set*2u*sizeof(fpxx)+1u; // lattice.set()*2*fi, flags
#ifdef UPDATE_FIELDS
bandwidth_bytes_per_cell += 16u; // rho, u
#ifdef TEMPERATURE
bandwidth_bytes_per_cell += 4u; // T
#endif // TEMPERATURE
#endif // UPDATE_FIELDS
#ifdef FORCE_FIELD
bandwidth_bytes_per_cell += 12u; // F
#endif // FORCE_FIELD
#if defined(MOVING_BOUNDARIES)||defined(SURFACE)||defined(TEMPERATURE)
bandwidth_bytes_per_cell += (velocity_set-1u)*1u; // neighbor flags have to be loaded
#endif // MOVING_BOUNDARIES, SURFACE or TEMPERATURE
#ifdef SURFACE
bandwidth_bytes_per_cell += (1u+(2u*velocity_set-1u)*sizeof(fpxx)+8u+(velocity_set-1u)*4u) + 1u + 1u + (4u+velocity_set+4u+4u+4u); // surface_0 (flags, fi, mass, massex), surface_1 (flags), surface_2 (flags), surface_3 (rho, flags, mass, massex, phi)
#endif // SURFACE
#ifdef TEMPERATURE
bandwidth_bytes_per_cell += 7u*2u*sizeof(fpxx); // 2*gi
#endif // TEMPERATURE
return bandwidth_bytes_per_cell;
}
uint3 resolution(const float3 box_aspect_ratio, const uint memory) { // input: simulation box aspect ratio and VRAM occupation in MB, output: grid resolution
float memory_required = (box_aspect_ratio.x*box_aspect_ratio.y*box_aspect_ratio.z)*(float)bytes_per_cell_device()/1048576.0f; // in MB
float scaling = cbrt((float)memory/memory_required);
return uint3(to_uint(scaling*box_aspect_ratio.x), to_uint(scaling*box_aspect_ratio.y), to_uint(scaling*box_aspect_ratio.z));
}
string default_filename(const string& path, const string& name, const string& extension, const ulong t) { // generate a default filename with timestamp
string time = "00000000"+to_string(t);
time = substring(time, length(time)-9u, 9u);
return create_file_extension((path=="" ? get_exe_path()+"export/" : path)+(name=="" ? "file" : name)+"-"+time, extension);
}
string default_filename(const string& name, const string& extension, const ulong t) { // generate a default filename with timestamp at exe_path/export/
return default_filename("", name, extension, t);
}
LBM_Domain::LBM_Domain(const Device_Info& device_info, const uint Nx, const uint Ny, const uint Nz, const uint Dx, const uint Dy, const uint Dz, const int Ox, const int Oy, const int Oz, const float nu, const float fx, const float fy, const float fz, const float sigma, const float alpha, const float beta, const uint particles_N, const float particles_rho) { // constructor with manual device selection and domain offset
this->Nx = Nx; this->Ny = Ny; this->Nz = Nz;
this->Dx = Dx; this->Dy = Dy; this->Dz = Dz;
this->Ox = Ox; this->Oy = Oy; this->Oz = Oz;
this->nu = nu;
this->fx = fx; this->fy = fy; this->fz = fz;
this->sigma = sigma;
this->alpha = alpha; this->beta = beta;
this->particles_N = particles_N;
this->particles_rho = particles_rho;
string opencl_c_code;
#ifdef GRAPHICS
graphics = Graphics(this);
opencl_c_code = device_defines()+graphics.device_defines()+get_opencl_c_code();
#else // GRAPHICS
opencl_c_code = device_defines()+get_opencl_c_code();
#endif // GRAPHICS
this->device = Device(device_info, opencl_c_code);
print_info("Allocating memory. This may take a few seconds.");
allocate(device); // lbm first
#ifdef GRAPHICS
graphics.allocate(device); // graphics after lbm
#endif // GRAPHICS
}
void LBM_Domain::allocate(Device& device) {
const ulong N = get_N();
fi = Memory<fpxx>(device, N, velocity_set, false);
rho = Memory<float>(device, N, 1u, true, true, 1.0f);
u = Memory<float>(device, N, 3u);
flags = Memory<uchar>(device, N);
kernel_initialize = Kernel(device, N, "initialize", fi, rho, u, flags);
kernel_stream_collide = Kernel(device, N, "stream_collide", fi, rho, u, flags, t, fx, fy, fz);
kernel_update_fields = Kernel(device, N, "update_fields", fi, rho, u, flags, t, fx, fy, fz);
#ifdef FORCE_FIELD
F = Memory<float>(device, N, 3u);
kernel_stream_collide.add_parameters(F);
kernel_update_fields.add_parameters(F);
kernel_calculate_force_on_boundaries = Kernel(device, N, "calculate_force_on_boundaries", fi, flags, t, F);
kernel_reset_force_field = Kernel(device, N, "reset_force_field", F);
#endif // FORCE_FIELD
#ifdef MOVING_BOUNDARIES
kernel_update_moving_boundaries = Kernel(device, N, "update_moving_boundaries", u, flags);
#endif // MOVING_BOUNDARIES
#ifdef SURFACE
phi = Memory<float>(device, N);
mass = Memory<float>(device, N, 1u, false);
massex = Memory<float>(device, N, 1u, false);
kernel_initialize.add_parameters(mass, massex, phi);
kernel_stream_collide.add_parameters(mass);
kernel_surface_0 = Kernel(device, N, "surface_0", fi, rho, u, flags, mass, massex, phi, t, fx, fy, fz);
kernel_surface_1 = Kernel(device, N, "surface_1", flags);
kernel_surface_2 = Kernel(device, N, "surface_2", fi, rho, u, flags, t);
kernel_surface_3 = Kernel(device, N, "surface_3", rho, flags, mass, massex, phi);
#endif // SURFACE
#ifdef TEMPERATURE
gi = Memory<fpxx>(device, N, 7u, false);
T = Memory<float>(device, N, 1u, true, true, 1.0f);
kernel_initialize.add_parameters(gi, T);
kernel_stream_collide.add_parameters(gi, T);
kernel_update_fields.add_parameters(gi, T);
#endif // TEMPERATURE
#ifdef PARTICLES
particles = Memory<float>(device, (ulong)particles_N, 3u);
kernel_integrate_particles = Kernel(device, (ulong)particles_N, "integrate_particles", particles, u, flags, 1.0f);
#ifdef FORCE_FIELD
kernel_integrate_particles.add_parameters(F, fx, fy, fz);
#endif // FORCE_FIELD
#endif // PARTICLES
if(get_D()>1u) allocate_transfer(device);
}
void LBM_Domain::enqueue_initialize() { // call kernel_initialize
kernel_initialize.enqueue_run();
}
void LBM_Domain::enqueue_stream_collide() { // call kernel_stream_collide to perform one LBM time step
kernel_stream_collide.set_parameters(4u, t, fx, fy, fz).enqueue_run();
}
void LBM_Domain::enqueue_update_fields() { // update fields (rho, u, T) manually
#ifndef UPDATE_FIELDS
if(t!=t_last_update_fields) { // only run kernel_update_fields if the time step has changed since last update
kernel_update_fields.set_parameters(4u, t, fx, fy, fz).enqueue_run();
t_last_update_fields = t;
}
#endif // UPDATE_FIELDS
}
#ifdef SURFACE
void LBM_Domain::enqueue_surface_0() {
kernel_surface_0.set_parameters(7u, t, fx, fy, fz).enqueue_run();
}
void LBM_Domain::enqueue_surface_1() {
kernel_surface_1.enqueue_run();
}
void LBM_Domain::enqueue_surface_2() {
kernel_surface_2.set_parameters(4u, t).enqueue_run();
}
void LBM_Domain::enqueue_surface_3() {
kernel_surface_3.enqueue_run();
}
#endif // SURFACE
#ifdef FORCE_FIELD
void LBM_Domain::enqueue_calculate_force_on_boundaries() { // calculate forces from fluid on TYPE_S cells
kernel_calculate_force_on_boundaries.set_parameters(2u, t).enqueue_run();
}
#endif // FORCE_FIELD
#ifdef MOVING_BOUNDARIES
void LBM_Domain::enqueue_update_moving_boundaries() { // mark/unmark cells next to TYPE_S cells with velocity!=0 with TYPE_MS
kernel_update_moving_boundaries.enqueue_run();
}
#endif // MOVING_BOUNDARIES
#ifdef PARTICLES
void LBM_Domain::enqueue_integrate_particles(const uint time_step_multiplicator) { // intgegrate particles forward in time and couple particles to fluid
#ifdef FORCE_FIELD
if(particles_rho!=1.0f) kernel_reset_force_field.enqueue_run(); // only reset force field if particles have buoyancy and apply forces on fluid
kernel_integrate_particles.set_parameters(5u, fx, fy, fz);
#endif // FORCE_FIELD
kernel_integrate_particles.set_parameters(3u, (float)time_step_multiplicator).enqueue_run();
}
#endif // PARTICLES
void LBM_Domain::increment_time_step(const uint steps) {
t += (ulong)steps; // increment time step
#ifdef UPDATE_FIELDS
t_last_update_fields = t;
#endif // UPDATE_FIELDS
}
void LBM_Domain::reset_time_step() {
t = 0ull; // increment time step
#ifdef UPDATE_FIELDS
t_last_update_fields = t;
#endif // UPDATE_FIELDS
}
void LBM_Domain::finish_queue() {
device.finish_queue();
}
uint LBM_Domain::get_velocity_set() const {
return velocity_set;
}
void LBM_Domain::voxelize_mesh_on_device(const Mesh* mesh, const uchar flag, const float3& rotation_center, const float3& linear_velocity, const float3& rotational_velocity) { // voxelize triangle mesh
Memory<float3> p0(device, mesh->triangle_number, 1u, mesh->p0);
Memory<float3> p1(device, mesh->triangle_number, 1u, mesh->p1);
Memory<float3> p2(device, mesh->triangle_number, 1u, mesh->p2);
Memory<float> bounding_box_and_velocity(device, 16u);
const float x0=mesh->pmin.x-2.0f, y0=mesh->pmin.y-2.0f, z0=mesh->pmin.z-2.0f, x1=mesh->pmax.x+2.0f, y1=mesh->pmax.y+2.0f, z1=mesh->pmax.z+2.0f; // use bounding box of mesh to speed up voxelization; add tolerance of 2 cells for re-voxelization of moving objects
bounding_box_and_velocity[ 0] = as_float(mesh->triangle_number);
bounding_box_and_velocity[ 1] = x0;
bounding_box_and_velocity[ 2] = y0;
bounding_box_and_velocity[ 3] = z0;
bounding_box_and_velocity[ 4] = x1;
bounding_box_and_velocity[ 5] = y1;
bounding_box_and_velocity[ 6] = z1;
bounding_box_and_velocity[ 7] = rotation_center.x;
bounding_box_and_velocity[ 8] = rotation_center.y;
bounding_box_and_velocity[ 9] = rotation_center.z;
bounding_box_and_velocity[10] = linear_velocity.x;
bounding_box_and_velocity[11] = linear_velocity.y;
bounding_box_and_velocity[12] = linear_velocity.z;
bounding_box_and_velocity[13] = rotational_velocity.x;
bounding_box_and_velocity[14] = rotational_velocity.y;
bounding_box_and_velocity[15] = rotational_velocity.z;
uint direction = 0u;
if(length(rotational_velocity)==0.0f) { // choose direction of minimum bounding-box cross-section area
float v[3] = { (y1-y0)*(z1-z0), (z1-z0)*(x1-x0), (x1-x0)*(y1-y0) };
float vmin = v[0];
for(uint i=1u; i<3u; i++) {
if(v[i]<vmin) {
vmin = v[i];
direction = i;
}
}
} else { // choose direction closest to rotation axis
float v[3] = { fabsf(rotational_velocity.x), fabsf(rotational_velocity.y), fabsf(rotational_velocity.z) };
float vmax = v[0];
for(uint i=1u; i<3u; i++) {
if(v[i]>vmax) {
vmax = v[i];
direction = i; // find direction of minimum bounding-box cross-section area
}
}
}
const ulong A[3] = { (ulong)Ny*(ulong)Nz, (ulong)Nz*(ulong)Nx, (ulong)Nx*(ulong)Ny };
Kernel kernel_voxelize_mesh(device, A[direction], "voxelize_mesh", direction, fi, u, flags, t+1ull, flag, p0, p1, p2, bounding_box_and_velocity);
#ifdef SURFACE
kernel_voxelize_mesh.add_parameters(mass, massex);
#endif // SURFACE
p0.write_to_device();
p1.write_to_device();
p2.write_to_device();
bounding_box_and_velocity.write_to_device();
kernel_voxelize_mesh.run();
}
void LBM_Domain::enqueue_unvoxelize_mesh_on_device(const Mesh* mesh, const uchar flag) { // remove voxelized triangle mesh from LBM grid
const float x0=mesh->pmin.x, y0=mesh->pmin.y, z0=mesh->pmin.z, x1=mesh->pmax.x, y1=mesh->pmax.y, z1=mesh->pmax.z; // remove all flags in bounding box of mesh
Kernel kernel_unvoxelize_mesh(device, get_N(), "unvoxelize_mesh", flags, flag, x0, y0, z0, x1, y1, z1);
kernel_unvoxelize_mesh.run();
}
string LBM_Domain::device_defines() const { return
"\n #define def_Nx "+to_string(Nx)+"u"
"\n #define def_Ny "+to_string(Ny)+"u"
"\n #define def_Nz "+to_string(Nz)+"u"
"\n #define def_N "+to_string(get_N())+"ul"
"\n #define uxx "+(get_N()<=(ulong)max_uint ? "uint" : "ulong")+"" // switchable data type for index calculation (32-bit uint / 64-bit ulong)
"\n #define def_Dx "+to_string(Dx)+"u"
"\n #define def_Dy "+to_string(Dy)+"u"
"\n #define def_Dz "+to_string(Dz)+"u"
"\n #define def_Ox "+to_string(Ox)+"" // offsets are signed integer!
"\n #define def_Oy "+to_string(Oy)+""
"\n #define def_Oz "+to_string(Oz)+""
"\n #define def_Ax "+to_string(Ny*Nz)+"u"
"\n #define def_Ay "+to_string(Nz*Nx)+"u"
"\n #define def_Az "+to_string(Nx*Ny)+"u"
"\n #define def_domain_offset_x "+to_string(0.5f*(float)((int)Nx+2*Ox+(int)Dx*(2*(int)(Dx>1u)-(int)Nx)))+"f"
"\n #define def_domain_offset_y "+to_string(0.5f*(float)((int)Ny+2*Oy+(int)Dy*(2*(int)(Dy>1u)-(int)Ny)))+"f"
"\n #define def_domain_offset_z "+to_string(0.5f*(float)((int)Nz+2*Oz+(int)Dz*(2*(int)(Dz>1u)-(int)Nz)))+"f"
"\n #define D"+to_string(dimensions)+"Q"+to_string(velocity_set)+"" // D2Q9/D3Q15/D3Q19/D3Q27
"\n #define def_velocity_set "+to_string(velocity_set)+"u" // LBM velocity set (D2Q9/D3Q15/D3Q19/D3Q27)
"\n #define def_dimensions "+to_string(dimensions)+"u" // number spatial dimensions (2D or 3D)
"\n #define def_transfers "+to_string(transfers)+"u" // number of DDFs that are transferred between multiple domains
"\n #define def_c 0.57735027f" // lattice speed of sound c = 1/sqrt(3)*dt
"\n #define def_w " +to_string(1.0f/get_tau())+"f" // relaxation rate w = dt/tau = dt/(nu/c^2+dt/2) = 1/(3*nu+1/2)
#if defined(D2Q9)
"\n #define def_w0 (1.0f/2.25f)" // center (0)
"\n #define def_ws (1.0f/9.0f)" // straight (1-4)
"\n #define def_we (1.0f/36.0f)" // edge (5-8)
#elif defined(D3Q15)
"\n #define def_w0 (1.0f/4.5f)" // center (0)
"\n #define def_ws (1.0f/9.0f)" // straight (1-6)
"\n #define def_wc (1.0f/72.0f)" // corner (7-14)
#elif defined(D3Q19)
"\n #define def_w0 (1.0f/3.0f)" // center (0)
"\n #define def_ws (1.0f/18.0f)" // straight (1-6)
"\n #define def_we (1.0f/36.0f)" // edge (7-18)
#elif defined(D3Q27)
"\n #define def_w0 (1.0f/3.375f)" // center (0)
"\n #define def_ws (1.0f/13.5f)" // straight (1-6)
"\n #define def_we (1.0f/54.0f)" // edge (7-18)
"\n #define def_wc (1.0f/216.0f)" // corner (19-26)
#endif // D3Q27
#if defined(SRT)
"\n #define SRT"
#elif defined(TRT)
"\n #define TRT"
#endif // TRT
"\n #define TYPE_S 0x01" // 0b00000001 // (stationary or moving) solid boundary
"\n #define TYPE_E 0x02" // 0b00000010 // equilibrium boundary (inflow/outflow)
"\n #define TYPE_T 0x04" // 0b00000100 // temperature boundary
"\n #define TYPE_F 0x08" // 0b00001000 // fluid
"\n #define TYPE_I 0x10" // 0b00010000 // interface
"\n #define TYPE_G 0x20" // 0b00100000 // gas
"\n #define TYPE_X 0x40" // 0b01000000 // reserved type X
"\n #define TYPE_Y 0x80" // 0b10000000 // reserved type Y
"\n #define TYPE_MS 0x03" // 0b00000011 // cell next to moving solid boundary
"\n #define TYPE_BO 0x03" // 0b00000011 // any flag bit used for boundaries (temperature excluded)
"\n #define TYPE_IF 0x18" // 0b00011000 // change from interface to fluid
"\n #define TYPE_IG 0x30" // 0b00110000 // change from interface to gas
"\n #define TYPE_GI 0x38" // 0b00111000 // change from gas to interface
"\n #define TYPE_SU 0x38" // 0b00111000 // any flag bit used for SURFACE
#if defined(FP16S)
"\n #define fpxx half" // switchable data type (scaled IEEE-754 16-bit floating-point format: 1-5-10, exp-30, +-1.99902344, +-1.86446416E-9, +-1.81898936E-12, 3.311 digits)
"\n #define fpxx_copy ushort" // switchable data type for direct copying (scaled IEEE-754 16-bit floating-point format: 1-5-10, exp-30, +-1.99902344, +-1.86446416E-9, +-1.81898936E-12, 3.311 digits)
"\n #define load(p,o) vload_half(o,p)*3.0517578E-5f" // special function for loading half
"\n #define store(p,o,x) vstore_half_rte((x)*32768.0f,o,p)" // special function for storing half
#elif defined(FP16C)
"\n #define fpxx ushort" // switchable data type (custom 16-bit floating-point format: 1-4-11, exp-15, +-1.99951168, +-6.10351562E-5, +-2.98023224E-8, 3.612 digits), 12.5% slower than IEEE-754 16-bit
"\n #define fpxx_copy ushort" // switchable data type for direct copying (custom 16-bit floating-point format: 1-4-11, exp-15, +-1.99951168, +-6.10351562E-5, +-2.98023224E-8, 3.612 digits), 12.5% slower than IEEE-754 16-bit
"\n #define load(p,o) half_to_float_custom(p[o])" // special function for loading half
"\n #define store(p,o,x) p[o]=float_to_half_custom(x)" // special function for storing half
#else // FP32
"\n #define fpxx float" // switchable data type (regular 32-bit float)
"\n #define fpxx_copy float" // switchable data type for direct copying (regular 32-bit float)
"\n #define load(p,o) p[o]" // regular float read
"\n #define store(p,o,x) p[o]=x" // regular float write
#endif // FP32
#ifdef UPDATE_FIELDS
"\n #define UPDATE_FIELDS"
#endif // UPDATE_FIELDS
#ifdef VOLUME_FORCE
"\n #define VOLUME_FORCE"
#endif // VOLUME_FORCE
#ifdef MOVING_BOUNDARIES
"\n #define MOVING_BOUNDARIES"
#endif // MOVING_BOUNDARIES
#ifdef EQUILIBRIUM_BOUNDARIES
"\n #define EQUILIBRIUM_BOUNDARIES"
#endif // EQUILIBRIUM_BOUNDARIES
#ifdef FORCE_FIELD
"\n #define FORCE_FIELD"
#endif // FORCE_FIELD
#ifdef SURFACE
"\n #define SURFACE"
"\n #define def_6_sigma "+to_string(6.0f*sigma)+"f" // rho_laplace = 2*o*K, rho = 1-rho_laplace/c^2 = 1-(6*o)*K
#endif // SURFACE
#ifdef TEMPERATURE
"\n #define TEMPERATURE"
"\n #define def_w_T "+to_string(1.0f/(2.0f*alpha+0.5f))+"f" // wT = dt/tauT = 1/(2*alpha+1/2), alpha = thermal diffusion coefficient
"\n #define def_beta "+to_string(beta)+"f" // thermal expansion coefficient
"\n #define def_T_avg "+to_string(T_avg)+"f" // average temperature
#endif // TEMPERATURE
#ifdef SUBGRID
"\n #define SUBGRID"
#endif // SUBGRID
#ifdef PARTICLES
"\n #define PARTICLES"
"\n #define def_particles_N "+to_string(particles_N)+"ul"
"\n #define def_particles_rho "+to_string(particles_rho)+"f"
#endif // PARTICLES
;}
#ifdef GRAPHICS
void LBM_Domain::Graphics::allocate(Device& device) {
bitmap = Memory<int>(device, camera.width*camera.height);
zbuffer = Memory<int>(device, camera.width*camera.height, 1u, lbm->get_D()>1u); // if there are multiple domains, allocate zbuffer also on host side
camera_parameters = Memory<float>(device, 15u);
kernel_clear = Kernel(device, bitmap.length(), "graphics_clear", bitmap, zbuffer);
kernel_graphics_flags = Kernel(device, lbm->get_N(), "graphics_flags", camera_parameters, bitmap, zbuffer, lbm->flags);
kernel_graphics_flags_mc = Kernel(device, lbm->get_N(), "graphics_flags_mc", camera_parameters, bitmap, zbuffer, lbm->flags);
kernel_graphics_field = Kernel(device, lbm->get_D()==1u ? camera.width*camera.height : lbm->get_N(), lbm->get_D()==1u ? "graphics_field_rt" : "graphics_field", camera_parameters, bitmap, zbuffer, 0, lbm->rho, lbm->u, lbm->flags); // raytraced field visualization only works for single-GPU
kernel_graphics_field_slice = Kernel(device, lbm->get_N(), "graphics_field_slice", camera_parameters, bitmap, zbuffer, 0, 0, 0, 0, 0, lbm->rho, lbm->u, lbm->flags);
#ifndef D2Q9
kernel_graphics_streamline = Kernel(device, (lbm->get_Nx()/GRAPHICS_STREAMLINE_SPARSE)*(lbm->get_Ny()/GRAPHICS_STREAMLINE_SPARSE)*(lbm->get_Nz()/GRAPHICS_STREAMLINE_SPARSE), "graphics_streamline", camera_parameters, bitmap, zbuffer, 0, 0, 0, 0, 0, lbm->rho, lbm->u, lbm->flags); // 3D
#else // D2Q9
kernel_graphics_streamline = Kernel(device, (lbm->get_Nx()/GRAPHICS_STREAMLINE_SPARSE)*(lbm->get_Ny()/GRAPHICS_STREAMLINE_SPARSE), "graphics_streamline", camera_parameters, bitmap, zbuffer, 0, 0, 0, 0, 0, lbm->rho, lbm->u, lbm->flags); // 2D
#endif // D2Q9
kernel_graphics_q = Kernel(device, lbm->get_N(), "graphics_q", camera_parameters, bitmap, zbuffer, 0, lbm->rho, lbm->u);
#ifdef FORCE_FIELD
kernel_graphics_flags.add_parameters(lbm->F);
kernel_graphics_flags_mc.add_parameters(lbm->F);
#endif // FORCE_FIELD
#ifdef SURFACE
skybox = Memory<int>(device, skybox_image->width()*skybox_image->height(), 1u, skybox_image->data());
kernel_graphics_rasterize_phi = Kernel(device, lbm->get_N(), "graphics_rasterize_phi", camera_parameters, bitmap, zbuffer, lbm->phi);
kernel_graphics_raytrace_phi = Kernel(device, bitmap.length(), "graphics_raytrace_phi", camera_parameters, bitmap, skybox, lbm->phi, lbm->flags);
kernel_graphics_q.add_parameters(lbm->flags);
#endif // SURFACE
#ifdef TEMPERATURE
kernel_graphics_field.add_parameters(lbm->T);
kernel_graphics_field_slice.add_parameters(lbm->T);
kernel_graphics_streamline.add_parameters(lbm->T);
kernel_graphics_q.add_parameters(lbm->T);
#endif // TEMPERATURE
#ifdef PARTICLES
kernel_graphics_particles = Kernel(device, lbm->particles.length(), "graphics_particles", camera_parameters, bitmap, zbuffer, lbm->particles);
#endif // PARTICLES
}
bool LBM_Domain::Graphics::update_camera() {
camera.update_matrix();
bool change = false;
for(uint i=0u; i<15u; i++) {
const float data = camera.data(i);
change |= (camera_parameters[i]!=data);
camera_parameters[i] = data;
}
return change; // return false if camera parameters remain unchanged
}
bool LBM_Domain::Graphics::enqueue_draw_frame(const int visualization_modes, const int field_mode, const int slice_mode, const int slice_x, const int slice_y, const int slice_z, const bool visualization_change) {
const bool camera_update = update_camera();
#if defined(INTERACTIVE_GRAPHICS)||defined(INTERACTIVE_GRAPHICS_ASCII)
if(!visualization_change&&!camera_update&&lbm->get_t()==t_last_rendered_frame) return false; // don't render a new frame if the scene hasn't changed since last frame
#endif // INTERACTIVE_GRAPHICS||INTERACTIVE_GRAPHICS_ASCII
t_last_rendered_frame = lbm->get_t();
if(camera_update) camera_parameters.enqueue_write_to_device(); // camera_parameters PCIe transfer and kernel_clear execution can happen simulataneously
kernel_clear.enqueue_run();
const int sx=slice_x-lbm->Ox, sy=slice_y-lbm->Oy, sz=slice_z-lbm->Oz; // subtract domain offsets
#ifdef SURFACE
if((visualization_modes&VIS_PHI_RAYTRACE)&&lbm->get_D()==1u) kernel_graphics_raytrace_phi.enqueue_run(); // disable raytracing for multi-GPU (domain decomposition rendering doesn't work for raytracing)
if(visualization_modes&VIS_PHI_RASTERIZE) kernel_graphics_rasterize_phi.enqueue_run();
#endif // SURFACE
if(visualization_modes&VIS_FLAG_LATTICE) kernel_graphics_flags.enqueue_run();
if(visualization_modes&VIS_FLAG_SURFACE) kernel_graphics_flags_mc.enqueue_run();
if(visualization_modes&VIS_STREAMLINES) kernel_graphics_streamline.set_parameters(3u, field_mode, slice_mode, sx, sy, sz).enqueue_run();
if(visualization_modes&VIS_Q_CRITERION) kernel_graphics_q.set_parameters(3u, field_mode).enqueue_run();
#ifdef PARTICLES
if(visualization_modes&VIS_PARTICLES) kernel_graphics_particles.enqueue_run();
#endif // PARTICLES
if(visualization_modes&VIS_FIELD) {
switch(slice_mode) { // 0 (no slice), 1 (x), 2 (y), 3 (z), 4 (xz), 5 (xyz), 6 (yz), 7 (xy)
case 0: // no slice
kernel_graphics_field.set_parameters(3u, field_mode).enqueue_run();
break;
case 1: case 2: case 3: // x/y/z
kernel_graphics_field_slice.set_ranges(lbm->get_area((uint)clamp(slice_mode-1, 0, 2))).set_parameters(3u, field_mode, slice_mode, sx, sy, sz).enqueue_run();
break;
case 4: // xz
kernel_graphics_field_slice.set_ranges(lbm->get_area(0u)).set_parameters(3u, field_mode, 0u+1u, sx, sy, sz).enqueue_run();
kernel_graphics_field_slice.set_ranges(lbm->get_area(2u)).set_parameters(3u, field_mode, 2u+1u, sx, sy, sz).enqueue_run();
break;
case 5: // xyz
kernel_graphics_field_slice.set_ranges(lbm->get_area(0u)).set_parameters(3u, field_mode, 0u+1u, sx, sy, sz).enqueue_run();
kernel_graphics_field_slice.set_ranges(lbm->get_area(1u)).set_parameters(3u, field_mode, 1u+1u, sx, sy, sz).enqueue_run();
kernel_graphics_field_slice.set_ranges(lbm->get_area(2u)).set_parameters(3u, field_mode, 2u+1u, sx, sy, sz).enqueue_run();
break;
case 6: // yz
kernel_graphics_field_slice.set_ranges(lbm->get_area(1u)).set_parameters(3u, field_mode, 1u+1u, sx, sy, sz).enqueue_run();
kernel_graphics_field_slice.set_ranges(lbm->get_area(2u)).set_parameters(3u, field_mode, 2u+1u, sx, sy, sz).enqueue_run();
break;
case 7: // xy
kernel_graphics_field_slice.set_ranges(lbm->get_area(0u)).set_parameters(3u, field_mode, 0u+1u, sx, sy, sz).enqueue_run();
kernel_graphics_field_slice.set_ranges(lbm->get_area(1u)).set_parameters(3u, field_mode, 1u+1u, sx, sy, sz).enqueue_run();
break;
}
}
bitmap.enqueue_read_from_device();
if(lbm->get_D()>1u) zbuffer.enqueue_read_from_device();
return true; // new frame has been rendered
}
int* LBM_Domain::Graphics::get_bitmap() { // returns pointer to zbuffer
return bitmap.data();
}
int* LBM_Domain::Graphics::get_zbuffer() { // returns pointer to zbuffer
return zbuffer.data();
}
string LBM_Domain::Graphics::device_defines() const { return
"\n #define GRAPHICS"
"\n #define def_background_color " +to_string(GRAPHICS_BACKGROUND_COLOR)+""
"\n #define def_screen_width " +to_string(camera.width)+"u"
"\n #define def_screen_height " +to_string(camera.height)+"u"
"\n #define def_scale_u " +to_string(1.0f/(0.57735027f*(GRAPHICS_U_MAX)))+"f"
"\n #define def_scale_rho " +to_string(0.5f/(GRAPHICS_RHO_DELTA))+"f"
"\n #define def_scale_T " +to_string(0.5f/(GRAPHICS_T_DELTA))+"f"
"\n #define def_scale_F " +to_string(0.5f/(GRAPHICS_F_MAX))+"f"
"\n #define def_scale_Q_min " +to_string(GRAPHICS_Q_CRITERION)+"f"
"\n #define def_streamline_sparse "+to_string(GRAPHICS_STREAMLINE_SPARSE)+"u"
"\n #define def_streamline_length "+to_string(GRAPHICS_STREAMLINE_LENGTH)+"u"
"\n #define def_n " +to_string(1.333f)+"f" // refractive index of water for raytracing graphics
"\n #define def_attenuation " +to_string(ln(clamp(GRAPHICS_RAYTRACING_TRANSMITTANCE, 1E-9f, 1.0f))/(float)max(max(lbm->get_Nx(), lbm->get_Ny()), lbm->get_Nz()))+"f" // (negative) attenuation parameter for raytracing graphics
"\n #define def_absorption_color " +to_string(GRAPHICS_RAYTRACING_COLOR)+"" // absorption color of fluid for raytracing graphics
"\n #define COLOR_S (127<<16|127<<8|127)" // (stationary or moving) solid boundary
"\n #define COLOR_E ( 0<<16|255<<8| 0)" // equilibrium boundary (inflow/outflow)
"\n #define COLOR_M (255<<16| 0<<8|255)" // cells next to moving solid boundary
"\n #define COLOR_T (255<<16| 0<<8| 0)" // temperature boundary
"\n #define COLOR_F ( 0<<16| 0<<8|255)" // fluid
"\n #define COLOR_I ( 0<<16|255<<8|255)" // interface
"\n #define COLOR_0 (127<<16|127<<8|127)" // regular cell or gas
"\n #define COLOR_X (255<<16|127<<8| 0)" // reserved type X
"\n #define COLOR_Y (255<<16|255<<8| 0)" // reserved type Y
"\n #define COLOR_P (255<<16|255<<8|191)" // particles
#ifdef GRAPHICS_TRANSPARENCY
"\n #define GRAPHICS_TRANSPARENCY "+to_string(GRAPHICS_TRANSPARENCY)+"f"
#endif // GRAPHICS_TRANSPARENCY
#ifndef SURFACE
"\n #define def_skybox_width 1u"
"\n #define def_skybox_height 1u"
#else // SURFACE
"\n #define def_skybox_width " +to_string(skybox_image->width() )+"u"
"\n #define def_skybox_height "+to_string(skybox_image->height())+"u"
#endif // SURFACE
;}
#endif // GRAPHICS
vector<Device_Info> smart_device_selection(const uint D) {
const vector<Device_Info>& devices = get_devices(); // a vector of all available OpenCL devices
vector<Device_Info> device_infos(D);
const int user_specified_devices = (int)main_arguments.size();
if(user_specified_devices>0) { // user has selevted specific devices as command line arguments
if(user_specified_devices==D) { // as much specified devices as domains
for(uint d=0; d<D; d++) device_infos[d] = select_device_with_id(to_uint(main_arguments[d]), devices); // use list of devices IDs specified by user
} else {
print_warning("Incorrect number of devices specified. Using single fastest device for all domains.");
for(uint d=0; d<D; d++) device_infos[d] = select_device_with_most_flops(devices);
}
} else { // device auto-selection
vector<vector<Device_Info>> device_type_ids; // a vector of all different devices, containing vectors of their device IDs
for(uint i=0u; i<(uint)devices.size(); i++) {
const string name_i = devices[i].name;
bool already_exists = false;
for(uint j=0u; j<(uint)device_type_ids.size(); j++) {
const string name_j = device_type_ids[j][0].name;
if(name_i==name_j) {
device_type_ids[j].push_back(devices[i]);
already_exists = true;
}
}
if(!already_exists) device_type_ids.push_back(vector<Device_Info>(1, devices[i]));
}
float best_value = -1.0f;
int best_j = -1;
for(uint j=0u; j<(uint)device_type_ids.size(); j++) {
const float value = device_type_ids[j][0].tflops;
if((uint)device_type_ids[j].size()>=D && value>best_value) {
best_value = value;
best_j = j;
}
}
if(best_j>=0) { // select all devices of fastest device type with at least D devices of the same type
for(uint d=0; d<D; d++) device_infos[d] = device_type_ids[best_j][d];
} else {
print_warning("Not enough devices of the same type available. Using single fastest device for all domains.");
for(uint d=0; d<D; d++) device_infos[d] = select_device_with_most_flops(devices);
}
//for(uint j=0u; j<(uint)device_type_ids.size(); j++) print_info("Device Type "+to_string(j)+" ("+device_type_ids[j][0].name+"): "+to_string((uint)device_type_ids[j].size())+"x");
}
return device_infos;
}
LBM::LBM(const uint Nx, const uint Ny, const uint Nz, const float nu, const float fx, const float fy, const float fz, const float sigma, const float alpha, const float beta, const uint particles_N, const float particles_rho) // single device
:LBM(Nx, Ny, Nz, 1u, 1u, 1u, nu, fx, fy, fz, sigma, alpha, beta, particles_N, particles_rho) { // delegating constructor
}
LBM::LBM(const uint Nx, const uint Ny, const uint Nz, const float nu, const float fx, const float fy, const float fz, const uint particles_N, const float particles_rho)
:LBM(Nx, Ny, Nz, 1u, 1u, 1u, nu, fx, fy, fz, 0.0f, 0.0f, 0.0f, particles_N, particles_rho) { // delegating constructor
}
LBM::LBM(const uint Nx, const uint Ny, const uint Nz, const float nu, const uint particles_N, const float particles_rho)
:LBM(Nx, Ny, Nz, 1u, 1u, 1u, nu, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, particles_N, particles_rho) { // delegating constructor
}
LBM::LBM(const uint3 N, const uint Dx, const uint Dy, const uint Dz, const float nu, const float fx, const float fy, const float fz, const float sigma, const float alpha, const float beta, const uint particles_N, const float particles_rho)
:LBM(N.x, N.y, N.z, Dx, Dy, Dz, nu, fx, fy, fz, sigma, alpha, beta, particles_N, particles_rho) { // delegating constructor
}
LBM::LBM(const uint3 N, const float nu, const float fx, const float fy, const float fz, const float sigma, const float alpha, const float beta, const uint particles_N, const float particles_rho) // single device
:LBM(N.x, N.y, N.z, 1u, 1u, 1u, nu, fx, fy, fz, sigma, alpha, beta, particles_N, particles_rho) { // delegating constructor
}
LBM::LBM(const uint3 N, const float nu, const uint particles_N, const float particles_rho)
:LBM(N.x, N.y, N.z, 1u, 1u, 1u, nu, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, particles_N, particles_rho) { // delegating constructor
}
LBM::LBM(const uint3 N, const float nu, const float fx, const float fy, const float fz, const uint particles_N, const float particles_rho)
:LBM(N.x, N.y, N.z, 1u, 1u, 1u, nu, fx, fy, fz, 0.0f, 0.0f, 0.0f, particles_N, particles_rho) { // delegating constructor
}
LBM::LBM(const uint Nx, const uint Ny, const uint Nz, const uint Dx, const uint Dy, const uint Dz, const float nu, const float fx, const float fy, const float fz, const float sigma, const float alpha, const float beta, const uint particles_N, const float particles_rho) { // multiple devices
const uint NDx=(Nx/Dx)*Dx, NDy=(Ny/Dy)*Dy, NDz=(Nz/Dz)*Dz; // make resolution equally divisible by domains
if(NDx!=Nx||NDy!=Ny||NDz!=Nz) print_warning("LBM grid ("+to_string(Nx)+"x"+to_string(Ny)+"x"+to_string(Nz)+") is not equally divisible in domains ("+to_string(Dx)+"x"+to_string(Dy)+"x"+to_string(Dz)+"). Changing resolution to ("+to_string(NDx)+"x"+to_string(NDy)+"x"+to_string(NDz)+").");
this->Nx = NDx; this->Ny = NDy; this->Nz = NDz;
this->Dx = Dx; this->Dy = Dy; this->Dz = Dz;
const uint D = Dx*Dy*Dz;
const uint Hx=Dx>1u, Hy=Dy>1u, Hz=Dz>1u; // halo offsets
const vector<Device_Info>& device_infos = smart_device_selection(D);
sanity_checks_constructor(device_infos, this->Nx, this->Ny, this->Nz, Dx, Dy, Dz, nu, fx, fy, fz, sigma, alpha, beta, particles_N, particles_rho);
lbm_domain = new LBM_Domain*[D];
for(uint d=0u; d<D; d++) { // parallel_for((ulong)D, D, [&](ulong d) {
const uint x=((uint)d%(Dx*Dy))%Dx, y=((uint)d%(Dx*Dy))/Dx, z=(uint)d/(Dx*Dy); // d = x+(y+z*Dy)*Dx
lbm_domain[d] = new LBM_Domain(device_infos[d], this->Nx/Dx+2u*Hx, this->Ny/Dy+2u*Hy, this->Nz/Dz+2u*Hz, Dx, Dy, Dz, (int)(x*this->Nx/Dx)-(int)Hx, (int)(y*this->Ny/Dy)-(int)Hy, (int)(z*this->Nz/Dz)-(int)Hz, nu, fx, fy, fz, sigma, alpha, beta, particles_N, particles_rho);
} // });
{
Memory<float>** buffers_rho = new Memory<float>*[D];
for(uint d=0u; d<D; d++) buffers_rho[d] = &(lbm_domain[d]->rho);
rho = Memory_Container(this, buffers_rho, "rho");
} {
Memory<float>** buffers_u = new Memory<float>*[D];
for(uint d=0u; d<D; d++) buffers_u[d] = &(lbm_domain[d]->u);
u = Memory_Container(this, buffers_u, "u");
} {
Memory<uchar>** buffers_flags = new Memory<uchar>*[D];
for(uint d=0u; d<D; d++) buffers_flags[d] = &(lbm_domain[d]->flags);
flags = Memory_Container(this, buffers_flags, "flags");
} {
#ifdef FORCE_FIELD
Memory<float>** buffers_F = new Memory<float>*[D];
for(uint d=0u; d<D; d++) buffers_F[d] = &(lbm_domain[d]->F);
F = Memory_Container(this, buffers_F, "F");
#endif // FORCE_FIELD
} {
#ifdef SURFACE
Memory<float>** buffers_phi = new Memory<float>*[D];
for(uint d=0u; d<D; d++) buffers_phi[d] = &(lbm_domain[d]->phi);
phi = Memory_Container(this, buffers_phi, "phi");
#endif // SURFACE
} {
#ifdef TEMPERATURE
Memory<float>** buffers_T = new Memory<float>*[D];
for(uint d=0u; d<D; d++) buffers_T[d] = &(lbm_domain[d]->T);
T = Memory_Container(this, buffers_T, "T");
#endif // TEMPERATURE
} {
#ifdef PARTICLES
particles = &(lbm_domain[0]->particles);
#endif // PARTICLES
}
#ifdef GRAPHICS
graphics = Graphics(this);
#endif // GRAPHICS
}
LBM::~LBM() {
#ifdef GRAPHICS
camera.allow_rendering = false;
#endif // GRAPHICS
info.print_finalize();
for(uint d=0u; d<get_D(); d++) delete lbm_domain[d];
delete[] lbm_domain;
}
void LBM::sanity_checks_constructor(const vector<Device_Info>& device_infos, const uint Nx, const uint Ny, const uint Nz, const uint Dx, const uint Dy, const uint Dz, const float nu, const float fx, const float fy, const float fz, const float sigma, const float alpha, const float beta, const uint particles_N, const float particles_rho) { // sanity checks on grid resolution and extension support
if((ulong)Nx*(ulong)Ny*(ulong)Nz==0ull) print_error("Grid point number is 0: "+to_string(Nx)+"x"+to_string(Ny)+"x"+to_string(Nz)+" = 0.");
if(Dx*Dy*Dz==0u) print_error("You specified 0 LBM grid domains ("+to_string(Dx)+"x"+to_string(Dy)+"x"+to_string(Dz)+"). There has to be at least 1 domain in every direction. Check your input in LBM constructor.");
const uint local_Nx=Nx/Dx+2u*(Dx>1u), local_Ny=Ny/Dy+2u*(Dy>1u), local_Nz=Nz/Dz+2u*(Dz>1u);
uint memory_available = max_uint; // in MB
for(Device_Info device_info : device_infos) memory_available = min(memory_available, device_info.memory);
uint memory_required = (uint)((ulong)Nx*(ulong)Ny*(ulong)Nz/((ulong)(Dx*Dy*Dz))*(ulong)bytes_per_cell_device()/1048576ull); // in MB
if(memory_required>memory_available) {
float factor = cbrt((float)memory_available/(float)memory_required);
const uint maxNx=(uint)(factor*(float)Nx), maxNy=(uint)(factor*(float)Ny), maxNz=(uint)(factor*(float)Nz);
string message = "Grid resolution ("+to_string(Nx)+", "+to_string(Ny)+", "+to_string(Nz)+") is too large: "+to_string(Dx*Dy*Dz)+"x "+to_string(memory_required)+" MB required, "+to_string(Dx*Dy*Dz)+"x "+to_string(memory_available)+" MB available. Largest possible resolution is ("+to_string(maxNx)+", "+to_string(maxNy)+", "+to_string(maxNz)+"). Restart the simulation with lower resolution or on different device(s) with more memory.";
#if !defined(FP16S)&&!defined(FP16C)
uint memory_required_fp16 = (uint)((ulong)Nx*(ulong)Ny*(ulong)Nz/((ulong)(Dx*Dy*Dz))*(ulong)(bytes_per_cell_device()-velocity_set*2u)/1048576ull); // in MB
float factor_fp16 = cbrt((float)memory_available/(float)memory_required_fp16);
const uint maxNx_fp16=(uint)(factor_fp16*(float)Nx), maxNy_fp16=(uint)(factor_fp16*(float)Ny), maxNz_fp16=(uint)(factor_fp16*(float)Nz);
message += " Consider using FP16S/FP16C memory compression to double maximum grid resolution to a maximum of ("+to_string(maxNx_fp16)+", "+to_string(maxNy_fp16)+", "+to_string(maxNz_fp16)+"); for this, uncomment \"#define FP16S\" or \"#define FP16C\" in defines.hpp.";
#endif // !FP16S&&!FP16C
print_error(message);
}
if(nu==0.0f) print_error("Viscosity cannot be 0. Change it in setup.cpp."); // sanity checks for viscosity
else if(nu<0.0f) print_error("Viscosity cannot be negative. Remove the \"-\" in setup.cpp.");
#ifdef D2Q9
if(Nz!=1u) print_error("D2Q9 is the 2D velocity set. You have to set Nz=1u in the LBM constructor! Currently you have set Nz="+to_string(Nz)+"u.");
#endif // D2Q9
#if !defined(SRT)&&!defined(TRT)
print_error("No LBM collision operator selected. Uncomment either \"#define SRT\" or \"#define TRT\" in defines.hpp");
#elif defined(SRT)&&defined(TRT)
print_error("Too many LBM collision operators selected. Comment out either \"#define SRT\" or \"#define TRT\" in defines.hpp");
#endif // SRT && TRT
#ifndef VOLUME_FORCE
if(fx!=0.0f||fy!=0.0f||fz!=0.0f) print_error("Volume force is set in LBM constructor in main_setup(), but VOLUME_FORCE is not enabled. Uncomment \"#define VOLUME_FORCE\" in defines.hpp.");
#else // VOLUME_FORCE
#ifndef FORCE_FIELD
if(fx==0.0f&&fy==0.0f&&fz==0.0f) print_warning("The VOLUME_FORCE extension is enabled but the volume force in LBM constructor is set to zero. You may disable the extension by commenting out \"#define VOLUME_FORCE\" in defines.hpp.");
#endif // FORCE_FIELD
#endif // VOLUME_FORCE
#ifndef SURFACE
if(sigma!=0.0f) print_error("Surface tension is set in LBM constructor in main_setup(), but SURFACE is not enabled. Uncomment \"#define SURFACE\" in defines.hpp.");
#endif // SURFACE
#ifndef TEMPERATURE
if(alpha!=0.0f||beta!=0.0f) print_error("Thermal diffusion/expansion coefficients are set in LBM constructor in main_setup(), but TEMPERATURE is not enabled. Uncomment \"#define TEMPERATURE\" in defines.hpp.");
#else // TEMPERATURE
if(alpha==0.0f&&beta==0.0f) print_warning("The TEMPERATURE extension is enabled but the thermal diffusion/expansion coefficients alpha/beta in the LBM constructor are both set to zero. You may disable the extension by commenting out \"#define TEMPERATURE\" in defines.hpp.");
#endif // TEMPERATURE
#ifdef PARTICLES
if(particles_N==0u) print_error("The PARTICLES extension is enabled but the number of particles is set to 0. Comment out \"#define PARTICLES\" in defines.hpp.");
if(get_D()>1u) print_error("The PARTICLES extension is not supported in multi-GPU mode.");
#if !defined(VOLUME_FORCE)||!defined(FORCE_FIELD)
if(particles_rho!=1.0f) print_error("Particle density is set unequal to 1, but particle-fluid 2-way-coupling is not enabled. Uncomment both \"#define VOLUME_FORCE\" and \"#define FORCE_FIELD\" in defines.hpp.");
#endif // !VOLUME_FORCE||!FORCE_FIELD
#ifdef FORCE_FIELD
if(particles_rho==1.0f) print_warning("Particle density is set to 1, so particles behave as passive tracers without acting a force on the fluid, but particle-fluid 2-way-coupling is enabled. You may comment out \"#define FORCE_FIELD\" in defines.hpp.");
#endif // FORCE_FIELD
#else // PARTICLES
if(particles_N>0u) print_error("The PARTICLES extension is disabled but the number of particles is set to "+to_string(particles_N)+">0. Uncomment \"#define PARTICLES\" in defines.hpp.");
#endif // PARTICLES
}
void LBM::sanity_checks_initialization() { // sanity checks during initialization on used extensions based on used flags
uchar flags_used = 0u;
bool moving_boundaries_used=false, equilibrium_boundaries_used=false, surface_used=false, temperature_used=false; // identify used extensions based used flags
const uint threads = thread::hardware_concurrency();
vector<uchar> t_flags_used(threads, 0u);
vector<char> t_moving_boundaries_used(threads, false); // don't use vector<bool> as it uses bit-packing which is broken for multithreading
vector<char> t_equilibrium_boundaries_used(threads, false); // don't use vector<bool> as it uses bit-packing which is broken for multithreading
parallel_for(get_N(), threads, [&](ulong n, uint t) {
const uchar flagsn = flags[n];
const uchar flagsn_bo = flagsn&(TYPE_S|TYPE_E);
t_flags_used[t] = t_flags_used[t]|flagsn;
if(flagsn_bo&TYPE_S) t_moving_boundaries_used[t] = t_moving_boundaries_used[t] || (((flagsn_bo==TYPE_S)&&(u.x[n]!=0.0f||u.y[n]!=0.0f||u.z[n]!=0.0f))||(flagsn_bo==(TYPE_S|TYPE_E)));
t_equilibrium_boundaries_used[t] = t_equilibrium_boundaries_used[t] || flagsn_bo==TYPE_E;
});
for(uint t=0u; t<threads; t++) {
flags_used = flags_used|t_flags_used[t];
moving_boundaries_used = moving_boundaries_used || t_moving_boundaries_used[t];
equilibrium_boundaries_used = equilibrium_boundaries_used || t_equilibrium_boundaries_used[t];
}
surface_used = (bool)(flags_used&(TYPE_F|TYPE_I|TYPE_G));
temperature_used = (bool)(flags_used&TYPE_T);
#ifndef MOVING_BOUNDARIES
if(moving_boundaries_used) print_warning("Some boundary cells have non-zero velocity, but MOVING_BOUNDARIES is not enabled. If you intend to use moving boundaries, uncomment \"#define MOVING_BOUNDARIES\" in defines.hpp.");
#else // MOVING_BOUNDARIES
if(!moving_boundaries_used) print_warning("The MOVING_BOUNDARIES extension is enabled but no moving boundary cells (TYPE_S flag and velocity unequal to zero) are placed in the simulation box. You may disable the extension by commenting out \"#define MOVING_BOUNDARIES\" in defines.hpp.");
#endif // MOVING_BOUNDARIES
#ifndef EQUILIBRIUM_BOUNDARIES
if(equilibrium_boundaries_used) print_error("Some cells are set as equilibrium boundaries with the TYPE_E flag, but EQUILIBRIUM_BOUNDARIES is not enabled. Uncomment \"#define EQUILIBRIUM_BOUNDARIES\" in defines.hpp.");
#else // EQUILIBRIUM_BOUNDARIES
if(!equilibrium_boundaries_used) print_warning("The EQUILIBRIUM_BOUNDARIES extension is enabled but no equilibrium boundary cells (TYPE_E flag) are placed in the simulation box. You may disable the extension by commenting out \"#define EQUILIBRIUM_BOUNDARIES\" in defines.hpp.");
#endif // EQUILIBRIUM_BOUNDARIES
#ifndef SURFACE
if(surface_used) print_error("Some cells are set as fluid/interface/gas with the TYPE_F/TYPE_I/TYPE_G flags, but SURFACE is not enabled. Uncomment \"#define SURFACE\" in defines.hpp.");
#else // SURFACE
if(!surface_used) print_error("The SURFACE extension is enabled but no fluid/interface/gas cells (TYPE_F/TYPE_I/TYPE_G flags) are placed in the simulation box. Disable the extension by commenting out \"#define SURFACE\" in defines.hpp.");
#endif // SURFACE
#ifndef TEMPERATURE
if(temperature_used) print_error("Some cells are set as temperature boundary with the TYPE_T flag, but TEMPERATURE is not enabled. Uncomment \"#define TEMPERATURE\" in defines.hpp.");
#endif // TEMPERATURE
}
void LBM::initialize() { // write all data fields to device and call kernel_initialize
#ifndef BENCHMARK
sanity_checks_initialization();
#endif // BENCHMARK
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->rho.enqueue_write_to_device();
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->u.enqueue_write_to_device();
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->flags.enqueue_write_to_device();
#ifdef FORCE_FIELD
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->F.enqueue_write_to_device();
#endif // FORCE_FIELD
#ifdef SURFACE
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->phi.enqueue_write_to_device();
#endif // SURFACE
#ifdef TEMPERATURE
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->T.enqueue_write_to_device();
#endif // TEMPERATURE
#ifdef PARTICLES
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->particles.enqueue_write_to_device();
#endif // PARTICLES
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->increment_time_step(); // the communicate calls at initialization need an odd time step
communicate_rho_u_flags();
#ifdef SURFACE
communicate_phi_massex_flags();
#endif // SURFACE
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->enqueue_initialize(); // odd time step is baked-in the kernel
communicate_rho_u_flags();
#ifdef SURFACE
communicate_phi_massex_flags();
#endif // SURFACE
communicate_fi(); // time step must be odd here
#ifdef TEMPERATURE
communicate_T(); // T halo data is required for field_slice rendering
communicate_gi(); // time step must be odd here
#endif // TEMPERATURE
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->finish_queue();
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->reset_time_step(); // set time step to 0 again
initialized = true;
}
void LBM::do_time_step() { // call kernel_stream_collide to perform one LBM time step
#ifdef SURFACE
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->enqueue_surface_0();
#endif // SURFACE
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->enqueue_stream_collide(); // run LBM stream_collide kernel after domain communication
#if defined(SURFACE) || defined(GRAPHICS)
communicate_rho_u_flags(); // rho/u/flags halo data is required for SURFACE extension, and u halo data is required for Q-criterion rendering
#endif // SURFACE || GRAPHICS
#ifdef SURFACE
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->enqueue_surface_1();
communicate_flags();
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->enqueue_surface_2();
communicate_flags();
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->enqueue_surface_3();
communicate_phi_massex_flags();
#endif // SURFACE
communicate_fi();
#ifdef TEMPERATURE
#ifdef GRAPHICS
communicate_T(); // T halo data is required for field_slice rendering
#endif // GRAPHICS
communicate_gi();
#endif // TEMPERATURE
#ifdef PARTICLES
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->enqueue_integrate_particles(); // intgegrate particles forward in time and couple particles to fluid
#endif // PARTICLES
if(get_D()==1u) for(uint d=0u; d<get_D(); d++) lbm_domain[d]->finish_queue(); // this additional domain synchronization barrier is only required in single-GPU, as communication calls already provide all necessary synchronization barriers in multi-GPU
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->increment_time_step();
}
void LBM::run(const ulong steps, const ulong total_steps) { // initializes the LBM simulation (copies data to device and runs initialize kernel), then runs LBM
info.append(steps, total_steps, get_t()); // total_steps parameter is just for runtime estimation
if(!initialized) {
initialize();
info.print_initialize(this); // only print setup info if the setup is new (run() was not called before)
#ifdef GRAPHICS
camera.allow_rendering = true;
#endif // GRAPHICS
}
Clock clock;
for(ulong i=1ull; i<=steps; i++) {
#if defined(INTERACTIVE_GRAPHICS)||defined(INTERACTIVE_GRAPHICS_ASCII)
while(!key_P&&running) sleep(0.016);
if(!running) break;
#endif // INTERACTIVE_GRAPHICS_ASCII || INTERACTIVE_GRAPHICS
clock.start();
do_time_step();
info.update(clock.stop());
}
if(get_D()>1u) for(uint d=0u; d<get_D(); d++) lbm_domain[d]->finish_queue(); // wait for everything to finish (multi-GPU only)
}
void LBM::update_fields() { // update fields (rho, u, T) manually
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->enqueue_update_fields();
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->finish_queue();
}
void LBM::reset() { // reset simulation (takes effect in following run() call)
initialized = false;
}
#ifdef FORCE_FIELD
void LBM::calculate_force_on_boundaries() { // calculate forces from fluid on TYPE_S cells
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->enqueue_calculate_force_on_boundaries();
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->finish_queue();
}
float3 LBM::calculate_object_center_of_mass(const uchar flag_marker) { // calculate center of mass of all cells flagged with flag_marker
double3 com(0.0, 0.0, 0.0);
ulong counter = 0ull;
const uint threads = thread::hardware_concurrency();
vector<double3> coms(threads, double3(0.0, 0.0, 0.0));
vector<ulong> counters(threads, 0ull);
parallel_for(get_N(), threads, [&](ulong n, uint t) {
if(flags[n]==flag_marker) {
const float3 p = position(n);
coms[t].x += (double)p.x;
coms[t].y += (double)p.y;
coms[t].z += (double)p.z;
counters[t]++;
}
});
for(uint t=0u; t<threads; t++) {
com += coms[t];
counter += counters[t];
}
return float3((float)com.x/(float)counter, (float)com.y/(float)counter, (float)com.z/(float)counter)+center();
}
float3 LBM::calculate_force_on_object(const uchar flag_marker) { // add up force for all cells flagged with flag_marker
double3 force(0.0, 0.0, 0.0);
const uint threads = thread::hardware_concurrency();
vector<double3> forces(threads, double3(0.0, 0.0, 0.0));
parallel_for(get_N(), threads, [&](ulong n, uint t) {
if(flags[n]==flag_marker) {
forces[t].x += (double)F.x[n];
forces[t].y += (double)F.y[n];
forces[t].z += (double)F.z[n];
}
});
for(uint t=0u; t<threads; t++) force += forces[t];
return float3((float)force.x, (float)force.y, (float)force.z);
}
float3 LBM::calculate_torque_on_object(const float3& rotation_center, const uchar flag_marker) { // add up torque around specified rotation center for all cells flagged with flag_marker
double3 torque(0.0, 0.0, 0.0);
const float3 rotation_center_in_box = rotation_center-center();
const uint threads = thread::hardware_concurrency();
vector<double3> torques(threads, double3(0.0, 0.0, 0.0));
parallel_for(get_N(), threads, [&](ulong n, uint t) {
if(flags[n]==flag_marker) {
const float3 torquen = cross(position(n)-rotation_center_in_box, float3(F.x[n], F.y[n], F.z[n]));
torques[t].x += (double)torquen.x;
torques[t].y += (double)torquen.y;
torques[t].z += (double)torquen.z;
}
});
for(uint t=0u; t<threads; t++) torque += torques[t];
return float3((float)torque.x, (float)torque.y, (float)torque.z);
}
#endif // FORCE_FIELD
#ifdef MOVING_BOUNDARIES
void LBM::update_moving_boundaries() { // mark/unmark cells next to TYPE_S cells with velocity!=0 with TYPE_MS
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->enqueue_update_moving_boundaries();
communicate_flags();
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->finish_queue();
#ifdef GRAPHICS
camera.key_update = true; // to prevent flickering of flags in interactive graphics when camera is not moved
#endif // GRAPHICS
}
#endif // MOVING_BOUNDARIES
#if defined(PARTICLES)&&!defined(FORCE_FIELD)
void LBM::integrate_particles(const ulong steps, const ulong total_steps, const uint time_step_multiplicator) { // intgegrate passive tracer particles forward in time in stationary flow field
info.append(steps, total_steps, get_t());
Clock clock;
for(ulong i=1ull; i<=steps; i+=(ulong)time_step_multiplicator) {
#if defined(INTERACTIVE_GRAPHICS)||defined(INTERACTIVE_GRAPHICS_ASCII)
while(!key_P&&running) sleep(0.016);
if(!running) break;
#endif // INTERACTIVE_GRAPHICS_ASCII || INTERACTIVE_GRAPHICS
clock.start();
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->enqueue_integrate_particles(time_step_multiplicator);
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->finish_queue();
for(uint d=0u; d<get_D(); d++) lbm_domain[d]->increment_time_step(time_step_multiplicator);
info.update(clock.stop());
}
}
#endif // PARTICLES&&!FORCE_FIELD
void LBM::write_status(const string& path) { // write LBM status report to a .txt file
string status = "";
status += "Grid Resolution = "+to_string(Nx)+" x "+to_string(Ny)+" x "+to_string(Nz)+" = "+to_string(get_N())+"\n";
status += "Grid Domains = "+to_string(Dx)+" x "+to_string(Dy)+" x "+to_string(Dz)+" = "+to_string(get_D())+"\n";
status += "LBM Type = D"+string(get_velocity_set()==9 ? "2" : "3")+"Q"+to_string(get_velocity_set())+" "+info.collision+"\n";
status += "Memory Usage = CPU "+to_string(info.cpu_mem_required)+" MB, GPU "+to_string(get_D())+"x "+to_string(info.gpu_mem_required)+" MB\n";