diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6faa6ff17..e2efed6dc 100755 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -8,14 +8,14 @@ stages: - compile_debug - compile_no_mpi_threadmultiple - compile_no_openmp - - compile_omptasks - - run_omptasks +# - compile_omptasks +# - run_omptasks install: stage: install only: - develop - + script: # Force workdir cleaning in case of retried - echo "CI_PIPELINE_ID = " $CI_PIPELINE_ID @@ -33,7 +33,7 @@ compile_default: stage: compile_default only: - develop - + script: # Move in test dir - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation @@ -44,7 +44,7 @@ runQuick: stage: run_quick only: - develop - + script: # Move in test dir - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation @@ -55,7 +55,7 @@ run1D: stage: run_default only: - develop - + script: # Move in test dir - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation @@ -67,7 +67,7 @@ run2D: stage: run_default only: - develop - + script: # Move in test dir - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation @@ -81,7 +81,7 @@ run3D: stage: run_default only: - develop - + script: # Move in test dir - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation @@ -96,7 +96,7 @@ runAM: stage: run_default only: - develop - + script: # Move in test dir - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation @@ -108,7 +108,7 @@ runCollisions: stage: run_default only: - develop - + script: # Move in test dir - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation @@ -164,21 +164,21 @@ compile_no_openmp: - make clean - python validation/validation.py -k noopenmp -c -v -compile_omptasks: - stage: compile_omptasks - only: - - develop - - script: - - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei - - make clean - - python validation/validation.py -k omptasks -c -v - -run_omptasks: - stage: run_omptasks - only: - - develop - - script: - - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation - - python validation.py -k omptasks -b "tst2d_tasks_01_radiation_pressure_acc.py" -m 4 -o 4 -n 1 -v +#compile_omptasks: +# stage: compile_omptasks +# only: +# - develop +# +# script: +# - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei +# - make clean +# - python validation/validation.py -k omptasks -c -v +# +#run_omptasks: +# stage: run_omptasks +# only: +# - develop +# +# script: +# - cd /sps3/gitlab-runner/$CI_PIPELINE_ID/smilei/validation +# - python validation.py -k omptasks -b "tst2d_tasks_01_radiation_pressure_acc.py" -m 4 -o 4 -n 1 -v diff --git a/benchmarks/gpu/tst3d_gpu_o2_thermal_plasma_medium.py b/benchmarks/gpu/tst3d_gpu_o2_thermal_plasma_medium.py index a50614236..cb8c8f26a 100644 --- a/benchmarks/gpu/tst3d_gpu_o2_thermal_plasma_medium.py +++ b/benchmarks/gpu/tst3d_gpu_o2_thermal_plasma_medium.py @@ -66,7 +66,7 @@ def InitialChargeDensity(x, y, z): number_of_patches = kPatchPerGridDimension, EM_boundary_conditions = [ ["periodic"] ], print_every = 10, - random_seed = smilei_mpi_rank) + ) Vectorization(mode = "off") diff --git a/benchmarks/gpu/tst3d_gpu_o2_thermal_plasma_short.py b/benchmarks/gpu/tst3d_gpu_o2_thermal_plasma_short.py index 548746977..a627232f9 100644 --- a/benchmarks/gpu/tst3d_gpu_o2_thermal_plasma_short.py +++ b/benchmarks/gpu/tst3d_gpu_o2_thermal_plasma_short.py @@ -61,7 +61,6 @@ gpu_computing = True, # random_seed = 0xDEADBEEF, - random_seed = smilei_mpi_rank, ) Vectorization( diff --git a/benchmarks/gpu/tst3d_v_o2_thermal_plasma_medium.py b/benchmarks/gpu/tst3d_v_o2_thermal_plasma_medium.py index 524f564d0..92c598c19 100644 --- a/benchmarks/gpu/tst3d_v_o2_thermal_plasma_medium.py +++ b/benchmarks/gpu/tst3d_v_o2_thermal_plasma_medium.py @@ -67,7 +67,7 @@ def InitialChargeDensity(x, y, z): number_of_patches = kPatchPerGridDimension, EM_boundary_conditions = [ ["periodic"] ], print_every = 10, - random_seed = smilei_mpi_rank) + ) Vectorization(mode = "on") diff --git a/benchmarks/gpu/tst3d_v_o2_thermal_plasma_short.py b/benchmarks/gpu/tst3d_v_o2_thermal_plasma_short.py index 3672fd9d0..bc553aa62 100644 --- a/benchmarks/gpu/tst3d_v_o2_thermal_plasma_short.py +++ b/benchmarks/gpu/tst3d_v_o2_thermal_plasma_short.py @@ -61,7 +61,6 @@ gpu_computing = False, # random_seed = 0xDEADBEEF, - random_seed = smilei_mpi_rank, ) Vectorization( diff --git a/benchmarks/tst1d_24_cir_plane_wave_BTIS3.py b/benchmarks/tst1d_24_cir_plane_wave_BTIS3.py deleted file mode 100755 index cab778662..000000000 --- a/benchmarks/tst1d_24_cir_plane_wave_BTIS3.py +++ /dev/null @@ -1,95 +0,0 @@ -# _____________________________________________________________________________ -# -# Electron trajectory in a plane wave -# with a Gaussian temporal profile. -# -# Validation in the relativist regime -# -# _____________________________________________________________________________ - -import math - -# _____________________________________________________________________________ -# Main parameters - -l0 = 2.0*math.pi # laser wavelength -t0 = l0 # optical cicle -Lx = 80*l0 - -n0 = 1e-8 # particle density - -Tsim = 150.*t0 # duration of the simulation -resx = 64. # nb of cells in one laser wavelength - -dx = l0/resx # space step -dt = 0.95 * dx # timestep (0.95 x CFL) - -a0 = 5 -start = 0 # Laser start -fwhm = 10*t0 # Gaussian time fwhm -duration = 90*t0 # Laser duration -center = duration*0.5 # Laser profile center - -pusher = "borisBTIS3" - -# Density profile for inital location of the particles -def n0_(x): - if (dx`_). -As of April 2024, 181 papers have been published covering a broad range of topics: +As of May 2024, at least 192 papers have been published covering a broad range of topics: * laser-plasma interaction (LPI) / inertial fusion (FCI) * ultra-high intensity (UHI) applications @@ -50,13 +50,74 @@ Following is the distribution of these topics in the listed publications up to N Use the python script doc/doi2publications.py to generate entries from a DOI number, and paste them here You can count the number of papers in the list with the vim command :%s/.. \[//gn. + +.. [Sikorski2024] + + P. Sikorski, A. G. R. Thomas, S. S. Bulanov, M. Zepf and D. Seipt, + `Novel signatures of radiation reaction in electron–laser sidescattering`, + `New Journal of Physics 26 063011 (2024) `_ + +.. [Ivanov2024b] + + K. A. Ivanov, S. A. Shulyapov, D. A. Gorlova, I. P. Tsygvintsev, M. S. Krivokorytov, I. N. Tsymbalov, R. V. Volkov and A. B. Savelev, + `Laser-accelerated MeV-scale collimated electron bunch from a near-critical plasma of a liquid jet target`, + `Laser Physics Letters 21, 7 (2024) `_ + +.. [Malik2024] + + H. K. Malik, S. Kumar, and D. K. Singh, + `Effect of trapezoidal plasma density region in bubble wakefield acceleration`, + `Physica Scripta 99, 075601 (2024) `_ + +.. [Krafft2024b] + + C. Krafft, P. Savoini, and F. J. Polanco-Rodríguez, + `Mechanisms of Fundamental Electromagnetic Wave Radiation in the Solar Wind`, + `The Astrophysical Journal Letters 967, 2 (2024) `_ + +.. [Salgado2024] + + F. C. Salgado, A. Kozan, D. Seipt, D. Hollatz, P. Hilz, M. Kaluza, A. Sävert, A. Seidel, D. Ullmann, Y. Zhao, and M. Zepf, + `All-optical source size and emittance measurements of laser-accelerated electron beams`, + `Physical Review Accelerators and Beams 27, 052803 (2024) `_ + +.. [Ivanov2024a] + + K. A. Ivanov, D. A. Gorlova, I. N. Tsymbalov, I. P. Tsygvintsev, S. A. Shulyapov, R. V. Volkov, and A. B. Savel’ev, + `Laser-driven pointed acceleration of electrons with preformed plasma lens`, + `Physical Review Accelerators and Beams 27, 051301 (2024) `_ + +.. [Timmis2024] + + R. J. L. Timmis, R. W. Paddock, I. Ouatu, J. Lee, S. Howard, E. Atonga, R. T. Ruskov, H. Martin, R. H. W. Wang, R. Aboushelbaya, M. W. von der Leyen, E. Gumbrell and P. A. Norreys, + `Attosecond and nano‐Coulomb electron bunches via the Zero Vector Potential mechanism`, + `Scientific Reports volume 14, 10805 (2024) `_ + +.. [Azamoum2024] + + Y. Azamoum, G. A. Becker, S. Keppler, G. Duchateau, S. Skupin, M. Grech, F. Catoire, S. Hell, I. Tamer, M. Hornung, M. Hellwing, A. Kessler, F. Schorcht, and M. C. Kaluza, + `Optical probing of ultrafast laser-induced solid-to-overdense-plasma transitions`, + `Light: Science & Applications 13, 109 (2024) `_ + +.. [Pan2024] + + Z. Pan, J. Liu, P. Wang, Z. Mei, Z. Cao, D. Kong, S. Xu, Z. Liu, Y. Liang, Z. Peng, T. Xu, T. Song, X. Chen, Q. Wu, Y. Zhang, Q. Han, H. Chen, J. Zhao, Y. Gao, S. Chen, Y. Zhao, X. Yan, Y. Shou, W. Ma, + `Electron acceleration and x-ray generation from near-critical-density carbon nanotube foams driven by moderately relativistic lasers`, + `Physics of Plasmas 31, 043108 (2024) `_ + +.. [Yao2024] + + W. Yao, M. Nakatsutsumi, S. Buffechoux, P. Antici, M. Borghesi, A. Ciardi, S. N. Chen, E. d’Humières, L. Gremillet, R. Heathcote, V. Horný, P. McKenna, M. N. Quinn, L. Romagnani, R. Royle, G. Sarri, Y. Sentoku, H.-P. Schlenvoigt, T. Toncian, O. Tresca, L. Vassura, O. Willi, J. Fuchs, + `Optimizing laser coupling, matter heating, and particle acceleration from solids using multiplexed ultraintense lasers`, + `Matter and Radiation at Extremes 9, 047202 (2024) `_ + .. [Luo2024] M. Luo, C. Riconda, I. Pusztai, A. Grassi, J. S. Wurtele, and T. Fülöp, `Control of autoresonant plasma beat-wave wakefield excitation`, - `Phys. Rev. Research 6, 013338 (2024) `_ + `Physical Review Research 6, 013338 (2024) `_ -.. [Krafft2024] +.. [Krafft2024a] C. Krafft and P. Savoini, `Electrostatic Wave Decay in the Randomly Inhomogeneous Solar Wind`, @@ -127,7 +188,13 @@ Following is the distribution of these topics in the listed publications up to N A. Seidel, B. Lei, C. Zepter, M. C. Kaluza, A. Sävert, M. Zepf, and D. Seipt, `Polarization and CEP dependence of the transverse phase space in laser driven accelerators`, `Physical Review Research 6, 013056 (2024) `_ - + +.. [Krishnamurthy2023] + + S. Krishnamurthy, S. Chintalwad, A. P. L. Robinson, R. M. G. M. Trines, and B. Ramakrishna, + `Observation of proton modulations in laser–solid interaction`, + `Plasma Physics and Controlled Fusion 65 085020 (2023) `_ + .. [Gao2023b] X. Gao, @@ -216,7 +283,7 @@ Following is the distribution of these topics in the listed publications up to N E. Starodubtseva, I. Tsymbalov, D. Gorlova, K. Ivanov, and A. Savel'ev, `Low energy electron injection for direct laser acceleration`, - `Phys. Plasmas 30, 083105 (2023) `_ + `Physics of Plasmas 30, 083105 (2023) `_ .. [Maffini2023] @@ -228,7 +295,7 @@ Following is the distribution of these topics in the listed publications up to N S. Yu. Gus'kov, Ph. Korneev, and M. Murakami, `Laser-driven electrodynamic implosion of fast ions in a thin shell`, - `Matter Radiat. Extremes 8, 056602 (2023) `_ + `Matter and Radiation at Extremes 8, 056602 (2023) `_ .. [RezaeiPandari2023] @@ -240,19 +307,19 @@ Following is the distribution of these topics in the listed publications up to N J. Jonnerby, A. von Boetticher, J. Holloway, L. Corner, A. Picksley, A. J. Ross, R. J. Shalloo , C. Thornton, N. Bourgeois, R. Walczak, and S. M. Hooker, `Measurement of the decay of laser-driven linear plasma wakefields`, - `Phys. Rev. E 108, 055211 (2023) `_ + `Physical Review E 108, 055211 (2023) `_ .. [Drobniak2023] P. Drobniak, E. Baynard, C. Bruni, K. Cassou, C. Guyot, G. Kane, S. Kazamias, V. Kubytskyi, N. Lericheux, B. Lucas, M. Pittman, F. Massimo, A. Beck, A. Specka, P. Nghiem, and D. Minenna, `Random scan optimization of a laser-plasma electron injector based on fast particle-in-cell simulations`, - `Phys. Rev. Accel. Beams 26, 091302 (2023) `_ + `Physical Review Accelerators and Beams 26, 091302 (2023) `_ .. [Bukharskii2023] N. Bukharskii and Ph. Korneev, `Intense widely controlled terahertz radiation from laser-driven wires`, - `Matter Radiat. Extremes 8, 044401 (2023) `_ + `Matter and Radiation at Extremes 8, 044401 (2023) `_ .. [Schmitz2023] @@ -276,7 +343,7 @@ Following is the distribution of these topics in the listed publications up to N X. Gao, `Ionization dynamics of sub-micrometer-sized clusters in intense ultrafast laser pulses`, - `Phys. Plasmas 30, 052102 (2023) `_ + `Physics of Plasmas 30, 052102 (2023) `_ .. [Krafft2023] @@ -294,7 +361,7 @@ Following is the distribution of these topics in the listed publications up to N A. Ghizzo, D. Del Sarto, and H. Betar, `Collisionless Heating Driven by Vlasov Filamentation in a Counterstreaming Beams Configuration`, - `Phys. Rev. Lett. 131, 035101 (2023) `_ + `Physical Review Letters 131, 035101 (2023) `_ .. [Yang2023] @@ -306,31 +373,31 @@ Following is the distribution of these topics in the listed publications up to N W. Yao, A. Fazzini, S.N. Chen, K. Burdonov, J. Béard, M. Borghesi, A. Ciardi, M. Miceli, S. Orlando, X. Ribeyre, E. d'Humières and J. Fuchs, `Investigating particle acceleration dynamics in interpenetrating magnetized collisionless super-critical shocks`, - `J. Plasma Phys. 89, 915890101 (2023) `_ + `Journal of Plasma Physics 89, 915890101 (2023) `_ .. [Pak2023] T. Pak, M. Rezaei-Pandari, S. B. Kim, G. Lee, D. H. Wi, C. I. Hojbota, M. Mirzaie, H. Kim, J. H. Sung, S. K. Lee, C. Kang and K.-Y. Kim, `Multi-millijoule terahertz emission from laser-wakefield-accelerated electrons`, - `Light Sci Appl 12, 37 (2023) `_ + `Light: Science and Applications 12, 37 (2023) `_ .. [Istokskaia2023] V. Istokskaia, M. Tosca, L. Giuffrida, J. Psikal, F. Grepl, V. Kantarelou, S. Stancek, S. Di Siena, A. Hadjikyriacou, A. McIlvenny, Y. Levy, J. Huynh, M. Cimrman, P. Pleskunov, D. Nikitin, A. Choukourov, F. Belloni, A. Picciotto, S. Kar, M. Borghesi, A. Lucianetti, T. Mocek and D. Margarone, `A multi-MeV alpha particle source via proton-boron fusion driven by a 10-GW tabletop laser`, - `Commun Phys 6, 27 (2023) `_ + `Communications Physics 6, 27 (2023) `_ .. [Yoon2023] Y. D. Yoon, D. E. Wendel and G. S. Yun, `Equilibrium selection via current sheet relaxation and guide field amplification`, - `Nat Commun 14, 139 (2023) `_ + `Nature Communications 14, 139 (2023) `_ .. [Galbiati2023] M. Galbiati, A. Formenti, M. Grech and M. Passoni, `Numerical investigation of non-linear inverse Compton scattering in double-layer targets`, - `Front. Phys. 11, fphy.2023.1117543 (2023) `_ + `Frontiers in Physics 11, fphy.2023.1117543 (2023) `_ .. [Sakai2023] @@ -342,7 +409,7 @@ Following is the distribution of these topics in the listed publications up to N A. Golovanov, I. Yu. Kostyukov, A. Pukhov and V. Malka, `Energy-Conserving Theory of the Blowout Regime of Plasma Wakefield`, - `Phys. Rev. Lett. 130, 105001 (2023) `_ + `Physical Review Letters 130, 105001 (2023) `_ .. [Miethlinger2023] @@ -354,13 +421,13 @@ Following is the distribution of these topics in the listed publications up to N C. Zepter, A. Seidel, M. Zepf, M. C. Kaluza and A. Sävert, `Role of spatiotemporal couplings in stimulated Raman side scattering`, - `Phys. Rev. Research 5, L012023 (2023) `_ + `Physical Review Research 5, L012023 (2023) `_ .. [Marini2023] S. Marini, M. Grech, P. S. Kleij, M. Raynaud and C. Riconda, `Electron acceleration by laser plasma wedge interaction`, - `Phys. Rev. Research 5, 013115 (2023) `_ + `Physical Review Research 5, 013115 (2023) `_ .. [Blackman2022] @@ -420,7 +487,7 @@ Following is the distribution of these topics in the listed publications up to N D. Margarone, J. Bonvalet, L. Giuffrida, A. Morace, V. Kantarelou, M. Tosca, D. Raffestin, P. Nicolai, A. Picciotto, Y. Abe, Y. Arikawa, S. Fujioka, Y. Fukuda, Y. Kuramitsu, H. Habara and D. Batani, `In-Target Proton–Boron Nuclear Fusion Using a PW-Class Laser`, - `Appl. Sci. 12(3), 1444 (2022) `_ + `Appled Sciences 12(3), 1444 (2022) `_ .. [Kochetkov2022] @@ -432,13 +499,13 @@ Following is the distribution of these topics in the listed publications up to N A. Oudin, A. Debayle, C. Ruyer, D. Benisti, `Cross-beam energy transfer between spatially smoothed laser beams`, - `Phys. Plasmas 29, 112112 (2022) `_ + `Physics of Plasmas 29, 112112 (2022) `_ .. [Chen2022] Q. Chen, D. Maslarova, J. Wang, S. Li, and D. Umstadter, `Injection of electron beams into two laser wakefields and generation of electron rings`, - `Phys. Rev. E 106, 055202 (2022) `_ + `Physical Review E 106, 055202 (2022) `_ .. [Kumar2022b] @@ -450,7 +517,7 @@ Following is the distribution of these topics in the listed publications up to N S. Kumar, D. K. Singh and H. K. Malik, `Comparative study of ultrashort single-pulse and multi-pulse driven laser wakefield acceleration`, - `Laser Phys. Lett. 20, 026001 (2022) `_ + `Laser Physics Letters 20, 026001 (2022) `_ .. [Miloshevsky2022] @@ -474,25 +541,25 @@ Following is the distribution of these topics in the listed publications up to N I. Ouatu, B. T. Spiers, R. Aboushelbaya, Q. Feng, M. W. von der Leyen, R. W. Paddock, R. Timmis, C. Ticos, K. M. Krushelnick and P. A. Norreys, `Ionization states for the multipetawatt laser-QED regime`, - `Phys. Rev. E 106, 015205 (2022) `_ + `Physical Review E 106, 015205 (2022) `_ .. [Beth2022] A. Beth, H. Gunell, C. Simon Wedlund, C. Goetz, H. Nilsson and M. Hamrin, `First investigation of the diamagnetic cavity boundary layer with a 1D3V PIC simulation`, - `A&A 667, A143 (2022) `_ + `Astronomy & Astrophysics 667, A143 (2022) `_ .. [Guo2022] Y. Guo, X. Geng, L. Ji, B. Shen and R. Li, `Improving the accuracy of hard photon emission by sigmoid sampling of the quantum-electrodynamic table in particle-in-cell Monte Carlo simulations`, - `Phys. Rev. E 105, 025309 (2022) `_ + `Physical Review E 105, 025309 (2022) `_ .. [Pae2022] - K. . Pae, C. M. Kim, V. B. Pathak, C.-M. Ryu and C. H. Nam, + K. H. Pae, C. M. Kim, V. B. Pathak, C.-M. Ryu and C. H. Nam, `Direct laser acceleration of electrons from a plasma mirror by an intense few-cycle Laguerre–Gaussian laser and its dependence on the carrier-envelope phase`, - `Plasma Phys. Control. Fusion 64, 055013 (2022) `_ + `Plasma Physics and Controlled Fusion 64, 055013 (2022) `_ .. [Zhang2022a] @@ -505,43 +572,43 @@ Following is the distribution of these topics in the listed publications up to N Q. Han, X. Geng, B. Shen, Z. Xu and L. Ji, `Ultra-fast polarization of a thin electron layer in the rotational standing-wave field driven by double ultra-intense laser pulses`, - `New J. Phys. 24, 063013 (2022) `_ + `New Journal of Physics 24, 063013 (2022) `_ .. [Gothel2022] I. Göthel, C. Bernert, M. Bussmann, M. Garten, T. Miethlinger, M. Rehwald, K. Zeil, T. Ziegler, T. E. Cowan, U. Schramm and T. Kluge, `Optimized laser ion acceleration at the relativistic critical density surface`, - `Plasma Phys. Control. Fusion 64, 044010 (2022) `_ + `Plasma Physics and Controlled Fusion 64, 044010 (2022) `_ .. [Fazzini2022] A. Fazzini, W. Yao, K. Burdonov, J. Béard, S. N. Chen, A. Ciardi, E. d’Humières, R. Diab, E. D. Filippov, S. Kisyov, V. Lelasseux, M. Miceli, Q. Moreno, S. Orlando, S. Pikuz, X. Ribeyre, M. Starodubtsev, R. Zemskov and J. Fuchs, `Particle energization in colliding subcritical collisionless shocks investigated in the laboratory`, - `A&A 665, A87 (2022) `_ + `Astronomy & Astrophysics 665, A87 (2022) `_ .. [Bykov2022] A. M. Bykov, S. M. Osipov and V. I. Romanskii, `Acceleration of Cosmic Rays to Energies above 1015 eV by Transrelativistic Shocks`, - `J. Exp. Theor. Phys. 134, 487-497 (2022) `_ + `Journal of Experimental and Theoretical Physics 134, 487-497 (2022) `_ .. [Sundstrom2022] A. Sundström, M. Grech, I. Pusztai and C. Riconda, `Stimulated-Raman-scattering amplification of attosecond XUV pulses with pulse-train pumps and application to local in-depth plasma-density measurement`, - `Phys. Rev. E 106, 045208 (2022) `_ + `Physical Review E 106, 045208 (2022) `_ .. [Krafft2022b] C. Krafft and P. Savoini, `Third and Fourth Harmonics of Electromagnetic Emissions by a Weak Beam in a Solar Wind Plasma with Random Density Fluctuations`, - `ApJL 934, L28 (2022) `_ + `The Astrophysical Journal Letters 934, L28 (2022) `_ .. [Krafft2022a] C. Krafft and P. Savoini, `Fundamental Electromagnetic Emissions by a Weak Electron Beam in Solar Wind Plasmas with Density Fluctuations`, - `ApJL 924, L24 (2022) `_ + `The Astrophysical Journal Letters 924, L24 (2022) `_ .. [Kong2022] @@ -553,7 +620,7 @@ Following is the distribution of these topics in the listed publications up to N C. Davidson, Z.-M. Sheng, T. Wilson and P. McKenna, `Theoretical and computational studies of the Weibel instability in several beam–plasma interaction configurations`, - `J. Plasma Phys. 88, 905880206 (2022) `_ + `Journal of Plasma Physics 88, 905880206 (2022) `_ .. [Glek2022] @@ -565,7 +632,7 @@ Following is the distribution of these topics in the listed publications up to N D. Umstadter `Controlled Injection of Electrons for Improved Performance of Laser-Wakefield Acceleration`, - `United States: N. p., (2022) `_ + `United States Department of Energy Technical Report (2022) `_ .. [Massimo2022] @@ -584,7 +651,7 @@ Following is the distribution of these topics in the listed publications up to N P. K. Singh, F.-Y. Li, C.-K. Huang, A. Moreau, R. Hollinger, A. Junghans, A. Favalli, C. Calvi, S. Wang, Y. Wang, H. Song, J. J. Rocca, R. E. Reinovsky and S. Palaniyappan, `Vacuum laser acceleration of super-ponderomotive electrons using relativistic transparency injection`, - `Nat Commun 13, 54 (2022) `_ + `Nature Communications 13, 54 (2022) `_ .. [Lobet2022] @@ -615,13 +682,13 @@ Following is the distribution of these topics in the listed publications up to N P. Tomassini, F. Massimo, L. Labate and L. A. Gizzi, `Accurate electron beam phase-space theory for ionization-injection schemes driven by laser pulses`, - `High Pow Laser Sci Eng 10, e15 (2021) `_ + `High Power Laser Science and Engineering 10, e15 (2021) `_ .. [Meinhold2021] T. A. Meinhold and N. Kumar, `Radiation pressure acceleration of protons from structured thin-foil targets`, - `J. Plasma Phys. 87, 905870607 (2021) `_ + `Journal of Plasma Physics 87, 905870607 (2021) `_ .. [Bonvalet2021b] @@ -633,13 +700,13 @@ Following is the distribution of these topics in the listed publications up to N Y. Shi, D. R. Blackman and A. Arefiev, `Electron acceleration using twisted laser wavefronts`, - `Plasma Phys. Control. Fusion 63, 125032 (2021) `_ + `Plasma Physics and Controlled Fusion 63, 125032 (2021) `_ .. [Kumar2021] N. Kumar and B. Reville, `Nonthermal Particle Acceleration at Highly Oblique Nonrelativistic Shocks`, - `ApJL 921, L14 (2021) `_ + `The Astrophysical Journal Letters 921, L14 (2021) `_ .. [Ghaith2021] @@ -651,13 +718,13 @@ Following is the distribution of these topics in the listed publications up to N V. Horný and L. Veisz, `Generation of single attosecond relativistic electron bunch from intense laser interaction with a nanosphere`, - `Plasma Phys. Control. Fusion 63, 125025 (2021) `_ + `Plasma Physics and Controlled Fusion 63, 125025 (2021) `_ .. [Krafft2021] C. Krafft and P. Savoini, `Second Harmonic Electromagnetic Emissions by an Electron Beam in Solar Wind Plasmas with Density Fluctuations`, - `ApJL 917, L23 (2021) `_ + `The Astrophysical Journal Letters 917, L23 (2021) `_ .. [Khalilzadeh2021c] @@ -681,7 +748,7 @@ Following is the distribution of these topics in the listed publications up to N Y. Shou, D. Wang, P. Wang, J. Liu, Z. Cao, Z. Mei, S. Xu, Z. Pan, D. Kong, G. Qi, Z. Liu, Y. Liang, Z. Peng, Y. Gao, S. Chen, J. Zhao, Y. Zhao, H. Xu, J. Zhao, Y. Wu, X. Yan and W. Ma, `High-efficiency generation of narrowband soft x rays from carbon nanotube foams irradiated by relativistic femtosecond lasers`, - `Opt. Lett. 46, 3969 (2021) `_ + `Optics Letters 46, 3969 (2021) `_ .. [Khalilzadeh2021b] @@ -693,67 +760,67 @@ Following is the distribution of these topics in the listed publications up to N H. Hosseinkhani, M. Pishdast, J. Yazdanpanah and S. A. Ghasemi, `Investigation of the classical and quantum radiation reaction effect on interaction of ultra high power laser with near critical plasma`, - `J. Nuclear Sci. Technol. 42, 27-35 (2021) `_ + `Journal of Nuclear Science, Engineering and Technology 42, 27-35 (2021) `_ .. [MercuriBaron2021] A. Mercuri-Baron, M. Grech, F. Niel, A. Grassi, M. Lobet, A. Di Piazza and C. Riconda, `Impact of the laser spatio-temporal shape on Breit–Wheeler pair production`, - `New J. Phys. 23, 085006 (2021) `_ + `New Journal of Physics 23, 085006 (2021) `_ .. [Peng2021] H. Peng, C. Riconda, S. Weber, C.T. Zhou and S.C. Ruan, `Frequency Conversion of Lasers in a Dynamic Plasma Grating`, - `Phys. Rev. Applied 15, 054053 (2021) `_ + `Physical Review Applied 15, 054053 (2021) `_ .. [Shi2021a] Y. Shi, D. Blackman, D. Stutman and A. Arefiev, `Generation of Ultrarelativistic Monoenergetic Electron Bunches via a Synergistic Interaction of Longitudinal Electric and Magnetic Fields of a Twisted Laser`, - `Phys. Rev. Lett. 126, 234801 (2021) `_ + `Physical Review Letters 126, 234801 (2021) `_ .. [Bonvalet2021a] J. Bonvalet, Ph. Nicolaï, D. Raffestin, E. D'humieres, D. Batani, V. Tikhonchuk, V. Kantarelou, L. Giuffrida, M. Tosca, G. Korn, A. Picciotto, A. Morace, Y. Abe, Y. Arikawa, S. Fujioka, Y. Fukuda, Y. Kuramitsu, H. Habara and D. Margarone, `Energetic α-particle sources produced through proton-boron reactions by high-energy high-intensity laser beams`, - `Phys. Rev. E 103, 053202 (2021) `_ + `Physical Review E 103, 053202 (2021) `_ .. [Shekhanov2021] S. A. Shekhanov and V. T. Tikhonchuk, `SRS-SBS competition and nonlinear laser energy absorption in a high temperature plasma`, - `Plasma Phys. Control. Fusion 63, 115016 (2021) `_ + `Plasma Physics and Controlled Fusion 63, 115016 (2021) `_ .. [Psikal2021] - J Psikal, + J. Psikal, `Laser-driven ion acceleration from near-critical Gaussian plasma density profile`, - `Plasma Phys. Control. Fusion 63, 064002 (2021) `_ + `Plasma Physics and Controlled Fusion 63, 064002 (2021) `_ .. [Yoon2021b] Y. D. Yoon, G. S. Yun, D. E. Wendel and J. L. Burch, `Collisionless relaxation of a disequilibrated current sheet and implications for bifurcated structures`, - `Nat Commun 12, 3774 (2021) `_ + `Nature Communications 12, 3774 (2021) `_ .. [Lavorenti2021] F. Lavorenti, P. Henri, F. Califano, S. Aizawa and N. André, `Electron acceleration driven by the lower-hybrid-drift instability. An extended quasilinear model`, - `A&A 652, 202141049 (2021) `_ + `Astronomy & Astrophysics 652, 202141049 (2021) `_ .. [Golovanov2021] - A A Golovanov, I Yu Kostyukov, L Reichwein, J Thomas and A Pukhov, + A. A. Golovanov, I. Y. Kostyukov, L. Reichwein, J. Thomas and A. Pukhov, `Excitation of strongly nonlinear plasma wakefield by electron bunches`, - `Plasma Phys. Control. Fusion 63, 085004 (2021) `_ + `Plasma Physics and Controlled Fusion 63, 085004 (2021) `_ .. [Jirka2021] M. Jirka, P. Sasorov, S. S. Bulanov, G. Korn, B. Rus and S. V. Bulanov, `Reaching high laser intensity by a radiating electron`, - `Phys. Rev. A 103, 053114 (2021) `_ + `Physical Review A 103, 053114 (2021) `_ .. [Marques2021] @@ -783,7 +850,7 @@ Following is the distribution of these topics in the listed publications up to N G. Cantono, A. Permogorov, J. Ferri, E. Smetanina, A. Dmitriev, A. Persson, T. Fülöp and C.-G. Wahlström, `Laser-driven proton acceleration from ultrathin foils with nanoholes`, - `Sci Rep 11, 5006 (2021) `_ + `Scientific Reports 11, 5006 (2021) `_ .. [Perez2021] @@ -801,13 +868,13 @@ Following is the distribution of these topics in the listed publications up to N A. Sampath, X. Davoine, S. Corde, L. Gremillet, M. Gilljohann, M. Sangal, C. H. Keitel, R. Ariniello, J. Cary, H. Ekerfelt, C. Emma, F. Fiuza, H. Fujii, M. Hogan, C. Joshi, A. Knetsch, O. Kononenko, V. Lee, M. Litos, K. Marsh, Z. Nie, B. O’Shea, J. R. Peterson, P. San Miguel Claveria, D. Storey, Y. Wu, X. Xu, C. Zhang and M. Tamburini, `Extremely Dense Gamma-Ray Pulses in Electron Beam-Multifoil Collisions`, - `Phys. Rev. Lett. 126, 064801 (2021) `_ + `Physical Review Letters 126, 064801 (2021) `_ .. [Marini2021a] S. Marini, P. S. Kleij, F. Pisani, F. Amiranoff, M. Grech, A. Macchi, M. Raynaud and C. Riconda, `Ultrashort high energy electron bunches from tunable surface plasma waves driven with laser wavefront rotation`, - `Phys. Rev. E 103, L021201 (2021) `_ + `Physical Review E 103, L021201 (2021) `_ .. [Yao2021] @@ -819,14 +886,14 @@ Following is the distribution of these topics in the listed publications up to N E. G. Gelfer, A. M, Fedotov and S. Weber, `Radiation induced acceleration of ions in a laser irradiated transparent foil`, - `New J. Phys. 23, 095002 (2021) `_ + `New Journal of Physics 23, 095002 (2021) `_ `arXiv:1907.02621 `_ .. [Siminos2021] E. Siminos, I. Thiele and C. Olofsson, `Laser Wakefield Driven Generation of Isolated Carrier-Envelope-Phase Tunable Intense Subcycle Pulses`, - `Phys. Rev. Lett. 126, 044801 (2021) `_ + `Physical Review Letters 126, 044801 (2021) `_ `arXiv:1902.05014 `_ .. [Budriga2020] @@ -839,13 +906,13 @@ Following is the distribution of these topics in the listed publications up to N P. A. P. Nghiem, R. Assmann, A. Beck et al., `Toward a plasma-based accelerator at high beam energy with high beam charge and high beam quality`, - `Phys. Rev. Accel. Beams 23, 031301 (2020) `_ + `Physical Review Accelerators and Beams 23, 031301 (2020) `_ .. [Pisarczyk2020] T. Pisarczyk, M. Kalal, S. Yu. Gus'kov et al., `Hot electron retention in laser plasma created under terawatt subnanosecond irradiation of Cu targets`, - `Plasma Phys. Control. Fusion 62, 115020 (2020) `_ + `Plasma Physics and Controlled Fusion 62, 115020 (2020) `_ .. [Pagano2020] @@ -863,25 +930,25 @@ Following is the distribution of these topics in the listed publications up to N H. Peng, C. Riconda, M. Grech, C.-T. Zhou and S. Weber, `Dynamical aspects of plasma gratings driven by a static ponderomotive potential`, - `Plasma Phys. Control. Fusion 62, 115015 (2020) `_ + `Plasma Physics and Controlled Fusion 62, 115015 (2020) `_ .. [Glek2020] P. B. Glek, A. A. Voronin, V. Ya. Panchenko and A. M. Zheltikov, `Relativistic electron bunches locked to attosecond optical field waveforms: an attosecond light–matter bound state`, - `Laser Phys. Lett. 17 055401 (2020) `_ + `Laser Physics Letters 17 055401 (2020) `_ .. [Margarone2020] D. Margarone, A. Morace, J. Bonvalet et al., `Generation of α-Particle Beams With a Multi-kJ, Peta-Watt Class Laser System`, - `Front. Phys. 8, 343 (2020) `_ + `Frontiers in Physics 8, 343 (2020) `_ .. [Sinha2020] U. Sinha and N. Kumar, `Pair-beam propagation in a magnetized plasma for modeling the polarized radiation emission from gamma-ray bursts in laboratory astrophysics experiments`, - `Phys. Rev. E 101, 063204 (2020) `_ + `Physical Review E 101, 063204 (2020) `_ .. [Mitrofanov2020] @@ -893,81 +960,81 @@ Following is the distribution of these topics in the listed publications up to N B. T. Spiers, M. P. Hill, C. Brown, L. Ceurvorst, N. Ratan, A. F. Savin, P. Allan, E. Floyd, J. Fyrth, L. Hobbs, S. James, J. Luis, M. Ramsay, N. Sircombe, J. Skidmore, R. Aboushelbaya, M. W. Mayr, R. Paddock, R. H. W. Wang and P. A. Norreys, `Whole-beam self-focusing in fusion-relevant plasma`, - `Phil. Trans. R. Soc. A379, 20200159 `_ + `Philosophical Transactions of the Royal Society A379, 20200159 `_ .. [Derouillat2020] J. Derouillat and A. Beck, `Single Domain Multiple Decompositions for Particle-in-Cell simulations`, - `J. Phys.: Conf. Ser. 1596, 012052 (2020) `_ + `Journal of Physics: Conference Series 1596, 012052 (2020) `_ `arXiv:1912.04064 `_ .. [Zemzemi2020] I. Zemzemi, F. Massimo and A. Beck, `Azimuthal decomposition study of a realistic laser profile for efficient modeling of Laser WakeField Acceleration`, - `J. Phys.: Conf. Ser. 1596, 012055 (2020) `_ + `Journal of Physics: Conference Series 1596, 012055 (2020) `_ .. [Massimo2020b] F. Massimo, I. Zemzemi, A. Beck, J. Derouillat and A. Specka, `Efficient cylindrical envelope modeling for laser wakefield acceleration`, - `J. Phys.: Conf. Ser. 1596, 012054 (2020) `_ + `Journal of Physics: Conference Series 1596, 012054 (2020) `_ `arXiv:1912.04674 `_ .. [Massimo2020a] F. Massimo, A. Beck, J. Derouillat, I. Zemzemi and A. Specka, `Numerical modeling of laser tunneling ionization in particle-in-cell codes with a laser envelope model`, - `Phys. Rev. E 102, 033204 (2020) `_ + `Physical Review E 102, 033204 (2020) `_ `arXiv:2006.04433 `_ .. [Marcowith2020] A. Marcowith, G. Ferrand, M. Grech, Z. Meliani, I. Plotnikov and R. Walder, `Multi-scale simulations of particle acceleration in astrophysical systems`, - `Living Rev Comput Astrophys 6, 1 (2020) `_ + `Living Reviews in Computational Astrophysics 6, 1 (2020) `_ `arXiv:2002.09411 `_ .. [Dargent2020] J. Dargent, N. Aunai, B. Lavraud, S. Toledo‐Redondo and F. Califano, `Simulation of Plasmaspheric Plume Impact on Dayside Magnetic Reconnection`, - `Geophys. Res. Lett. 47, 2019GL086546 (2020) `_ + `Geophysical Research Letters 47, 2019GL086546 (2020) `_ `arXiv:2002.02243 `_ .. [Sundström2020b] A. Sundström, L. Gremillet, E. Siminos and I. Pusztai, `Collisional effects on the electrostatic shock dynamics in thin-foil targets driven by an ultraintense short pulse laser`, - `Plasma Phys. Control. Fusion 62, 085015 (2020) `_ + `Plasma Physics and Controlled Fusion 62, 085015 (2020) `_ .. [Sundström2020a] A. Sundström, L. Gremillet, E. Siminos and I. Pusztai, `Fast collisional electron heating and relaxation in thin foils driven by a circularly polarized ultraintense short-pulse laser`, - `J. Plasma Phys. 86, 755860201 (2020) `_ + `Journal of Plasma Physics 86, 755860201 (2020) `_ `arXiv:1911.09562 `_ .. [Gelfer2020] E. G. Gelfer, A. M. Fedotov, O. Klimo and S. Weber, `Absorption and opacity threshold for a thin foil in a strong circularly polarized laser field`, - `Phys. Rev. E 101, 033204 (2020) `_ + `Physical Review E 101, 033204 (2020) `_ `arXiv:1906.05902 `_ .. [Ferri2020] J. Ferri, I. Thiele, E. Siminos, L. Gremillet, E. Smetanina, A. Dmitriev, G. Cantono, C.-G. Wahlström and T. Fülöp, `Enhancement of laser-driven ion acceleration in non-periodic nanostructured targets`, - `J. Plasma Phys. 86, 905860101 (2020) `_ + `Journal of Plasma Physics 86, 905860101 (2020) `_ `arXiv:1905.11131 `_ .. [Marques2019] J.-R. Marquès, L. Lancia, T. Gangolf, M. Blecher, S. Bolaños, J. Fuchs, O. Willi, F. Amiranoff, R. L. Berger, M. Chiaramello, S. Weber, and C. Riconda, `Joule-Level High-Efficiency Energy Transfer to Subpicosecond Laser Pulses by a Plasma-Based Amplifier`, - `Phys. Rev. X 9, 021008 (2019) `_ + `Physical Review X 9, 021008 (2019) `_ .. [Plotnikov2019] I. Plotnikov and L. Sironi, @@ -990,39 +1057,39 @@ Following is the distribution of these topics in the listed publications up to N X. S. Geng, L. L. Ji, B. F. Shen et al., `Quantum reflection above the classical radiation-reaction barrier in the quantum electro-dynamics regime`, - `Commun. Phys. 2, 66 (2019) `_ + `Communications Physics 2, 66 (2019) `_ .. [Sinha2019] U. Sinha, C. H. Keitel, and N. Kumar, `Polarized Light from the Transportation of a Matter-Antimatter Beam in a Plasma`, - `Phys. Rev. Lett. 122, 204801 (2019) `_ + `Physical Review Letters 122, 204801 (2019) `_ .. [Malko2019] S. Malko, X. Vaisseau, F. Perez, D. Batani, A. Curcio, M. Ehret, J. Honrubia, K. Jakubowska, A. Morace, J. J. Santos and L. Volpe, `Enhanced relativistic-electron beam collimation using two consecutive laser pulses`, - `Sci Rep 9, 14061 (2019) `_ + `Scientific Reports 9, 14061 (2019) `_ .. [Peng2019] H. Peng, C. Riconda, M. Grech, J.-Q. Su and S. Weber, `Nonlinear dynamics of laser-generated ion-plasma gratings: A unified description`, - `Phys. Rev. E 100, 061201 (2019) `_ + `Physical Review E 100, 061201 (2019) `_ `arXiv:1911.03440 `_ .. [Fang2019] J. Fang, C.-Y. Lu, J.-W. Yan and H. Yu, `Early acceleration of electrons and protons at the nonrelativistic quasiparallel shocks with different obliquity angles`, - `Res. Astron. Astrophys. 19, 182 (2019) `_ + `Research in Astronomy and Astrophysics 19, 182 (2019) `_ `arXiv:1908.08170 `_ .. [Yoon2019b] Y. Yoon and P. M. Bellan, `Kinetic Verification of the Stochastic Ion Heating Mechanism in Collisionless Magnetic Reconnection`, - `ApJ 887, L29 (2019) `_ + `The Astrophysical Journal Letters 887, L29 (2019) `_ .. [Yoon2019a] @@ -1034,7 +1101,7 @@ Following is the distribution of these topics in the listed publications up to N F. Massimo, A. Beck, J. Derouillat, M. Grech, M. Lobet, F. Pérez, I. Zemzemi and A Specka, `Efficient start-to-end 3D envelope modeling for two-stage laser wakefield acceleration experiments`, - `Plasma Phys. Control. Fusion 61, 124001 (2019) `_ + `Plasma Physics and Controlled Fusion 61, 124001 (2019) `_ `arXiv:1912.04127 `_ .. [Beck2019] @@ -1048,14 +1115,14 @@ Following is the distribution of these topics in the listed publications up to N F. Pérez and M. Grech, `Oblique-incidence, arbitrary-profile wave injection for electromagnetic simulations`, - `Phys. Rev. E 99, 033307 (2019) `_ + `Physical Review E 99, 033307 (2019) `_ `arXiv:1809.04435 `_ .. [Thiele2019] I. Thiele, E. Siminos and T. Fülöp, `Electron Beam Driven Generation of Frequency-Tunable Isolated Relativistic Subcycle Pulses`, - `Phys. Rev. Lett. 122, 104803 (2019) `_ + `Physical Review Letters 122, 104803 (2019) `_ `arXiv:1806.04976 `_ .. [Golovanov2018] @@ -1068,19 +1135,19 @@ Following is the distribution of these topics in the listed publications up to N S. Toledo-Redondo, J. Dargent, N. Aunai, B. Lavraud, M. André, W. Li, B. Giles, P.-A. Lindvist, R. E. Ergun, C. T. Russel and J. L. Burch, `Perpendicular Current Reduction Caused by Cold Ions of Ionospheric Origin in Magnetic Reconnection at the Magnetopause: Particle-in-Cell Simulations and Spacecraft Observations`, - `Geophys. Res. Lett. 45, 10,033 (2018) `_ + `Geophysical Research Letters 45, 10,033 (2018) `_ .. [Gelfer2018] E. Gelfer, N. Elkina and A. Fedotov, `Unexpected impact of radiation friction: enhancing production of longitudinal plasma waves`, - `Sci. Rep. 8, 6478 (2018) `_ + `Scientific Reports 8, 6478 (2018) `_ .. [Niel2018b] F. Niel, C. Riconda, F. Amiranoff, M. Lobet, J. Derouillat, F. Pérez, T. Vinci and M. Grech, `From quantum to classical modeling of radiation reaction: a focus on the radiation spectrum`, - `Plasma Phys. Control. Fusion 60, 094002 (2018) `_ + `Plasma Physics and Controlled Fusion 60, 094002 (2018) `_ `arXiv:1802.02927 `_ .. [Plotnikov2018] @@ -1094,21 +1161,21 @@ Following is the distribution of these topics in the listed publications up to N F. Niel, C. Riconda, F. Amiranoff, R. Duclous and M. Grech, `From quantum to classical modeling of radiation reaction: A focus on stochasticity effects`, - `Phys. Rev. E 97, 043209 (2018) `_ + `Physical Review E 97, 043209 (2018) `_ `arXiv:1707.02618 `_ .. [Grassi2017b] A. Grassi, M. Grech, F. Amiranoff, A. Macchi and C. Riconda, `Radiation-pressure-driven ion Weibel instability and collisionless shocks`, - `Phys. Rev. E 96, 033204 (2017) `_ + `Physical Review E 96, 033204 (2017) `_ `arXiv:1705.05402 `_ .. [Fedeli2017] L. Fedeli, A. Formenti, L. Cialfi, A. Sgattoni, G. Cantono and M. Passoni, `Structured targets for advanced laser-driven sources`, - `Plasma Phys. Control. Fusion 60, 014013 (2017) `_ + `Plasma Physics and Controlled Fusion 60, 014013 (2017) `_ .. [Golovanov2017] @@ -1120,19 +1187,19 @@ Following is the distribution of these topics in the listed publications up to N J. Dargent, N. Aunai, B. Lavraud, S. Toledo-Redondo, M. A. Shay, P. A. Cassak and K. Malakit, `Kinetic simulation of asymmetric magnetic reconnection with cold ions`, - `J. Geophys. Res. Space Physics 122, 5290-5306 (2017) `_ + `Journal of Geophysical Research: Space Physics 122, 5290-5306 (2017) `_ .. [Grassi2017a] A. Grassi, M. Grech, F. Amiranoff, F. Pegoraro, A. Macchi and C. Riconda, `Electron Weibel instability in relativistic counterstreaming plasmas with flow-aligned external magnetic fields`, - `Phys. Rev. E 95, 023203 (2017) `_ + `Physical Review E 95, 023203 (2017) `_ .. [Dargent2016] J. Dargent, N. Aunai, G. Belmont, N. Dorville, B. Lavraud and M. Hesse, `Full particle-in-cell simulations of kinetic equilibria and the role of the initial current sheet on steady asymmetric magnetic reconnection`, - `J. Plasma Phys. 82, 905820305 (2016) `_ + `Journal of Plasma Physics 82, 905820305 (2016) `_ .. [Chiaramello2016] @@ -1144,10 +1211,10 @@ Following is the distribution of these topics in the listed publications up to N A. Beck, J.T. Frederiksen and J. Dérouillat, `Load management strategy for Particle-In-Cell simulations in high energy particle acceleration`, - `Nucl. Inst. Meth. in Phys. Res. A 829, 418-421 (2016) `_ + `Nuclear Instuments and Methods in Physics Research A 829, 418-421 (2016) `_ .. [Lancia2016] L. Lancia, A. Giribono, L. Vassura, M. Chiaramello, C. Riconda, S. Weber, A. Castan, A. Chatelain, A. Frank, T. Gangolf, M. N. Quinn, J. Fuchs and J.-R. Marquès, `Signatures of the Self-Similar Regime of Strongly Coupled Stimulated Brillouin Scattering for Efficient Short Laser Pulse Amplification`, - `Phys. Rev. Lett. 116, 075001 (2016) `_ + `Physical Review Letters 116, 075001 (2016) `_ diff --git a/doc/Sphinx/Overview/partners.rst b/doc/Sphinx/Overview/partners.rst index 69b87e746..87d9d978a 100755 --- a/doc/Sphinx/Overview/partners.rst +++ b/doc/Sphinx/Overview/partners.rst @@ -52,11 +52,11 @@ Partners | | `Maison de la Simulation `_ (MdlS), USR 3441 | | | | + +---------------------------------------------------------------------------------------------------------+ -| | * `Olga Abramkina `_ | -| | * `Julien Dérouillat `_ | +| | * `Olga Abramkina `_ (Developer) | +| | * `Julien Dérouillat `_ (Cofounder) | | | * `Haithem Kallala `_ | -| | * `Mathieu Lobet `_ | -| | * `Charles Prouveur `_ | +| | * `Mathieu Lobet `_ (Developer) | +| | * `Charles Prouveur `_ (Architect) | | | | +------------+---------------------------------------------------------------------------------------------------------+ @@ -67,11 +67,11 @@ Partners | | `Laboratoire pour l'Utilisation des Lasers Intenses `_ (LULI), UMR 7605 | | | | + +-------------------------------------------------------------------------------------------------------------+ -| | * `Mickael Grech `_ | -| | * `Tommaso Vinci `_ | +| | * `Mickael Grech `_ (Founder) | +| | * `Tommaso Vinci `_ (Developer) | | | * `Marco Chiaramello `_ | | | * `Anna Grassi `_ | -| | * `Frédéric Pérez `_ | +| | * `Frédéric Pérez `_ (Community manager, Developer) | | | * `Caterina Riconda `_ | | | | +------------+-------------------------------------------------------------------------------------------------------------+ @@ -83,9 +83,9 @@ Partners | | `Laboratoire Leprince-Ringuet `_ (LLR), UMR 7638 | + +---------------------------------------------------------------------------------------------------------+ | | | -| | * `Arnaud Beck `_ | +| | * `Arnaud Beck `_ (Project Coordinator, Cofounder, Developer) | | | * `Imen Zemzemi `_ | -| | * `Guillaume Bouchard `_ | +| | * `Guillaume Bouchard `_ (Developer) | +------------+---------------------------------------------------------------------------------------------------------+ .. rst-class:: noborder @@ -95,7 +95,7 @@ Partners | | `Laboratoire de Physique des Gaz et des Plasmas `_ (LPGP), UMR 8578 | + +----------------------------------------------------------------------------------------------------------------------+ | | | -| | * `Francesco Massimo `_ | +| | * `Francesco Massimo `_ (Developer) | +------------+----------------------------------------------------------------------------------------------------------------------+ .. rst-class:: noborder @@ -105,7 +105,7 @@ Partners | | `Institut du developpement et des ressources en informatique scientifique `_ (IDRIS), UPS 851 | + +----------------------------------------------------------------------------------------------------------------------+ | | | -| | * `Olga Abramkina `_ | +| | * `Olga Abramkina `_ (Developer) | | | * `Marie Flé `_ | +------------+----------------------------------------------------------------------------------------------------------------------+ diff --git a/doc/Sphinx/Overview/releases.rst b/doc/Sphinx/Overview/releases.rst index 8027d2f3d..e271b32c5 100755 --- a/doc/Sphinx/Overview/releases.rst +++ b/doc/Sphinx/Overview/releases.rst @@ -16,22 +16,53 @@ Get Smilei You can find older, `unsupported versions here `_ +.. +.. ---- + +.. .. _latestVersion: + +.. Changes made in the repository (not released) +.. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ---- -.. _latestVersion: +Ongoing projects +^^^^^^^^^^^^^^^^ + +* Already available, but experimental: + + * Particle merging + * Nuclear reactions + * Perfectly Matched Layers + * NewParticles diagnostic + +* In preparation: + + * Spectral solvers + -Changes made in the repository (not released) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +---- + +Release 5.1 +^^^^^^^^^^^^^^^^^^^^^ -* GPU: +* **GPU**: + * ``1Dcartesian`` geometry now available. * Compilation simplified and better documented. + * Improved performance of particle sorting. -* Features: +* **Features**: * Relativistic field initialization now supports multiple species and both direction propagations. + * Added the argument ``phase_offset`` in laser definitions such as ``LaserGaussian2D``. + * The ``LaserGaussianAM`` definition will only use one coordinate for its ``focus`` argument + (the transverse coordinate of the focus in this geometry is zero). + * Small improvements in PML for envelope model (AM and 2D). + * Deprecated ``smilei_rand_max``. + * New namelist variables ``smilei_omp_threads`` and ``smilei_total_cores``. -* Happi: +* **Happi**: * In ``Scalar``, it is now possible to make an operation on scalars such as ``"Uelm+Ukin"``. * The list of available scalars can be obtained from ``getScalars()``. @@ -40,16 +71,11 @@ Changes made in the repository (not released) * Changed coordinate reference for 2D probe in 3D or AM geometry (zero is the box origin projected orthogonally on the probe plane). -* Documentation: +* **Documentation**: * Dark theme (click the switch on the bottom left, or set browser preferences). -* Added the argument ``phase_offset`` in laser definitions such as ``LaserGaussian2D``. -* The ``LaserGaussianAM`` definition will only use one coordinate for its ``focus`` argument - (the transverse coordinate of the focus in this geometry is zero). -* Small improvements in PML for envelope model (AM and 2D). - -* Bug fixes: +* **Bug fixes** : * ``dump_minutes`` often failed to write some checkpoint files. * ``"auto"`` limits in ``ParticleBinning`` could fail with only one side on ``"auto"``. @@ -57,23 +83,6 @@ Changes made in the repository (not released) ---- -Projects -^^^^^^^^^^^^^^^^ - -* Already available, but experimental: - - * Particle merging - * Nuclear reactions - * Perfectly Matched Layers - * NewParticles diagnostic - -* In preparation: - - * Spectral solvers - - ----- - Release 5.0 ^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/Sphinx/Use/namelist.rst b/doc/Sphinx/Use/namelist.rst index ad318954c..a07f19005 100755 --- a/doc/Sphinx/Use/namelist.rst +++ b/doc/Sphinx/Use/namelist.rst @@ -60,7 +60,8 @@ for each MPI process). The following steps are executed: * The rank of the current MPI process as :py:data:`smilei_mpi_rank`. * The total number of MPI processes as :py:data:`smilei_mpi_size`. - * The maximum random integer as :py:data:`smilei_rand_max`. + * The number of OpenMP threads per MPI :py:data:`smilei_omp_threads`. + * The total number of cores :py:data:`smilei_total_cores`. #. The namelist(s) is executed. @@ -1147,6 +1148,9 @@ Each species has to be defined in a ``Species`` block:: :ref:`tracking `. The available fields are ``"Ex"``, ``"Ey"``, ``"Ez"``, ``"Bx"``, ``"By"`` and ``"Bz"``. + Note that magnetic field components, as they originate from the interpolator, + are shifted by half a timestep compared to those from the *Fields* diagnostics. + Additionally, the work done by each component of the electric field is available as ``"Wx"``, ``"Wy"`` and ``"Wz"``. Contrary to the other interpolated fields, these quantities are accumulated over time. @@ -2715,7 +2719,8 @@ or several points arranged in a 2-D or 3-D grid. * **In "AMcylindrical" geometry**, probes are defined with 3D Cartesian coordinates and cannot be separated per mode. Use Field diagnostics for cylindrical coordinates and information per mode. - + * **Probes rely on the particle interpolator to compute fields** so that the + magnetic field is shifted by half a timestep compared to that of *Fields* diagnostics. To add one probe diagnostic, include the block ``DiagProbe``:: @@ -3342,19 +3347,20 @@ for instance:: def my_filter(particles): return (particles.px>-1.)*(particles.px<1.) + (particles.pz>3.) -.. Warning:: The ``px``, ``py`` and ``pz`` quantities are not exactly the momenta. - They are actually the velocities multiplied by the lorentz factor, i.e., - :math:`\gamma v_x`, :math:`\gamma v_y` and :math:`\gamma v_z`. This is true only - inside the ``filter`` function (not for the output of the diagnostic). - -.. Note:: The ``id`` attribute contains the :doc:`particles identification number`. - This number is set to 0 at the beginning of the simulation. **Only after particles have - passed the filter**, they acquire a positive ``id``. - -.. Note:: For advanced filtration, Smilei provides the quantity ``Main.iteration``, - accessible within the ``filter`` function. Its value is always equal to the current - iteration number of the PIC loop. The current time of the simulation is thus - ``Main.iteration * Main.timestep``. +.. Note:: + + * In the ``filter`` function only, the ``px``, ``py`` and ``pz`` quantities + are not exactly the momenta. + They are actually the velocities multiplied by the lorentz factor, i.e., + :math:`\gamma v_x`, :math:`\gamma v_y` and :math:`\gamma v_z`. + This is *not* true for the output of the diagnostic. + * The ``id`` attribute contains the :doc:`particles identification number`. + This number is set to 0 at the beginning of the simulation. **Only after particles have + passed the filter**, they acquire a positive ``id``. + * For advanced filtration, Smilei provides the quantity ``Main.iteration``, + accessible within the ``filter`` function. Its value is always equal to the current + iteration number of the PIC loop. The current time of the simulation is thus + ``Main.iteration * Main.timestep``. .. py:data:: attributes @@ -3367,6 +3373,11 @@ for instance:: (``"chi"``, only for species with radiation losses) or the fields interpolated at their positions (``"Ex"``, ``"Ey"``, ``"Ez"``, ``"Bx"``, ``"By"``, ``"Bz"``). +.. Note:: Here, interpolated fields are normally computed after the Maxwell solver. + They may thus differ by half a timestep from those computed at the middle of the + timestep to push particles. When exact values are needed, use the option + :py:data:`keep_interpolated_fields`. + ---- .. rst-class:: experimental @@ -3619,9 +3630,15 @@ namelist. They should not be re-defined by the user! The total number of MPI processes. -.. - <> - .. py:data:: smilei_rand_max +.. py:data:: smilei_omp_threads + + The number of OpenMP threads per MPI. + +.. py:data:: smilei_total_cores - The largest random integer. + The total number of cores. +.. note:: + + These variables can be access during ``happi`` post-processing, e.g. + ``S.namelist.smilei_mpi_size``. \ No newline at end of file diff --git a/doc/Sphinx/implementation.rst b/doc/Sphinx/implementation.rst index 46bf953e9..aab91c2c9 100644 --- a/doc/Sphinx/implementation.rst +++ b/doc/Sphinx/implementation.rst @@ -10,10 +10,10 @@ and conveniency for non-advanced C++ users. The repository is composed of the following directories: - ``Licence``: contains code licence information -- ``doc``: conatins the Sphinx doc files +- ``doc``: contains the Sphinx doc files - ``src``: contains all source files - ``happi``: contains the sources of the happi Python tool for visualization -- ``benchmarks``: contains the benchmarks used by the validation process. these becnhamrks are also examples for users. +- ``benchmarks``: contains the benchmarks used by the validation process, these benchmarks are also examples for users. - ``scripts``: contains multiple tool scripts for compilation and more - ``compile_tools``: contains scripts and machine files used by the makefile for compilation @@ -23,7 +23,7 @@ The repository is composed of the following directories: The source files directory is as well composed of several sub-directories to organise the `.cpp` and `.h` files by related thematics. The main is the file `Smilei.cpp`. -There is always only one class definition per file and the file name correcponds to the class name. +There is always only one class definition per file and the file name corresponds to the class name. The general implementation is later summarized in :numref:`smilei_main_loop` @@ -54,10 +54,10 @@ Notion of operators An operator is a class that operates on input data to provide a processed information. Input data can be parameters and data containers. Output data can be processed data from data containers or updated data containers. -An operator is a class functor (overloadind of the ``()`` ). -Sometime, operator provides additional methods called wrappers to provide differents simplified or adapted interfaces. -An operator do not store data or temporarely. -for instance, the particle interpolation, push and proection are operators. +An operator is a class functor (overloading of the ``()`` ). +Sometime, operator provides additional methods called wrappers to provide different simplified or adapted interfaces. +An operator do not store data or temporarily. +for instance, the particle interpolation, push and protection are operators. .. _operator: @@ -71,7 +71,7 @@ Notion of domain parts Domain parts are classes that represents some specific levels of the domain decomposition. They can be seen as high-level data container or container of data container. -They contain some methods to handle, manange and access the local data. +They contain some methods to handle, manage and access the local data. For instance, patches and ``Species`` are domain parts: - ``Species`` contains the particles. @@ -80,10 +80,10 @@ For instance, patches and ``Species`` are domain parts: Notion of factory ------------------------------------ -Some objects such as operators or data containers have sereral variations. +Some objects such as operators or data containers have several variations. For this we use inheritance. A base class is used for common parameters and methods and derived classes are used for all variations. -The factory uses user-defined input parameters to determine the right derive class to choose and initiate them as shown in :numref:`factory`. +The factory uses user-defined input parameters to determine the right derived class to choose and initiate them as shown in :numref:`factory`. For instance, there are several ``push`` operators implemented all derived from a base ``push`` class. The ``push`` factory will determine the right one to use. @@ -97,7 +97,7 @@ The ``push`` factory will determine the right one to use. Other ------------------------------------ -Some classes are used for specific actions in the code such as the initilization process. +Some classes are used for specific actions in the code such as the initialization process. ----------------------------------------------------------------- @@ -106,7 +106,7 @@ III. Domain decomposition and parallelism The simulation domain is divided multiple times following a succession of decomposition levels. The whole domain is the superimposition of different grids for each electromagnetic field component -and macro-particules. +and macro-particles. Let us represent schematically the domain as an array of cells as in Fig. :numref:`full_domain`. Each cell contains a certain population of particles (that can differ from cell to cell). @@ -127,8 +127,8 @@ The domain becomes a collection of patches as shown in :numref:`patch_domain_dec The domain in :program:`Smilei` is a collection of patches. -A patch is an independant piece of the whole simulation domain. -It therefore owns local electrmognatic grids and list of macro-particles. +A patch is an independent piece of the whole simulation domain. +It therefore owns the local electromagnetic grids and list of macro-particles. Electromagnetic grids have ghost cells that represent the information located in the neighboring patches (not shown in :numref:`patch_domain_decomposition`). All patches have the same spatial size .i.e. the same number of cells. The size of a patch is calculated so that all local field grids (ghost cells included) can fit in L2 cache. @@ -144,7 +144,7 @@ The distribution can be ensured in an equal cartesian way or using a load balanc Patches are then distributed among MPI processes in so-called MPI patch collections. Inside MPI patch collection, OpenMP loop directives are used to distribute the computation of the patches among the available threads. -Since each patch have a different number of particles, this approach enables a dynamic scheduling depending on the specified OpenMP scheduler. +Since each patch has a different number of particles, this approach enables a dynamic scheduling depending on the specified OpenMP scheduler. As shown in :numref:`smilei_main_loop`, a synchronization step is required to exchange grid ghost cells and particles traveling from patch to patch. The patch granularity is used for: @@ -163,7 +163,7 @@ The patch can be decomposed into bins as shown in :numref:`bin_decomposition`. Bin decomposition. -Contrary to patch, a bin is not an independant data structure with its own arrays. +Contrary to patch, a bin is not an independent data structure with its own arrays. It represents a smaller portion of the patch grids through specific start and end indexes. For the macro-particles, a sorting algorithm is used to ensure that in the macro-particles located in the same bin are grouped and contiguous in memory. @@ -288,7 +288,7 @@ located in the file `src/Tools.h`. - `ERROR_NAMELIST`: this function should be used for namelist error. It takes in argument a simple message and a link to the documentation. It throws as well a SIGABRT signal. - `MESSAGE`: this function should be used to output an information message (it uses `std::cout`). - `DEBUG` : should be used for debugging messages (for the so-called DEBUG mode) -- `WARNING` : should be used to thrown a warning. A warning alerts the users of a possible issue or to be carreful with some parameters without stoping the program. +- `WARNING` : should be used to thrown a warning. A warning alerts the users of a possible issue or to be careful with some parameters without stopping the program. -------------------------------------------------------------------------------- @@ -547,7 +547,7 @@ file ``Smilei.cpp`` thought calls to different ``vecPatches`` methods. .. code-block:: c++ - vecPatches.finalizeAndSortParticles( params, &smpi, simWindow, + vecPatches.finalizeExchParticlesAndSort( params, &smpi, simWindow, time_dual, timers, itime ); * **Particle merging**: merging process for particles (still experimental) @@ -618,7 +618,7 @@ We first loop on the patches and then the species of each patch ``ipatch``: ``(*this )( ipatch )->vecSpecies.size()``. For each species, the method ``Species::dynamics`` is called to perform the dynamic step of the respective particles. -The OpenMP parallelism is explicitely applied in ``vecPatches::dynamics`` on the patch loop as shown +The OpenMP parallelism is explicitly applied in ``vecPatches::dynamics`` on the patch loop as shown in the following pieces of code. .. code-block:: c++ diff --git a/doc/Sphinx/smilei_theme/layout.html b/doc/Sphinx/smilei_theme/layout.html index 592f7e532..1bed82e81 100755 --- a/doc/Sphinx/smilei_theme/layout.html +++ b/doc/Sphinx/smilei_theme/layout.html @@ -97,7 +97,7 @@
diff --git a/doc/Sphinx/smilei_theme/static/smilei_theme.css_t b/doc/Sphinx/smilei_theme/static/smilei_theme.css_t index fdf918810..4de1a7428 100755 --- a/doc/Sphinx/smilei_theme/static/smilei_theme.css_t +++ b/doc/Sphinx/smilei_theme/static/smilei_theme.css_t @@ -172,6 +172,10 @@ a:hover { text-decoration: underline; } +a:visited { + color:{{ theme_main_color_bold }}; color: var(--main_bold); +} + div.body h1, div.body h2, div.body h3, @@ -402,6 +406,14 @@ table.footnote td { padding: 0.3em 0.5em; } +table.noborder { + width: 100%; +} + +table.noborder tr:first-child td:first-child { + width: 7em; +} + table.noborder, table.noborder td { border:0 !important; } diff --git a/happi/_Diagnostics/TrackParticles.py b/happi/_Diagnostics/TrackParticles.py index 253bb2958..0825eb0f3 100755 --- a/happi/_Diagnostics/TrackParticles.py +++ b/happi/_Diagnostics/TrackParticles.py @@ -447,8 +447,9 @@ def _orderFiles( self, fileOrdered, chunksize, sort ): for k, name in self._short_properties_from_raw.items(): if k not in group: continue ordered = self._np.empty((nparticles_to_write, ), dtype=group[k].dtype) - if k == "id": ordered.fill(0) - else : ordered.fill(self._np.nan) + if k == "id" : ordered.fill(0) + elif k == "charge": ordered.fill(9999) + else : ordered.fill(self._np.nan) ordered[locs] = group[k][()][selectedIndices] f0[name].write_direct(ordered, dest_sel=self._np.s_[it,:]) @@ -461,8 +462,9 @@ def _orderFiles( self, fileOrdered, chunksize, sort ): for first_o, last_o, npart_o in ChunkedRange(nparticles_to_write, chunksize): for k, name in self._short_properties_from_raw.items(): if k not in group: continue - if k == "id": data[k].fill(0) - else : data[k].fill(self._np.nan) + if k == "id" : data[k].fill(0) + elif k == "charge": data[k].fill(9999) + else : data[k].fill(self._np.nan) # Loop chunks of the input for first_i, last_i, npart_i in ChunkedRange(nparticles, chunksize): # Obtain IDs @@ -538,7 +540,10 @@ def _generateRawData(self, times=None): data[it,:] -= self._XmovedForTime[time] else: data = self._readUnstructuredH5(self._h5items[axis], self.selectedParticles, first_time, last_time) - data[deadParticles] = self._np.nan + if data.dtype == float: + data[deadParticles] = self._np.nan + else: + data[deadParticles] = 9999 self._rawData[axis] = data if self._verbose: print("Process broken lines ...") diff --git a/happi/_Utils.py b/happi/_Utils.py index 9fd35a757..070046786 100755 --- a/happi/_Utils.py +++ b/happi/_Utils.py @@ -42,7 +42,10 @@ def updateMatplotLibColormaps(): if "smilei" in matplotlib.pyplot.colormaps(): return def register(name, d): cmap = matplotlib.colors.LinearSegmentedColormap(name, d, N=256, gamma=1.0) - matplotlib.pyplot.register_cmap(cmap=cmap) + try: + matplotlib.pyplot.register_cmap(cmap=cmap) + except Exception as e: + matplotlib.colormaps.register(cmap) register(u"smilei", { 'red' :((0., 0., 0.), (0.0625 , 0.091, 0.091), (0.09375, 0.118, 0.118), (0.125 , 0.127, 0.127), (0.1875 , 0.135, 0.135), (0.21875, 0.125, 0.125), (0.28125, 0.034, 0.034), (0.3125 , 0.010, 0.010), (0.34375, 0.009, 0.009), (0.4375 , 0.049, 0.049), (0.46875, 0.057, 0.057), (0.5 , 0.058, 0.058), (0.59375, 0.031, 0.031), (0.625 , 0.028, 0.028), (0.65625, 0.047, 0.047), (0.71875, 0.143, 0.143), (0.78125, 0.294, 0.294), (0.84375, 0.519, 0.519), (0.90625, 0.664, 0.664), (0.9375 , 0.760, 0.760), (0.96875, 0.880, 0.880), (1., 1., 1. )), 'green':((0., 0., 0.), (0.21875, 0.228, 0.228), (0.78125, 0.827, 0.827), (0.8125 , 0.852, 0.852), (0.84375, 0.869, 0.869), (0.9375 , 0.937, 0.937), (0.96875, 0.967, 0.967), (1. , 1. , 1. )), @@ -398,7 +401,11 @@ def __init__(self, operation, QuantityTranslator, ureg): raise Exception("Quantity "+q+" not understood") # Calculate the total units and its inverse locals().update(self.imports) - units = eval("".join(basic_op)).units + units = eval("".join(basic_op)) + if isinstance(units, (int, float)): + units = ureg.Quantity(1) # dimensionless + else: + units = units.units self.translated_units = units.format_babel(locale="en") # Make the operation string self.translated_operation = "".join(full_op) diff --git a/makefile b/makefile index 3aaff0201..277a2237d 100755 --- a/makefile +++ b/makefile @@ -52,7 +52,7 @@ DIRS := $(shell find src -type d) SRCS := $(shell find src/* -name \*.cpp) OBJS := $(addprefix $(BUILD_DIR)/, $(SRCS:.cpp=.o)) DEPS := $(addprefix $(BUILD_DIR)/, $(SRCS:.cpp=.d)) -SITEDIR = $(shell $(PYTHONEXE) -c 'import site; site._script()' --user-site) +SITEDIR = $(shell d=`$(PYTHONEXE) -m site --user-site` && echo $$d || $(PYTHONEXE) -c "import sysconfig; print(sysconfig.get_path('purelib'))") # Smilei tools TABLES_DIR := tools/tables @@ -202,9 +202,9 @@ endif ifneq (,$(call parse_config,gpu_nvidia)) override config += noopenmp # Prevent openmp for nvidia - CXXFLAGS += -DSMILEI_ACCELERATOR_MODE -DSMILEI_OPENACC_MODE + CXXFLAGS += -DSMILEI_ACCELERATOR_GPU -DSMILEI_ACCELERATOR_GPU_OACC GPU_COMPILER ?= nvcc - GPU_COMPILER_FLAGS += -x cu -DSMILEI_ACCELERATOR_MODE -DSMILEI_OPENACC_MODE $(DIRS:%=-I%) + GPU_COMPILER_FLAGS += -x cu -DSMILEI_ACCELERATOR_GPU -DSMILEI_ACCELERATOR_GPU_OACC $(DIRS:%=-I%) GPU_COMPILER_FLAGS += -I$(BUILD_DIR)/src/Python $(PY_CXXFLAGS) GPU_KERNEL_SRCS := $(shell find src/* -name \*.cu) GPU_KERNEL_OBJS := $(addprefix $(BUILD_DIR)/, $(GPU_KERNEL_SRCS:.cu=.o)) @@ -214,9 +214,9 @@ endif # AMD GPUs ifneq (,$(call parse_config,gpu_amd)) - CXXFLAGS += -DSMILEI_ACCELERATOR_MODE + CXXFLAGS += -DSMILEI_ACCELERATOR_GPU -DSMILEI_ACCELERATOR_GPU_OMP GPU_COMPILER ?= $(CC) - GPU_COMPILER_FLAGS += -x hip -DSMILEI_ACCELERATOR_MODE -std=c++14 $(DIRS:%=-I%) #$(PY_FLAGS) + GPU_COMPILER_FLAGS += -x hip -DSMILEI_ACCELERATOR_GPU -DSMILEI_ACCELERATOR_GPU_OMP -std=c++14 $(DIRS:%=-I%) GPU_COMPILER_FLAGS += -I$(BUILD_DIR)/src/Python $(PY_CXXFLAGS) GPU_KERNEL_SRCS := $(shell find src/* -name \*.cu) GPU_KERNEL_OBJS := $(addprefix $(BUILD_DIR)/, $(GPU_KERNEL_SRCS:.cu=.o)) diff --git a/scripts/compile_tools/machine/adastra b/scripts/compile_tools/machine/adastra index 7aab184ce..14c2a975a 100644 --- a/scripts/compile_tools/machine/adastra +++ b/scripts/compile_tools/machine/adastra @@ -85,7 +85,6 @@ ADASTRA_DEBUG_FLAGS := -g -ggdb $(ADASTRA_DEBUG_SANITIZER_FLAGS) -v # ifneq (,$(call parse_config,gpu_amd)) # When using OMP - ADASTRA_ACCELERATOR_GPU_OMP_DEFINE_FLAGS := -DSMILEI_ACCELERATOR_GPU_OMP=1 # ADASTRA_ACCELERATOR_GPU_TARGET := gfx908 # ADASTRA_ACCELERATOR_GPU_TARGET := gfx908:xnack- diff --git a/scripts/compile_tools/machine/jean_zay_gpu_V100 b/scripts/compile_tools/machine/jean_zay_gpu_V100 index 7fa7ce513..cc9d15c8b 100644 --- a/scripts/compile_tools/machine/jean_zay_gpu_V100 +++ b/scripts/compile_tools/machine/jean_zay_gpu_V100 @@ -5,12 +5,25 @@ # Documentation: # http://www.idris.fr/jean-zay # +# Use the following commented commands to have the proper environment for compilation and running +# +# module purge +# module load anaconda-py3/2020.11 +# module load nvidia-compilers/23.11 +# module load cuda/12.2.0 +# module load openmpi/4.1.5-cuda +# module load hdf5/1.12.0-mpi-cuda +# export HDF5_ROOT_DIR=/gpfslocalsup/spack_soft/hdf5/1.12.0/nvhpc-23.11-i5lyakq3iu254ru3eqe2yukvg7airopl +# export I_MPI_CXX=pgc++ +# export SMILEICXX=mpic++ +# export CICCFLAG="--c++14" + SMILEICXX_DEPS = g++ #GPU_COMPILER = nvcc CXXFLAGS += -w -CXXFLAGS += -ta=tesla:cc70 -std=c++14 -lcurand -Minfo=accel # what is offloaded/copied +CXXFLAGS += -acc=gpu -gpu=cc70 -std=c++14 -lcurand -Minfo=accel # what is offloaded/copied # CXXFLAGS += -Minfo=all # very verbose output CXXFLAGS += -D__GCC_ATOMIC_TEST_AND_SET_TRUEVAL=1 @@ -18,4 +31,4 @@ CXXFLAGS += -D__GCC_ATOMIC_TEST_AND_SET_TRUEVAL=1 GPU_COMPILER_FLAGS += -O3 --std c++14 -arch=sm_70 GPU_COMPILER_FLAGS += --expt-relaxed-constexpr -LDFLAGS += -ta=tesla:cc70 -std=c++14 -Mcudalib=curand -lcudart -lcurand -lacccuda +LDFLAGS += -acc=gpu -gpu=cc70 -std=c++14 -cudalib=curand -lcudart -lcurand -lacccuda diff --git a/scripts/compile_tools/machine/ruche_gpu2 b/scripts/compile_tools/machine/ruche_gpu2 index a9406d60d..80cf09198 100644 --- a/scripts/compile_tools/machine/ruche_gpu2 +++ b/scripts/compile_tools/machine/ruche_gpu2 @@ -26,7 +26,7 @@ GPU_COMPILER_FLAGS += -arch=sm_80 #sm_89 # first compile completely with sm_80 t CXXFLAGS += -Minfo=accel # what is offloaded/copied # CXXFLAGS += -Minfo=all # very verbose output -# To turn on the OpenMP support, uncomment these 3 lines and comment the line just above defining 'SMILEI_OPENACC_MODE' +# To turn on the OpenMP support, uncomment these 3 lines and comment the line just above defining 'SMILEI_ACCELERATOR_GPU_OACC' # CXXFLAGS += -mp=gpu -DSMILEI_ACCELERATOR_GPU_OMP # GPU_COMPILER_FLAGS += -DSMILEI_ACCELERATOR_GPU_OMP # Can't we pass the -mp=gpu to nvcc when compiling a .cu file ? # LDFLAGS += -mp=gpu diff --git a/src/Checkpoint/Checkpoint.cpp b/src/Checkpoint/Checkpoint.cpp index 13c3d28a5..3cbb6c12a 100755 --- a/src/Checkpoint/Checkpoint.cpp +++ b/src/Checkpoint/Checkpoint.cpp @@ -233,7 +233,7 @@ void Checkpoint::dumpAll( VectorPatch &vecPatches, Region ®ion, unsigned int MESSAGE( " Checkpoint #" << num_dump << " at iteration " << itime << " dumped" ); #endif -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) MESSAGE( " Copying device data in main memory" ); // TODO(Etienne M): This may very well be redundant if we did a diagnostic // during the last iteration. Indeed, we copy everything from the device to @@ -478,8 +478,8 @@ void Checkpoint::dumpPatch( Patch *patch, Params ¶ms, H5Write &g ) name << setfill( '0' ) << setw( 2 ) << bcId; string groupName=Tools::merge( "EM_boundary-species-", name.str() ); H5Write b = g.group( groupName ); - b.attr( "By_val", embc->By_val ); - b.attr( "Bz_val", embc->Bz_val ); + b.attr( "By_val", embc->By_val_ ); + b.attr( "Bz_val", embc->Bz_val_ ); } else if( dynamic_cast( EMfields->emBoundCond[bcId] ) ) { ElectroMagnBC2D_SM *embc = static_cast( EMfields->emBoundCond[bcId] ); ostringstream name( "" ); @@ -889,8 +889,8 @@ void Checkpoint::restartPatch( Patch *patch, Params ¶ms, H5Read &g ) name << setfill( '0' ) << setw( 2 ) << bcId; string groupName = Tools::merge( "EM_boundary-species-", name.str() ); H5Read b = g.group( groupName ); - b.attr( "By_val", embc->By_val ); - b.attr( "Bz_val", embc->Bz_val ); + b.attr( "By_val", embc->By_val_ ); + b.attr( "Bz_val", embc->Bz_val_ ); } else if( dynamic_cast( EMfields->emBoundCond[bcId] ) ) { ElectroMagnBC2D_SM *embc = static_cast( EMfields->emBoundCond[bcId] ); ostringstream name( "" ); diff --git a/src/Diagnostic/DiagnosticProbes.cpp b/src/Diagnostic/DiagnosticProbes.cpp index 5e79eecc9..e66c684e7 100755 --- a/src/Diagnostic/DiagnosticProbes.cpp +++ b/src/Diagnostic/DiagnosticProbes.cpp @@ -740,7 +740,7 @@ void DiagnosticProbes::run( SmileiMPI *smpi, VectorPatch &vecPatches, int itime, // Interpolate all usual fields on probe ("fake") particles of current patch unsigned int iPart_MPI = offset_in_MPI[ipatch]; unsigned int maxPart_MPI = offset_in_MPI[ipatch] + npart; -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) smpi->resizeDeviceBuffers( ithread, nDim_particle, npart ); diff --git a/src/Diagnostic/DiagnosticScalar.cpp b/src/Diagnostic/DiagnosticScalar.cpp index fe88f47d9..9b8b17409 100755 --- a/src/Diagnostic/DiagnosticScalar.cpp +++ b/src/Diagnostic/DiagnosticScalar.cpp @@ -436,7 +436,7 @@ void DiagnosticScalar::compute( Patch *patch, int ) const unsigned int nPart=vecSpecies[ispec]->getNbrOfParticles(); // number of particles -// #if defined( SMILEI_ACCELERATOR_MODE ) +// #if defined( SMILEI_ACCELERATOR_GPU ) const double *const __restrict__ weight_ptr = vecSpecies[ispec]->particles->getPtrWeight(); const short *const __restrict__ charge_ptr = vecSpecies[ispec]->particles->getPtrCharge(); const double *const __restrict__ momentum_x = vecSpecies[ispec]->particles->getPtrMomentum(0); @@ -447,14 +447,14 @@ void DiagnosticScalar::compute( Patch *patch, int ) if( vecSpecies[ispec]->mass_ > 0 ) { // GPU mode -#ifdef SMILEI_ACCELERATOR_MODE +#ifdef SMILEI_ACCELERATOR_GPU #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target teams distribute parallel for \ map(tofrom: density) \ is_device_ptr(weight_ptr) \ reduction(+:density) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel deviceptr(weight_ptr) #pragma acc loop gang worker vector reduction(+:density) #endif @@ -468,7 +468,7 @@ void DiagnosticScalar::compute( Patch *patch, int ) map(tofrom: charge) \ is_device_ptr( charge_ptr, weight_ptr) \ reduction(+:charge) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel deviceptr(weight_ptr, charge_ptr) #pragma acc loop gang worker vector reduction(+:charge) #endif @@ -484,7 +484,7 @@ void DiagnosticScalar::compute( Patch *patch, int ) momentum_y /* [istart:particle_number] */, \ momentum_z /* [istart:particle_number] */) \ reduction(+:ener_tot) -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc parallel deviceptr(weight_ptr, \ momentum_x, \ momentum_y, \ @@ -525,14 +525,14 @@ void DiagnosticScalar::compute( Patch *patch, int ) } else if( vecSpecies[ispec]->mass_ == 0 ) { // GPU mode -#ifdef SMILEI_ACCELERATOR_MODE +#ifdef SMILEI_ACCELERATOR_GPU #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target teams distribute parallel for \ map(tofrom: density) \ is_device_ptr(weight_ptr) \ reduction(+:density) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel deviceptr(weight_ptr) #pragma acc loop gang worker vector reduction(+:density) #endif @@ -548,7 +548,7 @@ void DiagnosticScalar::compute( Patch *patch, int ) momentum_y /* [istart:particle_number] */, \ momentum_z /* [istart:particle_number] */) \ reduction(+:ener_tot) -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc parallel deviceptr(weight_ptr, \ momentum_x, \ momentum_y, \ @@ -667,7 +667,7 @@ void DiagnosticScalar::compute( Patch *patch, int ) // total energy in current field double Uem = 0.; if( ! AM ) { -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) Uem = field->norm2OnDevice( EMfields->istart, EMfields->bufsize ); #else Uem = field->norm2( EMfields->istart, EMfields->bufsize ); @@ -751,7 +751,7 @@ void DiagnosticScalar::compute( Patch *patch, int ) j_max = iFieldStart[1]; k_max = iFieldStart[2]; -#if defined( SMILEI_ACCELERATOR_MODE) +#if defined( SMILEI_ACCELERATOR_GPU) // We use scalar rather than arrays because omp target // sometime fails to pass them to the device const unsigned int ixstart = iFieldStart[0]; @@ -776,7 +776,7 @@ void DiagnosticScalar::compute( Patch *patch, int ) map(tofrom: minval, maxval, i_min, i_max, j_min, j_max, k_min, k_max) \ map(to: ny, nz, ixstart, ixend, iystart, iyend, izstart, izend) //reduction(min:minval) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present(field_data) //deviceptr( data_ ) #pragma acc loop gang worker vector collapse(3) #endif diff --git a/src/Diagnostic/DiagnosticTrack.cpp b/src/Diagnostic/DiagnosticTrack.cpp index 16ac325e9..583caab94 100755 --- a/src/Diagnostic/DiagnosticTrack.cpp +++ b/src/Diagnostic/DiagnosticTrack.cpp @@ -188,7 +188,7 @@ void DiagnosticTrack::setIDs( Patch *patch ) for( unsigned int iPart=0; iPartvecSpecies[species_index_]->particles->id( iPart ) = ++latest_Id; } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) patch->vecSpecies[species_index_]->particles->initializeIDsOnDevice(); #endif } diff --git a/src/ElectroMagn/ElectroMagn.cpp b/src/ElectroMagn/ElectroMagn.cpp index 2c75bc6a4..02467ecd4 100755 --- a/src/ElectroMagn/ElectroMagn.cpp +++ b/src/ElectroMagn/ElectroMagn.cpp @@ -555,7 +555,7 @@ void ElectroMagn::applyAntenna( unsigned int iAntenna, double intensity ) //! Compute the total density and currents from species density and currents on Device //! This function is valid wathever the geometry // --------------------------------------------------------------------------------------------------------------------- -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) void ElectroMagn::computeTotalRhoJOnDevice() { @@ -577,7 +577,7 @@ void ElectroMagn::computeTotalRhoJOnDevice() double *const __restrict__ rhosp = rho_s[ispec] ? rho_s[ispec]->data() : nullptr; -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( \ Jxp[0:Jx_size], \ Jyp[0:Jy_size], \ @@ -594,7 +594,7 @@ void ElectroMagn::computeTotalRhoJOnDevice() #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc loop gang worker vector #endif for( unsigned int i=0 ; i( Bx_ ); - Field1D *By1D = static_cast( By_ ); - Field1D *Bz1D = static_cast( Bz_ ); - Field1D *Bx1D_m = static_cast( Bx_m ); - Field1D *By1D_m = static_cast( By_m ); - Field1D *Bz1D_m = static_cast( Bz_m ); + const double *const __restrict__ Bx1D = Bx_->data(); + const double *const __restrict__ By1D = By_->data(); + const double *const __restrict__ Bz1D = Bz_->data(); + double *const __restrict__ Bx1D_m = Bx_m->data(); + double *const __restrict__ By1D_m = By_m->data(); + double *const __restrict__ Bz1D_m = Bz_m->data(); + const unsigned int nx_p = dimPrim[0]; + const unsigned int nx_d = dimDual[0]; + // for Bx^(p) - for( unsigned int i=0 ; isize(); + const int sizeofBy = By_->size(); + const int sizeofBz = Bz_->size(); + + #pragma acc parallel present(Bx1D[0:sizeofBx],Bx1D_m[0:sizeofBx]) + #pragma acc loop gang worker vector +#elif defined( SMILEI_ACCELERATOR_GPU_OMP ) + #pragma omp target + #pragma omp teams distribute parallel for //simd +#endif + for( unsigned int i=0 ; i( By_mBTIS3 ); - Field1D *Bz_oldBTIS3 = static_cast( Bz_mBTIS3 ); - - for( unsigned int i=0 ; idata(); + double *const Bz1D_oldBTIS3 = Bz_mBTIS3->data(); +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) + const int sizeofByBTIS3 = By_mBTIS3->size(); + const int sizeofBzBTIS3 = Bz_mBTIS3->size(); + #pragma acc parallel present(By1D_oldBTIS3[0:sizeofByBTIS3],By1D[0:sizeofBy],Bz1D_oldBTIS3[0:sizeofBzBTIS3],Bz1D[0:sizeofBz]) + #pragma acc loop gang vector +#elif defined( SMILEI_ACCELERATOR_GPU_OMP ) + #pragma omp target + #pragma omp teams distribute parallel for +#endif +#if !defined( SMILEI_ACCELERATOR_GPU ) + #pragma omp simd +#endif + for( unsigned int i=0 ; idata(); // Magnetic field Bx^(p,d) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofBx = Bx_->size(); const int sizeofBy = By_->size(); const int sizeofBz = Bz_->size(); @@ -1229,10 +1229,10 @@ void ElectroMagn2D::centerMagneticFields() #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < nx_p; ++x ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int y = 0; y < ny_d; ++y ) { @@ -1241,7 +1241,7 @@ void ElectroMagn2D::centerMagneticFields() } // Magnetic field By^(d,p) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present(By2D[0:sizeofBy],By2D_m[0:sizeofBy]) #pragma acc loop gang worker #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -1249,10 +1249,10 @@ void ElectroMagn2D::centerMagneticFields() #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < ( nx_p + 1 ); ++x ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int y = 0; y < ny_p; ++y ) { @@ -1260,7 +1260,7 @@ void ElectroMagn2D::centerMagneticFields() } } // Magnetic field Bz^(d,d) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present(Bz2D[0:sizeofBz],Bz2D_m[0:sizeofBz]) #pragma acc loop gang worker #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -1268,10 +1268,10 @@ void ElectroMagn2D::centerMagneticFields() #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < ( nx_p + 1 ); ++x ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int y = 0; y < ny_d; ++y ) { @@ -1282,7 +1282,7 @@ void ElectroMagn2D::centerMagneticFields() double *const By2D_oldBTIS3 = By_mBTIS3->data(); double *const Bz2D_oldBTIS3 = Bz_mBTIS3->data(); -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofByBTIS3 = By_mBTIS3->size(); #pragma acc parallel present(By2D_oldBTIS3[0:sizeofByBTIS3],By2D[0:sizeofBy]) #pragma acc loop gang @@ -1291,17 +1291,17 @@ void ElectroMagn2D::centerMagneticFields() #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < ( nx_p - 1 ); ++x ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int y = 0; y < ny_p; ++y ) { By2D_oldBTIS3[x * ny_p + y] = ( By2D[(x+1) * ny_p + y] + By2D_oldBTIS3[x * ny_p + y] ) * 0.5; } } -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofBzBTIS3 = Bz_mBTIS3->size(); #pragma acc parallel present(Bz2D_oldBTIS3[0:sizeofBz],Bz2D[0:sizeofBz]) #pragma acc loop gang @@ -1310,10 +1310,10 @@ void ElectroMagn2D::centerMagneticFields() #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < ( nx_p - 1 ); ++x ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int y = 0; y < ny_d; ++y ) { @@ -1392,7 +1392,7 @@ void ElectroMagn2D::computeTotalRhoJ() //END computeTotalRhoJ } -// #if defined( SMILEI_ACCELERATOR_MODE ) +// #if defined( SMILEI_ACCELERATOR_GPU ) // //! Method used to compute the total charge density and currents by summing over all species on Device // void ElectroMagn2D::computeTotalRhoJOnDevice() // { diff --git a/src/ElectroMagn/ElectroMagn2D.h b/src/ElectroMagn/ElectroMagn2D.h index aecb87ab8..d8cdfb031 100755 --- a/src/ElectroMagn/ElectroMagn2D.h +++ b/src/ElectroMagn/ElectroMagn2D.h @@ -115,7 +115,7 @@ class ElectroMagn2D : public ElectroMagn //! Method used to compute the total charge density and currents by summing over all species void computeTotalRhoJ() override; -// #if defined( SMILEI_ACCELERATOR_MODE ) +// #if defined( SMILEI_ACCELERATOR_GPU ) // //! Method used to compute the total charge density and currents by summing over all species on Device // void computeTotalRhoJOnDevice() override; // #endif diff --git a/src/ElectroMagn/ElectroMagn3D.cpp b/src/ElectroMagn/ElectroMagn3D.cpp index c8994d75c..41ba9cc58 100755 --- a/src/ElectroMagn/ElectroMagn3D.cpp +++ b/src/ElectroMagn/ElectroMagn3D.cpp @@ -4,7 +4,7 @@ #include #include -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #include #endif @@ -1207,7 +1207,7 @@ void ElectroMagn3D::centerMagneticFields() double *const __restrict__ Bz3D_m = Bz_m->data(); // Magnetic field Bx^(p,d,d) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofBx = Bx_->size(); const int sizeofBy = By_->size(); const int sizeofBz = Bz_->size(); @@ -1219,11 +1219,11 @@ void ElectroMagn3D::centerMagneticFields() #pragma omp teams distribute parallel for collapse( 3 ) #endif for( unsigned int i=0 ; idata(); double *const __restrict__ BzmBTIS3 = Bz_mBTIS3->data(); -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofByBTIS3 = By_mBTIS3->size(); #pragma acc parallel present(By3D[0:sizeofBy],BymBTIS3[0:sizeofByBTIS3]) #pragma acc loop gang @@ -1305,11 +1305,11 @@ void ElectroMagn3D::centerMagneticFields() #pragma omp teams distribute parallel for collapse( 3 ) #endif for( unsigned int i=0 ; isize(); #pragma acc parallel present(Bz3D[0:sizeofBz],BzmBTIS3[0:sizeofBzBTIS3]) #pragma acc loop gang @@ -1332,11 +1332,11 @@ void ElectroMagn3D::centerMagneticFields() #pragma omp teams distribute parallel for collapse( 3 ) #endif for( unsigned int i=0 ; icopyFrom( Br_[imode] ); if (input[2] && copy[2]) Bt_m[imode]->copyFrom( Bt_[imode] ); } - ElectroMagnAM *emAM = static_cast( patch->EMfields ); + // ElectroMagnAM *emAM = static_cast( patch->EMfields ); //emAM->compute_B_m_fromEB(); } @@ -1900,7 +1900,7 @@ void ElectroMagnAM::compute_B_m_fromEB() { const unsigned int nl_p = dimPrim[0]; const unsigned int nl_d = dimDual[0]; - const unsigned int nr_p = dimPrim[1]; + // const unsigned int nr_p = dimPrim[1]; const unsigned int nr_d = dimDual[1]; const unsigned int Nmodes = El_.size(); diff --git a/src/ElectroMagn/ElectroMagnAM.h b/src/ElectroMagn/ElectroMagnAM.h index 979581b4c..cd3063113 100755 --- a/src/ElectroMagn/ElectroMagnAM.h +++ b/src/ElectroMagn/ElectroMagnAM.h @@ -157,7 +157,7 @@ class ElectroMagnAM : public ElectroMagn void computeTotalRhoJ() override; -// #if defined( SMILEI_ACCELERATOR_MODE ) +// #if defined( SMILEI_ACCELERATOR_GPU ) // //! Method used to compute the total charge density and currents by summing over all species on Device // void computeTotalRhoJOnDevice() override ; // #endif diff --git a/src/ElectroMagnBC/ElectroMagnBC1D_SM.cpp b/src/ElectroMagnBC/ElectroMagnBC1D_SM.cpp index d00c3cdb1..ff767bc12 100755 --- a/src/ElectroMagnBC/ElectroMagnBC1D_SM.cpp +++ b/src/ElectroMagnBC/ElectroMagnBC1D_SM.cpp @@ -17,23 +17,23 @@ ElectroMagnBC1D_SM::ElectroMagnBC1D_SM( Params ¶ms, Patch *patch, unsigned i : ElectroMagnBC1D( params, patch, i_boundary ) { // Parameters for the Silver-Mueller boundary conditions - Alpha = 2./( 1.+dt_ov_d[0] ); - Beta = ( dt_ov_d[0]-1. )/( 1.+dt_ov_d[0] ); - Gamma = 4./( 1.+dt_ov_d[0] ); + Alpha_ = 2. / ( 1. + dt_ov_d[0] ); + Beta_ = ( dt_ov_d[0] - 1. ) / ( 1. + dt_ov_d[0] ); + Gamma_ = 4. / ( 1. + dt_ov_d[0] ); - By_val = 0.; - Bz_val = 0.; + By_val_ = 0.; + Bz_val_ = 0.; sign_ = (double) (i_boundary_ % 2) *2 - 1.; // -1 or 1 for min or max if( i_boundary == 0 ) { - iE = 0; - iB = 0; - iB_old = 1; + iE_ = 0; + iB_ = 0; + iB_old_ = 1; } else { - iE = n_p[0] - 1; - iB = n_d[0] - 1; - iB_old = iB - 1; + iE_ = n_p[0] - 1; + iB_ = n_d[0] - 1; + iB_old_ = iB_ - 1; } } @@ -50,15 +50,15 @@ void ElectroMagnBC1D_SM::save_fields( Field *my_field, Patch *patch ) if( i_boundary_ == 0 && patch->isXmin() ) { if( field1D->name=="By" ) { - By_val = ( *my_field )( 0 ); + By_val_ = ( *my_field )( 0 ); } else if( field1D->name=="Bz" ) { - Bz_val = ( *my_field )( 0 ); + Bz_val_ = ( *my_field )( 0 ); } } else if( i_boundary_ == 1 && patch->isXmax() ) { if( field1D->name=="By" ) { - By_val = ( *my_field )( field1D->dims()[0]-1 ); + By_val_ = ( *my_field )( field1D->dims()[0]-1 ); } else if( field1D->name=="Bz" ) { - Bz_val = ( *my_field )( field1D->dims()[0]-1 ); + Bz_val_ = ( *my_field )( field1D->dims()[0]-1 ); } } @@ -74,11 +74,17 @@ void ElectroMagnBC1D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * if( patch->isBoundary( i_boundary_ ) ) { //Field1D* Ex1D = static_cast(EMfields->Ex_); - Field1D *Ey1D = static_cast( EMfields->Ey_ ); + /*Field1D *Ey1D = static_cast( EMfields->Ey_ ); Field1D *Ez1D = static_cast( EMfields->Ez_ ); Field1D *By1D = static_cast( EMfields->By_ ); - Field1D *Bz1D = static_cast( EMfields->Bz_ ); + Field1D *Bz1D = static_cast( EMfields->Bz_ );*/ + const Field *E[3]{ EMfields->Ex_, EMfields->Ey_, EMfields->Ez_ }; + const Field *B[3]{ EMfields->Bx_, EMfields->By_, EMfields->Bz_ }; + const double *const __restrict__ E1 = E[1]->data_; + const double *const __restrict__ E2 = E[2]->data_; + double *const __restrict__ B1 = B[1]->data_; + double *const __restrict__ B2 = B[2]->data_; // Lasers double by = 0., bz = 0.; vector pos( 1 ); @@ -88,11 +94,25 @@ void ElectroMagnBC1D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * bz += vecLaser[ilaser]->getAmplitude1( pos, time_dual, 0, 0 ); } +#ifdef SMILEI_ACCELERATOR_GPU_OACC + const int sizeofE1 = E[1]->number_of_points_; + const int sizeofE2 = E[2]->number_of_points_; + const int sizeofB1 = B[1]->number_of_points_; + const int sizeofB2 = B[2]->number_of_points_; +#endif // Apply Silver-Mueller EM boundary condition at x=xmin or xmax - ( *By1D )( iB ) = -sign_*Alpha*( *Ez1D )( iE ) + Beta*( ( *By1D )( iB_old )-By_val ) + Gamma*by + By_val; - ( *Bz1D )( iB ) = sign_*Alpha*( *Ey1D )( iE ) + Beta*( ( *Bz1D )( iB_old )-Bz_val ) + Gamma*bz + Bz_val; - +#ifdef SMILEI_ACCELERATOR_GPU_OACC + #pragma acc parallel present(E1[0:sizeofE1],E2[0:sizeofE2],B1[0:sizeofB1],B2[0:sizeofB2]) +#elif defined( SMILEI_ACCELERATOR_GPU_OMP ) + #pragma omp target +#endif + { + //( *By1D )( iB_ ) = -sign_*Alpha_*( *Ez1D )( iE_ ) + Beta_*( ( *By1D )( iB_old_ )-By_val_ ) + Gamma_*by + By_val_; + //( *Bz1D )( iB_ ) = sign_*Alpha_*( *Ey1D )( iE_ ) + Beta_*( ( *Bz1D )( iB_old_ )-Bz_val_ ) + Gamma_*bz + Bz_val_; + B1[ iB_ ] = -sign_ * Alpha_ * E2[iE_] + Beta_ * ( B1[iB_old_] - By_val_) + Gamma_ * by + By_val_; + B2[ iB_ ] = sign_ * Alpha_ * E1[iE_] + Beta_ * ( B2[iB_old_] - Bz_val_) + Gamma_ * bz + Bz_val_; + } } } diff --git a/src/ElectroMagnBC/ElectroMagnBC1D_SM.h b/src/ElectroMagnBC/ElectroMagnBC1D_SM.h index ac17f856d..ccbc499c1 100755 --- a/src/ElectroMagnBC/ElectroMagnBC1D_SM.h +++ b/src/ElectroMagnBC/ElectroMagnBC1D_SM.h @@ -17,16 +17,16 @@ class ElectroMagnBC1D_SM : public ElectroMagnBC1D void save_fields( Field *, Patch *patch ) override; - double By_val, Bz_val; + double By_val_, Bz_val_; private: //! Constants used for the Silver-Mueller boundary conditions - double Alpha, Beta, Gamma; + double Alpha_, Beta_, Gamma_; //! Locations to apply the profile - unsigned int iE, iB, iB_old; + unsigned int iE_, iB_, iB_old_; int sign_; }; diff --git a/src/ElectroMagnBC/ElectroMagnBC2D_SM.cpp b/src/ElectroMagnBC/ElectroMagnBC2D_SM.cpp index 42ce8c381..2d257cbd5 100755 --- a/src/ElectroMagnBC/ElectroMagnBC2D_SM.cpp +++ b/src/ElectroMagnBC/ElectroMagnBC2D_SM.cpp @@ -68,9 +68,9 @@ ElectroMagnBC2D_SM::ElectroMagnBC2D_SM( Params ¶ms, Patch *patch, unsigned i ElectroMagnBC2D_SM::~ElectroMagnBC2D_SM() { - for (int i=0 ; inumber_of_points_; const int sizeofE1 = E[1]->number_of_points_; const int sizeofE2 = E[2]->number_of_points_; @@ -182,7 +182,7 @@ void ElectroMagnBC2D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocateAndCopyHostToDevice( db1, b1_size ); if( axis0_ == 0 ) { // for By^(d,p) -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel present(E2[0:sizeofE2],B0[0:sizeofB0],B1[0:sizeofB1],B_ext1[0:B_ext_size1],B_ext0[0:B_ext_size0],db1[0:b1_size]) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -199,7 +199,7 @@ void ElectroMagnBC2D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * + B_ext1[j]; } } else { // for Bx^(p,d) -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel present(E2[0:sizeofE2],B0[0:sizeofB0],B1[0:sizeofB1],B_ext1[0:B_ext_size1],B_ext0[0:B_ext_size0],db1[0:b1_size]) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -234,7 +234,7 @@ void ElectroMagnBC2D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * // for Bz^(d,d) if( axis0_ == 0 ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel present(E1[0:sizeofE1],B2[0:sizeofB2],B_ext2[0:B_ext_size2],db2[0:b2_size]) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -247,7 +247,7 @@ void ElectroMagnBC2D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * } } else { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel present(E0[0:sizeofE0],B2[0:sizeofB2],B_ext2[0:B_ext_size2],db2[0:b2_size]) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) diff --git a/src/ElectroMagnBC/ElectroMagnBC3D_SM.cpp b/src/ElectroMagnBC/ElectroMagnBC3D_SM.cpp index 3ae113e60..ba4e61b28 100755 --- a/src/ElectroMagnBC/ElectroMagnBC3D_SM.cpp +++ b/src/ElectroMagnBC/ElectroMagnBC3D_SM.cpp @@ -186,7 +186,7 @@ void ElectroMagnBC3D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * const int isBoundary2min = patch->isBoundary( axis2_, 0 ); const int isBoundary2max = patch->isBoundary( axis2_, 1 ); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC const int sizeofE0 = E[axis0_]->number_of_points_; const int sizeofE1 = E[axis1_]->number_of_points_; const int sizeofE2 = E[axis2_]->number_of_points_; @@ -217,7 +217,7 @@ void ElectroMagnBC3D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * // B1 if( axis0_ == 0 ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel present(E2[0:sizeofE2],B0[0:sizeofB0],B1[0:sizeofB1],B_ext1[0:B_ext_size1],B_ext0[0:B_ext_size0],db1[0:b1_size]) #pragma acc loop gang #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -225,7 +225,7 @@ void ElectroMagnBC3D_SM::apply( ElectroMagn *EMfields, double time_dual, Patch * #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int j=isBoundary1min; j( fields->Ex_ ); Field1D *Ey1D = static_cast( fields->Ey_ ); Field1D *Ez1D = static_cast( fields->Ez_ ); - Field1D *Bx1D = static_cast( fields->Bx_ ); + // Field1D *Bx1D = static_cast( fields->Bx_ ); Field1D *By1D = static_cast( fields->By_ ); Field1D *Bz1D = static_cast( fields->Bz_ ); Field1D *Jx1D = static_cast( fields->Jx_ ); diff --git a/src/ElectroMagnSolver/MA_Solver1D_norm.cpp b/src/ElectroMagnSolver/MA_Solver1D_norm.cpp index 7e04123f4..803ffc6cb 100755 --- a/src/ElectroMagnSolver/MA_Solver1D_norm.cpp +++ b/src/ElectroMagnSolver/MA_Solver1D_norm.cpp @@ -17,26 +17,56 @@ void MA_Solver1D_norm::operator()( ElectroMagn *fields ) { const unsigned int nx_p = fields->dimPrim[0]; const unsigned int nx_d = fields->dimDual[0]; - Field1D *Ex1D = static_cast( fields->Ex_ ); - Field1D *Ey1D = static_cast( fields->Ey_ ); - Field1D *Ez1D = static_cast( fields->Ez_ ); - Field1D *By1D = static_cast( fields->By_ ); - Field1D *Bz1D = static_cast( fields->Bz_ ); - Field1D *Jx1D = static_cast( fields->Jx_ ); - Field1D *Jy1D = static_cast( fields->Jy_ ); - Field1D *Jz1D = static_cast( fields->Jz_ ); - + + double *const __restrict__ Ex1D = fields->Ex_->data(); // [x] : dual in x primal in y,z + double *const __restrict__ Ey1D = fields->Ey_->data(); // [x] : dual in y primal in x,z + double *const __restrict__ Ez1D = fields->Ez_->data(); // [x] : dual in z primal in x,y + //const double *const __restrict__ Bx1D = fields->Bx_->data(); // [x] : dual in y,z primal in x + const double *const __restrict__ By1D = fields->By_->data(); // [x] : dual in x,z primal in y + const double *const __restrict__ Bz1D = fields->Bz_->data(); // [x] : dual in x,y primal in z + const double *const __restrict__ Jx1D = fields->Jx_->data(); // [x] : dual in x primal in y,z + const double *const __restrict__ Jy1D = fields->Jy_->data(); // [x] : dual in y primal in x,z + const double *const __restrict__ Jz1D = fields->Jz_->data(); // [x] : dual in z primal in x,y + // -------------------- // Solve Maxwell-Ampere // -------------------- // Calculate the electrostatic field ex on the dual grid - for( unsigned int ix=0 ; ixEx_->number_of_points_; + const int sizeofEy = fields->Ey_->number_of_points_; + const int sizeofEz = fields->Ez_->number_of_points_; + //const int sizeofBx = fields->Bx_->number_of_points_; + const int sizeofBy = fields->By_->number_of_points_; + const int sizeofBz = fields->Bz_->number_of_points_; + #pragma acc parallel present( Ex1D[0:sizeofEx], Jx1D[0:sizeofEx] ) + #pragma acc loop gang worker vector +#elif defined( SMILEI_ACCELERATOR_GPU_OMP ) + #pragma omp target + #pragma omp teams distribute parallel for +#endif +#if !defined( SMILEI_ACCELERATOR_GPU ) + #pragma omp simd +#endif + for( unsigned int ix=0 ; ixEx_->number_of_points_; const int sizeofEy = fields->Ey_->number_of_points_; const int sizeofEz = fields->Ez_->number_of_points_; @@ -52,10 +52,10 @@ void MA_Solver2D_norm::operator()( ElectroMagn *fields ) #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < nx_d; ++x ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop worker #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int y = 0; y < ny_p; ++y ) { @@ -64,7 +64,7 @@ void MA_Solver2D_norm::operator()( ElectroMagn *fields ) } // Electric field Ey^(p,d) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( Ey2D[0:sizeofEy], Jy2D[0:sizeofEy], Bz2D[0:sizeofBz] ) #pragma acc loop gang #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -72,10 +72,10 @@ void MA_Solver2D_norm::operator()( ElectroMagn *fields ) #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < nx_p; ++x ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop worker #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int y = 0; y < ny_d; ++y ) { @@ -84,7 +84,7 @@ void MA_Solver2D_norm::operator()( ElectroMagn *fields ) } // Electric field Ez^(p,p) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( Ez2D[0:sizeofEz], Jz2D[0:sizeofEz], Bx2D[0:sizeofBx], By2D[0:sizeofBy] ) #pragma acc loop gang #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -92,10 +92,10 @@ void MA_Solver2D_norm::operator()( ElectroMagn *fields ) #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < nx_p; ++x ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop worker #endif -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif for( unsigned int y = 0; y < ny_p; ++y ) { diff --git a/src/ElectroMagnSolver/MA_Solver3D_norm.cpp b/src/ElectroMagnSolver/MA_Solver3D_norm.cpp index 9b2a089cc..7ffea26c0 100755 --- a/src/ElectroMagnSolver/MA_Solver3D_norm.cpp +++ b/src/ElectroMagnSolver/MA_Solver3D_norm.cpp @@ -35,7 +35,7 @@ void MA_Solver3D_norm::operator()( ElectroMagn *fields ) const unsigned int nz_d = fields->dimDual[2]; // Electric field Ex^(d,p,p) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofEx = fields->Ex_->number_of_points_; const int sizeofEy = fields->Ey_->number_of_points_; const int sizeofEz = fields->Ez_->number_of_points_; @@ -50,11 +50,11 @@ void MA_Solver3D_norm::operator()( ElectroMagn *fields ) #pragma omp teams distribute parallel for collapse( 3 ) #endif for( unsigned int i=0 ; idimPrim[0]; const unsigned int nx_d = fields->dimDual[0]; - // Static-cast of the fields - Field1D* Ey1D; - Field1D* Ez1D; - if (isEFilterApplied) { - Ey1D = static_cast(fields->filter_->Ey_[0]); - Ez1D = static_cast(fields->filter_->Ez_[0]); - } else { - Ey1D = static_cast(fields->Ey_); - Ez1D = static_cast(fields->Ez_); - } - Field1D *By1D = static_cast( fields->By_ ); - Field1D *Bz1D = static_cast( fields->Bz_ ); + const double *const __restrict__ Ey1D = isEFilterApplied ? fields->filter_->Ey_[0]->data() : + fields->Ey_->data(); // [ix] : dual in y primal in x,z + const double *const __restrict__ Ez1D = isEFilterApplied ? fields->filter_->Ez_[0]->data() : + fields->Ez_->data();// [ix] : dual in z primal in x,y + + double *const __restrict__ By1D = fields->By_->data();// [ix] : dual in x,z primal in y + double *const __restrict__ Bz1D = fields->Bz_->data();// [ix] : dual in x,y primal in z + // --------------------- // Solve Maxwell-Faraday // --------------------- // NB: bx is given in 1d and defined when initializing the fields (here put to 0) // Transverse fields by & bz are defined on the dual grid - //for (unsigned int ix=1 ; ixEy_->number_of_points_; + const int sizeofEz = fields->Ez_->number_of_points_; + const int sizeofBy = fields->By_->number_of_points_; + const int sizeofBz = fields->Bz_->number_of_points_; + #pragma acc parallel present( By1D[0:sizeofBy], Bz1D[0:sizeofBz],Ey1D[0:sizeofEy],Ez1D[0:sizeofEz] ) + #pragma acc loop gang vector +#elif defined( SMILEI_ACCELERATOR_GPU_OMP ) + #pragma omp target + #pragma omp teams distribute parallel for +#endif +#if !defined( SMILEI_ACCELERATOR_GPU ) + #pragma omp simd +#endif for( unsigned int ix=1 ; ixBz_->data(); // [x * ny_d + y] : dual in x,y primal in z // Magnetic field Bx^(p,d) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofEx = fields->Ex_->number_of_points_; const int sizeofEy = fields->Ey_->number_of_points_; const int sizeofEz = fields->Ez_->number_of_points_; @@ -48,10 +48,10 @@ void MF_Solver2D_Yee::operator()( ElectroMagn *fields ) #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 0; x < nx_d - 1; ++x ) { -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( unsigned int y = 1; y < ny_d - 1; ++y ) { @@ -59,7 +59,7 @@ void MF_Solver2D_Yee::operator()( ElectroMagn *fields ) } } // Magnetic field By^(d,p) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( By2D[0:sizeofBy], Ez2D[0:sizeofEz] ) #pragma acc loop gang #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -67,10 +67,10 @@ void MF_Solver2D_Yee::operator()( ElectroMagn *fields ) #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 1; x < nx_d - 1; ++x ) { -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( unsigned int y = 0; y < ny_p; ++y ) { @@ -79,7 +79,7 @@ void MF_Solver2D_Yee::operator()( ElectroMagn *fields ) } // Magnetic field Bz^(d,d) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( Bz2D[0:sizeofBy], Ex2D[0:sizeofEx], Ey2D[0:sizeofEz] ) #pragma acc loop gang #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -87,10 +87,10 @@ void MF_Solver2D_Yee::operator()( ElectroMagn *fields ) #pragma omp teams distribute parallel for collapse( 2 ) #endif for( unsigned int x = 1; x < nx_d - 1; ++x ) { -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) #pragma omp simd #endif -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( unsigned int y = 1; y < ny_d - 1; ++y ) { diff --git a/src/ElectroMagnSolver/MF_Solver3D_Yee.cpp b/src/ElectroMagnSolver/MF_Solver3D_Yee.cpp index 5930af3e1..f70159699 100755 --- a/src/ElectroMagnSolver/MF_Solver3D_Yee.cpp +++ b/src/ElectroMagnSolver/MF_Solver3D_Yee.cpp @@ -34,7 +34,7 @@ void MF_Solver3D_Yee::operator()( ElectroMagn *fields ) const double * __restrict__ Ez3D = isEFilterApplied ? fields->filter_->Ez_[0]->data() : fields->Ez_->data(); // Magnetic field Bx^(p,d,d) -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) const int sizeofEx = fields->Ex_->number_of_points_; const int sizeofEy = fields->Ey_->number_of_points_; const int sizeofEz = fields->Ez_->number_of_points_; @@ -49,11 +49,11 @@ void MF_Solver3D_Yee::operator()( ElectroMagn *fields ) #pragma omp teams distribute parallel for collapse( 3 ) #endif for( unsigned int i=0 ; i dA_over_dx_fdtd = ( ( *A_n_pml )( i+1, j )-( *A_n_pml )( i-1, j ) )/(2.*dx) ; - std::complex dA_over_dx = dA_over_dx_fdtd - + i1*k0*( *A_n_pml )( i, j ) ; + // std::complex dA_over_dx = dA_over_dx_fdtd + // + i1*k0*( *A_n_pml )( i, j ) ; // d2A/dx^2 = d2A/dx^2 + 2ik0 dA/dx - k0^2 A std::complex d2A_over_dx2_fdtd = ( ( *A_n_pml )( i-1, j )-2.*( *A_n_pml )( i, j )+( *A_n_pml )( i+1, j ) )/(dx*dx) ; std::complex d2A_over_dx2 = d2A_over_dx2_fdtd @@ -590,8 +590,8 @@ void PML_Solver2D_Envelope::compute_A_from_G( LaserEnvelope *envelope, int iDim, // ---- // dA/dx = dA/dx + ik0 A std::complex dA_over_dx_fdtd = ( ( *A_n_pml )( i+1, j )-( *A_n_pml )( i-1, j ) )/(2.*dx) ; - std::complex dA_over_dx = dA_over_dx_fdtd - + i1*k0*( *A_n_pml )( i, j ) ; + // std::complex dA_over_dx = dA_over_dx_fdtd + // + i1*k0*( *A_n_pml )( i, j ) ; // d2A/dx^2 = d2A/dx^2 + 2ik0 dA/dx - k0^2 A std::complex d2A_over_dx2_fdtd = ( ( *A_n_pml )( i-1, j )-2.*( *A_n_pml )( i, j )+( *A_n_pml )( i+1, j ) )/(dx*dx) ; std::complex d2A_over_dx2 = d2A_over_dx2_fdtd diff --git a/src/ElectroMagnSolver/PML_SolverAM_Envelope.cpp b/src/ElectroMagnSolver/PML_SolverAM_Envelope.cpp index 7e4e740c7..d8c65645a 100644 --- a/src/ElectroMagnSolver/PML_SolverAM_Envelope.cpp +++ b/src/ElectroMagnSolver/PML_SolverAM_Envelope.cpp @@ -395,7 +395,6 @@ void PML_SolverAM_Envelope::compute_A_from_G( LaserEnvelope *envelope, int iDim, double k0 = 1.; // laser wavenumber std::complex source_term_x ; std::complex source_term_y ; - double mpml_ratio = 0.00; if (iDim == 0) { for( unsigned int k=0 ; k<1 ; k++ ) { @@ -405,7 +404,7 @@ void PML_SolverAM_Envelope::compute_A_from_G( LaserEnvelope *envelope, int iDim, // dA/dx = dA/dx + ik0 A // r dA/dx = r dA/dx + ik0 rA <=> dG/dx = dG/dx + ik0 G std::complex dG_over_dx_fdtd = ( ( *G_n_pml )( i+1, j )-( *G_n_pml )( i-1, j ) )/(2.*dl) ; - std::complex dG_over_dx = dG_over_dx_fdtd + i1*k0*( *G_n_pml )( i, j ) ; + // std::complex dG_over_dx = dG_over_dx_fdtd + i1*k0*( *G_n_pml )( i, j ) ; // d2A/dx^2 = d2A/dx^2 + 2ik0 dA/dx - k0^2 A // r d2A/dx^2 = r d2A/dx^2 + r 2ik0 dA/dx - r k0^2 A <=> d2G/dx^2 = d2G/dx^2 + 2ik0 dG/dx - k0^2 G std::complex d2G_over_dx2_fdtd = ( ( *G_n_pml )( i-1, j )-2.*( *G_n_pml )( i, j )+( *G_n_pml )( i+1, j ) )/(dl*dl) ; @@ -494,7 +493,7 @@ void PML_SolverAM_Envelope::compute_A_from_G( LaserEnvelope *envelope, int iDim, // dA/dx = dA/dx + ik0 A // r dA/dx = r dA/dx + ik0 rA <=> dG/dx = dG/dx + ik0 G std::complex dA_over_dx_fdtd = ( ( *A_n_pml )( i+1, j )-( *A_n_pml )( i-1, j ) )/(2.*dl) ; - std::complex dA_over_dx = dA_over_dx_fdtd + i1*k0*( *A_n_pml )( i, j ) ; + // std::complex dA_over_dx = dA_over_dx_fdtd + i1*k0*( *A_n_pml )( i, j ) ; // d2A/dx^2 = d2A/dx^2 + 2ik0 dA/dx - k0^2 A // r d2A/dx^2 = r d2A/dx^2 + r 2ik0 dA/dx - r k0^2 A <=> d2G/dx^2 = d2G/dx^2 + 2ik0 dG/dx - k0^2 G std::complex d2A_over_dx2_fdtd = ( ( *A_n_pml )( i-1, j )-2.*( *A_n_pml )( i, j )+( *A_n_pml )( i+1, j ) )/(dl*dl) ; @@ -635,8 +634,8 @@ void PML_SolverAM_Envelope::compute_A_from_G( LaserEnvelope *envelope, int iDim, for( unsigned int j=solvermin ; j < solvermax ; j++ ) { // y loop // r dA/dx = r dA/dx + ik0 rA <=> dG/dx = dG/dx + ik0 G std::complex dG_over_dx_fdtd = ( ( *G_n_pml )( i+1, j )-( *G_n_pml )( i-1, j ) )/(2.*dl) ; - std::complex dG_over_dx = dG_over_dx_fdtd - + i1*k0*( *G_n_pml )( i, j ) ; + // std::complex dG_over_dx = dG_over_dx_fdtd + // + i1*k0*( *G_n_pml )( i, j ) ; // r d2A/dx^2 = r d2A/dx^2 + r 2ik0 dA/dx - r k0^2 A <=> d2G/dx^2 = d2G/dx^2 + 2ik0 dG/dx - k0^2 G std::complex d2G_over_dx2_fdtd = ( ( *G_n_pml )( i-1, j )-2.*( *G_n_pml )( i, j )+( *G_n_pml )( i+1, j ) )/(dl*dl) ; std::complex d2G_over_dx2 = d2G_over_dx2_fdtd diff --git a/src/ElectroMagnSolver/PML_SolverAM_EnvelopeReducedDispersion.cpp b/src/ElectroMagnSolver/PML_SolverAM_EnvelopeReducedDispersion.cpp index 771f12e37..c2a5c4087 100644 --- a/src/ElectroMagnSolver/PML_SolverAM_EnvelopeReducedDispersion.cpp +++ b/src/ElectroMagnSolver/PML_SolverAM_EnvelopeReducedDispersion.cpp @@ -400,7 +400,6 @@ void PML_SolverAM_EnvelopeReducedDispersion::compute_A_from_G( LaserEnvelope *en double k0 = 1.; // laser wavenumber std::complex source_term_x ; std::complex source_term_y ; - double mpml_ratio = 0.00; if (iDim == 0) { for( unsigned int k=0 ; k<1 ; k++ ) { @@ -410,7 +409,7 @@ void PML_SolverAM_EnvelopeReducedDispersion::compute_A_from_G( LaserEnvelope *en // dA/dx = dA/dx + ik0 A // r dA/dx = r dA/dx + ik0 rA <=> dG/dx = dG/dx + ik0 G std::complex dG_over_dx_fdtd = (1.+delta)*( ( *G_n_pml )( i+1, j )-( *G_n_pml )( i-1, j ) )/(2.*dl) - delta*( ( *G_n_pml )( i+2, j )-( *G_n_pml )( i-2, j ) )/(4.*dl) ; - std::complex dG_over_dx = dG_over_dx_fdtd + i1*k0*( *G_n_pml )( i, j ) ; + // std::complex dG_over_dx = dG_over_dx_fdtd + i1*k0*( *G_n_pml )( i, j ) ; // d2A/dx^2 = d2A/dx^2 + 2ik0 dA/dx - k0^2 A // r d2A/dx^2 = r d2A/dx^2 + r 2ik0 dA/dx - r k0^2 A <=> d2G/dx^2 = d2G/dx^2 + 2ik0 dG/dx - k0^2 G std::complex d2G_over_dx2_fdtd = (1.+delta)*( ( *G_n_pml )( i-1, j )-2.*( *G_n_pml )( i, j )+( *G_n_pml )( i+1, j ) )/(dl*dl)-delta*( ( *G_n_pml )( i-2, j )-2.*( *G_n_pml )( i, j )+( *G_n_pml )( i+2, j ) )/(4.*dl*dl) ; @@ -490,7 +489,7 @@ void PML_SolverAM_EnvelopeReducedDispersion::compute_A_from_G( LaserEnvelope *en for( unsigned int i=solvermin ; i dA_over_dx_fdtd = (1.+delta)*( ( *A_n_pml )( i+1, j )-( *A_n_pml )( i-1, j ) )/(2.*dl) - delta*( ( *A_n_pml )( i+2, j )-( *A_n_pml )( i-2, j ) )/(4.*dl) ; - std::complex dA_over_dx = dA_over_dx_fdtd + i1*k0*( *A_n_pml )( i, j ) ; + // std::complex dA_over_dx = dA_over_dx_fdtd + i1*k0*( *A_n_pml )( i, j ) ; // d2A/dx^2 = d2A/dx^2 + 2ik0 dA/dx - k0^2 A // r d2A/dx^2 = r d2A/dx^2 + r 2ik0 dA/dx - r k0^2 A <=> d2G/dx^2 = d2G/dx^2 + 2ik0 dG/dx - k0^2 G std::complex d2A_over_dx2_fdtd = (1.+delta)*( ( *A_n_pml )( i-1, j )-2.*( *A_n_pml )( i, j )+( *A_n_pml )( i+1, j ) )/(dl*dl)-delta*( ( *A_n_pml )( i-2, j )-2.*( *A_n_pml )( i, j )+( *A_n_pml )( i+2, j ) )/(4.*dl*dl) ; @@ -591,7 +590,7 @@ void PML_SolverAM_EnvelopeReducedDispersion::compute_A_from_G( LaserEnvelope *en for( unsigned int i=2 ; i dG_over_dx_fdtd = (1.+delta)*( ( *G_n_pml )( i+1, j )-( *G_n_pml )( i-1, j ) )/(2.*dl) - delta*( ( *G_n_pml )( i+2, j )-( *G_n_pml )( i-2, j ) )/(4.*dl) ; - std::complex dG_over_dx = dG_over_dx_fdtd + i1*k0*( *G_n_pml )( i, j ) ; + // std::complex dG_over_dx = dG_over_dx_fdtd + i1*k0*( *G_n_pml )( i, j ) ; // d2A/dx^2 = d2A/dx^2 + 2ik0 dA/dx - k0^2 A // r d2A/dx^2 = r d2A/dx^2 + r 2ik0 dA/dx - r k0^2 A <=> d2G/dx^2 = d2G/dx^2 + 2ik0 dG/dx - k0^2 G std::complex d2G_over_dx2_fdtd = (1.+delta)*( ( *G_n_pml )( i-1, j )-2.*( *G_n_pml )( i, j )+( *G_n_pml )( i+1, j ) )/(dl*dl)-delta*( ( *G_n_pml )( i-2, j )-2.*( *G_n_pml )( i, j )+( *G_n_pml )( i+2, j ) )/(4.*dl*dl) ; diff --git a/src/Field/Field.cpp b/src/Field/Field.cpp index 19c820d1d..0d8427f1e 100644 --- a/src/Field/Field.cpp +++ b/src/Field/Field.cpp @@ -5,14 +5,14 @@ void Field::put_to( double val ) { SMILEI_ASSERT( data_ != nullptr ); -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) const bool is_hostptr_mapped_on_device = smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( data_ ); #endif // NVCC's OpenACC needs that redundant pointer value double* an_other_data_pointer = data_; -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) // Test if data exists on GPU, put_to can be used on CPU and GPU during a simulation #pragma acc parallel present( an_other_data_pointer [0:size()] ) if( is_hostptr_mapped_on_device ) #pragma acc loop gang worker vector @@ -25,7 +25,7 @@ void Field::put_to( double val ) } } -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) //! copy the field array from Host to Device void Field::copyFromHostToDevice() { diff --git a/src/Field/Field.h b/src/Field/Field.h index 669106245..563705ab1 100755 --- a/src/Field/Field.h +++ b/src/Field/Field.h @@ -188,7 +188,7 @@ class Field virtual double norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) = 0; -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) //! Compute the norm2OnDevice of the field virtual double norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) = 0; #endif @@ -234,7 +234,7 @@ class Field return sum; } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) inline double __attribute__((always_inline)) normOnDevice() { @@ -245,7 +245,7 @@ class Field #pragma omp target teams distribute parallel for \ map(tofrom: sum) map(to: number_of_points_) \ reduction(+:sum) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present(field) //deviceptr( data_ ) #pragma acc loop gang worker vector reduction(+:sum) #endif @@ -279,7 +279,7 @@ class Field virtual void extract_fields_sum ( int iDim, int iNeighbor, int ghost_size ) = 0; virtual void inject_fields_sum ( int iDim, int iNeighbor, int ghost_size ) = 0; -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) //! copy the field from Host to Device void copyFromHostToDevice(); diff --git a/src/Field/Field1D.cpp b/src/Field/Field1D.cpp index d0fa18b2f..194660ce6 100755 --- a/src/Field/Field1D.cpp +++ b/src/Field/Field1D.cpp @@ -188,12 +188,37 @@ double Field1D::norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) return nrj; } - //! Perform the norm2 on Device -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) double Field1D::norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) { - ERROR("Not implemented"); + + double nrj( 0. ); + + int idxlocalstart[1]; + int idxlocalend[1]; + idxlocalstart[0] = istart[0][isDual_[0]]; + idxlocalend[0] = istart[0][isDual_[0]]+bufsize[0][isDual_[0]]; + + const double *const __restrict__ field = data(); + +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) + #pragma omp target teams distribute parallel for\ + map(tofrom: nrj) \ + map(to: idxlocalstart[0]) \ + /* is_device_ptr( data_ )*/ \ + reduction(+:nrj) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) + #pragma acc parallel present(field) //deviceptr( data_ ) + #pragma acc loop gang worker vector reduction(+:nrj) +#endif + + for( unsigned int i=idxlocalstart[0] ; i< idxlocalend[0] ; i++) { + nrj += field[i]*field[i]; + } + + return nrj; + } #endif @@ -246,15 +271,23 @@ void Field1D::create_sub_fields ( int iDim, int iNeighbor, int ghost_size ) if ( sendFields_[iDim*2+iNeighbor] == NULL ) { sendFields_[iDim*2+iNeighbor] = new Field1D(size); recvFields_[iDim*2+iNeighbor] = new Field1D(size); +#if defined( SMILEI_ACCELERATOR_GPU ) + if( ( name[0] == 'B' ) || ( name[0] == 'J' || name[0] == 'R' ) ) { + sendFields_[iDim * 2 + iNeighbor]->allocateAndCopyFromHostToDevice(); + recvFields_[iDim * 2 + iNeighbor]->allocateAndCopyFromHostToDevice(); + } +#endif } else if( ghost_size != (int) sendFields_[iDim*2+iNeighbor]->dims_[iDim] ) { +#if defined( SMILEI_ACCELERATOR_GPU ) + ERROR( "To Do GPU : envelope" ); +#endif delete sendFields_[iDim*2+iNeighbor]; sendFields_[iDim*2+iNeighbor] = new Field1D(size); delete recvFields_[iDim*2+iNeighbor]; recvFields_[iDim*2+iNeighbor] = new Field1D(size); } } - void Field1D::extract_fields_exch( int iDim, int iNeighbor, int ghost_size ) { std::vector size = dims_; @@ -267,13 +300,30 @@ void Field1D::extract_fields_exch( int iDim, int iNeighbor, int ghost_size ) unsigned int NX = size[0]; - double* sub = sendFields_[iDim*2+iNeighbor]->data_; - double* field = data_; + double *__restrict__ sub = sendFields_[iDim*2+iNeighbor]->data_; + const double*__restrict__ field = data_; + +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) + // At initialization, this data is NOT on the GPU + const bool should_manipulate_gpu_memory = name[0] == 'B' && + smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( sub ); + SMILEI_ASSERT( smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( field ) == + smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( sub ) ); + const unsigned field_first = ix; + const unsigned field_last = ix + NX - 1; + #pragma omp target if( should_manipulate_gpu_memory ) + #pragma omp teams distribute parallel for +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) + const int subSize = sendFields_[iDim*2+iNeighbor]->size(); + const int fSize = number_of_points_; + bool fieldName( (name.substr(0,1) == "B") ); + #pragma acc parallel present( field[0:fSize], sub[0:subSize] ) if (fieldName) + #pragma acc loop gang worker vector +#endif for( unsigned int i=0; i size = dims_; @@ -286,8 +336,25 @@ void Field1D::inject_fields_exch ( int iDim, int iNeighbor, int ghost_size ) unsigned int NX = size[0]; - double* sub = recvFields_[iDim*2+(iNeighbor+1)%2]->data_; - double* field = data_; + const double *__restrict__ sub = recvFields_[iDim*2+(iNeighbor+1)%2]->data_; + double *__restrict__ field = data_; + +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) + // At initialization, this data is NOT on the GPU + const bool should_manipulate_gpu_memory = name[0] == 'B' && + smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( sub ); + const unsigned field_first = ix; + const unsigned field_last = ix + NX - 1; + #pragma omp target if( should_manipulate_gpu_memory ) \ + map( tofrom : field [field_first:field_last - field_first] ) + #pragma omp teams distribute parallel for +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) + int subSize = recvFields_[iDim*2+(iNeighbor+1)%2]->size(); + const int fSize = number_of_points_; + bool fieldName( name.substr(0,1) == "B" ); + #pragma acc parallel present( field[0:fSize], sub[0:subSize] ) if (fieldName) + #pragma acc loop gang worker vector +#endif for( unsigned int i=0; idata_; - double* field = data_; + double *__restrict__ sub = sendFields_[iDim*2+iNeighbor]->data_; + const double *__restrict__ field = data_; + +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) + // At initialization, this data is NOT on the GPU + const bool should_manipulate_gpu_memory = (name[0] == 'J' || name[0] == 'R') && + smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( sub ); + const unsigned field_first = ix; + const unsigned field_last = ix + NX - 1; + #pragma omp target if( should_manipulate_gpu_memory ) \ + map( to : field [field_first:field_last - field_first] ) + #pragma omp teams distribute parallel for +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) + const int subSize = sendFields_[iDim*2+iNeighbor]->size(); + const int fSize = number_of_points_; + bool fieldName( ((name.substr(0,1) == "J") || (name.substr(0,1) == "R") ) && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( sub )); + #pragma acc parallel copy(field[0:fSize]) present( sub[0:subSize] ) if (fieldName) + #pragma acc loop gang worker vector +#endif for( unsigned int i=0; i size = dims_; @@ -324,9 +407,27 @@ void Field1D::inject_fields_sum ( int iDim, int iNeighbor, int ghost_size ) unsigned int NX = size[0]; - double* sub = recvFields_[iDim*2+(iNeighbor+1)%2]->data_; - double* field = data_; + const double *__restrict__ sub = recvFields_[iDim*2+(iNeighbor+1)%2]->data_; + double *__restrict__ field = data_; + +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) + // At initialization, this data is NOT on the GPU + const bool should_manipulate_gpu_memory = (name[0] == 'J' || name[0] == 'R') && + smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( sub ); + const unsigned field_first = ix; + const unsigned field_last = ix + NX - 1; + #pragma omp target if( should_manipulate_gpu_memory ) \ + map( tofrom : field [field_first:field_last - field_first] ) + #pragma omp teams distribute parallel for +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) + int subSize = recvFields_[iDim*2+(iNeighbor+1)%2]->size(); + int fSize = number_of_points_; + bool fieldName( name.substr(0,1) == "J" || name.substr(0,1) == "R"); + #pragma acc parallel copy(field[0:fSize]) present( sub[0:subSize] ) if (fieldName) + #pragma acc loop gang worker vector +#endif for( unsigned int i=0; iisOnDevice() ) { sendFields_[iside]->deleteOnDevice(); @@ -220,7 +220,7 @@ double Field2D::norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) } //! Perform the norm2 on Device -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) double Field2D::norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) { @@ -247,7 +247,7 @@ double Field2D::norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3 map(to: ny, idxlocalstart[0], idxlocalstart[1], iystart, iyend) \ /* is_device_ptr( data_ )*/ \ reduction(+:nrj) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present(field) //deviceptr( data_ ) #pragma acc loop gang worker vector collapse(2) reduction(+:nrj) #endif @@ -333,7 +333,7 @@ void Field2D::create_sub_fields( int iDim, int iNeighbor, int ghost_size ) sendFields_[iDim*2+iNeighbor] = new Field2D(size); recvFields_[iDim*2+iNeighbor] = new Field2D(size); -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) if( ( name[0] == 'B' ) || ( name[0] == 'J' || name[0] == 'R' ) ) { sendFields_[iDim * 2 + iNeighbor]->allocateAndCopyFromHostToDevice(); recvFields_[iDim * 2 + iNeighbor]->allocateAndCopyFromHostToDevice(); @@ -341,7 +341,7 @@ void Field2D::create_sub_fields( int iDim, int iNeighbor, int ghost_size ) #endif } else if ( ghost_size != (int)(sendFields_[iDim*2+iNeighbor]->dims_[iDim]) ) { -#if defined( SMILEI_OPENACC_MODE ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) ERROR( "To Do GPU : envelope" ); #endif delete sendFields_[iDim*2+iNeighbor]; @@ -381,7 +381,7 @@ void Field2D::extract_fields_exch( int iDim, int iNeighbor, int ghost_size ) #pragma omp target if( should_manipulate_gpu_memory ) #pragma omp teams distribute parallel for collapse( 2 ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) const int subSize = sendFields_[iDim*2+iNeighbor]->size(); const int fSize = number_of_points_; bool fieldName( (name.substr(0,1) == "B") ); @@ -389,7 +389,7 @@ void Field2D::extract_fields_exch( int iDim, int iNeighbor, int ghost_size ) #pragma acc loop gang #endif for( unsigned int i=0; isize(); const int fSize = number_of_points_; bool fieldName( name.substr(0,1) == "B" ); @@ -437,7 +437,7 @@ void Field2D::inject_fields_exch ( int iDim, int iNeighbor, int ghost_size ) #pragma acc loop gang #endif for( unsigned int i=0; isize(); const int fSize = number_of_points_; bool fieldName( ((name.substr(0,1) == "J") || (name.substr(0,1) == "R") ) && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( sub )); @@ -486,7 +486,7 @@ void Field2D::extract_fields_sum ( int iDim, int iNeighbor, int ghost_size ) #pragma acc loop gang #endif for( unsigned int i=0; isize(); int fSize = number_of_points_; bool fieldName( name.substr(0,1) == "J" || name.substr(0,1) == "R"); @@ -535,7 +535,7 @@ void Field2D::inject_fields_sum ( int iDim, int iNeighbor, int ghost_size ) #pragma acc loop gang #endif for( unsigned int i=0; i #include -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #include #endif @@ -81,7 +81,7 @@ Field3D::~Field3D() for( unsigned int iside=0 ; isideisOnDevice() ) { @@ -102,7 +102,9 @@ Field3D::~Field3D() } } if( data_!=NULL ) { +#if defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc exit data delete (data_[0:number_of_points_]) if (acc_deviceptr(data_) != NULL) +#endif delete [] data_; for( unsigned int i=0; idata_3D[i]; @@ -248,7 +250,7 @@ double Field3D::norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) } // Perform the norm2 on Device -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) double Field3D::norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) { double nrj( 0. ); @@ -277,7 +279,7 @@ double Field3D::norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3 map(to: ny, nz, ixstart, ixend, iystart, iyend, izstart, izend) \ /*is_device_ptr( data_ ) */ \ reduction(+:nrj) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present(field[0:number_of_points_]) //deviceptr( data_ ) #pragma acc loop gang worker vector collapse(3) reduction(+:nrj) #endif @@ -405,7 +407,7 @@ void Field3D::create_sub_fields ( int iDim, int iNeighbor, int ghost_size ) sendFields_[iDim*2+iNeighbor] = new Field3D(size); recvFields_[iDim*2+iNeighbor] = new Field3D(size); -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) if( ( name[0] == 'B' ) || ( name[0] == 'J' || name[0] == 'R' ) ) { @@ -427,7 +429,7 @@ void Field3D::create_sub_fields ( int iDim, int iNeighbor, int ghost_size ) } else if( ghost_size != (int) sendFields_[iDim*2+iNeighbor]->dims_[iDim] ) { -#if defined( SMILEI_OPENACC_MODE ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) ERROR( "To Do GPU : envelope" ); #endif delete sendFields_[iDim*2+iNeighbor]; @@ -463,7 +465,7 @@ void Field3D::extract_fields_exch( int iDim, int iNeighbor, int ghost_size ) #pragma omp target if( is_the_right_field ) #pragma omp teams distribute parallel for collapse( 3 ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) const int subSize = sendFields_[iDim*2+iNeighbor]->size(); const int fSize = number_of_points_; bool fieldName( (name.substr(0,1) == "B") ); @@ -471,11 +473,11 @@ void Field3D::extract_fields_exch( int iDim, int iNeighbor, int ghost_size ) #pragma acc loop gang #endif for( unsigned int i=0; i<(unsigned int)NX; i++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop worker #endif for( unsigned int j=0; j<(unsigned int)NY; j++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( unsigned int k=0; k<(unsigned int)NZ; k++ ) { @@ -514,7 +516,7 @@ void Field3D::inject_fields_exch ( int iDim, int iNeighbor, int ghost_size ) map( tofrom \ : field [0:fSize] ) #pragma omp teams distribute parallel for collapse( 3 ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) int subSize = recvFields_[iDim*2+(iNeighbor+1)%2]->size(); const int fSize = number_of_points_; bool fieldName( name.substr(0,1) == "B" ); @@ -522,11 +524,11 @@ void Field3D::inject_fields_exch ( int iDim, int iNeighbor, int ghost_size ) #pragma acc loop gang #endif for( unsigned int i=0; i<(unsigned int)NX; i++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop worker #endif for( unsigned int j=0; j<(unsigned int)NY; j++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( unsigned int k=0; k<(unsigned int)NZ; k++ ) { @@ -566,7 +568,7 @@ void Field3D::extract_fields_sum ( int iDim, int iNeighbor, int ghost_size ) map( to \ : field [0:fSize] ) #pragma omp teams distribute parallel for collapse( 3 ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) const int subSize = sendFields_[iDim*2+iNeighbor]->size(); const int fSize = number_of_points_; bool fieldName( (name.substr(0,1) == "J") || (name.substr(0,1) == "R")); @@ -575,11 +577,11 @@ void Field3D::extract_fields_sum ( int iDim, int iNeighbor, int ghost_size ) #pragma acc loop gang #endif for( unsigned int i=0; i<(unsigned int)NX; i++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop worker #endif for( unsigned int j=0; j<(unsigned int)NY; j++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( unsigned int k=0; k<(unsigned int)NZ; k++ ) { @@ -618,7 +620,7 @@ void Field3D::inject_fields_sum ( int iDim, int iNeighbor, int ghost_size ) map( tofrom \ : field [0:fSize] ) #pragma omp teams distribute parallel for collapse( 3 ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) int subSize = recvFields_[iDim*2+(iNeighbor+1)%2]->size(); int fSize = number_of_points_; bool fieldName( name.substr(0,1) == "J" || name.substr(0,1) == "R"); @@ -627,11 +629,11 @@ void Field3D::inject_fields_sum ( int iDim, int iNeighbor, int ghost_size ) #pragma acc loop gang #endif for( unsigned int i=0; i<(unsigned int)NX; i++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop worker #endif for( unsigned int j=0; j<(unsigned int)NY; j++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( unsigned int k=0; k<(unsigned int)NZ; k++ ) { diff --git a/src/Field/Field3D.h b/src/Field/Field3D.h index cc9524790..9f9ce4c9a 100755 --- a/src/Field/Field3D.h +++ b/src/Field/Field3D.h @@ -100,7 +100,7 @@ class Field3D : public Field virtual double norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override; //! Compute the norm2OnDevice of the field -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) virtual double norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override final; #endif diff --git a/src/Field/cField.h b/src/Field/cField.h index c37aa9514..d76de6ed7 100755 --- a/src/Field/cField.h +++ b/src/Field/cField.h @@ -63,7 +63,7 @@ class cField : public Field virtual double norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override = 0; //! Compute the norm2OnDevice of the field -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) virtual double norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) = 0; #endif diff --git a/src/Field/cField1D.cpp b/src/Field/cField1D.cpp index 77b0c2685..6a79da95a 100755 --- a/src/Field/cField1D.cpp +++ b/src/Field/cField1D.cpp @@ -191,7 +191,7 @@ double cField1D::norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) } //! Perform the norm2 on Device -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) double cField1D::norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) { ERROR("Not implemented"); diff --git a/src/Field/cField1D.h b/src/Field/cField1D.h index 43f2030e3..27b15bfc1 100755 --- a/src/Field/cField1D.h +++ b/src/Field/cField1D.h @@ -94,7 +94,7 @@ class cField1D : public cField virtual double norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override; //! Compute the norm2OnDevice of the field -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) virtual double norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override final; #endif diff --git a/src/Field/cField2D.cpp b/src/Field/cField2D.cpp index e1ca5560a..57ff6ea81 100755 --- a/src/Field/cField2D.cpp +++ b/src/Field/cField2D.cpp @@ -219,7 +219,7 @@ double cField2D::norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) } //! Perform the norm2 on Device -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) double cField2D::norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) { ERROR("Not implemented"); diff --git a/src/Field/cField2D.h b/src/Field/cField2D.h index d447d4f2e..26ee995c9 100755 --- a/src/Field/cField2D.h +++ b/src/Field/cField2D.h @@ -84,7 +84,7 @@ class cField2D : public cField virtual double norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override; //! Compute the norm2OnDevice of the field -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) virtual double norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override final; #endif diff --git a/src/Field/cField3D.cpp b/src/Field/cField3D.cpp index 84510f401..f4249e134 100755 --- a/src/Field/cField3D.cpp +++ b/src/Field/cField3D.cpp @@ -218,7 +218,7 @@ double cField3D::norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) } //! Perform the norm2 on Device -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) double cField3D::norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) { ERROR("Not implemented"); diff --git a/src/Field/cField3D.h b/src/Field/cField3D.h index a81f293fc..0db1f6835 100755 --- a/src/Field/cField3D.h +++ b/src/Field/cField3D.h @@ -84,7 +84,7 @@ class cField3D : public cField virtual double norm2( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override; //! Compute the norm2OnDevice of the field -#if defined(SMILEI_ACCELERATOR_MODE) +#if defined(SMILEI_ACCELERATOR_GPU) virtual double norm2OnDevice( unsigned int istart[3][2], unsigned int bufsize[3][2] ) override final; #endif diff --git a/src/Interpolator/Interpolator1D.cpp b/src/Interpolator/Interpolator1D.cpp index e10b611bd..cdf84992c 100755 --- a/src/Interpolator/Interpolator1D.cpp +++ b/src/Interpolator/Interpolator1D.cpp @@ -11,7 +11,7 @@ Interpolator1D::Interpolator1D( Patch *patch ) : Interpolator() { - index_domain_begin = patch->getCellStartingGlobalIndex( 0 ); + i_domain_begin_ = patch->getCellStartingGlobalIndex( 0 ); } diff --git a/src/Interpolator/Interpolator1D.h b/src/Interpolator/Interpolator1D.h index c1324e0a3..408b6ac3a 100755 --- a/src/Interpolator/Interpolator1D.h +++ b/src/Interpolator/Interpolator1D.h @@ -22,7 +22,7 @@ class Interpolator1D : public Interpolator protected: //! Inverse of the spatial-step double dx_inv_; - unsigned int index_domain_begin; + unsigned int i_domain_begin_; }; #endif diff --git a/src/Interpolator/Interpolator1D2Order.cpp b/src/Interpolator/Interpolator1D2Order.cpp index e867b29be..f85e735de 100755 --- a/src/Interpolator/Interpolator1D2Order.cpp +++ b/src/Interpolator/Interpolator1D2Order.cpp @@ -8,13 +8,11 @@ #include "Particles.h" #include "LaserEnvelope.h" - using namespace std; Interpolator1D2Order::Interpolator1D2Order( Params ¶ms, Patch *patch ) : Interpolator1D( patch ) { dx_inv_ = 1.0/params.cell_length[0]; - } // --------------------------------------------------------------------------------------------------------------------- @@ -23,31 +21,45 @@ Interpolator1D2Order::Interpolator1D2Order( Params ¶ms, Patch *patch ) : Int void Interpolator1D2Order::fields( ElectroMagn *EMfields, Particles &particles, int ipart, int nparts, double *ELoc, double *BLoc ) { // Static cast of the electromagnetic fields - Field1D *Ex1D = static_cast( EMfields->Ex_ ); - Field1D *Ey1D = static_cast( EMfields->Ey_ ); - Field1D *Ez1D = static_cast( EMfields->Ez_ ); - Field1D *Bx1D_m = static_cast( EMfields->Bx_m ); - Field1D *By1D_m = static_cast( EMfields->By_m ); - Field1D *Bz1D_m = static_cast( EMfields->Bz_m ); + Field1D *Ex1D = static_cast( EMfields->Ex_ ); + Field1D *Ey1D = static_cast( EMfields->Ey_ ); + Field1D *Ez1D = static_cast( EMfields->Ez_ ); + Field1D *Bx1D = static_cast( EMfields->Bx_m ); + Field1D *By1D = static_cast( EMfields->By_m ); + Field1D *Bz1D = static_cast( EMfields->Bz_m ); // Particle position (in units of the spatial-step) - double xpn = particles.position( 0, ipart )*dx_inv_; + double xpn = particles.position( 0, ipart ) * dx_inv_; // Calculate coeffs int idx_p[1], idx_d[1]; double delta_p[1]; double coeffxp[3]; double coeffxd[3]; + coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); + // Interpolation of Ex^(d) + /*ELoc[0*nparts+ipart] = compute( &coeffxd[0], Ex1D, idx_d[0] ); + // Interpolation of Ey^(p) + ELoc[1*nparts+ipart] = compute( &coeffxp[0], Ey1D, idx_p[0] ); + // Interpolation of Ez^(p) + ELoc[2*nparts+ipart] = compute( &coeffxp[0], Ez1D, idx_p[0] ); + // Interpolation of Bx^(p) + BLoc[0*nparts+ipart] = compute( &coeffxp[0], Bx1D, idx_p[0] ); + // Interpolation of By^(d) + BLoc[1*nparts+ipart] = compute( &coeffxd[0], By1D, idx_d[0] ); + // Interpolation of Bz^(d) + BLoc[2*nparts+ipart] = compute( &coeffxd[0], Bz1D, idx_d[0] );*/ + // Interpolate the fields from the Dual grid : Ex, By, Bz - *( ELoc+0*nparts ) = compute( coeffxd, Ex1D, idx_d[0] ); - *( BLoc+1*nparts ) = compute( coeffxd, By1D_m, idx_d[0] ); - *( BLoc+2*nparts ) = compute( coeffxd, Bz1D_m, idx_d[0] ); + *( ELoc+0*nparts ) = compute( coeffxd, Ex1D, idx_d[0] ); + *( BLoc+1*nparts ) = compute( coeffxd, By1D, idx_d[0] ); + *( BLoc+2*nparts ) = compute( coeffxd, Bz1D, idx_d[0] ); // Interpolate the fields from the Primal grid : Ey, Ez, Bx - *( ELoc+1*nparts ) = compute( coeffxp, Ey1D, idx_p[0] ); - *( ELoc+2*nparts ) = compute( coeffxp, Ez1D, idx_p[0] ); - *( BLoc+0*nparts ) = compute( coeffxp, Bx1D_m, idx_p[0] ); + *( ELoc+1*nparts ) = compute( coeffxp, Ey1D, idx_p[0] ); + *( ELoc+2*nparts ) = compute( coeffxp, Ez1D, idx_p[0] ); + *( BLoc+0*nparts ) = compute( coeffxp, Bx1D, idx_p[0] ); }//END Interpolator1D2Order @@ -65,16 +77,16 @@ void Interpolator1D2Order::fieldsAndCurrents( ElectroMagn *EMfields, Particles & } // Static cast of the electromagnetic fields - Field1D *Ex1D = static_cast( EMfields->Ex_ ); - Field1D *Ey1D = static_cast( EMfields->Ey_ ); - Field1D *Ez1D = static_cast( EMfields->Ez_ ); - Field1D *Bx1D_m = static_cast( EMfields->Bx_m ); - Field1D *By1D_m = static_cast( EMfields->By_m ); - Field1D *Bz1D_m = static_cast( EMfields->Bz_m ); - Field1D *Jx1D = static_cast( EMfields->Jx_ ); - Field1D *Jy1D = static_cast( EMfields->Jy_ ); - Field1D *Jz1D = static_cast( EMfields->Jz_ ); - Field1D *Rho1D = static_cast( EMfields->rho_ ); + Field1D *Ex1D = static_cast( EMfields->Ex_ ); + Field1D *Ey1D = static_cast( EMfields->Ey_ ); + Field1D *Ez1D = static_cast( EMfields->Ez_ ); + Field1D *Bx1D = static_cast( EMfields->Bx_m ); + Field1D *By1D = static_cast( EMfields->By_m ); + Field1D *Bz1D = static_cast( EMfields->Bz_m ); + Field1D *Jx1D = static_cast( EMfields->Jx_ ); + Field1D *Jy1D = static_cast( EMfields->Jy_ ); + Field1D *Jz1D = static_cast( EMfields->Jz_ ); + Field1D *Rho1D = static_cast( EMfields->rho_ ); Field1D *By1DBTIS3; Field1D *Bz1DBTIS3; if (smpi->use_BTIS3){ @@ -89,19 +101,34 @@ void Interpolator1D2Order::fieldsAndCurrents( ElectroMagn *EMfields, Particles & double delta_p[1]; double coeffxp[3]; double coeffxd[3]; + coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); int nparts( particles.numberOfParticles() ); + // Interpolation of Ex^(d) + /*ELoc[0*nparts+ipart] = compute( &coeffxd[0], Ex1D, idx_d[0] ); + // Interpolation of Ey^(p) + ELoc[1*nparts+ipart] = compute( &coeffxp[0], Ey1D, idx_p[0] ); + // Interpolation of Ez^(p) + ELoc[2*nparts+ipart] = compute( &coeffxp[0], Ez1D, idx_p[0] ); + // Interpolation of Bx^(p) + BLoc[0*nparts+ipart] = compute( &coeffxp[0], Bx1D, idx_p[0] ); + // Interpolation of By^(d) + BLoc[1*nparts+ipart] = compute( &coeffxd[0], By1D, idx_d[0] ); + // Interpolation of Bz^(d) + BLoc[2*nparts+ipart] = compute( &coeffxd[0], Bz1D, idx_d[0] );*/ + + // Interpolate the fields from the Dual grid : Ex, By, Bz - *( ELoc+0*nparts ) = compute( coeffxd, Ex1D, idx_d[0] ); - *( BLoc+1*nparts ) = compute( coeffxd, By1D_m, idx_d[0] ); - *( BLoc+2*nparts ) = compute( coeffxd, Bz1D_m, idx_d[0] ); + *( ELoc+0*nparts ) = compute( &coeffxd[0], Ex1D, idx_d[0] ); + *( BLoc+1*nparts ) = compute( &coeffxd[0], By1D, idx_d[0] ); + *( BLoc+2*nparts ) = compute( &coeffxd[0], Bz1D, idx_d[0] ); // Interpolate the fields from the Primal grid : Ey, Ez, Bx - *( ELoc+1*nparts ) = compute( coeffxp, Ey1D, idx_p[0] ); - *( ELoc+2*nparts ) = compute( coeffxp, Ez1D, idx_p[0] ); - *( BLoc+0*nparts ) = compute( coeffxp, Bx1D_m, idx_p[0] ); + *( ELoc+1*nparts ) = compute( &coeffxp[0], Ey1D, idx_p[0] ); + *( ELoc+2*nparts ) = compute( &coeffxp[0], Ez1D, idx_p[0] ); + *( BLoc+0*nparts ) = compute( &coeffxp[0], Bx1D, idx_p[0] ); // Interpolate the fields from the Primal grid : Jy, Jz, Rho JLoc->y = compute( coeffxp, Jy1D, idx_p[0] ); @@ -115,7 +142,6 @@ void Interpolator1D2Order::fieldsAndCurrents( ElectroMagn *EMfields, Particles & *( BLocyBTIS3+0*nparts ) = compute( coeffxp, By1DBTIS3, idx_p[0] ); *( BLoczBTIS3+0*nparts ) = compute( coeffxp, Bz1DBTIS3, idx_p[0] ); } - } // Interpolator on another field than the basic ones @@ -127,7 +153,7 @@ void Interpolator1D2Order::oneField( Field **field, Particles &particles, int *i double coeffxp[3]; double coeffxd[3]; double *coeff = F->isDual( 0 ) ? coeffxd : coeffxp; - int *i = F->isDual( 0 ) ? &idx_d[0] : &idx_p[0]; + int *i = F->isDual( 0 ) ? &idx_d[0] : &idx_p[0]; for( int ipart=*istart ; ipart<*iend; ipart++ ) { double xpn = particles.position( 0, ipart )*dx_inv_; @@ -136,26 +162,133 @@ void Interpolator1D2Order::oneField( Field **field, Particles &particles, int *i } } -void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, Particles &particles, SmileiMPI *smpi, int *istart, int *iend, int ithread, unsigned int, int ) +void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, + Particles &particles, SmileiMPI *smpi, + int *istart, int *iend, int ithread, unsigned int, int ) { - double *Epart = &( smpi->dynamics_Epart[ithread][0] ); - double *Bpart = &( smpi->dynamics_Bpart[ithread][0] ); - int *iold = &( smpi->dynamics_iold[ithread][0] ); - double *delta = &( smpi->dynamics_deltaold[ithread][0] ); + double *const __restrict__ ELoc = smpi->dynamics_Epart[ithread].data(); + double *const __restrict__ BLoc = smpi->dynamics_Bpart[ithread].data(); - // Static cast of the electromagnetic fields - Field1D *Ex1D = static_cast( EMfields->Ex_ ); - Field1D *Ey1D = static_cast( EMfields->Ey_ ); - Field1D *Ez1D = static_cast( EMfields->Ez_ ); - Field1D *Bx1D = static_cast( EMfields->Bx_m ); - Field1D *By1D = static_cast( EMfields->By_m ); - Field1D *Bz1D = static_cast( EMfields->Bz_m ); + int *const __restrict__ iold = smpi->dynamics_iold[ithread].data(); + double *const __restrict__ delta = smpi->dynamics_deltaold[ithread].data(); + const double *const __restrict__ position_x = particles.getPtrPosition( 0 ); + + // Static cast of the electromagnetic fields + const double *const __restrict__ Ex1D = static_cast( EMfields->Ex_ )->data(); + const double *const __restrict__ Ey1D = static_cast( EMfields->Ey_ )->data(); + const double *const __restrict__ Ez1D = static_cast( EMfields->Ez_ )->data(); + const double *const __restrict__ Bx1D = static_cast( EMfields->Bx_m )->data(); + const double *const __restrict__ By1D = static_cast( EMfields->By_m )->data(); + const double *const __restrict__ Bz1D = static_cast( EMfields->Bz_m )->data(); + +#if defined(SMILEI_ACCELERATOR_GPU_OACC) + const int sizeofEx = EMfields->Ex_->size(); + const int sizeofEy = EMfields->Ey_->size(); + const int sizeofEz = EMfields->Ez_->size(); + const int sizeofBx = EMfields->Bx_m->size(); + const int sizeofBy = EMfields->By_m->size(); + const int sizeofBz = EMfields->Bz_m->size(); +#endif //Loop on bin particles - int nparts = particles.numberOfParticles(); - - if (!smpi->use_BTIS3){ // without BTIS-3 interpolation + const int nparts = particles.numberOfParticles(); + const int first_index = *istart; + const int last_index = *iend; + double accdx_inv[2]; + accdx_inv[0]= dx_inv_; + + if (!smpi->use_BTIS3){ +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) + #pragma omp target map( to : i_domain_begin_) is_device_ptr (position_x) + #pragma omp teams distribute parallel for +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) + #pragma acc enter data create(this) + #pragma acc update device(this) + size_t interpolation_range_size = ( last_index + 0 * nparts ) - first_index; + #pragma acc parallel present(ELoc [first_index:interpolation_range_size],\ + BLoc [first_index:interpolation_range_size],\ + iold [first_index:interpolation_range_size],\ + delta [first_index:interpolation_range_size],\ + Ex1D [0:sizeofEx],\ + Ey1D [0:sizeofEy],\ + Ez1D [0:sizeofEz],\ + Bx1D [0:sizeofBx],\ + By1D [0:sizeofBy],\ + Bz1D [0:sizeofBz])\ + deviceptr(position_x) \ + copyin(accdx_inv[0:2]) //copyin(dx_inv_[:1]) //copyin(dx_inv_) + #pragma acc loop gang worker vector +#endif + for( int ipart = first_index; ipart < last_index; ipart++ ) { + // Normalized particle position + const double xpn = position_x[ipart] * accdx_inv[0]; + // Calculate coeffs + int idx_p[1], idx_d[1]; + double delta_p[1]; + double coeffxp[3]; + double coeffxd[3]; + + coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); + + // Interpolation of Ex^(d) + ELoc[0*nparts+ipart] = compute( &coeffxd[0], Ex1D, idx_d[0] ); + // Interpolation of Ey^(p) + ELoc[1*nparts+ipart] = compute( &coeffxp[0], Ey1D, idx_p[0] ); + // Interpolation of Ez^(p) + ELoc[2*nparts+ipart] = compute( &coeffxp[0], Ez1D, idx_p[0] ); + // Interpolation of Bx^(p) + BLoc[0*nparts+ipart] = compute( &coeffxp[0], Bx1D, idx_p[0] ); + // Interpolation of By^(d) + BLoc[1*nparts+ipart] = compute( &coeffxd[0], By1D, idx_d[0] ); + // Interpolation of Bz^(d) + BLoc[2*nparts+ipart] = compute( &coeffxd[0], Bz1D, idx_d[0] ); + + //Buffering of iol and delta + iold[0*nparts+ipart] = idx_p[0]; + delta[0*nparts+ipart] = delta_p[0]; + + } // end ipart loop + #if defined(SMILEI_ACCELERATOR_GPU_OACC) + #pragma acc exit data delete(this) + #endif + + }else { + + double *BypartBTIS3 = &( smpi->dynamics_Bpart_yBTIS3[ithread][0] ); + double *BzpartBTIS3 = &( smpi->dynamics_Bpart_zBTIS3[ithread][0] ); //*/ + //double *const __restrict__ BypartBTIS3 = smpi->dynamics_Bpart_yBTIS3[ithread].data(); + //double *const __restrict__ BzpartBTIS3 = smpi->dynamics_Bpart_zBTIS3[ithread].data(); + + const double *const __restrict__ By1D_mBTIS3 = static_cast( EMfields->By_mBTIS3 )->data(); + const double *const __restrict__ Bz1D_mBTIS3 = static_cast( EMfields->Bz_mBTIS3 )->data(); + +/* +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) + #pragma omp target map( to : i_domain_begin_) is_device_ptr ( position_x) + #pragma omp teams distribute parallel for +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) + #pragma acc enter data create(this) + #pragma acc update device(this) + size_t interpolation_range_size = ( last_index + 0 * nparts ) - first_index; + #pragma acc parallel present(ELoc [first_index:interpolation_range_size],\ + BLoc [first_index:interpolation_range_size],\ + BypartBTIS3 [first_index:interpolation_range_size],\ + BzpartBTIS3 [first_index:interpolation_range_size],\ + iold [first_index:interpolation_range_size],\ + delta [first_index:interpolation_range_size],\ + Ex1D [0:sizeofEx],\ + Ey1D [0:sizeofEy],\ + Ez1D [0:sizeofEz],\ + Bx1D [0:sizeofBx],\ + By1D [0:sizeofBy],\ + Bz1D [0:sizeofBz],\ + By1D_mBTIS3 [0:sizeofEz],\ + Bz1D_mBTIS3 [0:sizeofEy])\ + deviceptr(position_x) \ + copyin(d_inv_) + #pragma acc loop gang worker vector +#endif //*/ for (int ipart=*istart; ipart < *iend; ipart++){ // Normalized particle position @@ -166,70 +299,39 @@ void Interpolator1D2Order::fieldsWrapper( ElectroMagn *EMfields, Particles &part double delta_p[1]; double coeffxp[3]; double coeffxd[3]; + coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); // Interpolation of Ex^(d) - *( Epart+0*nparts+ipart ) = compute( coeffxd, Ex1D, idx_d[0] ); + ELoc[0*nparts+ipart] = compute( coeffxd, Ex1D, idx_d[0] ); // Interpolation of Ey^(p) - *( Epart+1*nparts+ipart ) = compute( coeffxp, Ey1D, idx_p[0] ); + ELoc[1*nparts+ipart] = compute( coeffxp, Ey1D, idx_p[0] ); // Interpolation of Ez^(p) - *( Epart+2*nparts+ipart ) = compute( coeffxp, Ez1D, idx_p[0] ); + ELoc[2*nparts+ipart] = compute( coeffxp, Ez1D, idx_p[0] ); // Interpolation of Bx^(p) - *( Bpart+0*nparts+ipart ) = compute( coeffxp, Bx1D, idx_p[0] ); + BLoc[0*nparts+ipart] = compute( coeffxp, Bx1D, idx_p[0] ); // Interpolation of By^(d) - *( Bpart+1*nparts+ipart ) = compute( coeffxd, By1D, idx_d[0] ); + BLoc[1*nparts+ipart] = compute( coeffxd, By1D, idx_d[0] ); // Interpolation of Bz^(d) - *( Bpart+2*nparts+ipart ) = compute( coeffxd, Bz1D, idx_d[0] ); + BLoc[2*nparts+ipart] = compute( coeffxd, Bz1D, idx_d[0] ); + // Interpolation of ByBTIS3^(p) + *( BypartBTIS3+0*nparts+ipart ) = compute( coeffxp, By1D_mBTIS3, idx_p[0] ); + // Interpolation of BzBTIS3^(p) + *( BzpartBTIS3+0*nparts+ipart ) = compute( coeffxp, Bz1D_mBTIS3, idx_p[0] ); + // Interpolation of ByBTIS3^(p) + //BypartBTIS3[0*nparts+ipart ] = compute( coeffxp, By1D_mBTIS3, idx_p[0] ); + // Interpolation of BzBTIS3^(p) + //BzpartBTIS3[0*nparts+ipart ] = compute( coeffxp, Bz1D_mBTIS3, idx_p[0] ); //Buffering of iol and delta - *( iold+0*nparts+ipart) = idx_p[0]; - *( delta+0*nparts+ipart) = delta_p[0]; - + iold[0*nparts+ipart] = idx_p[0]; + delta[0*nparts+ipart] = delta_p[0]; } // end ipart loop - } else { // with B-TIS3 interpolation - - Field1D *By1D_mBTIS3 = static_cast( EMfields->By_mBTIS3 ); - Field1D *Bz1D_mBTIS3 = static_cast( EMfields->Bz_mBTIS3 ); - double *BypartBTIS3 = &( smpi->dynamics_Bpart_yBTIS3[ithread][0] ); - double *BzpartBTIS3 = &( smpi->dynamics_Bpart_zBTIS3[ithread][0] ); - - for (int ipart=*istart; ipart < *iend; ipart++){ - - // Normalized particle position - double xpn = particles.position( 0, ipart )*dx_inv_; - - // Calculate coeffs - int idx_p[1], idx_d[1]; - double delta_p[1]; - double coeffxp[3]; - double coeffxd[3]; - - coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); - - // Interpolation of Ex^(d) - *( Epart+0*nparts+ipart ) = compute( coeffxd, Ex1D, idx_d[0] ); - // Interpolation of Ey^(p) - *( Epart+1*nparts+ipart ) = compute( coeffxp, Ey1D, idx_p[0] ); - // Interpolation of Ez^(p) - *( Epart+2*nparts+ipart ) = compute( coeffxp, Ez1D, idx_p[0] ); - // Interpolation of Bx^(p) - *( Bpart+0*nparts+ipart ) = compute( coeffxp, Bx1D, idx_p[0] ); - // Interpolation of By^(d) - *( Bpart+1*nparts+ipart ) = compute( coeffxd, By1D, idx_d[0] ); - // Interpolation of Bz^(d) - *( Bpart+2*nparts+ipart ) = compute( coeffxd, Bz1D, idx_d[0] ); - // Interpolation of ByBTIS3^(p) - *( BypartBTIS3+0*nparts ) = compute( coeffxp, By1D_mBTIS3, idx_p[0] ); - // Interpolation of BzBTIS3^(p) - *( BzpartBTIS3+0*nparts ) = compute( coeffxp, Bz1D_mBTIS3, idx_p[0] ); - - //Buffering of iol and delta - *( iold+0*nparts+ipart) = idx_p[0]; - *( delta+0*nparts+ipart) = delta_p[0]; - - } // end ipart loop - - } + #if defined(SMILEI_ACCELERATOR_GPU_OACC) + #pragma acc exit data delete(this) + #endif + } // end with B-TIS interpolation + } // Interpolator specific to tracked particles. A selection of particles may be provided @@ -350,9 +452,9 @@ void Interpolator1D2Order::fieldsAndEnvelope( ElectroMagn *EMfields, Particles & // Interpolation of Bz^(d) *( Bpart+2*nparts+ipart ) = compute( coeffxd, Bz1D, idx_d[0] ); // Interpolation of ByBTIS3^(p) - *( BypartBTIS3+0*nparts ) = compute( coeffxp, By1D_mBTIS3, idx_p[0] ); + *( BypartBTIS3+0*nparts) = compute( coeffxp, By1D_mBTIS3, idx_p[0] ); // Interpolation of BzBTIS3^(p) - *( BzpartBTIS3+0*nparts ) = compute( coeffxp, Bz1D_mBTIS3, idx_p[0] ); + *( BzpartBTIS3+0*nparts) = compute( coeffxp, Bz1D_mBTIS3, idx_p[0] ); // Interpolation of Phi^(p) *( PHIpart+0*nparts+ipart ) = compute( coeffxp, Phi1D, idx_p[0] ); // Interpolation of GradPhix^(p) @@ -394,11 +496,12 @@ void Interpolator1D2Order::timeCenteredEnvelope( ElectroMagn *EMfields, Particle // Calculate coeffs - int idx_p[1]; + int idx_p[1], idx_d[1]; double delta_p[1]; double coeffxp[3]; + double coeffxd[3]; - coeffs( xpn, idx_p, NULL, coeffxp, NULL, delta_p ); + coeffs( xpn, idx_p, idx_d, coeffxp, coeffxd, delta_p ); // Interpolation of Phi^(p) *( PHI_mpart+0*nparts+ipart ) = compute( coeffxp, Phi_m1D, idx_p[0] ); @@ -428,31 +531,46 @@ void Interpolator1D2Order::envelopeAndSusceptibility( ElectroMagn *EMfields, Par // Normalized particle position double xpn = particles.position( 0, ipart )*dx_inv_; - // Indexes of the central nodes - int idx_p[1]; - double delta_p[1]; + // Calculate coeffs double coeffxp[3]; - coeffs( xpn, idx_p, NULL, coeffxp, NULL, delta_p ); + + // Indexes of the central nodes + int ip = round( xpn ); + + // Declaration and calculation of the coefficient for interpolation + double deltax, delta2; + + deltax = xpn - ( double )ip; + delta2 = deltax*deltax; + coeffxp[0] = 0.5 * ( delta2-deltax+0.25 ); + coeffxp[1] = 0.75 - delta2; + coeffxp[2] = 0.5 * ( delta2+deltax+0.25 ); + + + //!\todo CHECK if this is correct for both primal & dual grids !!! + // First index for summation + ip = ip - i_domain_begin_; // ------------------------- // Interpolation of Env_A_abs_^(p) // ------------------------- - *( Env_A_abs_Loc ) = compute( coeffxp, Env_A_abs_1D, idx_p[0] ); + *( Env_A_abs_Loc ) = compute( coeffxp, Env_A_abs_1D, ip ); //compute( &coeffp_[1], Env_A_abs_1D, ip_ ); // ------------------------- // Interpolation of Env_Chi_^(p) // ------------------------- - *( Env_Chi_Loc ) = compute( coeffxp, Env_Chi_1D, idx_p[0] ); + *( Env_Chi_Loc ) = compute( coeffxp, Env_Chi_1D, ip ); //compute( &coeffp_[1], Env_Chi_1D, ip_ ); // ------------------------- // Interpolation of Env_E_abs_^(p) // ------------------------- - *( Env_E_abs_Loc ) = compute( coeffxp, Env_E_abs_1D, idx_p[0] ); + *( Env_E_abs_Loc ) = compute( coeffxp, Env_E_abs_1D, ip ); // compute( &coeffp_[1], Env_E_abs_1D, ip_ ); // ------------------------- // Interpolation of Env_Ex_abs_^(p) // ------------------------- - *( Env_Ex_abs_Loc ) = compute( coeffxp, Env_Ex_abs_1D, idx_p[0] ); + *( Env_Ex_abs_Loc ) = compute( coeffxp, Env_Ex_abs_1D, ip ); // compute( &coeffp_[1], Env_Ex_abs_1D, ip_ ); + } // END Interpolator1D2Order @@ -466,13 +584,26 @@ void Interpolator1D2Order::envelopeFieldForIonization( ElectroMagn *EMfields, Pa //Loop on bin particles for( int ipart=*istart ; ipart<*iend; ipart++ ) { - // Normalized particle position - double xpn = particles.position( 0, ipart )*dx_inv_; - int idx_p[1]; double delta_p[1]; double coeffxp[3]; - coeffs( xpn, idx_p, NULL, coeffxp, NULL, delta_p ); + + // Normalized particle position + double xpn = particles.position( 0, ipart )*dx_inv_; + + double delta2; + + // Primal + idx_p[0] = round( xpn ); // index of the central point + delta_p[0] = xpn -( double )idx_p[0]; // normalized distance to the central node + delta2 = pow( delta_p[0], 2 ); // square of the normalized distance to the central node + + // 2nd order interpolation on 3 nodes + coeffxp[0] = 0.5 * ( delta2-delta_p[0]+0.25 ); + coeffxp[1] = ( 0.75-delta2 ); + coeffxp[2] = 0.5 * ( delta2+delta_p[0]+0.25 ); + + idx_p[0] -= i_domain_begin_; // --------------------------------- // Interpolation of Env_E_abs^(p) diff --git a/src/Interpolator/Interpolator1D2Order.h b/src/Interpolator/Interpolator1D2Order.h index 268e33b5a..44e6651d4 100755 --- a/src/Interpolator/Interpolator1D2Order.h +++ b/src/Interpolator/Interpolator1D2Order.h @@ -5,6 +5,8 @@ #include "Interpolator1D.h" #include "Field1D.h" +#include "gpu.h" + // -------------------------------------------------------------------------------------------------------------------- //! Class for 2nd order interpolator for 1Dcartesian simulations // -------------------------------------------------------------------------------------------------------------------- @@ -13,7 +15,7 @@ class Interpolator1D2Order final : public Interpolator1D public: Interpolator1D2Order( Params &, Patch * ); - ~Interpolator1D2Order() override final {}; + ~Interpolator1D2Order() override {}; //final inline void __attribute__((always_inline)) fields( ElectroMagn *EMfields, Particles &particles, int ipart, int nparts, double *ELoc, double *BLoc ); inline void __attribute__((always_inline)) fieldsForTasks( ElectroMagn *EMfields, Particles &particles, int ipart, int nparts, double *ELoc, double *BLoc, int *iold, double *delta ); @@ -22,11 +24,23 @@ class Interpolator1D2Order final : public Interpolator1D void fieldsSelection( ElectroMagn *EMfields, Particles &particles, double *buffer, int offset, std::vector *selection ) override final; void oneField( Field **field, Particles &particles, int *istart, int *iend, double *FieldLoc, double *l1=NULL, double *l2=NULL, double *l3=NULL ) override final; - inline double __attribute__((always_inline)) compute( double *coeff, Field1D *f, int idx ) + inline double __attribute__((always_inline)) + compute( double *coeff, Field1D *f, int idx ) { double interp_res = coeff[0] * ( *f )( idx-1 ) + coeff[1] * ( *f )( idx ) + coeff[2] * ( *f )( idx+1 ); return interp_res; - }; + } + + SMILEI_ACCELERATOR_DECLARE_ROUTINE + static inline double __attribute__((always_inline)) + compute( const double *__restrict__ coeff, + const double *__restrict__ f, + int idx ) + { + double interp_res = coeff[0] * f[idx-1] + coeff[1] * f[idx] + coeff[2] * f[idx+1]; + return interp_res; + } + SMILEI_ACCELERATOR_DECLARE_ROUTINE_END void fieldsAndEnvelope( ElectroMagn *EMfields, Particles &particles, SmileiMPI *smpi, int *istart, int *iend, int ithread, int ipart_ref = 0 ) override final; void timeCenteredEnvelope( ElectroMagn *EMfields, Particles &particles, SmileiMPI *smpi, int *istart, int *iend, int ithread, int ipart_ref = 0 ) override final; @@ -34,39 +48,39 @@ class Interpolator1D2Order final : public Interpolator1D void envelopeFieldForIonization( ElectroMagn *EMfields, Particles &particles, SmileiMPI *smpi, int *istart, int *iend, int ithread, int ipart_ref = 0 ) override final; private: - inline void coeffs( double xpn, int* idx_p, int* idx_d, - double *coeffxp, double *coeffxd, double* delta_p ) + + // 2nd order interpolation on 3 nodes + SMILEI_ACCELERATOR_DECLARE_ROUTINE + inline void __attribute__( ( always_inline ) ) + coeffs( double xpn, int* idx_p, int* idx_d, + double *coeffxp, double *coeffxd, double* delta_p ) const { double delta, delta2; - // Primal - idx_p[0] = round( xpn ); // index of the central point - delta_p[0] = xpn -( double )idx_p[0]; // normalized distance to the central node - delta2 = pow( delta_p[0], 2 ); // square of the normalized distance to the central node - - // 2nd order interpolation on 3 nodes - coeffxp[0] = 0.5 * ( delta2-delta_p[0]+0.25 ); - coeffxp[1] = ( 0.75-delta2 ); - coeffxp[2] = 0.5 * ( delta2+delta_p[0]+0.25 ); + // index of the central point + idx_p[0] = std::round( xpn ); + idx_d[0] = std::round( xpn + 0.5 ); + + delta = xpn - static_cast( idx_d[0] ) + 0.5; // normalized distance to the central node + delta2 = delta * delta; // square of the normalized distance to the central node - idx_p[0] -= index_domain_begin; + coeffxd[0] = 0.5 * ( delta2 - delta + 0.25 ); + coeffxd[1] = ( 0.75 - delta2 ); + coeffxd[2] = 0.5 * ( delta2 + delta + 0.25 ); + + delta = xpn - static_cast( idx_p[0] ); + delta2 = delta * delta; // pow( delta_p[0], 2 ); // square of the normalized distance to the central node - if(idx_d){ - // Dual - idx_d[0] = round( xpn+0.5 ); // index of the central point - delta = xpn - ( double )idx_d[0] +0.5; // normalized distance to the central node - delta2 = delta*delta; // square of the normalized distance to the central node - - // 2nd order interpolation on 3 nodes - coeffxd[0] = 0.5 * ( delta2-delta+0.25 ); - coeffxd[1] = ( 0.75-delta2 ); - coeffxd[2] = 0.5 * ( delta2+delta+0.25 ); - - idx_d[0] -= index_domain_begin; - } + delta_p[0] = delta; // normalized distance to the central node + coeffxp[0] = 0.5 * ( delta2 - delta_p[0] + 0.25 ); + coeffxp[1] = ( 0.75 - delta2 ); + coeffxp[2] = 0.5 * ( delta2 + delta_p[0] + 0.25 ); + + idx_p[0] = idx_p[0] - i_domain_begin_; + idx_d[0] = idx_d[0] - i_domain_begin_; } - + SMILEI_ACCELERATOR_DECLARE_ROUTINE_END };//END class #endif diff --git a/src/Interpolator/Interpolator1D2OrderV.cpp b/src/Interpolator/Interpolator1D2OrderV.cpp old mode 100644 new mode 100755 index 31c3b7d4c..2b99cc66b --- a/src/Interpolator/Interpolator1D2OrderV.cpp +++ b/src/Interpolator/Interpolator1D2OrderV.cpp @@ -176,7 +176,7 @@ void Interpolator1D2OrderV::fieldsWrapper( ElectroMagn *EMfields, Particles &par coeffd[1] = ( 0.75-xjmxi2 ); coeffd[2] = 0.5 * ( xjmxi2+xjmxi+0.25 ); - idx -= index_domain_begin; + idx -= i_domain_begin_; // Primal ipx = round( xjn ); // index of the central point @@ -188,7 +188,7 @@ void Interpolator1D2OrderV::fieldsWrapper( ElectroMagn *EMfields, Particles &par coeffp[1] = ( 0.75-xjmxi2 ); coeffp[2] = 0.5 * ( xjmxi2+xjmxi+0.25 ); - ipx -= index_domain_begin; + ipx -= i_domain_begin_; // // Interpolate the fields from the Dual grid : Ex, By, Bz Epart_x[ipart] = coeffd[0] * Ex[idx-1] + coeffd[1] * Ex[idx] + coeffd[2] * Ex[idx+1]; @@ -329,7 +329,7 @@ void Interpolator1D2OrderV::timeCenteredEnvelope( ElectroMagn *EMfields, Particl //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // ------------------------- // Interpolation of Phiold^(p) @@ -388,7 +388,7 @@ void Interpolator1D2OrderV::envelopeAndSusceptibility( ElectroMagn *EMfields, Pa //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // ------------------------- // Interpolation of Env_A_abs_^(p) @@ -441,7 +441,7 @@ void Interpolator1D2OrderV::envelopeFieldForIonization( ElectroMagn *EMfields, P //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // --------------------------------- // Interpolation of Env_E_abs^(p) diff --git a/src/Interpolator/Interpolator1D2OrderV.h b/src/Interpolator/Interpolator1D2OrderV.h old mode 100644 new mode 100755 index b7dce6588..7c72f9ca2 --- a/src/Interpolator/Interpolator1D2OrderV.h +++ b/src/Interpolator/Interpolator1D2OrderV.h @@ -48,7 +48,7 @@ class Interpolator1D2OrderV final : public Interpolator1D coeffd_[1] = ( 0.75-xjmxi2 ); coeffd_[2] = 0.5 * ( xjmxi2+xjmxi+0.25 ); - id_ -= index_domain_begin; + id_ -= i_domain_begin_; // Primal ip_ = round( xjn ); // index of the central point @@ -60,7 +60,7 @@ class Interpolator1D2OrderV final : public Interpolator1D coeffp_[1] = ( 0.75-xjmxi2 ); coeffp_[2] = 0.5 * ( xjmxi2+xjmxi+0.25 ); - ip_ -= index_domain_begin; + ip_ -= i_domain_begin_; } // Last prim index computed diff --git a/src/Interpolator/Interpolator1D3Order.h b/src/Interpolator/Interpolator1D3Order.h index e9c821925..3228ed39b 100755 --- a/src/Interpolator/Interpolator1D3Order.h +++ b/src/Interpolator/Interpolator1D3Order.h @@ -42,7 +42,7 @@ class Interpolator1D3Order final : public Interpolator1D coeffd_[2] = dble_1ov6 + 0.5*( xi+xi2-xi3 ); coeffd_[3] = xi3*dble_1ov6; - id_ -= index_domain_begin; + id_ -= i_domain_begin_; // Primal ip_ = ( int )xjn; // index of the 2nd node @@ -56,7 +56,7 @@ class Interpolator1D3Order final : public Interpolator1D coeffp_[2] = dble_1ov6 + 0.5*( xi+xi2-xi3 ); coeffp_[3] = xi3*dble_1ov6; - ip_ -= index_domain_begin; + ip_ -= i_domain_begin_; } inline void coeffs( double xpn, int* idx_p, int* idx_d, @@ -77,7 +77,7 @@ class Interpolator1D3Order final : public Interpolator1D coeffxd[2] = dble_1ov6 + 0.5*( xi+xi2-xi3 ); coeffxd[3] = xi3*dble_1ov6; - idx_d[0] -= index_domain_begin; + idx_d[0] -= i_domain_begin_; // Primal idx_p[0] = ( int )xpn; // index of the 2nd node @@ -92,7 +92,7 @@ class Interpolator1D3Order final : public Interpolator1D coeffxp[2] = dble_1ov6 + 0.5*( xi+xi2-xi3 ); coeffxp[3] = xi3*dble_1ov6; - idx_p[0] -= index_domain_begin; + idx_p[0] -= i_domain_begin_; } // Last prim index computed diff --git a/src/Interpolator/Interpolator1D4Order.h b/src/Interpolator/Interpolator1D4Order.h index f8bd48ee4..0e8831091 100755 --- a/src/Interpolator/Interpolator1D4Order.h +++ b/src/Interpolator/Interpolator1D4Order.h @@ -53,7 +53,7 @@ class Interpolator1D4Order final : public Interpolator1D coeffxp[3] = dble_19_ov_96 + dble_11_ov_24 * delta_p[0] + dble_1_ov_4 * delta2 - dble_1_ov_6 * delta3 - dble_1_ov_6 * delta4; coeffxp[4] = dble_1_ov_384 + dble_1_ov_48 * delta_p[0] + dble_1_ov_16 * delta2 + dble_1_ov_12 * delta3 + dble_1_ov_24 * delta4; - idx_p[0] -= index_domain_begin; + idx_p[0] -= i_domain_begin_; if(idx_d){ // Dual @@ -70,7 +70,7 @@ class Interpolator1D4Order final : public Interpolator1D coeffxd[3] = dble_19_ov_96 + dble_11_ov_24 * delta + dble_1_ov_4 * delta2 - dble_1_ov_6 * delta3 - dble_1_ov_6 * delta4; coeffxd[4] = dble_1_ov_384 + dble_1_ov_48 * delta + dble_1_ov_16 * delta2 + dble_1_ov_12 * delta3 + dble_1_ov_24 * delta4; - idx_d[0] -= index_domain_begin; + idx_d[0] -= i_domain_begin_; } } diff --git a/src/Interpolator/Interpolator1DWT2Order.cpp b/src/Interpolator/Interpolator1DWT2Order.cpp index 2ba3881b5..4bc058096 100755 --- a/src/Interpolator/Interpolator1DWT2Order.cpp +++ b/src/Interpolator/Interpolator1DWT2Order.cpp @@ -239,7 +239,7 @@ void Interpolator1DWT2Order::timeCenteredEnvelope( ElectroMagn *EMfields, Partic //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // ------------------------- // Interpolation of Phiold^(p) @@ -298,7 +298,7 @@ void Interpolator1DWT2Order::envelopeAndSusceptibility( ElectroMagn *EMfields, P //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // ------------------------- // Interpolation of Env_A_abs_^(p) @@ -351,7 +351,7 @@ void Interpolator1DWT2Order::envelopeFieldForIonization( ElectroMagn *EMfields, //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // --------------------------------- // Interpolation of Env_E_abs^(p) diff --git a/src/Interpolator/Interpolator1DWT2Order.h b/src/Interpolator/Interpolator1DWT2Order.h index 19ea0ee7d..ff45230cf 100755 --- a/src/Interpolator/Interpolator1DWT2Order.h +++ b/src/Interpolator/Interpolator1DWT2Order.h @@ -47,7 +47,7 @@ class Interpolator1DWT2Order final : public Interpolator1D coeffd_[1] = ( 0.75-var1 ); coeffd_[2] = 0.5 * ( var1+xjmxi+0.25 ); - id_ -= index_domain_begin; + id_ -= i_domain_begin_; // Primal ip_ = round( xjn ); // index of the central point @@ -65,7 +65,7 @@ class Interpolator1DWT2Order final : public Interpolator1D coeffpt_[1] = 1.0 - 2.0 * var1; coeffpt_[2] = var1 + 0.5 * xjmxi; - ip_ -= index_domain_begin; + ip_ -= i_domain_begin_; } // Coefficients for WT diff --git a/src/Interpolator/Interpolator1DWT2OrderV.cpp b/src/Interpolator/Interpolator1DWT2OrderV.cpp index c64433035..40dd63589 100755 --- a/src/Interpolator/Interpolator1DWT2OrderV.cpp +++ b/src/Interpolator/Interpolator1DWT2OrderV.cpp @@ -178,7 +178,7 @@ void Interpolator1DWT2OrderV::fieldsWrapper( ElectroMagn *EMfields, Particles &p coeffd[1] = ( 0.75-var1 ); coeffd[2] = 0.5 * ( var1+xjmxi+0.25 ); - idx -= index_domain_begin; + idx -= i_domain_begin_; // Primal ipx = round( xjn ); // index of the central point @@ -190,7 +190,7 @@ void Interpolator1DWT2OrderV::fieldsWrapper( ElectroMagn *EMfields, Particles &p coeffpt[1] = 1.0 - 2.0 * var1; coeffpt[2] = var1 + 0.5 * xjmxi; - ipx -= index_domain_begin; + ipx -= i_domain_begin_; // // Interpolate the fields from the Dual grid : Ex, By, Bz Epart_x[ipart] = coeffd[0] * Ex[idx-1] + coeffd[1] * Ex[idx] + coeffd[2] * Ex[idx+1]; @@ -331,7 +331,7 @@ void Interpolator1DWT2OrderV::timeCenteredEnvelope( ElectroMagn *EMfields, Parti //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // ------------------------- // Interpolation of Phiold^(p) @@ -390,7 +390,7 @@ void Interpolator1DWT2OrderV::envelopeAndSusceptibility( ElectroMagn *EMfields, //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // ------------------------- // Interpolation of Env_A_abs_^(p) @@ -443,7 +443,7 @@ void Interpolator1DWT2OrderV::envelopeFieldForIonization( ElectroMagn *EMfields, //!\todo CHECK if this is correct for both primal & dual grids !!! // First index for summation - ip_ = ip_ - index_domain_begin; + ip_ = ip_ - i_domain_begin_; // --------------------------------- // Interpolation of Env_E_abs^(p) diff --git a/src/Interpolator/Interpolator1DWT2OrderV.h b/src/Interpolator/Interpolator1DWT2OrderV.h index 87a083fa5..4f20849c1 100755 --- a/src/Interpolator/Interpolator1DWT2OrderV.h +++ b/src/Interpolator/Interpolator1DWT2OrderV.h @@ -48,7 +48,7 @@ class Interpolator1DWT2OrderV final : public Interpolator1D coeffd_[1] = ( 0.75-var1 ); coeffd_[2] = 0.5 * ( var1+xjmxi+0.25 ); - id_ -= index_domain_begin; + id_ -= i_domain_begin_; // Primal ip_ = round( xjn ); // index of the central point @@ -66,7 +66,7 @@ class Interpolator1DWT2OrderV final : public Interpolator1D coeffpt_[1] = 1.0 - 2.0 * var1; coeffpt_[2] = var1 + 0.5 * xjmxi; - ip_ -= index_domain_begin; + ip_ -= i_domain_begin_; } // Coefficients for WT diff --git a/src/Interpolator/Interpolator1DWT4Order.h b/src/Interpolator/Interpolator1DWT4Order.h index dd5e78b13..6bc889885 100755 --- a/src/Interpolator/Interpolator1DWT4Order.h +++ b/src/Interpolator/Interpolator1DWT4Order.h @@ -55,7 +55,7 @@ class Interpolator1DWT4Order final : public Interpolator1D coeffd_[3] = dble_19_ov_96 + var1 + var3 * ( 1.5-xjmxi -var2 ); coeffd_[4] = dble_1_ov_24 * var5 * var5; - id_ -= index_domain_begin; + id_ -= i_domain_begin_; // Primal ip_ = round( xjn ); // index of the central point @@ -94,7 +94,7 @@ class Interpolator1DWT4Order final : public Interpolator1D coeffpt_[4] = var3 + var2 - var1; - ip_ -= index_domain_begin; + ip_ -= i_domain_begin_; } double dble_1_ov_6 ; diff --git a/src/Interpolator/Interpolator2D2Order.cpp b/src/Interpolator/Interpolator2D2Order.cpp index 0254294f5..795ab996d 100755 --- a/src/Interpolator/Interpolator2D2Order.cpp +++ b/src/Interpolator/Interpolator2D2Order.cpp @@ -180,7 +180,7 @@ void Interpolator2D2Order::fieldsWrapper( ElectroMagn *EMfields, const double *const __restrict__ By2D = static_cast( EMfields->By_m )->data(); const double *const __restrict__ Bz2D = static_cast( EMfields->Bz_m )->data(); -#if defined(SMILEI_OPENACC_MODE) +#if defined(SMILEI_ACCELERATOR_GPU_OACC) const int sizeofEx = EMfields->Ex_->size(); const int sizeofEy = EMfields->Ey_->size(); const int sizeofEz = EMfields->Ez_->size(); @@ -207,7 +207,7 @@ void Interpolator2D2Order::fieldsWrapper( ElectroMagn *EMfields, position_x /* [first_index:npart_range_size] */, \ position_y /* [first_index:npart_range_size] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc enter data create(this) #pragma acc update device(this) size_t interpolation_range_size = ( last_index + 1 * nparts ) - first_index; @@ -260,7 +260,7 @@ void Interpolator2D2Order::fieldsWrapper( ElectroMagn *EMfields, delta[1*nparts+ipart] = delta_p[1]; } - #if defined(SMILEI_OPENACC_MODE) + #if defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc exit data delete(this) #endif } else{ // with B-TIS3 interpolation @@ -276,7 +276,7 @@ void Interpolator2D2Order::fieldsWrapper( ElectroMagn *EMfields, position_x /* [first_index:npart_range_size] */, \ position_y /* [first_index:npart_range_size] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc enter data create(this) #pragma acc update device(this) size_t interpolation_range_size = ( last_index + 1 * nparts ) - first_index; @@ -337,7 +337,7 @@ void Interpolator2D2Order::fieldsWrapper( ElectroMagn *EMfields, delta[1*nparts+ipart] = delta_p[1]; } // end ipart loop - #if defined(SMILEI_OPENACC_MODE) + #if defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc exit data delete(this) #endif } // end with B-TIS interpolation diff --git a/src/Interpolator/Interpolator3D2Order.cpp b/src/Interpolator/Interpolator3D2Order.cpp index 9e594f20b..f40239836 100755 --- a/src/Interpolator/Interpolator3D2Order.cpp +++ b/src/Interpolator/Interpolator3D2Order.cpp @@ -185,8 +185,6 @@ void Interpolator3D2Order::fieldsWrapper( ElectroMagn *EMfields, Particles &part int *const __restrict__ iold = smpi->dynamics_iold[ithread].data(); double *const __restrict__ delta = smpi->dynamics_deltaold[ithread].data(); - unsigned int buffer_size = smpi->dynamics_Epart[ithread].size(); - const double *const __restrict__ position_x = particles.getPtrPosition( 0 ); const double *const __restrict__ position_y = particles.getPtrPosition( 1 ); const double *const __restrict__ position_z = particles.getPtrPosition( 2 ); @@ -198,7 +196,7 @@ void Interpolator3D2Order::fieldsWrapper( ElectroMagn *EMfields, Particles &part const double *const __restrict__ By3D = EMfields->By_m->data_; const double *const __restrict__ Bz3D = EMfields->Bz_m->data_; -#if defined(SMILEI_OPENACC_MODE) +#if defined(SMILEI_ACCELERATOR_GPU_OACC) const int sizeofEx = EMfields->Ex_->size(); const int sizeofEy = EMfields->Ey_->size(); const int sizeofEz = EMfields->Ez_->size(); @@ -224,7 +222,7 @@ void Interpolator3D2Order::fieldsWrapper( ElectroMagn *EMfields, Particles &part position_y /* [first_index:npart_range_size] */, \ position_z /* [first_index:npart_range_size] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc enter data create(this) #pragma acc update device(this) size_t interpolation_range_size = ( last_index + 2 * nparts ) - first_index; @@ -282,7 +280,7 @@ void Interpolator3D2Order::fieldsWrapper( ElectroMagn *EMfields, Particles &part delta[1*nparts+ipart] = delta_p[1]; delta[2*nparts+ipart] = delta_p[2]; } - #if defined(SMILEI_OPENACC_MODE) + #if defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc exit data delete(this) #endif } else { // with B-TIS3 interpolation @@ -302,7 +300,7 @@ void Interpolator3D2Order::fieldsWrapper( ElectroMagn *EMfields, Particles &part position_y /* [first_index:npart_range_size] */, \ position_z /* [first_index:npart_range_size] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc enter data create(this) #pragma acc update device(this) size_t interpolation_range_size = ( last_index + 2 * nparts ) - first_index; @@ -368,7 +366,7 @@ void Interpolator3D2Order::fieldsWrapper( ElectroMagn *EMfields, Particles &part delta[ipart+0*nparts] = delta_p[0]; delta[ipart+1*nparts] = delta_p[1]; delta[ipart+2*nparts] = delta_p[2]; - #if defined(SMILEI_OPENACC_MODE) + #if defined(SMILEI_ACCELERATOR_GPU_OACC) #pragma acc exit data delete(this) #endif } // end ipart loop diff --git a/src/Interpolator/Interpolator3D2Order.h b/src/Interpolator/Interpolator3D2Order.h index 52f0335a0..1fa07438d 100755 --- a/src/Interpolator/Interpolator3D2Order.h +++ b/src/Interpolator/Interpolator3D2Order.h @@ -59,7 +59,7 @@ class Interpolator3D2Order : public Interpolator3D int idx, int idy, int idz, - int nx, + int /*nx*/, int ny, int nz ) { diff --git a/src/Interpolator/InterpolatorFactory.h b/src/Interpolator/InterpolatorFactory.h index f2cbd7c19..37e1042fb 100755 --- a/src/Interpolator/InterpolatorFactory.h +++ b/src/Interpolator/InterpolatorFactory.h @@ -48,12 +48,22 @@ class InterpolatorFactory // 1Dcartesian simulation // --------------- if( ( params.geometry == "1Dcartesian" ) && ( params.interpolation_order == 2 ) ) { + if( !vectorization ) { + if ( params.interpolator_ == "momentum-conserving" ) { + Interp = new Interpolator1D2Order( params, patch ); + } + else if ( params.interpolator_ == "wt" ) { + Interp = new Interpolator1DWT2Order( params, patch ); + } + } + else { if ( params.interpolator_ == "momentum-conserving" ) { Interp = new Interpolator1D2OrderV( params, patch ); } else if ( params.interpolator_ == "wt" ) { Interp = new Interpolator1DWT2OrderV( params, patch ); } + } } else if( ( params.geometry == "1Dcartesian" ) && ( params.interpolation_order == 4 ) ) { if( params.interpolator_ == "momentum-conserving" ) { Interp = new Interpolator1D4Order( params, patch ); diff --git a/src/MovWindow/SimWindow.cpp b/src/MovWindow/SimWindow.cpp index 08ffada69..4ee9781c7 100755 --- a/src/MovWindow/SimWindow.cpp +++ b/src/MovWindow/SimWindow.cpp @@ -383,15 +383,10 @@ void SimWindow::shift( VectorPatch &vecPatches, SmileiMPI *smpi, Params ¶ms, } // end loop nSpecies -#if defined ( SMILEI_ACCELERATOR_MODE ) - if ( params.gpu_computing ) { - // ADD NEW PARTS ON GPU - for( unsigned int ispec=0 ; ispecvecSpecies[ispec]->particles_to_move->clear(); - // mypatch->vecSpecies[ispec]->particles->copyParticles( 0, mypatch->vecSpecies[ispec]->getNbrOfParticles(), - // *mypatch->vecSpecies[ispec]->particles_to_move, 0 ); - mypatch->vecSpecies[ispec]->particles->initializeDataOnDevice(); - mypatch->vecSpecies[ispec]->particles_to_move->initializeDataOnDevice(); +#if defined ( SMILEI_ACCELERATOR_GPU ) + if( params.gpu_computing ) { + for( auto spec: mypatch->vecSpecies ) { + spec->allocateParticlesOnDevice(); } } #endif @@ -403,7 +398,7 @@ void SimWindow::shift( VectorPatch &vecPatches, SmileiMPI *smpi, Params ¶ms, } // end test patch_particle_created[ithread][j] -#if defined ( SMILEI_ACCELERATOR_MODE ) +#if defined ( SMILEI_ACCELERATOR_GPU ) // if ( params.gpu_computing ) { // Initializes only field data structures, particle data structure are initialized separately mypatch->allocateAndCopyFieldsOnDevice(); diff --git a/src/MultiphotonBreitWheeler/MultiphotonBreitWheeler.cpp b/src/MultiphotonBreitWheeler/MultiphotonBreitWheeler.cpp index 6f7b9e0df..8136f36ff 100755 --- a/src/MultiphotonBreitWheeler/MultiphotonBreitWheeler.cpp +++ b/src/MultiphotonBreitWheeler/MultiphotonBreitWheeler.cpp @@ -10,7 +10,7 @@ #include "MultiphotonBreitWheeler.h" #include "Species.h" -#if defined(SMILEI_OPENACC_MODE) +#if defined(SMILEI_ACCELERATOR_GPU_OACC) #define __HIP_PLATFORM_NVCC__ #define __HIP_PLATFORM_NVIDIA__ #include "gpuRandom.h" @@ -248,7 +248,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, double *const __restrict__ pair1_chi = new_pair[1]->has_quantum_parameter ? new_pair[1]->getPtrChi() : nullptr; double *const __restrict__ pair1_tau = new_pair[1]->has_Monte_Carlo_process ? new_pair[1]->getPtrTau() : nullptr; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // Parameters for random generator unsigned long long seed; unsigned long long seq; @@ -325,7 +325,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, Ex[ipart-ipart_ref], Ey[ipart-ipart_ref], Ez[ipart-ipart_ref], Bx[ipart-ipart_ref], By[ipart-ipart_ref], Bz[ipart-ipart_ref] ); -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC } @@ -349,7 +349,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, while( tau[ipart] <= epsilon_tau_ ) { //tau[ipart] = -log( 1.-Rand::uniform() ); -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC tau[ipart] = -std::log( 1.-rand_->uniform() ); #else @@ -406,7 +406,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, double pair_chi[2]; // Draw random number in [0,1[ -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC const double random_number = rand_->uniform(); #else seed_curand_2 = (int) (ipart + 1)*(initial_seed_2 + 1); //Seed for linear generator @@ -431,7 +431,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, SMILEI_UNUSED( ibin ); // Creation of new electrons in the temporary array new_pair[0] new_pair[0]->createParticles( mBW_pair_creation_sampling_[0] ); -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC // Final size int nparticles = new_pair[0]->size(); @@ -442,7 +442,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, #endif // For all new paticles -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC #pragma omp simd #endif for( int ipair=i_pair_start; ipair < i_pair_start+mBW_pair_creation_sampling_[0]; ipair++ ) { @@ -466,7 +466,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, } // + new_pair[k].momentum(i,ipair)*remaining_dt*inv_gamma; -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC // Old positions if( particles.keepOldPositions() ) { pair0_position_old_x[ipair]=position_x[ipart] ; @@ -494,7 +494,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, // Create particle for the second pair species new_pair[1]->createParticles( mBW_pair_creation_sampling_[1] ); -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC // Final size nparticles = new_pair[1]->size(); @@ -505,7 +505,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, #endif // For all new paticles -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC #pragma omp simd #endif for( auto ipair=i_pair_start; ipair < i_pair_start + mBW_pair_creation_sampling_[1]; ipair++ ) { @@ -530,7 +530,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, } // + new_pair[k].momentum(i,ipair)*remaining_dt*inv_gamma; -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC // Old positions if( particles.keepOldPositions() ) { pair1_position_old_x[ipair]=position_x[ipart] ; @@ -629,7 +629,7 @@ void MultiphotonBreitWheeler::operator()( Particles &particles, } } // end ipart loop -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC } #endif } @@ -795,7 +795,7 @@ void MultiphotonBreitWheeler::removeDecayedPhotonsWithoutBinCompression( if( ipart < last_photon_index ) { // The last existing photon comes to the position of // the deleted photon -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC particles.overwriteParticle( last_photon_index, ipart ); #else #endif diff --git a/src/MultiphotonBreitWheeler/MultiphotonBreitWheeler.h b/src/MultiphotonBreitWheeler/MultiphotonBreitWheeler.h index 6e14a37f3..71315d79a 100755 --- a/src/MultiphotonBreitWheeler/MultiphotonBreitWheeler.h +++ b/src/MultiphotonBreitWheeler/MultiphotonBreitWheeler.h @@ -115,7 +115,7 @@ class MultiphotonBreitWheeler //! \param bmin Pointer toward the first particle index of the bin in the Particles object //! \param bmax Pointer toward the last particle index of the bin in the Particles object //! \param ithread Thread index -//#ifdef SMILEI_OPENACC_MODE +//#ifdef SMILEI_ACCELERATOR_GPU_OACC // #pragma acc routine seq //#endif void removeDecayedPhotonsWithoutBinCompression( diff --git a/src/MultiphotonBreitWheeler/MultiphotonBreitWheelerTables.h b/src/MultiphotonBreitWheeler/MultiphotonBreitWheelerTables.h index 4f7f1ce72..9bef108b6 100755 --- a/src/MultiphotonBreitWheeler/MultiphotonBreitWheelerTables.h +++ b/src/MultiphotonBreitWheeler/MultiphotonBreitWheelerTables.h @@ -54,7 +54,7 @@ class MultiphotonBreitWheelerTables //! the multiphoton Breit-Wheeler pair creation //! \param photon_chi photon quantum parameter //! \param[out] pair_chi quantum parameters of the pair -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif void computePairQuantumParameter( const double photon_chi, @@ -71,7 +71,7 @@ class MultiphotonBreitWheelerTables //! \param photon_chi photon quantum parameter //! \param gamma photon normalized energy // ----------------------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif double computeBreitWheelerPairProductionRate( diff --git a/src/Params/Params.cpp b/src/Params/Params.cpp index bc9fb8ed4..69973d104 100755 --- a/src/Params/Params.cpp +++ b/src/Params/Params.cpp @@ -129,16 +129,20 @@ Params::Params( SmileiMPI *smpi, std::vector namelistsFiles ) : PyObject_SetAttrString( Py_main, "_test_mode", Py_False ); PyTools::checkPyError(); - // here we add the rank, in case some script need it + // we add the rank, in case some script needs it PyModule_AddIntConstant( Py_main, "smilei_mpi_rank", smpi->getRank() ); - // here we add the MPI size, in case some script need it + // we add the MPI size, in case some script needs it PyModule_AddIntConstant( Py_main, "smilei_mpi_size", smpi->getSize() ); namelist += string( "smilei_mpi_size = " ) + to_string( smpi->getSize() ) + "\n"; - // here we add the larget int, important to get a valid seed for randomization - PyModule_AddIntConstant( Py_main, "smilei_rand_max", RAND_MAX ); - namelist += string( "smilei_rand_max = " ) + to_string( RAND_MAX ) + "\n\n"; + // we add the openMP size, in case some script needs it + PyModule_AddIntConstant( Py_main, "smilei_omp_threads", smpi->getOMPMaxThreads() ); + namelist += string( "smilei_omp_threads = " ) + to_string( smpi->getOMPMaxThreads() ) + "\n"; + + // we add the total number of cores, in case some script needs it + PyModule_AddIntConstant( Py_main, "smilei_total_cores", smpi->getGlobalNumCores() ); + namelist += string( "smilei_total_cores = " ) + to_string( smpi->getGlobalNumCores() ) + "\n"; // Running pyprofiles.py runScript( string( reinterpret_cast( pyprofiles_py ), pyprofiles_py_len ), "pyprofiles.py", globals ); @@ -833,7 +837,7 @@ Params::Params( SmileiMPI *smpi, std::vector namelistsFiles ) : PyTools::extract( "gpu_computing", gpu_computing, "Main" ); if( gpu_computing ) { -#if( defined( SMILEI_OPENACC_MODE ) && defined( _OPENACC ) ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) +#if( defined( SMILEI_ACCELERATOR_GPU_OACC ) && defined( _OPENACC ) ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) // If compiled for GPU and asking for GPU MESSAGE( 1, "Smilei will run on GPU devices" ); #else @@ -1051,27 +1055,26 @@ Params::Params( SmileiMPI *smpi, std::vector namelistsFiles ) : // Extract the list of profiles and verify their content PyObject *p = PyTools::extract_py( "_profiles", "Laser", i_laser ); vector profiles; - vector profiles_n = {1, 2}; if( ! PyTools::py2pyvector( p, profiles ) ) { ERROR_NAMELIST( "For LaserOffset #" << n_laser_offset << ": space_time_profile must be a list of 2 profiles", LINK_NAMELIST + std::string("#lasers") ); } Py_DECREF( p ); - if( profiles.size()!=2 ) { + if( profiles.size() != 2 ) { ERROR_NAMELIST( "For LaserOffset #" << n_laser_offset << ": space_time_profile needs 2 profiles.", LINK_NAMELIST + std::string("#lasers") ); } - if( profiles[1] == Py_None ) { - profiles .pop_back(); - profiles_n.pop_back(); - } - if( profiles[0] == Py_None ) { - profiles .erase( profiles .begin() ); - profiles_n.erase( profiles_n.begin() ); + vector profiles_n; + vector profiles_kept; + for( unsigned int i = 0; i < 2; i++ ) { + if( profiles[i] != Py_None ) { + profiles_kept.push_back( profiles[i] ); + profiles_n.push_back( i + 1 ); + } } - if( profiles.size() == 0 ) { + if( profiles_kept.size() == 0 ) { ERROR_NAMELIST( "For LaserOffset #" << n_laser_offset << ": space_time_profile cannot be [None, None]", LINK_NAMELIST + std::string("#lasers") ); } - for( unsigned int i=0; i namelistsFiles ) : // Make the propagation happen and write out the file if( ! smpi->test_mode ) { - propagateX( profiles, profiles_n, offset, file, keep_n_strongest_modes, angle_z ); + propagateX( profiles_kept, profiles_n, offset, file, keep_n_strongest_modes, angle_z ); } } - + + for( auto p: profiles ) { + Py_DECREF( p ); + } + n_laser_offset ++; } } @@ -1223,7 +1230,7 @@ void Params::compute() // Set cluster_width_ if not set by the user if( cluster_width_ == -1 ) { -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) cluster_width_ = patch_size_[0]; // On GPU, dont do the CPU automatic cluster_width computation, only one // bin is expected. @@ -1272,7 +1279,7 @@ void Params::compute() // Verify that cluster_width_ divides patch_size_[0] or patch_size_[n] in GPU mode -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) const int kClusterWidth = getGPUClusterWidth(); if( kClusterWidth < 0 ) { @@ -1882,7 +1889,7 @@ string Params::speciesField( string field_name ) return ""; } -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) bool Params::isGPUParticleBinningAvailable() const { @@ -1899,7 +1906,7 @@ bool Params::isGPUParticleBinningAvailable() const #endif -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) int Params::getGPUClusterWidth() const { diff --git a/src/Params/Params.h b/src/Params/Params.h index e2b0603e6..f22dec0cb 100755 --- a/src/Params/Params.h +++ b/src/Params/Params.h @@ -386,7 +386,7 @@ class Params //! bool isGPUParticleBinningAvailable() const; -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) //! Given dimension_id in [0, 3), return for dimension_id == : //! 1: the 1D value (not implemented) @@ -407,7 +407,7 @@ class Params //#if defined( SMILEI_ACCELERATOR_GPU_OMP ) switch( dimension_id ) { case 1: - return -1; + return 4; // check for optimal value case 2: return 4; case 3: diff --git a/src/ParticleBC/BoundaryConditionType.cpp b/src/ParticleBC/BoundaryConditionType.cpp index 318b6b289..304656eca 100755 --- a/src/ParticleBC/BoundaryConditionType.cpp +++ b/src/ParticleBC/BoundaryConditionType.cpp @@ -18,7 +18,7 @@ void internal_inf( Species *species, int imin, int imax, int direction, double l energy_change = 0.; // no energy loss during exchange const double* const position = species->particles->getPtrPosition( direction ); int* const cell_keys = species->particles->getPtrCellKeys(); -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel deviceptr(position,cell_keys) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -28,9 +28,9 @@ void internal_inf( Species *species, int imin, int imax, int direction, double l cell_keys /* [imin:imax - imin] */ ) #pragma omp teams distribute parallel for #endif - for (int ipart=imin ; ipart= 0 && position[ ipart ] < limit_inf ) { + cell_keys[ ipart ] = -2 - 2 * direction; } } } @@ -40,7 +40,7 @@ void internal_sup( Species *species, int imin, int imax, int direction, double l energy_change = 0.; // no energy loss during exchange const double* const position = species->particles->getPtrPosition( direction ); int* const cell_keys = species->particles->getPtrCellKeys(); -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel deviceptr(position,cell_keys) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -50,9 +50,9 @@ void internal_sup( Species *species, int imin, int imax, int direction, double l cell_keys /* [imin:imax - imin] */ ) #pragma omp teams distribute parallel for #endif - for (int ipart=imin ; ipart= limit_sup) { - cell_keys[ ipart ] = -1; + for( int ipart=imin ; ipart= 0 && position[ ipart ] >= limit_sup ) { + cell_keys[ ipart ] = -3 - 2 * direction; } } } @@ -63,10 +63,11 @@ void internal_inf_AM( Species *species, int imin, int imax, int /*direction*/, d double* position_y = species->particles->getPtrPosition(1); double* position_z = species->particles->getPtrPosition(2); int* cell_keys = species->particles->getPtrCellKeys(); - for (int ipart=imin ; ipart= 0 && distance2ToAxis < limit_inf2 ) { + cell_keys[ ipart ] = -4; } } } @@ -77,10 +78,11 @@ void internal_sup_AM( Species *species, int imin, int imax, int /*direction*/, d double* position_y = species->particles->getPtrPosition(1); double* position_z = species->particles->getPtrPosition(2); int* cell_keys = species->particles->getPtrCellKeys(); - for (int ipart=imin ; ipart= limit_sup*limit_sup ) { - cell_keys[ ipart ] = -1; + if( cell_keys[ ipart ] >= 0 && distance2ToAxis >= limit_sup2 ) { + cell_keys[ ipart ] = -5; } } } @@ -90,15 +92,15 @@ void reflect_particle_inf( Species *species, int imin, int imax, int direction, energy_change = 0.; // no energy loss during reflection double* position = species->particles->getPtrPosition(direction); double* momentum = species->particles->getPtrMomentum(direction); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel deviceptr(position,momentum) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target is_device_ptr( position, momentum ) #pragma omp teams distribute parallel for #endif - for (int ipart=imin ; ipartparticles->getPtrPosition(direction); double* momentum = species->particles->getPtrMomentum(direction); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel deviceptr(position,momentum) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -187,9 +189,9 @@ void remove_particle_inf( Species* species, int imin, int imax, int direction, double limit_inf, - double dt, - std::vector& invgf, - Random* rand, + double /*dt*/, + std::vector& /*invgf*/, + Random* /*rand*/, double& energy_change ) { @@ -208,7 +210,7 @@ void remove_particle_inf( Species* species, : change_in_energy ) #pragma omp teams distribute parallel for reduction( + \ : change_in_energy ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel deviceptr(position,momentum_x,momentum_y,momentum_z,weight,charge,cell_keys) #pragma acc loop gang worker vector reduction(+ : change_in_energy) #else @@ -233,9 +235,9 @@ void remove_particle_sup( Species* species, int imin, int imax, int direction, double limit_sup, - double dt, - std::vector& invgf, - Random* rand, + double /*dt*/, + std::vector& /*invgf*/, + Random* /*rand*/, double& energy_change ) { @@ -254,7 +256,7 @@ void remove_particle_sup( Species* species, : change_in_energy ) #pragma omp teams distribute parallel for reduction( + \ : change_in_energy ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel deviceptr(position,momentum_x,momentum_y,momentum_z,weight,charge,cell_keys) #pragma acc loop gang worker vector reduction(+ : change_in_energy) #else diff --git a/src/ParticleBC/PartBoundCond.h b/src/ParticleBC/PartBoundCond.h index 47ab7e235..7afd6ca9c 100755 --- a/src/ParticleBC/PartBoundCond.h +++ b/src/ParticleBC/PartBoundCond.h @@ -44,7 +44,7 @@ class PartBoundCond } else { int *const cell_keys = species->particles->getPtrCellKeys(); -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel deviceptr( cell_keys ) #pragma acc loop gang worker vector #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) diff --git a/src/Particles/Particles.cpp b/src/Particles/Particles.cpp index 308ee4319..dae056745 100755 --- a/src/Particles/Particles.cpp +++ b/src/Particles/Particles.cpp @@ -413,6 +413,51 @@ void Particles::copyParticles( unsigned int iPart, unsigned int nPart, Particles } } +// --------------------------------------------------------------------------------------------------------------------- +//! Copy particles indexed by array 'indices' to dest_id in dest_parts +//! The array 'indices' must be sorted in increasing order +//! cell keys not affected +// --------------------------------------------------------------------------------------------------------------------- +void Particles::copyParticles( vector indices, Particles &dest_parts, int dest_id ) +{ + const size_t transfer_size = indices.size(); + const size_t dest_new_size = dest_parts.size() + transfer_size; + const size_t displaced_size = dest_parts.size() - dest_id; + + for( unsigned int iprop=0 ; ipropresize( dest_new_size ); + auto loc = dest_parts.double_prop_[iprop]->begin() + dest_id; + move_backward( loc, loc + displaced_size, dest_parts.double_prop_[iprop]->end() ); + // Copy data + for( size_t i = 0; i < transfer_size; i++ ) { + ( *dest_parts.double_prop_[iprop] )[dest_id+i] = ( *double_prop_[iprop] )[indices[i]]; + } + } + + for( unsigned int iprop=0 ; ipropresize( dest_new_size ); + auto loc = dest_parts.short_prop_[iprop]->begin() + dest_id; + move_backward( loc, loc + displaced_size, dest_parts.short_prop_[iprop]->end() ); + // Copy data + for( size_t i = 0; i < transfer_size; i++ ) { + ( *dest_parts.short_prop_[iprop] )[dest_id+i] = ( *short_prop_[iprop] )[indices[i]]; + } + } + + for( unsigned int iprop=0 ; ipropresize( dest_new_size ); + auto loc = dest_parts.uint64_prop_[iprop]->begin() + dest_id; + move_backward( loc, loc + displaced_size, dest_parts.uint64_prop_[iprop]->end() ); + // Copy data + for( size_t i = 0; i < transfer_size; i++ ) { + ( *dest_parts.uint64_prop_[iprop] )[dest_id+i] = ( *uint64_prop_[iprop] )[indices[i]]; + } + } +} + // --------------------------------------------------------------------------------------------------------------------- //! Make a new particle at the position of another //! cell keys not affected @@ -529,6 +574,70 @@ void Particles::eraseParticle( unsigned int ipart, unsigned int npart, bool comp } + +// --------------------------------------------------------------------------------------------------------------------- +//! Erase particles indexed by array 'indices' to dest_id in dest_parts +//! The array 'indices' must be sorted in increasing order +//! cell keys not affected +// --------------------------------------------------------------------------------------------------------------------- +void Particles::eraseParticles( vector indices ) +{ + const size_t indices_size = indices.size(); + const size_t initial_size = size(); + + if( indices_size > 0 ) { + + for( auto prop : double_prop_ ) { + // Relocate data to fill erased space + size_t j = 1, stop = ( 1 == indices_size ) ? initial_size : indices[1], to = indices[0]; + for( size_t from = indices[0]+1; from < initial_size; from++ ) { + if( from < stop ) { + ( *prop )[to] = ( *prop )[from]; + to++; + } else { + j++; + stop = ( j == indices_size ) ? initial_size : indices[j]; + } + } + // Resize + prop->resize( initial_size - indices_size ); + } + + for( auto prop : short_prop_ ) { + // Relocate data to fill erased space + size_t j = 1, stop = ( 1 == indices_size ) ? initial_size : indices[1], to = indices[0]; + for( size_t from = indices[0]+1; from < initial_size; from++ ) { + if( from < stop ) { + ( *prop )[to] = ( *prop )[from]; + to++; + } else { + j++; + stop = ( j == indices_size ) ? initial_size : indices[j]; + } + } + // Resize + prop->resize( initial_size - indices_size ); + } + + for( auto prop : uint64_prop_ ) { + // Relocate data to fill erased space + size_t j = 1, stop = ( 1 == indices_size ) ? initial_size : indices[1], to = indices[0]; + for( size_t from = indices[0]+1; from < initial_size; from++ ) { + if( from < stop ) { + ( *prop )[to] = ( *prop )[from]; + to++; + } else { + j++; + stop = ( j == indices_size ) ? initial_size : indices[j]; + } + } + // Resize + prop->resize( initial_size - indices_size ); + } + + } +} + // --------------------------------------------------------------------------------------------------------------------- // Print parameters of particle iPart // --------------------------------------------------------------------------------------------------------------------- @@ -1190,21 +1299,61 @@ void Particles::copyFromHostToDevice() { ERROR( "Device only feature, should not have come here!" ); } -void Particles::copyFromDeviceToHost() +void Particles::copyFromDeviceToHost( bool ) { ERROR( "Device only feature, should not have come here!" ); } -void Particles::extractParticles( Particles* particles_to_move ) +// Loop all particles and copy the outgoing ones to buffers +void Particles::copyLeavingParticlesToBuffers( const vector copy, const vector buffer ) { - particles_to_move->clear(); - for ( int ipart=0 ; ipart indices; + for( size_t ipart = 0; ipart < buffer[0]->size(); ipart++ ) { + int direction = -buffer[0]->cell_keys[ipart] - 2; + if( direction > 0 ) { + if( copy[direction] ) { + buffer[0]->copyParticle( ipart, *buffer[direction] ); + } + indices.push_back( ipart ); } } + buffer[0]->eraseParticles( indices ); + +#else + + // CPU + + for( size_t ipart = 0; ipart < size(); ipart++ ) { + if( cell_keys[ipart] < -1 ) { + int direction = -cell_keys[ipart] - 2; + if( copy[direction] ) { + copyParticle( ipart, *buffer[direction] ); + } + } + } + +#endif } +void Particles::copyLeavingParticlesToBuffer( Particles* ) +{ + ERROR( "Device only feature, should not have come here!" ); +} + + void Particles::savePositions() { unsigned int ndim = Position.size(), npart = size(); double *p[3], *pold[3]; @@ -1249,13 +1398,12 @@ int Particles::eraseLeavingParticles() return 0; } -int Particles::injectParticles( Particles *particles_to_inject ) +int Particles::addParticles( Particles* particles_to_inject ) { ERROR( "Device only feature, should not have come here! On CPU it's done in sortParticles." ); - return 0; } -void Particles::importAndSortParticles( Particles *particles_to_inject ) +void Particles::importAndSortParticles( Particles */*particles_to_inject*/ ) { ERROR( "Device only feature, should not have come here! On CPU it's done in sortParticles." ); } diff --git a/src/Particles/Particles.h b/src/Particles/Particles.h index 1f67ab1cc..20b9c2ea6 100755 --- a/src/Particles/Particles.h +++ b/src/Particles/Particles.h @@ -143,6 +143,8 @@ class Particles //! Insert nPart particles starting at ipart to dest_id in dest_parts void copyParticles( unsigned int iPart, unsigned int nPart, Particles &dest_parts, int dest_id ); + //! Transfer particles indexed by array indices to dest_id in dest_parts + void copyParticles( std::vector indices, Particles &dest_parts, int dest_id ); //! Make a new particle at the position of another void makeParticleAt( Particles &source_particles, unsigned int ipart, double w, short q=0., double px=0., double py=0., double pz=0. ); @@ -151,6 +153,8 @@ class Particles void eraseParticle( unsigned int iPart, bool compute_cell_keys = false ); //! Suppress nPart particles from iPart void eraseParticle( unsigned int iPart, unsigned int nPart, bool compute_cell_keys = false ); + //! Suppress indexed particles + void eraseParticles( std::vector indices ); //! Suppress all particles from iPart to the end of particle array void eraseParticleTrail( unsigned int iPart, bool compute_cell_keys = false ); @@ -431,7 +435,7 @@ class Particles virtual void initializeDataOnDevice(); virtual void initializeIDsOnDevice(); virtual void copyFromHostToDevice(); - virtual void copyFromDeviceToHost(); + virtual void copyFromDeviceToHost( bool copy_keys = false ); //! Return the pointer toward the Position[idim] vector virtual double* getPtrPosition( int idim ) { @@ -469,10 +473,10 @@ class Particles // Accelerator specific virtual functions // ----------------------------------------------------------------------------- - //! Extract particles from the Particles object and put - //! them in the Particles object `particles_to_move` + //! Extract particles leaving the box to buffers // ----------------------------------------------------------------------------- - virtual void extractParticles( Particles *particles_to_move ); + void copyLeavingParticlesToBuffers( const std::vector copy, const std::vector buffer ); + virtual void copyLeavingParticlesToBuffer( Particles* buffer ); // ----------------------------------------------------------------------------- //! Erase particles leaving the patch object on device @@ -480,11 +484,9 @@ class Particles virtual int eraseLeavingParticles(); // ----------------------------------------------------------------------------- - //! Inject particles from particles_to_move object and put - //! them in the Particles object - //! \param[in,out] particles_to_inject Particles object containing particles to inject - virtual int injectParticles( Particles *particles_to_inject ); - + //! Resize & Copy particles from particles_to_inject to the end of the vectors + virtual int addParticles( Particles* particles_to_inject ); + //! Implementation of a somewhat efficient particle injection, sorting //! (including removing leaving particles) and binning for GPU if //! available for the configuration of offloading technology diff --git a/src/Particles/ParticlesFactory.cpp b/src/Particles/ParticlesFactory.cpp index 00f51bbb0..34e9a3a83 100755 --- a/src/Particles/ParticlesFactory.cpp +++ b/src/Particles/ParticlesFactory.cpp @@ -7,7 +7,7 @@ // ----------------------------------------------------------------------------- #include "ParticlesFactory.h" -#if defined( SMILEI_OPENACC_MODE ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) extern "C" void* CreateGPUParticles( const void* parameters, const void* a_parent_patch ); #endif @@ -22,7 +22,7 @@ Particles* ParticlesFactory::create( const Params& parameters, // We export a C interface to avoid potential ABI problems // that could occur when using two different compilers (e.g., one to // compile cuda/hip and another one for the host code). -#if defined( SMILEI_OPENACC_MODE ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) particles = static_cast( CreateGPUParticles( ¶meters, &a_parent_patch ) ); #else SMILEI_UNUSED( a_parent_patch ); diff --git a/src/Particles/nvidiaParticles.cu b/src/Particles/nvidiaParticles.cu old mode 100644 new mode 100755 index d7a63f0b3..a307455ea --- a/src/Particles/nvidiaParticles.cu +++ b/src/Particles/nvidiaParticles.cu @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include "Patch.h" @@ -33,14 +35,25 @@ // Cell key manipulation functor definition //////////////////////////////////////////////////////////////////////////////// -//! Structure with specific function count_if_out for thrust::tuple operator -//! Return True if the entry is -1 as in the cell keys vector for instance -struct count_if_out +//! Predicate for cell_keys +//! Return True if the entry is equal to `code` +template +struct cellKeyEquals { constexpr __host__ __device__ bool operator()( const int& x ) const { - return x == -1; + return x == code; + } +}; + +template +struct cellKeyBelow +{ + constexpr __host__ __device__ bool + operator()( const int& x ) const + { + return x < key; } }; @@ -71,12 +84,6 @@ namespace detail { const Params& parameters, const Patch& a_parent_patch ); - //! Sort the particle on GPU by their cluster/cell key. - //! - static inline void - sortParticleByKey( nvidiaParticles& particle_container, - const Params& parameters ); - //! precondition: //! - nvidia_cell_keys_ shall be sorted in non decreasing order //! - last_index.data() is a pointer mapped to GPU via @@ -107,36 +114,45 @@ namespace detail { InputIterator last, ClusterType cluster_type ); - template - static void - doSortParticleByKey( RandomAccessIterator0 key_first, - RandomAccessIterator0 key_last, - RandomAccessIterator1 value_first ); + }; - template + template + struct Cluster1D : public Cluster + { + public: + Cluster1D( double inverse_x_cell_dimension, + SizeType local_x_dimension_in_cell, + int CellStartingGlobalIndex_for_x); + + //! Compute the cell key of a_particle. a_particle shall be a tuple (from a + //! zipiterator). + //! The first value of a_particle is the cell key value, the other values are + //! the positions x + template + __host__ __device__ IDType + Index( const Tuple& a_particle ) const; + + //! Compute the cell key of a particle range. + //! static void - doImportAndSortParticles( nvidiaParticles& particle_container, - nvidiaParticles& particle_to_inject, - ClusterType cluster_type, - ParticleIteratorProvider particle_iterator_provider, - ParticleNoKeyIteratorProvider particle_no_key_iterator_provider ); - }; + computeParticleClusterKey( nvidiaParticles& particle_container, + const Params& parameters, + const Patch& a_parent_patch ); + double inverse_of_x_cell_dimension_; + int CellStartingGlobalIndex_for_x_; + }; template struct Cluster2D : public Cluster { - public: public: Cluster2D( double inverse_x_cell_dimension, double inverse_y_cell_dimension, SizeType local_x_dimension_in_cell, SizeType local_y_dimension_in_cell, - int CellStartingGlobalIndex_for_x, - int CellStartingGlobalIndex_for_y); + int CellStartingGlobalIndex_for_x, + int CellStartingGlobalIndex_for_y); //! Compute the cell key of a_particle. a_particle shall be a tuple (from a //! zipiterator). @@ -154,28 +170,17 @@ namespace detail { const Params& parameters, const Patch& a_parent_patch ); - static void - sortParticleByKey( nvidiaParticles& particle_container, - const Params& parameters ); - - static void - importAndSortParticles( nvidiaParticles& particle_container, - nvidiaParticles& particle_to_inject, - const Params& parameters, - const Patch& a_parent_patch ); - public: double inverse_of_x_cell_dimension_; double inverse_of_y_cell_dimension_; SizeType local_y_dimension_in_cluster_; - int CellStartingGlobalIndex_for_x_; + int CellStartingGlobalIndex_for_x_; int CellStartingGlobalIndex_for_y_; }; template struct Cluster3D : public Cluster { - public: public: Cluster3D( double inverse_x_cell_dimension, double inverse_y_cell_dimension, @@ -183,7 +188,7 @@ namespace detail { SizeType local_x_dimension_in_cell, SizeType local_y_dimension_in_cell, SizeType local_z_dimension_in_cell, - int CellStartingGlobalIndex_for_x, + int CellStartingGlobalIndex_for_x, int CellStartingGlobalIndex_for_y, int CellStartingGlobalIndex_for_z); @@ -203,16 +208,6 @@ namespace detail { const Params& parameters, const Patch& a_parent_patch ); - static void - sortParticleByKey( nvidiaParticles& particle_container, - const Params& parameters ); - - static void - importAndSortParticles( nvidiaParticles& particle_container, - nvidiaParticles& particle_to_inject, - const Params& parameters, - const Patch& a_parent_patch ); - public: double inverse_of_x_cell_dimension_; double inverse_of_y_cell_dimension_; @@ -220,7 +215,7 @@ namespace detail { SizeType local_y_dimension_in_cluster_; SizeType local_z_dimension_in_cluster_; int CellStartingGlobalIndex_for_x_; - int CellStartingGlobalIndex_for_y_; + int CellStartingGlobalIndex_for_y_; int CellStartingGlobalIndex_for_z_; }; @@ -230,46 +225,17 @@ namespace detail { template class AssignClusterIndex { - public: public: AssignClusterIndex( ClusterType cluster_type ) : cluster_type_{ cluster_type } { - // EMPTY } template __host__ __device__ void operator()( Tuple& a_particle ) const { - thrust::get<0>( a_particle ) /* cluster key */ = cluster_type_.Index( a_particle ); - } - - protected: - ClusterType cluster_type_; - }; - - - //! This functor assign a cluster key to a_particle. - //! - template - struct OutOfClusterPredicate - { - public: - public: - OutOfClusterPredicate( ClusterType cluster_type ) - : cluster_type_{ cluster_type } - { - // EMPTY - } - - template - __host__ __device__ bool - operator()( const Tuple& a_particle ) const - { - // NOTE: its ub to set the cluster key to wrongly keyed particles - // now.. - return thrust::get<0>( a_particle ) /* cluster key */ != cluster_type_.Index( a_particle ); + thrust::get<0>( a_particle ) = cluster_type_.Index( a_particle ); //cluster key } protected: @@ -277,20 +243,6 @@ namespace detail { }; - //! If the particle's cell/cluster key is -1 it means that it needs to be - //! evicted. - //! - struct OutOfBoundaryPredicate - { - template - __host__ __device__ bool - operator()( const Tuple& a_particle ) const - { - return thrust::get<0>( a_particle ) /* cluster key */ == -1; - } - }; - - //////////////////////////////////////////////////////////////////////////////// // Cluster manipulation functor method definitions //////////////////////////////////////////////////////////////////////////////// @@ -304,6 +256,12 @@ namespace detail { // dimensions. switch( particle_container.dimension() ) { + case 1: { + Cluster1D::computeParticleClusterKey( particle_container, + parameters, + a_parent_patch ); + break; + } case 2: { Cluster2D::computeParticleClusterKey( particle_container, parameters, @@ -317,32 +275,7 @@ namespace detail { break; } default: - // Not implemented, only Cartesian 2D or 3D for the moment - SMILEI_ASSERT( false ); - break; - } - } - - inline void - Cluster::sortParticleByKey( nvidiaParticles& particle_container, - const Params& parameters ) - { - // This is where we do a runtime dispatch depending on the simulation's - // dimensions. - - switch( particle_container.dimension() ) { - case 2: { - Cluster2D::sortParticleByKey( particle_container, - parameters ); - break; - } - case 3: { - Cluster3D::sortParticleByKey( particle_container, - parameters ); - break; - } - default: - // Not implemented, only Cartesian 2D or 3D for the moment + // Not implemented, only Cartesian 1D, 2D or 3D for the moment SMILEI_ASSERT( false ); break; } @@ -388,30 +321,46 @@ namespace detail { const Params& parameters, const Patch& a_parent_patch ) { - // This is where we do a runtime dispatch depending on the simulation's - // dimensions. + const auto initial_count = particle_container.deviceSize(); + const auto inject_count = particle_to_inject.deviceSize(); - switch( particle_container.dimension() ) { - case 2: { - Cluster2D::importAndSortParticles( particle_container, - particle_to_inject, - parameters, - a_parent_patch ); - break; - } - case 3: { - Cluster3D::importAndSortParticles( particle_container, - particle_to_inject, - parameters, - a_parent_patch ); - break; - } - - default: - // Not implemented, only 2D for the moment - SMILEI_ASSERT( false ); - break; + // Locate out-of-bounds particles in array "available_places" + const auto keys = particle_container.getPtrCellKeys(); + const auto erased_count = thrust::count_if( thrust::device, keys, keys + initial_count, cellKeyBelow<0>() ); + thrust::device_vector available_places( erased_count ); + thrust::copy_if( thrust::device, + thrust::counting_iterator{0}, + thrust::counting_iterator{ (int) initial_count }, + keys, + available_places.begin(), + cellKeyBelow<0>() ); + + const auto new_count = initial_count + inject_count - erased_count; + + // Copy the imported particles to available places + particle_to_inject.scatterParticles( particle_container, available_places ); + // If there are more imported particles than places, copy the remaining imported particles at the end + if( inject_count >= erased_count ) { + particle_container.deviceResize( new_count ); + particle_container.pasteParticles( &particle_to_inject, initial_count, erased_count ); + // If there are more places than imported particles, the remaining places should be filled + } else { + const auto last_filled = available_places[inject_count]; + particle_container.eraseParticlesByPredicate( cellKeyBelow<0>(), last_filled ); + particle_container.deviceResize( new_count ); } + + // Compute keys of particles + computeParticleClusterKey( particle_container, parameters, a_parent_patch ); + + // Sort particles by keys + // using particle_to_inject as a buffer (it is swapped with particle_container after sorting) + particle_to_inject.deviceReserve( new_count ); // reserve a bit more memory for the final arrays + particle_to_inject.deviceResize( new_count ); + particle_container.sortParticleByKey( particle_to_inject ); + + // Recompute bin locations + computeBinIndex( particle_container ); } template { cluster_type } ); } - template - void - Cluster::doSortParticleByKey( RandomAccessIterator0 key_first, - RandomAccessIterator0 key_last, - RandomAccessIterator1 value_first ) - { - thrust::sort_by_key( thrust::device, - key_first, key_last, - value_first ); - } + //////////////////////////////////////////////////////////////////////////////// + // Cluster method definitions + //////////////////////////////////////////////////////////////////////////////// - template - void - Cluster::doImportAndSortParticles( nvidiaParticles& particle_container, - nvidiaParticles& particle_to_inject, - ClusterType cluster_type, - ParticleIteratorProvider particle_iterator_provider, - ParticleNoKeyIteratorProvider particle_no_key_iterator_provider ) + template + Cluster1D::Cluster1D( double inverse_x_cell_dimension, + SizeType local_x_dimension_in_cell, + int CellStartingGlobalIndex_for_x) + : inverse_of_x_cell_dimension_{ inverse_x_cell_dimension } + , CellStartingGlobalIndex_for_x_{CellStartingGlobalIndex_for_x} { - const auto first_particle = particle_iterator_provider( particle_container ); - - auto last_particle = first_particle + - particle_container.deviceSize(); // Obviously, we use half open ranges - - // Remove out of bound particles - // Using more memory, we could use the faster remove_copy_if - // NOTE: remove_if is stable. - last_particle = thrust::remove_if( thrust::device, - first_particle, - last_particle, - OutOfBoundaryPredicate{} ); - - // Idea 1: - remove_copy_if instead of copy_if - // - sort(the_particles_to_inject) - // - merge - // - compute bins - // NOTE: This method consumes a lot of memory ! O(N) - - const auto new_particle_to_inject_count = particle_to_inject.deviceSize(); - const auto current_local_particles_count = std::distance( first_particle, last_particle ); - const auto new_particle_count = new_particle_to_inject_count + current_local_particles_count; - - // NOTE: We really want a non-initializing vector here! - // It's possible to give a custom allocator to thrust::device_vector. - // Create one with construct(<>) as a noop and derive from - // thrust::device_malloc_allocator. For now we do an explicit resize. - particle_to_inject.softReserve( new_particle_count ); - particle_to_inject.resize( new_particle_count ); // We probably invalidated the iterators - - // Copy out of cluster/tile/chunk particles - // partition_copy is way slower than copy_if/remove_copy_if on rocthrust - // https://github.com/ROCmSoftwarePlatform/rocThrust/issues/247 - - const auto first_particle_to_inject = particle_iterator_provider( particle_to_inject ); - - // NOTE: copy_if/remove_copy_if are stable. - const auto partitioned_particles_bounds_true = thrust::copy_if( thrust::device, - first_particle, last_particle, - // Dont overwrite the particle_to_inject (at the start of the array) - first_particle_to_inject + new_particle_to_inject_count, - OutOfClusterPredicate{ cluster_type } ); - const auto partitioned_particles_bounds_false = thrust::remove_copy_if( thrust::device, - first_particle, last_particle, - // Do the copy with a destination - // starting from partitioned_particles_bounds_true - partitioned_particles_bounds_true, - OutOfClusterPredicate{ cluster_type } ); - - // Compute or recompute the cluster index of the particle_to_inject - // NOTE: - // - we can "save" some work here if cluster index is already computed - // for the new particles to inject (not the one we got with copy_if). - // - doComputeParticleClusterKey( first_particle_to_inject, - partitioned_particles_bounds_true, - cluster_type ); - - const auto first_particle_to_inject_no_key = particle_no_key_iterator_provider( particle_to_inject ); - const auto particle_to_rekey_count = std::distance( first_particle_to_inject, - partitioned_particles_bounds_true ); - - doSortParticleByKey( particle_to_inject.getPtrCellKeys(), - particle_to_inject.getPtrCellKeys() + particle_to_rekey_count, - first_particle_to_inject_no_key ); - - // This free generates a lot of memory fragmentation. - // particle_container.free(); - // Same as for particle_to_inject, non-initializing vector is best. - particle_container.softReserve( new_particle_count ); - particle_container.resize( new_particle_count ); - - // Merge by key - // NOTE: Dont merge in place on GPU. That means we need an other large buffer! - // - thrust::merge_by_key( thrust::device, - particle_to_inject.getPtrCellKeys(), // Input range 1, first key - particle_to_inject.getPtrCellKeys() + particle_to_rekey_count, // Input range 1, last key - particle_to_inject.getPtrCellKeys() + particle_to_rekey_count, // Input range 2, first key - particle_to_inject.getPtrCellKeys() + new_particle_count, // Input range 2, last key - first_particle_to_inject_no_key, // Input range 1, first value - first_particle_to_inject_no_key + particle_to_rekey_count, // Input range 2, first value - particle_container.getPtrCellKeys(), // Output range first key - particle_no_key_iterator_provider( particle_container ) ); // Output range first value - - // Recompute bins - computeBinIndex( particle_container ); - - // This free generates a lot of memory fragmentation. If we enable it we - // reduce significantly the memory usage over time but a memory spike - // will still be present. Unfortunately, this free generates soo much - // fragmentation (like the one above) that at some point the GPU memory - // allocator will fail! - // particle_to_inject.free(); } - - //////////////////////////////////////////////////////////////////////////////// - // Cluster2D method definitions - //////////////////////////////////////////////////////////////////////////////// - template Cluster2D::Cluster2D( double inverse_x_cell_dimension, double inverse_y_cell_dimension, SizeType local_x_dimension_in_cell, SizeType local_y_dimension_in_cell, - int CellStartingGlobalIndex_for_x, int CellStartingGlobalIndex_for_y ) + int CellStartingGlobalIndex_for_x, int CellStartingGlobalIndex_for_y ) : inverse_of_x_cell_dimension_{ inverse_x_cell_dimension } , inverse_of_y_cell_dimension_{ inverse_y_cell_dimension } , local_y_dimension_in_cluster_{ local_y_dimension_in_cell / kClusterWidth } , CellStartingGlobalIndex_for_x_{CellStartingGlobalIndex_for_x} - , CellStartingGlobalIndex_for_y_{CellStartingGlobalIndex_for_y} + , CellStartingGlobalIndex_for_y_{CellStartingGlobalIndex_for_y} { - // EMPTY } template @@ -571,7 +409,7 @@ namespace detail { SizeType local_x_dimension_in_cell, SizeType local_y_dimension_in_cell, SizeType local_z_dimension_in_cell, - int CellStartingGlobalIndex_for_x, + int CellStartingGlobalIndex_for_x, int CellStartingGlobalIndex_for_y, int CellStartingGlobalIndex_for_z ) : inverse_of_x_cell_dimension_{ inverse_x_cell_dimension } , inverse_of_y_cell_dimension_{ inverse_y_cell_dimension } @@ -582,7 +420,30 @@ namespace detail { , CellStartingGlobalIndex_for_y_{CellStartingGlobalIndex_for_y} , CellStartingGlobalIndex_for_z_{CellStartingGlobalIndex_for_z} { - // EMPTY + } + + template + template + __host__ __device__ typename Cluster1D::IDType + Cluster1D::Index( const Tuple& a_particle ) const + { + const SizeType local_x_particle_coordinate_in_cell = static_cast( thrust::get<1>( a_particle ) * + inverse_of_x_cell_dimension_ ) - + CellStartingGlobalIndex_for_x_; + + // These divisions will be optimized. + // The integer division rounding behavior is expected. + + // NOTE: Flat tiles have been studied but were not as efficient for the + // projection. The square provides the minimal perimeter (and thus ghost + // cell amount) for a given area. + static constexpr SizeType x_cluster_dimension_in_cell = kClusterWidth; + + const SizeType local_x_particle_cluster_coordinate_in_cluster = local_x_particle_coordinate_in_cell / x_cluster_dimension_in_cell; + + const SizeType cluster_index = local_x_particle_cluster_coordinate_in_cluster; + + return static_cast( cluster_index ); } template @@ -658,6 +519,22 @@ namespace detail { return static_cast( cluster_index ); } + template + void + Cluster1D::computeParticleClusterKey( nvidiaParticles& particle_container, + const Params& parameters, + const Patch& a_parent_patch ) + { + const auto first = thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrCellKeys(), + static_cast( particle_container.getPtrPosition( 0 ) ) ) ); + const auto last = first + particle_container.deviceSize(); + int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); + doComputeParticleClusterKey( first, last, + Cluster1D{ parameters.res_space[0], + parameters.patch_size_[0], + CellStartingGlobalIndex_for_x} ); + } + template void Cluster2D::computeParticleClusterKey( nvidiaParticles& particle_container, @@ -670,7 +547,7 @@ namespace detail { const auto last = first + particle_container.deviceSize(); int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); int CellStartingGlobalIndex_for_y = a_parent_patch.getCellStartingGlobalIndex_noGC(1); - doComputeParticleClusterKey( first, last, + doComputeParticleClusterKey( first, last, Cluster2D{ parameters.res_space[0], parameters.res_space[1], parameters.patch_size_[0], @@ -693,7 +570,7 @@ namespace detail { int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); int CellStartingGlobalIndex_for_y = a_parent_patch.getCellStartingGlobalIndex_noGC(1); int CellStartingGlobalIndex_for_z = a_parent_patch.getCellStartingGlobalIndex_noGC(2); - doComputeParticleClusterKey( first, last, + doComputeParticleClusterKey( first, last, Cluster3D{ parameters.res_space[0], parameters.res_space[1], parameters.res_space[2], @@ -705,277 +582,6 @@ namespace detail { CellStartingGlobalIndex_for_z } ); } - template - void - Cluster2D::sortParticleByKey( nvidiaParticles& particle_container, - const Params& ) - { - // This is where we do a runtime dispatch depending on the simulation's - // qed/radiation settings. - - // NOTE: For now we support dont support qed/radiations. Performance - // comes from specialization. - - // TODO(Etienne M): Find a better way to dispatch at runtime. This is - // complex to read and to maintain. - - if( particle_container.has_quantum_parameter ) { - if( particle_container.has_Monte_Carlo_process ) { - SMILEI_ASSERT( false ); - } else { - SMILEI_ASSERT( false ); - } - } else { - if( particle_container.has_Monte_Carlo_process ) { - SMILEI_ASSERT( false ); - } else { - // The appropriate thrust::zip_iterator for the current - // simulation's parameters - - const auto value_first = thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge() ) ); - - doSortParticleByKey( particle_container.getPtrCellKeys(), - particle_container.getPtrCellKeys() + particle_container.deviceSize(), - value_first ); - } - } - } - - template - void - Cluster3D::sortParticleByKey( nvidiaParticles& particle_container, - const Params& ) - { - // This is where we do a runtime dispatch depending on the simulation's - // qed/radiation settings. - - // NOTE: For now we support dont support qed/radiations. Performance - // comes from specialization. - - // TODO(Etienne M): Find a better way to dispatch at runtime. This is - // complex to read and to maintain. - - if( particle_container.has_quantum_parameter ) { - if( particle_container.has_Monte_Carlo_process ) { - SMILEI_ASSERT( false ); - } else { - SMILEI_ASSERT( false ); - } - } else { - if( particle_container.has_Monte_Carlo_process ) { - SMILEI_ASSERT( false ); - } else { - // The appropriate thrust::zip_iterator for the current - // simulation's parameters - - if (particle_container.tracked) { - const auto value_first = thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrPosition( 2 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge(), - particle_container.getPtrId() ) ); - doSortParticleByKey( particle_container.getPtrCellKeys(), - particle_container.getPtrCellKeys() + particle_container.deviceSize(), - value_first ); - - } - else { - const auto value_first = thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrPosition( 2 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge() ) ); - doSortParticleByKey( particle_container.getPtrCellKeys(), - particle_container.getPtrCellKeys() + particle_container.deviceSize(), - value_first ); - } - } - } - } - - template - void - Cluster2D::importAndSortParticles( nvidiaParticles& particle_container, - nvidiaParticles& particle_to_inject, - const Params& parameters, - const Patch& a_parent_patch ) - { - // This is where we do a runtime dispatch depending on the simulation's - // qed/radiation settings. - - // NOTE: For now we support dont support qed/radiations. Performance - // comes from specialization. - - // TODO(Etienne M): Find a better way to dispatch at runtime. This is - // complex to read and to maintain. - int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); - int CellStartingGlobalIndex_for_y = a_parent_patch.getCellStartingGlobalIndex_noGC(1); - - const Cluster2D cluster_manipulator{ parameters.res_space[0], - parameters.res_space[1], - parameters.patch_size_[0], - parameters.patch_size_[1], - CellStartingGlobalIndex_for_x, CellStartingGlobalIndex_for_y}; - - if( particle_container.has_quantum_parameter ) { - if( particle_container.has_Monte_Carlo_process ) { - SMILEI_ASSERT( false ); - } else { - SMILEI_ASSERT( false ); - } - } else { - if( particle_container.has_Monte_Carlo_process ) { - SMILEI_ASSERT( false ); - } else { - // Returns the appropriate thrust::zip_iterator for the - // current simulation's parameters - const auto particle_iterator_provider = []( nvidiaParticles& particle_container ) { - return thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrCellKeys(), - particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge() ) ); - }; - - const auto particle_no_key_iterator_provider = []( nvidiaParticles& particle_container ) { - return thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge() ) ); - }; - - doImportAndSortParticles( particle_container, - particle_to_inject, - cluster_manipulator, - particle_iterator_provider, - particle_no_key_iterator_provider ); - } - } - } - - template - void - Cluster3D::importAndSortParticles( nvidiaParticles& particle_container, - nvidiaParticles& particle_to_inject, - const Params& parameters, - const Patch& a_parent_patch ) - { - // This is where we do a runtime dispatch depending on the simulation's - // qed/radiation settings. - - // NOTE: For now we support dont support qed/radiations. Performance - // comes from specialization. - - // TODO(Etienne M): Find a better way to dispatch at runtime. This is - // complex to read and to maintain. - int CellStartingGlobalIndex_for_x = a_parent_patch.getCellStartingGlobalIndex_noGC(0); - int CellStartingGlobalIndex_for_y = a_parent_patch.getCellStartingGlobalIndex_noGC(1); - int CellStartingGlobalIndex_for_z = a_parent_patch.getCellStartingGlobalIndex_noGC(2); - - const Cluster3D cluster_manipulator{ parameters.res_space[0], - parameters.res_space[1], - parameters.res_space[2], - parameters.patch_size_[0], - parameters.patch_size_[1], - parameters.patch_size_[2], - CellStartingGlobalIndex_for_x, - CellStartingGlobalIndex_for_y, CellStartingGlobalIndex_for_z}; - - if( particle_container.has_quantum_parameter ) { - if( particle_container.has_Monte_Carlo_process ) { - SMILEI_ASSERT( false ); - } else { - SMILEI_ASSERT( false ); - } - } else { - if( particle_container.has_Monte_Carlo_process ) { - SMILEI_ASSERT( false ); - } else { - // Returns the appropriate thrust::zip_iterator for the - // current simulation's parameters - if (particle_container.tracked) { - const auto particle_iterator_provider = []( nvidiaParticles& particle_container ) { - return thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrCellKeys(), - particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrPosition( 2 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge(), - particle_container.getPtrId() ) ); - }; - const auto particle_no_key_iterator_provider = []( nvidiaParticles& particle_container ) { - return thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrPosition( 2 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge(), - particle_container.getPtrId() ) ); - }; - doImportAndSortParticles( particle_container, - particle_to_inject, - cluster_manipulator, - particle_iterator_provider, - particle_no_key_iterator_provider ); - } - else { - const auto particle_iterator_provider = []( nvidiaParticles& particle_container ) { - return thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrCellKeys(), - particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrPosition( 2 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge() ) ); - }; - - const auto particle_no_key_iterator_provider = []( nvidiaParticles& particle_container ) { - return thrust::make_zip_iterator( thrust::make_tuple( particle_container.getPtrPosition( 0 ), - particle_container.getPtrPosition( 1 ), - particle_container.getPtrPosition( 2 ), - particle_container.getPtrMomentum( 0 ), - particle_container.getPtrMomentum( 1 ), - particle_container.getPtrMomentum( 2 ), - particle_container.getPtrWeight(), - particle_container.getPtrCharge() ) ); - }; - - doImportAndSortParticles( particle_container, - particle_to_inject, - cluster_manipulator, - particle_iterator_provider, - particle_no_key_iterator_provider ); - } - } - } - } - } // namespace detail @@ -990,7 +596,6 @@ nvidiaParticles::nvidiaParticles( const Params& parameters, , parent_patch_{ &a_parent_patch } , gpu_nparts_{} { - // EMPTY } nvidiaParticles::~nvidiaParticles() { @@ -1000,13 +605,7 @@ nvidiaParticles::~nvidiaParticles() { } } -void nvidiaParticles::resizeDimensions( unsigned int nDim ) -{ - nvidia_position_.resize( nDim ); - nvidia_momentum_.resize( 3 ); -} - -void nvidiaParticles::softReserve( unsigned int particle_count, float growth_factor ) +void nvidiaParticles::deviceReserve( unsigned int particle_count, float growth_factor ) { if( particle_count <= deviceCapacity() ) { // Dont reserve, for now we have enough capacity. @@ -1015,23 +614,12 @@ void nvidiaParticles::softReserve( unsigned int particle_count, float growth_fac const unsigned int new_capacity = static_cast( particle_count * growth_factor ); - for( unsigned int idim = 0; idim < nvidia_position_.size(); idim++ ) { - nvidia_position_[idim].reserve( new_capacity ); - } - - for( unsigned int idim = 0; idim < 3; idim++ ) { - nvidia_momentum_[idim].reserve( new_capacity ); - } - - nvidia_weight_.reserve( new_capacity ); - nvidia_charge_.reserve( new_capacity ); - - if( has_quantum_parameter ) { - nvidia_chi_.reserve( new_capacity ); + for( auto prop: nvidia_double_prop_ ) { + prop->reserve( new_capacity ); } - if( has_Monte_Carlo_process ) { - nvidia_tau_.reserve( new_capacity ); + for( auto prop: nvidia_short_prop_ ) { + prop->reserve( new_capacity ); } if( tracked ) { @@ -1041,137 +629,41 @@ void nvidiaParticles::softReserve( unsigned int particle_count, float growth_fac nvidia_cell_keys_.reserve( new_capacity ); } -void nvidiaParticles::reserve( unsigned int particle_count ) -{ - for( unsigned int idim = 0; idim < nvidia_position_.size(); idim++ ) { - nvidia_position_[idim].reserve( particle_count ); - } - - for( unsigned int idim = 0; idim < 3; idim++ ) { - nvidia_momentum_[idim].reserve( particle_count ); - } - - nvidia_weight_.reserve( particle_count ); - nvidia_charge_.reserve( particle_count ); - - if( has_quantum_parameter ) { - nvidia_chi_.reserve( particle_count ); - } - - if( has_Monte_Carlo_process ) { - nvidia_tau_.reserve( particle_count ); - } - - if( tracked ) { - nvidia_id_.reserve( particle_count ); - } - - nvidia_cell_keys_.reserve( particle_count ); -} - -void nvidiaParticles::resize( unsigned int particle_count ) -{ - - // TODO(Etienne M): Use non-initializing vector/allocator (dont pay the cost - // of what you dont use) ? - - for( int idim = 0; idim < nvidia_position_.size(); idim++ ) { - nvidia_position_[idim].resize( particle_count ); - } - - for( int idim = 0; idim < 3; idim++ ) { - nvidia_momentum_[idim].resize( particle_count ); - } - - nvidia_weight_.resize( particle_count ); - nvidia_charge_.resize( particle_count ); - - if( has_quantum_parameter ) { - nvidia_chi_.resize( particle_count ); - } - - if( has_Monte_Carlo_process ) { - nvidia_tau_.resize( particle_count ); - } - - if( tracked ) { - nvidia_id_.resize( particle_count ); - } - - nvidia_cell_keys_.resize( particle_count ); - - gpu_nparts_ = particle_count; -} - -void nvidiaParticles::free() +void nvidiaParticles::deviceFree() { - for( auto& a_vector : nvidia_position_ ) { - thrust::device_vector a_dummy_vector{}; - std::swap( a_vector, a_dummy_vector ); - } - - for( auto& a_vector : nvidia_momentum_ ) { - thrust::device_vector a_dummy_vector{}; - std::swap( a_vector, a_dummy_vector ); + for( auto prop: nvidia_double_prop_ ) { + thrust::device_vector().swap( *prop ); } - { - thrust::device_vector a_dummy_vector{}; - std::swap( nvidia_weight_, a_dummy_vector ); - } - - { - thrust::device_vector a_dummy_vector{}; - std::swap( nvidia_charge_, a_dummy_vector ); - } - - if( has_quantum_parameter ) { - thrust::device_vector a_dummy_vector{}; - std::swap( nvidia_chi_, a_dummy_vector ); - } - - if( has_Monte_Carlo_process ) { - thrust::device_vector a_dummy_vector{}; - std::swap( nvidia_tau_, a_dummy_vector ); + for( auto prop: nvidia_short_prop_ ) { + thrust::device_vector().swap( *prop ); } if( tracked ) { - thrust::device_vector a_dummy_vector{}; - std::swap( nvidia_id_, a_dummy_vector ); + thrust::device_vector().swap( nvidia_id_ ); } - { - thrust::device_vector a_dummy_vector{}; - std::swap( nvidia_cell_keys_, a_dummy_vector ); - } + thrust::device_vector().swap( nvidia_cell_keys_ ); gpu_nparts_ = 0; } -// --------------------------------------------------------------------------------------------------------------------- -//! Resize particle vectors -// --------------------------------------------------------------------------------------------------------------------- void nvidiaParticles::deviceResize( unsigned int new_size ) { - for( unsigned int iprop=0 ; ipropresize( new_size ); } - for( unsigned int iprop=0 ; ipropresize( new_size ); } - // - // for( unsigned int iprop=0 ; ipropclear(); + for( auto prop: nvidia_double_prop_ ) { + prop->clear(); } - for( unsigned int iprop = 0; iprop < nvidia_short_prop_.size(); iprop++ ) { - nvidia_short_prop_[iprop]->clear(); + for( auto prop: nvidia_short_prop_ ) { + prop->clear(); } // TODO(Etienne M): Clear cell keys too ? - if (tracked) { + if( tracked ) { nvidia_id_.clear(); } - + gpu_nparts_ = 0; } @@ -1215,23 +707,18 @@ void nvidiaParticles::initializeDataOnDevice() // The world shall end if we call this function multiple times SMILEI_ASSERT( nvidia_double_prop_.empty() ); - const auto kPositionDimension = Position.size(); - // We sure that we have as many say, position dimension as the base class. - resizeDimensions( kPositionDimension ); + nvidia_position_.resize( Position.size() ); + nvidia_momentum_.resize( 3 ); // Initialize the list of pointers - - for( unsigned int i = 0; i < kPositionDimension; i++ ) { - nvidia_double_prop_.push_back( &nvidia_position_[i] ); + for( auto &pos: nvidia_position_ ) { + nvidia_double_prop_.push_back( &pos ); } - - for( unsigned int i = 0; i < 3; i++ ) { - nvidia_double_prop_.push_back( &nvidia_momentum_[i] ); + for( auto &mom: nvidia_momentum_ ) { + nvidia_double_prop_.push_back( &mom ); } - nvidia_double_prop_.push_back( &nvidia_weight_ ); - nvidia_short_prop_.push_back( &nvidia_charge_ ); // Quantum parameter (for QED effects): @@ -1248,9 +735,9 @@ void nvidiaParticles::initializeDataOnDevice() nvidia_double_prop_.push_back( &nvidia_tau_ ); } - const auto kHostParticleCount = Position[0].size(); + const auto hostParticleCount = Position[0].size(); - if( kHostParticleCount == 0 ) { + if( hostParticleCount == 0 ) { // Should we reserve some space ? // reserve( 100 ); } else { @@ -1271,14 +758,13 @@ void nvidiaParticles::initializeDataOnDevice() // setHostBinIndex(); } else { - + // At this point, a copy of the host particles and last_index is on the // device and we know we support the space dimension. - detail::Cluster::computeParticleClusterKey( *this, *parameters_, *parent_patch_ ); // The particles are not correctly sorted when created. - detail::Cluster::sortParticleByKey( *this, *parameters_ ); + sortParticleByKey(); detail::Cluster::computeBinIndex( *this ); setHostBinIndex(); @@ -1299,7 +785,7 @@ void nvidiaParticles::initializeIDsOnDevice() // ------------------------------------------------------------------------------------------------- void nvidiaParticles::copyFromHostToDevice() { - resize( Position[0].size() ); + deviceResize( Position[0].size() ); for( int idim = 0; idim < Position.size(); idim++ ) { thrust::copy( Position[idim].begin(), Position[idim].end(), nvidia_position_[idim].begin() ); @@ -1308,7 +794,6 @@ void nvidiaParticles::copyFromHostToDevice() for( int idim = 0; idim < Momentum.size(); idim++ ) { thrust::copy( Momentum[idim].begin(), Momentum[idim].end(), nvidia_momentum_[idim].begin() ); } - thrust::copy( Weight.begin(), Weight.end(), nvidia_weight_.begin() ); thrust::copy( Charge.begin(), Charge.end(), nvidia_charge_.begin() ); @@ -1329,7 +814,7 @@ void nvidiaParticles::copyFromHostToDevice() // ------------------------------------------------------------------------------------------------- //! Copy device to host // ------------------------------------------------------------------------------------------------- -void nvidiaParticles::copyFromDeviceToHost() +void nvidiaParticles::copyFromDeviceToHost( bool copy_keys ) { for (int idim=0;idim( particles_to_move ); - const int nparts = gpu_nparts_; - const int position_dimension_count = nvidia_position_.size(); - - const int nparts_to_move = thrust::count_if( thrust::device, - nvidia_cell_keys_.cbegin(), - nvidia_cell_keys_.cbegin() + nparts, - count_if_out() ); - - // Resize it, if too small (copy_if do not resize) - cp_parts->resize( nparts_to_move ); - - // Iterator of the main data structure - // NOTE: https://nvidia.github.io/thrust/api/classes/classthrust_1_1zip__iterator.html#class-thrustzip_iterator - const auto source_iterator_first = thrust::make_zip_iterator( thrust::make_tuple( nvidia_position_[0].begin(), - nvidia_momentum_[0].begin(), - nvidia_momentum_[1].begin(), - nvidia_momentum_[2].begin(), - nvidia_weight_.begin(), - nvidia_charge_.begin() ) ); - const auto source_iterator_last = source_iterator_first + nparts; // std::advance - const auto destination_iterator_first = thrust::make_zip_iterator( thrust::make_tuple( cp_parts->nvidia_position_[0].begin(), - cp_parts->nvidia_momentum_[0].begin(), - cp_parts->nvidia_momentum_[1].begin(), - cp_parts->nvidia_momentum_[2].begin(), - cp_parts->nvidia_weight_.begin(), - cp_parts->nvidia_charge_.begin() ) ); - - // Copy send particles in dedicated data structure if nvidia_cell_keys_=0 (currently = 1 if keeped, new PartBoundCond::apply(...)) - thrust::copy_if( thrust::device, - source_iterator_first, - source_iterator_last, - // Copy depending on count_if_out()(nvidia_cell_keys_[i]) - nvidia_cell_keys_.cbegin(), - destination_iterator_first, - count_if_out() ); - - // Copy the other position values depending on the simulation's grid - // dimensions - for( int i = 1; i < position_dimension_count; ++i ) { - thrust::copy_if( thrust::device, - nvidia_position_[i].cbegin(), - nvidia_position_[i].cbegin() + nparts, - nvidia_cell_keys_.cbegin(), - cp_parts->nvidia_position_[i].begin(), - count_if_out() ); - } + copyParticlesByPredicate( buffer, cellKeyBelow<-1>() ); + buffer->copyFromDeviceToHost( true ); +} - // Special treatment for chi if radiation emission - if( has_quantum_parameter ) { - thrust::copy_if( thrust::device, - nvidia_chi_.cbegin(), - nvidia_chi_.cbegin() + nparts, - nvidia_cell_keys_.cbegin(), - cp_parts->nvidia_chi_.begin(), - count_if_out() ); - } - if( has_Monte_Carlo_process ) { - thrust::copy_if( thrust::device, - nvidia_tau_.cbegin(), - nvidia_tau_.cbegin() + nparts, - nvidia_cell_keys_.cbegin(), - cp_parts->nvidia_tau_.begin(), - count_if_out() ); +//! Copy particles which statisfy some predicate +template +void nvidiaParticles::copyParticlesByPredicate( Particles* buffer, Predicate pred ) +{ + // Count particles satisfying the predicate + const auto keys = getPtrCellKeys(); + const int nparts_to_copy = thrust::count_if( thrust::device, keys, keys + gpu_nparts_, pred ); + + // Resize destination buffer (copy_if does not resize) + nvidiaParticles* const dest = static_cast( buffer ); + dest->deviceResize( nparts_to_copy ); + + if( nparts_to_copy ) { + // Copy the particles to the destination + for( int ip = 0; ip < nvidia_double_prop_.size(); ip++ ) { + const auto in = nvidia_double_prop_[ip]->begin(); + const auto out = dest->nvidia_double_prop_[ip]->begin(); + thrust::copy_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + gpu_nparts_, keys, out, pred ); + } + for( int ip = 0; ip < nvidia_short_prop_.size(); ip++ ) { + const auto in = nvidia_short_prop_[ip]->begin(); + const auto out = dest->nvidia_short_prop_[ip]->begin(); + thrust::copy_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + gpu_nparts_, keys, out, pred ); + } + if( tracked ) { + const auto in = nvidia_id_.begin(); + const auto out = dest->nvidia_id_.begin(); + thrust::copy_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + gpu_nparts_, keys, out, pred ); + } + const auto in = nvidia_cell_keys_.begin(); + const auto out = dest->nvidia_cell_keys_.begin(); + thrust::copy_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + gpu_nparts_, keys, out, pred ); + SMILEI_ACCELERATOR_DEVICE_SYNC(); } +} +int nvidiaParticles::addParticles( Particles* particles_to_inject ) +{ + const auto nparts = gpu_nparts_; + nvidiaParticles* to_inject = static_cast( particles_to_inject ); + deviceResize( nparts + to_inject->gpu_nparts_ ); + pasteParticles( to_inject, nparts, 0 ); + return to_inject->gpu_nparts_; +} + +void nvidiaParticles::pasteParticles( nvidiaParticles* particles_to_inject, size_t offset_in_output, size_t offset_in_input ) +{ + const auto n = particles_to_inject->gpu_nparts_ - (int) offset_in_input; + + // Copy the particles to the destination + for( int ip = 0; ip < nvidia_double_prop_.size(); ip++ ) { + const auto in = particles_to_inject->nvidia_double_prop_[ip]->begin() + offset_in_input; + const auto out = nvidia_double_prop_[ip]->begin() + offset_in_output; + thrust::copy_n( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, n, out ); + } + for( int ip = 0; ip < nvidia_short_prop_.size(); ip++ ) { + const auto in = particles_to_inject->nvidia_short_prop_[ip]->begin() + offset_in_input; + const auto out = nvidia_short_prop_[ip]->begin() + offset_in_output; + thrust::copy_n( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, n, out ); + } if( tracked ) { - thrust::copy_if( thrust::device, - nvidia_id_.cbegin(), - nvidia_id_.cbegin() + nparts, - nvidia_cell_keys_.cbegin(), - cp_parts->nvidia_id_.begin(), - count_if_out() ); + const auto in = particles_to_inject->nvidia_id_.begin() + offset_in_input; + const auto out = nvidia_id_.begin() + offset_in_output; + thrust::copy_n( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, n, out ); } - - particles_to_move->copyFromDeviceToHost(); + SMILEI_ACCELERATOR_DEVICE_SYNC(); } - // ----------------------------------------------------------------------------- //! Erase `npart` particles from `ipart` // ----------------------------------------------------------------------------- @@ -1475,158 +950,49 @@ void nvidiaParticles::extractParticles( Particles* particles_to_move ) // std::begin( nvidia_position_[i] ), // std::begin( nvidia_position_[i] ) + nparts, // std::cbegin( nvidia_cell_keys_ ), -// count_if_out() ); +// cellKeyEquals<-1>() ); // } // //} // ----------------------------------------------------------------------------- -//! Erase particles leaving the patch object on device +//! Erase particles leaving the patch on device // ----------------------------------------------------------------------------- int nvidiaParticles::eraseLeavingParticles() { - const int position_dimension_count = nvidia_position_.size(); - const int nparts = gpu_nparts_; - const int nparts_to_remove = thrust::count_if( thrust::device, - nvidia_cell_keys_.begin(), - nvidia_cell_keys_.begin() + nparts, - count_if_out() ); - - - if( nparts_to_remove > 0 ) { - const auto first_particle = thrust::make_zip_iterator( thrust::make_tuple( nvidia_position_[0].begin(), - nvidia_momentum_[0].begin(), - nvidia_momentum_[1].begin(), - nvidia_momentum_[2].begin(), - nvidia_weight_.begin(), - nvidia_charge_.begin() ) ); - - const auto last_particle = first_particle + nparts; - - // Remove particles which leaves current patch - thrust::remove_if( thrust::device, - first_particle, - last_particle, - nvidia_cell_keys_.cbegin(), - count_if_out() ); - - // Remove the other position values depending on the simulation's grid - // dimensions - for( int i = 1; i < position_dimension_count; ++i ) { - thrust::remove_if( thrust::device, - nvidia_position_[i].begin(), - nvidia_position_[i].begin() + nparts, - nvidia_cell_keys_.cbegin(), - count_if_out() ); - } - - if( has_quantum_parameter ) { - thrust::remove_if( thrust::device, - nvidia_chi_.begin(), - nvidia_chi_.begin() + nparts, - nvidia_cell_keys_.cbegin(), - count_if_out() ); - } - - if( has_Monte_Carlo_process ) { - thrust::remove_if( thrust::device, - nvidia_tau_.begin(), - nvidia_tau_.begin() + nparts, - nvidia_cell_keys_.cbegin(), - count_if_out() ); - } - - if( tracked ) { - thrust::remove_if( thrust::device, - nvidia_id_.begin(), - nvidia_id_.begin() + nparts, - nvidia_cell_keys_.cbegin(), - count_if_out() ); - } - - // Update current number of particles - gpu_nparts_ -= nparts_to_remove; - - // Resize data structures (remove_if does not resize) - resize( gpu_nparts_ ); - } - - return nparts_to_remove; + const auto nremoved = eraseParticlesByPredicate( cellKeyBelow<0>(), 0 ); + deviceResize( gpu_nparts_ - nremoved ); + return nremoved; } -int nvidiaParticles::injectParticles( Particles* particles_to_inject ) +//! "Erase" particles but does not resize the arrays! +template +int nvidiaParticles::eraseParticlesByPredicate( Predicate pred, size_t offset ) { - const int nparts = gpu_nparts_; - - // Manage the recv data structure - nvidiaParticles* const cp_parts = static_cast( particles_to_inject ); - - const int nparts_add = cp_parts->gpu_nparts_; - const int tot_parts = nparts + nparts_add; - - const int position_dimension_count = nvidia_position_.size(); - - // Resize main data structure, if too small (copy_n do not resize) - resize( tot_parts ); - - const auto source_iterator_first = thrust::make_zip_iterator( thrust::make_tuple( cp_parts->nvidia_position_[0].cbegin(), - cp_parts->nvidia_momentum_[0].cbegin(), - cp_parts->nvidia_momentum_[1].cbegin(), - cp_parts->nvidia_momentum_[2].cbegin(), - cp_parts->nvidia_weight_.cbegin(), - cp_parts->nvidia_charge_.cbegin() ) ); - - // Iterator of the main data structure (once it has been resized) - const auto destination_iterator_first = thrust::make_zip_iterator( thrust::make_tuple( nvidia_position_[0].begin(), - nvidia_momentum_[0].begin(), - nvidia_momentum_[1].begin(), - nvidia_momentum_[2].begin(), - nvidia_weight_.begin(), - nvidia_charge_.begin() ) ) + - nparts; - - // Copy recv particles in main data structure - thrust::copy_n( thrust::device, - source_iterator_first, - nparts_add, - destination_iterator_first ); - - // Remove the other position values depending on the simulation's grid - // dimensions - for( int i = 1; i < position_dimension_count; ++i ) { - thrust::copy_n( thrust::device, - cp_parts->nvidia_position_[i].cbegin(), - nparts_add, - nvidia_position_[i].begin() + nparts ); + const auto keys = getPtrCellKeys(); + const int nparts_to_remove = thrust::count_if( thrust::device, keys + offset, keys + gpu_nparts_, pred ); + + // Copy the particles to the destination + // Using more memory, we could use the faster remove_copy_if + // NOTE: remove_if is stable. + for( auto prop: nvidia_double_prop_ ) { + const auto in = prop->begin(); + thrust::remove_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in + offset, in + gpu_nparts_, keys + offset, pred ); } - - if( has_quantum_parameter ) { - thrust::copy_n( thrust::device, - cp_parts->nvidia_chi_.cbegin(), - nparts_add, - nvidia_chi_.begin() + nparts ); + for( auto prop: nvidia_short_prop_ ) { + const auto in = prop->begin(); + thrust::remove_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in + offset, in + gpu_nparts_, keys + offset, pred ); } - - if( has_Monte_Carlo_process ) { - thrust::copy_n( thrust::device, - cp_parts->nvidia_tau_.cbegin(), - nparts_add, - nvidia_tau_.begin() + nparts ); - } - if( tracked ) { - thrust::copy_n( thrust::device, - cp_parts->nvidia_id_.cbegin(), - nparts_add, - nvidia_id_.begin() + nparts ); + const auto in = nvidia_id_.begin(); + thrust::remove_if( SMILEI_ACCELERATOR_ASYNC_POLYCY, in + offset, in + gpu_nparts_, keys + offset, pred ); } + SMILEI_ACCELERATOR_DEVICE_SYNC(); - // No more particles to move - cp_parts->resize( 0 ); - - return nparts_add; + return nparts_to_remove; } + // --------------------------------------------------------------------------------------------------------------------- //! Create n_additional_particles new particles at the end of vectors //! Fill the new elements with 0 @@ -1635,29 +1001,22 @@ void nvidiaParticles::createParticles( int n_additional_particles ) { int n_particles = gpu_nparts_; int new_size = n_particles + n_additional_particles; - for( unsigned int iprop=0 ; ipropbegin() + n_particles, prop->begin() + new_size, 0); } - - for( unsigned int iprop=0 ; ipropbegin() + n_particles, prop->begin() + new_size, 0); } - - // for( unsigned int iprop=0 ; iprop index( gpu_nparts_ ); + thrust::sequence( thrust::device, index.begin(), index.end() ); + thrust::sort_by_key( thrust::device, nvidia_cell_keys_.begin(), nvidia_cell_keys_.end(), index.begin() ); + + // Sort particles using thrust::gather, according to the sorting map + thrust::device_vector buffer( gpu_nparts_ ); + for( auto prop: nvidia_double_prop_ ) { + thrust::gather( thrust::device, index.begin(), index.end(), prop->begin(), buffer.begin() ); + prop->swap( buffer ); + } + buffer.clear(); + thrust::device_vector buffer_short( gpu_nparts_ ); + for( auto prop: nvidia_short_prop_ ) { + thrust::gather( thrust::device, index.begin(), index.end(), prop->begin(), buffer_short.begin() ); + prop->swap( buffer_short ); + } + buffer_short.clear(); + if( tracked ) { + thrust::device_vector buffer_uint64( gpu_nparts_ ); + thrust::gather( thrust::device, index.begin(), index.end(), nvidia_id_.begin(), buffer_uint64.begin() ); + nvidia_id_.swap( buffer_uint64 ); + buffer_uint64.clear(); + } +} + +//! Sort by cell_keys_ +//! This version is asynchronous, but requires a buffer of equal size to be provided +void nvidiaParticles::sortParticleByKey( nvidiaParticles &buffer ) +{ + // Make a sorting map using the cell keys (like numpy.argsort) + thrust::device_vector index( gpu_nparts_ ); + thrust::sequence( thrust::device, index.begin(), index.end() ); + thrust::sort_by_key( thrust::device, nvidia_cell_keys_.begin(), nvidia_cell_keys_.end(), index.begin() ); + + // Sort particles using thrust::gather, according to the sorting map + for( int ip = 0; ip < nvidia_double_prop_.size(); ip++ ) { + thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_double_prop_[ip]->begin(), buffer.nvidia_double_prop_[ip]->begin() ); + } + for( int ip = 0; ip < nvidia_short_prop_.size(); ip++ ) { + thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_short_prop_[ip]->begin(), buffer.nvidia_short_prop_[ip]->begin() ); + } + if( tracked ) { + thrust::gather( SMILEI_ACCELERATOR_ASYNC_POLYCY, index.begin(), index.end(), nvidia_id_.begin(), buffer.nvidia_id_.begin() ); + } + SMILEI_ACCELERATOR_DEVICE_SYNC(); + + // Swap properties with their buffer + for( int iprop = 0; iprop < nvidia_double_prop_.size(); iprop++ ) { + nvidia_double_prop_[iprop]->swap( *buffer.nvidia_double_prop_[iprop] ); + } + for( int iprop = 0; iprop < nvidia_short_prop_.size(); iprop++ ) { + nvidia_short_prop_[iprop]->swap( *buffer.nvidia_short_prop_[iprop] ); + } + if( tracked ) { + nvidia_id_.swap( buffer.nvidia_id_ ); + } +} + + +void nvidiaParticles::scatterParticles( nvidiaParticles &dest, const thrust::device_vector &index ) +{ + const auto n = std::min( (int) index.size(), gpu_nparts_ ); + for( int ip = 0; ip < nvidia_double_prop_.size(); ip++ ) { + const auto in = nvidia_double_prop_[ip]->begin(); + thrust::scatter( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + n, index.begin(), dest.nvidia_double_prop_[ip]->begin() ); + } + for( int ip = 0; ip < nvidia_short_prop_.size(); ip++ ) { + const auto in = nvidia_short_prop_[ip]->begin(); + thrust::scatter( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + n, index.begin(), dest.nvidia_short_prop_[ip]->begin() ); + } + if( tracked ) { + const auto in = nvidia_id_.begin(); + thrust::scatter( SMILEI_ACCELERATOR_ASYNC_POLYCY, in, in + n, index.begin(), dest.nvidia_id_.begin() ); + } + SMILEI_ACCELERATOR_DEVICE_SYNC(); +} + int nvidiaParticles::prepareBinIndex() { if( first_index.size() == 0 ) { - // Some Particles object like particles_to_move do not have allocated - // bins, we skip theses. + // Some Particles object do not have allocated bins, we skip theses. return -1; } @@ -1740,7 +1180,10 @@ void nvidiaParticles::naiveImportAndSortParticles( nvidiaParticles* particles_to eraseLeavingParticles(); // Inject newly arrived particles in particles_to_inject - injectParticles( particles_to_inject ); + const size_t current_size = gpu_nparts_; + deviceResize( current_size + particles_to_inject->size() ); + pasteParticles( particles_to_inject, current_size, 0 ); + particles_to_inject->clear(); } extern "C" diff --git a/src/Particles/nvidiaParticles.h b/src/Particles/nvidiaParticles.h index 249a9fcf2..a02edffc8 100644 --- a/src/Particles/nvidiaParticles.h +++ b/src/Particles/nvidiaParticles.h @@ -34,33 +34,20 @@ class nvidiaParticles : public Particles //! Destructor for nvidiaParticles ~nvidiaParticles(); - //! Allocate the right amount of position and momentum dimensions - void resizeDimensions( unsigned int nDim ); - //! Reserve space for (particle_count * growth_factor) particles only if //! particle_count >= deviceCapacity(). Must be called after //! allocateDimensions() - void softReserve( unsigned int particle_count, float growth_factor = 1.3F ); - - //! Reserve space for particle_count particles. Must be called after - //! allocateDimensions() - void reserve( unsigned int particle_count ); - - //! Allocate particle_count particles. Must be called after - //! allocateDimensions() - //! Set the size (deviceSize) of nvidiaParticles to particle_count. - //! - void resize( unsigned int particle_count ); + void deviceReserve( unsigned int particle_count, float growth_factor = 1.3F ); //! Assures that the memory holden by the nvidia_[position|momentum|weight| //! charge|chi|tau|cell_keys]_ is freed. This is not something you can //! achieve via a naive resize. //! The pointers in nvidia_[double|short]_prop_ are not invalidated. //! - void free(); + void deviceFree(); //! Resize Particle vectors on device - void deviceResize(unsigned int new_size); + void deviceResize( unsigned int new_size ); //! Remove all particles void deviceClear(); @@ -78,7 +65,7 @@ class nvidiaParticles : public Particles void copyFromHostToDevice() override; //! Update the particles from device to host - void copyFromDeviceToHost() override; + void copyFromDeviceToHost( bool copy_keys = false ) override; unsigned int deviceCapacity() const override; @@ -113,21 +100,27 @@ class nvidiaParticles : public Particles }; // ----------------------------------------------------------------------------- - //! Extract particles from the Particles object and put - //! them in the Particles object `particles_to_move` + //! Move leaving particles to the buffers // ----------------------------------------------------------------------------- - void extractParticles( Particles* particles_to_move ) override; + void copyLeavingParticlesToBuffer( Particles* buffer ) override; + + template + void copyParticlesByPredicate( Particles* buffer, Predicate pred ); + + //! Resize & Copy particles from particles_to_inject to end of vectors + int addParticles( Particles* particles_to_inject ) override; + + //! Copy particles from particles_to_inject to specific offset + void pasteParticles( nvidiaParticles* particles_to_inject, size_t offset_out, size_t offset_in ); // ----------------------------------------------------------------------------- //! Erase particles leaving the patch object on device and returns the number of particle removed // ----------------------------------------------------------------------------- int eraseLeavingParticles() override; - // ----------------------------------------------------------------------------- - //! Inject particles from particles_to_move into *this and return he number of particle added - // ----------------------------------------------------------------------------- - int injectParticles( Particles* particles_to_inject ) override; - + template + int eraseParticlesByPredicate( Predicate pred, size_t offset ); + // --------------------------------------------------------------------------------------------------------------------- //! Create n_additional_particles new particles at the end of vectors //! Fill the new elements with 0 @@ -137,6 +130,14 @@ class nvidiaParticles : public Particles //! See the Particles class for documentation. void importAndSortParticles( Particles* particles_to_inject ) override; + //! Sort by cell_keys_ + //! This version synchronizes for every vector, but uses less buffers + void sortParticleByKey(); + //! This version is asynchronous, but requires a buffer of equal size to be provided + void sortParticleByKey( nvidiaParticles& buffer ); + + void scatterParticles( nvidiaParticles &particles_to_import, const thrust::device_vector &index ); + protected: //! Redefine first_index and last_index according to the binning algorithm //! used on GPU. diff --git a/src/Patch/Patch.cpp b/src/Patch/Patch.cpp index b8ed401d9..ca76c6ece 100755 --- a/src/Patch/Patch.cpp +++ b/src/Patch/Patch.cpp @@ -445,7 +445,7 @@ void Patch::setLocationAndAllocateFields( Params ¶ms, DomainDecomposition *d Patch::~Patch() { -#ifdef SMILEI_ACCELERATOR_MODE +#ifdef SMILEI_ACCELERATOR_GPU deleteFieldsOnDevice(); #endif @@ -517,220 +517,155 @@ void Patch::updateMPIenv( SmileiMPI *smpi ) // --------------------------------------------------------------------------------------------------------------------- void Patch::cleanMPIBuffers( int ispec, Params ¶ms ) { - int ndim = params.nDim_field; + size_t ndim = params.nDim_field; + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; - for( int iDim=0 ; iDim < ndim ; iDim++ ) { + for( size_t iDim=0 ; iDim < ndim ; iDim++ ) { for( int iNeighbor=0 ; iNeighborMPI_buffer_.partRecv[iDim][iNeighbor].clear();//resize(0,ndim); - vecSpecies[ispec]->MPI_buffer_.partSend[iDim][iNeighbor].clear();//resize(0,ndim); - vecSpecies[ispec]->MPI_buffer_.part_index_send[iDim][iNeighbor].clear(); - //vecSpecies[ispec]->MPI_buffer_.part_index_send[iDim][iNeighbor].resize(0); - vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][iNeighbor] = 0; + buffer.partRecv[iDim][iNeighbor]->clear(); + buffer.partSend[iDim][iNeighbor]->clear(); } } } // cleanMPIBuffers // --------------------------------------------------------------------------------------------------------------------- -// Split particles Id to send in per direction and per patch neighbor dedicated buffers -// Apply periodicity if necessary +// Copy particles to be exchanged to buffers // --------------------------------------------------------------------------------------------------------------------- -void Patch::initExchParticles( int ispec, Params ¶ms ) +void Patch::copyExchParticlesToBuffers( int ispec, Params ¶ms ) { - Particles &cuParticles = ( *vecSpecies[ispec]->particles_to_move ); - int ndim = params.nDim_field; - int idim, check; -// double xmax[3]; - - for( int iDim=0 ; iDim < ndim ; iDim++ ) { - for( int iNeighbor=0 ; iNeighborMPI_buffer_.partRecv[iDim][iNeighbor].clear();//resize(0,ndim); - vecSpecies[ispec]->MPI_buffer_.partSend[iDim][iNeighbor].clear();//resize(0,ndim); - vecSpecies[ispec]->MPI_buffer_.part_index_send[iDim][iNeighbor].resize( 0 ); - vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][iNeighbor] = 0; - } + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; + Particles &part = *vecSpecies[ispec]->particles; + + cleanMPIBuffers( ispec, params ); + + // Make a list of buffers + vector copy( params.nDim_field*2, false ); + vector sendBuffer( params.nDim_field*2, nullptr ); + for( size_t iDim = 0; iDim < params.nDim_field; iDim++ ) { + copy[2*iDim+0] = neighbor_[iDim][0] != MPI_PROC_NULL; + copy[2*iDim+1] = neighbor_[iDim][1] != MPI_PROC_NULL; + sendBuffer[2*iDim+0] = buffer.partSend[iDim][0]; + sendBuffer[2*iDim+1] = buffer.partSend[iDim][1]; } - - int n_part_send = cuParticles.size(); - - int iPart; - - // Define where particles are going - //Put particles in the send buffer it belongs to. Priority to lower dimensions. - if( params.geometry != "AMcylindrical" ) { - for( int i=0 ; iMPI_buffer_.part_index_send[idim][0].push_back( iPart ); - } - //If particle is outside of the global domain (has no neighbor), it will not be put in a send buffer and will simply be deleted. - check = 1; - } else if( cuParticles.position( idim, iPart ) >= max_local_[idim] ) { - if( neighbor_[idim][1]!=MPI_PROC_NULL ) { - vecSpecies[ispec]->MPI_buffer_.part_index_send[idim][1].push_back( iPart ); - } - check = 1; - } - idim++; - } - } - } else { //if (geometry == "AMcylindrical") - double r_min2, r_max2; - r_max2 = max_local_[1] * max_local_[1] ; - r_min2 = min_local_[1] * min_local_[1] ; - for( int i=0 ; iboundary_conditions_[0][0]!="periodic" ) ) { - continue; - } - vecSpecies[ispec]->MPI_buffer_.part_index_send[0][0].push_back( iPart ); - //MESSAGE("Sending particle to the left x= " << cuParticles.position(0,iPart) << " xmin = " << min_local_[0] ); - } - //If particle is outside of the global domain (has no neighbor), it will not be put in a send buffer and will simply be deleted. - } else if( cuParticles.position( 0, iPart ) >= max_local_[0] ) { - if ( (Pcoordinates[0]==params.number_of_patches[0]-1) && ( vecSpecies[ispec]->boundary_conditions_[0][1]!="periodic" ) ) { - continue; - } - if( neighbor_[0][1]!=MPI_PROC_NULL ) { - vecSpecies[ispec]->MPI_buffer_.part_index_send[0][1].push_back( iPart ); - // MESSAGE("Sending particle to the right x= " << cuParticles.position(0,iPart) << " xmax = " << max_local_[0] ); - } - } else if( cuParticles.distance2ToAxis( iPart ) < r_min2 ) { - if( neighbor_[1][0]!=MPI_PROC_NULL ) { - vecSpecies[ispec]->MPI_buffer_.part_index_send[1][0].push_back( iPart ); - //MESSAGE("Sending particle to the south r= " << cuParticles.distance2ToAxis(iPart) << " rmin2 = " << r_min2 ); - } - } else if( cuParticles.distance2ToAxis( iPart ) >= r_max2 ) { - if( neighbor_[1][1]!=MPI_PROC_NULL ) { - vecSpecies[ispec]->MPI_buffer_.part_index_send[1][1].push_back( iPart ); - //MESSAGE("Sending particle to the north r= " << cuParticles.distance2ToAxis(iPart) << " rmax2 = " << r_max2 << " rmin2= " << r_min2 ); - } - } - - } + if( params.geometry == "AMcylindrical" ) { + copy[0] = copy[0] && ( Pcoordinates[0]!=0 || vecSpecies[ispec]->boundary_conditions_[0][0]=="periodic" ); + copy[1] = copy[1] && ( Pcoordinates[0]!=params.number_of_patches[0]-1 || vecSpecies[ispec]->boundary_conditions_[0][1]=="periodic" ); } - -} // initExchParticles(... iDim) + + part.copyLeavingParticlesToBuffers( copy, sendBuffer ); + +} // copyExchParticlesToBuffers(... iDim) // --------------------------------------------------------------------------------------------------------------------- -// For direction iDim, start exchange of number of particles -// - vecPatch : used for intra-MPI process comm (direct copy using Particels::copyParticles) -// - smpi : inhereted from previous SmileiMPI::exchangeParticles() +// Exchange number of particles to exchange to establish or not a communication // --------------------------------------------------------------------------------------------------------------------- void Patch::exchNbrOfParticles( SmileiMPI *smpi, int ispec, Params &, int iDim, VectorPatch *vecPatch ) { - int h0 = ( *vecPatch )( 0 )->hindex; - /********************************************************************************/ - // Exchange number of particles to exchange to establish or not a communication - /********************************************************************************/ + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; + for( int iNeighbor=0 ; iNeighborsize(); + + // Send number of particles from neighbor if( neighbor_[iDim][iNeighbor]!=MPI_PROC_NULL ) { - vecSpecies[ispec]->MPI_buffer_.part_index_send_sz[iDim][iNeighbor] = ( vecSpecies[ispec]->MPI_buffer_.part_index_send[iDim][iNeighbor] ).size(); - if( is_a_MPI_neighbor( iDim, iNeighbor ) ) { - //If neighbour is MPI ==> I send him the number of particles I'll send later. int local_hindex = hindex - vecPatch->refHindex_; int tag = buildtag( local_hindex, iDim+1, iNeighbor+3 ); - MPI_Isend( &( vecSpecies[ispec]->MPI_buffer_.part_index_send_sz[iDim][iNeighbor] ), 1, MPI_INT, MPI_neighbor_[iDim][iNeighbor], tag, MPI_COMM_WORLD, &( vecSpecies[ispec]->MPI_buffer_.srequest[iDim][iNeighbor] ) ); + MPI_Isend( &buffer.partSendSize[iDim][iNeighbor], 1, MPI_INT, MPI_neighbor_[iDim][iNeighbor], tag, MPI_COMM_WORLD, &buffer.srequest[iDim][iNeighbor] ); } else { - //Else, I directly set the receive size to the correct value. - ( *vecPatch )( neighbor_[iDim][iNeighbor]- h0 )->vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2] = vecSpecies[ispec]->MPI_buffer_.part_index_send_sz[iDim][iNeighbor]; + // If the destination is in the same MPI, directly set the number at destination + int destination_hindex = neighbor_[iDim][iNeighbor] - vecPatch->refHindex_; + SpeciesMPIbuffers &destination_buffer = ( *vecPatch )( destination_hindex )->vecSpecies[ispec]->MPI_buffer_; + destination_buffer.partRecvSize[iDim][iOppositeNeighbor] = buffer.partSendSize[iDim][iNeighbor]; } - } // END of Send - - if( neighbor_[iDim][( iNeighbor+1 )%2]!=MPI_PROC_NULL ) { - if( is_a_MPI_neighbor( iDim, ( iNeighbor+1 )%2 ) ) { - //If other neighbour is MPI ==> I receive the number of particles I'll receive later. - int local_hindex = neighbor_[iDim][( iNeighbor+1 )%2] - smpi->patch_refHindexes[ MPI_neighbor_[iDim][( iNeighbor+1 )%2] ]; + } + + // Receive number of particles from neighbor + if( neighbor_[iDim][iOppositeNeighbor]!=MPI_PROC_NULL ) { + if( is_a_MPI_neighbor( iDim, iOppositeNeighbor ) ) { + int local_hindex = neighbor_[iDim][iOppositeNeighbor] - smpi->patch_refHindexes[ MPI_neighbor_[iDim][iOppositeNeighbor] ]; int tag = buildtag( local_hindex, iDim+1, iNeighbor+3 ); - MPI_Irecv( &( vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2] ), 1, MPI_INT, MPI_neighbor_[iDim][( iNeighbor+1 )%2], tag, MPI_COMM_WORLD, &( vecSpecies[ispec]->MPI_buffer_.rrequest[iDim][( iNeighbor+1 )%2] ) ); + MPI_Irecv( &buffer.partRecvSize[iDim][iOppositeNeighbor], 1, MPI_INT, MPI_neighbor_[iDim][iOppositeNeighbor], tag, MPI_COMM_WORLD, &buffer.rrequest[iDim][iOppositeNeighbor] ); } } - }//end loop on nb_neighbors. - + + } + } // exchNbrOfParticles(... iDim) +// --------------------------------------------------------------------------------------------------------------------- +// Wait for end of communications over number of particles +// --------------------------------------------------------------------------------------------------------------------- void Patch::endNbrOfParticles( int ispec, int iDim ) { - Particles &cuParticles = ( *vecSpecies[ispec]->particles_to_move ); - - /********************************************************************************/ - // Wait for end of communications over number of particles - /********************************************************************************/ + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; + for( int iNeighbor=0 ; iNeighborMPI_buffer_.srequest[iDim][iNeighbor] ), &( sstat[iNeighbor] ) ); - } + int iOppositeNeighbor = ( iNeighbor+1 )%2; + + MPI_Status sstat[2]; + MPI_Status rstat[2]; + if( is_a_MPI_neighbor( iDim, iNeighbor ) ) { + MPI_Wait( &( buffer.srequest[iDim][iNeighbor] ), &( sstat[iNeighbor] ) ); } - if( neighbor_[iDim][( iNeighbor+1 )%2]!=MPI_PROC_NULL ) { - if( is_a_MPI_neighbor( iDim, ( iNeighbor+1 )%2 ) ) { - MPI_Wait( &( vecSpecies[ispec]->MPI_buffer_.rrequest[iDim][( iNeighbor+1 )%2] ), &( rstat[( iNeighbor+1 )%2] ) ); - if( vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2]!=0 ) { - //If I receive particles over MPI, I initialize my receive buffer with the appropriate size. - vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2].initialize( vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2], cuParticles ); - } - } + if( is_a_MPI_neighbor( iDim, iOppositeNeighbor ) ) { + MPI_Wait( &( buffer.rrequest[iDim][iOppositeNeighbor] ), &( rstat[iOppositeNeighbor] ) ); } } - } // END endNbrOfParticles(... iDim) // --------------------------------------------------------------------------------------------------------------------- -// For direction iDim, finalize receive of number of particles and really send particles +// For direction iDim, prepare particles to be sent // - vecPatch : used for intra-MPI process comm (direct copy using Particels::copyParticles) // - smpi : used smpi->periods_ // --------------------------------------------------------------------------------------------------------------------- void Patch::prepareParticles( SmileiMPI *smpi, int ispec, Params ¶ms, int iDim, VectorPatch *vecPatch ) { - Particles &cuParticles = ( *vecSpecies[ispec]->particles_to_move ); - - int n_part_send; - int h0 = ( *vecPatch )( 0 )->hindex; double x_max = params.cell_length[iDim]*( params.global_size_[iDim] ); - + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; + for( int iNeighbor=0 ; iNeighborMPI_buffer_.part_index_send[iDim][iNeighbor] ).size(); - if( ( neighbor_[iDim][iNeighbor]!=MPI_PROC_NULL ) && ( n_part_send!=0 ) ) { - // Enabled periodicity - if( smpi->periods_[iDim]==1 ) { - for( int iPart=0 ; iPartMPI_buffer_.part_index_send[iDim][iNeighbor][iPart] ) < 0. ) ) { - cuParticles.position( iDim, vecSpecies[ispec]->MPI_buffer_.part_index_send[iDim][iNeighbor][iPart] ) += x_max; - } else if( ( iNeighbor==1 ) && ( Pcoordinates[iDim] == params.number_of_patches[iDim]-1 ) && ( cuParticles.position( iDim, vecSpecies[ispec]->MPI_buffer_.part_index_send[iDim][iNeighbor][iPart] ) >= x_max ) ) { - cuParticles.position( iDim, vecSpecies[ispec]->MPI_buffer_.part_index_send[iDim][iNeighbor][iPart] ) -= x_max; + + Particles &partSend = *buffer.partSend[iDim][iNeighbor]; + + // Enabled periodicity + if( neighbor_[iDim][iNeighbor] != MPI_PROC_NULL ) { + if( partSend.size() > 0 && smpi->periods_[iDim]==1 ) { + if( iNeighbor == 0 && Pcoordinates[iDim] == 0 ) { + for( size_t iPart=0; iPart < partSend.size(); iPart++ ) { + if( partSend.position( iDim, iPart ) < 0. ) { + partSend.position( iDim, iPart ) += x_max; + } + } + } + if( iNeighbor == 1 && Pcoordinates[iDim] == params.number_of_patches[iDim]-1 ) { + for( size_t iPart=0; iPart < partSend.size(); iPart++ ) { + if( partSend.position( iDim, iPart ) >= x_max ) { + partSend.position( iDim, iPart ) -= x_max; + } } } } - // Send particles + + // Initialize receive buffer with the appropriate size if( is_a_MPI_neighbor( iDim, iNeighbor ) ) { - // If MPI comm, first copy particles in the sendbuffer - for( int iPart=0 ; iPartMPI_buffer_.part_index_send[iDim][iNeighbor][iPart], vecSpecies[ispec]->MPI_buffer_.partSend[iDim][iNeighbor] ); + if( buffer.partRecvSize[iDim][iNeighbor]!=0 ) { + buffer.partRecv[iDim][iNeighbor]->initialize( buffer.partRecvSize[iDim][iNeighbor], *vecSpecies[ispec]->particles ); } + // Swap particles to other patch directly if it belongs to the same MPI } else { - //If not MPI comm, copy particles directly in the receive buffer - for( int iPart=0 ; iPartMPI_buffer_.part_index_send[iDim][iNeighbor][iPart], ( ( *vecPatch )( neighbor_[iDim][iNeighbor]- h0 )->vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ) ); - } + int iOppositeNeighbor = ( iNeighbor+1 )%2; + SpeciesMPIbuffers &neighbor_buffer = ( *vecPatch )( neighbor_[iDim][iNeighbor]- vecPatch->refHindex_ )->vecSpecies[ispec]->MPI_buffer_; + swap( buffer.partSend[iDim][iNeighbor], neighbor_buffer.partRecv[iDim][iOppositeNeighbor] ); } - } // END of Send - + } + } // END for iNeighbor } // END prepareParticles(... iDim) @@ -738,169 +673,133 @@ void Patch::prepareParticles( SmileiMPI *smpi, int ispec, Params ¶ms, int iD void Patch::exchParticles( SmileiMPI *smpi, int ispec, Params &, int iDim, VectorPatch *vecPatch ) { - int n_part_send, n_part_recv; - - for( int iNeighbor=0 ; iNeighborMPI_buffer_.part_index_send[iDim][iNeighbor] ).size(); - if( ( neighbor_[iDim][iNeighbor]!=MPI_PROC_NULL ) && ( n_part_send!=0 ) ) { - // Send particles - if( is_a_MPI_neighbor( iDim, iNeighbor ) ) { - // Then send particles - int local_hindex = hindex - vecPatch->refHindex_; - int tag = buildtag( local_hindex, iDim+1, iNeighbor+3 ); - vecSpecies[ispec]->typePartSend[( iDim*2 )+iNeighbor] = smpi->createMPIparticles( &( vecSpecies[ispec]->MPI_buffer_.partSend[iDim][iNeighbor] ) ); - MPI_Isend( &( ( vecSpecies[ispec]->MPI_buffer_.partSend[iDim][iNeighbor] ).position( 0, 0 ) ), 1, vecSpecies[ispec]->typePartSend[( iDim*2 )+iNeighbor], MPI_neighbor_[iDim][iNeighbor], tag, MPI_COMM_WORLD, &( vecSpecies[ispec]->MPI_buffer_.srequest[iDim][iNeighbor] ) ); - } - } // END of Send - - n_part_recv = vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2]; - if( ( neighbor_[iDim][( iNeighbor+1 )%2]!=MPI_PROC_NULL ) && ( n_part_recv!=0 ) ) { - if( is_a_MPI_neighbor( iDim, ( iNeighbor+1 )%2 ) ) { - // If MPI comm, receive particles in the recv buffer previously initialized. - vecSpecies[ispec]->typePartRecv[( iDim*2 )+iNeighbor] = smpi->createMPIparticles( &( vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ) ); - int local_hindex = neighbor_[iDim][( iNeighbor+1 )%2] - smpi->patch_refHindexes[ MPI_neighbor_[iDim][( iNeighbor+1 )%2] ]; - int tag = buildtag( local_hindex, iDim+1, iNeighbor+3 ); - MPI_Irecv( &( ( vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ).position( 0, 0 ) ), 1, vecSpecies[ispec]->typePartRecv[( iDim*2 )+iNeighbor], MPI_neighbor_[iDim][( iNeighbor+1 )%2], tag, MPI_COMM_WORLD, &( vecSpecies[ispec]->MPI_buffer_.rrequest[iDim][( iNeighbor+1 )%2] ) ); - } - - } // END of Recv - - } // END for iNeighbor - + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; + + for( int iNeighbor=0; iNeighborrefHindex_; + int tag = buildtag( local_hindex, iDim+1, iNeighbor+3 ); + vecSpecies[ispec]->typePartSend[( iDim*2 )+iNeighbor] = smpi->createMPIparticles( &partSend ); + MPI_Isend( &partSend.position( 0, 0 ), 1, vecSpecies[ispec]->typePartSend[( iDim*2 )+iNeighbor], MPI_neighbor_[iDim][iNeighbor], tag, MPI_COMM_WORLD, &( buffer.srequest[iDim][iNeighbor] ) ); + } + + // Receive + int iOppositeNeighbor = ( iNeighbor+1 )%2; + Particles &partRecv = *buffer.partRecv[iDim][iOppositeNeighbor]; + if( partRecv.size() != 0 && is_a_MPI_neighbor( iDim, iOppositeNeighbor ) ) { + vecSpecies[ispec]->typePartRecv[( iDim*2 )+iNeighbor] = smpi->createMPIparticles( &partRecv ); + int local_hindex = neighbor_[iDim][iOppositeNeighbor] - smpi->patch_refHindexes[ MPI_neighbor_[iDim][iOppositeNeighbor] ]; + int tag = buildtag( local_hindex, iDim+1, iNeighbor+3 ); + MPI_Irecv( &partRecv.position( 0, 0 ), 1, vecSpecies[ispec]->typePartRecv[( iDim*2 )+iNeighbor], MPI_neighbor_[iDim][iOppositeNeighbor], tag, MPI_COMM_WORLD, &buffer.rrequest[iDim][iOppositeNeighbor] ); + } + + } + } // END exchParticles(... iDim) // --------------------------------------------------------------------------------------------------------------------- -// For direction iDim, finalize receive of particles, temporary store particles if diagonalParticles -// And store recv particles at their definitive place. -// Call Patch::cleanupSentParticles -// - vecPatch : used for intra-MPI process comm (direct copy using Particels::copyParticles) -// - smpi : used smpi->periods_ +// For direction iDim, wait receive of particles // --------------------------------------------------------------------------------------------------------------------- -void Patch::finalizeExchParticles( int ispec, int iDim ) +void Patch::waitExchParticles( int ispec, int iDim ) { - - int n_part_send, n_part_recv; - - /********************************************************************************/ - // Wait for end of communications over Particles - /********************************************************************************/ + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; + for( int iNeighbor=0 ; iNeighborMPI_buffer_.part_index_send[iDim][iNeighbor].size(); - n_part_recv = vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2]; - - if( ( neighbor_[iDim][iNeighbor]!=MPI_PROC_NULL ) && ( n_part_send!=0 ) ) { - if( is_a_MPI_neighbor( iDim, iNeighbor ) ) { - MPI_Wait( &( vecSpecies[ispec]->MPI_buffer_.srequest[iDim][iNeighbor] ), &( sstat[iNeighbor] ) ); - MPI_Type_free( &( vecSpecies[ispec]->typePartSend[( iDim*2 )+iNeighbor] ) ); - } + + int iOppositeNeighbor = ( iNeighbor+1 )%2; + Particles &partSend = *buffer.partSend[iDim][iNeighbor]; + Particles &partRecv = *buffer.partRecv[iDim][iOppositeNeighbor]; + + if( partSend.size() != 0 && is_a_MPI_neighbor( iDim, iNeighbor ) ) { + MPI_Wait( &buffer.srequest[iDim][iNeighbor], &sstat[iNeighbor] ); + MPI_Type_free( &vecSpecies[ispec]->typePartSend[( iDim*2 )+iNeighbor] ); } - if( ( neighbor_[iDim][( iNeighbor+1 )%2]!=MPI_PROC_NULL ) && ( n_part_recv!=0 ) ) { - if( is_a_MPI_neighbor( iDim, ( iNeighbor+1 )%2 ) ) { - MPI_Wait( &( vecSpecies[ispec]->MPI_buffer_.rrequest[iDim][( iNeighbor+1 )%2] ), &( rstat[( iNeighbor+1 )%2] ) ); - MPI_Type_free( &( vecSpecies[ispec]->typePartRecv[( iDim*2 )+iNeighbor] ) ); - } + if( partRecv.size() != 0 && is_a_MPI_neighbor( iDim, iOppositeNeighbor ) ) { + MPI_Wait( &buffer.rrequest[iDim][iOppositeNeighbor], &rstat[iOppositeNeighbor] ); + MPI_Type_free( &vecSpecies[ispec]->typePartRecv[( iDim*2 )+iNeighbor] ); } } } void Patch::cornersParticles( int ispec, Params ¶ms, int iDim ) { - int ndim = params.nDim_field; - int idim, check; - - Particles &cuParticles = ( *vecSpecies[ispec]->particles_to_move ); - - int n_part_recv; - - /********************************************************************************/ - // Wait for end of communications over Particles - /********************************************************************************/ + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; + + // No need to treat diag particles at last dimension + if( iDim == ndim-1 ) { + return; + } + for( int iNeighbor=0 ; iNeighborMPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2]; - - if( ( neighbor_[iDim][( iNeighbor+1 )%2]!=MPI_PROC_NULL ) && ( n_part_recv!=0 ) ) { - - // Treat diagonalParticles - if( iDim < ndim-1 ) { // No need to treat diag particles at last dimension. - if( params.geometry != "AMcylindrical" ) { - for( int iPart=n_part_recv-1 ; iPart>=0; iPart-- ) { - check = 0; - idim = iDim+1;//We check next dimension - while( check == 0 && idimMPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ).position( idim, iPart ) < min_local_[idim] ) { - if( neighbor_[idim][0]!=MPI_PROC_NULL ) { //if neighbour exists - //... copy it at the back of the local particle vector ... - ( vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ).copyParticle( iPart, cuParticles ); - //...adjust particles->last_index or cell_keys ... - //vecSpecies[ispec]->addSpaceForOneParticle(); - //... and add its index to the particles to be sent later... - vecSpecies[ispec]->MPI_buffer_.part_index_send[idim][0].push_back( cuParticles.size()-1 ); - } - //Remove it from receive buffer. - ( vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ).eraseParticle( iPart ); - vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2]--; - check = 1; - } - //Other side of idim - else if( ( vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ).position( idim, iPart ) >= max_local_[idim] ) { - if( neighbor_[idim][1]!=MPI_PROC_NULL ) { //if neighbour exists - ( vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ).copyParticle( iPart, cuParticles ); - //...adjust particles->last_index or cell_keys ... - //vecSpecies[ispec]->addSpaceForOneParticle(); - vecSpecies[ispec]->MPI_buffer_.part_index_send[idim][1].push_back( cuParticles.size()-1 ); - } - ( vecSpecies[ispec]->MPI_buffer_.partRecv[iDim][( iNeighbor+1 )%2] ).eraseParticle( iPart ); - vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[iDim][( iNeighbor+1 )%2]--; - check = 1; - } - idim++; + + Particles &partRecv = *buffer.partRecv[iDim][iNeighbor]; + + vector> indices_corner_min( ndim-iDim-1 ); + vector> indices_corner_max( ndim-iDim-1 ); + vector indices_all_corners; + + if( neighbor_[iDim][iNeighbor] != MPI_PROC_NULL && partRecv.size() != 0 ) { + + // Find corner particles and store their indices + if( params.geometry != "AMcylindrical" ) { + + for( size_t iPart = 0; iPart < partRecv.size(); iPart++ ) { + for( size_t otherDim = iDim+1; otherDim < (size_t) ndim; otherDim++ ) { + if( partRecv.position( otherDim, iPart ) < min_local_[otherDim] ) { + indices_corner_min[otherDim-iDim-1].push_back( iPart ); + indices_all_corners.push_back( iPart ); + break; + } else if( partRecv.position( otherDim, iPart ) >= max_local_[otherDim] ) { + indices_corner_max[otherDim-iDim-1].push_back( iPart ); + indices_all_corners.push_back( iPart ); + break; } } - } else { //In AM geometry - //In this case, iDim = 0 and idim = iDim + 1 = 1. We only have to check potential comms along R. - double r_min2, r_max2; - r_min2 = min_local_[1]*min_local_[1]; - r_max2 = max_local_[1]*max_local_[1]; - for( int iPart=n_part_recv-1 ; iPart>=0; iPart-- ) { - //MESSAGE("test particle diag r2 = " << (vecSpecies[ispec]->MPI_buffer_.partRecv[0][(iNeighbor+1)%2]).distance2ToAxis(iPart) << "rmin2 = " << r_min2 << " rmax2 = " << r_max2 ); - if( ( vecSpecies[ispec]->MPI_buffer_.partRecv[0][( iNeighbor+1 )%2] ).distance2ToAxis( iPart ) < r_min2 ) { - if( neighbor_[1][0]!=MPI_PROC_NULL ) { //if neighbour exists - //... copy it at the back of the local particle vector ... - ( vecSpecies[ispec]->MPI_buffer_.partRecv[0][( iNeighbor+1 )%2] ).copyParticle( iPart, cuParticles ); - //...adjust particles->last_index or cell_keys ... - //vecSpecies[ispec]->addSpaceForOneParticle(); - //... and add its index to the particles to be sent later... - vecSpecies[ispec]->MPI_buffer_.part_index_send[1][0].push_back( cuParticles.size()-1 ); - //..without forgeting to add it to the list of particles to clean. - } - //Remove it from receive buffer. - ( vecSpecies[ispec]->MPI_buffer_.partRecv[0][( iNeighbor+1 )%2] ).eraseParticle( iPart ); - vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[0][( iNeighbor+1 )%2]--; - } - //Other side of idim - else if( ( vecSpecies[ispec]->MPI_buffer_.partRecv[0][( iNeighbor+1 )%2] ).distance2ToAxis( iPart ) >= r_max2 ) { - if( neighbor_[1][1]!=MPI_PROC_NULL ) { //if neighbour exists - //MESSAGE("particle diag +R"); - ( vecSpecies[ispec]->MPI_buffer_.partRecv[0][( iNeighbor+1 )%2] ).copyParticle( iPart, cuParticles ); - //...adjust particles->last_index or cell_keys ... - //vecSpecies[ispec]->addSpaceForOneParticle(); - vecSpecies[ispec]->MPI_buffer_.part_index_send[1][1].push_back( cuParticles.size()-1 ); - } - ( vecSpecies[ispec]->MPI_buffer_.partRecv[0][( iNeighbor+1 )%2] ).eraseParticle( iPart ); - vecSpecies[ispec]->MPI_buffer_.part_index_recv_sz[0][( iNeighbor+1 )%2]--; - } + } + + } else { //In AM geometry + + //In this case, iDim = 0 and idim = iDim + 1 = 1. We only have to check potential comms along R. + double r_min2 = min_local_[1]*min_local_[1]; + double r_max2 = max_local_[1]*max_local_[1]; + + for( size_t iPart = 0; iPart < partRecv.size(); iPart++ ) { + if( partRecv.distance2ToAxis( iPart ) < r_min2 ) { + indices_corner_min[0].push_back( iPart ); + indices_all_corners.push_back( iPart ); + break; + } else if( partRecv.distance2ToAxis( iPart ) >= r_max2 ) { + indices_corner_max[0].push_back( iPart ); + indices_all_corners.push_back( iPart ); + break; } } - }//If not last dim for diagonal particles. + + } + + // Copy corner particles to the end of the particles to be sent for the following dimension + for( size_t otherDim = iDim+1; otherDim < (size_t) ndim; otherDim++ ) { + if( indices_corner_min[otherDim-iDim-1].size() > 0 && neighbor_[otherDim][0] != MPI_PROC_NULL ) { + partRecv.copyParticles( indices_corner_min[otherDim-iDim-1], *buffer.partSend[otherDim][0], buffer.partSend[otherDim][0]->size() ); + } + if( indices_corner_max[otherDim-iDim-1].size() > 0 && neighbor_[otherDim][1] != MPI_PROC_NULL ) { + partRecv.copyParticles( indices_corner_max[otherDim-iDim-1], *buffer.partSend[otherDim][1], buffer.partSend[otherDim][1]->size() ); + } + } + + // Erase corner particles from the current recv array + if( indices_all_corners.size() > 0 ) { + partRecv.eraseParticles( indices_all_corners ); + } + } //If received something } //loop i Neighbor } @@ -925,22 +824,20 @@ void Patch::importAndSortParticles( int ispec, Params ¶ms ) void Patch::cleanParticlesOverhead( Params ¶ms ) { - int ndim = params.nDim_field; + for( unsigned int ispec=0 ; ispecparticles ); - - for( int idim = 0; idim < ndim; idim++ ) { + SpeciesMPIbuffers &buffer = vecSpecies[ispec]->MPI_buffer_; + + for( size_t idim = 0; idim < params.nDim_field; idim++ ) { for( int iNeighbor=0 ; iNeighborMPI_buffer_.partRecv[idim][iNeighbor].clear(); - vecSpecies[ispec]->MPI_buffer_.partRecv[idim][iNeighbor].shrinkToFit( ); - vecSpecies[ispec]->MPI_buffer_.partSend[idim][iNeighbor].clear(); - vecSpecies[ispec]->MPI_buffer_.partSend[idim][iNeighbor].shrinkToFit( ); - vecSpecies[ispec]->MPI_buffer_.part_index_send[idim][iNeighbor].clear(); - vector( vecSpecies[ispec]->MPI_buffer_.part_index_send[idim][iNeighbor] ).swap( vecSpecies[ispec]->MPI_buffer_.part_index_send[idim][iNeighbor] ); + buffer.partRecv[idim][iNeighbor]->clear(); + buffer.partRecv[idim][iNeighbor]->shrinkToFit( ); + buffer.partSend[idim][iNeighbor]->clear(); + buffer.partSend[idim][iNeighbor]->shrinkToFit( ); } } - - cuParticles.shrinkToFit( ); + + vecSpecies[ispec]->particles->shrinkToFit( ); } } @@ -1256,7 +1153,7 @@ void Patch::computePoynting() { } } -#ifdef SMILEI_ACCELERATOR_MODE +#ifdef SMILEI_ACCELERATOR_GPU // --------------------------------------------------------------------------------------------------------------------- // Allocate data on device @@ -1414,7 +1311,6 @@ void Patch::deleteFieldsOnDevice() // for( unsigned int ispec=0 ; ispec<( *this )( ipatch )->vecSpecies.size() ; ispec++ ) { // Species *spec = species( ipatch, ispec ); // spec->particles->initializeDataOnDevice(); -// spec->particles_to_move->initializeDataOnDevice(); // //#pragma acc enter data copyin(spec->nrj_radiation) // } diff --git a/src/Patch/Patch.h b/src/Patch/Patch.h index 6fc3f7578..8d06d21c2 100755 --- a/src/Patch/Patch.h +++ b/src/Patch/Patch.h @@ -174,7 +174,7 @@ class Patch //! Clean the MPI buffers for communications void cleanMPIBuffers( int ispec, Params ¶ms ); //! manage Idx of particles per direction, - void initExchParticles( int ispec, Params ¶ms ); + void copyExchParticlesToBuffers( int ispec, Params ¶ms ); //! init comm nbr of particles void exchNbrOfParticles( SmileiMPI *smpi, int ispec, Params ¶ms, int iDim, VectorPatch *vecPatch ); //! finalize comm / nbr of particles, init exch / particles @@ -184,7 +184,7 @@ class Patch //! effective exchange of particles void exchParticles( SmileiMPI *smpi, int ispec, Params ¶ms, int iDim, VectorPatch *vecPatch ); //! finalize exch / particles - void finalizeExchParticles( int ispec, int iDim ); + void waitExchParticles( int ispec, int iDim ); //! Treat diagonalParticles void cornersParticles( int ispec, Params ¶ms, int iDim ); //! inject particles received in main data structure and particles sorting @@ -194,7 +194,7 @@ class Patch //! delete Particles included in the index of particles to exchange. Assumes indexes are sorted. void cleanupSentParticles( int ispec, std::vector *indexes_of_particles_to_exchange ); -#ifdef SMILEI_ACCELERATOR_MODE +#ifdef SMILEI_ACCELERATOR_GPU //! Allocate and copy all the field grids on device void allocateAndCopyFieldsOnDevice(); diff --git a/src/Patch/SyncVectorPatch.cpp b/src/Patch/SyncVectorPatch.cpp index 09817b201..5e1c39694 100755 --- a/src/Patch/SyncVectorPatch.cpp +++ b/src/Patch/SyncVectorPatch.cpp @@ -2,7 +2,7 @@ #include "SyncVectorPatch.h" #include -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #include #endif #include "Params.h" @@ -24,52 +24,34 @@ template void SyncVectorPatch::exchangeAlongAllDirections,cField template void SyncVectorPatch::exchangeAlongAllDirectionsNoOMP( std::vector fields, VectorPatch &vecPatches, SmileiMPI *smpi ); template void SyncVectorPatch::exchangeAlongAllDirectionsNoOMP,cField>( std::vector fields, VectorPatch &vecPatches, SmileiMPI *smpi ); -void SyncVectorPatch::exchangeParticles( VectorPatch &vecPatches, int ispec, Params ¶ms, SmileiMPI *smpi ) +void SyncVectorPatch::initExchParticles( VectorPatch &vecPatches, int ispec, Params ¶ms, SmileiMPI *smpi ) { #pragma omp for schedule(runtime) for( unsigned int ipatch=0 ; ipatchextractParticles(); - vecPatches( ipatch )->initExchParticles( ispec, params ); - } - - // Init comm in direction 0 -#ifndef _NO_MPI_TM - #pragma omp for schedule(runtime) -#else - #pragma omp single -#endif - for( unsigned int ipatch=0 ; ipatchexchNbrOfParticles( smpi, ispec, params, 0, &vecPatches ); + vecPatches( ipatch )->copyExchParticlesToBuffers( ispec, params ); } + + // Start exchange along dimension 0 only + SyncVectorPatch::initExchParticlesAlongDimension( vecPatches, ispec, 0, params, smpi ); } // --------------------------------------------------------------------------------------------------------------------- //! This function performs: -//! - the exhcange of particles for each direction using the diagonal trick. +//! - the exchange of particles for each direction using the diagonal trick. //! - the importation of the new particles in the particle property arrays //! - the sorting of particles // --------------------------------------------------------------------------------------------------------------------- -void SyncVectorPatch::finalizeAndSortParticles( VectorPatch &vecPatches, int ispec, Params ¶ms, SmileiMPI *smpi ) +void SyncVectorPatch::finalizeExchParticlesAndSort( VectorPatch &vecPatches, int ispec, Params ¶ms, SmileiMPI *smpi ) { - SyncVectorPatch::finalizeExchangeParticles( vecPatches, ispec, 0, params, smpi ); - - // Per direction + // finish exchange along dimension 0 only + SyncVectorPatch::finalizeExchParticlesAlongDimension( vecPatches, ispec, 0, params, smpi ); + + // Other directions for( unsigned int iDim=1 ; iDimexchNbrOfParticles( smpi, ispec, params, iDim, &vecPatches ); - } - - SyncVectorPatch::finalizeExchangeParticles( vecPatches, ispec, iDim, params, smpi ); + SyncVectorPatch::initExchParticlesAlongDimension( vecPatches, ispec, iDim, params, smpi ); + SyncVectorPatch::finalizeExchParticlesAlongDimension( vecPatches, ispec, iDim, params, smpi ); } - + #pragma omp for schedule(runtime) for( unsigned int ipatch=0 ; ipatchimportAndSortParticles( ispec, params ); @@ -108,8 +90,20 @@ void SyncVectorPatch::finalizeAndSortParticles( VectorPatch &vecPatches, int isp } +void SyncVectorPatch::initExchParticlesAlongDimension( VectorPatch &vecPatches, int ispec, int iDim, Params ¶ms, SmileiMPI *smpi ) +{ + // Exchange numbers of particles in direction 0 only +#ifndef _NO_MPI_TM + #pragma omp for schedule(runtime) +#else + #pragma omp single +#endif + for( unsigned int ipatch=0 ; ipatchexchNbrOfParticles( smpi, ispec, params, iDim, &vecPatches ); + } +} -void SyncVectorPatch::finalizeExchangeParticles( VectorPatch &vecPatches, int ispec, int iDim, Params ¶ms, SmileiMPI *smpi ) +void SyncVectorPatch::finalizeExchParticlesAlongDimension( VectorPatch &vecPatches, int ispec, int iDim, Params ¶ms, SmileiMPI *smpi ) { #ifndef _NO_MPI_TM #pragma omp for schedule(runtime) @@ -140,7 +134,7 @@ void SyncVectorPatch::finalizeExchangeParticles( VectorPatch &vecPatches, int is #pragma omp single #endif for( unsigned int ipatch=0 ; ipatchfinalizeExchParticles( ispec, iDim ); + vecPatches( ipatch )->waitExchParticles( ispec, iDim ); } #pragma omp for schedule(runtime) @@ -275,7 +269,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc vecPatches.densitiesMPIx[ifield ]->extract_fields_sum( 0, iNeighbor, oversize[0] ); vecPatches.densitiesMPIx[ifield+nPatchMPIx ]->extract_fields_sum( 0, iNeighbor, oversize[0] ); vecPatches.densitiesMPIx[ifield+2*nPatchMPIx]->extract_fields_sum( 0, iNeighbor, oversize[0] ); -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // Field* field = vecPatches.densitiesMPIx[ifield ]; // double* Jx = field->sendFields_[iNeighbor]->data_; // int sizeofJx = field->sendFields_[iNeighbor]->size(); @@ -297,7 +291,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc // iDim = 0, local const int nFieldLocalx = vecPatches.densitiesLocalx.size() / 3; -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) // At initialization, we may get a CPU buffer than needs to be handled on the host. const bool is_memory_on_device = vecPatches.densitiesLocalx.size() > 0 && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( vecPatches.densitiesLocalx[0]->data() ); @@ -330,9 +324,9 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc pt2 = &( vecPatches.densitiesLocalx[ifield]->data_[0] ); //Sum 2 ==> 1 - const int last = gsp[0] * ny_ * nz_; + const unsigned int last = gsp[0] * ny_ * nz_; -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) int ptsize = vecPatches.densitiesLocalx[ifield]->size(); int nspace0 = size[0]; #pragma acc parallel if ( is_memory_on_device) present(pt1[0-nspace0*ny_*nz_:ptsize],pt2[0:ptsize]) @@ -364,7 +358,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc vecPatches( ipatch )->finalizeSumField( vecPatches.densitiesMPIx[ifield+2*nPatchMPIx], 0 ); // Jz for (int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++) { if ( vecPatches( ipatch )->is_a_MPI_neighbor( 0, ( iNeighbor+1 )%2 ) ) { -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // Field* field = vecPatches.densitiesMPIx[ifield ]; // double* Jx = field->recvFields_[(iNeighbor+1)%2]->data_; // int sizeofJx = field->recvFields_[(iNeighbor+1)%2]->size(); @@ -408,7 +402,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc vecPatches.densitiesMPIy[ifield ]->extract_fields_sum( 1, iNeighbor, oversize[1] ); vecPatches.densitiesMPIy[ifield+nPatchMPIy ]->extract_fields_sum( 1, iNeighbor, oversize[1] ); vecPatches.densitiesMPIy[ifield+2*nPatchMPIy]->extract_fields_sum( 1, iNeighbor, oversize[1] ); -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // Field* field = vecPatches.densitiesMPIy[ifield ]; // double* Jx = field->sendFields_[iNeighbor+2]->data_; // int sizeofJx = field->sendFields_[iNeighbor+2]->size(); @@ -430,7 +424,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc // iDim = 1, const int nFieldLocaly = vecPatches.densitiesLocaly.size() / 3; -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) const bool is_memory_on_device = vecPatches.densitiesLocaly.size() > 0 && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( vecPatches.densitiesLocaly[0]->data() ); #endif @@ -463,11 +457,11 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc pt1 = &( fields[vecPatches( ipatch )->neighbor_[1][0]-h0+icomp*nPatches]->data_[size[1]*nz_] ); pt2 = &( vecPatches.densitiesLocaly[ifield]->data_[0] ); - const int outer_last = nx_ * ny_ * nz_; - const int outer_stride = ny_ * nz_; - const int inner_last = gsp[1] * nz_; + const unsigned int outer_last = nx_ * ny_ * nz_; + const unsigned int outer_stride = ny_ * nz_; + const unsigned int inner_last = gsp[1] * nz_; -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) int ptsize = vecPatches.densitiesLocaly[ifield]->size(); int blabla = size[1]; #pragma acc parallel if (is_memory_on_device) present(pt1[0-blabla*nz_:ptsize],pt2[0:ptsize]) @@ -502,7 +496,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc vecPatches( ipatch )->finalizeSumField( vecPatches.densitiesMPIy[ifield+2*nPatchMPIy], 1 ); // Jz for (int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++) { if ( vecPatches( ipatch )->is_a_MPI_neighbor( 1, ( iNeighbor+1 )%2 ) ) { -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // Field* field = vecPatches.densitiesMPIy[ifield ]; // double* Jx = field->recvFields_[(iNeighbor+1)%2+2]->data_; // int sizeofJx = field->recvFields_[(iNeighbor+1)%2+2]->size(); @@ -544,7 +538,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc vecPatches.densitiesMPIz[ifield ]->extract_fields_sum( 2, iNeighbor, oversize[2] ); vecPatches.densitiesMPIz[ifield+nPatchMPIz ]->extract_fields_sum( 2, iNeighbor, oversize[2] ); vecPatches.densitiesMPIz[ifield+2*nPatchMPIz]->extract_fields_sum( 2, iNeighbor, oversize[2] ); -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // Field* field = vecPatches.densitiesMPIz[ifield ]; // double* Jx = field->sendFields_[iNeighbor+4]->data_; // int sizeofJx = field->sendFields_[iNeighbor+4]->size(); @@ -566,7 +560,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc // iDim = 2 local const int nFieldLocalz = vecPatches.densitiesLocalz.size() / 3; -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) const bool is_memory_on_device = vecPatches.densitiesLocalz.size() > 0 && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( vecPatches.densitiesLocalz[0]->data() ); #endif @@ -600,11 +594,11 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc pt1 = &( fields[vecPatches( ipatch )->neighbor_[2][0]-h0+icomp*nPatches]->data_[size[2]] ); pt2 = &( vecPatches.densitiesLocalz[ifield]->data_[0] ); - const int outer_last = nx_ * ny_ * nz_; - const int outer_stride = nz_; - const int inner_last = gsp[2]; + const unsigned int outer_last = nx_ * ny_ * nz_; + const unsigned int outer_stride = nz_; + const unsigned int inner_last = gsp[2]; -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) int ptsize = vecPatches.densitiesLocalz[ifield]->size(); int blabla = size[2]; #pragma acc parallel if (is_memory_on_device) present(pt1[0-blabla:ptsize],pt2[0:ptsize]) @@ -636,7 +630,7 @@ void SyncVectorPatch::sumAllComponents( std::vector &fields, VectorPatc vecPatches( ipatch )->finalizeSumField( vecPatches.densitiesMPIz[ifield+2*nPatchMPIz], 2 ); // Jz for (int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++) { if ( vecPatches( ipatch )->is_a_MPI_neighbor( 2, ( iNeighbor+1 )%2 ) ) { -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // Field* field = vecPatches.densitiesMPIz[ifield ]; // double* Jx = field->recvFields_[(iNeighbor+1)%2+4]->data_; // int sizeofJx = field->recvFields_[(iNeighbor+1)%2+4]->size(); @@ -803,7 +797,7 @@ void SyncVectorPatch::exchangeE( Params &, VectorPatch &vecPatches, int imode, S SyncVectorPatch::finalizeExchangeAlongAllDirections( vecPatches.listEt_[imode], vecPatches ); } -void SyncVectorPatch::exchangeBmBTIS3( Params ¶ms, VectorPatch &vecPatches, int imode, SmileiMPI *smpi ) +void SyncVectorPatch::exchangeBmBTIS3( Params &/*params*/, VectorPatch &vecPatches, int imode, SmileiMPI *smpi ) { SyncVectorPatch::exchangeAlongAllDirections,cField>( vecPatches.listBr_mBTIS3[imode], vecPatches, smpi ); SyncVectorPatch::finalizeExchangeAlongAllDirections( vecPatches.listBr_mBTIS3[imode], vecPatches ); @@ -887,7 +881,7 @@ void SyncVectorPatch::exchangeEnvEx( Params ¶ms, VectorPatch &vecPatches, Sm } } -void SyncVectorPatch::exchangeBmBTIS3( Params ¶ms, VectorPatch &vecPatches, SmileiMPI *smpi ) +void SyncVectorPatch::exchangeBmBTIS3( Params &/*params*/, VectorPatch &vecPatches, SmileiMPI *smpi ) { // exchange BmBTIS3 in Cartesian geometries // exchange ByBTIS3 @@ -1493,7 +1487,7 @@ void SyncVectorPatch::exchangeAllComponentsAlongX( std::vector &fields, vecPatches.B_MPIx[ifield ]->extract_fields_exch( 0, iNeighbor, oversize ); vecPatches.B_MPIx[ifield+nMPIx]->create_sub_fields ( 0, iNeighbor, oversize ); vecPatches.B_MPIx[ifield+nMPIx]->extract_fields_exch( 0, iNeighbor, oversize ); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC Field* field = vecPatches.B_MPIx[ifield ]; double* By = field->sendFields_[iNeighbor]->data_; int sizeofBy = field->sendFields_[iNeighbor]->size(); @@ -1586,7 +1580,7 @@ void SyncVectorPatch::finalizeExchangeAllComponentsAlongX( VectorPatch &vecPatch vecPatches( ipatch )->finalizeExchange( vecPatches.B_MPIx[ifield+nMPIx], 0 ); // Bz for (int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++) { if ( vecPatches( ipatch )->is_a_MPI_neighbor( 0, ( iNeighbor+1 )%2 ) ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC Field* field = vecPatches.B_MPIx[ifield ]; double* By = field->recvFields_[(iNeighbor+1)%2]->data_; int sizeofBy = field->recvFields_[(iNeighbor+1)%2]->size(); @@ -1629,7 +1623,7 @@ void SyncVectorPatch::exchangeAllComponentsAlongY( std::vector &fields, vecPatches.B1_MPIy[ifield ]->extract_fields_exch( 1, iNeighbor, oversize ); vecPatches.B1_MPIy[ifield+nMPIy]->create_sub_fields ( 1, iNeighbor, oversize ); vecPatches.B1_MPIy[ifield+nMPIy]->extract_fields_exch( 1, iNeighbor, oversize ); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC Field* field = vecPatches.B1_MPIy[ifield ]; double* Bx = field->sendFields_[iNeighbor+2]->data_; int sizeofBx = field->sendFields_[iNeighbor+2]->size(); @@ -1677,7 +1671,7 @@ void SyncVectorPatch::exchangeAllComponentsAlongY( std::vector &fields, if( vecPatches( ipatch )->MPI_me_ == vecPatches( ipatch )->MPI_neighbor_[1][0] ) { pt1 = &( fields[vecPatches( ipatch )->neighbor_[1][0]-h0+icomp*nPatches]->data_[size*nz_] ); pt2 = &( vecPatches.B1_localy[ifield]->data_[0] ); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC int ptsize = vecPatches.B1_localy[ifield]->size(); #pragma acc parallel present(pt1[0-size*nz_:ptsize],pt2[0:ptsize]) #pragma acc loop gang worker vector @@ -1717,7 +1711,7 @@ void SyncVectorPatch::finalizeExchangeAllComponentsAlongY( VectorPatch &vecPatch vecPatches( ipatch )->finalizeExchange( vecPatches.B1_MPIy[ifield+nMPIy], 1 ); // Bz for (int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++) { if ( vecPatches( ipatch )->is_a_MPI_neighbor( 1, ( iNeighbor+1 )%2 ) ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC Field* field = vecPatches.B1_MPIy[ifield ]; double* Bx = field->recvFields_[(iNeighbor+1)%2+2]->data_; int sizeofBx = field->recvFields_[(iNeighbor+1)%2+2]->size(); @@ -1760,7 +1754,7 @@ void SyncVectorPatch::exchangeAllComponentsAlongZ( std::vector fields, vecPatches.B2_MPIz[ifield ]->extract_fields_exch( 2, iNeighbor, oversize ); vecPatches.B2_MPIz[ifield+nMPIz]->create_sub_fields ( 2, iNeighbor, oversize ); vecPatches.B2_MPIz[ifield+nMPIz]->extract_fields_exch( 2, iNeighbor, oversize ); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC Field* field = vecPatches.B2_MPIz[ifield ]; double* Bx = field->sendFields_[iNeighbor+4]->data_; int sizeofBx = field->sendFields_[iNeighbor+4]->size(); @@ -1805,7 +1799,7 @@ void SyncVectorPatch::exchangeAllComponentsAlongZ( std::vector fields, if( vecPatches( ipatch )->MPI_me_ == vecPatches( ipatch )->MPI_neighbor_[2][0] ) { pt1 = &( fields[vecPatches( ipatch )->neighbor_[2][0]-h0+icomp*nPatches]->data_[size] ); pt2 = &( vecPatches.B2_localz[ifield]->data_[0] ); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC int ptsize = vecPatches.B2_localz[ifield]->size(); #pragma acc parallel present(pt1[0-size:ptsize],pt2[0:ptsize]) #pragma acc loop gang worker vector @@ -1845,7 +1839,7 @@ void SyncVectorPatch::finalizeExchangeAllComponentsAlongZ( VectorPatch &vecPatch vecPatches( ipatch )->finalizeExchange( vecPatches.B2_MPIz[ifield+nMPIz], 2 ); // By for (int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++) { if ( vecPatches( ipatch )->is_a_MPI_neighbor( 2, ( iNeighbor+1 )%2 ) ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC Field* field = vecPatches.B2_MPIz[ifield ]; double* Bx = field->recvFields_[(iNeighbor+1)%2+4]->data_; int sizeofBx = field->recvFields_[(iNeighbor+1)%2+4]->size(); diff --git a/src/Patch/SyncVectorPatch.h b/src/Patch/SyncVectorPatch.h index 0ce868cae..07435cd49 100755 --- a/src/Patch/SyncVectorPatch.h +++ b/src/Patch/SyncVectorPatch.h @@ -17,9 +17,10 @@ class SyncVectorPatch public : //! Particles synchronization - static void exchangeParticles( VectorPatch &vecPatches, int ispec, Params ¶ms, SmileiMPI *smpi ); - static void finalizeAndSortParticles( VectorPatch &vecPatches, int ispec, Params ¶ms, SmileiMPI *smpi ); - static void finalizeExchangeParticles( VectorPatch &vecPatches, int ispec, int iDim, Params ¶ms, SmileiMPI *smpi ); + static void initExchParticles( VectorPatch &vecPatches, int ispec, Params ¶ms, SmileiMPI *smpi ); + static void finalizeExchParticlesAndSort( VectorPatch &vecPatches, int ispec, Params ¶ms, SmileiMPI *smpi ); + static void initExchParticlesAlongDimension( VectorPatch &vecPatches, int ispec, int iDim, Params ¶ms, SmileiMPI *smpi ); + static void finalizeExchParticlesAlongDimension( VectorPatch &vecPatches, int ispec, int iDim, Params ¶ms, SmileiMPI *smpi ); //! Densities synchronization static void sumRhoJ( Params ¶ms, VectorPatch &vecPatches, SmileiMPI *smpi ); @@ -72,7 +73,7 @@ public : if ( vecPatches( ipatch )->is_a_MPI_neighbor( 0, iNeighbor ) ) { fields[ifield]->create_sub_fields ( 0, iNeighbor, 2*oversize[0]+1+fields[ifield]->isDual_[0] ); fields[ifield]->extract_fields_sum( 0, iNeighbor, oversize[0] ); -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // double * pointer = fields[ifield]->sendFields_[iNeighbor]->data_; // int size = fields[ifield]->size(); // #endif @@ -86,7 +87,7 @@ public : // iDim = 0, local -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) // At initialization, we may get a CPU buffer than needs to be handled on the host. const bool is_memory_on_device = fields.size() > 0 && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( fields[0]->data() ); @@ -122,7 +123,7 @@ public : const unsigned int last = gsp[0] * ny_ * nz_; -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) int ptsize = fields[ifield]->size(); int nspace0 = size[0]; #pragma acc parallel if ( is_memory_on_device) present(pt1[0-nspace0*ny_*nz_:ptsize],pt2[0:ptsize]) @@ -176,7 +177,7 @@ public : if ( vecPatches( ipatch )->is_a_MPI_neighbor( 1, iNeighbor ) ) { fields[ifield]->create_sub_fields ( 1, iNeighbor, 2*oversize[1]+1+fields[ifield]->isDual_[1] ); fields[ifield]->extract_fields_sum( 1, iNeighbor, oversize[1] ); -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // double* pointer = fields[ifield]->recvFields_[(iNeighbor+1)%2]->data_; // int size = fields[ifield]->recvFields_[(iNeighbor+1)%2]->size(); // //#pragma acc update device( Jx[0:sizeofJx], Jy[0:sizeofJy], Jz[0:sizeofJz] ) @@ -191,7 +192,7 @@ public : // iDim = 1, local -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) const bool is_memory_on_device = fields.size() > 0 && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( fields[0]->data() ); #endif @@ -219,11 +220,11 @@ public : pt1 = &( *field1 )( size[1]*nz_ ); pt2 = &( *field2 )( 0 ); - const int outer_last = nx_ * ny_ * nz_; - const int outer_stride = ny_ * nz_; - const int inner_last = gsp[1] * nz_; + const unsigned int outer_last = nx_ * ny_ * nz_; + const unsigned int outer_stride = ny_ * nz_; + const unsigned int inner_last = gsp[1] * nz_; -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) int ptsize = fields[ifield]->size(); int blabla = size[1]; #pragma acc parallel if (is_memory_on_device) present(pt1[0-blabla*nz_:ptsize],pt2[0:ptsize]) @@ -281,7 +282,7 @@ public : if ( vecPatches( ipatch )->is_a_MPI_neighbor( 2, iNeighbor ) ) { fields[ifield]->create_sub_fields ( 2, iNeighbor, 2*oversize[2]+1+fields[ifield]->isDual_[2] ); fields[ifield]->extract_fields_sum( 2, iNeighbor, oversize[2] ); -// #ifdef SMILEI_OPENACC_MODE +// #ifdef SMILEI_ACCELERATOR_GPU_OACC // double* pointer = fields[ifield]->recvFields_[(iNeighbor+1)%2+2]->data_; // int size = fields[ifield]->recvFields_[(iNeighbor+1)%2+2]->size(); // #endif @@ -292,7 +293,7 @@ public : // iDim = 2 local -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) const bool is_memory_on_device = fields.size() > 0 && smilei::tools::gpu::HostDeviceMemoryManagement::IsHostPointerMappedOnDevice( fields[0]->data() ); #endif @@ -320,11 +321,11 @@ public : pt1 = &( *field1 )( size[2] ); pt2 = &( *field2 )( 0 ); - const int outer_last = nx_ * ny_ * nz_; - const int outer_stride = nz_; - const int inner_last = gsp[2]; + const unsigned int outer_last = nx_ * ny_ * nz_; + const unsigned int outer_stride = nz_; + const unsigned int inner_last = gsp[2]; -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) int ptsize = fields[ifield]->size(); int blabla = size[2]; #pragma acc parallel if (is_memory_on_device) present(pt1[0-blabla:ptsize],pt2[0:ptsize]) diff --git a/src/Patch/VectorPatch.cpp b/src/Patch/VectorPatch.cpp index 8b239b905..42f4dd3d8 100755 --- a/src/Patch/VectorPatch.cpp +++ b/src/Patch/VectorPatch.cpp @@ -301,7 +301,7 @@ void VectorPatch::reconfiguration( Params ¶ms, Timers &timers, int itime ) // --------------------------------------------------------------------------------------------------------------------- void VectorPatch::initialParticleSorting( Params ¶ms ) { -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC) // Initially I wanted to control the GPU particle sorting/bin initialization // here. In the end it was put in initializeDataOnDevice which is more // meaningful. @@ -322,7 +322,7 @@ void VectorPatch::initialParticleSorting( Params ¶ms ) } // --------------------------------------------------------------------------------------------------------------------- -// For all patches, move particles (restartRhoJ(s), dynamics and exchangeParticles) +// For all patches, move particles (restartRhoJ(s), dynamics and initExchParticles) // --------------------------------------------------------------------------------------------------------------------- void VectorPatch::dynamics( Params ¶ms, SmileiMPI *smpi, @@ -402,7 +402,7 @@ void VectorPatch::dynamics( Params ¶ms, for( unsigned int ispec=0 ; ispec<( *this )( 0 )->vecSpecies.size(); ispec++ ) { Species *spec = species( 0, ispec ); if ( (!params.Laser_Envelope_model) && (spec->isProj( time_dual, simWindow )) ){ - SyncVectorPatch::exchangeParticles( ( *this ), ispec, params, smpi ); // Included sortParticles + SyncVectorPatch::initExchParticles( ( *this ), ispec, params, smpi ); // Included sortParticles } // end condition on Species and on envelope model } // end loop on species //MESSAGE("exchange particles"); @@ -460,7 +460,7 @@ void VectorPatch::projectionForDiags( Params ¶ms, // --------------------------------------------------------------------------------------------------------------------- //! For all patches, exchange particles and sort them. // --------------------------------------------------------------------------------------------------------------------- -void VectorPatch::finalizeAndSortParticles( Params ¶ms, SmileiMPI *smpi, SimWindow *simWindow, +void VectorPatch::finalizeExchParticlesAndSort( Params ¶ms, SmileiMPI *smpi, SimWindow *simWindow, double time_dual, Timers &timers, int itime ) { timers.syncPart.restart(); @@ -471,7 +471,7 @@ void VectorPatch::finalizeAndSortParticles( Params ¶ms, SmileiMPI *smpi, Sim for( unsigned int ispec=0 ; ispec<( *this )( 0 )->vecSpecies.size(); ispec++ ) { if( ( *this )( 0 )->vecSpecies[ispec]->isProj( time_dual, simWindow ) ) { - SyncVectorPatch::finalizeAndSortParticles( ( *this ), ispec, params, smpi ); // Included sortParticles + SyncVectorPatch::finalizeExchParticlesAndSort( ( *this ), ispec, params, smpi ); // Included sortParticles } } @@ -491,7 +491,7 @@ void VectorPatch::finalizeAndSortParticles( Params ¶ms, SmileiMPI *smpi, Sim timers.syncPart.update( params.printNow( itime ) ); -} // END finalizeAndSortParticles +} // END finalizeExchParticlesAndSort //! Perform the particles merging on all patches @@ -853,7 +853,7 @@ void VectorPatch::sumDensities( Params ¶ms, double time_dual, Timers &timers #pragma omp for schedule(static) for( unsigned int ipatch=0 ; ipatchsize() ; ipatch++ ) { // Per species in global, Attention if output -> Sync / per species fields -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) // At itime == 0, data is still located on the Host if (itime == 0) { ( *this )( ipatch )->EMfields->computeTotalRhoJ(); @@ -1269,7 +1269,7 @@ void VectorPatch::closeAllDiags( SmileiMPI *smpi ) // --------------------------------------------------------------------------------------------------------------------- void VectorPatch::runAllDiags( Params &/*params*/, SmileiMPI *smpi, unsigned int itime, Timers &timers, SimWindow *simWindow ) { -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) bool data_on_cpu_updated = false; #endif @@ -1277,7 +1277,7 @@ void VectorPatch::runAllDiags( Params &/*params*/, SmileiMPI *smpi, unsigned int timers.diags.restart(); // Determine which data is required from the device -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) bool need_particles = false; bool need_fields = false; @@ -1346,7 +1346,7 @@ void VectorPatch::runAllDiags( Params &/*params*/, SmileiMPI *smpi, unsigned int for( unsigned int idiag = 0 ; idiag < globalDiags.size() ; idiag++ ) { diag_timers_[idiag]->restart(); -// #if defined( SMILEI_ACCELERATOR_MODE) +// #if defined( SMILEI_ACCELERATOR_GPU) // if( globalDiags[idiag]->timeSelection->theTimeIsNow( itime ) && // !data_on_cpu_updated && // ( itime > 0 ) ) { @@ -1462,7 +1462,7 @@ void VectorPatch::runAllDiags( Params &/*params*/, SmileiMPI *smpi, unsigned int for( unsigned int idiag = 0 ; idiag < localDiags.size() ; idiag++ ) { diag_timers_[globalDiags.size()+idiag]->restart(); -// #if defined( SMILEI_ACCELERATOR_MODE ) +// #if defined( SMILEI_ACCELERATOR_GPU ) // if( localDiags[idiag]->timeSelection->theTimeIsNow( itime ) && // !data_on_cpu_updated && // ( itime > 0 ) ) { @@ -1496,7 +1496,7 @@ void VectorPatch::runAllDiags( Params &/*params*/, SmileiMPI *smpi, unsigned int for( unsigned int ipatch=0 ; ipatchEMfields->restartRhoJs(); -#if defined (SMILEI_ACCELERATOR_MODE) +#if defined (SMILEI_ACCELERATOR_GPU) // Delete species current and rho grids from device for( unsigned int ispec = 0; ispec < ( *this )( ipatch )->vecSpecies.size(); ispec++ ) { ( *this )( ipatch )->vecSpecies[ispec]->Species::deleteSpeciesCurrentAndChargeOnDevice(ispec, ( *this )( ipatch )->EMfields); @@ -2973,7 +2973,7 @@ void VectorPatch::createPatches( Params ¶ms, SmileiMPI *smpi, SimWindow *sim // Set Index of the 1st patch of the vector yet on current MPI rank // Is this really necessary ? It should be done already ... - refHindex_ = ( *this )( 0 )->Hindex(); + setRefHindex(); // Current number of patch int nPatches_now = this->size() ; @@ -4402,7 +4402,7 @@ void VectorPatch::moveWindow( // Bring all particles and field grids to the Host (except species grids) // This part can be optimized by copying only the patch to be destructed -#if defined( SMILEI_ACCELERATOR_MODE) +#if defined( SMILEI_ACCELERATOR_GPU) if( simWindow->isMoving( time_dual ) || itime == simWindow->getAdditionalShiftsIteration() ) { copyParticlesFromDeviceToHost(); copyFieldsFromDeviceToHost(); @@ -4412,10 +4412,11 @@ void VectorPatch::moveWindow( simWindow->shift( (*this), smpi, params, itime, time_dual, region ); - if (itime == simWindow->getAdditionalShiftsIteration() ) { + if( itime == (int) simWindow->getAdditionalShiftsIteration() ) { int adjust = simWindow->isMoving(time_dual)?0:1; - for (unsigned int n=0;n < simWindow->getNumberOfAdditionalShifts()-adjust; n++) + for( unsigned int n=0; n < simWindow->getNumberOfAdditionalShifts()-adjust; n++ ) { simWindow->shift( (*this), smpi, params, itime, time_dual, region ); + } } // Copy all Fields and Particles to the device @@ -4423,7 +4424,7 @@ void VectorPatch::moveWindow( // let's try initialising like we do at the start: -/*#if defined( SMILEI_ACCELERATOR_MODE ) +/*#if defined( SMILEI_ACCELERATOR_GPU ) // Allocate particle and field arrays // Also copy particle array content on device vecPatches.allocateDataOnDevice( params, &smpi, @@ -4434,7 +4435,7 @@ void VectorPatch::moveWindow( #endif*/ // does not do anything? - /*#if defined( SMILEI_ACCELERATOR_MODE) + /*#if defined( SMILEI_ACCELERATOR_GPU) if( simWindow->isMoving( time_dual ) || itime == simWindow->getAdditionalShiftsIteration() ) { copyFieldsFromHostToDevice(); copyParticlesFromHostToDevice(); @@ -4588,7 +4589,7 @@ void VectorPatch::ponderomotiveUpdatePositionAndCurrents( Params ¶ms, timers.syncPart.restart(); for( unsigned int ispec=0 ; ispec<( *this )( 0 )->vecSpecies.size(); ispec++ ) { if( ( *this )( 0 )->vecSpecies[ispec]->isProj( time_dual, simWindow ) ) { - SyncVectorPatch::exchangeParticles( ( *this ), ispec, params, smpi ); // Included sortParticles + SyncVectorPatch::initExchParticles( ( *this ), ispec, params, smpi ); // Included sortParticles } // end condition on species } // end loop on species timers.syncPart.update( params.printNow( itime ) ); @@ -4609,91 +4610,26 @@ void VectorPatch::initNewEnvelope( Params & ) } // END initNewEnvelope +#if defined( SMILEI_ACCELERATOR_GPU ) void VectorPatch::allocateDataOnDevice(Params ¶ms, SmileiMPI *smpi, RadiationTables *radiation_tables, MultiphotonBreitWheelerTables *multiphoton_Breit_Wheeler_tables) { - -#if defined( SMILEI_ACCELERATOR_MODE ) // TODO(Etienne M): FREE. If we have load balancing or other patch // creation/destruction available (which is not the case on GPU ATM), // we should be taking care of freeing this GPU memory. - const int npatches = this->size(); - - // const int sizeofJx = patches_[0]->EMfields->Jx_->size(); - // const int sizeofJy = patches_[0]->EMfields->Jy_->size(); - // const int sizeofJz = patches_[0]->EMfields->Jz_->size(); - // const int sizeofRho = patches_[0]->EMfields->rho_->size(); - - // const int sizeofEx = patches_[0]->EMfields->Ex_->size(); - // const int sizeofEy = patches_[0]->EMfields->Ey_->size(); - // const int sizeofEz = patches_[0]->EMfields->Ez_->size(); - - // const int sizeofBx = patches_[0]->EMfields->Bx_->size(); - // const int sizeofBy = patches_[0]->EMfields->By_->size(); - // const int sizeofBz = patches_[0]->EMfields->Bz_->size(); - - for( int ipatch=0 ; ipatchvecSpecies.size(); ispec++ ) { - Species *spec = species( ipatch, ispec ); - spec->particles->initializeDataOnDevice(); - spec->particles_to_move->initializeDataOnDevice(); - - // Create photon species on the device - if ( spec->radiation_model_ == "mc" && spec->photon_species_) { - spec->radiated_photons_->initializeDataOnDevice(); - } - - // Create pair species on the device - if ( spec->mBW_pair_species_[0] && spec->mBW_pair_species_[1]) { - spec->mBW_pair_particles_[0]->initializeDataOnDevice(); - spec->mBW_pair_particles_[1]->initializeDataOnDevice(); - } - - //#pragma acc enter data copyin(spec->nrj_radiation) + for( auto spec: patch->vecSpecies ) { + spec->allocateParticlesOnDevice(); } // Allocate field data structures on GPU - patches_[ipatch]->allocateFieldsOnDevice(); - - // const double *const Jx = patches_[ipatch]->EMfields->Jx_->data(); - // const double *const Jy = patches_[ipatch]->EMfields->Jy_->data(); - // const double *const Jz = patches_[ipatch]->EMfields->Jz_->data(); - // const double *const Rho = patches_[ipatch]->EMfields->rho_->data(); - - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Jx, sizeofJx ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Jy, sizeofJy ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Jz, sizeofJz ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Rho, sizeofRho ); - - // const double *const Ex = patches_[ipatch]->EMfields->Ex_->data(); - // const double *const Ey = patches_[ipatch]->EMfields->Ey_->data(); - // const double *const Ez = patches_[ipatch]->EMfields->Ez_->data(); + patch->allocateFieldsOnDevice(); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Ex, sizeofEx ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Ey, sizeofEy ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Ez, sizeofEz ); - - // const double *const Bmx = patches_[ipatch]->EMfields->Bx_m->data(); - // const double *const Bmy = patches_[ipatch]->EMfields->By_m->data(); - // const double *const Bmz = patches_[ipatch]->EMfields->Bz_m->data(); - - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Bmx, sizeofBx ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Bmy, sizeofBy ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocate( Bmz, sizeofBz ); - - // const double *const Bx = patches_[ipatch]->EMfields->Bx_->data(); - // const double *const By = patches_[ipatch]->EMfields->By_->data(); - // const double *const Bz = patches_[ipatch]->EMfields->Bz_->data(); - - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocateAndCopyHostToDevice( Bx, sizeofBx ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocateAndCopyHostToDevice( By, sizeofBy ); - // smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocateAndCopyHostToDevice( Bz, sizeofBz ); - } // end patch loop // TODO(Etienne M): We should create a function that does the copy of the radiation table. @@ -4745,17 +4681,24 @@ void VectorPatch::allocateDataOnDevice(Params ¶ms, smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocateAndCopyHostToDevice( min_particle_chi_table, min_particle_chi_size ); smilei::tools::gpu::HostDeviceMemoryManagement::DeviceAllocateAndCopyHostToDevice( xi_table, xi_table_size ); } +} #else +void VectorPatch::allocateDataOnDevice(Params &, + SmileiMPI *, + RadiationTables *, + MultiphotonBreitWheelerTables *) +{ ERROR( "GPU related code should not be reached in CPU mode!" ); -#endif } +#endif + //! Clean data allocated on device +#if defined( SMILEI_ACCELERATOR_GPU ) void VectorPatch::cleanDataOnDevice( Params ¶ms, SmileiMPI *smpi, RadiationTables *radiation_tables, MultiphotonBreitWheelerTables *multiphoton_Breit_Wheeler_tables) { -#if defined( SMILEI_OPENACC_MODE ) || defined( SMILEI_ACCELERATOR_GPU_OMP ) const int npatches = this->size(); @@ -4865,12 +4808,17 @@ void VectorPatch::cleanDataOnDevice( Params ¶ms, SmileiMPI *smpi, smilei::tools::gpu::HostDeviceMemoryManagement::DeviceFree( xi_table, xi_table_size ); } +} #else +void VectorPatch::cleanDataOnDevice( Params &, SmileiMPI *, + RadiationTables *, + MultiphotonBreitWheelerTables *) +{ ERROR( "GPU related code should not be reached in CPU mode!" ); -#endif } +#endif -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) //! Field Synchronization from the GPU (Device) to the CPU //! This function updates the data on the host from the data located on the device @@ -4910,9 +4858,7 @@ void VectorPatch::copyFieldsFromHostToDevice() } } -#endif -#if defined( SMILEI_ACCELERATOR_MODE) //! Sync all fields from device to host void VectorPatch::copyFieldsFromDeviceToHost() @@ -4925,10 +4871,6 @@ VectorPatch::copyFieldsFromDeviceToHost() } } -#endif - - -#if defined( SMILEI_ACCELERATOR_MODE) //! Copy all species particles from Host to devices void VectorPatch::copyParticlesFromHostToDevice() @@ -4940,9 +4882,6 @@ void VectorPatch::copyParticlesFromHostToDevice() } } } -#endif - -#if defined( SMILEI_ACCELERATOR_MODE) //! copy all patch Particles from device to Host void @@ -4955,9 +4894,7 @@ VectorPatch::copyParticlesFromDeviceToHost() for( int ipatch = 0; ipatch < npatches; ipatch++ ) { for( unsigned int ispec = 0; ispec < ( *this )( ipatch )->vecSpecies.size(); ispec++ ) { species( ipatch, ispec )->particles->copyFromDeviceToHost(); -#if defined ( SMILEI_ACCELERATOR_GPU_OMP ) || defined ( SMILEI_ACCELERATOR_MODE ) species( ipatch, ispec )->particles->setHostBinIndex(); -#endif // std::cerr // << "ipatch: " << ipatch // << " ispec: " << ispec @@ -4970,9 +4907,6 @@ VectorPatch::copyParticlesFromDeviceToHost() } } -#endif - -#if defined( SMILEI_ACCELERATOR_MODE) //! Sync all fields from device to host void VectorPatch::copySpeciesFieldsFromDeviceToHost() @@ -5052,7 +4986,7 @@ void VectorPatch::dynamicsWithoutTasks( Params ¶ms, if( spec->isProj( time_dual, simWindow ) || diag_flag ) { -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) if (diag_flag) { spec->Species::prepareSpeciesCurrentAndChargeOnDevice( ispec, @@ -5364,7 +5298,7 @@ void VectorPatch::dynamicsWithTasks( Params ¶ms, Species *spec_task = species( ipatch, ispec ); for( unsigned int scell = 0 ; scell < spec_task->Ncells ; scell++ ) { for( unsigned int iPart=spec_task->particles->first_index[scell] ; ( int )iPartparticles->last_index[scell]; iPart++ ) { - if ( spec_task->particles->cell_keys[iPart] != -1 ) { + if ( spec_task->particles->cell_keys[iPart] >= 0 ) { //First reduction of the count sort algorithm. Lost particles are not included. spec_task->count[spec_task->particles->cell_keys[iPart]] ++; } @@ -5380,7 +5314,7 @@ void VectorPatch::dynamicsWithTasks( Params ¶ms, Species *spec_task = species( ipatch, ispec ); for( unsigned int scell = 0 ; scell < spec_task->Ncells ; scell++ ) { for( unsigned int iPart=spec_task->particles->first_index[scell] ; ( int )iPartparticles->last_index[scell]; iPart++ ) { - if ( spec_task->particles->cell_keys[iPart] != -1 ) { + if ( spec_task->particles->cell_keys[iPart] >= 0 ) { //First reduction of the count sort algorithm. Lost particles are not included. spec_task->count[spec_task->particles->cell_keys[iPart]] ++; } @@ -5600,7 +5534,7 @@ void VectorPatch::ponderomotiveUpdatePositionAndCurrentsWithTasks( Params ¶m Species *spec_task = species( ipatch, ispec ); for( unsigned int scell = 0 ; scell < spec_task->Ncells ; scell++ ) { for( unsigned int iPart=spec_task->particles->first_index[scell] ; ( int )iPartparticles->last_index[scell]; iPart++ ) { - if ( spec_task->particles->cell_keys[iPart] != -1 ) { + if ( spec_task->particles->cell_keys[iPart] >= 0 ) { //First reduction of the count sort algorithm. Lost particles are not included. spec_task->count[spec_task->particles->cell_keys[iPart]] ++; } @@ -5618,7 +5552,7 @@ void VectorPatch::ponderomotiveUpdatePositionAndCurrentsWithTasks( Params ¶m Species *spec_task = species( ipatch, ispec ); for( unsigned int scell = 0 ; scell < spec_task->Ncells ; scell++ ) { for( unsigned int iPart=spec_task->particles->first_index[scell] ; ( int )iPartparticles->last_index[scell]; iPart++ ) { - if ( spec_task->particles->cell_keys[iPart] != -1 ) { + if ( spec_task->particles->cell_keys[iPart] >= 0 ) { //First reduction of the count sort algorithm. Lost particles are not included. spec_task->count[spec_task->particles->cell_keys[iPart]] ++; } diff --git a/src/Patch/VectorPatch.h b/src/Patch/VectorPatch.h index 01ec195c2..051d78276 100755 --- a/src/Patch/VectorPatch.h +++ b/src/Patch/VectorPatch.h @@ -138,7 +138,7 @@ public : //! Particle sorting for all patches. This is done at initialization time. void initialParticleSorting( Params ¶ms ); - //! For all patch, move particles (restartRhoJ(s), dynamics and exchangeParticles) + //! For all patch, move particles (restartRhoJ(s), dynamics and initExchParticles) void dynamics( Params ¶ms, SmileiMPI *smpi, SimWindow *simWindow, @@ -157,7 +157,7 @@ public : Timers &timers, int itime ); //! For all patches, exchange particles and sort them. - void finalizeAndSortParticles( Params ¶ms, SmileiMPI *smpi, SimWindow *simWindow, + void finalizeExchParticlesAndSort( Params ¶ms, SmileiMPI *smpi, SimWindow *simWindow, double time_dual, Timers &timers, int itime ); void finalizeSyncAndBCFields( Params ¶ms, SmileiMPI *smpi, SimWindow *simWindow, @@ -510,7 +510,7 @@ public : RadiationTables * radiation_tables, MultiphotonBreitWheelerTables *multiphoton_Breit_Wheeler_tables ); -#if defined( SMILEI_ACCELERATOR_MODE) +#if defined( SMILEI_ACCELERATOR_GPU) //! Field Synchronization from the GPU (Device) to the host (CPU) diff --git a/src/Projector/Projector1D.h b/src/Projector/Projector1D.h index d51327bb7..c08c0e9a8 100755 --- a/src/Projector/Projector1D.h +++ b/src/Projector/Projector1D.h @@ -18,21 +18,19 @@ class Projector1D : public Projector virtual ~Projector1D() {}; virtual void mv_win( unsigned int shift ) { - index_domain_begin+=shift; + i_domain_begin_ += shift; } virtual void setMvWinLimits( unsigned int shift ) { - index_domain_begin = shift; + i_domain_begin_ = shift; } protected: //! Inverse of the spatial step 1/dx double dx_inv_; - int index_domain_begin; + double dx_ov_dt_; + int i_domain_begin_; double *Jx_, *Jy_, *Jz_, *rho_; - -private: - }; #endif diff --git a/src/Projector/Projector1D2Order.cpp b/src/Projector/Projector1D2Order.cpp index cd587dc71..451bca539 100755 --- a/src/Projector/Projector1D2Order.cpp +++ b/src/Projector/Projector1D2Order.cpp @@ -18,14 +18,12 @@ using namespace std; Projector1D2Order::Projector1D2Order( Params ¶ms, Patch *patch ) : Projector1D( params, patch ) { dx_inv_ = 1.0/params.cell_length[0]; - dx_ov_dt = params.cell_length[0] / params.timestep; + dx_ov_dt_ = params.cell_length[0] / params.timestep; - index_domain_begin = patch->getCellStartingGlobalIndex( 0 ); - - dt = params.timestep; - dts2 = params.timestep/2.; - dts4 = params.timestep/4.; + i_domain_begin_ = patch->getCellStartingGlobalIndex( 0 ); + dts2_ = params.timestep/2.; + dts4_ = params.timestep/4.; } @@ -43,7 +41,7 @@ void Projector1D2Order::currents( double *Jx, double *Jy, double *Jz, Particles int ip_m_ipo; double charge_weight = inv_cell_volume * ( double )( particles.charge( ipart ) )*particles.weight( ipart ); double xjn, xj_m_xipo, xj_m_xipo2, xj_m_xip, xj_m_xip2; - double crx_p = charge_weight*dx_ov_dt; // current density for particle moving in the x-direction + double crx_p = charge_weight*dx_ov_dt_; // current density for particle moving in the x-direction double cry_p = charge_weight*particles.momentum( 1, ipart )*invgf; // current density in the y-direction of the macroparticle double crz_p = charge_weight*particles.momentum( 2, ipart )*invgf; // current density allow the y-direction of the macroparticle double S0[5], S1[5], Wl[5], Wt[5], Jx_p[5]; // arrays used for the Esirkepov projection method @@ -76,7 +74,7 @@ void Projector1D2Order::currents( double *Jx, double *Jy, double *Jz, Particles // coefficients 2nd order interpolation on 3 nodes ipo = *iold; // index of the central node - ip_m_ipo = ip-ipo-index_domain_begin; + ip_m_ipo = ip-ipo-i_domain_begin_; S1[ip_m_ipo+1] = 0.5 * ( xj_m_xip2-xj_m_xip+0.25 ); S1[ip_m_ipo+2] = ( 0.75-xj_m_xip2 ); S1[ip_m_ipo+3] = 0.5 * ( xj_m_xip2+xj_m_xip+0.25 ); @@ -115,7 +113,7 @@ void Projector1D2Order::currentsAndDensity( double *Jx, double *Jy, double *Jz, int ip_m_ipo; double charge_weight = inv_cell_volume * ( double )( particles.charge( ipart ) )*particles.weight( ipart ); double xjn, xj_m_xipo, xj_m_xipo2, xj_m_xip, xj_m_xip2; - double crx_p = charge_weight*dx_ov_dt; // current density for particle moving in the x-direction + double crx_p = charge_weight*dx_ov_dt_; // current density for particle moving in the x-direction double cry_p = charge_weight*particles.momentum( 1, ipart )*invgf; // current density in the y-direction of the macroparticle double crz_p = charge_weight*particles.momentum( 2, ipart )*invgf; // current density allow the y-direction of the macroparticle double S0[5], S1[5], Wl[5], Wt[5], Jx_p[5]; // arrays used for the Esirkepov projection method @@ -132,7 +130,7 @@ void Projector1D2Order::currentsAndDensity( double *Jx, double *Jy, double *Jz, // Locate particle old position on the primal grid xj_m_xipo = *deltaold; // normalized distance to the nearest grid point - xj_m_xipo2 = xj_m_xipo*xj_m_xipo; // square of the normalized distance to the nearest grid point + xj_m_xipo2 = xj_m_xipo*xj_m_xipo; // square of the normalized distance to the nearest grid point // Locate particle new position on the primal grid xjn = particles.position( 0, ipart ) * dx_inv_; @@ -142,16 +140,16 @@ void Projector1D2Order::currentsAndDensity( double *Jx, double *Jy, double *Jz, // coefficients 2nd order interpolation on 3 nodes - S0[1] = 0.5 * ( xj_m_xipo2-xj_m_xipo+0.25 ); - S0[2] = ( 0.75-xj_m_xipo2 ); - S0[3] = 0.5 * ( xj_m_xipo2+xj_m_xipo+0.25 ); + S0[1] = 0.5 * ( xj_m_xipo2 - xj_m_xipo + 0.25 ); + S0[2] = ( 0.75 - xj_m_xipo2 ); + S0[3] = 0.5 * ( xj_m_xipo2 + xj_m_xipo + 0.25 ); // coefficients 2nd order interpolation on 3 nodes ipo = *iold; - ip_m_ipo = ip-ipo-index_domain_begin; - S1[ip_m_ipo+1] = 0.5 * ( xj_m_xip2-xj_m_xip+0.25 ); - S1[ip_m_ipo+2] = ( 0.75-xj_m_xip2 ); - S1[ip_m_ipo+3] = 0.5 * ( xj_m_xip2+xj_m_xip+0.25 ); + ip_m_ipo = ip-ipo-i_domain_begin_; + S1[ip_m_ipo+1] = 0.5 * ( xj_m_xip2 - xj_m_xip + 0.25 ); + S1[ip_m_ipo+2] = ( 0.75 - xj_m_xip2 ); + S1[ip_m_ipo+3] = 0.5 * ( xj_m_xip2 + xj_m_xip + 0.25 ); // coefficients used in the Esirkepov method for( unsigned int i=0; i<5; i++ ) { @@ -228,7 +226,7 @@ void Projector1D2Order::basic( double *rhoj, Particles &particles, unsigned int S1[2] = ( 0.75-xj_m_xip2 ); S1[3] = 0.5 * ( xj_m_xip2+xj_m_xip+0.25 ); - ip -= index_domain_begin + 2 + bin_shift; + ip -= i_domain_begin_ + 2 + bin_shift; // 2nd order projection for charge density // At the 2nd order, oversize = 2. @@ -270,7 +268,7 @@ void Projector1D2Order::ionizationCurrents( Field *Jx, Field *Jy, Field *Jz, Par xjmxi = xjn - ( double )i + 0.5; // normalized distance to the nearest grid point xjmxi2 = xjmxi*xjmxi; // square of the normalized distance to the nearest grid point - i -= index_domain_begin; + i -= i_domain_begin_; im1 = i-1; ip1 = i+1; @@ -291,7 +289,7 @@ void Projector1D2Order::ionizationCurrents( Field *Jx, Field *Jy, Field *Jz, Par xjmxi = xjn - ( double )i; // normalized distance to the nearest grid point xjmxi2 = xjmxi*xjmxi; // square of the normalized distance to the nearest grid point - i -= index_domain_begin; + i -= i_domain_begin_; im1 = i-1; ip1 = i+1; @@ -377,9 +375,9 @@ void Projector1D2Order::susceptibility( ElectroMagn *EMfields, Particles &partic for( int ipart=istart ; ipartJx_/Jy_/Jz_) inline void currents( double *Jx, double *Jy, double *Jz, Particles &particles, unsigned int ipart, double invgf, int *iold, double *deltaold, int bin_shift = 0 ); //! Project global current densities (EMfields->Jx_/Jy_/Jz_/rho), diagFields timestep - inline void currentsAndDensity( double *Jx, double *Jy, double *Jz, double *rho, Particles &particles, unsigned int ipart, double invgf, int *iold, double *deltaold, int bin_shift = 0 ); + inline void __attribute__((always_inline)) currentsAndDensity( double *Jx, double *Jy, double *Jz, double *rho, Particles &particles, unsigned int ipart, double invgf, int *iold, double *deltaold, int bin_shift = 0 ); //! Project global current charge (EMfields->rho_ , J), for initialization and diags void basic( double *rhoj, Particles &particles, unsigned int ipart, unsigned int type, int bin_shift = 0 ) override final; @@ -36,8 +36,7 @@ class Projector1D2Order : public Projector1D void susceptibilityOnBuffer( ElectroMagn *EMfields, double *b_Chi, int bin_shift, int bdim0, Particles &particles, double species_mass, SmileiMPI *smpi, int istart, int iend, int ithread, int icell = 0, int ipart_ref = 0 ) override final; private: - double dx_ov_dt; - double dt, dts2, dts4; + double dts2_, dts4_; }; #endif diff --git a/src/Projector/Projector1D2OrderGPU.cpp b/src/Projector/Projector1D2OrderGPU.cpp new file mode 100755 index 000000000..19493ef8d --- /dev/null +++ b/src/Projector/Projector1D2OrderGPU.cpp @@ -0,0 +1,294 @@ + + +#if defined( SMILEI_ACCELERATOR_GPU ) +#include "Projector1D2OrderGPUKernelCUDAHIP.h" +#include +#include "Tools.h" +#endif + +#include "Projector1D2OrderGPU.h" + +#include "ElectroMagn.h" +#include "Patch.h" +#include "gpu.h" + + +Projector1D2OrderGPU::Projector1D2OrderGPU( Params ¶meters, Patch *a_patch ) + : Projector1D{ parameters, a_patch } +{ + Projector1D::dx_inv_ = 1.0 / parameters.cell_length[0]; + Projector1D::dx_ov_dt_ = parameters.cell_length[0] / parameters.timestep; + Projector1D::i_domain_begin_ = a_patch->getCellStartingGlobalIndex( 0 ); + + not_spectral_ = !parameters.is_pxr; + dts2_ = parameters.timestep / 2.0; + dts4_ = dts2_ / 2.0; +#if defined( SMILEI_ACCELERATOR_GPU ) + x_dimension_bin_count_ = parameters.getGPUBinCount( 1 ); +#else + ERROR( "Only usable in GPU mode! " ); +#endif +} + +Projector1D2OrderGPU::~Projector1D2OrderGPU() +{ +} +#if defined( SMILEI_ACCELERATOR_GPU ) + + +//! Project global current densities (EMfields->Jx_/Jy_/Jz_) +extern "C" void +currentDepositionKernel1DOnDevice( double *__restrict__ host_Jx, + double *__restrict__ host_Jy, + double *__restrict__ host_Jz, + int Jx_size, + int Jy_size, + int Jz_size, + const double *__restrict__ device_particle_position_x, + const double *__restrict__ device_particle_momentum_y, + const double *__restrict__ device_particle_momentum_z, + const short *__restrict__ device_particle_charge, + const double *__restrict__ device_particle_weight, + const int *__restrict__ host_bin_index, + unsigned int x_dimension_bin_count_, + const double *__restrict__ host_invgf_, + const int *__restrict__ host_iold_, + const double *__restrict__ host_deltaold_, + double inv_cell_volume, + double dx_inv_, + double dx_ov_dt_, + int i_domain_begin_, + int not_spectral_ ) +{ + cudahip1d::currentDepositionKernel1D( host_Jx, host_Jy, host_Jz, + Jx_size, Jy_size, Jz_size, + device_particle_position_x, device_particle_momentum_y, + device_particle_momentum_z, + device_particle_charge, + device_particle_weight, + host_bin_index, + x_dimension_bin_count_, + host_invgf_, + host_iold_, host_deltaold_, + inv_cell_volume, + dx_inv_, + dx_ov_dt_, + i_domain_begin_, + not_spectral_ ); +} + + +//! Project global current and charge densities (EMfields->Jx_/Jy_/Jz_/rho_) +//! +extern "C" void +currentAndDensityDepositionKernel1DOnDevice( double *__restrict__ host_Jx, + double *__restrict__ host_Jy, + double *__restrict__ host_Jz, + double *__restrict__ host_rho, + int Jx_size, + int Jy_size, + int Jz_size, + int rho_size, + const double *__restrict__ device_particle_position_x, + const double *__restrict__ device_particle_momentum_y, + const double *__restrict__ device_particle_momentum_z, + const short *__restrict__ device_particle_charge, + const double *__restrict__ device_particle_weight, + const int *__restrict__ host_bin_index, + unsigned int x_dimension_bin_count_, + const double *__restrict__ host_invgf_, + const int *__restrict__ host_iold_, + const double *__restrict__ host_deltaold_, + double inv_cell_volume, + double dx_inv_, + double dx_ov_dt_, + int i_domain_begin_, + int not_spectral_ ) +{ + cudahip1d::currentAndDensityDepositionKernel1D( host_Jx, host_Jy, host_Jz, host_rho, + Jx_size, Jy_size, Jz_size, rho_size, + device_particle_position_x, device_particle_momentum_y, + device_particle_momentum_z, + device_particle_charge, + device_particle_weight, + host_bin_index, + x_dimension_bin_count_, + host_invgf_, + host_iold_, host_deltaold_, + inv_cell_volume, + dx_inv_, + dx_ov_dt_, + i_domain_begin_, + not_spectral_ ); +} +#endif + +// --------------------------------------------------------------------------------------------------------------------- +//! Project charge : frozen & diagFields timstep +// --------------------------------------------------------------------------------------------------------------------- +void Projector1D2OrderGPU::basic( double *rhoj, Particles &particles, unsigned int ipart, unsigned int type, int bin_shift ) +{ + + //Warning : this function is used for frozen species or initialization only and doesn't use the standard scheme. + //rho type = 0 + //Jx type = 1 + //Jy type = 2 + //Jz type = 3 + + // The variable bin received is number of bin * cluster width. + // Declare local variables + int ip; + double xjn, xj_m_xip, xj_m_xip2; + double S1[5]; // arrays used for the Esirkepov projection method + + double charge_weight = inv_cell_volume * ( double )( particles.charge( ipart ) )*particles.weight( ipart ); + if( type > 0 ) { + charge_weight *= 1./sqrt( 1.0 + particles.momentum( 0, ipart )*particles.momentum( 0, ipart ) + + particles.momentum( 1, ipart )*particles.momentum( 1, ipart ) + + particles.momentum( 2, ipart )*particles.momentum( 2, ipart ) ); + + if( type == 1 ) { + charge_weight *= particles.momentum( 0, ipart ); + } else if( type == 2 ) { + charge_weight *= particles.momentum( 1, ipart ); + } else { + charge_weight *= particles.momentum( 2, ipart ); + } + } + + // Initialize variables + for( unsigned int i=0; i<5; i++ ) { + S1[i]=0.; + }//i + + // Locate particle new position on the primal grid + xjn = particles.position( 0, ipart ) * dx_inv_; + ip = round( xjn + 0.5 * ( type==1 ) ); // index of the central node + xj_m_xip = xjn - ( double )ip; // normalized distance to the nearest grid point + xj_m_xip2 = xj_m_xip * xj_m_xip; // square of the normalized distance to the nearest grid point + + // coefficients 2nd order interpolation on 3 nodes + //ip_m_ipo = ip-ipo; + S1[1] = 0.5 * ( xj_m_xip2 - xj_m_xip + 0.25 ); + S1[2] = ( 0.75 - xj_m_xip2 ); + S1[3] = 0.5 * ( xj_m_xip2 + xj_m_xip + 0.25 ); + + ip -= i_domain_begin_ + 2 + bin_shift; + + // 2nd order projection for charge density + // At the 2nd order, oversize = 2. + for( unsigned int i=0; i<5; i++ ) { + rhoj[i + ip ] += charge_weight * S1[i]; + } + +} + + +void Projector1D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, + Particles &particles, + SmileiMPI *smpi, + int, + int, + int ithread, + bool diag_flag, + bool is_spectral, + int ispec, + int icell, + int ipart_ref ) +{ + std::vector &iold = smpi->dynamics_iold[ithread]; + std::vector &delta = smpi->dynamics_deltaold[ithread]; + std::vector &invgf = smpi->dynamics_invgf[ithread]; + + if( diag_flag ) { + + double *const __restrict__ b_Jx = EMfields->Jx_s[ispec] ? EMfields->Jx_s[ispec]->data() : EMfields->Jx_->data(); + unsigned int Jx_size = EMfields->Jx_s[ispec] ? EMfields->Jx_s[ispec]->size() : EMfields->Jx_->size(); + + double *const __restrict__ b_Jy = EMfields->Jy_s[ispec] ? EMfields->Jy_s[ispec]->data() : EMfields->Jy_->data(); + unsigned int Jy_size = EMfields->Jy_s[ispec] ? EMfields->Jy_s[ispec]->size() : EMfields->Jy_->size(); + + double *const __restrict__ b_Jz = EMfields->Jz_s[ispec] ? EMfields->Jz_s[ispec]->data() : EMfields->Jz_->data(); + unsigned int Jz_size = EMfields->Jz_s[ispec] ? EMfields->Jz_s[ispec]->size() : EMfields->Jz_->size(); + + double *const __restrict__ b_rho = EMfields->rho_s[ispec] ? EMfields->rho_s[ispec]->data() : EMfields->rho_->data(); + unsigned int rho_size = EMfields->rho_s[ispec] ? EMfields->rho_s[ispec]->size() : EMfields->rho_->size(); + + // Does not compute Rho ! + +#if defined( SMILEI_ACCELERATOR_GPU ) + + currentAndDensityDepositionKernel1DOnDevice( b_Jx,b_Jy,b_Jz,b_rho, + Jx_size, Jy_size, Jz_size, rho_size, + particles.getPtrPosition( 0 ), + particles.getPtrMomentum( 1 ), + particles.getPtrMomentum( 2 ), + particles.getPtrCharge(), + particles.getPtrWeight(), + particles.last_index.data(), + x_dimension_bin_count_, + invgf.data(), + iold.data(), + delta.data(), + inv_cell_volume, + dx_inv_, + dx_ov_dt_, + i_domain_begin_, + not_spectral_ ); + +#else + SMILEI_ASSERT( false ); +#endif + } else { + if( is_spectral ) { + ERROR( "Not implemented on GPU" ); + } + else{ + +#if defined( SMILEI_ACCELERATOR_GPU ) + currentDepositionKernel1DOnDevice(Jx_, Jy_, Jz_, + EMfields->Jx_->size(), EMfields->Jy_->size(), EMfields->Jz_->size(), + particles.getPtrPosition( 0 ), + particles.getPtrMomentum( 1 ), + particles.getPtrMomentum( 2 ), + particles.getPtrCharge(), + particles.getPtrWeight(), + particles.last_index.data(), + x_dimension_bin_count_, + invgf.data(), + iold.data(), + delta.data(), + inv_cell_volume, + dx_inv_, + dx_ov_dt_, + i_domain_begin_, + not_spectral_ ); +#else + SMILEI_ASSERT( false ); +#endif + } + } +} + +void Projector1D2OrderGPU::ionizationCurrents( Field *Jx, + Field *Jy, + Field *Jz, + Particles &particles, + int ipart, + LocalFields Jion ) +{ + ERROR( "Projector1D2OrderGPU::ionizationCurrents(): Not implemented !" ); +} + +void Projector1D2OrderGPU::susceptibility( ElectroMagn *EMfields, + Particles &particles, + double species_mass, + SmileiMPI *smpi, + int istart, + int iend, + int ithread, + int icell, + int ipart_ref ) +{ + ERROR( "Projector1D2OrderGPU::susceptibility(): Not implemented !" ); +} diff --git a/src/Projector/Projector1D2OrderGPU.h b/src/Projector/Projector1D2OrderGPU.h new file mode 100755 index 000000000..f35e8e4ee --- /dev/null +++ b/src/Projector/Projector1D2OrderGPU.h @@ -0,0 +1,127 @@ +#ifndef SMILEI_PROJECTOR_PROJECTOR1D2ORDERGPU_H +#define SMILEI_PROJECTOR_PROJECTOR1D2ORDERGPU_H + +#include "Projector1D.h" + + +class Projector1D2OrderGPU : public Projector1D +{ +public: + Projector1D2OrderGPU( Params ¶meters, Patch *a_patch ); + ~Projector1D2OrderGPU(); + + /// For initialization and diags, doesn't use the standard scheme + void basic( double *rhoj, + Particles &particles, + unsigned int ipart, + unsigned int type, + int bin_shift = 0 ) override; + /// Projection wrapper + void currentsAndDensityWrapper( ElectroMagn *EMfields, + Particles &particles, + SmileiMPI *smpi, + int istart, + int iend, + int ithread, + bool diag_flag, + bool is_spectral, + int ispec, + int icell = 0, + int ipart_ref = 0 ) override; + + void susceptibility( ElectroMagn *EMfields, + Particles &particles, + double species_mass, + SmileiMPI *smpi, + int istart, + int iend, + int ithread, + int icell = 0, + int ipart_ref = 0 ) override; + + void ionizationCurrents( Field *Jx, + Field *Jy, + Field *Jz, + Particles &particles, + int ipart, + LocalFields Jion ) override; + + + //!Wrapper for task-based implementation of Smilei + //! compiler complains otherwise even if it is completely useless + void currentsAndDensityWrapperOnBuffers( double *b_Jx, + double *b_Jy, + double *b_Jz, + double *b_rho, + int bin_width, + Particles &particles, + SmileiMPI *smpi, + int istart, + int iend, + int ithread, + bool diag_flag, + bool is_spectral, + int ispec, + int icell = 0, + int ipart_ref = 0 ) override {}; +/*#if defined( SMILEI_ACCELERATOR_GPU ) + +extern "C" void +currentDepositionKernel1DOnDevice( double *__restrict__ Jx, + double *__restrict__ Jy, + double *__restrict__ Jz, + int Jx_size, + int Jy_size, + int Jz_size, + const double *__restrict__ particle_position_x, + const double *__restrict__ particle_momentum_y, + const double *__restrict__ particle_momentum_z, + const short *__restrict__ particle_charge, + const double *__restrict__ particle_weight, + const int *__restrict__ host_bin_index, + unsigned int x_dimension_bin_count, + const double *__restrict__ invgf_, + const int *__restrict__ iold_, + const double *__restrict__ deltaold_, + double inv_cell_volume, + double dx_inv, + double dx_ov_dt, + int i_domain_begin, + int not_spectral_ ); + +extern "C" void +currentAndDensityDepositionKernel1DOnDevice( double *__restrict__ Jx, + double *__restrict__ Jy, + double *__restrict__ Jz, + double *__restrict__ rho, + int Jx_size, + int Jy_size, + int Jz_size, + int rho_size, + const double *__restrict__ particle_position_x, + const double *__restrict__ particle_momentum_y, + const double *__restrict__ particle_momentum_z, + const short *__restrict__ particle_charge, + const double *__restrict__ particle_weight, + const int *__restrict__ host_bin_index, + unsigned int x_dimension_bin_count, + const double *__restrict__ invgf_, + const int *__restrict__ iold_, + const double *__restrict__ deltaold_, + double inv_cell_volume, + double dx_inv, + double dx_ov_dt, + int i_domain_begin, + int not_spectral_ ); + +#endif*/ + + +protected: + double dts2_; + double dts4_; + int not_spectral_; + unsigned int x_dimension_bin_count_; +}; + +#endif \ No newline at end of file diff --git a/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.cu b/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.cu new file mode 100755 index 000000000..0a77a63db --- /dev/null +++ b/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.cu @@ -0,0 +1,1070 @@ + + +#if defined( __HIP__ ) + #include +#elif defined( __NVCC__ ) + #include + #include +#endif + +#include "Params.h" +#include "gpu.h" +#include + +#if defined( __HIP__ ) + // HIP compiler support enabled (for .cu files) +#else + #define PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION 1 +#endif + +#if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) + #include + #include "Tools.h" +#else + #include + + #include "Params.h" + #include "gpu.h" +#endif + +// #if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) + +//namespace naive { +// +// void //static inline void +// currentDepositionKernel2D( double *__restrict__ Jx, +// double *__restrict__ Jy, +// double *__restrict__ Jz, +// int Jx_size, +// int Jy_size, +// int Jz_size, +// const double *__restrict__ device_particle_position_x, +// const double *__restrict__ device_particle_momentum_y, +// const double *__restrict__ device_particle_momentum_z, +// const short *__restrict__ device_particle_charge, +// const double *__restrict__ device_particle_weight, +// const int *__restrict__ host_bin_index, +// unsigned int x_dimension_bin_count, +// const double *__restrict__ invgf_, +// const int *__restrict__ iold_, +// const double *__restrict__ deltaold_, +// double inv_cell_volume, +// double dx_inv, +// double dx_ov_dt, +// int i_domain_begin, +// int not_spectral_ ) +// { +// // The OMP implementation is NOT bin aware. As per the precondition on +// // host_bin_index, index zero always contains the number of particles. +// // See nvidiaParticles::prepareBinIndex / setHostBinIndex. +// const unsigned int bin_count = 1; +// const int particle_count = host_bin_index[bin_count - 1]; +// +// #if defined( SMILEI_ACCELERATOR_GPU_OMP ) +// #pragma omp target is_device_ptr /* map */ ( /* to: */ \ +// device_particle_position_x /* [0:particle_count] */, \ +// device_particle_momentum_y /* [0:particle_count] */, \ +// device_particle_momentum_z /* [0:particle_count] */, \ +// device_particle_charge /* [0:particle_count] */, \ +// device_particle_weight /* [0:particle_count] */ ) +// #pragma omp teams thread_limit( 64 ) distribute parallel for +// #elif defined( SMILEI_ACCELERATOR_GPU_OACC ) +// #pragma acc parallel \ +// deviceptr( device_particle_position_x, \ +// device_particle_momentum_y, \ +// device_particle_momentum_z, \ +// device_particle_charge, \ +// device_particle_weight ) \ +// present( iold [0:3 * particle_count], \ +// deltaold [0:3 * particle_count] ) +// #pragma acc loop gang worker vector +// #endif +// for( int particle_index = 0; particle_index < particle_count; ++particle_index ) { +// const double invgf = invgf_[particle_index]; +// const int *const __restrict__ iold = &iold_[particle_index]; +// const double *const __restrict__ deltaold = &deltaold_[particle_index]; +// +// double Sx0[5]; +// double Sx1[5]; +// +// // Variable declaration & initialization +// // Esirkepov's paper: https://arxiv.org/pdf/physics/9901047.pdf +// +// // Locate the particle on the primal grid at former time-step & calculate coeff. S0 +// { +// const double delta = deltaold[0 * particle_count]; +// const double delta2 = delta * delta; +// Sx0[0] = 0.0; +// Sx0[1] = 0.5 * ( delta2 - delta + 0.25 ); +// Sx0[2] = 0.75 - delta2; +// Sx0[3] = 0.5 * ( delta2 + delta + 0.25 ); +// Sx0[4] = 0.0; +// } +// +// // Locate the particle on the primal grid at current time-step & calculate coeff. S1 +// { +// const double xpn = device_particle_position_x[particle_index] * dx_inv; +// const int ip = std::round( xpn ); +// const int ipo = iold[0 * particle_count]; +// const int ip_m_ipo = ip - ipo - i_domain_begin; +// const double delta = xpn - static_cast( ip ); +// const double delta2 = delta * delta; +// +// Sx1[0] = 0.0; +// Sx1[1] = 0.0; +// // Sx1[2] = 0.0; // Always set below +// Sx1[3] = 0.0; +// Sx1[4] = 0.0; +// +// Sx1[ip_m_ipo + 1] = 0.5 * ( delta2 - delta + 0.25 ); +// Sx1[ip_m_ipo + 2] = 0.75 - delta2; +// Sx1[ip_m_ipo + 3] = 0.5 * ( delta2 + delta + 0.25 ); +// } +// +// // (x,y,z) components of the current density for the macro-particle +// const double charge_weight = inv_cell_volume * static_cast( device_particle_charge[particle_index] ) * device_particle_weight[particle_index]; +// const double crx_p = charge_weight * dx_ov_dt; +// const double cry_p = charge_weight * dy_ov_dt; +// const double crz_p = charge_weight * ( 1.0 / 3.0 ) * device_particle_momentum_z[particle_index] * invgf; +// +// // This is the particle position as grid index +// // This minus 2 come from the order 2 scheme, based on a 5 points stencil from -2 to +2. +// const int ipo = iold[0 * particle_count] - 2; +// +// for( unsigned int i = 0; i < 1; ++i ) { +// const int iloc = ( i + ipo ) ; +// /* Jx[iloc] += tmpJx[0]; */ +// +// SMILEI_ACCELERATOR_ATOMIC +// Jz[iloc] += crz_p * ( Sy1[0] * ( /* 0.5 * Sx0[i] + */ Sx1[i] ) ); +// double tmp = 0.0; +// for( unsigned int j = 1; j < 5; j++ ) { +// tmp -= cry_p * ( Sy1[j - 1] - Sy0[j - 1] ) * ( Sx0[i] + 0.5 * ( Sx1[i] - Sx0[i] ) ); +// +// SMILEI_ACCELERATOR_ATOMIC +// Jy[iloc + j + not_spectral_ * ( /* i + */ ipo )] += tmp; +// +// SMILEI_ACCELERATOR_ATOMIC +// Jz[iloc + j] += crz_p * ( Sy0[j] * ( 0.5 * Sx1[i] /* + Sx0[i] */ ) + +// Sy1[j] * ( /* 0.5 * Sx0[i] + */ Sx1[i] ) ); +// } +// } +// +// double tmpJx[5]{}; +// +// for( unsigned int i = 1; i < 5; ++i ) { +// const int iloc = ( i + ipo ) ; +// tmpJx[0] -= crx_p * ( Sx1[i - 1] - Sx0[i - 1] ) * ( 0.5 * ( Sy1[0] - Sy0[0] ) ); +// SMILEI_ACCELERATOR_ATOMIC +// Jx[iloc] += tmpJx[0]; +// SMILEI_ACCELERATOR_ATOMIC +// Jz[iloc] += crz_p * ( Sy1[0] * ( 0.5 * Sx0[i] + Sx1[i] ) ); +// double tmp = 0.0; +// for( unsigned int j = 1; j < 5; ++j ) { +// tmpJx[j] -= crx_p * ( Sx1[i - 1] - Sx0[i - 1] ) * ( Sy0[j] + 0.5 * ( Sy1[j] - Sy0[j] ) ); +// SMILEI_ACCELERATOR_ATOMIC +// Jx[iloc + j] += tmpJx[j]; +// tmp -= cry_p * ( Sy1[j - 1] - Sy0[j - 1] ) * ( Sx0[i] + 0.5 * ( Sx1[i] - Sx0[i] ) ); +// SMILEI_ACCELERATOR_ATOMIC +// Jy[iloc + j + not_spectral_ * ( i + ipo )] += tmp; +// +// SMILEI_ACCELERATOR_ATOMIC +// Jz[iloc + j] += crz_p * ( Sy0[j] * ( 0.5 * Sx1[i] + Sx0[i] ) + +// Sy1[j] * ( 0.5 * Sx0[i] + Sx1[i] ) ); +// } +// } +// } +// } // end currentDepositionKernel +// +// //static inline +// void +// currentAndDensityDepositionKernel( double *__restrict__ Jx, +// double *__restrict__ Jy, +// double *__restrict__ Jz, +// double *__restrict__ rho, +// int Jx_size, +// int Jy_size, +// int Jz_size, +// int rho_size, +// const double *__restrict__ device_particle_position_x, +// const double *__restrict__ device_particle_momentum_y, +// const double *__restrict__ device_particle_momentum_z, +// const short *__restrict__ device_particle_charge, +// const double *__restrict__ device_particle_weight, +// const int *__restrict__ host_bin_index, +// unsigned int, +// unsigned int, +// const double *__restrict__ invgf_, +// const int *__restrict__ iold_, +// const double *__restrict__ deltaold_, +// double inv_cell_volume, +// double dx_inv, +// double dx_ov_dt, +// int i_domain_begin, +// int not_spectral_ ) +// { +// // The OMP implementation is NOT bin aware. As per the precondition on +// // host_bin_index, index zero always contains the number of particles. +// // See nvidiaParticles::prepareBinIndex / setHostBinIndex. +// const unsigned int bin_count = 1; +// const int particle_count = host_bin_index[bin_count - 1]; +// +// #if defined( SMILEI_ACCELERATOR_GPU_OMP ) +// #pragma omp target is_device_ptr /* map */ ( /* to: */ \ +// device_particle_position_x /* [0:particle_count] */, \ +// device_particle_momentum_y /* [0:particle_count] */, \ +// device_particle_momentum_z /* [0:particle_count] */, \ +// device_particle_charge /* [0:particle_count] */, \ +// device_particle_weight /* [0:particle_count] */ ) +// #pragma omp teams thread_limit( 64 ) distribute parallel for +// #elif defined( SMILEI_ACCELERATOR_GPU_OACC ) +// #pragma acc parallel \ +// deviceptr( device_particle_position_x, \ +// device_particle_momentum_y, \ +// device_particle_momentum_z, \ +// device_particle_charge, \ +// device_particle_weight ) \ +// present( iold [0:3 * particle_count], \ +// deltaold [0:3 * particle_count] ) +// #pragma acc loop gang worker vector +// #endif +// for( int particle_index = 0; particle_index < particle_count; ++particle_index ) { +// const double invgf = invgf_[particle_index]; +// const int *const __restrict__ iold = &iold_[particle_index]; +// const double *const __restrict__ deltaold = &deltaold_[particle_index]; +// +// double Sx0[5]; +// double Sx1[5]; +// double Sy0[5]; +// double Sy1[5]; +// +// // Variable declaration & initialization +// // Esirkepov's paper: https://arxiv.org/pdf/physics/9901047.pdf +// +// // Locate the particle on the primal grid at former time-step & calculate coeff. S0 +// { +// const double delta = deltaold[0 * particle_count]; +// const double delta2 = delta * delta; +// Sx0[0] = 0.0; +// Sx0[1] = 0.5 * ( delta2 - delta + 0.25 ); +// Sx0[2] = 0.75 - delta2; +// Sx0[3] = 0.5 * ( delta2 + delta + 0.25 ); +// Sx0[4] = 0.0; +// } +// // Locate the particle on the primal grid at current time-step & calculate coeff. S1 +// { +// const double xpn = device_particle_position_x[particle_index] * dx_inv; +// const int ip = std::round( xpn ); +// const int ipo = iold[0 * particle_count]; +// const int ip_m_ipo = ip - ipo - i_domain_begin; +// const double delta = xpn - static_cast( ip ); +// const double delta2 = delta * delta; +// +// Sx1[0] = 0.0; +// Sx1[1] = 0.0; +// // Sx1[2] = 0.0; // Always set below +// Sx1[3] = 0.0; +// Sx1[4] = 0.0; +// +// Sx1[ip_m_ipo + 1] = 0.5 * ( delta2 - delta + 0.25 ); +// Sx1[ip_m_ipo + 2] = 0.75 - delta2; +// Sx1[ip_m_ipo + 3] = 0.5 * ( delta2 + delta + 0.25 ); +// } +// +// // (x,y,z) components of the current density for the macro-particle +// const double charge_weight = inv_cell_volume * static_cast( device_particle_charge[particle_index] ) * device_particle_weight[particle_index]; +// const double crx_p = charge_weight * dx_ov_dt; +// const double cry_p = charge_weight * dy_ov_dt; +// const double crz_p = charge_weight * ( 1.0 / 3.0 ) * device_particle_momentum_z[particle_index] * invgf; +// +// // This is the particle position as grid index +// // This minus 2 come from the order 2 scheme, based on a 5 points stencil from -2 to +2. +// const int ipo = iold[0 * particle_count] - 2; +// const int jpo = iold[1 * particle_count] - 2; +// +// // case i =0 +// for( unsigned int i = 0; i < 1; ++i ) { +// const int iloc = ( i + ipo ) ; +// /* Jx[iloc] += tmpJx[0]; */ +// +// SMILEI_ACCELERATOR_ATOMIC +// Jz[iloc] += crz_p * ( Sy1[0] * ( /* 0.5 * Sx0[i] + */ Sx1[i] ) ); +// +// SMILEI_ACCELERATOR_ATOMIC +// rho[iloc] += charge_weight * Sx1[0] * Sy1[0]; +// double tmp = 0.0; +// for( unsigned int j = 1; j < 5; j++ ) { +// tmp -= cry_p * ( Sy1[j - 1] - Sy0[j - 1] ) * ( Sx0[i] + 0.5 * ( Sx1[i] - Sx0[i] ) ); +// +// SMILEI_ACCELERATOR_ATOMIC +// Jy[iloc + j + not_spectral_ * ( /* i + */ ipo )] += tmp; +// +// SMILEI_ACCELERATOR_ATOMIC +// Jz[iloc + j] += crz_p * ( Sy0[j] * ( 0.5 * Sx1[i] /* + Sx0[i] */ ) + +// Sy1[j] * ( /* 0.5 * Sx0[i] + */ Sx1[i] ) ); +// SMILEI_ACCELERATOR_ATOMIC +// rho[iloc + j] += charge_weight * Sx1[0] * Sy1[j]; +// } +// } +// +// double tmpJx[5]{}; +// +// // case i> 0 +// for( unsigned int i = 1; i < 5; ++i ) { +// const int iloc = i + ipo ; +// tmpJx[0] -= crx_p * ( Sx1[i - 1] - Sx0[i - 1] ); +// +// SMILEI_ACCELERATOR_ATOMIC +// Jx[iloc] += tmpJx[0]; +// +// SMILEI_ACCELERATOR_ATOMIC +// Jz[iloc] += crz_p * ( Sy1[0] * ( 0.5 * Sx0[i] + Sx1[i] ) ); +// +// SMILEI_ACCELERATOR_ATOMIC +// rho[iloc] += charge_weight * Sx1[i] * Sy1[0]; +// +// double tmp = 0.0; +// for( unsigned int j = 1; j < 5; ++j ) { +// tmpJx[j] -= crx_p * ( Sx1[i - 1] - Sx0[i - 1] ) * ( Sy0[j] + 0.5 * ( Sy1[j] - Sy0[j] ) ); +// +// SMILEI_ACCELERATOR_ATOMIC +// Jx[iloc + j] += tmpJx[j]; +// tmp -= cry_p * ( Sy1[j - 1] - Sy0[j - 1] ) * ( Sx0[i] + 0.5 * ( Sx1[i] - Sx0[i] ) ); +// +// SMILEI_ACCELERATOR_ATOMIC +// Jy[iloc + j + not_spectral_ * ( i + ipo )] += tmp; +// +// SMILEI_ACCELERATOR_ATOMIC +// Jz[iloc + j] += crz_p * ( Sy0[j] * ( 0.5 * Sx1[i] + Sx0[i] ) + +// Sy1[j] * ( 0.5 * Sx0[i] + Sx1[i] ) ); +// +// SMILEI_ACCELERATOR_ATOMIC +// rho[iloc + j] += charge_weight * Sx1[i] * Sy1[j]; +// } +// } +// } +// } // end currentDepositionKernel +// +// +//} // namespace naive +// +// #else + +namespace cudahip1d { + namespace detail { +#if defined( __HIP__ ) + static inline void + checkErrors( ::hipError_t an_error_code, + const char *file_name, + int line ) + { + if( an_error_code != ::hipError_t::hipSuccess ) { + std::cout << "HIP error at " << file_name << ":" << line + << " -> " << ::hipGetErrorString( an_error_code ) << std::endl; + std::exit( EXIT_FAILURE ); + } + } +// For NVIDIA compiler +#elif defined( __NVCC__ ) + static inline void + checkErrors( ::cudaError_t an_error_code, + const char *file_name, + int line ) + { + if( an_error_code != ::cudaError_t::cudaSuccess ) { + std::cout << "CUDA error at " << file_name << ":" << line << " -> " << ::cudaGetErrorString( an_error_code ) << std::endl; + std::exit( EXIT_FAILURE ); + } + } +#endif + + } // namespace detail + + #define checkHIPErrors( an_expression ) \ + do { \ + detail::checkErrors( an_expression, __FILE__, __LINE__ ); \ + } while( 0 ) + + namespace kernel { + namespace atomic { + namespace LDS { + __device__ void + AddNoReturn( float *a_pointer, float a_value ) + { + #if defined( __gfx90a__ ) + ::unsafeAtomicAdd( a_pointer, a_value ); + #else + ::atomicAdd( a_pointer, a_value ); + #endif + } + + __device__ void + AddNoReturn( double *a_pointer, double a_value ) + { + #if defined( __gfx90a__ ) + ::unsafeAtomicAdd( a_pointer, a_value ); + #else + ::atomicAdd( a_pointer, a_value ); + #endif + } + } // namespace LDS + + namespace GDS { + __device__ void + AddNoReturn( double *a_pointer, double a_value ) + { + #if defined( __gfx90a__ ) + ::unsafeAtomicAdd( a_pointer, a_value ); + #else + ::atomicAdd( a_pointer, a_value ); + #endif + } + } // namespace GDS + } // namespace atomic + + + template + __device__ void inline __attribute__((always_inline)) init_S0(const ComputeFloat delta, ComputeFloat *__restrict__ S0) + { + const ComputeFloat delta2 = delta * delta; + S0[0] = static_cast( 0.5 ) * ( delta2 - delta + static_cast( 0.25 ) ); + S0[1] = static_cast( 0.75 ) - delta2; + S0[2] = static_cast( 0.5 ) * ( delta2 + delta + static_cast( 0.25 ) ); + S0[3] = static_cast( 0.0 ) ; + } + + template + __device__ void inline __attribute__((always_inline)) init_S1(const ComputeFloat xpn, const int ipo, const int i_domain_begin, + ComputeFloat *__restrict__ S1) + { + // const int ip = static_cast( xpn + 0.5 ); // std::round | rounding approximation which is correct enough and faster in this case + const int ip = std::round( xpn ); + const int ip_m_ipo = ip - ipo - i_domain_begin; + const ComputeFloat delta = xpn - static_cast( ip ); + const ComputeFloat delta2 = delta * delta; + + S1[0] = static_cast( 0.0 ); + S1[1] = static_cast( 0.0 ); // S1[2] = 0.0; // Always set below + S1[3] = static_cast( 0.0 ); + S1[4] = static_cast( 0.0 ); + + S1[ip_m_ipo + 1] = static_cast( 0.5 ) * ( delta2 - delta + static_cast( 0.25 ) ); + S1[ip_m_ipo + 2] = static_cast( 0.75 ) - delta2; + S1[ip_m_ipo + 3] = static_cast( 0.5 ) * ( delta2 + delta + static_cast( 0.25 ) ); + } + + + template + __global__ void + // __launch_bounds__(kWorkgroupSize, 1) + DepositCurrentDensity_1D_Order2( double *__restrict__ device_Jx, + double *__restrict__ device_Jy, + double *__restrict__ device_Jz, + int Jx_size, + int Jy_size, + int Jz_size, + const double *__restrict__ device_particle_position_x, + const double *__restrict__ device_particle_momentum_y, + const double *__restrict__ device_particle_momentum_z, + const short *__restrict__ device_particle_charge, + const double *__restrict__ device_particle_weight, + const int *__restrict__ device_bin_index, + const double *__restrict__ device_invgf_, + const int *__restrict__ device_iold_, + const double *__restrict__ device_deltaold_, + ComputeFloat inv_cell_volume, + ComputeFloat dx_inv, + ComputeFloat dx_ov_dt, + int i_domain_begin, + int not_spectral_ ) + { + // TODO(Etienne M): refactor this function. Break it into smaller + // pieces (lds init/store, coeff computation, deposition etc..) + // TODO(Etienne M): __ldg could be used to slightly improve GDS load + // speed. This would only have an effect on Nvidia cards as this + // operation is a no op on AMD. + const unsigned int workgroup_size = kWorkgroupSize; // blockDim.x; + const unsigned int bin_count = gridDim.x; + const unsigned int loop_stride = workgroup_size; // This stride should enable better memory access coalescing + + const unsigned int x_cluster_coordinate = blockIdx.x; + const unsigned int workgroup_dedicated_bin_index = x_cluster_coordinate; + const unsigned int thread_index_offset = threadIdx.x; + + // The unit is the cell + const unsigned int global_x_scratch_space_coordinate_offset = x_cluster_coordinate * Params::getGPUClusterWidth( 1 /* 1D */ ); + const int GPUClusterWithGCWidth = Params::getGPUClusterWithGhostCellWidth( 1 /* 1D */, 2 /* 2nd order interpolation */ ); + + // NOTE: We gain from the particles not being sorted inside a + // cluster because it reduces the bank conflicts one gets when + // multiple threads access the same part of the shared memory. Such + // "conflicted" accesses are serialized ! + // NOTE: We use a bit to much LDS. For Jx, the first row could be + // discarded, for Jy we could remove the first column. + + static constexpr unsigned int kFieldScratchSpaceSize = Params::getGPUInterpolationClusterCellVolume( 1 /* 1D */, 2 /* 2nd order interpolation */ ); + + // kWorkgroupSize, bin_count, loop_stride, x_cluster_coordinate, workgroup_dedicated_bin_index, thread_index_offset, Params::getGPUClusterWidth(1), GPUClusterWithGCWidth, kFieldScratchSpaceSize, global_x_scratch_space_coordinate_offset); + // NOTE: I tried having only one cache and reusing it. Doing that + // requires you to iterate multiple time over the particle which is + // possible but cost more bandwidth. The speedup was ~x0.92. + __shared__ ReductionFloat Jx_scratch_space[kFieldScratchSpaceSize]; + __shared__ ReductionFloat Jy_scratch_space[kFieldScratchSpaceSize]; + __shared__ ReductionFloat Jz_scratch_space[kFieldScratchSpaceSize]; + + // Init the shared memory + + for( unsigned int field_index = thread_index_offset; + field_index < kFieldScratchSpaceSize; + field_index += workgroup_size ) { + Jx_scratch_space[field_index] = static_cast( 0.0 ); + Jy_scratch_space[field_index] = static_cast( 0.0 ); + Jz_scratch_space[field_index] = static_cast( 0.0 ); + } + + __syncthreads(); + + const unsigned int particle_count = device_bin_index[bin_count - 1]; + + // This workgroup has to process distance(last_particle, + // first_particle) particles + const unsigned int first_particle = workgroup_dedicated_bin_index == 0 ? 0 : device_bin_index[workgroup_dedicated_bin_index - 1]; + const unsigned int last_particle = device_bin_index[workgroup_dedicated_bin_index]; + + for( unsigned int particle_index = first_particle + thread_index_offset; + particle_index < last_particle; + particle_index += loop_stride ) { + const ComputeFloat invgf = static_cast( device_invgf_[particle_index] ); + const int *const __restrict__ iold = &device_iold_[particle_index]; + const double *const __restrict__ deltaold = &device_deltaold_[particle_index]; + + ComputeFloat Sx0[5]; + ComputeFloat Sx1[5]; + + // Variable declaration & initialization + // Esirkepov's paper: https://arxiv.org/pdf/physics/9901047.pdf + + // Locate the particle on the primal grid at former time-step & calculate coeff. S0 + { + const ComputeFloat delta = deltaold[0 * particle_count]; + const ComputeFloat delta2 = delta * delta; + + Sx0[0] = static_cast( 0.0 ); + Sx0[1] = static_cast( 0.5 ) * ( delta2 - delta + static_cast( 0.25 ) ); + Sx0[2] = static_cast( 0.75 ) - delta2; + Sx0[3] = static_cast( 0.5 ) * ( delta2 + delta + static_cast( 0.25 ) ); + Sx0[4] = static_cast( 0.0 ); + } + //init_S0(deltaold[0 * particle_count], Sx0); + //init_S0(deltaold[1 * particle_count], Sy0); + + // Locate the particle on the primal grid at current time-step & calculate coeff. S1 + { + // const int ip = static_cast( xpn + 0.5 ); // std::round | rounding approximation which is correct enough and faster in this case + const ComputeFloat xpn = static_cast( device_particle_position_x[particle_index] ) * dx_inv; + const int ip = std::round( xpn ); + const int ipo = iold[0 * particle_count]; + const int ip_m_ipo = ip - ipo - i_domain_begin; + const ComputeFloat delta = xpn - static_cast( ip ); + const ComputeFloat delta2 = delta * delta; + + Sx1[0] = static_cast( 0.0 ); + Sx1[1] = static_cast( 0.0 ); + // Sx1[2] = 0.0; // Always set below + Sx1[3] = static_cast( 0.0 ); + Sx1[4] = static_cast( 0.0 ); + + Sx1[ip_m_ipo + 1] = static_cast( 0.5 ) * ( delta2 - delta + static_cast( 0.25 ) ); + Sx1[ip_m_ipo + 2] = static_cast( 0.75 ) - delta2; + Sx1[ip_m_ipo + 3] = static_cast( 0.5 ) * ( delta2 + delta + static_cast( 0.25 ) ); + } + + // (x,y,z) components of the current density for the macro-particle + const ComputeFloat charge_weight = inv_cell_volume * static_cast( device_particle_charge[particle_index] ) * static_cast( device_particle_weight[particle_index] ); + const ComputeFloat crx_p = charge_weight * dx_ov_dt; + const ComputeFloat cry_p = charge_weight * static_cast( device_particle_momentum_y[particle_index] ) * invgf; + const ComputeFloat crz_p = charge_weight * static_cast( device_particle_momentum_z[particle_index] ) * invgf; + + // This is the particle position as grid index + // This minus 2 come from the order 2 scheme, based on a 5 points stencil from -2 to +2. + const int ipo = iold[0 * particle_count] - + 2 /* Offset so we dont uses negative numbers in the loop */ - + global_x_scratch_space_coordinate_offset /* Offset to get cluster relative coordinates */; + + // Jx + ComputeFloat tmpJx[5]{}; + for( unsigned int i = 1; i < 5; ++i ) { + const int iloc = i + ipo; + tmpJx[i] = tmpJx[i-1] + crx_p * (Sx0[i-1] - Sx1[i-1]); + atomic::LDS::AddNoReturn( &Jx_scratch_space[iloc], static_cast( tmpJx[i] ) ); + } + + // Jy + for( unsigned int i = 0; i < 5; ++i ) { + const int iloc = i + ipo; + tmpJx[i] = cry_p * 0.5 * (Sx0[i] - Sx1[i]); + atomic::LDS::AddNoReturn( &Jy_scratch_space[iloc], static_cast( tmpJx[i] ) ); + } + + // Jz + for( unsigned int i = 0; i < 5; ++i ) { + const int iloc = i + ipo; + tmpJx[i] = crz_p * 0.5 * (Sx0[i] - Sx1[i]); + atomic::LDS::AddNoReturn( &Jz_scratch_space[iloc], static_cast( tmpJx[i] ) ); + } + } // particle_index + + __syncthreads(); + + for( unsigned int field_index = thread_index_offset; field_index < kFieldScratchSpaceSize; field_index += workgroup_size ) { + const unsigned int local_x_scratch_space_coordinate = field_index % GPUClusterWithGCWidth; // /GPUClusterWithGCWidth + const unsigned int global_x_scratch_space_coordinate = global_x_scratch_space_coordinate_offset + local_x_scratch_space_coordinate; + + const unsigned int global_memory_index = global_x_scratch_space_coordinate; + const unsigned int scratch_space_index = field_index; // local_x_scratch_space_coordinate * GPUClusterWithGCWidth + local_y_scratch_space_coordinate; + + // These atomics are basically free (very few of them). + atomic::GDS::AddNoReturn( &device_Jx[global_memory_index], static_cast( Jx_scratch_space[scratch_space_index] ) ); + atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + not_spectral_ * global_x_scratch_space_coordinate], static_cast( Jy_scratch_space[scratch_space_index] ) ); // We handle the FTDT/picsar + atomic::GDS::AddNoReturn( &device_Jz[global_memory_index], static_cast( Jz_scratch_space[scratch_space_index] ) ); + } + } // end DepositCurrent + + + template + __global__ void + // __launch_bounds__(kWorkgroupSize, 1) + DepositCurrentAndDensity_1D_Order2( double *__restrict__ device_Jx, + double *__restrict__ device_Jy, + double *__restrict__ device_Jz, + double *__restrict__ device_rho, + int Jx_size, + int Jy_size, + int Jz_size, + int rho_size, + const double *__restrict__ device_particle_position_x, + const double *__restrict__ device_particle_momentum_y, + const double *__restrict__ device_particle_momentum_z, + const short *__restrict__ device_particle_charge, + const double *__restrict__ device_particle_weight, + const int *__restrict__ device_bin_index, + const double *__restrict__ device_invgf_, + const int *__restrict__ device_iold_, + const double *__restrict__ device_deltaold_, + ComputeFloat inv_cell_volume, + ComputeFloat dx_inv, + ComputeFloat dx_ov_dt, + int i_domain_begin, + int not_spectral_ ) + { + // TODO(Etienne M): refactor this function. Break it into smaller + // pieces (lds init/store, coeff computation, deposition etc..) + // TODO(Etienne M): __ldg could be used to slightly improve GDS load + // speed. This would only have an effect on Nvidia cards as this + // operation is a no op on AMD. + const unsigned int workgroup_size = kWorkgroupSize; // blockDim.x; + const unsigned int bin_count = gridDim.x; + const unsigned int loop_stride = workgroup_size; // This stride should enable better memory access coalescing + + const unsigned int x_cluster_coordinate = blockIdx.x; + const unsigned int workgroup_dedicated_bin_index = x_cluster_coordinate ; + const unsigned int thread_index_offset = threadIdx.x; + + // The unit is the cell + const unsigned int global_x_scratch_space_coordinate_offset = x_cluster_coordinate * Params::getGPUClusterWidth( 1 /* 1D */ ); + + // NOTE: We gain from the particles not being sorted inside a + // cluster because it reduces the bank conflicts one gets when + // multiple threads access the same part of the shared memory. Such + // "conflicted" accesses are serialized ! + // NOTE: We use a bit to much LDS. For Jx, the first row could be + // discarded, for Jy we could remove the first column. + + const int GPUClusterWithGCWidth = Params::getGPUClusterWithGhostCellWidth( 1 /* 1D */, 2 /* 2nd order interpolation */ ); + static constexpr unsigned int kFieldScratchSpaceSize = Params::getGPUInterpolationClusterCellVolume( 1 /* 1D */, 2 /* 2nd order interpolation */ ); + + // NOTE: I tried having only one cache and reusing it. Doing that + // requires you to iterate multiple time over the particle which is + // possible but cost more bandwidth. The speedup was ~x0.92. + __shared__ ReductionFloat Jx_scratch_space[kFieldScratchSpaceSize]; + __shared__ ReductionFloat Jy_scratch_space[kFieldScratchSpaceSize]; + __shared__ ReductionFloat Jz_scratch_space[kFieldScratchSpaceSize]; + __shared__ ReductionFloat rho_scratch_space[kFieldScratchSpaceSize]; + + // Init the shared memory + + for( unsigned int field_index = thread_index_offset; + field_index < kFieldScratchSpaceSize; + field_index += workgroup_size ) { + Jx_scratch_space[field_index] = static_cast( 0.0 ); + Jy_scratch_space[field_index] = static_cast( 0.0 ); + Jz_scratch_space[field_index] = static_cast( 0.0 ); + rho_scratch_space[field_index] = static_cast( 0.0 ); + } + + __syncthreads(); + + const unsigned int particle_count = device_bin_index[bin_count - 1]; + + // This workgroup has to process distance(last_particle, + // first_particle) particles + const unsigned int first_particle = workgroup_dedicated_bin_index == 0 ? 0 : device_bin_index[workgroup_dedicated_bin_index - 1]; + const unsigned int last_particle = device_bin_index[workgroup_dedicated_bin_index]; + + for( unsigned int particle_index = first_particle + thread_index_offset; + particle_index < last_particle; + particle_index += loop_stride ) { + const ComputeFloat invgf = static_cast( device_invgf_[particle_index] ); + const int *const __restrict__ iold = &device_iold_[particle_index]; + const double *const __restrict__ deltaold = &device_deltaold_[particle_index]; + + ComputeFloat Sx0[5]; + ComputeFloat Sx1[5]; + + // Variable declaration & initialization + // Esirkepov's paper: https://arxiv.org/pdf/physics/9901047.pdf + + // Locate the particle on the primal grid at former time-step & calculate coeff. S0 + { + const ComputeFloat delta = deltaold[0 * particle_count]; + const ComputeFloat delta2 = delta * delta; + + Sx0[0] = static_cast( 0.0 ); + Sx0[1] = static_cast( 0.5 ) * ( delta2 - delta + static_cast( 0.25 ) ); + Sx0[2] = static_cast( 0.75 ) - delta2; + Sx0[3] = static_cast( 0.5 ) * ( delta2 + delta + static_cast( 0.25 ) ); + Sx0[4] = static_cast( 0.0 ); + } + + // Locate the particle on the primal grid at current time-step & calculate coeff. S1 + { + // const int ip = static_cast( xpn + 0.5 ); // std::round | rounding approximation which is correct enough and faster in this case + const ComputeFloat xpn = static_cast( device_particle_position_x[particle_index] ) * dx_inv; + const int ip = std::round( xpn ); + const int ipo = iold[0 * particle_count]; + const int ip_m_ipo = ip - ipo - i_domain_begin; + const ComputeFloat delta = xpn - static_cast( ip ); + const ComputeFloat delta2 = delta * delta; + + Sx1[0] = static_cast( 0.0 ); + Sx1[1] = static_cast( 0.0 ); + // Sx1[2] = 0.0; // Always set below + Sx1[3] = static_cast( 0.0 ); + Sx1[4] = static_cast( 0.0 ); + + Sx1[ip_m_ipo + 1] = static_cast( 0.5 ) * ( delta2 - delta + static_cast( 0.25 ) ); + Sx1[ip_m_ipo + 2] = static_cast( 0.75 ) - delta2; + Sx1[ip_m_ipo + 3] = static_cast( 0.5 ) * ( delta2 + delta + static_cast( 0.25 ) ); + } + + // (x,y,z) components of the current density for the macro-particle + const ComputeFloat charge_weight = inv_cell_volume * static_cast( device_particle_charge[particle_index] ) * static_cast( device_particle_weight[particle_index] ); + const ComputeFloat crx_p = charge_weight * dx_ov_dt; + const ComputeFloat cry_p = charge_weight * static_cast( device_particle_momentum_y[particle_index] ) * invgf; + const ComputeFloat crz_p = charge_weight * static_cast( device_particle_momentum_z[particle_index] ) * invgf; + + // This is the particle position as grid index + // This minus 2 come from the order 2 scheme, based on a 5 points stencil from -2 to +2. + const int ipo = iold[0 * particle_count] - + 2 /* Offset so we dont uses negative numbers in the loop */ - + global_x_scratch_space_coordinate_offset /* Offset to get cluster relative coordinates */; + + // Jx + ComputeFloat tmpJx[5]{}; + for( unsigned int i = 1; i < 5; ++i ) { + const int iloc = i + ipo; + tmpJx[i] = tmpJx[i-1] + crx_p * (Sx0[i-1] - Sx1[i-1]); + atomic::LDS::AddNoReturn( &Jx_scratch_space[iloc], static_cast( tmpJx[i] ) ); + } + + // Jy + for( unsigned int i = 0; i < 5; ++i ) { + const int iloc = i + ipo; + tmpJx[i] = cry_p * 0.5 * (Sx0[i] - Sx1[i]); + atomic::LDS::AddNoReturn( &Jy_scratch_space[iloc], static_cast( tmpJx[i] ) ); + } + + // Jz + for( unsigned int i = 0; i < 5; ++i ) { + const int iloc = i + ipo; + tmpJx[i] = crz_p * 0.5 * (Sx0[i] - Sx1[i]); + atomic::LDS::AddNoReturn( &Jz_scratch_space[iloc], static_cast( tmpJx[i] ) ); + } + + // Rho + for( unsigned int i = 0; i < 5; ++i ) { + const int iloc = i + ipo; + atomic::LDS::AddNoReturn( &rho_scratch_space[iloc], static_cast( charge_weight * Sx1[i] ) ); + } + + // improvements ideas: 1. unrolling to reduce the size of Sx0 and Sx1 + // 2. combine the loops + + /* + // + { + //ComputeFloat tmp = 0.5 * (Sx0[0] - Sx1[0]); // = - 0.5 * Sx1[0] + atomic::LDS::AddNoReturn( &Jy_scratch_space[ipo], static_cast( -cry_p * 0.5 * Sx1[0] ) ); + atomic::LDS::AddNoReturn( &Jz_scratch_space[ipo], static_cast( -crz_p * 0.5 * Sx1[0] ) ); + atomic::LDS::AddNoReturn( &rho_scratch_space[ipo], static_cast( charge_weight * Sx1[0] ) ); + }*/ + /*for( unsigned int i = 1; i < 4; ++i ) { + const int iloc = i + ipo; + tmpJx[i] = tmpJx[i-1] + crx_p * (Sx0[i-1] - Sx1[i-1]); + ComputeFloat tmp = 0.5 * (Sx0[i] - Sx1[i]); + atomic::LDS::AddNoReturn( &Jx_scratch_space[iloc], static_cast( tmpJx[i] ) ); + atomic::LDS::AddNoReturn( &Jy_scratch_space[iloc], static_cast( cry_p * tmp ) ); + atomic::LDS::AddNoReturn( &Jz_scratch_space[iloc], static_cast( crz_p * tmp ) ); + atomic::LDS::AddNoReturn( &rho_scratch_space[iloc], static_cast( charge_weight * Sx1[i] ) ); + }*/ + /* i=4 + { + const int iloc = i + ipo; + tmpJx[4] = tmpJx[3] + crx_p * (Sx0[i-1] - Sx1[i-1]); // can save some registers by tmpJx[0] instead of tmpJx[4] ? reducing its size from 5 to 4? + //ComputeFloat tmp = 0.5 * (Sx0[4] - Sx1[4]); // = -0.5 * Sx1[4] + atomic::LDS::AddNoReturn( &Jx_scratch_space[iloc], static_cast( tmpJx[i] ) ); + atomic::LDS::AddNoReturn( &Jy_scratch_space[iloc], static_cast( -cry_p * 0.5 * Sx1[4] ) ); //null + atomic::LDS::AddNoReturn( &Jz_scratch_space[iloc], static_cast( -crz_p * 0.5 * Sx1[4] ) ); //null + atomic::LDS::AddNoReturn( &rho_scratch_space[iloc], static_cast( charge_weight * Sx1[4] ) ); //null + } + + + */ + + } // particle_index + + __syncthreads(); + + for( unsigned int field_index = thread_index_offset; + field_index < kFieldScratchSpaceSize; + field_index += workgroup_size ) { + + const unsigned int local_x_scratch_space_coordinate = field_index % GPUClusterWithGCWidth; + const unsigned int global_x_scratch_space_coordinate = global_x_scratch_space_coordinate_offset + local_x_scratch_space_coordinate; + + const unsigned int global_memory_index = global_x_scratch_space_coordinate; + const unsigned int scratch_space_index = field_index; // local_x_scratch_space_coordinate * GPUClusterWithGCWidth + local_y_scratch_space_coordinate; + + // These atomics are basically free (very few of them). + atomic::GDS::AddNoReturn( &device_Jx[global_memory_index], static_cast( Jx_scratch_space[scratch_space_index] ) ); + atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + /* We handle the FTDT/picsar */ not_spectral_ * global_x_scratch_space_coordinate], static_cast( Jy_scratch_space[scratch_space_index] ) ); + atomic::GDS::AddNoReturn( &device_Jz[global_memory_index], static_cast( Jz_scratch_space[scratch_space_index] ) ); + atomic::GDS::AddNoReturn( &device_rho[global_memory_index], static_cast( rho_scratch_space[scratch_space_index] ) ); + } + } + } // namespace kernel + + + //static inline + void + currentDepositionKernel1D( double *__restrict__ host_Jx, + double *__restrict__ host_Jy, + double *__restrict__ host_Jz, + int Jx_size, + int Jy_size, + int Jz_size, + const double *__restrict__ device_particle_position_x, + const double *__restrict__ device_particle_momentum_y, + const double *__restrict__ device_particle_momentum_z, + const short *__restrict__ device_particle_charge, + const double *__restrict__ device_particle_weight, + const int *__restrict__ host_bin_index, + unsigned int x_dimension_bin_count, + const double *__restrict__ host_invgf_, + const int *__restrict__ host_iold_, + const double *__restrict__ host_deltaold_, + double inv_cell_volume, + double dx_inv, + double dx_ov_dt, + int i_domain_begin, + int not_spectral_ ) + { + SMILEI_ASSERT( Params::getGPUClusterWidth( 1 /* 1D */ ) != -1 && + Params::getGPUClusterGhostCellBorderWidth( 2 /* 2nd order interpolation */ ) != -1 ); + + // NOTE: + // This cluster is very strongly bound by atomic operations in LDS (shared memory) + // TODO(Etienne M): Find a way to lessen the atomic usage + + const ::dim3 kGridDimension { static_cast( x_dimension_bin_count ), 1, 1 }; + + static constexpr std::size_t kWorkgroupSize = 128; + const ::dim3 kBlockDimension{ static_cast( kWorkgroupSize ), 1, 1 }; + + // NOTE: On cards lacking hardware backed Binary64 atomic operations, + // falling back to Binary32 (supposing hardware support for atomic + // operations) can lead to drastic performance improvement. + // One just need to assign 'float' to ReductionFloat. + // + using ComputeFloat = double; + using ReductionFloat = double; + + auto KernelFunction = kernel::DepositCurrentDensity_1D_Order2; +#if defined ( __HIP__ ) + hipLaunchKernelGGL( KernelFunction, + kGridDimension, + kBlockDimension, + 0, // Shared memory + 0, // Stream + // Kernel arguments + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jx ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jy ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jz ), + Jx_size, Jy_size, Jz_size, + device_particle_position_x, + device_particle_momentum_y, + device_particle_momentum_z, + device_particle_charge, + device_particle_weight, + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_bin_index ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_invgf_ ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_iold_ ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_deltaold_ ), + inv_cell_volume, + dx_inv, + dx_ov_dt, + i_domain_begin, + not_spectral_ ); + + checkHIPErrors( ::hipDeviceSynchronize() ); +#elif defined ( __NVCC__ ) + KernelFunction <<< + kGridDimension, + kBlockDimension, + 0, // Shared memory + 0 // Stream + >>> + ( + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jx ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jy ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jz ), + Jx_size, Jy_size, Jz_size, + device_particle_position_x, + device_particle_momentum_y, + device_particle_momentum_z, + device_particle_charge, + device_particle_weight, + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_bin_index ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_invgf_ ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_iold_ ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_deltaold_ ), + inv_cell_volume, + dx_inv, + dx_ov_dt, + i_domain_begin, + not_spectral_ + ); + checkHIPErrors( ::cudaDeviceSynchronize() ); +#endif + } + + //static inline + void + currentAndDensityDepositionKernel1D( double *__restrict__ host_Jx, + double *__restrict__ host_Jy, + double *__restrict__ host_Jz, + double *__restrict__ host_rho, + int Jx_size, + int Jy_size, + int Jz_size, + int rho_size, + const double *__restrict__ device_particle_position_x, + const double *__restrict__ device_particle_momentum_y, + const double *__restrict__ device_particle_momentum_z, + const short *__restrict__ device_particle_charge, + const double *__restrict__ device_particle_weight, + const int *__restrict__ host_bin_index, + unsigned int x_dimension_bin_count, + const double *__restrict__ host_invgf_, + const int *__restrict__ host_iold_, + const double *__restrict__ host_deltaold_, + double inv_cell_volume, + double dx_inv, + double dx_ov_dt, + int i_domain_begin, + int not_spectral_ ) + { + // & because one 1D ; 2 because of 2nd order interpolation + SMILEI_ASSERT( Params::getGPUClusterWidth( 1 ) != -1 && + Params::getGPUClusterGhostCellBorderWidth( 2 ) != -1 ); + + const ::dim3 kGridDimension { static_cast( x_dimension_bin_count ), 1, 1 }; + + static constexpr std::size_t kWorkgroupSize = 128; + const ::dim3 kBlockDimension{ static_cast( kWorkgroupSize ), 1, 1 }; + + // NOTE: On cards lacking hardware backed Binary64 atomic operations, + // falling back to Binary32 (supposing hardware support for atomic + // operations) can lead to drastic performance improvement. + // One just need to assign 'float' to ReductionFloat. + // + using ComputeFloat = double; + using ReductionFloat = double; + auto KernelFunction = kernel::DepositCurrentAndDensity_1D_Order2; +#if defined ( __HIP__ ) + hipLaunchKernelGGL( KernelFunction, + kGridDimension, + kBlockDimension, + 0, // Shared memory + 0, // Stream + // Kernel arguments + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jx ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jy ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jz ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_rho ), + Jx_size, Jy_size, Jz_size, rho_size, + device_particle_position_x, + device_particle_momentum_y, + device_particle_momentum_z, + device_particle_charge, + device_particle_weight, + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_bin_index ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_invgf_ ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_iold_ ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_deltaold_ ), + inv_cell_volume, + dx_inv, + dx_ov_dt, + i_domain_begin, + not_spectral_ ); + + checkHIPErrors( ::hipDeviceSynchronize() ); +#elif defined ( __NVCC__ ) + KernelFunction <<< + kGridDimension, + kBlockDimension, + 0, // Shared memory + 0 // Stream + >>> + ( + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jx ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jy ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_Jz ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_rho ), + Jx_size, Jy_size, Jz_size, rho_size, + device_particle_position_x, + device_particle_momentum_y, + device_particle_momentum_z, + device_particle_charge, + device_particle_weight, + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_bin_index ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_invgf_ ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_iold_ ), + smilei::tools::gpu::HostDeviceMemoryManagement::GetDevicePointer( host_deltaold_ ), + inv_cell_volume, + dx_inv, + dx_ov_dt, + i_domain_begin, + not_spectral_ + ); + checkHIPErrors( ::cudaDeviceSynchronize() ); +#endif + } + +} // namespace cudahip1D + + diff --git a/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.h b/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.h new file mode 100755 index 000000000..f5e64e408 --- /dev/null +++ b/src/Projector/Projector1D2OrderGPUKernelCUDAHIP.h @@ -0,0 +1,71 @@ +//! HIP CUDA implementation + +#ifndef Projector1D2OrderGPUKernelCUDAHIP_H +#define Projector1D2OrderGPUKernelCUDAHIP_H + +#if defined( SMILEI_ACCELERATOR_GPU ) + +#if defined( __HIP__ ) + #include +#elif defined( __NVCC__ ) + #include + #include +#endif + +#include "Params.h" +#include "gpu.h" + +namespace cudahip1d { + +void currentDepositionKernel1D( double *__restrict__ host_Jx, + double *__restrict__ host_Jy, + double *__restrict__ host_Jz, + int Jx_size, + int Jy_size, + int Jz_size, + const double *__restrict__ device_particle_position_x, + const double *__restrict__ device_particle_momentum_y, + const double *__restrict__ device_particle_momentum_z, + const short *__restrict__ device_particle_charge, + const double *__restrict__ device_particle_weight, + const int *__restrict__ host_bin_index, + unsigned int x_dimension_bin_count, + const double *__restrict__ host_invgf_, + const int *__restrict__ host_iold_, + const double *__restrict__ host_deltaold_, + double inv_cell_volume, + double dx_inv, + double dx_ov_dt, + int i_domain_begin, + int not_spectral_ ); + +void currentAndDensityDepositionKernel1D( + double *__restrict__ host_Jx, + double *__restrict__ host_Jy, + double *__restrict__ host_Jz, + double *__restrict__ host_rho, + int Jx_size, + int Jy_size, + int Jz_size, + int rho_size, + const double *__restrict__ device_particle_position_x, + const double *__restrict__ device_particle_momentum_y, + const double *__restrict__ device_particle_momentum_z, + const short *__restrict__ device_particle_charge, + const double *__restrict__ device_particle_weight, + const int *__restrict__ host_bin_index, + unsigned int x_dimension_bin_count, + const double *__restrict__ host_invgf_, + const int *__restrict__ host_iold_, + const double *__restrict__ host_deltaold_, + double inv_cell_volume, + double dx_inv, + double dx_ov_dt, + int i_domain_begin, + int not_spectral_ ); + +} // namespace cudahip1d + +#endif +#endif + diff --git a/src/Projector/Projector1D4Order.cpp b/src/Projector/Projector1D4Order.cpp index e78ddea67..ea4eafa4a 100755 --- a/src/Projector/Projector1D4Order.cpp +++ b/src/Projector/Projector1D4Order.cpp @@ -19,11 +19,11 @@ Projector1D4Order::Projector1D4Order( Params ¶ms, Patch *patch ) : Projector1D( params, patch ) { dx_inv_ = 1.0/params.cell_length[0]; - dx_ov_dt = params.cell_length[0] / params.timestep; + dx_ov_dt_ = params.cell_length[0] / params.timestep; //double defined for use in coefficients - index_domain_begin = patch->getCellStartingGlobalIndex( 0 ); + i_domain_begin_ = patch->getCellStartingGlobalIndex( 0 ); DEBUG( "cell_length "<< params.cell_length[0] ); @@ -43,7 +43,7 @@ void Projector1D4Order::currents( double *Jx, double *Jy, double *Jz, Particles int ip_m_ipo; double charge_weight = inv_cell_volume * ( double )( particles.charge( ipart ) )*particles.weight( ipart ); double xjn, xj_m_xipo, xj_m_xipo2, xj_m_xipo3, xj_m_xipo4, xj_m_xip, xj_m_xip2, xj_m_xip3, xj_m_xip4; - double crx_p = charge_weight*dx_ov_dt; // current density for particle moving in the x-direction + double crx_p = charge_weight*dx_ov_dt_; // current density for particle moving in the x-direction double cry_p = charge_weight*particles.momentum( 1, ipart )*invgf; // current density in the y-direction of the macroparticle double crz_p = charge_weight*particles.momentum( 2, ipart )*invgf; // current density allow the y-direction of the macroparticle double S0[7], S1[7], Wl[7], Wt[7], Jx_p[7]; // arrays used for the Esirkepov projection method @@ -82,7 +82,7 @@ void Projector1D4Order::currents( double *Jx, double *Jy, double *Jz, Particles // coefficients 2nd order interpolation on 5 nodes ipo = *iold; // index of the central node - ip_m_ipo = ip-ipo-index_domain_begin; + ip_m_ipo = ip-ipo-i_domain_begin_; S1[ip_m_ipo+1] = dble_1_ov_384 - dble_1_ov_48 * xj_m_xip + dble_1_ov_16 * xj_m_xip2 - dble_1_ov_12 * xj_m_xip3 + dble_1_ov_24 * xj_m_xip4; S1[ip_m_ipo+2] = dble_19_ov_96 - dble_11_ov_24 * xj_m_xip + dble_1_ov_4 * xj_m_xip2 + dble_1_ov_6 * xj_m_xip3 - dble_1_ov_6 * xj_m_xip4; @@ -125,7 +125,7 @@ void Projector1D4Order::currentsAndDensity( double *Jx, double *Jy, double *Jz, int ip_m_ipo; double charge_weight = inv_cell_volume * ( double )( particles.charge( ipart ) )*particles.weight( ipart ); double xjn, xj_m_xipo, xj_m_xipo2, xj_m_xipo3, xj_m_xipo4, xj_m_xip, xj_m_xip2, xj_m_xip3, xj_m_xip4; - double crx_p = charge_weight*dx_ov_dt; // current density for particle moving in the x-direction + double crx_p = charge_weight*dx_ov_dt_; // current density for particle moving in the x-direction double cry_p = charge_weight*particles.momentum( 1, ipart )*invgf; // current density in the y-direction of the macroparticle double crz_p = charge_weight*particles.momentum( 2, ipart )*invgf; // current density allow the y-direction of the macroparticle double S0[7], S1[7], Wl[7], Wt[7], Jx_p[7]; // arrays used for the Esirkepov projection method @@ -164,7 +164,7 @@ void Projector1D4Order::currentsAndDensity( double *Jx, double *Jy, double *Jz, // coefficients 2nd order interpolation on 5 nodes ipo = *iold; // index of the central node - ip_m_ipo = ip-ipo-index_domain_begin; + ip_m_ipo = ip-ipo-i_domain_begin_; S1[ip_m_ipo+1] = dble_1_ov_384 - dble_1_ov_48 * xj_m_xip + dble_1_ov_16 * xj_m_xip2 - dble_1_ov_12 * xj_m_xip3 + dble_1_ov_24 * xj_m_xip4; S1[ip_m_ipo+2] = dble_19_ov_96 - dble_11_ov_24 * xj_m_xip + dble_1_ov_4 * xj_m_xip2 + dble_1_ov_6 * xj_m_xip3 - dble_1_ov_6 * xj_m_xip4; @@ -253,7 +253,7 @@ void Projector1D4Order::basic( double *rhoj, Particles &particles, unsigned int S1[4] = dble_19_ov_96 + dble_11_ov_24 * xj_m_xip + dble_1_ov_4 * xj_m_xip2 - dble_1_ov_6 * xj_m_xip3 - dble_1_ov_6 * xj_m_xip4; S1[5] = dble_1_ov_384 + dble_1_ov_48 * xj_m_xip + dble_1_ov_16 * xj_m_xip2 + dble_1_ov_12 * xj_m_xip3 + dble_1_ov_24 * xj_m_xip4; - ip -= index_domain_begin + 3 + bin_shift ; + ip -= i_domain_begin_ + 3 + bin_shift ; // 4th order projection for the charge density // At the 4th order, oversize = 3. @@ -299,7 +299,7 @@ void Projector1D4Order::ionizationCurrents( Field *Jx, Field *Jy, Field *Jz, Par xjmxi3 = xjmxi2*xjmxi; // cube xjmxi4 = xjmxi2*xjmxi2; // fourth-power - i -= index_domain_begin; + i -= i_domain_begin_; im2 = i-2; im1 = i-1; ip1 = i+1; @@ -326,7 +326,7 @@ void Projector1D4Order::ionizationCurrents( Field *Jx, Field *Jy, Field *Jz, Par xjmxi = xjn - ( double )i; // normalized distance to the nearest grid point xjmxi2 = xjmxi*xjmxi; // square of the normalized distance to the nearest grid point - i -= index_domain_begin; + i -= i_domain_begin_; im2 = i-2; im1 = i-1; ip1 = i+1; @@ -476,7 +476,7 @@ void Projector1D4Order::ionizationCurrentsForTasks( double *b_Jx, double *b_Jy, Sxd[3] = dble_19_ov_96 + dble_11_ov_24 * xpmxid + dble_1_ov_4 * xpmxid2 - dble_1_ov_6 * xpmxid3 - dble_1_ov_6 * xpmxid4; Sxd[4] = dble_1_ov_384 + dble_1_ov_48 * xpmxid + dble_1_ov_16 * xpmxid2 + dble_1_ov_12 * xpmxid3 + dble_1_ov_24 * xpmxid4; - ip -= index_domain_begin+bin_shift; + ip -= i_domain_begin_+bin_shift; // id -= i_domain_begin; for (unsigned int i=0 ; i<5 ; i++) { diff --git a/src/Projector/Projector1D4Order.h b/src/Projector/Projector1D4Order.h index 6cd570d62..3ef38a7c7 100755 --- a/src/Projector/Projector1D4Order.h +++ b/src/Projector/Projector1D4Order.h @@ -33,7 +33,6 @@ class Projector1D4Order : public Projector1D void susceptibility( ElectroMagn *EMfields, Particles &particles, double species_mass, SmileiMPI *smpi, int istart, int iend, int ithread, int icell = 0, int ipart_ref = 0 ) override final; private: - double dx_ov_dt; static constexpr double dble_1_ov_384 = 1.0/384.0; static constexpr double dble_1_ov_48 = 1.0/48.0 ; static constexpr double dble_1_ov_16 = 1.0/16.0 ; diff --git a/src/Projector/Projector2D2OrderGPU.cpp b/src/Projector/Projector2D2OrderGPU.cpp index cfe20eb7d..a91a29dde 100755 --- a/src/Projector/Projector2D2OrderGPU.cpp +++ b/src/Projector/Projector2D2OrderGPU.cpp @@ -21,12 +21,12 @@ Projector2D2OrderGPU::Projector2D2OrderGPU( Params ¶meters, Patch *a_patch ) // initialize it's member variable) we better initialize // Projector2D2OrderGPU's member variable after explicitly initializing // Projector2D. - not_spectral = !parameters.is_pxr; + not_spectral_ = !parameters.is_pxr; dt = parameters.timestep; dts2 = dt / 2.0; dts4 = dts2 / 2.0; -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined ( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined ( SMILEI_ACCELERATOR_GPU_OACC ) // When sorting is disabled, these values are invalid (-1) and the HIP // implementation can't be used. x_dimension_bin_count_ = parameters.getGPUBinCount( 1 ); @@ -41,7 +41,7 @@ Projector2D2OrderGPU::~Projector2D2OrderGPU() // EMPTY } -#if defined( SMILEI_ACCELERATOR_MODE ) //SMILEI_ACCELERATOR_GPU_OMP ) +#if defined( SMILEI_ACCELERATOR_GPU ) //SMILEI_ACCELERATOR_GPU_OMP ) extern "C" void currentDepositionKernel2DOnDevice( double *__restrict__ Jx, @@ -72,7 +72,7 @@ currentDepositionKernel2DOnDevice( double *__restrict__ Jx, int not_spectral ); extern "C" void -currentAndDensityDepositionKernelOnDevice( double *__restrict__ Jx, +currentAndDensityDepositionKernel2DOnDevice( double *__restrict__ Jx, double *__restrict__ Jy, double *__restrict__ Jz, double *__restrict__ rho, @@ -109,6 +109,7 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy /// Project global current densities (EMfields->Jx_/Jy_/Jz_) /// /* inline */ void +#if defined( SMILEI_ACCELERATOR_GPU )//SMILEI_ACCELERATOR_GPU_OMP ) currents( double *__restrict__ Jx, double *__restrict__ Jy, double *__restrict__ Jz, @@ -132,7 +133,6 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy double, int not_spectral ) { -#if defined( SMILEI_ACCELERATOR_MODE )//SMILEI_ACCELERATOR_GPU_OMP ) currentDepositionKernel2DOnDevice( Jx, Jy, Jz, @@ -159,15 +159,22 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy j_domain_begin, nprimy, not_spectral ); + } #else + currents( double *__restrict__ , double *__restrict__ , double *__restrict__ , int, int, int, + Particles &, unsigned int , unsigned int ,const double *__restrict__ , + const int *__restrict__ , const double *__restrict__ , double , double , double , + double , double , int , int , int , double, int ) + { SMILEI_ASSERT( false ); -#endif } +#endif /// Like currents(), project the particle current on the grid (Jx_/Jy_/Jz_) /// but also compute global current densities rho used for diagFields timestep /// /* inline */ void +#if defined( SMILEI_ACCELERATOR_GPU )//SMILEI_ACCELERATOR_GPU_OMP ) currentsAndDensity( double *__restrict__ Jx, double *__restrict__ Jy, double *__restrict__ Jz, @@ -193,8 +200,7 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy double, int not_spectral ) { -#if defined( SMILEI_ACCELERATOR_MODE )//SMILEI_ACCELERATOR_GPU_OMP ) - currentAndDensityDepositionKernelOnDevice( Jx, + currentAndDensityDepositionKernel2DOnDevice( Jx, Jy, Jz, rho, @@ -222,10 +228,16 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy j_domain_begin, nprimy, not_spectral ); + } #else + currentsAndDensity( double *__restrict__ , double *__restrict__ , double *__restrict__ , double *__restrict__ , + int , int , int , int , Particles &, unsigned int , unsigned int , + const double *__restrict__ , const int *__restrict__ , const double *__restrict__ , + double , double , double , double , double , int , int , int , double, int ) + { SMILEI_ASSERT( false ); -#endif } +#endif } // namespace @@ -233,7 +245,7 @@ void Projector2D2OrderGPU::basic( double *rhoj, Particles &particles, unsigned int ipart, unsigned int type, - int bin_shift ) + int /*bin_shift*/ ) { // Warning : this function is used for frozen species only. It is assumed that position = position_old !!! @@ -306,12 +318,12 @@ void Projector2D2OrderGPU::basic( double *rhoj, } } -void Projector2D2OrderGPU::ionizationCurrents( Field *Jx, - Field *Jy, - Field *Jz, - Particles &particles, - int ipart, - LocalFields Jion ) +void Projector2D2OrderGPU::ionizationCurrents( Field */*Jx*/, + Field */*Jy*/, + Field */*Jz*/, + Particles &/*particles*/, + int /*ipart*/, + LocalFields /*Jion */) { ERROR( "Projector2D2OrderGPU::ionizationCurrents(): Not implemented !" ); } @@ -325,8 +337,8 @@ void Projector2D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, bool diag_flag, bool is_spectral, int ispec, - int icell, - int ipart_ref ) + int /*icell*/, + int /*ipart_ref */) { std::vector &iold = smpi->dynamics_iold[ithread]; std::vector &delta = smpi->dynamics_deltaold[ithread]; @@ -368,7 +380,7 @@ void Projector2D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, // i_domain_begin_, j_domain_begin_, // nprimy, // one_third, - // not_spectral ); + // not_spectral_ ); // } // Does not compute Rho ! @@ -385,7 +397,7 @@ void Projector2D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, i_domain_begin_, j_domain_begin_, nprimy, one_third, - not_spectral ); + not_spectral_ ); } else { // If no field diagnostics this timestep, then the projection is done directly on the total arrays @@ -401,7 +413,7 @@ void Projector2D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, // i_domain_begin_, j_domain_begin_, // nprimy, // one_third, - // not_spectral ); + // not_spectral_ ); // } } else { @@ -420,25 +432,25 @@ void Projector2D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, i_domain_begin_, j_domain_begin_, nprimy, one_third, - not_spectral ); + not_spectral_ ); } } } -void Projector2D2OrderGPU::susceptibility( ElectroMagn *EMfields, - Particles &particles, - double species_mass, - SmileiMPI *smpi, - int istart, - int iend, - int ithread, - int icell, - int ipart_ref ) +void Projector2D2OrderGPU::susceptibility( ElectroMagn */*EMfields*/, + Particles &/*particles*/, + double /*species_mass*/, + SmileiMPI */*smpi*/, + int /*istart*/, + int /*iend*/, + int /*ithread*/, + int /*icell*/, + int /*ipart_ref */) { ERROR( "Projector2D2OrderGPU::susceptibility(): Not implemented !" ); } -//#if defined( SMILEI_ACCELERATOR_MODE ) +//#if defined( SMILEI_ACCELERATOR_GPU ) ////! Project global current densities (EMfields->Jx_/Jy_/Jz_) ////! //extern "C" void @@ -467,7 +479,7 @@ void Projector2D2OrderGPU::susceptibility( ElectroMagn *EMfields, // int i_domain_begin, // int j_domain_begin, // int nprimy, -// int not_spectral ) +// int not_spectral_ ) //{ // #if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) // naive:: // the naive, OMP version serves as a reference along with the CPU version @@ -490,7 +502,7 @@ void Projector2D2OrderGPU::susceptibility( ElectroMagn *EMfields, // dx_ov_dt, dy_ov_dt, // i_domain_begin, j_domain_begin, // nprimy, -// not_spectral ); +// not_spectral_ ); //} // // @@ -524,7 +536,7 @@ void Projector2D2OrderGPU::susceptibility( ElectroMagn *EMfields, // int i_domain_begin, // int j_domain_begin, // int nprimy, -// int not_spectral ) +// int not_spectral_ ) //{ // #if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) // naive:: // the naive, OMP version serves as a reference along with the CPU version @@ -547,7 +559,7 @@ void Projector2D2OrderGPU::susceptibility( ElectroMagn *EMfields, // dx_ov_dt, dy_ov_dt, // i_domain_begin, j_domain_begin, // nprimy, -// not_spectral ); +// not_spectral_ ); //} //#endif diff --git a/src/Projector/Projector2D2OrderGPU.h b/src/Projector/Projector2D2OrderGPU.h index 9a799f9b5..5e555b8f2 100755 --- a/src/Projector/Projector2D2OrderGPU.h +++ b/src/Projector/Projector2D2OrderGPU.h @@ -46,21 +46,21 @@ class Projector2D2OrderGPU : public Projector2D int ipart_ref = 0 ) override; //!Wrapper for task-based implementation of Smilei - void currentsAndDensityWrapperOnBuffers( double *b_Jx, - double *b_Jy, - double *b_Jz, - double *b_rho, - int bin_width, - Particles &particles, - SmileiMPI *smpi, - int istart, - int iend, - int ithread, - bool diag_flag, - bool is_spectral, - int ispec, - int icell = 0, - int ipart_ref = 0 ) override {}; + void currentsAndDensityWrapperOnBuffers( double * /*b_Jx*/, + double * /*b_Jy*/, + double * /*b_Jz*/, + double * /*b_rho*/, + int /*bin_width*/, + Particles &/*particles*/, + SmileiMPI */*smpi*/, + int /*istart*/, + int /*iend*/, + int /*ithread*/, + bool /*diag_flag*/, + bool /*is_spectral*/, + int /*ispec*/, + int /*icell*/ = 0, + int /*ipart_ref*/ = 0 ) override {}; /// Project susceptibility, used as source term in envelope equation /// @@ -78,7 +78,7 @@ class Projector2D2OrderGPU : public Projector2D double dt; double dts2; double dts4; - int not_spectral; + int not_spectral_; unsigned int x_dimension_bin_count_; unsigned int y_dimension_bin_count_; }; diff --git a/src/Projector/Projector2D2OrderGPUKernel.cpp b/src/Projector/Projector2D2OrderGPUKernel.cpp old mode 100644 new mode 100755 index 8f38f52fe..85814d54c --- a/src/Projector/Projector2D2OrderGPUKernel.cpp +++ b/src/Projector/Projector2D2OrderGPUKernel.cpp @@ -1,4 +1,4 @@ -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) #include "Projector2D2OrderGPUKernelCUDAHIP.h" #include @@ -33,7 +33,7 @@ currentDepositionKernel2DOnDevice( double *__restrict__ host_Jx, int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ) + int not_spectral_ ) { //#if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) //naive:: // the naive, OMP version serves as a reference along with the CPU version @@ -56,14 +56,14 @@ currentDepositionKernel2DOnDevice( double *__restrict__ host_Jx, dx_ov_dt, dy_ov_dt, i_domain_begin, j_domain_begin, nprimy, - not_spectral ); + not_spectral_ ); } //! Project global current and charge densities (EMfields->Jx_/Jy_/Jz_/rho_) //! extern "C" void -currentAndDensityDepositionKernelOnDevice( double *__restrict__ host_Jx, +currentAndDensityDepositionKernel2DOnDevice( double *__restrict__ host_Jx, double *__restrict__ host_Jy, double *__restrict__ host_Jz, double *__restrict__ host_rho, @@ -90,14 +90,14 @@ currentAndDensityDepositionKernelOnDevice( double *__restrict__ host_Jx, int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ) + int not_spectral_ ) { //#if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) //naive:: // the naive, OMP version serves as a reference along with the CPU version //#else cudahip2d:: //#endif - currentAndDensityDepositionKernel( host_Jx, host_Jy, host_Jz, host_rho, + currentAndDensityDepositionKernel2D( host_Jx, host_Jy, host_Jz, host_rho, Jx_size, Jy_size, Jz_size, rho_size, device_particle_position_x, device_particle_position_y, device_particle_momentum_z, @@ -113,7 +113,7 @@ currentAndDensityDepositionKernelOnDevice( double *__restrict__ host_Jx, dx_ov_dt, dy_ov_dt, i_domain_begin, j_domain_begin, nprimy, - not_spectral ); + not_spectral_ ); } #endif diff --git a/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.cu b/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.cu old mode 100644 new mode 100755 index 666a409f4..7c177c206 --- a/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.cu +++ b/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.cu @@ -20,20 +20,20 @@ #if defined( __HIP__ ) // HIP compiler support enabled (for .cu files) - #else - #define PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION 1 - #endif +#else + #define PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION 1 +#endif - #if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) - #include +#if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) + #include - #include "Tools.h" - #else - #include + #include "Tools.h" +#else + #include - #include "Params.h" - #include "gpu.h" - #endif + #include "Params.h" + #include "gpu.h" +#endif // #if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) @@ -65,7 +65,7 @@ // int i_domain_begin, // int j_domain_begin, // int nprimy, -// int not_spectral ) +// int not_spectral_ ) // { // // The OMP implementation is NOT bin aware. As per the precondition on // // host_bin_index, index zero always contains the number of particles. @@ -81,7 +81,7 @@ // device_particle_charge /* [0:particle_count] */, \ // device_particle_weight /* [0:particle_count] */ ) // #pragma omp teams thread_limit( 64 ) distribute parallel for -// #elif defined( SMILEI_OPENACC_MODE ) +// #elif defined( SMILEI_ACCELERATOR_GPU_OACC ) // #pragma acc parallel \ // deviceptr( device_particle_position_x, \ // device_particle_position_y, \ @@ -185,7 +185,7 @@ // tmp -= cry_p * ( Sy1[j - 1] - Sy0[j - 1] ) * ( Sx0[i] + 0.5 * ( Sx1[i] - Sx0[i] ) ); // // SMILEI_ACCELERATOR_ATOMIC -// Jy[iloc + j + not_spectral * ( /* i + */ ipo )] += tmp; +// Jy[iloc + j + not_spectral_ * ( /* i + */ ipo )] += tmp; // // SMILEI_ACCELERATOR_ATOMIC // Jz[iloc + j] += crz_p * ( Sy0[j] * ( 0.5 * Sx1[i] /* + Sx0[i] */ ) + @@ -209,7 +209,7 @@ // Jx[iloc + j] += tmpJx[j]; // tmp -= cry_p * ( Sy1[j - 1] - Sy0[j - 1] ) * ( Sx0[i] + 0.5 * ( Sx1[i] - Sx0[i] ) ); // SMILEI_ACCELERATOR_ATOMIC -// Jy[iloc + j + not_spectral * ( i + ipo )] += tmp; +// Jy[iloc + j + not_spectral_ * ( i + ipo )] += tmp; // // SMILEI_ACCELERATOR_ATOMIC // Jz[iloc + j] += crz_p * ( Sy0[j] * ( 0.5 * Sx1[i] + Sx0[i] ) + @@ -248,7 +248,7 @@ // int i_domain_begin, // int j_domain_begin, // int nprimy, -// int not_spectral ) +// int not_spectral_ ) // { // // The OMP implementation is NOT bin aware. As per the precondition on // // host_bin_index, index zero always contains the number of particles. @@ -264,7 +264,7 @@ // device_particle_charge /* [0:particle_count] */, \ // device_particle_weight /* [0:particle_count] */ ) // #pragma omp teams thread_limit( 64 ) distribute parallel for -// #elif defined( SMILEI_OPENACC_MODE ) +// #elif defined( SMILEI_ACCELERATOR_GPU_OACC ) // #pragma acc parallel \ // deviceptr( device_particle_position_x, \ // device_particle_position_y, \ @@ -372,7 +372,7 @@ // tmp -= cry_p * ( Sy1[j - 1] - Sy0[j - 1] ) * ( Sx0[i] + 0.5 * ( Sx1[i] - Sx0[i] ) ); // // SMILEI_ACCELERATOR_ATOMIC -// Jy[iloc + j + not_spectral * ( /* i + */ ipo )] += tmp; +// Jy[iloc + j + not_spectral_ * ( /* i + */ ipo )] += tmp; // // SMILEI_ACCELERATOR_ATOMIC // Jz[iloc + j] += crz_p * ( Sy0[j] * ( 0.5 * Sx1[i] /* + Sx0[i] */ ) + @@ -407,7 +407,7 @@ // tmp -= cry_p * ( Sy1[j - 1] - Sy0[j - 1] ) * ( Sx0[i] + 0.5 * ( Sx1[i] - Sx0[i] ) ); // // SMILEI_ACCELERATOR_ATOMIC -// Jy[iloc + j + not_spectral * ( i + ipo )] += tmp; +// Jy[iloc + j + not_spectral_ * ( i + ipo )] += tmp; // // SMILEI_ACCELERATOR_ATOMIC // Jz[iloc + j] += crz_p * ( Sy0[j] * ( 0.5 * Sx1[i] + Sx0[i] ) + @@ -567,7 +567,7 @@ namespace cudahip2d { int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ) + int not_spectral_ ) { // TODO(Etienne M): refactor this function. Break it into smaller // pieces (lds init/store, coeff computation, deposition etc..) @@ -867,7 +867,7 @@ namespace cudahip2d { // These atomics are basically free (very few of them). atomic::GDS::AddNoReturn( &device_Jx[global_memory_index], static_cast( Jx_scratch_space[scratch_space_index] ) ); - atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + /* We handle the FTDT/picsar */ not_spectral * global_x_scratch_space_coordinate], static_cast( Jy_scratch_space[scratch_space_index] ) ); + atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + /* We handle the FTDT/picsar */ not_spectral_ * global_x_scratch_space_coordinate], static_cast( Jy_scratch_space[scratch_space_index] ) ); atomic::GDS::AddNoReturn( &device_Jz[global_memory_index], static_cast( Jz_scratch_space[scratch_space_index] ) ); } } // end DepositCurrent @@ -903,7 +903,7 @@ namespace cudahip2d { int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ) + int not_spectral_ ) { // TODO(Etienne M): refactor this function. Break it into smaller // pieces (lds init/store, coeff computation, deposition etc..) @@ -1146,7 +1146,7 @@ namespace cudahip2d { // These atomics are basically free (very few of them). atomic::GDS::AddNoReturn( &device_Jx[global_memory_index], static_cast( Jx_scratch_space[scratch_space_index] ) ); - atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + /* We handle the FTDT/picsar */ not_spectral * global_x_scratch_space_coordinate], static_cast( Jy_scratch_space[scratch_space_index] ) ); + atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + /* We handle the FTDT/picsar */ not_spectral_ * global_x_scratch_space_coordinate], static_cast( Jy_scratch_space[scratch_space_index] ) ); atomic::GDS::AddNoReturn( &device_Jz[global_memory_index], static_cast( Jz_scratch_space[scratch_space_index] ) ); atomic::GDS::AddNoReturn( &device_rho[global_memory_index], static_cast( rho_scratch_space[scratch_space_index] ) ); } @@ -1181,7 +1181,7 @@ namespace cudahip2d { int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ) + int not_spectral_ ) { SMILEI_ASSERT( Params::getGPUClusterWidth( 2 /* 2D */ ) != -1 && Params::getGPUClusterGhostCellBorderWidth( 2 /* 2nd order interpolation */ ) != -1 ); @@ -1229,7 +1229,7 @@ namespace cudahip2d { dx_ov_dt, dy_ov_dt, i_domain_begin, j_domain_begin, nprimy, - not_spectral ); + not_spectral_ ); checkHIPErrors( ::hipDeviceSynchronize() ); #elif defined ( __NVCC__ ) @@ -1258,7 +1258,7 @@ namespace cudahip2d { dx_ov_dt, dy_ov_dt, i_domain_begin, j_domain_begin, nprimy, - not_spectral + not_spectral_ ); checkHIPErrors( ::cudaDeviceSynchronize() ); #endif @@ -1266,7 +1266,7 @@ namespace cudahip2d { //static inline void - currentAndDensityDepositionKernel( double *__restrict__ host_Jx, + currentAndDensityDepositionKernel2D( double *__restrict__ host_Jx, double *__restrict__ host_Jy, double *__restrict__ host_Jz, double *__restrict__ host_rho, @@ -1293,7 +1293,7 @@ namespace cudahip2d { int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ) + int not_spectral_ ) { SMILEI_ASSERT( Params::getGPUClusterWidth( 2 /* 2D */ ) != -1 && Params::getGPUClusterGhostCellBorderWidth( 2 /* 2nd order interpolation */ ) != -1 ); @@ -1341,7 +1341,7 @@ namespace cudahip2d { dx_ov_dt, dy_ov_dt, i_domain_begin, j_domain_begin, nprimy, - not_spectral ); + not_spectral_ ); checkHIPErrors( ::hipDeviceSynchronize() ); #elif defined ( __NVCC__ ) @@ -1371,7 +1371,7 @@ namespace cudahip2d { dx_ov_dt, dy_ov_dt, i_domain_begin, j_domain_begin, nprimy, - not_spectral + not_spectral_ ); checkHIPErrors( ::cudaDeviceSynchronize() ); #endif @@ -1409,7 +1409,7 @@ namespace cudahip2d { // int i_domain_begin, // int j_domain_begin, // int nprimy, -// int not_spectral ) +// int not_spectral_ ) //{ // #if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) // naive:: // the naive, OMP version serves as a reference along with the CPU version @@ -1432,7 +1432,7 @@ namespace cudahip2d { // dx_ov_dt, dy_ov_dt, // i_domain_begin, j_domain_begin, // nprimy, -// not_spectral ); +// not_spectral_ ); //} // ////! Project global current and charge densities (EMfields->Jx_/Jy_/Jz_/rho_) @@ -1465,7 +1465,7 @@ namespace cudahip2d { // int i_domain_begin, // int j_domain_begin, // int nprimy, -// int not_spectral ) +// int not_spectral_ ) //{ // #if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) // naive:: // the naive, OMP version serves as a reference along with the CPU version @@ -1488,6 +1488,6 @@ namespace cudahip2d { // dx_ov_dt, dy_ov_dt, // i_domain_begin, j_domain_begin, // nprimy, -// not_spectral ); +// not_spectral_ ); //} diff --git a/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h b/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h old mode 100644 new mode 100755 index d607a4ab4..d789796ab --- a/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h +++ b/src/Projector/Projector2D2OrderGPUKernelCUDAHIP.h @@ -3,8 +3,7 @@ #ifndef Projector2D2OrderGPUKernelCUDAHIP_H #define Projector2D2OrderGPUKernelCUDAHIP_H - -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) #if defined( __HIP__ ) #include @@ -20,8 +19,7 @@ namespace cudahip2d { //static -void - currentDepositionKernel2D( double *__restrict__ host_Jx, +void currentDepositionKernel2D( double *__restrict__ host_Jx, double *__restrict__ host_Jy, double *__restrict__ host_Jz, int Jx_size, @@ -46,11 +44,10 @@ void int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ); + int not_spectral_ ); //static -inline void - currentAndDensityDepositionKernel( +void currentAndDensityDepositionKernel2D( double *__restrict__ host_Jx, double *__restrict__ host_Jy, double *__restrict__ host_Jz, @@ -78,7 +75,7 @@ inline void int i_domain_begin, int j_domain_begin, int nprimy, - int not_spectral ); + int not_spectral_ ); } // namespace cudahip2d diff --git a/src/Projector/Projector3D2OrderGPU.cpp b/src/Projector/Projector3D2OrderGPU.cpp index 39342b204..f27d7b1e1 100755 --- a/src/Projector/Projector3D2OrderGPU.cpp +++ b/src/Projector/Projector3D2OrderGPU.cpp @@ -25,18 +25,18 @@ Projector3D2OrderGPU::Projector3D2OrderGPU( Params ¶meters, Patch *a_patch ) // initialize it's member variable) we better initialize // Projector2D2OrderGPU's member variable after explicitly initializing // Projector2D. - not_spectral = !parameters.is_pxr; + not_spectral_ = !parameters.is_pxr; dt = parameters.timestep; dts2 = dt / 2.0; dts4 = dts2 / 2.0; -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined ( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined ( SMILEI_ACCELERATOR_GPU_OACC ) // When sorting is disabled, these values are invalid (-1) and the HIP // implementation can't be used. x_dimension_bin_count_ = parameters.getGPUBinCount( 1 ); y_dimension_bin_count_ = parameters.getGPUBinCount( 2 ); z_dimension_bin_count_ = parameters.getGPUBinCount( 3 ); -//#elif defined( SMILEI_OPENACC_MODE ) +//#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) // x_dimension_bin_count_ = 1; // y_dimension_bin_count_ = 1; // z_dimension_bin_count_ = 1; @@ -50,7 +50,7 @@ Projector3D2OrderGPU::~Projector3D2OrderGPU() // EMPTY } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) extern "C" void currentDeposition3DOnDevice( double *__restrict__ Jx, double *__restrict__ Jy, @@ -122,6 +122,8 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy /// Project global current densities (EMfields->Jx_/Jy_/Jz_) /// /* inline */ void + +#if defined( SMILEI_ACCELERATOR_GPU ) currents( double *__restrict__ Jx, double *__restrict__ Jy, double *__restrict__ Jz, @@ -150,72 +152,77 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy double, int not_spectral ) { -#if defined( SMILEI_ACCELERATOR_MODE ) currentDeposition3DOnDevice( Jx, - Jy, - Jz, - Jx_size, - Jy_size, - Jz_size, - particles.getPtrPosition( 0 ), - particles.getPtrPosition( 1 ), - particles.getPtrPosition( 2 ), - particles.getPtrCharge(), - particles.getPtrWeight(), - particles.last_index.data(), - x_dimension_bin_count, - y_dimension_bin_count, - z_dimension_bin_count, - invgf_, - iold_, - deltaold_, - particles.deviceSize(), - inv_cell_volume, - dx_inv, - dy_inv, - dz_inv, - dx_ov_dt, - dy_ov_dt, - dz_ov_dt, - i_domain_begin, - j_domain_begin, - k_domain_begin, - nprimy, nprimz, - not_spectral ); + Jy, + Jz, + Jx_size, + Jy_size, + Jz_size, + particles.getPtrPosition( 0 ), + particles.getPtrPosition( 1 ), + particles.getPtrPosition( 2 ), + particles.getPtrCharge(), + particles.getPtrWeight(), + particles.last_index.data(), + x_dimension_bin_count, + y_dimension_bin_count, + z_dimension_bin_count, + invgf_, + iold_, + deltaold_, + particles.deviceSize(), + inv_cell_volume, + dx_inv, + dy_inv, + dz_inv, + dx_ov_dt, + dy_ov_dt, + dz_ov_dt, + i_domain_begin, + j_domain_begin, + k_domain_begin, + nprimy, nprimz, + not_spectral ); + } #else + currents( double *__restrict__ , double *__restrict__ , double *__restrict__ , int, int, int, + Particles &, unsigned int , unsigned int , unsigned int , const double *__restrict__ , + const int *__restrict__ , const double *__restrict__ , double , double , double , double , + double , double , double , int , int , int , int , int , double, int ) + { SMILEI_ASSERT( false ); -#endif } +#endif //! Project density /* inline */ void +#if defined( SMILEI_ACCELERATOR_GPU ) density( - double *__restrict__ rho, - int rho_size, - Particles &particles, - unsigned int x_dimension_bin_count, - unsigned int y_dimension_bin_count, - unsigned int z_dimension_bin_count, - const double *__restrict__ invgf_, - const int *__restrict__ iold_, - const double *__restrict__ deltaold_, - double inv_cell_volume, - double dx_inv, - double dy_inv, - double dz_inv, - double dx_ov_dt, - double dy_ov_dt, - double dz_ov_dt, - int i_domain_begin, - int j_domain_begin, - int k_domain_begin, - int nprimy, - int nprimz, - double, - int not_spectral ) + double *__restrict__ rho, + int rho_size, + Particles &particles, + unsigned int x_dimension_bin_count, + unsigned int y_dimension_bin_count, + unsigned int z_dimension_bin_count, + const double *__restrict__ invgf_, + const int *__restrict__ iold_, + const double *__restrict__ deltaold_, + double inv_cell_volume, + double dx_inv, + double dy_inv, + double dz_inv, + double dx_ov_dt, + double dy_ov_dt, + double dz_ov_dt, + int i_domain_begin, + int j_domain_begin, + int k_domain_begin, + int nprimy, + int nprimz, + double, + int not_spectral ) { -#if defined( SMILEI_ACCELERATOR_MODE ) densityDeposition3DOnDevice( rho, rho_size, @@ -244,10 +251,16 @@ namespace { // Unnamed namespace == static == internal linkage == no exported sy k_domain_begin, nprimy, nprimz, not_spectral ); + } #else + density( double *__restrict__ , int , Particles &, unsigned int , unsigned int , unsigned int , + const double *__restrict__ , const int *__restrict__ , const double *__restrict__ , + double , double , double , double , double , double , double , + int, int, int, int, int, double, int ) + { SMILEI_ASSERT( false ); -#endif } +#endif } // namespace @@ -255,7 +268,7 @@ void Projector3D2OrderGPU::basic( double *rhoj, Particles &particles, unsigned int ipart, unsigned int type, - int bin_shift ) + int /*bin_shift*/ ) { @@ -347,12 +360,12 @@ void Projector3D2OrderGPU::basic( double *rhoj, } } -void Projector3D2OrderGPU::ionizationCurrents( Field *Jx, - Field *Jy, - Field *Jz, - Particles &particles, - int ipart, - LocalFields Jion ) +void Projector3D2OrderGPU::ionizationCurrents( Field */*Jx*/, + Field */*Jy*/, + Field */*Jz*/, + Particles &/*particles*/, + int /*ipart*/, + LocalFields /*Jion */) { ERROR( "Projector3D2OrderGPU::ionizationCurrents(): Not implemented !" ); } @@ -366,8 +379,8 @@ void Projector3D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, bool diag_flag, bool is_spectral, int ispec, - int icell, - int ipart_ref ) + int /*icell*/, + int /*ipart_ref*/ ) { if( is_spectral ) { @@ -401,7 +414,7 @@ void Projector3D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, i_domain_begin_, j_domain_begin_, k_domain_begin_, nprimy, nprimz, one_third, - not_spectral ); + not_spectral_ ); double *const __restrict__ b_rho = EMfields->rho_s[ispec] ? EMfields->rho_s[ispec]->data() : EMfields->rho_->data(); unsigned int rho_size = EMfields->rho_s[ispec] ? EMfields->rho_s[ispec]->size() : EMfields->rho_->size(); @@ -416,7 +429,7 @@ void Projector3D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, i_domain_begin_, j_domain_begin_, k_domain_begin_, nprimy, nprimz, one_third, - not_spectral ); + not_spectral_ ); // If requested performs then the charge density deposition } else { @@ -440,7 +453,7 @@ void Projector3D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, i_domain_begin_, j_domain_begin_, k_domain_begin_, nprimy, nprimz, one_third, - not_spectral ); + not_spectral_ ); } // TODO(Etienne M): DIAGS. Find a way to get rho. We could: @@ -463,15 +476,15 @@ void Projector3D2OrderGPU::currentsAndDensityWrapper( ElectroMagn *EMfields, //std::cerr << sum << " " << sum2 << " " << sum_Jxs << " " << sum_Jx << std::endl; } -void Projector3D2OrderGPU::susceptibility( ElectroMagn *EMfields, - Particles &particles, - double species_mass, - SmileiMPI *smpi, - int istart, - int iend, - int ithread, - int icell, - int ipart_ref ) +void Projector3D2OrderGPU::susceptibility( ElectroMagn */*EMfields*/, + Particles &/*particles*/, + double /*species_mass*/, + SmileiMPI */*smpi*/, + int /*istart*/, + int /*iend*/, + int /*ithread*/, + int /*icell*/, + int /*ipart_ref */) { ERROR( "Projector3D2OrderGPU::susceptibility(): Not implemented !" ); } diff --git a/src/Projector/Projector3D2OrderGPU.cpp.backup b/src/Projector/Projector3D2OrderGPU.cpp.backup index 39ce7a4a5..761e6ae31 100755 --- a/src/Projector/Projector3D2OrderGPU.cpp.backup +++ b/src/Projector/Projector3D2OrderGPU.cpp.backup @@ -2,7 +2,7 @@ #include #include -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #include #include #endif @@ -136,7 +136,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( position_y /* [istart_pack:current_pack_size] */, \ position_z /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ deltaold [0:3 * nparts], \ Sx0 [0:kTmpArraySize], \ @@ -262,7 +262,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( DSx [0:kTmpArraySize], sumX [0:kTmpArraySize] ) // #pragma acc parallel deviceptr( DSx, sumX ) @@ -287,7 +287,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( charge /* [istart_pack:current_pack_size] */, \ weight /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ Jx [0:Jx_size], \ Sy0 [0:kTmpArraySize], \ @@ -310,7 +310,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( const double crx_p = dx_ov_dt_inv_cell_volume * static_cast( charge[ipart] ) * weight[ipart]; const int linindex0 = iold[ipart+0*packsize]*yz_size0+iold[ipart+1*packsize]*z_size0+iold[ipart+2*packsize]; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( int k=0 ; k<5 ; k++ ) { @@ -326,7 +326,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp atomic update -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc atomic #endif Jx [ jdx ] += val; @@ -339,7 +339,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( DSy [0:kTmpArraySize], \ sumX [0:kTmpArraySize] ) @@ -365,7 +365,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( charge /* [istart_pack:current_pack_size] */, \ weight /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ Jy [0:Jy_size], \ Sx0 [0:kTmpArraySize], \ @@ -388,7 +388,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( const double cry_p = dy_ov_dt_inv_cell_volume * static_cast( charge[ipart] ) * weight[ipart]; const int linindex1 = iold[ipart+0*packsize]*yz_size1+iold[ipart+1*packsize]*z_size1+iold[ipart+2*packsize]; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( int k=0 ; k<5 ; k++ ) { @@ -404,7 +404,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp atomic update -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc atomic #endif Jy [ jdx ] += val; @@ -417,7 +417,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( DSz [0:kTmpArraySize], \ sumX [0:kTmpArraySize] ) @@ -443,7 +443,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( charge /* [istart_pack:current_pack_size] */, \ weight /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ Jz [0:Jz_size], \ Sx0 [0:kTmpArraySize], \ @@ -466,7 +466,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( const double crz_p = dz_ov_dt_inv_cell_volume * static_cast( charge[ipart] ) * weight[ipart]; const int linindex2 = iold[ipart+0*packsize]*yz_size2+iold[ipart+1*packsize]*z_size2+iold[ipart+2*packsize]; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( int k=1 ; k<5 ; k++ ) { @@ -482,7 +482,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp atomic update -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc atomic #endif Jz[ jdx ] += val; @@ -498,7 +498,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( charge /* [istart_pack:current_pack_size] */, \ weight /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ rho [0:rho_size], \ Sx1 [0:kTmpArraySize], \ @@ -523,7 +523,7 @@ Projector3D2OrderGPU::currentsAndDensityGPU( int jdx = idx + k; #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp atomic update -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc atomic #endif rho[ jdx ] += charge_weight * Sx1[ipart_pack+i*packsize]*Sy1[ipart_pack+j*packsize]*Sz1[ipart_pack+k*packsize]; diff --git a/src/Projector/Projector3D2OrderGPU.h b/src/Projector/Projector3D2OrderGPU.h index 2fac2402e..c8ebd0ae8 100755 --- a/src/Projector/Projector3D2OrderGPU.h +++ b/src/Projector/Projector3D2OrderGPU.h @@ -46,21 +46,21 @@ class Projector3D2OrderGPU : public Projector3D int ipart_ref = 0 ) override; //!Wrapper for task-based implementation of Smilei - void currentsAndDensityWrapperOnBuffers( double *b_Jx, - double *b_Jy, - double *b_Jz, - double *b_rho, - int bin_width, - Particles &particles, - SmileiMPI *smpi, - int istart, - int iend, - int ithread, - bool diag_flag, - bool is_spectral, - int ispec, - int icell = 0, - int ipart_ref = 0 ) override {}; + void currentsAndDensityWrapperOnBuffers( double * /*b_Jx*/, + double * /*b_Jy*/, + double * /*b_Jz*/, + double * /*b_rho*/, + int /*bin_width*/, + Particles &/*particles*/, + SmileiMPI */*smpi*/, + int /*istart*/, + int /*iend*/, + int /*ithread*/, + bool /*diag_flag*/, + bool /*is_spectral*/, + int /*ispec*/, + int /*icell*/ = 0, + int /*ipart_ref*/ = 0 ) override {}; /// Project susceptibility, used as source term in envelope equation /// @@ -78,7 +78,7 @@ class Projector3D2OrderGPU : public Projector3D double dt; double dts2; double dts4; - int not_spectral; + int not_spectral_; unsigned int x_dimension_bin_count_; unsigned int y_dimension_bin_count_; unsigned int z_dimension_bin_count_; diff --git a/src/Projector/Projector3D2OrderGPUKernel.cpp b/src/Projector/Projector3D2OrderGPUKernel.cpp index f77a4fda3..f9465dc2a 100644 --- a/src/Projector/Projector3D2OrderGPUKernel.cpp +++ b/src/Projector/Projector3D2OrderGPUKernel.cpp @@ -5,7 +5,7 @@ // issues (!). -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) //! Simple switch to jump between the reference (omp) implementation and the //! hip one. @@ -71,7 +71,6 @@ currentDeposition3DOnDevice( double *__restrict__ host_Jx, int nprimz, int not_spectral ) { - // printf("We are doing current deposition on GPU \n"); //#if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) //acc:: // OpenMP or OpenACC version serves as a reference along with the CPU version //#else @@ -134,8 +133,6 @@ densityDeposition3DOnDevice( int nprimz, int not_spectral ) { - //printf("We are doing a densitydeposition on GPU \n"); - //#if defined( PRIVATE_SMILEI_USE_OPENMP_PROJECTION_IMPLEMENTATION ) //acc:: // OpenMP or OpenACC version serves as a reference along with the CPU version //#else diff --git a/src/Projector/Projector3D2OrderGPUKernelAcc.h b/src/Projector/Projector3D2OrderGPUKernelAcc.h index 9cf3b224d..43bff1cce 100644 --- a/src/Projector/Projector3D2OrderGPUKernelAcc.h +++ b/src/Projector/Projector3D2OrderGPUKernelAcc.h @@ -1,6 +1,6 @@ //! Optimized Acc projection (from Julien Derouillat) -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) #include #include "Tools.h" @@ -110,7 +110,7 @@ namespace acc { position_y /* [istart_pack:current_pack_size] */, \ position_z /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ deltaold [0:3 * nparts], \ Sx0 [0:kTmpArraySize], \ @@ -236,7 +236,7 @@ namespace acc { #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( DSx [0:kTmpArraySize], sumX [0:kTmpArraySize] ) // #pragma acc parallel deviceptr( DSx, sumX ) @@ -261,7 +261,7 @@ namespace acc { charge /* [istart_pack:current_pack_size] */, \ weight /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ Jx [0:Jx_size], \ Sy0 [0:kTmpArraySize], \ @@ -284,7 +284,7 @@ namespace acc { const double crx_p = dx_ov_dt_inv_cell_volume * static_cast( charge[ipart] ) * weight[ipart]; const int linindex0 = iold[ipart+0*packsize]*yz_size0+iold[ipart+1*packsize]*z_size0+iold[ipart+2*packsize]; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( int k=0 ; k<5 ; k++ ) { @@ -309,7 +309,7 @@ namespace acc { #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( DSy [0:kTmpArraySize], \ sumX [0:kTmpArraySize] ) @@ -335,7 +335,7 @@ namespace acc { charge /* [istart_pack:current_pack_size] */, \ weight /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ Jy [0:Jy_size], \ Sx0 [0:kTmpArraySize], \ @@ -358,7 +358,7 @@ namespace acc { const double cry_p = dy_ov_dt_inv_cell_volume * static_cast( charge[ipart] ) * weight[ipart]; const int linindex1 = iold[ipart+0*packsize]*yz_size1+iold[ipart+1*packsize]*z_size1+iold[ipart+2*packsize]; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( int k=0 ; k<5 ; k++ ) { @@ -383,7 +383,7 @@ namespace acc { #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( DSz [0:kTmpArraySize], \ sumX [0:kTmpArraySize] ) @@ -409,7 +409,7 @@ namespace acc { charge /* [istart_pack:current_pack_size] */, \ weight /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ Jz [0:Jz_size], \ Sx0 [0:kTmpArraySize], \ @@ -432,7 +432,7 @@ namespace acc { const double crz_p = dz_ov_dt_inv_cell_volume * static_cast( charge[ipart] ) * weight[ipart]; const int linindex2 = iold[ipart+0*packsize]*yz_size2+iold[ipart+1*packsize]*z_size2+iold[ipart+2*packsize]; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc loop vector #endif for( int k=1 ; k<5 ; k++ ) { @@ -536,7 +536,7 @@ namespace acc { position_y /* [istart_pack:current_pack_size] */, \ position_z /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ deltaold [0:3 * nparts], \ Sx1 [0:kTmpArraySize], \ @@ -630,7 +630,7 @@ namespace acc { charge /* [istart_pack:current_pack_size] */, \ weight /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ rho [0:rho_size], \ Sx1 [0:kTmpArraySize], \ diff --git a/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.cu b/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.cu old mode 100644 new mode 100755 index 195a02667..0883bdafd --- a/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.cu +++ b/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.cu @@ -1,6 +1,6 @@ //! HIP CUDA implementation -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) //#include "Projector3D2OrderGPUKernelCUDAHIP.h" @@ -162,7 +162,7 @@ namespace cudahip { int k_domain_begin, int nprimy, int nprimz, - int not_spectral ) + int not_spectral_ ) { // Potential future work for optimization: Break the kernel into smaller // pieces (lds init/store, coeff computation, deposition etc..) @@ -238,9 +238,6 @@ namespace cudahip { device_bin_index[workgroup_dedicated_bin_index - 1]; const unsigned int last_particle = device_bin_index[workgroup_dedicated_bin_index]; -//std::cout << first_particle << std::endl; -//printf("%d \n",first_particle); - for( unsigned int particle_index = first_particle + thread_index_offset; particle_index < last_particle; particle_index += loop_stride ) { @@ -501,8 +498,8 @@ namespace cudahip { // These atomics are basically free (very few of them). atomic::GDS::AddNoReturn( &device_Jx[global_memory_index], static_cast( Jx_scratch_space[field_index] ) ); - atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + /* We handle the FTDT/picsar */ not_spectral * global_x_scratch_space_coordinate * nprimz], static_cast( Jy_scratch_space[field_index] ) ); - atomic::GDS::AddNoReturn( &device_Jz[global_memory_index + /* We handle the FTDT/picsar */ not_spectral * (global_x_scratch_space_coordinate * nprimy + global_y_scratch_space_coordinate)], static_cast( Jz_scratch_space[field_index] ) ); + atomic::GDS::AddNoReturn( &device_Jy[global_memory_index + /* We handle the FTDT/picsar */ not_spectral_ * global_x_scratch_space_coordinate * nprimz], static_cast( Jy_scratch_space[field_index] ) ); + atomic::GDS::AddNoReturn( &device_Jz[global_memory_index + /* We handle the FTDT/picsar */ not_spectral_ * (global_x_scratch_space_coordinate * nprimy + global_y_scratch_space_coordinate)], static_cast( Jz_scratch_space[field_index] ) ); } } // end DepositCurrent @@ -536,7 +533,7 @@ namespace cudahip { int k_domain_begin, int nprimy, int nprimz, - int not_spectral ) + int not_spectral_ ) { // TODO(Etienne M): refactor this function. Break it into smaller // pieces (lds init/store, coeff computation, deposition etc..) @@ -716,7 +713,7 @@ namespace cudahip { int k_domain_begin, int nprimy, int nprimz, - int not_spectral ) + int not_spectral_ ) { SMILEI_ASSERT( Params::getGPUClusterWidth( 3 /* 2D */ ) != -1 && Params::getGPUClusterGhostCellBorderWidth( 2 /* 2nd order interpolation */ ) != -1 ); @@ -767,7 +764,7 @@ namespace cudahip { dx_ov_dt, dy_ov_dt, dz_ov_dt, i_domain_begin, j_domain_begin, k_domain_begin, nprimy, nprimz, - not_spectral + not_spectral_ ); checkHIPErrors( ::hipDeviceSynchronize() ); @@ -799,7 +796,7 @@ namespace cudahip { dx_ov_dt, dy_ov_dt, dz_ov_dt, i_domain_begin, j_domain_begin, k_domain_begin, nprimy, nprimz, - not_spectral + not_spectral_ ); checkHIPErrors( ::cudaDeviceSynchronize() ); #endif @@ -836,7 +833,7 @@ namespace cudahip { int k_domain_begin, int nprimy, int nprimz, - int not_spectral ) + int not_spectral_ ) { SMILEI_ASSERT( Params::getGPUClusterWidth( 3 /* 2D */ ) != -1 && Params::getGPUClusterGhostCellBorderWidth( 2 /* 2nd order interpolation */ ) != -1 ); @@ -886,7 +883,7 @@ namespace cudahip { dx_ov_dt, dy_ov_dt, dz_ov_dt, i_domain_begin, j_domain_begin, k_domain_begin, nprimy, nprimz, - not_spectral ); + not_spectral_ ); checkHIPErrors( ::hipDeviceSynchronize() ); #elif defined ( __NVCC__ ) @@ -914,7 +911,7 @@ namespace cudahip { dx_ov_dt, dy_ov_dt, dz_ov_dt, i_domain_begin, j_domain_begin, k_domain_begin, nprimy, nprimz, - not_spectral + not_spectral_ ); checkHIPErrors( ::cudaDeviceSynchronize() ); #endif diff --git a/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.h b/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.h old mode 100644 new mode 100755 index 94368f4dd..cbd9729c3 --- a/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.h +++ b/src/Projector/Projector3D2OrderGPUKernelCUDAHIP.h @@ -4,7 +4,7 @@ #define Projector3D2OrderGPUKernelCUDAHIP_H -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) #if defined( __HIP__ ) #include @@ -17,9 +17,8 @@ #include "gpu.h" namespace cudahip { -//static -inline void - currentDepositionKernel3D( double *__restrict__ host_Jx, +//static inline +void currentDepositionKernel3D( double *__restrict__ host_Jx, double *__restrict__ host_Jy, double *__restrict__ host_Jz, int Jx_size, @@ -50,11 +49,10 @@ inline void int k_domain_begin, int nprimy, int nprimz, - int not_spectral ); + int not_spectral_ ); -//static -inline void - densityDepositionKernel3D( +//static inline +void densityDepositionKernel3D( double *__restrict__ host_rho, int rho_size, const double *__restrict__ device_particle_position_x, @@ -82,7 +80,7 @@ inline void int k_domain_begin, int nprimy, int nprimz, - int not_spectral ); + int not_spectral_ ); } // namespace cudahip diff --git a/src/Projector/Projector3D2OrderGPUKernelNaive.h b/src/Projector/Projector3D2OrderGPUKernelNaive.h index b6cfac080..a261af40b 100644 --- a/src/Projector/Projector3D2OrderGPUKernelNaive.h +++ b/src/Projector/Projector3D2OrderGPUKernelNaive.h @@ -1,6 +1,6 @@ //! Naive ACC/OMP implementation -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) #include #include "Tools.h" @@ -66,7 +66,7 @@ namespace acc { position_y /* [istart_pack:current_pack_size] */, \ position_z /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ deltaold [0:3 * nparts], \ Jx[0:Jx_size], \ @@ -344,7 +344,7 @@ namespace acc { position_y /* [istart_pack:current_pack_size] */, \ position_z /* [istart_pack:current_pack_size] */ ) #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( iold [0:3 * nparts], \ deltaold [0:3 * nparts], \ rho[0:rho_size] \ diff --git a/src/Projector/ProjectorAM2OrderV.cpp b/src/Projector/ProjectorAM2OrderV.cpp index b222aa4ee..890d37332 100755 --- a/src/Projector/ProjectorAM2OrderV.cpp +++ b/src/Projector/ProjectorAM2OrderV.cpp @@ -673,10 +673,6 @@ void ProjectorAM2OrderV::susceptibility( ElectroMagn *EMfields, Particles &parti double charge_weight[8] __attribute__( ( aligned( 64 ) ) ); // double r_bar[8] __attribute__( ( aligned( 64 ) ) ); - //double *invR_local = &(invR_[jpom2]); - // double *invRd_local = &(invRd_[jpom2]); - - double *invR_local = &(invR_[jpom2]); // Pointer for GPU and vectorization on ARM processors double * __restrict__ position_x = particles.getPtrPosition(0); double * __restrict__ position_y = particles.getPtrPosition(1); diff --git a/src/Projector/ProjectorFactory.h b/src/Projector/ProjectorFactory.h index db8c39e1f..5b1f50e37 100755 --- a/src/Projector/ProjectorFactory.h +++ b/src/Projector/ProjectorFactory.h @@ -3,6 +3,7 @@ #include "Projector.h" #include "Projector1D2Order.h" +#include "Projector1D2OrderGPU.h" #include "Projector1D4Order.h" #include "Projector2D2Order.h" #include "Projector2D2OrderGPU.h" @@ -33,7 +34,11 @@ class ProjectorFactory // 1Dcartesian simulation // --------------- if( ( params.geometry == "1Dcartesian" ) && ( params.interpolation_order == ( unsigned int )2 ) ) { - Proj = new Projector1D2Order( params, patch ); + #if defined( SMILEI_ACCELERATOR_GPU ) + Proj = new Projector1D2OrderGPU( params, patch ); + #else + Proj = new Projector1D2Order( params, patch ); + #endif } else if( ( params.geometry == "1Dcartesian" ) && ( params.interpolation_order == ( unsigned int )4 ) ) { Proj = new Projector1D4Order( params, patch ); } @@ -42,7 +47,7 @@ class ProjectorFactory // --------------- else if( ( params.geometry == "2Dcartesian" ) && ( params.interpolation_order == ( unsigned int )2 ) ) { if( !vectorization ) { - #if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) + #if defined( SMILEI_ACCELERATOR_GPU ) Proj = new Projector2D2OrderGPU( params, patch ); #else Proj = new Projector2D2Order( params, patch ); @@ -64,7 +69,7 @@ class ProjectorFactory // --------------- else if( ( params.geometry == "3Dcartesian" ) && ( params.interpolation_order == ( unsigned int )2 ) ) { if( !vectorization ) { - #if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) + #if defined( SMILEI_ACCELERATOR_GPU ) Proj = new Projector3D2OrderGPU( params, patch ); #else Proj = new Projector3D2Order( params, patch ); diff --git a/src/Pusher/PusherBoris.cpp b/src/Pusher/PusherBoris.cpp index 536def7a9..8f70a6cc3 100755 --- a/src/Pusher/PusherBoris.cpp +++ b/src/Pusher/PusherBoris.cpp @@ -57,7 +57,7 @@ void PusherBoris::operator()( Particles &particles, SmileiMPI *smpi, int istart, position_y /* [istart:particle_number] */, \ position_z /* [istart:particle_number] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) const int istart_offset = istart - ipart_buffer_offset; const int particle_number = iend - istart; diff --git a/src/Pusher/PusherBorisNR.cpp b/src/Pusher/PusherBorisNR.cpp index 84f072e1f..df4a3277b 100755 --- a/src/Pusher/PusherBorisNR.cpp +++ b/src/Pusher/PusherBorisNR.cpp @@ -57,7 +57,7 @@ void PusherBorisNR::operator()( Particles &particles, SmileiMPI *smpi, int istar position_y /* [istart:particle_number] */, \ position_z /* [istart:particle_number] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) const int istart_offset = istart - ipart_buffer_offset; const int particle_number = iend - istart; diff --git a/src/Pusher/PusherHigueraCary.cpp b/src/Pusher/PusherHigueraCary.cpp index 2ab234ae1..c85189fff 100755 --- a/src/Pusher/PusherHigueraCary.cpp +++ b/src/Pusher/PusherHigueraCary.cpp @@ -68,7 +68,7 @@ void PusherHigueraCary::operator()( Particles &particles, SmileiMPI *smpi, int i position_y /* [istart:particle_number] */, \ position_z /* [istart:particle_number] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) const int istart_offset = istart - ipart_buffer_offset; const int particle_number = iend - istart; diff --git a/src/Pusher/PusherPhoton.cpp b/src/Pusher/PusherPhoton.cpp index a94a521e3..5feb7823d 100755 --- a/src/Pusher/PusherPhoton.cpp +++ b/src/Pusher/PusherPhoton.cpp @@ -53,7 +53,7 @@ void PusherPhoton::operator()( Particles &particles, SmileiMPI *smpi, position_y /* [istart:particle_number] */, \ position_z /* [istart:particle_number] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) const int istart_offset = istart - ipart_ref; const int particle_number = iend - istart; diff --git a/src/Pusher/PusherPonderomotiveBoris.cpp b/src/Pusher/PusherPonderomotiveBoris.cpp index 41afa42e6..9d151dabb 100755 --- a/src/Pusher/PusherPonderomotiveBoris.cpp +++ b/src/Pusher/PusherPonderomotiveBoris.cpp @@ -55,7 +55,7 @@ void PusherPonderomotiveBoris::operator()( Particles &particles, SmileiMPI *smpi const double *const __restrict__ GradPhiz = &( ( *GradPhipart )[2*nparts] ); //double *inv_gamma_ponderomotive = &( ( *dynamics_inv_gamma_ponderomotive )[0*nparts] ); - #ifndef SMILEI_OPENACC_MODE + #ifndef SMILEI_ACCELERATOR_GPU_OACC #pragma omp simd #else int np = iend-istart; diff --git a/src/Pusher/PusherPonderomotiveBorisBTIS3.cpp b/src/Pusher/PusherPonderomotiveBorisBTIS3.cpp index 379f41763..a32f359cb 100644 --- a/src/Pusher/PusherPonderomotiveBorisBTIS3.cpp +++ b/src/Pusher/PusherPonderomotiveBorisBTIS3.cpp @@ -31,7 +31,6 @@ void PusherPonderomotiveBorisBTIS3::operator()( Particles &particles, SmileiMPI double charge_over_mass_dts2, charge_sq_over_mass_sq_dts4; double umx, umy, umz, upx, upy, upz; double alpha; - double TxTy, TyTz, TzTx; double pxsm, pysm, pzsm; //double one_ov_gamma_ponderomotive; diff --git a/src/Pusher/PusherPonderomotivePositionBoris.cpp b/src/Pusher/PusherPonderomotivePositionBoris.cpp index 16a4e6c69..9b9bea639 100755 --- a/src/Pusher/PusherPonderomotivePositionBoris.cpp +++ b/src/Pusher/PusherPonderomotivePositionBoris.cpp @@ -52,7 +52,7 @@ void PusherPonderomotivePositionBoris::operator()( Particles &particles, SmileiM const double *const __restrict__ GradPhi_my = &( ( *GradPhi_mpart )[1*nparts] ); const double *const __restrict__ GradPhi_mz = &( ( *GradPhi_mpart )[2*nparts] ); - #ifndef SMILEI_OPENACC_MODE + #ifndef SMILEI_ACCELERATOR_GPU_OACC #pragma omp simd #else int np = iend-istart; diff --git a/src/Pusher/PusherVay.cpp b/src/Pusher/PusherVay.cpp index c1ba76693..83debaae4 100755 --- a/src/Pusher/PusherVay.cpp +++ b/src/Pusher/PusherVay.cpp @@ -67,7 +67,7 @@ void PusherVay::operator()( Particles &particles, SmileiMPI *smpi, int istart, i position_y /* [istart:particle_number] */, \ position_z /* [istart:particle_number] */ ) #pragma omp teams distribute parallel for -#elif defined(SMILEI_OPENACC_MODE) +#elif defined(SMILEI_ACCELERATOR_GPU_OACC) const int istart_offset = istart - ipart_buffer_offset; const int particle_number = iend - istart; diff --git a/src/Python/pyinit.py b/src/Python/pyinit.py index 56febc475..f5aeeb7e1 100755 --- a/src/Python/pyinit.py +++ b/src/Python/pyinit.py @@ -645,7 +645,8 @@ class MultiphotonBreitWheeler(SmileiComponent): # Smilei-defined smilei_mpi_rank = 0 smilei_mpi_size = 1 -smilei_rand_max = 2**31-1 +smilei_omp_threads = 1 +smilei_total_cores = 1 # Variable to set to False for the actual run (useful for the test mode) _test_mode = True diff --git a/src/Python/pyprofiles.py b/src/Python/pyprofiles.py index 0e122a1a9..2fff14c1f 100755 --- a/src/Python/pyprofiles.py +++ b/src/Python/pyprofiles.py @@ -702,7 +702,7 @@ def LaserGaussianAM( box_side="xmin", a0=1., omega=1., focus=None, waist=3., print("ERROR: focus should be a list of length 1") exit(1) elif (len(focus)==2): - print("WARNING: deprecated focus in LaserEnvelopeGaussianAM should be a list of length 1") + print("WARNING: deprecated focus in LaserGaussianAM should be a list of length 1") # Polarization and amplitude [dephasing, amplitudeY, amplitudeZ] = transformPolarization(polarization_phi, ellipticity) amplitudeY *= a0 * omega diff --git a/src/Radiation/RadiationCorrLandauLifshitz.cpp b/src/Radiation/RadiationCorrLandauLifshitz.cpp index 16c7b01fe..ebb0e54dd 100755 --- a/src/Radiation/RadiationCorrLandauLifshitz.cpp +++ b/src/Radiation/RadiationCorrLandauLifshitz.cpp @@ -96,7 +96,7 @@ void RadiationCorrLandauLifshitz::operator()( // cumulative Radiated energy from istart to iend double radiated_energy_loc = 0; -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC // Local vector to store the radiated energy double * rad_norm_energy = new double [iend-istart]; // double * rad_norm_energy = (double*) aligned_alloc(64, (iend-istart)*sizeof(double)); @@ -112,7 +112,7 @@ void RadiationCorrLandauLifshitz::operator()( // Computation // NVIDIA GPUs - #if defined (SMILEI_OPENACC_MODE) + #if defined (SMILEI_ACCELERATOR_GPU_OACC) const int istart_offset = istart - ipart_ref; const int np = iend-istart; #pragma acc parallel \ @@ -185,7 +185,7 @@ void RadiationCorrLandauLifshitz::operator()( // _______________________________________________________________ // Computation of the thread radiated energy -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC // Exact energy loss due to the radiation rad_norm_energy[ipart-istart] = gamma - std::sqrt( 1.0 @@ -210,7 +210,7 @@ void RadiationCorrLandauLifshitz::operator()( // _______________________________________________________________ // Update of the quantum parameter -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC #pragma omp simd for( int ipart=istart ; ipart #include -#if defined(SMILEI_OPENACC_MODE) +#if defined(SMILEI_ACCELERATOR_GPU_OACC) #define __HIP_PLATFORM_NVCC__ #define __HIP_PLATFORM_NVIDIA__ #include "gpuRandom.h" @@ -103,7 +103,7 @@ void RadiationMonteCarlo::operator()( // Temporary double parameter double temp; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC unsigned long long seed; // Parameters for CUDA generator unsigned long long seq; unsigned long long offset; @@ -152,7 +152,7 @@ void RadiationMonteCarlo::operator()( // Number of photons int nphotons; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC int nphotons_start; #endif @@ -160,7 +160,7 @@ void RadiationMonteCarlo::operator()( const double photon_buffer_size_per_particle = radiation_photon_sampling_ * max_photon_emissions_; if (photons) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // We reserve a large number of potential photons on device since we can't reallocate nphotons_start = photons->deviceSize(); //static_cast(photons)->deviceReserve( nphotons + (iend - istart) * photon_buffer_size_per_particle ); @@ -199,13 +199,13 @@ void RadiationMonteCarlo::operator()( double *const __restrict__ photon_tau = photons ? (photons->has_Monte_Carlo_process ? photons->getPtrTau() : nullptr) : nullptr; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // Cell keys as a mask int *const __restrict__ photon_cell_keys = photons ? photons->getPtrCellKeys() : nullptr; #endif // Table properties ---------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // Size of tables // int size_of_Table_integfochi = RadiationTables.integfochi_.size_particle_chi_; // int size_of_Table_min_photon_chi = RadiationTables.xi_.size_particle_chi_; @@ -221,7 +221,7 @@ void RadiationMonteCarlo::operator()( // _______________________________________________________________ // Computation -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // Management of the data on GPU though this data region int np = iend-istart; @@ -342,7 +342,7 @@ void RadiationMonteCarlo::operator()( // New final optical depth to reach for emision while( tau[ipart] <= epsilon_tau_ ) { //tau[ipart] = -log( 1.-Rand::uniform() ); - #ifndef SMILEI_OPENACC_MODE + #ifndef SMILEI_ACCELERATOR_GPU_OACC tau[ipart] = -std::log( 1.-rand_->uniform() ); #else seed_curand_1 = (int) (ipart+1)*(initial_seed_1+1); //Seed for linear generator @@ -385,7 +385,7 @@ void RadiationMonteCarlo::operator()( // Draw random number in [0,1[ - #ifndef SMILEI_OPENACC_MODE + #ifndef SMILEI_ACCELERATOR_GPU_OACC random_number = rand_->uniform(); #else seed_curand_2 = (int) (ipart + 1)*(initial_seed_2 + 1); //Seed for linear generator @@ -433,7 +433,7 @@ void RadiationMonteCarlo::operator()( && ( i_photon_emission < max_photon_emissions_)) { // CPU implementation (non-threaded implementation) -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC // Creation of new photons in the temporary array photons photons->createParticles( radiation_photon_sampling_ ); @@ -611,14 +611,14 @@ void RadiationMonteCarlo::operator()( } // end while } // end for -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC } // end acc parallel #endif //if (photons) std::cerr << photons->deviceSize() << std::endl; // Remove extra space to save memory -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC if (photons) { photons->shrinkToFit( true ); } @@ -631,7 +631,7 @@ void RadiationMonteCarlo::operator()( // ____________________________________________________ // Update of the quantum parameter chi -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC #pragma omp simd #else int np = iend-istart; @@ -660,11 +660,11 @@ void RadiationMonteCarlo::operator()( } - #ifdef SMILEI_OPENACC_MODE + #ifdef SMILEI_ACCELERATOR_GPU_OACC } // end acc parallel #endif -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC } // end acc data #endif diff --git a/src/Radiation/RadiationMonteCarlo.h b/src/Radiation/RadiationMonteCarlo.h index 34b8c31db..4e84f169d 100755 --- a/src/Radiation/RadiationMonteCarlo.h +++ b/src/Radiation/RadiationMonteCarlo.h @@ -16,7 +16,7 @@ #include "Radiation.h" #include "userFunctions.h" -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #include // This is wrong. Dont include nvidiaParticles, it may cause problem! // See particle factory. diff --git a/src/Radiation/RadiationNiel.cpp b/src/Radiation/RadiationNiel.cpp index 6e61f3759..dff292df4 100755 --- a/src/Radiation/RadiationNiel.cpp +++ b/src/Radiation/RadiationNiel.cpp @@ -127,7 +127,7 @@ void RadiationNiel::operator()( double radiated_energy_loc = 0; // Parameters for linear alleatory number generator - #ifdef SMILEI_OPENACC_MODE + #ifdef SMILEI_ACCELERATOR_GPU_OACC // Initialize initial seed for linear generator double initial_seed = rand_->uniform(); @@ -144,7 +144,7 @@ void RadiationNiel::operator()( //double t0 = MPI_Wtime(); // 1) Vectorized computation of gamma and the particle quantum parameter -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC #pragma omp simd #else @@ -190,12 +190,12 @@ void RadiationNiel::operator()( Ex[ipart-ipart_ref], Ey[ipart-ipart_ref], Ez[ipart-ipart_ref], Bx[ipart-ipart_ref], By[ipart-ipart_ref], Bz[ipart-ipart_ref] ); -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC } //finish cycle #endif //double t1 = MPI_Wtime(); - #ifdef SMILEI_OPENACC_MODE + #ifdef SMILEI_ACCELERATOR_GPU_OACC if( particle_chi[ipart] > minimum_chi_continuous ) { seed_curand = (int) (ipart+1)*(initial_seed+1); //Seed for linear generator @@ -297,7 +297,7 @@ void RadiationNiel::operator()( if( niel_computation_method == 0 ) { - #ifndef SMILEI_OPENACC_MODE + #ifndef SMILEI_ACCELERATOR_GPU_OACC for( ipart=istart ; ipart minimum_chi_continuous ) { @@ -310,7 +310,7 @@ void RadiationNiel::operator()( diffusion[ipart-istart] = std::sqrt( factor_classical_radiated_power*gamma[ipart-ipart_ref]*temp )*random_numbers[ipart-istart]; - #ifndef SMILEI_OPENACC_MODE + #ifndef SMILEI_ACCELERATOR_GPU_OACC } } #endif @@ -318,7 +318,7 @@ void RadiationNiel::operator()( // Using the fit at order 5 (vectorized) else if( niel_computation_method == 1 ) { - #ifndef SMILEI_OPENACC_MODE + #ifndef SMILEI_ACCELERATOR_GPU_OACC #pragma omp simd private(temp) for( ipart=istart ; ipart #endif diff --git a/src/Radiation/RadiationTables.h b/src/Radiation/RadiationTables.h index bc5003966..77bcac8e2 100755 --- a/src/Radiation/RadiationTables.h +++ b/src/Radiation/RadiationTables.h @@ -58,7 +58,7 @@ class RadiationTables //! param[in] particle_chi particle quantum parameter //! param[in] particle_gamma particle Lorentz factor //! param[in] integfochi_table table of the discretized integrated f/chi function for Photon production yield computation -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif double computePhotonProductionYield( const double particle_chi, @@ -77,7 +77,7 @@ class RadiationTables //! \param[in] xi //! \param[in] table_min_photon_chi //! \param[in] table_xi -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif double computeRandomPhotonChiWithInterpolation( double particle_chi, @@ -95,7 +95,7 @@ class RadiationTables //! from the computed table niel_.table //! \param particle_chi particle quantum parameter -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif double getHNielFromTable( double particle_chi, double * tableNiel); @@ -116,7 +116,7 @@ class RadiationTables //! \param particle_chi particle quantum parameter //! \param dt time step //#pragma omp declare simd -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif inline double __attribute__((always_inline)) getRidgersCorrectedRadiatedEnergy( const double particle_chi, @@ -138,7 +138,7 @@ class RadiationTables //! Get of the classical continuous radiated energy during dt //! \param particle_chi particle quantum parameter //! \param dt time step -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif inline double __attribute__((always_inline)) getClassicalRadiatedEnergy( double particle_chi, double dt ) @@ -148,7 +148,7 @@ class RadiationTables //! Return the minimum_chi_discontinuous_ value //! Under this value, no discontinuous radiation reaction -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif inline double __attribute__((always_inline)) getMinimumChiDiscontinuous() @@ -158,7 +158,7 @@ class RadiationTables //! Return the minimum_chi_continuous_ value //! Under this value, no continuous radiation reaction -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif inline double __attribute__((always_inline)) getMinimumChiContinuous() diff --git a/src/Radiation/RadiationTools.h b/src/Radiation/RadiationTools.h index 33cb5f501..1746c894e 100644 --- a/src/Radiation/RadiationTools.h +++ b/src/Radiation/RadiationTools.h @@ -32,7 +32,7 @@ class RadiationTools { //! Valid between particle_chi in 1E-3 and 1E1 //! \param particle_chi particle quantum parameter // ----------------------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif static inline double __attribute__((always_inline)) getHNielFitOrder10(double particle_chi) @@ -62,7 +62,7 @@ class RadiationTools { //! Valid between particle_chi in 1E-3 and 1E1 //! \param particle_chi particle quantum parameter // ----------------------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif static inline double __attribute__((always_inline)) getHNielFitOrder5(double particle_chi) @@ -86,7 +86,7 @@ class RadiationTools { //! Ridgers et al., ArXiv 1708.04511 (2017) //! \param particle_chi particle quantum parameter // ----------------------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif static inline double __attribute__((always_inline)) getHNielFitRidgers(double particle_chi) @@ -104,7 +104,7 @@ class RadiationTools { //! approximation formulae //! \param particle_chi particle quantum parameter //#pragma omp declare simd -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif static inline double __attribute__((always_inline)) computeGRidgers(double particle_chi) @@ -117,7 +117,7 @@ class RadiationTools { //! Return f1(nu) = Int_nu^\infty K_{5/3}(y) dy //! used in computed synchrotron power spectrum // ----------------------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif static inline double __attribute__((always_inline)) computeF1Nu(double nu) @@ -155,7 +155,7 @@ class RadiationTools { //! Return f2(nu) = BesselK_{2/3}(nu) //! used in computed synchrotron power spectrum // ----------------------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif static inline double __attribute__((always_inline)) computeF2Nu(double nu) @@ -194,7 +194,7 @@ class RadiationTools { //! = Int_nu^\infty K_{5/3}(y) dy + cst * BesselK_{2/3}(nu) //! used in computed synchrotron power spectrum // ----------------------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif static inline double __attribute__((always_inline)) computeBesselPartsRadiatedPower(double nu, double cst) diff --git a/src/Radiation/Table.h b/src/Radiation/Table.h index 8b74aeeaa..a028d4df3 100644 --- a/src/Radiation/Table.h +++ b/src/Radiation/Table.h @@ -45,7 +45,7 @@ class Table void compute_parameters(); //! get value using linear interpolation at position x -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif double get(double x); diff --git a/src/Smilei.cpp b/src/Smilei.cpp index 15cd7b047..81ba6c258 100755 --- a/src/Smilei.cpp +++ b/src/Smilei.cpp @@ -20,7 +20,7 @@ #include #include #include -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #include #endif @@ -44,7 +44,7 @@ using namespace std; // MAIN CODE // --------------------------------------------------------------------------------------------------------------------- -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #ifdef _OPENACC void initialization_openacc() { @@ -80,7 +80,7 @@ int main( int argc, char *argv[] ) // ------------------------- // Create the OpenACC environment -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC initialization_openacc(); #endif @@ -124,7 +124,7 @@ int main( int argc, char *argv[] ) // oblivious to the program (only one, the one by default). // This could be a missed but very advanced optimization for some // kernels/exchange. - ERROR( "Simlei needs only one accelerator (GPU). Look for HIP_VISIBLE_DEVICES or 'gpu-bind=closest' in your SLURM script or use a custom binding script." ); + ERROR( "Smilei needs only one accelerator (GPU). Look for HIP_VISIBLE_DEVICES or 'gpu-bind=closest' in your SLURM script or use a custom binding script." ); } else { // ::omp_set_default_device(0); } @@ -248,7 +248,7 @@ int main( int argc, char *argv[] ) checkpoint.restartAll( vecPatches, region, &smpi, params ); -#if !defined( SMILEI_ACCELERATOR_MODE ) +#if !defined( SMILEI_ACCELERATOR_GPU ) // CPU only, its too early to sort on GPU vecPatches.initialParticleSorting( params ); #endif @@ -271,7 +271,7 @@ int main( int argc, char *argv[] ) PatchesFactory::createVector( vecPatches, params, &smpi, openPMD, &radiation_tables_, 0 ); -#if !(defined( SMILEI_ACCELERATOR_MODE )) +#if !(defined( SMILEI_ACCELERATOR_GPU )) // CPU only, its too early to sort on GPU vecPatches.initialParticleSorting( params ); #endif @@ -407,7 +407,7 @@ int main( int argc, char *argv[] ) } } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) TITLE( "GPU allocation and copy of the fields and particles" ); // Allocate particle and field arrays // Also copy particle array content on device @@ -629,7 +629,7 @@ int main( int argc, char *argv[] ) #pragma omp parallel shared (time_dual,smpi,params, vecPatches, region, simWindow, checkpoint, itime) { // finalize particle exchanges and sort particles - vecPatches.finalizeAndSortParticles( params, &smpi, simWindow, + vecPatches.finalizeExchParticlesAndSort( params, &smpi, simWindow, time_dual, timers, itime ); // Particle merging @@ -685,7 +685,7 @@ int main( int argc, char *argv[] ) } //End omp parallel region if( params.has_load_balancing && params.load_balancing_time_selection->theTimeIsNow( itime ) ) { -// #if defined( SMILEI_ACCELERATOR_MODE ) +// #if defined( SMILEI_ACCELERATOR_GPU ) // ERROR( "Load balancing not tested on GPU !" ); // #endif count_dlb++; @@ -777,7 +777,7 @@ int main( int argc, char *argv[] ) region.clean(); } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) vecPatches.cleanDataOnDevice( params, &smpi, &radiation_tables_, &multiphoton_Breit_Wheeler_tables_ ); #endif diff --git a/src/SmileiMPI/AsyncMPIbuffers.cpp b/src/SmileiMPI/AsyncMPIbuffers.cpp index 0f7cebe9d..ff8efb17f 100755 --- a/src/SmileiMPI/AsyncMPIbuffers.cpp +++ b/src/SmileiMPI/AsyncMPIbuffers.cpp @@ -1,5 +1,6 @@ #include "AsyncMPIbuffers.h" +#include "ParticlesFactory.h" #include "Field.h" #include "Patch.h" @@ -66,30 +67,47 @@ SpeciesMPIbuffers::SpeciesMPIbuffers() SpeciesMPIbuffers::~SpeciesMPIbuffers() { + for( size_t i=0 ; i > partRecv; + std::vector< std::vector > partRecv; //! ndim vectors of 2 received packets of particles (1 per direction) - std::vector< std::vector > partSend; + std::vector< std::vector > partSend; - //! ndim vectors of 2 vectors of index particles to send (1 per direction) - //! - not sent - // - used to sort Species::indexes_of_particles_to_exchange built in Species::dynamics - std::vector< std::vector< std::vector > > part_index_send; //! ndim vectors of 2 numbers of particles to send (1 per direction) - std::vector< std::vector< unsigned int > > part_index_send_sz; + std::vector< std::vector< unsigned int > > partSendSize; //! ndim vectors of 2 numbers of particles to receive (1 per direction) - std::vector< std::vector< unsigned int > > part_index_recv_sz; + std::vector< std::vector< unsigned int > > partRecvSize; }; diff --git a/src/SmileiMPI/SmileiMPI.cpp b/src/SmileiMPI/SmileiMPI.cpp index c35a69fe9..5e3a6b2da 100755 --- a/src/SmileiMPI/SmileiMPI.cpp +++ b/src/SmileiMPI/SmileiMPI.cpp @@ -763,7 +763,7 @@ void SmileiMPI::isend_species( Patch *patch, int to, int &irequest, int tag, Par irequest ++; } -#if defined( SMILEI_ACCELERATOR_MODE) +#if defined( SMILEI_ACCELERATOR_GPU) // For the particles for( unsigned int ispec=0; ispecvecSpecies[ispec]->particles, from, tag+2*ispec, recvParts ); MPI_Type_free( &( recvParts ) ); } - patch->vecSpecies[ispec]->particles->initializeDataOnDevice(); - patch->vecSpecies[ispec]->particles_to_move->initializeDataOnDevice(); + patch->vecSpecies[ispec]->allocateParticlesOnDevice(); } @@ -1210,7 +1209,7 @@ void SmileiMPI::send_PML(ElectroMagn *EM, Tpml embc, int bcId, int to, int &ire void SmileiMPI::isend( ElectroMagn *EM, int to, int &irequest, vector &requests, int tag, bool send_xmax_bc ) { -// #if defined (SMILEI_ACCELERATOR_MODE) +// #if defined (SMILEI_ACCELERATOR_GPU) // isendOnDevice( EM->Ex_, to, tag+irequest, requests[irequest] ); // irequest++; @@ -1319,9 +1318,9 @@ void SmileiMPI::isend( ElectroMagn *EM, int to, int &irequest, vector( EM->emBoundCond[bcId] ) ) { ElectroMagnBC1D_SM *embc = static_cast( EM->emBoundCond[bcId] ); - MPI_Isend( &( embc->By_val ), 1, MPI_DOUBLE, to, tag+irequest, MPI_COMM_WORLD, &requests[irequest] ); + MPI_Isend( &( embc->By_val_ ), 1, MPI_DOUBLE, to, tag+irequest, MPI_COMM_WORLD, &requests[irequest] ); irequest++; - MPI_Isend( &( embc->Bz_val ), 1, MPI_DOUBLE, to, tag+irequest, MPI_COMM_WORLD, &requests[irequest] ); + MPI_Isend( &( embc->Bz_val_ ), 1, MPI_DOUBLE, to, tag+irequest, MPI_COMM_WORLD, &requests[irequest] ); irequest++; } else if( dynamic_cast( EM->emBoundCond[bcId] ) ) { // BCs at the x-border @@ -1746,7 +1745,7 @@ int SmileiMPI::recv_PML(ElectroMagn *EM, Tpml embc, int bcId, int from, int tag void SmileiMPI::recv( ElectroMagn *EM, int from, int &tag, bool recv_xmin_bc ) { -// #if defined (SMILEI_ACCELERATOR_MODE) +// #if defined (SMILEI_ACCELERATOR_GPU) // recvOnDevice( EM->Ex_, from, tag ); // tag++; @@ -1855,9 +1854,9 @@ void SmileiMPI::recv( ElectroMagn *EM, int from, int &tag, bool recv_xmin_bc ) if( dynamic_cast( EM->emBoundCond[bcId] ) ) { ElectroMagnBC1D_SM *embc = static_cast( EM->emBoundCond[bcId] ); MPI_Status status; - MPI_Recv( &( embc->By_val ), 1, MPI_DOUBLE, from, tag, MPI_COMM_WORLD, &status ); + MPI_Recv( &( embc->By_val_ ), 1, MPI_DOUBLE, from, tag, MPI_COMM_WORLD, &status ); tag++; - MPI_Recv( &( embc->Bz_val ), 1, MPI_DOUBLE, from, tag, MPI_COMM_WORLD, &status ); + MPI_Recv( &( embc->Bz_val_ ), 1, MPI_DOUBLE, from, tag, MPI_COMM_WORLD, &status ); tag++; } else if( dynamic_cast( EM->emBoundCond[bcId] ) ) { // BCs at the x-border @@ -2122,7 +2121,7 @@ void SmileiMPI::isend( Field *field, int to, int tag, MPI_Request &request ) } // End isend ( Field ) -#if defined (SMILEI_ACCELERATOR_MODE) +#if defined (SMILEI_ACCELERATOR_GPU) //! Sends the whole Field Device to Device (assuming MPI enables it) void SmileiMPI::isendOnDevice( Field *field, int to, int tag, MPI_Request &request ) { @@ -2195,7 +2194,7 @@ void SmileiMPI::recv( Field *field, int from, int tag ) } // End recv ( Field ) -#if defined (SMILEI_ACCELERATOR_MODE) +#if defined (SMILEI_ACCELERATOR_GPU) void SmileiMPI::recvOnDevice( Field *field, int from, int tag ) { @@ -2525,7 +2524,7 @@ void SmileiMPI::eraseBufferParticleTrail( const int ndim, const int istart, cons } -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) template static inline void diff --git a/src/SmileiMPI/SmileiMPI.h b/src/SmileiMPI/SmileiMPI.h index 13cacc416..2785921de 100755 --- a/src/SmileiMPI/SmileiMPI.h +++ b/src/SmileiMPI/SmileiMPI.h @@ -103,7 +103,7 @@ class SmileiMPI //! Sends the whole Field void isend( Field *field, int to, int tag, MPI_Request &request ); //! Sends the whole Field Device to Device (assuming MPI enables it) -#if defined (SMILEI_ACCELERATOR_MODE) +#if defined (SMILEI_ACCELERATOR_GPU) void isendOnDevice( Field *field, int to, int tag, MPI_Request &request ); #endif @@ -114,7 +114,7 @@ class SmileiMPI //! Receives the whole Field void recv( Field *field, int from, int tag); //! Receives the whole Field Device to Device (assuming MPI enables it) -#if defined (SMILEI_ACCELERATOR_MODE) +#if defined (SMILEI_ACCELERATOR_GPU) void recvOnDevice( Field *field, int from, int tag); #endif @@ -248,7 +248,7 @@ class SmileiMPI //! Erase Particles from istart ot the end in the buffers of thread ithread void eraseBufferParticleTrail( const int ndim, const int istart, const int ithread, bool isAM = false ); -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) //! Map CPU buffers onto the GPU to at least accommodate particle_count //! particles. This method tries to reduce the number of //! allocation/deallocation which produces a lot of fragmentation on some diff --git a/src/Species/Species.cpp b/src/Species/Species.cpp index 37462566f..7555cb778 100755 --- a/src/Species/Species.cpp +++ b/src/Species/Species.cpp @@ -90,7 +90,6 @@ Species::Species( Params ¶ms, Patch *patch ) : { // &particles_sorted[0] particles = ParticlesFactory::create( params, *patch ); - particles_to_move = ParticlesFactory::create( params, *patch ); regular_number_array_.clear(); partBoundCond = NULL; @@ -104,7 +103,7 @@ Species::Species( Params ¶ms, Patch *patch ) : dx_inv_[1] = 1./cell_length[1]; dx_inv_[2] = 1./cell_length[2]; - initCluster( params ); + initCluster( params, patch ); inv_nDim_particles = 1./( ( double )nDim_particle ); length_[0]=0; @@ -123,7 +122,7 @@ Species::Species( Params ¶ms, Patch *patch ) : }//END Species creator -void Species::initCluster( Params ¶ms ) +void Species::initCluster( Params ¶ms, Patch *patch ) { // NOTE: On GPU we dont use first_index, it would contain redundant data but // we are forced to initialize it due to ParticleCreator::create() and the @@ -252,7 +251,7 @@ void Species::initCluster( Params ¶ms ) #endif //Initialize specMPI - MPI_buffer_.allocate( nDim_field ); + MPI_buffer_.allocate( params, patch ); //ener_tot = 0.; nrj_bc_lost = 0.; @@ -378,18 +377,14 @@ void Species::initOperators( Params ¶ms, Patch *patch ) partBoundCond = new PartBoundCond( params, this, patch ); for( unsigned int iDim=0 ; iDim < nDim_field ; iDim++ ) { for( unsigned int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++ ) { - MPI_buffer_.partRecv[iDim][iNeighbor].initialize( 0, ( *particles ) ); - MPI_buffer_.partSend[iDim][iNeighbor].initialize( 0, ( *particles ) ); - MPI_buffer_.part_index_send[iDim][iNeighbor].resize( 0 ); - MPI_buffer_.part_index_recv_sz[iDim][iNeighbor] = 0; - MPI_buffer_.part_index_send_sz[iDim][iNeighbor] = 0; + MPI_buffer_.partRecv[iDim][iNeighbor]->initialize( 0, ( *particles ) ); + MPI_buffer_.partSend[iDim][iNeighbor]->initialize( 0, ( *particles ) ); } } typePartSend.resize( nDim_field*2, MPI_DATATYPE_NULL ); typePartRecv.resize( nDim_field*2, MPI_DATATYPE_NULL ); exchangePatch = MPI_DATATYPE_NULL; - particles_to_move->initialize( 0, *particles ); } @@ -399,7 +394,6 @@ void Species::initOperators( Params ¶ms, Patch *patch ) Species::~Species() { delete particles; - delete particles_to_move; delete Push; delete Interp; @@ -506,7 +500,7 @@ Species::~Species() } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) //! Prepare the species Current and Rho grids on Device void Species::prepareSpeciesCurrentAndChargeOnDevice( @@ -546,7 +540,7 @@ Species::prepareSpeciesCurrentAndChargeOnDevice( } -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc parallel present( Jx_s[0:Jx_size], \ Jy_s[0:Jy_size], \ Jz_s[0:Jz_size], \ @@ -557,7 +551,7 @@ Species::prepareSpeciesCurrentAndChargeOnDevice( #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target #pragma omp teams distribute parallel for -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc loop gang worker vector #endif for( unsigned int i=0 ; iinitializeDataOnDevice(); + + // The first send/recv buffers are also on device + MPI_buffer_.partSend[0][0]->initializeDataOnDevice(); + MPI_buffer_.partSend[0][1]->initializeDataOnDevice(); + MPI_buffer_.partRecv[0][0]->initializeDataOnDevice(); + MPI_buffer_.partRecv[0][1]->initializeDataOnDevice(); + + // Create photon species on the device + if( radiation_model_ == "mc" && photon_species_ ) { + radiated_photons_->initializeDataOnDevice(); + } + + // Create pair species on the device + if( mBW_pair_species_[0] && mBW_pair_species_[1] ) { + mBW_pair_particles_[0]->initializeDataOnDevice(); + mBW_pair_particles_[1]->initializeDataOnDevice(); + } +} + + //! Copy particles from host to device void Species::copyParticlesFromHostToDevice() @@ -641,7 +659,7 @@ Species::copyParticlesFromHostToDevice() particles->copyFromHostToDevice(); } -#endif // end if SMILEI_ACCELERATOR_MODE +#endif // end if SMILEI_ACCELERATOR_GPU // --------------------------------------------------------------------------------------------------------------------- //! Method calculating the Particle dynamics (interpolation, pusher, projection and more) @@ -682,7 +700,7 @@ void Species::dynamics( double time_dual, if( time_dual>time_frozen_ || Ionize) { // moving particle // Prepare temporary buffers for this iteration -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) smpi->resizeDeviceBuffers( ithread, nDim_field, particles->numberOfParticles() ); @@ -695,7 +713,7 @@ void Species::dynamics( double time_dual, patch->startFineTimer(mBW_timer_id_); -#if defined( SMILEI_OPENACC_MODE) +#if defined( SMILEI_ACCELERATOR_GPU_OACC) static_cast(mBW_pair_particles_[0])->deviceResize( particles->deviceSize() * Multiphoton_Breit_Wheeler_process->getPairCreationSampling(0) ); static_cast(mBW_pair_particles_[0])->resetCellKeys(); static_cast(mBW_pair_particles_[1])->deviceResize( particles->deviceSize() * Multiphoton_Breit_Wheeler_process->getPairCreationSampling(1) ); @@ -708,7 +726,7 @@ void Species::dynamics( double time_dual, patch->stopFineTimer(mBW_timer_id_); } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) // Make sure some bin preconditions are respected SMILEI_ASSERT( particles->first_index.size() == 1 ); SMILEI_ASSERT( particles->last_index.size() >= 1 ); @@ -814,7 +832,7 @@ void Species::dynamics( double time_dual, // Compression of the bins if necessary if( Multiphoton_Breit_Wheeler_process ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC removeTaggedParticles(smpi, &particles->first_index[0], &particles->last_index[0], @@ -1672,14 +1690,14 @@ void Species::dynamicsImportParticles( double time_dual, Params ¶ms, Patch * // Radiation losses if( Radiate && photon_species_ ) { // If creation of macro-photon, we add them to photon_species -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // We first erase empty slots in the buffer of photons // radiation_photons_->cell_keys is used as a mask static_cast(radiated_photons_)->eraseLeavingParticles(); #endif photon_species_->importParticles( params, patch, *radiated_photons_, localDiags, time_dual ); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // We explicitely clear the device Particles static_cast(radiated_photons_)->deviceClear(); #endif @@ -1691,7 +1709,7 @@ void Species::dynamicsImportParticles( double time_dual, Params ¶ms, Patch * // Addition of the electron-positron particles for( int k=0; k<2; k++ ) { -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // We first erase empty slots in the buffer of photons // radiation_photons_->cell_keys is used as a mask static_cast(mBW_pair_particles_[k])->eraseLeavingParticles(); @@ -1699,7 +1717,7 @@ void Species::dynamicsImportParticles( double time_dual, Params ¶ms, Patch * mBW_pair_species_[k]->importParticles( params, patch, *mBW_pair_particles_[k], localDiags, time_dual ); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC // We explicitely clear the device Particles static_cast(mBW_pair_particles_[k])->deviceClear(); #endif @@ -1747,53 +1765,32 @@ void Species::computeCharge( ElectroMagn *EMfields, bool old /*=false*/ ) }//END computeCharge -void Species::extractParticles() -{ - particles->extractParticles( particles_to_move ); -} - -// void Species::injectParticles( Params ¶ms ) -// { -// } - - // --------------------------------------------------------------------------------------------------------------------- //! Sort particles // --------------------------------------------------------------------------------------------------------------------- void Species::sortParticles( Params ¶ms ) { -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) // ----------------------------- // GPU version - - // particles_to_move contains, up to here, send particles - // clean it to manage recv particles - particles_to_move->clear(); // Clear on the host - // Merge all MPI_buffer_.partRecv in particles_to_move - for( int idim = 0; idim < params.nDim_field; idim++ ) { - for( int iNeighbor = 0; iNeighbor < 2; iNeighbor++ ) { - int n_part_recv = MPI_buffer_.part_index_recv_sz[idim][iNeighbor]; - if( ( n_part_recv != 0 ) ) { - // insert n_part_recv in particles_to_move from 0 - MPI_buffer_.partRecv[idim][iNeighbor].copyParticles( 0, - n_part_recv, - *particles_to_move, - particles_to_move->size() ); + + // Merge all MPI_buffer_.partRecv in the first one + Particles * first_buffer = MPI_buffer_.partRecv[0][0]; + for( auto &partRecvs: MPI_buffer_.partRecv ) { + for( auto partRecv: partRecvs ) { + if( partRecv != first_buffer && partRecv->size() > 0 ) { + partRecv->copyParticles( 0, partRecv->size(), *first_buffer, first_buffer->size() ); + partRecv->clear(); } } } - - particles_to_move->copyFromHostToDevice(); - - // // Erase particles that leaves this patch - // particles->last_index[0] = particles->eraseLeavingParticles(); - // - // // Inject newly arrived particles in particles_to_move - // particles->last_index[0] += particles->injectParticles( particles_to_move ); - - particles->importAndSortParticles( particles_to_move ); + + first_buffer->copyFromHostToDevice(); + + particles->importAndSortParticles( first_buffer ); + #else // -------------------------- @@ -1804,28 +1801,10 @@ void Species::sortParticles( Params ¶ms ) int ndim = params.nDim_field; int idim; - // Compute total number of particles received - // int total_number_part_recv = 0; - //Merge all MPI_buffer_.partRecv in particles_to_move - // for( int idim = 0; idim < ndim; idim++ ) { - // for( int iNeighbor=0 ; iNeighbor<2 ; iNeighbor++ ) { - // int n_part_recv = MPI_buffer_.part_index_recv_sz[idim][iNeighbor]; - // if( ( n_part_recv!=0 ) ) { - // // insert n_part_recv in particles_to_move from 0 - // //MPI_buffer_.partRecv[idim][iNeighbor].copyParticles( 0, n_part_recv, *particles_to_move, 0 ); - // total_number_part_recv += n_part_recv; - // //particles->last_index[particles->last_index.size()-1] += n_part_recv; - // //particles->cell_keys.resize(particles->cell_keys.size()+n_part_recv); - // } - // } - // } - //cout << "\t Species id : " << species_number_ << " - nparticles recv : " << blabla << endl; - - // Sort to adapt do cell_keys usage std::vector indexes_of_particles_to_exchange; for ( int ipart=0 ; ipart< (int)(getNbrOfParticles()) ; ipart++ ) { - if ( particles->cell_keys[ipart] == -1 ) { + if ( particles->cell_keys[ipart] < 0 ) { indexes_of_particles_to_exchange.push_back( ipart ); } } @@ -1900,15 +1879,15 @@ void Species::sortParticles( Params ¶ms ) //Evaluation of the necessary shift of all bins.2 //idim=0 - shift[1] += MPI_buffer_.part_index_recv_sz[0][0];//Particles coming from xmin all go to bin 0 and shift all the other bins. - shift[particles->last_index.size()] += MPI_buffer_.part_index_recv_sz[0][1];//Used only to count the total number of particles arrived. + shift[1] += MPI_buffer_.partRecv[0][0]->size();//Particles coming from xmin all go to bin 0 and shift all the other bins. + shift[particles->last_index.size()] += MPI_buffer_.partRecv[0][1]->size();//Used only to count the total number of particles arrived. //idim>0 for( idim = 1; idim < ndim; idim++ ) { for( int iNeighbor=0 ; iNeighborsize(); for( unsigned int j=0; j<( unsigned int )n_part_recv ; j++ ) { //We first evaluate how many particles arrive in each bin. - ii = int( ( MPI_buffer_.partRecv[idim][iNeighbor].position( 0, j )-min_loc )/dbin ); //bin in which the particle goes. + ii = int( ( MPI_buffer_.partRecv[idim][iNeighbor]->position( 0, j )-min_loc )/dbin ); //bin in which the particle goes. shift[ii+1]++; // It makes the next bins shift. } } @@ -1943,11 +1922,11 @@ void Species::sortParticles( Params ¶ms ) //Space has been made now to write the arriving particles into the correct bins //idim == 0 is the easy case, when particles arrive either in first or last bin. for( int iNeighbor=0 ; iNeighborsize(); //if ( (neighbor_[0][iNeighbor]!=MPI_PROC_NULL) && (n_part_recv!=0) ) { if( ( n_part_recv!=0 ) ) { ii = iNeighbor*( particles->last_index.size()-1 ); //0 if iNeighbor=0(particles coming from Xmin) and particles->last_index.size()-1 otherwise. - MPI_buffer_.partRecv[0][iNeighbor].overwriteParticle( 0, *particles, particles->last_index[ii], n_part_recv ); + MPI_buffer_.partRecv[0][iNeighbor]->overwriteParticle( 0, *particles, particles->last_index[ii], n_part_recv ); particles->last_index[ii] += n_part_recv ; } } @@ -1955,12 +1934,12 @@ void Species::sortParticles( Params ¶ms ) for( idim = 1; idim < ndim; idim++ ) { //if (idim!=iDim) continue; for( int iNeighbor=0 ; iNeighborsize(); //if ( (neighbor_[idim][iNeighbor]!=MPI_PROC_NULL) && (n_part_recv!=0) ) { if( ( n_part_recv!=0 ) ) { for( unsigned int j=0; j<( unsigned int )n_part_recv; j++ ) { - ii = int( ( MPI_buffer_.partRecv[idim][iNeighbor].position( 0, j )-min_loc )/dbin ); //bin in which the particle goes. - MPI_buffer_.partRecv[idim][iNeighbor].overwriteParticle( j, *particles, particles->last_index[ii] ); + ii = int( ( MPI_buffer_.partRecv[idim][iNeighbor]->position( 0, j )-min_loc )/dbin ); //bin in which the particle goes. + MPI_buffer_.partRecv[idim][iNeighbor]->overwriteParticle( j, *particles, particles->last_index[ii] ); particles->last_index[ii] ++ ; } } @@ -2117,15 +2096,16 @@ void Species::countSortParticles( Params ¶ms ) // Move all particles from another species to this one void Species::importParticles( Params ¶ms, Patch *patch, Particles &source_particles, vector &localDiags, double time_dual, Ionization *I ) { -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) || defined( SMILEI_ACCELERATOR_GPU_OACC ) // --------------------------------------------------- // GPU version // Warning: the GPU version does not handle bin and sorting // Warning: the current GPU version does not handle tracked particles // Inject particles from source_particles - particles->last_index.back() += particles->injectParticles( &source_particles ); + particles->last_index.back() += particles->addParticles( &source_particles ); particles->last_index[0] = particles->last_index.back(); + source_particles.resize( 0 ); #else // --------------------------------------------------- @@ -2228,7 +2208,7 @@ void Species::compress(SmileiMPI *smpi, int ithread, bool compute_cell_keys) { const int nparts = smpi->dynamics_Epart[ithread].size()/3; -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC double *const __restrict__ weight = particles->getPtrWeight(); @@ -2267,7 +2247,7 @@ void Species::compress(SmileiMPI *smpi, int ithread, bool compute_cell_keys) { const int nbin = particles->numberOfBins(); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel \ present(Ex[0:nparts],Ey[0:nparts],Ez[0:nparts], \ Bx[0:nparts], By[0:nparts], Bz[0:nparts], \ @@ -2312,7 +2292,7 @@ void Species::compress(SmileiMPI *smpi, int ithread, bool compute_cell_keys) { if (copy_particle_number>0) { -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC particles->overwriteParticle(copy_first_index, particles->last_index[ibin], copy_particle_number, compute_cell_keys ); #else for (auto ipart = 0 ; ipart < copy_particle_number ; ipart ++) { @@ -2367,7 +2347,7 @@ void Species::compress(SmileiMPI *smpi, int ithread, bool compute_cell_keys) { } } -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC if (thetaold) { for( unsigned int ipart = 0 ; ipart < copy_particle_number ; ipart ++ ) { thetaold[copy_first_index + ipart] = thetaold[particles->last_index[ibin] + ipart]; @@ -2405,7 +2385,7 @@ void Species::compress(SmileiMPI *smpi, int ithread, bool compute_cell_keys) { } } -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC } // end parallel region #endif @@ -2439,7 +2419,7 @@ void Species::removeTaggedParticlesPerBin( // Weight shortcut double *const __restrict__ weight = particles->getPtrWeight(); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC double *const __restrict__ position_x = particles->getPtrPosition( 0 ); double *const __restrict__ position_y = nDim_particle > 1 ? particles->getPtrPosition( 1 ) : nullptr; double *const __restrict__ position_z = nDim_particle > 2 ? particles->getPtrPosition( 2 ) : nullptr; @@ -2457,7 +2437,7 @@ void Species::removeTaggedParticlesPerBin( // Total number of bins / cells const int nbin = particles->numberOfBins(); -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc parallel \ present(Epart[0:nparts*3],\ Bpart[0:nparts*3], \ @@ -2499,7 +2479,7 @@ void Species::removeTaggedParticlesPerBin( if( ipart < last_photon_index ) { // The last existing photon comes to the position of // the deleted photon -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC particles->overwriteParticle( last_photon_index, ipart, compute_cell_keys ); #else weight[ipart] = weight[last_photon_index]; @@ -2533,7 +2513,7 @@ void Species::removeTaggedParticlesPerBin( } gamma[ipart] = gamma[0*nparts+last_photon_index]; -#ifndef SMILEI_OPENACC_MODE +#ifndef SMILEI_ACCELERATOR_GPU_OACC if (thetaold) { thetaold[0*nparts+ipart] = thetaold[0*nparts+last_photon_index]; } @@ -2560,13 +2540,14 @@ void Species::removeTaggedParticlesPerBin( } // if last_index[ibin] > first_index[ibin] } // end loop over the bins -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC } // end parallel region #endif } //! This method removes particles with a negative weight //! when a single bin is used +#ifdef SMILEI_ACCELERATOR_GPU_OACC void Species::removeTaggedParticles( SmileiMPI *smpi, int *const first_index, @@ -2575,8 +2556,6 @@ void Species::removeTaggedParticles( bool compute_cell_keys) { -#ifdef SMILEI_OPENACC_MODE - unsigned int new_n_parts = 0; unsigned int nb_deleted = 0; @@ -2644,7 +2623,7 @@ void Species::removeTaggedParticles( // that will not be erased // Backward loop over the tagged particles to fill holes in the photon particle array (at the bin level only) -//#ifdef SMILEI_OPENACC_MODE +//#ifdef SMILEI_ACCELERATOR_GPU_OACC // #pragma acc loop seq //#endif for( int ipart=last_moving_index-1 ; ipart>=*first_index; ipart-- ) { @@ -2721,9 +2700,9 @@ void Species::removeTaggedParticles( } } // if nparts > 0 +} #endif -} // ------------------------------------------------ // Set position when using restart & moving window diff --git a/src/Species/Species.h b/src/Species/Species.h index 56c693d65..d4af3bf9d 100755 --- a/src/Species/Species.h +++ b/src/Species/Species.h @@ -6,7 +6,7 @@ // #include "PyTools.h" #include "Particles.h" -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #include "nvidiaParticles.h" #endif #include "Params.h" @@ -147,8 +147,6 @@ class Species //! Vector containing all Particles of the considered Species Particles *particles; - //! Data structure through which passes particles which move from one patch to another - Particles *particles_to_move; Particles particles_sorted[2]; //std::vector index_of_particles_to_exchange; @@ -344,7 +342,7 @@ class Species // ----------------------------------------------------------------------------- // 5. Methods - virtual void initCluster( Params & ); + virtual void initCluster( Params &, Patch * ); virtual void resizeCluster( Params & ); @@ -384,7 +382,9 @@ class Species return particles->capacity(); } -#if defined( SMILEI_ACCELERATOR_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU ) + + void allocateParticlesOnDevice(); //! Copy particles from host to device void @@ -482,12 +482,6 @@ class Species //! Method calculating the Particle charge on the grid (projection) virtual void computeCharge( ElectroMagn *EMfields, bool old=false ); - //! Method used to select particles which will change of patches - virtual void extractParticles(); - - //! Method used to integrate particles which come from another patches - // virtual void injectParticles( Params ¶ms ); - //! Method used to inject and sort particles virtual void sortParticles( Params ¶m ); @@ -572,12 +566,14 @@ class Species //! This method removes particles with a negative weight //! when a single bin is used +#ifdef SMILEI_ACCELERATOR_GPU_OACC void removeTaggedParticles( SmileiMPI *smpi, int *const first_index, int *const last_index, int ithread, bool compute_cell_keys = false); +#endif //! Moving window boundary conditions managment void disableXmax(); diff --git a/src/Species/SpeciesV.cpp b/src/Species/SpeciesV.cpp index 98d5d9dbb..4a4199b63 100755 --- a/src/Species/SpeciesV.cpp +++ b/src/Species/SpeciesV.cpp @@ -46,7 +46,7 @@ using namespace std; SpeciesV::SpeciesV( Params ¶ms, Patch *patch ) : Species( params, patch ) { - initCluster( params ); + initCluster( params, patch ); npack_ = 0 ; packsize_ = 0; @@ -106,7 +106,7 @@ SpeciesV::~SpeciesV() } -void SpeciesV::initCluster( Params ¶ms ) +void SpeciesV::initCluster( Params ¶ms, Patch *patch ) { int ncells = 1; for( unsigned int iDim=0 ; iDimfirst_index[ipack*packsize_+scell] ; iPartlast_index[ipack*packsize_+scell]; iPart++ ) { - // if ( particles->cell_keys[iPart] != -1 ) { + // if ( particles->cell_keys[iPart] >= 0 ) { // //Compute cell_keys of remaining particles // for( unsigned int i = 0 ; icell_keys[iPart] *= length_[i]; @@ -552,7 +552,7 @@ void SpeciesV::dynamics( double time_dual, unsigned int ispec, // if( mass_>0 ) { // for( iPart=particles->first_index[ipack*packsize_+scell] ; iPartlast_index[ipack*packsize_+scell]; iPart++ ) { - // if ( particles->cell_keys[iPart] != -1 ) { + // if ( particles->cell_keys[iPart] >= 0 ) { // //Compute cell_keys of remaining particles // for( unsigned int i = 0 ; icell_keys[iPart] *= this->length_[i]; @@ -564,7 +564,7 @@ void SpeciesV::dynamics( double time_dual, unsigned int ispec, // } // for( iPart=particles->first_index[ipack*packsize_+scell] ; iPartlast_index[ipack*packsize_+scell]; iPart++ ) { - // if ( particles->cell_keys[iPart] != -1 ) { + // if ( particles->cell_keys[iPart] >= 0 ) { // //Compute cell_keys of remaining particles // for( unsigned int i = 0 ; icell_keys[iPart] *= this->length_[i]; @@ -1053,7 +1053,7 @@ void SpeciesV::dynamicsTasks( double time_dual, unsigned int ispec, if( mass_>0 ) { for( int scell = first_cell_of_bin[ibin] ; scell <= last_cell_of_bin[ibin] ; scell++ ) { for( int iPart=particles->first_index[ipack*packsize_+scell] ; ( int )iPartlast_index[ipack*packsize_+scell]; iPart++ ) { - if ( particles->cell_keys[iPart] != -1 ) { + if ( particles->cell_keys[iPart] >= 0 ) { //Compute cell_keys of remaining particles for( unsigned int i = 0 ; icell_keys[iPart] *= this->length_[i]; @@ -1067,7 +1067,7 @@ void SpeciesV::dynamicsTasks( double time_dual, unsigned int ispec, } else if( mass_==0 ) { for( int scell = first_cell_of_bin[ibin] ; scell <= last_cell_of_bin[ibin] ; scell++ ) { for( int iPart=particles->first_index[scell] ; ( int )iPartlast_index[scell]; iPart++ ) { - if ( particles->cell_keys[iPart] != -1 ) { + if ( particles->cell_keys[iPart] >= 0 ) { //Compute cell_keys of remaining particles for( unsigned int i = 0 ; icell_keys[iPart] *= length[i]; @@ -1366,27 +1366,27 @@ void SpeciesV::sortParticles( Params ¶ms ) //Loop over just arrived particles to compute their cell keys and contribution to count for( unsigned int idim=0; idim < nDim_field ; idim++ ) { for( unsigned int ineighbor=0 ; ineighbor < 2 ; ineighbor++ ) { - buf_cell_keys[idim][ineighbor].resize( MPI_buffer_.part_index_recv_sz[idim][ineighbor] ); + buf_cell_keys[idim][ineighbor].resize( MPI_buffer_.partRecv[idim][ineighbor]->size() ); // #pragma omp simd - // for( unsigned int ip=0; ip < MPI_buffer_.part_index_recv_sz[idim][ineighbor]; ip++ ) { + // for( unsigned int ip=0; ip < MPI_buffer_.partRecv[idim][ineighbor]->size(); ip++ ) { // for( unsigned int ipos=0; ipos < nDim_field ; ipos++ ) { - // double X = ((this)->*(distance[ipos]))(&MPI_buffer_.partRecv[idim][ineighbor], ipos, ip); + // double X = ((this)->*(distance[ipos]))(MPI_buffer_.partRecv[idim][ineighbor], ipos, ip); // int IX = round( X * dx_inv_[ipos] ); // buf_cell_keys[idim][ineighbor][ip] = buf_cell_keys[idim][ineighbor][ip] * length_[ipos] + IX; // } // } // // not vectorizable because random access to count - // for( unsigned int ip=0; ip < MPI_buffer_.part_index_recv_sz[idim][ineighbor]; ip++ ) { + // for( unsigned int ip=0; ip < MPI_buffer_.partRecv[idim][ineighbor]->size(); ip++ ) { // count[buf_cell_keys[idim][ineighbor][ip]] ++; // } computeParticleCellKeys( params, - &MPI_buffer_.partRecv[idim][ineighbor], + MPI_buffer_.partRecv[idim][ineighbor], &buf_cell_keys[idim][ineighbor][0], &count[0], 0, - MPI_buffer_.part_index_recv_sz[idim][ineighbor] ); + MPI_buffer_.partRecv[idim][ineighbor]->size() ); } } @@ -1403,8 +1403,8 @@ void SpeciesV::sortParticles( Params ¶ms ) //Now proceed to the cycle sort - if( MPI_buffer_.partRecv[0][0].size() == 0 ) { - MPI_buffer_.partRecv[0][0].initialize( 0, *particles ); //Is this correct ? + if( MPI_buffer_.partRecv[0][0]->size() == 0 ) { + MPI_buffer_.partRecv[0][0]->initialize( 0, *particles ); //Is this correct ? } // Resize the particle vector @@ -1418,7 +1418,7 @@ void SpeciesV::sortParticles( Params ¶ms ) //Copy all particles from MPI buffers back to the writable particles via cycle sort pass. for( unsigned int idim=0; idim < nDim_field ; idim++ ) { for( unsigned int ineighbor=0 ; ineighbor < 2 ; ineighbor++ ) { - for( unsigned int ip=0; ip < MPI_buffer_.part_index_recv_sz[idim][ineighbor]; ip++ ) { + for( unsigned int ip=0; ip < MPI_buffer_.partRecv[idim][ineighbor]->size(); ip++ ) { cycle.resize( 1 ); cell_target = buf_cell_keys[idim][ineighbor][ip]; ip_dest = particles->first_index[cell_target]; @@ -1429,7 +1429,7 @@ void SpeciesV::sortParticles( Params ¶ms ) cycle[0] = ip_dest; cell_target = particles->cell_keys[ip_dest]; //As long as the particle is not erased, we can build up the cycle. - while( cell_target != -1 ) { + while( cell_target >= 0 ) { ip_dest = particles->first_index[cell_target]; while( particles->cell_keys[ip_dest] == cell_target ) { ip_dest++; @@ -1441,7 +1441,7 @@ void SpeciesV::sortParticles( Params ¶ms ) //Last target_cell is -1, the particle must be erased: particles->translateParticles( cycle ); //Eventually copy particle from the MPI buffer into the particle vector. - MPI_buffer_.partRecv[idim][ineighbor].overwriteParticle( ip, *particles, cycle[0] ); + MPI_buffer_.partRecv[idim][ineighbor]->overwriteParticle( ip, *particles, cycle[0] ); } } } @@ -1450,14 +1450,14 @@ void SpeciesV::sortParticles( Params ¶ms ) for( unsigned int ip=( unsigned int )particles->last_index.back(); ip < npart; ip++ ) { cell_target = particles->cell_keys[ip]; - if( cell_target == -1 ) { + if( cell_target < 0 ) { continue; } cycle.resize( 0 ); cycle.push_back( ip ); //As long as the particle is not erased, we can build up the cycle. - while( cell_target != -1 ) { + while( cell_target >= 0 ) { ip_dest = particles->first_index[cell_target]; @@ -1533,7 +1533,7 @@ void SpeciesV::computeParticleCellKeys( Params & params, #pragma omp simd for( iPart=istart; iPart < iend ; iPart++ ) { - if ( cell_keys[iPart] != -1 ) { + if ( cell_keys[iPart] >= 0 ) { //Compute cell_keys particles cell_keys[iPart] = std::round( position_x[iPart] * dx_inv_[0]) - min_loc_l ; cell_keys[iPart] *= length_[1]; @@ -1553,7 +1553,7 @@ void SpeciesV::computeParticleCellKeys( Params & params, #pragma omp simd for( iPart=istart; iPart < iend ; iPart++ ) { - if ( cell_keys[iPart] != -1 ) { + if ( cell_keys[iPart] >= 0 ) { //Compute cell_keys of remaining particles cell_keys[iPart] = std::round(position_x[iPart] * dx_inv_[0] )- min_loc_x ; cell_keys[iPart] *= length_[1]; @@ -1573,7 +1573,7 @@ void SpeciesV::computeParticleCellKeys( Params & params, #pragma omp simd for( iPart=istart; iPart < iend ; iPart++ ) { - if ( cell_keys[iPart] != -1 ) { + if ( cell_keys[iPart] >= 0 ) { //Compute cell_keys of remaining particles cell_keys[iPart] = std::round(position_x[iPart] * dx_inv_[0] )- min_loc_x ; cell_keys[iPart] *= length_[1]; @@ -1589,7 +1589,7 @@ void SpeciesV::computeParticleCellKeys( Params & params, #pragma omp simd for( iPart=istart; iPart < iend ; iPart++ ) { - if ( cell_keys[iPart] != -1 ) { + if ( cell_keys[iPart] >= 0 ) { //Compute cell_keys of remaining particles cell_keys[iPart] = round(position_x[iPart] * dx_inv_[0] )- min_loc_x ; } @@ -1598,7 +1598,7 @@ void SpeciesV::computeParticleCellKeys( Params & params, } for( iPart=istart; iPart < iend ; iPart++ ) { - if ( cell_keys[iPart] != -1 ) { + if ( cell_keys[iPart] >= 0 ) { count[cell_keys[iPart]] ++; } } @@ -2526,7 +2526,7 @@ void SpeciesV::ponderomotiveUpdatePositionAndCurrentsTasks( double time_dual, un smpi->traceEventIfDiagTracing(diag_PartEventTracing, Tools::getOMPThreadNum(),0,11); for( int iPart=particles->first_index[scell] ; iPartlast_index[scell]; iPart++ ) { - if ( particles->cell_keys[iPart] != -1 ) { + if ( particles->cell_keys[iPart] >= 0 ) { //First reduction of the count sort algorithm. Lost particles are not included. for( int i = 0 ; i<( int )nDim_field; i++ ) { particles->cell_keys[iPart] *= length_[i]; diff --git a/src/Species/SpeciesV.h b/src/Species/SpeciesV.h index 39dc45089..7f5fe587c 100755 --- a/src/Species/SpeciesV.h +++ b/src/Species/SpeciesV.h @@ -26,7 +26,7 @@ class SpeciesV : public Species //! Species destructor virtual ~SpeciesV(); - void initCluster( Params ¶ms ) override; + void initCluster( Params ¶ms, Patch *patch ) override; //! Method calculating the Particle dynamics (interpolation, pusher, projection) void dynamics( double time, unsigned int ispec, diff --git a/src/Species/SpeciesVAdaptive.cpp b/src/Species/SpeciesVAdaptive.cpp index b24d86711..273362561 100755 --- a/src/Species/SpeciesVAdaptive.cpp +++ b/src/Species/SpeciesVAdaptive.cpp @@ -46,7 +46,7 @@ using namespace std; SpeciesVAdaptive::SpeciesVAdaptive( Params ¶ms, Patch *patch ) : SpeciesV( params, patch ) { - initCluster( params ); + initCluster( params, patch ); npack_ = 0 ; packsize_ = 0; }//END SpeciesVAdaptive creator @@ -275,7 +275,7 @@ void SpeciesVAdaptive::scalarDynamics( double time_dual, unsigned int ispec, // if( mass_>0 ) { // // for( iPart=particles->first_index[scell] ; ( int )iPartlast_index[scell]; iPart++ ) { - // if ( particles->cell_keys[iPart] != -1 ) { + // if ( particles->cell_keys[iPart] >= 0 ) { // //Compute cell_keys of remaining particles // for( unsigned int i = 0 ; icell_keys[iPart] *= this->length_[i]; @@ -289,7 +289,7 @@ void SpeciesVAdaptive::scalarDynamics( double time_dual, unsigned int ispec, // } else if( mass_==0 ) { // // for( iPart=particles->first_index[scell] ; ( int )iPartlast_index[scell]; iPart++ ) { - // if ( particles->cell_keys[iPart] != -1 ) { + // if ( particles->cell_keys[iPart] >= 0 ) { // //Compute cell_keys of remaining particles // for( unsigned int i = 0 ; icell_keys[iPart] *= this->length_[i]; @@ -754,7 +754,7 @@ void SpeciesVAdaptive::scalarDynamicsTasks( double time_dual, unsigned int ispec if( mass_>0 ) { for( int iPart=particles->first_index[ipack*packsize_+scell] ; ( int )iPartlast_index[ipack*packsize_+scell]; iPart++ ) { - if ( particles->cell_keys[iPart] != -1 ) { + if ( particles->cell_keys[iPart] >= 0 ) { //Compute cell_keys of remaining particles for( unsigned int i = 0 ; icell_keys[iPart] *= this->length_[i]; @@ -768,7 +768,7 @@ void SpeciesVAdaptive::scalarDynamicsTasks( double time_dual, unsigned int ispec } else if( mass_==0 ) { for( int iPart=particles->first_index[scell] ; ( int )iPartlast_index[scell]; iPart++ ) { - if ( particles->cell_keys[iPart] != -1 ) { + if ( particles->cell_keys[iPart] >= 0 ) { //Compute cell_keys of remaining particles for( unsigned int i = 0 ; icell_keys[iPart] *= length[i]; @@ -1662,7 +1662,7 @@ void SpeciesVAdaptive::scalarPonderomotiveUpdatePositionAndCurrentsTasks( double smpi->traceEventIfDiagTracing(diag_PartEventTracing, Tools::getOMPThreadNum(),0,11); for( int iPart=particles->first_index[first_cell_of_bin[ibin]] ; iPartlast_index[last_cell_of_bin[ibin]]; iPart++ ) { - if ( particles->cell_keys[iPart] != -1 ) { + if ( particles->cell_keys[iPart] >= 0 ) { //First reduction of the count sort algorithm. Lost particles are not included. for( int i = 0 ; i<( int )nDim_field; i++ ) { particles->cell_keys[iPart] *= length_[i]; diff --git a/src/Species/SpeciesVAdaptiveMixedSort.cpp b/src/Species/SpeciesVAdaptiveMixedSort.cpp index cc809d8c3..1889f0cd8 100755 --- a/src/Species/SpeciesVAdaptiveMixedSort.cpp +++ b/src/Species/SpeciesVAdaptiveMixedSort.cpp @@ -46,7 +46,7 @@ using namespace std; SpeciesVAdaptiveMixedSort::SpeciesVAdaptiveMixedSort( Params ¶ms, Patch *patch ) : SpeciesV( params, patch ) { - initCluster( params ); + initCluster( params, patch ); npack_ = 0 ; packsize_ = 0; diff --git a/src/Tools/Pragma.h b/src/Tools/Pragma.h index b1a81cdae..0fb5e1e9d 100644 --- a/src/Tools/Pragma.h +++ b/src/Tools/Pragma.h @@ -31,7 +31,7 @@ #if defined ( SMILEI_ACCELERATOR_GPU_OMP ) #define ATOMIC(mode) \ _Pragma( TOSTRING(omp atomic mode)) -#elif defined ( SMILEI_OPENACC_MODE ) +#elif defined ( SMILEI_ACCELERATOR_GPU_OACC ) #define ATOMIC(mode) \ _Pragma( TOSTRING(acc atomic mode)) #endif diff --git a/src/Tools/Timers.cpp b/src/Tools/Timers.cpp index 0cd6dac0c..d3edda0e4 100755 --- a/src/Tools/Timers.cpp +++ b/src/Tools/Timers.cpp @@ -18,7 +18,7 @@ Timers::Timers( SmileiMPI *smpi ) : collisions( "Collisions" ), // Call to Collisions methods movWindow( "Mov window" ), // Moving Window loadBal( "Load balancing" ), // Load balancing - syncPart( "Sync Particles" ), // Call exchangeParticles (MPI & Patch sync) + syncPart( "Sync Particles" ), // Call initExchParticles (MPI & Patch sync) syncField( "Sync Fields" ), // Call sumRhoJ(s), exchangeB (MPI & Patch sync) syncDens( "Sync Densities" ), // If necessary the following timers can be reintroduced particleMerging( "Part Merging" ), // Particle merging diff --git a/src/Tools/gpu.cpp b/src/Tools/gpu.cpp index 7ce000e03..497786096 100644 --- a/src/Tools/gpu.cpp +++ b/src/Tools/gpu.cpp @@ -1,6 +1,6 @@ #include "gpu.h" -#if defined( SMILEI_ACCELERATOR_GPU_OMP ) && defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OMP ) && defined( SMILEI_ACCELERATOR_GPU_OACC ) #error "You can not enable both OpenACC and OpenMP GPU support" #endif @@ -29,7 +29,7 @@ #else #error "Asking for OpenMP support without enabling compiler support for OpenMP" #endif -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #if defined( _OPENACC ) #include #else @@ -46,11 +46,12 @@ namespace smilei { #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target enter data map( alloc \ : byte_array [0:a_count * an_object_size] ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc enter data create( byte_array [0:a_count * an_object_size] ) #else SMILEI_UNUSED( a_host_pointer ); SMILEI_UNUSED( a_count ); + SMILEI_UNUSED( an_object_size ); SMILEI_UNUSED( byte_array ); #endif } @@ -61,11 +62,12 @@ namespace smilei { #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target enter data map( to \ : byte_array [0:a_count * an_object_size] ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc enter data copyin( byte_array [0:a_count * an_object_size] ) #else SMILEI_UNUSED( a_host_pointer ); SMILEI_UNUSED( a_count ); + SMILEI_UNUSED( an_object_size ); SMILEI_UNUSED( byte_array ); #endif } @@ -75,11 +77,12 @@ namespace smilei { const unsigned char* byte_array = static_cast( a_host_pointer ); #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target update to( byte_array [0:a_count * an_object_size] ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc update device( byte_array [0:a_count * an_object_size] ) #else SMILEI_UNUSED( a_host_pointer ); SMILEI_UNUSED( a_count ); + SMILEI_UNUSED( an_object_size ); SMILEI_UNUSED( byte_array ); #endif } @@ -89,11 +92,12 @@ namespace smilei { unsigned char* byte_array = static_cast( a_host_pointer ); #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target update from( byte_array [0:a_count * an_object_size] ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc update host( byte_array [0:a_count * an_object_size] ) #else SMILEI_UNUSED( a_host_pointer ); SMILEI_UNUSED( a_count ); + SMILEI_UNUSED( an_object_size ); SMILEI_UNUSED( byte_array ); #endif } @@ -104,11 +108,12 @@ namespace smilei { #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target exit data map( from \ : byte_array [0:a_count * an_object_size] ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc exit data copyout( byte_array [0:a_count * an_object_size] ) #else SMILEI_UNUSED( a_host_pointer ); SMILEI_UNUSED( a_count ); + SMILEI_UNUSED( an_object_size ); SMILEI_UNUSED( byte_array ); #endif } @@ -119,11 +124,12 @@ namespace smilei { #if defined( SMILEI_ACCELERATOR_GPU_OMP ) #pragma omp target exit data map( delete \ : byte_array [0:a_count * an_object_size] ) -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #pragma acc exit data delete( byte_array [0:a_count * an_object_size] ) #else SMILEI_UNUSED( a_host_pointer ); SMILEI_UNUSED( a_count ); + SMILEI_UNUSED( an_object_size ); SMILEI_UNUSED( byte_array ); #endif } @@ -154,7 +160,7 @@ namespace smilei { SMILEI_ASSERT( a_device_pointer != nullptr ); return const_cast( a_device_pointer ); -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) //return const_cast( ::acc_deviceptr( a_host_pointer ) ); return ::acc_deviceptr( const_cast(a_host_pointer) ) ; #else @@ -171,7 +177,7 @@ namespace smilei { a_count * an_object_size, 0, 0, device_num, device_num ) != 0 ) { ERROR( "omp_target_memcpy failed" ); } -#elif defined( SMILEI_OPENACC_MODE ) +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) // It seems that the interface of ::acc_memcpy_device does not accept ptr to array of const type ! // https://www.openacc.org/sites/default/files/inline-files/OpenACC.2.7.pdf // void acc_memcpy_device( d_void* dest, d_void* src, size_t bytes ); diff --git a/src/Tools/gpu.h b/src/Tools/gpu.h index 28a8c98da..bb8e6c472 100644 --- a/src/Tools/gpu.h +++ b/src/Tools/gpu.h @@ -19,10 +19,14 @@ namespace smilei { #define SMILEI_ACCELERATOR_DECLARE_ROUTINE _Pragma( "omp declare target" ) #define SMILEI_ACCELERATOR_DECLARE_ROUTINE_END _Pragma( "omp end declare target" ) #define SMILEI_ACCELERATOR_ATOMIC _Pragma( "omp atomic update" ) -#elif defined( SMILEI_OPENACC_MODE ) + #define SMILEI_ACCELERATOR_ASYNC_POLYCY thrust::hip::par_nosync + #define SMILEI_ACCELERATOR_DEVICE_SYNC() hipDeviceSynchronize() +#elif defined( SMILEI_ACCELERATOR_GPU_OACC ) #define SMILEI_ACCELERATOR_DECLARE_ROUTINE _Pragma( "acc routine seq" ) #define SMILEI_ACCELERATOR_DECLARE_ROUTINE_END #define SMILEI_ACCELERATOR_ATOMIC _Pragma( "acc atomic" ) + #define SMILEI_ACCELERATOR_ASYNC_POLYCY thrust::cuda::par_nosync + #define SMILEI_ACCELERATOR_DEVICE_SYNC() cudaDeviceSynchronize() #else #define SMILEI_ACCELERATOR_DECLARE_ROUTINE #define SMILEI_ACCELERATOR_DECLARE_ROUTINE_END diff --git a/src/Tools/gpuRandom.h b/src/Tools/gpuRandom.h index 916a7b8f8..bdb9aca59 100644 --- a/src/Tools/gpuRandom.h +++ b/src/Tools/gpuRandom.h @@ -1,7 +1,7 @@ #ifndef GPU_RANDOM #define GPU_RANDOM -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) // #include #include "curand_kernel.h" #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) @@ -29,7 +29,7 @@ namespace smilei { { protected: using State = -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) ::curandState_t; #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) // TODO @@ -42,7 +42,7 @@ namespace smilei { public: Random() -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) : a_state_{ 0xDEADBEEFU } #else @@ -53,26 +53,36 @@ namespace smilei { } // Initialization +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) void init( unsigned long long seed, unsigned long long seq, unsigned long long offset ) { -#if defined( SMILEI_OPENACC_MODE ) // Cuda generator initialization ::curand_init( seed, seq, offset, &a_state_ ); + } #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) + void init( unsigned long long seed, + unsigned long long , + unsigned long long ) + { // Hip generator initialization // ::hiprand_init( seed, seq, offset, &state ); a_state_ = State{ static_cast( seed ) }; + } #else + void init( unsigned long long seed, + unsigned long long , + unsigned long long ) + { a_state_ = State{ static_cast( seed ) }; -#endif } +#endif // Initialization double uniform() { -#if defined( SMILEI_OPENACC_MODE ) +#if defined( SMILEI_ACCELERATOR_GPU_OACC ) return ::curand_uniform( &a_state_ ); #elif defined( SMILEI_ACCELERATOR_GPU_OMP ) // TODO diff --git a/src/Tools/userFunctions.h b/src/Tools/userFunctions.h index 63753fb20..d9525723d 100755 --- a/src/Tools/userFunctions.h +++ b/src/Tools/userFunctions.h @@ -1,5 +1,5 @@ -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #include #endif @@ -36,7 +36,7 @@ class userFunctions //! \param array array in which to find the value //! \param elem element to be found //! \param nb_elem number of elements -#ifdef SMILEI_OPENACC_MODE +#ifdef SMILEI_ACCELERATOR_GPU_OACC #pragma acc routine seq #endif template diff --git a/validation/analyses/validate_tst2d_v_o2_qed_cascade_vranic_cartesian.py b/validation/analyses/validate_tst2d_v_o2_qed_cascade_vranic_cartesian.py index ee807d65b..8d5b8ddb1 100644 --- a/validation/analyses/validate_tst2d_v_o2_qed_cascade_vranic_cartesian.py +++ b/validation/analyses/validate_tst2d_v_o2_qed_cascade_vranic_cartesian.py @@ -247,7 +247,7 @@ def adaptive_error(values, statistics, thresholds): thresholds = {} thresholds["points"] = np.array([0. ,10 ,100,1000]) -thresholds["factor"] = np.array([1e9, 1.,0.5, 0.2]) +thresholds["factor"] = np.array([1e9, 1.,0.7, 0.2]) Validate("Average gamma for the electrons vs time", average_gamma["electron"], adaptive_error(average_gamma["electron"], Nelectron, thresholds)) Validate("Average gamma for the positrons vs time", average_gamma["positron"], adaptive_error(average_gamma["positron"], Npositron, thresholds)) diff --git a/validation/references/tst2d_04_laser_wake.py.txt b/validation/references/tst2d_04_laser_wake.py.txt index 48d9eaeca..094e7c366 100755 Binary files a/validation/references/tst2d_04_laser_wake.py.txt and b/validation/references/tst2d_04_laser_wake.py.txt differ