diff --git a/tardis/spectrum/formal_integral_cuda.py b/tardis/spectrum/formal_integral_cuda.py index b1d7f38b93c..78e5e6fad2e 100644 --- a/tardis/spectrum/formal_integral_cuda.py +++ b/tardis/spectrum/formal_integral_cuda.py @@ -12,6 +12,28 @@ H_CGS = 6.62606957e-27 +@cuda.jit +def cuda_vector_integrator(L, I_nu, N, R_max): + """ + The CUDA Vectorized integrator over second axis + + Parameters + ---------- + L : array(float64, 1d, C) + Output Array + I_nu : array(floagt64, 2d, C) + Input Array + N : int64 + R_max : float64 + + """ + + nu_idx = cuda.grid(1) + L[nu_idx] = ( + 8 * M_PI * M_PI * trapezoid_integration_cuda(I_nu[nu_idx], R_max / N) + ) + + @cuda.jit def cuda_formal_integral( r_inner, @@ -27,7 +49,6 @@ def cuda_formal_integral( tau_sobolev, electron_density, N, - L, pp, exp_tau, I_nu, @@ -41,44 +62,59 @@ def cuda_formal_integral( Parameters ---------- r_inner : array(float64, 1d, C) - self.geometry.r_inner + inner radius of each shell r_outer : array(float64, 1d, C) - self.geometry.r_outer + outer radius of each shell time_explosion: float64 - self.geometry.time_explosion + geometrical explosion time line_list_nu : array(float64, 1d, A) - self.plasma.line_list_nu + List of line transition frequencies iT : np.float64 + interpolated temperture in cgs units inu : np.float64 + interpolated frequencies in cgs units inu_size : int64 + size of inu array att_S_ul : array(float64, 1d, C) + attentuated source function Jred_lu : array(float64, 1d, C) + J estimator from red end of the line from lower to upper level Jblue_lu : array(float64, 1d, C) + J estimator from blue end of the line from lower to upper level tau_sobolev : array(float64, 2d, C) + Sobolev Optical depth for each line in each shell electron_density : array(float64, 1d, C) + electron density in each shell N : int64 + Number of impact parameter values (p) L : array(float64, 1d, C) + Luminosity density at each frequency This is where the results will be stored pp : array(float64, 1d, C) + Impact parameter arrays exp_tau : array(float64, 1d, C) + $\exp{-tau}$ array to speed up computation I_nu array(floatt64, 2d, C) + Radiative intensity per unit frequency per impact parameter z : array(float64, 2d, C) + Ray intersections with the shells shell_id : array(int64, 2d, C) + List of shells for each thread """ - # todo: add all the original todos - # global read-only values size_line, size_shell = tau_sobolev.shape - size_tau = size_line * size_shell R_ph = r_inner[0] # make sure these are cgs - R_max = r_outer[size_shell - 1] - nu_idx = cuda.grid(1) + nu_idx, p_idx = cuda.grid(2) # 2D Cuda Grid, nu x p + p_idx += 1 # Because the iteration starts at 1 # Check to see if CUDA is out of bounds if nu_idx >= inu_size: return + if p_idx >= N: + return + # These all have their own version for each thread to avoid race conditions I_nu_thread = I_nu[nu_idx] z_thread = z[nu_idx] @@ -106,107 +142,105 @@ def cuda_formal_integral( pline = 0 nu = inu[nu_idx] + escat_contrib = 0.0 + p = pp[p_idx] + + # initialize z intersections for p values + size_z = populate_z_cuda( + r_inner, r_outer, time_explosion, p, z_thread, shell_id_thread + ) + if p <= R_ph: + I_nu_thread[p_idx] = intensity_black_body_cuda(nu * z_thread[0], iT) + else: + I_nu_thread[p_idx] = 0 + nu_start = nu * z_thread[0] + nu_end = nu * z_thread[1] + + idx_nu_start = line_search_cuda(line_list_nu, nu_start, size_line) + offset = shell_id_thread[0] * size_line + # start tracking accumulated e-scattering optical depth + zstart = time_explosion / C_INV * (1.0 - z_thread[0]) + # Initialize "pointers" + pline = int(idx_nu_start) + pexp_tau = int(offset + idx_nu_start) + patt_S_ul = int(offset + idx_nu_start) + pJred_lu = int(offset + idx_nu_start) + pJblue_lu = int(offset + idx_nu_start) + + # loop over all interactions + for i in range(size_z - 1): + escat_op = electron_density[int(shell_id_thread[i])] * SIGMA_THOMSON + nu_end = ( + nu * z_thread[i + 1] + ) # +1 is the offset as the original is from z[1:] + + nu_end_idx = line_search_cuda(line_list_nu, nu_end, len(line_list_nu)) + zend = time_explosion / C_INV * (1.0 - line_list_nu[pline] / nu) + escat_contrib += ( + (zend - zstart) + * escat_op + * (Jblue_lu[pJblue_lu] - I_nu_thread[p_idx]) + ) + pJred_lu += 1 + I_nu_thread[p_idx] += escat_contrib + # // Lucy 1999, Eq 26 + I_nu_thread[p_idx] *= exp_tau[pexp_tau] + I_nu_thread[p_idx] += att_S_ul[patt_S_ul] - # now loop over discrete values along line - for p_idx in range(1, N): + # // reset e-scattering opacity escat_contrib = 0.0 - p = pp[p_idx] + zstart = zend - # initialize z intersections for p values - size_z = populate_z_cuda( - r_inner, r_outer, time_explosion, p, z_thread, shell_id_thread - ) - if p <= R_ph: - I_nu_thread[p_idx] = intensity_black_body_cuda(nu * z_thread[0], iT) - else: - I_nu_thread[p_idx] = 0 - nu_start = nu * z_thread[0] - nu_end = nu * z_thread[1] - - idx_nu_start = line_search_cuda(line_list_nu, nu_start, size_line) - offset = shell_id_thread[0] * size_line - # start tracking accumulated e-scattering optical depth - zstart = time_explosion / C_INV * (1.0 - z_thread[0]) - # Initialize "pointers" - pline = int(idx_nu_start) - pexp_tau = int(offset + idx_nu_start) - patt_S_ul = int(offset + idx_nu_start) - pJred_lu = int(offset + idx_nu_start) - pJblue_lu = int(offset + idx_nu_start) - - # flag for first contribution to integration on current p-ray - first = 1 - - # loop over all interactions - for i in range(size_z - 1): - escat_op = electron_density[int(shell_id_thread[i])] * SIGMA_THOMSON - nu_end = ( - nu * z_thread[i + 1] - ) # +1 is the offset as the original is from z[1:] - - nu_end_idx = line_search_cuda( - line_list_nu, nu_end, len(line_list_nu) - ) + pline += 1 + pexp_tau += 1 + patt_S_ul += 1 + pJblue_lu += 1 - for _ in range(max(nu_end_idx - pline, 0)): - # calculate e-scattering optical depth to next resonance point - zend = time_explosion / C_INV * (1.0 - line_list_nu[pline] / nu) - if first == 1: - # first contribution to integration - # NOTE: this treatment of I_nu_b (given - # by boundary conditions) is not in Lucy 1999; - # should be re-examined carefully - escat_contrib += ( - (zend - zstart) - * escat_op - * (Jblue_lu[pJblue_lu] - I_nu_thread[p_idx]) - ) - first = 0 - else: - # Account for e-scattering, c.f. Eqs 27, 28 in Lucy 1999 - Jkkp = 0.5 * (Jred_lu[pJred_lu] + Jblue_lu[pJblue_lu]) - escat_contrib += ( - (zend - zstart) * escat_op * (Jkkp - I_nu_thread[p_idx]) - ) - # this introduces the necessary offset of one element between - # pJblue_lu and pJred_lu - pJred_lu += 1 - I_nu_thread[p_idx] += escat_contrib - # // Lucy 1999, Eq 26 - I_nu_thread[p_idx] *= exp_tau[pexp_tau] - I_nu_thread[p_idx] += att_S_ul[patt_S_ul] - - # // reset e-scattering opacity - escat_contrib = 0.0 - zstart = zend - - pline += 1 - pexp_tau += 1 - patt_S_ul += 1 - pJblue_lu += 1 - - # calculate e-scattering optical depth to grid cell boundary + for _ in range(1, max(nu_end_idx - pline, 1)): + # calculate e-scattering optical depth to next resonance point + zend = time_explosion / C_INV * (1.0 - line_list_nu[pline] / nu) + # Account for e-scattering, c.f. Eqs 27, 28 in Lucy 1999 Jkkp = 0.5 * (Jred_lu[pJred_lu] + Jblue_lu[pJblue_lu]) - zend = time_explosion / C_INV * (1.0 - nu_end / nu) escat_contrib += ( (zend - zstart) * escat_op * (Jkkp - I_nu_thread[p_idx]) ) + # this introduces the necessary offset of one element between + # pJblue_lu and pJred_lu + pJred_lu += 1 + I_nu_thread[p_idx] += escat_contrib + # // Lucy 1999, Eq 26 + I_nu_thread[p_idx] *= exp_tau[pexp_tau] + I_nu_thread[p_idx] += att_S_ul[patt_S_ul] + + # // reset e-scattering opacity + escat_contrib = 0.0 zstart = zend - # advance pointers - direction = int( - (shell_id_thread[i + 1] - shell_id_thread[i]) * size_line - ) - pexp_tau += direction - patt_S_ul += direction - pJred_lu += direction - pJblue_lu += direction + pline += 1 + pexp_tau += 1 + patt_S_ul += 1 + pJblue_lu += 1 - I_nu_thread[p_idx] *= p - L[nu_idx] = ( - 8 * M_PI * M_PI * trapezoid_integration_cuda(I_nu_thread, R_max / N) - ) + # calculate e-scattering optical depth to grid cell boundary + + Jkkp = 0.5 * (Jred_lu[pJred_lu] + Jblue_lu[pJblue_lu]) + zend = time_explosion / C_INV * (1.0 - nu_end / nu) + escat_contrib += ( + (zend - zstart) * escat_op * (Jkkp - I_nu_thread[p_idx]) + ) + zstart = zend + + # advance pointers + direction = int( + (shell_id_thread[i + 1] - shell_id_thread[i]) * size_line + ) + pexp_tau += direction + patt_S_ul += direction + pJred_lu += direction + pJblue_lu += direction + + I_nu_thread[p_idx] *= p class CudaFormalIntegrator(object): @@ -236,7 +270,6 @@ def formal_integral( """ Simple wrapper for the CUDA implementation of the formal integral """ - L = np.zeros(inu_size, dtype=np.float64) # array(float64, 1d, C) # global read-only values size_line, size_shell = tau_sobolev.shape # int64, int64 size_tau = size_line * size_shell # int64 @@ -247,26 +280,51 @@ def formal_integral( pp[::] = calculate_p_values( self.geometry.r_outer[size_shell - 1], N ) # array(float64, 1d, C) - - I_nu = np.zeros( - (inu_size, N), dtype=np.float64 - ) # array(float64, 1d, C) z = np.zeros( (inu_size, 2 * size_shell), dtype=np.float64 ) # array(float64, 1d, C) shell_id = np.zeros( (inu_size, 2 * size_shell), dtype=np.int64 ) # array(int64, 1d, C) - THREADS_PER_BLOCK = 32 - blocks_per_grid = (inu_size // THREADS_PER_BLOCK) + 1 - cuda_formal_integral[blocks_per_grid, THREADS_PER_BLOCK]( - self.geometry.r_inner, - self.geometry.r_outer, + # These get separate names since they'll be copied back + # These are device objects stored on the GPU + # for the Luminosity density and Radiative intensity + d_L = cuda.device_array((inu_size,), dtype=np.float64) + d_I_nu = cuda.device_array((inu_size, N), dtype=np.float64) + + # Copy these arrays to the device, we don't need them again + # But they must be initialized with zeros + z = cuda.to_device(z) + shell_id = cuda.to_device(shell_id) + pp = cuda.to_device(pp) + exp_tau = cuda.to_device(exp_tau) + r_inner = cuda.to_device(self.geometry.r_inner) + r_outer = cuda.to_device(self.geometry.r_outer) + line_list_nu = cuda.to_device(self.plasma.line_list_nu) + inu = cuda.to_device(inu.value) + att_S_ul = cuda.to_device(att_S_ul) + Jred_lu = cuda.to_device(Jred_lu) + Jblue_lu = cuda.to_device(Jblue_lu) + tau_sobolev = cuda.to_device(tau_sobolev) + electron_density = cuda.to_device(electron_density) + + # Thread/Block Allocation, this seems to work + THREADS_PER_BLOCK_NU = 32 + THREADS_PER_BLOCK_P = 16 + blocks_per_grid_nu = (inu_size // THREADS_PER_BLOCK_NU) + 1 + blocks_per_grid_p = ((N - 1) // THREADS_PER_BLOCK_P) + 1 + + cuda_formal_integral[ + (blocks_per_grid_nu, blocks_per_grid_p), + (THREADS_PER_BLOCK_NU, THREADS_PER_BLOCK_P), + ]( + r_inner, + r_outer, self.time_explosion, - self.plasma.line_list_nu, + line_list_nu, iT.value, - inu.value, + inu, inu_size, att_S_ul, Jred_lu, @@ -274,14 +332,20 @@ def formal_integral( tau_sobolev, electron_density, N, - L, pp, exp_tau, - I_nu, + d_I_nu, z, shell_id, ) + R_max = self.geometry.r_outer[size_shell - 1] + cuda_vector_integrator[blocks_per_grid_nu, THREADS_PER_BLOCK_NU]( + d_L, d_I_nu, N, R_max + ) + L = d_L.copy_to_host() + I_nu = d_I_nu.copy_to_host() + return L, I_nu