Skip to content

Commit

Permalink
fix GPU memory address alignment errors, add comments/debug code
Browse files Browse the repository at this point in the history
  • Loading branch information
bwmeyers committed Dec 18, 2023
1 parent b87e165 commit 2a93cdf
Showing 1 changed file with 30 additions and 19 deletions.
49 changes: 30 additions & 19 deletions src/form_beam.cu
Original file line number Diff line number Diff line change
Expand Up @@ -255,19 +255,25 @@ __global__ void vmBeamform_kernel( cuDoubleComplex *Jv_Q,
int ant = threadIdx.x; /* The (ant)enna number */
int nant = blockDim.x; /* The (n)_umber of (ant)ennas */

/*// GPU profiling
clock_t start, stop;
double setup_t, detect_t, sum_t, stokes_t;
if ((p == 0) && (ant == 0) && (c == 0) && (s == 0)) start = clock();*/

// Organise dynamically allocated shared arrays (see tag 11NSTATION for kernel call)
extern __shared__ double arrays[];

cuDoubleComplex *ex = (cuDoubleComplex *)(&arrays[1*nant]);
cuDoubleComplex *ey = (cuDoubleComplex *)(&arrays[3*nant]);
cuDoubleComplex *Nxx = (cuDoubleComplex *)(&arrays[5*nant]);
cuDoubleComplex *Nxy = (cuDoubleComplex *)(&arrays[7*nant]);
cuDoubleComplex *Nyy = (cuDoubleComplex *)(&arrays[9*nant]);
// These SHOULD be *aligned* on integer numbers of 4-byte blocks.

// NOTE: Because these are complex-doubles, each takes up 2*sizeof(double), hence the stride of 2 here.
/* Given that we need to ensure access alignment on the 4-byte boundaries (CUDA requirement), we have
to make sure that the array access corresponds to an integer number of sizeof(double), which
means we need to use even indexes to access memory. This ensures for all `nant` that we access
within the memory boundaries. This StackOverflow post helped us figure this out:
https://stackoverflow.com/questions/70765553/cuda-shared-memory-alignement-in-documentation
(Previously, the indexes were: 1*nant, 3*nant, etc.)
*/
cuDoubleComplex *ex = (cuDoubleComplex *)(&arrays[0*nant]);
cuDoubleComplex *ey = (cuDoubleComplex *)(&arrays[2*nant]);
cuDoubleComplex *Nxx = (cuDoubleComplex *)(&arrays[4*nant]);
cuDoubleComplex *Nxy = (cuDoubleComplex *)(&arrays[6*nant]);
cuDoubleComplex *Nyy = (cuDoubleComplex *)(&arrays[8*nant]);
// (Nyx is not needed as it's degenerate with Nxy)

// Calculate the beam and the noise floor
Expand All @@ -278,25 +284,22 @@ __global__ void vmBeamform_kernel( cuDoubleComplex *Jv_Q,

ex[ant] = make_cuDoubleComplex( 0.0, 0.0 );
ey[ant] = make_cuDoubleComplex( 0.0, 0.0 );

Nxx[ant] = make_cuDoubleComplex( 0.0, 0.0 );
Nxy[ant] = make_cuDoubleComplex( 0.0, 0.0 );
//Nyx[ant] = make_cuDoubleComplex( 0.0, 0.0 );
Nyy[ant] = make_cuDoubleComplex( 0.0, 0.0 );
__syncthreads();

// Calculate beamform products for each antenna, and then add them together
// Calculate the coherent beam (B = J*phi*D)
ex[ant] = cuCmul( phi[PHI_IDX(p,ant,c,nant,nc)], Jv_Q[Jv_IDX(p,s,c,ant,ns,nc,nant)] );
ey[ant] = cuCmul( phi[PHI_IDX(p,ant,c,nant,nc)], Jv_P[Jv_IDX(p,s,c,ant,ns,nc,nant)] );

ex[ant] = cuCmul( phi[PHI_IDX(p,ant,c,nant,nc)], Jv_Q[Jv_IDX(p,s,c,ant,ns,nc,nant)] );
ey[ant] = cuCmul( phi[PHI_IDX(p,ant,c,nant,nc)], Jv_P[Jv_IDX(p,s,c,ant,ns,nc,nant)] );
Nxx[ant] = cuCmul( ex[ant], cuConj(ex[ant]) );
Nxy[ant] = cuCmul( ex[ant], cuConj(ey[ant]) );
Nyy[ant] = cuCmul( ey[ant], cuConj(ey[ant]) );
__syncthreads();

// Detect the coherent beam
// The safest, slowest option: Just get one thread to do it
__syncthreads();
if ( ant == 0 )
{
for (int i = 1; i < nant; i++)
Expand All @@ -305,7 +308,6 @@ __global__ void vmBeamform_kernel( cuDoubleComplex *Jv_Q,
ey[0] = cuCadd( ey[0], ey[i] );
Nxx[0] = cuCadd( Nxx[0], Nxx[i] );
Nxy[0] = cuCadd( Nxy[0], Nxy[i] );
//Nyx[0]=cuCadd( Nyx[0], Nyx[i] );
Nyy[0] = cuCadd( Nyy[0], Nyy[i] );
}
}
Expand Down Expand Up @@ -543,16 +545,26 @@ void vmBeamformChunk( vcsbeam_context *vm )
{
uintptr_t shared_array_size = 11 * vm->obs_metadata->num_ants * sizeof(double);
// (To see how the 11*STATION double arrays are used, go to this code tag: 11NSTATION)
#ifdef DEBUG
fprintf( stderr, "shared_array_size=%d bytes\n", 11 * vm->obs_metadata->num_ants * sizeof(double));
#endif

// Define GPU compute frame sizes
dim3 chan_samples( vm->nfine_chan, vm->fine_sample_rate / vm->chunks_per_second );
dim3 stat( vm->obs_metadata->num_ants );

// Get the "chunk" number
int chunk = vm->chunk_to_load % vm->chunks_per_second;
gpuErrchk( cudaDeviceSynchronize() );

// Send off a parallel CUDA stream for each pointing
int p;
for (p = 0; p < vm->npointing; p++ )
{
#ifdef DEBUG
fprintf(stderr, "vm->npointing=%d pointing=%d\n", vm->npointing, p);
fprintf(stderr, "chan_samples=(%d,%d,%d) stat=(%d,%d,%d)\n", chan_samples.x, chan_samples.y, chan_samples.z, stat.x, stat.y, stat.z);
#endif
// Call the beamformer kernel
vmBeamform_kernel<<<chan_samples, stat, shared_array_size, vm->streams[p]>>>(
vm->d_Jv_Q,
Expand All @@ -570,7 +582,6 @@ void vmBeamformChunk( vcsbeam_context *vm )
cudaCheckErrors( "vmBeamformChunk: vmBeamform_kernel failed" );
}
gpuErrchk( cudaDeviceSynchronize() );

}

/**
Expand Down

0 comments on commit 2a93cdf

Please sign in to comment.