Merge pull request #270 from randompast/convolve_1d_2o3o

[REVIEW] Implemented convolve1d{2/3}o 'valid', needed={tests,benchmarks,optimizations}
rapidsai · Nov 18, 2020 · 53bdc01 · 53bdc01
2 parents a3e5293 + ef3756d
commit 53bdc01
Show file tree

Hide file tree

Showing 6 changed files with 606 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ## New Features
 - PR #241 - Add inverse_complex_cepstrum and minimum_phase to acoustics module
+- PR #270 - Add second and third order convolutions as convolve1d2o and convolve1d3o 
 
 ## Improvements
 - PR #267 - Various optimization across all functions

diff --git a/cpp/src/convolution/_convolution.cu b/cpp/src/convolution/_convolution.cu
@@ -576,3 +576,221 @@ extern "C" __global__ void __launch_bounds__( 256 )
                                   const int pick ) {
     _cupy_correlate2D<thrust::complex<double>>( inp, inpW, inpH, kernel, kerW, kerH, S0, S1, out, outW, outH, pick );
 }
+
+
+///////////////////////////////////////////////////////////////////////////////
+//                              CONVOLVE 1D2O                                //
+///////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+__device__ void _cupy_convolve1D2O( const T *__restrict__ inp,
+                                const int inpW,
+                                const T *__restrict__ kernel,
+                                const int  kerW,
+                                const int  kerH,
+                                const int  mode,
+                                T *__restrict__ out,
+                                const int outW ) {
+
+    const int tx { static_cast<int>( blockIdx.x * blockDim.x + threadIdx.x ) };
+    const int stride { static_cast<int>( blockDim.x * gridDim.x ) };
+
+    for ( int tid = tx; tid < outW; tid += stride ) {
+
+        T temp {};
+
+        if ( mode == 0 ) {  // Valid
+            if ( tid >= 0 && tid < inpW ) {
+                for ( int i = 0; i < kerW; i++ ) {
+                    for ( int j = 0; j < kerH; j++ ) {
+                        temp += inp[tid + kerW - i - 1] * inp[tid + kerH - j - 1] * kernel[ kerW * i + j];
+                    }
+                }
+            }
+        }
+        out[tid] = temp;
+    }
+
+}
+
+extern "C" __global__ void __launch_bounds__( 512 ) _cupy_convolve1D2O_int32( const int *__restrict__ inp,
+                                                                          const int inpW,
+                                                                          const int *__restrict__ kernel,
+                                                                          const int  kerW,
+                                                                          const int  kerH,
+                                                                          const int  mode,
+                                                                          int *__restrict__ out,
+                                                                          const int outW ) {
+    _cupy_convolve1D2O<int>( inp, inpW, kernel, kerW, kerH, mode, out, outW );
+}
+
+extern "C" __global__ void __launch_bounds__( 512 ) _cupy_convolve1D2O_int64( const long int *__restrict__ inp,
+                                                                          const int inpW,
+                                                                          const long int *__restrict__ kernel,
+                                                                          const int  kerW,
+                                                                          const int  kerH,
+                                                                          const int  mode,
+                                                                          long int *__restrict__ out,
+                                                                          const int outW ) {
+    _cupy_convolve1D2O<long int>( inp, inpW, kernel, kerW, kerH, mode, out, outW );
+}
+
+extern "C" __global__ void __launch_bounds__( 512 ) _cupy_convolve1D2O_float32( const float *__restrict__ inp,
+                                                                            const int inpW,
+                                                                            const float *__restrict__ kernel,
+                                                                            const int  kerW,
+                                                                            const int  kerH,
+                                                                            const int  mode,
+                                                                            float *__restrict__ out,
+                                                                            const int outW ) {
+    _cupy_convolve1D2O<float>( inp, inpW, kernel, kerW, kerH, mode, out, outW );
+}
+
+extern "C" __global__ void __launch_bounds__( 512 ) _cupy_convolve1D2O_float64( const double *__restrict__ inp,
+                                                                            const int inpW,
+                                                                            const double *__restrict__ kernel,
+                                                                            const int  kerW,
+                                                                            const int  kerH,
+                                                                            const int  mode,
+                                                                            double *__restrict__ out,
+                                                                            const int outW ) {
+    _cupy_convolve1D2O<double>( inp, inpW, kernel, kerW, kerH, mode, out, outW );
+}
+
+extern "C" __global__ void __launch_bounds__( 512 )
+    _cupy_convolve1D2O_complex64( thrust::complex<float> *__restrict__ inp,
+                              const int inpW,
+                              thrust::complex<float> *__restrict__ kernel,
+                              const int  kerW,
+                              const int  kerH,
+                              const int  mode,
+                              thrust::complex<float> *__restrict__ out,
+                              const int outW ) {
+    _cupy_convolve1D2O<thrust::complex<float>>( inp, inpW, kernel, kerW, kerH, mode, out, outW );
+}
+
+extern "C" __global__ void __launch_bounds__( 512 )
+    _cupy_convolve1D2O_complex128( const thrust::complex<double> *__restrict__ inp,
+                               const int inpW,
+                               const thrust::complex<double> *__restrict__ kernel,
+                               const int  kerW,
+                               const int  kerH,
+                               const int  mode,
+                               thrust::complex<double> *__restrict__ out,
+                               const int outW ) {
+    _cupy_convolve1D2O<thrust::complex<double>>( inp, inpW, kernel, kerW, kerH, mode, out, outW );
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+//                              CONVOLVE 1D3O                                //
+///////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+__device__ void _cupy_convolve1D3O( const T *__restrict__ inp,
+                                const int inpW,
+                                const T *__restrict__ kernel,
+                                const int  kerW,
+                                const int  kerH,
+                                const int  kerD,
+                                const int  mode,
+                                T *__restrict__ out,
+                                const int outW ) {
+
+    const int tx { static_cast<int>( blockIdx.x * blockDim.x + threadIdx.x ) };
+    const int stride { static_cast<int>( blockDim.x * gridDim.x ) };
+
+    for ( int tid = tx; tid < outW; tid += stride ) {
+
+        T temp {};
+
+        if ( mode == 0 ) {  // Valid
+            if ( tid >= 0 && tid < inpW ) {
+                for ( int i = 0; i < kerW; i++ ) {
+                    for ( int j = 0; j < kerH; j++ ) {
+                        for ( int k = 0; k < kerD; k++ ) {
+                            temp += inp[tid + kerW - i - 1] * inp[tid + kerH - j - 1] * inp[tid + kerD - k - 1] * kernel[ (kerW * i + j) * kerH + k ];
+                        }
+                    }
+                }
+            }
+        }
+        out[tid] = temp;
+    }
+
+}
+
+extern "C" __global__ void __launch_bounds__( 512 ) _cupy_convolve1D3O_int32( const int *__restrict__ inp,
+                                                                          const int inpW,
+                                                                          const int *__restrict__ kernel,
+                                                                          const int  kerW,
+                                                                          const int  kerH,
+                                                                          const int  kerD,
+                                                                          const int  mode,
+                                                                          int *__restrict__ out,
+                                                                          const int outW ) {
+    _cupy_convolve1D3O<int>( inp, inpW, kernel, kerW, kerH, kerD, mode, out, outW );
+}
+
+extern "C" __global__ void __launch_bounds__( 512 ) _cupy_convolve1D3O_int64( const long int *__restrict__ inp,
+                                                                          const int inpW,
+                                                                          const long int *__restrict__ kernel,
+                                                                          const int  kerW,
+                                                                          const int  kerH,
+                                                                          const int  kerD,
+                                                                          const int  mode,
+                                                                          long int *__restrict__ out,
+                                                                          const int outW ) {
+    _cupy_convolve1D3O<long int>( inp, inpW, kernel, kerW, kerH, kerD, mode, out, outW );
+}
+
+extern "C" __global__ void __launch_bounds__( 512 ) _cupy_convolve1D3O_float32( const float *__restrict__ inp,
+                                                                            const int inpW,
+                                                                            const float *__restrict__ kernel,
+                                                                            const int  kerW,
+                                                                            const int  kerH,
+                                                                            const int  kerD,
+                                                                            const int  mode,
+                                                                            float *__restrict__ out,
+                                                                            const int outW ) {
+    _cupy_convolve1D3O<float>( inp, inpW, kernel, kerW, kerH, kerD, mode, out, outW );
+}
+
+extern "C" __global__ void __launch_bounds__( 512 ) _cupy_convolve1D3O_float64( const double *__restrict__ inp,
+                                                                            const int inpW,
+                                                                            const double *__restrict__ kernel,
+                                                                            const int  kerW,
+                                                                            const int  kerH,
+                                                                            const int  kerD,
+                                                                            const int  mode,
+                                                                            double *__restrict__ out,
+                                                                            const int outW ) {
+    _cupy_convolve1D3O<double>( inp, inpW, kernel, kerW, kerH, kerD, mode, out, outW );
+}
+
+extern "C" __global__ void __launch_bounds__( 512 )
+    _cupy_convolve1D3O_complex64( thrust::complex<float> *__restrict__ inp,
+                              const int inpW,
+                              thrust::complex<float> *__restrict__ kernel,
+                              const int  kerW,
+                              const int  kerH,
+                              const int  kerD,
+                              const int  mode,
+                              thrust::complex<float> *__restrict__ out,
+                              const int outW ) {
+    _cupy_convolve1D3O<thrust::complex<float>>( inp, inpW, kernel, kerW, kerH, kerD, mode, out, outW );
+}
+
+extern "C" __global__ void __launch_bounds__( 512 )
+    _cupy_convolve1D3O_complex128( const thrust::complex<double> *__restrict__ inp,
+                               const int inpW,
+                               const thrust::complex<double> *__restrict__ kernel,
+                               const int  kerW,
+                               const int  kerH,
+                               const int  kerD,
+                               const int  mode,
+                               thrust::complex<double> *__restrict__ out,
+                               const int outW ) {
+    _cupy_convolve1D3O<thrust::complex<double>>( inp, inpW, kernel, kerW, kerH, kerD, mode, out, outW );
+}
diff --git a/python/cusignal/__init__.py b/python/cusignal/__init__.py
@@ -40,6 +40,8 @@
     choose_conv_method,
     convolve,
     convolve2d,
+    convolve1d2o,
+    convolve1d3o,
 )
 from cusignal.filter_design.fir_filter_design import (
     kaiser_beta,

diff --git a/python/cusignal/convolution/__init__.py b/python/cusignal/convolution/__init__.py
@@ -16,5 +16,7 @@
     fftconvolve,
     convolve2d,
     choose_conv_method,
+    convolve1d2o,
+    convolve1d3o,
 )
 from cusignal.convolution.correlate import correlate, correlate2d