From 2cca8c0d8b96e555c9a39b6d484ab87dd51ab9b1 Mon Sep 17 00:00:00 2001 From: randompast Date: Mon, 19 Oct 2020 00:01:51 -0400 Subject: [PATCH 1/5] Implemented convolve{1d2o,1d3o} mode='valid', shape={square,cube}, needed={tests,validations,benchmarks,optimizations} --- cpp/src/convolution/_convolution.cu | 218 ++++++++++++++++++ python/cusignal/__init__.py | 2 + python/cusignal/convolution/__init__.py | 2 + .../cusignal/convolution/_convolution_cuda.py | 197 ++++++++++++++++ python/cusignal/convolution/convolve.py | 186 +++++++++++++++ 5 files changed, 605 insertions(+) diff --git a/cpp/src/convolution/_convolution.cu b/cpp/src/convolution/_convolution.cu index 9d6959c6..7dba54ce 100644 --- a/cpp/src/convolution/_convolution.cu +++ b/cpp/src/convolution/_convolution.cu @@ -576,3 +576,221 @@ extern "C" __global__ void __launch_bounds__( 256 ) const int pick ) { _cupy_correlate2D>( inp, inpW, inpH, kernel, kerW, kerH, S0, S1, out, outW, outH, pick ); } + + +/////////////////////////////////////////////////////////////////////////////// +// CONVOLVE 1D2O // +/////////////////////////////////////////////////////////////////////////////// + +template +__device__ void _cupy_convolve1D2O( const T *__restrict__ inp, + const int inpW, + const T *__restrict__ kernel, + const int kerW, + const int kerH, + const int mode, + T *__restrict__ out, + const int outW ) { + + const int tx { static_cast( blockIdx.x * blockDim.x + threadIdx.x ) }; + const int stride { static_cast( blockDim.x * gridDim.x ) }; + + for ( int tid = tx; tid < outW; tid += stride ) { + + T temp {}; + + if ( mode == 0 ) { // Valid + if ( tid >= 0 && tid < inpW ) { + for ( int i = 0; i < kerW; i++ ) { + for ( int j = 0; j < kerH; j++ ) { + temp += inp[tid + kerW - i - 1] * inp[tid + kerH - j - 1] * kernel[ kerW * i + j]; + } + } + } + } + out[tid] = temp; + } + +} + +extern "C" __global__ void __launch_bounds__( 512 ) _cupy_convolve1D2O_int32( const int *__restrict__ inp, + const int inpW, + const int *__restrict__ kernel, + const int kerW, + const int kerH, + const int mode, + int *__restrict__ out, + const int outW ) { + _cupy_convolve1D2O( inp, inpW, kernel, kerW, kerH, mode, out, outW ); +} + +extern "C" __global__ void __launch_bounds__( 512 ) _cupy_convolve1D2O_int64( const long int *__restrict__ inp, + const int inpW, + const long int *__restrict__ kernel, + const int kerW, + const int kerH, + const int mode, + long int *__restrict__ out, + const int outW ) { + _cupy_convolve1D2O( inp, inpW, kernel, kerW, kerH, mode, out, outW ); +} + +extern "C" __global__ void __launch_bounds__( 512 ) _cupy_convolve1D2O_float32( const float *__restrict__ inp, + const int inpW, + const float *__restrict__ kernel, + const int kerW, + const int kerH, + const int mode, + float *__restrict__ out, + const int outW ) { + _cupy_convolve1D2O( inp, inpW, kernel, kerW, kerH, mode, out, outW ); +} + +extern "C" __global__ void __launch_bounds__( 512 ) _cupy_convolve1D2O_float64( const double *__restrict__ inp, + const int inpW, + const double *__restrict__ kernel, + const int kerW, + const int kerH, + const int mode, + double *__restrict__ out, + const int outW ) { + _cupy_convolve1D2O( inp, inpW, kernel, kerW, kerH, mode, out, outW ); +} + +extern "C" __global__ void __launch_bounds__( 512 ) + _cupy_convolve1D2O_complex64( thrust::complex *__restrict__ inp, + const int inpW, + thrust::complex *__restrict__ kernel, + const int kerW, + const int kerH, + const int mode, + thrust::complex *__restrict__ out, + const int outW ) { + _cupy_convolve1D2O>( inp, inpW, kernel, kerW, kerH, mode, out, outW ); +} + +extern "C" __global__ void __launch_bounds__( 512 ) + _cupy_convolve1D2O_complex128( const thrust::complex *__restrict__ inp, + const int inpW, + const thrust::complex *__restrict__ kernel, + const int kerW, + const int kerH, + const int mode, + thrust::complex *__restrict__ out, + const int outW ) { + _cupy_convolve1D2O>( inp, inpW, kernel, kerW, kerH, mode, out, outW ); +} + + + +/////////////////////////////////////////////////////////////////////////////// +// CONVOLVE 1D3O // +/////////////////////////////////////////////////////////////////////////////// + +template +__device__ void _cupy_convolve1D3O( const T *__restrict__ inp, + const int inpW, + const T *__restrict__ kernel, + const int kerW, + const int kerH, + const int kerD, + const int mode, + T *__restrict__ out, + const int outW ) { + + const int tx { static_cast( blockIdx.x * blockDim.x + threadIdx.x ) }; + const int stride { static_cast( blockDim.x * gridDim.x ) }; + + for ( int tid = tx; tid < outW; tid += stride ) { + + T temp {}; + + if ( mode == 0 ) { // Valid + if ( tid >= 0 && tid < inpW ) { + for ( int i = 0; i < kerW; i++ ) { + for ( int j = 0; j < kerH; j++ ) { + for ( int k = 0; k < kerD; k++ ) { + temp += inp[tid + kerW - i - 1] * inp[tid + kerH - j - 1] * inp[tid + kerD - k - 1] * kernel[ (kerW * i + j) * kerH + k ]; + } + } + } + } + } + out[tid] = temp; + } + +} + +extern "C" __global__ void __launch_bounds__( 512 ) _cupy_convolve1D3O_int32( const int *__restrict__ inp, + const int inpW, + const int *__restrict__ kernel, + const int kerW, + const int kerH, + const int kerD, + const int mode, + int *__restrict__ out, + const int outW ) { + _cupy_convolve1D3O( inp, inpW, kernel, kerW, kerH, kerD, mode, out, outW ); +} + +extern "C" __global__ void __launch_bounds__( 512 ) _cupy_convolve1D3O_int64( const long int *__restrict__ inp, + const int inpW, + const long int *__restrict__ kernel, + const int kerW, + const int kerH, + const int kerD, + const int mode, + long int *__restrict__ out, + const int outW ) { + _cupy_convolve1D3O( inp, inpW, kernel, kerW, kerH, kerD, mode, out, outW ); +} + +extern "C" __global__ void __launch_bounds__( 512 ) _cupy_convolve1D3O_float32( const float *__restrict__ inp, + const int inpW, + const float *__restrict__ kernel, + const int kerW, + const int kerH, + const int kerD, + const int mode, + float *__restrict__ out, + const int outW ) { + _cupy_convolve1D3O( inp, inpW, kernel, kerW, kerH, kerD, mode, out, outW ); +} + +extern "C" __global__ void __launch_bounds__( 512 ) _cupy_convolve1D3O_float64( const double *__restrict__ inp, + const int inpW, + const double *__restrict__ kernel, + const int kerW, + const int kerH, + const int kerD, + const int mode, + double *__restrict__ out, + const int outW ) { + _cupy_convolve1D3O( inp, inpW, kernel, kerW, kerH, kerD, mode, out, outW ); +} + +extern "C" __global__ void __launch_bounds__( 512 ) + _cupy_convolve1D3O_complex64( thrust::complex *__restrict__ inp, + const int inpW, + thrust::complex *__restrict__ kernel, + const int kerW, + const int kerH, + const int kerD, + const int mode, + thrust::complex *__restrict__ out, + const int outW ) { + _cupy_convolve1D3O>( inp, inpW, kernel, kerW, kerH, kerD, mode, out, outW ); +} + +extern "C" __global__ void __launch_bounds__( 512 ) + _cupy_convolve1D3O_complex128( const thrust::complex *__restrict__ inp, + const int inpW, + const thrust::complex *__restrict__ kernel, + const int kerW, + const int kerH, + const int kerD, + const int mode, + thrust::complex *__restrict__ out, + const int outW ) { + _cupy_convolve1D3O>( inp, inpW, kernel, kerW, kerH, kerD, mode, out, outW ); +} diff --git a/python/cusignal/__init__.py b/python/cusignal/__init__.py index 6a1b4d3c..25dcb09a 100644 --- a/python/cusignal/__init__.py +++ b/python/cusignal/__init__.py @@ -40,6 +40,8 @@ choose_conv_method, convolve, convolve2d, + convolve1d2o, + convolve1d3o, ) from cusignal.filter_design.fir_filter_design import ( kaiser_beta, diff --git a/python/cusignal/convolution/__init__.py b/python/cusignal/convolution/__init__.py index f46d4dce..c42f4229 100644 --- a/python/cusignal/convolution/__init__.py +++ b/python/cusignal/convolution/__init__.py @@ -16,5 +16,7 @@ fftconvolve, convolve2d, choose_conv_method, + convolve1d2o, + convolve1d3o, ) from cusignal.convolution.correlate import correlate, correlate2d diff --git a/python/cusignal/convolution/_convolution_cuda.py b/python/cusignal/convolution/_convolution_cuda.py index dff444cd..edf2baa9 100644 --- a/python/cusignal/convolution/_convolution_cuda.py +++ b/python/cusignal/convolution/_convolution_cuda.py @@ -73,6 +73,73 @@ def __call__( self.kernel(self.grid, self.block, kernel_args) +class _cupy_convolve_1d2o_wrapper(object): + def __init__(self, grid, block, kernel): + if isinstance(grid, int): + grid = (grid,) + if isinstance(block, int): + block = (block,) + + self.grid = grid + self.block = block + self.kernel = kernel + + def __call__( + self, + d_inp, + d_kernel, + mode, + out, + ): + + kernel_args = ( + d_inp, + d_inp.shape[0], + d_kernel, + d_kernel.shape[0], + d_kernel.shape[1], + mode, + out, + out.shape[0], + ) + + self.kernel(self.grid, self.block, kernel_args) + + +class _cupy_convolve_1d3o_wrapper(object): + def __init__(self, grid, block, kernel): + if isinstance(grid, int): + grid = (grid,) + if isinstance(block, int): + block = (block,) + + self.grid = grid + self.block = block + self.kernel = kernel + + def __call__( + self, + d_inp, + d_kernel, + mode, + out, + ): + + kernel_args = ( + d_inp, + d_inp.shape[0], + d_kernel, + d_kernel.shape[0], + d_kernel.shape[1], + d_kernel.shape[2], + mode, + out, + out.shape[0], + ) + + self.kernel(self.grid, self.block, kernel_args) + + class _cupy_convolve_2d_wrapper(object): def __init__(self, grid, block, kernel): if isinstance(grid, int): @@ -145,6 +212,10 @@ def _get_backend_kernel( return _cupy_convolve_wrapper(grid, block, kernel) elif k_type == "convolve2D" or k_type == "correlate2D": return _cupy_convolve_2d_wrapper(grid, block, kernel) + elif k_type == "convolve1D2O": + return _cupy_convolve_1d2o_wrapper(grid, block, kernel) + elif k_type == "convolve1D3O": + return _cupy_convolve_1d3o_wrapper(grid, block, kernel) else: raise NotImplementedError( "No CuPY kernel found for k_type {}, datatype {}".format( @@ -439,3 +510,129 @@ def _convolve2d(in1, in2, use_convolve, mode, boundary, fillvalue): ) return out + + +def _convolve1d2o_gpu( + inp, + out, + ker, + mode, +): + + d_inp = cp.array(inp) + d_kernel = cp.array(ker) + + threadsperblock, blockspergrid = _get_tpb_bpg() + + k_type = "convolve1D2O" + + _populate_kernel_cache(out.dtype, k_type) + + kernel = _get_backend_kernel( + out.dtype, + blockspergrid, + threadsperblock, + k_type, + ) + + kernel(d_inp, d_kernel, mode, out) + + _print_atts(kernel) + + return out + + +def _convolve1d2o(in1, in2, mode): + + val = _valfrommode(mode) + + # Promote inputs + promType = cp.promote_types(in1.dtype, in2.dtype) + in1 = in1.astype(promType) + in2 = in2.astype(promType) + + # Create empty array to hold number of aout dimensions + out_dimens = np.empty(in1.ndim, np.int) + if val == VALID: + for i in range(in1.ndim): + out_dimens[i] = in1.shape[i] - in2.shape[i] + 1 + if out_dimens[i] < 0: + raise Exception( + "no part of the output is valid, use option 1 (same) or 2 \ + (full) for third argument" + ) + + # Create empty array out on GPU + out = cp.empty(out_dimens.tolist(), in1.dtype) + + out = _convolve1d2o_gpu( + in1, + out, + in2, + val, + ) + + return out + + +def _convolve1d3o_gpu( + inp, + out, + ker, + mode, +): + + d_inp = cp.array(inp) + d_kernel = cp.array(ker) + + threadsperblock, blockspergrid = _get_tpb_bpg() + + k_type = "convolve1D3O" + + _populate_kernel_cache(out.dtype, k_type) + + kernel = _get_backend_kernel( + out.dtype, + blockspergrid, + threadsperblock, + k_type, + ) + + kernel(d_inp, d_kernel, mode, out) + + _print_atts(kernel) + + return out + + +def _convolve1d3o(in1, in2, mode): + + val = _valfrommode(mode) + + # Promote inputs + promType = cp.promote_types(in1.dtype, in2.dtype) + in1 = in1.astype(promType) + in2 = in2.astype(promType) + + # Create empty array to hold number of aout dimensions + out_dimens = np.empty(in1.ndim, np.int) + if val == VALID: + for i in range(in1.ndim): + out_dimens[i] = in1.shape[i] - in2.shape[i] + 1 + if out_dimens[i] < 0: + raise Exception( + "no part of the output is valid, use option 1 (same) or 2 \ + (full) for third argument" + ) + + # Create empty array out on GPU + out = cp.empty(out_dimens.tolist(), in1.dtype) + + out = _convolve1d3o_gpu( + in1, + out, + in2, + val, + ) + + return out \ No newline at end of file diff --git a/python/cusignal/convolution/convolve.py b/python/cusignal/convolution/convolve.py index ad0fb571..3a80baa3 100644 --- a/python/cusignal/convolution/convolve.py +++ b/python/cusignal/convolution/convolve.py @@ -526,3 +526,189 @@ def choose_conv_method(in1, in2, mode="full", measure=False): return "fft" return "direct" + + +def convolve1d2o( + in1, + in2, + mode="valid", + method="direct", +): + """ + Convolve a 1-dimensional arrays with a 2nd order filter. + This results in a second order convolution. + + Convolve `in1` and `in2`, with the output size determined by the + `mode` argument. + + Parameters + ---------- + in1 : array_like + First input. + in2 : array_like + Second input. Should have the same number of dimensions as `in1`. + mode : str {'full', 'valid', 'same'}, optional + A string indicating the size of the output: + + ``full`` + The output is the full discrete linear convolution + of the inputs. (Default) + ``valid`` + The output consists only of those elements that do not + rely on the zero-padding. In 'valid' mode, either `in1` or `in2` + must be at least as large as the other in every dimension. + ``same`` + The output is the same size as `in1`, centered + with respect to the 'full' output. + method : str {'auto', 'direct', 'fft'}, optional + A string indicating which method to use to calculate the convolution. + + ``direct`` + The convolution is determined directly from sums, the definition of + convolution. + ``fft`` + The Fourier Transform is used to perform the convolution by calling + `fftconvolve`. + ``auto`` + Automatically chooses direct or Fourier method based on an estimate + of which is faster (default). + + Returns + ------- + out : ndarray + A 1-dimensional array containing a subset of the discrete linear + convolution of `in1` with `in2`. + + See Also + -------- + convolve + convolve1d2o + convolve1d3o + + Examples + -------- + Convolution of a 2nd order filter on a 1d signal + + >>> import cusignal as cs + >>> import numpy as np + >>> d = 50 + >>> a = np.random.uniform(-1,1,(200)) + >>> b = np.random.uniform(-1,1,(d,d)) + >>> c = cs.convolve1d3o(a,b) + + """ + + signal = cp.asarray(in1) + kernel = cp.asarray(in2) + + if mode == "valid" and signal.shape[0] < kernel.shape[0]: + # Convolution is commutative + # order doesn't have any effect on output + signal, kernel = kernel, signal + + if mode in ["same", "full"]: + raise NotImplementedError( + "Mode == {} not implemented".format( + mode + ) + ) + + if method == "direct": + return _convolution_cuda._convolve1d2o( + signal, kernel, mode + ) + else: + raise NotImplementedError("Only Direct method implemented") + + +def convolve1d3o( + in1, + in2, + mode="valid", + method="direct", +): + """ + Convolve a 1-dimensional array with a 3rd order filter. + This results in a second order convolution. + + Convolve `in1` and `in2`, with the output size determined by the + `mode` argument. + + Parameters + ---------- + in1 : array_like + First input. + in2 : array_like + Second input. Should have the same number of dimensions as `in1`. + mode : str {'full', 'valid', 'same'}, optional + A string indicating the size of the output: + + ``full`` + The output is the full discrete linear convolution + of the inputs. (Default) + ``valid`` + The output consists only of those elements that do not + rely on the zero-padding. In 'valid' mode, either `in1` or `in2` + must be at least as large as the other in every dimension. + ``same`` + The output is the same size as `in1`, centered + with respect to the 'full' output. + method : str {'auto', 'direct', 'fft'}, optional + A string indicating which method to use to calculate the convolution. + + ``direct`` + The convolution is determined directly from sums, the definition of + convolution. + ``fft`` + The Fourier Transform is used to perform the convolution by calling + `fftconvolve`. + ``auto`` + Automatically chooses direct or Fourier method based on an estimate + of which is faster (default). + + Returns + ------- + out : ndarray + A 1-dimensional array containing a subset of the discrete linear + convolution of `in1` with `in2`. + + See Also + -------- + convolve + convolve1d2o + convolve1d3o + + Examples + -------- + Convolution of a 3rd order filter on a 1d signal + + >>> import cusignal as cs + >>> import numpy as np + >>> d = 50 + >>> a = np.random.uniform(-1,1,(200)) + >>> b = np.random.uniform(-1,1,(d,d,d)) + >>> c = cs.convolve1d3o(a,b) + + """ + + signal = cp.asarray(in1) + kernel = cp.asarray(in2) + + if mode == "valid" and signal.shape[0] < kernel.shape[0]: + # Convolution is commutative + # order doesn't have any effect on output + signal, kernel = kernel, signal + + if mode in ["same", "full"]: + raise NotImplementedError( + "Mode == {} not implemented".format( + mode + ) + ) + + if method == "direct": + return _convolution_cuda._convolve1d3o( + signal, kernel, mode + ) + else: + raise NotImplementedError("Only Direct method implemented") From 87cdfea5f091d99c5b59352363d8de42267d19b6 Mon Sep 17 00:00:00 2001 From: randompast Date: Sat, 24 Oct 2020 01:01:50 -0400 Subject: [PATCH 2/5] changed changedlog.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index db633072..1e72448e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## New Features - PR #241 - Add inverse_complex_cepstrum and minimum_phase to acoustics module +- PR #270 - Add convolve1d2o and convolve1d3o ## Improvements From b8869d36c7b42a217c0f35c1bc9a9d03e3e11dd1 Mon Sep 17 00:00:00 2001 From: randompast Date: Sat, 24 Oct 2020 01:05:07 -0400 Subject: [PATCH 3/5] improved #270 changelog.md message --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e72448e..c7d5ca54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ ## New Features - PR #241 - Add inverse_complex_cepstrum and minimum_phase to acoustics module -- PR #270 - Add convolve1d2o and convolve1d3o +- PR #270 - Add second and third order convolutions as convolve1d2o and convolve1d3o ## Improvements From efe63ab523c50b579bb8ea06127e66b5c82dab7e Mon Sep 17 00:00:00 2001 From: BradReesWork Date: Wed, 18 Nov 2020 09:12:16 -0500 Subject: [PATCH 4/5] style fix --- python/cusignal/convolution/_convolution_cuda.py | 3 ++- python/cusignal/convolution/convolve.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cusignal/convolution/_convolution_cuda.py b/python/cusignal/convolution/_convolution_cuda.py index edf2baa9..8fa88228 100644 --- a/python/cusignal/convolution/_convolution_cuda.py +++ b/python/cusignal/convolution/_convolution_cuda.py @@ -635,4 +635,5 @@ def _convolve1d3o(in1, in2, mode): val, ) - return out \ No newline at end of file + return out + diff --git a/python/cusignal/convolution/convolve.py b/python/cusignal/convolution/convolve.py index 3a80baa3..68c1fa2c 100644 --- a/python/cusignal/convolution/convolve.py +++ b/python/cusignal/convolution/convolve.py @@ -584,7 +584,7 @@ def convolve1d2o( convolve convolve1d2o convolve1d3o - + Examples -------- Convolution of a 2nd order filter on a 1d signal From ef3756db7ae31a1cd24d92a0bc7f83468af34286 Mon Sep 17 00:00:00 2001 From: BradReesWork Date: Wed, 18 Nov 2020 09:14:18 -0500 Subject: [PATCH 5/5] style --- python/cusignal/convolution/_convolution_cuda.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cusignal/convolution/_convolution_cuda.py b/python/cusignal/convolution/_convolution_cuda.py index 8fa88228..501cc6ce 100644 --- a/python/cusignal/convolution/_convolution_cuda.py +++ b/python/cusignal/convolution/_convolution_cuda.py @@ -636,4 +636,3 @@ def _convolve1d3o(in1, in2, mode): ) return out -