Skip to content

Commit

Permalink
fix for size = N*256+1
Browse files Browse the repository at this point in the history
  • Loading branch information
VinInn authored and fwyzard committed Jul 15, 2022
1 parent 4db3807 commit d5faa3d
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 104 deletions.
2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDAUtilities/interface/radixSort.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ __device__ __forceinline__ void radixSortImpl(
// broadcast
ibs = size - 1;
__syncthreads();
while (__syncthreads_and(ibs > 0)) {
while (__syncthreads_and(ibs >= 0)) {
int i = ibs - threadIdx.x;
if (threadIdx.x < sb) {
cu[threadIdx.x] = -1;
Expand Down
220 changes: 117 additions & 103 deletions HeterogeneousCore/CUDAUtilities/test/oneRadixSort_t.cu
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
// #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"


#include <cstddef>
#include <cstdint>
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
#include <algorithm>


using FLOAT = double;

// A templated unsigned integer type with N bytes
Expand Down Expand Up @@ -43,7 +41,7 @@ using uintT_t = uintN_t<sizeof(T) * 8>;

// Keep only the `N` most significant bytes of `t`, and set the others to zero
template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
__device__ __host__ T truncate(T const & t) {
__device__ __host__ T truncate(T const& t) {
const int shift = 8 * (sizeof(T) - N);
union {
T t;
Expand All @@ -55,136 +53,152 @@ __device__ __host__ T truncate(T const & t) {
}

namespace {
__global__ void testKernel(FLOAT * gpu_input, int * gpu_product, int elements){
size_t firstElement = threadIdx.x + blockIdx.x * blockDim.x; // This is going to be the track index
size_t gridSize = blockDim.x * gridDim.x;

// radix sort works in a single block
assert(1==gridDim.x);
assert(0==blockIdx.x);
assert(elements<=2048);

__shared__ uint16_t order[2048];
__shared__ uint16_t sws[2048];
__shared__ float z[2048];
__shared__ int iz[2048];
for (unsigned int itrack=firstElement; itrack<elements; itrack+=gridSize) {
z[itrack] = gpu_input[itrack];
iz[itrack] = 10000*gpu_input[itrack];
// order[itrack] = itrack;
}
__syncthreads();
radixSort<float, 2>(z, order, sws, elements);
__syncthreads();


//verify
for (unsigned int itrack=firstElement; itrack<(elements-1); itrack+=gridSize) {
auto ntrack = order[itrack];
auto mtrack = order[itrack+1];
assert(truncate<2>(z[ntrack])<=truncate<2>(z[mtrack]));
__global__ void testKernel(FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) {
size_t firstElement = threadIdx.x + blockIdx.x * blockDim.x; // This is going to be the track index
size_t gridSize = blockDim.x * gridDim.x;

// radix sort works in a single block
assert(1 == gridDim.x);
assert(0 == blockIdx.x);
assert(elements <= 2048);

__shared__ uint16_t order[2048];
__shared__ uint16_t sws[2048];
__shared__ float z[2048];
__shared__ int iz[2048];
for (unsigned int itrack = firstElement; itrack < elements; itrack += gridSize) {
z[itrack] = gpu_input[itrack];
iz[itrack] = 10000 * gpu_input[itrack];
// order[itrack] = itrack;
}
__syncthreads();
radixSort<float, 2>(z, order, sws, elements);
__syncthreads();

//verify
for (unsigned int itrack = firstElement; itrack < (elements - 1); itrack += gridSize) {
auto ntrack = order[itrack];
auto mtrack = order[itrack + 1];
assert(truncate<2>(z[ntrack]) <= truncate<2>(z[mtrack]));
}

__syncthreads();

if (doPrint)
if (threadIdx.x == 0 && blockIdx.x == 0) {
for (unsigned int itrackO = 0; itrackO < elements; itrackO++) {
int itrack = order[itrackO];
printf(
"Radix sort with %i elements: At position %i, track position at input %i with z at input %f, z fed to "
"radixSort %f\n",
elements,
itrackO,
itrack,
gpu_input[itrack],
z[itrack]);
gpu_product[itrackO] = itrack;
}
}

__syncthreads();
__syncthreads();
radixSort<int, 4>(iz, order, sws, elements);
__syncthreads();

for (unsigned int itrack = firstElement; itrack < (elements - 1); itrack += gridSize) {
auto ntrack = order[itrack];
auto mtrack = order[itrack + 1];
assert(iz[ntrack] <= iz[mtrack]);
}

/*
if (doPrint)
if (threadIdx.x == 0 && blockIdx.x == 0) {
for (unsigned int itrackO=0; itrackO < elements; itrackO++){
int itrack = order[itrackO];
printf("Radix sort with %i elements: At position %i, track position at input %i with z at input %f, z fed to radixSort %f\n", elements, itrackO, itrack, gpu_input[itrack], z[itrack]);
gpu_product[itrackO] = itrack;
}
}
*/

__syncthreads();
radixSort<int, 4>(iz, order, sws, elements);
__syncthreads();

for (unsigned int itrack=firstElement; itrack<(elements-1); itrack+=gridSize) {
auto ntrack = order[itrack];
auto mtrack = order[itrack+1];
assert(iz[ntrack]<=iz[mtrack]);
for (unsigned int itrackO = 0; itrackO < elements; itrackO++) {
int itrack = order[itrackO];
printf(
"Radix sort with %i elements: At position %i, track position at input %i with z at input %f, z fed to "
"radixSort %d\n",
elements,
itrackO,
itrack,
gpu_input[itrack],
iz[itrack]);
gpu_product[itrackO] = itrack;
}
}
}

/*
if (threadIdx.x == 0 && blockIdx.x == 0) {
for (unsigned int itrackO=0; itrackO < elements; itrackO++){
int itrack = order[itrackO];
printf("Radix sort with %i elements: At position %i, track position at input %i with z at input %f, z fed to radixSort %d\n", elements, itrackO, itrack, gpu_input[itrack], iz[itrack]);
gpu_product[itrackO] = itrack;
}
}
*/


}


void testWrapper(FLOAT * gpu_input, int * gpu_product, int elements){
auto blockSize = 512; // somewhat arbitrary
auto gridSize = 1; // round up to cover the sample size
testKernel<<<gridSize, blockSize>>>(gpu_input, gpu_product, elements);
void testWrapper(FLOAT* gpu_input, int* gpu_product, int elements, bool doPrint) {
auto blockSize = 512; // somewhat arbitrary
auto gridSize = 1; // round up to cover the sample size
testKernel<<<gridSize, blockSize>>>(gpu_input, gpu_product, elements, doPrint);
cudaCheck(cudaGetLastError());
}
}

}
} // namespace

#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"

int main() {

cms::cudatest::requireDevices();


FLOAT* gpu_input;
int* gpu_product;


int nmax = 4*260;
FLOAT input[nmax] = {30.0, 30.0, -4.4, -7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007, 2.2666890621, 2.2677690983, 2.2794606686, 2.2802586555, 2.2821085453, 2.2852313519, 2.2877883911, 2.2946476936, 2.2960267067, 2.3006286621, 2.3245604038, 2.6755006313, 2.7229132652, 2.783257246, 2.8440306187, 2.9017834663, 2.9252648354, 2.9254128933, 2.927520752, 2.9422419071, 2.9453969002, 2.9457902908, 2.9465973377, 2.9492356777, 2.9573802948, 2.9575133324, 2.9575304985, 2.9586606026, 2.9605507851, 2.9622797966, 2.9625515938, 2.9641008377, 2.9646151066, 2.9676523209, 2.9708273411, 2.974111557, 2.9742531776, 2.9772830009, 2.9877333641, 2.9960610867, 3.013969183, 3.0187871456, 3.0379793644, 3.0407221317, 3.0415751934, 3.0470511913, 3.0560519695, 3.0592908859, 3.0599737167, 3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904, 3.0643932819, 3.0645000935, 3.0666446686, 3.068046093, 3.0697011948, 3.0717656612, 3.0718104839, 3.0718348026, 3.0733406544, 3.0738227367, 3.0738801956, 3.0738828182, 3.0744686127, 3.0753741264, 3.0758397579, 3.0767207146, 3.0773906708, 3.0778541565, 3.0780284405, 3.0780889988, 3.0782799721, 3.0789675713, 3.0792205334, 3.0793278217, 3.0795567036, 3.0797944069, 3.0806643963, 3.0809247494, 3.0815284252, 3.0817306042, 3.0819730759, 3.0820026398, 3.0838682652, 3.084009409, 3.0848178864, 3.0853257179, 3.0855510235, 3.0856611729, 3.0873703957, 3.0884618759, 3.0891149044, 3.0893011093, 3.0895674229, 3.0901503563, 3.0903317928, 3.0912668705, 3.0920717716, 3.0954346657, 3.096424818, 3.0995628834, 3.1001036167, 3.1173279285, 3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508, 3.1676850319, 3.1886672974, 3.3769197464, 3.3821125031, 3.4780933857, 3.4822063446, 3.4989323616, 3.5076274872, 3.5225863457, 3.5271244049, 3.5298995972, 3.5417425632, 3.5444457531, 3.5465917587, 3.5473103523, 3.5480232239, 3.5526945591, 3.5531234741, 3.5538012981, 3.5544877052, 3.5547749996, 3.5549693108, 3.5550665855, 3.5558729172, 3.5560717583, 3.5560848713, 3.5584278107, 3.558681488, 3.5587313175, 3.5592217445, 3.559384346, 3.5604712963, 3.5634038448, 3.563803196, 3.564593792, 3.5660364628, 3.5683133602, 3.5696356297, 3.569729805, 3.5740811825, 3.5757565498, 3.5760207176, 3.5760478973, 3.5836098194, 3.5839796066, 3.5852358341, 3.5901627541, 3.6141786575, 3.6601481438, 3.7187042236, 3.9741659164, 4.4111995697, 4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405, 4.6868157387, 4.6868577003, 4.6879930496, 4.6888813972, 4.6910686493, 4.6925001144, 4.6957530975, 4.698094368, 4.6997032166, 4.7017259598, 4.7020640373, 4.7024269104, 4.7036352158, 4.7038679123, 4.7042069435, 4.7044086456, 4.7044372559, 4.7050771713, 4.7055773735, 4.7060651779, 4.7062759399, 4.7065420151, 4.70657444, 4.7066287994, 4.7066788673, 4.7067341805, 4.7072944641, 4.7074551582, 4.7075614929, 4.7075891495, 4.7076044083, 4.7077374458, 4.7080879211, 4.70819664, 4.7086658478, 4.708937645, 4.7092385292, 4.709479332, 4.7095656395, 4.7100076675, 4.7102108002, 4.7104525566, 4.7105507851, 4.71118927, 4.7113513947, 4.7115578651, 4.7116270065, 4.7116751671, 4.7117190361, 4.7117333412, 4.7117910385, 4.7119007111, 4.7120013237, 4.712003231, 4.712044239, 4.7122926712, 4.7135767937, 4.7143669128, 4.7145690918, 4.7148418427, 4.7149815559, 4.7159647942, 4.7161884308, 4.7177276611, 4.717815876, 4.718059063, 4.7188801765, 4.7190728188, 4.7199850082, 4.7213058472, 4.7239775658, 4.7243933678, 4.7243990898, 4.7273659706, 4.7294125557, 4.7296204567, 4.7325615883, 4.7356877327, 4.740146637, 4.742254734, 4.7433848381, 4.7454957962, 4.7462964058, 4.7692604065, 4.7723139628, 4.774812736, 4.8577151299, 4.890037536};
for (int i=0; i<260; i++) {
input[i+260]=-input[i];
input[i+2*260]=input[i]+10;
input[i+3*260]=-input[i]-10;

int nmax = 4 * 260;
FLOAT input[nmax] = {
30.0, 30.0, -4.4, -7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007, 2.2666890621,
2.2677690983, 2.2794606686, 2.2802586555, 2.2821085453, 2.2852313519, 2.2877883911, 2.2946476936, 2.2960267067,
2.3006286621, 2.3245604038, 2.6755006313, 2.7229132652, 2.783257246, 2.8440306187, 2.9017834663, 2.9252648354,
2.9254128933, 2.927520752, 2.9422419071, 2.9453969002, 2.9457902908, 2.9465973377, 2.9492356777, 2.9573802948,
2.9575133324, 2.9575304985, 2.9586606026, 2.9605507851, 2.9622797966, 2.9625515938, 2.9641008377, 2.9646151066,
2.9676523209, 2.9708273411, 2.974111557, 2.9742531776, 2.9772830009, 2.9877333641, 2.9960610867, 3.013969183,
3.0187871456, 3.0379793644, 3.0407221317, 3.0415751934, 3.0470511913, 3.0560519695, 3.0592908859, 3.0599737167,
3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904, 3.0643932819, 3.0645000935, 3.0666446686, 3.068046093,
3.0697011948, 3.0717656612, 3.0718104839, 3.0718348026, 3.0733406544, 3.0738227367, 3.0738801956, 3.0738828182,
3.0744686127, 3.0753741264, 3.0758397579, 3.0767207146, 3.0773906708, 3.0778541565, 3.0780284405, 3.0780889988,
3.0782799721, 3.0789675713, 3.0792205334, 3.0793278217, 3.0795567036, 3.0797944069, 3.0806643963, 3.0809247494,
3.0815284252, 3.0817306042, 3.0819730759, 3.0820026398, 3.0838682652, 3.084009409, 3.0848178864, 3.0853257179,
3.0855510235, 3.0856611729, 3.0873703957, 3.0884618759, 3.0891149044, 3.0893011093, 3.0895674229, 3.0901503563,
3.0903317928, 3.0912668705, 3.0920717716, 3.0954346657, 3.096424818, 3.0995628834, 3.1001036167, 3.1173279285,
3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508, 3.1676850319, 3.1886672974, 3.3769197464, 3.3821125031,
3.4780933857, 3.4822063446, 3.4989323616, 3.5076274872, 3.5225863457, 3.5271244049, 3.5298995972, 3.5417425632,
3.5444457531, 3.5465917587, 3.5473103523, 3.5480232239, 3.5526945591, 3.5531234741, 3.5538012981, 3.5544877052,
3.5547749996, 3.5549693108, 3.5550665855, 3.5558729172, 3.5560717583, 3.5560848713, 3.5584278107, 3.558681488,
3.5587313175, 3.5592217445, 3.559384346, 3.5604712963, 3.5634038448, 3.563803196, 3.564593792, 3.5660364628,
3.5683133602, 3.5696356297, 3.569729805, 3.5740811825, 3.5757565498, 3.5760207176, 3.5760478973, 3.5836098194,
3.5839796066, 3.5852358341, 3.5901627541, 3.6141786575, 3.6601481438, 3.7187042236, 3.9741659164, 4.4111995697,
4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405, 4.6868157387, 4.6868577003, 4.6879930496, 4.6888813972,
4.6910686493, 4.6925001144, 4.6957530975, 4.698094368, 4.6997032166, 4.7017259598, 4.7020640373, 4.7024269104,
4.7036352158, 4.7038679123, 4.7042069435, 4.7044086456, 4.7044372559, 4.7050771713, 4.7055773735, 4.7060651779,
4.7062759399, 4.7065420151, 4.70657444, 4.7066287994, 4.7066788673, 4.7067341805, 4.7072944641, 4.7074551582,
4.7075614929, 4.7075891495, 4.7076044083, 4.7077374458, 4.7080879211, 4.70819664, 4.7086658478, 4.708937645,
4.7092385292, 4.709479332, 4.7095656395, 4.7100076675, 4.7102108002, 4.7104525566, 4.7105507851, 4.71118927,
4.7113513947, 4.7115578651, 4.7116270065, 4.7116751671, 4.7117190361, 4.7117333412, 4.7117910385, 4.7119007111,
4.7120013237, 4.712003231, 4.712044239, 4.7122926712, 4.7135767937, 4.7143669128, 4.7145690918, 4.7148418427,
4.7149815559, 4.7159647942, 4.7161884308, 4.7177276611, 4.717815876, 4.718059063, 4.7188801765, 4.7190728188,
4.7199850082, 4.7213058472, 4.7239775658, 4.7243933678, 4.7243990898, 4.7273659706, 4.7294125557, 4.7296204567,
4.7325615883, 4.7356877327, 4.740146637, 4.742254734, 4.7433848381, 4.7454957962, 4.7462964058, 4.7692604065,
4.7723139628, 4.774812736, 4.8577151299, 4.890037536};
for (int i = 0; i < 260; i++) {
input[i + 260] = -input[i];
input[i + 2 * 260] = input[i] + 10;
input[i + 3 * 260] = -input[i] - 10;
}
cudaCheck(cudaMalloc(&gpu_input, sizeof(FLOAT) * nmax));
cudaCheck(cudaMalloc(&gpu_product, sizeof(int) * nmax));
// copy the input data to the GPU
cudaCheck(cudaMemcpy(gpu_input, input, sizeof(FLOAT) * nmax, cudaMemcpyHostToDevice));


for (int k=2; k<=nmax; k++) {
if (k==257) continue;
if (k==513) continue;
if (k==769) continue;
if (k==1025) continue;
std::random_shuffle(input,input+k);
printf("Test with %d items\n",k);
for (int k = 2; k <= nmax; k++) {
std::random_shuffle(input, input + k);
printf("Test with %d items\n", k);
// sort on the GPU
testWrapper(gpu_input, gpu_product, k);
testWrapper(gpu_input, gpu_product, k, false);
cudaCheck(cudaDeviceSynchronize());
}



FLOAT input2[255] = {-7.1860761642, -6.6870317459, 1.8010582924, 2.2535820007, 2.2666890621, 2.2677690983, 2.2794606686, 2.2802586555, 2.2821085453, 2.2852313519, 2.2877883911, 2.2946476936, 2.2960267067, 2.3006286621, 2.3245604038, 2.6755006313, 2.7229132652, 2.783257246, 2.8440306187, 2.9017834663, 2.9252648354, 2.9254128933, 2.927520752, 2.9422419071, 2.9453969002, 2.9457902908, 2.9465973377, 2.9492356777, 2.9573802948, 2.9575133324, 2.9575304985, 2.9586606026, 2.9605507851, 2.9622797966, 2.9625515938, 2.9641008377, 2.9646151066, 2.9676523209, 2.9708273411, 2.974111557, 2.9742531776, 2.9772830009, 2.9877333641, 2.9960610867, 3.013969183, 3.0187871456, 3.0379793644, 3.0407221317, 3.0415751934, 3.0470511913, 3.0560519695, 3.0592908859, 3.0599737167, 3.0607066154, 3.0629007816, 3.0632448196, 3.0633215904, 3.0643932819, 3.0645000935, 3.0666446686, 3.068046093, 3.0697011948, 3.0717656612, 3.0718104839, 3.0718348026, 3.0733406544, 3.0738227367, 3.0738801956, 3.0738828182, 3.0744686127, 3.0753741264, 3.0758397579, 3.0767207146, 3.0773906708, 3.0778541565, 3.0780284405, 3.0780889988, 3.0782799721, 3.0789675713, 3.0792205334, 3.0793278217, 3.0795567036, 3.0797944069, 3.0806643963, 3.0809247494, 3.0815284252, 3.0817306042, 3.0819730759, 3.0820026398, 3.0838682652, 3.084009409, 3.0848178864, 3.0853257179, 3.0855510235, 3.0856611729, 3.0873703957, 3.0884618759, 3.0891149044, 3.0893011093, 3.0895674229, 3.0901503563, 3.0903317928, 3.0912668705, 3.0920717716, 3.0954346657, 3.096424818, 3.0995628834, 3.1001036167, 3.1173279285, 3.1185023785, 3.1195163727, 3.1568386555, 3.1675374508, 3.1676850319, 3.1886672974, 3.3769197464, 3.3821125031, 3.4780933857, 3.4822063446, 3.4989323616, 3.5076274872, 3.5225863457, 3.5271244049, 3.5298995972, 3.5417425632, 3.5444457531, 3.5465917587, 3.5473103523, 3.5480232239, 3.5526945591, 3.5531234741, 3.5538012981, 3.5544877052, 3.5547749996, 3.5549693108, 3.5550665855, 3.5558729172, 3.5560717583, 3.5560848713, 3.5584278107, 3.558681488, 3.5587313175, 3.5592217445, 3.559384346, 3.5604712963, 3.5634038448, 3.563803196, 3.564593792, 3.5660364628, 3.5683133602, 3.5696356297, 3.569729805, 3.5740811825, 3.5757565498, 3.5760207176, 3.5760478973, 3.5836098194, 3.5839796066, 3.5852358341, 3.5901627541, 3.6141786575, 3.6601481438, 3.7187042236, 3.9741659164, 4.4111995697, 4.5337572098, 4.6292567253, 4.6748633385, 4.6806583405, 4.6868157387, 4.6868577003, 4.6879930496, 4.6888813972, 4.6910686493, 4.6925001144, 4.6957530975, 4.698094368, 4.6997032166, 4.7017259598, 4.7020640373, 4.7024269104, 4.7036352158, 4.7038679123, 4.7042069435, 4.7044086456, 4.7044372559, 4.7050771713, 4.7055773735, 4.7060651779, 4.7062759399, 4.7065420151, 4.70657444, 4.7066287994, 4.7066788673, 4.7067341805, 4.7072944641, 4.7074551582, 4.7075614929, 4.7075891495, 4.7076044083, 4.7077374458, 4.7080879211, 4.70819664, 4.7086658478, 4.708937645, 4.7092385292, 4.709479332, 4.7095656395, 4.7100076675, 4.7102108002, 4.7104525566, 4.7105507851, 4.71118927, 4.7113513947, 4.7115578651, 4.7116270065, 4.7116751671, 4.7117190361, 4.7117333412, 4.7117910385, 4.7119007111, 4.7120013237, 4.712003231, 4.712044239, 4.7122926712, 4.7135767937, 4.7143669128, 4.7145690918, 4.7148418427, 4.7149815559, 4.7159647942, 4.7161884308, 4.7177276611, 4.717815876, 4.718059063, 4.7188801765, 4.7190728188, 4.7199850082, 4.7213058472, 4.7239775658, 4.7243933678, 4.7243990898, 4.7273659706, 4.7294125557, 4.7296204567, 4.7325615883, 4.7356877327, 4.740146637, 4.742254734, 4.7433848381, 4.7454957962, 4.7462964058, 4.7692604065, 4.7884969711, 4.8024044037};
int elements = 255;
std::random_shuffle(input2,input2+255);
printf("Second test, with 255 items\n");
// copy the input data to the GPU
cudaCheck(cudaMemcpy(gpu_input, input2, sizeof(FLOAT) * elements, cudaMemcpyHostToDevice));

// convert the vectors from cylindrical to cartesian coordinates, on the GPU
testWrapper(gpu_input, gpu_product, elements);
cudaCheck(cudaDeviceSynchronize());

// free the GPU memory
cudaCheck(cudaFree(gpu_input));
cudaCheck(cudaFree(gpu_product));


return 0;
}

0 comments on commit d5faa3d

Please sign in to comment.