Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix radix sort test #34929

Merged
merged 1 commit into from
Aug 18, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 49 additions & 9 deletions HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cstdint>
#include <iomanip>
#include <iostream>
#include <limits>
#include <memory>
#include <random>
#include <set>
#include <type_traits>

#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
Expand All @@ -32,6 +34,50 @@ struct RS<float> {
static constexpr int imax = std::numeric_limits<int>::max();
};

// A templated unsigned integer type with N bytes
template <int N>
struct uintN;

template <>
struct uintN<8> {
using type = uint8_t;
};

template <>
struct uintN<16> {
using type = uint16_t;
};

template <>
struct uintN<32> {
using type = uint32_t;
};

template <>
struct uintN<64> {
using type = uint64_t;
};

template <int N>
using uintN_t = typename uintN<N>::type;

// A templated unsigned integer type with the same size as T
template <typename T>
using uintT_t = uintN_t<sizeof(T) * 8>;

// Keep only the `N` most significant bytes of `t`, and set the others to zero
template <int N, typename T, typename SFINAE = std::enable_if_t<N <= sizeof(T)>>
void truncate(T& t) {
const int shift = 8 * (sizeof(T) - N);
union {
T t;
uintT_t<T> u;
} c;
c.t = t;
c.u = c.u >> shift << shift;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm pretty sure this is actually undefined behavior as you are not supposed to set a union using one type and read the union using another type. I believe the recommend way to do this is to use memcpy.

uintT_t<T> u;
memcpy(&u, &t, sizeof(u));
u = u >>shift <<shift;
memcpy(&t, &u, sizeof(t));

I played around with that recently on godbolt (for reasons other than this PR) and found compilers know about memcpy and optimize out the calls and just do the 'right' thing.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think section 11.5.6.3 of the C++ 20 standard demonstrates that it is undefined behavior.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm pretty sure the union pattern is used elsewhere in CMSSW too, so we should address those as well.

t = c.t;
}

template <typename T, int NS = sizeof(T), typename U = T, typename LL = long long>
void go(bool useShared) {
std::mt19937 eng;
Expand Down Expand Up @@ -100,7 +146,7 @@ void go(bool useShared) {
cudaCheck(cudaMemcpy(off_d.get(), offsets, 4 * (blocks + 1), cudaMemcpyHostToDevice));

if (i < 2)
std::cout << "lauch for " << offsets[blocks] << std::endl;
std::cout << "launch for " << offsets[blocks] << std::endl;

auto ntXBl __attribute__((unused)) = 1 == i % 4 ? 256 : 256;

Expand Down Expand Up @@ -138,14 +184,8 @@ void go(bool useShared) {
auto a = v + offsets[ib];
auto k1 = a[ind[j]];
auto k2 = a[ind[j - 1]];
auto sh = sizeof(uint64_t) - NS;
sh *= 8;
auto shorten = [sh](T& t) {
auto k = (uint64_t*)(&t);
*k = (*k >> sh) << sh;
};
shorten(k1);
shorten(k2);
truncate<NS>(k1);
truncate<NS>(k2);
if (k1 < k2)
std::cout << ib << " not ordered at " << ind[j] << " : " << a[ind[j]] << ' ' << a[ind[j - 1]] << std::endl;
}
Expand Down