Skip to content

Commit

Permalink
Delete some obsolete cruft
Browse files Browse the repository at this point in the history
The SSE/AVX code was written in 2012 and mostly hasn't been touched since then.
In the time since then, runtime checking for SSE4.2 support has become
pointless (CPUs supporting it were first released in 2008, so it was somewhat
recent in 2012 and now is not at all recent), and we never actually wrote
anything which requires AVX. There was also some SSE code which has been
disabled since 2012.

set_string_compare_method() and everyhing related to it has never actually been
used by any SDK, and is not really the correct solution to the problem anyway.

The encryption code used `ssize_t` in a few places because it was originally
written to use `pread` and friends, but the functions it now calls use `size_t`
and `ssize_t` doesn't exist on Windows. The use of ssize_t in the Windows
networking code was simply incorrect; the winsock functions return `int`.
  • Loading branch information
tgoyne authored and ironage committed Mar 15, 2023
1 parent 134c9c4 commit 5f2dda1
Show file tree
Hide file tree
Showing 15 changed files with 346 additions and 1,504 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
### Compatibility
* Fileformat: Generates files with format v24. Reads and automatically upgrade from fileformat v10. If you want to upgrade from an earlier file format version you will have to use RealmCore v13.x.y or earlier.

-----------

### Internals
* Remove `set_string_compare_method()` and everything related to it, which has never actually been used and was a bad solution to the problem it tried to solve.
* Remove runtime CPUID checking as the most recent CPU features we rely on are now 15 years old.

----------------------------------------------
# 13.4.2 Release notes

Expand Down
177 changes: 79 additions & 98 deletions src/realm/array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -686,23 +686,6 @@ int64_t Array::sum(size_t start, size_t end) const

for (size_t t = 0; t < chunks; t++) {
if (w == 1) {

#if 0
#if defined(USE_SSE42) && defined(_MSC_VER) && defined(REALM_PTR_64)
s += __popcnt64(data[t]);
#elif !defined(_MSC_VER) && defined(USE_SSE42) && defined(REALM_PTR_64)
s += __builtin_popcountll(data[t]);
#else
uint64_t a = data[t];
const uint64_t m1 = 0x5555555555555555ULL;
a -= (a >> 1) & m1;
a = (a & m2) + ((a >> 2) & m2);
a = (a + (a >> 4)) & m4;
a = (a * h01) >> 56;
s += a;
#endif
#endif

s += fast_popcount64(data[t]);
}
else if (w == 2) {
Expand All @@ -724,91 +707,89 @@ int64_t Array::sum(size_t start, size_t end) const
}

#ifdef REALM_COMPILER_SSE
if (sseavx<42>()) {

// 2000 items summed 500000 times, 8/16/32 bits, miliseconds:
// Naive, templated get<>: 391 371 374
// SSE: 97 148 282

if ((w == 8 || w == 16 || w == 32) && end - start > sizeof(__m128i) * 8 / no0(w)) {
__m128i* data = reinterpret_cast<__m128i*>(m_data + start * w / 8);
__m128i sum_result = {0};
__m128i sum2;

size_t chunks = (end - start) * w / 8 / sizeof(__m128i);

for (size_t t = 0; t < chunks; t++) {
if (w == 8) {
/*
// 469 ms AND disadvantage of handling max 64k elements before overflow
__m128i vl = _mm_cvtepi8_epi16(data[t]);
__m128i vh = data[t];
vh.m128i_i64[0] = vh.m128i_i64[1];
vh = _mm_cvtepi8_epi16(vh);
sum_result = _mm_add_epi16(sum_result, vl);
sum_result = _mm_add_epi16(sum_result, vh);
*/

/*
// 424 ms
__m128i vl = _mm_unpacklo_epi8(data[t], _mm_set1_epi8(0));
__m128i vh = _mm_unpackhi_epi8(data[t], _mm_set1_epi8(0));
sum_result = _mm_add_epi32(sum_result, _mm_madd_epi16(vl, _mm_set1_epi16(1)));
sum_result = _mm_add_epi32(sum_result, _mm_madd_epi16(vh, _mm_set1_epi16(1)));
*/

__m128i vl = _mm_cvtepi8_epi16(data[t]); // sign extend lower words 8->16
__m128i vh = data[t];
vh = _mm_srli_si128(vh, 8); // v >>= 64
vh = _mm_cvtepi8_epi16(vh); // sign extend lower words 8->16
__m128i sum1 = _mm_add_epi16(vl, vh);
__m128i sumH = _mm_cvtepi16_epi32(sum1);
__m128i sumL = _mm_srli_si128(sum1, 8); // v >>= 64
sumL = _mm_cvtepi16_epi32(sumL);
sum_result = _mm_add_epi32(sum_result, sumL);
sum_result = _mm_add_epi32(sum_result, sumH);
}
else if (w == 16) {
// todo, can overflow for array size > 2^32
__m128i vl = _mm_cvtepi16_epi32(data[t]); // sign extend lower words 16->32
__m128i vh = data[t];
vh = _mm_srli_si128(vh, 8); // v >>= 64
vh = _mm_cvtepi16_epi32(vh); // sign extend lower words 16->32
sum_result = _mm_add_epi32(sum_result, vl);
sum_result = _mm_add_epi32(sum_result, vh);
}
else if (w == 32) {
__m128i v = data[t];
__m128i v0 = _mm_cvtepi32_epi64(v); // sign extend lower dwords 32->64
v = _mm_srli_si128(v, 8); // v >>= 64
__m128i v1 = _mm_cvtepi32_epi64(v); // sign extend lower dwords 32->64
sum_result = _mm_add_epi64(sum_result, v0);
sum_result = _mm_add_epi64(sum_result, v1);

/*
__m128i m = _mm_set1_epi32(0xc000); // test if overflow could happen (still need
underflow test).
__m128i mm = _mm_and_si128(data[t], m);
zz = _mm_or_si128(mm, zz);
sum_result = _mm_add_epi32(sum_result, data[t]);
*/
}
}
start += sizeof(__m128i) * 8 / no0(w) * chunks;

// prevent taking address of 'state' to make the compiler keep it in SSE register in above loop
// (vc2010/gcc4.6)
sum2 = sum_result;
// 2000 items summed 500000 times, 8/16/32 bits, miliseconds:
// Naive, templated get<>: 391 371 374
// SSE: 97 148 282

if ((w == 8 || w == 16 || w == 32) && end - start > sizeof(__m128i) * 8 / no0(w)) {
__m128i* data = reinterpret_cast<__m128i*>(m_data + start * w / 8);
__m128i sum_result = {0};
__m128i sum2;

// Avoid aliasing bug where sum2 might not yet be initialized when accessed by get_universal
char sum3[sizeof sum2];
memcpy(&sum3, &sum2, sizeof sum2);
size_t chunks = (end - start) * w / 8 / sizeof(__m128i);

// Sum elements of sum
for (size_t t = 0; t < sizeof(__m128i) * 8 / ((w == 8 || w == 16) ? 32 : 64); ++t) {
int64_t v = get_universal < (w == 8 || w == 16) ? 32 : 64 > (reinterpret_cast<char*>(&sum3), t);
s += v;
for (size_t t = 0; t < chunks; t++) {
if (w == 8) {
/*
// 469 ms AND disadvantage of handling max 64k elements before overflow
__m128i vl = _mm_cvtepi8_epi16(data[t]);
__m128i vh = data[t];
vh.m128i_i64[0] = vh.m128i_i64[1];
vh = _mm_cvtepi8_epi16(vh);
sum_result = _mm_add_epi16(sum_result, vl);
sum_result = _mm_add_epi16(sum_result, vh);
*/

/*
// 424 ms
__m128i vl = _mm_unpacklo_epi8(data[t], _mm_set1_epi8(0));
__m128i vh = _mm_unpackhi_epi8(data[t], _mm_set1_epi8(0));
sum_result = _mm_add_epi32(sum_result, _mm_madd_epi16(vl, _mm_set1_epi16(1)));
sum_result = _mm_add_epi32(sum_result, _mm_madd_epi16(vh, _mm_set1_epi16(1)));
*/

__m128i vl = _mm_cvtepi8_epi16(data[t]); // sign extend lower words 8->16
__m128i vh = data[t];
vh = _mm_srli_si128(vh, 8); // v >>= 64
vh = _mm_cvtepi8_epi16(vh); // sign extend lower words 8->16
__m128i sum1 = _mm_add_epi16(vl, vh);
__m128i sumH = _mm_cvtepi16_epi32(sum1);
__m128i sumL = _mm_srli_si128(sum1, 8); // v >>= 64
sumL = _mm_cvtepi16_epi32(sumL);
sum_result = _mm_add_epi32(sum_result, sumL);
sum_result = _mm_add_epi32(sum_result, sumH);
}
else if (w == 16) {
// todo, can overflow for array size > 2^32
__m128i vl = _mm_cvtepi16_epi32(data[t]); // sign extend lower words 16->32
__m128i vh = data[t];
vh = _mm_srli_si128(vh, 8); // v >>= 64
vh = _mm_cvtepi16_epi32(vh); // sign extend lower words 16->32
sum_result = _mm_add_epi32(sum_result, vl);
sum_result = _mm_add_epi32(sum_result, vh);
}
else if (w == 32) {
__m128i v = data[t];
__m128i v0 = _mm_cvtepi32_epi64(v); // sign extend lower dwords 32->64
v = _mm_srli_si128(v, 8); // v >>= 64
__m128i v1 = _mm_cvtepi32_epi64(v); // sign extend lower dwords 32->64
sum_result = _mm_add_epi64(sum_result, v0);
sum_result = _mm_add_epi64(sum_result, v1);

/*
__m128i m = _mm_set1_epi32(0xc000); // test if overflow could happen (still need
underflow test).
__m128i mm = _mm_and_si128(data[t], m);
zz = _mm_or_si128(mm, zz);
sum_result = _mm_add_epi32(sum_result, data[t]);
*/
}
}
start += sizeof(__m128i) * 8 / no0(w) * chunks;

// prevent taking address of 'state' to make the compiler keep it in SSE register in above loop
// (vc2010/gcc4.6)
sum2 = sum_result;

// Avoid aliasing bug where sum2 might not yet be initialized when accessed by get_universal
char sum3[sizeof sum2];
memcpy(&sum3, &sum2, sizeof sum2);

// Sum elements of sum
for (size_t t = 0; t < sizeof(__m128i) * 8 / ((w == 8 || w == 16) ? 32 : 64); ++t) {
int64_t v = get_universal < (w == 8 || w == 16) ? 32 : 64 > (reinterpret_cast<char*>(&sum3), t);
s += v;
}
}
#endif
Expand Down
1 change: 0 additions & 1 deletion src/realm/array_with_find.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,4 @@ size_t ArrayWithFind::first_set_bit64(int64_t v) const
return first_set_bit(v1) + 32;
}


} // namespace realm
26 changes: 8 additions & 18 deletions src/realm/array_with_find.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -369,11 +369,10 @@ bool ArrayWithFind::find_optimized(int64_t value, size_t start, size_t end, size
REALM_ASSERT_3(m_array.m_width, !=, 0);

#if defined(REALM_COMPILER_SSE)
// Only use SSE if payload is at least one SSE chunk (128 bits) in size. Also note taht SSE doesn't support
// Only use SSE if payload is at least one SSE chunk (128 bits) in size. Also note that SSE doesn't support
// Less-than comparison for 64-bit values.
if ((!(std::is_same<cond, Less>::value && m_array.m_width == 64)) && end - start2 >= sizeof(__m128i) &&
m_array.m_width >= 8 &&
(sseavx<42>() || (sseavx<30>() && std::is_same<cond, Equal>::value && m_array.m_width < 64))) {
if ((!(std::is_same_v<cond, Less> && m_array.m_width == 64)) && end - start2 >= sizeof(__m128i) &&
m_array.m_width >= 8) {

// find_sse() must start2 at 16-byte boundary, so search area before that using compare_equality()
__m128i* const a =
Expand All @@ -388,19 +387,10 @@ bool ArrayWithFind::find_optimized(int64_t value, size_t start, size_t end, size

// Search aligned area with SSE
if (b > a) {
if (sseavx<42>()) {
if (!find_sse<cond, bitwidth, Callback>(
value, a, b - a, state,
baseindex + ((reinterpret_cast<char*>(a) - m_array.m_data) * 8 / no0(bitwidth)), callback))
return false;
}
else if (sseavx<30>()) {

if (!find_sse<Equal, bitwidth, Callback>(
value, a, b - a, state,
baseindex + ((reinterpret_cast<char*>(a) - m_array.m_data) * 8 / no0(bitwidth)), callback))
return false;
}
if (!find_sse<cond, bitwidth, Callback>(
value, a, b - a, state,
baseindex + ((reinterpret_cast<char*>(a) - m_array.m_data) * 8 / no0(bitwidth)), callback))
return false;
}

// Search remainder with compare_equality()
Expand Down Expand Up @@ -909,7 +899,7 @@ bool ArrayWithFind::compare_leafs_4(const Array* foreign, size_t start, size_t e


#if defined(REALM_COMPILER_SSE)
if (sseavx<42>() && width == foreign_width && (width == 8 || width == 16 || width == 32)) {
if (width == foreign_width && (width == 8 || width == 16 || width == 32)) {
// We can only use SSE if both bitwidths are equal and above 8 bits and all values are signed
// and the two arrays are aligned the same way
if ((reinterpret_cast<size_t>(m_array.m_data) & 0xf) == (reinterpret_cast<size_t>(foreign_m_data) & 0xf)) {
Expand Down
14 changes: 0 additions & 14 deletions src/realm/group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,6 @@
using namespace realm;
using namespace realm::util;

namespace {

class Initialization {
public:
Initialization()
{
realm::cpuid_init();
}
};

Initialization initialization;

} // anonymous namespace

Group::Group()
: m_local_alloc(new SlabAlloc)
, m_alloc(*m_local_alloc) // Throws
Expand Down
81 changes: 0 additions & 81 deletions src/realm/realm_nmmintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,87 +36,6 @@

namespace realm {

#if 0
#ifdef REALM_COMPILER_AVX
typedef float __m256 __attribute__((__vector_size__(32), __may_alias__));
typedef double __m256d __attribute__((__vector_size__(32), __may_alias__));

const int _CMP_EQ_OQ = 0x00; // Equal (ordered, non-signaling)
const int _CMP_NEQ_OQ = 0x0c; // Not-equal (ordered, non-signaling)
const int _CMP_LT_OQ = 0x11; // Less-than (ordered, non-signaling)
const int _CMP_LE_OQ = 0x12; // Less-than-or-equal (ordered, non-signaling)
const int _CMP_GE_OQ = 0x1d; // Greater-than-or-equal (ordered, non-signaling)
const int _CMP_GT_OQ = 0x1e; // Greater-than (ordered, non-signaling)


template<int op>
static int movemask_cmp_ps(__m256* y1, __m256* y2)
{
int ret;
__asm__("vmovaps %0, %%ymm0" : : "m"(*y1) : "%xmm0" );
__asm__("vmovaps %0, %%ymm1" : : "m"(*y2) : "%xmm1" );
__asm__("vcmpps %0, %%ymm0, %%ymm1, %%ymm0" : : "I"(op) : "%xmm0" );
__asm__("vmovmskps %%ymm0, %0" : "=r"(ret) : : );
return ret;
}

template<int op>
static inline int movemask_cmp_pd(__m256d* y1, __m256d* y2)
{
int ret;
__asm__("vmovapd %0, %%ymm0" : : "m"(*y1) : "%xmm0" );
__asm__("vmovapd %0, %%ymm1" : : "m"(*y2) : "%xmm1" );
__asm__("vcmppd %0, %%ymm0, %%ymm1, %%ymm0" : : "I"(op) : "%xmm0" );
__asm__("vmovmskpd %%ymm0, %0" : "=r"(ret) : : );
return ret;
}



static inline int movemask_cmp_ps(__m256* y1, __m256* y2, int op)
{
// todo, use constexpr;
if (op == _CMP_EQ_OQ)
return movemask_cmp_ps<_CMP_NEQ_OQ>(y1, y2);
else if (op == _CMP_NEQ_OQ)
return movemask_cmp_ps<_CMP_NEQ_OQ>(y1, y2);
else if (op == _CMP_LT_OQ)
return movemask_cmp_ps<_CMP_LT_OQ>(y1, y2);
else if (op == _CMP_LE_OQ)
return movemask_cmp_ps<_CMP_LE_OQ>(y1, y2);
else if (op == _CMP_GE_OQ)
return movemask_cmp_ps<_CMP_GE_OQ>(y1, y2);
else if (op == _CMP_GT_OQ)
return movemask_cmp_ps<_CMP_GT_OQ>(y1, y2);

REALM_ASSERT(false);
return 0;
}

static inline int movemask_cmp_pd(__m256d* y1, __m256d* y2, int op)
{
// todo, use constexpr;
if (op == _CMP_EQ_OQ)
return movemask_cmp_pd<_CMP_NEQ_OQ>(y1, y2);
else if (op == _CMP_NEQ_OQ)
return movemask_cmp_pd<_CMP_NEQ_OQ>(y1, y2);
else if (op == _CMP_LT_OQ)
return movemask_cmp_pd<_CMP_LT_OQ>(y1, y2);
else if (op == _CMP_LE_OQ)
return movemask_cmp_pd<_CMP_LE_OQ>(y1, y2);
else if (op == _CMP_GE_OQ)
return movemask_cmp_pd<_CMP_GE_OQ>(y1, y2);
else if (op == _CMP_GT_OQ)
return movemask_cmp_pd<_CMP_GT_OQ>(y1, y2);

REALM_ASSERT(false);
return 0;
}


#endif
#endif

// Instructions introduced by SSE 3 and 4.2
static inline __m128i _mm_cmpgt_epi64(__m128i xmm1, __m128i xmm2)
{
Expand Down
4 changes: 2 additions & 2 deletions src/realm/sync/network/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1943,7 +1943,7 @@ std::size_t Service::Descriptor::read_some(char* buffer, std::size_t size, std::
for (;;) {
int flags = 0;
#ifdef _WIN32
ssize_t ret = ::recv(m_fd, buffer, int(size), flags);
int ret = ::recv(m_fd, buffer, int(size), flags);
if (ret == SOCKET_ERROR) {
int err = WSAGetLastError();
// Retry on interruption by system signal
Expand Down Expand Up @@ -2028,7 +2028,7 @@ std::size_t Service::Descriptor::write_some(const char* data, std::size_t size,
flags |= MSG_NOSIGNAL;
#endif
#ifdef _WIN32
ssize_t ret = ::send(m_fd, data, int(size), flags);
int ret = ::send(m_fd, data, int(size), flags);
if (ret == SOCKET_ERROR) {
int err = WSAGetLastError();
// Retry on interruption by system signal
Expand Down
Loading

0 comments on commit 5f2dda1

Please sign in to comment.