Skip to content

Commit

Permalink
Changes needed to build AVX2 with clang-cl on Windows
Browse files Browse the repository at this point in the history
A few __clang__ checks, and switching _mm_prefetch to operate on const char * as
its signature requires on Windows

# Conflicts:
#	faiss/utils/distances_simd.cpp
  • Loading branch information
borrrden committed Feb 13, 2024
1 parent ebb5f84 commit 9dfcd39
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 15 deletions.
4 changes: 3 additions & 1 deletion faiss/impl/LocalSearchQuantizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,9 @@ void LocalSearchQuantizer::icm_encode_step(
{
size_t binary_idx = (other_m + 1) * M * K * K +
m * K * K + code2 * K + code;
_mm_prefetch(binaries + binary_idx, _MM_HINT_T0);
_mm_prefetch(
(const char*)(binaries + binary_idx),
_MM_HINT_T0);
}
}
#endif
Expand Down
4 changes: 4 additions & 0 deletions faiss/impl/platform_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,13 @@

#include <intrin.h>

#ifndef __clang__
inline int __builtin_ctzll(uint64_t x) {
unsigned long ret;
_BitScanForward64(&ret, x);
return (int)ret;
}
#endif

// cudatoolkit provides __builtin_ctz for NVCC >= 11.0
#if !defined(__CUDACC__) || __CUDACC_VER_MAJOR__ < 11
Expand All @@ -55,9 +57,11 @@ inline int __builtin_ctz(unsigned long x) {
}
#endif

#ifndef __clang__
inline int __builtin_clzll(uint64_t x) {
return (int)__lzcnt64(x);
}
#endif

#define __builtin_popcount __popcnt
#define __builtin_popcountl __popcnt64
Expand Down
8 changes: 4 additions & 4 deletions faiss/utils/distances.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -417,8 +417,8 @@ void exhaustive_L2sqr_blas_cmax_avx2(
for (int64_t i = i0; i < i1; i++) {
float* ip_line = ip_block.get() + (i - i0) * (j1 - j0);

_mm_prefetch(ip_line, _MM_HINT_NTA);
_mm_prefetch(ip_line + 16, _MM_HINT_NTA);
_mm_prefetch((const char*)ip_line, _MM_HINT_NTA);
_mm_prefetch((const char*)(ip_line + 16), _MM_HINT_NTA);

// constant
const __m256 mul_minus2 = _mm256_set1_ps(-2);
Expand All @@ -445,8 +445,8 @@ void exhaustive_L2sqr_blas_cmax_avx2(

// process 16 elements per loop
for (; idx_j < (count / 16) * 16; idx_j += 16, ip_line += 16) {
_mm_prefetch(ip_line + 32, _MM_HINT_NTA);
_mm_prefetch(ip_line + 48, _MM_HINT_NTA);
_mm_prefetch((const char*)(ip_line + 32), _MM_HINT_NTA);
_mm_prefetch((const char*)(ip_line + 48), _MM_HINT_NTA);

// load values for norms
const __m256 y_norm_0 =
Expand Down
2 changes: 1 addition & 1 deletion faiss/utils/distances_fused/simdlib_based.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ void kernel(

// prefetch the next point
#if defined(__AVX2__)
_mm_prefetch(xd_0 + DIM * sizeof(float), _MM_HINT_NTA);
_mm_prefetch((const char*)(xd_0 + DIM * sizeof(float)), _MM_HINT_NTA);
#endif

// load a single point from x
Expand Down
18 changes: 9 additions & 9 deletions faiss/utils/distances_simd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,14 +439,14 @@ void fvec_op_ny_D2<ElementOpIP>(

if (ny8 > 0) {
// process 8 D2-vectors per loop.
_mm_prefetch(y, _MM_HINT_T0);
_mm_prefetch(y + 16, _MM_HINT_T0);
_mm_prefetch((const char*)y, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 16), _MM_HINT_T0);

const __m256 m0 = _mm256_set1_ps(x[0]);
const __m256 m1 = _mm256_set1_ps(x[1]);

for (i = 0; i < ny8 * 8; i += 8) {
_mm_prefetch(y + 32, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 32), _MM_HINT_T0);

// load 8x2 matrix and transpose it in registers.
// the typical bottleneck is memory access, so
Expand Down Expand Up @@ -496,14 +496,14 @@ void fvec_op_ny_D2<ElementOpL2>(

if (ny8 > 0) {
// process 8 D2-vectors per loop.
_mm_prefetch(y, _MM_HINT_T0);
_mm_prefetch(y + 16, _MM_HINT_T0);
_mm_prefetch((const char*)y, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 16), _MM_HINT_T0);

const __m256 m0 = _mm256_set1_ps(x[0]);
const __m256 m1 = _mm256_set1_ps(x[1]);

for (i = 0; i < ny8 * 8; i += 8) {
_mm_prefetch(y + 32, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 32), _MM_HINT_T0);

// load 8x2 matrix and transpose it in registers.
// the typical bottleneck is memory access, so
Expand Down Expand Up @@ -1084,8 +1084,8 @@ size_t fvec_L2sqr_ny_nearest_D2(
// process 8 D2-vectors per loop.
const size_t ny8 = ny / 8;
if (ny8 > 0) {
_mm_prefetch(y, _MM_HINT_T0);
_mm_prefetch(y + 16, _MM_HINT_T0);
_mm_prefetch((const char*)y, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 16), _MM_HINT_T0);

// track min distance and the closest vector independently
// for each of 8 AVX2 components.
Expand All @@ -1100,7 +1100,7 @@ size_t fvec_L2sqr_ny_nearest_D2(
const __m256 m1 = _mm256_set1_ps(x[1]);

for (; i < ny8 * 8; i += 8) {
_mm_prefetch(y + 32, _MM_HINT_T0);
_mm_prefetch((const char*)(y + 32), _MM_HINT_T0);

__m256 v0;
__m256 v1;
Expand Down

0 comments on commit 9dfcd39

Please sign in to comment.