From 686a3db3d3aa5bf013cffeb86dcf0c8d66902ffb Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <johannes.dieterich@amd.com>
Date: Fri, 3 Nov 2023 16:54:44 -0500
Subject: [PATCH] Progress for WF32.

---
 faiss/hip/CMakeLists.txt                      |   2 +-
 faiss/hip/GpuIndex.hip                        |   2 +-
 faiss/hip/GpuResources.cpp                    |   2 +-
 faiss/hip/StandardGpuResources.cpp            |  21 +-
 faiss/hip/impl/BinaryDistance.hip             |   2 +
 faiss/hip/impl/BroadcastSum.hip               |   6 +-
 faiss/hip/impl/DistanceUtils.h                |   2 +-
 faiss/hip/impl/GeneralDistance.h              |   3 +-
 faiss/hip/impl/IVFAppend.hip                  |  13 +-
 faiss/hip/impl/IVFFlatScan.hip                |   7 +-
 faiss/hip/impl/IVFInterleaved.hip             |   1 +
 faiss/hip/impl/IVFUtils.hip                   |   4 +-
 faiss/hip/impl/IVFUtilsSelect1.hip            |  11 +-
 faiss/hip/impl/IVFUtilsSelect2.hip            |  11 +-
 faiss/hip/impl/IcmEncoder.hip                 |   6 +
 faiss/hip/impl/L2Norm.hip                     |   4 +-
 faiss/hip/impl/L2Select.hip                   |   4 +-
 faiss/hip/impl/PQCodeDistances-inl.h          |   6 +-
 faiss/hip/impl/PQCodeLoad.h                   | 244 ++++++------------
 .../impl/PQScanMultiPassNoPrecomputed-inl.h   |   8 +-
 faiss/hip/impl/PQScanMultiPassPrecomputed.hip |  10 +-
 faiss/hip/impl/VectorResidual.hip             |   6 +-
 faiss/hip/impl/scan/IVFInterleavedImpl.h      |   2 +-
 .../scan/IVFInterleavedScanKernelTemplate.hip |   1 +
 faiss/hip/perf/PerfBinaryFlat.hip             |   6 +-
 faiss/hip/perf/PerfClustering.cpp             |   6 +-
 faiss/hip/perf/PerfFlat.hip                   |   6 +-
 faiss/hip/perf/PerfIVFFlat.hip                |   6 +-
 faiss/hip/perf/PerfIVFPQ.hip                  |   8 +-
 faiss/hip/perf/PerfIVFPQAdd.cpp               |   6 +-
 faiss/hip/test/TestGpuMemoryException.cpp     |   2 +-
 faiss/hip/utils/BlockSelectFloat.hip          |   8 +-
 faiss/hip/utils/CopyUtils.h                   |   4 +-
 faiss/hip/utils/DeviceDefs.h                  |   2 +-
 faiss/hip/utils/DeviceTensor-inl.h            |   2 +-
 faiss/hip/utils/DeviceUtils.h                 |  20 +-
 faiss/hip/utils/DeviceUtils.hip               |  28 +-
 faiss/hip/utils/DeviceVector.h                |  14 +-
 faiss/hip/utils/LoadStoreOperators.h          |  29 ++-
 faiss/hip/utils/MatrixMult-inl.h              |   4 +-
 faiss/hip/utils/MergeNetworkWarp.h            |  11 +-
 faiss/hip/utils/PtxUtils.h                    |   2 +-
 faiss/hip/utils/Select.h                      |   8 -
 faiss/hip/utils/Tensor-inl.h                  |  12 +-
 faiss/hip/utils/Tensor.h                      |  16 --
 faiss/hip/utils/Timer.cpp                     |  16 +-
 faiss/hip/utils/Transpose.h                   |   8 +-
 faiss/hip/utils/WarpSelectFloat.hip           |  12 +-
 faiss/hip/utils/WarpShuffles.h                |   5 -
 faiss/hip/utils/blockselect/BlockSelectImpl.h |   4 +-
 faiss/hip/utils/warpselect/WarpSelectImpl.h   |   2 +-
 51 files changed, 267 insertions(+), 358 deletions(-)

diff --git a/faiss/hip/CMakeLists.txt b/faiss/hip/CMakeLists.txt
index 5a3f95689b..4e36ca81f0 100644
--- a/faiss/hip/CMakeLists.txt
+++ b/faiss/hip/CMakeLists.txt
@@ -191,7 +191,7 @@ function(generate_ivf_interleaved_code)
     "128|1|1"
     "128|128|3"
     "128|256|4"
-    "128|64|2" #"128|32|2"     TODO won't compile with a warpsize of 64. Changed to 64
+    "128|32|2"  #"128|32|2"     TODO won't compile with a warpsize of 64. Changed to 64
     "128|512|8"
     "128|64|3"
     "64|2048|8"
diff --git a/faiss/hip/GpuIndex.hip b/faiss/hip/GpuIndex.hip
index 821c873ad5..cba5e57268 100644
--- a/faiss/hip/GpuIndex.hip
+++ b/faiss/hip/GpuIndex.hip
@@ -397,7 +397,7 @@ void GpuIndex::searchFromCpuPaged_(
                 eventPrev->streamWaitOnEvent(copyStream);
             }
 
-            CUDA_VERIFY(hipMemcpyAsync(
+            HIP_VERIFY(hipMemcpyAsync(
                     bufGpus[cur2BufIndex]->data(),
                     bufPinned[cur2BufIndex],
                     numToCopy * this->d * sizeof(float),
diff --git a/faiss/hip/GpuResources.cpp b/faiss/hip/GpuResources.cpp
index d3e4eed749..a9b58cd4a5 100644
--- a/faiss/hip/GpuResources.cpp
+++ b/faiss/hip/GpuResources.cpp
@@ -183,7 +183,7 @@ hipStream_t GpuResources::getAsyncCopyStreamCurrentDevice() {
 }
 
 void GpuResources::syncDefaultStream(int device) {
-    CUDA_VERIFY(hipStreamSynchronize(getDefaultStream(device)));
+    HIP_VERIFY(hipStreamSynchronize(getDefaultStream(device)));
 }
 
 void GpuResources::syncDefaultStreamCurrentDevice() {
diff --git a/faiss/hip/StandardGpuResources.cpp b/faiss/hip/StandardGpuResources.cpp
index df94db79c7..75448188c2 100644
--- a/faiss/hip/StandardGpuResources.cpp
+++ b/faiss/hip/StandardGpuResources.cpp
@@ -134,21 +134,21 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
         DeviceScope scope(entry.first);
 
         // We created these streams, so are responsible for destroying them
-        CUDA_VERIFY(hipStreamDestroy(entry.second));
+        HIP_VERIFY(hipStreamDestroy(entry.second));
     }
 
     for (auto& entry : alternateStreams_) {
         DeviceScope scope(entry.first);
 
         for (auto stream : entry.second) {
-            CUDA_VERIFY(hipStreamDestroy(stream));
+            HIP_VERIFY(hipStreamDestroy(stream));
         }
     }
 
     for (auto& entry : asyncCopyStreams_) {
         DeviceScope scope(entry.first);
 
-        CUDA_VERIFY(hipStreamDestroy(entry.second));
+        HIP_VERIFY(hipStreamDestroy(entry.second));
     }
 
     for (auto& entry : blasHandles_) {
@@ -349,20 +349,21 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
             prop.minor);
 
     // Our code is pre-built with and expects warpSize == 32, validate that
-    FAISS_ASSERT_FMT(
-            prop.warpSize == 64,
-            "Device id %d does not have expected warpSize of 64",
-            device);
+    // we can have either warp size 32 or 64 - disable this check
+    //FAISS_ASSERT_FMT(
+    //        prop.warpSize == 64,
+    //        "Device id %d does not have expected warpSize of 64",
+    //        device);
 
     // Create streams
     hipStream_t defaultStream = 0;
-    CUDA_VERIFY(
+    HIP_VERIFY(
             hipStreamCreateWithFlags(&defaultStream, hipStreamNonBlocking));
 
     defaultStreams_[device] = defaultStream;
 
     hipStream_t asyncCopyStream = 0;
-    CUDA_VERIFY(
+    HIP_VERIFY(
             hipStreamCreateWithFlags(&asyncCopyStream, hipStreamNonBlocking));
 
     asyncCopyStreams_[device] = asyncCopyStream;
@@ -370,7 +371,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
     std::vector<hipStream_t> deviceStreams;
     for (int j = 0; j < kNumStreams; ++j) {
         hipStream_t stream = 0;
-        CUDA_VERIFY(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+        HIP_VERIFY(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
 
         deviceStreams.push_back(stream);
     }
diff --git a/faiss/hip/impl/BinaryDistance.hip b/faiss/hip/impl/BinaryDistance.hip
index d1c2327d65..a1e31bf9e0 100644
--- a/faiss/hip/impl/BinaryDistance.hip
+++ b/faiss/hip/impl/BinaryDistance.hip
@@ -228,6 +228,7 @@ void runBinaryDistanceAnySize(
                 <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
     }
 #endif
+    HIP_TEST_ERROR();
 }
 
 template <typename BinaryType, int ReductionLimit>
@@ -269,6 +270,7 @@ void runBinaryDistanceLimitSize(
                 <<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
     }
 #endif
+    HIP_TEST_ERROR();
 }
 
 void runBinaryDistance(
diff --git a/faiss/hip/impl/BroadcastSum.hip b/faiss/hip/impl/BroadcastSum.hip
index 91dfd70f20..0f9a4d0208 100644
--- a/faiss/hip/impl/BroadcastSum.hip
+++ b/faiss/hip/impl/BroadcastSum.hip
@@ -265,7 +265,7 @@ void runSumAlongColumns(
                 <<<grid, block, 0, stream>>>(input, output);
     }
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 void runSumAlongColumns(
@@ -320,7 +320,7 @@ void runAssignAlongColumns(
                 <<<grid, block, 0, stream>>>(input, output);
     }
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 void runAssignAlongColumns(
@@ -356,7 +356,7 @@ void runSumAlongRows(
         sumAlongRows<T, false><<<grid, block, 0, stream>>>(input, output);
     }
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 void runSumAlongRows(
diff --git a/faiss/hip/impl/DistanceUtils.h b/faiss/hip/impl/DistanceUtils.h
index d8a35efbb8..070a43eab4 100644
--- a/faiss/hip/impl/DistanceUtils.h
+++ b/faiss/hip/impl/DistanceUtils.h
@@ -326,7 +326,7 @@ void runIncrementIndex(
     auto block = std::min(k, getMaxThreadsCurrentDevice());
 
     incrementIndex<<<grid, block, 0, stream>>>(indices, k, increment);
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 // If the inner size (dim) of the vectors is small, we want a larger query tile
diff --git a/faiss/hip/impl/GeneralDistance.h b/faiss/hip/impl/GeneralDistance.h
index 095dc10cff..529d829422 100644
--- a/faiss/hip/impl/GeneralDistance.h
+++ b/faiss/hip/impl/GeneralDistance.h
@@ -250,6 +250,7 @@ void runGeneralDistanceKernel(
     dim3 block(kWarpSize, kWarpSize);
 
     generalDistance<<<grid, block, 0, stream>>>(query, vecs, op, out);
+    HIP_TEST_ERROR();
 }
 
 template <typename T, typename DistanceOp, bool InnerContig>
@@ -449,7 +450,7 @@ void runGeneralDistance(
         FAISS_THROW_MSG("interrupted");
     }
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 } // namespace hip
diff --git a/faiss/hip/impl/IVFAppend.hip b/faiss/hip/impl/IVFAppend.hip
index e7f3e712b3..053f951050 100644
--- a/faiss/hip/impl/IVFAppend.hip
+++ b/faiss/hip/impl/IVFAppend.hip
@@ -71,7 +71,7 @@ void runUpdateListPointers(
             listCodes.data(),
             listIndices.data());
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 // Appends new indices for vectors being added to the IVF indices lists
@@ -124,7 +124,7 @@ void runIVFIndicesAppend(
         ivfIndicesAppend<<<blocks, threads, 0, stream>>>(
                 listIds, listOffset, indices, opt, listIndices.data());
 
-        CUDA_TEST_ERROR();
+        HIP_TEST_ERROR();
     }
 }
 
@@ -259,7 +259,7 @@ void runIVFFlatAppend(
         }
     }
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 
 #undef RUN_APPEND
 }
@@ -308,7 +308,7 @@ void runIVFPQAppend(
     ivfpqAppend<<<threads, blocks, 0, stream>>>(
             listIds, listOffset, encodings, listCodes.data());
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 //
@@ -341,6 +341,7 @@ void runSQEncode(
     idx_t blocks = vecs.getSize(0);
 
     sqEncode<<<blocks, threads, 0, stream>>>(vecs, encodedVecs, codec);
+    HIP_TEST_ERROR();
 }
 
 // Handles appending encoded vectors (one per EncodeT word) packed into
@@ -584,7 +585,7 @@ void runIVFFlatInterleavedAppend(
     }
 
 #undef RUN_APPEND
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 void runIVFPQInterleavedAppend(
@@ -640,7 +641,7 @@ void runIVFPQInterleavedAppend(
     }
 
 #undef RUN_APPEND
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 } // namespace hip
diff --git a/faiss/hip/impl/IVFFlatScan.hip b/faiss/hip/impl/IVFFlatScan.hip
index f829b67914..d5100b999e 100644
--- a/faiss/hip/impl/IVFFlatScan.hip
+++ b/faiss/hip/impl/IVFFlatScan.hip
@@ -229,6 +229,7 @@ void runIVFFlatScanTile(
                 metric,                                               \
                 prefixSumOffsets,                                     \
                 allDistances);                                        \
+        HIP_TEST_ERROR();                                            \
     } while (0)
 
 #define HANDLE_METRICS                             \
@@ -291,7 +292,7 @@ void runIVFFlatScanTile(
         }
     }
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 
 #undef HANDLE_METRICS
 #undef RUN_IVF_FLAT
@@ -414,9 +415,9 @@ void runIVFFlatScan(
 
     // Make sure the element before prefixSumOffsets is 0, since we
     // depend upon simple, boundary-less indexing to get proper results
-    CUDA_VERIFY(hipMemsetAsync(
+    HIP_VERIFY(hipMemsetAsync(
             prefixSumOffsetSpace1.data(), 0, sizeof(idx_t), stream));
-    CUDA_VERIFY(hipMemsetAsync(
+    HIP_VERIFY(hipMemsetAsync(
             prefixSumOffsetSpace2.data(), 0, sizeof(idx_t), stream));
 
     DeviceTensor<float, 1, true> allDistances1(
diff --git a/faiss/hip/impl/IVFInterleaved.hip b/faiss/hip/impl/IVFInterleaved.hip
index 6b06518758..17088babf9 100644
--- a/faiss/hip/impl/IVFInterleaved.hip
+++ b/faiss/hip/impl/IVFInterleaved.hip
@@ -172,6 +172,7 @@ void runIVFInterleavedScan2(
         IVF_SCAN_2(64, 2048, 8);
     }
 #endif
+    HIP_TEST_ERROR();
 }
 
 void runIVFInterleavedScan(
diff --git a/faiss/hip/impl/IVFUtils.hip b/faiss/hip/impl/IVFUtils.hip
index 80faa619ba..d1ed23b324 100644
--- a/faiss/hip/impl/IVFUtils.hip
+++ b/faiss/hip/impl/IVFUtils.hip
@@ -62,7 +62,7 @@ void runCalcListOffsets(
 
     getResultLengths<<<grid, block, 0, stream>>>(
             ivfListIds, listLengths.data(), totalSize, prefixSumOffsets);
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 
     // Prefix sum of the indices, so we know where the intermediate
     // results should be maintained
@@ -76,7 +76,7 @@ void runCalcListOffsets(
             prefixSumOffsets.data(),
             prefixSumOffsets.data() + totalSize,
             prefixSumOffsets.data());
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 } // namespace hip
diff --git a/faiss/hip/impl/IVFUtilsSelect1.hip b/faiss/hip/impl/IVFUtilsSelect1.hip
index da49763ae2..97c2fd2371 100644
--- a/faiss/hip/impl/IVFUtilsSelect1.hip
+++ b/faiss/hip/impl/IVFUtilsSelect1.hip
@@ -125,6 +125,7 @@ void runPass1SelectLists(
                         k,                                              \
                         heapDistances,                                  \
                         heapIndices);                                   \
+        HIP_TEST_ERROR();                                              \
         return; /* success */                                           \
     } while (0)
 
@@ -135,8 +136,8 @@ void runPass1SelectLists(
     do {                                          \
         if (k == 1) {                             \
             RUN_PASS(INDEX_T, 128, 1, 1, DIR);    \
-        /*} else if (k <= 32) {                   */  \
-        /*    RUN_PASS(INDEX_T, 128, 32, 2, DIR); */  \
+        } else if (k <= 32) {                     \
+            RUN_PASS(INDEX_T, 128, 32, 2, DIR);   \
         } else if (k <= 64) {                     \
             RUN_PASS(INDEX_T, 128, 64, 3, DIR);   \
         } else if (k <= 128) {                    \
@@ -158,8 +159,8 @@ void runPass1SelectLists(
     do {                                          \
         if (k == 1) {                             \
             RUN_PASS(INDEX_T, 128, 1, 1, DIR);    \
-        /*} else if (k <= 32) {                   */  \
-        /*    RUN_PASS(INDEX_T, 128, 32, 2, DIR); */  \
+        } else if (k <= 32) {                     \
+            RUN_PASS(INDEX_T, 128, 32, 2, DIR);   \
         } else if (k <= 64) {                     \
             RUN_PASS(INDEX_T, 128, 64, 3, DIR);   \
         } else if (k <= 128) {                    \
@@ -192,7 +193,7 @@ void runPass1SelectLists(
 #undef RUN_PASS_DIR
 #undef RUN_PASS
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 } // namespace hip
diff --git a/faiss/hip/impl/IVFUtilsSelect2.hip b/faiss/hip/impl/IVFUtilsSelect2.hip
index c3e8668ac5..739876837a 100644
--- a/faiss/hip/impl/IVFUtilsSelect2.hip
+++ b/faiss/hip/impl/IVFUtilsSelect2.hip
@@ -184,6 +184,7 @@ void runPass2SelectLists(
                         indicesOptions,                                 \
                         outDistances,                                   \
                         outIndices);                                    \
+        HIP_TEST_ERROR();                                              \
     } while (0)
 
 #if GPU_MAX_SELECTION_K >= 2048
@@ -193,8 +194,8 @@ void runPass2SelectLists(
     do {                                          \
         if (k == 1) {                             \
             RUN_PASS(INDEX_T, 128, 1, 1, DIR);    \
-        /*} else if (k <= 32) {                  */   \
-        /*    RUN_PASS(INDEX_T, 128, 32, 2, DIR);*/   \
+        } else if (k <= 32) {                     \
+            RUN_PASS(INDEX_T, 128, 32, 2, DIR);   \
         } else if (k <= 64) {                     \
             RUN_PASS(INDEX_T, 128, 64, 3, DIR);   \
         } else if (k <= 128) {                    \
@@ -216,8 +217,8 @@ void runPass2SelectLists(
     do {                                          \
         if (k == 1) {                             \
             RUN_PASS(INDEX_T, 128, 1, 1, DIR);    \
-        /*} else if (k <= 32) {                   */  \
-        /*    RUN_PASS(INDEX_T, 128, 32, 2, DIR); */  \
+        } else if (k <= 32) {                     \
+            RUN_PASS(INDEX_T, 128, 32, 2, DIR);   \
         } else if (k <= 64) {                     \
             RUN_PASS(INDEX_T, 128, 64, 3, DIR);   \
         } else if (k <= 128) {                    \
@@ -250,7 +251,7 @@ void runPass2SelectLists(
 #undef RUN_PASS_DIR
 #undef RUN_PASS
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 } // namespace hip
diff --git a/faiss/hip/impl/IcmEncoder.hip b/faiss/hip/impl/IcmEncoder.hip
index 1f0db2491a..78fefb9989 100644
--- a/faiss/hip/impl/IcmEncoder.hip
+++ b/faiss/hip/impl/IcmEncoder.hip
@@ -241,6 +241,7 @@ void IcmEncoderImpl::computeUnaryTerms(
         auto uPtr = uterm + m * n * K;
         auto nPtr = norm.data() + m * K;
         runNormAddition<<<n, K, 0, stream>>>(uPtr, nPtr, K);
+        HIP_TEST_ERROR();
     }
 }
 
@@ -329,6 +330,7 @@ void IcmEncoderImpl::encode(
             M,
             K,
             dims);
+    HIP_TEST_ERROR();
 
     int blockSize = 256;
     int numBlocks = (n + blockSize - 1) / blockSize;
@@ -336,6 +338,7 @@ void IcmEncoderImpl::encode(
     for (int i = 0; i < ilsIters; i++) {
         runCodesPerturbation<<<numBlocks, blockSize, 0, stream>>>(
                 gen(), codes.data(), n, M, K, nperts);
+        HIP_TEST_ERROR();
 
         // perform icm encoding
         for (int j = 0; j < icmIters; j++) {
@@ -347,6 +350,7 @@ void IcmEncoderImpl::encode(
                         M,
                         K,
                         m);
+                HIP_TEST_ERROR();
             }
         }
 
@@ -360,6 +364,7 @@ void IcmEncoderImpl::encode(
                 M,
                 K,
                 dims);
+        HIP_TEST_ERROR();
 
         // if objs[i] < best_objs[i], replace best_codes[i] with codes[i]
         runCodesSelection<<<numBlocks, blockSize, 0, stream>>>(
@@ -369,6 +374,7 @@ void IcmEncoderImpl::encode(
                 objs.data(),
                 n,
                 M);
+        HIP_TEST_ERROR();
 
         codes.copyFrom(bestCodes, stream);
     }
diff --git a/faiss/hip/impl/L2Norm.hip b/faiss/hip/impl/L2Norm.hip
index c66ad6b52c..61c31ffe52 100644
--- a/faiss/hip/impl/L2Norm.hip
+++ b/faiss/hip/impl/L2Norm.hip
@@ -239,6 +239,7 @@ void runL2Norm(
                         <<<grid, block, smem, stream>>>(INPUT, output);      \
             }                                                                \
         }                                                                    \
+        HIP_TEST_ERROR();                                                   \
     } while (0)
 
     if (inputRowMajor) {
@@ -295,11 +296,12 @@ void runL2Norm(
         } else {
             l2NormColMajor<T, false><<<grid, block, 0, stream>>>(input, output);
         }
+        HIP_TEST_ERROR();
     }
 
 #undef RUN_L2
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 void runL2Norm(
diff --git a/faiss/hip/impl/L2Select.hip b/faiss/hip/impl/L2Select.hip
index 35975e41de..f56632eee8 100644
--- a/faiss/hip/impl/L2Select.hip
+++ b/faiss/hip/impl/L2Select.hip
@@ -215,6 +215,7 @@ void runL2SelectMin(
                         centroidDistances,
                         outDistances,
                         outIndices);
+        HIP_TEST_ERROR();
     } else {
         auto grid = dim3(outDistances.getSize(0));
 
@@ -240,6 +241,7 @@ void runL2SelectMin(
         } else {                                                 \
             L2_KERNEL(int32_t, BLOCK, NUM_WARP_Q, NUM_THREAD_Q); \
         }                                                        \
+        HIP_TEST_ERROR();                                       \
     } while (false)
 
         // block size 128 for everything <= 1024
@@ -269,7 +271,7 @@ void runL2SelectMin(
 #undef L2_KERNEL
 #undef RUN_L2_SELECT
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 void runL2SelectMin(
diff --git a/faiss/hip/impl/PQCodeDistances-inl.h b/faiss/hip/impl/PQCodeDistances-inl.h
index bafff11552..345e7abf29 100644
--- a/faiss/hip/impl/PQCodeDistances-inl.h
+++ b/faiss/hip/impl/PQCodeDistances-inl.h
@@ -350,7 +350,7 @@ void runPQResidualVector(
                 residual);
     }
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 template <typename T>
@@ -391,6 +391,7 @@ void runPQDistanceIPCorrection(
 
     pqDistanceIPCorrection<<<grid, block, 0, stream>>>(
             codeView, coarseDistances, codeDistances.getSize(2));
+    HIP_TEST_ERROR();
 }
 
 // This is a general purpose implementation that leverages GEMM to calculate
@@ -665,6 +666,7 @@ void runPQCodeDistances(
                             coarseIndices,                               \
                             outCodeDistancesT);                          \
         }                                                                \
+        HIP_TEST_ERROR();                                               \
     } while (0)
 
 #define CODE_L2(DIMS)              \
@@ -729,7 +731,7 @@ void runPQCodeDistances(
 #undef RUN_CODE
 #undef CODE_L2
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 } // namespace hip
diff --git a/faiss/hip/impl/PQCodeLoad.h b/faiss/hip/impl/PQCodeLoad.h
index d9cf1a8cd6..967f3ad22e 100644
--- a/faiss/hip/impl/PQCodeLoad.h
+++ b/faiss/hip/impl/PQCodeLoad.h
@@ -45,7 +45,10 @@ struct LoadCode32<1> {
             uint8_t* p,
             int offset) {
         p += offset * 1;
-            code32[0] = *p;
+        using T = uint8_t __attribute__((ext_vector_type(1)));
+        T* t = reinterpret_cast<T*>(p);
+        T* u = reinterpret_cast<T*>(code32);
+        u[0] = __builtin_nontemporal_load(t);
 //DONE        asm("ld.global.cs.u8 {%0}, [%1];" : "=r"(code32[0]) : "l"(p));
     }
 };
@@ -57,7 +60,10 @@ struct LoadCode32<2> {
             uint8_t* p,
             int offset) {
         p += offset * 2;
-                code32[0] = *p;
+        using T = uint8_t __attribute__((ext_vector_type(2)));
+        T* t = reinterpret_cast<T*>(p);
+        T* u = reinterpret_cast<T*>(code32);
+        u[0] = __builtin_nontemporal_load(t);
 //DONE        asm("ld.global.cs.u16 {%0}, [%1];" : "=r"(code32[0]) : "l"(p));
     }
 };
@@ -69,24 +75,18 @@ struct LoadCode32<3> {
             uint8_t* p,
             int offset) {
         p += offset * 3;
-        unsigned int a;
-        unsigned int b;
-        unsigned int c;
+	using T = uint8_t __attribute__((ext_vector_type(3)));
+        T* t = reinterpret_cast<T*>(p);
+        T* u = reinterpret_cast<T*>(code32);
+        u[0] = __builtin_nontemporal_load(t);
 
         // FIXME: this is a non-coalesced, unaligned, non-vectorized load
         // unfortunately need to reorganize memory layout by warp
 //DONE        asm("ld.global.cs.u8 {%0}, [%1 + 0];" : "=r"(a) : "l"(p));
 //DONE        asm("ld.global.cs.u8 {%0}, [%1 + 1];" : "=r"(b) : "l"(p));
 //DONE        asm("ld.global.cs.u8 {%0}, [%1 + 2];" : "=r"(c) : "l"(p));
-        a = *p;
-        p += 1;
-        b = *p;
-        p += 1;
-        c = *p;
         // FIXME: this is also slow, since we have to recover the
         // individual bytes loaded
-        code32[0] = (c << 16) | (b << 8) | a;
-        p -= 2;
     }
 };
 
@@ -97,8 +97,11 @@ struct LoadCode32<4> {
             uint8_t* p,
             int offset) {
         p += offset * 4;
-        code32[0] = *p;
-//DONE        asm("ld.global.cs.u32 {%0}, [%1];" : "=r"(code32[0]) : "l"(p));
+        using T = uint32_t __attribute__((ext_vector_type(1)));
+        T* t = reinterpret_cast<T*>(p);
+        T* u = reinterpret_cast<T*>(code32);
+        u[0] = __builtin_nontemporal_load(t);
+	//DONE        asm("ld.global.cs.u32 {%0}, [%1];" : "=r"(code32[0]) : "l"(p));
     }
 };
 
@@ -109,11 +112,13 @@ struct LoadCode32<8> {
             uint8_t* p,
             int offset) {
         p += offset * 8;
-//DONE        asm("ld.global.cs.v2.u32 {%0, %1}, [%2];"
+        using T = uint32_t __attribute__((ext_vector_type(2)));
+        T* t = reinterpret_cast<T*>(p);
+        T* u = reinterpret_cast<T*>(code32);
+        u[0] = __builtin_nontemporal_load(t);
+	//DONE        asm("ld.global.cs.v2.u32 {%0, %1}, [%2];"
 //DONE            : "=r"(code32[0]), "=r"(code32[1])
 //DONE            : "l"(p));
-        code32[0] = *p;
-        code32[1] = *p;
     }
 };
 
@@ -124,14 +129,15 @@ struct LoadCode32<12> {
             uint8_t* p,
             int offset) {
         p += offset * 12;
+	using T = uint32_t __attribute__((ext_vector_type(3)));
+        T* t = reinterpret_cast<T*>(p);
+        T* u = reinterpret_cast<T*>(code32);
+        u[0] = __builtin_nontemporal_load(t);
         // FIXME: this is a non-coalesced, unaligned, non-vectorized load
         // unfortunately need to reorganize memory layout by warp
 //DONE        asm(LD_NC_V1 " {%0}, [%1 + 0];" : "=r"(code32[0]) : "l"(p));
 //DONE        asm(LD_NC_V1 " {%0}, [%1 + 4];" : "=r"(code32[1]) : "l"(p));
 //DONE        asm(LD_NC_V1 " {%0}, [%1 + 8];" : "=r"(code32[2]) : "l"(p));
-        code32[0] = *p;
-        code32[1] = *p;
-        code32[2] = *p;
     }
 };
 
@@ -142,10 +148,10 @@ struct LoadCode32<16> {
             uint8_t* p,
             int offset) {
         p += offset * 16;
-        code32[0] = *p;
-        code32[1] = *p;
-        code32[2] = *p;
-        code32[3] = *p;
+	using T = uint32_t __attribute__((ext_vector_type(4)));
+        T* t = reinterpret_cast<T*>(p);
+        T* u = reinterpret_cast<T*>(code32);
+        u[0] = __builtin_nontemporal_load(t);
 //DONE        asm("ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];"
 //DONE            : "=r"(code32[0]), "=r"(code32[1]), "=r"(code32[2]), "=r"(code32[3])
 //DONE            : "l"(p));
@@ -159,23 +165,25 @@ struct LoadCode32<20> {
             uint8_t* p,
             int offset) {
         p += offset * 20;
-        // FIXME: this is a non-coalesced, unaligned, non-vectorized load
+        //using T = uint32_t __attribute__((ext_vector_type(1)));
+        code32[0] = __builtin_nontemporal_load(p);
+	code32[1] = __builtin_nontemporal_load(p + 4);
+        code32[2] = __builtin_nontemporal_load(p + 8);
+	code32[3] = __builtin_nontemporal_load(p + 12);
+	code32[4] = __builtin_nontemporal_load(p + 16);
+
+	// FIXME: this is a non-coalesced, unaligned, non-vectorized load
         // unfortunately need to reorganize memory layout by warp
 //DONE        asm(LD_NC_V1 " {%0}, [%1 + 0];" : "=r"(code32[0]) : "l"(p));
 //DONE        asm(LD_NC_V1 " {%0}, [%1 + 4];" : "=r"(code32[1]) : "l"(p));
 //DONE        asm(LD_NC_V1 " {%0}, [%1 + 8];" : "=r"(code32[2]) : "l"(p));
 //DONE        asm(LD_NC_V1 " {%0}, [%1 + 12];" : "=r"(code32[3]) : "l"(p));
 //DONE        asm(LD_NC_V1 " {%0}, [%1 + 16];" : "=r"(code32[4]) : "l"(p));
-        code32[0] = *p;
-        p += 4;
-        code32[1] = *p;
-        p += 4;
-        code32[2] = *p;
-        p += 4;
-        code32[3] = *p;
-        p += 4;
-        code32[4] = *p;
-        p -= 16;
+        //code32[0] = p[0];
+        //code32[1] = p[1];
+        //code32[2] = p[2];
+        //code32[3] = p[3];
+        //code32[4] = p[4];
     }
 };
 
@@ -186,6 +194,10 @@ struct LoadCode32<24> {
             uint8_t* p,
             int offset) {
         p += offset * 24;
+	using T = uint32_t __attribute__((ext_vector_type(6)));
+	T* t = reinterpret_cast<T*>(p);
+        T* u = reinterpret_cast<T*>(code32);
+	u[0] = __builtin_nontemporal_load(t);
         // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
         // unfortunately need to reorganize memory layout by warp
 //DONE        asm(LD_NC_V2 " {%0, %1}, [%2 + 0];"
@@ -197,15 +209,6 @@ struct LoadCode32<24> {
 //DONE        asm(LD_NC_V2 " {%0, %1}, [%2 + 16];"
 //DONE            : "=r"(code32[4]), "=r"(code32[5])
 //DONE            : "l"(p));
-        code32[0] = *p;
-        code32[1] = *p;
-        p += 8;
-        code32[2] = *p;
-        code32[3] = *p;
-        p += 8;
-        code32[4] = *p;
-        code32[5] = *p;
-        p -= 16;
     }
 };
 
@@ -216,6 +219,10 @@ struct LoadCode32<28> {
             uint8_t* p,
             int offset) {
         p += offset * 28;
+	using T = uint32_t __attribute__((ext_vector_type(7)));
+        T* t = reinterpret_cast<T*>(p);
+        T* u = reinterpret_cast<T*>(code32);
+        u[0] = __builtin_nontemporal_load(t);
         // FIXME: this is a non-coalesced, unaligned, non-vectorized load
         // unfortunately need to reorganize memory layout by warp
 //DONE        asm(LD_NC_V1 " {%0}, [%1 + 0];" : "=r"(code32[0]) : "l"(p));
@@ -225,20 +232,6 @@ struct LoadCode32<28> {
 //DONE        asm(LD_NC_V1 " {%0}, [%1 + 16];" : "=r"(code32[4]) : "l"(p));
 //DONE        asm(LD_NC_V1 " {%0}, [%1 + 20];" : "=r"(code32[5]) : "l"(p));
 //DONE        asm(LD_NC_V1 " {%0}, [%1 + 24];" : "=r"(code32[6]) : "l"(p));
-        code32[0] = *p;
-        p += 4;
-        code32[1] = *p;
-        p += 4;
-        code32[2] = *p;
-        p += 4;
-        code32[3] = *p;
-        p += 4;
-        code32[4] = *p;
-        p += 4;
-        code32[5] = *p;
-        p += 4;
-        code32[6] = *p;
-        p -= 24;
     }
 };
 
@@ -249,6 +242,10 @@ struct LoadCode32<32> {
             uint8_t* p,
             int offset) {
         p += offset * 32;
+	using T = uint32_t __attribute__((ext_vector_type(8)));
+        T* t = reinterpret_cast<T*>(p);
+        T* u = reinterpret_cast<T*>(code32);
+        u[0] = __builtin_nontemporal_load(t);
         // FIXME: this is a non-coalesced load
         // unfortunately need to reorganize memory layout by warp
 //DONE        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];"
@@ -257,16 +254,6 @@ struct LoadCode32<32> {
 //DONE        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];"
 //DONE            : "=r"(code32[4]), "=r"(code32[5]), "=r"(code32[6]), "=r"(code32[7])
 //DONE            : "l"(p));
-        code32[0] = *p;
-        code32[1] = *p;
-        code32[2] = *p;
-        code32[3] = *p;
-        p += 16;
-        code32[4] = *p;
-        code32[5] = *p;
-        code32[6] = *p;
-        code32[7] = *p;
-        p -= 16;
     }
 };
 
@@ -277,6 +264,10 @@ struct LoadCode32<40> {
             uint8_t* p,
             int offset) {
         p += offset * 40;
+	using T = uint32_t __attribute__((ext_vector_type(10)));
+        T* t = reinterpret_cast<T*>(p);
+        T* u = reinterpret_cast<T*>(code32);
+        u[0] = __builtin_nontemporal_load(t);
         // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
         // unfortunately need to reorganize memory layout by warp
 //DONE        asm(LD_NC_V2 " {%0, %1}, [%2 + 0];"
@@ -294,21 +285,6 @@ struct LoadCode32<40> {
 //DONE        asm(LD_NC_V2 " {%0, %1}, [%2 + 32];"
 //DONE            : "=r"(code32[8]), "=r"(code32[9])
 //DONE            : "l"(p));
-        code32[0] = *p;
-        code32[1] = *p;
-        p += 8;
-        code32[2] = *p;
-        code32[3] = *p;
-        p += 8;
-        code32[4] = *p;
-        code32[5] = *p;
-        p += 8;
-        code32[6] = *p;
-        code32[7] = *p;
-        p += 8;
-        code32[8] = *p;
-        code32[9] = *p;
-        p -= 32;
     }
 };
 
@@ -319,6 +295,10 @@ struct LoadCode32<48> {
             uint8_t* p,
             int offset) {
         p += offset * 48;
+	using T = uint32_t __attribute__((ext_vector_type(12)));
+        T* t = reinterpret_cast<T*>(p);
+        T* u = reinterpret_cast<T*>(code32);
+        u[0] = __builtin_nontemporal_load(t);
         // FIXME: this is a non-coalesced load
         // unfortunately need to reorganize memory layout by warp
 //DONE        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];"
@@ -333,21 +313,6 @@ struct LoadCode32<48> {
 //DONE              "=r"(code32[10]),
 //DONE              "=r"(code32[11])
 //DONE            : "l"(p));
-        code32[0] = *p;
-        code32[1] = *p;
-        code32[2] = *p;
-        code32[3] = *p;
-        p += 16;
-        code32[4] = *p;
-        code32[5] = *p;
-        code32[6] = *p;
-        code32[7] = *p;
-        p += 16;
-        code32[8] = *p;
-        code32[9] = *p;
-        code32[10] = *p;
-        code32[11] = *p;
-        p -= 32;
     }
 };
 
@@ -358,6 +323,10 @@ struct LoadCode32<56> {
             uint8_t* p,
             int offset) {
         p += offset * 56;
+	using T = uint32_t __attribute__((ext_vector_type(14)));
+        T* t = reinterpret_cast<T*>(p);
+        T* u = reinterpret_cast<T*>(code32);
+        u[0] = __builtin_nontemporal_load(t);
         // FIXME: this is a non-coalesced, unaligned, 2-vectorized load
         // unfortunately need to reorganize memory layout by warp
 //DONE        asm(LD_NC_V2 " {%0, %1}, [%2 + 0];"
@@ -381,27 +350,6 @@ struct LoadCode32<56> {
 //DONE        asm(LD_NC_V2 " {%0, %1}, [%2 + 48];"
 //DONE            : "=r"(code32[12]), "=r"(code32[13])
 //DONE            : "l"(p));
-        code32[0] = *p;
-        code32[1] = *p;
-        p += 8;
-        code32[2] = *p;
-        code32[3] = *p;
-        p += 8;
-        code32[4] = *p;
-        code32[5] = *p;
-        p += 8;
-        code32[6] = *p;
-        code32[7] = *p;
-        p += 8;
-        code32[8] = *p;
-        code32[9] = *p;
-        p += 8;
-        code32[10] = *p;
-        code32[11] = *p;
-        p += 8;
-        code32[12] = *p;
-        code32[13] = *p;
-        p -= 48;
     }
 };
 
@@ -412,6 +360,10 @@ struct LoadCode32<64> {
             uint8_t* p,
             int offset) {
         p += offset * 64;
+	using T = uint32_t __attribute__((ext_vector_type(16)));
+        T* t = reinterpret_cast<T*>(p);
+        T* u = reinterpret_cast<T*>(code32);
+        u[0] = __builtin_nontemporal_load(t);
         // FIXME: this is a non-coalesced load
         // unfortunately need to reorganize memory layout by warp
 //DONE        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];"
@@ -432,26 +384,6 @@ struct LoadCode32<64> {
 //DONE              "=r"(code32[14]),
 //DONE              "=r"(code32[15])
 //DONE            : "l"(p));
-        code32[0] = *p;
-        code32[1] = *p;
-        code32[2] = *p;
-        code32[3] = *p;
-        p += 16;
-        code32[4] = *p;
-        code32[5] = *p;
-        code32[6] = *p;
-        code32[7] = *p;
-        p += 16;
-        code32[8] = *p;
-        code32[9] = *p;
-        code32[10] = *p;
-        code32[11] = *p;
-        p += 16;
-        code32[12] = *p;
-        code32[13] = *p;
-        code32[14] = *p;
-        code32[15] = *p;
-        p -= 48;
     }
 };
 
@@ -462,6 +394,10 @@ struct LoadCode32<96> {
             uint8_t* p,
             int offset) {
         p += offset * 96;
+	using T = uint32_t __attribute__((ext_vector_type(24)));
+        T* t = reinterpret_cast<T*>(p);
+        T* u = reinterpret_cast<T*>(code32);
+        u[0] = __builtin_nontemporal_load(t);
         // FIXME: this is a non-coalesced load
         // unfortunately need to reorganize memory layout by warp
 //DONE        asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];"
@@ -494,36 +430,6 @@ struct LoadCode32<96> {
 //DONE              "=r"(code32[22]),
 //DONE              "=r"(code32[23])
 //DONE            : "l"(p));
-        code32[0] = *p;
-        code32[1] = *p;
-        code32[2] = *p;
-        code32[3] = *p;
-        p += 16;
-        code32[4] = *p;
-        code32[5] = *p;
-        code32[6] = *p;
-        code32[7] = *p;
-        p += 16;
-        code32[8] = *p;
-        code32[9] = *p;
-        code32[10] = *p;
-        code32[11] = *p;
-        p += 16;
-        code32[12] = *p;
-        code32[13] = *p;
-        code32[14] = *p;
-        code32[15] = *p;
-        p += 16;
-        code32[16] = *p;
-        code32[17] = *p;
-        code32[18] = *p;
-        code32[19] = *p;
-        p += 16;
-        code32[20] = *p;
-        code32[21] = *p;
-        code32[22] = *p;
-        code32[23] = *p;
-        p -= 80;
     }
 };
 
diff --git a/faiss/hip/impl/PQScanMultiPassNoPrecomputed-inl.h b/faiss/hip/impl/PQScanMultiPassNoPrecomputed-inl.h
index 2c180589d3..d2078e18ca 100644
--- a/faiss/hip/impl/PQScanMultiPassNoPrecomputed-inl.h
+++ b/faiss/hip/impl/PQScanMultiPassNoPrecomputed-inl.h
@@ -353,6 +353,7 @@ void runMultiPassTile(
                         listLengths.data(),                    \
                         prefixSumOffsets,                      \
                         allDistances);                         \
+         HIP_TEST_ERROR();                                    \
     } while (0)
 
         if (useFloat16Lookup) {
@@ -424,6 +425,7 @@ void runMultiPassTile(
                         listLengths.data(),                             \
                         prefixSumOffsets,                               \
                         allDistances);                                  \
+         HIP_TEST_ERROR();                                             \
     } while (0)
 
 #define RUN_PQ(NUM_SUB_Q)                         \
@@ -493,7 +495,7 @@ void runMultiPassTile(
 #undef RUN_PQ_OPT
     }
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 
     // k-select the output in chunks, to increase parallelism
     runPass1SelectLists(
@@ -624,9 +626,9 @@ void runPQScanMultiPassNoPrecomputed(
 
     // Make sure the element before prefixSumOffsets is 0, since we
     // depend upon simple, boundary-less indexing to get proper results
-    CUDA_VERIFY(hipMemsetAsync(
+    HIP_VERIFY(hipMemsetAsync(
             prefixSumOffsetSpace1.data(), 0, sizeof(idx_t), stream));
-    CUDA_VERIFY(hipMemsetAsync(
+    HIP_VERIFY(hipMemsetAsync(
             prefixSumOffsetSpace2.data(), 0, sizeof(idx_t), stream));
 
     idx_t codeDistanceTypeSize =
diff --git a/faiss/hip/impl/PQScanMultiPassPrecomputed.hip b/faiss/hip/impl/PQScanMultiPassPrecomputed.hip
index 519227033f..7b08f196e6 100644
--- a/faiss/hip/impl/PQScanMultiPassPrecomputed.hip
+++ b/faiss/hip/impl/PQScanMultiPassPrecomputed.hip
@@ -357,6 +357,7 @@ void runMultiPassTile(
                         listLengths.data(),                               \
                         prefixSumOffsets,                                 \
                         allDistances);                                    \
+         HIP_TEST_ERROR();                                               \
     } while (0)
 
         if (useFloat16Lookup) {
@@ -432,6 +433,7 @@ void runMultiPassTile(
                         listLengths.data(),                           \
                         prefixSumOffsets,                             \
                         allDistances);                                \
+         HIP_TEST_ERROR();                                           \
     } while (0)
 
 #define RUN_PQ(NUM_SUB_Q)                         \
@@ -497,7 +499,7 @@ void runMultiPassTile(
                 break;
         }
 
-        CUDA_TEST_ERROR();
+        HIP_TEST_ERROR();
 
 #undef RUN_PQ
 #undef RUN_PQ_OPT
@@ -534,7 +536,7 @@ void runMultiPassTile(
             outIndices,
             stream);
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 void runPQScanMultiPassPrecomputed(
@@ -633,9 +635,9 @@ void runPQScanMultiPassPrecomputed(
 
     // Make sure the element before prefixSumOffsets is 0, since we
     // depend upon simple, boundary-less indexing to get proper results
-    CUDA_VERIFY(hipMemsetAsync(
+    HIP_VERIFY(hipMemsetAsync(
             prefixSumOffsetSpace1.data(), 0, sizeof(idx_t), stream));
-    CUDA_VERIFY(hipMemsetAsync(
+    HIP_VERIFY(hipMemsetAsync(
             prefixSumOffsetSpace2.data(), 0, sizeof(idx_t), stream));
 
     DeviceTensor<float, 1, true> allDistances1(
diff --git a/faiss/hip/impl/VectorResidual.hip b/faiss/hip/impl/VectorResidual.hip
index b0a92cd28d..29f3f5d9ea 100644
--- a/faiss/hip/impl/VectorResidual.hip
+++ b/faiss/hip/impl/VectorResidual.hip
@@ -82,7 +82,7 @@ void calcResidual(
                 vecs, centroids, vecToCentroid, residuals);
     }
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 void runCalcResidual(
@@ -152,7 +152,7 @@ void gatherReconstructByIds(
 
     gatherReconstructByIds<T><<<grid, block, 0, stream>>>(ids, vecs, out);
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 template <typename T>
@@ -175,7 +175,7 @@ void gatherReconstructByRange(
     gatherReconstructByRange<T>
             <<<grid, block, 0, stream>>>(start, num, vecs, out);
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 void runReconstruct(
diff --git a/faiss/hip/impl/scan/IVFInterleavedImpl.h b/faiss/hip/impl/scan/IVFInterleavedImpl.h
index b62880bc95..aa48082f7f 100644
--- a/faiss/hip/impl/scan/IVFInterleavedImpl.h
+++ b/faiss/hip/impl/scan/IVFInterleavedImpl.h
@@ -252,7 +252,7 @@ void ivfInterleavedScanImpl(
         FAISS_ASSERT(false);
     }
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 } // namespace hip
diff --git a/faiss/hip/impl/scan/IVFInterleavedScanKernelTemplate.hip b/faiss/hip/impl/scan/IVFInterleavedScanKernelTemplate.hip
index 0821ed4a48..5ca6d76c30 100644
--- a/faiss/hip/impl/scan/IVFInterleavedScanKernelTemplate.hip
+++ b/faiss/hip/impl/scan/IVFInterleavedScanKernelTemplate.hip
@@ -60,6 +60,7 @@ void IVFINT_RUN<
                     distanceTemp,
                     indicesTemp,
                     useResidual);
+    HIP_TEST_ERROR();
 
     runIVFInterleavedScan2(
             distanceTemp,
diff --git a/faiss/hip/perf/PerfBinaryFlat.hip b/faiss/hip/perf/PerfBinaryFlat.hip
index 99a2d397dd..6cd42f8ce6 100644
--- a/faiss/hip/perf/PerfBinaryFlat.hip
+++ b/faiss/hip/perf/PerfBinaryFlat.hip
@@ -92,7 +92,7 @@ int main(int argc, char** argv) {
     HostTensor<int, 2, true> gpuDistances({numQueries, FLAGS_k});
     HostTensor<faiss::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
 
-    CUDA_VERIFY(hipProfilerStart());
+    HIP_VERIFY(hipProfilerStart());
     faiss::hip::synchronizeAllDevices();
 
     float gpuTime = 0.0f;
@@ -113,10 +113,10 @@ int main(int argc, char** argv) {
         gpuTime = timer.elapsedMilliseconds();
     }
 
-    CUDA_VERIFY(hipProfilerStop());
+    HIP_VERIFY(hipProfilerStop());
     printf("GPU time %.3f ms\n", gpuTime);
 
-    CUDA_VERIFY(hipDeviceSynchronize());
+    HIP_VERIFY(hipDeviceSynchronize());
 
     return 0;
 }
diff --git a/faiss/hip/perf/PerfClustering.cpp b/faiss/hip/perf/PerfClustering.cpp
index 5e52bf7ce8..f47573c249 100644
--- a/faiss/hip/perf/PerfClustering.cpp
+++ b/faiss/hip/perf/PerfClustering.cpp
@@ -86,7 +86,7 @@ int main(int argc, char** argv) {
 
     IndexWrapper<faiss::hip::GpuIndexFlat> gpuIndex(FLAGS_num_gpus, initFn);
 
-    CUDA_VERIFY(cudaProfilerStart());
+    HIP_VERIFY(cudaProfilerStart());
     faiss::hip::synchronizeAllDevices();
 
     float gpuTime = 0.0f;
@@ -112,10 +112,10 @@ int main(int argc, char** argv) {
         gpuTime = timer.elapsedMilliseconds();
     }
 
-    CUDA_VERIFY(cudaProfilerStop());
+    HIP_VERIFY(cudaProfilerStop());
     printf("k-means time %.3f ms\n", gpuTime);
 
-    CUDA_VERIFY(cudaDeviceSynchronize());
+    HIP_VERIFY(cudaDeviceSynchronize());
 
     return 0;
 }
diff --git a/faiss/hip/perf/PerfFlat.hip b/faiss/hip/perf/PerfFlat.hip
index b04002354b..160fd707b3 100644
--- a/faiss/hip/perf/PerfFlat.hip
+++ b/faiss/hip/perf/PerfFlat.hip
@@ -116,7 +116,7 @@ int main(int argc, char** argv) {
     HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
     HostTensor<faiss::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
 
-    CUDA_VERIFY(hipProfilerStart());
+    HIP_VERIFY(hipProfilerStart());
     faiss::hip::synchronizeAllDevices();
 
     float gpuTime = 0.0f;
@@ -137,7 +137,7 @@ int main(int argc, char** argv) {
         gpuTime = timer.elapsedMilliseconds();
     }
 
-    CUDA_VERIFY(hipProfilerStop());
+    HIP_VERIFY(hipProfilerStop());
     printf("GPU time %.3f ms\n", gpuTime);
 
     if (FLAGS_cpu) {
@@ -154,7 +154,7 @@ int main(int argc, char** argv) {
                 false);
     }
 
-    CUDA_VERIFY(hipDeviceSynchronize());
+    HIP_VERIFY(hipDeviceSynchronize());
 
     return 0;
 }
diff --git a/faiss/hip/perf/PerfIVFFlat.hip b/faiss/hip/perf/PerfIVFFlat.hip
index ef47f728ab..b9e200eaf7 100644
--- a/faiss/hip/perf/PerfIVFFlat.hip
+++ b/faiss/hip/perf/PerfIVFFlat.hip
@@ -115,7 +115,7 @@ int main(int argc, char** argv) {
     HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
     HostTensor<faiss::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
 
-    CUDA_VERIFY(hipProfilerStart());
+    HIP_VERIFY(hipProfilerStart());
     faiss::hip::synchronizeAllDevices();
 
     float gpuTime = 0.0f;
@@ -136,7 +136,7 @@ int main(int argc, char** argv) {
         gpuTime = timer.elapsedMilliseconds();
     }
 
-    CUDA_VERIFY(hipProfilerStop());
+    HIP_VERIFY(hipProfilerStop());
     printf("GPU time %.3f ms\n", gpuTime);
 
     compareLists(
@@ -151,7 +151,7 @@ int main(int argc, char** argv) {
             FLAGS_diff,
             false);
 
-    CUDA_VERIFY(hipDeviceSynchronize());
+    HIP_VERIFY(hipDeviceSynchronize());
 
     return 0;
 }
diff --git a/faiss/hip/perf/PerfIVFPQ.hip b/faiss/hip/perf/PerfIVFPQ.hip
index 85be2f7451..61731ca12f 100644
--- a/faiss/hip/perf/PerfIVFPQ.hip
+++ b/faiss/hip/perf/PerfIVFPQ.hip
@@ -40,7 +40,7 @@ using namespace faiss::hip;
 int main(int argc, char** argv) {
     gflags::ParseCommandLineFlags(&argc, &argv, true);
 
-    CUDA_VERIFY(hipProfilerStop());
+    HIP_VERIFY(hipProfilerStop());
 
     auto seed = FLAGS_seed != -1L ? FLAGS_seed : time(nullptr);
     printf("using seed %ld\n", seed);
@@ -125,7 +125,7 @@ int main(int argc, char** argv) {
     HostTensor<float, 2, true> gpuDistances({numQueries, FLAGS_k});
     HostTensor<faiss::idx_t, 2, true> gpuIndices({numQueries, FLAGS_k});
 
-    CUDA_VERIFY(hipProfilerStart());
+    HIP_VERIFY(hipProfilerStart());
     faiss::hip::synchronizeAllDevices();
 
     float gpuTime = 0.0f;
@@ -146,7 +146,7 @@ int main(int argc, char** argv) {
         gpuTime = timer.elapsedMilliseconds();
     }
 
-    CUDA_VERIFY(hipProfilerStop());
+    HIP_VERIFY(hipProfilerStop());
     printf("GPU time %.3f ms\n", gpuTime);
 
     compareLists(
@@ -161,7 +161,7 @@ int main(int argc, char** argv) {
             FLAGS_diff,
             false);
 
-    CUDA_VERIFY(hipDeviceSynchronize());
+    HIP_VERIFY(hipDeviceSynchronize());
 
     return 0;
 }
diff --git a/faiss/hip/perf/PerfIVFPQAdd.cpp b/faiss/hip/perf/PerfIVFPQAdd.cpp
index e43f09b7c0..7650bbb6c1 100644
--- a/faiss/hip/perf/PerfIVFPQAdd.cpp
+++ b/faiss/hip/perf/PerfIVFPQAdd.cpp
@@ -74,7 +74,7 @@ int main(int argc, char** argv) {
     }
 
     cudaDeviceSynchronize();
-    CUDA_VERIFY(cudaProfilerStart());
+    HIP_VERIFY(cudaProfilerStart());
 
     float totalGpuTime = 0.0f;
     float totalCpuTime = 0.0f;
@@ -91,7 +91,7 @@ int main(int argc, char** argv) {
         if (FLAGS_time_gpu) {
             faiss::hip::CpuTimer timer;
             gpuIndex.add(FLAGS_batch_size, addVecs.data());
-            CUDA_VERIFY(cudaDeviceSynchronize());
+            HIP_VERIFY(cudaDeviceSynchronize());
             auto time = timer.elapsedMilliseconds();
 
             totalGpuTime += time;
@@ -122,7 +122,7 @@ int main(int argc, char** argv) {
         }
     }
 
-    CUDA_VERIFY(cudaProfilerStop());
+    HIP_VERIFY(cudaProfilerStop());
 
     int total = FLAGS_batch_size * FLAGS_batches;
 
diff --git a/faiss/hip/test/TestGpuMemoryException.cpp b/faiss/hip/test/TestGpuMemoryException.cpp
index 0af8c3bca8..976d1655aa 100644
--- a/faiss/hip/test/TestGpuMemoryException.cpp
+++ b/faiss/hip/test/TestGpuMemoryException.cpp
@@ -20,7 +20,7 @@ TEST(TestGpuMemoryException, AddException) {
     size_t devFree = 0;
     size_t devTotal = 0;
 
-    CUDA_VERIFY(hipMemGetInfo(&devFree, &devTotal));
+    HIP_VERIFY(hipMemGetInfo(&devFree, &devTotal));
 
     // Figure out the dimensionality needed to get at least greater than
     // devTotal
diff --git a/faiss/hip/utils/BlockSelectFloat.hip b/faiss/hip/utils/BlockSelectFloat.hip
index de6ac40b3e..bf2db1bdda 100644
--- a/faiss/hip/utils/BlockSelectFloat.hip
+++ b/faiss/hip/utils/BlockSelectFloat.hip
@@ -55,7 +55,7 @@ void runBlockSelect(
     if (dir) {
         if (k == 1) {
             BLOCK_SELECT_CALL(float, true, 1);
-        //} else if (k <= 32) {
+        //} else if (k <= 32) { //(kWarpSize == 32 && k <= 32)
         //    BLOCK_SELECT_CALL(float, true, 32);
         } else if (k <= 64) {
             BLOCK_SELECT_CALL(float, true, 64);
@@ -75,7 +75,7 @@ void runBlockSelect(
     } else {
         if (k == 1) {
             BLOCK_SELECT_CALL(float, false, 1);
-        //} else if (k <= 32) {
+        //} else if (k <= 32) { // (kWarpSize == 32 && k <= 32)
         //    BLOCK_SELECT_CALL(float, false, 32);
         } else if (k <= 64) {
             BLOCK_SELECT_CALL(float, false, 64);
@@ -108,7 +108,7 @@ void runBlockSelectPair(
     if (dir) {
         if (k == 1) {
             BLOCK_SELECT_PAIR_CALL(float, true, 1);
-        //} else if (k <= 32) {
+        //} else if (k <= 32) { // (kWarpSize == 32 && k <= 32)
         //    BLOCK_SELECT_PAIR_CALL(float, true, 32);
         } else if (k <= 64) {
             BLOCK_SELECT_PAIR_CALL(float, true, 64);
@@ -128,7 +128,7 @@ void runBlockSelectPair(
     } else {
         if (k == 1) {
             BLOCK_SELECT_PAIR_CALL(float, false, 1);
-        //} else if (k <= 32) {
+        //} else if (k <= 32) { // (kWarpSize == 32 && k <= 32)
         //    BLOCK_SELECT_PAIR_CALL(float, false, 32);
         } else if (k <= 64) {
             BLOCK_SELECT_PAIR_CALL(float, false, 64);
diff --git a/faiss/hip/utils/CopyUtils.h b/faiss/hip/utils/CopyUtils.h
index 0c6cb6a44e..59158ba4b9 100644
--- a/faiss/hip/utils/CopyUtils.h
+++ b/faiss/hip/utils/CopyUtils.h
@@ -117,10 +117,10 @@ inline void fromDevice(T* src, T* dst, size_t num, hipStream_t stream) {
     int dev = getDeviceForAddress(dst);
 
     if (dev == -1) {
-        CUDA_VERIFY(hipMemcpyAsync(
+        HIP_VERIFY(hipMemcpyAsync(
                 dst, src, num * sizeof(T), hipMemcpyDeviceToHost, stream));
     } else {
-        CUDA_VERIFY(hipMemcpyAsync(
+        HIP_VERIFY(hipMemcpyAsync(
                 dst, src, num * sizeof(T), hipMemcpyDeviceToDevice, stream));
     }
 }
diff --git a/faiss/hip/utils/DeviceDefs.h b/faiss/hip/utils/DeviceDefs.h
index 77b195d6dc..fb5bb53c74 100644
--- a/faiss/hip/utils/DeviceDefs.h
+++ b/faiss/hip/utils/DeviceDefs.h
@@ -14,7 +14,7 @@ namespace faiss {
 namespace hip {
 
 // We validate this against the actual architecture in device initialization
-constexpr int kWarpSize = warpSize;   // = 64 (Defined in hip_runtime.h)
+constexpr int kWarpSize = 32;//__AMDGCN_WAVEFRONT_SIZE;   // either = 32 or = 64 (Defined in hip_runtime.h)
 
 // This is a memory barrier for intra-warp writes to shared memory.
 __forceinline__ __device__ void warpFence() {
diff --git a/faiss/hip/utils/DeviceTensor-inl.h b/faiss/hip/utils/DeviceTensor-inl.h
index 9776e5a221..5b37f69048 100644
--- a/faiss/hip/utils/DeviceTensor-inl.h
+++ b/faiss/hip/utils/DeviceTensor-inl.h
@@ -181,7 +181,7 @@ __host__ DeviceTensor<T, Dim, InnerContig, IndexT, PtrTraits>& DeviceTensor<
         // Region must be contiguous
         FAISS_ASSERT(this->isContiguous());
 
-        CUDA_VERIFY(hipMemsetAsync(
+        HIP_VERIFY(hipMemsetAsync(
                 this->data_, 0, this->getSizeInBytes(), stream));
     }
 
diff --git a/faiss/hip/utils/DeviceUtils.h b/faiss/hip/utils/DeviceUtils.h
index 74a3bf92c9..3b69f32c9f 100644
--- a/faiss/hip/utils/DeviceUtils.h
+++ b/faiss/hip/utils/DeviceUtils.h
@@ -134,8 +134,8 @@ class CudaEvent {
     hipEvent_t event_;
 };
 
-/// Wrapper to test return status of CUDA functions
-#define CUDA_VERIFY(X)                      \
+/// Wrapper to test return status of HIP functions
+#define HIP_VERIFY(X)                      \
     do {                                    \
         auto err__ = (X);                   \
         FAISS_ASSERT_FMT(                   \
@@ -149,14 +149,14 @@ class CudaEvent {
 // #define FAISS_GPU_SYNC_ERROR 1
 
 #ifdef FAISS_GPU_SYNC_ERROR
-#define CUDA_TEST_ERROR()                     \
+#define HIP_TEST_ERROR()                     \
     do {                                      \
-        CUDA_VERIFY(hipDeviceSynchronize()); \
+        HIP_VERIFY(hipDeviceSynchronize()); \
     } while (0)
 #else
-#define CUDA_TEST_ERROR()                \
+#define HIP_TEST_ERROR()                \
     do {                                 \
-        CUDA_VERIFY(hipGetLastError()); \
+        HIP_VERIFY(hipGetLastError()); \
     } while (0)
 #endif
 
@@ -167,20 +167,20 @@ void streamWaitBase(const L1& listWaiting, const L2& listWaitOn) {
     std::vector<hipEvent_t> events;
     for (auto& stream : listWaitOn) {
         hipEvent_t event;
-        CUDA_VERIFY(hipEventCreateWithFlags(&event, hipEventDisableTiming));
-        CUDA_VERIFY(hipEventRecord(event, stream));
+        HIP_VERIFY(hipEventCreateWithFlags(&event, hipEventDisableTiming));
+        HIP_VERIFY(hipEventRecord(event, stream));
         events.push_back(event);
     }
 
     // For all the streams that are waiting, issue a wait
     for (auto& stream : listWaiting) {
         for (auto& event : events) {
-            CUDA_VERIFY(hipStreamWaitEvent(stream, event, 0));
+            HIP_VERIFY(hipStreamWaitEvent(stream, event, 0));
         }
     }
 
     for (auto& event : events) {
-        CUDA_VERIFY(hipEventDestroy(event));
+        HIP_VERIFY(hipEventDestroy(event));
     }
 }
 
diff --git a/faiss/hip/utils/DeviceUtils.hip b/faiss/hip/utils/DeviceUtils.hip
index b48a6250bc..600bcfea79 100644
--- a/faiss/hip/utils/DeviceUtils.hip
+++ b/faiss/hip/utils/DeviceUtils.hip
@@ -18,14 +18,14 @@ namespace hip {
 
 int getCurrentDevice() {
     int dev = -1;
-    CUDA_VERIFY(hipGetDevice(&dev));
+    HIP_VERIFY(hipGetDevice(&dev));
     FAISS_ASSERT(dev != -1);
 
     return dev;
 }
 
 void setCurrentDevice(int device) {
-    CUDA_VERIFY(hipSetDevice(device));
+    HIP_VERIFY(hipSetDevice(device));
 }
 
 int getNumDevices() {
@@ -34,7 +34,7 @@ int getNumDevices() {
     if (hipErrorNoDevice == err) {
         numDev = 0;
     } else {
-        CUDA_VERIFY(err);
+        HIP_VERIFY(err);
     }
     FAISS_ASSERT(numDev != -1);
 
@@ -42,18 +42,18 @@ int getNumDevices() {
 }
 
 void profilerStart() {
-    CUDA_VERIFY(hipProfilerStart());
+    HIP_VERIFY(hipProfilerStart());
 }
 
 void profilerStop() {
-    CUDA_VERIFY(hipProfilerStop());
+    HIP_VERIFY(hipProfilerStop());
 }
 
 void synchronizeAllDevices() {
     for (int i = 0; i < getNumDevices(); ++i) {
         DeviceScope scope(i);
 
-        CUDA_VERIFY(hipDeviceSynchronize());
+        HIP_VERIFY(hipDeviceSynchronize());
     }
 }
 
@@ -66,7 +66,7 @@ const hipDeviceProp_t& getDeviceProperties(int device) {
     auto it = properties.find(device);
     if (it == properties.end()) {
         hipDeviceProp_t prop;
-        CUDA_VERIFY(hipGetDeviceProperties(&prop, device));
+        HIP_VERIFY(hipGetDeviceProperties(&prop, device));
 
         properties[device] = prop;
         it = properties.find(device);
@@ -167,7 +167,7 @@ size_t getFreeMemory(int device) {
     size_t free = 0;
     size_t total = 0;
 
-    CUDA_VERIFY(hipMemGetInfo(&free, &total));
+    HIP_VERIFY(hipMemGetInfo(&free, &total));
 
     return free;
 }
@@ -176,7 +176,7 @@ size_t getFreeMemoryCurrentDevice() {
     size_t free = 0;
     size_t total = 0;
 
-    CUDA_VERIFY(hipMemGetInfo(&free, &total));
+    HIP_VERIFY(hipMemGetInfo(&free, &total));
 
     return free;
 }
@@ -213,10 +213,10 @@ CublasHandleScope::~CublasHandleScope() {
 }
 
 CudaEvent::CudaEvent(hipStream_t stream, bool timer) : event_(0) {
-    CUDA_VERIFY(hipEventCreateWithFlags(
+    HIP_VERIFY(hipEventCreateWithFlags(
             &event_, timer ? hipEventDefault : hipEventDisableTiming));
     printf("------------------hipEventRecord1");
-    CUDA_VERIFY(hipEventRecord(event_, stream));
+    HIP_VERIFY(hipEventRecord(event_, stream));
 }
 
 CudaEvent::CudaEvent(CudaEvent&& event) noexcept
@@ -226,7 +226,7 @@ CudaEvent::CudaEvent(CudaEvent&& event) noexcept
 
 CudaEvent::~CudaEvent() {
     if (event_) {
-        CUDA_VERIFY(hipEventDestroy(event_));
+        HIP_VERIFY(hipEventDestroy(event_));
     }
 }
 
@@ -238,11 +238,11 @@ CudaEvent& CudaEvent::operator=(CudaEvent&& event) noexcept {
 }
 
 void CudaEvent::streamWaitOnEvent(hipStream_t stream) {
-    CUDA_VERIFY(hipStreamWaitEvent(stream, event_, 0));
+    HIP_VERIFY(hipStreamWaitEvent(stream, event_, 0));
 }
 
 void CudaEvent::cpuWaitOnEvent() {
-    CUDA_VERIFY(hipEventSynchronize(event_));
+    HIP_VERIFY(hipEventSynchronize(event_));
 }
 
 } // namespace hip
diff --git a/faiss/hip/utils/DeviceVector.h b/faiss/hip/utils/DeviceVector.h
index d79d46a634..6d552bbee5 100644
--- a/faiss/hip/utils/DeviceVector.h
+++ b/faiss/hip/utils/DeviceVector.h
@@ -78,7 +78,7 @@ class DeviceVector {
 
         if (num_ > 0) {
             FAISS_ASSERT(data());
-            CUDA_VERIFY(hipMemcpyAsync(
+            HIP_VERIFY(hipMemcpyAsync(
                     out.data(),
                     data(),
                     num_ * sizeof(T),
@@ -109,14 +109,14 @@ class DeviceVector {
 
             int dev = getDeviceForAddress(d);
             if (dev == -1) {
-                CUDA_VERIFY(hipMemcpyAsync(
+                HIP_VERIFY(hipMemcpyAsync(
                         data() + num_,
                         d,
                         n * sizeof(T),
                         hipMemcpyHostToDevice,
                         stream));
             } else {
-                CUDA_VERIFY(hipMemcpyAsync(
+                HIP_VERIFY(hipMemcpyAsync(
                         data() + num_,
                         d,
                         n * sizeof(T),
@@ -155,7 +155,7 @@ class DeviceVector {
     // Set the specific value at a given index to `value`
     void setAt(size_t idx, const T& value, hipStream_t stream) {
         FAISS_ASSERT(idx < num_);
-        CUDA_VERIFY(hipMemcpyAsync(
+        HIP_VERIFY(hipMemcpyAsync(
                 data() + idx,
                 &value,
                 sizeof(T),
@@ -168,7 +168,7 @@ class DeviceVector {
         FAISS_ASSERT(idx < num_);
 
         T out;
-        CUDA_VERIFY(hipMemcpyAsync(
+        HIP_VERIFY(hipMemcpyAsync(
                 &out, data() + idx, sizeof(T), hipMemcpyDeviceToHost, stream));
     }
 
@@ -228,7 +228,7 @@ class DeviceVector {
                 AllocRequest(allocInfo_, newSizeInBytes));
 
         // Copy over any old data
-        CUDA_VERIFY(hipMemcpyAsync(
+        HIP_VERIFY(hipMemcpyAsync(
                 newAlloc.data,
                 data(),
                 oldSizeInBytes,
@@ -236,7 +236,7 @@ class DeviceVector {
                 stream));
 
         // Zero out the new space past the data we just copied
-        CUDA_VERIFY(hipMemsetAsync(
+        HIP_VERIFY(hipMemsetAsync(
                 (uint8_t*)newAlloc.data + oldSizeInBytes,
                 0,
                 newSizeInBytes - oldSizeInBytes,
diff --git a/faiss/hip/utils/LoadStoreOperators.h b/faiss/hip/utils/LoadStoreOperators.h
index 82e3a7111e..3ca9168676 100644
--- a/faiss/hip/utils/LoadStoreOperators.h
+++ b/faiss/hip/utils/LoadStoreOperators.h
@@ -33,9 +33,9 @@ template <>
 struct LoadStore<Half4> {
     static inline __device__ Half4 load(void* p) {
         Half4 out;
-        out.a.x = *static_cast<unsigned short*>(p);
-        out.b.x = *static_cast<unsigned short*>(p);
-
+        half2* t = reinterpret_cast<half2*>(p);
+        out.a = t[0];
+	out.b = t[1];
     //DONE        asm("ld.global.v2.u32 {%0, %1}, [%2];"
     //DONE            : "=r"(__HALF2_TO_UI(out.a)), "=r"(__HALF2_TO_UI(out.b))
     //DONE            : "l"(p));
@@ -48,8 +48,9 @@ struct LoadStore<Half4> {
     }
 
     static inline __device__ void store(void* p, Half4& v) {
-    v.a.x = *static_cast<unsigned short*>(p);
-    v.b.x = *static_cast<unsigned short*>(p);
+        half2* t = reinterpret_cast<half2*>(p);
+        v.a = t[0];
+        v.b = t[1];
 //#if CUDA_VERSION >= 9000
 //DONE        asm("st.v2.u32 [%0], {%1, %2};"
 //DONE            :
@@ -64,10 +65,11 @@ template <>
 struct LoadStore<Half8> {
     static inline __device__ Half8 load(void* p) {
         Half8 out;
-        out.a.a.x = *static_cast<unsigned short*>(p);
-        out.a.b.x = *static_cast<unsigned short*>(p);
-        out.b.a.x = *static_cast<unsigned short*>(p);
-        out.b.b.x = *static_cast<unsigned short*>(p);
+	half2* t = reinterpret_cast<half2*>(p);
+        out.a.a = t[0];
+        out.a.b = t[1];
+        out.b.a = t[2];
+	out.b.b = t[3];
 //#if CUDA_VERSION >= 9000
 //DONE        asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];"
 //DONE            : "=r"(__HALF2_TO_UI(out.a.a)),
@@ -84,10 +86,11 @@ struct LoadStore<Half8> {
     }
 
     static inline __device__ void store(void* p, Half8& v) {
-    v.a.a.x = *static_cast<unsigned short*>(p);
-    v.a.b.x = *static_cast<unsigned short*>(p);
-    v.b.a.x = *static_cast<unsigned short*>(p);
-    v.b.b.x = *static_cast<unsigned short*>(p);
+        half2* t = reinterpret_cast<half2*>(p);
+        v.a.a = t[0];
+        v.a.b = t[1];
+        v.b.a = t[2];
+        v.b.b = t[3];
 //#if CUDA_VERSION >= 9000
 //DONE        asm("st.v4.u32 [%0], {%1, %2, %3, %4};"
 //DONE            :
diff --git a/faiss/hip/utils/MatrixMult-inl.h b/faiss/hip/utils/MatrixMult-inl.h
index 58f2135b76..3b7d8812fd 100644
--- a/faiss/hip/utils/MatrixMult-inl.h
+++ b/faiss/hip/utils/MatrixMult-inl.h
@@ -252,7 +252,7 @@ void runMatrixMult(
             lda,
             ldb,
             ldc);
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 template <typename AT, typename BT>
@@ -355,7 +355,7 @@ void runBatchMatrixMult(
             err == HIPBLAS_STATUS_SUCCESS,
             "hipblasGemmStridedBatchedEx failed (%d)",
             (int)err);
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 } // namespace hip
diff --git a/faiss/hip/utils/MergeNetworkWarp.h b/faiss/hip/utils/MergeNetworkWarp.h
index 9ae5266f8c..7f52c515ae 100644
--- a/faiss/hip/utils/MergeNetworkWarp.h
+++ b/faiss/hip/utils/MergeNetworkWarp.h
@@ -93,7 +93,7 @@ template <
         bool IsBitonic>
 inline __device__ void warpBitonicMergeLE16(K& k, V& v) {
     static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
-    static_assert(L <= kWarpSize / 2, "merge list size must be <= 32");
+    static_assert(L <= kWarpSize / 2, "merge list size must be <= 32 or 16, depending on warp size");
 
     int laneId = getLaneId();
 
@@ -529,14 +529,17 @@ struct BitonicSortStep<K, V, 1, Dir, Comp> {
     static inline __device__ void sort(K k[1], V v[1]) {
         // Update this code if this changes
         // should go from 1 -> kWarpSize in multiples of 2
-        static_assert(kWarpSize == 64, "unexpected warp size");
-
+	// warp size for us is either 32 or 64
         warpBitonicMergeLE16<K, V, 1, Dir, Comp, false>(k[0], v[0]);
         warpBitonicMergeLE16<K, V, 2, Dir, Comp, false>(k[0], v[0]);
         warpBitonicMergeLE16<K, V, 4, Dir, Comp, false>(k[0], v[0]);
         warpBitonicMergeLE16<K, V, 8, Dir, Comp, false>(k[0], v[0]);
         warpBitonicMergeLE16<K, V, 16, Dir, Comp, false>(k[0], v[0]);
-        warpBitonicMergeLE16<K, V, 32, Dir, Comp, false>(k[0], v[0]);
+#if !(__gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__)
+//TODO needs to be fixed
+//#warning(including wider merge)
+//	warpBitonicMergeLE16<K, V, 32, Dir, Comp, false>(k[0], v[0]);
+#endif
     }
 };
 
diff --git a/faiss/hip/utils/PtxUtils.h b/faiss/hip/utils/PtxUtils.h
index f31768e3cc..b15882fef6 100644
--- a/faiss/hip/utils/PtxUtils.h
+++ b/faiss/hip/utils/PtxUtils.h
@@ -44,7 +44,7 @@ getBitfield(uint64_t val, int pos, int len) {
 }
 
 __device__ __forceinline__ int getLaneId() {
-    return threadIdx.x & 63;
+    return threadIdx.x & (32 - 1); //(__AMDGCN_WAVEFRONT_SIZE - 1);
 }
 
 } // namespace hip
diff --git a/faiss/hip/utils/Select.h b/faiss/hip/utils/Select.h
index 9032e8556b..fb409c8baf 100644
--- a/faiss/hip/utils/Select.h
+++ b/faiss/hip/utils/Select.h
@@ -207,11 +207,7 @@ struct BlockSelect {
     __device__ inline void checkThreadQ() {
         bool needSort = (numVals == NumThreadQ);
 
-#if CUDA_VERSION >= 9000
-        needSort = __any_sync(0xffffffff, needSort);
-#else
         needSort = __any(needSort);
-#endif
 
         if (!needSort) {
             // no lanes have triggered a sort
@@ -484,11 +480,7 @@ struct WarpSelect {
     __device__ inline void checkThreadQ() {
         bool needSort = (numVals == NumThreadQ);
 
-#if CUDA_VERSION >= 9000
-        needSort = __any_sync(0xffffffff, needSort);
-#else
         needSort = __any(needSort);
-#endif
 
         if (!needSort) {
             // no lanes have triggered a sort
diff --git a/faiss/hip/utils/Tensor-inl.h b/faiss/hip/utils/Tensor-inl.h
index 062856a195..15d23685be 100644
--- a/faiss/hip/utils/Tensor-inl.h
+++ b/faiss/hip/utils/Tensor-inl.h
@@ -197,7 +197,7 @@ __host__ void Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::copyFrom(
         int tDev = getDeviceForAddress(t.data());
 
         if (tDev == -1) {
-            CUDA_VERIFY(hipMemcpyAsync(
+            HIP_VERIFY(hipMemcpyAsync(
                     this->data_,
                     t.data(),
                     this->getSizeInBytes(),
@@ -205,7 +205,7 @@ __host__ void Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::copyFrom(
                                  : hipMemcpyHostToDevice,
                     stream));
         } else {
-            CUDA_VERIFY(hipMemcpyAsync(
+            HIP_VERIFY(hipMemcpyAsync(
                     this->data_,
                     t.data(),
                     this->getSizeInBytes(),
@@ -242,7 +242,7 @@ __host__ void Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::copyTo(
         int tDev = getDeviceForAddress(t.data());
 
         if (tDev == -1) {
-            CUDA_VERIFY(hipMemcpyAsync(
+            HIP_VERIFY(hipMemcpyAsync(
                     t.data(),
                     this->data_,
                     this->getSizeInBytes(),
@@ -250,7 +250,7 @@ __host__ void Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::copyTo(
                                  : hipMemcpyDeviceToHost,
                     stream));
         } else {
-            CUDA_VERIFY(hipMemcpyAsync(
+            HIP_VERIFY(hipMemcpyAsync(
                     t.data(),
                     this->data_,
                     this->getSizeInBytes(),
@@ -281,7 +281,7 @@ __host__ void Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::copyFrom(
         GPU_FAISS_ASSERT(this->data_);
         int ourDev = getDeviceForAddress(this->data_);
 
-        CUDA_VERIFY(hipMemcpyAsync(
+        HIP_VERIFY(hipMemcpyAsync(
                 this->data_,
                 v.data(),
                 this->getSizeInBytes(),
@@ -311,7 +311,7 @@ __host__ std::vector<T> Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::
             std::memcpy(
                     out.data(), this->data_, this->numElements() * sizeof(T));
         } else {
-            CUDA_VERIFY(hipMemcpyAsync(
+            HIP_VERIFY(hipMemcpyAsync(
                     out.data(),
                     this->data_,
                     this->numElements() * sizeof(T),
diff --git a/faiss/hip/utils/Tensor.h b/faiss/hip/utils/Tensor.h
index 06de4bec60..cddf22ff08 100644
--- a/faiss/hip/utils/Tensor.h
+++ b/faiss/hip/utils/Tensor.h
@@ -470,21 +470,13 @@ class SubTensor<TensorType, 0, PtrTraits> {
 
     /// Use the texture cache for reads
     __device__ inline typename TensorType::DataType ldg() const {
-#if __CUDA_ARCH__ >= 350
         return __ldg(data_);
-#else
-        return *data_;
-#endif
     }
 
     /// Use the texture cache for reads; cast as a particular type
     template <typename T>
     __device__ inline T ldgAs() const {
-#if __CUDA_ARCH__ >= 350
         return __ldg(dataAs<T>());
-#else
-        return as<T>();
-#endif
     }
 
    protected:
@@ -606,21 +598,13 @@ class SubTensor {
 
     /// Use the texture cache for reads
     __device__ inline typename TensorType::DataType ldg() const {
-#if __CUDA_ARCH__ >= 350
         return __ldg(data_);
-#else
-        return *data_;
-#endif
     }
 
     /// Use the texture cache for reads; cast as a particular type
     template <typename T>
     __device__ inline T ldgAs() const {
-#if __CUDA_ARCH__ >= 350
         return __ldg(dataAs<T>());
-#else
-        return as<T>();
-#endif
     }
 
     /// Returns a tensor that is a view of the SubDim-dimensional slice
diff --git a/faiss/hip/utils/Timer.cpp b/faiss/hip/utils/Timer.cpp
index 1be22895a4..af5e55ab8f 100644
--- a/faiss/hip/utils/Timer.cpp
+++ b/faiss/hip/utils/Timer.cpp
@@ -15,25 +15,25 @@ namespace hip {
 
 KernelTimer::KernelTimer(hipStream_t stream)
         : startEvent_(0), stopEvent_(0), stream_(stream), valid_(true) {
-    CUDA_VERIFY(hipEventCreate(&startEvent_));
-    CUDA_VERIFY(hipEventCreate(&stopEvent_));
+    HIP_VERIFY(hipEventCreate(&startEvent_));
+    HIP_VERIFY(hipEventCreate(&stopEvent_));
 
-    CUDA_VERIFY(hipEventRecord(startEvent_, stream_));
+    HIP_VERIFY(hipEventRecord(startEvent_, stream_));
 }
 
 KernelTimer::~KernelTimer() {
-    CUDA_VERIFY(hipEventDestroy(startEvent_));
-    CUDA_VERIFY(hipEventDestroy(stopEvent_));
+    HIP_VERIFY(hipEventDestroy(startEvent_));
+    HIP_VERIFY(hipEventDestroy(stopEvent_));
 }
 
 float KernelTimer::elapsedMilliseconds() {
     FAISS_ASSERT(valid_);
 
-    CUDA_VERIFY(hipEventRecord(stopEvent_, stream_));
-    CUDA_VERIFY(hipEventSynchronize(stopEvent_));
+    HIP_VERIFY(hipEventRecord(stopEvent_, stream_));
+    HIP_VERIFY(hipEventSynchronize(stopEvent_));
 
     auto time = 0.0f;
-    CUDA_VERIFY(hipEventElapsedTime(&time, startEvent_, stopEvent_));
+    HIP_VERIFY(hipEventElapsedTime(&time, startEvent_, stopEvent_));
     valid_ = false;
 
     return time;
diff --git a/faiss/hip/utils/Transpose.h b/faiss/hip/utils/Transpose.h
index 6e802ac0fd..d18b9f6af0 100644
--- a/faiss/hip/utils/Transpose.h
+++ b/faiss/hip/utils/Transpose.h
@@ -79,17 +79,13 @@ template <typename T, int DimInput, int DimOutput>
 __global__ void transposeAny(
         TensorInfo<T> input,
         TensorInfo<T> output,
-        idx_t totalSize) {
+        idx_t totalSize) __attribute__((amdgpu_flat_work_group_size(1,1024))) {
     for (idx_t i = idx_t(blockIdx.x) * blockDim.x + threadIdx.x; i < totalSize;
          i += gridDim.x * blockDim.x) {
         auto inputOffset = TensorInfoOffset<T, DimInput>::get(input, i);
         auto outputOffset = TensorInfoOffset<T, DimOutput>::get(output, i);
 
-#if __CUDA_ARCH__ >= 350
         output.data[outputOffset] = __ldg(&input.data[inputOffset]);
-#else
-        output.data[outputOffset] = input.data[inputOffset];
-#endif
     }
 }
 
@@ -187,7 +183,7 @@ void runTransposeAny(
                 <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
     }
 
-    CUDA_TEST_ERROR();
+    HIP_TEST_ERROR();
 }
 
 } // namespace hip
diff --git a/faiss/hip/utils/WarpSelectFloat.hip b/faiss/hip/utils/WarpSelectFloat.hip
index 0c9375f5aa..20298c003b 100644
--- a/faiss/hip/utils/WarpSelectFloat.hip
+++ b/faiss/hip/utils/WarpSelectFloat.hip
@@ -55,8 +55,10 @@ void runWarpSelect(
     if (dir) {
         if (k == 1) {
             WARP_SELECT_CALL(float, true, 1);
-        //} else if (k <= 32) {
-        //    WARP_SELECT_CALL(float, true, 32);
+#if (__gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__)
+        } else if (k <= 32) {
+            WARP_SELECT_CALL(float, true, 32);
+#endif
         } else if (k <= 64) {
             WARP_SELECT_CALL(float, true, 64);
         } else if (k <= 128) {
@@ -75,8 +77,10 @@ void runWarpSelect(
     } else {
         if (k == 1) {
             WARP_SELECT_CALL(float, false, 1);
-        //} else if (k <= 32) {
-        //    WARP_SELECT_CALL(float, false, 32);
+#if (__gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__)
+        } else if (k <= 32) {
+            WARP_SELECT_CALL(float, false, 32);
+#endif
         } else if (k <= 64) {
             WARP_SELECT_CALL(float, false, 64);
         } else if (k <= 128) {
diff --git a/faiss/hip/utils/WarpShuffles.h b/faiss/hip/utils/WarpShuffles.h
index 05a3ab2af3..5f2d9c398d 100644
--- a/faiss/hip/utils/WarpShuffles.h
+++ b/faiss/hip/utils/WarpShuffles.h
@@ -14,12 +14,7 @@ namespace faiss {
 namespace hip {
 
 // defines to simplify the SASS assembly structure file/line in the profiler
-#if CUDA_VERSION >= 9000
-#define SHFL_SYNC(VAL, SRC_LANE, WIDTH) \
-    __shfl_sync(0xffffffff, VAL, SRC_LANE, WIDTH)
-#else
 #define SHFL_SYNC(VAL, SRC_LANE, WIDTH) __shfl(VAL, SRC_LANE, WIDTH)
-#endif
 
 template <typename T>
 inline __device__ T shfl(const T val, int srcLane, int width = kWarpSize) {
diff --git a/faiss/hip/utils/blockselect/BlockSelectImpl.h b/faiss/hip/utils/blockselect/BlockSelectImpl.h
index 0b6334a2aa..86f53edba6 100644
--- a/faiss/hip/utils/blockselect/BlockSelectImpl.h
+++ b/faiss/hip/utils/blockselect/BlockSelectImpl.h
@@ -60,7 +60,7 @@
                 THREAD_Q,                                                      \
                 kBlockSelectNumThreads>                                        \
                 <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k); \
-        CUDA_TEST_ERROR();                                                     \
+        HIP_TEST_ERROR();                                                     \
     }                                                                          \
                                                                                \
     void runBlockSelectPair_##TYPE##_##DIR##_##WARP_Q##_(                      \
@@ -93,7 +93,7 @@
                 THREAD_Q,                                                      \
                 kBlockSelectNumThreads><<<grid, block, 0, stream>>>(           \
                 inK, inV, outK, outV, kInit, vInit, k);                        \
-        CUDA_TEST_ERROR();                                                     \
+        HIP_TEST_ERROR();                                                     \
     }
 
 #define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q) \
diff --git a/faiss/hip/utils/warpselect/WarpSelectImpl.h b/faiss/hip/utils/warpselect/WarpSelectImpl.h
index 6c8ec39fdd..6fb48a3cd0 100644
--- a/faiss/hip/utils/warpselect/WarpSelectImpl.h
+++ b/faiss/hip/utils/warpselect/WarpSelectImpl.h
@@ -38,7 +38,7 @@
                                                                                \
         warpSelect<TYPE, idx_t, DIR, WARP_Q, THREAD_Q, kWarpSelectNumThreads>  \
                 <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k); \
-        CUDA_TEST_ERROR();                                                     \
+        HIP_TEST_ERROR();                                                     \
     }
 
 #define WARP_SELECT_CALL(TYPE, DIR, WARP_Q) \