From 686a3db3d3aa5bf013cffeb86dcf0c8d66902ffb Mon Sep 17 00:00:00 2001 From: Johannes M Dieterich Date: Fri, 3 Nov 2023 16:54:44 -0500 Subject: [PATCH] Progress for WF32. --- faiss/hip/CMakeLists.txt | 2 +- faiss/hip/GpuIndex.hip | 2 +- faiss/hip/GpuResources.cpp | 2 +- faiss/hip/StandardGpuResources.cpp | 21 +- faiss/hip/impl/BinaryDistance.hip | 2 + faiss/hip/impl/BroadcastSum.hip | 6 +- faiss/hip/impl/DistanceUtils.h | 2 +- faiss/hip/impl/GeneralDistance.h | 3 +- faiss/hip/impl/IVFAppend.hip | 13 +- faiss/hip/impl/IVFFlatScan.hip | 7 +- faiss/hip/impl/IVFInterleaved.hip | 1 + faiss/hip/impl/IVFUtils.hip | 4 +- faiss/hip/impl/IVFUtilsSelect1.hip | 11 +- faiss/hip/impl/IVFUtilsSelect2.hip | 11 +- faiss/hip/impl/IcmEncoder.hip | 6 + faiss/hip/impl/L2Norm.hip | 4 +- faiss/hip/impl/L2Select.hip | 4 +- faiss/hip/impl/PQCodeDistances-inl.h | 6 +- faiss/hip/impl/PQCodeLoad.h | 244 ++++++------------ .../impl/PQScanMultiPassNoPrecomputed-inl.h | 8 +- faiss/hip/impl/PQScanMultiPassPrecomputed.hip | 10 +- faiss/hip/impl/VectorResidual.hip | 6 +- faiss/hip/impl/scan/IVFInterleavedImpl.h | 2 +- .../scan/IVFInterleavedScanKernelTemplate.hip | 1 + faiss/hip/perf/PerfBinaryFlat.hip | 6 +- faiss/hip/perf/PerfClustering.cpp | 6 +- faiss/hip/perf/PerfFlat.hip | 6 +- faiss/hip/perf/PerfIVFFlat.hip | 6 +- faiss/hip/perf/PerfIVFPQ.hip | 8 +- faiss/hip/perf/PerfIVFPQAdd.cpp | 6 +- faiss/hip/test/TestGpuMemoryException.cpp | 2 +- faiss/hip/utils/BlockSelectFloat.hip | 8 +- faiss/hip/utils/CopyUtils.h | 4 +- faiss/hip/utils/DeviceDefs.h | 2 +- faiss/hip/utils/DeviceTensor-inl.h | 2 +- faiss/hip/utils/DeviceUtils.h | 20 +- faiss/hip/utils/DeviceUtils.hip | 28 +- faiss/hip/utils/DeviceVector.h | 14 +- faiss/hip/utils/LoadStoreOperators.h | 29 ++- faiss/hip/utils/MatrixMult-inl.h | 4 +- faiss/hip/utils/MergeNetworkWarp.h | 11 +- faiss/hip/utils/PtxUtils.h | 2 +- faiss/hip/utils/Select.h | 8 - faiss/hip/utils/Tensor-inl.h | 12 +- faiss/hip/utils/Tensor.h | 16 -- faiss/hip/utils/Timer.cpp | 16 +- faiss/hip/utils/Transpose.h | 8 +- faiss/hip/utils/WarpSelectFloat.hip | 12 +- faiss/hip/utils/WarpShuffles.h | 5 - faiss/hip/utils/blockselect/BlockSelectImpl.h | 4 +- faiss/hip/utils/warpselect/WarpSelectImpl.h | 2 +- 51 files changed, 267 insertions(+), 358 deletions(-) diff --git a/faiss/hip/CMakeLists.txt b/faiss/hip/CMakeLists.txt index 5a3f95689b..4e36ca81f0 100644 --- a/faiss/hip/CMakeLists.txt +++ b/faiss/hip/CMakeLists.txt @@ -191,7 +191,7 @@ function(generate_ivf_interleaved_code) "128|1|1" "128|128|3" "128|256|4" - "128|64|2" #"128|32|2" TODO won't compile with a warpsize of 64. Changed to 64 + "128|32|2" #"128|32|2" TODO won't compile with a warpsize of 64. Changed to 64 "128|512|8" "128|64|3" "64|2048|8" diff --git a/faiss/hip/GpuIndex.hip b/faiss/hip/GpuIndex.hip index 821c873ad5..cba5e57268 100644 --- a/faiss/hip/GpuIndex.hip +++ b/faiss/hip/GpuIndex.hip @@ -397,7 +397,7 @@ void GpuIndex::searchFromCpuPaged_( eventPrev->streamWaitOnEvent(copyStream); } - CUDA_VERIFY(hipMemcpyAsync( + HIP_VERIFY(hipMemcpyAsync( bufGpus[cur2BufIndex]->data(), bufPinned[cur2BufIndex], numToCopy * this->d * sizeof(float), diff --git a/faiss/hip/GpuResources.cpp b/faiss/hip/GpuResources.cpp index d3e4eed749..a9b58cd4a5 100644 --- a/faiss/hip/GpuResources.cpp +++ b/faiss/hip/GpuResources.cpp @@ -183,7 +183,7 @@ hipStream_t GpuResources::getAsyncCopyStreamCurrentDevice() { } void GpuResources::syncDefaultStream(int device) { - CUDA_VERIFY(hipStreamSynchronize(getDefaultStream(device))); + HIP_VERIFY(hipStreamSynchronize(getDefaultStream(device))); } void GpuResources::syncDefaultStreamCurrentDevice() { diff --git a/faiss/hip/StandardGpuResources.cpp b/faiss/hip/StandardGpuResources.cpp index df94db79c7..75448188c2 100644 --- a/faiss/hip/StandardGpuResources.cpp +++ b/faiss/hip/StandardGpuResources.cpp @@ -134,21 +134,21 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() { DeviceScope scope(entry.first); // We created these streams, so are responsible for destroying them - CUDA_VERIFY(hipStreamDestroy(entry.second)); + HIP_VERIFY(hipStreamDestroy(entry.second)); } for (auto& entry : alternateStreams_) { DeviceScope scope(entry.first); for (auto stream : entry.second) { - CUDA_VERIFY(hipStreamDestroy(stream)); + HIP_VERIFY(hipStreamDestroy(stream)); } } for (auto& entry : asyncCopyStreams_) { DeviceScope scope(entry.first); - CUDA_VERIFY(hipStreamDestroy(entry.second)); + HIP_VERIFY(hipStreamDestroy(entry.second)); } for (auto& entry : blasHandles_) { @@ -349,20 +349,21 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) { prop.minor); // Our code is pre-built with and expects warpSize == 32, validate that - FAISS_ASSERT_FMT( - prop.warpSize == 64, - "Device id %d does not have expected warpSize of 64", - device); + // we can have either warp size 32 or 64 - disable this check + //FAISS_ASSERT_FMT( + // prop.warpSize == 64, + // "Device id %d does not have expected warpSize of 64", + // device); // Create streams hipStream_t defaultStream = 0; - CUDA_VERIFY( + HIP_VERIFY( hipStreamCreateWithFlags(&defaultStream, hipStreamNonBlocking)); defaultStreams_[device] = defaultStream; hipStream_t asyncCopyStream = 0; - CUDA_VERIFY( + HIP_VERIFY( hipStreamCreateWithFlags(&asyncCopyStream, hipStreamNonBlocking)); asyncCopyStreams_[device] = asyncCopyStream; @@ -370,7 +371,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) { std::vector deviceStreams; for (int j = 0; j < kNumStreams; ++j) { hipStream_t stream = 0; - CUDA_VERIFY(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + HIP_VERIFY(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); deviceStreams.push_back(stream); } diff --git a/faiss/hip/impl/BinaryDistance.hip b/faiss/hip/impl/BinaryDistance.hip index d1c2327d65..a1e31bf9e0 100644 --- a/faiss/hip/impl/BinaryDistance.hip +++ b/faiss/hip/impl/BinaryDistance.hip @@ -228,6 +228,7 @@ void runBinaryDistanceAnySize( <<>>(vecs, query, outK, outV, k); } #endif + HIP_TEST_ERROR(); } template @@ -269,6 +270,7 @@ void runBinaryDistanceLimitSize( <<>>(vecs, query, outK, outV, k); } #endif + HIP_TEST_ERROR(); } void runBinaryDistance( diff --git a/faiss/hip/impl/BroadcastSum.hip b/faiss/hip/impl/BroadcastSum.hip index 91dfd70f20..0f9a4d0208 100644 --- a/faiss/hip/impl/BroadcastSum.hip +++ b/faiss/hip/impl/BroadcastSum.hip @@ -265,7 +265,7 @@ void runSumAlongColumns( <<>>(input, output); } - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } void runSumAlongColumns( @@ -320,7 +320,7 @@ void runAssignAlongColumns( <<>>(input, output); } - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } void runAssignAlongColumns( @@ -356,7 +356,7 @@ void runSumAlongRows( sumAlongRows<<>>(input, output); } - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } void runSumAlongRows( diff --git a/faiss/hip/impl/DistanceUtils.h b/faiss/hip/impl/DistanceUtils.h index d8a35efbb8..070a43eab4 100644 --- a/faiss/hip/impl/DistanceUtils.h +++ b/faiss/hip/impl/DistanceUtils.h @@ -326,7 +326,7 @@ void runIncrementIndex( auto block = std::min(k, getMaxThreadsCurrentDevice()); incrementIndex<<>>(indices, k, increment); - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } // If the inner size (dim) of the vectors is small, we want a larger query tile diff --git a/faiss/hip/impl/GeneralDistance.h b/faiss/hip/impl/GeneralDistance.h index 095dc10cff..529d829422 100644 --- a/faiss/hip/impl/GeneralDistance.h +++ b/faiss/hip/impl/GeneralDistance.h @@ -250,6 +250,7 @@ void runGeneralDistanceKernel( dim3 block(kWarpSize, kWarpSize); generalDistance<<>>(query, vecs, op, out); + HIP_TEST_ERROR(); } template @@ -449,7 +450,7 @@ void runGeneralDistance( FAISS_THROW_MSG("interrupted"); } - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } } // namespace hip diff --git a/faiss/hip/impl/IVFAppend.hip b/faiss/hip/impl/IVFAppend.hip index e7f3e712b3..053f951050 100644 --- a/faiss/hip/impl/IVFAppend.hip +++ b/faiss/hip/impl/IVFAppend.hip @@ -71,7 +71,7 @@ void runUpdateListPointers( listCodes.data(), listIndices.data()); - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } // Appends new indices for vectors being added to the IVF indices lists @@ -124,7 +124,7 @@ void runIVFIndicesAppend( ivfIndicesAppend<<>>( listIds, listOffset, indices, opt, listIndices.data()); - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } } @@ -259,7 +259,7 @@ void runIVFFlatAppend( } } - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); #undef RUN_APPEND } @@ -308,7 +308,7 @@ void runIVFPQAppend( ivfpqAppend<<>>( listIds, listOffset, encodings, listCodes.data()); - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } // @@ -341,6 +341,7 @@ void runSQEncode( idx_t blocks = vecs.getSize(0); sqEncode<<>>(vecs, encodedVecs, codec); + HIP_TEST_ERROR(); } // Handles appending encoded vectors (one per EncodeT word) packed into @@ -584,7 +585,7 @@ void runIVFFlatInterleavedAppend( } #undef RUN_APPEND - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } void runIVFPQInterleavedAppend( @@ -640,7 +641,7 @@ void runIVFPQInterleavedAppend( } #undef RUN_APPEND - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } } // namespace hip diff --git a/faiss/hip/impl/IVFFlatScan.hip b/faiss/hip/impl/IVFFlatScan.hip index f829b67914..d5100b999e 100644 --- a/faiss/hip/impl/IVFFlatScan.hip +++ b/faiss/hip/impl/IVFFlatScan.hip @@ -229,6 +229,7 @@ void runIVFFlatScanTile( metric, \ prefixSumOffsets, \ allDistances); \ + HIP_TEST_ERROR(); \ } while (0) #define HANDLE_METRICS \ @@ -291,7 +292,7 @@ void runIVFFlatScanTile( } } - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); #undef HANDLE_METRICS #undef RUN_IVF_FLAT @@ -414,9 +415,9 @@ void runIVFFlatScan( // Make sure the element before prefixSumOffsets is 0, since we // depend upon simple, boundary-less indexing to get proper results - CUDA_VERIFY(hipMemsetAsync( + HIP_VERIFY(hipMemsetAsync( prefixSumOffsetSpace1.data(), 0, sizeof(idx_t), stream)); - CUDA_VERIFY(hipMemsetAsync( + HIP_VERIFY(hipMemsetAsync( prefixSumOffsetSpace2.data(), 0, sizeof(idx_t), stream)); DeviceTensor allDistances1( diff --git a/faiss/hip/impl/IVFInterleaved.hip b/faiss/hip/impl/IVFInterleaved.hip index 6b06518758..17088babf9 100644 --- a/faiss/hip/impl/IVFInterleaved.hip +++ b/faiss/hip/impl/IVFInterleaved.hip @@ -172,6 +172,7 @@ void runIVFInterleavedScan2( IVF_SCAN_2(64, 2048, 8); } #endif + HIP_TEST_ERROR(); } void runIVFInterleavedScan( diff --git a/faiss/hip/impl/IVFUtils.hip b/faiss/hip/impl/IVFUtils.hip index 80faa619ba..d1ed23b324 100644 --- a/faiss/hip/impl/IVFUtils.hip +++ b/faiss/hip/impl/IVFUtils.hip @@ -62,7 +62,7 @@ void runCalcListOffsets( getResultLengths<<>>( ivfListIds, listLengths.data(), totalSize, prefixSumOffsets); - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); // Prefix sum of the indices, so we know where the intermediate // results should be maintained @@ -76,7 +76,7 @@ void runCalcListOffsets( prefixSumOffsets.data(), prefixSumOffsets.data() + totalSize, prefixSumOffsets.data()); - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } } // namespace hip diff --git a/faiss/hip/impl/IVFUtilsSelect1.hip b/faiss/hip/impl/IVFUtilsSelect1.hip index da49763ae2..97c2fd2371 100644 --- a/faiss/hip/impl/IVFUtilsSelect1.hip +++ b/faiss/hip/impl/IVFUtilsSelect1.hip @@ -125,6 +125,7 @@ void runPass1SelectLists( k, \ heapDistances, \ heapIndices); \ + HIP_TEST_ERROR(); \ return; /* success */ \ } while (0) @@ -135,8 +136,8 @@ void runPass1SelectLists( do { \ if (k == 1) { \ RUN_PASS(INDEX_T, 128, 1, 1, DIR); \ - /*} else if (k <= 32) { */ \ - /* RUN_PASS(INDEX_T, 128, 32, 2, DIR); */ \ + } else if (k <= 32) { \ + RUN_PASS(INDEX_T, 128, 32, 2, DIR); \ } else if (k <= 64) { \ RUN_PASS(INDEX_T, 128, 64, 3, DIR); \ } else if (k <= 128) { \ @@ -158,8 +159,8 @@ void runPass1SelectLists( do { \ if (k == 1) { \ RUN_PASS(INDEX_T, 128, 1, 1, DIR); \ - /*} else if (k <= 32) { */ \ - /* RUN_PASS(INDEX_T, 128, 32, 2, DIR); */ \ + } else if (k <= 32) { \ + RUN_PASS(INDEX_T, 128, 32, 2, DIR); \ } else if (k <= 64) { \ RUN_PASS(INDEX_T, 128, 64, 3, DIR); \ } else if (k <= 128) { \ @@ -192,7 +193,7 @@ void runPass1SelectLists( #undef RUN_PASS_DIR #undef RUN_PASS - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } } // namespace hip diff --git a/faiss/hip/impl/IVFUtilsSelect2.hip b/faiss/hip/impl/IVFUtilsSelect2.hip index c3e8668ac5..739876837a 100644 --- a/faiss/hip/impl/IVFUtilsSelect2.hip +++ b/faiss/hip/impl/IVFUtilsSelect2.hip @@ -184,6 +184,7 @@ void runPass2SelectLists( indicesOptions, \ outDistances, \ outIndices); \ + HIP_TEST_ERROR(); \ } while (0) #if GPU_MAX_SELECTION_K >= 2048 @@ -193,8 +194,8 @@ void runPass2SelectLists( do { \ if (k == 1) { \ RUN_PASS(INDEX_T, 128, 1, 1, DIR); \ - /*} else if (k <= 32) { */ \ - /* RUN_PASS(INDEX_T, 128, 32, 2, DIR);*/ \ + } else if (k <= 32) { \ + RUN_PASS(INDEX_T, 128, 32, 2, DIR); \ } else if (k <= 64) { \ RUN_PASS(INDEX_T, 128, 64, 3, DIR); \ } else if (k <= 128) { \ @@ -216,8 +217,8 @@ void runPass2SelectLists( do { \ if (k == 1) { \ RUN_PASS(INDEX_T, 128, 1, 1, DIR); \ - /*} else if (k <= 32) { */ \ - /* RUN_PASS(INDEX_T, 128, 32, 2, DIR); */ \ + } else if (k <= 32) { \ + RUN_PASS(INDEX_T, 128, 32, 2, DIR); \ } else if (k <= 64) { \ RUN_PASS(INDEX_T, 128, 64, 3, DIR); \ } else if (k <= 128) { \ @@ -250,7 +251,7 @@ void runPass2SelectLists( #undef RUN_PASS_DIR #undef RUN_PASS - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } } // namespace hip diff --git a/faiss/hip/impl/IcmEncoder.hip b/faiss/hip/impl/IcmEncoder.hip index 1f0db2491a..78fefb9989 100644 --- a/faiss/hip/impl/IcmEncoder.hip +++ b/faiss/hip/impl/IcmEncoder.hip @@ -241,6 +241,7 @@ void IcmEncoderImpl::computeUnaryTerms( auto uPtr = uterm + m * n * K; auto nPtr = norm.data() + m * K; runNormAddition<<>>(uPtr, nPtr, K); + HIP_TEST_ERROR(); } } @@ -329,6 +330,7 @@ void IcmEncoderImpl::encode( M, K, dims); + HIP_TEST_ERROR(); int blockSize = 256; int numBlocks = (n + blockSize - 1) / blockSize; @@ -336,6 +338,7 @@ void IcmEncoderImpl::encode( for (int i = 0; i < ilsIters; i++) { runCodesPerturbation<<>>( gen(), codes.data(), n, M, K, nperts); + HIP_TEST_ERROR(); // perform icm encoding for (int j = 0; j < icmIters; j++) { @@ -347,6 +350,7 @@ void IcmEncoderImpl::encode( M, K, m); + HIP_TEST_ERROR(); } } @@ -360,6 +364,7 @@ void IcmEncoderImpl::encode( M, K, dims); + HIP_TEST_ERROR(); // if objs[i] < best_objs[i], replace best_codes[i] with codes[i] runCodesSelection<<>>( @@ -369,6 +374,7 @@ void IcmEncoderImpl::encode( objs.data(), n, M); + HIP_TEST_ERROR(); codes.copyFrom(bestCodes, stream); } diff --git a/faiss/hip/impl/L2Norm.hip b/faiss/hip/impl/L2Norm.hip index c66ad6b52c..61c31ffe52 100644 --- a/faiss/hip/impl/L2Norm.hip +++ b/faiss/hip/impl/L2Norm.hip @@ -239,6 +239,7 @@ void runL2Norm( <<>>(INPUT, output); \ } \ } \ + HIP_TEST_ERROR(); \ } while (0) if (inputRowMajor) { @@ -295,11 +296,12 @@ void runL2Norm( } else { l2NormColMajor<<>>(input, output); } + HIP_TEST_ERROR(); } #undef RUN_L2 - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } void runL2Norm( diff --git a/faiss/hip/impl/L2Select.hip b/faiss/hip/impl/L2Select.hip index 35975e41de..f56632eee8 100644 --- a/faiss/hip/impl/L2Select.hip +++ b/faiss/hip/impl/L2Select.hip @@ -215,6 +215,7 @@ void runL2SelectMin( centroidDistances, outDistances, outIndices); + HIP_TEST_ERROR(); } else { auto grid = dim3(outDistances.getSize(0)); @@ -240,6 +241,7 @@ void runL2SelectMin( } else { \ L2_KERNEL(int32_t, BLOCK, NUM_WARP_Q, NUM_THREAD_Q); \ } \ + HIP_TEST_ERROR(); \ } while (false) // block size 128 for everything <= 1024 @@ -269,7 +271,7 @@ void runL2SelectMin( #undef L2_KERNEL #undef RUN_L2_SELECT - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } void runL2SelectMin( diff --git a/faiss/hip/impl/PQCodeDistances-inl.h b/faiss/hip/impl/PQCodeDistances-inl.h index bafff11552..345e7abf29 100644 --- a/faiss/hip/impl/PQCodeDistances-inl.h +++ b/faiss/hip/impl/PQCodeDistances-inl.h @@ -350,7 +350,7 @@ void runPQResidualVector( residual); } - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } template @@ -391,6 +391,7 @@ void runPQDistanceIPCorrection( pqDistanceIPCorrection<<>>( codeView, coarseDistances, codeDistances.getSize(2)); + HIP_TEST_ERROR(); } // This is a general purpose implementation that leverages GEMM to calculate @@ -665,6 +666,7 @@ void runPQCodeDistances( coarseIndices, \ outCodeDistancesT); \ } \ + HIP_TEST_ERROR(); \ } while (0) #define CODE_L2(DIMS) \ @@ -729,7 +731,7 @@ void runPQCodeDistances( #undef RUN_CODE #undef CODE_L2 - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } } // namespace hip diff --git a/faiss/hip/impl/PQCodeLoad.h b/faiss/hip/impl/PQCodeLoad.h index d9cf1a8cd6..967f3ad22e 100644 --- a/faiss/hip/impl/PQCodeLoad.h +++ b/faiss/hip/impl/PQCodeLoad.h @@ -45,7 +45,10 @@ struct LoadCode32<1> { uint8_t* p, int offset) { p += offset * 1; - code32[0] = *p; + using T = uint8_t __attribute__((ext_vector_type(1))); + T* t = reinterpret_cast(p); + T* u = reinterpret_cast(code32); + u[0] = __builtin_nontemporal_load(t); //DONE asm("ld.global.cs.u8 {%0}, [%1];" : "=r"(code32[0]) : "l"(p)); } }; @@ -57,7 +60,10 @@ struct LoadCode32<2> { uint8_t* p, int offset) { p += offset * 2; - code32[0] = *p; + using T = uint8_t __attribute__((ext_vector_type(2))); + T* t = reinterpret_cast(p); + T* u = reinterpret_cast(code32); + u[0] = __builtin_nontemporal_load(t); //DONE asm("ld.global.cs.u16 {%0}, [%1];" : "=r"(code32[0]) : "l"(p)); } }; @@ -69,24 +75,18 @@ struct LoadCode32<3> { uint8_t* p, int offset) { p += offset * 3; - unsigned int a; - unsigned int b; - unsigned int c; + using T = uint8_t __attribute__((ext_vector_type(3))); + T* t = reinterpret_cast(p); + T* u = reinterpret_cast(code32); + u[0] = __builtin_nontemporal_load(t); // FIXME: this is a non-coalesced, unaligned, non-vectorized load // unfortunately need to reorganize memory layout by warp //DONE asm("ld.global.cs.u8 {%0}, [%1 + 0];" : "=r"(a) : "l"(p)); //DONE asm("ld.global.cs.u8 {%0}, [%1 + 1];" : "=r"(b) : "l"(p)); //DONE asm("ld.global.cs.u8 {%0}, [%1 + 2];" : "=r"(c) : "l"(p)); - a = *p; - p += 1; - b = *p; - p += 1; - c = *p; // FIXME: this is also slow, since we have to recover the // individual bytes loaded - code32[0] = (c << 16) | (b << 8) | a; - p -= 2; } }; @@ -97,8 +97,11 @@ struct LoadCode32<4> { uint8_t* p, int offset) { p += offset * 4; - code32[0] = *p; -//DONE asm("ld.global.cs.u32 {%0}, [%1];" : "=r"(code32[0]) : "l"(p)); + using T = uint32_t __attribute__((ext_vector_type(1))); + T* t = reinterpret_cast(p); + T* u = reinterpret_cast(code32); + u[0] = __builtin_nontemporal_load(t); + //DONE asm("ld.global.cs.u32 {%0}, [%1];" : "=r"(code32[0]) : "l"(p)); } }; @@ -109,11 +112,13 @@ struct LoadCode32<8> { uint8_t* p, int offset) { p += offset * 8; -//DONE asm("ld.global.cs.v2.u32 {%0, %1}, [%2];" + using T = uint32_t __attribute__((ext_vector_type(2))); + T* t = reinterpret_cast(p); + T* u = reinterpret_cast(code32); + u[0] = __builtin_nontemporal_load(t); + //DONE asm("ld.global.cs.v2.u32 {%0, %1}, [%2];" //DONE : "=r"(code32[0]), "=r"(code32[1]) //DONE : "l"(p)); - code32[0] = *p; - code32[1] = *p; } }; @@ -124,14 +129,15 @@ struct LoadCode32<12> { uint8_t* p, int offset) { p += offset * 12; + using T = uint32_t __attribute__((ext_vector_type(3))); + T* t = reinterpret_cast(p); + T* u = reinterpret_cast(code32); + u[0] = __builtin_nontemporal_load(t); // FIXME: this is a non-coalesced, unaligned, non-vectorized load // unfortunately need to reorganize memory layout by warp //DONE asm(LD_NC_V1 " {%0}, [%1 + 0];" : "=r"(code32[0]) : "l"(p)); //DONE asm(LD_NC_V1 " {%0}, [%1 + 4];" : "=r"(code32[1]) : "l"(p)); //DONE asm(LD_NC_V1 " {%0}, [%1 + 8];" : "=r"(code32[2]) : "l"(p)); - code32[0] = *p; - code32[1] = *p; - code32[2] = *p; } }; @@ -142,10 +148,10 @@ struct LoadCode32<16> { uint8_t* p, int offset) { p += offset * 16; - code32[0] = *p; - code32[1] = *p; - code32[2] = *p; - code32[3] = *p; + using T = uint32_t __attribute__((ext_vector_type(4))); + T* t = reinterpret_cast(p); + T* u = reinterpret_cast(code32); + u[0] = __builtin_nontemporal_load(t); //DONE asm("ld.global.cs.v4.u32 {%0, %1, %2, %3}, [%4];" //DONE : "=r"(code32[0]), "=r"(code32[1]), "=r"(code32[2]), "=r"(code32[3]) //DONE : "l"(p)); @@ -159,23 +165,25 @@ struct LoadCode32<20> { uint8_t* p, int offset) { p += offset * 20; - // FIXME: this is a non-coalesced, unaligned, non-vectorized load + //using T = uint32_t __attribute__((ext_vector_type(1))); + code32[0] = __builtin_nontemporal_load(p); + code32[1] = __builtin_nontemporal_load(p + 4); + code32[2] = __builtin_nontemporal_load(p + 8); + code32[3] = __builtin_nontemporal_load(p + 12); + code32[4] = __builtin_nontemporal_load(p + 16); + + // FIXME: this is a non-coalesced, unaligned, non-vectorized load // unfortunately need to reorganize memory layout by warp //DONE asm(LD_NC_V1 " {%0}, [%1 + 0];" : "=r"(code32[0]) : "l"(p)); //DONE asm(LD_NC_V1 " {%0}, [%1 + 4];" : "=r"(code32[1]) : "l"(p)); //DONE asm(LD_NC_V1 " {%0}, [%1 + 8];" : "=r"(code32[2]) : "l"(p)); //DONE asm(LD_NC_V1 " {%0}, [%1 + 12];" : "=r"(code32[3]) : "l"(p)); //DONE asm(LD_NC_V1 " {%0}, [%1 + 16];" : "=r"(code32[4]) : "l"(p)); - code32[0] = *p; - p += 4; - code32[1] = *p; - p += 4; - code32[2] = *p; - p += 4; - code32[3] = *p; - p += 4; - code32[4] = *p; - p -= 16; + //code32[0] = p[0]; + //code32[1] = p[1]; + //code32[2] = p[2]; + //code32[3] = p[3]; + //code32[4] = p[4]; } }; @@ -186,6 +194,10 @@ struct LoadCode32<24> { uint8_t* p, int offset) { p += offset * 24; + using T = uint32_t __attribute__((ext_vector_type(6))); + T* t = reinterpret_cast(p); + T* u = reinterpret_cast(code32); + u[0] = __builtin_nontemporal_load(t); // FIXME: this is a non-coalesced, unaligned, 2-vectorized load // unfortunately need to reorganize memory layout by warp //DONE asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" @@ -197,15 +209,6 @@ struct LoadCode32<24> { //DONE asm(LD_NC_V2 " {%0, %1}, [%2 + 16];" //DONE : "=r"(code32[4]), "=r"(code32[5]) //DONE : "l"(p)); - code32[0] = *p; - code32[1] = *p; - p += 8; - code32[2] = *p; - code32[3] = *p; - p += 8; - code32[4] = *p; - code32[5] = *p; - p -= 16; } }; @@ -216,6 +219,10 @@ struct LoadCode32<28> { uint8_t* p, int offset) { p += offset * 28; + using T = uint32_t __attribute__((ext_vector_type(7))); + T* t = reinterpret_cast(p); + T* u = reinterpret_cast(code32); + u[0] = __builtin_nontemporal_load(t); // FIXME: this is a non-coalesced, unaligned, non-vectorized load // unfortunately need to reorganize memory layout by warp //DONE asm(LD_NC_V1 " {%0}, [%1 + 0];" : "=r"(code32[0]) : "l"(p)); @@ -225,20 +232,6 @@ struct LoadCode32<28> { //DONE asm(LD_NC_V1 " {%0}, [%1 + 16];" : "=r"(code32[4]) : "l"(p)); //DONE asm(LD_NC_V1 " {%0}, [%1 + 20];" : "=r"(code32[5]) : "l"(p)); //DONE asm(LD_NC_V1 " {%0}, [%1 + 24];" : "=r"(code32[6]) : "l"(p)); - code32[0] = *p; - p += 4; - code32[1] = *p; - p += 4; - code32[2] = *p; - p += 4; - code32[3] = *p; - p += 4; - code32[4] = *p; - p += 4; - code32[5] = *p; - p += 4; - code32[6] = *p; - p -= 24; } }; @@ -249,6 +242,10 @@ struct LoadCode32<32> { uint8_t* p, int offset) { p += offset * 32; + using T = uint32_t __attribute__((ext_vector_type(8))); + T* t = reinterpret_cast(p); + T* u = reinterpret_cast(code32); + u[0] = __builtin_nontemporal_load(t); // FIXME: this is a non-coalesced load // unfortunately need to reorganize memory layout by warp //DONE asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" @@ -257,16 +254,6 @@ struct LoadCode32<32> { //DONE asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4 + 16];" //DONE : "=r"(code32[4]), "=r"(code32[5]), "=r"(code32[6]), "=r"(code32[7]) //DONE : "l"(p)); - code32[0] = *p; - code32[1] = *p; - code32[2] = *p; - code32[3] = *p; - p += 16; - code32[4] = *p; - code32[5] = *p; - code32[6] = *p; - code32[7] = *p; - p -= 16; } }; @@ -277,6 +264,10 @@ struct LoadCode32<40> { uint8_t* p, int offset) { p += offset * 40; + using T = uint32_t __attribute__((ext_vector_type(10))); + T* t = reinterpret_cast(p); + T* u = reinterpret_cast(code32); + u[0] = __builtin_nontemporal_load(t); // FIXME: this is a non-coalesced, unaligned, 2-vectorized load // unfortunately need to reorganize memory layout by warp //DONE asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" @@ -294,21 +285,6 @@ struct LoadCode32<40> { //DONE asm(LD_NC_V2 " {%0, %1}, [%2 + 32];" //DONE : "=r"(code32[8]), "=r"(code32[9]) //DONE : "l"(p)); - code32[0] = *p; - code32[1] = *p; - p += 8; - code32[2] = *p; - code32[3] = *p; - p += 8; - code32[4] = *p; - code32[5] = *p; - p += 8; - code32[6] = *p; - code32[7] = *p; - p += 8; - code32[8] = *p; - code32[9] = *p; - p -= 32; } }; @@ -319,6 +295,10 @@ struct LoadCode32<48> { uint8_t* p, int offset) { p += offset * 48; + using T = uint32_t __attribute__((ext_vector_type(12))); + T* t = reinterpret_cast(p); + T* u = reinterpret_cast(code32); + u[0] = __builtin_nontemporal_load(t); // FIXME: this is a non-coalesced load // unfortunately need to reorganize memory layout by warp //DONE asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" @@ -333,21 +313,6 @@ struct LoadCode32<48> { //DONE "=r"(code32[10]), //DONE "=r"(code32[11]) //DONE : "l"(p)); - code32[0] = *p; - code32[1] = *p; - code32[2] = *p; - code32[3] = *p; - p += 16; - code32[4] = *p; - code32[5] = *p; - code32[6] = *p; - code32[7] = *p; - p += 16; - code32[8] = *p; - code32[9] = *p; - code32[10] = *p; - code32[11] = *p; - p -= 32; } }; @@ -358,6 +323,10 @@ struct LoadCode32<56> { uint8_t* p, int offset) { p += offset * 56; + using T = uint32_t __attribute__((ext_vector_type(14))); + T* t = reinterpret_cast(p); + T* u = reinterpret_cast(code32); + u[0] = __builtin_nontemporal_load(t); // FIXME: this is a non-coalesced, unaligned, 2-vectorized load // unfortunately need to reorganize memory layout by warp //DONE asm(LD_NC_V2 " {%0, %1}, [%2 + 0];" @@ -381,27 +350,6 @@ struct LoadCode32<56> { //DONE asm(LD_NC_V2 " {%0, %1}, [%2 + 48];" //DONE : "=r"(code32[12]), "=r"(code32[13]) //DONE : "l"(p)); - code32[0] = *p; - code32[1] = *p; - p += 8; - code32[2] = *p; - code32[3] = *p; - p += 8; - code32[4] = *p; - code32[5] = *p; - p += 8; - code32[6] = *p; - code32[7] = *p; - p += 8; - code32[8] = *p; - code32[9] = *p; - p += 8; - code32[10] = *p; - code32[11] = *p; - p += 8; - code32[12] = *p; - code32[13] = *p; - p -= 48; } }; @@ -412,6 +360,10 @@ struct LoadCode32<64> { uint8_t* p, int offset) { p += offset * 64; + using T = uint32_t __attribute__((ext_vector_type(16))); + T* t = reinterpret_cast(p); + T* u = reinterpret_cast(code32); + u[0] = __builtin_nontemporal_load(t); // FIXME: this is a non-coalesced load // unfortunately need to reorganize memory layout by warp //DONE asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" @@ -432,26 +384,6 @@ struct LoadCode32<64> { //DONE "=r"(code32[14]), //DONE "=r"(code32[15]) //DONE : "l"(p)); - code32[0] = *p; - code32[1] = *p; - code32[2] = *p; - code32[3] = *p; - p += 16; - code32[4] = *p; - code32[5] = *p; - code32[6] = *p; - code32[7] = *p; - p += 16; - code32[8] = *p; - code32[9] = *p; - code32[10] = *p; - code32[11] = *p; - p += 16; - code32[12] = *p; - code32[13] = *p; - code32[14] = *p; - code32[15] = *p; - p -= 48; } }; @@ -462,6 +394,10 @@ struct LoadCode32<96> { uint8_t* p, int offset) { p += offset * 96; + using T = uint32_t __attribute__((ext_vector_type(24))); + T* t = reinterpret_cast(p); + T* u = reinterpret_cast(code32); + u[0] = __builtin_nontemporal_load(t); // FIXME: this is a non-coalesced load // unfortunately need to reorganize memory layout by warp //DONE asm(LD_NC_V4 " {%0, %1, %2, %3}, [%4];" @@ -494,36 +430,6 @@ struct LoadCode32<96> { //DONE "=r"(code32[22]), //DONE "=r"(code32[23]) //DONE : "l"(p)); - code32[0] = *p; - code32[1] = *p; - code32[2] = *p; - code32[3] = *p; - p += 16; - code32[4] = *p; - code32[5] = *p; - code32[6] = *p; - code32[7] = *p; - p += 16; - code32[8] = *p; - code32[9] = *p; - code32[10] = *p; - code32[11] = *p; - p += 16; - code32[12] = *p; - code32[13] = *p; - code32[14] = *p; - code32[15] = *p; - p += 16; - code32[16] = *p; - code32[17] = *p; - code32[18] = *p; - code32[19] = *p; - p += 16; - code32[20] = *p; - code32[21] = *p; - code32[22] = *p; - code32[23] = *p; - p -= 80; } }; diff --git a/faiss/hip/impl/PQScanMultiPassNoPrecomputed-inl.h b/faiss/hip/impl/PQScanMultiPassNoPrecomputed-inl.h index 2c180589d3..d2078e18ca 100644 --- a/faiss/hip/impl/PQScanMultiPassNoPrecomputed-inl.h +++ b/faiss/hip/impl/PQScanMultiPassNoPrecomputed-inl.h @@ -353,6 +353,7 @@ void runMultiPassTile( listLengths.data(), \ prefixSumOffsets, \ allDistances); \ + HIP_TEST_ERROR(); \ } while (0) if (useFloat16Lookup) { @@ -424,6 +425,7 @@ void runMultiPassTile( listLengths.data(), \ prefixSumOffsets, \ allDistances); \ + HIP_TEST_ERROR(); \ } while (0) #define RUN_PQ(NUM_SUB_Q) \ @@ -493,7 +495,7 @@ void runMultiPassTile( #undef RUN_PQ_OPT } - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); // k-select the output in chunks, to increase parallelism runPass1SelectLists( @@ -624,9 +626,9 @@ void runPQScanMultiPassNoPrecomputed( // Make sure the element before prefixSumOffsets is 0, since we // depend upon simple, boundary-less indexing to get proper results - CUDA_VERIFY(hipMemsetAsync( + HIP_VERIFY(hipMemsetAsync( prefixSumOffsetSpace1.data(), 0, sizeof(idx_t), stream)); - CUDA_VERIFY(hipMemsetAsync( + HIP_VERIFY(hipMemsetAsync( prefixSumOffsetSpace2.data(), 0, sizeof(idx_t), stream)); idx_t codeDistanceTypeSize = diff --git a/faiss/hip/impl/PQScanMultiPassPrecomputed.hip b/faiss/hip/impl/PQScanMultiPassPrecomputed.hip index 519227033f..7b08f196e6 100644 --- a/faiss/hip/impl/PQScanMultiPassPrecomputed.hip +++ b/faiss/hip/impl/PQScanMultiPassPrecomputed.hip @@ -357,6 +357,7 @@ void runMultiPassTile( listLengths.data(), \ prefixSumOffsets, \ allDistances); \ + HIP_TEST_ERROR(); \ } while (0) if (useFloat16Lookup) { @@ -432,6 +433,7 @@ void runMultiPassTile( listLengths.data(), \ prefixSumOffsets, \ allDistances); \ + HIP_TEST_ERROR(); \ } while (0) #define RUN_PQ(NUM_SUB_Q) \ @@ -497,7 +499,7 @@ void runMultiPassTile( break; } - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); #undef RUN_PQ #undef RUN_PQ_OPT @@ -534,7 +536,7 @@ void runMultiPassTile( outIndices, stream); - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } void runPQScanMultiPassPrecomputed( @@ -633,9 +635,9 @@ void runPQScanMultiPassPrecomputed( // Make sure the element before prefixSumOffsets is 0, since we // depend upon simple, boundary-less indexing to get proper results - CUDA_VERIFY(hipMemsetAsync( + HIP_VERIFY(hipMemsetAsync( prefixSumOffsetSpace1.data(), 0, sizeof(idx_t), stream)); - CUDA_VERIFY(hipMemsetAsync( + HIP_VERIFY(hipMemsetAsync( prefixSumOffsetSpace2.data(), 0, sizeof(idx_t), stream)); DeviceTensor allDistances1( diff --git a/faiss/hip/impl/VectorResidual.hip b/faiss/hip/impl/VectorResidual.hip index b0a92cd28d..29f3f5d9ea 100644 --- a/faiss/hip/impl/VectorResidual.hip +++ b/faiss/hip/impl/VectorResidual.hip @@ -82,7 +82,7 @@ void calcResidual( vecs, centroids, vecToCentroid, residuals); } - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } void runCalcResidual( @@ -152,7 +152,7 @@ void gatherReconstructByIds( gatherReconstructByIds<<>>(ids, vecs, out); - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } template @@ -175,7 +175,7 @@ void gatherReconstructByRange( gatherReconstructByRange <<>>(start, num, vecs, out); - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } void runReconstruct( diff --git a/faiss/hip/impl/scan/IVFInterleavedImpl.h b/faiss/hip/impl/scan/IVFInterleavedImpl.h index b62880bc95..aa48082f7f 100644 --- a/faiss/hip/impl/scan/IVFInterleavedImpl.h +++ b/faiss/hip/impl/scan/IVFInterleavedImpl.h @@ -252,7 +252,7 @@ void ivfInterleavedScanImpl( FAISS_ASSERT(false); } - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } } // namespace hip diff --git a/faiss/hip/impl/scan/IVFInterleavedScanKernelTemplate.hip b/faiss/hip/impl/scan/IVFInterleavedScanKernelTemplate.hip index 0821ed4a48..5ca6d76c30 100644 --- a/faiss/hip/impl/scan/IVFInterleavedScanKernelTemplate.hip +++ b/faiss/hip/impl/scan/IVFInterleavedScanKernelTemplate.hip @@ -60,6 +60,7 @@ void IVFINT_RUN< distanceTemp, indicesTemp, useResidual); + HIP_TEST_ERROR(); runIVFInterleavedScan2( distanceTemp, diff --git a/faiss/hip/perf/PerfBinaryFlat.hip b/faiss/hip/perf/PerfBinaryFlat.hip index 99a2d397dd..6cd42f8ce6 100644 --- a/faiss/hip/perf/PerfBinaryFlat.hip +++ b/faiss/hip/perf/PerfBinaryFlat.hip @@ -92,7 +92,7 @@ int main(int argc, char** argv) { HostTensor gpuDistances({numQueries, FLAGS_k}); HostTensor gpuIndices({numQueries, FLAGS_k}); - CUDA_VERIFY(hipProfilerStart()); + HIP_VERIFY(hipProfilerStart()); faiss::hip::synchronizeAllDevices(); float gpuTime = 0.0f; @@ -113,10 +113,10 @@ int main(int argc, char** argv) { gpuTime = timer.elapsedMilliseconds(); } - CUDA_VERIFY(hipProfilerStop()); + HIP_VERIFY(hipProfilerStop()); printf("GPU time %.3f ms\n", gpuTime); - CUDA_VERIFY(hipDeviceSynchronize()); + HIP_VERIFY(hipDeviceSynchronize()); return 0; } diff --git a/faiss/hip/perf/PerfClustering.cpp b/faiss/hip/perf/PerfClustering.cpp index 5e52bf7ce8..f47573c249 100644 --- a/faiss/hip/perf/PerfClustering.cpp +++ b/faiss/hip/perf/PerfClustering.cpp @@ -86,7 +86,7 @@ int main(int argc, char** argv) { IndexWrapper gpuIndex(FLAGS_num_gpus, initFn); - CUDA_VERIFY(cudaProfilerStart()); + HIP_VERIFY(cudaProfilerStart()); faiss::hip::synchronizeAllDevices(); float gpuTime = 0.0f; @@ -112,10 +112,10 @@ int main(int argc, char** argv) { gpuTime = timer.elapsedMilliseconds(); } - CUDA_VERIFY(cudaProfilerStop()); + HIP_VERIFY(cudaProfilerStop()); printf("k-means time %.3f ms\n", gpuTime); - CUDA_VERIFY(cudaDeviceSynchronize()); + HIP_VERIFY(cudaDeviceSynchronize()); return 0; } diff --git a/faiss/hip/perf/PerfFlat.hip b/faiss/hip/perf/PerfFlat.hip index b04002354b..160fd707b3 100644 --- a/faiss/hip/perf/PerfFlat.hip +++ b/faiss/hip/perf/PerfFlat.hip @@ -116,7 +116,7 @@ int main(int argc, char** argv) { HostTensor gpuDistances({numQueries, FLAGS_k}); HostTensor gpuIndices({numQueries, FLAGS_k}); - CUDA_VERIFY(hipProfilerStart()); + HIP_VERIFY(hipProfilerStart()); faiss::hip::synchronizeAllDevices(); float gpuTime = 0.0f; @@ -137,7 +137,7 @@ int main(int argc, char** argv) { gpuTime = timer.elapsedMilliseconds(); } - CUDA_VERIFY(hipProfilerStop()); + HIP_VERIFY(hipProfilerStop()); printf("GPU time %.3f ms\n", gpuTime); if (FLAGS_cpu) { @@ -154,7 +154,7 @@ int main(int argc, char** argv) { false); } - CUDA_VERIFY(hipDeviceSynchronize()); + HIP_VERIFY(hipDeviceSynchronize()); return 0; } diff --git a/faiss/hip/perf/PerfIVFFlat.hip b/faiss/hip/perf/PerfIVFFlat.hip index ef47f728ab..b9e200eaf7 100644 --- a/faiss/hip/perf/PerfIVFFlat.hip +++ b/faiss/hip/perf/PerfIVFFlat.hip @@ -115,7 +115,7 @@ int main(int argc, char** argv) { HostTensor gpuDistances({numQueries, FLAGS_k}); HostTensor gpuIndices({numQueries, FLAGS_k}); - CUDA_VERIFY(hipProfilerStart()); + HIP_VERIFY(hipProfilerStart()); faiss::hip::synchronizeAllDevices(); float gpuTime = 0.0f; @@ -136,7 +136,7 @@ int main(int argc, char** argv) { gpuTime = timer.elapsedMilliseconds(); } - CUDA_VERIFY(hipProfilerStop()); + HIP_VERIFY(hipProfilerStop()); printf("GPU time %.3f ms\n", gpuTime); compareLists( @@ -151,7 +151,7 @@ int main(int argc, char** argv) { FLAGS_diff, false); - CUDA_VERIFY(hipDeviceSynchronize()); + HIP_VERIFY(hipDeviceSynchronize()); return 0; } diff --git a/faiss/hip/perf/PerfIVFPQ.hip b/faiss/hip/perf/PerfIVFPQ.hip index 85be2f7451..61731ca12f 100644 --- a/faiss/hip/perf/PerfIVFPQ.hip +++ b/faiss/hip/perf/PerfIVFPQ.hip @@ -40,7 +40,7 @@ using namespace faiss::hip; int main(int argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); - CUDA_VERIFY(hipProfilerStop()); + HIP_VERIFY(hipProfilerStop()); auto seed = FLAGS_seed != -1L ? FLAGS_seed : time(nullptr); printf("using seed %ld\n", seed); @@ -125,7 +125,7 @@ int main(int argc, char** argv) { HostTensor gpuDistances({numQueries, FLAGS_k}); HostTensor gpuIndices({numQueries, FLAGS_k}); - CUDA_VERIFY(hipProfilerStart()); + HIP_VERIFY(hipProfilerStart()); faiss::hip::synchronizeAllDevices(); float gpuTime = 0.0f; @@ -146,7 +146,7 @@ int main(int argc, char** argv) { gpuTime = timer.elapsedMilliseconds(); } - CUDA_VERIFY(hipProfilerStop()); + HIP_VERIFY(hipProfilerStop()); printf("GPU time %.3f ms\n", gpuTime); compareLists( @@ -161,7 +161,7 @@ int main(int argc, char** argv) { FLAGS_diff, false); - CUDA_VERIFY(hipDeviceSynchronize()); + HIP_VERIFY(hipDeviceSynchronize()); return 0; } diff --git a/faiss/hip/perf/PerfIVFPQAdd.cpp b/faiss/hip/perf/PerfIVFPQAdd.cpp index e43f09b7c0..7650bbb6c1 100644 --- a/faiss/hip/perf/PerfIVFPQAdd.cpp +++ b/faiss/hip/perf/PerfIVFPQAdd.cpp @@ -74,7 +74,7 @@ int main(int argc, char** argv) { } cudaDeviceSynchronize(); - CUDA_VERIFY(cudaProfilerStart()); + HIP_VERIFY(cudaProfilerStart()); float totalGpuTime = 0.0f; float totalCpuTime = 0.0f; @@ -91,7 +91,7 @@ int main(int argc, char** argv) { if (FLAGS_time_gpu) { faiss::hip::CpuTimer timer; gpuIndex.add(FLAGS_batch_size, addVecs.data()); - CUDA_VERIFY(cudaDeviceSynchronize()); + HIP_VERIFY(cudaDeviceSynchronize()); auto time = timer.elapsedMilliseconds(); totalGpuTime += time; @@ -122,7 +122,7 @@ int main(int argc, char** argv) { } } - CUDA_VERIFY(cudaProfilerStop()); + HIP_VERIFY(cudaProfilerStop()); int total = FLAGS_batch_size * FLAGS_batches; diff --git a/faiss/hip/test/TestGpuMemoryException.cpp b/faiss/hip/test/TestGpuMemoryException.cpp index 0af8c3bca8..976d1655aa 100644 --- a/faiss/hip/test/TestGpuMemoryException.cpp +++ b/faiss/hip/test/TestGpuMemoryException.cpp @@ -20,7 +20,7 @@ TEST(TestGpuMemoryException, AddException) { size_t devFree = 0; size_t devTotal = 0; - CUDA_VERIFY(hipMemGetInfo(&devFree, &devTotal)); + HIP_VERIFY(hipMemGetInfo(&devFree, &devTotal)); // Figure out the dimensionality needed to get at least greater than // devTotal diff --git a/faiss/hip/utils/BlockSelectFloat.hip b/faiss/hip/utils/BlockSelectFloat.hip index de6ac40b3e..bf2db1bdda 100644 --- a/faiss/hip/utils/BlockSelectFloat.hip +++ b/faiss/hip/utils/BlockSelectFloat.hip @@ -55,7 +55,7 @@ void runBlockSelect( if (dir) { if (k == 1) { BLOCK_SELECT_CALL(float, true, 1); - //} else if (k <= 32) { + //} else if (k <= 32) { //(kWarpSize == 32 && k <= 32) // BLOCK_SELECT_CALL(float, true, 32); } else if (k <= 64) { BLOCK_SELECT_CALL(float, true, 64); @@ -75,7 +75,7 @@ void runBlockSelect( } else { if (k == 1) { BLOCK_SELECT_CALL(float, false, 1); - //} else if (k <= 32) { + //} else if (k <= 32) { // (kWarpSize == 32 && k <= 32) // BLOCK_SELECT_CALL(float, false, 32); } else if (k <= 64) { BLOCK_SELECT_CALL(float, false, 64); @@ -108,7 +108,7 @@ void runBlockSelectPair( if (dir) { if (k == 1) { BLOCK_SELECT_PAIR_CALL(float, true, 1); - //} else if (k <= 32) { + //} else if (k <= 32) { // (kWarpSize == 32 && k <= 32) // BLOCK_SELECT_PAIR_CALL(float, true, 32); } else if (k <= 64) { BLOCK_SELECT_PAIR_CALL(float, true, 64); @@ -128,7 +128,7 @@ void runBlockSelectPair( } else { if (k == 1) { BLOCK_SELECT_PAIR_CALL(float, false, 1); - //} else if (k <= 32) { + //} else if (k <= 32) { // (kWarpSize == 32 && k <= 32) // BLOCK_SELECT_PAIR_CALL(float, false, 32); } else if (k <= 64) { BLOCK_SELECT_PAIR_CALL(float, false, 64); diff --git a/faiss/hip/utils/CopyUtils.h b/faiss/hip/utils/CopyUtils.h index 0c6cb6a44e..59158ba4b9 100644 --- a/faiss/hip/utils/CopyUtils.h +++ b/faiss/hip/utils/CopyUtils.h @@ -117,10 +117,10 @@ inline void fromDevice(T* src, T* dst, size_t num, hipStream_t stream) { int dev = getDeviceForAddress(dst); if (dev == -1) { - CUDA_VERIFY(hipMemcpyAsync( + HIP_VERIFY(hipMemcpyAsync( dst, src, num * sizeof(T), hipMemcpyDeviceToHost, stream)); } else { - CUDA_VERIFY(hipMemcpyAsync( + HIP_VERIFY(hipMemcpyAsync( dst, src, num * sizeof(T), hipMemcpyDeviceToDevice, stream)); } } diff --git a/faiss/hip/utils/DeviceDefs.h b/faiss/hip/utils/DeviceDefs.h index 77b195d6dc..fb5bb53c74 100644 --- a/faiss/hip/utils/DeviceDefs.h +++ b/faiss/hip/utils/DeviceDefs.h @@ -14,7 +14,7 @@ namespace faiss { namespace hip { // We validate this against the actual architecture in device initialization -constexpr int kWarpSize = warpSize; // = 64 (Defined in hip_runtime.h) +constexpr int kWarpSize = 32;//__AMDGCN_WAVEFRONT_SIZE; // either = 32 or = 64 (Defined in hip_runtime.h) // This is a memory barrier for intra-warp writes to shared memory. __forceinline__ __device__ void warpFence() { diff --git a/faiss/hip/utils/DeviceTensor-inl.h b/faiss/hip/utils/DeviceTensor-inl.h index 9776e5a221..5b37f69048 100644 --- a/faiss/hip/utils/DeviceTensor-inl.h +++ b/faiss/hip/utils/DeviceTensor-inl.h @@ -181,7 +181,7 @@ __host__ DeviceTensor& DeviceTensor< // Region must be contiguous FAISS_ASSERT(this->isContiguous()); - CUDA_VERIFY(hipMemsetAsync( + HIP_VERIFY(hipMemsetAsync( this->data_, 0, this->getSizeInBytes(), stream)); } diff --git a/faiss/hip/utils/DeviceUtils.h b/faiss/hip/utils/DeviceUtils.h index 74a3bf92c9..3b69f32c9f 100644 --- a/faiss/hip/utils/DeviceUtils.h +++ b/faiss/hip/utils/DeviceUtils.h @@ -134,8 +134,8 @@ class CudaEvent { hipEvent_t event_; }; -/// Wrapper to test return status of CUDA functions -#define CUDA_VERIFY(X) \ +/// Wrapper to test return status of HIP functions +#define HIP_VERIFY(X) \ do { \ auto err__ = (X); \ FAISS_ASSERT_FMT( \ @@ -149,14 +149,14 @@ class CudaEvent { // #define FAISS_GPU_SYNC_ERROR 1 #ifdef FAISS_GPU_SYNC_ERROR -#define CUDA_TEST_ERROR() \ +#define HIP_TEST_ERROR() \ do { \ - CUDA_VERIFY(hipDeviceSynchronize()); \ + HIP_VERIFY(hipDeviceSynchronize()); \ } while (0) #else -#define CUDA_TEST_ERROR() \ +#define HIP_TEST_ERROR() \ do { \ - CUDA_VERIFY(hipGetLastError()); \ + HIP_VERIFY(hipGetLastError()); \ } while (0) #endif @@ -167,20 +167,20 @@ void streamWaitBase(const L1& listWaiting, const L2& listWaitOn) { std::vector events; for (auto& stream : listWaitOn) { hipEvent_t event; - CUDA_VERIFY(hipEventCreateWithFlags(&event, hipEventDisableTiming)); - CUDA_VERIFY(hipEventRecord(event, stream)); + HIP_VERIFY(hipEventCreateWithFlags(&event, hipEventDisableTiming)); + HIP_VERIFY(hipEventRecord(event, stream)); events.push_back(event); } // For all the streams that are waiting, issue a wait for (auto& stream : listWaiting) { for (auto& event : events) { - CUDA_VERIFY(hipStreamWaitEvent(stream, event, 0)); + HIP_VERIFY(hipStreamWaitEvent(stream, event, 0)); } } for (auto& event : events) { - CUDA_VERIFY(hipEventDestroy(event)); + HIP_VERIFY(hipEventDestroy(event)); } } diff --git a/faiss/hip/utils/DeviceUtils.hip b/faiss/hip/utils/DeviceUtils.hip index b48a6250bc..600bcfea79 100644 --- a/faiss/hip/utils/DeviceUtils.hip +++ b/faiss/hip/utils/DeviceUtils.hip @@ -18,14 +18,14 @@ namespace hip { int getCurrentDevice() { int dev = -1; - CUDA_VERIFY(hipGetDevice(&dev)); + HIP_VERIFY(hipGetDevice(&dev)); FAISS_ASSERT(dev != -1); return dev; } void setCurrentDevice(int device) { - CUDA_VERIFY(hipSetDevice(device)); + HIP_VERIFY(hipSetDevice(device)); } int getNumDevices() { @@ -34,7 +34,7 @@ int getNumDevices() { if (hipErrorNoDevice == err) { numDev = 0; } else { - CUDA_VERIFY(err); + HIP_VERIFY(err); } FAISS_ASSERT(numDev != -1); @@ -42,18 +42,18 @@ int getNumDevices() { } void profilerStart() { - CUDA_VERIFY(hipProfilerStart()); + HIP_VERIFY(hipProfilerStart()); } void profilerStop() { - CUDA_VERIFY(hipProfilerStop()); + HIP_VERIFY(hipProfilerStop()); } void synchronizeAllDevices() { for (int i = 0; i < getNumDevices(); ++i) { DeviceScope scope(i); - CUDA_VERIFY(hipDeviceSynchronize()); + HIP_VERIFY(hipDeviceSynchronize()); } } @@ -66,7 +66,7 @@ const hipDeviceProp_t& getDeviceProperties(int device) { auto it = properties.find(device); if (it == properties.end()) { hipDeviceProp_t prop; - CUDA_VERIFY(hipGetDeviceProperties(&prop, device)); + HIP_VERIFY(hipGetDeviceProperties(&prop, device)); properties[device] = prop; it = properties.find(device); @@ -167,7 +167,7 @@ size_t getFreeMemory(int device) { size_t free = 0; size_t total = 0; - CUDA_VERIFY(hipMemGetInfo(&free, &total)); + HIP_VERIFY(hipMemGetInfo(&free, &total)); return free; } @@ -176,7 +176,7 @@ size_t getFreeMemoryCurrentDevice() { size_t free = 0; size_t total = 0; - CUDA_VERIFY(hipMemGetInfo(&free, &total)); + HIP_VERIFY(hipMemGetInfo(&free, &total)); return free; } @@ -213,10 +213,10 @@ CublasHandleScope::~CublasHandleScope() { } CudaEvent::CudaEvent(hipStream_t stream, bool timer) : event_(0) { - CUDA_VERIFY(hipEventCreateWithFlags( + HIP_VERIFY(hipEventCreateWithFlags( &event_, timer ? hipEventDefault : hipEventDisableTiming)); printf("------------------hipEventRecord1"); - CUDA_VERIFY(hipEventRecord(event_, stream)); + HIP_VERIFY(hipEventRecord(event_, stream)); } CudaEvent::CudaEvent(CudaEvent&& event) noexcept @@ -226,7 +226,7 @@ CudaEvent::CudaEvent(CudaEvent&& event) noexcept CudaEvent::~CudaEvent() { if (event_) { - CUDA_VERIFY(hipEventDestroy(event_)); + HIP_VERIFY(hipEventDestroy(event_)); } } @@ -238,11 +238,11 @@ CudaEvent& CudaEvent::operator=(CudaEvent&& event) noexcept { } void CudaEvent::streamWaitOnEvent(hipStream_t stream) { - CUDA_VERIFY(hipStreamWaitEvent(stream, event_, 0)); + HIP_VERIFY(hipStreamWaitEvent(stream, event_, 0)); } void CudaEvent::cpuWaitOnEvent() { - CUDA_VERIFY(hipEventSynchronize(event_)); + HIP_VERIFY(hipEventSynchronize(event_)); } } // namespace hip diff --git a/faiss/hip/utils/DeviceVector.h b/faiss/hip/utils/DeviceVector.h index d79d46a634..6d552bbee5 100644 --- a/faiss/hip/utils/DeviceVector.h +++ b/faiss/hip/utils/DeviceVector.h @@ -78,7 +78,7 @@ class DeviceVector { if (num_ > 0) { FAISS_ASSERT(data()); - CUDA_VERIFY(hipMemcpyAsync( + HIP_VERIFY(hipMemcpyAsync( out.data(), data(), num_ * sizeof(T), @@ -109,14 +109,14 @@ class DeviceVector { int dev = getDeviceForAddress(d); if (dev == -1) { - CUDA_VERIFY(hipMemcpyAsync( + HIP_VERIFY(hipMemcpyAsync( data() + num_, d, n * sizeof(T), hipMemcpyHostToDevice, stream)); } else { - CUDA_VERIFY(hipMemcpyAsync( + HIP_VERIFY(hipMemcpyAsync( data() + num_, d, n * sizeof(T), @@ -155,7 +155,7 @@ class DeviceVector { // Set the specific value at a given index to `value` void setAt(size_t idx, const T& value, hipStream_t stream) { FAISS_ASSERT(idx < num_); - CUDA_VERIFY(hipMemcpyAsync( + HIP_VERIFY(hipMemcpyAsync( data() + idx, &value, sizeof(T), @@ -168,7 +168,7 @@ class DeviceVector { FAISS_ASSERT(idx < num_); T out; - CUDA_VERIFY(hipMemcpyAsync( + HIP_VERIFY(hipMemcpyAsync( &out, data() + idx, sizeof(T), hipMemcpyDeviceToHost, stream)); } @@ -228,7 +228,7 @@ class DeviceVector { AllocRequest(allocInfo_, newSizeInBytes)); // Copy over any old data - CUDA_VERIFY(hipMemcpyAsync( + HIP_VERIFY(hipMemcpyAsync( newAlloc.data, data(), oldSizeInBytes, @@ -236,7 +236,7 @@ class DeviceVector { stream)); // Zero out the new space past the data we just copied - CUDA_VERIFY(hipMemsetAsync( + HIP_VERIFY(hipMemsetAsync( (uint8_t*)newAlloc.data + oldSizeInBytes, 0, newSizeInBytes - oldSizeInBytes, diff --git a/faiss/hip/utils/LoadStoreOperators.h b/faiss/hip/utils/LoadStoreOperators.h index 82e3a7111e..3ca9168676 100644 --- a/faiss/hip/utils/LoadStoreOperators.h +++ b/faiss/hip/utils/LoadStoreOperators.h @@ -33,9 +33,9 @@ template <> struct LoadStore { static inline __device__ Half4 load(void* p) { Half4 out; - out.a.x = *static_cast(p); - out.b.x = *static_cast(p); - + half2* t = reinterpret_cast(p); + out.a = t[0]; + out.b = t[1]; //DONE asm("ld.global.v2.u32 {%0, %1}, [%2];" //DONE : "=r"(__HALF2_TO_UI(out.a)), "=r"(__HALF2_TO_UI(out.b)) //DONE : "l"(p)); @@ -48,8 +48,9 @@ struct LoadStore { } static inline __device__ void store(void* p, Half4& v) { - v.a.x = *static_cast(p); - v.b.x = *static_cast(p); + half2* t = reinterpret_cast(p); + v.a = t[0]; + v.b = t[1]; //#if CUDA_VERSION >= 9000 //DONE asm("st.v2.u32 [%0], {%1, %2};" //DONE : @@ -64,10 +65,11 @@ template <> struct LoadStore { static inline __device__ Half8 load(void* p) { Half8 out; - out.a.a.x = *static_cast(p); - out.a.b.x = *static_cast(p); - out.b.a.x = *static_cast(p); - out.b.b.x = *static_cast(p); + half2* t = reinterpret_cast(p); + out.a.a = t[0]; + out.a.b = t[1]; + out.b.a = t[2]; + out.b.b = t[3]; //#if CUDA_VERSION >= 9000 //DONE asm("ld.global.v4.u32 {%0, %1, %2, %3}, [%4];" //DONE : "=r"(__HALF2_TO_UI(out.a.a)), @@ -84,10 +86,11 @@ struct LoadStore { } static inline __device__ void store(void* p, Half8& v) { - v.a.a.x = *static_cast(p); - v.a.b.x = *static_cast(p); - v.b.a.x = *static_cast(p); - v.b.b.x = *static_cast(p); + half2* t = reinterpret_cast(p); + v.a.a = t[0]; + v.a.b = t[1]; + v.b.a = t[2]; + v.b.b = t[3]; //#if CUDA_VERSION >= 9000 //DONE asm("st.v4.u32 [%0], {%1, %2, %3, %4};" //DONE : diff --git a/faiss/hip/utils/MatrixMult-inl.h b/faiss/hip/utils/MatrixMult-inl.h index 58f2135b76..3b7d8812fd 100644 --- a/faiss/hip/utils/MatrixMult-inl.h +++ b/faiss/hip/utils/MatrixMult-inl.h @@ -252,7 +252,7 @@ void runMatrixMult( lda, ldb, ldc); - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } template @@ -355,7 +355,7 @@ void runBatchMatrixMult( err == HIPBLAS_STATUS_SUCCESS, "hipblasGemmStridedBatchedEx failed (%d)", (int)err); - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } } // namespace hip diff --git a/faiss/hip/utils/MergeNetworkWarp.h b/faiss/hip/utils/MergeNetworkWarp.h index 9ae5266f8c..7f52c515ae 100644 --- a/faiss/hip/utils/MergeNetworkWarp.h +++ b/faiss/hip/utils/MergeNetworkWarp.h @@ -93,7 +93,7 @@ template < bool IsBitonic> inline __device__ void warpBitonicMergeLE16(K& k, V& v) { static_assert(utils::isPowerOf2(L), "L must be a power-of-2"); - static_assert(L <= kWarpSize / 2, "merge list size must be <= 32"); + static_assert(L <= kWarpSize / 2, "merge list size must be <= 32 or 16, depending on warp size"); int laneId = getLaneId(); @@ -529,14 +529,17 @@ struct BitonicSortStep { static inline __device__ void sort(K k[1], V v[1]) { // Update this code if this changes // should go from 1 -> kWarpSize in multiples of 2 - static_assert(kWarpSize == 64, "unexpected warp size"); - + // warp size for us is either 32 or 64 warpBitonicMergeLE16(k[0], v[0]); warpBitonicMergeLE16(k[0], v[0]); warpBitonicMergeLE16(k[0], v[0]); warpBitonicMergeLE16(k[0], v[0]); warpBitonicMergeLE16(k[0], v[0]); - warpBitonicMergeLE16(k[0], v[0]); +#if !(__gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__) +//TODO needs to be fixed +//#warning(including wider merge) +// warpBitonicMergeLE16(k[0], v[0]); +#endif } }; diff --git a/faiss/hip/utils/PtxUtils.h b/faiss/hip/utils/PtxUtils.h index f31768e3cc..b15882fef6 100644 --- a/faiss/hip/utils/PtxUtils.h +++ b/faiss/hip/utils/PtxUtils.h @@ -44,7 +44,7 @@ getBitfield(uint64_t val, int pos, int len) { } __device__ __forceinline__ int getLaneId() { - return threadIdx.x & 63; + return threadIdx.x & (32 - 1); //(__AMDGCN_WAVEFRONT_SIZE - 1); } } // namespace hip diff --git a/faiss/hip/utils/Select.h b/faiss/hip/utils/Select.h index 9032e8556b..fb409c8baf 100644 --- a/faiss/hip/utils/Select.h +++ b/faiss/hip/utils/Select.h @@ -207,11 +207,7 @@ struct BlockSelect { __device__ inline void checkThreadQ() { bool needSort = (numVals == NumThreadQ); -#if CUDA_VERSION >= 9000 - needSort = __any_sync(0xffffffff, needSort); -#else needSort = __any(needSort); -#endif if (!needSort) { // no lanes have triggered a sort @@ -484,11 +480,7 @@ struct WarpSelect { __device__ inline void checkThreadQ() { bool needSort = (numVals == NumThreadQ); -#if CUDA_VERSION >= 9000 - needSort = __any_sync(0xffffffff, needSort); -#else needSort = __any(needSort); -#endif if (!needSort) { // no lanes have triggered a sort diff --git a/faiss/hip/utils/Tensor-inl.h b/faiss/hip/utils/Tensor-inl.h index 062856a195..15d23685be 100644 --- a/faiss/hip/utils/Tensor-inl.h +++ b/faiss/hip/utils/Tensor-inl.h @@ -197,7 +197,7 @@ __host__ void Tensor::copyFrom( int tDev = getDeviceForAddress(t.data()); if (tDev == -1) { - CUDA_VERIFY(hipMemcpyAsync( + HIP_VERIFY(hipMemcpyAsync( this->data_, t.data(), this->getSizeInBytes(), @@ -205,7 +205,7 @@ __host__ void Tensor::copyFrom( : hipMemcpyHostToDevice, stream)); } else { - CUDA_VERIFY(hipMemcpyAsync( + HIP_VERIFY(hipMemcpyAsync( this->data_, t.data(), this->getSizeInBytes(), @@ -242,7 +242,7 @@ __host__ void Tensor::copyTo( int tDev = getDeviceForAddress(t.data()); if (tDev == -1) { - CUDA_VERIFY(hipMemcpyAsync( + HIP_VERIFY(hipMemcpyAsync( t.data(), this->data_, this->getSizeInBytes(), @@ -250,7 +250,7 @@ __host__ void Tensor::copyTo( : hipMemcpyDeviceToHost, stream)); } else { - CUDA_VERIFY(hipMemcpyAsync( + HIP_VERIFY(hipMemcpyAsync( t.data(), this->data_, this->getSizeInBytes(), @@ -281,7 +281,7 @@ __host__ void Tensor::copyFrom( GPU_FAISS_ASSERT(this->data_); int ourDev = getDeviceForAddress(this->data_); - CUDA_VERIFY(hipMemcpyAsync( + HIP_VERIFY(hipMemcpyAsync( this->data_, v.data(), this->getSizeInBytes(), @@ -311,7 +311,7 @@ __host__ std::vector Tensor:: std::memcpy( out.data(), this->data_, this->numElements() * sizeof(T)); } else { - CUDA_VERIFY(hipMemcpyAsync( + HIP_VERIFY(hipMemcpyAsync( out.data(), this->data_, this->numElements() * sizeof(T), diff --git a/faiss/hip/utils/Tensor.h b/faiss/hip/utils/Tensor.h index 06de4bec60..cddf22ff08 100644 --- a/faiss/hip/utils/Tensor.h +++ b/faiss/hip/utils/Tensor.h @@ -470,21 +470,13 @@ class SubTensor { /// Use the texture cache for reads __device__ inline typename TensorType::DataType ldg() const { -#if __CUDA_ARCH__ >= 350 return __ldg(data_); -#else - return *data_; -#endif } /// Use the texture cache for reads; cast as a particular type template __device__ inline T ldgAs() const { -#if __CUDA_ARCH__ >= 350 return __ldg(dataAs()); -#else - return as(); -#endif } protected: @@ -606,21 +598,13 @@ class SubTensor { /// Use the texture cache for reads __device__ inline typename TensorType::DataType ldg() const { -#if __CUDA_ARCH__ >= 350 return __ldg(data_); -#else - return *data_; -#endif } /// Use the texture cache for reads; cast as a particular type template __device__ inline T ldgAs() const { -#if __CUDA_ARCH__ >= 350 return __ldg(dataAs()); -#else - return as(); -#endif } /// Returns a tensor that is a view of the SubDim-dimensional slice diff --git a/faiss/hip/utils/Timer.cpp b/faiss/hip/utils/Timer.cpp index 1be22895a4..af5e55ab8f 100644 --- a/faiss/hip/utils/Timer.cpp +++ b/faiss/hip/utils/Timer.cpp @@ -15,25 +15,25 @@ namespace hip { KernelTimer::KernelTimer(hipStream_t stream) : startEvent_(0), stopEvent_(0), stream_(stream), valid_(true) { - CUDA_VERIFY(hipEventCreate(&startEvent_)); - CUDA_VERIFY(hipEventCreate(&stopEvent_)); + HIP_VERIFY(hipEventCreate(&startEvent_)); + HIP_VERIFY(hipEventCreate(&stopEvent_)); - CUDA_VERIFY(hipEventRecord(startEvent_, stream_)); + HIP_VERIFY(hipEventRecord(startEvent_, stream_)); } KernelTimer::~KernelTimer() { - CUDA_VERIFY(hipEventDestroy(startEvent_)); - CUDA_VERIFY(hipEventDestroy(stopEvent_)); + HIP_VERIFY(hipEventDestroy(startEvent_)); + HIP_VERIFY(hipEventDestroy(stopEvent_)); } float KernelTimer::elapsedMilliseconds() { FAISS_ASSERT(valid_); - CUDA_VERIFY(hipEventRecord(stopEvent_, stream_)); - CUDA_VERIFY(hipEventSynchronize(stopEvent_)); + HIP_VERIFY(hipEventRecord(stopEvent_, stream_)); + HIP_VERIFY(hipEventSynchronize(stopEvent_)); auto time = 0.0f; - CUDA_VERIFY(hipEventElapsedTime(&time, startEvent_, stopEvent_)); + HIP_VERIFY(hipEventElapsedTime(&time, startEvent_, stopEvent_)); valid_ = false; return time; diff --git a/faiss/hip/utils/Transpose.h b/faiss/hip/utils/Transpose.h index 6e802ac0fd..d18b9f6af0 100644 --- a/faiss/hip/utils/Transpose.h +++ b/faiss/hip/utils/Transpose.h @@ -79,17 +79,13 @@ template __global__ void transposeAny( TensorInfo input, TensorInfo output, - idx_t totalSize) { + idx_t totalSize) __attribute__((amdgpu_flat_work_group_size(1,1024))) { for (idx_t i = idx_t(blockIdx.x) * blockDim.x + threadIdx.x; i < totalSize; i += gridDim.x * blockDim.x) { auto inputOffset = TensorInfoOffset::get(input, i); auto outputOffset = TensorInfoOffset::get(output, i); -#if __CUDA_ARCH__ >= 350 output.data[outputOffset] = __ldg(&input.data[inputOffset]); -#else - output.data[outputOffset] = input.data[inputOffset]; -#endif } } @@ -187,7 +183,7 @@ void runTransposeAny( <<>>(inInfo, outInfo, totalSize); } - CUDA_TEST_ERROR(); + HIP_TEST_ERROR(); } } // namespace hip diff --git a/faiss/hip/utils/WarpSelectFloat.hip b/faiss/hip/utils/WarpSelectFloat.hip index 0c9375f5aa..20298c003b 100644 --- a/faiss/hip/utils/WarpSelectFloat.hip +++ b/faiss/hip/utils/WarpSelectFloat.hip @@ -55,8 +55,10 @@ void runWarpSelect( if (dir) { if (k == 1) { WARP_SELECT_CALL(float, true, 1); - //} else if (k <= 32) { - // WARP_SELECT_CALL(float, true, 32); +#if (__gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__) + } else if (k <= 32) { + WARP_SELECT_CALL(float, true, 32); +#endif } else if (k <= 64) { WARP_SELECT_CALL(float, true, 64); } else if (k <= 128) { @@ -75,8 +77,10 @@ void runWarpSelect( } else { if (k == 1) { WARP_SELECT_CALL(float, false, 1); - //} else if (k <= 32) { - // WARP_SELECT_CALL(float, false, 32); +#if (__gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__) + } else if (k <= 32) { + WARP_SELECT_CALL(float, false, 32); +#endif } else if (k <= 64) { WARP_SELECT_CALL(float, false, 64); } else if (k <= 128) { diff --git a/faiss/hip/utils/WarpShuffles.h b/faiss/hip/utils/WarpShuffles.h index 05a3ab2af3..5f2d9c398d 100644 --- a/faiss/hip/utils/WarpShuffles.h +++ b/faiss/hip/utils/WarpShuffles.h @@ -14,12 +14,7 @@ namespace faiss { namespace hip { // defines to simplify the SASS assembly structure file/line in the profiler -#if CUDA_VERSION >= 9000 -#define SHFL_SYNC(VAL, SRC_LANE, WIDTH) \ - __shfl_sync(0xffffffff, VAL, SRC_LANE, WIDTH) -#else #define SHFL_SYNC(VAL, SRC_LANE, WIDTH) __shfl(VAL, SRC_LANE, WIDTH) -#endif template inline __device__ T shfl(const T val, int srcLane, int width = kWarpSize) { diff --git a/faiss/hip/utils/blockselect/BlockSelectImpl.h b/faiss/hip/utils/blockselect/BlockSelectImpl.h index 0b6334a2aa..86f53edba6 100644 --- a/faiss/hip/utils/blockselect/BlockSelectImpl.h +++ b/faiss/hip/utils/blockselect/BlockSelectImpl.h @@ -60,7 +60,7 @@ THREAD_Q, \ kBlockSelectNumThreads> \ <<>>(in, outK, outV, kInit, vInit, k); \ - CUDA_TEST_ERROR(); \ + HIP_TEST_ERROR(); \ } \ \ void runBlockSelectPair_##TYPE##_##DIR##_##WARP_Q##_( \ @@ -93,7 +93,7 @@ THREAD_Q, \ kBlockSelectNumThreads><<>>( \ inK, inV, outK, outV, kInit, vInit, k); \ - CUDA_TEST_ERROR(); \ + HIP_TEST_ERROR(); \ } #define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q) \ diff --git a/faiss/hip/utils/warpselect/WarpSelectImpl.h b/faiss/hip/utils/warpselect/WarpSelectImpl.h index 6c8ec39fdd..6fb48a3cd0 100644 --- a/faiss/hip/utils/warpselect/WarpSelectImpl.h +++ b/faiss/hip/utils/warpselect/WarpSelectImpl.h @@ -38,7 +38,7 @@ \ warpSelect \ <<>>(in, outK, outV, kInit, vInit, k); \ - CUDA_TEST_ERROR(); \ + HIP_TEST_ERROR(); \ } #define WARP_SELECT_CALL(TYPE, DIR, WARP_Q) \