Skip to content

Commit

Permalink
Merge pull request facebookresearch#1 from iotamudelta/wf32
Browse files Browse the repository at this point in the history
Progress for WF32.
  • Loading branch information
ItsPitt authored Nov 6, 2023
2 parents 6329d16 + 686a3db commit f464f77
Show file tree
Hide file tree
Showing 51 changed files with 267 additions and 358 deletions.
2 changes: 1 addition & 1 deletion faiss/hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ function(generate_ivf_interleaved_code)
"128|1|1"
"128|128|3"
"128|256|4"
"128|64|2" #"128|32|2" TODO won't compile with a warpsize of 64. Changed to 64
"128|32|2" #"128|32|2" TODO won't compile with a warpsize of 64. Changed to 64
"128|512|8"
"128|64|3"
"64|2048|8"
Expand Down
2 changes: 1 addition & 1 deletion faiss/hip/GpuIndex.hip
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ void GpuIndex::searchFromCpuPaged_(
eventPrev->streamWaitOnEvent(copyStream);
}

CUDA_VERIFY(hipMemcpyAsync(
HIP_VERIFY(hipMemcpyAsync(
bufGpus[cur2BufIndex]->data(),
bufPinned[cur2BufIndex],
numToCopy * this->d * sizeof(float),
Expand Down
2 changes: 1 addition & 1 deletion faiss/hip/GpuResources.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ hipStream_t GpuResources::getAsyncCopyStreamCurrentDevice() {
}

void GpuResources::syncDefaultStream(int device) {
CUDA_VERIFY(hipStreamSynchronize(getDefaultStream(device)));
HIP_VERIFY(hipStreamSynchronize(getDefaultStream(device)));
}

void GpuResources::syncDefaultStreamCurrentDevice() {
Expand Down
21 changes: 11 additions & 10 deletions faiss/hip/StandardGpuResources.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,21 +134,21 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
DeviceScope scope(entry.first);

// We created these streams, so are responsible for destroying them
CUDA_VERIFY(hipStreamDestroy(entry.second));
HIP_VERIFY(hipStreamDestroy(entry.second));
}

for (auto& entry : alternateStreams_) {
DeviceScope scope(entry.first);

for (auto stream : entry.second) {
CUDA_VERIFY(hipStreamDestroy(stream));
HIP_VERIFY(hipStreamDestroy(stream));
}
}

for (auto& entry : asyncCopyStreams_) {
DeviceScope scope(entry.first);

CUDA_VERIFY(hipStreamDestroy(entry.second));
HIP_VERIFY(hipStreamDestroy(entry.second));
}

for (auto& entry : blasHandles_) {
Expand Down Expand Up @@ -349,28 +349,29 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
prop.minor);

// Our code is pre-built with and expects warpSize == 32, validate that
FAISS_ASSERT_FMT(
prop.warpSize == 64,
"Device id %d does not have expected warpSize of 64",
device);
// we can have either warp size 32 or 64 - disable this check
//FAISS_ASSERT_FMT(
// prop.warpSize == 64,
// "Device id %d does not have expected warpSize of 64",
// device);

// Create streams
hipStream_t defaultStream = 0;
CUDA_VERIFY(
HIP_VERIFY(
hipStreamCreateWithFlags(&defaultStream, hipStreamNonBlocking));

defaultStreams_[device] = defaultStream;

hipStream_t asyncCopyStream = 0;
CUDA_VERIFY(
HIP_VERIFY(
hipStreamCreateWithFlags(&asyncCopyStream, hipStreamNonBlocking));

asyncCopyStreams_[device] = asyncCopyStream;

std::vector<hipStream_t> deviceStreams;
for (int j = 0; j < kNumStreams; ++j) {
hipStream_t stream = 0;
CUDA_VERIFY(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
HIP_VERIFY(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));

deviceStreams.push_back(stream);
}
Expand Down
2 changes: 2 additions & 0 deletions faiss/hip/impl/BinaryDistance.hip
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ void runBinaryDistanceAnySize(
<<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
}
#endif
HIP_TEST_ERROR();
}

template <typename BinaryType, int ReductionLimit>
Expand Down Expand Up @@ -269,6 +270,7 @@ void runBinaryDistanceLimitSize(
<<<grid, block, 0, stream>>>(vecs, query, outK, outV, k);
}
#endif
HIP_TEST_ERROR();
}

void runBinaryDistance(
Expand Down
6 changes: 3 additions & 3 deletions faiss/hip/impl/BroadcastSum.hip
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ void runSumAlongColumns(
<<<grid, block, 0, stream>>>(input, output);
}

CUDA_TEST_ERROR();
HIP_TEST_ERROR();
}

void runSumAlongColumns(
Expand Down Expand Up @@ -320,7 +320,7 @@ void runAssignAlongColumns(
<<<grid, block, 0, stream>>>(input, output);
}

CUDA_TEST_ERROR();
HIP_TEST_ERROR();
}

void runAssignAlongColumns(
Expand Down Expand Up @@ -356,7 +356,7 @@ void runSumAlongRows(
sumAlongRows<T, false><<<grid, block, 0, stream>>>(input, output);
}

CUDA_TEST_ERROR();
HIP_TEST_ERROR();
}

void runSumAlongRows(
Expand Down
2 changes: 1 addition & 1 deletion faiss/hip/impl/DistanceUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ void runIncrementIndex(
auto block = std::min(k, getMaxThreadsCurrentDevice());

incrementIndex<<<grid, block, 0, stream>>>(indices, k, increment);
CUDA_TEST_ERROR();
HIP_TEST_ERROR();
}

// If the inner size (dim) of the vectors is small, we want a larger query tile
Expand Down
3 changes: 2 additions & 1 deletion faiss/hip/impl/GeneralDistance.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ void runGeneralDistanceKernel(
dim3 block(kWarpSize, kWarpSize);

generalDistance<<<grid, block, 0, stream>>>(query, vecs, op, out);
HIP_TEST_ERROR();
}

template <typename T, typename DistanceOp, bool InnerContig>
Expand Down Expand Up @@ -449,7 +450,7 @@ void runGeneralDistance(
FAISS_THROW_MSG("interrupted");
}

CUDA_TEST_ERROR();
HIP_TEST_ERROR();
}

} // namespace hip
Expand Down
13 changes: 7 additions & 6 deletions faiss/hip/impl/IVFAppend.hip
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ void runUpdateListPointers(
listCodes.data(),
listIndices.data());

CUDA_TEST_ERROR();
HIP_TEST_ERROR();
}

// Appends new indices for vectors being added to the IVF indices lists
Expand Down Expand Up @@ -124,7 +124,7 @@ void runIVFIndicesAppend(
ivfIndicesAppend<<<blocks, threads, 0, stream>>>(
listIds, listOffset, indices, opt, listIndices.data());

CUDA_TEST_ERROR();
HIP_TEST_ERROR();
}
}

Expand Down Expand Up @@ -259,7 +259,7 @@ void runIVFFlatAppend(
}
}

CUDA_TEST_ERROR();
HIP_TEST_ERROR();

#undef RUN_APPEND
}
Expand Down Expand Up @@ -308,7 +308,7 @@ void runIVFPQAppend(
ivfpqAppend<<<threads, blocks, 0, stream>>>(
listIds, listOffset, encodings, listCodes.data());

CUDA_TEST_ERROR();
HIP_TEST_ERROR();
}

//
Expand Down Expand Up @@ -341,6 +341,7 @@ void runSQEncode(
idx_t blocks = vecs.getSize(0);

sqEncode<<<blocks, threads, 0, stream>>>(vecs, encodedVecs, codec);
HIP_TEST_ERROR();
}

// Handles appending encoded vectors (one per EncodeT word) packed into
Expand Down Expand Up @@ -584,7 +585,7 @@ void runIVFFlatInterleavedAppend(
}

#undef RUN_APPEND
CUDA_TEST_ERROR();
HIP_TEST_ERROR();
}

void runIVFPQInterleavedAppend(
Expand Down Expand Up @@ -640,7 +641,7 @@ void runIVFPQInterleavedAppend(
}

#undef RUN_APPEND
CUDA_TEST_ERROR();
HIP_TEST_ERROR();
}

} // namespace hip
Expand Down
7 changes: 4 additions & 3 deletions faiss/hip/impl/IVFFlatScan.hip
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ void runIVFFlatScanTile(
metric, \
prefixSumOffsets, \
allDistances); \
HIP_TEST_ERROR(); \
} while (0)

#define HANDLE_METRICS \
Expand Down Expand Up @@ -291,7 +292,7 @@ void runIVFFlatScanTile(
}
}

CUDA_TEST_ERROR();
HIP_TEST_ERROR();

#undef HANDLE_METRICS
#undef RUN_IVF_FLAT
Expand Down Expand Up @@ -414,9 +415,9 @@ void runIVFFlatScan(

// Make sure the element before prefixSumOffsets is 0, since we
// depend upon simple, boundary-less indexing to get proper results
CUDA_VERIFY(hipMemsetAsync(
HIP_VERIFY(hipMemsetAsync(
prefixSumOffsetSpace1.data(), 0, sizeof(idx_t), stream));
CUDA_VERIFY(hipMemsetAsync(
HIP_VERIFY(hipMemsetAsync(
prefixSumOffsetSpace2.data(), 0, sizeof(idx_t), stream));

DeviceTensor<float, 1, true> allDistances1(
Expand Down
1 change: 1 addition & 0 deletions faiss/hip/impl/IVFInterleaved.hip
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ void runIVFInterleavedScan2(
IVF_SCAN_2(64, 2048, 8);
}
#endif
HIP_TEST_ERROR();
}

void runIVFInterleavedScan(
Expand Down
4 changes: 2 additions & 2 deletions faiss/hip/impl/IVFUtils.hip
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ void runCalcListOffsets(

getResultLengths<<<grid, block, 0, stream>>>(
ivfListIds, listLengths.data(), totalSize, prefixSumOffsets);
CUDA_TEST_ERROR();
HIP_TEST_ERROR();

// Prefix sum of the indices, so we know where the intermediate
// results should be maintained
Expand All @@ -76,7 +76,7 @@ void runCalcListOffsets(
prefixSumOffsets.data(),
prefixSumOffsets.data() + totalSize,
prefixSumOffsets.data());
CUDA_TEST_ERROR();
HIP_TEST_ERROR();
}

} // namespace hip
Expand Down
11 changes: 6 additions & 5 deletions faiss/hip/impl/IVFUtilsSelect1.hip
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ void runPass1SelectLists(
k, \
heapDistances, \
heapIndices); \
HIP_TEST_ERROR(); \
return; /* success */ \
} while (0)

Expand All @@ -135,8 +136,8 @@ void runPass1SelectLists(
do { \
if (k == 1) { \
RUN_PASS(INDEX_T, 128, 1, 1, DIR); \
/*} else if (k <= 32) { */ \
/* RUN_PASS(INDEX_T, 128, 32, 2, DIR); */ \
} else if (k <= 32) { \
RUN_PASS(INDEX_T, 128, 32, 2, DIR); \
} else if (k <= 64) { \
RUN_PASS(INDEX_T, 128, 64, 3, DIR); \
} else if (k <= 128) { \
Expand All @@ -158,8 +159,8 @@ void runPass1SelectLists(
do { \
if (k == 1) { \
RUN_PASS(INDEX_T, 128, 1, 1, DIR); \
/*} else if (k <= 32) { */ \
/* RUN_PASS(INDEX_T, 128, 32, 2, DIR); */ \
} else if (k <= 32) { \
RUN_PASS(INDEX_T, 128, 32, 2, DIR); \
} else if (k <= 64) { \
RUN_PASS(INDEX_T, 128, 64, 3, DIR); \
} else if (k <= 128) { \
Expand Down Expand Up @@ -192,7 +193,7 @@ void runPass1SelectLists(
#undef RUN_PASS_DIR
#undef RUN_PASS

CUDA_TEST_ERROR();
HIP_TEST_ERROR();
}

} // namespace hip
Expand Down
11 changes: 6 additions & 5 deletions faiss/hip/impl/IVFUtilsSelect2.hip
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ void runPass2SelectLists(
indicesOptions, \
outDistances, \
outIndices); \
HIP_TEST_ERROR(); \
} while (0)

#if GPU_MAX_SELECTION_K >= 2048
Expand All @@ -193,8 +194,8 @@ void runPass2SelectLists(
do { \
if (k == 1) { \
RUN_PASS(INDEX_T, 128, 1, 1, DIR); \
/*} else if (k <= 32) { */ \
/* RUN_PASS(INDEX_T, 128, 32, 2, DIR);*/ \
} else if (k <= 32) { \
RUN_PASS(INDEX_T, 128, 32, 2, DIR); \
} else if (k <= 64) { \
RUN_PASS(INDEX_T, 128, 64, 3, DIR); \
} else if (k <= 128) { \
Expand All @@ -216,8 +217,8 @@ void runPass2SelectLists(
do { \
if (k == 1) { \
RUN_PASS(INDEX_T, 128, 1, 1, DIR); \
/*} else if (k <= 32) { */ \
/* RUN_PASS(INDEX_T, 128, 32, 2, DIR); */ \
} else if (k <= 32) { \
RUN_PASS(INDEX_T, 128, 32, 2, DIR); \
} else if (k <= 64) { \
RUN_PASS(INDEX_T, 128, 64, 3, DIR); \
} else if (k <= 128) { \
Expand Down Expand Up @@ -250,7 +251,7 @@ void runPass2SelectLists(
#undef RUN_PASS_DIR
#undef RUN_PASS

CUDA_TEST_ERROR();
HIP_TEST_ERROR();
}

} // namespace hip
Expand Down
Loading

0 comments on commit f464f77

Please sign in to comment.