From 50f43c769acecc4e9d06b4ce76f6f755986d954b Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 3 Dec 2021 18:26:07 -0500 Subject: [PATCH] Disabling fused l2 knn from bfknn (#407) It appears the recent changes to the fused l2 knn have somehow broken a few things in cuml, such as rbc, trustworthines, and UMAP. Authors: - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/raft/pull/407 --- .../knn/detail/knn_brute_force_faiss.cuh | 120 +++++++++--------- cpp/test/spatial/ball_cover.cu | 14 +- 2 files changed, 68 insertions(+), 66 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh index 0b89377630..d154e5f92a 100644 --- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh +++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh @@ -293,69 +293,69 @@ void brute_force_knn_impl( cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i); - // TODO: Enable this once we figure out why it's causing pytest failures in cuml. - if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true && - (metric == raft::distance::DistanceType::L2Unexpanded || - metric == raft::distance::DistanceType::L2SqrtUnexpanded //|| - // metric == raft::distance::DistanceType::L2Expanded || - // metric == raft::distance::DistanceType::L2SqrtExpanded) - )) { - fusedL2Knn(D, - out_i_ptr, - out_d_ptr, - input[i], - search_items, - sizes[i], - n, - k, - rowMajorIndex, - rowMajorQuery, - stream, - metric); - } else { - switch (metric) { - case raft::distance::DistanceType::Haversine: - - ASSERT(D == 2, - "Haversine distance requires 2 dimensions " - "(latitude / longitude)."); - - haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream); - break; - default: - faiss::MetricType m = build_faiss_metric(metric); - - faiss::gpu::StandardGpuResources gpu_res; - - gpu_res.noTempMemory(); - gpu_res.setDefaultStream(device, stream); - - faiss::gpu::GpuDistanceParams args; - args.metric = m; - args.metricArg = metricArg; - args.k = k; - args.dims = D; - args.vectors = input[i]; - args.vectorsRowMajor = rowMajorIndex; - args.numVectors = sizes[i]; - args.queries = search_items; - args.queriesRowMajor = rowMajorQuery; - args.numQueries = n; - args.outDistances = out_d_ptr; - args.outIndices = out_i_ptr; - - /** - * @todo: Until FAISS supports pluggable allocation strategies, - * we will not reap the benefits of the pool allocator for - * avoiding device-wide synchronizations from cudaMalloc/cudaFree - */ - bfKnn(&gpu_res, args); - } + // // TODO: Enable this once we figure out why it's causing pytest failures in cuml. + // if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true && + // (metric == raft::distance::DistanceType::L2Unexpanded || + // metric == raft::distance::DistanceType::L2SqrtUnexpanded //|| + // // metric == raft::distance::DistanceType::L2Expanded || + // // metric == raft::distance::DistanceType::L2SqrtExpanded) + // )) { + // fusedL2Knn(D, + // out_i_ptr, + // out_d_ptr, + // input[i], + // search_items, + // sizes[i], + // n, + // k, + // rowMajorIndex, + // rowMajorQuery, + // stream, + // metric); + // } else { + switch (metric) { + case raft::distance::DistanceType::Haversine: + + ASSERT(D == 2, + "Haversine distance requires 2 dimensions " + "(latitude / longitude)."); + + haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream); + break; + default: + faiss::MetricType m = build_faiss_metric(metric); + + faiss::gpu::StandardGpuResources gpu_res; + + gpu_res.noTempMemory(); + gpu_res.setDefaultStream(device, stream); + + faiss::gpu::GpuDistanceParams args; + args.metric = m; + args.metricArg = metricArg; + args.k = k; + args.dims = D; + args.vectors = input[i]; + args.vectorsRowMajor = rowMajorIndex; + args.numVectors = sizes[i]; + args.queries = search_items; + args.queriesRowMajor = rowMajorQuery; + args.numQueries = n; + args.outDistances = out_d_ptr; + args.outIndices = out_i_ptr; + + /** + * @todo: Until FAISS supports pluggable allocation strategies, + * we will not reap the benefits of the pool allocator for + * avoiding device-wide synchronizations from cudaMalloc/cudaFree + */ + bfKnn(&gpu_res, args); } - - CUDA_CHECK(cudaPeekAtLastError()); } + CUDA_CHECK(cudaPeekAtLastError()); + // } + // Sync internal streams if used. We don't need to // sync the user stream because we'll already have // fully serial execution. diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu index ab85e7fe8f..0a1680badc 100644 --- a/cpp/test/spatial/ball_cover.cu +++ b/cpp/test/spatial/ball_cover.cu @@ -52,12 +52,14 @@ __global__ void count_discrepancies_kernel(value_idx* actual_idx, if (row < m) { for (uint32_t i = 0; i < n; i++) { value_t d = actual[row * n + i] - expected[row * n + i]; - bool matches = fabsf(d) <= thres; - if (!matches) { - // printf("row=%d, actual_idx=%ld, actual=%f, expected_id=%ld, expected=%f\n", - // row, actual_idx[row*n+i], actual[row*n+i], expected_idx[row*n+i], - // expected[row*n+i]); - } + bool matches = (fabsf(d) <= thres) || (actual_idx[row * n + i] == expected_idx[row * n + i] && + actual_idx[row * n + i] == row); + // if (!matches) { + // printf("row=%d, actual_idx=%ld, actual=%f, expected_id=%ld, + // expected=%f\n", + // row, actual_idx[row*n+i], actual[row*n+i], expected_idx[row*n+i], + // expected[row*n+i]); + // } n_diffs += !matches; out[row] = n_diffs;