From 7acf5d29fea79869e5dc73d286ac00ab2298f2a8 Mon Sep 17 00:00:00 2001 From: Heemin Kim Date: Wed, 20 Dec 2023 13:42:50 -0800 Subject: [PATCH] Add patch to support multi vector in faiss Signed-off-by: Heemin Kim --- .github/workflows/CI.yml | 9 + CHANGELOG.md | 1 + jni/CMakeLists.txt | 16 +- ...Custom-patch-to-support-multi-vector.patch | 281 ++++++++++++++++++ 4 files changed, 305 insertions(+), 2 deletions(-) create mode 100644 jni/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index c807e348e6..c1b9a85ac1 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -38,6 +38,15 @@ jobs: with: submodules: true + # Git functionality in CMAKE file does not work with given ubuntu image. Therefore, handling it here. + - name: Apply Git Patch + # Deleting file at the end to skip `git apply` inside CMAKE file + run: | + cd jni/external/faiss + git apply --ignore-space-change --ignore-whitespace --3way ../../patches/faiss/0001-Custom-patch-to-support-multi-vector.patch + rm ../../patches/faiss/0001-Custom-patch-to-support-multi-vector.patch + working-directory: ${{ github.workspace }} + - name: Setup Java ${{ matrix.java }} uses: actions/setup-java@v1 with: diff --git a/CHANGELOG.md b/CHANGELOG.md index 3328dda520..9e40762156 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,3 +28,4 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), * Upgrade urllib to 1.26.18 [#1319](https://github.com/opensearch-project/k-NN/pull/1319) * Upgrade guava to 32.1.3 [#1319](https://github.com/opensearch-project/k-NN/pull/1319) ### Refactoring +* Add patch to support multi vector in faiss [#1358](https://github.com/opensearch-project/k-NN/pull/1358) diff --git a/jni/CMakeLists.txt b/jni/CMakeLists.txt index 29a844ee07..6e66e17ac7 100644 --- a/jni/CMakeLists.txt +++ b/jni/CMakeLists.txt @@ -79,7 +79,7 @@ list(APPEND TARGET_LIBS ${TARGET_LIB_COMMON}) # ---------------------------------- NMSLIB ---------------------------------- if (${CONFIG_NMSLIB} STREQUAL ON OR ${CONFIG_ALL} STREQUAL ON OR ${CONFIG_TEST} STREQUAL ON) # Check if nmslib exists - find_path(NMS_REPO_DIR NAMES similarity_search PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/nmslib) + find_path(NMS_REPO_DIR NAMES similarity_search PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/nmslib NO_DEFAULT_PATH) # If not, pull the updated submodule if (NOT EXISTS ${NMS_REPO_DIR}) @@ -134,7 +134,7 @@ if (${CONFIG_FAISS} STREQUAL ON OR ${CONFIG_ALL} STREQUAL ON OR ${CONFIG_TEST} S find_package(LAPACK REQUIRED) # Check if faiss exists - find_path(FAISS_REPO_DIR NAMES faiss PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/faiss) + find_path(FAISS_REPO_DIR NAMES faiss PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/faiss NO_DEFAULT_PATH) # If not, pull the updated submodule if (NOT EXISTS ${FAISS_REPO_DIR}) @@ -142,6 +142,18 @@ if (${CONFIG_FAISS} STREQUAL ON OR ${CONFIG_ALL} STREQUAL ON OR ${CONFIG_TEST} S execute_process(COMMAND git submodule update --init -- external/faiss WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif () + # Check if patch exist, this is to skip git apply during CI build. See CI.yml with ubuntu. + find_path(PATCH_FILE NAMES 0001-Custom-patch-to-support-multi-vector.patch PATHS ${CMAKE_CURRENT_SOURCE_DIR}/patches/faiss NO_DEFAULT_PATH) + + # If it exists, apply patches + if (EXISTS ${PATCH_FILE}) + message(STATUS "Applying custom patches.") + execute_process(COMMAND git apply --ignore-space-change --ignore-whitespace --3way ${CMAKE_CURRENT_SOURCE_DIR}/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/external/faiss ERROR_VARIABLE ERROR_MSG RESULT_VARIABLE RESULT_CODE) + if(RESULT_CODE) + message(FATAL_ERROR "Failed to apply patch:\n${ERROR_MSG}") + endif() + endif() + set(FAISS_ENABLE_GPU OFF) set(FAISS_ENABLE_PYTHON OFF) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/external/faiss EXCLUDE_FROM_ALL) diff --git a/jni/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch b/jni/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch new file mode 100644 index 0000000000..97411c203d --- /dev/null +++ b/jni/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch @@ -0,0 +1,281 @@ +From 8f89fbf1cf445a8216b9cc4ee52e7b82e24e906e Mon Sep 17 00:00:00 2001 +From: Heemin Kim +Date: Wed, 6 Dec 2023 16:33:52 -0800 +Subject: [PATCH] Custom patch to support multi-vector + +Signed-off-by: Heemin Kim +--- + faiss/CMakeLists.txt | 2 + + faiss/Index.h | 6 ++- + faiss/IndexIDMap.cpp | 20 ++++++++++ + faiss/IndexIDMap.h | 1 + + faiss/impl/HNSW.cpp | 25 ++++++++----- + faiss/impl/ResultCollector.h | 58 +++++++++++++++++++++++++++++ + faiss/impl/ResultCollectorFactory.h | 29 +++++++++++++++ + 7 files changed, 129 insertions(+), 12 deletions(-) + create mode 100644 faiss/impl/ResultCollector.h + create mode 100644 faiss/impl/ResultCollectorFactory.h + +diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt +index 27701586..af682a05 100644 +--- a/faiss/CMakeLists.txt ++++ b/faiss/CMakeLists.txt +@@ -162,6 +162,8 @@ set(FAISS_HEADERS + impl/ProductQuantizer.h + impl/Quantizer.h + impl/ResidualQuantizer.h ++ impl/ResultCollector.h ++ impl/ResultCollectorFactory.h + impl/ResultHandler.h + impl/ScalarQuantizer.h + impl/ThreadedIndex-inl.h +diff --git a/faiss/Index.h b/faiss/Index.h +index 4b4b302b..13eab0c0 100644 +--- a/faiss/Index.h ++++ b/faiss/Index.h +@@ -38,11 +38,12 @@ + + namespace faiss { + +-/// Forward declarations see impl/AuxIndexStructures.h, impl/IDSelector.h and +-/// impl/DistanceComputer.h ++/// Forward declarations see impl/AuxIndexStructures.h, impl/IDSelector.h, ++/// impl/DistanceComputer.h, and impl/ResultCollectorFactory.h + struct IDSelector; + struct RangeSearchResult; + struct DistanceComputer; ++struct ResultCollectorFactory; + + /** Parent class for the optional search paramenters. + * +@@ -52,6 +53,7 @@ struct DistanceComputer; + struct SearchParameters { + /// if non-null, only these IDs will be considered during search. + IDSelector* sel = nullptr; ++ ResultCollectorFactory* col = nullptr; + /// make sure we can dynamic_cast this + virtual ~SearchParameters() {} + }; +diff --git a/faiss/IndexIDMap.cpp b/faiss/IndexIDMap.cpp +index 7972bec9..a5c017a9 100644 +--- a/faiss/IndexIDMap.cpp ++++ b/faiss/IndexIDMap.cpp +@@ -102,6 +102,20 @@ struct ScopedSelChange { + } + }; + ++/// RAII object to reset the ResultCollectorFactory in the params object ++struct ScopedColChange { ++ SearchParameters* params = nullptr; ++ void set(SearchParameters* params, const std::vector* id_map) { ++ this->params = params; ++ params->col->id_map = id_map; ++ } ++ ~ScopedColChange() { ++ if (params) { ++ params->col->id_map = nullptr; ++ } ++ } ++}; ++ + } // namespace + + template +@@ -114,6 +128,7 @@ void IndexIDMapTemplate::search( + const SearchParameters* params) const { + IDSelectorTranslated this_idtrans(this->id_map, nullptr); + ScopedSelChange sel_change; ++ ScopedColChange col_change; + + if (params && params->sel) { + auto idtrans = dynamic_cast(params->sel); +@@ -131,6 +146,11 @@ void IndexIDMapTemplate::search( + sel_change.set(params_non_const, &this_idtrans); + } + } ++ ++ if (params && params->col) { ++ auto params_non_const = const_cast(params); ++ col_change.set(params_non_const, &this->id_map); ++ } + index->search(n, x, k, distances, labels, params); + idx_t* li = labels; + #pragma omp parallel for +diff --git a/faiss/IndexIDMap.h b/faiss/IndexIDMap.h +index 2d164123..c6a1be73 100644 +--- a/faiss/IndexIDMap.h ++++ b/faiss/IndexIDMap.h +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + + #include + #include +diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp +index 9fc201ea..540210a6 100644 +--- a/faiss/impl/HNSW.cpp ++++ b/faiss/impl/HNSW.cpp +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -530,6 +531,15 @@ int search_from_candidates( + int level, + int nres_in = 0, + const SearchParametersHNSW* params = nullptr) { ++ ResultCollectorFactory defaultFactory; ++ ResultCollectorFactory* collectorFactory; ++ if (params == nullptr || params->col == nullptr) { ++ collectorFactory = &defaultFactory; ++ } else { ++ collectorFactory = params->col; ++ } ++ ResultCollector* collector = collectorFactory->newCollector(); ++ + int nres = nres_in; + int ndis = 0; + +@@ -544,11 +554,7 @@ int search_from_candidates( + float d = candidates.dis[i]; + FAISS_ASSERT(v1 >= 0); + if (!sel || sel->is_member(v1)) { +- if (nres < k) { +- faiss::maxheap_push(++nres, D, I, d, v1); +- } else if (d < D[0]) { +- faiss::maxheap_replace_top(nres, D, I, d, v1); +- } ++ collector->collect(k, nres, D, I, d, v1); + } + vt.set(v1); + } +@@ -612,11 +618,7 @@ int search_from_candidates( + + auto add_to_heap = [&](const size_t idx, const float dis) { + if (!sel || sel->is_member(idx)) { +- if (nres < k) { +- faiss::maxheap_push(++nres, D, I, dis, idx); +- } else if (dis < D[0]) { +- faiss::maxheap_replace_top(nres, D, I, dis, idx); +- } ++ collector->collect(k, nres, D, I, dis, idx); + } + candidates.push(idx, dis); + }; +@@ -660,6 +662,9 @@ int search_from_candidates( + } + } + ++ collector->finalize(nres, I); ++ collectorFactory->deleteCollector(collector); ++ + if (level == 0) { + stats.n1++; + if (candidates.size() == 0) { +diff --git a/faiss/impl/ResultCollector.h b/faiss/impl/ResultCollector.h +new file mode 100644 +index 00000000..3e4dac34 +--- /dev/null ++++ b/faiss/impl/ResultCollector.h +@@ -0,0 +1,58 @@ ++/** ++ * Copyright (c) Facebook, Inc. and its affiliates. ++ * ++ * This source code is licensed under the MIT license found in the ++ * LICENSE file in the root directory of this source tree. ++ */ ++ ++#pragma once ++ ++#include ++#include ++ ++#include ++#include ++ ++/** ResultCollector is intended to define how to collect search result */ ++ ++namespace faiss { ++ ++/** Encapsulates a set of ids to handle. */ ++struct ResultCollector { ++ // For each result, collect method is called to store result ++ virtual void collect( ++ int k, ++ int& nres, ++ float* bh_val, ++ idx_t* bh_ids, ++ float val, ++ idx_t ids) = 0; ++ ++ // This method is called after all result is collected ++ virtual void finalize(idx_t nres, idx_t* bh_ids) = 0; ++ virtual ~ResultCollector() {} ++}; ++ ++struct DefaultCollector : ResultCollector { ++ void collect( ++ int k, ++ int& nres, ++ float* bh_val, ++ idx_t* bh_ids, ++ float val, ++ idx_t ids) override { ++ if (nres < k) { ++ faiss::maxheap_push(++nres, bh_val, bh_ids, val, ids); ++ } else if (val < bh_val[0]) { ++ faiss::maxheap_replace_top(nres, bh_val, bh_ids, val, ids); ++ } ++ } ++ ++ void finalize(idx_t nres, idx_t* bh_ids) override { ++ // Do nothing ++ } ++ ++ ~DefaultCollector() override {} ++}; ++ ++} // namespace faiss +diff --git a/faiss/impl/ResultCollectorFactory.h b/faiss/impl/ResultCollectorFactory.h +new file mode 100644 +index 00000000..4d903f8d +--- /dev/null ++++ b/faiss/impl/ResultCollectorFactory.h +@@ -0,0 +1,29 @@ ++/** ++ * Copyright (c) Facebook, Inc. and its affiliates. ++ * ++ * This source code is licensed under the MIT license found in the ++ * LICENSE file in the root directory of this source tree. ++ */ ++ ++#pragma once ++#include ++namespace faiss { ++ ++/** ResultCollector is intended to define how to collect search result */ ++struct ResultCollectorFactory { ++ DefaultCollector default_collector; ++ const std::vector* id_map; ++ ++ // For each result, collect method is called to store result ++ virtual ResultCollector* newCollector() { ++ return &default_collector; ++ } ++ ++ virtual void deleteCollector(ResultCollector* collector) { ++ // Do nothing ++ } ++ // This method is called after all result is collected ++ virtual ~ResultCollectorFactory() {} ++}; ++ ++} // namespace faiss +-- +2.39.3 (Apple Git-145) +