From 610c0dcc463d71ffe6165cf11973af24892e5caa Mon Sep 17 00:00:00 2001 From: Heemin Kim Date: Wed, 20 Dec 2023 13:42:50 -0800 Subject: [PATCH] Add patch to support multi vector in faiss Signed-off-by: Heemin Kim --- .github/workflows/CI.yml | 9 + CHANGELOG.md | 1 + jni/CMakeLists.txt | 23 +- ...Custom-patch-to-support-multi-vector.patch | 221 ++++++++++++++++++ 4 files changed, 251 insertions(+), 3 deletions(-) create mode 100644 jni/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index c807e348e6..f932a24bd5 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -38,6 +38,15 @@ jobs: with: submodules: true + # Git functionality in CMAKE file does not work with given ubuntu image. Therefore, handling it here. + - name: Apply Git Patch + run: | + cd jni/external/faiss + git config --global core.autocrlf input + git apply --ignore-space-change --ignore-whitespace --3way ../../patches/faiss/0001-Custom-patch-to-support-multi-vector.patch + rm ../../patches/faiss/0001-Custom-patch-to-support-multi-vector.patch + working-directory: ${{ github.workspace }} + - name: Setup Java ${{ matrix.java }} uses: actions/setup-java@v1 with: diff --git a/CHANGELOG.md b/CHANGELOG.md index 3328dda520..9e40762156 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,3 +28,4 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), * Upgrade urllib to 1.26.18 [#1319](https://github.com/opensearch-project/k-NN/pull/1319) * Upgrade guava to 32.1.3 [#1319](https://github.com/opensearch-project/k-NN/pull/1319) ### Refactoring +* Add patch to support multi vector in faiss [#1358](https://github.com/opensearch-project/k-NN/pull/1358) diff --git a/jni/CMakeLists.txt b/jni/CMakeLists.txt index 29a844ee07..6cb8080457 100644 --- a/jni/CMakeLists.txt +++ b/jni/CMakeLists.txt @@ -58,6 +58,13 @@ if (${CMAKE_SYSTEM_PROCESSOR} STREQUAL aarch64) elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL x86_64) set(MACH_ARCH x64) endif() + +# Set git config to make `git apply` works across platforms +if (NOT "${WIN32}" STREQUAL "") + execute_process(COMMAND git config --global core.autocrlf input) +else() + execute_process(COMMAND git config --global core.autocrlf true) +endif() # ---------------------------------------------------------------------------- # ---------------------------------- COMMON ---------------------------------- @@ -79,7 +86,7 @@ list(APPEND TARGET_LIBS ${TARGET_LIB_COMMON}) # ---------------------------------- NMSLIB ---------------------------------- if (${CONFIG_NMSLIB} STREQUAL ON OR ${CONFIG_ALL} STREQUAL ON OR ${CONFIG_TEST} STREQUAL ON) # Check if nmslib exists - find_path(NMS_REPO_DIR NAMES similarity_search PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/nmslib) + find_path(NMS_REPO_DIR NAMES similarity_search PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/nmslib NO_DEFAULT_PATH) # If not, pull the updated submodule if (NOT EXISTS ${NMS_REPO_DIR}) @@ -134,14 +141,24 @@ if (${CONFIG_FAISS} STREQUAL ON OR ${CONFIG_ALL} STREQUAL ON OR ${CONFIG_TEST} S find_package(LAPACK REQUIRED) # Check if faiss exists - find_path(FAISS_REPO_DIR NAMES faiss PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/faiss) + find_path(FAISS_REPO_DIR NAMES faiss PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/faiss NO_DEFAULT_PATH) - # If not, pull the updated submodule + # If not, pull the updated submodule and apply patches if (NOT EXISTS ${FAISS_REPO_DIR}) message(STATUS "Could not find faiss. Pulling updated submodule.") execute_process(COMMAND git submodule update --init -- external/faiss WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif () + # Check if patch exist, this is to skip git apply during CI build. See CI.yml with ubuntu. + find_path(PATCH_FILE NAMES 0001-Custom-patch-to-support-multi-vector.patch PATHS ${CMAKE_CURRENT_SOURCE_DIR}/patches/faiss NO_DEFAULT_PATH) + if (EXISTS ${PATCH_FILE}) + message(STATUS "Applying custom patches.") + execute_process(COMMAND git apply --ignore-space-change --ignore-whitespace --3way ${CMAKE_CURRENT_SOURCE_DIR}/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/external/faiss ERROR_VARIABLE ERROR_MSG RESULT_VARIABLE RESULT_CODE) + if(RESULT_CODE) + message(FATAL_ERROR "Failed to apply patch:\n${ERROR_MSG}") + endif() + endif() + set(FAISS_ENABLE_GPU OFF) set(FAISS_ENABLE_PYTHON OFF) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/external/faiss EXCLUDE_FROM_ALL) diff --git a/jni/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch b/jni/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch new file mode 100644 index 0000000000..83ff36f15b --- /dev/null +++ b/jni/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch @@ -0,0 +1,221 @@ +From 864c1abe7bdced5d306e871ea2bd73e1e35987fd Mon Sep 17 00:00:00 2001 +From: Heemin Kim +Date: Wed, 6 Dec 2023 16:33:52 -0800 +Subject: [PATCH] Custom patch to support multi-vector + +Signed-off-by: Heemin Kim +--- + faiss/CMakeLists.txt | 2 + + faiss/Index.h | 6 ++- + faiss/impl/HNSW.cpp | 25 ++++++++----- + faiss/impl/ResultCollector.h | 58 +++++++++++++++++++++++++++++ + faiss/impl/ResultCollectorFactory.h | 28 ++++++++++++++ + 5 files changed, 107 insertions(+), 12 deletions(-) + create mode 100644 faiss/impl/ResultCollector.h + create mode 100644 faiss/impl/ResultCollectorFactory.h + +diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt +index 27701586..af682a05 100644 +--- a/faiss/CMakeLists.txt ++++ b/faiss/CMakeLists.txt +@@ -162,6 +162,8 @@ set(FAISS_HEADERS + impl/ProductQuantizer.h + impl/Quantizer.h + impl/ResidualQuantizer.h ++ impl/ResultCollector.h ++ impl/ResultCollectorFactory.h + impl/ResultHandler.h + impl/ScalarQuantizer.h + impl/ThreadedIndex-inl.h +diff --git a/faiss/Index.h b/faiss/Index.h +index 4b4b302b..13eab0c0 100644 +--- a/faiss/Index.h ++++ b/faiss/Index.h +@@ -38,11 +38,12 @@ + + namespace faiss { + +-/// Forward declarations see impl/AuxIndexStructures.h, impl/IDSelector.h and +-/// impl/DistanceComputer.h ++/// Forward declarations see impl/AuxIndexStructures.h, impl/IDSelector.h, ++/// impl/DistanceComputer.h, and impl/ResultCollectorFactory.h + struct IDSelector; + struct RangeSearchResult; + struct DistanceComputer; ++struct ResultCollectorFactory; + + /** Parent class for the optional search paramenters. + * +@@ -52,6 +53,7 @@ struct DistanceComputer; + struct SearchParameters { + /// if non-null, only these IDs will be considered during search. + IDSelector* sel = nullptr; ++ ResultCollectorFactory* col = nullptr; + /// make sure we can dynamic_cast this + virtual ~SearchParameters() {} + }; +diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp +index 9fc201ea..540210a6 100644 +--- a/faiss/impl/HNSW.cpp ++++ b/faiss/impl/HNSW.cpp +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -530,6 +531,15 @@ int search_from_candidates( + int level, + int nres_in = 0, + const SearchParametersHNSW* params = nullptr) { ++ ResultCollectorFactory defaultFactory; ++ ResultCollectorFactory* collectorFactory; ++ if (params == nullptr || params->col == nullptr) { ++ collectorFactory = &defaultFactory; ++ } else { ++ collectorFactory = params->col; ++ } ++ ResultCollector* collector = collectorFactory->newCollector(); ++ + int nres = nres_in; + int ndis = 0; + +@@ -544,11 +554,7 @@ int search_from_candidates( + float d = candidates.dis[i]; + FAISS_ASSERT(v1 >= 0); + if (!sel || sel->is_member(v1)) { +- if (nres < k) { +- faiss::maxheap_push(++nres, D, I, d, v1); +- } else if (d < D[0]) { +- faiss::maxheap_replace_top(nres, D, I, d, v1); +- } ++ collector->collect(k, nres, D, I, d, v1); + } + vt.set(v1); + } +@@ -612,11 +618,7 @@ int search_from_candidates( + + auto add_to_heap = [&](const size_t idx, const float dis) { + if (!sel || sel->is_member(idx)) { +- if (nres < k) { +- faiss::maxheap_push(++nres, D, I, dis, idx); +- } else if (dis < D[0]) { +- faiss::maxheap_replace_top(nres, D, I, dis, idx); +- } ++ collector->collect(k, nres, D, I, dis, idx); + } + candidates.push(idx, dis); + }; +@@ -660,6 +662,9 @@ int search_from_candidates( + } + } + ++ collector->finalize(nres, I); ++ collectorFactory->deleteCollector(collector); ++ + if (level == 0) { + stats.n1++; + if (candidates.size() == 0) { +diff --git a/faiss/impl/ResultCollector.h b/faiss/impl/ResultCollector.h +new file mode 100644 +index 00000000..3e4dac34 +--- /dev/null ++++ b/faiss/impl/ResultCollector.h +@@ -0,0 +1,58 @@ ++/** ++ * Copyright (c) Facebook, Inc. and its affiliates. ++ * ++ * This source code is licensed under the MIT license found in the ++ * LICENSE file in the root directory of this source tree. ++ */ ++ ++#pragma once ++ ++#include ++#include ++ ++#include ++#include ++ ++/** ResultCollector is intended to define how to collect search result */ ++ ++namespace faiss { ++ ++/** Encapsulates a set of ids to handle. */ ++struct ResultCollector { ++ // For each result, collect method is called to store result ++ virtual void collect( ++ int k, ++ int& nres, ++ float* bh_val, ++ idx_t* bh_ids, ++ float val, ++ idx_t ids) = 0; ++ ++ // This method is called after all result is collected ++ virtual void finalize(idx_t nres, idx_t* bh_ids) = 0; ++ virtual ~ResultCollector() {} ++}; ++ ++struct DefaultCollector : ResultCollector { ++ void collect( ++ int k, ++ int& nres, ++ float* bh_val, ++ idx_t* bh_ids, ++ float val, ++ idx_t ids) override { ++ if (nres < k) { ++ faiss::maxheap_push(++nres, bh_val, bh_ids, val, ids); ++ } else if (val < bh_val[0]) { ++ faiss::maxheap_replace_top(nres, bh_val, bh_ids, val, ids); ++ } ++ } ++ ++ void finalize(idx_t nres, idx_t* bh_ids) override { ++ // Do nothing ++ } ++ ++ ~DefaultCollector() override {} ++}; ++ ++} // namespace faiss +diff --git a/faiss/impl/ResultCollectorFactory.h b/faiss/impl/ResultCollectorFactory.h +new file mode 100644 +index 00000000..6a15208a +--- /dev/null ++++ b/faiss/impl/ResultCollectorFactory.h +@@ -0,0 +1,28 @@ ++/** ++ * Copyright (c) Facebook, Inc. and its affiliates. ++ * ++ * This source code is licensed under the MIT license found in the ++ * LICENSE file in the root directory of this source tree. ++ */ ++ ++#pragma once ++#include ++namespace faiss { ++ ++/** ResultCollector is intended to define how to collect search result */ ++struct ResultCollectorFactory { ++ DefaultCollector defaultCollector; ++ ++ // For each result, collect method is called to store result ++ virtual ResultCollector* newCollector() { ++ return &defaultCollector; ++ } ++ ++ virtual void deleteCollector(ResultCollector* collector) { ++ // Do nothing ++ } ++ // This method is called after all result is collected ++ virtual ~ResultCollectorFactory() {} ++}; ++ ++} // namespace faiss +-- +2.39.3 (Apple Git-145) +