diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index b58741b921..04ec15fd41 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -38,6 +38,15 @@ jobs: with: submodules: true + # Git functionality in CMAKE file does not work with given ubuntu image. Therefore, handling it here. + - name: Apply Git Patch + # Deleting file at the end to skip `git apply` inside CMAKE file + run: | + cd jni/external/faiss + git apply --ignore-space-change --ignore-whitespace --3way ../../patches/faiss/0001-Custom-patch-to-support-multi-vector.patch + rm ../../patches/faiss/0001-Custom-patch-to-support-multi-vector.patch + working-directory: ${{ github.workspace }} + - name: Setup Java ${{ matrix.java }} uses: actions/setup-java@v1 with: diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md index 2e7f7322c8..a62437a8b7 100644 --- a/DEVELOPER_GUIDE.md +++ b/DEVELOPER_GUIDE.md @@ -229,6 +229,13 @@ For users that want to get the most out of the libraries, they should follow [th and build the libraries from source in their production environment, so that if their environment has optimized instruction sets, they take advantage of them. +### Custom patch on JNI Library +If you want to make a custom patch on JNI library +1. Make a change on top of current version of JNI library and push the commit locally. +2. Create a patch file for the change using `git format-patch -o patches HEAD^` +3. Place the patch file under `jni/patches` +4. Make a change in `jni/CmakeLists.txt`, `.github/workflows/CI.yml` to apply the patch during build + ## Run OpenSearch k-NN ### Run Single-node Cluster Locally diff --git a/jni/CMakeLists.txt b/jni/CMakeLists.txt index 29a844ee07..6e66e17ac7 100644 --- a/jni/CMakeLists.txt +++ b/jni/CMakeLists.txt @@ -79,7 +79,7 @@ list(APPEND TARGET_LIBS ${TARGET_LIB_COMMON}) # ---------------------------------- NMSLIB ---------------------------------- if (${CONFIG_NMSLIB} STREQUAL ON OR ${CONFIG_ALL} STREQUAL ON OR ${CONFIG_TEST} STREQUAL ON) # Check if nmslib exists - find_path(NMS_REPO_DIR NAMES similarity_search PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/nmslib) + find_path(NMS_REPO_DIR NAMES similarity_search PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/nmslib NO_DEFAULT_PATH) # If not, pull the updated submodule if (NOT EXISTS ${NMS_REPO_DIR}) @@ -134,7 +134,7 @@ if (${CONFIG_FAISS} STREQUAL ON OR ${CONFIG_ALL} STREQUAL ON OR ${CONFIG_TEST} S find_package(LAPACK REQUIRED) # Check if faiss exists - find_path(FAISS_REPO_DIR NAMES faiss PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/faiss) + find_path(FAISS_REPO_DIR NAMES faiss PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/faiss NO_DEFAULT_PATH) # If not, pull the updated submodule if (NOT EXISTS ${FAISS_REPO_DIR}) @@ -142,6 +142,18 @@ if (${CONFIG_FAISS} STREQUAL ON OR ${CONFIG_ALL} STREQUAL ON OR ${CONFIG_TEST} S execute_process(COMMAND git submodule update --init -- external/faiss WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif () + # Check if patch exist, this is to skip git apply during CI build. See CI.yml with ubuntu. + find_path(PATCH_FILE NAMES 0001-Custom-patch-to-support-multi-vector.patch PATHS ${CMAKE_CURRENT_SOURCE_DIR}/patches/faiss NO_DEFAULT_PATH) + + # If it exists, apply patches + if (EXISTS ${PATCH_FILE}) + message(STATUS "Applying custom patches.") + execute_process(COMMAND git apply --ignore-space-change --ignore-whitespace --3way ${CMAKE_CURRENT_SOURCE_DIR}/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/external/faiss ERROR_VARIABLE ERROR_MSG RESULT_VARIABLE RESULT_CODE) + if(RESULT_CODE) + message(FATAL_ERROR "Failed to apply patch:\n${ERROR_MSG}") + endif() + endif() + set(FAISS_ENABLE_GPU OFF) set(FAISS_ENABLE_PYTHON OFF) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/external/faiss EXCLUDE_FROM_ALL) diff --git a/jni/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch b/jni/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch new file mode 100644 index 0000000000..8bd42a1f56 --- /dev/null +++ b/jni/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch @@ -0,0 +1,301 @@ +From 70c9b5d6408bb85fd1503002d29f9428fe0d48dc Mon Sep 17 00:00:00 2001 +From: Heemin Kim +Date: Wed, 6 Dec 2023 16:33:52 -0800 +Subject: [PATCH] Custom patch to support multi-vector + +Signed-off-by: Heemin Kim +--- + faiss/CMakeLists.txt | 2 + + faiss/Index.h | 6 ++- + faiss/IndexIDMap.cpp | 24 ++++++++++ + faiss/impl/HNSW.cpp | 27 +++++++---- + faiss/impl/ResultCollector.h | 74 +++++++++++++++++++++++++++++ + faiss/impl/ResultCollectorFactory.h | 33 +++++++++++++ + 6 files changed, 154 insertions(+), 12 deletions(-) + create mode 100644 faiss/impl/ResultCollector.h + create mode 100644 faiss/impl/ResultCollectorFactory.h + +diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt +index 27701586..af682a05 100644 +--- a/faiss/CMakeLists.txt ++++ b/faiss/CMakeLists.txt +@@ -162,6 +162,8 @@ set(FAISS_HEADERS + impl/ProductQuantizer.h + impl/Quantizer.h + impl/ResidualQuantizer.h ++ impl/ResultCollector.h ++ impl/ResultCollectorFactory.h + impl/ResultHandler.h + impl/ScalarQuantizer.h + impl/ThreadedIndex-inl.h +diff --git a/faiss/Index.h b/faiss/Index.h +index 4b4b302b..13eab0c0 100644 +--- a/faiss/Index.h ++++ b/faiss/Index.h +@@ -38,11 +38,12 @@ + + namespace faiss { + +-/// Forward declarations see impl/AuxIndexStructures.h, impl/IDSelector.h and +-/// impl/DistanceComputer.h ++/// Forward declarations see impl/AuxIndexStructures.h, impl/IDSelector.h, ++/// impl/DistanceComputer.h, and impl/ResultCollectorFactory.h + struct IDSelector; + struct RangeSearchResult; + struct DistanceComputer; ++struct ResultCollectorFactory; + + /** Parent class for the optional search paramenters. + * +@@ -52,6 +53,7 @@ struct DistanceComputer; + struct SearchParameters { + /// if non-null, only these IDs will be considered during search. + IDSelector* sel = nullptr; ++ ResultCollectorFactory* col = nullptr; + /// make sure we can dynamic_cast this + virtual ~SearchParameters() {} + }; +diff --git a/faiss/IndexIDMap.cpp b/faiss/IndexIDMap.cpp +index 7972bec9..0f82a17c 100644 +--- a/faiss/IndexIDMap.cpp ++++ b/faiss/IndexIDMap.cpp +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + + namespace faiss { + +@@ -102,6 +103,24 @@ struct ScopedSelChange { + } + }; + ++// RAII object to reset the id_map parameter in ResultCollectorFactory object ++// This object make sure to reset the id_map parameter in ResultCollectorFactory ++// once the program exist current method scope. ++struct ScopedColChange { ++ ResultCollectorFactory* collector_factory = nullptr; ++ void set( ++ ResultCollectorFactory* collector_factory, ++ const std::vector* id_map) { ++ this->collector_factory = collector_factory; ++ collector_factory->id_map = id_map; ++ } ++ ~ScopedColChange() { ++ if (collector_factory) { ++ collector_factory->id_map = nullptr; ++ } ++ } ++}; ++ + } // namespace + + template +@@ -114,6 +133,7 @@ void IndexIDMapTemplate::search( + const SearchParameters* params) const { + IDSelectorTranslated this_idtrans(this->id_map, nullptr); + ScopedSelChange sel_change; ++ ScopedColChange col_change; + + if (params && params->sel) { + auto idtrans = dynamic_cast(params->sel); +@@ -131,6 +151,10 @@ void IndexIDMapTemplate::search( + sel_change.set(params_non_const, &this_idtrans); + } + } ++ ++ if (params && params->col && !params->col->id_map) { ++ col_change.set(params->col, &this->id_map); ++ } + index->search(n, x, k, distances, labels, params); + idx_t* li = labels; + #pragma omp parallel for +diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp +index 9fc201ea..5b5900d1 100644 +--- a/faiss/impl/HNSW.cpp ++++ b/faiss/impl/HNSW.cpp +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -530,6 +531,15 @@ int search_from_candidates( + int level, + int nres_in = 0, + const SearchParametersHNSW* params = nullptr) { ++ ResultCollectorFactory defaultFactory; ++ ResultCollectorFactory* collectorFactory; ++ if (params == nullptr || params->col == nullptr) { ++ collectorFactory = &defaultFactory; ++ } else { ++ collectorFactory = params->col; ++ } ++ ResultCollector* collector = collectorFactory->new_collector(); ++ + int nres = nres_in; + int ndis = 0; + +@@ -544,11 +554,7 @@ int search_from_candidates( + float d = candidates.dis[i]; + FAISS_ASSERT(v1 >= 0); + if (!sel || sel->is_member(v1)) { +- if (nres < k) { +- faiss::maxheap_push(++nres, D, I, d, v1); +- } else if (d < D[0]) { +- faiss::maxheap_replace_top(nres, D, I, d, v1); +- } ++ collector->collect(k, nres, D, I, d, v1); + } + vt.set(v1); + } +@@ -612,11 +618,7 @@ int search_from_candidates( + + auto add_to_heap = [&](const size_t idx, const float dis) { + if (!sel || sel->is_member(idx)) { +- if (nres < k) { +- faiss::maxheap_push(++nres, D, I, dis, idx); +- } else if (dis < D[0]) { +- faiss::maxheap_replace_top(nres, D, I, dis, idx); +- } ++ collector->collect(k, nres, D, I, dis, idx); + } + candidates.push(idx, dis); + }; +@@ -660,6 +662,11 @@ int search_from_candidates( + } + } + ++ // Completed collection of result. Run post processor. ++ collector->post_process(nres, I); ++ // Collector completed its task. Release all resource of the collector. ++ collectorFactory->delete_collector(collector); ++ + if (level == 0) { + stats.n1++; + if (candidates.size() == 0) { +diff --git a/faiss/impl/ResultCollector.h b/faiss/impl/ResultCollector.h +new file mode 100644 +index 00000000..a0489fd6 +--- /dev/null ++++ b/faiss/impl/ResultCollector.h +@@ -0,0 +1,74 @@ ++/** ++ * Copyright (c) Facebook, Inc. and its affiliates. ++ * ++ * This source code is licensed under the MIT license found in the ++ * LICENSE file in the root directory of this source tree. ++ */ ++ ++#pragma once ++ ++#include ++#include ++ ++#include ++#include ++ ++/** ++ * ResultCollector is intended to define how to collect search result ++ * For each single search result, collect method will be called. ++ * After every results are collected, post_process method is called at the end. ++ */ ++ ++namespace faiss { ++ ++/** Encapsulates a set of ids to handle. */ ++struct ResultCollector { ++ /** ++ * For each result, collect method is called to store result ++ * @param k number of vectors to search ++ * @param nres number of results in queue ++ * @param bh_val search result, distances from query ++ * @param bh_ids search result, ids of vectors ++ * @param val distance from query for current vector ++ * @param ids id of current vector ++ */ ++ virtual void collect( ++ int k, ++ int& nres, ++ float* bh_val, ++ idx_t* bh_ids, ++ float val, ++ idx_t ids) = 0; ++ ++ // This method is called after all result is collected ++ virtual void post_process(idx_t nres, idx_t* bh_ids) = 0; ++ virtual ~ResultCollector() {} ++}; ++ ++struct DefaultCollector : ResultCollector { ++ void collect( ++ int k, ++ int& nres, ++ float* bh_val, ++ idx_t* bh_ids, ++ float val, ++ idx_t ids) override { ++ if (nres < k) { ++ faiss::maxheap_push(++nres, bh_val, bh_ids, val, ids); ++ } else if (val < bh_val[0]) { ++ faiss::maxheap_replace_top(nres, bh_val, bh_ids, val, ids); ++ } ++ } ++ ++ // This method is called once all result is collected so that final post ++ // processing can be done For example, if the result is collected using ++ // group id, the group id can be converted back to its original id inside ++ // this method ++ void post_process(idx_t nres, idx_t* bh_ids) override { ++ // Do nothing ++ } ++ ++ ~DefaultCollector() override {} ++}; ++ ++} // namespace faiss +diff --git a/faiss/impl/ResultCollectorFactory.h b/faiss/impl/ResultCollectorFactory.h +new file mode 100644 +index 00000000..70918266 +--- /dev/null ++++ b/faiss/impl/ResultCollectorFactory.h +@@ -0,0 +1,33 @@ ++/** ++ * Copyright (c) Facebook, Inc. and its affiliates. ++ * ++ * This source code is licensed under the MIT license found in the ++ * LICENSE file in the root directory of this source tree. ++ */ ++ ++#pragma once ++#include ++namespace faiss { ++ ++/** ResultCollectorFactory to create a ResultCollector object */ ++struct ResultCollectorFactory { ++ DefaultCollector default_collector; ++ const std::vector* id_map; ++ ++ // Create a new ResultCollector object ++ virtual ResultCollector* new_collector() { ++ return &default_collector; ++ } ++ ++ // For default case, the factory share single object and no need to delete ++ // the object. For other case, the factory can create a new object which ++ // need to be deleted later. We have deleteCollector method to handle both ++ // case as factory class knows how to release resource that it created ++ virtual void delete_collector(ResultCollector* collector) { ++ // Do nothing ++ } ++ ++ virtual ~ResultCollectorFactory() {} ++}; ++ ++} // namespace faiss +-- +2.39.3 (Apple Git-145) +