diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index c807e348e6..c1b9a85ac1 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -38,6 +38,15 @@ jobs: with: submodules: true + # Git functionality in CMAKE file does not work with given ubuntu image. Therefore, handling it here. + - name: Apply Git Patch + # Deleting file at the end to skip `git apply` inside CMAKE file + run: | + cd jni/external/faiss + git apply --ignore-space-change --ignore-whitespace --3way ../../patches/faiss/0001-Custom-patch-to-support-multi-vector.patch + rm ../../patches/faiss/0001-Custom-patch-to-support-multi-vector.patch + working-directory: ${{ github.workspace }} + - name: Setup Java ${{ matrix.java }} uses: actions/setup-java@v1 with: diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md index cbfaf0f4c6..b0329d70bb 100644 --- a/DEVELOPER_GUIDE.md +++ b/DEVELOPER_GUIDE.md @@ -229,6 +229,13 @@ For users that want to get the most out of the libraries, they should follow [th and build the libraries from source in their production environment, so that if their environment has optimized instruction sets, they take advantage of them. +### Custom patch on JNI Library +If you want to make a custom patch on JNI library +1. Make a change on top of current version of JNI library and push the commit locally. +2. Create a patch file for the change using `git format-patch -o patches HEAD^` +3. Place the patch file under `jni/patches` +4. Make a change in `jni/CmakeLists.txt`, `.github/workflows/CI.yml` to apply the patch during build + ## Run OpenSearch k-NN ### Run Single-node Cluster Locally diff --git a/jni/CMakeLists.txt b/jni/CMakeLists.txt index 29a844ee07..6e66e17ac7 100644 --- a/jni/CMakeLists.txt +++ b/jni/CMakeLists.txt @@ -79,7 +79,7 @@ list(APPEND TARGET_LIBS ${TARGET_LIB_COMMON}) # ---------------------------------- NMSLIB ---------------------------------- if (${CONFIG_NMSLIB} STREQUAL ON OR ${CONFIG_ALL} STREQUAL ON OR ${CONFIG_TEST} STREQUAL ON) # Check if nmslib exists - find_path(NMS_REPO_DIR NAMES similarity_search PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/nmslib) + find_path(NMS_REPO_DIR NAMES similarity_search PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/nmslib NO_DEFAULT_PATH) # If not, pull the updated submodule if (NOT EXISTS ${NMS_REPO_DIR}) @@ -134,7 +134,7 @@ if (${CONFIG_FAISS} STREQUAL ON OR ${CONFIG_ALL} STREQUAL ON OR ${CONFIG_TEST} S find_package(LAPACK REQUIRED) # Check if faiss exists - find_path(FAISS_REPO_DIR NAMES faiss PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/faiss) + find_path(FAISS_REPO_DIR NAMES faiss PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/faiss NO_DEFAULT_PATH) # If not, pull the updated submodule if (NOT EXISTS ${FAISS_REPO_DIR}) @@ -142,6 +142,18 @@ if (${CONFIG_FAISS} STREQUAL ON OR ${CONFIG_ALL} STREQUAL ON OR ${CONFIG_TEST} S execute_process(COMMAND git submodule update --init -- external/faiss WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif () + # Check if patch exist, this is to skip git apply during CI build. See CI.yml with ubuntu. + find_path(PATCH_FILE NAMES 0001-Custom-patch-to-support-multi-vector.patch PATHS ${CMAKE_CURRENT_SOURCE_DIR}/patches/faiss NO_DEFAULT_PATH) + + # If it exists, apply patches + if (EXISTS ${PATCH_FILE}) + message(STATUS "Applying custom patches.") + execute_process(COMMAND git apply --ignore-space-change --ignore-whitespace --3way ${CMAKE_CURRENT_SOURCE_DIR}/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/external/faiss ERROR_VARIABLE ERROR_MSG RESULT_VARIABLE RESULT_CODE) + if(RESULT_CODE) + message(FATAL_ERROR "Failed to apply patch:\n${ERROR_MSG}") + endif() + endif() + set(FAISS_ENABLE_GPU OFF) set(FAISS_ENABLE_PYTHON OFF) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/external/faiss EXCLUDE_FROM_ALL) diff --git a/jni/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch b/jni/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch new file mode 100644 index 0000000000..8c637314df --- /dev/null +++ b/jni/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch @@ -0,0 +1,305 @@ +From c59bc53ffeea69b3fcc66902cefed1601644095f Mon Sep 17 00:00:00 2001 +From: Heemin Kim +Date: Wed, 6 Dec 2023 16:33:52 -0800 +Subject: [PATCH] Custom patch to support multi-vector + +Signed-off-by: Heemin Kim +--- + faiss/CMakeLists.txt | 2 + + faiss/Index.h | 6 ++- + faiss/IndexIDMap.cpp | 23 +++++++++++ + faiss/IndexIDMap.h | 1 + + faiss/impl/HNSW.cpp | 31 +++++++++------ + faiss/impl/ResultCollector.h | 62 +++++++++++++++++++++++++++++ + faiss/impl/ResultCollectorFactory.h | 33 +++++++++++++++ + 7 files changed, 144 insertions(+), 14 deletions(-) + create mode 100644 faiss/impl/ResultCollector.h + create mode 100644 faiss/impl/ResultCollectorFactory.h + +diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt +index 27701586..af682a05 100644 +--- a/faiss/CMakeLists.txt ++++ b/faiss/CMakeLists.txt +@@ -162,6 +162,8 @@ set(FAISS_HEADERS + impl/ProductQuantizer.h + impl/Quantizer.h + impl/ResidualQuantizer.h ++ impl/ResultCollector.h ++ impl/ResultCollectorFactory.h + impl/ResultHandler.h + impl/ScalarQuantizer.h + impl/ThreadedIndex-inl.h +diff --git a/faiss/Index.h b/faiss/Index.h +index 4b4b302b..13eab0c0 100644 +--- a/faiss/Index.h ++++ b/faiss/Index.h +@@ -38,11 +38,12 @@ + + namespace faiss { + +-/// Forward declarations see impl/AuxIndexStructures.h, impl/IDSelector.h and +-/// impl/DistanceComputer.h ++/// Forward declarations see impl/AuxIndexStructures.h, impl/IDSelector.h, ++/// impl/DistanceComputer.h, and impl/ResultCollectorFactory.h + struct IDSelector; + struct RangeSearchResult; + struct DistanceComputer; ++struct ResultCollectorFactory; + + /** Parent class for the optional search paramenters. + * +@@ -52,6 +53,7 @@ struct DistanceComputer; + struct SearchParameters { + /// if non-null, only these IDs will be considered during search. + IDSelector* sel = nullptr; ++ ResultCollectorFactory* col = nullptr; + /// make sure we can dynamic_cast this + virtual ~SearchParameters() {} + }; +diff --git a/faiss/IndexIDMap.cpp b/faiss/IndexIDMap.cpp +index 7972bec9..7fe200c4 100644 +--- a/faiss/IndexIDMap.cpp ++++ b/faiss/IndexIDMap.cpp +@@ -102,6 +102,24 @@ struct ScopedSelChange { + } + }; + ++// RAII object to reset the id_map parameter in ResultCollectorFactory object ++// This object make sure to reset the id_map parameter in ResultCollectorFactory ++// once the program exist current method scope. ++struct ScopedColChange { ++ ResultCollectorFactory* collector_factory = nullptr; ++ void set( ++ ResultCollectorFactory* collector_factory, ++ const std::vector* id_map) { ++ this->collector_factory = collector_factory; ++ collector_factory->id_map = id_map; ++ } ++ ~ScopedColChange() { ++ if (collector_factory) { ++ collector_factory->id_map = nullptr; ++ } ++ } ++}; ++ + } // namespace + + template +@@ -114,6 +132,7 @@ void IndexIDMapTemplate::search( + const SearchParameters* params) const { + IDSelectorTranslated this_idtrans(this->id_map, nullptr); + ScopedSelChange sel_change; ++ ScopedColChange col_change; + + if (params && params->sel) { + auto idtrans = dynamic_cast(params->sel); +@@ -131,6 +150,10 @@ void IndexIDMapTemplate::search( + sel_change.set(params_non_const, &this_idtrans); + } + } ++ ++ if (params && params->col && !params->col->id_map) { ++ col_change.set(params->col, &this->id_map); ++ } + index->search(n, x, k, distances, labels, params); + idx_t* li = labels; + #pragma omp parallel for +diff --git a/faiss/IndexIDMap.h b/faiss/IndexIDMap.h +index 2d164123..c6a1be73 100644 +--- a/faiss/IndexIDMap.h ++++ b/faiss/IndexIDMap.h +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + + #include + #include +diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp +index 9fc201ea..1ebdd12a 100644 +--- a/faiss/impl/HNSW.cpp ++++ b/faiss/impl/HNSW.cpp +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -111,8 +112,8 @@ void HNSW::print_neighbor_stats(int level) const { + level, + nb_neighbors(level)); + size_t tot_neigh = 0, tot_common = 0, tot_reciprocal = 0, n_node = 0; +-#pragma omp parallel for reduction(+: tot_neigh) reduction(+: tot_common) \ +- reduction(+: tot_reciprocal) reduction(+: n_node) ++#pragma omp parallel for reduction(+ : tot_neigh) reduction(+ : tot_common) \ ++ reduction(+ : tot_reciprocal) reduction(+ : n_node) + for (int i = 0; i < levels.size(); i++) { + if (levels[i] > level) { + n_node++; +@@ -530,6 +531,15 @@ int search_from_candidates( + int level, + int nres_in = 0, + const SearchParametersHNSW* params = nullptr) { ++ ResultCollectorFactory defaultFactory; ++ ResultCollectorFactory* collectorFactory; ++ if (params == nullptr || params->col == nullptr) { ++ collectorFactory = &defaultFactory; ++ } else { ++ collectorFactory = params->col; ++ } ++ ResultCollector* collector = collectorFactory->newCollector(); ++ + int nres = nres_in; + int ndis = 0; + +@@ -544,11 +554,7 @@ int search_from_candidates( + float d = candidates.dis[i]; + FAISS_ASSERT(v1 >= 0); + if (!sel || sel->is_member(v1)) { +- if (nres < k) { +- faiss::maxheap_push(++nres, D, I, d, v1); +- } else if (d < D[0]) { +- faiss::maxheap_replace_top(nres, D, I, d, v1); +- } ++ collector->collect(k, nres, D, I, d, v1); + } + vt.set(v1); + } +@@ -612,11 +618,7 @@ int search_from_candidates( + + auto add_to_heap = [&](const size_t idx, const float dis) { + if (!sel || sel->is_member(idx)) { +- if (nres < k) { +- faiss::maxheap_push(++nres, D, I, dis, idx); +- } else if (dis < D[0]) { +- faiss::maxheap_replace_top(nres, D, I, dis, idx); +- } ++ collector->collect(k, nres, D, I, dis, idx); + } + candidates.push(idx, dis); + }; +@@ -660,6 +662,11 @@ int search_from_candidates( + } + } + ++ // Completed collection of result. Run post processor. ++ collector->post_process(nres, I); ++ // Collector completed its task. Release all resource of the collector. ++ collectorFactory->deleteCollector(collector); ++ + if (level == 0) { + stats.n1++; + if (candidates.size() == 0) { +diff --git a/faiss/impl/ResultCollector.h b/faiss/impl/ResultCollector.h +new file mode 100644 +index 00000000..d4aa29f7 +--- /dev/null ++++ b/faiss/impl/ResultCollector.h +@@ -0,0 +1,62 @@ ++/** ++ * Copyright (c) Facebook, Inc. and its affiliates. ++ * ++ * This source code is licensed under the MIT license found in the ++ * LICENSE file in the root directory of this source tree. ++ */ ++ ++#pragma once ++ ++#include ++#include ++ ++#include ++#include ++ ++/** ResultCollector is intended to define how to collect search result */ ++ ++namespace faiss { ++ ++/** Encapsulates a set of ids to handle. */ ++struct ResultCollector { ++ // For each result, collect method is called to store result ++ virtual void collect( ++ int k, ++ int& nres, ++ float* bh_val, ++ idx_t* bh_ids, ++ float val, ++ idx_t ids) = 0; ++ ++ // This method is called after all result is collected ++ virtual void post_process(idx_t nres, idx_t* bh_ids) = 0; ++ virtual ~ResultCollector() {} ++}; ++ ++struct DefaultCollector : ResultCollector { ++ void collect( ++ int k, ++ int& nres, ++ float* bh_val, ++ idx_t* bh_ids, ++ float val, ++ idx_t ids) override { ++ if (nres < k) { ++ faiss::maxheap_push(++nres, bh_val, bh_ids, val, ids); ++ } else if (val < bh_val[0]) { ++ faiss::maxheap_replace_top(nres, bh_val, bh_ids, val, ids); ++ } ++ } ++ ++ // This method is called once all result is collected so that final post ++ // processing can be done For example, if the result is collected using ++ // group id, the group id can be converted back to its original id inside ++ // this method ++ void post_process(idx_t nres, idx_t* bh_ids) override { ++ // Do nothing ++ } ++ ++ ~DefaultCollector() override {} ++}; ++ ++} // namespace faiss +diff --git a/faiss/impl/ResultCollectorFactory.h b/faiss/impl/ResultCollectorFactory.h +new file mode 100644 +index 00000000..d9a667ac +--- /dev/null ++++ b/faiss/impl/ResultCollectorFactory.h +@@ -0,0 +1,33 @@ ++/** ++ * Copyright (c) Facebook, Inc. and its affiliates. ++ * ++ * This source code is licensed under the MIT license found in the ++ * LICENSE file in the root directory of this source tree. ++ */ ++ ++#pragma once ++#include ++namespace faiss { ++ ++/** ResultCollectorFactory to create a ResultCollector object */ ++struct ResultCollectorFactory { ++ DefaultCollector default_collector; ++ const std::vector* id_map; ++ ++ // Create a new ResultCollector object ++ virtual ResultCollector* newCollector() { ++ return &default_collector; ++ } ++ ++ // For default case, the factory share single object and no need to delete ++ // the object. For other case, the factory can create a new object which ++ // need to be deleted later. We have deleteCollector method to handle both ++ // case as factory class knows how to release resource that it created ++ virtual void deleteCollector(ResultCollector* collector) { ++ // Do nothing ++ } ++ ++ virtual ~ResultCollectorFactory() {} ++}; ++ ++} // namespace faiss +-- +2.39.3 (Apple Git-145) +