Skip to content

Commit

Permalink
Add patch to support multi vector in faiss (opensearch-project#1358)
Browse files Browse the repository at this point in the history
Signed-off-by: Heemin Kim <[email protected]>
  • Loading branch information
heemin32 committed Jan 16, 2024
1 parent 45282e0 commit 04a46c4
Show file tree
Hide file tree
Showing 4 changed files with 331 additions and 2 deletions.
9 changes: 9 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@ jobs:
with:
submodules: true

# Git functionality in CMAKE file does not work with given ubuntu image. Therefore, handling it here.
- name: Apply Git Patch
# Deleting file at the end to skip `git apply` inside CMAKE file
run: |
cd jni/external/faiss
git apply --ignore-space-change --ignore-whitespace --3way ../../patches/faiss/0001-Custom-patch-to-support-multi-vector.patch
rm ../../patches/faiss/0001-Custom-patch-to-support-multi-vector.patch
working-directory: ${{ github.workspace }}

- name: Setup Java ${{ matrix.java }}
uses: actions/setup-java@v1
with:
Expand Down
7 changes: 7 additions & 0 deletions DEVELOPER_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,13 @@ For users that want to get the most out of the libraries, they should follow [th
and build the libraries from source in their production environment, so that if their environment has optimized
instruction sets, they take advantage of them.

### Custom patch on JNI Library
If you want to make a custom patch on JNI library
1. Make a change on top of current version of JNI library and push the commit locally.
2. Create a patch file for the change using `git format-patch -o patches HEAD^`
3. Place the patch file under `jni/patches`
4. Make a change in `jni/CmakeLists.txt`, `.github/workflows/CI.yml` to apply the patch during build

## Run OpenSearch k-NN

### Run Single-node Cluster Locally
Expand Down
16 changes: 14 additions & 2 deletions jni/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ list(APPEND TARGET_LIBS ${TARGET_LIB_COMMON})
# ---------------------------------- NMSLIB ----------------------------------
if (${CONFIG_NMSLIB} STREQUAL ON OR ${CONFIG_ALL} STREQUAL ON OR ${CONFIG_TEST} STREQUAL ON)
# Check if nmslib exists
find_path(NMS_REPO_DIR NAMES similarity_search PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/nmslib)
find_path(NMS_REPO_DIR NAMES similarity_search PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/nmslib NO_DEFAULT_PATH)

# If not, pull the updated submodule
if (NOT EXISTS ${NMS_REPO_DIR})
Expand Down Expand Up @@ -134,14 +134,26 @@ if (${CONFIG_FAISS} STREQUAL ON OR ${CONFIG_ALL} STREQUAL ON OR ${CONFIG_TEST} S
find_package(LAPACK REQUIRED)

# Check if faiss exists
find_path(FAISS_REPO_DIR NAMES faiss PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/faiss)
find_path(FAISS_REPO_DIR NAMES faiss PATHS ${CMAKE_CURRENT_SOURCE_DIR}/external/faiss NO_DEFAULT_PATH)

# If not, pull the updated submodule
if (NOT EXISTS ${FAISS_REPO_DIR})
message(STATUS "Could not find faiss. Pulling updated submodule.")
execute_process(COMMAND git submodule update --init -- external/faiss WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif ()

# Check if patch exist, this is to skip git apply during CI build. See CI.yml with ubuntu.
find_path(PATCH_FILE NAMES 0001-Custom-patch-to-support-multi-vector.patch PATHS ${CMAKE_CURRENT_SOURCE_DIR}/patches/faiss NO_DEFAULT_PATH)

# If it exists, apply patches
if (EXISTS ${PATCH_FILE})
message(STATUS "Applying custom patches.")
execute_process(COMMAND git apply --ignore-space-change --ignore-whitespace --3way ${CMAKE_CURRENT_SOURCE_DIR}/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/external/faiss ERROR_VARIABLE ERROR_MSG RESULT_VARIABLE RESULT_CODE)
if(RESULT_CODE)
message(FATAL_ERROR "Failed to apply patch:\n${ERROR_MSG}")
endif()
endif()

set(FAISS_ENABLE_GPU OFF)
set(FAISS_ENABLE_PYTHON OFF)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/external/faiss EXCLUDE_FROM_ALL)
Expand Down
301 changes: 301 additions & 0 deletions jni/patches/faiss/0001-Custom-patch-to-support-multi-vector.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,301 @@
From 70c9b5d6408bb85fd1503002d29f9428fe0d48dc Mon Sep 17 00:00:00 2001
From: Heemin Kim <[email protected]>
Date: Wed, 6 Dec 2023 16:33:52 -0800
Subject: [PATCH] Custom patch to support multi-vector

Signed-off-by: Heemin Kim <[email protected]>
---
faiss/CMakeLists.txt | 2 +
faiss/Index.h | 6 ++-
faiss/IndexIDMap.cpp | 24 ++++++++++
faiss/impl/HNSW.cpp | 27 +++++++----
faiss/impl/ResultCollector.h | 74 +++++++++++++++++++++++++++++
faiss/impl/ResultCollectorFactory.h | 33 +++++++++++++
6 files changed, 154 insertions(+), 12 deletions(-)
create mode 100644 faiss/impl/ResultCollector.h
create mode 100644 faiss/impl/ResultCollectorFactory.h

diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index 27701586..af682a05 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -162,6 +162,8 @@ set(FAISS_HEADERS
impl/ProductQuantizer.h
impl/Quantizer.h
impl/ResidualQuantizer.h
+ impl/ResultCollector.h
+ impl/ResultCollectorFactory.h
impl/ResultHandler.h
impl/ScalarQuantizer.h
impl/ThreadedIndex-inl.h
diff --git a/faiss/Index.h b/faiss/Index.h
index 4b4b302b..13eab0c0 100644
--- a/faiss/Index.h
+++ b/faiss/Index.h
@@ -38,11 +38,12 @@

namespace faiss {

-/// Forward declarations see impl/AuxIndexStructures.h, impl/IDSelector.h and
-/// impl/DistanceComputer.h
+/// Forward declarations see impl/AuxIndexStructures.h, impl/IDSelector.h,
+/// impl/DistanceComputer.h, and impl/ResultCollectorFactory.h
struct IDSelector;
struct RangeSearchResult;
struct DistanceComputer;
+struct ResultCollectorFactory;

/** Parent class for the optional search paramenters.
*
@@ -52,6 +53,7 @@ struct DistanceComputer;
struct SearchParameters {
/// if non-null, only these IDs will be considered during search.
IDSelector* sel = nullptr;
+ ResultCollectorFactory* col = nullptr;
/// make sure we can dynamic_cast this
virtual ~SearchParameters() {}
};
diff --git a/faiss/IndexIDMap.cpp b/faiss/IndexIDMap.cpp
index 7972bec9..0f82a17c 100644
--- a/faiss/IndexIDMap.cpp
+++ b/faiss/IndexIDMap.cpp
@@ -18,6 +18,7 @@
#include <faiss/impl/FaissAssert.h>
#include <faiss/utils/Heap.h>
#include <faiss/utils/WorkerThread.h>
+#include <faiss/impl/ResultCollectorFactory.h>

namespace faiss {

@@ -102,6 +103,24 @@ struct ScopedSelChange {
}
};

+// RAII object to reset the id_map parameter in ResultCollectorFactory object
+// This object make sure to reset the id_map parameter in ResultCollectorFactory
+// once the program exist current method scope.
+struct ScopedColChange {
+ ResultCollectorFactory* collector_factory = nullptr;
+ void set(
+ ResultCollectorFactory* collector_factory,
+ const std::vector<int64_t>* id_map) {
+ this->collector_factory = collector_factory;
+ collector_factory->id_map = id_map;
+ }
+ ~ScopedColChange() {
+ if (collector_factory) {
+ collector_factory->id_map = nullptr;
+ }
+ }
+};
+
} // namespace

template <typename IndexT>
@@ -114,6 +133,7 @@ void IndexIDMapTemplate<IndexT>::search(
const SearchParameters* params) const {
IDSelectorTranslated this_idtrans(this->id_map, nullptr);
ScopedSelChange sel_change;
+ ScopedColChange col_change;

if (params && params->sel) {
auto idtrans = dynamic_cast<const IDSelectorTranslated*>(params->sel);
@@ -131,6 +151,10 @@ void IndexIDMapTemplate<IndexT>::search(
sel_change.set(params_non_const, &this_idtrans);
}
}
+
+ if (params && params->col && !params->col->id_map) {
+ col_change.set(params->col, &this->id_map);
+ }
index->search(n, x, k, distances, labels, params);
idx_t* li = labels;
#pragma omp parallel for
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index 9fc201ea..5b5900d1 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -14,6 +14,7 @@
#include <faiss/impl/AuxIndexStructures.h>
#include <faiss/impl/DistanceComputer.h>
#include <faiss/impl/IDSelector.h>
+#include <faiss/impl/ResultCollectorFactory.h>
#include <faiss/utils/prefetch.h>

#include <faiss/impl/platform_macros.h>
@@ -530,6 +531,15 @@ int search_from_candidates(
int level,
int nres_in = 0,
const SearchParametersHNSW* params = nullptr) {
+ ResultCollectorFactory defaultFactory;
+ ResultCollectorFactory* collectorFactory;
+ if (params == nullptr || params->col == nullptr) {
+ collectorFactory = &defaultFactory;
+ } else {
+ collectorFactory = params->col;
+ }
+ ResultCollector* collector = collectorFactory->new_collector();
+
int nres = nres_in;
int ndis = 0;

@@ -544,11 +554,7 @@ int search_from_candidates(
float d = candidates.dis[i];
FAISS_ASSERT(v1 >= 0);
if (!sel || sel->is_member(v1)) {
- if (nres < k) {
- faiss::maxheap_push(++nres, D, I, d, v1);
- } else if (d < D[0]) {
- faiss::maxheap_replace_top(nres, D, I, d, v1);
- }
+ collector->collect(k, nres, D, I, d, v1);
}
vt.set(v1);
}
@@ -612,11 +618,7 @@ int search_from_candidates(

auto add_to_heap = [&](const size_t idx, const float dis) {
if (!sel || sel->is_member(idx)) {
- if (nres < k) {
- faiss::maxheap_push(++nres, D, I, dis, idx);
- } else if (dis < D[0]) {
- faiss::maxheap_replace_top(nres, D, I, dis, idx);
- }
+ collector->collect(k, nres, D, I, dis, idx);
}
candidates.push(idx, dis);
};
@@ -660,6 +662,11 @@ int search_from_candidates(
}
}

+ // Completed collection of result. Run post processor.
+ collector->post_process(nres, I);
+ // Collector completed its task. Release all resource of the collector.
+ collectorFactory->delete_collector(collector);
+
if (level == 0) {
stats.n1++;
if (candidates.size() == 0) {
diff --git a/faiss/impl/ResultCollector.h b/faiss/impl/ResultCollector.h
new file mode 100644
index 00000000..a0489fd6
--- /dev/null
+++ b/faiss/impl/ResultCollector.h
@@ -0,0 +1,74 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <unordered_set>
+#include <vector>
+
+#include <faiss/MetricType.h>
+#include <faiss/utils/Heap.h>
+
+/**
+ * ResultCollector is intended to define how to collect search result
+ * For each single search result, collect method will be called.
+ * After every results are collected, post_process method is called at the end.
+ */
+
+namespace faiss {
+
+/** Encapsulates a set of ids to handle. */
+struct ResultCollector {
+ /**
+ * For each result, collect method is called to store result
+ * @param k number of vectors to search
+ * @param nres number of results in queue
+ * @param bh_val search result, distances from query
+ * @param bh_ids search result, ids of vectors
+ * @param val distance from query for current vector
+ * @param ids id of current vector
+ */
+ virtual void collect(
+ int k,
+ int& nres,
+ float* bh_val,
+ idx_t* bh_ids,
+ float val,
+ idx_t ids) = 0;
+
+ // This method is called after all result is collected
+ virtual void post_process(idx_t nres, idx_t* bh_ids) = 0;
+ virtual ~ResultCollector() {}
+};
+
+struct DefaultCollector : ResultCollector {
+ void collect(
+ int k,
+ int& nres,
+ float* bh_val,
+ idx_t* bh_ids,
+ float val,
+ idx_t ids) override {
+ if (nres < k) {
+ faiss::maxheap_push(++nres, bh_val, bh_ids, val, ids);
+ } else if (val < bh_val[0]) {
+ faiss::maxheap_replace_top(nres, bh_val, bh_ids, val, ids);
+ }
+ }
+
+ // This method is called once all result is collected so that final post
+ // processing can be done For example, if the result is collected using
+ // group id, the group id can be converted back to its original id inside
+ // this method
+ void post_process(idx_t nres, idx_t* bh_ids) override {
+ // Do nothing
+ }
+
+ ~DefaultCollector() override {}
+};
+
+} // namespace faiss
diff --git a/faiss/impl/ResultCollectorFactory.h b/faiss/impl/ResultCollectorFactory.h
new file mode 100644
index 00000000..70918266
--- /dev/null
+++ b/faiss/impl/ResultCollectorFactory.h
@@ -0,0 +1,33 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <faiss/impl/ResultCollector.h>
+namespace faiss {
+
+/** ResultCollectorFactory to create a ResultCollector object */
+struct ResultCollectorFactory {
+ DefaultCollector default_collector;
+ const std::vector<int64_t>* id_map;
+
+ // Create a new ResultCollector object
+ virtual ResultCollector* new_collector() {
+ return &default_collector;
+ }
+
+ // For default case, the factory share single object and no need to delete
+ // the object. For other case, the factory can create a new object which
+ // need to be deleted later. We have deleteCollector method to handle both
+ // case as factory class knows how to release resource that it created
+ virtual void delete_collector(ResultCollector* collector) {
+ // Do nothing
+ }
+
+ virtual ~ResultCollectorFactory() {}
+};
+
+} // namespace faiss
--
2.39.3 (Apple Git-145)

0 comments on commit 04a46c4

Please sign in to comment.