From 44445deb24bbf16f1a676a25dcab77d1f8e68e64 Mon Sep 17 00:00:00 2001 From: Dax Pryce Date: Wed, 26 Jul 2023 14:58:39 -0700 Subject: [PATCH] DynamicMemoryIndex bug fixes (#404) * While simply creating a unit test to repro Issue #400, I found a number of bugs that I needed to address just to get it to work the way I had intended. This does not yet have what I would consider a comprehensive suite of test coverage for the DynamicMemoryIndex, but we at least do save it with the metadata file, we can load it correctly, and saving *always* consolidate_deletes() prior to save if any item has been marked for deletion prior to save. * We actually cannot save without compacting before save anyway. Removing the parameter from save() and hardcoding it to True until we can actually support it. * Addressing some PR comments and readying a 0.5.0.rc5 release --- pyproject.toml | 2 +- python/include/dynamic_memory_index.h | 2 + python/src/_dynamic_memory_index.py | 28 ++++++++++++-- python/src/dynamic_memory_index.cpp | 5 +++ python/src/module.cpp | 3 +- python/tests/test_dynamic_memory_index.py | 45 ++++++++++++++++++++++- 6 files changed, 79 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bc0d1666d..55e8a465a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ build-backend = "setuptools.build_meta" [project] name = "diskannpy" -version = "0.5.0.rc4" +version = "0.5.0.rc5" description = "DiskANN Python extension module" readme = "python/README.md" diff --git a/python/include/dynamic_memory_index.h b/python/include/dynamic_memory_index.h index 9bfbbec2d..02d6b8cce 100644 --- a/python/include/dynamic_memory_index.h +++ b/python/include/dynamic_memory_index.h @@ -41,6 +41,8 @@ class DynamicMemoryIndex uint64_t num_queries, uint64_t knn, uint64_t complexity, uint32_t num_threads); void consolidate_delete(); + size_t num_points(); + private: const uint32_t _initial_search_complexity; diff --git a/python/src/_dynamic_memory_index.py b/python/src/_dynamic_memory_index.py index a25587475..dc7fd2978 100644 --- a/python/src/_dynamic_memory_index.py +++ b/python/src/_dynamic_memory_index.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT license. +import os import warnings import numpy as np @@ -21,12 +22,14 @@ _assert, _assert_2d, _assert_dtype, + _assert_existing_directory, _assert_is_nonnegative_uint32, _assert_is_positive_uint32, _castable_dtype_or_raise, _ensure_index_metadata, _valid_metric, _valid_index_prefix, + _write_index_metadata ) from ._diskannpy import defaults @@ -158,6 +161,7 @@ def __init__( """ dap_metric = _valid_metric(distance_metric) + self._dap_metric = dap_metric _assert_dtype(vector_dtype) _assert_is_positive_uint32(dimensions, "dimensions") @@ -199,6 +203,7 @@ def __init__( search_threads=search_threads, concurrent_consolidation=concurrent_consolidation ) + self._points_deleted = False def search( self, query: VectorLike, k_neighbors: int, complexity: int @@ -293,16 +298,31 @@ def batch_search( num_threads=num_threads, ) - def save(self, save_path: str, compact_before_save: bool = True): + def save(self, save_path: str, index_prefix: str = "ann"): """ Saves this index to file. :param save_path: The path to save these index files to. :type save_path: str - :param compact_before_save: + :param index_prefix: The prefix to use for the index files. Default is "ann". + :type index_prefix: str """ if save_path == "": raise ValueError("save_path cannot be empty") - self._index.save(save_path=save_path, compact_before_save=compact_before_save) + if index_prefix == "": + raise ValueError("index_prefix cannot be empty") + _assert_existing_directory(save_path, "save_path") + save_path = os.path.join(save_path, index_prefix) + if self._points_deleted is True: + warnings.warn( + "DynamicMemoryIndex.save() currently requires DynamicMemoryIndex.consolidate_delete() to be called " + "prior to save when items have been marked for deletion. This is being done automatically now, though" + "it will increase the time it takes to save; on large sets of data it can take a substantial amount of " + "time. In the future, we will implement a faster save with unconsolidated deletes, but for now this is " + "required." + ) + self._index.consolidate_delete() + self._index.save(save_path=save_path, compact_before_save=True) # we do not yet support uncompacted saves + _write_index_metadata(save_path, self._vector_dtype, self._dap_metric, self._index.num_points(), self._dimensions) def insert(self, vector: VectorLike, vector_id: VectorIdentifier): """ @@ -349,6 +369,7 @@ def mark_deleted(self, vector_id: VectorIdentifier): :type vector_id: int """ _assert_is_positive_uint32(vector_id, "vector_id") + self._points_deleted = True self._index.mark_deleted(np.uintc(vector_id)) def consolidate_delete(self): @@ -356,3 +377,4 @@ def consolidate_delete(self): This method actually restructures the DiskANN index to remove the items that have been marked for deletion. """ self._index.consolidate_delete() + self._points_deleted = False diff --git a/python/src/dynamic_memory_index.cpp b/python/src/dynamic_memory_index.cpp index 98cf5b628..af276b85f 100644 --- a/python/src/dynamic_memory_index.cpp +++ b/python/src/dynamic_memory_index.cpp @@ -159,6 +159,11 @@ template void DynamicMemoryIndex
::consolidate_delete() _index.consolidate_deletes(_write_parameters); } +template size_t DynamicMemoryIndex
::num_points() +{ + return _index.get_num_points(); +} + template class DynamicMemoryIndex; template class DynamicMemoryIndex; template class DynamicMemoryIndex; diff --git a/python/src/module.cpp b/python/src/module.cpp index 9f7337002..7aea9fc03 100644 --- a/python/src/module.cpp +++ b/python/src/module.cpp @@ -79,7 +79,8 @@ template inline void add_variant(py::module_ &m, const Variant &var .def("save", &diskannpy::DynamicMemoryIndex::save, "save_path"_a = "", "compact_before_save"_a = false) .def("insert", &diskannpy::DynamicMemoryIndex::insert, "vector"_a, "id"_a) .def("mark_deleted", &diskannpy::DynamicMemoryIndex::mark_deleted, "id"_a) - .def("consolidate_delete", &diskannpy::DynamicMemoryIndex::consolidate_delete); + .def("consolidate_delete", &diskannpy::DynamicMemoryIndex::consolidate_delete) + .def("num_points", &diskannpy::DynamicMemoryIndex::num_points); py::class_>(m, variant.static_disk_index_name.c_str()) .def(py::init(), diff --git a/python/tests/test_dynamic_memory_index.py b/python/tests/test_dynamic_memory_index.py index d555e1234..ec3737702 100644 --- a/python/tests/test_dynamic_memory_index.py +++ b/python/tests/test_dynamic_memory_index.py @@ -1,8 +1,8 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT license. -import os import shutil +import tempfile import unittest import diskannpy as dap @@ -296,3 +296,46 @@ def test_value_ranges_batch_search(self): index.batch_search( queries=np.array([[]], dtype=np.single), **kwargs ) + + # Issue #400 + def test_issue400(self): + _, _, _, index_vectors, ann_dir, _, generated_tags = self._test_matrix[0] + + deletion_tag = generated_tags[10] # arbitrary choice + deletion_vector = index_vectors[10] + + index = dap.DynamicMemoryIndex.from_file( + index_directory=ann_dir, + num_threads=16, + initial_search_complexity=32, + max_vectors=10100, + complexity=64, + graph_degree=32 + ) + index.insert(np.array([1.0] * 10, dtype=np.single), 10099) + index.insert(np.array([2.0] * 10, dtype=np.single), 10050) + index.insert(np.array([3.0] * 10, dtype=np.single), 10053) + tags, distances = index.search(np.array([3.0] * 10, dtype=np.single), k_neighbors=5, complexity=64) + self.assertIn(10053, tags) + tags, distances = index.search(deletion_vector, k_neighbors=5, complexity=64) + self.assertIn(deletion_tag, tags, "deletion_tag should exist, as we have not deleted yet") + index.mark_deleted(deletion_tag) + tags, distances = index.search(deletion_vector, k_neighbors=5, complexity=64) + self.assertNotIn(deletion_tag, tags, "deletion_tag should not exist, as we have marked it for deletion") + with tempfile.TemporaryDirectory() as tmpdir: + index.save(tmpdir) + + index2 = dap.DynamicMemoryIndex.from_file( + index_directory=tmpdir, + num_threads=16, + initial_search_complexity=32, + max_vectors=10100, + complexity=64, + graph_degree=32 + ) + tags, distances = index2.search(deletion_vector, k_neighbors=5, complexity=64) + self.assertNotIn( + deletion_tag, + tags, + "deletion_tag should not exist, as we saved and reloaded the index without it" + )