From 755646d5a6c09ecb0273710a69366b0fcde44db0 Mon Sep 17 00:00:00 2001 From: Ben Frederickson <4302519+difyrrwrzd@users.noreply.github.com> Date: Thu, 30 May 2024 08:36:30 -0700 Subject: [PATCH] Expose serialization to the python / c-api (#164) Authors: - Ben Frederickson (https://github.com/benfred) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuvs/pull/164 --- cpp/include/cuvs/neighbors/cagra.h | 45 +++++++++++ cpp/src/neighbors/cagra_c.cpp | 76 ++++++++++++++++-- docs/source/working_with_ann_indexes_c.rst | 4 - docs/source/working_with_ann_indexes_cpp.rst | 6 -- .../working_with_ann_indexes_python.rst | 5 -- docs/source/working_with_ann_indexes_rust.rst | 8 -- python/cuvs/cuvs/neighbors/cagra/__init__.py | 4 + python/cuvs/cuvs/neighbors/cagra/cagra.pxd | 10 +++ python/cuvs/cuvs/neighbors/cagra/cagra.pyx | 80 ++++++++++++++++++- python/cuvs/cuvs/test/test_cagra.py | 50 ++++++++++++ 10 files changed, 258 insertions(+), 30 deletions(-) diff --git a/cpp/include/cuvs/neighbors/cagra.h b/cpp/include/cuvs/neighbors/cagra.h index 727c39c..c821957 100644 --- a/cpp/include/cuvs/neighbors/cagra.h +++ b/cpp/include/cuvs/neighbors/cagra.h @@ -18,6 +18,7 @@ #include #include +#include #include #ifdef __cplusplus @@ -382,6 +383,50 @@ cuvsError_t cuvsCagraSearch(cuvsResources_t res, * @} */ +/** + * @defgroup cagra_c_serialize CAGRA C-API serialize functions + * @{ + */ +/** + * Save the index to file. + * + * Experimental, both the API and the serialization format are subject to change. + * + * @code{.cpp} + * #include + * + * // Create cuvsResources_t + * cuvsResources_t res; + * cuvsError_t res_create_status = cuvsResourcesCreate(&res); + * + * // create an index with `cuvsCagraBuild` + * cuvsCagraSerialize(res, "/path/to/index", index, true); + * @endcode + * + * @param[in] res cuvsResources_t opaque C handle + * @param[in] filename the file name for saving the index + * @param[in] index CAGRA index + * @param[in] include_dataset Whether or not to write out the dataset to the file. + * + */ +cuvsError_t cuvsCagraSerialize(cuvsResources_t res, + const char* filename, + cuvsCagraIndex_t index, + bool include_dataset); + +/** + * Load index from file. + * + * Experimental, both the API and the serialization format are subject to change. + * + * @param[in] res cuvsResources_t opaque C handle + * @param[in] filename the name of the file that stores the index + * @param[out] index CAGRA index loaded disk + */ +cuvsError_t cuvsCagraDeserialize(cuvsResources_t res, const char* filename, cuvsCagraIndex_t index); +/** + * @} + */ #ifdef __cplusplus } #endif diff --git a/cpp/src/neighbors/cagra_c.cpp b/cpp/src/neighbors/cagra_c.cpp index cadda19..630a505 100644 --- a/cpp/src/neighbors/cagra_c.cpp +++ b/cpp/src/neighbors/cagra_c.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -104,6 +105,27 @@ void _search(cuvsResources_t res, *res_ptr, search_params, *index_ptr, queries_mds, neighbors_mds, distances_mds); } +template +void _serialize(cuvsResources_t res, + const char* filename, + cuvsCagraIndex_t index, + bool include_dataset) +{ + auto res_ptr = reinterpret_cast(res); + auto index_ptr = reinterpret_cast*>(index->addr); + cuvs::neighbors::cagra::serialize_file( + *res_ptr, std::string(filename), *index_ptr, include_dataset); +} + +template +void* _deserialize(cuvsResources_t res, const char* filename) +{ + auto res_ptr = reinterpret_cast(res); + auto index = new cuvs::neighbors::cagra::index(*res_ptr); + cuvs::neighbors::cagra::deserialize_file(*res_ptr, std::string(filename), index); + return index; +} + } // namespace extern "C" cuvsError_t cuvsCagraIndexCreate(cuvsCagraIndex_t* index) @@ -140,15 +162,13 @@ extern "C" cuvsError_t cuvsCagraBuild(cuvsResources_t res, { return cuvs::core::translate_exceptions([=] { auto dataset = dataset_tensor->dl_tensor; + index->dtype = dataset.dtype; if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) { - index->addr = reinterpret_cast(_build(res, *params, dataset_tensor)); - index->dtype.code = kDLFloat; + index->addr = reinterpret_cast(_build(res, *params, dataset_tensor)); } else if (dataset.dtype.code == kDLInt && dataset.dtype.bits == 8) { - index->addr = reinterpret_cast(_build(res, *params, dataset_tensor)); - index->dtype.code = kDLInt; + index->addr = reinterpret_cast(_build(res, *params, dataset_tensor)); } else if (dataset.dtype.code == kDLUInt && dataset.dtype.bits == 8) { index->addr = reinterpret_cast(_build(res, *params, dataset_tensor)); - index->dtype.code = kDLUInt; } else { RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d", dataset.dtype.code, @@ -247,3 +267,49 @@ extern "C" cuvsError_t cuvsCagraSearchParamsDestroy(cuvsCagraSearchParams_t para { return cuvs::core::translate_exceptions([=] { delete params; }); } + +extern "C" cuvsError_t cuvsCagraDeserialize(cuvsResources_t res, + const char* filename, + cuvsCagraIndex_t index) +{ + return cuvs::core::translate_exceptions([=] { + // read the numpy dtype from the beginning of the file + std::ifstream is(filename, std::ios::in | std::ios::binary); + if (!is) { RAFT_FAIL("Cannot open file %s", filename); } + char dtype_string[4]; + is.read(dtype_string, 4); + auto dtype = raft::detail::numpy_serializer::parse_descr(std::string(dtype_string, 4)); + + index->dtype.bits = dtype.itemsize * 8; + if (dtype.kind == 'f' && dtype.itemsize == 4) { + index->addr = reinterpret_cast(_deserialize(res, filename)); + index->dtype.code = kDLFloat; + } else if (dtype.kind == 'i' && dtype.itemsize == 1) { + index->addr = reinterpret_cast(_deserialize(res, filename)); + index->dtype.code = kDLInt; + } else if (dtype.kind == 'u' && dtype.itemsize == 1) { + index->addr = reinterpret_cast(_deserialize(res, filename)); + index->dtype.code = kDLUInt; + } else { + RAFT_FAIL("Unsupported dtype in file %s", filename); + } + }); +} + +extern "C" cuvsError_t cuvsCagraSerialize(cuvsResources_t res, + const char* filename, + cuvsCagraIndex_t index, + bool include_dataset) +{ + return cuvs::core::translate_exceptions([=] { + if (index->dtype.code == kDLFloat && index->dtype.bits == 32) { + _serialize(res, filename, index, include_dataset); + } else if (index->dtype.code == kDLInt && index->dtype.bits == 8) { + _serialize(res, filename, index, include_dataset); + } else if (index->dtype.code == kDLUInt && index->dtype.bits == 8) { + _serialize(res, filename, index, include_dataset); + } else { + RAFT_FAIL("Unsupported index dtype: %d and bits: %d", index->dtype.code, index->dtype.bits); + } + }); +} diff --git a/docs/source/working_with_ann_indexes_c.rst b/docs/source/working_with_ann_indexes_c.rst index 89f368a..9d0c17a 100644 --- a/docs/source/working_with_ann_indexes_c.rst +++ b/docs/source/working_with_ann_indexes_c.rst @@ -62,7 +62,3 @@ Searching an index cuvsCagraIndexDestroy(index); cuvsCagraIndexParamsDestroy(index_params); cuvsResourcesDestroy(res); - - -Serializing an index --------------------- \ No newline at end of file diff --git a/docs/source/working_with_ann_indexes_cpp.rst b/docs/source/working_with_ann_indexes_cpp.rst index 1d3225c..ead6410 100644 --- a/docs/source/working_with_ann_indexes_cpp.rst +++ b/docs/source/working_with_ann_indexes_cpp.rst @@ -43,9 +43,3 @@ Searching an index cagra::search_params search_params; cagra::search(res, search_params, index, queries, neighbors, distances); - - -Serializing an index --------------------- - - diff --git a/docs/source/working_with_ann_indexes_python.rst b/docs/source/working_with_ann_indexes_python.rst index e9aba18..c075dea 100644 --- a/docs/source/working_with_ann_indexes_python.rst +++ b/docs/source/working_with_ann_indexes_python.rst @@ -33,8 +33,3 @@ Searching an index index = // ... build index ... neighbors, distances = cagra.search(search_params, index, queries, k) - - - -Serializing an index --------------------- \ No newline at end of file diff --git a/docs/source/working_with_ann_indexes_rust.rst b/docs/source/working_with_ann_indexes_rust.rst index e7da496..ba284d1 100644 --- a/docs/source/working_with_ann_indexes_rust.rst +++ b/docs/source/working_with_ann_indexes_rust.rst @@ -33,11 +33,3 @@ Building an index Ok(()) } - - -Searching an index ------------------- - - -Serializing an index --------------------- \ No newline at end of file diff --git a/python/cuvs/cuvs/neighbors/cagra/__init__.py b/python/cuvs/cuvs/neighbors/cagra/__init__.py index 0b487d7..dd87a64 100644 --- a/python/cuvs/cuvs/neighbors/cagra/__init__.py +++ b/python/cuvs/cuvs/neighbors/cagra/__init__.py @@ -19,6 +19,8 @@ IndexParams, SearchParams, build_index, + load, + save, search, ) @@ -28,5 +30,7 @@ "IndexParams", "SearchParams", "build_index", + "load", + "save", "search", ] diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pxd b/python/cuvs/cuvs/neighbors/cagra/cagra.pxd index d229a3b..b23c2a4 100644 --- a/python/cuvs/cuvs/neighbors/cagra/cagra.pxd +++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pxd @@ -23,6 +23,7 @@ from libc.stdint cimport ( uint64_t, uintptr_t, ) +from libcpp cimport bool from cuvs.common.c_api cimport cuvsError_t, cuvsResources_t from cuvs.common.cydlpack cimport DLDataType, DLManagedTensor @@ -110,3 +111,12 @@ cdef extern from "cuvs/neighbors/cagra.h" nogil: DLManagedTensor* queries, DLManagedTensor* neighbors, DLManagedTensor* distances) except + + + cuvsError_t cuvsCagraSerialize(cuvsResources_t res, + const char * filename, + cuvsCagraIndex_t index, + bool include_dataset) except + + + cuvsError_t cuvsCagraDeserialize(cuvsResources_t res, + const char * filename, + cuvsCagraIndex_t index) except + diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx index 56439dd..3cdff37 100644 --- a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx +++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx @@ -23,6 +23,7 @@ from cuvs.common.resources import auto_sync_resources from cython.operator cimport dereference as deref from libcpp cimport bool, cast +from libcpp.string cimport string from cuvs.common cimport cydlpack @@ -282,7 +283,6 @@ def build_index(IndexParams index_params, dataset, resources=None): np.dtype('ubyte')]) cdef Index idx = Index() - cdef cuvsError_t build_status cdef cydlpack.DLManagedTensor* dataset_dlpack = \ cydlpack.dlpack_c(dataset_ai) cdef cuvsCagraIndexParams* params = index_params.params @@ -544,7 +544,6 @@ def search(SearchParams search_params, exp_rows=n_queries, exp_cols=k) cdef cuvsCagraSearchParams* params = &search_params.params - cdef cuvsError_t search_status cdef cydlpack.DLManagedTensor* queries_dlpack = \ cydlpack.dlpack_c(queries_cai) cdef cydlpack.DLManagedTensor* neighbors_dlpack = \ @@ -564,3 +563,80 @@ def search(SearchParams search_params, )) return (distances, neighbors) + + +@auto_sync_resources +def save(filename, Index index, bool include_dataset=True, resources=None): + """ + Saves the index to a file. + + Saving / loading the index is experimental. The serialization format is + subject to change. + + Parameters + ---------- + filename : string + Name of the file. + index : Index + Trained CAGRA index. + include_dataset : bool + Whether or not to write out the dataset along with the index. Including + the dataset in the serialized index will use extra disk space, and + might not be desired if you already have a copy of the dataset on + disk. If this option is set to false, you will have to call + `index.update_dataset(dataset)` after loading the index. + {resources_docstring} + + Examples + -------- + >>> import cupy as cp + >>> from cuvs.neighbors import cagra + >>> n_samples = 50000 + >>> n_features = 50 + >>> dataset = cp.random.random_sample((n_samples, n_features), + ... dtype=cp.float32) + >>> # Build index + >>> index = cagra.build_index(cagra.IndexParams(), dataset) + >>> # Serialize and deserialize the cagra index built + >>> cagra.save("my_index.bin", index) + >>> index_loaded = cagra.load("my_index.bin") + """ + cdef string c_filename = filename.encode('utf-8') + cdef cuvsResources_t res = resources.get_c_obj() + check_cuvs(cuvsCagraSerialize(res, + c_filename.c_str(), + index.index, + include_dataset)) + + +@auto_sync_resources +def load(filename, resources=None): + """ + Loads index from file. + + Saving / loading the index is experimental. The serialization format is + subject to change, therefore loading an index saved with a previous + version of cuvs is not guaranteed to work. + + Parameters + ---------- + filename : string + Name of the file. + {resources_docstring} + + Returns + ------- + index : Index + + """ + cdef Index idx = Index() + cdef cuvsResources_t res = resources.get_c_obj() + cdef string c_filename = filename.encode('utf-8') + + check_cuvs(cuvsCagraDeserialize( + res, + c_filename.c_str(), + idx.index + )) + idx.trained = True + return idx diff --git a/python/cuvs/cuvs/test/test_cagra.py b/python/cuvs/cuvs/test/test_cagra.py index a2b0184..15711c8 100644 --- a/python/cuvs/cuvs/test/test_cagra.py +++ b/python/cuvs/cuvs/test/test_cagra.py @@ -183,3 +183,53 @@ def test_cagra_vpq_compression(): run_cagra_build_search_test( n_cols=dim, compression=cagra.CompressionParams(pq_dim=dim / pq_len) ) + + +@pytest.mark.parametrize("dtype", [np.float32, np.int8, np.ubyte]) +# TODO: expose update_dataset +# @pytest.mark.parametrize("include_dataset", [True, False]) +@pytest.mark.parametrize("include_dataset", [True]) +def test_save_load(dtype, include_dataset): + n_rows = 10000 + n_cols = 50 + n_queries = 1000 + + dataset = generate_data((n_rows, n_cols), dtype) + dataset_device = device_ndarray(dataset) + + build_params = cagra.IndexParams() + index = cagra.build_index(build_params, dataset_device) + + assert index.trained + filename = "my_index.bin" + cagra.save(filename, index, include_dataset=include_dataset) + loaded_index = cagra.load(filename) + + # if we didn't save the dataset with the index, we need to update the + # index with an already loaded copy + if not include_dataset: + loaded_index.update_dataset(dataset) + + queries = generate_data((n_queries, n_cols), dtype) + + queries_device = device_ndarray(queries) + search_params = cagra.SearchParams() + k = 10 + + distance_dev, neighbors_dev = cagra.search( + search_params, index, queries_device, k + ) + + neighbors = neighbors_dev.copy_to_host() + dist = distance_dev.copy_to_host() + del index + + distance_dev, neighbors_dev = cagra.search( + search_params, loaded_index, queries_device, k + ) + + neighbors2 = neighbors_dev.copy_to_host() + dist2 = distance_dev.copy_to_host() + + assert np.all(neighbors == neighbors2) + assert np.allclose(dist, dist2, rtol=1e-6)