diff --git a/benchmarks/perf-tool/README.md b/benchmarks/perf-tool/README.md index 9c1c189182..91bdcf16be 100644 --- a/benchmarks/perf-tool/README.md +++ b/benchmarks/perf-tool/README.md @@ -13,18 +13,36 @@ file. ## Install Prerequisites -### Python +### Setup -Python 3.7 or above is required. +K-NN perf requires Python 3.8 or greater to be installed. One of +the easier ways to do this is through Conda, a package and environment +management system for Python. -### Pip +First, follow the +[installation instructions](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) +to install Conda on your system. -Use pip to install the necessary requirements: +Next, create a Python 3.8 environment: +``` +conda create -n knn-perf python=3.8 +``` +After the environment is created, activate it: ``` +source activate knn-perf +``` + +Lastly, clone the k-NN repo and install all required python packages: +``` +git clone https://github.com/opensearch-project/k-NN.git +cd k-NN/benchmarks/perf-tool pip install -r requirements.txt ``` +After all of this completes, you should be ready to run your first performance benchmarks! + + ## Usage ### Quick Start diff --git a/benchmarks/perf-tool/okpt/io/config/parsers/test.py b/benchmarks/perf-tool/okpt/io/config/parsers/test.py index 34b1752c72..d0ef4c02fe 100644 --- a/benchmarks/perf-tool/okpt/io/config/parsers/test.py +++ b/benchmarks/perf-tool/okpt/io/config/parsers/test.py @@ -23,6 +23,7 @@ class TestConfig: test_name: str test_id: str endpoint: str + port: int num_runs: int show_runs: bool setup: List[Step] @@ -48,6 +49,9 @@ def parse(self, file_obj: TextIOWrapper) -> TestConfig: if 'endpoint' in config_obj: implicit_step_config['endpoint'] = config_obj['endpoint'] + if 'port' in config_obj: + implicit_step_config['port'] = config_obj['port'] + # Each step should have its own parse - take the config object and check if its valid setup = [] if 'setup' in config_obj: @@ -62,6 +66,7 @@ def parse(self, file_obj: TextIOWrapper) -> TestConfig: test_config = TestConfig( endpoint=config_obj['endpoint'], + port=config_obj['port'], test_name=config_obj['test_name'], test_id=config_obj['test_id'], num_runs=config_obj['num_runs'], diff --git a/benchmarks/perf-tool/okpt/io/config/schemas/test.yml b/benchmarks/perf-tool/okpt/io/config/schemas/test.yml index 1939a8a311..713c656d08 100644 --- a/benchmarks/perf-tool/okpt/io/config/schemas/test.yml +++ b/benchmarks/perf-tool/okpt/io/config/schemas/test.yml @@ -9,6 +9,9 @@ endpoint: type: string default: "localhost" +port: + type: integer + default: 80 test_name: type: string test_id: diff --git a/benchmarks/perf-tool/okpt/test/steps/steps.py b/benchmarks/perf-tool/okpt/test/steps/steps.py index 0de61078fc..b04a4af4de 100644 --- a/benchmarks/perf-tool/okpt/test/steps/steps.py +++ b/benchmarks/perf-tool/okpt/test/steps/steps.py @@ -5,7 +5,7 @@ # compatible open source license. """Provides steps for OpenSearch tests. -Some of the OpenSearch operations return a `took` field in the response body, +Some OpenSearch operations return a `took` field in the response body, so the profiling decorators aren't needed for some functions. """ import json @@ -454,8 +454,7 @@ def _action(self): results['took'] = [ float(query_response['took']) for query_response in query_responses ] - port = 9200 if self.endpoint == 'localhost' else 80 - results['memory_kb'] = get_cache_size_in_kb(self.endpoint, port) + results['memory_kb'] = get_cache_size_in_kb(self.endpoint, self.port) if self.calculate_recall: ids = [[int(hit['_id']) @@ -614,7 +613,6 @@ def _action(self): num_of_search_segments = 0; for shard_key in shards.keys(): for segment in shards[shard_key]: - num_of_committed_segments += segment["num_committed_segments"] num_of_search_segments += segment["num_search_segments"] @@ -689,12 +687,13 @@ def delete_model(endpoint, port, model_id): return response.json() -def get_opensearch_client(endpoint: str, port: int): +def get_opensearch_client(endpoint: str, port: int, timeout=60): """ Get an opensearch client from an endpoint and port Args: endpoint: Endpoint OpenSearch is running on port: Port OpenSearch is running on + timeout: timeout for OpenSearch client, default value 60 Returns: OpenSearch client @@ -708,7 +707,7 @@ def get_opensearch_client(endpoint: str, port: int): use_ssl=False, verify_certs=False, connection_class=RequestsHttpConnection, - timeout=60, + timeout=timeout, ) diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/index.json b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/index.json new file mode 100644 index 0000000000..b8f591176c --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/index.json @@ -0,0 +1,26 @@ +{ + "settings": { + "index": { + "knn": true, + "number_of_shards": 24, + "number_of_replicas": 1 + } + }, + "mappings": { + "properties": { + "target_field": { + "type": "knn_vector", + "dimension": 128, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "faiss", + "parameters": { + "ef_construction": 256, + "m": 16 + } + } + } + } + } +} diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/relaxed-filter-spec.json b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/relaxed-filter-spec.json new file mode 100644 index 0000000000..fecde03928 --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/relaxed-filter-spec.json @@ -0,0 +1,42 @@ +{ + "bool": + { + "should": + [ + { + "range": + { + "age": + { + "gte": 30, + "lte": 70 + } + } + }, + { + "term": + { + "color": "green" + } + }, + { + "term": + { + "color": "blue" + } + }, + { + "term": + { + "color": "yellow" + } + }, + { + "term": + { + "color": "sweet" + } + } + ] + } +} \ No newline at end of file diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/relaxed-filter-test.yml b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/relaxed-filter-test.yml new file mode 100644 index 0000000000..348ee4a8e3 --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/relaxed-filter/relaxed-filter-test.yml @@ -0,0 +1,35 @@ +endpoint: "navneev.aka.corp.amazon.com" +port: 9900 +test_name: "Faiss HNSW Relaxed Filter Test" +test_id: "Faiss HNSW Relaxed Filter Test" +num_runs: 10 +show_runs: false +steps: + - name: delete_index + index_name: target_index + - name: create_index + index_name: target_index + index_spec: [INDEX_SPEC_PATH]/relaxed-filter/index.json + - name: ingest_multi_field + index_name: target_index + field_name: target_field + bulk_size: 500 + dataset_format: hdf5 + dataset_path: [DATASET_PATH]/sift-128-euclidean-with-attr.hdf5 + attributes_dataset_name: attributes + attribute_spec: [ { name: 'color', type: 'str' }, { name: 'taste', type: 'str' }, { name: 'age', type: 'int' } ] + - name: refresh_index + index_name: target_index + - name: query_with_filter + k: 100 + r: 1 + calculate_recall: true + index_name: target_index + field_name: target_field + dataset_format: hdf5 + dataset_path: [DATASET_PATH]/sift-128-euclidean-with-attr.hdf5 + neighbors_format: hdf5 + neighbors_path: [DATASET_PATH]/sift-128-euclidean-with-filters.hdf5 + neighbors_dataset: neighbors_filter_5 + filter_spec: [INDEX_SPEC_PATH]/relaxed-filter-spec.json + filter_type: FILTER diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/index.json b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/index.json new file mode 100644 index 0000000000..b8f591176c --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/index.json @@ -0,0 +1,26 @@ +{ + "settings": { + "index": { + "knn": true, + "number_of_shards": 24, + "number_of_replicas": 1 + } + }, + "mappings": { + "properties": { + "target_field": { + "type": "knn_vector", + "dimension": 128, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "faiss", + "parameters": { + "ef_construction": 256, + "m": 16 + } + } + } + } + } +} diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/restrictive-filter-spec.json b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/restrictive-filter-spec.json new file mode 100644 index 0000000000..9e6356f1c7 --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/restrictive-filter-spec.json @@ -0,0 +1,44 @@ +{ + "bool": + { + "must": + [ + { + "range": + { + "age": + { + "gte": 30, + "lte": 60 + } + } + }, + { + "term": + { + "taste": "bitter" + } + }, + { + "bool": + { + "should": + [ + { + "term": + { + "color": "blue" + } + }, + { + "term": + { + "color": "green" + } + } + ] + } + } + ] + } +} \ No newline at end of file diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/restrictive-filter-test.yml b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/restrictive-filter-test.yml new file mode 100644 index 0000000000..bf02144ac5 --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/filtering/restrictive-filter/restrictive-filter-test.yml @@ -0,0 +1,37 @@ +endpoint: [ENDPOINT] +test_name: "Faiss HNSW Restrictive Filter Test" +test_id: "Faiss HNSW Restrictive Filter Test" +num_runs: 10 +show_runs: false +steps: + - name: delete_index + index_name: target_index + - name: create_index + index_name: target_index + index_spec: [INDEX_SPEC_PATH]/index.json + - name: ingest_multi_field + index_name: target_index + field_name: target_field + bulk_size: 500 + dataset_format: hdf5 + dataset_path: [DATASET_PATH]/sift-128-euclidean-with-attr.hdf5 + attributes_dataset_name: attributes + attribute_spec: [ { name: 'color', type: 'str' }, { name: 'taste', type: 'str' }, { name: 'age', type: 'int' } ] + - name: refresh_index + index_name: target_index + - name: force_merge + index_name: target_index + max_num_segments: 1 + - name: query_with_filter + k: 100 + r: 1 + calculate_recall: true + index_name: target_index + field_name: target_field + dataset_format: hdf5 + dataset_path: [DATASET_PATH]/sift-128-euclidean-with-attr.hdf5 + neighbors_format: hdf5 + neighbors_path: [DATASET_PATH]/sift-128-euclidean-with-filters.hdf5 + neighbors_dataset: neighbors_filter_4 + filter_spec: [INDEX_SPEC_PATH]/restrictive-filter-spec.json + filter_type: FILTER diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/index.json b/benchmarks/perf-tool/release-configs/faiss-hnsw/index.json new file mode 100644 index 0000000000..b8f591176c --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/index.json @@ -0,0 +1,26 @@ +{ + "settings": { + "index": { + "knn": true, + "number_of_shards": 24, + "number_of_replicas": 1 + } + }, + "mappings": { + "properties": { + "target_field": { + "type": "knn_vector", + "dimension": 128, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "faiss", + "parameters": { + "ef_construction": 256, + "m": 16 + } + } + } + } + } +} diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/test.yml b/benchmarks/perf-tool/release-configs/faiss-hnsw/test.yml new file mode 100644 index 0000000000..f3e976cf3c --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/test.yml @@ -0,0 +1,32 @@ +endpoint: localhost +test_name: "Faiss HNSW Test" +test_id: "Faiss HNSW Test" +num_runs: 10 +show_runs: false +steps: + - name: delete_index + index_name: target_index + - name: create_index + index_name: target_index + index_spec: /home/ec2-user/[PATH]/index.json + - name: ingest + index_name: target_index + field_name: target_field + bulk_size: 500 + dataset_format: hdf5 + dataset_path: [DATASET_PATH]/sift-128-euclidean.hdf5 + - name: refresh_index + index_name: target_index + - name: force_merge + index_name: target_index + max_num_segments: 1 + - name: query + k: 100 + r: 1 + calculate_recall: true + index_name: target_index + field_name: target_field + dataset_format: hdf5 + dataset_path: [DATASET_PATH]/sift-128-euclidean.hdf5 + neighbors_format: hdf5 + neighbors_path: [DATASET_PATH]/sift-128-euclidean.hdf5 diff --git a/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/relaxed-filter/relaxed-filter-test.yml b/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/relaxed-filter/relaxed-filter-test.yml index f20fba2031..44ed8e66e2 100644 --- a/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/relaxed-filter/relaxed-filter-test.yml +++ b/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/relaxed-filter/relaxed-filter-test.yml @@ -1,6 +1,6 @@ endpoint: [ENDPOINT] -test_name: "index-workflow" -test_id: "Index workflow" +test_name: "Lucene HNSW Relaxed Filter Test" +test_id: "Lucene HNSW Relaxed Filter Test" num_runs: 10 show_runs: false steps: diff --git a/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/restrictive-filter/restrictive-filter-test.yml b/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/restrictive-filter/restrictive-filter-test.yml index b1d7b60d7b..d7f451a48e 100644 --- a/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/restrictive-filter/restrictive-filter-test.yml +++ b/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/restrictive-filter/restrictive-filter-test.yml @@ -1,6 +1,6 @@ endpoint: [ENDPOINT] -test_name: "index-workflow" -test_id: "Index workflow" +test_name: "Lucene HNSW Restrictive Filter Test" +test_id: "Lucene HNSW Restrictive Filter Test" num_runs: 10 show_runs: false steps: @@ -8,17 +8,20 @@ steps: index_name: target_index - name: create_index index_name: target_index - index_spec: [INDEX_SPEC_PATH]/index.json + index_spec: /home/ec2-user/k-NN/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/restrictive-filter/index.json - name: ingest_multi_field index_name: target_index field_name: target_field bulk_size: 500 dataset_format: hdf5 - dataset_path: [DATASET_PATH]/sift-128-euclidean-with-attr.hdf5 + dataset_path: /home/ec2-user/k-NN/benchmarks/perf-tool/dataset/sift-128-euclidean-with-attr.hdf5 attributes_dataset_name: attributes attribute_spec: [ { name: 'color', type: 'str' }, { name: 'taste', type: 'str' }, { name: 'age', type: 'int' } ] - name: refresh_index index_name: target_index + - name: force_merge + index_name: target_index + max_num_segments: 1 - name: query_with_filter k: 100 r: 1 @@ -26,9 +29,9 @@ steps: index_name: target_index field_name: target_field dataset_format: hdf5 - dataset_path: [DATASET_PATH]/sift-128-euclidean-with-attr.hdf5 + dataset_path: /home/ec2-user/k-NN/benchmarks/perf-tool/dataset/sift-128-euclidean-with-attr.hdf5 neighbors_format: hdf5 - neighbors_path: [DATASET_PATH]/sift-128-euclidean-with-filters.hdf5 + neighbors_path: /home/ec2-user/k-NN/benchmarks/perf-tool/dataset/sift-128-euclidean-with-filters.hdf5 neighbors_dataset: neighbors_filter_4 - filter_spec: [INDEX_SPEC_PATH]/restrictive-filter-spec.json + filter_spec: /home/ec2-user/k-NN/benchmarks/perf-tool/release-configs/lucene-hnsw/filtering/restrictive-filter/restrictive-filter-spec.json filter_type: FILTER diff --git a/jni/src/faiss_wrapper.cpp b/jni/src/faiss_wrapper.cpp index 2e626f9c6c..a1bbb96352 100644 --- a/jni/src/faiss_wrapper.cpp +++ b/jni/src/faiss_wrapper.cpp @@ -256,7 +256,6 @@ jobjectArray knn_jni::faiss_wrapper::QueryIndex_WithFilter(knn_jni::JNIUtilInter jniUtil->ReleaseIntArrayElements(env, filterIdsJ, filteredIdsArray, JNI_ABORT); } else { try { - std::cout << "Doing query" << std::endl; indexReader->search(1, rawQueryvector, kJ, dis.data(), ids.data()); } catch (...) { jniUtil->ReleaseFloatArrayElements(env, queryVectorJ, rawQueryvector, JNI_ABORT); @@ -286,33 +285,6 @@ jobjectArray knn_jni::faiss_wrapper::QueryIndex_WithFilter(knn_jni::JNIUtilInter return results; } -/** - * Based on the type of the index reader we need to return the SearchParameters. The way we do this by dynamically - * casting the IndexReader. - * @param indexReader - * @param idSelector - * @return SearchParameters - */ -std::unique_ptr buildSearchParams(const faiss::IndexIDMap *indexReader, faiss::IDSelector* idSelector) { - auto hnswReader = dynamic_cast(indexReader->index); - if(hnswReader) { - // we need to make this variable unique_ptr so that the scope can be shared with caller function. - std::unique_ptr hnswParams(new faiss::SearchParametersHNSW); - hnswParams->sel = idSelector; - return hnswParams; - } - - auto ivfReader = dynamic_cast(indexReader->index); - auto ivfFlatReader = dynamic_cast(indexReader->index); - if(ivfReader || ivfFlatReader) { - // we need to make this variable unique_ptr so that the scope can be shared with caller function. - std::unique_ptr ivfParams(new faiss::SearchParametersIVF); - ivfParams->sel = idSelector; - return ivfParams; - } - throw std::runtime_error("Invalid Index Type supported for Filtered Search on Faiss"); -} - void knn_jni::faiss_wrapper::Free(jlong indexPointer) { auto *indexWrapper = reinterpret_cast(indexPointer); delete indexWrapper; @@ -499,3 +471,33 @@ void buildFilterIdsBitMap(const int* filterIds, int filterIdsLength, uint8_t* bi bitsetVector[bitsetArrayIndex] = bitsetVector[bitsetArrayIndex] | (1 << (value & 7)); } } + +/** + * Based on the type of the index reader we need to return the SearchParameters. The way we do this by dynamically + * casting the IndexReader. + * @param indexReader + * @param idSelector + * @return SearchParameters + */ +std::unique_ptr buildSearchParams(const faiss::IndexIDMap *indexReader, faiss::IDSelector* idSelector) { + auto hnswReader = dynamic_cast(indexReader->index); + if(hnswReader) { + // we need to make this variable unique_ptr so that the scope can be shared with caller function. + std::unique_ptr hnswParams(new faiss::SearchParametersHNSW); + // Setting the ef_search value equal to what was provided during index creation. SearchParametersHNSW has a default + // value of ef_search = 16 which will then be used. + hnswParams->efSearch = hnswReader->hnsw.efSearch; + hnswParams->sel = idSelector; + return hnswParams; + } + + auto ivfReader = dynamic_cast(indexReader->index); + auto ivfFlatReader = dynamic_cast(indexReader->index); + if(ivfReader || ivfFlatReader) { + // we need to make this variable unique_ptr so that the scope can be shared with caller function. + std::unique_ptr ivfParams(new faiss::SearchParametersIVF); + ivfParams->sel = idSelector; + return ivfParams; + } + throw std::runtime_error("Invalid Index Type supported for Filtered Search on Faiss"); +}