From fb8f3458053499fb414a4cf659d577b0727610c5 Mon Sep 17 00:00:00 2001 From: Martin Gaievski Date: Thu, 27 Oct 2022 17:32:54 -0700 Subject: [PATCH] Adding bool query post-filtering option Signed-off-by: Martin Gaievski --- benchmarks/perf-tool/README.md | 32 +++++++++---------- benchmarks/perf-tool/okpt/test/steps/steps.py | 21 +++++++++++- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/benchmarks/perf-tool/README.md b/benchmarks/perf-tool/README.md index 239563e22..2a0e34f07 100644 --- a/benchmarks/perf-tool/README.md +++ b/benchmarks/perf-tool/README.md @@ -286,22 +286,22 @@ Runs a set of queries with filter against an index. ##### Parameters -| Parameter Name | Description | Default | -| ----------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------| -| k | Number of neighbors to return on search | 100 | -| r | r value in Recall@R | 1 | -| index_name | Name of index to search | No default | -| field_name | Name field to search | No default | -| calculate_recall | Whether to calculate recall values | False | -| dataset_format | Format the dataset is in. Currently hdf5 and bigann is supported. The hdf5 file must be organized in the same way that the ann-benchmarks organizes theirs. | 'hdf5' | -| dataset_path | Path to dataset | No default | -| neighbors_format | Format the neighbors dataset is in. Currently hdf5 and bigann is supported. The hdf5 file must be organized in the same way that the ann-benchmarks organizes theirs. | 'hdf5' | -| neighbors_path | Path to neighbors dataset | No default | -| neighbors_dataset | Name of filter dataset inside the neighbors dataset | No default | -| filter_spec | Path to filter specification | No default | -| filter_type | Type of filter format, we do support following types:
FILTER inner filter format for approximate k-NN search
SCRIPT score scripting style with exact k-NN search | SCRIPT | -| score_script_similarity | Similarity function that has been used to index dataset. Used for SCRIPT filter type and ignored for others | l2 | -| query_count | Number of queries to create from data-set | Size of the data-set | +| Parameter Name | Description | Default | +| ----------- |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------| +| k | Number of neighbors to return on search | 100 | +| r | r value in Recall@R | 1 | +| index_name | Name of index to search | No default | +| field_name | Name field to search | No default | +| calculate_recall | Whether to calculate recall values | False | +| dataset_format | Format the dataset is in. Currently hdf5 and bigann is supported. The hdf5 file must be organized in the same way that the ann-benchmarks organizes theirs. | 'hdf5' | +| dataset_path | Path to dataset | No default | +| neighbors_format | Format the neighbors dataset is in. Currently hdf5 and bigann is supported. The hdf5 file must be organized in the same way that the ann-benchmarks organizes theirs. | 'hdf5' | +| neighbors_path | Path to neighbors dataset | No default | +| neighbors_dataset | Name of filter dataset inside the neighbors dataset | No default | +| filter_spec | Path to filter specification | No default | +| filter_type | Type of filter format, we do support following types:
FILTER inner filter format for approximate k-NN search
SCRIPT score scripting with exact k-NN search and pre-filtering
BOOL_POST_FILTER Bool query with post-filtering | SCRIPT | +| score_script_similarity | Similarity function that has been used to index dataset. Used for SCRIPT filter type and ignored for others | l2 | +| query_count | Number of queries to create from data-set | Size of the data-set | ##### Metrics diff --git a/benchmarks/perf-tool/okpt/test/steps/steps.py b/benchmarks/perf-tool/okpt/test/steps/steps.py index 2fc35ca2d..d62f2858a 100644 --- a/benchmarks/perf-tool/okpt/test/steps/steps.py +++ b/benchmarks/perf-tool/okpt/test/steps/steps.py @@ -555,6 +555,25 @@ def get_body_filter(vec): } } } + elif self.filter_type == 'BOOL_POST_FILTER': + return { + 'size': self.k, + 'query': { + 'bool': { + 'filter': filter_json, + 'must': [ + { + 'knn': { + self.field_name: { + 'vector': vec, + 'k': self.k + } + } + } + ] + } + } + } else: raise ConfigurationError('Not supported filter type {}'.format(self.filter_type)) @@ -573,7 +592,7 @@ def get_body_filter(vec): results['took'] = [ float(query_response['took']) for query_response in query_responses ] - results['memory_kb'] = get_cache_size_in_kb(self.endpoint, 9200) + results['memory_kb'] = get_cache_size_in_kb(self.endpoint, 80) if self.calculate_recall: ids = [[int(hit['_id'])