-
Notifications
You must be signed in to change notification settings - Fork 127
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
* Adding workflow for benchmarking queries with filters Signed-off-by: Martin Gaievski <[email protected]> (cherry picked from commit 79ae6c2) Co-authored-by: Martin Gaievski <[email protected]>
- Loading branch information
1 parent
9d4cf3f
commit a5a4b6b
Showing
15 changed files
with
752 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# The OpenSearch Contributors require contributions made to | ||
# this file be licensed under the Apache-2.0 license or a | ||
# compatible open source license. | ||
""" | ||
Script builds complex dataset with additional attributes from exiting dataset that has only vectors. | ||
Additional attributes are predefined in the script: color, taste, age. Only HDF5 format of vector dataset is supported. | ||
Output dataset file will have additional dataset 'attributes' with multiple columns, each column corresponds to one attribute | ||
from an attribute set, and value is generated at random, e.g.: | ||
0: green None 71 | ||
1: green bitter 28 | ||
there is no explicit index reference in 'attributes' dataset, index of the row corresponds to a document id. | ||
For instance, in example above two rows of fields mapped to documents with ids '0' and '1'. | ||
If 'generate_filters' flag is set script generates additional dataset of neighbours (ground truth) for each filter type. | ||
Output is a new file with several datasets, each dataset corresponds to one filter. Datasets are named 'neighbour_filter_X' | ||
where X is 1 based index of particular filter. | ||
Each dataset has rows with array of integers, where integer corresponds to | ||
a document id from original dataset with additional fields. Array ca have -1 values that are treated as null, this is because | ||
subset of filtered documents is same of smaller than original set. | ||
For example, dataset file content may look like : | ||
neighbour_filter_1: [[ 2, 5, -1], | ||
[ 3, 1, -1], | ||
[ 2 5, 7]] | ||
neighbour_filter_2: [[-1, -1, -1], | ||
[ 5, 6, -1], | ||
[ 4, 2, 1]] | ||
In this case we do have datasets for two filters, 3 query results for each. [2, 5, -1] indicates that for first query | ||
if filter 1 is used most similar document is with id 2, next similar is 5, and the rest do not pass filter 1 criteria. | ||
Example of script usage: | ||
create new hdf5 file with attribute dataset | ||
add-filters-to-dataset.py ~/dev/opensearch/k-NN/benchmarks/perf-tool/dataset/data.hdf5 ~/dev/opensearch/datasets/data-with-attr True False | ||
create new hdf5 file with filter datasets | ||
add-filters-to-dataset.py ~/dev/opensearch/k-NN/benchmarks/perf-tool/dataset/data-with-attr.hdf5 ~/dev/opensearch/datasets/data-with-filters False True | ||
""" | ||
|
||
import getopt | ||
import os | ||
import random | ||
import sys | ||
|
||
import h5py | ||
|
||
from osb.extensions.data_set import HDF5DataSet | ||
|
||
|
||
class _Dataset: | ||
"""Type of dataset container for data with additional attributes""" | ||
DEFAULT_TYPE = HDF5DataSet.FORMAT_NAME | ||
|
||
def create_dataset(self, source_dataset_path, out_file_path, generate_attrs: bool, generate_filters: bool) -> None: | ||
path_elements = os.path.split(os.path.abspath(source_dataset_path)) | ||
data_set_dir = path_elements[0] | ||
|
||
# For HDF5, because multiple data sets can be grouped in the same file, | ||
# we will build data sets in memory and not write to disk until | ||
# _flush_data_sets_to_disk is called | ||
# read existing dataset | ||
data_hdf5 = os.path.join(os.path.dirname(os.path.realpath('/')), source_dataset_path) | ||
|
||
with h5py.File(data_hdf5, "r") as hf: | ||
|
||
if generate_attrs: | ||
data_set_w_attr = self.create_dataset_file(out_file_path, self.DEFAULT_TYPE, data_set_dir) | ||
|
||
possible_colors = ['red', 'green', 'yellow', 'blue', None] | ||
possible_tastes = ['sweet', 'salty', 'sour', 'bitter', None] | ||
max_age = 100 | ||
|
||
for key in hf.keys(): | ||
if key not in ['neighbors', 'test', 'train']: | ||
continue | ||
data_set_w_attr.create_dataset(key, data=hf[key][()]) | ||
|
||
attributes = [] | ||
for i in range(len(hf['train'])): | ||
attr = [random.choice(possible_colors), random.choice(possible_tastes), | ||
random.randint(0, max_age + 1)] | ||
attributes.append(attr) | ||
|
||
data_set_w_attr.create_dataset('attributes', (len(attributes), 3), 'S10', data=attributes) | ||
|
||
data_set_w_attr.flush() | ||
data_set_w_attr.close() | ||
|
||
if generate_filters: | ||
attributes = hf['attributes'][()] | ||
expected_neighbors = hf['neighbors'][()] | ||
|
||
data_set_filters = self.create_dataset_file(out_file_path, self.DEFAULT_TYPE, data_set_dir) | ||
|
||
def filter1(attributes, vector_idx): | ||
if attributes[vector_idx][0].decode() == 'red' and int(attributes[vector_idx][2].decode()) >= 20: | ||
return True | ||
else: | ||
return False | ||
|
||
self.apply_filter(expected_neighbors, attributes, data_set_filters, 'neighbors_filter_1', filter1) | ||
|
||
# filter 2 - color = blue or None and taste = 'salty' | ||
def filter2(attributes, vector_idx): | ||
if (attributes[vector_idx][0].decode() == 'blue' or attributes[vector_idx][ | ||
0].decode() == 'None') and attributes[vector_idx][1].decode() == 'salty': | ||
return True | ||
else: | ||
return False | ||
|
||
self.apply_filter(expected_neighbors, attributes, data_set_filters, 'neighbors_filter_2', filter2) | ||
|
||
# filter 3 - color and taste are not None and age is between 20 and 80 | ||
def filter3(attributes, vector_idx): | ||
if attributes[vector_idx][0].decode() != 'None' and attributes[vector_idx][ | ||
1].decode() != 'None' and 20 <= \ | ||
int(attributes[vector_idx][2].decode()) <= 80: | ||
return True | ||
else: | ||
return False | ||
|
||
self.apply_filter(expected_neighbors, attributes, data_set_filters, 'neighbors_filter_3', filter3) | ||
|
||
# filter 4 - color green or blue and taste is bitter and age is between (30, 60) | ||
def filter4(attributes, vector_idx): | ||
if (attributes[vector_idx][0].decode() == 'green' or attributes[vector_idx][0].decode() == 'blue') \ | ||
and (attributes[vector_idx][1].decode() == 'bitter') \ | ||
and 30 <= int(attributes[vector_idx][2].decode()) <= 60: | ||
return True | ||
else: | ||
return False | ||
|
||
self.apply_filter(expected_neighbors, attributes, data_set_filters, 'neighbors_filter_4', filter4) | ||
|
||
# filter 5 color is (green or blue or yellow) or taste = sweet or age is between (30, 70) | ||
def filter5(attributes, vector_idx): | ||
if attributes[vector_idx][0].decode() == 'green' or attributes[vector_idx][0].decode() == 'blue' \ | ||
or attributes[vector_idx][0].decode() == 'yellow' \ | ||
or attributes[vector_idx][1].decode() == 'sweet' \ | ||
or 30 <= int(attributes[vector_idx][2].decode()) <= 70: | ||
return True | ||
else: | ||
return False | ||
|
||
self.apply_filter(expected_neighbors, attributes, data_set_filters, 'neighbors_filter_5', filter5) | ||
|
||
data_set_filters.flush() | ||
data_set_filters.close() | ||
|
||
def apply_filter(self, expected_neighbors, attributes, data_set_w_filtering, filter_name, filter_func): | ||
neighbors_filter = [] | ||
filtered_count = 0 | ||
for expected_neighbors_row in expected_neighbors: | ||
neighbors_filter_row = [-1] * len(expected_neighbors_row) | ||
idx = 0 | ||
for vector_idx in expected_neighbors_row: | ||
if filter_func(attributes, vector_idx): | ||
neighbors_filter_row[idx] = vector_idx | ||
idx += 1 | ||
filtered_count += 1 | ||
neighbors_filter.append(neighbors_filter_row) | ||
overall_count = len(expected_neighbors) * len(expected_neighbors[0]) | ||
perc = float(filtered_count / overall_count) * 100 | ||
print('ground truth size for {} is {}, percentage {}'.format(filter_name, filtered_count, perc)) | ||
data_set_w_filtering.create_dataset(filter_name, data=neighbors_filter) | ||
return expected_neighbors | ||
|
||
def create_dataset_file(self, file_name, extension, data_set_dir) -> h5py.File: | ||
data_set_file_name = "{}.{}".format(file_name, extension) | ||
data_set_path = os.path.join(data_set_dir, data_set_file_name) | ||
|
||
data_set_w_filtering = h5py.File(data_set_path, 'a') | ||
|
||
return data_set_w_filtering | ||
|
||
|
||
def main(argv): | ||
opts, args = getopt.getopt(argv, "") | ||
in_file_path = args[0] | ||
out_file_path = args[1] | ||
generate_attr = str2bool(args[2]) | ||
generate_filters = str2bool(args[3]) | ||
|
||
worker = _Dataset() | ||
worker.create_dataset(in_file_path, out_file_path, generate_attr, generate_filters) | ||
|
||
|
||
def str2bool(v): | ||
return v.lower() in ("yes", "true", "t", "1") | ||
|
||
|
||
if __name__ == "__main__": | ||
main(sys.argv[1:]) |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.