Skip to content

Commit

Permalink
Cleaning up code, adding docs
Browse files Browse the repository at this point in the history
Signed-off-by: Martin Gaievski <[email protected]>
  • Loading branch information
martin-gaievski committed Nov 4, 2022
1 parent fb8f345 commit bc09d85
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 34 deletions.
82 changes: 55 additions & 27 deletions benchmarks/perf-tool/add-filters-to-dataset.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,41 @@
import getopt
import os
import random
import sys

import h5py

from osb.extensions.data_set import Context, HDF5DataSet

# SPDX-License-Identifier: Apache-2.0
#
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.
"""
Script builds complex dataset with additional attributes from exiting dataset that has only vectors.
Additional attributes are predefined in the script: color, taste, age. Only HDF5 format of vector dataset is supported.
Script generates additional dataset of neighbours (ground truth) for each filter type.
Output dataset file will have additional dataset 'attributes' with multiple columns, each column corresponds to one attribute
from an attribute set, and value is generated at random, e.g.:
0: green None 71
1: green bitter 28
there is no explicit index reference in 'attributes' dataset, index of the row corresponds to a document id.
For instance, in example above two rows of fields mapped to documents with ids '0' and '1'.
If 'generate_filters' flag is set script generates additional dataset of neighbours (ground truth) for each filter type.
Output is a new file with several datasets, each dataset corresponds to one filter. Datasets are named 'neighbour_filter_X'
where X is 1 based index of particular filter.
Each dataset has rows with array of integers, where integer corresponds to
a document id from original dataset with additional fields. Array ca have -1 values that are treated as null, this is because
subset of filtered documents is same of smaller than original set.
For example, dataset file content may look like :
neighbour_filter_1: [[ 2, 5, -1],
[ 3, 1, -1],
[ 2 5, 7]]
neighbour_filter_2: [[-1, -1, -1],
[ 5, 6, -1],
[ 4, 2, 1]]
Example of usage:
In this case we do have datasets for two filters, 3 query results for each. [2, 5, -1] indicates that for first query
if filter 1 is used most similar document is with id 2, next similar is 5, and the rest do not pass filter 1 criteria.
Example of script usage:
create new hdf5 file with attribute dataset
add-filters-to-dataset.py ~/dev/opensearch/k-NN/benchmarks/perf-tool/dataset/data.hdf5 ~/dev/opensearch/datasets/data-with-attr True False
Expand All @@ -22,16 +44,21 @@
add-filters-to-dataset.py ~/dev/opensearch/k-NN/benchmarks/perf-tool/dataset/data-with-attr.hdf5 ~/dev/opensearch/datasets/data-with-filters False True
"""

class Dataset():
DEFAULT_INDEX_NAME = "test-index"
DEFAULT_FIELD_NAME = "test-field"
DEFAULT_CONTEXT = Context.INDEX
import getopt
import os
import random
import sys

import h5py

from osb.extensions.data_set import HDF5DataSet


class _Dataset:
"""Type of dataset container for data with additional attributes"""
DEFAULT_TYPE = HDF5DataSet.FORMAT_NAME
DEFAULT_NUM_VECTORS = 10
DEFAULT_DIMENSION = 10
DEFAULT_RANDOM_STRING_LENGTH = 8

def createDataset(self, source_dataset_path, out_file_path, generate_attrs: bool, generate_filters: bool) -> None:
def create_dataset(self, source_dataset_path, out_file_path, generate_attrs: bool, generate_filters: bool) -> None:
path_elements = os.path.split(os.path.abspath(source_dataset_path))
data_set_dir = path_elements[0]

Expand Down Expand Up @@ -104,8 +131,8 @@ def filter3(attributes, vector_idx):
# filter 4 - color green or blue and taste is bitter and age is between (30, 60)
def filter4(attributes, vector_idx):
if (attributes[vector_idx][0].decode() == 'green' or attributes[vector_idx][0].decode() == 'blue') \
and (attributes[vector_idx][1].decode() == 'bitter') \
and 30 <= int(attributes[vector_idx][2].decode()) <= 60:
and (attributes[vector_idx][1].decode() == 'bitter') \
and 30 <= int(attributes[vector_idx][2].decode()) <= 60:
return True
else:
return False
Expand Down Expand Up @@ -140,7 +167,7 @@ def apply_filter(self, expected_neighbors, attributes, data_set_w_filtering, fil
filtered_count += 1
neighbors_filter.append(neighbors_filter_row)
overall_count = len(expected_neighbors) * len(expected_neighbors[0])
perc = float(filtered_count/overall_count) * 100
perc = float(filtered_count / overall_count) * 100
print('ground truth size for {} is {}, percentage {}'.format(filter_name, filtered_count, perc))
data_set_w_filtering.create_dataset(filter_name, data=neighbors_filter)
return expected_neighbors
Expand All @@ -161,12 +188,13 @@ def main(argv):
generate_attr = str2bool(args[2])
generate_filters = str2bool(args[3])

worker = Dataset()
worker.createDataset(in_file_path, out_file_path, generate_attr, generate_filters)
worker = _Dataset()
worker.create_dataset(in_file_path, out_file_path, generate_attr, generate_filters)


def str2bool(v):
return v.lower() in ("yes", "true", "t", "1")
return v.lower() in ("yes", "true", "t", "1")

if __name__ == "__main__":
main(sys.argv[1:])

if __name__ == "__main__":
main(sys.argv[1:])
2 changes: 1 addition & 1 deletion benchmarks/perf-tool/okpt/io/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def _parse_context(context: Context, custom_context=None) -> str:
return "neighbors"

if context == Context.INDEX:
return "train"
return "neighbors"

if context == Context.QUERY:
return "test"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,6 @@
{
"color": "sweet"
}
},
{
"term":
{
"taste": "bitter"
}
}
]
}
Expand Down

0 comments on commit bc09d85

Please sign in to comment.