diff --git a/benchmarks/perf-tool/add-filters-to-dataset.py b/benchmarks/perf-tool/add-filters-to-dataset.py index 3e02a0d0b8..b2d9625b3d 100644 --- a/benchmarks/perf-tool/add-filters-to-dataset.py +++ b/benchmarks/perf-tool/add-filters-to-dataset.py @@ -5,15 +5,41 @@ import h5py -from osb.extensions.data_set import Context, HDF5DataSet +from osb.extensions.data_set import HDF5DataSet """ Script builds complex dataset with additional attributes from exiting dataset that has only vectors. Additional attributes are predefined in the script: color, taste, age. Only HDF5 format of vector dataset is supported. -Script generates additional dataset of neighbours (ground truth) for each filter type. +Output dataset file will have additional dataset 'attributes' with multiple columns, each column corresponds to one attribute +from an attribute set, and value is generated at random, e.g.: -Example of usage: +0: green None 71 +1: green bitter 28 + +there is no explicit index reference in 'attributes' dataset, index of the row corresponds to a document id. +For instance, in example above two rows of fields mapped to documents with ids '0' and '1'. + +If 'generate_filters' flag is set script generates additional dataset of neighbours (ground truth) for each filter type. +Output is a new file with several datasets, each dataset corresponds to one filter. Datasets are named 'neighbour_filter_X' +where X is 1 based index of particular filter. +Each dataset has rows with array of integers, where integer corresponds to +a document id from original dataset with additional fields. Array ca have -1 values that are treated as null, this is because +subset of filtered documents is same of smaller than original set. + +For example, dataset file content may look like : + +neighbour_filter_1: [[ 2, 5, -1], + [ 3, 1, -1], + [ 2 5, 7]] +neighbour_filter_2: [[-1, -1, -1], + [ 5, 6, -1], + [ 4, 2, 1]] + +In this case we do have datasets for two filters, 3 query results for each. [2, 5, -1] indicates that for first query +if filter 1 is used most similar document is with id 2, next similar is 5, and the rest do not pass filter 1 criteria. + +Example of script usage: create new hdf5 file with attribute dataset add-filters-to-dataset.py ~/dev/opensearch/k-NN/benchmarks/perf-tool/dataset/data.hdf5 ~/dev/opensearch/datasets/data-with-attr True False @@ -22,16 +48,12 @@ add-filters-to-dataset.py ~/dev/opensearch/k-NN/benchmarks/perf-tool/dataset/data-with-attr.hdf5 ~/dev/opensearch/datasets/data-with-filters False True """ -class Dataset(): - DEFAULT_INDEX_NAME = "test-index" - DEFAULT_FIELD_NAME = "test-field" - DEFAULT_CONTEXT = Context.INDEX + +class _Dataset: + """Type of dataset container for data with additional attributes""" DEFAULT_TYPE = HDF5DataSet.FORMAT_NAME - DEFAULT_NUM_VECTORS = 10 - DEFAULT_DIMENSION = 10 - DEFAULT_RANDOM_STRING_LENGTH = 8 - def createDataset(self, source_dataset_path, out_file_path, generate_attrs: bool, generate_filters: bool) -> None: + def create_dataset(self, source_dataset_path, out_file_path, generate_attrs: bool, generate_filters: bool) -> None: path_elements = os.path.split(os.path.abspath(source_dataset_path)) data_set_dir = path_elements[0] @@ -104,8 +126,8 @@ def filter3(attributes, vector_idx): # filter 4 - color green or blue and taste is bitter and age is between (30, 60) def filter4(attributes, vector_idx): if (attributes[vector_idx][0].decode() == 'green' or attributes[vector_idx][0].decode() == 'blue') \ - and (attributes[vector_idx][1].decode() == 'bitter') \ - and 30 <= int(attributes[vector_idx][2].decode()) <= 60: + and (attributes[vector_idx][1].decode() == 'bitter') \ + and 30 <= int(attributes[vector_idx][2].decode()) <= 60: return True else: return False @@ -140,7 +162,7 @@ def apply_filter(self, expected_neighbors, attributes, data_set_w_filtering, fil filtered_count += 1 neighbors_filter.append(neighbors_filter_row) overall_count = len(expected_neighbors) * len(expected_neighbors[0]) - perc = float(filtered_count/overall_count) * 100 + perc = float(filtered_count / overall_count) * 100 print('ground truth size for {} is {}, percentage {}'.format(filter_name, filtered_count, perc)) data_set_w_filtering.create_dataset(filter_name, data=neighbors_filter) return expected_neighbors @@ -161,12 +183,13 @@ def main(argv): generate_attr = str2bool(args[2]) generate_filters = str2bool(args[3]) - worker = Dataset() - worker.createDataset(in_file_path, out_file_path, generate_attr, generate_filters) + worker = _Dataset() + worker.create_dataset(in_file_path, out_file_path, generate_attr, generate_filters) + def str2bool(v): - return v.lower() in ("yes", "true", "t", "1") + return v.lower() in ("yes", "true", "t", "1") -if __name__ == "__main__": - main(sys.argv[1:]) +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/benchmarks/perf-tool/okpt/io/dataset.py b/benchmarks/perf-tool/okpt/io/dataset.py index 001563babb..046b6eded6 100644 --- a/benchmarks/perf-tool/okpt/io/dataset.py +++ b/benchmarks/perf-tool/okpt/io/dataset.py @@ -94,7 +94,7 @@ def _parse_context(context: Context, custom_context=None) -> str: return "neighbors" if context == Context.INDEX: - return "train" + return "neighbors" if context == Context.QUERY: return "test" diff --git a/benchmarks/perf-tool/sample-configs/filter-spec/filter-5-spec.json b/benchmarks/perf-tool/sample-configs/filter-spec/filter-5-spec.json index 08acded4b5..fecde03928 100644 --- a/benchmarks/perf-tool/sample-configs/filter-spec/filter-5-spec.json +++ b/benchmarks/perf-tool/sample-configs/filter-spec/filter-5-spec.json @@ -36,12 +36,6 @@ { "color": "sweet" } - }, - { - "term": - { - "taste": "bitter" - } } ] }