Cleaning up code, adding docs

Signed-off-by: Martin Gaievski <[email protected]>
opensearch-project · Nov 4, 2022 · bc09d85 · bc09d85
1 parent fb8f345
commit bc09d85
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 34 deletions.
diff --git a/benchmarks/perf-tool/add-filters-to-dataset.py b/benchmarks/perf-tool/add-filters-to-dataset.py
@@ -1,19 +1,41 @@
-import getopt
-import os
-import random
-import sys
-
-import h5py
-
-from osb.extensions.data_set import Context, HDF5DataSet
-
+# SPDX-License-Identifier: Apache-2.0
+#
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.
 """
 Script builds complex dataset with additional attributes from exiting dataset that has only vectors. 
 Additional attributes are predefined in the script: color, taste, age. Only HDF5 format of vector dataset is supported.
 
-Script generates additional dataset of neighbours (ground truth) for each filter type. 
+Output dataset file will have additional dataset 'attributes' with multiple columns, each column corresponds to one attribute
+from an attribute set, and value is generated at random, e.g.:
+
+0: green	None	71
+1: green	bitter	28
+
+there is no explicit index reference in 'attributes' dataset, index of the row corresponds to a document id. 
+For instance, in example above two rows of fields mapped to documents with ids '0' and '1'.  
+
+If 'generate_filters' flag is set script generates additional dataset of neighbours (ground truth) for each filter type. 
+Output is a new file with several datasets, each dataset corresponds to one filter. Datasets are named 'neighbour_filter_X'
+where X is 1 based index of particular filter. 
+Each dataset has rows with array of integers, where integer corresponds to 
+a document id from original dataset with additional fields. Array ca have -1 values that are treated as null, this is because
+subset of filtered documents is same of smaller than original set. 
+
+For example, dataset file content may look like :
+
+neighbour_filter_1: [[ 2,  5, -1],
+                     [ 3,  1, -1],
+                     [ 2   5,  7]]
+neighbour_filter_2: [[-1, -1, -1],
+                     [ 5,  6, -1],
+                     [ 4,  2,  1]]
 
-Example of usage:
+In this case we do have datasets for two filters, 3 query results for each. [2, 5, -1] indicates that for first query 
+if filter 1 is used most similar document is with id 2, next similar is 5, and the rest do not pass filter 1 criteria.
+
+Example of script usage:
     
     create new hdf5 file with attribute dataset
     add-filters-to-dataset.py ~/dev/opensearch/k-NN/benchmarks/perf-tool/dataset/data.hdf5 ~/dev/opensearch/datasets/data-with-attr True False
@@ -22,16 +44,21 @@
     add-filters-to-dataset.py ~/dev/opensearch/k-NN/benchmarks/perf-tool/dataset/data-with-attr.hdf5 ~/dev/opensearch/datasets/data-with-filters False True
 """
 
-class Dataset():
-    DEFAULT_INDEX_NAME = "test-index"
-    DEFAULT_FIELD_NAME = "test-field"
-    DEFAULT_CONTEXT = Context.INDEX
+import getopt
+import os
+import random
+import sys
+
+import h5py
+
+from osb.extensions.data_set import HDF5DataSet
+
+
+class _Dataset:
+    """Type of dataset container for data with additional attributes"""
     DEFAULT_TYPE = HDF5DataSet.FORMAT_NAME
-    DEFAULT_NUM_VECTORS = 10
-    DEFAULT_DIMENSION = 10
-    DEFAULT_RANDOM_STRING_LENGTH = 8
 
-    def createDataset(self, source_dataset_path, out_file_path, generate_attrs: bool, generate_filters: bool) -> None:
+    def create_dataset(self, source_dataset_path, out_file_path, generate_attrs: bool, generate_filters: bool) -> None:
         path_elements = os.path.split(os.path.abspath(source_dataset_path))
         data_set_dir = path_elements[0]
 
@@ -104,8 +131,8 @@ def filter3(attributes, vector_idx):
                 # filter 4 - color green or blue and taste is bitter and age is between (30, 60)
                 def filter4(attributes, vector_idx):
                     if (attributes[vector_idx][0].decode() == 'green' or attributes[vector_idx][0].decode() == 'blue') \
-                        and  (attributes[vector_idx][1].decode() == 'bitter') \
-                        and 30 <= int(attributes[vector_idx][2].decode()) <= 60:
+                            and (attributes[vector_idx][1].decode() == 'bitter') \
+                            and 30 <= int(attributes[vector_idx][2].decode()) <= 60:
                         return True
                     else:
                         return False
@@ -140,7 +167,7 @@ def apply_filter(self, expected_neighbors, attributes, data_set_w_filtering, fil
                     filtered_count += 1
             neighbors_filter.append(neighbors_filter_row)
         overall_count = len(expected_neighbors) * len(expected_neighbors[0])
-        perc = float(filtered_count/overall_count) * 100
+        perc = float(filtered_count / overall_count) * 100
         print('ground truth size for {} is {}, percentage {}'.format(filter_name, filtered_count, perc))
         data_set_w_filtering.create_dataset(filter_name, data=neighbors_filter)
         return expected_neighbors
@@ -161,12 +188,13 @@ def main(argv):
     generate_attr = str2bool(args[2])
     generate_filters = str2bool(args[3])
 
-    worker = Dataset()
-    worker.createDataset(in_file_path, out_file_path, generate_attr, generate_filters)
+    worker = _Dataset()
+    worker.create_dataset(in_file_path, out_file_path, generate_attr, generate_filters)
+
 
 def str2bool(v):
-  return v.lower() in ("yes", "true", "t", "1")
+    return v.lower() in ("yes", "true", "t", "1")
 
-if __name__ == "__main__":
-   main(sys.argv[1:])
 
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/benchmarks/perf-tool/okpt/io/dataset.py b/benchmarks/perf-tool/okpt/io/dataset.py
@@ -94,7 +94,7 @@ def _parse_context(context: Context, custom_context=None) -> str:
             return "neighbors"
 
         if context == Context.INDEX:
-            return "train"
+            return "neighbors"
 
         if context == Context.QUERY:
             return "test"

diff --git a/benchmarks/perf-tool/sample-configs/filter-spec/filter-5-spec.json b/benchmarks/perf-tool/sample-configs/filter-spec/filter-5-spec.json
@@ -36,12 +36,6 @@
                 {
                     "color": "sweet"
                 }
-            },
-            {
-                "term":
-                {
-                    "taste": "bitter"
-                }
             }
         ]
     }
-Original file line number
+Diff line change
@@ Expand Up / @@ -36,12 +36,6 @@ @@
                     {
                         "color": "sweet"
                     }
-                },
-                {
-                    "term":
-                    {
-                        "taste": "bitter"
-                    }
                 }
             ]
         }
@@ Expand Down @@