Skip to content

Commit

Permalink
Merge pull request #5 from divyegala/python-ann-bench-use-gbench
Browse files Browse the repository at this point in the history
Add `data_export.py`
  • Loading branch information
cjnolet authored Aug 29, 2023
2 parents c28326c + ef7b7f9 commit 39dd3f4
Show file tree
Hide file tree
Showing 6 changed files with 147 additions and 72 deletions.
65 changes: 65 additions & 0 deletions bench/ann/data_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import argparse
import pandas as pd
import os
import json


def read_file(dataset, dataset_path, method):
dir = os.path.join(dataset_path, dataset, "result", method)
for file in os.listdir(dir):
if file.endswith(".json"):
with open(os.path.join(dir, file), "r") as f:
data = json.load(f)
df = pd.DataFrame(data["benchmarks"])
yield (os.path.join(dir, file), file.split('-')[0], df)

def convert_json_to_csv_build(dataset, dataset_path):
for file, algo_name, df in read_file(dataset, dataset_path, "build"):
df['name'] = df['name'].str.split('/').str[0]
write = pd.DataFrame({'algo_name' : [algo_name] * len(df),
'index_name' : df['name'],
'time' : df['real_time']})
write.to_csv(file.replace('.json', '.csv'), index=False)


def convert_json_to_csv_search(dataset, dataset_path):
for file, algo_name, df in read_file(dataset, dataset_path, "search"):
df['name'] = df['name'].str.split('/').str[0]
write = pd.DataFrame({'algo_name' : [algo_name] * len(df),
'index_name' : df['name'],
'recall' : df['Recall'],
'qps' : df['items_per_second']})
write.to_csv(file.replace('.json', '.csv'), index=False)


def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--dataset", help="dataset to download",
default="glove-100-inner")
parser.add_argument("--dataset-path", help="path to dataset folder",
default=os.path.join(os.getenv("RAFT_HOME"),
"bench", "ann", "data"))
args = parser.parse_args()
convert_json_to_csv_build(args.dataset, args.dataset_path)
convert_json_to_csv_search(args.dataset, args.dataset_path)


if __name__ == "__main__":
main()
74 changes: 31 additions & 43 deletions bench/ann/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,18 +203,18 @@ def create_plot_build(build_results, search_results, linestyles, fn_out,
xn = "k-nn"
yn = "qps"

recall_85 = [-1] * len(linestyles)
qps_85 = [-1] * len(linestyles)
bt_85 = [0] * len(linestyles)
i_85 = [-1] * len(linestyles)
recall_90 = [-1] * len(linestyles)

qps_90 = [-1] * len(linestyles)
bt_90 = [0] * len(linestyles)
i_90 = [-1] * len(linestyles)
recall_95 = [-1] * len(linestyles)

qps_95 = [-1] * len(linestyles)
bt_95 = [0] * len(linestyles)
i_95 = [-1] * len(linestyles)

data = OrderedDict()
colors = OrderedDict()

Expand Down Expand Up @@ -248,7 +248,7 @@ def mean_y(algo):
plt.figure(figsize=(12, 9))
ax = df.plot.bar(rot=0, color=colors)
fig = ax.get_figure()
print(f"writing search output to {fn_out}")
print(f"writing build output to {fn_out}")
plt.title("Build Time for Highest QPS")
plt.suptitle(f"{dataset} k={k} batch_size={batch_size}")
plt.ylabel("Build Time (s)")
Expand All @@ -258,45 +258,33 @@ def mean_y(algo):
def load_lines(results_path, result_files, method, index_key):
results = dict()

linebreaker = "name,iterations"

for result_filename in result_files:
with open(os.path.join(results_path, result_filename), 'r') as f:
lines = f.readlines()
lines = lines[:-1] if lines[-1] == "\n" else lines
idx = 0
for pos, line in enumerate(lines):
if linebreaker in line:
idx = pos
break

if method == "build":
if "hnswlib" in result_filename:
if result_filename.endswith('.csv'):
with open(os.path.join(results_path, result_filename), 'r') as f:
lines = f.readlines()
lines = lines[:-1] if lines[-1] == "\n" else lines

if method == "build":
key_idx = [2]
else:
key_idx = [10]
elif method == "search":
if "hnswlib" in result_filename:
key_idx = [10, 6]
else:
key_idx = [12, 10]

for line in lines[idx+1:]:
split_lines = line.split(',')

algo_name = split_lines[0].split('.')[0].strip("\"")
index_name = split_lines[0].split('/')[0].strip("\"")

if index_key == "algo":
dict_key = algo_name
elif index_key == "index":
dict_key = (algo_name, index_name)
if dict_key not in results:
results[dict_key] = []
to_add = [algo_name, index_name]
for key_i in key_idx:
to_add.append(float(split_lines[key_i]))
results[dict_key].append(to_add)
elif method == "search":
key_idx = [2, 3]

for line in lines[1:]:
split_lines = line.split(',')

algo_name = split_lines[0]
index_name = split_lines[1]

if index_key == "algo":
dict_key = algo_name
elif index_key == "index":
dict_key = (algo_name, index_name)
if dict_key not in results:
results[dict_key] = []
to_add = [algo_name, index_name]
for key_i in key_idx:
to_add.append(float(split_lines[key_i]))
results[dict_key].append(to_add)

return results

Expand Down Expand Up @@ -375,8 +363,8 @@ def main():
build = args.build
search = args.search

search_output_filepath = os.path.join(args.output_filepath, f"search-{args.dataset}-{k}-{batch_size}.png")
build_output_filepath = os.path.join(args.output_filepath, f"build-{args.dataset}-{k}-{batch_size}.png")
search_output_filepath = os.path.join(args.output_filepath, f"search-{args.dataset}-k{k}-batch_size{batch_size}.png")
build_output_filepath = os.path.join(args.output_filepath, f"build-{args.dataset}-k{k}-batch_size{batch_size}.png")

search_results = load_all_results(
os.path.join(args.dataset_path, args.dataset),
Expand Down
13 changes: 6 additions & 7 deletions bench/ann/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ def find_executable(algos_conf, algo, k, batch_size):
executable)
build_path = os.path.join(os.getenv("RAFT_HOME"), "cpp", "build", executable)
if os.path.exists(conda_path):
return (executable, conda_path, f"{algo}-{k}-{batch_size}")
return (executable, conda_path, f"{algo}-k{k}-batch_size{batch_size}")
elif os.path.exists(build_path):
return (executable, build_path, f"{algo}-{k}-{batch_size}")
return (executable, build_path, f"{algo}-k{k}-batch_size{batch_size}")
else:
raise FileNotFoundError(executable)

Expand Down Expand Up @@ -72,8 +72,8 @@ def run_build_and_search(conf_file, conf_filename, conf_filedir,
cmd = [ann_executable_path,
"--build",
"--data_prefix="+dataset_path,
"--benchmark_out_format=csv",
f"--benchmark_out={os.path.join(build_folder, f'{algo}.csv')}"]
"--benchmark_out_format=json",
f"--benchmark_out={os.path.join(build_folder, f'{algo}.json')}"]
if force:
cmd = cmd + ["--overwrite"]
cmd = cmd + [temp_conf_filepath]
Expand All @@ -90,9 +90,8 @@ def run_build_and_search(conf_file, conf_filename, conf_filedir,
"--benchmark_counters_tabular",
"--override_kv=k:%s" % k,
"--override_kv=n_queries:%s" % batch_size,
"--benchmark_min_warmup_time=0.01",
"--benchmark_out_format=csv",
f"--benchmark_out={os.path.join(search_folder, f'{algo}.csv')}"]
"--benchmark_out_format=json",
f"--benchmark_out={os.path.join(search_folder, f'{algo}.json')}"]
if force:
cmd = cmd + ["--overwrite"]
cmd = cmd + [temp_conf_filepath]
Expand Down
13 changes: 2 additions & 11 deletions cpp/bench/ann/src/common/benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,8 @@ auto load_lib(const std::string& algo) -> void*
auto found = libs.find(algo);

if (found != libs.end()) { return found->second.handle; }
auto lib_name = "lib" + algo + "_ann_bench.so";
std::string lib_path = "";
if (std::getenv("CONDA_PREFIX") != nullptr) {
auto conda_path = std::string(std::getenv("CONDA_PREFIX")) + "/bin" + "/ann/";
if (std::filesystem::exists(conda_path + "ANN_BENCH")) { lib_path = conda_path; }
}
if (std::getenv("RAFT_HOME") != nullptr) {
auto build_path = std::string(std::getenv("RAFT_HOME")) + "/cpp" + "/build/";
if (std::filesystem::exists(build_path + "ANN_BENCH")) { lib_path = build_path; }
}
return libs.emplace(algo, lib_path + lib_name).first->second.handle;
auto lib_name = "lib" + algo + "_ann_bench.so";
return libs.emplace(algo, lib_name).first->second.handle;
}

auto get_fun_name(void* addr) -> std::string
Expand Down
2 changes: 1 addition & 1 deletion dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,8 @@ dependencies:
- benchmark>=1.8.2
- faiss-proc=*=cuda
- matplotlib
- pyyaml
- pandas
- pyyaml

cudatoolkit:
specific:
Expand Down
52 changes: 42 additions & 10 deletions docs/source/raft_ann_benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@ Please see the [build instructions](ann_benchmarks_build.md) to build the benchm
## Running the benchmarks

### Usage
There are 3 general steps to running the benchmarks and vizualizing the results:
There are 4 general steps to running the benchmarks and vizualizing the results:
1. Prepare Dataset
2. Build Index and Search Index
3. Plot Results
3. Data Export
4. Plot Results

We provide a collection of lightweight Python scripts that are wrappers over
lower level scripts and executables to run our benchmarks. Either Python scripts or
Expand All @@ -46,7 +47,10 @@ python bench/ann/get_dataset.py --dataset deep-image-96-angular --normalize
# (2) build and search index
python bench/ann/run.py --dataset deep-image-96-inner

# (3) plot results
# (3) export data
python bench/ann/data_export.py --dataset deep-image-96-inner

# (4) plot results
python bench/ann/plot.py --dataset deep-image-96-inner
```

Expand Down Expand Up @@ -82,7 +86,10 @@ python bench/ann/split_groundtruth.py --groundtruth bench/ann/data/deep-1B/deep_
# (2) build and search index
python bench/ann/run.py --dataset deep-1B

# (3) plot results
# (3) export data
python bench/ann/data_export.py --dataset deep-1B

# (4) plot results
python bench/ann/plot.py --dataset deep-1B
```

Expand Down Expand Up @@ -160,27 +167,44 @@ options:
The configuration file should be name as `<dataset>.json`. It is optional if the name of the dataset is
provided with the `dataset` argument, in which case
a configuration file will be searched for as `${RAFT_HOME}/bench/ann/conf/<dataset>.json`.
For every algorithm run by this script, it outputs an index build statistics CSV file in `<dataset-path/<dataset>/result/build/<algo.csv>
and an index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo.csv>.
For every algorithm run by this script, it outputs an index build statistics JSON file in `<dataset-path/<dataset>/result/build/<algo-k{k}-batch_size{batch_size}.json>`
and an index search statistics JSON file in `<dataset-path/<dataset>/result/search/<algo-k{k}-batch_size{batch_size}.json>`.

`dataset-path` :
1. data is read from `<dataset-path>/<dataset>`
2. indices are built in `<dataset-path>/<dataset>/index`
3. search results are stored in `<dataset-path>/<dataset>/result`
3. build/search results are stored in `<dataset-path>/<dataset>/result`

`build` and `search` : if both parameters are not supplied to the script then
it is assumed both are `True`.

`indices` and `algorithms` : these parameters ensure that the algorithm specified for an index
is available in `algos.yaml` and not disabled, as well as having an associated executable.

#### Step 3: Plot Results
#### Step 3: Data Export
The script `bench/ann/data_export.py` will convert the intermediate JSON outputs produced by `bench/ann/run.py` to more
easily readable CSV files, which are needed to build charts made by `bench/ann/plot.py`.

```bash
usage: data_export.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH]
options:
-h, --help show this help message and exit
--dataset DATASET dataset to download (default: glove-100-inner)
--dataset-path DATASET_PATH
path to dataset folder (default: ${RAFT_HOME}/bench/ann/data)
```
Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<algo-k{k}-batch_size{batch_size}.csv>`
and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<algo-k{k}-batch_size{batch_size}.csv>`.

#### Step 4: Plot Results
The script `bench/ann/plot.py` will plot results for all algorithms found in index search statistics
CSV file in `<dataset-path/<dataset>/search/result/<algo.csv>.
CSV file in `<dataset-path/<dataset>/result/search/<-k{k}-batch_size{batch_size}>.csv`.

The usage of this script is:
```bash
usage: plot.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw]
usage: plot.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [-k COUNT] [-bs BATCH_SIZE] [--build] [--search]
[--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--raw]
options:
-h, --help show this help message and exit
Expand All @@ -189,6 +213,14 @@ options:
path to dataset folder (default: ${RAFT_HOME}/bench/ann/data)
--output-filepath OUTPUT_FILEPATH
directory for PNG to be saved (default: os.getcwd())
--algorithms ALGORITHMS
plot only comma separated list of named algorithms (default: None)
-k COUNT, --count COUNT
the number of nearest neighbors to search for (default: 10)
-bs BATCH_SIZE, --batch-size BATCH_SIZE
number of query vectors to use in each query trial (default: 10000)
--build
--search
--x-scale X_SCALE Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear)
--y-scale {linear,log,symlog,logit}
Scale to use when drawing the Y-axis (default: linear)
Expand Down

0 comments on commit 39dd3f4

Please sign in to comment.