Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add compare.py to compare the output of multiple benchmarks #5655

Merged
merged 5 commits into from
Mar 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 45 additions & 1 deletion benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,56 @@ cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-parq

Or if you want to verify and run all the queries in the benchmark, you can just run `cargo test`.

### Machine readable benchmark summary
### Comparing results between runs

Any `tpch` execution with `-o <dir>` argument will produce a summary file right under the `<dir>`
directory. It is a JSON serialized form of all the runs that happened as well as the runtime metadata
(number of cores, DataFusion version, etc.).

```shell
$ git checkout main
# generate an output script in /tmp/output_main
$ mkdir -p /tmp/output_main
$ cargo run --release --bin tpch -- benchmark datafusion --iterations 5 --path ./data --format parquet -o /tmp/output_main
# generate an output script in /tmp/output_branch
$ mkdir -p /tmp/output_branch
$ git checkout my_branch
$ cargo run --release --bin tpch -- benchmark datafusion --iterations 5 --path ./data --format parquet -o /tmp/output_branch
# compare the results:
./compare.py /tmp/output_main/tpch-summary--1679330119.json /tmp/output_branch/tpch-summary--1679328405.json
```

This will produce output like

```
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Query ┃ /home/alamb… ┃ /home/alamb… ┃ Change ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ Q1 │ 16252.56ms │ 16031.82ms │ no change │
│ Q2 │ 3994.56ms │ 4353.75ms │ 1.09x slower │
│ Q3 │ 5572.06ms │ 5620.27ms │ no change │
│ Q4 │ 2144.14ms │ 2194.67ms │ no change │
│ Q5 │ 7796.93ms │ 7646.74ms │ no change │
│ Q6 │ 4382.32ms │ 4327.16ms │ no change │
│ Q7 │ 18702.50ms │ 19922.74ms │ 1.07x slower │
│ Q8 │ 7383.74ms │ 7616.21ms │ no change │
│ Q9 │ 13855.17ms │ 14408.42ms │ no change │
│ Q10 │ 7446.05ms │ 8030.00ms │ 1.08x slower │
│ Q11 │ 3414.81ms │ 3850.34ms │ 1.13x slower │
│ Q12 │ 3027.16ms │ 3085.89ms │ no change │
│ Q13 │ 18859.06ms │ 18627.02ms │ no change │
│ Q14 │ 4157.91ms │ 4140.22ms │ no change │
│ Q15 │ 5293.05ms │ 5369.17ms │ no change │
│ Q16 │ 6512.42ms │ 3011.58ms │ +2.16x faster │
│ Q17 │ 86253.33ms │ 76036.06ms │ +1.13x faster │
│ Q18 │ 45101.99ms │ 49717.76ms │ 1.10x slower │
│ Q19 │ 7323.15ms │ 7409.85ms │ no change │
│ Q20 │ 19902.39ms │ 20965.94ms │ 1.05x slower │
│ Q21 │ 22040.06ms │ 23184.84ms │ 1.05x slower │
│ Q22 │ 2011.87ms │ 2143.62ms │ 1.07x slower │
└──────────────┴──────────────┴──────────────┴───────────────┘
```

## Expected output

The result of query 1 should produce the following output when executed against the SF=1 dataset.
Expand Down
184 changes: 184 additions & 0 deletions benchmarks/compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
#!/usr/bin/env python
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#


from __future__ import annotations

import json
import statistics
from dataclasses import dataclass
from typing import Dict, List, Any
from pathlib import Path
from argparse import ArgumentParser

try:
from rich.console import Console
from rich.table import Table
except ImportError:
print("Try `pip install rich` for using this script.")
raise

MEAN_THRESHOLD = 5


@dataclass
class QueryResult:
elapsed: float
row_count: int

@classmethod
def load_from(cls, data: Dict[str, Any]) -> QueryResult:
return cls(elapsed=data["elapsed"], row_count=data["row_count"])


@dataclass
class QueryRun:
query: int
iterations: List[QueryResult]
start_time: int

@classmethod
def load_from(cls, data: Dict[str, Any]) -> QueryRun:
return cls(
query=data["query"],
iterations=[QueryResult(**iteration) for iteration in data["iterations"]],
start_time=data["start_time"],
)

@property
def execution_time(self) -> float:
assert len(self.iterations) >= 1

# If we don't have enough samples, median() is probably
# going to be a worse measure than just an average.
if len(self.iterations) < MEAN_THRESHOLD:
method = statistics.mean
else:
method = statistics.median

return method(iteration.elapsed for iteration in self.iterations)


@dataclass
class Context:
benchmark_version: str
datafusion_version: str
num_cpus: int
start_time: int
arguments: List[str]
branch: str

@classmethod
def load_from(cls, data: Dict[str, Any]) -> Context:
return cls(
benchmark_version=data["benchmark_version"],
datafusion_version=data["datafusion_version"],
num_cpus=data["num_cpus"],
start_time=data["start_time"],
arguments=data["arguments"],
branch=data["arguments"][9]
)


@dataclass
class BenchmarkRun:
context: Context
queries: List[QueryRun]

@classmethod
def load_from(cls, data: Dict[str, Any]) -> BenchmarkRun:
return cls(
context=Context.load_from(data["context"]),
queries=[QueryRun.load_from(result) for result in data["queries"]],
)

@classmethod
def load_from_file(cls, path: Path) -> BenchmarkRun:
with open(path, "r") as f:
return cls.load_from(json.load(f))


def compare(
baseline_path: Path,
comparison_path: Path,
noise_threshold: float,
) -> None:
baseline = BenchmarkRun.load_from_file(baseline_path)
baselineBranch = baseline.context.branch

comparison = BenchmarkRun.load_from_file(comparison_path)
comparisonBranch = comparison.context.branch

console = Console()

table = Table(show_header=True, header_style="bold magenta")
table.add_column("Query", style="dim", width=12)
table.add_column(baselineBranch, justify="right", style="dim", width=12)
table.add_column(comparisonBranch, justify="right", style="dim", width=12)
table.add_column("Change", justify="right", style="dim")

for baseline_result, comparison_result in zip(baseline.queries, comparison.queries):
assert baseline_result.query == comparison_result.query

change = comparison_result.execution_time / baseline_result.execution_time

if (1.0 - noise_threshold) <= change <= (1.0 + noise_threshold):
change = "no change"
elif change < 1.0:
change = f"+{(1 / change):.2f}x faster"
else:
change = f"{change:.2f}x slower"

table.add_row(
f"Q{baseline_result.query}",
f"{baseline_result.execution_time:.2f}ms",
f"{comparison_result.execution_time:.2f}ms",
change,
)

console.print(table)


def main() -> None:
parser = ArgumentParser()
compare_parser = parser
compare_parser.add_argument(
"baseline_path",
type=Path,
help="Path to the baseline summary file.",
)
compare_parser.add_argument(
"comparison_path",
type=Path,
help="Path to the comparison summary file.",
)
compare_parser.add_argument(
"--noise-threshold",
type=float,
default=0.05,
help="The threshold for statistically insignificant results (+/- %5).",
)

options = parser.parse_args()

compare(options.baseline_path, options.comparison_path, options.noise_threshold)



if __name__ == "__main__":
main()