diff --git a/benchmarks/README.md b/benchmarks/README.md index 0d13ed9764f7..d397def8f8e2 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -76,12 +76,56 @@ cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-parq Or if you want to verify and run all the queries in the benchmark, you can just run `cargo test`. -### Machine readable benchmark summary +### Comparing results between runs Any `tpch` execution with `-o ` argument will produce a summary file right under the `` directory. It is a JSON serialized form of all the runs that happened as well as the runtime metadata (number of cores, DataFusion version, etc.). +```shell +$ git checkout main +# generate an output script in /tmp/output_main +$ mkdir -p /tmp/output_main +$ cargo run --release --bin tpch -- benchmark datafusion --iterations 5 --path ./data --format parquet -o /tmp/output_main +# generate an output script in /tmp/output_branch +$ mkdir -p /tmp/output_branch +$ git checkout my_branch +$ cargo run --release --bin tpch -- benchmark datafusion --iterations 5 --path ./data --format parquet -o /tmp/output_branch +# compare the results: +./compare.py /tmp/output_main/tpch-summary--1679330119.json /tmp/output_branch/tpch-summary--1679328405.json +``` + +This will produce output like + +``` +┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ +┃ Query ┃ /home/alamb… ┃ /home/alamb… ┃ Change ┃ +┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ +│ Q1 │ 16252.56ms │ 16031.82ms │ no change │ +│ Q2 │ 3994.56ms │ 4353.75ms │ 1.09x slower │ +│ Q3 │ 5572.06ms │ 5620.27ms │ no change │ +│ Q4 │ 2144.14ms │ 2194.67ms │ no change │ +│ Q5 │ 7796.93ms │ 7646.74ms │ no change │ +│ Q6 │ 4382.32ms │ 4327.16ms │ no change │ +│ Q7 │ 18702.50ms │ 19922.74ms │ 1.07x slower │ +│ Q8 │ 7383.74ms │ 7616.21ms │ no change │ +│ Q9 │ 13855.17ms │ 14408.42ms │ no change │ +│ Q10 │ 7446.05ms │ 8030.00ms │ 1.08x slower │ +│ Q11 │ 3414.81ms │ 3850.34ms │ 1.13x slower │ +│ Q12 │ 3027.16ms │ 3085.89ms │ no change │ +│ Q13 │ 18859.06ms │ 18627.02ms │ no change │ +│ Q14 │ 4157.91ms │ 4140.22ms │ no change │ +│ Q15 │ 5293.05ms │ 5369.17ms │ no change │ +│ Q16 │ 6512.42ms │ 3011.58ms │ +2.16x faster │ +│ Q17 │ 86253.33ms │ 76036.06ms │ +1.13x faster │ +│ Q18 │ 45101.99ms │ 49717.76ms │ 1.10x slower │ +│ Q19 │ 7323.15ms │ 7409.85ms │ no change │ +│ Q20 │ 19902.39ms │ 20965.94ms │ 1.05x slower │ +│ Q21 │ 22040.06ms │ 23184.84ms │ 1.05x slower │ +│ Q22 │ 2011.87ms │ 2143.62ms │ 1.07x slower │ +└──────────────┴──────────────┴──────────────┴───────────────┘ +``` + ## Expected output The result of query 1 should produce the following output when executed against the SF=1 dataset. diff --git a/benchmarks/compare.py b/benchmarks/compare.py new file mode 100755 index 000000000000..aa3871c18b23 --- /dev/null +++ b/benchmarks/compare.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +from __future__ import annotations + +import json +import statistics +from dataclasses import dataclass +from typing import Dict, List, Any +from pathlib import Path +from argparse import ArgumentParser + +try: + from rich.console import Console + from rich.table import Table +except ImportError: + print("Try `pip install rich` for using this script.") + raise + +MEAN_THRESHOLD = 5 + + +@dataclass +class QueryResult: + elapsed: float + row_count: int + + @classmethod + def load_from(cls, data: Dict[str, Any]) -> QueryResult: + return cls(elapsed=data["elapsed"], row_count=data["row_count"]) + + +@dataclass +class QueryRun: + query: int + iterations: List[QueryResult] + start_time: int + + @classmethod + def load_from(cls, data: Dict[str, Any]) -> QueryRun: + return cls( + query=data["query"], + iterations=[QueryResult(**iteration) for iteration in data["iterations"]], + start_time=data["start_time"], + ) + + @property + def execution_time(self) -> float: + assert len(self.iterations) >= 1 + + # If we don't have enough samples, median() is probably + # going to be a worse measure than just an average. + if len(self.iterations) < MEAN_THRESHOLD: + method = statistics.mean + else: + method = statistics.median + + return method(iteration.elapsed for iteration in self.iterations) + + +@dataclass +class Context: + benchmark_version: str + datafusion_version: str + num_cpus: int + start_time: int + arguments: List[str] + branch: str + + @classmethod + def load_from(cls, data: Dict[str, Any]) -> Context: + return cls( + benchmark_version=data["benchmark_version"], + datafusion_version=data["datafusion_version"], + num_cpus=data["num_cpus"], + start_time=data["start_time"], + arguments=data["arguments"], + branch=data["arguments"][9] + ) + + +@dataclass +class BenchmarkRun: + context: Context + queries: List[QueryRun] + + @classmethod + def load_from(cls, data: Dict[str, Any]) -> BenchmarkRun: + return cls( + context=Context.load_from(data["context"]), + queries=[QueryRun.load_from(result) for result in data["queries"]], + ) + + @classmethod + def load_from_file(cls, path: Path) -> BenchmarkRun: + with open(path, "r") as f: + return cls.load_from(json.load(f)) + + +def compare( + baseline_path: Path, + comparison_path: Path, + noise_threshold: float, +) -> None: + baseline = BenchmarkRun.load_from_file(baseline_path) + baselineBranch = baseline.context.branch + + comparison = BenchmarkRun.load_from_file(comparison_path) + comparisonBranch = comparison.context.branch + + console = Console() + + table = Table(show_header=True, header_style="bold magenta") + table.add_column("Query", style="dim", width=12) + table.add_column(baselineBranch, justify="right", style="dim", width=12) + table.add_column(comparisonBranch, justify="right", style="dim", width=12) + table.add_column("Change", justify="right", style="dim") + + for baseline_result, comparison_result in zip(baseline.queries, comparison.queries): + assert baseline_result.query == comparison_result.query + + change = comparison_result.execution_time / baseline_result.execution_time + + if (1.0 - noise_threshold) <= change <= (1.0 + noise_threshold): + change = "no change" + elif change < 1.0: + change = f"+{(1 / change):.2f}x faster" + else: + change = f"{change:.2f}x slower" + + table.add_row( + f"Q{baseline_result.query}", + f"{baseline_result.execution_time:.2f}ms", + f"{comparison_result.execution_time:.2f}ms", + change, + ) + + console.print(table) + + +def main() -> None: + parser = ArgumentParser() + compare_parser = parser + compare_parser.add_argument( + "baseline_path", + type=Path, + help="Path to the baseline summary file.", + ) + compare_parser.add_argument( + "comparison_path", + type=Path, + help="Path to the comparison summary file.", + ) + compare_parser.add_argument( + "--noise-threshold", + type=float, + default=0.05, + help="The threshold for statistically insignificant results (+/- %5).", + ) + + options = parser.parse_args() + + compare(options.baseline_path, options.comparison_path, options.noise_threshold) + + + +if __name__ == "__main__": + main()