diff --git a/benchmarks/README.md b/benchmarks/README.md
index 0d13ed9764f7..d397def8f8e2 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -76,12 +76,56 @@ cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-parq
Or if you want to verify and run all the queries in the benchmark, you can just run `cargo test`.
-### Machine readable benchmark summary
+### Comparing results between runs
Any `tpch` execution with `-o
` argument will produce a summary file right under the ``
directory. It is a JSON serialized form of all the runs that happened as well as the runtime metadata
(number of cores, DataFusion version, etc.).
+```shell
+$ git checkout main
+# generate an output script in /tmp/output_main
+$ mkdir -p /tmp/output_main
+$ cargo run --release --bin tpch -- benchmark datafusion --iterations 5 --path ./data --format parquet -o /tmp/output_main
+# generate an output script in /tmp/output_branch
+$ mkdir -p /tmp/output_branch
+$ git checkout my_branch
+$ cargo run --release --bin tpch -- benchmark datafusion --iterations 5 --path ./data --format parquet -o /tmp/output_branch
+# compare the results:
+./compare.py /tmp/output_main/tpch-summary--1679330119.json /tmp/output_branch/tpch-summary--1679328405.json
+```
+
+This will produce output like
+
+```
+┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
+┃ Query ┃ /home/alamb… ┃ /home/alamb… ┃ Change ┃
+┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
+│ Q1 │ 16252.56ms │ 16031.82ms │ no change │
+│ Q2 │ 3994.56ms │ 4353.75ms │ 1.09x slower │
+│ Q3 │ 5572.06ms │ 5620.27ms │ no change │
+│ Q4 │ 2144.14ms │ 2194.67ms │ no change │
+│ Q5 │ 7796.93ms │ 7646.74ms │ no change │
+│ Q6 │ 4382.32ms │ 4327.16ms │ no change │
+│ Q7 │ 18702.50ms │ 19922.74ms │ 1.07x slower │
+│ Q8 │ 7383.74ms │ 7616.21ms │ no change │
+│ Q9 │ 13855.17ms │ 14408.42ms │ no change │
+│ Q10 │ 7446.05ms │ 8030.00ms │ 1.08x slower │
+│ Q11 │ 3414.81ms │ 3850.34ms │ 1.13x slower │
+│ Q12 │ 3027.16ms │ 3085.89ms │ no change │
+│ Q13 │ 18859.06ms │ 18627.02ms │ no change │
+│ Q14 │ 4157.91ms │ 4140.22ms │ no change │
+│ Q15 │ 5293.05ms │ 5369.17ms │ no change │
+│ Q16 │ 6512.42ms │ 3011.58ms │ +2.16x faster │
+│ Q17 │ 86253.33ms │ 76036.06ms │ +1.13x faster │
+│ Q18 │ 45101.99ms │ 49717.76ms │ 1.10x slower │
+│ Q19 │ 7323.15ms │ 7409.85ms │ no change │
+│ Q20 │ 19902.39ms │ 20965.94ms │ 1.05x slower │
+│ Q21 │ 22040.06ms │ 23184.84ms │ 1.05x slower │
+│ Q22 │ 2011.87ms │ 2143.62ms │ 1.07x slower │
+└──────────────┴──────────────┴──────────────┴───────────────┘
+```
+
## Expected output
The result of query 1 should produce the following output when executed against the SF=1 dataset.
diff --git a/benchmarks/compare.py b/benchmarks/compare.py
new file mode 100755
index 000000000000..aa3871c18b23
--- /dev/null
+++ b/benchmarks/compare.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+from __future__ import annotations
+
+import json
+import statistics
+from dataclasses import dataclass
+from typing import Dict, List, Any
+from pathlib import Path
+from argparse import ArgumentParser
+
+try:
+ from rich.console import Console
+ from rich.table import Table
+except ImportError:
+ print("Try `pip install rich` for using this script.")
+ raise
+
+MEAN_THRESHOLD = 5
+
+
+@dataclass
+class QueryResult:
+ elapsed: float
+ row_count: int
+
+ @classmethod
+ def load_from(cls, data: Dict[str, Any]) -> QueryResult:
+ return cls(elapsed=data["elapsed"], row_count=data["row_count"])
+
+
+@dataclass
+class QueryRun:
+ query: int
+ iterations: List[QueryResult]
+ start_time: int
+
+ @classmethod
+ def load_from(cls, data: Dict[str, Any]) -> QueryRun:
+ return cls(
+ query=data["query"],
+ iterations=[QueryResult(**iteration) for iteration in data["iterations"]],
+ start_time=data["start_time"],
+ )
+
+ @property
+ def execution_time(self) -> float:
+ assert len(self.iterations) >= 1
+
+ # If we don't have enough samples, median() is probably
+ # going to be a worse measure than just an average.
+ if len(self.iterations) < MEAN_THRESHOLD:
+ method = statistics.mean
+ else:
+ method = statistics.median
+
+ return method(iteration.elapsed for iteration in self.iterations)
+
+
+@dataclass
+class Context:
+ benchmark_version: str
+ datafusion_version: str
+ num_cpus: int
+ start_time: int
+ arguments: List[str]
+ branch: str
+
+ @classmethod
+ def load_from(cls, data: Dict[str, Any]) -> Context:
+ return cls(
+ benchmark_version=data["benchmark_version"],
+ datafusion_version=data["datafusion_version"],
+ num_cpus=data["num_cpus"],
+ start_time=data["start_time"],
+ arguments=data["arguments"],
+ branch=data["arguments"][9]
+ )
+
+
+@dataclass
+class BenchmarkRun:
+ context: Context
+ queries: List[QueryRun]
+
+ @classmethod
+ def load_from(cls, data: Dict[str, Any]) -> BenchmarkRun:
+ return cls(
+ context=Context.load_from(data["context"]),
+ queries=[QueryRun.load_from(result) for result in data["queries"]],
+ )
+
+ @classmethod
+ def load_from_file(cls, path: Path) -> BenchmarkRun:
+ with open(path, "r") as f:
+ return cls.load_from(json.load(f))
+
+
+def compare(
+ baseline_path: Path,
+ comparison_path: Path,
+ noise_threshold: float,
+) -> None:
+ baseline = BenchmarkRun.load_from_file(baseline_path)
+ baselineBranch = baseline.context.branch
+
+ comparison = BenchmarkRun.load_from_file(comparison_path)
+ comparisonBranch = comparison.context.branch
+
+ console = Console()
+
+ table = Table(show_header=True, header_style="bold magenta")
+ table.add_column("Query", style="dim", width=12)
+ table.add_column(baselineBranch, justify="right", style="dim", width=12)
+ table.add_column(comparisonBranch, justify="right", style="dim", width=12)
+ table.add_column("Change", justify="right", style="dim")
+
+ for baseline_result, comparison_result in zip(baseline.queries, comparison.queries):
+ assert baseline_result.query == comparison_result.query
+
+ change = comparison_result.execution_time / baseline_result.execution_time
+
+ if (1.0 - noise_threshold) <= change <= (1.0 + noise_threshold):
+ change = "no change"
+ elif change < 1.0:
+ change = f"+{(1 / change):.2f}x faster"
+ else:
+ change = f"{change:.2f}x slower"
+
+ table.add_row(
+ f"Q{baseline_result.query}",
+ f"{baseline_result.execution_time:.2f}ms",
+ f"{comparison_result.execution_time:.2f}ms",
+ change,
+ )
+
+ console.print(table)
+
+
+def main() -> None:
+ parser = ArgumentParser()
+ compare_parser = parser
+ compare_parser.add_argument(
+ "baseline_path",
+ type=Path,
+ help="Path to the baseline summary file.",
+ )
+ compare_parser.add_argument(
+ "comparison_path",
+ type=Path,
+ help="Path to the comparison summary file.",
+ )
+ compare_parser.add_argument(
+ "--noise-threshold",
+ type=float,
+ default=0.05,
+ help="The threshold for statistically insignificant results (+/- %5).",
+ )
+
+ options = parser.parse_args()
+
+ compare(options.baseline_path, options.comparison_path, options.noise_threshold)
+
+
+
+if __name__ == "__main__":
+ main()