diff --git a/benchmarks/README.md b/benchmarks/README.md
index 0d13ed9764f7..d397def8f8e2 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -76,12 +76,56 @@ cargo run --release --bin tpch -- convert --input ./data --output /mnt/tpch-parq
 
 Or if you want to verify and run all the queries in the benchmark, you can just run `cargo test`.
 
-### Machine readable benchmark summary
+### Comparing results between runs
 
 Any `tpch` execution with `-o <dir>` argument will produce a summary file right under the `<dir>`
 directory. It is a JSON serialized form of all the runs that happened as well as the runtime metadata
 (number of cores, DataFusion version, etc.).
 
+```shell
+$ git checkout main
+# generate an output script in /tmp/output_main
+$ mkdir -p /tmp/output_main
+$ cargo run --release --bin tpch -- benchmark datafusion --iterations 5 --path ./data --format parquet -o /tmp/output_main
+# generate an output script in /tmp/output_branch
+$ mkdir -p /tmp/output_branch
+$ git checkout my_branch
+$ cargo run --release --bin tpch -- benchmark datafusion --iterations 5 --path ./data --format parquet -o /tmp/output_branch
+# compare the results:
+./compare.py /tmp/output_main/tpch-summary--1679330119.json  /tmp/output_branch/tpch-summary--1679328405.json
+```
+
+This will produce output like
+
+```
+┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
+┃ Query        ┃ /home/alamb… ┃ /home/alamb… ┃        Change ┃
+┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
+│ Q1           │   16252.56ms │   16031.82ms │     no change │
+│ Q2           │    3994.56ms │    4353.75ms │  1.09x slower │
+│ Q3           │    5572.06ms │    5620.27ms │     no change │
+│ Q4           │    2144.14ms │    2194.67ms │     no change │
+│ Q5           │    7796.93ms │    7646.74ms │     no change │
+│ Q6           │    4382.32ms │    4327.16ms │     no change │
+│ Q7           │   18702.50ms │   19922.74ms │  1.07x slower │
+│ Q8           │    7383.74ms │    7616.21ms │     no change │
+│ Q9           │   13855.17ms │   14408.42ms │     no change │
+│ Q10          │    7446.05ms │    8030.00ms │  1.08x slower │
+│ Q11          │    3414.81ms │    3850.34ms │  1.13x slower │
+│ Q12          │    3027.16ms │    3085.89ms │     no change │
+│ Q13          │   18859.06ms │   18627.02ms │     no change │
+│ Q14          │    4157.91ms │    4140.22ms │     no change │
+│ Q15          │    5293.05ms │    5369.17ms │     no change │
+│ Q16          │    6512.42ms │    3011.58ms │ +2.16x faster │
+│ Q17          │   86253.33ms │   76036.06ms │ +1.13x faster │
+│ Q18          │   45101.99ms │   49717.76ms │  1.10x slower │
+│ Q19          │    7323.15ms │    7409.85ms │     no change │
+│ Q20          │   19902.39ms │   20965.94ms │  1.05x slower │
+│ Q21          │   22040.06ms │   23184.84ms │  1.05x slower │
+│ Q22          │    2011.87ms │    2143.62ms │  1.07x slower │
+└──────────────┴──────────────┴──────────────┴───────────────┘
+```
+
 ## Expected output
 
 The result of query 1 should produce the following output when executed against the SF=1 dataset.
diff --git a/benchmarks/compare.py b/benchmarks/compare.py
new file mode 100755
index 000000000000..aa3871c18b23
--- /dev/null
+++ b/benchmarks/compare.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+from __future__ import annotations
+
+import json
+import statistics
+from dataclasses import dataclass
+from typing import Dict, List, Any
+from pathlib import Path
+from argparse import ArgumentParser
+
+try:
+    from rich.console import Console
+    from rich.table import Table
+except ImportError:
+    print("Try `pip install rich` for using this script.")
+    raise
+
+MEAN_THRESHOLD = 5
+
+
+@dataclass
+class QueryResult:
+    elapsed: float
+    row_count: int
+
+    @classmethod
+    def load_from(cls, data: Dict[str, Any]) -> QueryResult:
+        return cls(elapsed=data["elapsed"], row_count=data["row_count"])
+
+
+@dataclass
+class QueryRun:
+    query: int
+    iterations: List[QueryResult]
+    start_time: int
+
+    @classmethod
+    def load_from(cls, data: Dict[str, Any]) -> QueryRun:
+        return cls(
+            query=data["query"],
+            iterations=[QueryResult(**iteration) for iteration in data["iterations"]],
+            start_time=data["start_time"],
+        )
+
+    @property
+    def execution_time(self) -> float:
+        assert len(self.iterations) >= 1
+
+        # If we don't have enough samples, median() is probably
+        # going to be a worse measure than just an average.
+        if len(self.iterations) < MEAN_THRESHOLD:
+            method = statistics.mean
+        else:
+            method = statistics.median
+
+        return method(iteration.elapsed for iteration in self.iterations)
+
+
+@dataclass
+class Context:
+    benchmark_version: str
+    datafusion_version: str
+    num_cpus: int
+    start_time: int
+    arguments: List[str]
+    branch: str
+
+    @classmethod
+    def load_from(cls, data: Dict[str, Any]) -> Context:
+        return cls(
+            benchmark_version=data["benchmark_version"],
+            datafusion_version=data["datafusion_version"],
+            num_cpus=data["num_cpus"],
+            start_time=data["start_time"],
+            arguments=data["arguments"],
+            branch=data["arguments"][9]
+        )
+
+
+@dataclass
+class BenchmarkRun:
+    context: Context
+    queries: List[QueryRun]
+
+    @classmethod
+    def load_from(cls, data: Dict[str, Any]) -> BenchmarkRun:
+        return cls(
+            context=Context.load_from(data["context"]),
+            queries=[QueryRun.load_from(result) for result in data["queries"]],
+        )
+
+    @classmethod
+    def load_from_file(cls, path: Path) -> BenchmarkRun:
+        with open(path, "r") as f:
+            return cls.load_from(json.load(f))
+
+
+def compare(
+    baseline_path: Path,
+    comparison_path: Path,
+    noise_threshold: float,
+) -> None:
+    baseline = BenchmarkRun.load_from_file(baseline_path)
+    baselineBranch = baseline.context.branch
+
+    comparison = BenchmarkRun.load_from_file(comparison_path)
+    comparisonBranch = comparison.context.branch
+
+    console = Console()
+
+    table = Table(show_header=True, header_style="bold magenta")
+    table.add_column("Query", style="dim", width=12)
+    table.add_column(baselineBranch, justify="right", style="dim", width=12)
+    table.add_column(comparisonBranch, justify="right", style="dim", width=12)
+    table.add_column("Change", justify="right", style="dim")
+
+    for baseline_result, comparison_result in zip(baseline.queries, comparison.queries):
+        assert baseline_result.query == comparison_result.query
+
+        change = comparison_result.execution_time / baseline_result.execution_time
+
+        if (1.0 - noise_threshold) <= change <= (1.0 + noise_threshold):
+            change = "no change"
+        elif change < 1.0:
+            change = f"+{(1 / change):.2f}x faster"
+        else:
+            change = f"{change:.2f}x slower"
+
+        table.add_row(
+            f"Q{baseline_result.query}",
+            f"{baseline_result.execution_time:.2f}ms",
+            f"{comparison_result.execution_time:.2f}ms",
+            change,
+        )
+
+    console.print(table)
+
+
+def main() -> None:
+    parser = ArgumentParser()
+    compare_parser = parser
+    compare_parser.add_argument(
+        "baseline_path",
+        type=Path,
+        help="Path to the baseline summary file.",
+    )
+    compare_parser.add_argument(
+        "comparison_path",
+        type=Path,
+        help="Path to the comparison summary file.",
+    )
+    compare_parser.add_argument(
+        "--noise-threshold",
+        type=float,
+        default=0.05,
+        help="The threshold for statistically insignificant results (+/- %5).",
+    )
+
+    options = parser.parse_args()
+
+    compare(options.baseline_path, options.comparison_path, options.noise_threshold)
+
+
+
+if __name__ == "__main__":
+    main()