diff --git a/.github/workflows/bench-upload.yml b/.github/workflows/bench-upload.yml
new file mode 100644
index 000000000000..69d8d44f504f
--- /dev/null
+++ b/.github/workflows/bench-upload.yml
@@ -0,0 +1,46 @@
+# This file is manually managed. It is used to upload benchmarks to to the
+# https://github.com/enso-org/engine-benchmark-results repository.
+
+name: Benchmarks upload
+on:
+  workflow_run:
+    workflows: ["Benchmark Engine", "Benchmark Standard Libraries"]
+    types:
+      - completed
+jobs:
+  upload-benchmarks:
+    name: Upload benchmarks
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout enso repository
+        uses: actions/checkout@v4
+        with:
+          repository: enso-org/enso
+          path: enso
+      - name: Checkout engine-benchmark-results repository
+        uses: actions/checkout@v4
+        with:
+          repository: enso-org/engine-benchmark-results
+          path: engine-benchmark-results
+          token: ${{ secrets.ENSO_BENCHMARK_RESULTS_TOKEN }}
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y \
+            python3 \
+            python3-jinja2 \
+            python3-numpy \
+            python3-pandas
+          sudo apt-get install -y gh
+      - name: Set up git
+        run: |
+          git config --global user.email "ci@enso.org"
+          git config --global user.name "Enso CI Bot"
+      - name: Upload benchmarks
+        run: |
+          cd enso/tools/performance/engine-benchmarks
+          python3 website_regen.py \
+            -v \
+            --local-repo  ${{ github.workspace }}/engine-benchmark-results
+        env:
+          GITHUB_TOKEN: ${{ secrets.ENSO_BENCHMARK_RESULTS_TOKEN }}
diff --git a/tools/performance/engine-benchmarks/README.md b/tools/performance/engine-benchmarks/README.md
index c37cfbe060af..3b7bba729079 100644
--- a/tools/performance/engine-benchmarks/README.md
+++ b/tools/performance/engine-benchmarks/README.md
@@ -4,6 +4,12 @@ This directory contains a python script `bench_download.py` for downloading
 Engine and stdlib benchmark results from GitHub, and `Engine_Benchs` Enso
 project for analysing the downloaded data.
 
+Note that for convenience, there is `bench_tool` directory that is a Python
+package. The `bench_download.py` script uses this package.
+
+To run all the Python tests for that package, run `python -m unittest` in this
+directory.
+
 Dependencies for `bench_download.py`:
 
 - python >= 3.7
diff --git a/tools/performance/engine-benchmarks/bench_download.py b/tools/performance/engine-benchmarks/bench_download.py
index 7743adb5627a..9aa6b3c081d4 100755
--- a/tools/performance/engine-benchmarks/bench_download.py
+++ b/tools/performance/engine-benchmarks/bench_download.py
@@ -45,451 +45,44 @@
         - Used as a template engine for the HTML.
 """
 
+import sys
+
+from bench_tool.bench_results import get_bench_runs, fetch_job_reports
+from bench_tool.remote_cache import ReadonlyRemoteCache
+from bench_tool.utils import gather_all_bench_labels, sort_job_reports
+
+if not (sys.version_info.major >= 3 and sys.version_info.minor >= 7):
+    print("ERROR: python version lower than 3.7")
+    exit(1)
+
 import asyncio
-import json
 import logging
 import logging.config
-import math
 import os
-import re
 import shutil
-import subprocess
-import sys
 import tempfile
-import zipfile
 from argparse import ArgumentParser, RawDescriptionHelpFormatter
 from csv import DictWriter
 from datetime import datetime, timedelta
-from enum import Enum
 from os import path
-from typing import List, Dict, Optional, Any, Union, Set
-from dataclasses import dataclass
-import xml.etree.ElementTree as ET
-from urllib.parse import urlencode
+from typing import List, Dict, Optional, Set
 
+from bench_tool import DATE_FORMAT, GENERATED_SITE_DIR, GH_ARTIFACT_RETENTION_PERIOD, TEMPLATES_DIR, \
+    JINJA_TEMPLATE, JobRun, JobReport, \
+    TemplateBenchData, JinjaData, Source
+from bench_tool.gh import ensure_gh_installed
+from bench_tool.template_render import create_template_data, render_html
 
-if not (sys.version_info.major >= 3 and sys.version_info.minor >= 7):
-    print("ERROR: python version lower than 3.7")
-    exit(1)
 try:
     import pandas as pd
     import numpy as np
     import jinja2
 except ModuleNotFoundError as err:
     print("ERROR: One of pandas, numpy, or jinja2 packages not installed", file=sys.stderr)
+    print("Install either with `pip install pandas numpy jinja2` or "
+          "with `apt-get install python3-pandas python3-numpy python3-jinja2`", file=sys.stderr)
     exit(1)
 
-DATE_FORMAT = "%Y-%m-%d"
-ENGINE_BENCH_WORKFLOW_ID = 29450898
-"""
-Workflow ID of engine benchmarks, got via `gh api 
-'/repos/enso-org/enso/actions/workflows'`.
-The name of the workflow is 'Benchmark Engine'
-"""
-NEW_ENGINE_BENCH_WORKFLOW_ID = 67075764
-"""
-Workflow ID for 'Benchmark Engine' workflow, which is the new workflow
-since 2023-08-22.
-"""
-STDLIBS_BENCH_WORKFLOW_ID = 66661001
-"""
-Workflow ID of stdlibs benchmarks, got via `gh api 
-'/repos/enso-org/enso/actions/workflows'`.
-The name is 'Benchmark Standard Libraries'
-"""
-GH_DATE_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
-""" Date format as returned from responses in GH API"""
-ENSO_COMMIT_BASE_URL = "https://github.com/enso-org/enso/commit/"
-JINJA_TEMPLATE = "templates/template_jinja.html"
-""" Path to the Jinja HTML template """
-TEMPLATES_DIR = "templates"
-GENERATED_SITE_DIR = "generated_site"
-GH_ARTIFACT_RETENTION_PERIOD = timedelta(days=90)
-
-
-class Source(Enum):
-    ENGINE = "engine"
-    STDLIB = "stdlib"
-
-    def workflow_ids(self) -> List[int]:
-        if self == Source.ENGINE:
-            return [ENGINE_BENCH_WORKFLOW_ID, NEW_ENGINE_BENCH_WORKFLOW_ID]
-        elif self == Source.STDLIB:
-            return [STDLIBS_BENCH_WORKFLOW_ID]
-        else:
-            raise ValueError(f"Unknown source {self}")
-
-
-@dataclass
-class Author:
-    name: str
-
-
-@dataclass
-class Commit:
-    """ Corresponds to the commit from GH API """
-    id: str
-    author: Author
-    timestamp: str
-    message: str
-
-
-@dataclass
-class JobRun:
-    """
-    Gathered via the GH API. Defines a single run of an Engine benchmark job.
-    """
-    id: str
-    display_title: str
-    html_url: str
-    run_attempt: int
-    """ An event as defined by the GitHub API, for example 'push' or 'schedule' """
-    event: str
-    head_commit: Commit
-
-
-@dataclass
-class JobReport:
-    """
-    Gathered via the GH API - a report that is pushed as an aritfact to the job.
-    Contains a XML file with scores for all the benchmarks.
-    """
-    label_score_dict: Dict[str, float]
-    """ A mapping of benchmark labels to their scores """
-    bench_run: JobRun
-
-
-@dataclass
-class BenchmarkData:
-    """
-    Data for a single benchmark compiled from all the job reports.
-    """
-
-    @dataclass
-    class Entry:
-        score: float
-        commit: Commit
-        bench_run_url: str
-        bench_run_event: str
-
-    label: str
-    """ Label for the benchmark, as reported by org.enso.interpreter.bench.BenchmarksRunner """
-    entries: List[Entry]
-    """ Entries sorted by timestamps """
-
-
-@dataclass
-class BenchDatapoint:
-    """
-    A single datapoint that will be on the chart. `timestamp` is on X axis,
-    `score` on Y axis, and the rest of the fields is used either for the tooltip,
-    or for the selection info.
-    """
-    timestamp: datetime
-    score: float
-    score_diff: str
-    """ Difference of the score with previous datapoint, or NaN """
-    score_diff_perc: str
-    tooltip: str
-    bench_run_url: str
-    commit_id: str
-    commit_msg: str
-    commit_author: str
-    commit_url: str
-
-
-@dataclass
-class TemplateBenchData:
-    """ Data for one benchmark label (with a unique name and ID) """
-    id: str
-    """ ID of the benchmark, must not contain dots """
-    name: str
-    """ Human readable name of the benchmark """
-    branches_datapoints: Dict[str, List[BenchDatapoint]]
-    """ Mapping of branches to datapoints for that branch """
-
-
-@dataclass
-class JinjaData:
-    bench_source: Source
-    bench_datas: List[TemplateBenchData]
-    branches: List[str]
-    since: datetime
-    until: datetime
-    display_since: datetime
-    """ The date from which all the datapoints are first displayed """
-
-
-def _parse_bench_run_from_json(obj: Dict[Any, Any]) -> JobRun:
-    return JobRun(
-        id=str(obj["id"]),
-        html_url=obj["html_url"],
-        run_attempt=int(obj["run_attempt"]),
-        event=obj["event"],
-        display_title=obj["display_title"],
-        head_commit=Commit(
-            id=obj["head_commit"]["id"],
-            message=obj["head_commit"]["message"],
-            timestamp=obj["head_commit"]["timestamp"],
-            author=Author(
-                name=obj["head_commit"]["author"]["name"]
-            )
-        )
-    )
-
-
-def _parse_bench_report_from_json(obj: Dict[Any, Any]) -> JobReport:
-    return JobReport(
-        bench_run=_parse_bench_run_from_json(obj["bench_run"]),
-        label_score_dict=obj["label_score_dict"]
-    )
-
-
-def _bench_report_to_json(bench_report: JobReport) -> Dict[Any, Any]:
-    return {
-        "bench_run": {
-            "id": bench_report.bench_run.id,
-            "html_url": bench_report.bench_run.html_url,
-            "run_attempt": bench_report.bench_run.run_attempt,
-            "event": bench_report.bench_run.event,
-            "display_title": bench_report.bench_run.display_title,
-            "head_commit": {
-                "id": bench_report.bench_run.head_commit.id,
-                "message": bench_report.bench_run.head_commit.message,
-                "timestamp": bench_report.bench_run.head_commit.timestamp,
-                "author": {
-                    "name": bench_report.bench_run.head_commit.author.name
-                }
-            }
-        },
-        "label_score_dict": bench_report.label_score_dict
-    }
-
-
-def _parse_bench_report_from_xml(bench_report_xml_path: str, bench_run: JobRun) -> "JobReport":
-    logging.debug(f"Parsing BenchReport from {bench_report_xml_path}")
-    tree = ET.parse(bench_report_xml_path)
-    root = tree.getroot()
-    label_score_dict: Dict[str, float] = dict()
-    for cases in root:
-        assert cases.tag == "cases"
-        for case in cases:
-            assert case.tag == "case"
-            label = case.findtext("label").strip()
-            scores = case.find("scores")
-            scores_float = [float(score.text.strip()) for score in scores]
-            if len(scores_float) > 1:
-                logging.warning(f"More than one score for benchmark {label}, "
-                                f"using the last one (the newest one).")
-            label_score_dict[label] = scores_float[len(scores_float) - 1]
-    return JobReport(
-        label_score_dict=label_score_dict,
-        bench_run=bench_run
-    )
-
-
-def _is_benchrun_id(name: str) -> bool:
-    return re.match("\d{9}", name) is not None
-
-
-def _read_json(json_file: str) -> Dict[Any, Any]:
-    assert path.exists(json_file) and path.isfile(json_file)
-    with open(json_file, "r") as f:
-        return json.load(f)
-
-
-async def _invoke_gh_api(endpoint: str,
-                   query_params: Dict[str, str] = {},
-                   result_as_text: bool = True) -> Union[Dict[str, Any], bytes]:
-    urlencode(query_params)
-    cmd = [
-        "gh",
-        "api",
-        f"/repos/enso-org/enso{endpoint}" + "?" + urlencode(query_params)
-    ]
-    logging.info(f"Starting subprocess `{' '.join(cmd)}`")
-    proc = await asyncio.create_subprocess_exec("gh", *cmd[1:],
-                                                stdout=subprocess.PIPE,
-                                                stderr=subprocess.PIPE)
-    out, err = await proc.communicate()
-    logging.info(f"Finished subprocess `{' '.join(cmd)}`")
-    if proc.returncode != 0:
-        print("Command `" + " ".join(cmd) + "` FAILED with errcode " + str(
-            proc.returncode))
-        print(err.decode())
-        exit(proc.returncode)
-    if result_as_text:
-        return json.loads(out.decode())
-    else:
-        return out
-
-
-class Cache:
-    """
-    Cache is a directory filled with json files that have name of format <bench_run_id>.json, and
-    in every json, there is `BenchReport` dataclass serialized.
-    """
-
-    def __init__(self, dirname: str):
-        assert path.exists(dirname) and path.isdir(dirname)
-        self._dir = dirname
-        # Keys are BenchRun ids
-        self._items: Dict[str, JobReport] = {}
-        for fname in os.listdir(dirname):
-            fname_without_ext, ext = path.splitext(fname)
-            if _is_benchrun_id(fname_without_ext) and ext == ".json":
-                logging.debug(f"Loading into cache from {fname}")
-                bench_report = _parse_bench_report_from_json(
-                    _read_json(path.join(dirname, fname))
-                )
-                self._items[fname_without_ext] = bench_report
-
-    def __len__(self) -> int:
-        return len(self._items)
-
-    def __contains__(self, key: str) -> bool:
-        assert _is_benchrun_id(key)
-        return key in self._items
-
-    def __getitem__(self, item: str) -> Optional[JobReport]:
-        if not _is_benchrun_id(item):
-            return None
-        else:
-            return self._items[item]
-
-    def __setitem__(self, bench_run_id: str, bench_report: JobReport) -> None:
-        assert isinstance(bench_report, JobReport)
-        assert isinstance(bench_run_id, str)
-        assert _is_benchrun_id(bench_run_id)
-        self._items[bench_run_id] = bench_report
-        json_fname = path.join(self._dir, bench_run_id + ".json")
-        logging.debug(f"Putting {bench_run_id} into cache {json_fname}")
-        with open(json_fname, "w") as json_file:
-            json.dump(
-                _bench_report_to_json(bench_report),
-                json_file,
-                indent=2,
-                ensure_ascii=False
-            )
-
-    def __str__(self) -> str:
-        return str(self._items)
-
-    def contains(self, bench_run_id: str) -> bool:
-        return bench_run_id in self._items
-
-
-class FakeCache:
-    def __getitem__(self, item):
-        return None
-
-    def __setitem__(self, key, value):
-        pass
-
-    def __contains__(self, item):
-        return False
-
-    def __len__(self):
-        return 0
-
-
-async def get_bench_runs(since: datetime, until: datetime, branch: str, workflow_id: int) -> List[JobRun]:
-    """
-    Fetches the list of all the job runs from the GH API for the specified `branch`.
-    """
-    logging.info(f"Looking for all successful Engine benchmark workflow run "
-                 f"actions from {since} to {until} for branch {branch} "
-                 f"and workflow ID {workflow_id}")
-    query_fields = {
-        "branch": branch,
-        "status": "success",
-        "created": since.strftime(DATE_FORMAT) + ".." + until.strftime(DATE_FORMAT),
-        # Start with 1, just to determine the total count
-        "per_page": "1"
-    }
-    res = await _invoke_gh_api(f"/actions/workflows/{workflow_id}/runs", query_fields)
-    total_count = int(res["total_count"])
-    per_page = 3
-    logging.debug(f"Total count of all runs: {total_count} for workflow ID "
-                  f"{workflow_id}. Will process {per_page} runs per page")
-
-    async def get_and_parse_run(page: int, parsed_bench_runs) -> None:
-        _query_fields = query_fields.copy()
-        _query_fields["page"] = str(page)
-        res = await _invoke_gh_api(f"/actions/workflows/{workflow_id}/runs", _query_fields)
-        bench_runs_json = res["workflow_runs"]
-        _parsed_bench_runs = [_parse_bench_run_from_json(bench_run_json)
-                              for bench_run_json in bench_runs_json]
-        parsed_bench_runs.extend(_parsed_bench_runs)
-
-    # Now we know the total count, so we can fetch all the runs
-    query_fields["per_page"] = str(per_page)
-    num_queries = math.ceil(total_count / per_page)
-    parsed_bench_runs = []
-
-    tasks = []
-    # Page is indexed from 1
-    for page in range(1, num_queries + 1):
-        tasks.append(get_and_parse_run(page, parsed_bench_runs))
-    await asyncio.gather(*tasks)
-
-    return parsed_bench_runs
-
-
-async def get_bench_report(bench_run: JobRun, cache: Cache, temp_dir: str) -> Optional[JobReport]:
-    """
-    Extracts some data from the given bench_run, which was fetched via the GH API,
-    optionally getting it from the cache.
-    An artifact in GH can expire, in such case, returns None.
-    :param bench_run:
-    :param cache:
-    :param temp_dir: Used for downloading and unzipping artifacts.
-    :return: None if the corresponding artifact expired.
-    """
-    if bench_run.id in cache:
-        logging.info(f"Getting bench run with ID {bench_run.id} from cache")
-        return cache[bench_run.id]
-
-    # There might be multiple artifacts in the artifact list for a benchmark run
-    # We are looking for the one named 'Runtime Benchmark Report', which will
-    # be downloaded as a ZIP file.
-    obj: Dict[str, Any] = await _invoke_gh_api(f"/actions/runs/{bench_run.id}/artifacts")
-    artifacts = obj["artifacts"]
-    assert len(artifacts) == 1, "There should be exactly one artifact for a benchmark run"
-    bench_report_artifact = artifacts[0]
-    assert bench_report_artifact, "Benchmark Report artifact not found"
-    artifact_id = str(bench_report_artifact["id"])
-    if bench_report_artifact["expired"]:
-        created_at = bench_report_artifact["created_at"]
-        updated_at = bench_report_artifact["updated_at"]
-        expires_at = bench_report_artifact["expires_at"]
-        logging.warning(f"Artifact with ID {artifact_id} from bench report {bench_run.id} has expired. "
-                        f"created_at={created_at}, updated_at={updated_at}, expires_at={expires_at}")
-        return None
-
-    # Get contents of the ZIP artifact file
-    artifact_ret = await _invoke_gh_api(f"/actions/artifacts/{artifact_id}/zip", result_as_text=False)
-    zip_file_name = os.path.join(temp_dir, artifact_id + ".zip")
-    logging.debug(f"Writing artifact ZIP content into {zip_file_name}")
-    with open(zip_file_name, "wb") as zip_file:
-        zip_file.write(artifact_ret)
-
-    extracted_dirname = os.path.join(temp_dir, artifact_id)
-    if os.path.exists(extracted_dirname):
-        shutil.rmtree(extracted_dirname)
-    os.mkdir(extracted_dirname)
-
-    logging.debug(f"Extracting {zip_file_name} into {extracted_dirname}")
-    zip_file = zipfile.ZipFile(zip_file_name, "r")
-    zip_file.extractall(extracted_dirname)
-    bench_report_xml = path.join(extracted_dirname, "bench-report.xml")
-    assert path.exists(bench_report_xml)
-
-    bench_report_parsed = _parse_bench_report_from_xml(bench_report_xml, bench_run)
-    cache[bench_run.id] = bench_report_parsed
-    return bench_report_parsed
-
-
 CSV_FIELDNAMES = [
     "label",
     "score",
@@ -524,171 +117,9 @@ def write_bench_reports_to_csv(bench_reports: List[JobReport], csv_fname: str) -
                 })
 
 
-def populate_cache(cache_dir: str) -> Cache:
-    """
-    Initializes cache from `cache_dir`, if there are any items.
-    See docs of `Cache`.
-
-    :param cache_dir: Path to the cache directory. Does not have to exist
-    :return: Populated cache. Might be empty.
-    """
-    if not path.exists(cache_dir):
-        logging.info(f"No cache at {cache_dir}, creating the cache directory")
-        os.mkdir(cache_dir)
-    logging.debug(f"Initializing cache from {cache_dir}")
-    cache = Cache(cache_dir)
-    logging.debug(f"Cache populated with {len(cache)} items")
-    return cache
-
-
-def create_template_data(
-        job_reports_per_branch: Dict[str, List[JobReport]],
-        bench_labels: Set[str]) -> List[TemplateBenchData]:
-    """
-    Creates all the necessary data for the Jinja template from all collected
-    benchmark job reports.
-    :param job_reports_per_branch: Mapping of branch name to list of job reports.
-    job reports should be sorted by the commit date, otherwise the difference
-    between scores might be wrongly computed.
-    :param bench_labels:
-    :return:
-    """
-
-    def pct_to_str(score_diff_perc: float) -> str:
-        if not np.isnan(score_diff_perc):
-            buff = "+" if score_diff_perc > 0 else ""
-            buff += "{:.5f}".format(score_diff_perc * 100)
-            buff += "%"
-            return buff
-        else:
-            return "NaN"
-
-    def diff_str(score_diff: float, score_diff_perc: float) -> str:
-        if not np.isnan(score_diff):
-            diff_str = "+" if score_diff > 0 else ""
-            diff_str += "{:.5f}".format(score_diff)
-            diff_str += " ("
-            diff_str += pct_to_str(score_diff_perc)
-            diff_str += ")"
-            return diff_str
-        else:
-            return "NA"
-
-    template_bench_datas: List[TemplateBenchData] = []
-    for bench_label in bench_labels:
-        logging.debug(f"Creating template data for benchmark {bench_label}")
-        branch_datapoints: Dict[str, List[BenchDatapoint]] = {}
-        for branch, job_reports in job_reports_per_branch.items():
-            logging.debug(f"Creating datapoints for branch {branch} from {len(job_reports)} job reports")
-            datapoints: List[BenchDatapoint] = []
-            for job_report in job_reports:
-                prev_datapoint: Optional[BenchDatapoint] = \
-                    datapoints[-1] if len(datapoints) > 0 else None
-                if bench_label in job_report.label_score_dict:
-                    score = job_report.label_score_dict[bench_label]
-                    commit = job_report.bench_run.head_commit
-                    timestamp = datetime.strptime(
-                        commit.timestamp,
-                        GH_DATE_FORMAT
-                    )
-                    commit_msg_header = \
-                        commit.message.splitlines()[0].replace('"', "'")
-                    series = pd.Series([
-                        prev_datapoint.score if prev_datapoint else None,
-                        score
-                    ])
-                    score_diff = series.diff()[1]
-                    score_diff_perc = series.pct_change()[1]
-                    tooltip = "score = " + str(score) + "\\n"
-                    tooltip += "date = " + str(timestamp) + "\\n"
-                    tooltip += "branch = " + branch + "\\n"
-                    tooltip += "diff = " + diff_str(score_diff, score_diff_perc)
-                    author_name = commit.author.name\
-                        .replace('"', '\\"')\
-                        .replace("'", "\\'")
-                    datapoints.append(BenchDatapoint(
-                        timestamp=timestamp,
-                        score=score,
-                        score_diff=str(score_diff),
-                        score_diff_perc=pct_to_str(score_diff_perc),
-                        tooltip=tooltip,
-                        bench_run_url=job_report.bench_run.html_url,
-                        commit_id=commit.id,
-                        commit_msg=commit_msg_header,
-                        commit_author=author_name,
-                        commit_url=ENSO_COMMIT_BASE_URL + commit.id,
-                    ))
-            logging.debug(f"{len(datapoints)} datapoints created for branch {branch}")
-            branch_datapoints[branch] = datapoints.copy()
-        logging.debug(f"Template data for benchmark {bench_label} created")
-        template_bench_datas.append(TemplateBenchData(
-            id=_label_to_id(bench_label),
-            name=_label_to_name(bench_label),
-            branches_datapoints=branch_datapoints,
-        ))
-    return template_bench_datas
-
-
-def _label_to_id(label: str) -> str:
-    return label.replace(".", "_")
-
-
-def _label_to_name(label: str) -> str:
-    items = label.split(".")
-    assert len(items) >= 2
-    filtered_items = \
-        [item for item in items if item not in (
-            "org",
-            "enso",
-            "benchmark",
-            "benchmarks",
-            "semantic",
-            "interpreter",
-            "bench"
-        )]
-    return "_".join(filtered_items)
-
-
-def _gather_all_bench_labels(job_reports: List[JobReport]) -> Set[str]:
-    """
-    Iterates through all the job reports and gathers all the benchmark labels
-    found. Note that every job report can have a different set of benchmark labels.
-    :return: List of benchmark labels.
-    """
-    all_labels = set()
-    for job_report in job_reports:
-        for labels in job_report.label_score_dict.keys():
-            all_labels.add(labels)
-    return all_labels
-
-
-def render_html(jinja_data: JinjaData, template_file: str, html_out_fname: str) -> None:
-    jinja_env = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
-    jinja_template = jinja_env.get_template(template_file)
-    generated_html = jinja_template.render(jinja_data.__dict__)
-    if path.exists(html_out_fname):
-        logging.info(f"{html_out_fname} already exist, rewritting")
-    with open(html_out_fname, "w") as html_file:
-        html_file.write(generated_html)
-
-
-def ensure_gh_installed() -> None:
-    try:
-        out = subprocess.run(["gh", "--version"], check=True, capture_output=True)
-        if out.returncode != 0:
-            print("`gh` command not found - GH CLI utility is not installed. "
-                  "See https://cli.github.com/", file=sys.stderr)
-            exit(1)
-    except subprocess.CalledProcessError:
-        print("`gh` command not found - GH CLI utility is not installed. "
-              "See https://cli.github.com/", file=sys.stderr)
-        exit(1)
-
-
 async def main():
     default_since: datetime = (datetime.now() - timedelta(days=14))
     default_until: datetime = datetime.now()
-    default_cache_dir = path.expanduser("~/.cache/enso_bench_download")
     default_csv_out = "Engine_Benchs/data/benchs.csv"
     date_format_help = DATE_FORMAT.replace("%", "%%")
 
@@ -724,17 +155,6 @@ def _parse_bench_source(_bench_source: str) -> Source:
                             help=f"The date until which the benchmark results will be gathered. "
                                  f"Format is {date_format_help}. "
                                  f"The default is today")
-    arg_parser.add_argument("--use-cache",
-                            default=False,
-                            metavar="(true|false)",
-                            type=lambda input: True if input in ("true", "True") else False,
-                            help="Whether the cache directory should be used. The default is False.")
-    arg_parser.add_argument("-c", "--cache", action="store",
-                            default=default_cache_dir,
-                            metavar="CACHE_DIR",
-                            help=f"Cache directory. Makes sense only iff specified with --use-cache argument. "
-                                 f"The default is {default_cache_dir}. If there are any troubles with the "
-                                 f"cache, just do `rm -rf {default_cache_dir}`.")
     arg_parser.add_argument("-b", "--branches", action="store",
                             nargs="+",
                             default=["develop"],
@@ -766,20 +186,17 @@ def _parse_bench_source(_bench_source: str) -> Source:
 
     since: datetime = args.since
     until: datetime = args.until
-    cache_dir: str = args.cache
     if not args.tmp_dir:
         temp_dir: str = tempfile.mkdtemp()
     else:
         temp_dir: str = args.tmp_dir
-    use_cache: bool = args.use_cache
-    assert cache_dir and temp_dir
     bench_source: Source = args.source
     csv_output: str = args.csv_output
     create_csv: bool = args.create_csv
     branches: List[str] = args.branches
     labels_override: Set[str] = args.labels
-    logging.debug(f"parsed args: since={since}, until={until}, cache_dir={cache_dir}, "
-                 f"temp_dir={temp_dir}, use_cache={use_cache}, bench_source={bench_source}, "
+    logging.debug(f"parsed args: since={since}, until={until}, "
+                 f"temp_dir={temp_dir}, bench_source={bench_source}, "
                  f"csv_output={csv_output}, "
                  f"create_csv={create_csv}, branches={branches}, "
                  f"labels_override={labels_override}")
@@ -789,22 +206,15 @@ def _parse_bench_source(_bench_source: str) -> Source:
     # If the user requires benchmarks for which artifacts are not retained
     # anymore, then cache should be used.
     min_since_without_cache = datetime.today() - GH_ARTIFACT_RETENTION_PERIOD
-    if not use_cache and since < min_since_without_cache:
-        logging.warning(f"The default GH artifact retention period is "
+    if since < min_since_without_cache:
+        logging.info(f"The default GH artifact retention period is "
                         f"{GH_ARTIFACT_RETENTION_PERIOD.days} days. "
                         f"This means that all the artifacts older than "
                         f"{min_since_without_cache.date()} are expired."
-                        f"The use_cache parameter is set to False, so no "
-                        f"expired artifacts will be fetched.")
-        logging.warning(f"The `since` parameter is reset to "
-                        f"{min_since_without_cache.date()} to prevent "
-                        f"unnecessary GH API queries.")
-        since = min_since_without_cache
-
-    if use_cache:
-        cache = populate_cache(cache_dir)
-    else:
-        cache = FakeCache()
+                        f"The since date was set to {since}, so the remote cache is enabled, "
+                        f"and the older artifacts will be fetched from the cache.")
+
+    remote_cache = ReadonlyRemoteCache()
 
     bench_labels: Optional[Set[str]] = None
     """ Set of all gathered benchmark labels from all the job reports """
@@ -821,18 +231,7 @@ def _parse_bench_source(_bench_source: str) -> Source:
                 f" until {until} for branch {branch}")
             exit(1)
 
-        job_reports: List[JobReport] = []
-
-        async def _process_report(_bench_run):
-            _job_report = await get_bench_report(_bench_run, cache, temp_dir)
-            if _job_report:
-                job_reports.append(_job_report)
-
-        tasks = []
-        for bench_run in bench_runs:
-            tasks.append(_process_report(bench_run))
-        await asyncio.gather(*tasks)
-
+        job_reports = await fetch_job_reports(bench_runs, remote_cache)
         logging.debug(f"Got {len(job_reports)} job reports for branch {branch}")
         if len(job_reports) == 0:
             print(f"There were 0 job_reports in the specified time interval, "
@@ -841,14 +240,7 @@ async def _process_report(_bench_run):
             exit(1)
 
         logging.debug("Sorting job_reports by commit date")
-
-        def _get_timestamp(job_report: JobReport) -> datetime:
-            return datetime.strptime(
-                job_report.bench_run.head_commit.timestamp,
-                GH_DATE_FORMAT
-            )
-
-        job_reports.sort(key=lambda report: _get_timestamp(report))
+        sort_job_reports(job_reports)
 
         if create_csv:
             write_bench_reports_to_csv(job_reports, csv_output)
@@ -858,7 +250,7 @@ def _get_timestamp(job_report: JobReport) -> datetime:
 
         # Gather all the benchmark labels from all the job reports
         if bench_labels is None:
-            all_bench_labels = _gather_all_bench_labels(job_reports)
+            all_bench_labels = gather_all_bench_labels(job_reports)
             if len(labels_override) > 0:
                 logging.info(f"Subset of labels specified: {labels_override}")
                 if not set(labels_override).issubset(all_bench_labels):
@@ -883,6 +275,7 @@ def _get_timestamp(job_report: JobReport) -> datetime:
         bench_datas=template_bench_datas,
         bench_source=bench_source,
         branches=branches,
+        timestamp=datetime.now()
     )
 
     # Render Jinja template with jinja_data
@@ -890,10 +283,9 @@ def _get_timestamp(job_report: JobReport) -> datetime:
         os.mkdir(GENERATED_SITE_DIR)
 
     logging.debug(f"Rendering HTML from {JINJA_TEMPLATE} to {GENERATED_SITE_DIR}")
-    site_path = path.join(GENERATED_SITE_DIR, bench_source.value + "-benchs.html")
+    site_path = GENERATED_SITE_DIR.joinpath(bench_source.value + "-benchs.html")
     render_html(
         jinja_data,
-        JINJA_TEMPLATE,
         site_path
     )
     logging.debug(f"Copying static site content from {TEMPLATES_DIR} to {GENERATED_SITE_DIR}")
diff --git a/tools/performance/engine-benchmarks/bench_tool/__init__.py b/tools/performance/engine-benchmarks/bench_tool/__init__.py
new file mode 100644
index 000000000000..f3318a98c0c7
--- /dev/null
+++ b/tools/performance/engine-benchmarks/bench_tool/__init__.py
@@ -0,0 +1,220 @@
+import os
+from dataclasses import dataclass
+from datetime import timedelta, datetime
+from enum import Enum
+from pathlib import Path
+from typing import List, Dict, Any
+
+
+def pkg_dir() -> Path:
+    """ Directory of this package """
+    return Path(os.path.dirname(os.path.realpath(__file__)))
+
+
+ENSO_REPO = "enso-org/enso"
+BENCH_REPO = "enso-org/engine-benchmark-results"
+BRANCH_DEVELOP = "develop"
+DATE_FORMAT = "%Y-%m-%d"
+GH_DATE_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
+ENGINE_BENCH_WORKFLOW_ID = 29450898
+"""
+Workflow ID of engine benchmarks, got via `gh api 
+'/repos/enso-org/enso/actions/workflows'`.
+The name of the workflow is 'Benchmark Engine'
+"""
+NEW_ENGINE_BENCH_WORKFLOW_ID = 67075764
+"""
+Workflow ID for 'Benchmark Engine' workflow, which is the new workflow
+since 2023-08-22.
+"""
+STDLIBS_BENCH_WORKFLOW_ID = 66661001
+"""
+Workflow ID of stdlibs benchmarks, got via `gh api 
+'/repos/enso-org/enso/actions/workflows'`.
+The name is 'Benchmark Standard Libraries'
+"""
+""" Date format as returned from responses in GH API"""
+ENSO_COMMIT_BASE_URL = "https://github.com/enso-org/enso/commit/"
+
+GH_ARTIFACT_RETENTION_PERIOD = timedelta(days=90)
+
+GENERATED_SITE_DIR = pkg_dir().parent.joinpath("generated_site")
+TEMPLATES_DIR = pkg_dir().parent.joinpath("templates")
+JINJA_TEMPLATE = TEMPLATES_DIR.joinpath("template_jinja.html")
+
+assert TEMPLATES_DIR.exists()
+assert JINJA_TEMPLATE.exists()
+
+
+class Source(Enum):
+    ENGINE = "engine"
+    STDLIB = "stdlib"
+
+    def workflow_ids(self) -> List[int]:
+        if self == Source.ENGINE:
+            return [ENGINE_BENCH_WORKFLOW_ID, NEW_ENGINE_BENCH_WORKFLOW_ID]
+        elif self == Source.STDLIB:
+            return [STDLIBS_BENCH_WORKFLOW_ID]
+        else:
+            raise ValueError(f"Unknown source {self}")
+
+    def artifact_names(self) -> List[str]:
+        if self == Source.ENGINE:
+            return ["Runtime Benchmark Report"]
+        elif self == Source.STDLIB:
+            return ["Enso JMH Benchmark Report"]
+        else:
+            raise ValueError(f"Unknown source {self}")
+
+
+@dataclass
+class Author:
+    name: str
+
+
+@dataclass
+class Commit:
+    """ Corresponds to the commit from GH API """
+    id: str
+    author: Author
+    timestamp: str
+    message: str
+
+
+@dataclass
+class JobRun:
+    """
+    Gathered via the GH API. Defines a single run of an Engine benchmark job.
+    """
+    id: str
+    display_title: str
+    html_url: str
+    run_attempt: int
+    """ An event as defined by the GitHub API, for example 'push' or 'schedule' """
+    event: str
+    head_commit: Commit
+
+    @staticmethod
+    def from_dict(obj: Dict[Any, Any]) -> "JobRun":
+        return JobRun(
+            id=str(obj["id"]),
+            html_url=obj["html_url"],
+            run_attempt=int(obj["run_attempt"]),
+            event=obj["event"],
+            display_title=obj["display_title"],
+            head_commit=Commit(
+                id=obj["head_commit"]["id"],
+                message=obj["head_commit"]["message"],
+                timestamp=obj["head_commit"]["timestamp"],
+                author=Author(
+                    name=obj["head_commit"]["author"]["name"]
+                )
+            )
+        )
+
+    def to_dict(self) -> Dict[Any, Any]:
+        return {
+            "id": self.id,
+            "html_url": self.html_url,
+            "run_attempt": self.run_attempt,
+            "event": self.event,
+            "display_title": self.display_title,
+            "head_commit": {
+                "id": self.head_commit.id,
+                "message": self.head_commit.message,
+                "timestamp": self.head_commit.timestamp,
+                "author": {
+                    "name": self.head_commit.author.name
+                }
+            }
+        }
+
+
+@dataclass
+class JobReport:
+    """
+    Gathered via the GH API - a report that is pushed as an aritfact to the job.
+    Contains a XML file with scores for all the benchmarks.
+    """
+    label_score_dict: Dict[str, float]
+    """ A mapping of benchmark labels to their scores """
+    bench_run: JobRun
+
+    @staticmethod
+    def from_dict(obj: Dict[Any, Any]) -> "JobReport":
+        return JobReport(
+            bench_run=JobRun.from_dict(obj["bench_run"]),
+            label_score_dict=obj["label_score_dict"]
+        )
+
+    def to_dict(self) -> Dict[Any, Any]:
+        return {
+            "bench_run": self.bench_run.to_dict(),
+            "label_score_dict": self.label_score_dict
+        }
+
+
+@dataclass
+class BenchmarkData:
+    """
+    Data for a single benchmark compiled from all the job reports.
+    """
+
+    @dataclass
+    class Entry:
+        score: float
+        commit: Commit
+        bench_run_url: str
+        bench_run_event: str
+
+    label: str
+    """ Label for the benchmark, as reported by org.enso.interpreter.bench.BenchmarksRunner """
+    entries: List[Entry]
+    """ Entries sorted by timestamps """
+
+
+@dataclass
+class BenchDatapoint:
+    """
+    A single datapoint that will be on the chart. `timestamp` is on X axis,
+    `score` on Y axis, and the rest of the fields is used either for the tooltip,
+    or for the selection info.
+    """
+    timestamp: datetime
+    score: float
+    score_diff: str
+    """ Difference of the score with previous datapoint, or NaN """
+    score_diff_perc: str
+    tooltip: str
+    bench_run_url: str
+    commit_id: str
+    commit_msg: str
+    commit_author: str
+    commit_url: str
+
+
+@dataclass
+class TemplateBenchData:
+    """ Data for one benchmark label (with a unique name and ID) """
+    id: str
+    """ ID of the benchmark, must not contain dots """
+    name: str
+    """ Human readable name of the benchmark """
+    branches_datapoints: Dict[str, List[BenchDatapoint]]
+    """ Mapping of branches to datapoints for that branch """
+
+
+@dataclass
+class JinjaData:
+    bench_source: Source
+    bench_datas: List[TemplateBenchData]
+    branches: List[str]
+    since: datetime
+    until: datetime
+    display_since: datetime
+    """ The date from which all the datapoints are first displayed """
+    timestamp: datetime
+    """ The time when the website was generated """
+
+
+
diff --git a/tools/performance/engine-benchmarks/bench_tool/bench_results.py b/tools/performance/engine-benchmarks/bench_tool/bench_results.py
new file mode 100644
index 000000000000..5337203334c1
--- /dev/null
+++ b/tools/performance/engine-benchmarks/bench_tool/bench_results.py
@@ -0,0 +1,194 @@
+import asyncio
+import logging
+import math
+import os
+import shutil
+import zipfile
+from datetime import datetime
+from os import path
+from typing import List, Dict, Optional, Any
+from xml.etree import ElementTree as ET
+
+from bench_tool import JobRun, DATE_FORMAT, ENSO_REPO, JobReport, Source
+from bench_tool.gh import invoke_gh_api
+from bench_tool.remote_cache import RemoteCache
+from bench_tool.utils import WithTempDir
+
+ARTIFACT_ID = "Runtime Benchmark Report"
+
+_logger = logging.getLogger(__name__)
+
+
+async def get_bench_runs(since: datetime, until: datetime, branch: str, workflow_id: int) -> List[JobRun]:
+    """
+    Fetches the list of all the SUCCESSFUL job runs from the GH API for the specified `branch`.
+
+    :param since: The date from which the benchmark results will be gathered.
+    :param until: The date until which the benchmark results will be gathered.
+    :param branch: The branch for which the benchmark results will be gathered.
+    :param workflow_id: The ID of the workflow for which the benchmark results will be gathered.
+    """
+    _logger.info(f"Looking for all successful Engine benchmark workflow run "
+                 f"actions from {since} to {until} for branch {branch} "
+                 f"and workflow ID {workflow_id}")
+    query_fields = {
+        "branch": branch,
+        "status": "success",
+        "created": since.strftime(DATE_FORMAT) + ".." + until.strftime(DATE_FORMAT),
+        # Start with 1, just to determine the total count
+        "per_page": "1"
+    }
+    res = await invoke_gh_api(ENSO_REPO, f"/actions/workflows/{workflow_id}/runs", query_fields)
+    total_count = int(res["total_count"])
+    per_page = 3
+    _logger.debug(f"Total count of all runs: {total_count} for workflow ID "
+                  f"{workflow_id}. Will process {per_page} runs per page")
+
+    async def get_and_parse_run(page: int, parsed_bench_runs) -> None:
+        _query_fields = query_fields.copy()
+        _query_fields["page"] = str(page)
+        res = await invoke_gh_api(ENSO_REPO, f"/actions/workflows/{workflow_id}/runs", _query_fields)
+        bench_runs_json = res["workflow_runs"]
+        _parsed_bench_runs = [JobRun.from_dict(bench_run_json)
+                              for bench_run_json in bench_runs_json]
+        parsed_bench_runs.extend(_parsed_bench_runs)
+
+    # Now we know the total count, so we can fetch all the runs
+    query_fields["per_page"] = str(per_page)
+    num_queries = math.ceil(total_count / per_page)
+    parsed_bench_runs = []
+
+    tasks = []
+    # Page is indexed from 1
+    for page in range(1, num_queries + 1):
+        tasks.append(get_and_parse_run(page, parsed_bench_runs))
+    await asyncio.gather(*tasks)
+
+    return parsed_bench_runs
+
+
+async def fetch_job_reports(
+        bench_runs: List[JobRun],
+        remote_cache: RemoteCache
+) -> List[JobReport]:
+    """
+    Fetches all benchmark reports for the given benchmark runs. Benchmark runs are basically
+    just IDs of artifacts, and the reports are the actual benchmark results. These results are
+    either on the GH as artifacts, or are fetched from the cache if the artifact is expired.
+    All the runs are fetched in parallel.
+    :param bench_runs:
+    :param remote_cache:
+    :return:
+    """
+    job_reports: List[JobReport] = []
+
+    async def _process_report(_bench_run: JobRun):
+        with WithTempDir("bench_download") as temp_dir:
+            _job_report = await get_bench_report(_bench_run, temp_dir, remote_cache)
+        if _job_report:
+            job_reports.append(_job_report)
+
+    tasks = []
+    for bench_run in bench_runs:
+        tasks.append(_process_report(bench_run))
+    await asyncio.gather(*tasks)
+    return job_reports
+
+
+def _known_artifact_names() -> List[str]:
+    return Source.STDLIB.artifact_names() + Source.ENGINE.artifact_names()
+
+
+async def get_bench_report(bench_run: JobRun, temp_dir: str, remote_cache: RemoteCache) -> Optional[JobReport]:
+    """
+    Extracts some data from the given bench_run, which was fetched via the GH API,
+    optionally getting it from the cache.
+    An artifact in GH can expire, in such case, returns None.
+    :param bench_run:
+    :param temp_dir: Used for downloading and unzipping artifacts.
+    :return: None if the corresponding artifact cannot be found, neither as a GH artifact, neither from the remote cache.
+    """
+    assert os.path.exists(temp_dir) and os.path.isdir(temp_dir)
+
+    # There might be multiple artifacts in the artifact list for a benchmark run
+    # We are looking for the one named 'Runtime Benchmark Report', which will
+    # be downloaded as a ZIP file.
+    obj: Dict[str, Any] = await invoke_gh_api(ENSO_REPO, f"/actions/runs/{bench_run.id}/artifacts")
+    artifacts = obj["artifacts"]
+    artifacts_by_names = {artifact["name"]: artifact for artifact in artifacts}
+    # At this point, we don't know the source of the benchmark - either it is from
+    # Engine, or from stdlib. Thus, we don't know exactly which artifact name we
+    # are looking for. But we know, there must be exactly one of the artifact names.
+    bench_report_artifact = None
+    for known_name in _known_artifact_names():
+        if known_name in artifacts_by_names:
+            bench_report_artifact = artifacts_by_names[known_name]
+    if bench_report_artifact is None:
+        _logger.warning(f"Bench run {bench_run.id} does not contain any of the known artifact names: "
+                        f"{_known_artifact_names()}, but it is a successful run.")
+        return None
+    assert bench_report_artifact, "Benchmark Report artifact not found"
+    artifact_id = str(bench_report_artifact["id"])
+    created_at = bench_report_artifact["created_at"]
+    updated_at = bench_report_artifact["updated_at"]
+    expires_at = bench_report_artifact["expires_at"]
+    is_expired = bench_report_artifact["expired"]
+    _logger.debug(f"Got artifact with ID {artifact_id}, from bench run {bench_run.id}: "
+                  f"created_at={created_at}, updated_at={updated_at}, expires_at={expires_at}, "
+                  f"is_expired={is_expired}")
+
+    job_report = await remote_cache.fetch(bench_run.id)
+    if is_expired and job_report is None:
+        _logger.error(
+            f"Artifact {artifact_id} from bench run {bench_run.id} is expired, and it is not in the remote cache")
+        return None
+    if job_report:
+        _logger.debug(f"Got job report from the cache for {bench_run.id}")
+        return job_report
+
+    assert not is_expired
+
+    # Get contents of the ZIP artifact file
+    artifact_ret = await invoke_gh_api(ENSO_REPO, f"/actions/artifacts/{artifact_id}/zip", result_as_json=False)
+    zip_file_name = os.path.join(temp_dir, artifact_id + ".zip")
+    _logger.debug(f"Writing artifact ZIP content into {zip_file_name}")
+    with open(zip_file_name, "wb") as zip_file:
+        zip_file.write(artifact_ret)
+
+    extracted_dirname = os.path.join(temp_dir, artifact_id)
+    if os.path.exists(extracted_dirname):
+        shutil.rmtree(extracted_dirname)
+    os.mkdir(extracted_dirname)
+
+    _logger.debug(f"Extracting {zip_file_name} into {extracted_dirname}")
+    zip_file = zipfile.ZipFile(zip_file_name, "r")
+    zip_file.extractall(extracted_dirname)
+    bench_report_xml = path.join(extracted_dirname, "bench-report.xml")
+    assert path.exists(bench_report_xml)
+
+    bench_report_parsed = _parse_bench_report_from_xml(bench_report_xml, bench_run)
+    await remote_cache.put(bench_run.id, bench_report_parsed)
+    return bench_report_parsed
+
+
+def _parse_bench_report_from_xml(bench_report_xml_path: str, bench_run: JobRun) -> "JobReport":
+    _logger.debug(f"Parsing BenchReport from {bench_report_xml_path}")
+    tree = ET.parse(bench_report_xml_path)
+    root = tree.getroot()
+    label_score_dict: Dict[str, float] = dict()
+    for cases in root:
+        assert cases.tag == "cases"
+        for case in cases:
+            assert case.tag == "case"
+            label = case.findtext("label").strip()
+            scores = case.find("scores")
+            scores_float = [float(score.text.strip()) for score in scores]
+            if len(scores_float) > 1:
+                _logger.warning(f"More than one score for benchmark {label}, "
+                                f"using the last one (the newest one).")
+            label_score_dict[label] = scores_float[len(scores_float) - 1]
+    return JobReport(
+        label_score_dict=label_score_dict,
+        bench_run=bench_run
+    )
+
diff --git a/tools/performance/engine-benchmarks/bench_tool/gh.py b/tools/performance/engine-benchmarks/bench_tool/gh.py
new file mode 100644
index 000000000000..d8899e15b22d
--- /dev/null
+++ b/tools/performance/engine-benchmarks/bench_tool/gh.py
@@ -0,0 +1,107 @@
+import asyncio
+import base64
+import json
+import logging
+import subprocess
+import sys
+from typing import Dict, Optional, Union, Any
+from urllib.parse import urlencode
+
+_logger = logging.getLogger(__name__)
+
+MAX_BACKOFF_SECONDS = 120
+
+
+def ensure_gh_installed() -> None:
+    try:
+        out = subprocess.run(["gh", "--version"], check=True,
+                             capture_output=True)
+        if out.returncode != 0:
+            print("`gh` command not found - GH CLI utility is not installed. "
+                  "See https://cli.github.com/", file=sys.stderr)
+            exit(1)
+    except subprocess.CalledProcessError:
+        print("`gh` command not found - GH CLI utility is not installed. "
+              "See https://cli.github.com/", file=sys.stderr)
+        exit(1)
+
+
+async def invoke_gh_api(
+    repo: str,
+    endpoint: str,
+    query_params: Dict[str, str] = {},
+    fields: Dict[str, str] = {},
+    result_as_json: bool = True,
+    method: str = "GET",
+    backoff: int = 0,
+) -> Optional[Union[Dict[str, Any], bytes]]:
+    """
+    Invokes the GitHub API using the `gh` command line tool.
+    :param repo: Repository name in the form `owner/repo`
+    :param endpoint: Endpoint of the query. Must start with `/`.
+    :param query_params: Additional query parameters.
+    :param fields: Additional fields to be added to the query. add static
+    string parameters to the request payload.
+    :param result_as_json: If result should be parsed as JSON.
+          If false, the raw bytes are returned.
+    :param method: HTTP method to use, 'GET' by default.
+    :param backoff: Number of seconds to wait before retrying the request.
+    If higher than 0, it means that the request has already been retried,
+    try to do it again, with a higher backoff.
+    :return: None if the query fails
+    """
+    assert endpoint.startswith("/")
+    if len(fields) > 0 and method != "POST":
+        raise ValueError("Fields can be used only with POST method")
+    urlencode(query_params)
+    cmd = [
+        "gh",
+        "api",
+        "--method", method,
+        f"/repos/{repo}{endpoint}" + "?" + urlencode(query_params)
+    ]
+    for k, v in fields.items():
+        cmd.append("-f")
+        cmd.append(f"{k}='{v}'")
+    if 0 < backoff <= MAX_BACKOFF_SECONDS:
+        _logger.debug(f"Backing off for {backoff} seconds")
+        await asyncio.sleep(backoff)
+    elif backoff > MAX_BACKOFF_SECONDS:
+        _logger.error(f"Backoff of {backoff} seconds is too high, giving up.")
+        return None
+    _logger.debug("Invoking gh API with `%s`", " ".join(cmd))
+    proc = await asyncio.create_subprocess_exec("gh", *cmd[1:],
+                                                stdout=subprocess.PIPE,
+                                                stderr=subprocess.PIPE)
+    out, err = await proc.communicate()
+    _logger.debug("Finished gh API `%s`", " ".join(cmd))
+    if proc.returncode != 0:
+        # Special handling of rate limit exceeded - just try to make the
+        # request one more time after some backoff.
+        if "You have exceeded a secondary rate limit" in err.decode():
+            new_backoff = 10 if backoff == 0 else backoff * 2
+            _logger.warning(f"Trying to retry the request with a new backoff "
+                            f"of {new_backoff} seconds.")
+            return await invoke_gh_api(repo, endpoint, query_params, fields,
+                                       result_as_json, method, new_backoff)
+        else:
+            _logger.error("Command `%s` FAILED with errcode %d",
+                          " ".join(cmd),
+                          proc.returncode)
+            _logger.error("  stdout: %s", out.decode())
+            _logger.error("  stderr: %s", err.decode())
+            return None
+    if result_as_json:
+        return json.loads(out.decode())
+    else:
+        return out
+
+
+async def fetch_file(repo: str, file_path: str) -> Optional[str]:
+    ret = await invoke_gh_api(repo, f"/contents/{file_path}",
+                              result_as_json=True)
+    if ret is None:
+        _logger.warning("File %s not found in %s", file_path, repo)
+        return None
+    file_content = base64.b64decode(ret["content"]).decode()
+    return file_content
diff --git a/tools/performance/engine-benchmarks/bench_tool/git.py b/tools/performance/engine-benchmarks/bench_tool/git.py
new file mode 100644
index 000000000000..8e3529f05cd3
--- /dev/null
+++ b/tools/performance/engine-benchmarks/bench_tool/git.py
@@ -0,0 +1,119 @@
+import asyncio
+import logging
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Set
+
+_logger = logging.getLogger(__name__)
+
+
+@dataclass
+class GitStatus:
+    modified: Set[str]
+    untracked: Set[str]
+    added: Set[str]
+
+
+async def clone(repo: str, dest: Path) -> None:
+    _logger.debug("Cloning %s to %s", repo, dest)
+    dest_abs_path = str(dest.absolute())
+    args = ["clone", f"git@github.com:{repo}.git", dest_abs_path]
+    proc = await asyncio.create_subprocess_exec("git", *args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    ret = await proc.wait()
+    if ret != 0:
+        stdout, stderr = await proc.communicate()
+        out = stdout.decode() + stderr.decode()
+        raise RuntimeError(f"Failed to clone {repo}: {out}")
+    assert dest.exists()
+
+
+async def pull(repo: Path) -> None:
+    _logger.debug("Pulling %s", repo)
+    # Avoid unnecessary merge commits by using `--ff-only`
+    args = ["pull", "--ff-only"]
+    proc = await asyncio.create_subprocess_exec("git", *args, cwd=repo, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    ret = await proc.wait()
+    if ret != 0:
+        stdout, stderr = await proc.communicate()
+        out = stdout.decode() + stderr.decode()
+        raise RuntimeError(f"Failed to pull {repo}: {out}")
+
+
+async def status(repo: Path) -> GitStatus:
+    assert repo.exists()
+    proc = await asyncio.create_subprocess_exec("git", "status", "--porcelain", cwd=repo,
+                                                stdout=subprocess.PIPE)
+    out, _ = await proc.communicate()
+    lines = out.decode().splitlines()
+    untracked: Set[str] = set()
+    modified: Set[str] = set()
+    added: Set[str] = set()
+    for line in lines:
+        line = line.strip()
+        if line.startswith("??"):
+            untracked.add(line.split()[1])
+        elif line.startswith("M "):
+            modified.add(line.split()[1])
+        elif line.startswith("A "):
+            added.add(line.split()[1])
+    return GitStatus(modified, untracked, added)
+
+
+async def add(repo: Path, files: Set[str]) -> None:
+    _logger.debug("Adding %s to %s", files, repo)
+    assert len(files) > 0
+    args = ["add"] + list(files)
+    proc = await asyncio.create_subprocess_exec("git", *args, cwd=repo, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    ret = await proc.wait()
+    if ret != 0:
+        out, err = await proc.communicate()
+        all_out = out.decode() + err.decode()
+        raise RuntimeError(f"Failed to add {files} to {repo}. Output: {all_out}")
+
+
+async def commit(repo: Path, msg: str) -> None:
+    _logger.debug("Committing %s with message '%s'", repo, msg)
+    stat = await status(repo)
+    assert len(stat.added) > 0 or len(stat.modified) > 0
+    args = ["commit", "-m", msg]
+    proc = await asyncio.create_subprocess_exec("git", *args, cwd=repo, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    ret = await proc.wait()
+    if ret != 0:
+        out, err = await proc.communicate()
+        all_out = out.decode() + err.decode()
+        raise RuntimeError(f"Failed to commit {repo}. Output: {all_out}")
+
+
+async def push(repo: Path) -> None:
+    _logger.debug("Pushing to %s", repo)
+    args = ["push"]
+    proc = await asyncio.create_subprocess_exec("git", *args, cwd=repo, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    ret = await proc.wait()
+    if ret != 0:
+        out, err = await proc.communicate()
+        all_out = out.decode() + err.decode()
+        raise RuntimeError(f"Failed to push {repo}. Output: {all_out}")
+
+
+async def init(repo: Path) -> None:
+    _logger.debug("Initializing git repo in %s", repo)
+    assert repo.exists()
+    args = ["init"]
+    proc = await asyncio.create_subprocess_exec("git", *args, cwd=repo, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    ret = await proc.wait()
+    if ret != 0:
+        out, err = await proc.communicate()
+        all_out = out.decode() + err.decode()
+        raise RuntimeError(f"Failed to init {repo}. Output: {all_out}")
+
+
+async def head_commit(repo: Path) -> str:
+    args = ["rev-parse", "HEAD"]
+    proc = await asyncio.create_subprocess_exec("git", *args, cwd=repo, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    ret = await proc.wait()
+    out, err = await proc.communicate()
+    if ret != 0:
+        raise RuntimeError(f"Failed to get HEAD commit of {repo}: {err.decode()}")
+    else:
+        return out.decode().strip()
diff --git a/tools/performance/engine-benchmarks/bench_tool/remote_cache.py b/tools/performance/engine-benchmarks/bench_tool/remote_cache.py
new file mode 100644
index 000000000000..b6627fb639cd
--- /dev/null
+++ b/tools/performance/engine-benchmarks/bench_tool/remote_cache.py
@@ -0,0 +1,174 @@
+"""
+A remote cache is located inhttps://github.com/enso-org/engine-benchmark-results/tree/main/cache.
+It is just a bunch of JSON files, each representing a single job report.
+"""
+import abc
+import json
+import logging
+import os
+import re
+import tempfile
+from pathlib import Path
+from typing import Dict, Optional
+
+from . import gh, JobReport, BENCH_REPO, git
+
+_logger = logging.getLogger(__name__)
+
+CACHE_REMOTE_DIR = "cache"
+ENGINE_INDEX_HTML = "engine-benchs.html"
+STDLIB_INDEX_HTML = "stdlib-benchs.html"
+
+
+class RemoteCache(abc.ABC):
+
+    @abc.abstractmethod
+    async def fetch(self, bench_id: str) -> Optional[JobReport]:
+        """
+        Fetches a job report for the given bench ID from the remote cache
+        :param bench_id:
+        :return: None if the report does not exist
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    async def put(self, bench_id: str, job_report: JobReport) -> None:
+        """
+        Puts a job report to the remote cache, or to the internal data structures.
+        :param bench_id:
+        :param job_report:
+        :return:
+        """
+        raise NotImplementedError
+
+
+class ReadonlyRemoteCache(RemoteCache):
+    """
+    Only fetches the artifacts from the remote cache, does not push anything.
+    """
+
+    def __init__(self):
+        self._fetched_items: Dict[str, JobReport] = {}
+
+    async def fetch(self, bench_id: str) -> Optional[JobReport]:
+        """ Fetches a job report for the given bench ID from the remote cache """
+        if bench_id in self._fetched_items:
+            return self._fetched_items[bench_id]
+        if not _is_benchrun_id(bench_id):
+            _logger.warning("Invalid bench ID: %s", bench_id)
+            return None
+        remote_path = self._get_remote_path(bench_id)
+        _logger.debug("Fetching cache from %s", remote_path)
+        content = await gh.fetch_file(BENCH_REPO, remote_path)
+        if content is None:
+            _logger.warning("Cache not found for %s", bench_id)
+            return None
+        bench_report = JobReport.from_dict(
+            json.loads(content)
+        )
+        assert bench_id not in self._fetched_items
+        self._fetched_items[bench_id] = bench_report
+        return bench_report
+
+    async def put(self, bench_id: str, job_report: JobReport) -> None:
+        assert _is_benchrun_id(bench_id)
+        assert bench_id not in self._fetched_items
+        self._fetched_items[bench_id] = job_report
+
+    def _get_remote_path(self, bench_id: str) -> str:
+        assert _is_benchrun_id(bench_id)
+        return os.path.join(CACHE_REMOTE_DIR, bench_id + ".json")
+
+
+class SyncRemoteCache(RemoteCache):
+    """
+    Fetches and pushes the artifacts to the remote cache. Needs a write permissions to the repo.
+    """
+
+    def __init__(self, local_root_dir: Optional[Path] = None):
+        if local_root_dir is not None:
+            assert local_root_dir.exists()
+            assert local_root_dir.is_dir()
+            assert local_root_dir.joinpath(".git").exists()
+            self._repo_root_dir = local_root_dir
+            self._should_clone = False
+        else:
+            self._repo_root_dir = Path(tempfile.mkdtemp(prefix="bench_tool_remote_cache"))
+            self._should_clone = True
+        assert self._repo_root_dir.exists()
+        assert self._repo_root_dir.is_dir()
+        self._cache_dir = self._repo_root_dir.joinpath(CACHE_REMOTE_DIR)
+
+    def repo_root_dir(self) -> Path:
+        return self._repo_root_dir
+
+    def cache_dir(self) -> Path:
+        return self._cache_dir
+
+    def engine_index_html(self) -> Path:
+        return self._repo_root_dir.joinpath(ENGINE_INDEX_HTML)
+
+    def stdlib_index_html(self) -> Path:
+        return self._repo_root_dir.joinpath(STDLIB_INDEX_HTML)
+
+    async def initialize(self) -> None:
+        """
+        Make sure the repo is up-to-date
+        :return:
+        """
+        if self._should_clone:
+            await git.clone(BENCH_REPO, self._repo_root_dir)
+        else:
+            await git.pull(self._repo_root_dir)
+        assert self._repo_root_dir.exists()
+        assert self._cache_dir.exists()
+
+    async def fetch(self, bench_id: str) -> Optional[JobReport]:
+        assert self._cache_dir.exists()
+        path = self._cache_dir.joinpath(bench_id + ".json")
+        if path.exists():
+            with path.open() as f:
+                return JobReport.from_dict(json.load(f))
+        return None
+
+    async def put(self, bench_id: str, job_report: JobReport) -> None:
+        assert self._cache_dir.exists()
+        path = self._cache_dir.joinpath(bench_id + ".json")
+        assert not path.exists()
+        with path.open("w") as f:
+            json.dump(
+                job_report.to_dict(),
+                f,
+                ensure_ascii=True,
+                indent=2
+            )
+
+    async def sync(self) -> None:
+        """
+        Synchronizes the local repo state with upstream. That means, pushes if some untracked or
+        modified files are in the local directory.
+        :return:
+        """
+        status = await git.status(self._repo_root_dir)
+        is_repo_dirty = len(status.modified) > 0 or len(status.added) > 0
+        if is_repo_dirty:
+            _logger.info("Untracked or modified files found in the repo: %s", self._repo_root_dir)
+            commit_msg = "Regenerate websites"
+            if len(status.modified) > 0:
+                _logger.debug("Modified files: %s", status.modified)
+                await git.add(self._repo_root_dir, status.modified)
+            if len(status.untracked) > 0:
+                _logger.debug("Untracked files: %s", status.untracked)
+                await git.add(self._repo_root_dir, status.untracked)
+                commit_msg += f" - Add {len(status.untracked)} new reports."
+            else:
+                commit_msg += "."
+            await git.commit(self._repo_root_dir, commit_msg)
+            await git.push(self._repo_root_dir)
+
+
+def _is_benchrun_id(name: str) -> bool:
+    return re.match(r"\d{9}", name) is not None
+
+
+
diff --git a/tools/performance/engine-benchmarks/bench_tool/requirements.txt b/tools/performance/engine-benchmarks/bench_tool/requirements.txt
new file mode 100644
index 000000000000..d9ac381d2764
--- /dev/null
+++ b/tools/performance/engine-benchmarks/bench_tool/requirements.txt
@@ -0,0 +1,3 @@
+
+Jinja2 == 3.1.2
+numpy == 1.24.2
diff --git a/tools/performance/engine-benchmarks/bench_tool/template_render.py b/tools/performance/engine-benchmarks/bench_tool/template_render.py
new file mode 100644
index 000000000000..57a626487111
--- /dev/null
+++ b/tools/performance/engine-benchmarks/bench_tool/template_render.py
@@ -0,0 +1,133 @@
+import logging
+from pathlib import Path
+from typing import List, Dict, Optional, Set
+
+import jinja2
+import numpy as np
+import pandas as pd
+
+from bench_tool import JobReport, TemplateBenchData, BenchDatapoint, ENSO_COMMIT_BASE_URL, JinjaData, \
+    JINJA_TEMPLATE, TEMPLATES_DIR
+from bench_tool.utils import parse_commit_timestamp
+
+_logger = logging.getLogger(__name__)
+
+
+def create_template_data(
+        job_reports_per_branch: Dict[str, List[JobReport]],
+        bench_labels: Set[str]) -> List[TemplateBenchData]:
+    """
+    Creates all the necessary data for the Jinja template from all collected
+    benchmark job reports.
+    :param job_reports_per_branch: Mapping of branch name to list of job reports.
+    job reports should be sorted by the commit date, otherwise the difference
+    between scores might be wrongly computed.
+    :param bench_labels:
+    :return:
+    """
+
+    def pct_to_str(score_diff_perc: float) -> str:
+        if not np.isnan(score_diff_perc):
+            buff = "+" if score_diff_perc > 0 else ""
+            buff += "{:.5f}".format(score_diff_perc * 100)
+            buff += "%"
+            return buff
+        else:
+            return "NaN"
+
+    def diff_str(score_diff: float, score_diff_perc: float) -> str:
+        if not np.isnan(score_diff):
+            diff_str = "+" if score_diff > 0 else ""
+            diff_str += "{:.5f}".format(score_diff)
+            diff_str += " ("
+            diff_str += pct_to_str(score_diff_perc)
+            diff_str += ")"
+            return diff_str
+        else:
+            return "NA"
+
+    template_bench_datas: List[TemplateBenchData] = []
+    for bench_label in bench_labels:
+        _logger.debug("Creating template data for benchmark %s", bench_label)
+        branch_datapoints: Dict[str, List[BenchDatapoint]] = {}
+        for branch, job_reports in job_reports_per_branch.items():
+            _logger.debug("Creating datapoints for branch %s from %d job reports",
+                          branch, len(job_reports))
+            datapoints: List[BenchDatapoint] = []
+            for job_report in job_reports:
+                prev_datapoint: Optional[BenchDatapoint] = \
+                    datapoints[-1] if len(datapoints) > 0 else None
+                if bench_label in job_report.label_score_dict:
+                    score = job_report.label_score_dict[bench_label]
+                    commit = job_report.bench_run.head_commit
+                    timestamp = parse_commit_timestamp(commit)
+                    commit_msg_header = \
+                        commit.message.splitlines()[0].replace('"', "'")
+                    series = pd.Series([
+                        prev_datapoint.score if prev_datapoint else None,
+                        score
+                    ])
+                    score_diff = series.diff()[1]
+                    score_diff_perc = series.pct_change()[1]
+                    tooltip = "score = " + str(score) + "\\n"
+                    tooltip += "date = " + str(timestamp) + "\\n"
+                    tooltip += "branch = " + branch + "\\n"
+                    tooltip += "diff = " + diff_str(score_diff, score_diff_perc)
+                    author_name = commit.author.name \
+                        .replace('"', '\\"') \
+                        .replace("'", "\\'")
+                    datapoints.append(BenchDatapoint(
+                        timestamp=timestamp,
+                        score=score,
+                        score_diff=str(score_diff),
+                        score_diff_perc=pct_to_str(score_diff_perc),
+                        tooltip=tooltip,
+                        bench_run_url=job_report.bench_run.html_url,
+                        commit_id=commit.id,
+                        commit_msg=commit_msg_header,
+                        commit_author=author_name,
+                        commit_url=ENSO_COMMIT_BASE_URL + commit.id,
+                    ))
+            _logger.debug("%d datapoints created for branch %s",
+                          len(datapoints), branch)
+            branch_datapoints[branch] = datapoints.copy()
+        _logger.debug("Template data for benchmark %s created", bench_label)
+        template_bench_datas.append(TemplateBenchData(
+            id=_label_to_id(bench_label),
+            name=_label_to_name(bench_label),
+            branches_datapoints=branch_datapoints,
+        ))
+    return template_bench_datas
+
+
+def render_html(jinja_data: JinjaData, html_out: Path) -> None:
+    jinja_env = jinja2.Environment(
+        loader=jinja2.FileSystemLoader(TEMPLATES_DIR)
+    )
+    template_name = str(JINJA_TEMPLATE.name)
+    jinja_template = jinja_env.get_template(template_name)
+    generated_html = jinja_template.render(jinja_data.__dict__)
+    if html_out.exists():
+        _logger.info("%s already exist, rewriting", html_out)
+    with html_out.open("w") as html_file:
+        html_file.write(generated_html)
+
+
+def _label_to_id(label: str) -> str:
+    return label.replace(".", "_")
+
+
+def _label_to_name(label: str) -> str:
+    items = label.split(".")
+    assert len(items) >= 2
+    filtered_items = \
+        [item for item in items if item not in (
+            "org",
+            "enso",
+            "benchmark",
+            "benchmarks",
+            "semantic",
+            "interpreter",
+            "bench"
+        )]
+    return "_".join(filtered_items)
diff --git a/tools/performance/engine-benchmarks/bench_tool/test_bench_results.py b/tools/performance/engine-benchmarks/bench_tool/test_bench_results.py
new file mode 100644
index 000000000000..0c45ba70ae05
--- /dev/null
+++ b/tools/performance/engine-benchmarks/bench_tool/test_bench_results.py
@@ -0,0 +1,78 @@
+import json
+import unittest
+from datetime import datetime
+
+from bench_tool import ENGINE_BENCH_WORKFLOW_ID, JobReport, JobRun, Commit, \
+    Author
+from .bench_results import get_bench_report, get_bench_runs
+from .remote_cache import ReadonlyRemoteCache
+from .utils import parse_commit_timestamp, WithTempDir
+
+# A single ID for a benchmark run between 2023-05-01 and 2023-05-05
+# We know for sure that this workflow run is on the GH.
+BENCH_RUN_ID = "4888453297"
+
+sample_job_report = JobReport(
+    label_score_dict={
+        "test_label": 1.0
+    },
+    bench_run=JobRun(
+        id="123456789",
+        display_title="Test",
+        html_url="https://github.com/enso-org/enso/actions/runs/123456789",
+        run_attempt=1,
+        event="push",
+        head_commit=Commit(
+            id="a67297aebf6a094d1ad0b0d88cf7438dbf8bd8fe",
+            message="Test commit",
+            timestamp="2021-06-01T12:00:00Z",
+            author=Author(
+                name="Pavel Marek"
+            )
+        )
+    )
+)
+
+
+class TestBenchResults(unittest.IsolatedAsyncioTestCase):
+    def test_job_report_is_serializable(self):
+        s = json.dumps(sample_job_report.to_dict())
+        self.assertIsNotNone(s)
+        self.assertGreater(len(s), 0)
+
+    def test_job_report_is_deserializable(self):
+        d = sample_job_report.to_dict()
+        job_report = JobReport.from_dict(d)
+        self.assertEqual(sample_job_report, job_report)
+
+    async def test_get_bench_run(self):
+        """
+        Bench run does not need remote cache - it fetches just some metadata about GH artifacts.
+        :return:
+        """
+        since = datetime.fromisoformat("2023-05-01")
+        until = datetime.fromisoformat("2023-05-05")
+        bench_runs = await get_bench_runs(since, until, "develop", ENGINE_BENCH_WORKFLOW_ID)
+        self.assertEqual(1, len(bench_runs))
+        # There is just a single bench run between 2023-05-01 and 2023-05-05
+        bench_run = bench_runs[0]
+        self.assertEqual(BENCH_RUN_ID, bench_run.id)
+        commit_ts = parse_commit_timestamp(bench_run.head_commit)
+        self.assertLess(since, commit_ts)
+        self.assertGreater(until, commit_ts)
+
+    async def test_get_bench_report(self):
+        # We choose an old date on purpose, so that the remote cache must be used, and is thus
+        # transitively tested.
+        since = datetime.fromisoformat("2023-05-01")
+        until = datetime.fromisoformat("2023-05-05")
+        bench_runs = await get_bench_runs(since, until, "develop", ENGINE_BENCH_WORKFLOW_ID)
+        self.assertEqual(1, len(bench_runs))
+        bench_run = bench_runs[0]
+        remote_cache = ReadonlyRemoteCache()
+        with WithTempDir("test_get_bench_report") as temp_dir:
+            bench_report = await get_bench_report(bench_run, temp_dir, remote_cache)
+            self.assertIsNotNone(bench_report)
+            self.assertEqual(bench_run, bench_report.bench_run)
+            self.assertEqual(55, len(bench_report.label_score_dict))
+
diff --git a/tools/performance/engine-benchmarks/bench_tool/test_gh.py b/tools/performance/engine-benchmarks/bench_tool/test_gh.py
new file mode 100644
index 000000000000..1882390fd601
--- /dev/null
+++ b/tools/performance/engine-benchmarks/bench_tool/test_gh.py
@@ -0,0 +1,34 @@
+import unittest
+
+from bench_tool import ENSO_REPO, Source
+from . import gh
+
+
+class TestGH(unittest.IsolatedAsyncioTestCase):
+    async def test_ensure_gh_installed(self):
+        self.assertIsNone(gh.ensure_gh_installed())
+
+    async def test_file_fetch(self):
+        content = await gh.fetch_file(ENSO_REPO, "README.md")
+        self.assertIsNotNone(content)
+        self.assertIsInstance(content, str)
+        self.assertGreater(len(content), 0)
+
+    async def test_fetch_non_existing_file(self):
+        content = await gh.fetch_file(ENSO_REPO, "non_existing_file")
+        self.assertIsNone(content)
+
+    async def test_wrong_gh_query_should_not_fail(self):
+        res = await gh.invoke_gh_api("non_existing_repo", "/non_existing_endpoint")
+        self.assertIsNone(res)
+
+    async def test_get_stdlib_bench_run(self):
+        # This bench run ID does not contain the "Runtime Benchmark Report" artifact name,
+        # but it is a successful run. There should be a special handling for this case
+        # https://github.com/enso-org/enso/actions/runs/7909011591
+        bench_run_id = "7909011591"
+        obj = await gh.invoke_gh_api(ENSO_REPO, f"/actions/runs/{bench_run_id}/artifacts")
+        artifacts = obj["artifacts"]
+        stdlib_artifact_name = Source.STDLIB.artifact_names()[0]
+        self.assertEqual(1, len(artifacts))
+        self.assertEqual(stdlib_artifact_name, artifacts[0]["name"])
diff --git a/tools/performance/engine-benchmarks/bench_tool/test_git.py b/tools/performance/engine-benchmarks/bench_tool/test_git.py
new file mode 100644
index 000000000000..61a635786a0f
--- /dev/null
+++ b/tools/performance/engine-benchmarks/bench_tool/test_git.py
@@ -0,0 +1,59 @@
+import shutil
+import tempfile
+import unittest
+from pathlib import Path
+
+from . import git
+
+
+class TestGit(unittest.IsolatedAsyncioTestCase):
+    def setUp(self):
+        self.repo_root = Path(tempfile.mkdtemp())
+
+    def tearDown(self):
+        shutil.rmtree(self.repo_root)
+
+    async def test_init(self):
+        await git.init(self.repo_root)
+        status = await git.status(self.repo_root)
+        self.assertEqual(0, len(status.added))
+        self.assertEqual(0, len(status.modified))
+        self.assertEqual(0, len(status.untracked))
+
+    async def test_add_file(self):
+        await git.init(self.repo_root)
+        self.repo_root.joinpath("README.md").write_text("Hello")
+        status = await git.status(self.repo_root)
+        self.assertEqual(1, len(status.untracked))
+
+    async def test_commit(self):
+        await git.init(self.repo_root)
+        self.repo_root.joinpath("README.md").write_text("Hello")
+        await git.add(self.repo_root, {"README.md"})
+        await git.commit(self.repo_root, "Initial commit")
+        status = await git.status(self.repo_root)
+        self.assertEqual(0, len(status.added))
+        self.assertEqual(0, len(status.modified))
+        self.assertEqual(0, len(status.untracked))
+
+    async def test_modify_file(self):
+        await git.init(self.repo_root)
+        self.repo_root.joinpath("README.md").write_text("Hello")
+        await git.add(self.repo_root, {"README.md"})
+        await git.commit(self.repo_root, "Initial commit")
+        self.repo_root.joinpath("README.md").write_text("Hello World")
+        status = await git.status(self.repo_root)
+        self.assertEqual(0, len(status.added))
+        self.assertEqual(1, len(status.modified))
+        self.assertEqual(0, len(status.untracked))
+
+    async def test_add_more_files(self):
+        await git.init(self.repo_root)
+        self.repo_root.joinpath("README.md").write_text("Hello")
+        self.repo_root.joinpath("pom.xml").write_text("<xml></xml>")
+        status = await git.status(self.repo_root)
+        self.assertEqual(2, len(status.untracked))
+        await git.add(self.repo_root, {"README.md", "pom.xml"})
+        status = await git.status(self.repo_root)
+        self.assertEqual(2, len(status.added))
+
diff --git a/tools/performance/engine-benchmarks/bench_tool/test_remote_cache.py b/tools/performance/engine-benchmarks/bench_tool/test_remote_cache.py
new file mode 100644
index 000000000000..18e046c12700
--- /dev/null
+++ b/tools/performance/engine-benchmarks/bench_tool/test_remote_cache.py
@@ -0,0 +1,114 @@
+import unittest
+from pathlib import Path
+
+from . import JobReport, JobRun, Commit, Author
+from .bench_results import fetch_job_reports
+from .remote_cache import ReadonlyRemoteCache, SyncRemoteCache
+
+
+sample_job_report = JobReport(
+    label_score_dict={
+        "test_label": 1.0
+    },
+    bench_run=JobRun(
+        id="123456789",
+        display_title="Test",
+        html_url="https://github.com/enso-org/enso/actions/runs/123456789",
+        run_attempt=1,
+        event="push",
+        head_commit=Commit(
+            id="a67297aebf6a094d1ad0b0d88cf7438dbf8bd8fe",
+            message="Test commit",
+            timestamp="2021-06-01T12:00:00Z",
+            author=Author(
+                name="Pavel Marek"
+            )
+        )
+    )
+)
+
+stdlib_bench_run = JobRun(
+    id='7879611014',
+    display_title='Benchmark Standard Libraries',
+    html_url='https://github.com/enso-org/enso/actions/runs/7879611014',
+    run_attempt=1,
+    event='schedule',
+    head_commit=Commit(
+        id='eb59b475f68146f03fc3cef1092ee56eaaa1600a',
+        author=Author(name='Radosław Waśko'),
+        timestamp='2024-02-12T19:04:13Z',
+        message='Write support for S3 (#8921)\n\n- Closes #8809'
+    )
+)
+
+
+class TestReadonlyRemoteCache(unittest.IsolatedAsyncioTestCase):
+    async def test_fetch_some_cache(self):
+        remote_cache = ReadonlyRemoteCache()
+        # This ID is definitelly in the cache
+        bench_id = "3686412302"
+        job_report = await remote_cache.fetch(bench_id)
+        self.assertIsNotNone(job_report)
+        self.assertEqual(1, job_report.bench_run.run_attempt)
+        self.assertEqual(bench_id, job_report.bench_run.id)
+        self.assertEqual("Jaroslav Tulach", job_report.bench_run.head_commit.author.name)
+
+    async def test_non_existing_cache_should_not_fail(self):
+        remote_cache = ReadonlyRemoteCache()
+        bench_id = "FOOOO BAR"
+        job_report = await remote_cache.fetch(bench_id)
+        self.assertIsNone(job_report)
+
+    async def test_put_job_report_into_cache(self):
+        remote_cache = ReadonlyRemoteCache()
+        bench_id = sample_job_report.bench_run.id
+        await remote_cache.put(bench_id, sample_job_report)
+        job_report = await remote_cache.fetch(bench_id)
+        self.assertIsNotNone(job_report)
+        self.assertEqual(bench_id, job_report.bench_run.id)
+
+    async def test_fetch_stdlib_report(self):
+        remote_cache = ReadonlyRemoteCache()
+        job_reports = await fetch_job_reports([stdlib_bench_run], remote_cache)
+        self.assertIsNotNone(job_reports)
+        self.assertEqual(1, len(job_reports))
+
+
+class TestSyncRemoteCache(unittest.IsolatedAsyncioTestCase):
+    LOCAL_REPO_ROOT = Path("/home/pavel/dev/engine-benchmark-results")
+
+    async def test_init_sync_remote_cache_from_local_repo(self):
+        if not self.LOCAL_REPO_ROOT.exists():
+            self.skipTest(f"Local repo {self.LOCAL_REPO_ROOT} does not exist")
+        remote_cache = SyncRemoteCache(self.LOCAL_REPO_ROOT)
+        await remote_cache.initialize()
+        root_dir = remote_cache.repo_root_dir()
+        self.assertTrue(root_dir.exists())
+        self.assertTrue(root_dir.is_dir())
+        cache_dir = remote_cache.cache_dir()
+        self.assertTrue(cache_dir.exists())
+        self.assertTrue(cache_dir.is_dir())
+        self.assertTrue(remote_cache.engine_index_html().exists())
+        self.assertTrue(remote_cache.stdlib_index_html().exists())
+
+    async def test_clone_sync_remote_cache(self):
+        self.skipTest("TODO: Takes too long")
+        remote_cache = SyncRemoteCache()
+        await remote_cache.initialize()
+        root_dir = remote_cache.repo_root_dir()
+        self.assertTrue(root_dir.exists())
+        self.assertTrue(root_dir.is_dir())
+        cache_dir = remote_cache.cache_dir()
+        self.assertTrue(cache_dir.exists())
+        self.assertTrue(cache_dir.is_dir())
+        self.assertTrue(remote_cache.engine_index_html().exists())
+        self.assertTrue(remote_cache.stdlib_index_html().exists())
+
+    async def test_fetch_stdlib_report(self):
+        if not self.LOCAL_REPO_ROOT.exists():
+            self.skipTest(f"Local repo {self.LOCAL_REPO_ROOT} does not exist")
+        remote_cache = SyncRemoteCache(self.LOCAL_REPO_ROOT)
+        await remote_cache.initialize()
+        job_reports = await fetch_job_reports([stdlib_bench_run], remote_cache)
+        self.assertIsNotNone(job_reports)
+        self.assertEqual(1, len(job_reports))
diff --git a/tools/performance/engine-benchmarks/bench_tool/test_website_regen.py b/tools/performance/engine-benchmarks/bench_tool/test_website_regen.py
new file mode 100644
index 000000000000..567533d4d5d2
--- /dev/null
+++ b/tools/performance/engine-benchmarks/bench_tool/test_website_regen.py
@@ -0,0 +1,31 @@
+import unittest
+from pathlib import Path
+from datetime import datetime
+
+from bench_tool import Source
+from bench_tool.remote_cache import SyncRemoteCache
+from bench_tool.utils import WithTempDir
+from bench_tool.website import generate_bench_website
+
+
+class TestWebsiteRegen(unittest.IsolatedAsyncioTestCase):
+    LOCAL_REPO_ROOT = Path("/home/pavel/dev/engine-benchmark-results")
+
+    async def test_engine_website_regen(self):
+        if not self.LOCAL_REPO_ROOT.exists():
+            self.skipTest(f"Local repo {self.LOCAL_REPO_ROOT} does not exist")
+        remote_cache = SyncRemoteCache(self.LOCAL_REPO_ROOT)
+        # Pull the repo if necessary
+        await remote_cache.initialize()
+        since = datetime.fromisoformat("2023-02-01")
+        until = datetime.fromisoformat("2023-02-25")
+        with WithTempDir("test_engine_website_regen") as temp_dir:
+            temp_dir_path = Path(temp_dir)
+            html_out = temp_dir_path.joinpath("engine-benchs.html")
+            await generate_bench_website(Source.ENGINE, remote_cache, since, until, html_out)
+            self.assertTrue(html_out.exists())
+            self.assertGreater(
+                html_out.stat().st_size, 100 * 1024,
+                "The generated HTML file should have size bigger than 100 KB"
+            )
+        pass
diff --git a/tools/performance/engine-benchmarks/bench_tool/utils.py b/tools/performance/engine-benchmarks/bench_tool/utils.py
new file mode 100644
index 000000000000..0a04f0784a87
--- /dev/null
+++ b/tools/performance/engine-benchmarks/bench_tool/utils.py
@@ -0,0 +1,54 @@
+import logging
+import shutil
+import tempfile
+from datetime import datetime
+from typing import List, Set
+
+from bench_tool import JobReport, GH_DATE_FORMAT, Commit
+
+_logger = logging.getLogger(__name__)
+
+
+class WithTempDir:
+    def __init__(self, prefix: str):
+        self.prefix = prefix
+        self.temp_dir = None
+
+    def __enter__(self):
+        self.temp_dir = tempfile.mkdtemp(prefix=self.prefix)
+        return self.temp_dir
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+
+def gather_all_bench_labels(job_reports: List[JobReport]) -> Set[str]:
+    """
+    Iterates through all the job reports and gathers all the benchmark labels
+    found. Note that every job report can have a different set of benchmark labels.
+    :return: List of benchmark labels.
+    """
+    all_labels = set()
+    for job_report in job_reports:
+        for labels in job_report.label_score_dict.keys():
+            all_labels.add(labels)
+    return all_labels
+
+
+def parse_commit_timestamp(commit: Commit) -> datetime:
+    """ Parses the timestamp from the commit based on the GH's formatting. """
+    return datetime.strptime(commit.timestamp, GH_DATE_FORMAT)
+
+
+def sort_job_reports(
+        job_reports: List[JobReport]
+) -> None:
+    """
+    Sorts the job reports in place by the commit date.
+    :param job_reports:
+    :return:
+    """
+    def _get_timestamp(job_report: JobReport) -> datetime:
+        return parse_commit_timestamp(job_report.bench_run.head_commit)
+
+    job_reports.sort(key=lambda report: _get_timestamp(report))
diff --git a/tools/performance/engine-benchmarks/bench_tool/website.py b/tools/performance/engine-benchmarks/bench_tool/website.py
new file mode 100644
index 000000000000..57f6f6da29d3
--- /dev/null
+++ b/tools/performance/engine-benchmarks/bench_tool/website.py
@@ -0,0 +1,66 @@
+import logging
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import List, Dict, Set
+
+from bench_tool import JobRun, BRANCH_DEVELOP, Source, JobReport, TemplateBenchData, JinjaData
+from bench_tool.bench_results import get_bench_runs, fetch_job_reports
+from bench_tool.remote_cache import SyncRemoteCache
+from bench_tool.template_render import create_template_data, render_html
+from bench_tool.utils import sort_job_reports, gather_all_bench_labels
+
+_logger = logging.getLogger(__name__)
+
+
+async def generate_bench_website(
+        bench_source: Source,
+        remote_cache: SyncRemoteCache,
+        since: datetime,
+        until: datetime,
+        generated_html: Path
+) -> None:
+    """
+    Generates single `index.html` website with the benchmark results.
+
+    :param bench_source: Source of the benchmarks, either engine or stdlib
+    :param remote_cache: Remote cache used for fetching the job reports.
+    :param since: Date since when the benchmarks should be considered
+    :param until: Date until when the benchmarks should be considered
+    :param generated_html: Path to the generated HTML file
+    :return:
+    """
+    bench_runs: List[JobRun] = []
+    for workflow_id in bench_source.workflow_ids():
+        bench_runs.extend(
+            await get_bench_runs(since, until, BRANCH_DEVELOP, workflow_id)
+        )
+    assert len(bench_runs) > 0, "No benchmark runs found"
+
+    job_reports = await fetch_job_reports(bench_runs, remote_cache)
+    _logger.debug(f"Gathered {len(job_reports)} job reports")
+    assert len(job_reports) > 0, "No job reports found"
+
+    _logger.debug("Sorting job_reports by commit date")
+    sort_job_reports(job_reports)
+
+    all_bench_labels: Set[str] = gather_all_bench_labels(job_reports)
+    _logger.debug(f"Found {len(all_bench_labels)} unique benchmark labels")
+
+    job_reports_per_branch: Dict[str, List[JobReport]] = {
+        BRANCH_DEVELOP: job_reports
+    }
+    template_bench_datas: List[TemplateBenchData] = \
+        create_template_data(job_reports_per_branch, all_bench_labels)
+    template_bench_datas.sort(key=lambda data: data.id)
+
+    jinja_data = JinjaData(
+        since=since,
+        display_since=max(until - timedelta(days=30), since),
+        until=until,
+        bench_datas=template_bench_datas,
+        bench_source=bench_source,
+        branches=[BRANCH_DEVELOP],
+        timestamp=datetime.now()
+    )
+    _logger.debug(f"Rendering HTML to {generated_html}")
+    render_html(jinja_data, generated_html)
diff --git a/tools/performance/engine-benchmarks/templates/template_jinja.html b/tools/performance/engine-benchmarks/templates/template_jinja.html
index 97311c0d8af8..838f0968c368 100644
--- a/tools/performance/engine-benchmarks/templates/template_jinja.html
+++ b/tools/performance/engine-benchmarks/templates/template_jinja.html
@@ -280,7 +280,8 @@ <h2 class="text-center">
       </p>
 
       <br />
-      Generated by the <code>bench_download.py</code> script.
+      Generated by the <code>bench_download.py</code> script in
+      <code>{{ timestamp }}</code>.
     </div>
 
     <div id="top-panel" class="container">
@@ -334,7 +335,10 @@ <h3 class="card-header text-center">Applied filters</h3>
     <div id="benchmarks-container" class="container-fluid">
       {% for bench_data in bench_datas %}
       <div class="bench-container card">
-        <h3 class="card-header text-center">{{ bench_data.id }}</h3>
+        <a name="{{ bench_data.id }}"></a>
+        <h3 class="card-header text-center">
+          <a href="#{{ bench_data.id }}">{{ bench_data.id }}</a>
+        </h3>
         <!-- This is a placeholder div for a bench chart -->
         <div id="{{ bench_data.id }}" class="bench-chart"></div>
         <!-- selection-info div will be shown once user selects a point in the chart -->
diff --git a/tools/performance/engine-benchmarks/website_regen.py b/tools/performance/engine-benchmarks/website_regen.py
new file mode 100644
index 000000000000..be408fa6f7eb
--- /dev/null
+++ b/tools/performance/engine-benchmarks/website_regen.py
@@ -0,0 +1,66 @@
+"""
+IMPORTANT NOTE: Should be run only on the CI!!
+
+This script regenerate the benchmark results website, hosted as GH web pages on the
+https://github.com/enso-org/engine-benchmark-results repo.
+"""
+import asyncio
+import logging
+from argparse import ArgumentParser
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+from bench_tool import Source
+from bench_tool.remote_cache import SyncRemoteCache
+from bench_tool.website import generate_bench_website
+
+# The inception date of the benchmarks, i.e., the date of the first benchmark run.
+ENGINE_SINCE = datetime.fromisoformat("2022-12-01")
+STDLIB_SINCE = datetime.fromisoformat("2023-08-22")
+
+_logger = logging.getLogger("website_regen")
+
+
+async def main():
+    arg_parser = ArgumentParser(description="Regenerate the benchmark results website")
+    arg_parser.add_argument("-v", "--verbose", action="store_true")
+    arg_parser.add_argument("-n", "--dry-run", action="store_true")
+    arg_parser.add_argument("--local-repo",
+                            type=str,
+                            help="Path to the local clone of the engine-benchmark-results repo")
+    args = arg_parser.parse_args()
+    dry_run: bool = args.dry_run
+    verbose: bool = args.verbose
+    local_repo: Optional[Path] = Path(args.local_repo) if args.local_repo else None
+    logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO)
+    _logger.debug(f"Args: dry_run={dry_run}, verbose={verbose}, local_repo={local_repo}")
+    remote_cache = SyncRemoteCache(local_repo)
+    _logger.info("Initializing the bench results repo, this might take some time")
+    await remote_cache.initialize()
+    _logger.info("Bench results repo initialized")
+
+    now = datetime.now()
+    engine_html_task = generate_bench_website(
+        Source.ENGINE,
+        remote_cache,
+        ENGINE_SINCE,
+        now,
+        remote_cache.engine_index_html()
+    )
+    stdlib_html_task = generate_bench_website(
+        Source.STDLIB,
+        remote_cache,
+        STDLIB_SINCE,
+        now,
+        remote_cache.stdlib_index_html()
+    )
+    await asyncio.gather(engine_html_task, stdlib_html_task)
+    if dry_run:
+        _logger.info("Dry-run, not syncing the remote cache")
+    else:
+        await remote_cache.sync()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())