From 180f19a8fb31bf12823673b685ebf7039c7f8009 Mon Sep 17 00:00:00 2001
From: Jake Lishman <jake.lishman@ibm.com>
Date: Wed, 6 Sep 2023 20:50:06 +0100
Subject: [PATCH 1/2] Use `SmallVec` in `NeighborTable` for cache locality
 (#10784)

* Use `SmallVec` in `NeighborTable` for cache locality

A reasonable chunk of our time in Sabre is spent reading through the
`NeighborTable` to find the candidate swaps for a given layout.  Most
coupling maps that we care about have a relatively low number of edges
between qubits, yet we needed to redirect to the heap for each
individual physical-qubit lookup currently.

This switches from using a `Vec` (which is always a fat pointer to heap
memory) to `SmallVec` with an inline buffer space of four qubits.
With the qubit type being `u32`, the `SmallVec` now takes up the same
stack size as a `Vec` but can store (usually) all the swaps directly
inline in the outer `Vec` of qubits.  This means that most lookups of
the available swaps are looking in the same (up to relatively small
offsets) in memory, which makes the access patterns much easier for
prefetching to optimise for.

* Pickle via `PyList` instead of duplicate conversion

`SmallVec` doesn't have implementations of the PyO3 conversion trait, so
it needs to be done manually.  The previous state used to convert to a
Rust-space `Vec` that then needed to have its data moved from the Python
heap to the Rust heap.  This instead changes the conversions to interact
directly with Python lists, rather than using intermediary structures.
---
 Cargo.lock                                    |  1 +
 crates/accelerate/Cargo.toml                  |  4 ++
 .../src/sabre_swap/neighbor_table.rs          | 60 +++++++++++++------
 3 files changed, 46 insertions(+), 19 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 97ca32bae717..78421193ca79 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -442,6 +442,7 @@ dependencies = [
  "rand_pcg",
  "rayon",
  "rustworkx-core",
+ "smallvec",
 ]
 
 [[package]]
diff --git a/crates/accelerate/Cargo.toml b/crates/accelerate/Cargo.toml
index a0b59e6460de..a5db75b52102 100644
--- a/crates/accelerate/Cargo.toml
+++ b/crates/accelerate/Cargo.toml
@@ -25,6 +25,10 @@ num-complex = "0.4"
 num-bigint = "0.4"
 rustworkx-core = "0.13"
 
+[dependencies.smallvec]
+version = "1.11"
+features = ["union"]
+
 [dependencies.pyo3]
 workspace = true
 features = ["hashbrown", "indexmap", "num-complex", "num-bigint"]
diff --git a/crates/accelerate/src/sabre_swap/neighbor_table.rs b/crates/accelerate/src/sabre_swap/neighbor_table.rs
index 6cb44536dc0e..577466b514db 100644
--- a/crates/accelerate/src/sabre_swap/neighbor_table.rs
+++ b/crates/accelerate/src/sabre_swap/neighbor_table.rs
@@ -14,8 +14,10 @@ use crate::getenv_use_multiple_threads;
 use ndarray::prelude::*;
 use numpy::PyReadonlyArray2;
 use pyo3::prelude::*;
+use pyo3::types::PyList;
 use rayon::prelude::*;
 use rustworkx_core::petgraph::prelude::*;
+use smallvec::SmallVec;
 
 use crate::nlayout::PhysicalQubit;
 
@@ -32,7 +34,11 @@ use crate::nlayout::PhysicalQubit;
 #[pyclass(module = "qiskit._accelerate.sabre_swap")]
 #[derive(Clone, Debug)]
 pub struct NeighborTable {
-    neighbors: Vec<Vec<PhysicalQubit>>,
+    // The choice of 4 `PhysicalQubit`s in the stack-allocated region is because a) this causes the
+    // `SmallVec<T>` to be the same width as a `Vec` on 64-bit systems (three machine words == 24
+    // bytes); b) the majority of coupling maps we're likely to encounter have a degree of 3 (heavy
+    // hex) or 4 (grid / heavy square).
+    neighbors: Vec<SmallVec<[PhysicalQubit; 4]>>,
 }
 
 impl NeighborTable {
@@ -63,21 +69,22 @@ impl NeighborTable {
         let neighbors = match adjacency_matrix {
             Some(adjacency_matrix) => {
                 let adj_mat = adjacency_matrix.as_array();
-                let build_neighbors = |row: ArrayView1<f64>| -> PyResult<Vec<PhysicalQubit>> {
-                    row.iter()
-                        .enumerate()
-                        .filter_map(|(row_index, value)| {
-                            if *value == 0. {
-                                None
-                            } else {
-                                Some(match row_index.try_into() {
-                                    Ok(index) => Ok(PhysicalQubit::new(index)),
-                                    Err(err) => Err(err.into()),
-                                })
-                            }
-                        })
-                        .collect()
-                };
+                let build_neighbors =
+                    |row: ArrayView1<f64>| -> PyResult<SmallVec<[PhysicalQubit; 4]>> {
+                        row.iter()
+                            .enumerate()
+                            .filter_map(|(row_index, value)| {
+                                if *value == 0. {
+                                    None
+                                } else {
+                                    Some(match row_index.try_into() {
+                                        Ok(index) => Ok(PhysicalQubit::new(index)),
+                                        Err(err) => Err(err.into()),
+                                    })
+                                }
+                            })
+                            .collect()
+                    };
                 if run_in_parallel {
                     adj_mat
                         .axis_iter(Axis(0))
@@ -96,11 +103,26 @@ impl NeighborTable {
         Ok(NeighborTable { neighbors })
     }
 
-    fn __getstate__(&self) -> Vec<Vec<PhysicalQubit>> {
-        self.neighbors.clone()
+    fn __getstate__(&self, py: Python<'_>) -> Py<PyList> {
+        PyList::new(
+            py,
+            self.neighbors
+                .iter()
+                .map(|v| PyList::new(py, v.iter()).to_object(py)),
+        )
+        .into()
     }
 
-    fn __setstate__(&mut self, state: Vec<Vec<PhysicalQubit>>) {
+    fn __setstate__(&mut self, state: &PyList) -> PyResult<()> {
         self.neighbors = state
+            .iter()
+            .map(|v| {
+                v.downcast::<PyList>()?
+                    .iter()
+                    .map(PyAny::extract)
+                    .collect::<PyResult<_>>()
+            })
+            .collect::<PyResult<_>>()?;
+        Ok(())
     }
 }

From 7252db3723c286f6c4b4865c654061911ff33e3b Mon Sep 17 00:00:00 2001
From: Jake Lishman <jake.lishman@ibm.com>
Date: Wed, 6 Sep 2023 20:50:33 +0100
Subject: [PATCH 2/2] Reuse scratch space for Sabre best-swap choice (#10783)

* Reuse scratch space for Sabre best-swap choice

This re-uses the same growable scratch space for storing the temporary
swaps for each choice.  This particular space typically needed to grow
several times for each swap choice, which was taking up a non-negligible
amount of the time we spent in Rust space, especially for large circuits
on big coupling maps.

* Add comment on scratch space
---
 crates/accelerate/src/sabre_swap/mod.rs | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/crates/accelerate/src/sabre_swap/mod.rs b/crates/accelerate/src/sabre_swap/mod.rs
index 792313699bfa..2a7c7317f650 100644
--- a/crates/accelerate/src/sabre_swap/mod.rs
+++ b/crates/accelerate/src/sabre_swap/mod.rs
@@ -383,6 +383,10 @@ fn swap_map_trial(
     // Main logic loop; the front layer only becomes empty when all nodes have been routed.  At
     // each iteration of this loop, we route either one or two gates.
     let mut routable_nodes = Vec::<NodeIndex>::with_capacity(2);
+    // Reusable allocated storage space for choosing the best swap.  This is owned outside of the
+    // `choose_best_swap` function so that we don't need to reallocate and then re-grow the
+    // collection on every entry.
+    let mut swap_scratch = Vec::<[VirtualQubit; 2]>::new();
     while !front_layer.is_empty() {
         let mut current_swaps: Vec<[VirtualQubit; 2]> = Vec::new();
         // Swap-mapping loop.  This is the main part of the algorithm, which we repeat until we
@@ -397,6 +401,7 @@ fn swap_map_trial(
                 &qubits_decay,
                 heuristic,
                 &mut rng,
+                &mut swap_scratch,
             );
             front_layer.routable_after(&mut routable_nodes, &best_swap, &layout, coupling_graph);
             current_swaps.push(best_swap);
@@ -688,9 +693,10 @@ fn choose_best_swap(
     qubits_decay: &[f64],
     heuristic: &Heuristic,
     rng: &mut Pcg64Mcg,
+    best_swaps: &mut Vec<[VirtualQubit; 2]>,
 ) -> [VirtualQubit; 2] {
+    best_swaps.clear();
     let mut min_score = f64::MAX;
-    let mut best_swaps: Vec<[VirtualQubit; 2]> = Vec::new();
     // The decay heuristic is the only one that actually needs the absolute score.
     let absolute_score = match heuristic {
         Heuristic::Decay => {