diff --git a/Cargo.lock b/Cargo.lock
index 97ca32bae717..78421193ca79 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -442,6 +442,7 @@ dependencies = [
  "rand_pcg",
  "rayon",
  "rustworkx-core",
+ "smallvec",
 ]
 
 [[package]]
diff --git a/crates/accelerate/Cargo.toml b/crates/accelerate/Cargo.toml
index a0b59e6460de..a5db75b52102 100644
--- a/crates/accelerate/Cargo.toml
+++ b/crates/accelerate/Cargo.toml
@@ -25,6 +25,10 @@ num-complex = "0.4"
 num-bigint = "0.4"
 rustworkx-core = "0.13"
 
+[dependencies.smallvec]
+version = "1.11"
+features = ["union"]
+
 [dependencies.pyo3]
 workspace = true
 features = ["hashbrown", "indexmap", "num-complex", "num-bigint"]
diff --git a/crates/accelerate/src/sabre_swap/mod.rs b/crates/accelerate/src/sabre_swap/mod.rs
index 792313699bfa..2a7c7317f650 100644
--- a/crates/accelerate/src/sabre_swap/mod.rs
+++ b/crates/accelerate/src/sabre_swap/mod.rs
@@ -383,6 +383,10 @@ fn swap_map_trial(
     // Main logic loop; the front layer only becomes empty when all nodes have been routed.  At
     // each iteration of this loop, we route either one or two gates.
     let mut routable_nodes = Vec::<NodeIndex>::with_capacity(2);
+    // Reusable allocated storage space for choosing the best swap.  This is owned outside of the
+    // `choose_best_swap` function so that we don't need to reallocate and then re-grow the
+    // collection on every entry.
+    let mut swap_scratch = Vec::<[VirtualQubit; 2]>::new();
     while !front_layer.is_empty() {
         let mut current_swaps: Vec<[VirtualQubit; 2]> = Vec::new();
         // Swap-mapping loop.  This is the main part of the algorithm, which we repeat until we
@@ -397,6 +401,7 @@ fn swap_map_trial(
                 &qubits_decay,
                 heuristic,
                 &mut rng,
+                &mut swap_scratch,
             );
             front_layer.routable_after(&mut routable_nodes, &best_swap, &layout, coupling_graph);
             current_swaps.push(best_swap);
@@ -688,9 +693,10 @@ fn choose_best_swap(
     qubits_decay: &[f64],
     heuristic: &Heuristic,
     rng: &mut Pcg64Mcg,
+    best_swaps: &mut Vec<[VirtualQubit; 2]>,
 ) -> [VirtualQubit; 2] {
+    best_swaps.clear();
     let mut min_score = f64::MAX;
-    let mut best_swaps: Vec<[VirtualQubit; 2]> = Vec::new();
     // The decay heuristic is the only one that actually needs the absolute score.
     let absolute_score = match heuristic {
         Heuristic::Decay => {
diff --git a/crates/accelerate/src/sabre_swap/neighbor_table.rs b/crates/accelerate/src/sabre_swap/neighbor_table.rs
index 6cb44536dc0e..577466b514db 100644
--- a/crates/accelerate/src/sabre_swap/neighbor_table.rs
+++ b/crates/accelerate/src/sabre_swap/neighbor_table.rs
@@ -14,8 +14,10 @@ use crate::getenv_use_multiple_threads;
 use ndarray::prelude::*;
 use numpy::PyReadonlyArray2;
 use pyo3::prelude::*;
+use pyo3::types::PyList;
 use rayon::prelude::*;
 use rustworkx_core::petgraph::prelude::*;
+use smallvec::SmallVec;
 
 use crate::nlayout::PhysicalQubit;
 
@@ -32,7 +34,11 @@ use crate::nlayout::PhysicalQubit;
 #[pyclass(module = "qiskit._accelerate.sabre_swap")]
 #[derive(Clone, Debug)]
 pub struct NeighborTable {
-    neighbors: Vec<Vec<PhysicalQubit>>,
+    // The choice of 4 `PhysicalQubit`s in the stack-allocated region is because a) this causes the
+    // `SmallVec<T>` to be the same width as a `Vec` on 64-bit systems (three machine words == 24
+    // bytes); b) the majority of coupling maps we're likely to encounter have a degree of 3 (heavy
+    // hex) or 4 (grid / heavy square).
+    neighbors: Vec<SmallVec<[PhysicalQubit; 4]>>,
 }
 
 impl NeighborTable {
@@ -63,21 +69,22 @@ impl NeighborTable {
         let neighbors = match adjacency_matrix {
             Some(adjacency_matrix) => {
                 let adj_mat = adjacency_matrix.as_array();
-                let build_neighbors = |row: ArrayView1<f64>| -> PyResult<Vec<PhysicalQubit>> {
-                    row.iter()
-                        .enumerate()
-                        .filter_map(|(row_index, value)| {
-                            if *value == 0. {
-                                None
-                            } else {
-                                Some(match row_index.try_into() {
-                                    Ok(index) => Ok(PhysicalQubit::new(index)),
-                                    Err(err) => Err(err.into()),
-                                })
-                            }
-                        })
-                        .collect()
-                };
+                let build_neighbors =
+                    |row: ArrayView1<f64>| -> PyResult<SmallVec<[PhysicalQubit; 4]>> {
+                        row.iter()
+                            .enumerate()
+                            .filter_map(|(row_index, value)| {
+                                if *value == 0. {
+                                    None
+                                } else {
+                                    Some(match row_index.try_into() {
+                                        Ok(index) => Ok(PhysicalQubit::new(index)),
+                                        Err(err) => Err(err.into()),
+                                    })
+                                }
+                            })
+                            .collect()
+                    };
                 if run_in_parallel {
                     adj_mat
                         .axis_iter(Axis(0))
@@ -96,11 +103,26 @@ impl NeighborTable {
         Ok(NeighborTable { neighbors })
     }
 
-    fn __getstate__(&self) -> Vec<Vec<PhysicalQubit>> {
-        self.neighbors.clone()
+    fn __getstate__(&self, py: Python<'_>) -> Py<PyList> {
+        PyList::new(
+            py,
+            self.neighbors
+                .iter()
+                .map(|v| PyList::new(py, v.iter()).to_object(py)),
+        )
+        .into()
     }
 
-    fn __setstate__(&mut self, state: Vec<Vec<PhysicalQubit>>) {
+    fn __setstate__(&mut self, state: &PyList) -> PyResult<()> {
         self.neighbors = state
+            .iter()
+            .map(|v| {
+                v.downcast::<PyList>()?
+                    .iter()
+                    .map(PyAny::extract)
+                    .collect::<PyResult<_>>()
+            })
+            .collect::<PyResult<_>>()?;
+        Ok(())
     }
 }