diff --git a/Cargo.lock b/Cargo.lock
index 35d786f10407..c08aea5b98ce 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -415,6 +415,7 @@ dependencies = [
  "ndarray",
  "num-bigint",
  "num-complex",
+ "num-traits",
  "numpy",
  "pyo3",
  "rand",
diff --git a/crates/accelerate/Cargo.toml b/crates/accelerate/Cargo.toml
index f50d9deb594d..a1bcb7ed0532 100644
--- a/crates/accelerate/Cargo.toml
+++ b/crates/accelerate/Cargo.toml
@@ -21,6 +21,7 @@ rand = "0.8"
 rand_pcg = "0.3"
 rand_distr = "0.4.3"
 ahash = "0.8.6"
+num-traits = "0.2"
 num-complex = "0.4"
 num-bigint = "0.4"
 rustworkx-core = "0.13"
diff --git a/crates/accelerate/src/sparse_pauli_op.rs b/crates/accelerate/src/sparse_pauli_op.rs
index 7ed3beb4dd82..aa97dd64b180 100644
--- a/crates/accelerate/src/sparse_pauli_op.rs
+++ b/crates/accelerate/src/sparse_pauli_op.rs
@@ -10,13 +10,16 @@
 // copyright notice, and modified files need to carry a notice indicating
 // that they have been altered from the originals.
 
+use pyo3::exceptions::PyValueError;
 use pyo3::prelude::*;
 use pyo3::wrap_pyfunction;
 use pyo3::Python;
 
 use hashbrown::HashMap;
-use ndarray::{ArrayView1, Axis};
-use numpy::{IntoPyArray, PyReadonlyArray2};
+use ndarray::{s, Array1, Array2, ArrayView1, ArrayView2, Axis};
+use num_complex::Complex64;
+use num_traits::Zero;
+use numpy::{IntoPyArray, PyArray1, PyArray2, PyReadonlyArray2};
 
 /// Find the unique elements of an array.
 ///
@@ -60,8 +63,204 @@ pub fn unordered_unique(py: Python, array: PyReadonlyArray2<u16>) -> (PyObject,
     )
 }
 
+#[derive(Clone, Copy)]
+enum Pauli {
+    I,
+    X,
+    Y,
+    Z,
+}
+
+/// A complete ZX-convention representation of a Pauli decomposition.  This is all the components
+/// necessary to construct a Qiskit-space :class:`.SparsePauliOp`, where :attr:`phases` is in the
+/// ZX convention.
+#[pyclass(module = "qiskit._accelerate.sparse_pauli_op")]
+pub struct ZXPaulis {
+    #[pyo3(get)]
+    pub z: Py<PyArray2<bool>>,
+    #[pyo3(get)]
+    pub x: Py<PyArray2<bool>>,
+    #[pyo3(get)]
+    pub phases: Py<PyArray1<u8>>,
+    #[pyo3(get)]
+    pub coeffs: Py<PyArray1<Complex64>>,
+}
+
+/// Decompose a dense complex operator into the symplectic Pauli representation in the
+/// ZX-convention.
+///
+/// This is an implementation of the "tensorized Pauli decomposition" presented in
+/// `Hantzko, Binkowski and Gupta (2023) <https://arxiv.org/abs/2310.13421>`__.
+#[pyfunction]
+pub fn decompose_dense(
+    py: Python,
+    operator: PyReadonlyArray2<Complex64>,
+    tolerance: f64,
+) -> PyResult<ZXPaulis> {
+    let num_qubits = operator.shape()[0].ilog2() as usize;
+    let size = 1 << num_qubits;
+    if operator.shape() != [size, size] {
+        return Err(PyValueError::new_err(format!(
+            "input with shape {:?} cannot be interpreted as a multiqubit operator",
+            operator.shape()
+        )));
+    }
+    let mut paulis = vec![];
+    let mut coeffs = vec![];
+    if num_qubits > 0 {
+        decompose_dense_inner(
+            Complex64::new(1.0, 0.0),
+            num_qubits,
+            &[],
+            operator.as_array(),
+            &mut paulis,
+            &mut coeffs,
+            tolerance * tolerance,
+        );
+    }
+    if coeffs.is_empty() {
+        Ok(ZXPaulis {
+            z: PyArray2::zeros(py, [0, num_qubits], false).into(),
+            x: PyArray2::zeros(py, [0, num_qubits], false).into(),
+            phases: PyArray1::zeros(py, [0], false).into(),
+            coeffs: PyArray1::zeros(py, [0], false).into(),
+        })
+    } else {
+        // Constructing several arrays of different shapes at once is rather awkward in iterator
+        // logic, so we just loop manually.
+        let mut z = Array2::<bool>::uninit([paulis.len(), num_qubits]);
+        let mut x = Array2::<bool>::uninit([paulis.len(), num_qubits]);
+        let mut phases = Array1::<u8>::uninit(paulis.len());
+        for (i, paulis) in paulis.drain(..).enumerate() {
+            let mut phase = 0u8;
+            for (j, pauli) in paulis.into_iter().rev().enumerate() {
+                match pauli {
+                    Pauli::I => {
+                        z[[i, j]].write(false);
+                        x[[i, j]].write(false);
+                    }
+                    Pauli::X => {
+                        z[[i, j]].write(false);
+                        x[[i, j]].write(true);
+                    }
+                    Pauli::Y => {
+                        z[[i, j]].write(true);
+                        x[[i, j]].write(true);
+                        phase = phase.wrapping_add(1);
+                    }
+                    Pauli::Z => {
+                        z[[i, j]].write(true);
+                        x[[i, j]].write(false);
+                    }
+                }
+            }
+            phases[i].write(phase % 4);
+        }
+        // These are safe because the above loops write into every element.  It's guaranteed that
+        // each of the elements of the `paulis` vec will have `num_qubits` because they're all
+        // reading from the same base array.
+        let z = unsafe { z.assume_init() };
+        let x = unsafe { x.assume_init() };
+        let phases = unsafe { phases.assume_init() };
+        Ok(ZXPaulis {
+            z: z.into_pyarray(py).into(),
+            x: x.into_pyarray(py).into(),
+            phases: phases.into_pyarray(py).into(),
+            coeffs: PyArray1::from_vec(py, coeffs).into(),
+        })
+    }
+}
+
+/// Recurse worker routine of `decompose_dense`.  Should be called with at least one qubit.
+fn decompose_dense_inner(
+    factor: Complex64,
+    num_qubits: usize,
+    paulis: &[Pauli],
+    block: ArrayView2<Complex64>,
+    out_paulis: &mut Vec<Vec<Pauli>>,
+    out_coeffs: &mut Vec<Complex64>,
+    square_tolerance: f64,
+) {
+    if num_qubits == 0 {
+        // It would be safe to `return` here, but if it's unreachable then LLVM is allowed to
+        // optimise out this branch entirely in release mode, which is good for a ~2% speedup.
+        unreachable!("should not call this with an empty operator")
+    }
+    // Base recursion case.
+    if num_qubits == 1 {
+        let mut push_if_nonzero = |extra: Pauli, value: Complex64| {
+            if value.norm_sqr() <= square_tolerance {
+                return;
+            }
+            let paulis = {
+                let mut vec = Vec::with_capacity(paulis.len() + 1);
+                vec.extend_from_slice(paulis);
+                vec.push(extra);
+                vec
+            };
+            out_paulis.push(paulis);
+            out_coeffs.push(value);
+        };
+        push_if_nonzero(Pauli::I, 0.5 * factor * (block[[0, 0]] + block[[1, 1]]));
+        push_if_nonzero(Pauli::X, 0.5 * factor * (block[[0, 1]] + block[[1, 0]]));
+        push_if_nonzero(
+            Pauli::Y,
+            0.5 * Complex64::i() * factor * (block[[0, 1]] - block[[1, 0]]),
+        );
+        push_if_nonzero(Pauli::Z, 0.5 * factor * (block[[0, 0]] - block[[1, 1]]));
+        return;
+    }
+    let mut recurse_if_nonzero = |extra: Pauli, factor: Complex64, values: Array2<Complex64>| {
+        let mut is_zero = true;
+        for value in values.iter() {
+            if !value.is_zero() {
+                is_zero = false;
+                break;
+            }
+        }
+        if is_zero {
+            return;
+        }
+        let mut new_paulis = Vec::with_capacity(paulis.len() + 1);
+        new_paulis.extend_from_slice(paulis);
+        new_paulis.push(extra);
+        decompose_dense_inner(
+            factor,
+            num_qubits - 1,
+            &new_paulis,
+            values.view(),
+            out_paulis,
+            out_coeffs,
+            square_tolerance,
+        );
+    };
+    let mid = 1usize << (num_qubits - 1);
+    recurse_if_nonzero(
+        Pauli::I,
+        0.5 * factor,
+        &block.slice(s![..mid, ..mid]) + &block.slice(s![mid.., mid..]),
+    );
+    recurse_if_nonzero(
+        Pauli::X,
+        0.5 * factor,
+        &block.slice(s![..mid, mid..]) + &block.slice(s![mid.., ..mid]),
+    );
+    recurse_if_nonzero(
+        Pauli::Y,
+        0.5 * Complex64::i() * factor,
+        &block.slice(s![..mid, mid..]) - &block.slice(s![mid.., ..mid]),
+    );
+    recurse_if_nonzero(
+        Pauli::Z,
+        0.5 * factor,
+        &block.slice(s![..mid, ..mid]) - &block.slice(s![mid.., mid..]),
+    );
+}
+
 #[pymodule]
 pub fn sparse_pauli_op(_py: Python, m: &PyModule) -> PyResult<()> {
     m.add_wrapped(wrap_pyfunction!(unordered_unique))?;
+    m.add_wrapped(wrap_pyfunction!(decompose_dense))?;
+    m.add_class::<ZXPaulis>()?;
     Ok(())
 }
diff --git a/qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py b/qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py
index 65938410c28c..84c1e09636e4 100644
--- a/qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py
+++ b/qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py
@@ -24,7 +24,7 @@
 import numpy as np
 import rustworkx as rx
 
-from qiskit._accelerate.sparse_pauli_op import unordered_unique
+from qiskit._accelerate.sparse_pauli_op import unordered_unique, decompose_dense
 from qiskit.circuit.parameter import Parameter
 from qiskit.circuit.parameterexpression import ParameterExpression
 from qiskit.circuit.parametertable import ParameterView
@@ -35,7 +35,6 @@
 from qiskit.quantum_info.operators.operator import Operator
 from qiskit.quantum_info.operators.symplectic.pauli import BasePauli
 from qiskit.quantum_info.operators.symplectic.pauli_list import PauliList
-from qiskit.quantum_info.operators.symplectic.pauli_utils import pauli_basis
 from qiskit.quantum_info.operators.symplectic.pauli import Pauli
 
 
@@ -131,8 +130,8 @@ def __init__(
 
         pauli_list = PauliList(data.copy() if copy and hasattr(data, "copy") else data)
 
-        if isinstance(coeffs, np.ndarray) and coeffs.dtype == object:
-            dtype = object
+        if isinstance(coeffs, np.ndarray):
+            dtype = object if coeffs.dtype == object else complex
         elif coeffs is not None:
             if not isinstance(coeffs, (np.ndarray, Sequence)):
                 coeffs = [coeffs]
@@ -727,15 +726,20 @@ def from_operator(
     ) -> SparsePauliOp:
         """Construct from an Operator objector.
 
-        Note that the cost of this construction is exponential as it involves
-        taking inner products with every element of the N-qubit Pauli basis.
+        Note that the cost of this construction is exponential in general because the number of
+        possible Pauli terms in the decomposition is exponential in the number of qubits.
+
+        Internally this uses an implementation of the "tensorized Pauli decomposition" presented in
+        `Hantzko, Binkowski and Gupta (2023) <https://arxiv.org/abs/2310.13421>`__.
 
         Args:
             obj (Operator): an N-qubit operator.
-            atol (float): Optional. Absolute tolerance for checking if
-                          coefficients are zero (Default: 1e-8).
-            rtol (float): Optional. relative tolerance for checking if
-                          coefficients are zero (Default: 1e-5).
+            atol (float): Optional. Absolute tolerance for checking if coefficients are zero
+                (Default: 1e-8).  Since the comparison is to zero, in effect the tolerance used is
+                the maximum of ``atol`` and ``rtol``.
+            rtol (float): Optional. relative tolerance for checking if coefficients are zero
+                (Default: 1e-5).  Since the comparison is to zero, in effect the tolerance used is
+                the maximum of ``atol`` and ``rtol``.
 
         Returns:
             SparsePauliOp: the SparsePauliOp representation of the operator.
@@ -748,6 +752,7 @@ def from_operator(
             atol = SparsePauliOp.atol
         if rtol is None:
             rtol = SparsePauliOp.rtol
+        tol = max((atol, rtol))
 
         if not isinstance(obj, Operator):
             obj = Operator(obj)
@@ -756,24 +761,11 @@ def from_operator(
         num_qubits = obj.num_qubits
         if num_qubits is None:
             raise QiskitError("Input Operator is not an N-qubit operator.")
-        data = obj.data
-
-        # Index of non-zero basis elements
-        inds = []
-        # Non-zero coefficients
-        coeffs = []
-        # Non-normalized basis factor
-        denom = 2**num_qubits
-        # Compute coefficients from basis
-        basis = pauli_basis(num_qubits)
-        for i, mat in enumerate(basis.matrix_iter()):
-            coeff = np.trace(mat.dot(data)) / denom
-            if not np.isclose(coeff, 0, atol=atol, rtol=rtol):
-                inds.append(i)
-                coeffs.append(coeff)
-        # Get Non-zero coeff PauliList terms
-        paulis = basis[inds]
-        return SparsePauliOp(paulis, coeffs, copy=False)
+        zx_paulis = decompose_dense(obj.data, tol)
+        # Indirection through `BasePauli` is because we're already supplying the phase in the ZX
+        # convention.
+        pauli_list = PauliList(BasePauli(zx_paulis.z, zx_paulis.x, zx_paulis.phases))
+        return SparsePauliOp(pauli_list, zx_paulis.coeffs, ignore_pauli_phase=True, copy=False)
 
     @staticmethod
     def from_list(
diff --git a/releasenotes/notes/fast-sparse-pauli-operator-b41cacf11e8c4e0e.yaml b/releasenotes/notes/fast-sparse-pauli-operator-b41cacf11e8c4e0e.yaml
new file mode 100644
index 000000000000..625231d2fb10
--- /dev/null
+++ b/releasenotes/notes/fast-sparse-pauli-operator-b41cacf11e8c4e0e.yaml
@@ -0,0 +1,8 @@
+---
+features:
+  - |
+    :meth:`.SparsePauliOp.from_operator` now uses an implementation of the
+    "tensorized Pauli decomposition algorithm" presented in 
+    Hatznko, Binkowski and Gupta (2023) <https://arxiv.org/abs/2310.13421>`__.  The method is now
+    several orders of magnitude faster; for example, it is possible to decompose a random
+    10-qubit operator in around 250ms on a consumer Macbook Pro (Intel i7, 2020).