diff --git a/CHANGELOG.md b/CHANGELOG.md
index 514a6636..075de2e9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,7 +13,11 @@
 - Add the support for linear shape-cast (`query::time_of_impact`) for heightfields.
 - Make the convex polyhedron scaling more forgiving regarding normals to avoid frequent unjustified panics.
 - Fix panic happening when building a convex polyhedron with empty inputs.
-
+- Add the support of Heightfields on CUDA kernels written in Rust using the `cust` crate.
+- Add the `rkyv-serialize` feature that enables the implementation of `rkyv` serialization/deserialization
+  for most shapes.
+- Add the `parallel` feature that enables methods for the parallel traversal of QBVH trees: `QBVH::traverse_bvtt_parallel`,
+  `QBVH::traverse_bvtt_node_parallel`, `QBVH::traverse_depth_first_parallel`, `QBVH::traverse_depth_first_node_parallel`.
 
 ### Fixed
 - Fix the application of non-uniform scaling to balls.
diff --git a/Cargo.toml b/Cargo.toml
index 6f4f0f82..904c7f75 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,4 +8,6 @@ parry3d = { path = "crates/parry3d" }
 parry2d-f64 = { path = "crates/parry2d-f64" }
 parry3d-f64 = { path = "crates/parry3d-f64" }
 
+#simba = { path = "../simba" }
+#simba = { git = "https://github.com/dimforge/simba", rev = "b1392df62a0f4cf91e397bbb6bd41b7731afb6ab" }
 # nalgebra = { git = "https://github.com/dimforge/nalgebra" }
\ No newline at end of file
diff --git a/crates/parry2d-f64/Cargo.toml b/crates/parry2d-f64/Cargo.toml
index 1457a2c4..dc151597 100644
--- a/crates/parry2d-f64/Cargo.toml
+++ b/crates/parry2d-f64/Cargo.toml
@@ -23,10 +23,12 @@ std     = [ "nalgebra/std", "slab", "rustc-hash", "simba/std", "arrayvec/std", "
 dim2    = [ ]
 f64     = [ ]
 serde-serialize = [ "serde", "nalgebra/serde-serialize", "arrayvec/serde" ]
+rkyv-serialize = [ "rkyv", "nalgebra/rkyv-serialize", "simba/rkyv-serialize" ]
 simd-stable = [ "simba/wide", "simd-is-enabled" ]
 simd-nightly = [ "simba/packed_simd", "simd-is-enabled" ]
 enhanced-determinism = [ "simba/libm_force", "indexmap" ]
 cuda   = [ "cust_core", "cust", "nalgebra/cuda" ]
+parallel = [ "rayon" ]
 
 # Do not enable this feature directly. It is automatically
 # enabled with the "simd-stable" or "simd-nightly" feature.
@@ -45,15 +47,17 @@ num-traits      = { version = "0.2", default-features = false }
 smallvec        = "1"
 slab            = { version = "0.4", optional = true }
 arrayvec        = { version = "0.7", default-features = false }
-simba           = { version = "0.7", default-features = false }
+simba           = { version = "^0.7.2", default-features = false }
 nalgebra        = { version = "0.31", default-features = false, features = [ "libm" ] }
 approx          = { version = "0.5", default-features = false }
-serde           = { version = "1.0", optional = true, features = ["derive"]}
+serde           = { version = "1.0", optional = true, features = ["derive"] }
+rkyv            = { version = "0.7", optional = true }
 num-derive      = "0.3"
 indexmap        = { version = "1", features = [ "serde-1" ], optional = true }
 rustc-hash      = { version = "1", optional = true }
 cust_core       = { version = "0.1", optional = true }
 spade           = { version = "2", optional = true } # Make this optional?
+rayon           = { version = "1", optional = true }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
 cust       = { version = "0.3", optional = true }
diff --git a/crates/parry2d/Cargo.toml b/crates/parry2d/Cargo.toml
index c76a251d..5befc83e 100644
--- a/crates/parry2d/Cargo.toml
+++ b/crates/parry2d/Cargo.toml
@@ -23,10 +23,12 @@ std     = [ "nalgebra/std", "slab", "rustc-hash", "simba/std", "arrayvec/std", "
 dim2    = [ ]
 f32     = [ ]
 serde-serialize = [ "serde", "nalgebra/serde-serialize", "arrayvec/serde" ]
+rkyv-serialize = [ "rkyv", "nalgebra/rkyv-serialize", "simba/rkyv-serialize" ]
 simd-stable = [ "simba/wide", "simd-is-enabled" ]
 simd-nightly = [ "simba/packed_simd", "simd-is-enabled" ]
 enhanced-determinism = [ "simba/libm_force", "indexmap" ]
-cuda   = [ "cust_core", "cust", "nalgebra/cuda" ]
+cuda = [ "cust_core", "cust", "nalgebra/cuda" ]
+parallel = [ "rayon" ]
 
 # Do not enable this feature directly. It is automatically
 # enabled with the "simd-stable" or "simd-nightly" feature.
@@ -45,15 +47,17 @@ num-traits      = { version = "0.2", default-features = false }
 smallvec        = "1"
 slab            = { version = "0.4", optional = true }
 arrayvec        = { version = "0.7", default-features = false }
-simba           = { version = "0.7", default-features = false }
+simba           = { version = "^0.7.2", default-features = false }
 nalgebra        = { version = "0.31", default-features = false, features = [ "libm" ] }
 approx          = { version = "0.5", default-features = false }
-serde           = { version = "1.0", optional = true, features = ["derive"]}
+serde           = { version = "1.0", optional = true, features = ["derive"] }
+rkyv            = { version = "0.7", optional = true }
 num-derive      = "0.3"
 indexmap        = { version = "1", features = [ "serde-1" ], optional = true }
 rustc-hash      = { version = "1", optional = true }
 cust_core       = { version = "0.1", optional = true }
-spade           = { version = "2", optional = true } # Make this optional?
+spade           = { version = "2", optional = true }
+rayon           = { version = "1", optional = true }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
 cust = { version = "0.3", optional = true }
diff --git a/crates/parry3d-f64/Cargo.toml b/crates/parry3d-f64/Cargo.toml
index e57b9f0d..6bf56967 100644
--- a/crates/parry3d-f64/Cargo.toml
+++ b/crates/parry3d-f64/Cargo.toml
@@ -23,10 +23,12 @@ std     = [ "nalgebra/std", "slab", "rustc-hash", "simba/std", "arrayvec/std", "
 dim3    = [ ]
 f64     = [ ]
 serde-serialize = [ "serde", "nalgebra/serde-serialize" ]
+rkyv-serialize = [ "rkyv", "nalgebra/rkyv-serialize", "simba/rkyv-serialize" ]
 simd-stable = [ "simba/wide", "simd-is-enabled" ]
 simd-nightly = [ "simba/packed_simd", "simd-is-enabled" ]
 enhanced-determinism = [ "simba/libm_force", "indexmap" ]
 cuda   = [ "cust_core", "cust", "nalgebra/cuda" ]
+parallel = [ "rayon" ]
 
 # Do not enable this feature directly. It is automatically
 # enabled with the "simd-stable" or "simd-nightly" feature.
@@ -45,15 +47,17 @@ num-traits = { version = "0.2", default-features = false }
 smallvec   = "1"
 slab       = { version = "0.4", optional = true }
 arrayvec   = { version = "0.7", default-features = false }
-simba      = { version = "0.7", default-features = false }
+simba      = { version = "^0.7.2", default-features = false }
 nalgebra   = { version = "0.31", default-features = false, features = [ "libm" ] }
 approx     = { version = "0.5", default-features = false }
 serde      = { version = "1.0", optional = true, features = ["derive", "rc"]}
+rkyv       = { version = "0.7", optional = true }
 num-derive = "0.3"
 indexmap   = { version = "1", features = [ "serde-1" ], optional = true }
 rustc-hash = { version = "1", optional = true }
 cust_core  = { version = "0.1", optional = true }
 spade      = { version = "2", optional = true } # Make this optional?
+rayon      = { version = "1", optional = true }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
 cust       = { version = "0.3", optional = true }
diff --git a/crates/parry3d/Cargo.toml b/crates/parry3d/Cargo.toml
index 32d69184..e7331f5f 100644
--- a/crates/parry3d/Cargo.toml
+++ b/crates/parry3d/Cargo.toml
@@ -23,10 +23,13 @@ std     = [ "nalgebra/std", "slab", "rustc-hash", "simba/std", "arrayvec/std", "
 dim3    = [ ]
 f32     = [ ]
 serde-serialize = [ "serde", "nalgebra/serde-serialize" ]
+rkyv-serialize = [ "rkyv", "nalgebra/rkyv-serialize", "simba/rkyv-serialize" ]
+
 simd-stable = [ "simba/wide", "simd-is-enabled" ]
 simd-nightly = [ "simba/packed_simd", "simd-is-enabled" ]
 enhanced-determinism = [ "simba/libm_force", "indexmap" ]
 cuda   = [ "cust_core", "cust", "nalgebra/cuda" ]
+parallel = [ "rayon" ]
 
 # Do not enable this feature directly. It is automatically
 # enabled with the "simd-stable" or "simd-nightly" feature.
@@ -45,15 +48,17 @@ num-traits = { version = "0.2", default-features = false }
 smallvec   = "1"
 slab       = { version = "0.4", optional = true }
 arrayvec   = { version = "0.7", default-features = false }
-simba      = { version = "0.7", default-features = false }
+simba      = { version = "^0.7.2", default-features = false }
 nalgebra   = { version = "0.31", default-features = false, features = [ "libm" ] }
 approx     = { version = "0.5", default-features = false }
 serde      = { version = "1.0", optional = true, features = ["derive", "rc"]}
+rkyv       = { version = "0.7", optional = true }
 num-derive = "0.3"
 indexmap   = { version = "1", features = [ "serde-1" ], optional = true }
 rustc-hash = { version = "1", optional = true }
 cust_core  = { version = "0.1", optional = true }
 spade      = { version = "2", optional = true } # Make this optional?
+rayon      = { version = "1", optional = true }
 
 [target.'cfg(not(target_os = "cuda"))'.dependencies]
 cust       = { version = "0.3", optional = true }
diff --git a/src/bounding_volume/aabb.rs b/src/bounding_volume/aabb.rs
index 8286d199..617a8c83 100644
--- a/src/bounding_volume/aabb.rs
+++ b/src/bounding_volume/aabb.rs
@@ -13,6 +13,10 @@ use na::ComplexField; // for .abs()
 
 /// An Axis Aligned Bounding Box.
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[cfg_attr(feature = "cuda", derive(cust_core::DeviceCopy))]
 #[derive(Debug, PartialEq, Copy, Clone)]
 pub struct AABB {
diff --git a/src/bounding_volume/bounding_sphere.rs b/src/bounding_volume/bounding_sphere.rs
index d644d8cd..93640430 100644
--- a/src/bounding_volume/bounding_sphere.rs
+++ b/src/bounding_volume/bounding_sphere.rs
@@ -7,6 +7,10 @@ use num::Zero;
 
 /// A Bounding Sphere.
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[derive(Debug, PartialEq, Copy, Clone)]
 pub struct BoundingSphere {
     pub center: Point<Real>,
diff --git a/src/bounding_volume/simd_aabb.rs b/src/bounding_volume/simd_aabb.rs
index b31641b6..76e0ecff 100644
--- a/src/bounding_volume/simd_aabb.rs
+++ b/src/bounding_volume/simd_aabb.rs
@@ -7,6 +7,10 @@ use simba::simd::{SimdPartialOrd, SimdValue};
 
 /// Four AABB represented as a single SoA AABB with SIMD components.
 #[derive(Debug, Copy, Clone)]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 pub struct SimdAABB {
     /// The min coordinates of the AABBs.
     pub mins: Point<SimdReal>,
diff --git a/src/lib.rs b/src/lib.rs
index ad398f99..30f7ad43 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -28,10 +28,6 @@ std::compile_error!("The `simd-is-enabled` feature should not be enabled explici
 std::compile_error!(
     "SIMD cannot be enabled when the `enhanced-determinism` feature is also enabled."
 );
-#[cfg(all(feature = "simd-is-enabled", feature = "f64"))]
-std::compile_error!(
-    "Explicit SIMD optimization are not yet supported when the f64 feature is enabled."
-);
 
 macro_rules! array(
     ($callback: expr; SIMD_WIDTH) => {
@@ -252,36 +248,18 @@ mod simd {
 
 #[cfg(feature = "simd-is-enabled")]
 mod simd {
-    #[allow(unused_imports)]
-    #[cfg(feature = "simd-nightly")]
-    use simba::simd::{f32x16, f32x4, f32x8, m32x16, m32x4, m32x8, u8x16, u8x4, u8x8};
-    #[cfg(feature = "simd-stable")]
-    use simba::simd::{WideBoolF32x4, WideF32x4};
+    #[cfg(all(feature = "simd-nightly", feature = "f32"))]
+    pub use simba::simd::{f32x4 as SimdReal, m32x4 as SimdBool};
+    #[cfg(all(feature = "simd-stable", feature = "f32"))]
+    pub use simba::simd::{WideBoolF32x4 as SimdBool, WideF32x4 as SimdReal};
+
+    #[cfg(all(feature = "simd-nightly", feature = "f64"))]
+    pub use simba::simd::{f64x4 as SimdReal, m64x4 as SimdBool};
+    #[cfg(all(feature = "simd-stable", feature = "f64"))]
+    pub use simba::simd::{WideBoolF64x4 as SimdBool, WideF64x4 as SimdReal};
 
     /// The number of lanes of a SIMD number.
     pub const SIMD_WIDTH: usize = 4;
     /// SIMD_WIDTH - 1
     pub const SIMD_LAST_INDEX: usize = 3;
-    #[cfg(not(feature = "simd-nightly"))]
-    /// A SIMD float with SIMD_WIDTH lanes.
-    pub type SimdReal = WideF32x4;
-    #[cfg(not(feature = "simd-nightly"))]
-    /// A SIMD bool with SIMD_WIDTH lanes.
-    pub type SimdBool = WideBoolF32x4;
-    #[cfg(feature = "simd-nightly")]
-    /// A SIMD float with SIMD_WIDTH lanes.
-    pub type SimdReal = f32x4;
-    #[cfg(feature = "simd-nightly")]
-    /// A bool float with SIMD_WIDTH lanes.
-    pub type SimdBool = m32x4;
-
-    // pub const SIMD_WIDTH: usize = 8;
-    // pub const SIMD_LAST_INDEX: usize = 7;
-    // pub type SimdReal = f32x8;
-    // pub type SimdBool = m32x8;
-
-    // pub const SIMD_WIDTH: usize = 16;
-    // pub const SIMD_LAST_INDEX: usize = 15;
-    // pub type SimdReal = f32x16;
-    // pub type SimdBool = m32x16;
 }
diff --git a/src/mass_properties/mass_properties.rs b/src/mass_properties/mass_properties.rs
index 2093314a..ae1c70ae 100644
--- a/src/mass_properties/mass_properties.rs
+++ b/src/mass_properties/mass_properties.rs
@@ -10,6 +10,10 @@ const EPSILON: Real = f32::EPSILON as Real;
 
 #[derive(Copy, Clone, Debug, Default, PartialEq)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 /// The local mass properties of a rigid-body.
 pub struct MassProperties {
     /// The center of mass of a rigid-body expressed in its local-space.
diff --git a/src/partitioning/mod.rs b/src/partitioning/mod.rs
index 9fc083ab..caace136 100644
--- a/src/partitioning/mod.rs
+++ b/src/partitioning/mod.rs
@@ -1,6 +1,11 @@
 //! Spatial partitioning tools.
 
-pub use self::qbvh::{IndexedData, QBVHDataGenerator, QbvhNonOverlappingDataSplitter, QBVH};
+pub use self::qbvh::{
+    CenterDataSplitter, IndexedData, NodeIndex, QBVHDataGenerator, QBVHNode, QBVHProxy,
+    QbvhNonOverlappingDataSplitter, SimdNodeIndex, QBVH,
+};
+#[cfg(feature = "parallel")]
+pub use self::visitor::{ParallelSimdSimultaneousVisitor, ParallelSimdVisitor};
 pub use self::visitor::{
     SimdBestFirstVisitStatus, SimdBestFirstVisitor, SimdSimultaneousVisitStatus,
     SimdSimultaneousVisitor, SimdVisitStatus, SimdVisitor,
diff --git a/src/partitioning/qbvh/build.rs b/src/partitioning/qbvh/build.rs
index 5a17238f..954ff30e 100644
--- a/src/partitioning/qbvh/build.rs
+++ b/src/partitioning/qbvh/build.rs
@@ -8,15 +8,15 @@ use simba::simd::SimdValue;
 use super::utils::split_indices_wrt_dim;
 use super::{IndexedData, NodeIndex, QBVHNode, QBVHProxy, QBVH};
 
-pub struct BuilderProxies<'a, T> {
-    proxies: &'a mut Vec<QBVHProxy<T>>,
+pub struct BuilderProxies<'a, LeafData> {
+    proxies: &'a mut Vec<QBVHProxy<LeafData>>,
     aabbs: &'a mut Vec<AABB>,
 }
 
-impl<'a, T> BuilderProxies<'a, T> {
-    fn insert(&mut self, data: T, aabb: AABB)
+impl<'a, LeafData> BuilderProxies<'a, LeafData> {
+    fn insert(&mut self, data: LeafData, aabb: AABB)
     where
-        T: IndexedData,
+        LeafData: IndexedData,
     {
         let index = data.index();
 
@@ -30,18 +30,23 @@ impl<'a, T> BuilderProxies<'a, T> {
     }
 }
 
-pub trait QBVHDataSplitter<T> {
+pub trait QBVHDataSplitter<LeafData> {
     fn split_dataset<'idx>(
         &mut self,
         subdiv_dims: [usize; 2],
         center: Point<Real>,
         indices: &'idx mut [usize],
         indices_workspace: &'idx mut Vec<usize>,
-        proxies: BuilderProxies<T>,
+        proxies: BuilderProxies<LeafData>,
     ) -> [&'idx mut [usize]; 4];
 }
 
-struct CenterDataSplitter {
+/// A data splitter that arranges a set of AABBs in two sets based on their center’s coordinate
+/// along the split axis.
+pub struct CenterDataSplitter {
+    /// If all the AABB centers have the same coordinate values along the splitting axis
+    /// setting this to `true` will allow the spliter to split the AABB set into two
+    /// subsets arbitrarily.
     pub enable_fallback_split: bool,
 }
 
@@ -53,26 +58,26 @@ impl Default for CenterDataSplitter {
     }
 }
 
-impl<T> QBVHDataSplitter<T> for CenterDataSplitter {
+impl<LeafData> QBVHDataSplitter<LeafData> for CenterDataSplitter {
     fn split_dataset<'idx>(
         &mut self,
         subdiv_dims: [usize; 2],
         center: Point<Real>,
         indices: &'idx mut [usize],
         _: &'idx mut Vec<usize>,
-        proxies: BuilderProxies<T>,
+        proxies: BuilderProxies<LeafData>,
     ) -> [&'idx mut [usize]; 4] {
         self.split_dataset_wo_workspace(subdiv_dims, center, indices, proxies)
     }
 }
 
 impl CenterDataSplitter {
-    fn split_dataset_wo_workspace<'idx, T>(
+    fn split_dataset_wo_workspace<'idx, LeafData>(
         &mut self,
         subdiv_dims: [usize; 2],
         center: Point<Real>,
         indices: &'idx mut [usize],
-        proxies: BuilderProxies<T>,
+        proxies: BuilderProxies<LeafData>,
     ) -> [&'idx mut [usize]; 4] {
         // TODO: should we split wrt. the median instead of the average?
         // TODO: we should ensure each subslice contains at least 4 elements each (or less if
@@ -117,10 +122,10 @@ pub struct QbvhNonOverlappingDataSplitter<F> {
     pub epsilon: Real,
 }
 
-impl<T, F> QBVHDataSplitter<T> for QbvhNonOverlappingDataSplitter<F>
+impl<LeafData, F> QBVHDataSplitter<LeafData> for QbvhNonOverlappingDataSplitter<F>
 where
-    T: IndexedData,
-    F: FnMut(T, usize, Real, Real, AABB, AABB) -> SplitResult<(T, AABB)>,
+    LeafData: IndexedData,
+    F: FnMut(LeafData, usize, Real, Real, AABB, AABB) -> SplitResult<(LeafData, AABB)>,
 {
     fn split_dataset<'idx>(
         &mut self,
@@ -128,7 +133,7 @@ where
         center: Point<Real>,
         indices: &'idx mut [usize],
         indices_workspace: &'idx mut Vec<usize>,
-        mut proxies: BuilderProxies<T>,
+        mut proxies: BuilderProxies<LeafData>,
     ) -> [&'idx mut [usize]; 4] {
         // 1. Snap the spliting point to one fo the AABB min/max,
         // such that at least one AABB isn’t split along each dimension.
@@ -228,36 +233,36 @@ where
 }
 
 /// Trait used for generating the content of the leaves of the QBVH acceleration structure.
-pub trait QBVHDataGenerator<T> {
+pub trait QBVHDataGenerator<LeafData> {
     /// Gives an idea of the number of elements this generator contains.
     ///
     /// This is primarily used for pre-allocating some arrays for better performances.
     fn size_hint(&self) -> usize;
     /// Iterate through all the elements of this generator.
-    fn for_each(&mut self, f: impl FnMut(T, AABB));
+    fn for_each(&mut self, f: impl FnMut(LeafData, AABB));
 }
 
-impl<T, F> QBVHDataGenerator<T> for F
+impl<LeafData, F> QBVHDataGenerator<LeafData> for F
 where
-    F: ExactSizeIterator<Item = (T, AABB)>,
+    F: ExactSizeIterator<Item = (LeafData, AABB)>,
 {
     fn size_hint(&self) -> usize {
         self.len()
     }
 
     #[inline(always)]
-    fn for_each(&mut self, mut f: impl FnMut(T, AABB)) {
+    fn for_each(&mut self, mut f: impl FnMut(LeafData, AABB)) {
         for (elt, aabb) in self {
             f(elt, aabb)
         }
     }
 }
 
-impl<T: IndexedData> QBVH<T> {
+impl<LeafData: IndexedData> QBVH<LeafData> {
     /// Clears this quaternary BVH and rebuilds it from a new set of data and AABBs.
     pub fn clear_and_rebuild(
         &mut self,
-        data_gen: impl QBVHDataGenerator<T>,
+        data_gen: impl QBVHDataGenerator<LeafData>,
         dilation_factor: Real,
     ) {
         self.clear_and_rebuild_with_splitter(
@@ -268,12 +273,12 @@ impl<T: IndexedData> QBVH<T> {
     }
 }
 
-impl<T: IndexedData> QBVH<T> {
+impl<LeafData: IndexedData> QBVH<LeafData> {
     /// Clears this quaternary BVH and rebuilds it from a new set of data and AABBs.
     pub fn clear_and_rebuild_with_splitter(
         &mut self,
-        mut data_gen: impl QBVHDataGenerator<T>,
-        mut splitter: impl QBVHDataSplitter<T>,
+        mut data_gen: impl QBVHDataGenerator<LeafData>,
+        mut splitter: impl QBVHDataSplitter<LeafData>,
         dilation_factor: Real,
     ) {
         self.nodes.clear();
@@ -326,7 +331,7 @@ impl<T: IndexedData> QBVH<T> {
 
     fn do_recurse_build_generic(
         &mut self,
-        splitter: &mut impl QBVHDataSplitter<T>,
+        splitter: &mut impl QBVHDataSplitter<LeafData>,
         indices: &mut [usize],
         aabbs: &mut Vec<AABB>,
         parent: NodeIndex,
@@ -408,7 +413,7 @@ impl<T: IndexedData> QBVH<T> {
         // Split the set along the two subdiv_dims dimensions.
         let proxies = BuilderProxies {
             proxies: &mut self.proxies,
-            aabbs: aabbs,
+            aabbs,
         };
 
         // Recurse!
diff --git a/src/partitioning/qbvh/mod.rs b/src/partitioning/qbvh/mod.rs
index 6b470bd0..ab420039 100644
--- a/src/partitioning/qbvh/mod.rs
+++ b/src/partitioning/qbvh/mod.rs
@@ -1,7 +1,7 @@
-pub use self::build::{BuilderProxies, QBVHDataGenerator, QbvhNonOverlappingDataSplitter};
-pub use self::qbvh::{IndexedData, QBVH};
-
-pub(self) use self::qbvh::*;
+pub use self::build::{
+    BuilderProxies, CenterDataSplitter, QBVHDataGenerator, QbvhNonOverlappingDataSplitter,
+};
+pub use self::qbvh::{IndexedData, NodeIndex, QBVHNode, QBVHProxy, SimdNodeIndex, QBVH};
 
 mod build;
 mod qbvh;
diff --git a/src/partitioning/qbvh/qbvh.rs b/src/partitioning/qbvh/qbvh.rs
index 7dff9cfc..d77397e6 100644
--- a/src/partitioning/qbvh/qbvh.rs
+++ b/src/partitioning/qbvh/qbvh.rs
@@ -1,7 +1,6 @@
 use crate::bounding_volume::{SimdAABB, AABB};
 use crate::math::{Real, Vector};
 use na::SimdValue;
-use std::collections::VecDeque;
 
 /// A data to which an index is associated.
 pub trait IndexedData: Copy {
@@ -40,12 +39,22 @@ impl IndexedData for u64 {
     }
 }
 
+/// The index of an internal SIMD node of a QBVH.
+pub type SimdNodeIndex = u32;
+
 /// The index of a node part of a QBVH.
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
+/// The index of one specific node of a QBVH.
 pub struct NodeIndex {
-    pub(super) index: u32, // Index of the addressed node in the `nodes` array.
-    pub(super) lane: u8,   // SIMD lane of the addressed node.
+    /// The index of the SIMD node containing the addressed node.
+    pub index: SimdNodeIndex, // Index of the addressed node in the `nodes` array.
+    /// The SIMD lane the addressed node is associated to.
+    pub lane: u8, // SIMD lane of the addressed node.
 }
 
 impl NodeIndex {
@@ -66,6 +75,10 @@ impl NodeIndex {
 /// This groups four nodes of the QBVH.
 #[derive(Copy, Clone, Debug)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 pub struct QBVHNode {
     /// The AABBs of the qbvh nodes represented by this node.
     pub simd_aabb: SimdAABB,
@@ -74,30 +87,37 @@ pub struct QBVHNode {
     pub children: [u32; 4],
     /// The index of the node parent to the 4 nodes represented by `self`.
     pub parent: NodeIndex,
-    /// Are the four nodes represneted by `self` leaves of the `QBVH`?
+    /// Are the four nodes represented by `self` leaves of the `QBVH`?
     pub leaf: bool, // TODO: pack this with the NodexIndex.lane?
     pub(super) dirty: bool, // TODO: move this to a separate bitvec?
 }
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
-pub struct QBVHProxy<T> {
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
+/// Combination of a leaf data and its associated node’s index.
+pub struct QBVHProxy<LeafData> {
+    /// Index of the leaf node the leaf data is associated to.
     pub node: NodeIndex,
-    pub data: T, // The collider data. TODO: only set the collider generation here?
+    /// The data contained in this node.
+    pub data: LeafData, // The collider data. TODO: only set the collider generation here?
 }
 
-impl<T> QBVHProxy<T> {
+impl<LeafData> QBVHProxy<LeafData> {
     pub(super) fn invalid() -> Self
     where
-        T: IndexedData,
+        LeafData: IndexedData,
     {
         Self {
             node: NodeIndex::invalid(),
-            data: T::default(),
+            data: LeafData::default(),
         }
     }
 
-    pub(super) fn detached(data: T) -> Self {
+    pub(super) fn detached(data: LeafData) -> Self {
         Self {
             node: NodeIndex::invalid(),
             data,
@@ -109,32 +129,36 @@ impl<T> QBVHProxy<T> {
 ///
 /// This is a bounding-volume-hierarchy where each node has either four children or none.
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[derive(Clone, Debug)]
-pub struct QBVH<T> {
+pub struct QBVH<LeafData> {
     pub(super) root_aabb: AABB,
     pub(super) nodes: Vec<QBVHNode>,
-    pub(super) dirty_nodes: VecDeque<u32>,
-    pub(super) proxies: Vec<QBVHProxy<T>>,
+    pub(super) dirty_nodes: Vec<u32>,
+    pub(super) proxies: Vec<QBVHProxy<LeafData>>,
 }
 
-impl<T: IndexedData> QBVH<T> {
+impl<LeafData: IndexedData> QBVH<LeafData> {
     /// Initialize an empty QBVH.
     pub fn new() -> Self {
         QBVH {
             root_aabb: AABB::new_invalid(),
             nodes: Vec::new(),
-            dirty_nodes: VecDeque::new(),
+            dirty_nodes: Vec::new(),
             proxies: Vec::new(),
         }
     }
 
     /// Iterates mutably through all the leaf data in this QBVH.
-    pub fn iter_data_mut(&mut self) -> impl Iterator<Item = (NodeIndex, &mut T)> {
+    pub fn iter_data_mut(&mut self) -> impl Iterator<Item = (NodeIndex, &mut LeafData)> {
         self.proxies.iter_mut().map(|p| (p.node, &mut p.data))
     }
 
     /// Iterate through all the leaf data in this QBVH.
-    pub fn iter_data(&self) -> impl Iterator<Item = (NodeIndex, &T)> {
+    pub fn iter_data(&self) -> impl Iterator<Item = (NodeIndex, &LeafData)> {
         self.proxies.iter().map(|p| (p.node, &p.data))
     }
 
@@ -153,7 +177,7 @@ impl<T: IndexedData> QBVH<T> {
     /// Returns the data associated to a given leaf.
     ///
     /// Returns `None` if the provided node ID does not identify a leaf.
-    pub fn leaf_data(&mut self, node_id: NodeIndex) -> Option<T> {
+    pub fn leaf_data(&mut self, node_id: NodeIndex) -> Option<LeafData> {
         let node = self.nodes.get(node_id.index as usize)?;
 
         if !node.leaf {
@@ -180,7 +204,7 @@ impl<T: IndexedData> QBVH<T> {
     /// If this QBVH isn’t empty, the first element of the returned slice is the root of the
     /// tree. The other elements are not arranged in any particular order.
     /// The more high-level traversal methods should be used instead of this.
-    pub fn raw_proxies(&self) -> &[QBVHProxy<T>] {
+    pub fn raw_proxies(&self) -> &[QBVHProxy<LeafData>] {
         &self.proxies
     }
 
diff --git a/src/partitioning/qbvh/traversal.rs b/src/partitioning/qbvh/traversal.rs
index 109a3093..2eb7b487 100644
--- a/src/partitioning/qbvh/traversal.rs
+++ b/src/partitioning/qbvh/traversal.rs
@@ -10,28 +10,70 @@ use crate::utils::WeightedValue;
 use num::Bounded;
 use simba::simd::SimdBool;
 use std::collections::BinaryHeap;
+#[cfg(feature = "parallel")]
+use {
+    crate::partitioning::{ParallelSimdSimultaneousVisitor, ParallelSimdVisitor},
+    arrayvec::ArrayVec,
+    rayon::prelude::*,
+    std::sync::atomic::{AtomicBool, Ordering as AtomicOrdering},
+};
 
 use super::{IndexedData, NodeIndex, QBVH};
 
-impl<T: IndexedData> QBVH<T> {
+impl<LeafData: IndexedData> QBVH<LeafData> {
     /// Performs a depth-first traversal on the BVH.
-    pub fn traverse_depth_first(&self, visitor: &mut impl SimdVisitor<T, SimdAABB>) {
-        self.traverse_depth_first_with_stack(visitor, &mut Vec::new())
+    ///
+    /// # Return
+    ///
+    /// Returns `false` if the traversal exitted early, and `true` otherwise.
+    pub fn traverse_depth_first(&self, visitor: &mut impl SimdVisitor<LeafData, SimdAABB>) -> bool {
+        self.traverse_depth_first_node(visitor, 0)
+    }
+
+    /// Performs a depth-first traversal on the BVH, starting at the given node.
+    ///
+    /// # Return
+    ///
+    /// Returns `false` if the traversal exitted early, and `true` otherwise.
+    pub fn traverse_depth_first_node(
+        &self,
+        visitor: &mut impl SimdVisitor<LeafData, SimdAABB>,
+        start_node: u32,
+    ) -> bool {
+        self.traverse_depth_first_node_with_stack(visitor, &mut Vec::new(), start_node)
     }
 
     /// Performs a depth-first traversal on the BVH.
+    ///
+    /// # Return
+    ///
+    /// Returns `false` if the traversal exited early, and `true` otherwise.
     pub fn traverse_depth_first_with_stack(
         &self,
-        visitor: &mut impl SimdVisitor<T, SimdAABB>,
+        visitor: &mut impl SimdVisitor<LeafData, SimdAABB>,
         stack: &mut Vec<u32>,
-    ) {
+    ) -> bool {
+        self.traverse_depth_first_node_with_stack(visitor, stack, 0)
+    }
+
+    /// Performs a depth-first traversal on the BVH.
+    ///
+    /// # Return
+    ///
+    /// Returns `false` if the traversal exited early, and `true` otherwise.
+    pub fn traverse_depth_first_node_with_stack(
+        &self,
+        visitor: &mut impl SimdVisitor<LeafData, SimdAABB>,
+        stack: &mut Vec<u32>,
+        start_node: u32,
+    ) -> bool {
         stack.clear();
 
         if !self.nodes.is_empty() {
-            stack.push(0);
+            stack.push(start_node);
         }
         while let Some(entry) = stack.pop() {
-            let node = self.nodes[entry as usize];
+            let node = &self.nodes[entry as usize];
             let leaf_data = if node.leaf {
                 Some(
                     array![|ii| Some(&self.proxies.get(node.children[ii] as usize)?.data); SIMD_WIDTH],
@@ -42,7 +84,7 @@ impl<T: IndexedData> QBVH<T> {
 
             match visitor.visit(&node.simd_aabb, leaf_data) {
                 SimdVisitStatus::ExitEarly => {
-                    return;
+                    return false;
                 }
                 SimdVisitStatus::MaybeContinue(mask) => {
                     let bitmask = mask.bitmask();
@@ -62,6 +104,8 @@ impl<T: IndexedData> QBVH<T> {
                 }
             }
         }
+
+        true
     }
 
     /// Performs a best-first-search on the BVH.
@@ -70,18 +114,35 @@ impl<T: IndexedData> QBVH<T> {
     /// user-defined type.
     pub fn traverse_best_first<BFS>(&self, visitor: &mut BFS) -> Option<(NodeIndex, BFS::Result)>
     where
-        BFS: SimdBestFirstVisitor<T, SimdAABB>,
+        BFS: SimdBestFirstVisitor<LeafData, SimdAABB>,
         BFS::Result: Clone, // Because we cannot move out of an array…
     {
         if self.nodes.is_empty() {
             return None;
         }
 
+        self.traverse_best_first_node(visitor, 0, Real::max_value())
+    }
+
+    /// Performs a best-first-search on the BVH, starting at the given node.
+    ///
+    /// Returns the content of the leaf with the smallest associated cost, and a result of
+    /// user-defined type.
+    pub fn traverse_best_first_node<BFS>(
+        &self,
+        visitor: &mut BFS,
+        start_node: u32,
+        init_cost: Real,
+    ) -> Option<(NodeIndex, BFS::Result)>
+    where
+        BFS: SimdBestFirstVisitor<LeafData, SimdAABB>,
+        BFS::Result: Clone, // Because we cannot move out of an array…
+    {
         let mut queue: BinaryHeap<WeightedValue<u32>> = BinaryHeap::new();
 
-        let mut best_cost = Real::max_value();
+        let mut best_cost = init_cost;
         let mut best_result = None;
-        queue.push(WeightedValue::new(0, -best_cost / 2.0));
+        queue.push(WeightedValue::new(start_node, -best_cost / 2.0));
 
         while let Some(entry) = queue.pop() {
             if -entry.cost >= best_cost {
@@ -89,7 +150,7 @@ impl<T: IndexedData> QBVH<T> {
                 break; // Solution found.
             }
 
-            let node = self.nodes[entry.value as usize];
+            let node = &self.nodes[entry.value as usize];
             let leaf_data = if node.leaf {
                 Some(
                     array![|ii| Some(&self.proxies.get(node.children[ii] as usize)?.data); SIMD_WIDTH],
@@ -144,7 +205,7 @@ impl<T: IndexedData> QBVH<T> {
     /// the given AABB:
     // FIXME: implement a visitor pattern to merge intersect_aabb
     // and intersect_ray into a single method.
-    pub fn intersect_aabb(&self, aabb: &AABB, out: &mut Vec<T>) {
+    pub fn intersect_aabb(&self, aabb: &AABB, out: &mut Vec<LeafData>) {
         if self.nodes.is_empty() {
             return;
         }
@@ -153,7 +214,7 @@ impl<T: IndexedData> QBVH<T> {
         let mut stack = vec![0u32];
         let simd_aabb = SimdAABB::splat(*aabb);
         while let Some(inode) = stack.pop() {
-            let node = self.nodes[inode as usize];
+            let node = &self.nodes[inode as usize];
             let intersections = node.simd_aabb.intersects(&simd_aabb);
             let bitmask = intersections.bitmask();
 
@@ -179,19 +240,19 @@ impl<T: IndexedData> QBVH<T> {
     }
 
     /// Performs a simultaneous traversal of two QBVH.
-    pub fn traverse_bvtt<T2: IndexedData>(
+    pub fn traverse_bvtt<LeafData2: IndexedData>(
         &self,
-        qbvh2: &QBVH<T2>,
-        visitor: &mut impl SimdSimultaneousVisitor<T, T2, SimdAABB>,
+        qbvh2: &QBVH<LeafData2>,
+        visitor: &mut impl SimdSimultaneousVisitor<LeafData, LeafData2, SimdAABB>,
     ) {
         self.traverse_bvtt_with_stack(qbvh2, visitor, &mut Vec::new())
     }
 
     /// Performs a simultaneous traversal of two QBVH.
-    pub fn traverse_bvtt_with_stack<T2: IndexedData>(
+    pub fn traverse_bvtt_with_stack<LeafData2: IndexedData>(
         &self,
-        qbvh2: &QBVH<T2>,
-        visitor: &mut impl SimdSimultaneousVisitor<T, T2, SimdAABB>,
+        qbvh2: &QBVH<LeafData2>,
+        visitor: &mut impl SimdSimultaneousVisitor<LeafData, LeafData2, SimdAABB>,
         stack: &mut Vec<(u32, u32)>,
     ) {
         let qbvh1 = self;
@@ -202,8 +263,8 @@ impl<T: IndexedData> QBVH<T> {
         }
 
         while let Some(entry) = stack.pop() {
-            let node1 = qbvh1.nodes[entry.0 as usize];
-            let node2 = qbvh2.nodes[entry.1 as usize];
+            let node1 = &qbvh1.nodes[entry.0 as usize];
+            let node2 = &qbvh2.nodes[entry.1 as usize];
 
             let leaf_data1 = if node1.leaf {
                 Some(
@@ -274,3 +335,186 @@ impl<T: IndexedData> QBVH<T> {
         }
     }
 }
+
+#[cfg(feature = "parallel")]
+impl<LeafData: IndexedData + Sync> QBVH<LeafData> {
+    /// Performs a depth-first traversal of two QBVH using
+    /// parallelism internally for better performances with large tree.
+    pub fn traverse_depth_first_parallel(&self, visitor: &impl ParallelSimdVisitor<LeafData>) {
+        if !self.nodes.is_empty() {
+            let exit_early = AtomicBool::new(false);
+            self.traverse_depth_first_node_parallel(visitor, &exit_early, 0);
+        }
+    }
+
+    /// Runs a parallel depth-first traversal of the sub-tree starting at the given node.
+    pub fn traverse_depth_first_node_parallel(
+        &self,
+        visitor: &impl ParallelSimdVisitor<LeafData>,
+        exit_early: &AtomicBool,
+        entry: u32,
+    ) {
+        if exit_early.load(AtomicOrdering::Relaxed) {
+            return;
+        }
+
+        let mut stack: ArrayVec<u32, SIMD_WIDTH> = ArrayVec::new();
+        let node = &self.nodes[entry as usize];
+        let leaf_data = if node.leaf {
+            Some(array![|ii| Some(&self.proxies.get(node.children[ii] as usize)?.data); SIMD_WIDTH])
+        } else {
+            None
+        };
+
+        match visitor.visit(entry, node, leaf_data) {
+            SimdVisitStatus::ExitEarly => {
+                exit_early.store(true, AtomicOrdering::Relaxed);
+                return;
+            }
+            SimdVisitStatus::MaybeContinue(mask) => {
+                let bitmask = mask.bitmask();
+
+                for ii in 0..SIMD_WIDTH {
+                    if (bitmask & (1 << ii)) != 0 {
+                        if !node.leaf {
+                            // Internal node, visit the child.
+                            // Un fortunately, we have this check because invalid AABBs
+                            // return a hit as well.
+                            if node.children[ii] as usize <= self.nodes.len() {
+                                stack.push(node.children[ii]);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        stack
+            .as_slice()
+            .par_iter()
+            .copied()
+            .for_each(|entry| self.traverse_depth_first_node_parallel(visitor, exit_early, entry));
+    }
+
+    /// Performs a simultaneous traversal of two QBVH using
+    /// parallelism internally for better performances with large tree.
+    pub fn traverse_bvtt_parallel<
+        LeafData2: IndexedData + Sync,
+        Visitor: ParallelSimdSimultaneousVisitor<LeafData, LeafData2>,
+    >(
+        &self,
+        qbvh2: &QBVH<LeafData2>,
+        visitor: &Visitor,
+    ) {
+        if !self.nodes.is_empty() && !qbvh2.nodes.is_empty() {
+            let exit_early = AtomicBool::new(false);
+            self.traverse_bvtt_node_parallel(
+                qbvh2,
+                visitor,
+                &exit_early,
+                Visitor::Data::default(),
+                (0, 0),
+            );
+        }
+    }
+
+    /// Runs a parallel simultaneous traversal of the sub-tree starting at the given nodes.
+    pub fn traverse_bvtt_node_parallel<
+        LeafData2: IndexedData + Sync,
+        Visitor: ParallelSimdSimultaneousVisitor<LeafData, LeafData2>,
+    >(
+        &self,
+        qbvh2: &QBVH<LeafData2>,
+        visitor: &Visitor,
+        exit_early: &AtomicBool,
+        data: Visitor::Data,
+        entry: (u32, u32),
+    ) {
+        if exit_early.load(AtomicOrdering::Relaxed) {
+            return;
+        }
+
+        let qbvh1 = self;
+        let node1 = &qbvh1.nodes[entry.0 as usize];
+        let node2 = &qbvh2.nodes[entry.1 as usize];
+
+        const SQUARE_SIMD_WIDTH: usize = SIMD_WIDTH * SIMD_WIDTH;
+        let mut stack: ArrayVec<(u32, u32), SQUARE_SIMD_WIDTH> = ArrayVec::new();
+
+        let leaf_data1 = if node1.leaf {
+            Some(
+                array![|ii| Some(&qbvh1.proxies.get(node1.children[ii] as usize)?.data); SIMD_WIDTH],
+            )
+        } else {
+            None
+        };
+
+        let leaf_data2 = if node2.leaf {
+            Some(
+                array![|ii| Some(&qbvh2.proxies.get(node2.children[ii] as usize)?.data); SIMD_WIDTH],
+            )
+        } else {
+            None
+        };
+
+        let (status, data) = visitor.visit(
+            entry.0, &node1, leaf_data1, entry.1, &node2, leaf_data2, data,
+        );
+
+        match status {
+            SimdSimultaneousVisitStatus::ExitEarly => {
+                exit_early.store(true, AtomicOrdering::Relaxed);
+                return;
+            }
+            SimdSimultaneousVisitStatus::MaybeContinue(mask) => {
+                match (node1.leaf, node2.leaf) {
+                    (true, true) => { /* Can’t go deeper. */ }
+                    (true, false) => {
+                        let mut bitmask = 0;
+                        for ii in 0..SIMD_WIDTH {
+                            bitmask |= mask[ii].bitmask();
+                        }
+
+                        for jj in 0..SIMD_WIDTH {
+                            if (bitmask & (1 << jj)) != 0 {
+                                if node2.children[jj] as usize <= qbvh2.nodes.len() {
+                                    stack.push((entry.0, node2.children[jj]));
+                                }
+                            }
+                        }
+                    }
+                    (false, true) => {
+                        for ii in 0..SIMD_WIDTH {
+                            let bitmask = mask[ii].bitmask();
+
+                            if bitmask != 0 {
+                                if node1.children[ii] as usize <= qbvh1.nodes.len() {
+                                    stack.push((node1.children[ii], entry.1));
+                                }
+                            }
+                        }
+                    }
+                    (false, false) => {
+                        for ii in 0..SIMD_WIDTH {
+                            let bitmask = mask[ii].bitmask();
+
+                            for jj in 0..SIMD_WIDTH {
+                                if (bitmask & (1 << jj)) != 0 {
+                                    if node1.children[ii] as usize <= qbvh1.nodes.len()
+                                        && node2.children[jj] as usize <= qbvh2.nodes.len()
+                                    {
+                                        stack.push((node1.children[ii], node2.children[jj]));
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        stack.as_slice().par_iter().copied().for_each(|entry| {
+            self.traverse_bvtt_node_parallel(qbvh2, visitor, exit_early, data, entry)
+        });
+    }
+}
diff --git a/src/partitioning/qbvh/update.rs b/src/partitioning/qbvh/update.rs
index 564cf0d7..e45c4614 100644
--- a/src/partitioning/qbvh/update.rs
+++ b/src/partitioning/qbvh/update.rs
@@ -15,15 +15,15 @@ struct QBVHIncrementalBuilderStep {
 }
 
 #[allow(dead_code)]
-struct QBVHIncrementalBuilder<T> {
-    qbvh: QBVH<T>,
+struct QBVHIncrementalBuilder<LeafData> {
+    qbvh: QBVH<LeafData>,
     to_insert: Vec<QBVHIncrementalBuilderStep>,
     aabbs: Vec<AABB>,
     indices: Vec<usize>,
 }
 
 #[allow(dead_code)]
-impl<T: IndexedData> QBVHIncrementalBuilder<T> {
+impl<LeafData: IndexedData> QBVHIncrementalBuilder<LeafData> {
     pub fn new() -> Self {
         Self {
             qbvh: QBVH::new(),
@@ -158,28 +158,28 @@ impl<T: IndexedData> QBVHIncrementalBuilder<T> {
     }
 }
 
-impl<T: IndexedData> QBVH<T> {
+impl<LeafData: IndexedData> QBVH<LeafData> {
     /// Marks a piece of data as dirty so it can be updated during the next
     /// call to `self.update`.
-    pub fn pre_update(&mut self, data: T) {
+    pub fn pre_update(&mut self, data: LeafData) {
         let id = data.index();
         let node_id = self.proxies[id].node.index;
         let node = &mut self.nodes[node_id as usize];
         if !node.dirty {
             node.dirty = true;
-            self.dirty_nodes.push_back(node_id);
+            self.dirty_nodes.push(node_id);
         }
     }
 
     /// Update all the nodes that have been marked as dirty by `self.pre_update`.
     pub fn update<F>(&mut self, aabb_builder: F, dilation_factor: Real)
     where
-        F: Fn(&T) -> AABB,
+        F: Fn(&LeafData) -> AABB,
     {
         // Loop on the dirty leaves.
         let dilation_factor = SimdReal::splat(dilation_factor);
 
-        while let Some(id) = self.dirty_nodes.pop_front() {
+        while let Some(id) = self.dirty_nodes.pop() {
             // NOTE: this will data the case where we reach the root of the tree.
             if let Some(node) = self.nodes.get(id as usize) {
                 // Compute the new aabb.
@@ -203,7 +203,7 @@ impl<T: IndexedData> QBVH<T> {
                 if !node.simd_aabb.contains(&new_simd_aabb).all() {
                     node.simd_aabb = new_simd_aabb;
                     node.simd_aabb.dilate_by_factor(dilation_factor);
-                    self.dirty_nodes.push_back(node.parent.index);
+                    self.dirty_nodes.push(node.parent.index);
                 }
                 node.dirty = false;
             }
diff --git a/src/partitioning/visitor.rs b/src/partitioning/visitor.rs
index 9f864ecd..f364a5c7 100644
--- a/src/partitioning/visitor.rs
+++ b/src/partitioning/visitor.rs
@@ -1,4 +1,6 @@
 use crate::math::{Real, SimdBool, SimdReal, SIMD_WIDTH};
+use crate::partitioning::qbvh::QBVHNode;
+use crate::partitioning::SimdNodeIndex;
 
 /// The next action to be taken by a BVH traversal algorithm after having visited a node with some data.
 pub enum SimdBestFirstVisitStatus<Res> {
@@ -20,7 +22,7 @@ pub enum SimdBestFirstVisitStatus<Res> {
 }
 
 /// Trait implemented by cost functions used by the best-first search on a `BVT`.
-pub trait SimdBestFirstVisitor<T, SimdBV> {
+pub trait SimdBestFirstVisitor<LeafData, SimdBV> {
     /// The result of a best-first traversal.
     type Result;
 
@@ -29,7 +31,7 @@ pub trait SimdBestFirstVisitor<T, SimdBV> {
         &mut self,
         best_cost_so_far: Real,
         bv: &SimdBV,
-        value: Option<[Option<&T>; SIMD_WIDTH]>,
+        value: Option<[Option<&LeafData>; SIMD_WIDTH]>,
     ) -> SimdBestFirstVisitStatus<Self::Result>;
 }
 
@@ -52,23 +54,30 @@ pub enum SimdSimultaneousVisitStatus {
 }
 
 /// Trait implemented by visitor called during the traversal of a spatial partitioning data structure.
-pub trait SimdVisitor<T, SimdBV> {
+pub trait SimdVisitor<LeafData, SimdBV> {
     /// Execute an operation on the content of a node of the spatial partitioning structure.
     ///
     /// Returns whether the traversal should continue on the node's children, if it should not continue
     /// on those children, or if the whole traversal should be exited early.
-    fn visit(&mut self, bv: &SimdBV, data: Option<[Option<&T>; SIMD_WIDTH]>) -> SimdVisitStatus;
+    fn visit(
+        &mut self,
+        bv: &SimdBV,
+        data: Option<[Option<&LeafData>; SIMD_WIDTH]>,
+    ) -> SimdVisitStatus;
 }
 
-impl<F, T, SimdBV> SimdVisitor<T, SimdBV> for F
+impl<F, LeafData, SimdBV> SimdVisitor<LeafData, SimdBV> for F
 where
-    F: FnMut(&SimdBV, Option<[Option<&T>; SIMD_WIDTH]>) -> SimdVisitStatus,
+    F: FnMut(&SimdBV, Option<[Option<&LeafData>; SIMD_WIDTH]>) -> SimdVisitStatus,
 {
-    fn visit(&mut self, bv: &SimdBV, data: Option<[Option<&T>; SIMD_WIDTH]>) -> SimdVisitStatus {
+    fn visit(
+        &mut self,
+        bv: &SimdBV,
+        data: Option<[Option<&LeafData>; SIMD_WIDTH]>,
+    ) -> SimdVisitStatus {
         (self)(bv, data)
     }
 }
-
 /// Trait implemented by visitor called during a simultaneous spatial partitioning data structure tarversal.
 pub trait SimdSimultaneousVisitor<T1, T2, SimdBV> {
     /// Execute an operation on the content of two nodes, one from each structure.
@@ -83,3 +92,60 @@ pub trait SimdSimultaneousVisitor<T1, T2, SimdBV> {
         right_data: Option<[Option<&T2>; SIMD_WIDTH]>,
     ) -> SimdSimultaneousVisitStatus;
 }
+
+/*
+ *
+ * Parallel visitors bellow.
+ *
+ */
+
+/// Trait implemented by visitor called during the parallel traversal of a spatial partitioning data structure.
+pub trait ParallelSimdVisitor<LeafData>: Sync {
+    /// Execute an operation on the content of a node of the spatial partitioning structure.
+    ///
+    /// Returns whether the traversal should continue on the node's children, if it should not continue
+    /// on those children, or if the whole traversal should be exited early.
+    fn visit(
+        &self,
+        node_id: SimdNodeIndex,
+        bv: &QBVHNode,
+        data: Option<[Option<&LeafData>; SIMD_WIDTH]>,
+    ) -> SimdVisitStatus;
+}
+
+impl<F, LeafData> ParallelSimdVisitor<LeafData> for F
+where
+    F: Sync + Fn(&QBVHNode, Option<[Option<&LeafData>; SIMD_WIDTH]>) -> SimdVisitStatus,
+{
+    fn visit(
+        &self,
+        _node_id: SimdNodeIndex,
+        node: &QBVHNode,
+        data: Option<[Option<&LeafData>; SIMD_WIDTH]>,
+    ) -> SimdVisitStatus {
+        (self)(node, data)
+    }
+}
+
+/// Trait implemented by visitor called during a parallel simultaneous spatial partitioning
+/// data structure traversal.
+#[cfg(feature = "parallel")]
+pub trait ParallelSimdSimultaneousVisitor<LeafData1, LeafData2>: Sync {
+    /// Visitor state data that will be passed down the recursion.
+    type Data: Copy + Sync + Default;
+
+    /// Execute an operation on the content of two nodes, one from each structure.
+    ///
+    /// Returns whether the traversal should continue on the nodes children, if it should not continue
+    /// on those children, or if the whole traversal should be exited early.
+    fn visit(
+        &self,
+        left_node_id: SimdNodeIndex,
+        left_node: &QBVHNode,
+        left_data: Option<[Option<&LeafData1>; SIMD_WIDTH]>,
+        right_node_id: SimdNodeIndex,
+        right_node: &QBVHNode,
+        right_data: Option<[Option<&LeafData2>; SIMD_WIDTH]>,
+        visitor_data: Self::Data,
+    ) -> (SimdSimultaneousVisitStatus, Self::Data);
+}
diff --git a/src/query/closest_points/closest_points.rs b/src/query/closest_points/closest_points.rs
index cab6bce6..991d360a 100644
--- a/src/query/closest_points/closest_points.rs
+++ b/src/query/closest_points/closest_points.rs
@@ -5,6 +5,10 @@ use std::mem;
 /// Closest points information.
 #[derive(Debug, PartialEq, Clone, Copy)]
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 pub enum ClosestPoints {
     /// The two objects are intersecting.
     Intersecting,
diff --git a/src/query/contact/contact.rs b/src/query/contact/contact.rs
index 22f6637b..a6f7821f 100644
--- a/src/query/contact/contact.rs
+++ b/src/query/contact/contact.rs
@@ -5,6 +5,10 @@ use std::mem;
 /// Geometric description of a contact.
 #[derive(Debug, PartialEq, Copy, Clone)]
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 pub struct Contact {
     /// Position of the contact on the first object.
     pub point1: Point<Real>,
diff --git a/src/query/contact/contact_composite_shape_shape.rs b/src/query/contact/contact_composite_shape_shape.rs
index 9a8078fe..40418991 100644
--- a/src/query/contact/contact_composite_shape_shape.rs
+++ b/src/query/contact/contact_composite_shape_shape.rs
@@ -41,7 +41,7 @@ where
     };
 
     let mut visitor = BoundingVolumeIntersectionsVisitor::new(&ls_aabb2, &mut leaf_callback);
-    g1.qbvh().traverse_depth_first(&mut visitor);
+    let _ = g1.qbvh().traverse_depth_first(&mut visitor);
     res
 }
 
diff --git a/src/query/contact_manifolds/contact_manifold.rs b/src/query/contact_manifolds/contact_manifold.rs
index e206c9ee..d3d30c5e 100644
--- a/src/query/contact_manifolds/contact_manifold.rs
+++ b/src/query/contact_manifolds/contact_manifold.rs
@@ -2,6 +2,10 @@ use crate::math::{Isometry, Point, Real, Vector};
 
 #[derive(Copy, Clone, Debug)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 /// A single contact between two shape.
 pub struct TrackedContact<Data> {
     /// The contact point in the local-space of the first shape.
diff --git a/src/query/contact_manifolds/contact_manifolds_composite_shape_composite_shape.rs b/src/query/contact_manifolds/contact_manifolds_composite_shape_composite_shape.rs
index f31b7dc1..f3680bbb 100644
--- a/src/query/contact_manifolds/contact_manifolds_composite_shape_composite_shape.rs
+++ b/src/query/contact_manifolds/contact_manifolds_composite_shape_composite_shape.rs
@@ -12,6 +12,10 @@ use crate::utils::hashmap::{Entry, HashMap};
 use crate::utils::IsometryOpt;
 
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[derive(Clone)]
 struct SubDetector {
     manifold_id: usize,
@@ -169,14 +173,14 @@ pub fn contact_manifolds_composite_shape_composite_shape<'a, ManifoldData, Conta
             let mut visitor2 =
                 BoundingVolumeIntersectionsVisitor::new(&ls_part_aabb1_2, &mut leaf_fn2);
 
-            qbvh2.traverse_depth_first_with_stack(&mut visitor2, &mut stack2);
+            let _ = qbvh2.traverse_depth_first_with_stack(&mut visitor2, &mut stack2);
         });
 
         true
     };
 
     let mut visitor1 = BoundingVolumeIntersectionsVisitor::new(&ls_aabb2_1, &mut leaf_fn1);
-    qbvh1.traverse_depth_first(&mut visitor1);
+    let _ = qbvh1.traverse_depth_first(&mut visitor1);
 
     workspace
         .sub_detectors
diff --git a/src/query/contact_manifolds/contact_manifolds_composite_shape_shape.rs b/src/query/contact_manifolds/contact_manifolds_composite_shape_shape.rs
index 7c2dc195..1f3ca8f6 100644
--- a/src/query/contact_manifolds/contact_manifolds_composite_shape_shape.rs
+++ b/src/query/contact_manifolds/contact_manifolds_composite_shape_shape.rs
@@ -12,6 +12,10 @@ use crate::utils::hashmap::{Entry, HashMap};
 use crate::utils::IsometryOpt;
 
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[derive(Clone)]
 struct SubDetector {
     manifold_id: usize,
@@ -140,7 +144,7 @@ pub fn contact_manifolds_composite_shape_shape<ManifoldData, ContactData>(
     };
 
     let mut visitor1 = BoundingVolumeIntersectionsVisitor::new(&ls_aabb2_1, &mut leaf1_fn);
-    composite1.qbvh().traverse_depth_first(&mut visitor1);
+    let _ = composite1.qbvh().traverse_depth_first(&mut visitor1);
 
     workspace
         .sub_detectors
diff --git a/src/query/contact_manifolds/contact_manifolds_heightfield_composite_shape.rs b/src/query/contact_manifolds/contact_manifolds_heightfield_composite_shape.rs
index c2fcecb3..e09187e5 100644
--- a/src/query/contact_manifolds/contact_manifolds_heightfield_composite_shape.rs
+++ b/src/query/contact_manifolds/contact_manifolds_heightfield_composite_shape.rs
@@ -14,6 +14,10 @@ use crate::utils::hashmap::{Entry, HashMap};
 use crate::utils::IsometryOpt;
 
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[derive(Clone)]
 struct SubDetector {
     manifold_id: usize,
@@ -147,7 +151,7 @@ pub fn contact_manifolds_heightfield_composite_shape<ManifoldData, ContactData>(
         };
 
         let mut visitor2 = BoundingVolumeIntersectionsVisitor::new(&ls_aabb1_2, &mut leaf_fn2);
-        qbvh2.traverse_depth_first_with_stack(&mut visitor2, &mut stack2);
+        let _ = qbvh2.traverse_depth_first_with_stack(&mut visitor2, &mut stack2);
     });
 
     workspace
diff --git a/src/query/contact_manifolds/contact_manifolds_heightfield_shape.rs b/src/query/contact_manifolds/contact_manifolds_heightfield_shape.rs
index c7020045..15e9d9a9 100644
--- a/src/query/contact_manifolds/contact_manifolds_heightfield_shape.rs
+++ b/src/query/contact_manifolds/contact_manifolds_heightfield_shape.rs
@@ -12,6 +12,10 @@ use crate::shape::{HeightField, Shape};
 use crate::utils::hashmap::{Entry, HashMap};
 
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[derive(Clone)]
 struct SubDetector {
     manifold_id: usize,
diff --git a/src/query/contact_manifolds/contact_manifolds_trimesh_shape.rs b/src/query/contact_manifolds/contact_manifolds_trimesh_shape.rs
index 1c0f560e..2a00837a 100644
--- a/src/query/contact_manifolds/contact_manifolds_trimesh_shape.rs
+++ b/src/query/contact_manifolds/contact_manifolds_trimesh_shape.rs
@@ -9,6 +9,10 @@ use crate::query::ContactManifold;
 use crate::shape::{Shape, TriMesh};
 
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[derive(Clone)]
 pub struct TriMeshShapeContactManifoldsWorkspace {
     interferences: Vec<u32>,
diff --git a/src/query/point/point_composite_shape.rs b/src/query/point/point_composite_shape.rs
index 6e57445c..d618dbfc 100644
--- a/src/query/point/point_composite_shape.rs
+++ b/src/query/point/point_composite_shape.rs
@@ -39,7 +39,7 @@ impl PointQuery for Polyline {
     #[inline]
     fn contains_local_point(&self, point: &Point<Real>) -> bool {
         let mut visitor = CompositePointContainmentTest::new(self, point);
-        self.qbvh().traverse_depth_first(&mut visitor);
+        let _ = self.qbvh().traverse_depth_first(&mut visitor);
         visitor.found
     }
 }
@@ -84,7 +84,7 @@ impl PointQuery for TriMesh {
         }
 
         let mut visitor = CompositePointContainmentTest::new(self, point);
-        self.qbvh().traverse_depth_first(&mut visitor);
+        let _ = self.qbvh().traverse_depth_first(&mut visitor);
         visitor.found
     }
 }
@@ -107,7 +107,7 @@ impl PointQuery for Compound {
     #[inline]
     fn contains_local_point(&self, point: &Point<Real>) -> bool {
         let mut visitor = CompositePointContainmentTest::new(self, point);
-        self.qbvh().traverse_depth_first(&mut visitor);
+        let _ = self.qbvh().traverse_depth_first(&mut visitor);
         visitor.found
     }
 }
diff --git a/src/query/point/point_query.rs b/src/query/point/point_query.rs
index f1e37d1b..6bc5e4c6 100644
--- a/src/query/point/point_query.rs
+++ b/src/query/point/point_query.rs
@@ -5,6 +5,10 @@ use na;
 /// Description of the projection of a point on a shape.
 #[derive(Copy, Clone, Debug)]
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 pub struct PointProjection {
     /// Whether or not the point to project was inside of the shape.
     pub is_inside: bool,
diff --git a/src/query/ray/ray.rs b/src/query/ray/ray.rs
index 23ed9427..b28c985d 100644
--- a/src/query/ray/ray.rs
+++ b/src/query/ray/ray.rs
@@ -6,6 +6,10 @@ use crate::shape::FeatureId;
 /// A Ray.
 #[derive(Debug, Clone, Copy)]
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[cfg_attr(feature = "cuda", derive(cust_core::DeviceCopy))]
 #[repr(C)]
 pub struct Ray {
@@ -54,6 +58,10 @@ impl Ray {
 /// Structure containing the result of a successful ray cast.
 #[derive(Copy, Clone, Debug)]
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 pub struct RayIntersection {
     /// The time of impact of the ray with the object.  The exact contact point can be computed
     /// with: `ray.point_at(toi)` or equivalently `origin + dir * toi` where `origin` is the origin of the ray;
diff --git a/src/query/split/split_trimesh.rs b/src/query/split/split_trimesh.rs
index 38909b92..e91ff2d9 100644
--- a/src/query/split/split_trimesh.rs
+++ b/src/query/split/split_trimesh.rs
@@ -419,7 +419,7 @@ impl TriMesh {
             intersecting_tris.push(*id);
             true
         });
-        self.qbvh().traverse_depth_first(&mut visitor);
+        let _ = self.qbvh().traverse_depth_first(&mut visitor);
 
         if intersecting_tris.is_empty() {
             return None;
diff --git a/src/query/visitors/bounding_volume_intersections_simultaneous_visitor.rs b/src/query/visitors/bounding_volume_intersections_simultaneous_visitor.rs
index b5131a04..1d5098b1 100644
--- a/src/query/visitors/bounding_volume_intersections_simultaneous_visitor.rs
+++ b/src/query/visitors/bounding_volume_intersections_simultaneous_visitor.rs
@@ -5,6 +5,9 @@ use na::SimdValue;
 use simba::simd::SimdBool as _;
 use std::marker::PhantomData;
 
+#[cfg(feature = "parallel")]
+use crate::partitioning::{QBVHNode, SimdNodeIndex};
+
 /// Spatial partitioning data structure visitor collecting interferences with a given bounding volume.
 pub struct BoundingVolumeIntersectionsSimultaneousVisitor<T1, T2, F> {
     pos12: Option<Isometry<SimdReal>>,
@@ -12,10 +15,7 @@ pub struct BoundingVolumeIntersectionsSimultaneousVisitor<T1, T2, F> {
     _phantom: PhantomData<(T1, T2)>,
 }
 
-impl<T1, T2, F> BoundingVolumeIntersectionsSimultaneousVisitor<T1, T2, F>
-where
-    F: FnMut(&T1, &T2) -> bool,
-{
+impl<T1, T2, F> BoundingVolumeIntersectionsSimultaneousVisitor<T1, T2, F> {
     /// Creates a new `BoundingVolumeIntersectionsSimultaneousVisitor`.
     #[inline]
     pub fn new(callback: F) -> BoundingVolumeIntersectionsSimultaneousVisitor<T1, T2, F> {
@@ -77,3 +77,52 @@ where
         SimdSimultaneousVisitStatus::MaybeContinue(mask)
     }
 }
+
+#[cfg(feature = "parallel")]
+impl<LeafData1: Sync, LeafData2: Sync, F>
+    crate::partitioning::ParallelSimdSimultaneousVisitor<LeafData1, LeafData2>
+    for BoundingVolumeIntersectionsSimultaneousVisitor<LeafData1, LeafData2, F>
+where
+    F: Sync + Fn(&LeafData1, &LeafData2) -> bool,
+{
+    type Data = ();
+
+    #[inline]
+    fn visit(
+        &self,
+        _: SimdNodeIndex,
+        left_node: &QBVHNode,
+        left_data: Option<[Option<&LeafData1>; SIMD_WIDTH]>,
+        _: SimdNodeIndex,
+        right_node: &QBVHNode,
+        right_data: Option<[Option<&LeafData2>; SIMD_WIDTH]>,
+        _: (),
+    ) -> (SimdSimultaneousVisitStatus, ()) {
+        let mask = if let Some(pos12) = &self.pos12 {
+            let transformed_right_bv = right_node.simd_aabb.transform_by(pos12);
+            left_node
+                .simd_aabb
+                .intersects_permutations(&transformed_right_bv)
+        } else {
+            left_node
+                .simd_aabb
+                .intersects_permutations(&right_node.simd_aabb)
+        };
+
+        if let (Some(data1), Some(data2)) = (left_data, right_data) {
+            for ii in 0..SIMD_WIDTH {
+                let bitmask = mask[ii].bitmask();
+
+                for jj in 0..SIMD_WIDTH {
+                    if (bitmask & (1 << jj)) != 0 && data1[ii].is_some() && data2[jj].is_some() {
+                        if !(self.callback)(data1[ii].unwrap(), data2[jj].unwrap()) {
+                            return (SimdSimultaneousVisitStatus::ExitEarly, ());
+                        }
+                    }
+                }
+            }
+        }
+
+        (SimdSimultaneousVisitStatus::MaybeContinue(mask), ())
+    }
+}
diff --git a/src/shape/ball.rs b/src/shape/ball.rs
index 47897d1c..3f5342f8 100644
--- a/src/shape/ball.rs
+++ b/src/shape/ball.rs
@@ -7,6 +7,10 @@ use crate::shape::SupportMap;
 
 /// A Ball shape.
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+// #[cfg_attr(
+//     feature = "rkyv",
+//     derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+// )]
 #[cfg_attr(feature = "cuda", derive(cust_core::DeviceCopy))]
 #[derive(PartialEq, Debug, Copy, Clone)]
 #[repr(C)]
diff --git a/src/shape/capsule.rs b/src/shape/capsule.rs
index d01e142d..ed4793df 100644
--- a/src/shape/capsule.rs
+++ b/src/shape/capsule.rs
@@ -6,7 +6,11 @@ use na::Unit;
 use either::Either;
 
 #[derive(Copy, Clone, Debug)]
-#[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[cfg_attr(feature = "cuda", derive(cust_core::DeviceCopy))]
 #[repr(C)]
 /// A capsule shape defined as a round segment.
diff --git a/src/shape/cone.rs b/src/shape/cone.rs
index 2abc4438..7c1b76cb 100644
--- a/src/shape/cone.rs
+++ b/src/shape/cone.rs
@@ -13,6 +13,10 @@ use na::RealField; // for .copysign()
 
 /// Cone shape with its principal axis aligned with the `y` axis.
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[cfg_attr(feature = "cuda", derive(cust_core::DeviceCopy))]
 #[derive(PartialEq, Debug, Copy, Clone)]
 #[repr(C)]
diff --git a/src/shape/convex_polygon.rs b/src/shape/convex_polygon.rs
index 3c52678c..915089f1 100644
--- a/src/shape/convex_polygon.rs
+++ b/src/shape/convex_polygon.rs
@@ -5,6 +5,10 @@ use na::{self, ComplexField, RealField, Unit};
 
 /// A 2D convex polygon.
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[derive(Clone, Debug)]
 pub struct ConvexPolygon {
     points: Vec<Point<Real>>,
diff --git a/src/shape/convex_polyhedron.rs b/src/shape/convex_polyhedron.rs
index 1ebc7fd9..f64a3cb0 100644
--- a/src/shape/convex_polyhedron.rs
+++ b/src/shape/convex_polyhedron.rs
@@ -10,6 +10,10 @@ use std::f64;
 use na::ComplexField; // for .abs()
 
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[derive(PartialEq, Debug, Copy, Clone)]
 pub struct Vertex {
     pub first_adj_face_or_edge: u32,
@@ -17,6 +21,10 @@ pub struct Vertex {
 }
 
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[derive(PartialEq, Debug, Copy, Clone)]
 pub struct Edge {
     pub vertices: Point2<u32>,
@@ -36,6 +44,10 @@ impl Edge {
 }
 
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[derive(PartialEq, Debug, Copy, Clone)]
 pub struct Face {
     pub first_vertex_or_edge: u32,
@@ -44,6 +56,10 @@ pub struct Face {
 }
 
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[derive(PartialEq, Debug, Copy, Clone)]
 struct Triangle {
     vertices: [u32; 3],
@@ -66,6 +82,10 @@ impl Triangle {
 }
 
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[derive(PartialEq, Debug, Clone)]
 /// A convex polyhedron without degenerate faces.
 pub struct ConvexPolyhedron {
diff --git a/src/shape/cuboid.rs b/src/shape/cuboid.rs
index 4fc3eb14..44ac3364 100644
--- a/src/shape/cuboid.rs
+++ b/src/shape/cuboid.rs
@@ -12,6 +12,10 @@ use na::RealField; // for .copysign()
 
 /// Shape of a box.
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[cfg_attr(feature = "cuda", derive(cust_core::DeviceCopy))]
 #[derive(PartialEq, Debug, Copy, Clone)]
 #[repr(C)]
diff --git a/src/shape/cylinder.rs b/src/shape/cylinder.rs
index f3f32316..11f5febc 100644
--- a/src/shape/cylinder.rs
+++ b/src/shape/cylinder.rs
@@ -13,6 +13,10 @@ use na::RealField; // for .copysign()
 
 /// Cylinder shape with its principal axis aligned with the `y` axis.
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[cfg_attr(feature = "cuda", derive(cust_core::DeviceCopy))]
 #[derive(PartialEq, Debug, Copy, Clone)]
 #[repr(C)]
diff --git a/src/shape/feature_id.rs b/src/shape/feature_id.rs
index 1ce05b7f..a54e4189 100644
--- a/src/shape/feature_id.rs
+++ b/src/shape/feature_id.rs
@@ -4,6 +4,10 @@
 /// allows an efficient retrieval of the geometric information of the
 /// feature.
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)]
 pub enum FeatureId {
     /// Shape-dependent identifier of a vertex.
diff --git a/src/shape/half_space.rs b/src/shape/half_space.rs
index 351512d4..24c80524 100644
--- a/src/shape/half_space.rs
+++ b/src/shape/half_space.rs
@@ -5,6 +5,10 @@ use na::Unit;
 /// A half-space delimited by an infinite plane.
 #[derive(PartialEq, Debug, Clone, Copy)]
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[cfg_attr(feature = "cuda", derive(cust_core::DeviceCopy))]
 #[repr(C)]
 pub struct HalfSpace {
diff --git a/src/shape/heightfield2.rs b/src/shape/heightfield2.rs
index a1e89a52..7d8481db 100644
--- a/src/shape/heightfield2.rs
+++ b/src/shape/heightfield2.rs
@@ -50,6 +50,10 @@ impl<T: Scalar> HeightFieldStorage for DVector<T> {
 }
 
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[cfg_attr(feature = "cuda", derive(cust_core::DeviceCopy))]
 #[derive(Copy, Clone, Debug)]
 #[repr(C)] // Needed for Cuda.
diff --git a/src/shape/heightfield3.rs b/src/shape/heightfield3.rs
index 9123a664..a36d2c68 100644
--- a/src/shape/heightfield3.rs
+++ b/src/shape/heightfield3.rs
@@ -14,6 +14,10 @@ use na::ComplexField;
 
 bitflags! {
     #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+    #[cfg_attr(
+        feature = "rkyv",
+        derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+    )]
     #[cfg_attr(feature = "cuda", derive(cust_core::DeviceCopy))]
     #[derive(Default)]
     /// The status of the cell of an heightfield.
@@ -69,6 +73,10 @@ impl<T: Scalar> HeightFieldStorage for DMatrix<T> {
 }
 
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[cfg_attr(feature = "cuda", derive(cust_core::DeviceCopy))]
 #[derive(Copy, Clone, Debug)]
 #[repr(C)] // Needed for Cuda.
diff --git a/src/shape/polygon.rs b/src/shape/polygon.rs
index f4a7384c..c04870d6 100644
--- a/src/shape/polygon.rs
+++ b/src/shape/polygon.rs
@@ -5,6 +5,10 @@ use parry::bounding_volume::AABB;
 
 #[derive(Clone)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 /// A convex planar polygon.
 pub struct Polygon {
     pub(crate) vertices: Vec<Point<Real>>,
diff --git a/src/shape/polyline.rs b/src/shape/polyline.rs
index 67839bab..e8ac52b7 100644
--- a/src/shape/polyline.rs
+++ b/src/shape/polyline.rs
@@ -10,6 +10,10 @@ use na::ComplexField; // for .abs()
 
 #[derive(Clone)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 /// A polyline.
 pub struct Polyline {
     qbvh: QBVH<u32>,
diff --git a/src/shape/round_shape.rs b/src/shape/round_shape.rs
index f681281e..ecaf9889 100644
--- a/src/shape/round_shape.rs
+++ b/src/shape/round_shape.rs
@@ -3,6 +3,10 @@ use crate::shape::SupportMap;
 use na::Unit;
 
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[cfg_attr(feature = "cuda", derive(cust_core::DeviceCopy))]
 #[derive(Copy, Clone, Debug)]
 #[repr(C)]
diff --git a/src/shape/segment.rs b/src/shape/segment.rs
index d048cac0..9a6fd048 100644
--- a/src/shape/segment.rs
+++ b/src/shape/segment.rs
@@ -8,6 +8,10 @@ use std::mem;
 
 /// A segment shape.
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[cfg_attr(feature = "cuda", derive(cust_core::DeviceCopy))]
 #[derive(PartialEq, Debug, Copy, Clone)]
 #[repr(C)]
diff --git a/src/shape/tetrahedron.rs b/src/shape/tetrahedron.rs
index d4dbcd6e..7d5e7393 100644
--- a/src/shape/tetrahedron.rs
+++ b/src/shape/tetrahedron.rs
@@ -11,6 +11,10 @@ use na::ComplexField; // for .abs()
 
 /// A tetrahedron with 4 vertices.
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[cfg_attr(feature = "cuda", derive(cust_core::DeviceCopy))]
 #[derive(Copy, Clone, Debug)]
 #[repr(C)]
diff --git a/src/shape/triangle.rs b/src/shape/triangle.rs
index bd91ded1..265918e1 100644
--- a/src/shape/triangle.rs
+++ b/src/shape/triangle.rs
@@ -13,6 +13,10 @@ use std::mem;
 
 /// A triangle shape.
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[cfg_attr(feature = "cuda", derive(cust_core::DeviceCopy))]
 #[derive(PartialEq, Debug, Copy, Clone, Default)]
 #[repr(C)]
diff --git a/src/shape/trimesh.rs b/src/shape/trimesh.rs
index 6de6f559..bb4ca5e2 100644
--- a/src/shape/trimesh.rs
+++ b/src/shape/trimesh.rs
@@ -54,6 +54,10 @@ impl std::error::Error for TopologyError {}
 /// DOI: 10.1109/TVCG.2005.49
 #[derive(Clone)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 #[cfg(feature = "dim3")]
 pub struct TriMeshPseudoNormals {
     /// The pseudo-normals of the vertices.
@@ -75,6 +79,10 @@ impl Default for TriMeshPseudoNormals {
 /// The connected-components of a triangle mesh.
 #[derive(Clone, Debug)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 pub struct TriMeshConnectedComponents {
     /// The `face_colors[i]` gives the connected-component index
     /// of the i-th face.
@@ -96,6 +104,10 @@ impl TriMeshConnectedComponents {
 /// A vertex of a triangle-mesh’s half-edge topology.
 #[derive(Clone, Copy, Debug)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 pub struct TopoVertex {
     /// One of the half-edge with this vertex as endpoint.
     pub half_edge: u32,
@@ -104,6 +116,10 @@ pub struct TopoVertex {
 /// A face of a triangle-mesh’s half-edge topology.
 #[derive(Clone, Copy, Debug)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 pub struct TopoFace {
     /// The half-edge adjascent to this face, whith a starting point equal
     /// to the first point of this face.
@@ -113,6 +129,10 @@ pub struct TopoFace {
 /// A half-edge of a triangle-mesh’s half-edge topology.
 #[derive(Clone, Copy, Debug)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 pub struct TopoHalfEdge {
     /// The next half-edge.
     pub next: u32,
@@ -129,6 +149,10 @@ pub struct TopoHalfEdge {
 /// The half-edge topology information of a triangle mesh.
 #[derive(Clone, Default)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 pub struct TriMeshTopology {
     /// The vertices of this half-edge representation.
     pub vertices: Vec<TopoVertex>,
@@ -155,6 +179,10 @@ impl TriMeshTopology {
 
 bitflags::bitflags! {
     #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+    #[cfg_attr(
+        feature = "rkyv",
+        derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+    )]
     #[cfg_attr(feature = "cuda", derive(cust_core::DeviceCopy))]
     #[derive(Default)]
     /// The status of the cell of an heightfield.
@@ -195,6 +223,10 @@ bitflags::bitflags! {
 
 #[derive(Clone)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 /// A triangle mesh.
 pub struct TriMesh {
     qbvh: QBVH<u32>,
diff --git a/src/transformation/mesh_intersection/mesh_intersection.rs b/src/transformation/mesh_intersection/mesh_intersection.rs
index b56e747e..3a030654 100644
--- a/src/transformation/mesh_intersection/mesh_intersection.rs
+++ b/src/transformation/mesh_intersection/mesh_intersection.rs
@@ -35,11 +35,13 @@ pub fn intersect_meshes(
 
     // 1: collect all the potential triangle-triangle intersections.
     let mut intersections = vec![];
-    let mut visitor =
-        BoundingVolumeIntersectionsSimultaneousVisitor::with_relative_pos(pos12, |tri1, tri2| {
+    let mut visitor = BoundingVolumeIntersectionsSimultaneousVisitor::with_relative_pos(
+        pos12,
+        |tri1: &u32, tri2: &u32| {
             intersections.push((*tri1, *tri2));
             true
-        });
+        },
+    );
 
     mesh1.qbvh().traverse_bvtt(mesh2.qbvh(), &mut visitor);
 
diff --git a/src/utils/cuda_array.rs b/src/utils/cuda_array.rs
index 43453686..ac43699e 100644
--- a/src/utils/cuda_array.rs
+++ b/src/utils/cuda_array.rs
@@ -12,6 +12,7 @@ use cust_core::DeviceCopy;
  *
 */
 #[cfg(feature = "std")]
+/// A 2D array residing on GPU memory.
 pub struct CudaArray2<T: ?Sized + DeviceCopy> {
     data: DeviceBuffer<T>,
     nrows: usize,
@@ -20,6 +21,7 @@ pub struct CudaArray2<T: ?Sized + DeviceCopy> {
 
 #[cfg(feature = "std")]
 impl<T: ?Sized + DeviceCopy> CudaArray2<T> {
+    /// Initialize a 2D cuda array on the GPU.
     pub fn new(data: &[T], nrows: usize, ncols: usize) -> CudaResult<Self> {
         assert_eq!(
             data.len(),
@@ -29,10 +31,12 @@ impl<T: ?Sized + DeviceCopy> CudaArray2<T> {
         DeviceBuffer::from_slice(data).map(|data| Self { data, nrows, ncols })
     }
 
+    /// Initialize, using a matrix, a 2D cuda array on the GPU.
     pub fn from_matrix(mat: &na::DMatrix<T>) -> CudaResult<Self> {
         Self::new(mat.as_slice(), mat.nrows(), mat.ncols())
     }
 
+    /// Gets the device pointer to the CUDA memory.
     pub fn as_device_ptr(&self) -> CudaArrayPointer2<T> {
         CudaArrayPointer2 {
             data: self.data.as_device_ptr(),
@@ -44,6 +48,7 @@ impl<T: ?Sized + DeviceCopy> CudaArray2<T> {
 
 #[repr(C)]
 #[derive(Copy, Clone, cust_core::DeviceCopy)]
+/// A pointer to a 2D CUDA array.
 pub struct CudaArrayPointer2<T: ?Sized + DeviceCopy> {
     data: DevicePointer<T>,
     nrows: usize,
@@ -83,20 +88,24 @@ impl<T: ?Sized + DeviceCopy> HeightFieldStorage for CudaArrayPointer2<T> {
  *
  */
 #[cfg(feature = "std")]
+/// A 1D array residing on GPU memory.
 pub struct CudaArray1<T: ?Sized + DeviceCopy> {
     data: DeviceBuffer<T>,
 }
 
 #[cfg(feature = "std")]
 impl<T: ?Sized + DeviceCopy> CudaArray1<T> {
+    /// Initialize a 1D cuda array on the GPU.
     pub fn new(data: &[T]) -> CudaResult<Self> {
         DeviceBuffer::from_slice(data).map(|data| Self { data })
     }
 
+    /// Initialize a 1D cuda array on the GPU using a dynamically-sized vector.
     pub fn from_vector(vect: &na::DVector<T>) -> CudaResult<Self> {
         Self::new(vect.as_slice())
     }
 
+    /// Gets the device pointer to the CUDA memory.
     pub fn as_device_ptr(&self) -> CudaArrayPointer1<T> {
         CudaArrayPointer1 {
             data: self.data.as_device_ptr(),
@@ -107,11 +116,31 @@ impl<T: ?Sized + DeviceCopy> CudaArray1<T> {
 
 #[repr(C)]
 #[derive(Copy, Clone, cust_core::DeviceCopy)]
+/// A pointer to a 2D CUDA array.
 pub struct CudaArrayPointer1<T: ?Sized + DeviceCopy> {
     data: DevicePointer<T>,
     len: usize,
 }
 
+#[cfg(target_os = "cuda")]
+impl<T: ?Sized + DeviceCopy> CudaArrayPointer1<T> {
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    pub fn get(&self, i: usize) -> T {
+        assert!(i < self.len);
+        unsafe { *self.data.as_ptr().add(i) }
+    }
+
+    pub fn set(&mut self, i: usize, val: T) {
+        assert!(i < self.len);
+        unsafe {
+            *self.data.as_mut_ptr().add(i) = val;
+        }
+    }
+}
+
 #[cfg(all(feature = "dim2", target_os = "cuda"))]
 impl<T: ?Sized + DeviceCopy> HeightFieldStorage for CudaArrayPointer1<T> {
     type Item = T;
diff --git a/src/utils/sdp_matrix.rs b/src/utils/sdp_matrix.rs
index 203203d8..901376ee 100644
--- a/src/utils/sdp_matrix.rs
+++ b/src/utils/sdp_matrix.rs
@@ -5,6 +5,10 @@ use std::ops::{Add, Mul};
 /// A 2x2 symmetric-definite-positive matrix.
 #[derive(Copy, Clone, Debug, PartialEq)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 pub struct SdpMatrix2<N> {
     /// The component at the first row and first column of this matrix.
     pub m11: N,
@@ -106,6 +110,10 @@ impl Mul<Real> for SdpMatrix2<Real> {
 /// A 3x3 symmetric-definite-positive matrix.
 #[derive(Copy, Clone, Debug, PartialEq)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 pub struct SdpMatrix3<N> {
     /// The component at the first row and first column of this matrix.
     pub m11: N,
diff --git a/src/utils/sorted_pair.rs b/src/utils/sorted_pair.rs
index 3820ec26..f87c86f2 100644
--- a/src/utils/sorted_pair.rs
+++ b/src/utils/sorted_pair.rs
@@ -5,6 +5,10 @@ use std::ops::Deref;
 /// A pair of elements sorted in increasing order.
 #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
 #[cfg_attr(feature = "serde-serialize", derive(Serialize, Deserialize))]
+#[cfg_attr(
+    feature = "rkyv",
+    derive(rkyv::Archive, rkyv::Deserialize, rkyv::Serialize)
+)]
 pub struct SortedPair<T: PartialOrd>([T; 2]);
 
 impl<T: PartialOrd> SortedPair<T> {
@@ -25,3 +29,30 @@ impl<T: PartialOrd> Deref for SortedPair<T> {
         unsafe { mem::transmute(self) }
     }
 }
+
+// TODO: can we avoid these manual impls of Hash/PartialEq/Eq for the archived types?
+#[cfg(feature = "rkyv")]
+impl<T: rkyv::Archive + PartialOrd> std::hash::Hash for ArchivedSortedPair<T>
+where
+    [<T as rkyv::Archive>::Archived; 2]: std::hash::Hash,
+{
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.0.hash(state)
+    }
+}
+
+#[cfg(feature = "rkyv")]
+impl<T: rkyv::Archive + PartialOrd> PartialEq for ArchivedSortedPair<T>
+where
+    [<T as rkyv::Archive>::Archived; 2]: PartialEq,
+{
+    fn eq(&self, other: &Self) -> bool {
+        self.0 == other.0
+    }
+}
+
+#[cfg(feature = "rkyv")]
+impl<T: rkyv::Archive + PartialOrd> Eq for ArchivedSortedPair<T> where
+    [<T as rkyv::Archive>::Archived; 2]: Eq
+{
+}