diff --git a/deno_webgpu/src/buffer.rs b/deno_webgpu/src/buffer.rs
index 45eab483d1..1eb331863e 100644
--- a/deno_webgpu/src/buffer.rs
+++ b/deno_webgpu/src/buffer.rs
@@ -128,7 +128,7 @@ pub async fn op_webgpu_buffer_get_map_async(
             {
                 let state = state.borrow();
                 let instance = state.borrow::<super::Instance>();
-                gfx_select!(device => instance.device_poll(device, false)).unwrap();
+                gfx_select!(device => instance.device_poll(device, false, None)).unwrap();
             }
             tokio::time::sleep(Duration::from_millis(10)).await;
         }
diff --git a/player/src/bin/play.rs b/player/src/bin/play.rs
index 524c859430..f05e66f56d 100644
--- a/player/src/bin/play.rs
+++ b/player/src/bin/play.rs
@@ -95,7 +95,7 @@ fn main() {
         }
 
         gfx_select!(device => global.device_stop_capture(device));
-        gfx_select!(device => global.device_poll(device, true)).unwrap();
+        gfx_select!(device => global.device_poll(device, true, None)).unwrap();
     }
     #[cfg(feature = "winit")]
     {
@@ -181,7 +181,7 @@ fn main() {
                 },
                 Event::LoopDestroyed => {
                     log::info!("Closing");
-                    gfx_select!(device => global.device_poll(device, true)).unwrap();
+                    gfx_select!(device => global.device_poll(device, true, None)).unwrap();
                 }
                 _ => {}
             }
diff --git a/player/tests/test.rs b/player/tests/test.rs
index 6fa3395ae2..fd96f32173 100644
--- a/player/tests/test.rs
+++ b/player/tests/test.rs
@@ -120,7 +120,7 @@ impl Test<'_> {
         }
 
         println!("\t\t\tWaiting...");
-        wgc::gfx_select!(device => global.device_poll(device, true)).unwrap();
+        wgc::gfx_select!(device => global.device_poll(device, true, None)).unwrap();
 
         for expect in self.expectations {
             println!("\t\t\tChecking {}", expect.name);
diff --git a/wgpu-core/src/device/life.rs b/wgpu-core/src/device/life.rs
index 8c6705276f..27f1708860 100644
--- a/wgpu-core/src/device/life.rs
+++ b/wgpu-core/src/device/life.rs
@@ -224,6 +224,8 @@ struct ActiveSubmission<A: hal::Api> {
 pub enum WaitIdleError {
     #[error(transparent)]
     Device(#[from] DeviceError),
+    #[error("Tried to wait using a submission index from the wrong device. Submission index is from device {0:?}. Called poll on device {1:?}.")]
+    WrongSubmissionIndex(id::QueueId, id::DeviceId),
     #[error("GPU got stuck :(")]
     StuckGpu,
 }
diff --git a/wgpu-core/src/device/mod.rs b/wgpu-core/src/device/mod.rs
index 24fa72e571..495c62d299 100644
--- a/wgpu-core/src/device/mod.rs
+++ b/wgpu-core/src/device/mod.rs
@@ -440,6 +440,7 @@ impl<A: HalApi> Device<A> {
         &'this self,
         hub: &Hub<A, G>,
         force_wait: bool,
+        submission_index: Option<queue::WrappedSubmissionIndex>,
         token: &mut Token<'token, Self>,
     ) -> Result<(UserClosures, bool), WaitIdleError> {
         profiling::scope!("maintain", "Device");
@@ -464,13 +465,20 @@ impl<A: HalApi> Device<A> {
         life_tracker.triage_mapped(hub, token);
 
         let last_done_index = if force_wait {
-            let current_index = self.active_submission_index;
+            let index_to_wait_for = match submission_index {
+                Some(submission_index) => {
+                    // We don't need to check to see if the queue id matches
+                    // as we already checked this from inside the poll call.
+                    submission_index.index
+                }
+                None => self.active_submission_index,
+            };
             unsafe {
                 self.raw
-                    .wait(&self.fence, current_index, CLEANUP_WAIT_MS)
+                    .wait(&self.fence, index_to_wait_for, CLEANUP_WAIT_MS)
                     .map_err(DeviceError::from)?
             };
-            current_index
+            index_to_wait_for
         } else {
             unsafe {
                 self.raw
@@ -4968,15 +4976,25 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         &self,
         device_id: id::DeviceId,
         force_wait: bool,
+        submission_index: Option<queue::WrappedSubmissionIndex>,
     ) -> Result<bool, WaitIdleError> {
         let (closures, queue_empty) = {
+            if let Some(submission_index) = submission_index {
+                if submission_index.queue_id != device_id {
+                    return Err(WaitIdleError::WrongSubmissionIndex(
+                        submission_index.queue_id,
+                        device_id,
+                    ));
+                }
+            }
+
             let hub = A::hub(self);
             let mut token = Token::root();
             let (device_guard, mut token) = hub.devices.read(&mut token);
             device_guard
                 .get(device_id)
                 .map_err(|_| DeviceError::Invalid)?
-                .maintain(hub, force_wait, &mut token)?
+                .maintain(hub, force_wait, submission_index, &mut token)?
         };
         unsafe {
             closures.fire();
@@ -5004,7 +5022,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
             let (device_guard, mut token) = hub.devices.read(&mut token);
 
             for (id, device) in device_guard.iter(A::VARIANT) {
-                let (cbs, queue_empty) = device.maintain(hub, force_wait, &mut token)?;
+                let (cbs, queue_empty) = device.maintain(hub, force_wait, None, &mut token)?;
                 all_queue_empty = all_queue_empty && queue_empty;
 
                 // If the device's own `RefCount` clone is the only one left, and
diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs
index e76255f772..7246199c2b 100644
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@@ -12,7 +12,7 @@ use crate::{
     id,
     init_tracker::{has_copy_partial_init_tracker_coverage, TextureInitRange},
     resource::{BufferAccessError, BufferMapState, TextureInner},
-    track, FastHashSet,
+    track, FastHashSet, SubmissionIndex,
 };
 
 use hal::{CommandEncoder as _, Device as _, Queue as _};
@@ -39,6 +39,13 @@ pub struct SubmittedWorkDoneClosure {
 unsafe impl Send for SubmittedWorkDoneClosure {}
 unsafe impl Sync for SubmittedWorkDoneClosure {}
 
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct WrappedSubmissionIndex {
+    pub queue_id: id::QueueId,
+    pub index: SubmissionIndex,
+}
+
 struct StagingData<A: hal::Api> {
     buffer: A::Buffer,
 }
@@ -580,10 +587,10 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         &self,
         queue_id: id::QueueId,
         command_buffer_ids: &[id::CommandBufferId],
-    ) -> Result<(), QueueSubmitError> {
+    ) -> Result<WrappedSubmissionIndex, QueueSubmitError> {
         profiling::scope!("submit", "Queue");
 
-        let callbacks = {
+        let (submit_index, callbacks) = {
             let hub = A::hub(self);
             let mut token = Token::root();
 
@@ -918,24 +925,28 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
 
             // This will schedule destruction of all resources that are no longer needed
             // by the user but used in the command stream, among other things.
-            let (closures, _) = match device.maintain(hub, false, &mut token) {
+            let (closures, _) = match device.maintain(hub, false, None, &mut token) {
                 Ok(closures) => closures,
                 Err(WaitIdleError::Device(err)) => return Err(QueueSubmitError::Queue(err)),
                 Err(WaitIdleError::StuckGpu) => return Err(QueueSubmitError::StuckGpu),
+                Err(WaitIdleError::WrongSubmissionIndex(..)) => unreachable!(),
             };
 
             device.pending_writes.temp_resources = pending_write_resources;
             device.temp_suspected.clear();
             device.lock_life(&mut token).post_submit();
 
-            closures
+            (submit_index, closures)
         };
 
         // the closures should execute with nothing locked!
         unsafe {
             callbacks.fire();
         }
-        Ok(())
+        Ok(WrappedSubmissionIndex {
+            queue_id,
+            index: submit_index,
+        })
     }
 
     pub fn queue_get_timestamp_period<A: HalApi>(
diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs
index 82a3243986..0737053514 100644
--- a/wgpu-hal/src/lib.rs
+++ b/wgpu-hal/src/lib.rs
@@ -304,6 +304,7 @@ pub trait Device<A: Api>: Send + Sync {
     unsafe fn create_fence(&self) -> Result<A::Fence, DeviceError>;
     unsafe fn destroy_fence(&self, fence: A::Fence);
     unsafe fn get_fence_value(&self, fence: &A::Fence) -> Result<FenceValue, DeviceError>;
+    /// Calling wait with a lower value than the current fence value will immediately return.
     unsafe fn wait(
         &self,
         fence: &A::Fence,
diff --git a/wgpu/examples/capture/main.rs b/wgpu/examples/capture/main.rs
index 2f4b2a56a5..52aa8fad2c 100644
--- a/wgpu/examples/capture/main.rs
+++ b/wgpu/examples/capture/main.rs
@@ -5,7 +5,7 @@ use std::env;
 use std::fs::File;
 use std::io::Write;
 use std::mem::size_of;
-use wgpu::{Buffer, Device};
+use wgpu::{Buffer, Device, SubmissionIndex};
 
 async fn run(png_output_path: &str) {
     let args: Vec<_> = env::args().collect();
@@ -20,14 +20,22 @@ async fn run(png_output_path: &str) {
             return;
         }
     };
-    let (device, buffer, buffer_dimensions) = create_red_image_with_dimensions(width, height).await;
-    create_png(png_output_path, device, buffer, &buffer_dimensions).await;
+    let (device, buffer, buffer_dimensions, submission_index) =
+        create_red_image_with_dimensions(width, height).await;
+    create_png(
+        png_output_path,
+        device,
+        buffer,
+        &buffer_dimensions,
+        submission_index,
+    )
+    .await;
 }
 
 async fn create_red_image_with_dimensions(
     width: usize,
     height: usize,
-) -> (Device, Buffer, BufferDimensions) {
+) -> (Device, Buffer, BufferDimensions, SubmissionIndex) {
     let adapter = wgpu::Instance::new(
         wgpu::util::backend_bits_from_env().unwrap_or_else(wgpu::Backends::all),
     )
@@ -114,8 +122,8 @@ async fn create_red_image_with_dimensions(
         encoder.finish()
     };
 
-    queue.submit(Some(command_buffer));
-    (device, output_buffer, buffer_dimensions)
+    let index = queue.submit(Some(command_buffer));
+    (device, output_buffer, buffer_dimensions, index)
 }
 
 async fn create_png(
@@ -123,6 +131,7 @@ async fn create_png(
     device: Device,
     output_buffer: Buffer,
     buffer_dimensions: &BufferDimensions,
+    submission_index: SubmissionIndex,
 ) {
     // Note that we're not calling `.await` here.
     let buffer_slice = output_buffer.slice(..);
@@ -131,7 +140,9 @@ async fn create_png(
     // Poll the device in a blocking manner so that our future resolves.
     // In an actual application, `device.poll(...)` should
     // be called in an event loop or on another thread.
-    device.poll(wgpu::Maintain::Wait);
+    //
+    // We pass our submission index so we don't need to wait for any other possible submissions.
+    device.poll(wgpu::Maintain::Wait(Some(submission_index)));
     // If a file system is available, write the buffer as a PNG
     let has_file_system_available = cfg!(not(target_arch = "wasm32"));
     if !has_file_system_available {
diff --git a/wgpu/examples/framework.rs b/wgpu/examples/framework.rs
index 0ed135f1cd..ddab7ce68f 100644
--- a/wgpu/examples/framework.rs
+++ b/wgpu/examples/framework.rs
@@ -518,7 +518,7 @@ pub fn test<E: Example>(mut params: FrameworkRefTest) {
 
             let dst_buffer_slice = dst_buffer.slice(..);
             let _ = dst_buffer_slice.map_async(wgpu::MapMode::Read);
-            ctx.device.poll(wgpu::Maintain::Wait);
+            ctx.device.poll(wgpu::Maintain::Wait(None));
             let bytes = dst_buffer_slice.get_mapped_range().to_vec();
 
             test_common::image::compare_image_output(
diff --git a/wgpu/examples/hello-compute/main.rs b/wgpu/examples/hello-compute/main.rs
index bbff9130f4..c5d58921ab 100644
--- a/wgpu/examples/hello-compute/main.rs
+++ b/wgpu/examples/hello-compute/main.rs
@@ -153,7 +153,7 @@ async fn execute_gpu_inner(
     // Poll the device in a blocking manner so that our future resolves.
     // In an actual application, `device.poll(...)` should
     // be called in an event loop or on another thread.
-    device.poll(wgpu::Maintain::Wait);
+    device.poll(wgpu::Maintain::Wait(None));
 
     // Awaits until `buffer_future` can be read from
     if let Ok(()) = buffer_future.await {
diff --git a/wgpu/examples/mipmap/main.rs b/wgpu/examples/mipmap/main.rs
index 9603895d57..3570bdaf46 100644
--- a/wgpu/examples/mipmap/main.rs
+++ b/wgpu/examples/mipmap/main.rs
@@ -386,7 +386,7 @@ impl framework::Example for Example {
                 .slice(..)
                 .map_async(wgpu::MapMode::Read);
             // Wait for device to be done rendering mipmaps
-            device.poll(wgpu::Maintain::Wait);
+            device.poll(wgpu::Maintain::Wait(None));
             // This is guaranteed to be ready.
             let timestamp_view = query_sets
                 .data_buffer
diff --git a/wgpu/src/backend/direct.rs b/wgpu/src/backend/direct.rs
index fdd599e57b..72dd696759 100644
--- a/wgpu/src/backend/direct.rs
+++ b/wgpu/src/backend/direct.rs
@@ -800,6 +800,7 @@ impl crate::Context for Context {
     type SurfaceId = Surface;
 
     type SurfaceOutputDetail = SurfaceOutputDetail;
+    type SubmissionIndex = wgc::device::queue::WrappedSubmissionIndex;
 
     type RequestAdapterFuture = Ready<Option<Self::AdapterId>>;
     #[allow(clippy::type_complexity)]
@@ -1571,7 +1572,7 @@ impl crate::Context for Context {
 
         #[cfg(any(not(target_arch = "wasm32"), feature = "emscripten"))]
         {
-            match wgc::gfx_select!(device.id => global.device_poll(device.id, true)) {
+            match wgc::gfx_select!(device.id => global.device_poll(device.id, true, None)) {
                 Ok(_) => (),
                 Err(err) => self.handle_error_fatal(err, "Device::drop"),
             }
@@ -1582,12 +1583,14 @@ impl crate::Context for Context {
 
     fn device_poll(&self, device: &Self::DeviceId, maintain: crate::Maintain) -> bool {
         let global = &self.0;
+        let (wait, index) = match maintain {
+            crate::Maintain::Poll => (false, None),
+            crate::Maintain::Wait(index) => (true, index.map(|i| i.0)),
+        };
         match wgc::gfx_select!(device.id => global.device_poll(
             device.id,
-            match maintain {
-                crate::Maintain::Poll => false,
-                crate::Maintain::Wait => true,
-            }
+            wait,
+            index
         )) {
             Ok(queue_empty) => queue_empty,
             Err(err) => self.handle_error_fatal(err, "Device::poll"),
@@ -2190,12 +2193,12 @@ impl crate::Context for Context {
         &self,
         queue: &Self::QueueId,
         command_buffers: I,
-    ) {
+    ) -> Self::SubmissionIndex {
         let temp_command_buffers = command_buffers.collect::<SmallVec<[_; 4]>>();
 
         let global = &self.0;
         match wgc::gfx_select!(*queue => global.queue_submit(*queue, &temp_command_buffers)) {
-            Ok(()) => (),
+            Ok(index) => index,
             Err(err) => self.handle_error_fatal(err, "Queue::submit"),
         }
     }
diff --git a/wgpu/src/backend/web.rs b/wgpu/src/backend/web.rs
index 18b84cb25c..48738f4f18 100644
--- a/wgpu/src/backend/web.rs
+++ b/wgpu/src/backend/web.rs
@@ -995,6 +995,7 @@ impl crate::Context for Context {
     type SurfaceId = Sendable<web_sys::GpuCanvasContext>;
 
     type SurfaceOutputDetail = SurfaceOutputDetail;
+    type SubmissionIndex = ();
 
     type RequestAdapterFuture = MakeSendFuture<
         wasm_bindgen_futures::JsFuture,
@@ -2200,10 +2201,12 @@ impl crate::Context for Context {
         &self,
         queue: &Self::QueueId,
         command_buffers: I,
-    ) {
+    ) -> Self::SubmissionIndex {
         let temp_command_buffers = command_buffers.map(|i| i.0).collect::<js_sys::Array>();
 
         queue.0.submit(&temp_command_buffers);
+
+        // SubmissionIndex is (), so just let this function end
     }
 
     fn queue_get_timestamp_period(&self, _queue: &Self::QueueId) -> f32 {
diff --git a/wgpu/src/lib.rs b/wgpu/src/lib.rs
index 15408d8a7d..899cc1a925 100644
--- a/wgpu/src/lib.rs
+++ b/wgpu/src/lib.rs
@@ -187,6 +187,7 @@ trait Context: Debug + Send + Sized + Sync {
     type SurfaceId: Debug + Send + Sync + 'static;
 
     type SurfaceOutputDetail: Send;
+    type SubmissionIndex: Debug + Copy + Clone + Send + 'static;
 
     type RequestAdapterFuture: Future<Output = Option<Self::AdapterId>> + Send;
     type RequestDeviceFuture: Future<Output = Result<(Self::DeviceId, Self::QueueId), RequestDeviceError>>
@@ -490,7 +491,7 @@ trait Context: Debug + Send + Sized + Sync {
         &self,
         queue: &Self::QueueId,
         command_buffers: I,
-    );
+    ) -> Self::SubmissionIndex;
     fn queue_get_timestamp_period(&self, queue: &Self::QueueId) -> f32;
     fn queue_on_submitted_work_done(
         &self,
@@ -550,13 +551,24 @@ pub struct Device {
     id: <C as Context>::DeviceId,
 }
 
-/// Passed to [`Device::poll`] to control if it should block or not. This has no effect on
-/// the web.
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+/// Identifier for a particular call to [`Queue::submit`]. Can be used
+/// as part of an argument to [`Device::poll`] to block for a particular
+/// submission to finish.
+#[derive(Debug, Copy, Clone)]
+pub struct SubmissionIndex(<C as Context>::SubmissionIndex);
+
+/// Passed to [`Device::poll`] to control how and if it should block.
+#[derive(Clone)]
 pub enum Maintain {
-    /// Block
-    Wait,
-    /// Don't block
+    /// Block until the callbacks for the given submission will resolve on their own.
+    ///
+    /// If the submission index is None it will wait for the most recent submission.
+    ///
+    /// On native this will block the thread until the submission is finished.
+    ///
+    /// On web this is a no-op but all the callbacks will automatically fire.
+    Wait(Option<SubmissionIndex>),
+    /// Check the device for a single time without blocking.
     Poll,
 }
 
@@ -3366,14 +3378,19 @@ impl Queue {
     }
 
     /// Submits a series of finished command buffers for execution.
-    pub fn submit<I: IntoIterator<Item = CommandBuffer>>(&self, command_buffers: I) {
-        Context::queue_submit(
+    pub fn submit<I: IntoIterator<Item = CommandBuffer>>(
+        &self,
+        command_buffers: I,
+    ) -> SubmissionIndex {
+        let raw = Context::queue_submit(
             &*self.context,
             &self.id,
             command_buffers
                 .into_iter()
                 .map(|mut comb| comb.id.take().unwrap()),
         );
+
+        SubmissionIndex(raw)
     }
 
     /// Gets the amount of nanoseconds each tick of a timestamp query represents.
diff --git a/wgpu/tests/vertex_indices/mod.rs b/wgpu/tests/vertex_indices/mod.rs
index fa85ae62d9..1bc362f7db 100644
--- a/wgpu/tests/vertex_indices/mod.rs
+++ b/wgpu/tests/vertex_indices/mod.rs
@@ -124,7 +124,7 @@ fn pulling_common(
     ctx.queue.submit(Some(encoder.finish()));
     let slice = buffer.slice(..);
     let _ = slice.map_async(wgpu::MapMode::Read);
-    ctx.device.poll(wgpu::Maintain::Wait);
+    ctx.device.poll(wgpu::Maintain::Wait(None));
     let data: Vec<u32> = bytemuck::cast_slice(&*slice.get_mapped_range()).to_vec();
 
     assert_eq!(data, expected);
diff --git a/wgpu/tests/zero_init_texture_after_discard.rs b/wgpu/tests/zero_init_texture_after_discard.rs
index ec77e909f0..488778aff1 100644
--- a/wgpu/tests/zero_init_texture_after_discard.rs
+++ b/wgpu/tests/zero_init_texture_after_discard.rs
@@ -283,7 +283,7 @@ fn assert_buffer_is_zero(readback_buffer: &wgpu::Buffer, device: &wgpu::Device)
     {
         let buffer_slice = readback_buffer.slice(..);
         let _ = buffer_slice.map_async(wgpu::MapMode::Read);
-        device.poll(wgpu::Maintain::Wait);
+        device.poll(wgpu::Maintain::Wait(None));
         let buffer_view = buffer_slice.get_mapped_range();
 
         assert!(