diff --git a/Cargo.lock b/Cargo.lock index e2bb4476..47e65990 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -641,13 +641,10 @@ dependencies = [ [[package]] name = "imago" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "301cd8e9a7de4545ed7387d5d563f4cc5adbdc44d8e658ffbb548bccb5bfe194" +version = "0.1.3" dependencies = [ "async-trait", "bincode", - "futures", "libc", "miniz_oxide", "rustc_version", @@ -655,7 +652,6 @@ dependencies = [ "tokio", "tracing", "vm-memory", - "windows-sys 0.59.0", ] [[package]] diff --git a/src/devices/Cargo.toml b/src/devices/Cargo.toml index f703da64..7e516346 100644 --- a/src/devices/Cargo.toml +++ b/src/devices/Cargo.toml @@ -34,7 +34,7 @@ utils = { path = "../utils" } polly = { path = "../polly" } rutabaga_gfx = { path = "../rutabaga_gfx", features = ["virgl_renderer", "virgl_renderer_next"], optional = true } -imago = { version = "0.1.2", features = ["sync-wrappers", "vm-memory"] } +imago = { path = "../imago", features = ["sync-wrappers", "vm-memory"] } [target.'cfg(target_os = "macos")'.dependencies] hvf = { path = "../hvf" } diff --git a/src/imago/.cargo_vcs_info.json b/src/imago/.cargo_vcs_info.json new file mode 100644 index 00000000..4ff846d6 --- /dev/null +++ b/src/imago/.cargo_vcs_info.json @@ -0,0 +1,6 @@ +{ + "git": { + "sha1": "6d4fbca7dd85c4d740261c91f0350d3403cc6ee5" + }, + "path_in_vcs": "" +} \ No newline at end of file diff --git a/src/imago/.gitignore b/src/imago/.gitignore new file mode 100644 index 00000000..ea8c4bf7 --- /dev/null +++ b/src/imago/.gitignore @@ -0,0 +1 @@ +/target diff --git a/src/imago/Cargo.toml b/src/imago/Cargo.toml new file mode 100644 index 00000000..75b041d8 --- /dev/null +++ b/src/imago/Cargo.toml @@ -0,0 +1,71 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2021" +name = "imago" +version = "0.1.3" +build = "build.rs" +autolib = false +autobins = false +autoexamples = false +autotests = false +autobenches = false +description = "A library for accessing virtual machine disk images." +readme = "README.md" +license = "MIT" +repository = "https://gitlab.com/hreitz/imago" + +[package.metadata.docs.rs] +all-features = true + +[features] +default = [] +sync-wrappers = [] +vm-memory = ["dep:vm-memory"] + +[lib] +name = "imago" +path = "src/lib.rs" + +[dependencies.async-trait] +version = "0.1" + +[dependencies.bincode] +version = "1.3" + +[dependencies.miniz_oxide] +version = "0.8" +features = ["std"] + +[dependencies.serde] +version = "1.0" +features = ["derive"] + +[dependencies.tokio] +version = "1" +features = [ + "rt", + "sync", +] + +[dependencies.tracing] +version = "0.1" + +[dependencies.vm-memory] +version = "0.16" +optional = true + +[build-dependencies.rustc_version] +version = "0.4.0" + +[target."cfg(unix)".dependencies.libc] +version = "0.2" diff --git a/src/imago/LICENSE b/src/imago/LICENSE new file mode 100644 index 00000000..c8f51b0b --- /dev/null +++ b/src/imago/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2024 imago contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/src/imago/README.md b/src/imago/README.md new file mode 100644 index 00000000..2849f443 --- /dev/null +++ b/src/imago/README.md @@ -0,0 +1,76 @@ +# Imago + +Provides access to VM image formats. + +Simple example (requires the `sync-wrappers` feature): +```rust +use imago::file::File; +use imago::qcow2::Qcow2; +use imago::SyncFormatAccess; +use std::fs::OpenOptions; + +// Produce read-only qcow2 instance using purely `File` for storage +let mut qcow2 = Qcow2::::open_path_sync("image.qcow2", false)?; +qcow2.open_implicit_dependencies_sync()?; + +let qcow2 = SyncFormatAccess::new(qcow2)?; + +let mut buf = vec![0u8; 512]; +qcow2.read(&mut buf, 0)?; +``` + +Another example, using the native async interface instead of sync wrapper functions, explicitly +overriding the implicit references contained in qcow2 files, and showcasing using different +types of storage (specifically normal files and null storage): +```rust +use imago::file::File; +use imago::null::Null; +use imago::qcow2::Qcow2; +use imago::raw::Raw; +use imago::{DynStorage, FormatAccess, Storage, StorageOpenOptions}; +use std::sync::Arc; + +let qcow2_file_opts = StorageOpenOptions::new() + .write(true) + .filename(String::from("image.qcow2")); +let qcow2_file = File::open(qcow2_file_opts).await?; + +// Produce qcow2 instance with arbitrary (and potentially mixed) storage instances +let mut qcow2 = + Qcow2::, Arc>>::open_image(Box::new(qcow2_file), true) + .await?; + +let backing_storage: Box = Box::new(Null::new(0)); +let backing = Raw::open_image(backing_storage, false).await?; +let backing = Arc::new(FormatAccess::new(backing)); +qcow2.set_backing(Some(Arc::clone(&backing))); + +// Open potentially remaining dependencies (like an external data file) +qcow2.open_implicit_dependencies().await?; + +let qcow2 = FormatAccess::new(qcow2); + +let mut buf = vec![0u8; 512]; +qcow2.read(&mut buf, 0).await?; + +qcow2.flush().await?; +``` + +# Flushing + +Given that `AsyncDrop` is not stable yet (and probably will not be stable for a long time), +callers must ensure that images are properly flushed before dropping them, i.e. call +`.flush().await` on any image that is not read-only. + +(The synchronous wrapper `SyncFormatAccess` does perform a synchronous flush in its `Drop` +implementation.) + +# Features + +- `sync-wrappers`: Provide synchronous wrappers for the native `async` interface. Note that + these build a `tokio` runtime in which they run the `async` functions, so using the `async` + interface is definitely preferred. + +- `vm-memory`: Provide conversion functions `IoVector::from_volatile_slice` and + `IoVectorMut::from_volatile_slice` to convert the vm-memory crate’s `[VolatileSlice]` arrays into + imago’s native I/O vectors. diff --git a/src/imago/build.rs b/src/imago/build.rs new file mode 100644 index 00000000..667d1dc9 --- /dev/null +++ b/src/imago/build.rs @@ -0,0 +1,9 @@ +use rustc_version::{version_meta, Channel}; + +fn main() { + println!("cargo:rustc-check-cfg=cfg(nightly)"); + + if version_meta().unwrap().channel == Channel::Nightly { + println!("cargo:rustc-cfg=nightly"); + } +} diff --git a/src/imago/rustfmt.toml b/src/imago/rustfmt.toml new file mode 100644 index 00000000..48b16b35 --- /dev/null +++ b/src/imago/rustfmt.toml @@ -0,0 +1,3 @@ +edition = "2021" +format_code_in_doc_comments = true +imports_granularity = "Module" diff --git a/src/imago/src/annotated.rs b/src/imago/src/annotated.rs new file mode 100644 index 00000000..f9d016ff --- /dev/null +++ b/src/imago/src/annotated.rs @@ -0,0 +1,172 @@ +//! Annotating wrapper around storage objects. +//! +//! Wraps other storage objects, adding an arbitrary tag to them. +//! +//! This may be useful when using the “mapping” interface, to identify the storage objects returned +//! in raw mappings. +//! +//! Example: +//! ``` +//! # use imago::{FormatAccess, Mapping}; +//! # use imago::annotated::Annotated; +//! # use imago::null::Null; +//! # use imago::raw::Raw; +//! # tokio::runtime::Builder::new_current_thread() +//! # .build() +//! # .unwrap() +//! # .block_on(async move { +//! # +//! const TEST_TAG: u32 = 42; +//! +//! let disk_size = 16 << 30; +//! let test_offset = 1 << 30; +//! +//! let inner_storage = Null::new(disk_size); +//! let annotated_storage = Annotated::new(inner_storage, TEST_TAG); +//! let image = Raw::open_image(annotated_storage, false).await?; +//! let image = FormatAccess::new(image); +//! +//! let mapping = image.get_mapping(test_offset, 1).await?.0; +//! let Mapping::Raw { +//! storage, +//! offset, +//! writable, +//! } = mapping +//! else { +//! panic!("Raw mapping expected"); +//! }; +//! assert_eq!(*storage.tag(), TEST_TAG); +//! assert_eq!(offset, test_offset); +//! # +//! # Ok::<(), std::io::Error>(()) +//! # }).unwrap() +//! ``` + +use crate::io_buffers::{IoVector, IoVectorMut}; +use crate::storage::drivers::CommonStorageHelper; +use crate::{Storage, StorageOpenOptions}; +use std::fmt::{self, Debug, Display, Formatter}; +use std::io; +use std::ops::{Deref, DerefMut}; +use std::path::{Path, PathBuf}; + +/// Annotating wrapper around storage objects. +/// +/// Wraps other storage objects, adding an arbitrary tag to them. +// TODO: Remove the `Default` requirement. We want to implement `Storage::open()` if `Default` is +// implemented, though, but return an error if it is not. Doing that probably requires +// specialization, though. +#[derive(Debug)] +pub struct Annotated { + /// Wrapped storage object. + inner: S, + + /// Tag. + tag: Tag, +} + +impl Annotated { + /// Wrap `storage`, adding the tag `tag`. + pub fn new(storage: S, tag: T) -> Self { + Annotated { + inner: storage, + tag, + } + } + + /// Get the tag. + pub fn tag(&self) -> &T { + &self.tag + } + + /// Allow modifying or changing the tag. + pub fn tag_mut(&mut self) -> &mut T { + &mut self.tag + } +} + +impl From for Annotated { + fn from(storage: S) -> Self { + Self::new(storage, T::default()) + } +} + +impl Storage for Annotated { + async fn open(opts: StorageOpenOptions) -> io::Result { + Ok(S::open(opts).await?.into()) + } + + #[cfg(feature = "sync-wrappers")] + fn open_sync(opts: StorageOpenOptions) -> io::Result { + Ok(S::open_sync(opts)?.into()) + } + + fn mem_align(&self) -> usize { + self.inner.mem_align() + } + + fn req_align(&self) -> usize { + self.inner.req_align() + } + + fn size(&self) -> io::Result { + self.inner.size() + } + + fn resolve_relative_path>(&self, relative: P) -> io::Result { + self.inner.resolve_relative_path(relative) + } + + async unsafe fn pure_readv(&self, bufv: IoVectorMut<'_>, offset: u64) -> io::Result<()> { + // Caller guarantees safety + unsafe { self.inner.pure_readv(bufv, offset) }.await + } + + async unsafe fn pure_writev(&self, bufv: IoVector<'_>, offset: u64) -> io::Result<()> { + // Caller guarantees safety + unsafe { self.inner.pure_writev(bufv, offset) }.await + } + + async unsafe fn pure_write_zeroes(&self, offset: u64, length: u64) -> io::Result<()> { + // Caller guarantees safety + unsafe { self.inner.pure_write_zeroes(offset, length) }.await + } + + async unsafe fn pure_discard(&self, offset: u64, length: u64) -> io::Result<()> { + // Caller guarantees safety + unsafe { self.inner.pure_discard(offset, length) }.await + } + + async fn flush(&self) -> io::Result<()> { + self.inner.flush().await + } + + async fn sync(&self) -> io::Result<()> { + self.inner.sync().await + } + + fn get_storage_helper(&self) -> &CommonStorageHelper { + // Share storage helper from inner (to e.g. get same request serialization) + self.inner.get_storage_helper() + } +} + +impl Deref for Annotated { + type Target = S; + + fn deref(&self) -> &S { + &self.inner + } +} + +impl DerefMut for Annotated { + fn deref_mut(&mut self) -> &mut S { + &mut self.inner + } +} + +impl Display for Annotated { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "annotated({})[{}]", self.tag, self.inner) + } +} diff --git a/src/imago/src/async_lru_cache.rs b/src/imago/src/async_lru_cache.rs new file mode 100644 index 00000000..72c66438 --- /dev/null +++ b/src/imago/src/async_lru_cache.rs @@ -0,0 +1,429 @@ +//! Provides a least-recently-used cache with async access. +//! +//! To operate, this cache is bound to an I/O back-end object that provides the loading and +//! flushing of cache entries. +//! +//! Also supports inter-cache dependency, e.g. for when the qcow2 L2 table cache needs to be +//! flushed before the refblock cache, because some clusters were freed (so the L2 references need +//! to be cleared before the clusters are deallocated). + +#![allow(dead_code)] + +use crate::vector_select::FutureVector; +use async_trait::async_trait; +use std::collections::HashMap; +use std::fmt::Debug; +use std::hash::Hash; +use std::io; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use tokio::sync::{Mutex, MutexGuard, RwLock, RwLockWriteGuard}; +use tracing::{error, span, trace, Level}; + +/// Cache entry structure, wrapping the cached object. +pub(crate) struct AsyncLruCacheEntry { + /// Cached object. + /// + /// Always set during operation, only cleared when trying to unwrap the `Arc` on eviction. + value: Option>, + + /// When this entry was last accessed. + last_used: AtomicUsize, +} + +/// Least-recently-used cache with async access. +struct AsyncLruCacheInner< + Key: Clone + Copy + Debug + PartialEq + Eq + Hash + Send + Sync, + Value: Send + Sync, + IoBackend: AsyncLruCacheBackend, +> { + /// I/O back-end that performs loading and flushing of cache entries. + backend: IoBackend, + + /// Cache entries. + map: RwLock>>, + + /// Flush dependencies (flush these first). + flush_before: Mutex>>, + + /// Monotonically increasing counter to generate “timestamps”. + lru_timer: AtomicUsize, + + /// Upper limit of how many entries to cache. + limit: usize, +} + +/// Least-recently-used cache with async access. +/// +/// Keeps the least recently used entries up to a limited count. Accessing and flushing is +/// async-aware. +/// +/// `K` is the key used to uniquely identify cache entries, `V` is the cached data. +pub(crate) struct AsyncLruCache< + K: Clone + Copy + Debug + PartialEq + Eq + Hash + Send + Sync, + V: Send + Sync, + B: AsyncLruCacheBackend, +>(Arc>); + +/// Internal trait used to implement inter-cache flush dependencies. +#[async_trait(?Send)] +trait FlushableCache: Send + Sync { + /// Flush the cache. + async fn flush(&self) -> io::Result<()>; + + /// Check of circular dependencies. + /// + /// Return `true` if (and only if) `other` is already a transitive dependency of `self`. + async fn check_circular(&self, other: &Arc) -> bool; +} + +/// Provides loading and flushing for cache entries. +pub(crate) trait AsyncLruCacheBackend: Send + Sync { + /// Key type. + type Key: Clone + Copy + Debug + PartialEq + Eq + Hash + Send + Sync; + /// Value (object) type. + type Value: Send + Sync; + + /// Load the given object. + #[allow(async_fn_in_trait)] // No need for Send + async fn load(&self, key: Self::Key) -> io::Result; + + /// Flush the given object. + /// + /// The implementation should itself check whether the object is dirty; `flush()` is called for + /// all evicted cache entries, regardless of whether they actually are dirty or not. + #[allow(async_fn_in_trait)] // No need for Send + async fn flush(&self, key: Self::Key, value: Arc) -> io::Result<()>; +} + +impl< + K: Clone + Copy + Debug + PartialEq + Eq + Hash + Send + Sync, + V: Send + Sync, + B: AsyncLruCacheBackend, + > AsyncLruCache +{ + /// Create a new cache. + /// + /// `size` is the maximum number of entries to keep in the cache. + pub fn new(backend: B, size: usize) -> Self { + AsyncLruCache(Arc::new(AsyncLruCacheInner { + backend, + map: Default::default(), + flush_before: Default::default(), + lru_timer: AtomicUsize::new(0), + limit: size, + })) + } + + /// Retrieve an entry from the cache. + /// + /// If there is no entry yet, run `read()` to generate it. If then there are more entries in + /// the cache than its limit, flush out the oldest entry via `flush()`. + pub async fn get_or_insert(&self, key: K) -> io::Result> { + self.0.get_or_insert(key).await + } + + /// Force-insert the given object into the cache. + /// + /// If there is an existing object under that key, it is flushed first. + pub async fn insert(&self, key: K, value: Arc) -> io::Result<()> { + self.0.insert(key, value).await + } + + /// Flush all cache entries. + /// + /// Those entries are not evicted, but remain in the cache. + pub async fn flush(&self) -> io::Result<()> { + self.0.flush().await + } +} + +impl< + K: Clone + Copy + Debug + PartialEq + Eq + Hash + Send + Sync + 'static, + V: Send + Sync + 'static, + B: AsyncLruCacheBackend + 'static, + > AsyncLruCache +{ + /// Set up a flush dependency. + /// + /// Ensure that before anything in this cache is flushed, `flush_before` is flushed first. + pub async fn depend_on< + K2: Clone + Copy + Debug + PartialEq + Eq + Hash + Send + Sync + 'static, + V2: Send + Sync + 'static, + B2: AsyncLruCacheBackend + 'static, + >( + &self, + other: &AsyncLruCache, + ) -> io::Result<()> { + let _span = span!( + Level::TRACE, + "AsyncLruCache::depend_on", + self = Arc::as_ptr(&self.0) as usize, + other = Arc::as_ptr(&other.0) as usize + ) + .entered(); + + let cloned: Arc> = Arc::clone(&other.0); + let cloned: Arc = cloned; + + loop { + { + let mut locked = self.0.flush_before.lock().await; + // Shouldn’t be long, so linear search seems fine + if locked.iter().any(|x| Arc::ptr_eq(x, &cloned)) { + break; + } + + let self_arc: Arc> = Arc::clone(&self.0); + let self_arc: Arc = self_arc; + if !other.0.check_circular(&self_arc).await { + trace!("No circular dependency, entering new dependency"); + locked.push(cloned); + break; + } + } + + trace!("Circular dependency detected, flushing other cache first"); + + other.0.flush().await?; + } + + Ok(()) + } +} + +impl< + K: Clone + Copy + Debug + PartialEq + Eq + Hash + Send + Sync, + V: Send + Sync, + B: AsyncLruCacheBackend, + > AsyncLruCacheInner +{ + /// Flush all dependencies. + /// + /// Flush all caches that must be flushed before this one. Remove all successfully flushed + /// caches from our dependency list. + /// + /// Call with a guard that should be dropped only after this cache is flushed, so that no new + /// dependencies can enter while we are still flushing this cache. + async fn flush_dependencies( + flush_before: &mut MutexGuard<'_, Vec>>, + ) -> io::Result<()> { + let _span = span!(Level::TRACE, "AsyncLruCache::flush_dependencies").entered(); + + while let Some(dep) = flush_before.pop() { + trace!("Flushing dependency {:?}", Arc::as_ptr(&dep) as *const _); + if let Err(err) = dep.flush().await { + flush_before.push(dep); + return Err(err); + } + } + Ok(()) + } + + /// Ensure there is at least one free entry in the cache. + /// + /// Do this by evicting (flushing) existing entries, if necessary. + async fn ensure_free_entry( + &self, + map: &mut RwLockWriteGuard<'_, HashMap>>, + ) -> io::Result<()> { + let _span = span!( + Level::TRACE, + "AsyncLruCache::ensure_free_entry", + self = &self as *const _ as usize + ) + .entered(); + + while map.len() >= self.limit { + trace!("{} / {} used", map.len(), self.limit); + + let now = self.lru_timer.load(Ordering::Relaxed); + let (evicted_object, key, last_used) = loop { + let oldest = map.iter().fold((0, None), |oldest, (key, entry)| { + // Cannot drop entries that are in use + if Arc::strong_count(entry.value()) > 1 { + return oldest; + } + + let age = now.wrapping_sub(entry.last_used.load(Ordering::Relaxed)); + if age >= oldest.0 { + (age, Some(*key)) + } else { + oldest + } + }); + + let Some(oldest_key) = oldest.1 else { + error!("Cannot evict entry from cache; everything is in use"); + return Err(io::Error::other( + "Cannot evict entry from cache; everything is in use", + )); + }; + + trace!( + "Removing entry with key {:?}, aged {}", + oldest_key, + oldest.0 + ); + + let mut oldest_entry = map.remove(&oldest_key).unwrap(); + match Arc::try_unwrap(oldest_entry.value.take().unwrap()) { + Ok(object) => { + break ( + object, + oldest_key, + oldest_entry.last_used.load(Ordering::Relaxed), + ) + } + Err(arc) => { + trace!("Entry is still in use, retrying"); + + // Found a race, retry. + // (`Arc::strong_count()` should return `1` in the next iteration, + // filtering this entry out.) + oldest_entry.value = Some(arc); + } + } + }; + + let mut dep_guard = self.flush_before.lock().await; + Self::flush_dependencies(&mut dep_guard).await?; + let obj = Arc::new(evicted_object); + trace!("Flushing {key:?}"); + if let Err(err) = self.backend.flush(key, Arc::clone(&obj)).await { + map.insert( + key, + AsyncLruCacheEntry { + value: Some(obj), + last_used: last_used.into(), + }, + ); + return Err(err); + } + let _ = Arc::into_inner(obj).expect("flush() must not clone the object"); + } + + Ok(()) + } + + /// Retrieve an entry from the cache. + /// + /// If there is no entry yet, run `read()` to generate it. If then there are more entries in + /// the cache than its limit, flush out the oldest entry via `flush()`. + async fn get_or_insert(&self, key: K) -> io::Result> { + { + let map = self.map.read().await; + if let Some(entry) = map.get(&key) { + entry.last_used.store( + self.lru_timer.fetch_add(1, Ordering::Relaxed), + Ordering::Relaxed, + ); + return Ok(Arc::clone(entry.value())); + } + } + + let mut map = self.map.write().await; + if let Some(entry) = map.get(&key) { + entry.last_used.store( + self.lru_timer.fetch_add(1, Ordering::Relaxed), + Ordering::Relaxed, + ); + return Ok(Arc::clone(entry.value())); + } + + self.ensure_free_entry(&mut map).await?; + + let object = Arc::new(self.backend.load(key).await?); + + let new_entry = AsyncLruCacheEntry { + value: Some(Arc::clone(&object)), + last_used: AtomicUsize::new(self.lru_timer.fetch_add(1, Ordering::Relaxed)), + }; + map.insert(key, new_entry); + + Ok(object) + } + + /// Force-insert the given object into the cache. + /// + /// If there is an existing object under that key, it is flushed first. + async fn insert(&self, key: K, value: Arc) -> io::Result<()> { + let mut map = self.map.write().await; + if let Some(entry) = map.get_mut(&key) { + entry.last_used.store( + self.lru_timer.fetch_add(1, Ordering::Relaxed), + Ordering::Relaxed, + ); + let mut dep_guard = self.flush_before.lock().await; + Self::flush_dependencies(&mut dep_guard).await?; + self.backend.flush(key, Arc::clone(entry.value())).await?; + entry.value = Some(value); + } else { + self.ensure_free_entry(&mut map).await?; + + let new_entry = AsyncLruCacheEntry { + value: Some(value), + last_used: AtomicUsize::new(self.lru_timer.fetch_add(1, Ordering::Relaxed)), + }; + map.insert(key, new_entry); + } + + Ok(()) + } + + /// Flush all cache entries. + /// + /// Those entries are not evicted, but remain in the cache. + async fn flush(&self) -> io::Result<()> { + let _span = span!( + Level::TRACE, + "AsyncLruCache::flush", + self = &self as *const _ as usize + ) + .entered(); + + let mut futs = FutureVector::new(); + + let mut dep_guard = self.flush_before.lock().await; + Self::flush_dependencies(&mut dep_guard).await?; + + let map = self.map.read().await; + for (key, entry) in map.iter() { + let key = *key; + let object = Arc::clone(entry.value()); + trace!("Flushing {key:?}"); + futs.push(Box::pin(self.backend.flush(key, object))); + } + + futs.discarding_join().await + } +} + +impl AsyncLruCacheEntry { + /// Return the cached object. + fn value(&self) -> &Arc { + self.value.as_ref().unwrap() + } +} + +#[async_trait(?Send)] +impl< + K: Clone + Copy + Debug + PartialEq + Eq + Hash + Send + Sync, + V: Send + Sync, + B: AsyncLruCacheBackend, + > FlushableCache for AsyncLruCacheInner +{ + async fn flush(&self) -> io::Result<()> { + AsyncLruCacheInner::::flush(self).await + } + + async fn check_circular(&self, other: &Arc) -> bool { + let deps = self.flush_before.lock().await; + for dep in deps.iter() { + if Arc::ptr_eq(dep, other) { + return true; + } + } + false + } +} diff --git a/src/imago/src/file.rs b/src/imago/src/file.rs new file mode 100644 index 00000000..75a859e4 --- /dev/null +++ b/src/imago/src/file.rs @@ -0,0 +1,456 @@ +//! Use a plain as storage. + +use crate::io_buffers::{IoVector, IoVectorMut}; +use crate::storage::drivers::CommonStorageHelper; +use crate::{Storage, StorageOpenOptions}; +use std::fmt::{self, Display, Formatter}; +use std::fs; +use std::io::{self, Seek, SeekFrom, Write}; +#[cfg(any(target_os = "linux", target_os = "macos"))] +use std::os::fd::AsRawFd; +#[cfg(all(unix, not(target_os = "macos")))] +use std::os::unix::fs::OpenOptionsExt; +#[cfg(windows)] +use std::os::windows::fs::{FileExt, OpenOptionsExt}; +#[cfg(windows)] +use std::os::windows::io::AsRawHandle; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::RwLock; +#[cfg(windows)] +use windows_sys::Win32::System::Ioctl::{FILE_ZERO_DATA_INFORMATION, FSCTL_SET_ZERO_DATA}; +#[cfg(windows)] +use windows_sys::Win32::System::IO::DeviceIoControl; + +/// Use a plain file as storage objects. +#[derive(Debug)] +pub struct File { + /// The file. + file: RwLock, + + /// Whether we are using direct I/O. + direct_io: bool, + + /// For debug purposes, and to resolve relative filenames. + filename: Option, + + /// Cached file length. + /// + /// Third parties changing the length concurrently is pretty certain to break things anyway. + size: AtomicU64, + + /// Storage helper. + common_storage_helper: CommonStorageHelper, +} + +impl TryFrom for File { + type Error = io::Error; + + /// Use the given existing `std::fs::File`. + /// + /// Convert the given existing `std::fs::File` object into an imago storage object. + /// + /// When using this, the resulting object will not know its own filename. That makes it + /// impossible to auto-resolve relative paths to it, e.g. qcow2 backing file names. + fn try_from(mut file: fs::File) -> io::Result { + let size = file.seek(SeekFrom::End(0))?; + + Ok(File { + file: RwLock::new(file), + // TODO: Find out, or better yet, drop `direct_io` and just probe the alignment. + direct_io: false, + filename: None, + size: AtomicU64::new(size), + common_storage_helper: Default::default(), + }) + } +} + +impl Storage for File { + async fn open(opts: StorageOpenOptions) -> io::Result { + Self::do_open_sync(opts) + } + + #[cfg(feature = "sync-wrappers")] + fn open_sync(opts: StorageOpenOptions) -> io::Result { + Self::do_open_sync(opts) + } + + fn mem_align(&self) -> usize { + // TODO: Probe + if self.direct_io { + 4096 + } else { + 1 + } + } + + fn req_align(&self) -> usize { + // TODO: Probe + if self.direct_io { + 4096 + } else { + 1 + } + } + + fn size(&self) -> io::Result { + Ok(self.size.load(Ordering::Relaxed)) + } + + fn resolve_relative_path>(&self, relative: P) -> io::Result { + let relative = relative.as_ref(); + + if relative.is_absolute() { + return Ok(relative.to_path_buf()); + } + + let filename = self + .filename + .as_ref() + .ok_or_else(|| io::Error::other("No filename set for base image"))?; + + let dirname = filename + .parent() + .ok_or_else(|| io::Error::other("Invalid base image filename set"))?; + + Ok(dirname.join(relative)) + } + + #[cfg(unix)] + async unsafe fn pure_readv( + &self, + mut bufv: IoVectorMut<'_>, + mut offset: u64, + ) -> io::Result<()> { + while !bufv.is_empty() { + let iovec = unsafe { bufv.as_iovec() }; + let result = unsafe { + libc::preadv( + self.file.read().unwrap().as_raw_fd(), + iovec.as_ptr(), + iovec.len() as libc::c_int, + offset + .try_into() + .map_err(|_| io::Error::other("Read offset overflow"))?, + ) + }; + + let len = if result < 0 { + let err = io::Error::last_os_error(); + if err.raw_os_error() == Some(libc::EINTR) { + continue; + } + return Err(err); + } else { + result as u64 + }; + + if len == 0 { + // End of file + bufv.fill(0); + break; + } + + bufv = bufv.split_tail_at(len); + offset = offset + .checked_add(len) + .ok_or_else(|| io::Error::other("Read offset overflow"))?; + } + + Ok(()) + } + + #[cfg(windows)] + async unsafe fn pure_readv(&self, bufv: IoVectorMut<'_>, mut offset: u64) -> io::Result<()> { + for mut buffer in bufv.into_inner() { + let mut buffer: &mut [u8] = &mut buffer; + while !buffer.is_empty() { + let len = if offset >= self.size.load(Ordering::Relaxed) { + buffer.fill(0); + buffer.len() + } else { + self.file.write().unwrap().seek_read(buffer, offset)? + }; + offset = offset + .checked_add(len as u64) + .ok_or_else(|| io::Error::other("Read offset overflow"))?; + buffer = buffer.split_at_mut(len).1; + } + } + Ok(()) + } + + #[cfg(unix)] + async unsafe fn pure_writev(&self, mut bufv: IoVector<'_>, mut offset: u64) -> io::Result<()> { + while !bufv.is_empty() { + let iovec = unsafe { bufv.as_iovec() }; + let result = unsafe { + libc::pwritev( + self.file.read().unwrap().as_raw_fd(), + iovec.as_ptr(), + iovec.len() as libc::c_int, + offset + .try_into() + .map_err(|_| io::Error::other("Write offset overflow"))?, + ) + }; + + let len = if result < 0 { + let err = io::Error::last_os_error(); + if err.raw_os_error() == Some(libc::EINTR) { + continue; + } + return Err(err); + } else { + result as u64 + }; + + if result == 0 { + // Should not happen, i.e. is an error + return Err(io::ErrorKind::WriteZero.into()); + } + + bufv = bufv.split_tail_at(len); + offset = offset + .checked_add(len) + .ok_or_else(|| io::Error::other("Write offset overflow"))?; + self.size.fetch_max(offset, Ordering::Relaxed); + } + + Ok(()) + } + + #[cfg(windows)] + async unsafe fn pure_writev(&self, bufv: IoVector<'_>, mut offset: u64) -> io::Result<()> { + for buffer in bufv.into_inner() { + let mut buffer: &[u8] = &buffer; + while !buffer.is_empty() { + let len = self.file.write().unwrap().seek_write(buffer, offset)?; + offset = offset + .checked_add(len as u64) + .ok_or_else(|| io::Error::other("Write offset overflow"))?; + self.size.fetch_max(offset, Ordering::Relaxed); + buffer = buffer.split_at(len).1; + } + } + Ok(()) + } + + #[cfg(any(target_os = "linux", windows, target_os = "macos"))] + async unsafe fn pure_write_zeroes(&self, offset: u64, length: u64) -> io::Result<()> { + // All of our discard methods also ensure the range reads back as zeroes + unsafe { self.pure_discard(offset, length) }.await + } + + // Beware when adding new discard methods: This is called by `pure_write_zeroes()`, so the + // current expectation is that discarded ranges will read back as zeroes. If the new method + // does not guarantee that, you will need to modify `pure_write_zeroes()`. + #[cfg(target_os = "linux")] + async unsafe fn pure_discard(&self, offset: u64, length: u64) -> io::Result<()> { + if self.try_discard_by_truncate(offset, length)? { + return Ok(()); + } + + // If offset or length are too big, just skip discarding. + let Ok(offset) = libc::off_t::try_from(offset) else { + return Ok(()); + }; + let Ok(length) = libc::off_t::try_from(length) else { + return Ok(()); + }; + + let file = self.file.read().unwrap(); + // Safe: File descriptor is valid, and the rest are simple integer parameters. + let ret = unsafe { + libc::fallocate( + file.as_raw_fd(), + libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, + offset, + length, + ) + }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + + Ok(()) + } + + // Beware when adding new discard methods: This is called by `pure_write_zeroes()`, so the + // current expectation is that discarded ranges will read back as zeroes. If the new method + // does not guarantee that, you will need to modify `pure_write_zeroes()`. + #[cfg(windows)] + async unsafe fn pure_discard(&self, offset: u64, length: u64) -> io::Result<()> { + if self.try_discard_by_truncate(offset, length)? { + return Ok(()); + } + + // If offset or length are too big, just skip discarding. + let Ok(offset) = i64::try_from(offset) else { + return Ok(()); + }; + let Ok(length) = i64::try_from(length) else { + return Ok(()); + }; + + let end = offset.saturating_add(length).saturating_add(1); + let params = FILE_ZERO_DATA_INFORMATION { + FileOffset: offset, + BeyondFinalZero: end, + }; + let mut _returned = 0; + let file = self.file.read().unwrap(); + // Safe: File handle is valid, mandatory pointers (input, returned length) are passed and + // valid, the parameter type matches the call, and the input size matches the object + // passed. + let ret = unsafe { + DeviceIoControl( + file.as_raw_handle(), + FSCTL_SET_ZERO_DATA, + (¶ms as *const FILE_ZERO_DATA_INFORMATION).cast::(), + size_of_val(¶ms) as u32, + std::ptr::null_mut(), + 0, + &mut _returned, + std::ptr::null_mut(), + ) + }; + if ret == 0 { + return Err(io::Error::last_os_error()); + } + + Ok(()) + } + + // Beware when adding new discard methods: This is called by `pure_write_zeroes()`, so the + // current expectation is that discarded ranges will read back as zeroes. If the new method + // does not guarantee that, you will need to modify `pure_write_zeroes()`. + #[cfg(target_os = "macos")] + async unsafe fn pure_discard(&self, offset: u64, length: u64) -> io::Result<()> { + if self.try_discard_by_truncate(offset, length)? { + return Ok(()); + } + + // If offset or length are too big, just skip discarding. + let Ok(offset) = libc::off_t::try_from(offset) else { + return Ok(()); + }; + let Ok(length) = libc::off_t::try_from(length) else { + return Ok(()); + }; + + let params = libc::fpunchhole_t { + fp_flags: 0, + reserved: 0, + fp_offset: offset, + fp_length: length, + }; + let file = self.file.read().unwrap(); + // Safe: FD is valid, passed pointer is valid and its type matches the call. + let ret = unsafe { libc::fcntl(file.as_raw_fd(), libc::F_PUNCHHOLE, ¶ms) }; + if ret < 0 { + return Err(io::Error::last_os_error()); + } + + Ok(()) + } + + async fn flush(&self) -> io::Result<()> { + self.file.write().unwrap().flush() + } + + async fn sync(&self) -> io::Result<()> { + self.file.write().unwrap().sync_all() + } + + fn get_storage_helper(&self) -> &CommonStorageHelper { + &self.common_storage_helper + } +} + +impl File { + /// Implementation for [`File::open()`] and [`File::open_sync()`]. + fn do_open_sync(opts: StorageOpenOptions) -> io::Result { + let Some(filename) = opts.filename else { + return Err(io::Error::other("Filename required")); + }; + + let mut file_opts = fs::OpenOptions::new(); + file_opts.read(true).write(opts.writable); + #[cfg(not(target_os = "macos"))] + if opts.direct { + file_opts.custom_flags( + #[cfg(unix)] + libc::O_DIRECT, + #[cfg(windows)] + windows_sys::Win32::Storage::FileSystem::FILE_FLAG_NO_BUFFERING, + ); + } + + let filename_owned = filename.to_owned(); + let mut file = file_opts.open(filename)?; + + let size = file.seek(SeekFrom::End(0))?; + + #[cfg(target_os = "macos")] + if opts.direct { + // Safe: We check the return value. + let ret = unsafe { libc::fcntl(file.as_raw_fd(), libc::F_NOCACHE, 1) }; + if ret < 0 { + let err = io::Error::last_os_error(); + return Err(io::Error::new( + err.kind(), + format!("Failed to disable host cache: {err}"), + )); + } + } + + Ok(File { + file: RwLock::new(file), + direct_io: opts.direct, + filename: Some(filename_owned), + size: AtomicU64::new(size), + common_storage_helper: Default::default(), + }) + } + + /// Attempt to discard range by truncating the file. + /// + /// If the given range is at the end of the file, discard it by simply truncating the file. + /// Return `true` on success. + /// + /// If the range is not at the end of the file, i.e. another method of discarding is needed, + /// return `false`. + fn try_discard_by_truncate(&self, offset: u64, length: u64) -> io::Result { + // Prevent modifications to the file length + #[allow(clippy::readonly_write_lock)] + let file = self.file.write().unwrap(); + + let size = self.size.load(Ordering::Relaxed); + if offset >= size { + // Nothing to do + return Ok(true); + } + + // If `offset + length` overflows, we can just assume it ends at `size`. (Anything past + // `size is irrelevant anyway.) + let end = offset.checked_add(length).unwrap_or(size); + if end < size { + return Ok(false); + } + + file.set_len(offset)?; + Ok(true) + } +} + +impl Display for File { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + if let Some(filename) = self.filename.as_ref() { + write!(f, "file:{filename:?}") + } else { + write!(f, "file:") + } + } +} diff --git a/src/imago/src/format/access.rs b/src/imago/src/format/access.rs new file mode 100644 index 00000000..5d16ab36 --- /dev/null +++ b/src/imago/src/format/access.rs @@ -0,0 +1,438 @@ +//! Actual public image access functionality. +//! +//! Provides access to different image formats via `FormatAccess` objects. + +use super::drivers::{self, FormatDriverInstance}; +use crate::io_buffers::{IoVector, IoVectorMut}; +use crate::vector_select::FutureVector; +use crate::{Storage, StorageExt}; +use std::fmt::{self, Display, Formatter}; +use std::{cmp, io, ptr}; + +/// Provides access to a disk image. +#[derive(Debug)] +pub struct FormatAccess { + /// Image format driver. + inner: Box>, + + /// Whether this image may be modified. + writable: bool, + + /// How many asynchronous requests to perform per read request in parallel. + read_parallelization: usize, + + /// How many asynchronous requests to perform per write request in parallel. + write_parallelization: usize, +} + +/// Fully recursive mapping information. +/// +/// Mapping information that resolves down to the storage object layer (except for special data). +#[derive(Debug)] +pub enum Mapping<'a, S: Storage> { + /// Raw data. + Raw { + /// Storage object where this data is stored. + storage: &'a S, + + /// Offset in `storage` where this data is stored. + offset: u64, + + /// Whether this mapping may be written to. + /// + /// If `true`, you can directly write to `offset` on `storage` to change the disk image’s + /// data accordingly. + /// + /// If `false`, the disk image format does not allow writing to `offset` on `storage`; a + /// new mapping must be allocated first. + writable: bool, + }, + + /// Range is to be read as zeroes. + Zero, + + /// End of file reached. + /// + /// The accompanying length is always 0. + Eof, + + /// Data is encoded in some manner, e.g. compressed or encrypted. + /// + /// Such data cannot be accessed directly, but must be interpreted by the image format driver. + Special { + /// Format layer where this special data was encountered. + layer: &'a FormatAccess, + + /// Original (“guest”) offset on `layer` to pass to `readv_special()`. + offset: u64, + }, +} + +// When adding new public methods, don’t forget to add them to sync_wrappers, too. +impl FormatAccess { + /// Wrap a format driver instance in `FormatAccess`. + /// + /// `FormatAccess` provides I/O access to disk images, based on the functionality offered by + /// the individual format drivers via `FormatDriverInstance`. + pub fn new + 'static>(inner: D) -> Self { + let writable = inner.writable(); + FormatAccess { + inner: Box::new(inner), + read_parallelization: 1, + write_parallelization: 1, + writable, + } + } + + /// Return the disk size in bytes. + pub fn size(&self) -> u64 { + self.inner.size() + } + + /// Set the number of simultaneous async requests per read. + /// + /// When issuing read requests, issue this many async requests in parallel (still in a single + /// thread). The default count is `1`, i.e. no parallel requests. + pub fn set_async_read_parallelization(&mut self, count: usize) { + self.read_parallelization = count; + } + + /// Set the number of simultaneous async requests per write. + /// + /// When issuing write requests, issue this many async requests in parallel (still in a single + /// thread). The default count is `1`, i.e. no parallel requests. + pub fn set_async_write_parallelization(&mut self, count: usize) { + self.write_parallelization = count; + } + + /// Return all storage dependencies of this image. + /// + /// Includes recursive dependencies, i.e. those from other image dependencies like backing + /// images. + pub(crate) fn collect_storage_dependencies(&self) -> Vec<&S> { + self.inner.collect_storage_dependencies() + } + + /// Minimal I/O alignment, for both length and offset. + /// + /// All requests to this image should be aligned to this value, both in length and offset. + /// + /// Requests that do not match this alignment will be realigned internally, which requires + /// creating bounce buffers and read-modify-write cycles for write requests, which is costly, + /// so should be avoided. + pub fn req_align(&self) -> usize { + self.inner + .collect_storage_dependencies() + .into_iter() + .fold(1, |max, s| cmp::max(max, s.req_align())) + } + + /// Minimal memory buffer alignment, for both address and length. + /// + /// All buffers used in requests to this image should be aligned to this value, both their + /// address and length. + /// + /// Request buffers that do not match this alignment will be realigned internally, which + /// requires creating bounce buffers, which is costly, so should be avoided. + pub fn mem_align(&self) -> usize { + self.inner + .collect_storage_dependencies() + .into_iter() + .fold(1, |max, s| cmp::max(max, s.mem_align())) + } + + /// Read the data from the given mapping. + async fn read_chunk( + &self, + mut bufv: IoVectorMut<'_>, + mapping: Mapping<'_, S>, + ) -> io::Result<()> { + match mapping { + Mapping::Raw { + storage, + offset, + writable: _, + } => storage.readv(bufv, offset).await, + + Mapping::Zero | Mapping::Eof => { + bufv.fill(0); + Ok(()) + } + + // FIXME: TOCTTOU problem. Not sure how to fully fix it, if possible at all. + // (Concurrent writes can change the mapping, but the driver will have to reload the + // mapping because it cannot pass it in `NonRecursiveMapping::Special`. It may then + // find that this is no longer a “special” range. Even passing the low-level mapping + // information in `Mapping::Special` wouldn’t fully fix it, though: If concurrent + // writes change the low-level cluster type, and the driver then tries to e.g. + // decompress the data that was there, that may well fail.) + Mapping::Special { layer, offset } => layer.inner.readv_special(bufv, offset).await, + } + } + + /// Return the mapping at `offset`. + /// + /// Find what `offset` is mapped to, return that mapping information, and the length of that + /// continuous mapping (from `offset`). + pub async fn get_mapping( + &self, + mut offset: u64, + mut max_length: u64, + ) -> io::Result<(Mapping<'_, S>, u64)> { + let mut format_layer = self; + let mut writable_gate = true; + + loop { + let (mapping, length) = format_layer.inner.get_mapping(offset, max_length).await?; + let length = std::cmp::min(length, max_length); + + match mapping { + drivers::Mapping::Raw { + storage, + offset, + writable, + } => { + return Ok(( + Mapping::Raw { + storage, + offset, + writable: writable && writable_gate, + }, + length, + )) + } + + drivers::Mapping::Indirect { + layer: recurse_layer, + offset: recurse_offset, + writable: recurse_writable, + } => { + format_layer = recurse_layer; + offset = recurse_offset; + writable_gate = recurse_writable; + max_length = length; + } + + drivers::Mapping::Zero => return Ok((Mapping::Zero, length)), + + drivers::Mapping::Eof => { + // Return EOF only on top layer, zero otherwise + return if ptr::eq(format_layer, self) { + Ok((Mapping::Eof, 0)) + } else { + Ok((Mapping::Zero, max_length)) + }; + } + + drivers::Mapping::Special { offset } => { + return Ok(( + Mapping::Special { + layer: format_layer, + offset, + }, + length, + )); + } + } + } + } + + /// Create a raw data mapping at `offset`. + /// + /// Ensure that `offset` is directly mapped to some storage object, up to a length of `length`. + /// Return the storage object, the corresponding offset there, and the continuous length that + /// we were able to map (less than or equal to `length`). + /// + /// If `overwrite` is true, the contents in the range are supposed to be overwritten and may be + /// discarded. Otherwise, they are kept. + pub async fn ensure_data_mapping( + &self, + offset: u64, + length: u64, + overwrite: bool, + ) -> io::Result<(&S, u64, u64)> { + let (storage, mapped_offset, mapped_length) = self + .inner + .ensure_data_mapping(offset, length, overwrite) + .await?; + let mapped_length = cmp::min(length, mapped_length); + assert!(mapped_length > 0); + Ok((storage, mapped_offset, mapped_length)) + } + + /// Read data at `offset` into `bufv`. + /// + /// Reads until `bufv` is filled completely, i.e. will not do short reads. When reaching the + /// end of file, the rest of `bufv` is filled with 0. + pub async fn readv(&self, mut bufv: IoVectorMut<'_>, mut offset: u64) -> io::Result<()> { + let mut workers = (self.read_parallelization > 1).then(FutureVector::new); + + while !bufv.is_empty() { + let (mapping, chunk_length) = self.get_mapping(offset, bufv.len()).await?; + if chunk_length == 0 { + assert!(mapping.is_eof()); + bufv.fill(0); + break; + } + + if let Some(workers) = workers.as_mut() { + while workers.len() >= self.read_parallelization { + workers.select().await?; + } + } + + let (chunk, remainder) = bufv.split_at(chunk_length); + bufv = remainder; + offset += chunk_length; + + if let Some(workers) = workers.as_mut() { + workers.push(Box::pin(self.read_chunk(chunk, mapping))); + } else { + self.read_chunk(chunk, mapping).await?; + } + } + + if let Some(mut workers) = workers { + workers.discarding_join().await?; + } + + Ok(()) + } + + /// Read data at `offset` into `buf`. + /// + /// Reads until `buf` is filled completely, i.e. will not do short reads. When reaching the + /// end of file, the rest of `buf` is filled with 0. + pub async fn read(&self, buf: impl Into>, offset: u64) -> io::Result<()> { + self.readv(buf.into(), offset).await + } + + /// Write data from `bufv` to `offset`. + /// + /// Writes all data from `bufv` (or returns an error), i.e. will not do short writes. Reaching + /// the end of file before the end of the buffer results in an error. + pub async fn writev(&self, mut bufv: IoVector<'_>, mut offset: u64) -> io::Result<()> { + if !self.writable { + return Err(io::Error::other("Image is read-only")); + } + + // Limit to disk size + let disk_size = self.inner.size(); + if offset >= disk_size { + return Ok(()); + } + if bufv.len() > disk_size - offset { + bufv = bufv.split_at(disk_size - offset).0; + } + + let mut workers = (self.write_parallelization > 1).then(FutureVector::new); + + while !bufv.is_empty() { + let (storage, st_offset, st_length) = + self.ensure_data_mapping(offset, bufv.len(), true).await?; + + if let Some(workers) = workers.as_mut() { + while workers.len() >= self.write_parallelization { + workers.select().await?; + } + } + + let (chunk, remainder) = bufv.split_at(st_length); + bufv = remainder; + offset += st_length; + + if let Some(workers) = workers.as_mut() { + workers.push(Box::pin(storage.writev(chunk, st_offset))); + } else { + storage.writev(chunk, st_offset).await?; + } + } + + if let Some(mut workers) = workers { + workers.discarding_join().await?; + } + + Ok(()) + } + + /// Write data from `buf` to `offset`. + /// + /// Writes all data from `bufv` (or returns an error), i.e. will not do short writes. Reaching + /// the end of file before the end of the buffer results in an error. + pub async fn write(&self, buf: impl Into>, offset: u64) -> io::Result<()> { + self.writev(buf.into(), offset).await + } + + /// Flush internal buffers. Always call this before drop! + /// + /// Does not necessarily sync those buffers to disk. When using `flush()`, consider whether + /// you want to call `sync()` afterwards. + /// + /// Because of the current lack of stable `async_drop`, you must manually call this before + /// dropping a `FormatAccess` instance! (Not necessarily for read-only images, though.) + #[allow(async_fn_in_trait)] // No need for Send + pub async fn flush(&self) -> io::Result<()> { + self.inner.flush().await + } + + /// Sync data already written to the storage hardware. + /// + /// This does not necessarily include flushing internal buffers, i.e. `flush`. When using + /// `sync()`, consider whether you want to call `flush()` before it. + #[allow(async_fn_in_trait)] // No need for Send + pub async fn sync(&self) -> io::Result<()> { + self.inner.sync().await + } +} + +impl Mapping<'_, S> { + /// Return `true` if and only if this mapping signifies the end of file. + pub fn is_eof(&self) -> bool { + matches!(self, Mapping::Eof) + } +} + +impl Display for FormatAccess { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + self.inner.fmt(f) + } +} + +impl Display for Mapping<'_, S> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + Mapping::Raw { + storage, + offset, + writable, + } => { + let writable = if *writable { "rw" } else { "ro" }; + write!(f, "{storage}:0x{offset:x}/{writable}") + } + + Mapping::Zero => write!(f, ""), + + Mapping::Eof => write!(f, ""), + + Mapping::Special { layer, offset } => { + write!(f, "") + } + } + } +} + +/* +#[cfg(feature = "async-drop")] +impl std::future::AsyncDrop for FormatAccess { + type Dropper<'a> = std::pin::Pin + 'a>> where S: 'a; + + fn async_drop(self: std::pin::Pin<&mut Self>) -> Self::Dropper<'_> { + Box::pin(async move { + if let Err(err) = self.flush().await { + let inner = &self.inner; + tracing::error!("Failed to flush {inner}: {err}"); + } + }) + } +} +*/ diff --git a/src/imago/src/format/drivers.rs b/src/imago/src/format/drivers.rs new file mode 100644 index 00000000..2aa27994 --- /dev/null +++ b/src/imago/src/format/drivers.rs @@ -0,0 +1,141 @@ +//! Internal image format driver interface. +//! +//! Provides the internal interface for image format drivers to provide their services, on which +//! the publically visible interface [`FormatAccess`] is built. + +use crate::io_buffers::IoVectorMut; +use crate::{FormatAccess, Storage}; +use async_trait::async_trait; +use std::fmt::{Debug, Display}; +use std::io; + +/// Implementation of a disk image format. +#[async_trait(?Send)] +pub trait FormatDriverInstance: Debug + Display + Send + Sync { + /// Type of storage used. + type Storage: Storage; + + /// Size of the disk represented by this image. + fn size(&self) -> u64; + + /// Recursively collect all storage objects associated with this image. + /// + /// “Recursive” means to recurse to other images like e.g. a backing file. + fn collect_storage_dependencies(&self) -> Vec<&Self::Storage>; + + /// Return whether this image may be modified. + /// + /// This state must not change via interior mutability, i.e. as long as this FDI is wrapped in + /// a `FormatAccess`, its writability must remain constant. + fn writable(&self) -> bool; + + /// Return the mapping at `offset`. + /// + /// Find what `offset` is mapped to, return that mapping information, and the length of that + /// continuous mapping (from `offset`). + /// + /// To determine that continuous mapping length, drivers should not perform additional I/O + /// beyond what is necessary to get mapping information for `offset` itself. + /// + /// `max_length` is a hint how long of a range is required at all, but the returned length may + /// exceed that value if that simplifies the implementation. + /// + /// The returned length must only be 0 if `Mapping::Eof` is returned. + async fn get_mapping<'a>( + &'a self, + offset: u64, + max_length: u64, + ) -> io::Result<(Mapping<'a, Self::Storage>, u64)>; + + /// Ensure that `offset` is directly mapped to some storage object, up to a length of `length`. + /// + /// Return the storage object, the corresponding offset there, and the continuous length that + /// the driver was able to map (less than or equal to `length`). + /// + /// If the returned length is less than `length`, drivers can expect subsequent calls to + /// allocate the rest of the original range. Therefore, if a driver knows in advance that it + /// is impossible to fully map the given range (e.g. because it lies partially or fully beyond + /// the end of the disk), it should return an error immediately. + /// + /// If `overwrite` is true, the contents in the range are supposed to be overwritten and may be + /// discarded. Otherwise, they must be kept. + async fn ensure_data_mapping<'a>( + &'a self, + offset: u64, + length: u64, + overwrite: bool, + ) -> io::Result<(&'a Self::Storage, u64, u64)>; + + /// Read data from a `Mapping::Special` area. + async fn readv_special(&self, _bufv: IoVectorMut<'_>, _offset: u64) -> io::Result<()> { + Err(io::ErrorKind::Unsupported.into()) + } + + /// Flush internal buffers. + /// + /// Does not need to ensure those buffers are synced to disk (hardware). + async fn flush(&self) -> io::Result<()>; + + /// Sync data already written to the storage hardware. + /// + /// Does not need to ensure internal buffers are written, i.e. should generally just be passed + /// through to `Storage::sync()` for all underlying storage objects. + async fn sync(&self) -> io::Result<()>; +} + +/// Non-recursive mapping information. +/// +/// Mapping information as returned by `FormatDriverInstance::get_mapping()`, only looking at that +/// format layer’s information. +#[derive(Debug)] +pub enum Mapping<'a, S: Storage> { + /// Raw data. + Raw { + /// Storage object where this data is stored. + storage: &'a S, + + /// Offset in `storage` where this data is stored. + offset: u64, + + /// Whether this mapping may be written to. + /// + /// If `true`, you can directly write to `offset` on `storage` to change the disk image’s + /// data accordingly. + /// + /// If `false`, the disk image format does not allow writing to `offset` on `storage`; a + /// new mapping must be allocated first. + writable: bool, + }, + + /// Data lives in a different disk image (e.g. a backing file). + Indirect { + /// Format instance where this data can be obtained. + layer: &'a FormatAccess, + + /// Offset in `layer` where this data can be obtained. + offset: u64, + + /// Whether this mapping may be written to. + /// + /// If `true`, you can directly write to `offset` on `layer` to change the disk image’s + /// data accordingly. + /// + /// If `false`, the disk image format does not allow writing to `offset` on `layer`; a new + /// mapping must be allocated first. + writable: bool, + }, + + /// Range is to be read as zeroes. + Zero, + + /// End of file reached. + Eof, + + /// Data is encoded in some manner, e.g. compressed or encrypted. + /// + /// Such data cannot be accessed directly, but must be interpreted by the image format driver. + Special { + /// Original (“guest”) offset to pass to `FormatDriverInstance::readv_special()`. + offset: u64, + }, +} diff --git a/src/imago/src/format/mod.rs b/src/imago/src/format/mod.rs new file mode 100644 index 00000000..a863ca19 --- /dev/null +++ b/src/imago/src/format/mod.rs @@ -0,0 +1,9 @@ +//! Core functionality. +//! +//! Provides access to different image formats via `FormatAccess` objects. + +pub mod access; +pub mod drivers; +#[cfg(feature = "sync-wrappers")] +pub mod sync_wrappers; +pub mod wrapped; diff --git a/src/imago/src/format/sync_wrappers.rs b/src/imago/src/format/sync_wrappers.rs new file mode 100644 index 00000000..2f5f4e9b --- /dev/null +++ b/src/imago/src/format/sync_wrappers.rs @@ -0,0 +1,186 @@ +//! Synchronous wrapper around [`FormatAccess`]. + +use super::drivers::FormatDriverInstance; +use crate::io_buffers::{IoVector, IoVectorMut}; +use crate::{FormatAccess, Mapping, Storage}; +use std::io; + +/// Synchronous wrapper around [`FormatAccess`]. +/// +/// Creates and keeps a tokio runtime in which to run I/O. +pub struct SyncFormatAccess { + /// Wrapped asynchronous [`FormatAccess`]. + inner: FormatAccess, + + /// Tokio runtime in which I/O is run. + runtime: tokio::runtime::Runtime, +} + +impl SyncFormatAccess { + /// Like [`FormatAccess::new()`], but create a synchronous wrapper. + pub fn new + 'static>(inner: D) -> io::Result { + FormatAccess::new(inner).try_into() + } + + /// Get a reference to the contained async [`FormatAccess`] object. + pub fn inner(&self) -> &FormatAccess { + &self.inner + } + + /// Return the disk size in bytes. + pub fn size(&self) -> u64 { + self.inner.size() + } + + /// Set the number of simultaneous async requests per read. + /// + /// When issuing read requests, issue this many async requests in parallel (still in a single + /// thread). The default count is `1`, i.e. no parallel requests. + /// + /// Note that inside of this synchronous wrapper, we still run async functions, so this setting + /// is valid even for [`SyncFormatAccess`]. + pub fn set_async_read_parallelization(&mut self, count: usize) { + self.inner.set_async_read_parallelization(count) + } + + /// Set the number of simultaneous async requests per write. + /// + /// When issuing write requests, issue this many async requests in parallel (still in a single + /// thread). The default count is `1`, i.e. no parallel requests. + /// + /// Note that inside of this synchronous wrapper, we still run async functions, so this setting + /// is valid even for [`SyncFormatAccess`]. + pub fn set_async_write_parallelization(&mut self, count: usize) { + self.inner.set_async_write_parallelization(count) + } + + /// Minimal I/O alignment, for both length and offset. + /// + /// All requests to this image should be aligned to this value, both in length and offset. + /// + /// Requests that do not match this alignment will be realigned internally, which requires + /// creating bounce buffers and read-modify-write cycles for write requests, which is costly, + /// so should be avoided. + pub fn req_align(&self) -> usize { + self.inner.req_align() + } + + /// Minimal memory buffer alignment, for both address and length. + /// + /// All buffers used in requests to this image should be aligned to this value, both their + /// address and length. + /// + /// Request buffers that do not match this alignment will be realigned internally, which + /// requires creating bounce buffers, which is costly, so should be avoided. + pub fn mem_align(&self) -> usize { + self.inner.mem_align() + } + + /// Return the mapping at `offset`. + /// + /// Find what `offset` is mapped to, return that mapping information, and the length of that + /// continuous mapping (from `offset`). + pub fn get_mapping_sync( + &self, + offset: u64, + max_length: u64, + ) -> io::Result<(Mapping<'_, S>, u64)> { + self.runtime + .block_on(self.inner.get_mapping(offset, max_length)) + } + + /// Create a raw data mapping at `offset`. + /// + /// Ensure that `offset` is directly mapped to some storage object, up to a length of `length`. + /// Return the storage object, the corresponding offset there, and the continuous length that + /// we were able to map (less than or equal to `length`). + /// + /// If `overwrite` is true, the contents in the range are supposed to be overwritten and may be + /// discarded. Otherwise, they are kept. + pub fn ensure_data_mapping( + &self, + offset: u64, + length: u64, + overwrite: bool, + ) -> io::Result<(&S, u64, u64)> { + self.runtime + .block_on(self.inner.ensure_data_mapping(offset, length, overwrite)) + } + + /// Read data at `offset` into `bufv`. + /// + /// Reads until `bufv` is filled completely, i.e. will not do short reads. When reaching the + /// end of file, the rest of `bufv` is filled with 0. + pub fn readv(&self, bufv: IoVectorMut<'_>, offset: u64) -> io::Result<()> { + self.runtime.block_on(self.inner.readv(bufv, offset)) + } + + /// Read data at `offset` into `buf`. + /// + /// Reads until `buf` is filled completely, i.e. will not do short reads. When reaching the + /// end of file, the rest of `buf` is filled with 0. + pub fn read<'a>(&'a self, buf: impl Into>, offset: u64) -> io::Result<()> { + self.readv(buf.into(), offset) + } + + /// Write data from `bufv` to `offset`. + /// + /// Writes all data from `bufv` (or returns an error), i.e. will not do short writes. Reaching + /// the end of file before the end of the buffer results in an error. + pub fn writev(&self, bufv: IoVector<'_>, offset: u64) -> io::Result<()> { + self.runtime.block_on(self.inner.writev(bufv, offset)) + } + + /// Write data from `buf` to `offset`. + /// + /// Writes all data from `bufv` (or returns an error), i.e. will not do short writes. Reaching + /// the end of file before the end of the buffer results in an error. + pub fn write<'a>(&'a self, buf: impl Into>, offset: u64) -> io::Result<()> { + self.writev(buf.into(), offset) + } + + /// Flush internal buffers. + /// + /// Does not necessarily sync those buffers to disk. When using `flush()`, consider whether + /// you want to call `sync()` afterwards. + pub fn flush(&self) -> io::Result<()> { + self.runtime.block_on(self.inner.flush()) + } + + /// Sync data already written to the storage hardware. + /// + /// This does not necessarily include flushing internal buffers, i.e. `flush`. When using + /// `sync()`, consider whether you want to call `flush()` before it. + pub fn sync(&self) -> io::Result<()> { + self.runtime.block_on(self.inner.sync()) + } +} + +impl TryFrom> for SyncFormatAccess { + type Error = io::Error; + + fn try_from(async_access: FormatAccess) -> io::Result { + let runtime = tokio::runtime::Builder::new_current_thread() + .build() + .map_err(|err| { + io::Error::other(format!( + "Failed to create a tokio runtime for synchronous image access: {err}" + )) + })?; + + Ok(SyncFormatAccess { + inner: async_access, + runtime, + }) + } +} + +// #[cfg(not(feature = "async-drop"))] +impl Drop for SyncFormatAccess { + fn drop(&mut self) { + if let Err(err) = self.flush() { + let inner = &self.inner; + tracing::error!("Failed to flush {inner}: {err}"); + } + } +} diff --git a/src/imago/src/format/wrapped.rs b/src/imago/src/format/wrapped.rs new file mode 100644 index 00000000..6031c1c0 --- /dev/null +++ b/src/imago/src/format/wrapped.rs @@ -0,0 +1,59 @@ +//! Allows using [`FormatAccess`] in containers. +//! +//! Users may want to wrap [`FormatAccess`] objects e.g. in `Arc` and then assign them as +//! dependencies to other objects (e.g. as a backing image). The [`WrappedFormat`] trait provided +//! here allows images to use other images (`FormatAccess` objects) regardless of whether they are +//! wrapped in such containers or not. + +use crate::{FormatAccess, Storage}; +use std::fmt::{Debug, Display}; +use std::ops::Deref; +use std::sync::Arc; +use tokio::sync::{OwnedRwLockReadGuard, RwLock}; + +/// Represents [`FormatAccess`] wrapped in e.g. `Arc`, `Box`, or nothing at all. +/// +/// This struct is necessary so that we can reference format instances regardless of whether the +/// user decides to wrap them or not. +pub trait WrappedFormat: Debug + Display + Send + Sync { + /// Construct this `WrappedFormat`. + fn wrap(inner: FormatAccess) -> Self; + + /// Access the inner format instance. + fn unwrap(&self) -> &FormatAccess; +} + +impl< + S: Storage, + D: Deref> + Debug + Display + From> + Send + Sync, + > WrappedFormat for D +{ + fn wrap(inner: FormatAccess) -> Self { + Self::from(inner) + } + + fn unwrap(&self) -> &FormatAccess { + self.deref() + } +} + +impl WrappedFormat for FormatAccess { + fn wrap(inner: FormatAccess) -> Self { + inner + } + + fn unwrap(&self) -> &FormatAccess { + self + } +} + +impl WrappedFormat for OwnedRwLockReadGuard> { + fn wrap(inner: FormatAccess) -> Self { + // Ugly, but works. + Arc::new(RwLock::new(inner)).try_read_owned().unwrap() + } + + fn unwrap(&self) -> &FormatAccess { + self.deref() + } +} diff --git a/src/imago/src/io_buffers.rs b/src/imago/src/io_buffers.rs new file mode 100644 index 00000000..ff8fb7e6 --- /dev/null +++ b/src/imago/src/io_buffers.rs @@ -0,0 +1,1118 @@ +//! Types for I/O buffers. +//! +//! This module provides: +//! - buffer types that can be allocated with arbitrary alignment, +//! - references to buffers that more or less ensure the content is read only once (because it can +//! change for buffers owned by VM guests), +//! - buffer vector types. + +use crate::macros::passthrough_trait_fn; +#[cfg(feature = "vm-memory")] +use crate::misc_helpers::ImagoAsRef; +use std::alloc::{self, GlobalAlloc}; +use std::fmt::{self, Debug, Formatter}; +use std::io::{IoSlice, IoSliceMut}; +use std::marker::PhantomData; +#[cfg(unix)] +use std::mem; +use std::mem::{size_of, size_of_val}; +use std::ops::Range; +use std::{cmp, io, ptr, slice}; + +/// Owned memory buffer. +pub struct IoBuffer { + /// Raw pointer to the start of the buffer. + pointer: *mut u8, + + /// Size in bytes. + size: usize, + + /// Allocation layout. `None` only for null buffers. + layout: Option, +} + +/// Reference to any immutable memory buffer. +pub struct IoBufferRef<'a> { + /// Raw pointer to the start of the buffer. + pointer: *const u8, + + /// Size in bytes. + size: usize, + + /// Lifetime marker. + _lifetime: PhantomData<&'a [u8]>, +} + +/// Reference to any mutable memory buffer. +pub struct IoBufferMut<'a> { + /// Raw pointer to the start of the buffer. + pointer: *mut u8, + + /// Size in bytes. + size: usize, + + /// Lifetime marker. + _lifetime: PhantomData<&'a mut [u8]>, +} + +// Blocked because of the pointer, but we want this to be usable across threads +unsafe impl Send for IoBuffer {} +unsafe impl Sync for IoBuffer {} +unsafe impl Send for IoBufferRef<'_> {} +unsafe impl Sync for IoBufferRef<'_> {} +unsafe impl Send for IoBufferMut<'_> {} +unsafe impl Sync for IoBufferMut<'_> {} + +impl IoBuffer { + /// Create a new owned buffer, containing uninitialized data. + /// + /// Do note that the returned buffer contains uninitialized data, which however is perfectly + /// fine for an I/O buffer. + pub fn new(size: usize, alignment: usize) -> io::Result { + let layout = alloc::Layout::from_size_align(size, alignment).map_err(io::Error::other)?; + Self::new_with_layout(layout) + } + + /// Create a new owned buffer, containing uninitialized data, with the given `layout`. + pub fn new_with_layout(layout: alloc::Layout) -> io::Result { + if layout.size() == 0 { + return Ok(IoBuffer { + pointer: ptr::null_mut(), + size: 0, + layout: None, + }); + } + + // We guarantee the size not to be 0 and do not care about the memory being uninitialized, + // so this is safe + let pointer = unsafe { alloc::System.alloc(layout) }; + + if pointer.is_null() { + return Err(io::Error::new( + io::ErrorKind::OutOfMemory, + format!( + "Failed to allocate memory (size={}, alignment={})", + layout.size(), + layout.align(), + ), + )); + } + + Ok(IoBuffer { + pointer, + size: layout.size(), + layout: Some(layout), + }) + } + + /// Length in bytes. + pub fn len(&self) -> usize { + self.size + } + + /// Whether this is a null buffer (length is 0). + pub fn is_empty(&self) -> bool { + self.size == 0 + } + + /// Generate an immutable reference. + pub fn as_ref(&self) -> IoBufferRef<'_> { + IoBufferRef { + pointer: self.pointer as *const u8, + size: self.size, + _lifetime: PhantomData, + } + } + + /// Generate an immutable reference to a sub-range. + pub fn as_ref_range(&self, range: Range) -> IoBufferRef<'_> { + IoBufferRef::from_slice(&self.as_ref().into_slice()[range]) + } + + /// Generate a mutable reference. + pub fn as_mut(&mut self) -> IoBufferMut<'_> { + IoBufferMut { + pointer: self.pointer, + size: self.size, + _lifetime: PhantomData, + } + } + + /// Generate a mutable reference to a sub-range. + pub fn as_mut_range(&mut self, range: Range) -> IoBufferMut<'_> { + (&mut self.as_mut().into_slice()[range]).into() + } +} + +impl Drop for IoBuffer { + /// Free this buffer. + fn drop(&mut self) { + if let Some(layout) = self.layout { + // Safe because we have allocated this buffer using `alloc::System` + unsafe { + alloc::System.dealloc(self.pointer, layout); + } + } + } +} + +/// Common functions for both `IoBufferRef` and `IoBufferMut`. +pub trait IoBufferRefTrait<'a>: Sized { + /// `&[T]` or `&mut [T]`. + type SliceType; + + /// `*const T` or `*mut T`. + type PointerType; + + /// Create a reference to a slice. + fn from_slice(slice: Self::SliceType) -> Self; + + /// Create an owned [`IoBuffer`] with the same data (copied). + fn try_into_owned(self, alignment: usize) -> io::Result; + + /// Size in bytes. + fn len(&self) -> usize; + + /// Whether the length is 0. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Return the pointer to the start of the buffer. + fn as_ptr(&self) -> Self::PointerType; + + /// Turn this reference into a slice. + /// + /// References to `IoBuffer`s must not be copied/cloned (so they can only be accessed once; + /// they are considered volatile due to potential VM guest accesses), so this consumes the + /// object. + fn into_slice(self) -> Self::SliceType { + // Alignment requirement is always met, resulting data is pure binary data + unsafe { self.into_typed_slice::() } + } + + /// Turn this reference into a slice with the given element type. + /// + /// # Safety + /// Caller must ensure that alignment and length requirements are met and that the resulting + /// data is valid. + unsafe fn into_typed_slice(self) -> Self::SliceType; + + /// Split the buffer at `mid`. + /// + /// Return `&self[..mid]` and `&self[mid..]`. + /// + /// If `mid > self.len()`, return `&self[..]` and `[]`. + fn split_at(self, mid: usize) -> (Self, Self); + + /// Make this reference immutable. + fn into_ref(self) -> IoBufferRef<'a>; +} + +impl<'a> IoBufferRef<'a> { + /// Create a reference to a slice. + pub fn from_slice(slice: &'a [u8]) -> Self { + IoBufferRef { + pointer: slice.as_ptr(), + size: size_of_val(slice), + _lifetime: PhantomData, + } + } + + /// Create an owned [`IoBuffer`] with the same data (copied). + pub fn try_into_owned(self, alignment: usize) -> io::Result { + let mut new_buf = IoBuffer::new(self.len(), alignment)?; + new_buf + .as_mut() + .into_slice() + .copy_from_slice(self.into_slice()); + Ok(new_buf) + } + + /// Size in bytes. + pub fn len(&self) -> usize { + self.size + } + + /// Whether the length is 0. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Return the pointer to the start of the buffer. + pub fn as_ptr(&self) -> *const u8 { + self.pointer + } + + /// Turn this reference into a slice. + /// + /// References to `IoBuffer`s must not be copied/cloned (so they can only be accessed once; + /// they are considered volatile due to potential VM guest accesses), so this consumes the + /// object. + pub fn into_slice(self) -> &'a [u8] { + // Alignment requirement is always met, resulting data is pure binary data + unsafe { self.into_typed_slice::() } + } + + /// Turn this reference into a slice with the given element type. + /// + /// # Safety + /// Caller must ensure that alignment and length requirements are met and that the resulting + /// data is valid. + pub unsafe fn into_typed_slice(self) -> &'a [T] { + // Safety ensured by the caller; we ensure that nothing outside of this buffer will be part + // of the slice + unsafe { slice::from_raw_parts(self.as_ptr() as *const T, self.len() / size_of::()) } + } + + /// Split the buffer at `mid`. + /// + /// Return `&self[..mid]` and `&self[mid..]`. + /// + /// If `mid > self.len()`, return `&self[..]` and `[]`. + pub fn split_at(self, mid: usize) -> (IoBufferRef<'a>, IoBufferRef<'a>) { + let head_len = cmp::min(mid, self.size); + + ( + IoBufferRef { + pointer: self.pointer, + size: head_len, + _lifetime: PhantomData, + }, + IoBufferRef { + // Safe because we have limited this to `self.size` + pointer: unsafe { self.pointer.add(head_len) }, + size: self.size - head_len, + _lifetime: PhantomData, + }, + ) + } + + /// Make this reference immutable. + pub fn into_ref(self) -> IoBufferRef<'a> { + self + } +} + +impl<'a> IoBufferRefTrait<'a> for IoBufferRef<'a> { + type SliceType = &'a [T]; + type PointerType = *const T; + + passthrough_trait_fn! { fn from_slice(slice: Self::SliceType) -> Self; } + passthrough_trait_fn! { fn try_into_owned(self, alignment: usize) -> io::Result; } + passthrough_trait_fn! { fn len(&self) -> usize; } + passthrough_trait_fn! { fn as_ptr(&self) -> Self::PointerType; } + passthrough_trait_fn! { fn split_at(self, mid: usize) -> (Self, Self); } + passthrough_trait_fn! { fn into_ref(self) -> IoBufferRef<'a>; } + + unsafe fn into_typed_slice(self) -> Self::SliceType { + Self::into_typed_slice(self) + } +} + +impl<'a> From> for IoBufferRef<'a> { + fn from(slice: IoSlice<'a>) -> Self { + IoBufferRef { + pointer: slice.as_ptr(), + size: slice.len(), + _lifetime: PhantomData, + } + } +} + +impl<'a> From> for IoSlice<'a> { + fn from(buf: IoBufferRef<'a>) -> Self { + IoSlice::new(buf.into_slice()) + } +} + +impl<'a> IoBufferMut<'a> { + /// Create a reference to a slice. + pub fn from_slice(slice: &'a mut [u8]) -> Self { + IoBufferMut { + pointer: slice.as_mut_ptr(), + size: size_of_val(slice), + _lifetime: PhantomData, + } + } + + /// Create an owned [`IoBuffer`] with the same data (copied). + pub fn try_into_owned(self, alignment: usize) -> io::Result { + let mut new_buf = IoBuffer::new(self.len(), alignment)?; + new_buf + .as_mut() + .into_slice() + .copy_from_slice(self.into_slice()); + Ok(new_buf) + } + + /// Size in bytes. + pub fn len(&self) -> usize { + self.size + } + + /// Whether the length is 0. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Return the pointer to the start of the buffer. + pub fn as_ptr(&self) -> *mut u8 { + self.pointer + } + + /// Turn this reference into a slice. + /// + /// References to `IoBuffer`s must not be copied/cloned (so they can only be accessed once; + /// they are considered volatile due to potential VM guest accesses), so this consumes the + /// object. + pub fn into_slice(self) -> &'a mut [u8] { + // Alignment requirement is always met, resulting data is pure binary data + unsafe { self.into_typed_slice::() } + } + + /// Turn this reference into a slice with the given element type. + /// + /// # Safety + /// Caller must ensure that alignment and length requirements are met and that the resulting + /// data is valid. + pub unsafe fn into_typed_slice(self) -> &'a mut [T] { + // Safety ensured by the caller; we ensure that nothing outside of this buffer will be part + // of the slice + unsafe { slice::from_raw_parts_mut(self.as_ptr() as *mut T, self.len() / size_of::()) } + } + + /// Split the buffer at `mid`. + /// + /// Return `&self[..mid]` and `&self[mid..]`. + /// + /// If `mid > self.len()`, return `&self[..]` and `[]`. + pub fn split_at(self, mid: usize) -> (IoBufferMut<'a>, IoBufferMut<'a>) { + let head_len = cmp::min(mid, self.size); + + ( + IoBufferMut { + pointer: self.pointer, + size: head_len, + _lifetime: PhantomData, + }, + IoBufferMut { + // Safe because we have limited this to `self.size` + pointer: unsafe { self.pointer.add(head_len) }, + size: self.size - head_len, + _lifetime: PhantomData, + }, + ) + } + + /// Make this reference immutable. + pub fn into_ref(self) -> IoBufferRef<'a> { + IoBufferRef { + pointer: self.pointer, + size: self.size, + _lifetime: PhantomData, + } + } +} + +impl<'a> IoBufferRefTrait<'a> for IoBufferMut<'a> { + type SliceType = &'a mut [T]; + type PointerType = *mut T; + + passthrough_trait_fn! { fn from_slice(slice: Self::SliceType) -> Self; } + passthrough_trait_fn! { fn try_into_owned(self, alignment: usize) -> io::Result; } + passthrough_trait_fn! { fn len(&self) -> usize; } + passthrough_trait_fn! { fn as_ptr(&self) -> Self::PointerType; } + passthrough_trait_fn! { fn split_at(self, mid: usize) -> (Self, Self); } + passthrough_trait_fn! { fn into_ref(self) -> IoBufferRef<'a>; } + + unsafe fn into_typed_slice(self) -> Self::SliceType { + Self::into_typed_slice(self) + } +} + +impl<'a, T: Sized> From<&'a mut [T]> for IoBufferMut<'a> { + fn from(slice: &'a mut [T]) -> Self { + IoBufferMut { + pointer: slice.as_mut_ptr() as *mut u8, + size: size_of_val(slice), + _lifetime: PhantomData, + } + } +} + +impl<'a> From> for IoBufferMut<'a> { + fn from(mut slice: IoSliceMut<'a>) -> Self { + IoBufferMut { + pointer: slice.as_mut_ptr(), + size: slice.len(), + _lifetime: PhantomData, + } + } +} + +impl<'a> From> for IoSliceMut<'a> { + fn from(buf: IoBufferMut<'a>) -> Self { + IoSliceMut::new(buf.into_slice()) + } +} + +/// Common functions for both `IoVector` and `IoVectorMut`. +#[allow(dead_code)] +pub(crate) trait IoVectorTrait: Sized { + /// `&[u8]` or `&mut [u8]`. + type SliceType; + + /// `IoSlice` or `IoSliceMut`. + type BufferType; + + /// Create an empty vector. + fn new() -> Self; + + /// Create an empty vector, pre-allocating space for `cap` buffers. + /// + /// This does not allocate an memory buffer, only space in the buffer vector. + fn with_capacity(cap: usize) -> Self; + + /// Append a slice. + fn push(&mut self, slice: Self::SliceType); + + /// Append a slice. + fn push_ioslice(&mut self, ioslice: Self::BufferType); + + /// Insert a slice at the given `index` in the buffer vector. + fn insert(&mut self, index: usize, slice: Self::SliceType); + + /// Return the sum total length in bytes of all buffers in this vector. + fn len(&self) -> u64; + + /// Return the number of buffers in this vector. + fn buffer_count(&self) -> usize; + + /// Return `true` if and only if this vector’s length is zero. + /// + /// Synonymous with whether this vector’s buffer count is zero. + fn is_empty(&self) -> bool { + debug_assert!((self.len() == 0) == (self.buffer_count() == 0)); + self.len() == 0 + } + + /// Append all buffers from the given other vector to this vector. + fn append(&mut self, other: Self); + + /// Split the vector into two. + /// + /// The first returned vector contains the bytes in the `[..mid]` range, and the second one + /// covers the `[mid..]` range. + fn split_at(self, mid: u64) -> (Self, Self); + + /// Like [`IoVectorTrait::split_at()`], but discards the head, only returning the tail. + /// + /// More efficient than to use `self.split_at(mid).1` because the former requires creating a + /// new `Vec` object for the head, which this version skips. + fn split_tail_at(self, mid: u64) -> Self; + + /// Copy the data from `self` into `slice`. + /// + /// Both must have the same length. + fn copy_into_slice(&self, slice: &mut [u8]); + + /// Create a single owned [`IoBuffer`] with the same data (copied). + fn try_into_owned(self, alignment: usize) -> io::Result; + + /// Return a corresponding `&[libc::iovec]`. + /// + /// # Safety + /// `iovec` has no lifetime information. Callers must ensure no elements in the returned slice + /// are used beyond the lifetime `'_`. + #[cfg(unix)] + unsafe fn as_iovec<'a>(&'a self) -> &'a [libc::iovec] + where + Self: 'a; + + /// Check whether `self` is aligned. + /// + /// Each buffer must be aligned to `mem_alignment`, and each buffer’s length must be aligned to + /// both `mem_alignment` and `req_alignment` (the I/O request offset/size alignment). + fn is_aligned(&self, mem_alignment: usize, req_alignment: usize) -> bool; + + /// Return the internal vector of `IoSlice` objects. + fn into_inner(self) -> Vec; +} + +/// Implement most of both `IoVector` and `IoVectorMut`. +macro_rules! impl_io_vector { + ($type:tt, $inner_type:tt, $buffer_type:tt, $slice_type:ty, $slice_type_lifetime_b:ty) => { + /// Vector of memory buffers. + pub struct $type<'a> { + /// Buffer list. + vector: Vec<$inner_type<'a>>, + + /// Complete size in bytes. + total_size: u64, + } + + impl<'a> $type<'a> { + /// Create an empty vector. + pub fn new() -> Self { + Self::default() + } + + /// Create an empty vector, pre-allocating space for `cap` buffers. + /// + /// This does not allocate an memory buffer, only space in the buffer vector. + pub fn with_capacity(cap: usize) -> Self { + $type { + vector: Vec::with_capacity(cap), + total_size: 0, + } + } + + /// Append a slice. + pub fn push(&mut self, slice: $slice_type) { + debug_assert!(!slice.is_empty()); + self.total_size += slice.len() as u64; + self.vector.push($inner_type::new(slice)); + } + + /// Append a slice. + pub fn push_ioslice(&mut self, ioslice: $inner_type<'a>) { + debug_assert!(!ioslice.is_empty()); + self.total_size += ioslice.len() as u64; + self.vector.push(ioslice); + } + + /// Insert a slice at the given `index` in the buffer vector. + pub fn insert(&mut self, index: usize, slice: $slice_type) { + debug_assert!(!slice.is_empty()); + self.total_size += slice.len() as u64; + self.vector.insert(index, $inner_type::new(slice)); + } + + /// Return the sum total length in bytes of all buffers in this vector. + pub fn len(&self) -> u64 { + self.total_size + } + + /// Return the number of buffers in this vector. + pub fn buffer_count(&self) -> usize { + self.vector.len() + } + + /// Return `true` if and only if this vector’s length is zero. + /// + /// Synonymous with whether this vector’s buffer count is zero. + pub fn is_empty(&self) -> bool { + debug_assert!((self.len() == 0) == (self.buffer_count() == 0)); + self.len() == 0 + } + + /// Append all buffers from the given other vector to this vector. + pub fn append(&mut self, mut other: Self) { + self.total_size += other.total_size; + self.vector.append(&mut other.vector); + } + + /// Split the vector into two. + /// + /// The first returned vector contains the bytes in the `[..mid]` range, and the second + /// one covers the `[mid..]` range. + pub fn split_at(self, mid: u64) -> (Self, Self) { + let (head, tail) = self.do_split_at(mid, true); + (head.unwrap(), tail) + } + + /// Like [`Self::split_at()`], but discards the head, only returning the tail. + /// + /// More efficient than to use `self.split_at(mid).1` because the former requires + /// creating a new `Vec` object for the head, which this version skips. + pub fn split_tail_at(self, mid: u64) -> Self { + self.do_split_at(mid, false).1 + } + + /// Copy the data from `self` into `slice`. + /// + /// Both must have the same length. + pub fn copy_into_slice(&self, slice: &mut [u8]) { + if slice.len() as u64 != self.total_size { + panic!("IoVectorTrait::copy_into_slice() called on a slice of different length from the vector"); + } + + assert!(self.total_size <= usize::MAX as u64); + + let mut offset = 0usize; + for elem in self.vector.iter() { + let next_offset = offset + elem.len(); + slice[offset..next_offset].copy_from_slice(&elem[..]); + offset = next_offset; + } + } + + /// Create a single owned [`IoBuffer`] with the same data (copied). + pub fn try_into_owned(self, alignment: usize) -> io::Result { + let size = self.total_size.try_into().map_err(|_| { + io::Error::other(format!("Buffer is too big ({})", self.total_size)) + })?; + let mut new_buf = IoBuffer::new(size, alignment)?; + self.copy_into_slice(new_buf.as_mut().into_slice()); + Ok(new_buf) + } + + /// Return a corresponding `&[libc::iovec]`. + /// + /// # Safety + /// `iovec` has no lifetime information. Callers must ensure no elements in the + /// returned slice are used beyond the lifetime `'_`. + #[cfg(unix)] + pub unsafe fn as_iovec<'b>(&'b self) -> &'b [libc::iovec] where Self: 'b { + // IoSlice and IoSliceMut are defined to have the same representation in memory as + // libc::iovec does + unsafe { + mem::transmute::<&'b [$inner_type<'b>], &'b [libc::iovec]>(&self.vector[..]) + } + } + + /// Check whether `self` is aligned. + /// + /// Each buffer must be aligned to `mem_alignment`, and each buffer’s length must be + /// aligned to both `mem_alignment` and `req_alignment` (the I/O request offset/size + /// alignment). + pub fn is_aligned(&self, mem_alignment: usize, req_alignment: usize) -> bool { + // Trivial case + if mem_alignment == 1 && req_alignment == 1 { + return true; + } + + debug_assert!(mem_alignment.is_power_of_two() && req_alignment.is_power_of_two()); + let base_align_mask = mem_alignment - 1; + let len_align_mask = base_align_mask | (req_alignment - 1); + + self.vector.iter().all(|buf| { + buf.as_ptr() as usize & base_align_mask == 0 && + buf.len() & len_align_mask == 0 + }) + } + + /// Return the internal vector of `IoSlice` objects. + pub fn into_inner(self) -> Vec<$inner_type<'a>> { + self.vector + } + + /// Same as [`Self::push()`], but takes ownership of `self`. + /// + /// By taking ownership of `self` and returning it, this method allows reducing the + /// lifetime of `self` to that of `slice`, if necessary. + pub fn with_pushed<'b>(self, slice: $slice_type_lifetime_b) -> $type<'b> + where + 'a: 'b, + { + let mut vec: $type<'b> = self; + vec.push(slice); + vec + } + + /// Same as [`Self::insert()`], but takes ownership of `self.` + /// + /// By taking ownership of `self` and returning it, this method allows reducing the + /// lifetime of `self` to that of `slice`, if necessary. + pub fn with_inserted<'b>(self, index: usize, slice: $slice_type_lifetime_b) -> $type<'b> + where + 'a: 'b, + { + let mut vec: $type<'b> = self; + vec.insert(index, slice); + vec + } + + /// Implementation for [`Self::split_at()`] and [`Self::split_tail_at()`]. + /// + /// If `keep_head` is true, both head and tail are returned ([`Self::split_at()`]). + /// Otherwise, the head is discarded ([`Self::split_tail_at()`]). + fn do_split_at(mut self, mid: u64, keep_head: bool) -> (Option<$type<'a>>, $type<'a>) { + if mid >= self.total_size { + // Special case: Empty tail + return ( + keep_head.then_some(self), + $type { + vector: Vec::new(), + total_size: 0, + }, + ); + } + + let mut i = 0; // Current element index + let mut offset = 0u64; // Current element offset + let (vec_head, vec_tail) = loop { + if offset == mid { + // Clean split: `i` is fully behind `mid`, the rest is fully ahead + if keep_head { + let mut vec_head = self.vector; + let vec_tail = vec_head.split_off(i); + break (Some(vec_head), vec_tail); + } else { + break (None, self.vector.split_off(i)); + } + } + + let post_elm_offset = offset + self.vector[i].len() as u64; + + if post_elm_offset > mid { + // Not so clean split: The beginning of this element was before `mid`, the end is + // behind it, so we must split this element between head and tail + let mut vec_head = self.vector; + let mut tail_iter = vec_head.drain(i..); + + // This is the current element (at `i`), which must be present + let mid_elm = tail_iter.next().unwrap(); + let mid_elm: $buffer_type<'a> = mid_elm.into(); + + // Each element's length is of type usize, so this must fit into usize + let mid_elm_head_len: usize = (mid - offset).try_into().unwrap(); + let (mid_head, mid_tail) = mid_elm.split_at(mid_elm_head_len); + + let mut vec_tail: Vec<$inner_type<'a>> = vec![mid_tail.into()]; + vec_tail.extend(tail_iter); + + if keep_head { + vec_head.push(mid_head.into()); + break (Some(vec_head), vec_tail); + } else { + break (None, vec_tail); + } + } + + offset = post_elm_offset; + + i += 1; + // We know that `mid < self.total_size`, so we must encounter `mid before the end of + // the vector + assert!(i < self.vector.len()); + }; + + let head = keep_head.then(|| $type { + vector: vec_head.unwrap(), + total_size: mid, + }); + let tail = $type { + vector: vec_tail, + total_size: self.total_size - mid, + }; + + (head, tail) + } + } + + impl<'a> IoVectorTrait for $type<'a> { + type SliceType = $slice_type; + type BufferType = $inner_type<'a>; + + passthrough_trait_fn! { fn new() -> Self; } + passthrough_trait_fn! { fn with_capacity(cap: usize) -> Self; } + passthrough_trait_fn! { fn push(&mut self, slice: Self::SliceType); } + passthrough_trait_fn! { fn push_ioslice(&mut self, ioslice: Self::BufferType); } + passthrough_trait_fn! { fn insert(&mut self, index: usize, slice: Self::SliceType); } + passthrough_trait_fn! { fn len(&self) -> u64; } + passthrough_trait_fn! { fn buffer_count(&self) -> usize; } + passthrough_trait_fn! { fn append(&mut self, other: Self); } + passthrough_trait_fn! { fn split_at(self, mid: u64) -> (Self, Self); } + passthrough_trait_fn! { fn split_tail_at(self, mid: u64) -> Self; } + passthrough_trait_fn! { fn copy_into_slice(&self, slice: &mut [u8]); } + passthrough_trait_fn! { fn try_into_owned(self, alignment: usize) -> io::Result; } + passthrough_trait_fn! { fn is_aligned(&self, mem_alignment: usize, req_alignment: usize) -> bool; } + passthrough_trait_fn! { fn into_inner(self) -> Vec; } + + #[cfg(unix)] + unsafe fn as_iovec<'b>(&'b self) -> &'b [libc::iovec] + where + Self: 'b + { + Self::as_iovec(self) + } + } + + impl<'a> From>> for $type<'a> { + fn from(vector: Vec<$inner_type<'a>>) -> Self { + let total_size = vector + .iter() + .map(|e| e.len()) + .fold(0u64, |sum, e| sum + e as u64); + + $type { vector, total_size } + } + } + + impl<'a> From<$buffer_type<'a>> for $type<'a> { + fn from(buffer: $buffer_type<'a>) -> Self { + let total_size = buffer.len() as u64; + if total_size > 0 { + $type { + vector: vec![buffer.into()], + total_size, + } + } else { + $type { + vector: Vec::new(), + total_size: 0, + } + } + } + } + + impl<'a> From<$slice_type> for $type<'a> { + fn from(slice: $slice_type) -> Self { + let total_size = slice.len() as u64; + if total_size > 0 { + $type { + vector: vec![$inner_type::new(slice)], + total_size, + } + } else { + $type { + vector: Vec::new(), + total_size: 0, + } + } + } + } + + impl<'a> Default for $type<'a> { + fn default() -> Self { + $type { + vector: Vec::new(), + total_size: 0, + } + } + } + + impl Debug for $type<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct(std::stringify!($type)) + .field("vector.len()", &self.vector.len()) + .field("total_size", &self.total_size) + .finish() + } + } + }; +} + +impl_io_vector!(IoVector, IoSlice, IoBufferRef, &'a [u8], &'b [u8]); +impl_io_vector!( + IoVectorMut, + IoSliceMut, + IoBufferMut, + &'a mut [u8], + &'b mut [u8] +); + +#[cfg(feature = "vm-memory")] +impl<'a> IoVector<'a> { + /// Converts a `VolatileSlice` array (from vm-memory) into an `IoVector`. + /// + /// In addition to a the vector, return a guard that ensures that the memory in `slices` is + /// indeed mapped while in use. This guard must not be dropped while this vector is in use! + pub fn from_volatile_slice< + B: vm_memory::bitmap::BitmapSlice, + I: IntoIterator< + Item: ImagoAsRef<'a, vm_memory::VolatileSlice<'a, B>>, + IntoIter: ExactSizeIterator, + >, + >( + slices: I, + ) -> ( + Self, + VolatileSliceGuard<'a, vm_memory::volatile_memory::PtrGuard, B>, + ) { + let ptr_guards = slices + .into_iter() + .map(|slice| slice.as_ref().ptr_guard()) + .collect::>(); + let buffers = ptr_guards + .iter() + .map(|pg| { + // Safe because this whole module basically exists to follow the same design concepts + // as `VolatileSlice`. + let slice = unsafe { std::slice::from_raw_parts(pg.as_ptr(), pg.len()) }; + IoSlice::new(slice) + }) + .collect::>(); + + let vector = IoVector::from(buffers); + let guard = VolatileSliceGuard { + _ptr_guards: ptr_guards, + // `IoVector` is immutable, so no need to dirty + dirty_on_drop: None, + }; + + (vector, guard) + } +} + +impl IoVectorMut<'_> { + /// Fill all buffers in the vector with the given byte pattern. + pub fn fill(&mut self, value: u8) { + for slice in self.vector.iter_mut() { + slice.fill(value); + } + } + + /// Copy data from `slice` into the buffers in this vector. + /// + /// The vector and the slice must have the same total length. + pub fn copy_from_slice(&mut self, slice: &[u8]) { + if slice.len() as u64 != self.total_size { + panic!("IoVectorMut::copy_from_slice() called on a slice of different length from the vector"); + } + + assert!(self.total_size <= usize::MAX as u64); + + let mut offset = 0usize; + for elem in self.vector.iter_mut() { + let next_offset = offset + elem.len(); + elem.copy_from_slice(&slice[offset..next_offset]); + offset = next_offset; + } + } +} + +#[cfg(feature = "vm-memory")] +impl<'a> IoVectorMut<'a> { + /// Converts a `VolatileSlice` array (from vm-memory) into an `IoVectorMut`. + /// + /// In addition to a the vector, return a guard that ensures that the memory in `slices` is + /// indeed mapped while in use. This guard must not be dropped while this vector is in use! + pub fn from_volatile_slice< + B: vm_memory::bitmap::BitmapSlice, + I: IntoIterator< + Item: ImagoAsRef<'a, vm_memory::VolatileSlice<'a, B>>, + IntoIter: ExactSizeIterator, + >, + >( + slices: I, + ) -> ( + Self, + VolatileSliceGuard<'a, vm_memory::volatile_memory::PtrGuardMut, B>, + ) { + let slices = slices.into_iter(); + let slice_count = slices.len(); + let mut ptr_guards = Vec::with_capacity(slice_count); + let mut dirty_on_drop = Vec::with_capacity(slice_count); + + for slice in slices { + let slice = slice.as_ref(); + ptr_guards.push(slice.ptr_guard_mut()); + // `IoVector` is mutable, so we can assume it will all be written + dirty_on_drop.push((slice.bitmap(), slice.len())); + } + + let buffers = ptr_guards + .iter() + .map(|pg| { + // Safe because this whole module basically exists to follow the same design concepts + // as `VolatileSlice`. + let slice = unsafe { std::slice::from_raw_parts_mut(pg.as_ptr(), pg.len()) }; + IoSliceMut::new(slice) + }) + .collect::>(); + + let vector = IoVectorMut::from(buffers); + let guard = VolatileSliceGuard { + _ptr_guards: ptr_guards, + dirty_on_drop: Some(dirty_on_drop), + }; + + (vector, guard) + } +} + +impl<'a> From<&'a Vec> for IoVector<'a> { + fn from(vec: &'a Vec) -> Self { + vec.as_slice().into() + } +} + +impl<'a> From<&'a IoBuffer> for IoVector<'a> { + fn from(buf: &'a IoBuffer) -> Self { + buf.as_ref().into_slice().into() + } +} + +impl<'a> From<&'a mut Vec> for IoVectorMut<'a> { + fn from(vec: &'a mut Vec) -> Self { + vec.as_mut_slice().into() + } +} + +impl<'a> From<&'a mut IoBuffer> for IoVectorMut<'a> { + fn from(buf: &'a mut IoBuffer) -> Self { + buf.as_mut().into_slice().into() + } +} + +/// Ensures an I/O vector’s validity when created from `[VolatileSlice]`. +/// +/// `[VolatileSlice]` arrays may require being explicitly mapped before use (and unmapped after), +/// and this guard ensures that the memory is mapped until it is dropped. +/// +/// Further, for mutable vectors ([`IoVectorMut`]), it will also dirty the corresponding bitmap +/// slices when dropped, assuming the whole vector has been written. +#[cfg(feature = "vm-memory")] +pub struct VolatileSliceGuard<'a, PtrGuardType, BitmapType: vm_memory::bitmap::Bitmap> { + /// vm-memory’s pointer guards ensuring the memory remains mapped while used. + _ptr_guards: Vec, + + /// If given, mark the given dirty bitmap range as dirty when dropping this guard. + /// + /// `.1` is the length of the respective `VolatileSlice` (i.e. the length of the area to + /// dirty). + dirty_on_drop: Option>, +} + +#[cfg(feature = "vm-memory")] +impl Drop for VolatileSliceGuard<'_, P, B> { + fn drop(&mut self) { + if let Some(dirty_on_drop) = self.dirty_on_drop.take() { + for (bitmap, len) in dirty_on_drop { + // Every bitmap is a window into the full bitmap for its specific `VolatileSlice`, + // so marking the whole thing is dirty is correct. + bitmap.mark_dirty(0, len); + } + } + } +} + +#[cfg(all(test, feature = "vm-memory"))] +mod vm_memory_test { + use crate::io_buffers::{IoVector, IoVectorMut}; + use vm_memory::bitmap::BitmapSlice; + use vm_memory::VolatileSlice; + + pub fn do_test_volatile_slice_owned(slices: &[VolatileSlice]) { + { + let _vec = IoVector::from_volatile_slice(slices); + } + { + let _vec = IoVectorMut::from_volatile_slice(slices); + } + } + + #[test] + fn test_volatile_slice_owned() { + let empty: Vec> = Vec::new(); + do_test_volatile_slice_owned(&empty); + } + + pub fn do_test_volatile_slice_ref(slices: &[&VolatileSlice]) { + { + let _vec = IoVector::from_volatile_slice(slices); + } + { + let _vec = IoVectorMut::from_volatile_slice(slices); + } + } + + #[test] + fn test_volatile_slice_ref() { + let empty: Vec<&vm_memory::VolatileSlice<()>> = Vec::new(); + do_test_volatile_slice_ref(&empty); + } +} diff --git a/src/imago/src/lib.rs b/src/imago/src/lib.rs new file mode 100644 index 00000000..bd6d9f11 --- /dev/null +++ b/src/imago/src/lib.rs @@ -0,0 +1,106 @@ +// #![feature(async_drop)] -- enable with async-drop +#![cfg_attr(all(doc, nightly), feature(doc_auto_cfg))] // expect nightly for doc +#![warn(missing_docs)] +#![warn(clippy::missing_docs_in_private_items)] + +//! Provides access to VM image formats. +//! +//! Simple example (requires the `sync-wrappers` feature): +//! ```no_run +//! # #[cfg(feature = "sync-wrappers")] +//! # || -> std::io::Result<()> { +//! use imago::file::File; +//! use imago::qcow2::Qcow2; +//! use imago::SyncFormatAccess; +//! use std::fs::OpenOptions; +//! +//! // Produce read-only qcow2 instance using purely `File` for storage +//! let mut qcow2 = Qcow2::::open_path_sync("image.qcow2", false)?; +//! qcow2.open_implicit_dependencies_sync()?; +//! +//! let qcow2 = SyncFormatAccess::new(qcow2)?; +//! +//! let mut buf = vec![0u8; 512]; +//! qcow2.read(&mut buf, 0)?; +//! # Ok::<(), std::io::Error>(()) +//! # }; +//! ``` +//! +//! Another example, using the native async interface instead of sync wrapper functions, explicitly +//! overriding the implicit references contained in qcow2 files, and showcasing using different +//! types of storage (specifically normal files and null storage): +//! ```no_run +//! # let _ = async { +//! use imago::file::File; +//! use imago::null::Null; +//! use imago::qcow2::Qcow2; +//! use imago::raw::Raw; +//! use imago::{DynStorage, FormatAccess, Storage, StorageOpenOptions}; +//! use std::sync::Arc; +//! +//! let qcow2_file_opts = StorageOpenOptions::new() +//! .write(true) +//! .filename(String::from("image.qcow2")); +//! let qcow2_file = File::open(qcow2_file_opts).await?; +//! +//! // Produce qcow2 instance with arbitrary (and potentially mixed) storage instances +//! let mut qcow2 = +//! Qcow2::, Arc>>::open_image(Box::new(qcow2_file), true) +//! .await?; +//! +//! let backing_storage: Box = Box::new(Null::new(0)); +//! let backing = Raw::open_image(backing_storage, false).await?; +//! let backing = Arc::new(FormatAccess::new(backing)); +//! qcow2.set_backing(Some(Arc::clone(&backing))); +//! +//! // Open potentially remaining dependencies (like an external data file) +//! qcow2.open_implicit_dependencies().await?; +//! +//! let qcow2 = FormatAccess::new(qcow2); +//! +//! let mut buf = vec![0u8; 512]; +//! qcow2.read(&mut buf, 0).await?; +//! +//! qcow2.flush().await?; +//! # Ok::<(), std::io::Error>(()) +//! # }; +//! ``` +//! +//! # Flushing +//! +//! Given that `AsyncDrop` is not stable yet (and probably will not be stable for a long time), +//! callers must ensure that images are properly flushed before dropping them, i.e. call +//! `.flush().await` on any image that is not read-only. +//! +//! (The synchronous wrapper [`SyncFormatAccess`] does perform a synchronous flush in its `Drop` +//! implementation.) +//! +//! # Features +//! +//! - `sync-wrappers`: Provide synchronous wrappers for the native `async` interface. Note that +//! these build a `tokio` runtime in which they run the `async` functions, so using the `async` +//! interface is definitely preferred. +//! +//! - `vm-memory`: Provide conversion functions +//! [`IoVector::from_volatile_slice`](io_buffers::IoVector::from_volatile_slice) and +//! [`IoVectorMut::from_volatile_slice`](io_buffers::IoVectorMut::from_volatile_slice) to convert +//! the vm-memory crate’s `[VolatileSlice]` arrays into imago’s native I/O vectors. + +pub mod annotated; +mod async_lru_cache; +pub mod file; +pub mod format; +pub mod io_buffers; +mod macros; +mod misc_helpers; +pub mod null; +pub mod qcow2; +pub mod raw; +pub mod storage; +mod vector_select; + +pub use format::access::*; +#[cfg(feature = "sync-wrappers")] +pub use format::sync_wrappers::*; +pub use storage::ext::StorageExt; +pub use storage::*; diff --git a/src/imago/src/macros.rs b/src/imago/src/macros.rs new file mode 100644 index 00000000..4c31c0b3 --- /dev/null +++ b/src/imago/src/macros.rs @@ -0,0 +1,85 @@ +//! Helper macros. + +/// Implements `TryFrom` for enums from their numerical representation. +macro_rules! numerical_enum { + ( + $(#[$attr:meta])* + $vis:vis enum $enum_name:ident as $repr:tt { + $( + $(#[$id_attr:meta])* + $identifier:ident = $value:expr, + )+ + } + ) => { + $(#[$attr])* + #[derive(Copy, Clone, Debug, Eq, PartialEq)] + #[repr($repr)] + $vis enum $enum_name { + $( + $(#[$id_attr])* + $identifier = $value, + )+ + } + + impl TryFrom<$repr> for $enum_name { + type Error = std::io::Error; + + fn try_from(val: $repr) -> std::io::Result { + match val { + $(x if x == $value => Ok($enum_name::$identifier),)* + _ => Err(std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!( + "Invalid value for {}: {:x}", + stringify!($enum_name), + val, + ), + )), + } + } + } + } +} + +pub(crate) use numerical_enum; + +/// Implements a function as itself. +/// +/// For traits that generalize interfaces that duplicate what we have on the struct itself, too. +/// For example, we want to have `IoVectorTrait`, but not export it; requiring users to import that +/// trait just for `.len()` is silly. So `.len()` is implemented directly on both `IoVector` and +/// `IoVectorMut` -- still, we want to have a generic `IoVectorTrait::len()`, too. This is what +/// this macro implements. +macro_rules! passthrough_trait_fn { + { fn $name:ident($($param:ident: $type:ty),*) -> $ret:ty; } => { + fn $name($($param: $type),*) -> $ret { + Self::$name($($param),*) + } + }; + + { fn $name:ident(self$(, $param:ident: $type:ty)*) -> $ret:ty; } => { + passthrough_trait_fn! { fn $name(self: Self$(, $param: $type)*) -> $ret; } + }; + + { fn $name:ident(&self$(, $param:ident: $type:ty)*) -> $ret:ty; } => { + passthrough_trait_fn! { fn $name(self: &Self$(, $param: $type)*) -> $ret; } + }; + + { fn $name:ident(&mut self$(, $param:ident: $type:ty)*) -> $ret:ty; } => { + passthrough_trait_fn! { fn $name(self: &mut Self$(, $param: $type)*) -> $ret; } + }; + + { fn $name:ident(self$(, $param:ident: $type:ty)*); } => { + passthrough_trait_fn! { fn $name(self$(, $param: $type)*) -> (); } + }; + + { fn $name:ident(&self$(, $param:ident: $type:ty)*); } => { + passthrough_trait_fn! { fn $name(&self$(, $param: $type)*) -> (); } + }; + + { fn $name:ident(&mut self$(, $param:ident: $type:ty)*); } => { + passthrough_trait_fn! { fn $name(&mut self$(, $param: $type)*) -> (); } + }; +} + +pub(crate) use passthrough_trait_fn; diff --git a/src/imago/src/misc_helpers.rs b/src/imago/src/misc_helpers.rs new file mode 100644 index 00000000..22fe3d93 --- /dev/null +++ b/src/imago/src/misc_helpers.rs @@ -0,0 +1,81 @@ +//! Miscellaneous helper functions. + +use std::io; +use std::ops::Range; + +/// Checks whether something overlaps with something else. +pub(crate) trait Overlaps { + /// Does this overlap with `other`? + fn overlaps(&self, other: &Self) -> bool; +} + +impl Overlaps for Range { + fn overlaps(&self, other: &Self) -> bool { + self.start < other.end && other.start < self.end + } +} + +/// Prepend `Error` messages by context. +/// +/// Trait for `Error` objects that allows prepending their error messages by something that gives +/// context. +pub(crate) trait ErrorContext { + /// Prepend the error by `context`. + fn context(self, context: C) -> Self; +} + +impl ErrorContext for io::Error { + fn context(self, context: C) -> Self { + io::Error::new(self.kind(), format!("{context}: {self}")) + } +} + +/// Give results context in case of error. +/// +/// Lifts the `ErrorContext` trait to `Result` types. +pub(crate) trait ResultErrorContext { + /// Give context if `self` is an error. + /// + /// If `self` is an error, prepend the given `context`. + fn err_context C>(self, context: F) -> Self; +} + +impl ResultErrorContext for Result { + fn err_context C>(self, context: F) -> Self { + self.map_err(|err| err.context(context())) + } +} + +/// Similar to `AsRef`, but for types where `AsRef` is not implemented. +/// +/// When we need `AsRef` for a type but it is not implemented in its origin crate, there is no way +/// but to provide a local trait that we can implement here. Because there are no negative trait +/// bounds, we cannot implement this for `AsRef` (to have a common trait). +/// +/// Also includes a lifetime so that it is possible to borrow things for longer. +pub trait ImagoAsRef<'a, T: ?Sized> { + /// Return a simple reference for `self`. + fn as_ref(&self) -> &'a T; +} + +impl<'a, T: ?Sized, U: ImagoAsRef<'a, T>> ImagoAsRef<'a, T> for &'a U { + fn as_ref(&self) -> &'a T { + >::as_ref(self) + } +} + +#[cfg(feature = "vm-memory")] +impl<'a, B: vm_memory::bitmap::BitmapSlice> ImagoAsRef<'a, vm_memory::VolatileSlice<'a, B>> + for &'a vm_memory::VolatileSlice<'a, B> +{ + fn as_ref(&self) -> &'a vm_memory::VolatileSlice<'a, B> { + self + } +} + +/// Generate an `io::Error` of kind `InvalidData`. +pub(crate) fn invalid_data>>( + error: E, +) -> io::Error { + io::Error::new(io::ErrorKind::InvalidData, error) +} diff --git a/src/imago/src/null.rs b/src/imago/src/null.rs new file mode 100644 index 00000000..d6fa379c --- /dev/null +++ b/src/imago/src/null.rs @@ -0,0 +1,82 @@ +//! Null storage. +//! +//! Discard all written data, and return zeroes when read. + +use crate::io_buffers::{IoVector, IoVectorMut}; +use crate::storage::drivers::CommonStorageHelper; +use crate::Storage; +use std::fmt::{self, Display, Formatter}; +use std::io; +use std::sync::atomic::{AtomicU64, Ordering}; + +/// Null storage object. +/// +/// Reading from this will always return zeroes, writing to it does nothing (except to potentially +/// grow its virtual “file length”). +#[derive(Debug)] +pub struct Null { + /// Virtual “file length”. + size: AtomicU64, + + /// Storage helper. + common_storage_helper: CommonStorageHelper, +} + +impl Null { + /// Create a new null storage object with the given initial virtual size. + pub fn new(size: u64) -> Self { + Null { + size: size.into(), + common_storage_helper: Default::default(), + } + } +} + +impl Storage for Null { + fn size(&self) -> io::Result { + Ok(self.size.load(Ordering::Relaxed)) + } + + async unsafe fn pure_readv(&self, mut bufv: IoVectorMut<'_>, _offset: u64) -> io::Result<()> { + bufv.fill(0); + Ok(()) + } + + async unsafe fn pure_writev(&self, bufv: IoVector<'_>, offset: u64) -> io::Result<()> { + let Some(end) = offset.checked_add(bufv.len()) else { + return Err(io::Error::other("Write too long")); + }; + + self.size.fetch_max(end, Ordering::Relaxed); + Ok(()) + } + + async unsafe fn pure_write_zeroes(&self, offset: u64, length: u64) -> io::Result<()> { + let Some(end) = offset.checked_add(length) else { + return Err(io::Error::other("Write too long")); + }; + + self.size.fetch_max(end, Ordering::Relaxed); + Ok(()) + } + + async fn flush(&self) -> io::Result<()> { + // Nothing to do, there are no buffers + Ok(()) + } + + async fn sync(&self) -> io::Result<()> { + // Nothing to do, there is no hardware + Ok(()) + } + + fn get_storage_helper(&self) -> &CommonStorageHelper { + &self.common_storage_helper + } +} + +impl Display for Null { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "null:[{}B]", self.size.load(Ordering::Relaxed)) + } +} diff --git a/src/imago/src/qcow2/allocation.rs b/src/imago/src/qcow2/allocation.rs new file mode 100644 index 00000000..6c08716b --- /dev/null +++ b/src/imago/src/qcow2/allocation.rs @@ -0,0 +1,534 @@ +//! Cluster allocation. +//! +//! Functionality for allocating single clusters and ranges of clusters, and general handling of +//! refcount structures. + +use super::cache::RefBlockCacheBackend; +use super::*; +use std::mem; +use tokio::sync::MutexGuard; +use tracing::{event, warn, Level}; + +/// Central facility for cluster allocation. +pub(super) struct Allocator { + /// Qcow2 metadata file. + file: Arc, + + /// Qcow2 refcount table. + reftable: RefTable, + + /// The first free cluster index in the qcow2 file, to speed up allocation. + first_free_cluster: HostCluster, + + /// Qcow2 image header. + header: Arc
, + + /// Refblock cache. + rb_cache: AsyncLruCache>, +} + +impl + 'static> Qcow2 { + /// Return the central allocator instance. + /// + /// Returns an error for read-only images. + async fn allocator(&self) -> io::Result>> { + Ok(self + .allocator + .as_ref() + .ok_or_else(|| io::Error::other("Image is read-only"))? + .lock() + .await) + } + + /// Allocate one metadata cluster. + /// + /// Metadata clusters are allocated exclusively in the metadata (image) file. + pub(super) async fn allocate_meta_cluster(&self) -> io::Result { + self.allocate_meta_clusters(ClusterCount(1)).await + } + + /// Allocate multiple continuous metadata clusters. + /// + /// Useful e.g. for the L1 table or refcount table. + pub(super) async fn allocate_meta_clusters( + &self, + count: ClusterCount, + ) -> io::Result { + self.allocator().await?.allocate_clusters(count, None).await + } + + /// Allocate one data clusters for the given guest cluster. + /// + /// Without an external data file, data clusters are allocated in the image file, just like + /// metadata clusters. + /// + /// With an external data file, data clusters aren’t really allocated, but just put there at + /// the same offset as their guest offset. Their refcount is not tracked by the qcow2 metadata + /// structures (which only cover the metadata (image) file). + pub(super) async fn allocate_data_cluster( + &self, + guest_cluster: GuestCluster, + ) -> io::Result { + if self.header.external_data_file() { + Ok(HostCluster(guest_cluster.0)) + } else { + let mut allocator = self.allocator().await?; + + // Allocate clusters before setting up L2 entries + self.l2_cache.depend_on(&allocator.rb_cache).await?; + + allocator.allocate_clusters(ClusterCount(1), None).await + } + } + + /// Allocate the data cluster with the given index. + /// + /// Without a `mandatory_host_cluster` given, this is the same as + /// [`Qcow2::allocate_data_cluster()`]. + /// + /// With a `mandatory_host_cluster` given, try to allocate that cluster. If that is not + /// possible because it is already allocated, return `Ok(None)`. + pub(super) async fn allocate_data_cluster_at( + &self, + guest_cluster: GuestCluster, + mandatory_host_cluster: Option, + ) -> io::Result> { + let Some(mandatory_host_cluster) = mandatory_host_cluster else { + return self.allocate_data_cluster(guest_cluster).await.map(Some); + }; + + if self.header.external_data_file() { + let cluster = HostCluster(guest_cluster.0); + Ok((cluster == mandatory_host_cluster).then_some(cluster)) + } else { + let mut allocator = self.allocator().await?; + + // Allocate clusters before setting up L2 entries + self.l2_cache.depend_on(&allocator.rb_cache).await?; + + let cluster = allocator + .allocate_cluster_at(mandatory_host_cluster) + .await? + .then_some(mandatory_host_cluster); + Ok(cluster) + } + } + + /// Free metadata clusters (i.e. decrement their refcount). + /// + /// Best-effort operation. On error, the given clusters may be leaked, but no errors are ever + /// returned (because there is no good way to handle such errors anyway). + pub(super) async fn free_meta_clusters(&self, cluster: HostCluster, count: ClusterCount) { + if let Ok(mut allocator) = self.allocator().await { + allocator.free_clusters(cluster, count).await + } + } + + /// Free data clusters (i.e. decrement their refcount). + /// + /// Best-effort operation. On error, the given clusters may be leaked, but no errors are ever + /// returned (because there is no good way to handle such errors anyway). + pub(super) async fn free_data_clusters(&self, cluster: HostCluster, count: ClusterCount) { + if !self.header.external_data_file() { + if let Ok(mut allocator) = self.allocator().await { + // Clear L2 entries before deallocating clusters + if let Err(err) = allocator.rb_cache.depend_on(&self.l2_cache).await { + warn!("Leaking clusters; cannot set up cache inter-dependency with L2 cache: {err}"); + return; + } + + allocator.free_clusters(cluster, count).await; + } + } + } +} + +impl Allocator { + /// Create a new allocator for the given image file. + pub async fn new(image: Arc, header: Arc
) -> io::Result { + let cb = header.cluster_bits(); + let rt_offset = header.reftable_offset(); + let rt_cluster = rt_offset + .checked_cluster(cb) + .ok_or_else(|| invalid_data(format!("Unaligned refcount table: {rt_offset}")))?; + + let reftable = RefTable::load( + image.as_ref(), + &header, + rt_cluster, + header.reftable_entries(), + ) + .await?; + + let rb_cache_backend = RefBlockCacheBackend::new(Arc::clone(&image), Arc::clone(&header)); + let rb_cache = AsyncLruCache::new(rb_cache_backend, 32); + + Ok(Allocator { + file: image, + reftable, + first_free_cluster: HostCluster(0), + header, + rb_cache, + }) + } + + /// Flush the refcount block cache. + pub async fn flush_rb_cache(&self) -> io::Result<()> { + self.rb_cache.flush().await + } + + /// Allocate clusters in the image file. + /// + /// `end_cluster` should only be used when allocating refblocks. When reaching this cluster + /// index, abort trying to allocate. (This is used for allocating refblocks, to prevent + /// infinite recursion and speed things up.) + async fn allocate_clusters( + &mut self, + count: ClusterCount, + end_cluster: Option, + ) -> io::Result { + let mut index = self.first_free_cluster; + loop { + if end_cluster == Some(index) { + return Err(io::Error::other("Maximum cluster index reached")); + } + + let alloc_count = self.allocate_clusters_at(index, count).await?; + if alloc_count == count { + return Ok(index); + } + + index += alloc_count + ClusterCount(1); + if index.offset(self.header.cluster_bits()) > MAX_OFFSET { + return Err(io::Error::other("Cannot grow qcow2 file any further")); + } + } + } + + /// Allocate the given clusters in the image file. + /// + /// Allocate up to `count` unallocated clusters starting from `index`. When encountering an + /// already allocated cluster (or any other error), stop, and free the clusters that were just + /// newly allocated. + /// + /// Returns the number of clusters that could be allocated (starting from `index`), which may + /// be 0 if `index` has already been allocated. Note again that in case this is less than + /// `count`, those clusters will have been freed again already, so this is just a hint to + /// callers that the cluster at `index + count` is already allocated. + async fn allocate_clusters_at( + &mut self, + mut index: HostCluster, + mut count: ClusterCount, + ) -> io::Result { + let start_index = index; + + while count > ClusterCount(0) { + // Note that `ensure_rb()` in `allocate_cluster_at()` may allocate clusters (new + // refblocks), and also a new refcount table. This can interfere with us allocating a + // large continuous region like so (A is our allocation, R is a refblock, imagine a + // refblock covers four clusters): + // + // |AAAA| -- allocated four clusters need new refblock + // |AAAA|R | -- made refblock self-describing, but now allocation cannot go on + // + // This gets resolved by us retrying, and future refblocks using the region that has + // now become free but already has refblocks to cover it: + // + // | |RAAA| -- retry after refblock; need a new refblock again + // |R |RAAA|AAAA| -- the new refblock allocates itself in the region we abandoned + // + // However, eventually, the new refblocks will run into the new start of our allocation + // again: + // + // |RRRR|RAAA|AAAA|AAAA|AAAA|AAAA| -- need new refblock + // |RRRR|RAAA|AAAA|AAAA|AAAA|AAAA|R | -- allocation cannot go on, again + // |RRRR|R | | | | |RAAA| -- another attempt + // |RRRR|RRRR|R...| | | |RAAA|AAAA|AAAA|AAAA|AAAA|... + // + // As you can see, the hole we leave behind gets larger each time. So eventually, this + // must converge. + // + // The same applies to the refcount table being allocated instead of just refblocks. + + let result = self.allocate_cluster_at(index).await; + if !matches!(result, Ok(true)) { + // Already allocated, or some real error occurred; free everything allocated so far + self.free_clusters(start_index, index - start_index).await; + return result.map(|_| index - start_index); + } + + count -= ClusterCount(1); + index += ClusterCount(1); + } + + Ok(index - start_index) + } + + /// Allocate the given cluster in the image file. + /// + /// Return `Ok(true)` if allocation was successful, or `Ok(false)` if the cluster was already + /// allocated before. + async fn allocate_cluster_at(&mut self, index: HostCluster) -> io::Result { + let rb_bits = self.header.rb_bits(); + let (rt_index, rb_index) = index.rt_rb_indices(rb_bits); + + let rb = self.ensure_rb(rt_index).await?; + let mut rb = rb.lock_write().await; + let can_allocate = rb.is_zero(rb_index); + if can_allocate { + rb.increment(rb_index)?; + } + + // We now know this is allocated + if index == self.first_free_cluster { + self.first_free_cluster = index + ClusterCount(1); + } + + Ok(can_allocate) + } + + /// Get the refblock referenced by the given reftable index, if any. + /// + /// If there is no refblock for the given reftable index, return `Ok(None)`. + async fn get_rb(&mut self, rt_index: usize) -> io::Result>> { + let rt_entry = self.reftable.get(rt_index); + if let Some(rb_offset) = rt_entry.refblock_offset() { + let cb = self.header.cluster_bits(); + let rb_cluster = rb_offset.checked_cluster(cb).ok_or_else(|| { + invalid_data(format!("Unaligned refcount block with index {rt_index}; refcount table entry: {rt_entry:?}")) + })?; + + self.rb_cache.get_or_insert(rb_cluster).await.map(Some) + } else { + Ok(None) + } + } + + /// Get a refblock for the given reftable index. + /// + /// If there already is a refblock at that index, return it. Otherwise, create one and hook it + /// up. + async fn ensure_rb(&mut self, rt_index: usize) -> io::Result> { + if let Some(rb) = self.get_rb(rt_index).await? { + return Ok(rb); + } + + if !self.reftable.in_bounds(rt_index) { + self.grow_reftable(rt_index).await?; + // `grow_reftable` will allocate new refblocks, so check the index again + if let Some(rb) = self.get_rb(rt_index).await? { + return Ok(rb); + } + } + + let mut new_rb = RefBlock::new_cleared(self.file.as_ref(), &self.header)?; + + // This is the first cluster covered by the new refblock + let rb_cluster = HostCluster::from_ref_indices(rt_index, 0, self.header.rb_bits()); + + // Try to allocate a cluster in the already existing refcount structures. + // By stopping looking for clusters at `rb_cluster`, we ensure that we will not land here + // in this exact function again, trying to allocate the very same refblock (it is possible + // we allocate one before the current one, though), and so prevent any possible infinite + // recursion. + // Recursion is possible, though, so the future must be boxed. + // false`), so must be boxed. + if let Ok(new_rb_cluster) = + Box::pin(self.allocate_clusters(ClusterCount(1), Some(rb_cluster))).await + { + new_rb.set_cluster(new_rb_cluster); + } else { + // Place the refblock such that it covers itself + new_rb.set_cluster(rb_cluster); + new_rb.lock_write().await.increment(0)?; + } + new_rb.write(self.file.as_ref()).await?; + + self.reftable.enter_refblock(rt_index, &new_rb)?; + self.reftable + .write_entry(self.file.as_ref(), rt_index) + .await?; + + let new_rb = Arc::new(new_rb); + self.rb_cache + .insert(new_rb.get_cluster().unwrap(), Arc::clone(&new_rb)) + .await?; + Ok(new_rb) + } + + /// Create a new refcount table covering at least `at_least_index`. + /// + /// Create a new reftable of the required size, copy all existing refblock references into it, + /// ensure it is refcounted itself (also creating new refblocks if necessary), and have the + /// image header reference the new refcount table. + async fn grow_reftable(&mut self, at_least_index: usize) -> io::Result<()> { + let cb = self.header.cluster_bits(); + let rb_bits = self.header.rb_bits(); + let rb_entries = 1 << rb_bits; + + let mut new_rt = self.reftable.clone_and_grow(&self.header, at_least_index)?; + let rt_clusters = ClusterCount::from_byte_size(new_rt.byte_size() as u64, cb); + + // Find free range + let (mut rt_index, mut rb_index) = self.first_free_cluster.rt_rb_indices(rb_bits); + let mut free_cluster_index: Option = None; + let mut free_cluster_count = ClusterCount(0); + + // Number of clusters required to allocate both the new reftable and all new refblocks. + // Note that `clone_and_grow()` *guarantees* we can fit the final count in there. + let mut required_clusters = rt_clusters; + + while free_cluster_count < required_clusters { + // `clone_and_grow()` guarantees it can fit + assert!(new_rt.in_bounds(rt_index)); + + let rt_entry = new_rt.get(rt_index); + let Some(rb_offset) = rt_entry.refblock_offset() else { + let start_index = HostCluster::from_ref_indices(rt_index, 0, rb_bits); + free_cluster_index.get_or_insert(start_index); + free_cluster_count += ClusterCount(rb_entries as u64); + // Need to allocate this RB + required_clusters += ClusterCount(1); + continue; + }; + + let rb_cluster = rb_offset.checked_cluster(cb).ok_or_else(|| { + invalid_data(format!("Unaligned refcount block with index {rt_index}; refcount table entry: {rt_entry:?}")) + })?; + + let rb = self.rb_cache.get_or_insert(rb_cluster).await?; + for i in rb_index..rb_entries { + if rb.is_zero(i) { + let index = HostCluster::from_ref_indices(rt_index, i, rb_bits); + free_cluster_index.get_or_insert(index); + free_cluster_count += ClusterCount(1); + + if free_cluster_count >= required_clusters { + break; + } + } else if free_cluster_index.is_some() { + free_cluster_index.take(); + free_cluster_count = ClusterCount(0); + required_clusters = rt_clusters; // reset + } + } + + rb_index = 0; + rt_index += 1; + } + + let mut index = free_cluster_index.unwrap(); + let mut count = required_clusters; + + // Put refblocks first + let rt_index_start = index.rt_index(rb_bits); + let rt_index_end = (index + count).0.div_ceil(rb_entries as u64) as usize; + + let mut refblocks = Vec::>::new(); + for rt_i in rt_index_start..rt_index_end { + if let Some(rb_offset) = new_rt.get(rt_i).refblock_offset() { + // Checked in the loop above + let rb_cluster = rb_offset.checked_cluster(cb).unwrap(); + let rb = self.rb_cache.get_or_insert(rb_cluster).await?; + refblocks.push(rb); + continue; + } + + let mut rb = RefBlock::new_cleared(self.file.as_ref(), &self.header)?; + rb.set_cluster(index); + new_rt.enter_refblock(rt_i, &rb)?; + let rb = Arc::new(rb); + self.rb_cache.insert(index, Arc::clone(&rb)).await?; + refblocks.push(rb); + index += ClusterCount(1); + count -= ClusterCount(1); + } + + assert!(count >= rt_clusters); + new_rt.set_cluster(index); + + // Now set allocation information + let start_index = free_cluster_index.unwrap(); + let end_index = index + rt_clusters; + + for index in start_index.0..end_index.0 { + let index = HostCluster(index); + let (rt_i, rb_i) = index.rt_rb_indices(rb_bits); + + // `refblocks[0]` is for `rt_index_start` + let rb_vec_i = rt_i - rt_index_start; + // Incrementing from 0 to 1 must succeed + refblocks[rb_vec_i] + .lock_write() + .await + .increment(rb_i) + .unwrap(); + } + + // Any errors from here on may lead to leaked clusters if there are refblocks in + // `refblocks` that are already part of the old reftable. + // TODO: Try to clean that up, though it seems quite hard for little gain. + self.rb_cache.flush().await?; + new_rt.write(self.file.as_ref()).await?; + + self.header.set_reftable(&new_rt)?; + self.header + .write_reftable_pointer(self.file.as_ref()) + .await?; + + // Must set new reftable before calling `free_clusters()` + let mut old_reftable = mem::replace(&mut self.reftable, new_rt); + if let Some(old_rt_cluster) = old_reftable.get_cluster() { + let old_rt_size = old_reftable.cluster_count(); + old_reftable.unset_cluster(); + self.free_clusters(old_rt_cluster, old_rt_size).await; + } + + Ok(()) + } + + /// Free clusters (i.e. decrement their refcount). + /// + /// Best-effort operation. On error, the given clusters may be leaked, but no errors are ever + /// returned (because there is no good way to handle such errors anyway). + async fn free_clusters(&mut self, start: HostCluster, mut count: ClusterCount) { + if count.0 == 0 { + return; + } + + if start < self.first_free_cluster { + self.first_free_cluster = start; + } + + let rb_bits = self.header.rb_bits(); + let rb_entries = 1 << rb_bits; + let (mut rt_index, mut rb_index) = start.rt_rb_indices(rb_bits); + + while count > ClusterCount(0) { + let in_rb_count = cmp::min((rb_entries - rb_index) as u64, count.0) as usize; + + match self.get_rb(rt_index).await { + Ok(Some(rb)) => { + let mut rb = rb.lock_write().await; + for i in rb_index..(rb_index + in_rb_count) { + if let Err(err) = rb.decrement(i) { + event!(Level::WARN, "Failed to free cluster: {err}"); + } + } + } + + Ok(None) => { + event!( + Level::WARN, + "Failed to free {in_rb_count} clusters: Not allocated" + ) + } + Err(err) => event!(Level::WARN, "Failed to free {in_rb_count} clusters: {err}"), + } + + count -= ClusterCount(in_rb_count as u64); + rb_index = 0; + rt_index += 1; + } + } +} diff --git a/src/imago/src/qcow2/cache.rs b/src/imago/src/qcow2/cache.rs new file mode 100644 index 00000000..e61b757a --- /dev/null +++ b/src/imago/src/qcow2/cache.rs @@ -0,0 +1,84 @@ +//! Provides functionality for the L2 and refblock caches. + +use super::*; +use crate::async_lru_cache::AsyncLruCacheBackend; +use tracing::trace; + +/// I/O back-end for the L2 table cache. +pub(super) struct L2CacheBackend { + /// Qcow2 metadata file. + file: Arc, + + /// Qcow2 header. + header: Arc
, +} + +/// I/O back-end for the refblock cache. +pub(super) struct RefBlockCacheBackend { + /// Qcow2 metadata file. + file: Arc, + + /// Qcow2 header. + header: Arc
, +} + +impl L2CacheBackend { + /// Create a new `L2CacheBackend`. + /// + /// `file` is the qcow2 metadata (image) file. + pub fn new(file: Arc, header: Arc
) -> Self { + L2CacheBackend { file, header } + } +} + +impl AsyncLruCacheBackend for L2CacheBackend { + type Key = HostCluster; + type Value = L2Table; + + async fn load(&self, l2_cluster: HostCluster) -> io::Result { + trace!("Loading L2 table"); + + L2Table::load( + self.file.as_ref(), + &self.header, + l2_cluster, + self.header.l2_entries(), + ) + .await + } + + async fn flush(&self, l2_cluster: HostCluster, l2_table: Arc) -> io::Result<()> { + trace!("Flushing L2 table"); + if l2_table.is_modified() { + assert!(l2_table.get_cluster().unwrap() == l2_cluster); + l2_table.write(self.file.as_ref()).await?; + } + Ok(()) + } +} + +impl RefBlockCacheBackend { + /// Create a new `RefBlockCacheBackend`. + /// + /// `file` is the qcow2 metadata (image) file. + pub fn new(file: Arc, header: Arc
) -> Self { + RefBlockCacheBackend { file, header } + } +} + +impl AsyncLruCacheBackend for RefBlockCacheBackend { + type Key = HostCluster; + type Value = RefBlock; + + async fn load(&self, rb_cluster: HostCluster) -> io::Result { + RefBlock::load(self.file.as_ref(), &self.header, rb_cluster).await + } + + async fn flush(&self, rb_cluster: HostCluster, refblock: Arc) -> io::Result<()> { + if refblock.is_modified() { + assert!(refblock.get_cluster().unwrap() == rb_cluster); + refblock.write(self.file.as_ref()).await?; + } + Ok(()) + } +} diff --git a/src/imago/src/qcow2/compressed.rs b/src/imago/src/qcow2/compressed.rs new file mode 100644 index 00000000..654b9c38 --- /dev/null +++ b/src/imago/src/qcow2/compressed.rs @@ -0,0 +1,55 @@ +//! Support for compressed clusters. + +use super::*; +use crate::io_buffers::IoBuffer; +use miniz_oxide::inflate::core::{decompress as inflate, DecompressorOxide}; +use miniz_oxide::inflate::TINFLStatus; + +impl + 'static> Qcow2 { + /// Read one compressed cluster. + /// + /// Read the compressed data at `compressed_offset` of length `compressed_length` (which must + /// be the values from the L2 compressed cluster descriptor) into a bounce buffer, then + /// decompress it into `buf` (which must have a length of exactly one cluster). + pub(super) async fn read_compressed_cluster( + &self, + buf: &mut [u8], + compressed_offset: HostOffset, + compressed_length: u64, + ) -> io::Result<()> { + debug_assert!(buf.len() == self.header.cluster_size()); + + let storage = self.storage(); + + // Must fit (really shouldn’t be compressed if this exceeds the cluster size anyway) + let compressed_length = compressed_length.try_into().map_err(io::Error::other)?; + let mut compressed_buf = IoBuffer::new(compressed_length, storage.mem_align())?; + storage + .read(&mut compressed_buf, compressed_offset.0) + .await?; + + let mut dec_ox = DecompressorOxide::new(); + let (status, _read, written) = + inflate(&mut dec_ox, compressed_buf.as_ref().into_slice(), buf, 0, 0); + + // Because `compressed_length` will generally exceed the actual length, `HasMoreOutput` is + // expected and can be ignored + if status != TINFLStatus::Done && status != TINFLStatus::HasMoreOutput { + return Err(io::Error::other(format!( + "Failed to decompress cluster (host offset {}+{}): {:?}", + compressed_offset, compressed_length, status + ))); + } + if written < buf.len() { + return Err(io::Error::other(format!( + "Failed to decompress cluster (host offset {}+{}): Decompressed {} bytes, expected {}", + compressed_offset, + compressed_length, + written, + buf.len(), + ))); + } + + Ok(()) + } +} diff --git a/src/imago/src/qcow2/cow.rs b/src/imago/src/qcow2/cow.rs new file mode 100644 index 00000000..8c31b729 --- /dev/null +++ b/src/imago/src/qcow2/cow.rs @@ -0,0 +1,295 @@ +//! Copy-on-write operations. +//! +//! Implements copy-on-write when writing to clusters that are not simple allocated data clusters. + +use super::*; +use crate::io_buffers::IoBuffer; + +impl> Qcow2 { + /// Do copy-on-write for the given guest cluster, if necessary. + /// + /// If the given guest cluster is backed by an allocated copied data cluster, return that + /// cluster, so it can just be written into. + /// + /// Otherwise, allocate a new data cluster and copy the previously visible cluster contents + /// there: + /// - For non-copied data clusters, copy the cluster contents. + /// - For zero clusters, write zeroes. + /// - For unallocated clusters, copy data from the backing file (if any, zeroes otherwise). + /// - For compressed clusters, decompress the data and write it into the new cluster. + /// + /// Return the new cluster, if any was allocated, or the old cluster in case it was already + /// safe to write to. I.e., the returned cluster is where data for `cluster` may be written + /// to. + /// + /// `cluster` is the guest cluster to COW. + /// + /// `mandatory_host_cluster` may specify the cluster that must be used for the new allocation, + /// or that an existing data cluster allocation must match. If it does not match, or that + /// cluster is already allocated and cannot be used, return `Ok(None)`. + /// + /// `partial_skip_cow` may give an in-cluster range that is supposed to be overwritten + /// immediately anyway, i.e. that need not be copied. + /// + /// `l2_table` is the L2 table for `offset`. + /// + /// If a previously existing allocation is replaced, the old one will be put into + /// `leaked_allocations`. The caller must free it. + pub(super) async fn cow_cluster( + &self, + cluster: GuestCluster, + mandatory_host_cluster: Option, + partial_skip_cow: Option>, + l2_table: &mut L2TableWriteGuard<'_>, + leaked_allocations: &mut Vec<(HostCluster, ClusterCount)>, + ) -> io::Result> { + // No need to do COW when writing the full cluster + let full_skip_cow = if let Some(skip) = partial_skip_cow.as_ref() { + skip.start == 0 && skip.end == self.header.cluster_size() + } else { + false + }; + + let existing_mapping = l2_table.get_mapping(cluster)?; + if let L2Mapping::DataFile { + host_cluster, + copied: true, + } = existing_mapping + { + if let Some(mandatory_host_cluster) = mandatory_host_cluster { + if host_cluster != mandatory_host_cluster { + return Ok(None); + } + } + return Ok(Some(host_cluster)); + }; + + self.need_writable()?; + + let new_cluster = if let L2Mapping::Zero { + host_cluster: Some(host_cluster), + copied: true, + } = existing_mapping + { + if let Some(mandatory_host_cluster) = mandatory_host_cluster { + if host_cluster == mandatory_host_cluster { + Some(host_cluster) + } else { + // Discard existing mapping + self.allocate_data_cluster_at(cluster, Some(mandatory_host_cluster)) + .await? + } + } else { + Some(host_cluster) + } + } else { + self.allocate_data_cluster_at(cluster, mandatory_host_cluster) + .await? + }; + let Some(new_cluster) = new_cluster else { + // Allocation at `mandatory_host_cluster` failed + return Ok(None); + }; + + if !full_skip_cow { + match existing_mapping { + L2Mapping::DataFile { + host_cluster: _, + copied: true, + } => unreachable!(), + + L2Mapping::DataFile { + host_cluster, + copied: false, + } => { + self.cow_copy_storage( + self.storage(), + host_cluster, + new_cluster, + partial_skip_cow, + ) + .await? + } + + L2Mapping::Backing { backing_offset } => { + if let Some(backing) = self.backing.as_ref() { + self.cow_copy_format(backing, backing_offset, new_cluster, partial_skip_cow) + .await? + } else { + self.cow_zero(new_cluster, partial_skip_cow).await? + } + } + + L2Mapping::Zero { + host_cluster: _, + copied: _, + } => self.cow_zero(new_cluster, partial_skip_cow).await?, + + L2Mapping::Compressed { + host_offset, + length, + } => { + self.cow_compressed(host_offset, length, new_cluster) + .await? + } + } + } + + let l2i = cluster.l2_index(self.header.cluster_bits()); + if let Some(leaked) = l2_table.map_cluster(l2i, new_cluster) { + leaked_allocations.push(leaked); + } + + Ok(Some(new_cluster)) + } + + /// Calculate what range of a cluster we need to COW. + /// + /// Given potentially a range to skip, calculate what we should COW. The range will only be + /// taken into account if it is at one end of the cluster, to always yield a continuous range + /// to COW (one without a hole in the middle). + /// + /// The returned range is also aligned to `alignment` if possible. + fn get_cow_range( + &self, + partial_skip_cow: Option>, + alignment: usize, + ) -> Option> { + let mut copy_range = 0..self.header.cluster_size(); + if let Some(partial_skip_cow) = partial_skip_cow { + if partial_skip_cow.start == copy_range.start { + copy_range.start = partial_skip_cow.end; + } else if partial_skip_cow.end == copy_range.end { + copy_range.end = partial_skip_cow.start; + } + } + + if copy_range.is_empty() { + return None; + } + + let alignment = cmp::min(alignment, self.header.cluster_size()); + debug_assert!(alignment.is_power_of_two()); + let mask = alignment - 1; + + if copy_range.start & mask != 0 { + copy_range.start &= !mask; + } + if copy_range.end & mask != 0 { + copy_range.end = (copy_range.end & !mask) + alignment; + } + + Some(copy_range) + } + + /// Copy data from one data file cluster to another. + /// + /// Used for COW on non-copied data clusters. + async fn cow_copy_storage( + &self, + from: &S, + from_cluster: HostCluster, + to_cluster: HostCluster, + partial_skip_cow: Option>, + ) -> io::Result<()> { + let to = self.storage(); + + let align = cmp::max(from.req_align(), to.req_align()); + let Some(cow_range) = self.get_cow_range(partial_skip_cow, align) else { + return Ok(()); + }; + + let mut buf = IoBuffer::new(cow_range.end - cow_range.start, from.mem_align())?; + + let cb = self.header.cluster_bits(); + let from_offset = from_cluster.offset(cb); + let to_offset = to_cluster.offset(cb); + + from.read(&mut buf, from_offset.0 + cow_range.start as u64) + .await?; + + to.write(&buf, to_offset.0 + cow_range.start as u64).await?; + + Ok(()) + } + + /// Copy data from another image into our data file. + /// + /// Used for COW on clusters served by a backing image. + async fn cow_copy_format( + &self, + from: &F, + from_offset: u64, + to_cluster: HostCluster, + partial_skip_cow: Option>, + ) -> io::Result<()> { + let to = self.storage(); + let from = from.unwrap(); + + let align = cmp::max(from.req_align(), to.req_align()); + let Some(cow_range) = self.get_cow_range(partial_skip_cow, align) else { + return Ok(()); + }; + + let mut buf = IoBuffer::new(cow_range.end - cow_range.start, from.mem_align())?; + + let to_offset = to_cluster.offset(self.header.cluster_bits()); + + from.read(&mut buf, from_offset + cow_range.start as u64) + .await?; + + to.write(&buf, to_offset.0 + cow_range.start as u64).await?; + + Ok(()) + } + + /// Fill the given cluster with zeroes. + /// + /// Used for COW on zero clusters. + async fn cow_zero( + &self, + to_cluster: HostCluster, + partial_skip_cow: Option>, + ) -> io::Result<()> { + let to = self.storage(); + + let align = to.req_align(); + let Some(cow_range) = self.get_cow_range(partial_skip_cow, align) else { + return Ok(()); + }; + + let to_offset = to_cluster.offset(self.header.cluster_bits()); + to.write_zeroes( + to_offset.0 + cow_range.start as u64, + (cow_range.end - cow_range.start) as u64, + ) + .await?; + + Ok(()) + } + + /// Decompress a cluster into the target cluster. + /// + /// Used for COW on compressed clusters. + async fn cow_compressed( + &self, + compressed_offset: HostOffset, + compressed_length: u64, + to_cluster: HostCluster, + ) -> io::Result<()> { + let to = self.storage(); + + let mut buf = IoBuffer::new(self.header.cluster_size(), to.mem_align())?; + self.read_compressed_cluster( + buf.as_mut().into_slice(), + compressed_offset, + compressed_length, + ) + .await?; + + let to_offset = to_cluster.offset(self.header.cluster_bits()); + to.write(&buf, to_offset.0).await?; + + Ok(()) + } +} diff --git a/src/imago/src/qcow2/io_func.rs b/src/imago/src/qcow2/io_func.rs new file mode 100644 index 00000000..d1076d98 --- /dev/null +++ b/src/imago/src/qcow2/io_func.rs @@ -0,0 +1,81 @@ +//! Special I/O functions. +//! +//! Most of I/O should be implemented in the generic +//! [`imago::format::access`](crate::format::access) module, but some I/O needs to be done directly +//! by image drivers (like handling compression). + +use super::*; +use crate::io_buffers::IoBuffer; + +impl> Qcow2 { + /// Read the special range at `offset`. + /// + /// Currently, the only special range we have are compressed clusters. + pub(super) async fn do_readv_special( + &self, + mut bufv: IoVectorMut<'_>, + mut offset: GuestOffset, + ) -> io::Result<()> { + let mut saved_l2_table: Option> = None; + let cb = self.header.cluster_bits(); + + // Do everything cluster by cluster. + while !bufv.is_empty() { + let l2_table = if let Some(saved) = saved_l2_table.as_ref() { + saved + } else { + let new_l2 = self + .get_l2(offset, false) + .await? + .ok_or(io::ErrorKind::Other)?; + saved_l2_table.get_or_insert(new_l2) + }; + + let chunk_length = offset.remaining_in_cluster(cb); + let (chunk, remainder) = bufv.split_at(chunk_length); + bufv = remainder; + + let mut bounce_buffer_and_chunk = None; + let need_bounce_buffer = chunk.buffer_count() != 1 + || offset.in_cluster_offset(cb) != 0 + || chunk.len() != self.header.cluster_size() as u64; + + let slice = if need_bounce_buffer { + let bounce_buffer = IoBuffer::new(self.header.cluster_size(), 1)?; + bounce_buffer_and_chunk = Some((bounce_buffer, chunk)); + bounce_buffer_and_chunk.as_mut().unwrap().0.as_mut() + } else { + chunk.into_inner().pop().unwrap().into() + }; + + let guest_cluster = offset.cluster(cb); + match l2_table.get_mapping(guest_cluster)? { + L2Mapping::Compressed { + host_offset, + length, + } => { + self.read_compressed_cluster(slice.into_slice(), host_offset, length) + .await?; + } + + _ => return Err(io::ErrorKind::Other.into()), + } + + if let Some((bounce_buffer, mut chunk)) = bounce_buffer_and_chunk { + let ofs = offset.in_cluster_offset(cb); + let end = ofs + chunk.len() as usize; + chunk.copy_from_slice(bounce_buffer.as_ref_range(ofs..end).into_slice()); + } + + let next_cluster = if let Some(next) = guest_cluster.next_in_l2(cb) { + next + } else { + saved_l2_table.take(); + guest_cluster.first_in_next_l2(cb) + }; + offset = next_cluster.offset(cb); + } + + Ok(()) + } +} diff --git a/src/imago/src/qcow2/mappings.rs b/src/imago/src/qcow2/mappings.rs new file mode 100644 index 00000000..80dcaf02 --- /dev/null +++ b/src/imago/src/qcow2/mappings.rs @@ -0,0 +1,346 @@ +//! Get and establish cluster mappings. + +use super::*; +use tokio::sync::RwLockWriteGuard; + +impl> Qcow2 { + /// Get the given range’s mapping information. + /// + /// Underlying implementation for [`Qcow2::get_mapping()`]. + pub(super) async fn do_get_mapping( + &self, + offset: GuestOffset, + max_length: u64, + ) -> io::Result<(Mapping<'_, S>, u64)> { + let Some(l2_table) = self.get_l2(offset, false).await? else { + let cb = self.header.cluster_bits(); + let len = cmp::min(offset.remaining_in_l2_table(cb), max_length); + let mapping = if let Some(backing) = self.backing.as_ref() { + Mapping::Indirect { + layer: backing.unwrap(), + offset: offset.0, + writable: false, + } + } else { + Mapping::Zero + }; + return Ok((mapping, len)); + }; + + self.do_get_mapping_with_l2(offset, max_length, &l2_table) + .await + } + + /// Get the given range’s mapping information, when we already have the L2 table. + pub(super) async fn do_get_mapping_with_l2( + &self, + offset: GuestOffset, + max_length: u64, + l2_table: &L2Table, + ) -> io::Result<(Mapping<'_, S>, u64)> { + let cb = self.header.cluster_bits(); + + // Get mapping at `offset` + let mut current_guest_cluster = offset.cluster(cb); + let first_mapping = l2_table.get_mapping(current_guest_cluster)?; + let return_mapping = match first_mapping { + L2Mapping::DataFile { + host_cluster, + copied, + } => Mapping::Raw { + storage: self.storage(), + offset: host_cluster.relative_offset(offset, cb).0, + writable: copied, + }, + + L2Mapping::Backing { backing_offset } => { + if let Some(backing) = self.backing.as_ref() { + Mapping::Indirect { + layer: backing.unwrap(), + offset: backing_offset + offset.in_cluster_offset(cb) as u64, + writable: false, + } + } else { + Mapping::Zero + } + } + + L2Mapping::Zero { + host_cluster: _, + copied: _, + } => Mapping::Zero, + + L2Mapping::Compressed { + host_offset: _, + length: _, + } => Mapping::Special { offset: offset.0 }, + }; + + // Find out how long this consecutive mapping is, but only within the current L2 table + let mut consecutive_length = offset.remaining_in_cluster(cb); + let mut preceding_mapping = first_mapping; + while consecutive_length < max_length { + let Some(next) = current_guest_cluster.next_in_l2(cb) else { + break; + }; + current_guest_cluster = next; + + let mapping = l2_table.get_mapping(current_guest_cluster)?; + if !mapping.is_consecutive(&preceding_mapping, cb) { + break; + } + + preceding_mapping = mapping; + consecutive_length += self.header.cluster_size() as u64; + } + + consecutive_length = cmp::min(consecutive_length, max_length); + Ok((return_mapping, consecutive_length)) + } + + /// Make the given range be mapped by data clusters. + /// + /// Underlying implementation for [`Qcow2::ensure_data_mapping()`]. + pub(super) async fn do_ensure_data_mapping( + &self, + offset: GuestOffset, + length: u64, + overwrite: bool, + ) -> io::Result<(&S, u64, u64)> { + let l2_table = self.ensure_l2(offset).await?; + + // Fast path for if everything is already allocated, which should be the common case at + // runtime. + // It must really be everything, though; we know our caller will want to have everything + // allocated eventually, so if anything is missing, go down to the allocation path so we + // try to allocate clusters such that they are not fragmented (if possible) and we can + // return as big of a single mapping as possible. + let existing = self + .do_get_mapping_with_l2(offset, length, &l2_table) + .await?; + if let Mapping::Raw { + storage, + offset, + writable: true, + } = existing.0 + { + if existing.1 >= length { + return Ok((storage, offset, existing.1)); + } + } + + let l2_table = l2_table.lock_write().await; + let mut leaked_allocations = Vec::<(HostCluster, ClusterCount)>::new(); + + let res = self + .ensure_data_mapping_no_cleanup( + offset, + length, + overwrite, + l2_table, + &mut leaked_allocations, + ) + .await; + + for alloc in leaked_allocations { + self.free_data_clusters(alloc.0, alloc.1).await; + } + let (host_offset, length) = res?; + + Ok((self.storage(), host_offset, length)) + } + + /// Get the L2 table referenced by the given L1 table index, if any. + /// + /// `writable` says whether the L2 table should be modifiable. + /// + /// If the L1 table index does not point to any L2 table, or the existing entry is not + /// modifiable but `writable` is true, return `Ok(None)`. + pub(super) async fn get_l2( + &self, + offset: GuestOffset, + writable: bool, + ) -> io::Result>> { + let cb = self.header.cluster_bits(); + + let l1_entry = self.l1_table.read().await.get(offset.l1_index(cb)); + if let Some(l2_offset) = l1_entry.l2_offset() { + if writable && !l1_entry.is_copied() { + return Ok(None); + } + let l2_cluster = l2_offset.checked_cluster(cb).ok_or_else(|| { + invalid_data(format!( + "Unaligned L2 table for {offset:?}; L1 entry: {l1_entry:?}" + )) + })?; + + self.l2_cache.get_or_insert(l2_cluster).await.map(Some) + } else { + Ok(None) + } + } + + /// Get a L2 table for the given L1 table index. + /// + /// If there already is an L2 table at that index, return it. Otherwise, create one and hook + /// it up. + pub(super) async fn ensure_l2(&self, offset: GuestOffset) -> io::Result> { + let cb = self.header.cluster_bits(); + + if let Some(l2) = self.get_l2(offset, true).await? { + return Ok(l2); + } + + self.need_writable()?; + + let mut l1_locked = self.l1_table.write().await; + let l1_index = offset.l1_index(cb); + if !l1_locked.in_bounds(l1_index) { + l1_locked = self.grow_l1_table(l1_locked, l1_index).await?; + } + + let l1_entry = l1_locked.get(l1_index); + let mut l2_table = if let Some(l2_offset) = l1_entry.l2_offset() { + let l2_cluster = l2_offset.checked_cluster(cb).ok_or_else(|| { + invalid_data(format!( + "Unaligned L2 table for {offset:?}; L1 entry: {l1_entry:?}" + )) + })?; + + let l2 = self.l2_cache.get_or_insert(l2_cluster).await?; + if l1_entry.is_copied() { + return Ok(l2); + } + + L2Table::clone(&l2) + } else { + L2Table::new_cleared(&self.header) + }; + + let l2_cluster = self.allocate_meta_cluster().await?; + l2_table.set_cluster(l2_cluster); + l2_table.write(self.metadata.as_ref()).await?; + + l1_locked.enter_l2_table(l1_index, &l2_table)?; + l1_locked + .write_entry(self.metadata.as_ref(), l1_index) + .await?; + + // Free old L2 table, if any + if let Some(l2_offset) = l1_entry.l2_offset() { + self.free_meta_clusters(l2_offset.cluster(cb), ClusterCount(1)) + .await; + } + + let l2_table = Arc::new(l2_table); + self.l2_cache + .insert(l2_cluster, Arc::clone(&l2_table)) + .await?; + Ok(l2_table) + } + + /// Create a new L1 table covering at least `at_least_index`. + /// + /// Create a new L1 table of the required size with all the entries of the previous L1 table. + async fn grow_l1_table<'a>( + &self, + mut l1_locked: RwLockWriteGuard<'a, L1Table>, + at_least_index: usize, + ) -> io::Result> { + let mut new_l1 = l1_locked.clone_and_grow(at_least_index, &self.header)?; + + let l1_start = self.allocate_meta_clusters(new_l1.cluster_count()).await?; + + new_l1.set_cluster(l1_start); + new_l1.write(self.metadata.as_ref()).await?; + + self.header.set_l1_table(&new_l1)?; + self.header + .write_l1_table_pointer(self.metadata.as_ref()) + .await?; + + if let Some(old_l1_cluster) = l1_locked.get_cluster() { + let old_l1_size = l1_locked.cluster_count(); + l1_locked.unset_cluster(); + self.free_meta_clusters(old_l1_cluster, old_l1_size).await; + } + + *l1_locked = new_l1; + + Ok(l1_locked) + } + + /// Inner implementation for [`Qcow2::do_ensure_data_mapping()`]. + /// + /// Does not do any clean-up: The L2 table will probably be modified, but not written to disk. + /// Any existing allocations that have been removed from it (and are thus leaked) are entered + /// into `leaked_allocations`, but not freed. + /// + /// The caller must do both, ensuring it is done both in case of success and in case of error. + async fn ensure_data_mapping_no_cleanup( + &self, + offset: GuestOffset, + full_length: u64, + overwrite: bool, + mut l2_table: L2TableWriteGuard<'_>, + leaked_allocations: &mut Vec<(HostCluster, ClusterCount)>, + ) -> io::Result<(u64, u64)> { + let cb = self.header.cluster_bits(); + + let partial_skip_cow = overwrite.then(|| { + let start = offset.in_cluster_offset(cb); + let end = cmp::min(start as u64 + full_length, 1 << cb) as usize; + start..end + }); + + let mut current_guest_cluster = offset.cluster(cb); + + // Without a mandatory host offset, this should never return `Ok(None)` + let host_cluster = self + .cow_cluster( + current_guest_cluster, + None, + partial_skip_cow, + &mut l2_table, + leaked_allocations, + ) + .await? + .ok_or_else(|| io::Error::other("Internal allocation error"))?; + + let host_offset_start = host_cluster.relative_offset(offset, cb); + let mut allocated_length = offset.remaining_in_cluster(cb); + let mut current_host_cluster = host_cluster; + + while allocated_length < full_length { + let Some(next) = current_guest_cluster.next_in_l2(cb) else { + break; + }; + current_guest_cluster = next; + + let chunk_length = cmp::min(full_length - allocated_length, 1 << cb) as usize; + let partial_skip_cow = overwrite.then(|| 0..chunk_length); + + let next_host_cluster = current_host_cluster + ClusterCount(1); + let host_cluster = self + .cow_cluster( + current_guest_cluster, + Some(next_host_cluster), + partial_skip_cow, + &mut l2_table, + leaked_allocations, + ) + .await?; + + let Some(host_cluster) = host_cluster else { + // Cannot continue continuous mapping range + break; + }; + assert!(host_cluster == next_host_cluster); + current_host_cluster = host_cluster; + + allocated_length += chunk_length as u64; + } + + Ok((host_offset_start.0, allocated_length)) + } +} diff --git a/src/imago/src/qcow2/metadata.rs b/src/imago/src/qcow2/metadata.rs new file mode 100644 index 00000000..e32a2a40 --- /dev/null +++ b/src/imago/src/qcow2/metadata.rs @@ -0,0 +1,2545 @@ +//! Functionality for working with qcow2 metadata. + +use super::types::*; +use crate::io_buffers::IoBuffer; +use crate::macros::numerical_enum; +use crate::misc_helpers::invalid_data; +use crate::{Storage, StorageExt}; +use bincode::Options; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::mem::size_of; +use std::num::TryFromIntError; +use std::sync::atomic::{AtomicBool, AtomicU16, AtomicU32, AtomicU64, AtomicU8, Ordering}; +use std::{cmp, io}; +use tokio::sync::{Mutex, MutexGuard}; +use tracing::error; + +/// Qcow header magic ("QFI\xfb"). +const MAGIC: u32 = 0x51_46_49_fb; + +/// Maximum file length. +const MAX_FILE_LENGTH: u64 = 0x0100_0000_0000_0000u64; + +/// Maximum permissible host offset. +pub(super) const MAX_OFFSET: HostOffset = HostOffset(MAX_FILE_LENGTH - 512); + +/// Minimum cluster size. +/// +/// Defined by the specification. +pub(super) const MIN_CLUSTER_SIZE: usize = 512; + +/// Maximum cluster size. +/// +/// This is QEMU’s limit, so we can apply it, too. +pub(super) const MAX_CLUSTER_SIZE: usize = 2 * 1024 * 1024; + +/// Minimum number of bits per refcount entry. +pub(super) const MIN_REFCOUNT_WIDTH: usize = 1; + +/// Maximum number of bits per refcount entry. +pub(super) const MAX_REFCOUNT_WIDTH: usize = 64; + +/// Qcow2 v2 header. +#[derive(Deserialize, Serialize)] +struct V2Header { + /// Qcow magic string ("QFI\xfb"). + magic: u32, + + /// Version number (valid values are 2 and 3). + version: u32, + + /// Offset into the image file at which the backing file name is stored (NB: The string is not + /// null terminated). 0 if the image doesn’t have a backing file. + /// + /// Note: backing files are incompatible with raw external data files (auto-clear feature bit + /// 1). + backing_file_offset: u64, + + /// Length of the backing file name in bytes. Must not be longer than 1023 bytes. Undefined + /// if the image doesn’t have a backing file. + backing_file_size: u32, + + /// Number of bits that are used for addressing an offset within a cluster (`1 << cluster_bits` + /// is the cluster size). Must not be less than 9 (i.e. 512 byte clusters). + /// + /// Note: qemu as of today has an implementation limit of 2 MB as the maximum cluster size and + /// won’t be able to open images with larger cluster sizes. + /// + /// Note: if the image has Extended L2 Entries then `cluster_bits` must be at least 14 (i.e. + /// 16384 byte clusters). + cluster_bits: u32, + + /// Virtual disk size in bytes. + /// + /// Note: qemu has an implementation limit of 32 MB as the maximum L1 table size. With a 2 MB + /// cluster size, it is unable to populate a virtual cluster beyond 2 EB (61 bits); with a 512 + /// byte cluster size, it is unable to populate a virtual size larger than 128 GB (37 bits). + /// Meanwhile, L1/L2 table layouts limit an image to no more than 64 PB (56 bits) of populated + /// clusters, and an image may hit other limits first (such as a file system’s maximum size). + size: u64, + + /// Encryption method: + /// + /// 0. no encryption + /// 1. AES encryption + /// 2. LUKS encryption + crypt_method: u32, + + /// Number of entries in the active L1 table. + l1_size: AtomicU32, + + /// Offset into the image file at which the active L1 table starts. Must be aligned to a + /// cluster boundary. + l1_table_offset: AtomicU64, + + /// Offset into the image file at which the refcount table starts. Must be aligned to a + /// cluster boundary. + refcount_table_offset: AtomicU64, + + /// Number of clusters that the refcount table occupies. + refcount_table_clusters: AtomicU32, + + /// Number of snapshots contained in the image. + nb_snapshots: u32, + + /// Offset into the image file at which the snapshot table starts. Must be aligned to a + /// cluster boundary. + snapshots_offset: u64, +} + +impl V2Header { + /// Raw v2 header length. + const RAW_SIZE: usize = 72; +} + +/// Qcow2 v3 header. +#[derive(Deserialize, Serialize)] +struct V3HeaderBase { + /// Bitmask of incompatible features. An implementation must fail to open an image if an + /// unknown bit is set. + /// + /// 0. Dirty bit. If this bit is set then refcounts may be inconsistent, make sure to scan + /// L1/L2 tables to repair refcounts before accessing the image. + /// 1. Corrupt bit. If this bit is set then any data structure may be corrupt and the image + /// must not be written to (unless for regaining consistency). + /// 2. External data file bit. If this bit is set, an external data file is used. Guest + /// clusters are then stored in the external data file. For such images, clusters in the + /// external data file are not refcounted. The offset field in the Standard Cluster + /// Descriptor must match the guest offset and neither compressed clusters nor internal + /// snapshots are supported. An External Data File Name header extension may be present if + /// this bit is set. + /// 3. Compression type bit. If this bit is set, a non-default compression is used for + /// compressed clusters. The compression_type field must be present and not zero. + /// 4. Extended L2 Entries. If this bit is set then L2 table entries use an extended format + /// that allows subcluster-based allocation. See the Extended L2 Entries section for more + /// details. + /// + /// Bits 5-63 are reserved (set to 0). + incompatible_features: u64, + + /// Bitmask of compatible features. An implementation can safely ignore any unknown bits that + /// are set. + /// + /// 0. Lazy refcounts bit. If this bit is set then lazy refcount updates can be used. This + /// means marking the image file dirty and postponing refcount metadata updates. + /// + /// Bits 1-63 are reserved (set to 0). + compatible_features: u64, + + /// Bitmask of auto-clear features. An implementation may only write to an image with unknown + /// auto-clear features if it clears the respective bits from this field first. + /// + /// 0. Bitmaps extension bit. This bit indicates consistency for the bitmaps extension data. + /// It is an error if this bit is set without the bitmaps extension present. If the bitmaps + /// extension is present but this bit is unset, the bitmaps extension data must be + /// considered inconsistent. + /// 1. Raw external data bit. If this bit is set, the external data file can be read as a + /// consistent standalone raw image without looking at the qcow2 metadata. Setting this bit + /// has a performance impact for some operations on the image (e.g. writing zeros requires + /// writing to the data file instead of only setting the zero flag in the L2 table entry) + /// and conflicts with backing files. This bit may only be set if the External Data File + /// bit (incompatible feature bit 1) is also set. + /// + /// Bits 2-63 are reserved (set to 0). + autoclear_features: u64, + + /// Describes the width of a reference count block entry (width in bits: `refcount_bits = 1 << + /// refcount_order`). For version 2 images, the order is always assumed to be 4 (i.e. + /// `refcount_bits = 16`). This value may not exceed 6 (i.e. `refcount_bits = 64`). + refcount_order: u32, + + /// Length of the header structure in bytes. For version 2 images, the length is always + /// assumed to be 72 bytes. For version 3 it’s at least 104 bytes and must be a multiple of 8. + header_length: u32, +} + +impl V3HeaderBase { + /// Raw v3 header length beyond the v2 header. + const RAW_SIZE: usize = 104 - V2Header::RAW_SIZE; +} + +impl Default for V3HeaderBase { + fn default() -> Self { + V3HeaderBase { + incompatible_features: 0, + compatible_features: 0, + autoclear_features: 0, + refcount_order: 4, + header_length: (V2Header::RAW_SIZE + V3HeaderBase::RAW_SIZE) as u32, + } + } +} + +numerical_enum! { + /// Incompatible feature bits. + pub(super) enum IncompatibleFeatures as u64 { + Dirty = 1 << 0, + Corrupt = 1 << 1, + ExternalDataFile = 1 << 2, + CompressionType = 1 << 3, + ExtendedL2Entries = 1 << 4, + } +} + +numerical_enum! { + /// Extension type IDs. + pub(super) enum HeaderExtensionType as u32 { + /// End of extension list. + End = 0, + + /// Backing file format string. + BackingFileFormat = 0xe2792aca, + + /// Map of feature bits to human-readable names. + FeatureNameTable = 0x6803f857, + + /// External data file filename string. + ExternalDataFileName = 0x44415441, + } +} + +/// Header for a header extension. +#[derive(Default, Deserialize, Serialize)] +struct HeaderExtensionHeader { + /// Type code of the header extension. + extension_type: u32, + + /// Data length. + length: u32, +} + +impl HeaderExtensionHeader { + /// Raw struct length. + const RAW_SIZE: usize = 8; +} + +numerical_enum! { + /// Feature type ID for the feature name table. + #[derive(Hash)] + pub(super) enum FeatureType as u8 { + Incompatible = 0, + Compatible = 1, + Autoclear = 2, + } +} + +/// Header extensions (high-level representation). +#[derive(Debug, Clone, Eq, PartialEq)] +pub(super) enum HeaderExtension { + /// Backing file format string. + BackingFileFormat(String), + + /// Map of feature bits to human-readable names. + FeatureNameTable(HashMap<(FeatureType, u8), String>), + + /// External data file filename string. + ExternalDataFileName(String), + + /// Unknown extension. + Unknown { + /// Type. + extension_type: u32, + /// Data (as read). + data: Vec, + }, +} + +/// Integrated header representation. +pub(super) struct Header { + /// v2 part of the header. + v2: V2Header, + + /// Base v3 part of the header. + v3: V3HeaderBase, + + /// Unrecognized header fields. + unknown_header_fields: Vec, + + /// Backing filename string. + backing_filename: Option, + + /// Extensions. + extensions: Vec, + + /// Whether an external data file is required. + external_data_file: bool, +} + +impl Header { + /// Load the qcow2 header from disk. + /// + /// If `writable` is false, do not perform any modifications (e.g. clearing auto-clear bits). + pub async fn load(image: &S, writable: bool) -> io::Result { + let bincode = bincode::DefaultOptions::new() + .with_fixint_encoding() + .with_big_endian(); + + let mut header_buf = vec![0u8; V2Header::RAW_SIZE]; + image.read(header_buf.as_mut_slice(), 0).await?; + + let header: V2Header = bincode.deserialize(&header_buf).map_err(invalid_data)?; + if header.magic != MAGIC { + return Err(invalid_data("Not a qcow2 file")); + } + + let v3header_base = if header.version == 2 { + V3HeaderBase::default() + } else if header.version == 3 { + let mut header_buf = vec![0u8; V3HeaderBase::RAW_SIZE]; + image + .read(header_buf.as_mut_slice(), V2Header::RAW_SIZE as u64) + .await?; + bincode.deserialize(&header_buf).map_err(invalid_data)? + } else { + return Err(invalid_data(format!( + "qcow2 v{} is not supported", + header.version + ))); + }; + + let cluster_size = 1usize.checked_shl(header.cluster_bits).ok_or_else(|| { + invalid_data(format!("Invalid cluster size: 2^{}", header.cluster_bits)) + })?; + if !(MIN_CLUSTER_SIZE..=MAX_CLUSTER_SIZE).contains(&cluster_size) { + return Err(invalid_data(format!( + "Invalid cluster size: {}; must be between {} and {}", + cluster_size, MIN_CLUSTER_SIZE, MAX_CLUSTER_SIZE, + ))); + } + + let min_header_size = V2Header::RAW_SIZE + V3HeaderBase::RAW_SIZE; + if (v3header_base.header_length as usize) < min_header_size { + return Err(invalid_data(format!( + "qcow2 header too short: {} < {}", + v3header_base.header_length, min_header_size, + ))); + } else if (v3header_base.header_length as usize) > cluster_size { + return Err(invalid_data(format!( + "qcow2 header too big: {} > {}", + v3header_base.header_length, cluster_size, + ))); + } + + let unknown_header_fields = if header.version == 2 { + Vec::new() + } else { + let mut unknown_header_fields = + vec![0u8; v3header_base.header_length as usize - min_header_size]; + image + .read(&mut unknown_header_fields, min_header_size as u64) + .await?; + unknown_header_fields + }; + + let l1_offset = HostOffset(header.l1_table_offset.load(Ordering::Relaxed)); + l1_offset + .checked_cluster(header.cluster_bits) + .ok_or_else(|| invalid_data(format!("Unaligned L1 table: {l1_offset}")))?; + + let rt_offset = HostOffset(header.refcount_table_offset.load(Ordering::Relaxed)); + rt_offset + .checked_cluster(header.cluster_bits) + .ok_or_else(|| invalid_data(format!("Unaligned refcount table: {rt_offset}")))?; + + let rc_width = 1usize + .checked_shl(v3header_base.refcount_order) + .ok_or_else(|| { + invalid_data(format!( + "Invalid refcount width: 2^{}", + v3header_base.refcount_order + )) + })?; + if !(MIN_REFCOUNT_WIDTH..=MAX_REFCOUNT_WIDTH).contains(&rc_width) { + return Err(invalid_data(format!( + "Invalid refcount width: {}; must be between {} and {}", + rc_width, MIN_REFCOUNT_WIDTH, MAX_REFCOUNT_WIDTH, + ))); + } + + let backing_filename = if header.backing_file_offset != 0 { + let (offset, length) = (header.backing_file_offset, header.backing_file_size); + if length > 1023 { + return Err(invalid_data(format!( + "Backing file name is too long ({length}, must not exceed 1023)" + ))); + } + + let end = offset.checked_add(length as u64).ok_or(invalid_data( + "Backing file name offset is invalid (too high)", + ))?; + if end >= cluster_size as u64 { + return Err(invalid_data( + "Backing file name offset is invalid (beyond first cluster)", + )); + } + + let mut backing_buf = vec![0; length as usize]; + image.read(&mut backing_buf, offset).await?; + + Some( + String::from_utf8(backing_buf) + .map_err(|err| invalid_data(format!("Backing file name is invalid: {err}")))?, + ) + } else { + None + }; + + let extensions = if header.version == 2 { + Vec::new() + } else { + let mut ext_offset: u64 = v3header_base.header_length as u64; + let mut extensions = Vec::::new(); + loop { + if ext_offset + HeaderExtensionHeader::RAW_SIZE as u64 > cluster_size as u64 { + return Err(invalid_data("Header extensions exceed the first cluster")); + } + + let mut ext_hdr_buf = vec![0; HeaderExtensionHeader::RAW_SIZE]; + image.read(&mut ext_hdr_buf, ext_offset).await?; + + ext_offset += HeaderExtensionHeader::RAW_SIZE as u64; + + let ext_hdr: HeaderExtensionHeader = + bincode.deserialize(&ext_hdr_buf).map_err(invalid_data)?; + let ext_end = ext_offset + .checked_add(ext_hdr.length as u64) + .ok_or_else(|| invalid_data("Header size overflow"))?; + if ext_end > cluster_size as u64 { + return Err(invalid_data("Header extensions exceed the first cluster")); + } + + let mut ext_data = vec![0; ext_hdr.length as usize]; + image.read(&mut ext_data, ext_offset).await?; + + ext_offset += (ext_hdr.length as u64).next_multiple_of(8); + + let Some(extension) = + HeaderExtension::deserialize(ext_hdr.extension_type, ext_data)? + else { + break; + }; + + extensions.push(extension); + } + extensions + }; + + // Check for header extension conflicts + let backing_fmt = extensions + .iter() + .find(|ext| matches!(ext, HeaderExtension::BackingFileFormat(_))); + if let Some(backing_fmt) = backing_fmt { + let conflicting = extensions.iter().find(|ext| { + matches!(ext, HeaderExtension::BackingFileFormat(_)) && ext != &backing_fmt + }); + if let Some(conflicting) = conflicting { + return Err(io::Error::other(format!( + "Found conflicting backing file formats: {:?} != {:?}", + backing_fmt, conflicting + ))); + } + } + let ext_data_file = extensions + .iter() + .find(|ext| matches!(ext, HeaderExtension::ExternalDataFileName(_))); + if let Some(ext_data_file) = ext_data_file { + let conflicting = extensions.iter().find(|ext| { + matches!(ext, HeaderExtension::ExternalDataFileName(_)) && ext != &ext_data_file + }); + if let Some(conflicting) = conflicting { + return Err(io::Error::other(format!( + "Found conflicting external data file names: {:?} != {:?}", + ext_data_file, conflicting + ))); + } + } + + let mut incompatible_features = v3header_base.incompatible_features; + let autoclear_features = v3header_base.autoclear_features; + + let external_data_file = + incompatible_features & IncompatibleFeatures::ExternalDataFile as u64 != 0; + incompatible_features &= !(IncompatibleFeatures::ExternalDataFile as u64); + + let mut header = Header { + v2: header, + v3: v3header_base, + unknown_header_fields, + backing_filename, + extensions, + external_data_file, + }; + + // No need to clear autoclear features for read-only images + if autoclear_features != 0 && writable { + header.v3.autoclear_features = 0; + header.write(image).await?; + } + + if incompatible_features != 0 { + let feats = (0..64) + .filter(|bit| header.v3.incompatible_features & (1u64 << bit) != 0) + .map(|bit| { + if let Some(name) = header.feature_name(FeatureType::Incompatible, bit) { + format!("{bit} ({name})") + } else { + format!("{bit}") + } + }) + .collect::>(); + + return Err(invalid_data(format!( + "Unrecognized incompatible feature(s) {}", + feats.join(", ") + ))); + } + + Ok(header) + } + + /// Write the qcow2 header to disk. + pub async fn write(&mut self, image: &S) -> io::Result<()> { + let bincode = bincode::DefaultOptions::new() + .with_fixint_encoding() + .with_big_endian(); + + let header_len = if self.v2.version > 2 { + let len = bincode.serialized_size(&self.v2).unwrap() as usize + + bincode.serialized_size(&self.v3).unwrap() as usize + + self.unknown_header_fields.len(); + let len = len.next_multiple_of(8); + self.v3.header_length = len as u32; + len + } else { + V2Header::RAW_SIZE + }; + + let mut header_exts = self.serialize_extensions()?; + + if let Some(backing) = self.backing_filename.as_ref() { + let offset = header_len + header_exts.len(); + let size = backing.len(); // length in bytes + let end = offset.checked_add(size).ok_or_else(|| { + io::Error::other("Header plus header extensions plus backing filename is too long") + })?; + if end > self.cluster_size() { + return Err(io::Error::other( + "Header plus header extensions plus backing filename is too long", + ))?; + } + self.v2.backing_file_offset = offset as u64; + self.v2.backing_file_size = size as u32; + } else { + self.v2.backing_file_offset = 0; + self.v2.backing_file_size = 0; + } + + let mut full_buf = bincode.serialize(&self.v2).map_err(invalid_data)?; + if self.v2.version > 2 { + full_buf.append(&mut bincode.serialize(&self.v3).map_err(invalid_data)?); + full_buf.extend_from_slice(&self.unknown_header_fields); + full_buf.resize(full_buf.len().next_multiple_of(8), 0); + } + + full_buf.append(&mut header_exts); + + if let Some(backing) = self.backing_filename.as_ref() { + full_buf.extend_from_slice(backing.as_bytes()); + } + + if full_buf.len() > self.cluster_size() { + return Err(io::Error::other(format!( + "Header is too big to write ({}, larger than a cluster ({}))", + full_buf.len(), + self.cluster_size(), + ))); + } + + image.write(&full_buf, 0).await + } + + /// Guest disk size. + pub fn size(&self) -> u64 { + self.v2.size + } + + /// log2 of the cluster size. + pub fn cluster_bits(&self) -> u32 { + self.v2.cluster_bits + } + + /// Cluster size in bytes. + pub fn cluster_size(&self) -> usize { + 1 << self.cluster_bits() + } + + /// Number of entries per L2 table. + pub fn l2_entries(&self) -> usize { + // 3 == log2(size_of::()) + 1 << (self.cluster_bits() - 3) + } + + /// log2 of the number of entries per refcount block. + pub fn rb_bits(&self) -> u32 { + // log2(cluster_size >> (refcount_order - 3)) + self.cluster_bits() - (self.refcount_order() - 3) + } + + /// Number of entries per refcount block. + pub fn rb_entries(&self) -> usize { + 1 << self.rb_bits() + } + + /// log2 of the refcount bits. + pub fn refcount_order(&self) -> u32 { + self.v3.refcount_order + } + + /// Offset of the L1 table. + pub fn l1_table_offset(&self) -> HostOffset { + HostOffset(self.v2.l1_table_offset.load(Ordering::Relaxed)) + } + + /// Number of entries in the L1 table. + pub fn l1_table_entries(&self) -> usize { + self.v2.l1_size.load(Ordering::Relaxed) as usize + } + + /// Enter a new L1 table in the image header. + pub fn set_l1_table(&self, l1_table: &L1Table) -> io::Result<()> { + let offset = l1_table.get_offset().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "New L1 table has no assigned offset", + ) + })?; + + let entries = l1_table.entries(); + let entries = entries + .try_into() + .map_err(|err| invalid_data(format!("Too many L1 entries ({entries}): {err}")))?; + + self.v2.l1_table_offset.store(offset.0, Ordering::Relaxed); + + self.v2.l1_size.store(entries, Ordering::Relaxed); + + Ok(()) + } + + /// Offset of the refcount table. + pub fn reftable_offset(&self) -> HostOffset { + HostOffset(self.v2.refcount_table_offset.load(Ordering::Relaxed)) + } + + /// Number of clusters occupied by the refcount table. + pub fn reftable_clusters(&self) -> ClusterCount { + ClusterCount(self.v2.refcount_table_clusters.load(Ordering::Relaxed) as u64) + } + + /// Number of entries in the refcount table. + pub fn reftable_entries(&self) -> usize { + // 3 == log2(size_of::()) + (self.reftable_clusters().byte_size(self.cluster_bits()) >> 3) as usize + } + + /// Enter a new refcount table in the image header. + pub fn set_reftable(&self, reftable: &RefTable) -> io::Result<()> { + let offset = reftable.get_offset().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "New refcount table has no assigned offset", + ) + })?; + + let clusters = reftable.cluster_count(); + let clusters = clusters.0.try_into().map_err(|err| { + invalid_data(format!("Too many reftable clusters ({clusters}): {err}")) + })?; + + self.v2 + .refcount_table_clusters + .store(clusters, Ordering::Relaxed); + + self.v2 + .refcount_table_offset + .store(offset.0, Ordering::Relaxed); + + Ok(()) + } + + /// Backing filename from the image header (if any). + pub fn backing_filename(&self) -> Option<&String> { + self.backing_filename.as_ref() + } + + /// Backing format string from the image header (if any). + pub fn backing_format(&self) -> Option<&String> { + self.extensions.iter().find_map(|e| match e { + HeaderExtension::BackingFileFormat(fmt) => Some(fmt), + _ => None, + }) + } + + /// Whether this image requires an external data file. + pub fn external_data_file(&self) -> bool { + self.external_data_file + } + + /// External data file filename from the image header (if any). + pub fn external_data_filename(&self) -> Option<&String> { + self.extensions.iter().find_map(|e| match e { + HeaderExtension::ExternalDataFileName(filename) => Some(filename), + _ => None, + }) + } + + /// Translate a feature bit to a human-readable name. + /// + /// Uses the feature name table from the image header, if present. + pub fn feature_name(&self, feat_type: FeatureType, bit: u32) -> Option<&String> { + for e in &self.extensions { + if let HeaderExtension::FeatureNameTable(names) = e { + if let Some(name) = names.get(&(feat_type, bit as u8)) { + return Some(name); + } + } + } + + None + } + + /// Serialize all header extensions. + fn serialize_extensions(&self) -> io::Result> { + let bincode = bincode::DefaultOptions::new() + .with_fixint_encoding() + .with_big_endian(); + + let mut result = Vec::new(); + for e in &self.extensions { + let mut data = e.serialize_data()?; + let ext_hdr = HeaderExtensionHeader { + extension_type: e.extension_type(), + length: data.len().try_into().map_err(|err| { + invalid_data(format!( + "Header extension too long ({}): {}", + data.len(), + err + )) + })?, + }; + result.append(&mut bincode.serialize(&ext_hdr).map_err(invalid_data)?); + result.append(&mut data); + result.resize(result.len().next_multiple_of(8), 0); + } + + let end_ext = HeaderExtensionHeader { + extension_type: HeaderExtensionType::End as u32, + length: 0, + }; + result.append(&mut bincode.serialize(&end_ext).map_err(invalid_data)?); + result.resize(result.len().next_multiple_of(8), 0); + + Ok(result) + } + + /// Helper for functions that just need to change little bits in the v2 header part. + async fn write_v2_header(&self, image: &S) -> io::Result<()> { + let bincode = bincode::DefaultOptions::new() + .with_fixint_encoding() + .with_big_endian(); + + let v2_header = bincode.serialize(&self.v2).map_err(invalid_data)?; + image.write(&v2_header, 0).await + } + + /// Write the refcount table pointer (offset and size) to disk. + pub async fn write_reftable_pointer(&self, image: &S) -> io::Result<()> { + // TODO: Just write the reftable offset and size + self.write_v2_header(image).await + } + + /// Write the L1 table pointer (offset and size) to disk. + pub async fn write_l1_table_pointer(&self, image: &S) -> io::Result<()> { + // TODO: Just write the L1 table offset and size + self.write_v2_header(image).await + } +} + +impl HeaderExtension { + /// Parse an extension from its type and data. Unrecognized types are stored as `Unknown` + /// extensions, encountering the end of extensions returns `Ok(None)`. + fn deserialize(ext_type: u32, data: Vec) -> io::Result> { + let ext = if let Ok(ext_type) = HeaderExtensionType::try_from(ext_type) { + match ext_type { + HeaderExtensionType::End => return Ok(None), + HeaderExtensionType::BackingFileFormat => { + let fmt = String::from_utf8(data).map_err(|err| { + invalid_data(format!("Invalid backing file format: {err}")) + })?; + HeaderExtension::BackingFileFormat(fmt) + } + HeaderExtensionType::FeatureNameTable => { + let mut feats = HashMap::new(); + for feat in data.chunks(48) { + let feat_type: FeatureType = match feat[0].try_into() { + Ok(ft) => ft, + Err(_) => continue, // skip unrecognized entries + }; + // Cannot use CStr to parse this, as it may not be NUL-terminated. + // Use this to remove everything from the first NUL byte. + let feat_name_bytes = feat[2..].split(|c| *c == 0).next().unwrap(); + // Then just use it as a UTF-8 string. + let feat_name = String::from_utf8_lossy(feat_name_bytes); + feats.insert((feat_type, feat[1]), feat_name.to_string()); + } + HeaderExtension::FeatureNameTable(feats) + } + HeaderExtensionType::ExternalDataFileName => { + let filename = String::from_utf8(data).map_err(|err| { + invalid_data(format!("Invalid external data file name: {err}")) + })?; + HeaderExtension::ExternalDataFileName(filename) + } + } + } else { + HeaderExtension::Unknown { + extension_type: ext_type, + data, + } + }; + + Ok(Some(ext)) + } + + /// Return the extension type ID. + fn extension_type(&self) -> u32 { + match self { + HeaderExtension::BackingFileFormat(_) => HeaderExtensionType::BackingFileFormat as u32, + HeaderExtension::FeatureNameTable(_) => HeaderExtensionType::FeatureNameTable as u32, + HeaderExtension::ExternalDataFileName(_) => { + HeaderExtensionType::ExternalDataFileName as u32 + } + HeaderExtension::Unknown { + extension_type, + data: _, + } => *extension_type, + } + } + + /// Serialize this extension’s data (exclusing its header). + fn serialize_data(&self) -> io::Result> { + match self { + HeaderExtension::BackingFileFormat(fmt) => Ok(fmt.as_bytes().into()), + HeaderExtension::FeatureNameTable(map) => { + let mut result = Vec::new(); + for (bit, name) in map { + result.push(bit.0 as u8); + result.push(bit.1); + + let mut padded_name = vec![0; 46]; + let name_bytes = name.as_bytes(); + // Might truncate in the middle of a multibyte character, but getting that + // right is complicated and probably not worth it + let truncated_len = cmp::min(name_bytes.len(), 46); + padded_name[..truncated_len].copy_from_slice(&name_bytes[..truncated_len]); + result.extend_from_slice(&padded_name); + } + Ok(result) + } + HeaderExtension::ExternalDataFileName(filename) => Ok(filename.as_bytes().into()), + HeaderExtension::Unknown { + extension_type: _, + data, + } => Ok(data.clone()), + } + } +} + +/// L1 table entry. +/// +/// - Bit 0 - 8: Reserved (set to 0) +/// - Bit 9 – 55: Bits 9-55 of the offset into the image file at which the L2 table starts. Must +/// be aligned to a cluster boundary. If the offset is 0, the L2 table and all clusters +/// described by this L2 table are unallocated. +/// - Bit 56 - 62: Reserved (set to 0) +/// - Bit 63: 0 for an L2 table that is unused or requires COW, 1 if its refcount is exactly one. +/// This information is only accurate in the active L1 table. +#[derive(Copy, Clone, Default, Debug)] +pub(super) struct L1Entry(u64); + +impl L1Entry { + /// Offset of the L2 table, if any. + pub fn l2_offset(&self) -> Option { + let ofs = self.0 & 0x00ff_ffff_ffff_fe00u64; + if ofs == 0 { + None + } else { + Some(HostOffset(ofs)) + } + } + + /// Whether the L2 table’s cluster is “copied”. + /// + /// `true` means is refcount is one, `false` means modifying it will require COW. + pub fn is_copied(&self) -> bool { + self.0 & (1u64 << 63) != 0 + } + + /// Return all reserved bits. + pub fn reserved_bits(&self) -> u64 { + self.0 & 0x7f00_0000_0000_01feu64 + } +} + +impl TableEntry for L1Entry { + fn try_from_plain(value: u64, header: &Header) -> io::Result { + let entry = L1Entry(value); + + if entry.reserved_bits() != 0 { + return Err(invalid_data(format!( + "Invalid L1 entry 0x{:x}, reserved bits set (0x{:x})", + value, + entry.reserved_bits(), + ))); + } + + if let Some(l2_ofs) = entry.l2_offset() { + if l2_ofs.in_cluster_offset(header.cluster_bits()) != 0 { + return Err(invalid_data(format!( + "Invalid L1 entry 0x{:x}, offset ({}) is not aligned to cluster size (0x{:x})", + value, + l2_ofs, + header.cluster_size(), + ))); + } + } + + Ok(entry) + } + + fn to_plain(&self) -> u64 { + self.0 + } +} + +/// L1 table. +#[derive(Debug)] +pub(super) struct L1Table { + /// First cluster in the image file. + cluster: Option, + + /// Table data. + data: Box<[L1Entry]>, + + /// log2 of the cluster size. + cluster_bits: u32, + + /// Whether this table has been modified since it was last written. + modified: AtomicBool, +} + +impl L1Table { + /// Create a clone that covers at least `at_least_index`. + pub fn clone_and_grow(&self, at_least_index: usize, header: &Header) -> io::Result { + let new_entry_count = cmp::max(at_least_index + 1, self.data.len()); + let new_entry_count = + new_entry_count.next_multiple_of(header.cluster_size() / size_of::()); + + if new_entry_count > ::MAX_ENTRIES { + return Err(io::Error::other( + "Cannot grow the image to this size; L1 table would become too big", + )); + } + + let mut new_data = vec![L1Entry::default(); new_entry_count]; + new_data[..self.data.len()].copy_from_slice(&self.data); + + Ok(Self { + cluster: None, + data: new_data.into_boxed_slice(), + cluster_bits: header.cluster_bits(), + modified: true.into(), + }) + } + + /// Check whether `index` is in bounds. + pub fn in_bounds(&self, index: usize) -> bool { + index < self.data.len() + } + + /// Enter the given L2 table into this L1 table. + pub fn enter_l2_table(&mut self, index: usize, l2: &L2Table) -> io::Result<()> { + let l2_offset = l2.get_offset().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "L2 table has no assigned offset", + ) + })?; + + let l1entry = L1Entry((1 << 63) | l2_offset.0); + debug_assert!(l1entry.reserved_bits() == 0); + self.data[index] = l1entry; + self.modified.store(true, Ordering::Relaxed); + + Ok(()) + } +} + +impl Table for L1Table { + type InternalEntry = L1Entry; + type Entry = L1Entry; + const NAME: &'static str = "L1 table"; + + /// Maximum number of L1 table entries. + /// + /// Limit taken from QEMU; if QEMU rejects this, we can, too. + const MAX_ENTRIES: usize = 4 * 1024 * 1024; + + fn from_data(data: Box<[L1Entry]>, header: &Header) -> Self { + Self { + cluster: None, + data, + cluster_bits: header.cluster_bits(), + modified: true.into(), + } + } + + fn entries(&self) -> usize { + self.data.len() + } + + fn get_ref(&self, index: usize) -> Option<&L1Entry> { + self.data.get(index) + } + + fn get(&self, index: usize) -> L1Entry { + self.data.get(index).copied().unwrap_or(L1Entry(0)) + } + + fn get_cluster(&self) -> Option { + self.cluster + } + + fn get_offset(&self) -> Option { + self.cluster.map(|index| index.offset(self.cluster_bits)) + } + + fn set_cluster(&mut self, cluster: HostCluster) { + self.cluster = Some(cluster); + self.modified.store(true, Ordering::Relaxed); + } + + fn unset_cluster(&mut self) { + self.cluster = None; + } + + fn is_modified(&self) -> bool { + self.modified.load(Ordering::Relaxed) + } + + fn clear_modified(&self) { + self.modified.store(false, Ordering::Relaxed); + } + + fn set_modified(&self) { + self.modified.store(true, Ordering::Relaxed); + } + + fn cluster_bits(&self) -> u32 { + self.cluster_bits + } +} + +/// L2 table entry. +/// +/// - Bit 0 - 61: Cluster descriptor +/// - Bit 62: 0 for standard clusters, 1 for compressed clusters +/// - Bit 63: 0 for clusters that are unused, compressed or require COW. 1 for standard clusters +/// whose refcount is exactly one. This information is only accurate in L2 tables that are +/// reachable from the active L1 table. With external data files, all guest clusters have an +/// implicit refcount of 1 (because of the fixed host = guest mapping for guest cluster offsets), +/// so this bit should be 1 for all allocated clusters. +/// +/// Standard Cluster Descriptor: +/// - Bit 0: If set to 1, the cluster reads as all zeros. The host cluster offset can be used to +/// describe a preallocation, but it won’t be used for reading data from this cluster, nor is +/// data read from the backing file if the cluster is unallocated. With version 2 or with +/// extended L2 entries (see the next section), this is always 0. +/// - Bit 1 – 8: Reserved (set to 0) +/// - Bit 9 – 55: Bits 9-55 of host cluster offset. Must be aligned to a cluster boundary. If the +/// offset is 0 and bit 63 is clear, the cluster is unallocated. The offset may only be 0 with +/// bit 63 set (indicating a host cluster offset of 0) when an external data file is used. +/// - Bit 56 - 61: Reserved (set to 0) +/// +/// Compressed Cluster Descriptor (`x = 62 - (cluster_bits - 8)`): +/// - Bit 0 - x-1: Host cluster offset. This is usually _not_ aligned to a cluster or sector +/// boundary! If cluster_bits is small enough that this field includes bits beyond 55, those +/// upper bits must be set to 0. +/// - Bit x - 61: Number of additional 512-byte sectors used for the compressed data, beyond the +/// sector containing the offset in the previous field. Some of these sectors may reside in the +/// next contiguous host cluster. Note that the compressed data does not necessarily occupy all +/// of the bytes in the final sector; rather, decompression stops when it has produced a cluster +/// of data. Another compressed cluster may map to the tail of the final sector used by this +/// compressed cluster. +#[derive(Copy, Clone, Default, Debug)] +pub(super) struct L2Entry(u64); + +/// Internal actual type of L2 entries. +/// +/// Using atomic allows flushing L2 tables from the cache while they are write-locked. +#[derive(Default, Debug)] +pub(super) struct AtomicL2Entry(AtomicU64); + +/// High-level representation of an L2 entry. +#[derive(Debug, Clone)] +pub(super) enum L2Mapping { + /// Data is in the data file. + DataFile { + /// Cluster in the data file. + host_cluster: HostCluster, + + /// Whether the cluster has a refcount of exactly 1. + copied: bool, + }, + + /// Data is in the backing file. + Backing { + /// Guest cluster index. + backing_offset: u64, + }, + + /// Data is zero. + Zero { + /// Preallocated cluster in the data file, if any. + host_cluster: Option, + + /// Whether the preallocated cluster has a refcount of exactly 1. + copied: bool, + }, + + /// Data is compressed. + Compressed { + /// Offset in the data file. + host_offset: HostOffset, + + /// Upper limit on the number of bytes that comprise the compressed data. + length: u64, + }, +} + +impl L2Entry { + /// Offset of the data cluster, if any. + /// + /// Assumes the L2 entry references a data cluster, not a compressed cluster. + /// + /// `external_data_file` must be true when using an external data file; in this case, offset 0 + /// is a valid offset, and can only be distinguished from “unallocated” by whether the COPIED + /// flag is set or not (which it always is when using an external data file). + pub fn cluster_offset(&self, external_data_file: bool) -> Option { + let ofs = self.0 & 0x00ff_ffff_ffff_fe00u64; + if ofs != 0 || (external_data_file && self.is_copied()) { + Some(HostOffset(ofs)) + } else { + None + } + } + + /// Whether the cluster is compressed. + pub fn is_compressed(&self) -> bool { + self.0 & (1u64 << 62) != 0 + } + + /// Whether the cluster is “copied”. + /// + /// `true` means is refcount is one, `false` means modifying it will require COW. + pub fn is_copied(&self) -> bool { + self.0 & (1u64 << 63) != 0 + } + + /// Clear “copied” flag. + #[must_use] + pub fn without_copied(self) -> Self { + L2Entry(self.0 & !(1u64 << 63)) + } + + /// Whether the cluster is a zero cluster. + /// + /// Assumes the L2 entry references a data cluster, not a compressed cluster. + pub fn is_zero(&self) -> bool { + self.0 & (1u64 << 0) != 0 + } + + /// Return all reserved bits. + pub fn reserved_bits(&self) -> u64 { + if self.is_compressed() { + self.0 & 0x8000_0000_0000_0000u64 + } else { + self.0 & 0x3f00_0000_0000_01feu64 + } + } + + /// Return the full compressed cluster descriptor. + pub fn compressed_descriptor(&self) -> u64 { + self.0 & 0x3fff_ffff_ffff_ffffu64 + } + + /// If this entry is compressed, return the start host offset and upper limit on the compressed + /// number of bytes. + pub fn compressed_range(&self, cluster_bits: u32) -> Option<(HostOffset, u64)> { + if self.is_compressed() { + let desc = self.compressed_descriptor(); + let compressed_offset_bits = 62 - (cluster_bits - 8); + let offset = desc & ((1 << compressed_offset_bits) - 1) & 0x00ff_ffff_ffff_ffffu64; + let sectors = desc >> compressed_offset_bits; + // The first sector is not considered in `sectors`, so we add it and subtract the + // number of bytes there that do not belong to this compressed cluster + let length = (sectors + 1) * 512 - (offset & 511); + + Some((HostOffset(offset), length)) + } else { + None + } + } + + /// If this entry is allocated, return the first host cluster and the number of clusters it + /// references. + /// + /// `external_data_file` must be true when using an external data file. + fn allocation( + &self, + cluster_bits: u32, + external_data_file: bool, + ) -> Option<(HostCluster, ClusterCount)> { + if let Some((offset, length)) = self.compressed_range(cluster_bits) { + // Compressed clusters can cross host cluster boundaries, and thus occupy two clusters + let first_cluster = offset.cluster(cluster_bits); + let cluster_count = ClusterCount::from_byte_size( + offset + length - first_cluster.offset(cluster_bits), + cluster_bits, + ); + Some((first_cluster, cluster_count)) + } else { + self.cluster_offset(external_data_file) + .map(|ofs| (ofs.cluster(cluster_bits), ClusterCount(1))) + } + } + + /// Return the high-level `L2Mapping` representation. + /// + /// `guest_cluster` is the guest cluster being accessed, `cluster_bits` is log2 of the cluster + /// size. `external_data_file` must be true when using an external data file. + fn into_mapping( + self, + guest_cluster: GuestCluster, + cluster_bits: u32, + external_data_file: bool, + ) -> io::Result { + let mapping = if let Some((offset, length)) = self.compressed_range(cluster_bits) { + L2Mapping::Compressed { + host_offset: offset, + length, + } + } else if self.is_zero() { + let host_cluster = self + .cluster_offset(external_data_file) + .map(|ofs| { + ofs.checked_cluster(cluster_bits).ok_or_else(|| { + let offset = guest_cluster.offset(cluster_bits); + io::Error::other(format!( + "Unaligned pre-allocated zero cluster at {offset}; L2 entry: {self:?}" + )) + }) + }) + .transpose()?; + + L2Mapping::Zero { + host_cluster, + copied: host_cluster.is_some() && self.is_copied(), + } + } else if let Some(host_offset) = self.cluster_offset(external_data_file) { + let host_cluster = host_offset.checked_cluster(cluster_bits).ok_or_else(|| { + let offset = guest_cluster.offset(cluster_bits); + io::Error::other(format!( + "Unaligned data cluster at {offset}; L2 entry: {self:?}" + )) + })?; + + L2Mapping::DataFile { + host_cluster, + copied: self.is_copied(), + } + } else { + L2Mapping::Backing { + backing_offset: guest_cluster.offset(cluster_bits).0, + } + }; + + Ok(mapping) + } + + /// Create an L2 entry from its high-level `L2Mapping` representation. + fn from_mapping(value: L2Mapping, cluster_bits: u32) -> Self { + let num_val: u64 = match value { + L2Mapping::DataFile { + host_cluster, + copied, + } => { + debug_assert!(host_cluster.offset(cluster_bits) <= MAX_OFFSET); + if copied { + (1 << 63) | host_cluster.offset(cluster_bits).0 + } else { + host_cluster.offset(cluster_bits).0 + } + } + + L2Mapping::Backing { backing_offset: _ } => 0, + + L2Mapping::Zero { + host_cluster, + copied, + } => { + let host_offset = host_cluster.map(|hc| hc.offset(cluster_bits)); + debug_assert!(host_offset.unwrap_or(HostOffset(0)) <= MAX_OFFSET); + if copied { + (1 << 63) | host_offset.unwrap().0 | 0x1 + } else { + host_offset.unwrap_or(HostOffset(0)).0 | 0x1 + } + } + + L2Mapping::Compressed { + host_offset, + length, + } => { + let compressed_offset_bits = 62 - (cluster_bits - 8); + assert!(length < 1 << cluster_bits); + assert!(host_offset.0 < 1 << compressed_offset_bits); + + // The first sector is not considered, so we subtract the number of bytes in it + // that belong to this compressed cluster from `length`: + // ceil((length - (512 - (host_offset & 511))) / 512) + // = (length + 511 - 512 + (host_offset & 511)) / 512 + let sectors = (length - 1 + (host_offset.0 & 511)) / 512; + + (1 << 62) | (sectors << compressed_offset_bits) | host_offset.0 + } + }; + + let entry = L2Entry(num_val); + debug_assert!(entry.reserved_bits() == 0); + entry + } +} + +impl AtomicL2Entry { + /// Get the contained value. + fn get(&self) -> L2Entry { + L2Entry(self.0.load(Ordering::Relaxed)) + } + + /// Exchange the contained value. + /// + /// # Safety + /// Caller must ensure that: + /// (1) No reader sees invalid intermediate states. + /// (2) Updates are done atomically (do not depend on prior state of the L2 table), or there is + /// only one writer at a time. + unsafe fn swap(&self, l2e: L2Entry) -> L2Entry { + L2Entry(self.0.swap(l2e.0, Ordering::Relaxed)) + } +} + +impl TableEntry for AtomicL2Entry { + fn try_from_plain(value: u64, header: &Header) -> io::Result { + let entry = L2Entry(value); + + if entry.reserved_bits() != 0 { + return Err(invalid_data(format!( + "Invalid L2 entry 0x{:x}, reserved bits set (0x{:x})", + value, + entry.reserved_bits(), + ))); + } + + if let Some(offset) = entry.cluster_offset(header.external_data_file()) { + if !entry.is_compressed() && offset.in_cluster_offset(header.cluster_bits()) != 0 { + return Err(invalid_data(format!( + "Invalid L2 entry 0x{:x}, offset ({}) is not aligned to cluster size (0x{:x})", + value, + offset, + header.cluster_size(), + ))); + } + } + + Ok(AtomicL2Entry(AtomicU64::new(entry.0))) + } + + fn to_plain(&self) -> u64 { + self.get().0 + } +} + +impl L2Mapping { + /// Check whether two mappings are consecutive. + /// + /// Given the `preceding` mapping, check whether `self` is consecutive to it, i.e. is the same + /// kind of mapping, and the offsets are consecutive. + pub fn is_consecutive(&self, preceding: &L2Mapping, cluster_bits: u32) -> bool { + match preceding { + L2Mapping::DataFile { + host_cluster: prior_cluster, + copied, + } => { + if let L2Mapping::DataFile { + host_cluster: next_cluster, + copied: next_copied, + } = self + { + *next_cluster == *prior_cluster + ClusterCount(1) && *next_copied == *copied + } else { + false + } + } + + L2Mapping::Backing { + backing_offset: prior_backing_offset, + } => { + let Some(expected_next) = prior_backing_offset.checked_add(1 << cluster_bits) + else { + return false; + }; + + if let L2Mapping::Backing { + backing_offset: next_offset, + } = self + { + *next_offset == expected_next + } else { + false + } + } + + L2Mapping::Zero { + host_cluster: _, + copied: _, + } => { + // Cluster and copied do not matter; every read is continuous regardless (always + // zero), and every write is, too (always allocate) + matches!( + self, + L2Mapping::Zero { + host_cluster: _, + copied: _, + } + ) + } + + L2Mapping::Compressed { + host_offset: _, + length: _, + } => { + // Not really true, but in practice it is. Reads need to go through a special + // function anyway, and every write will need COW anyway. + matches!( + self, + L2Mapping::Compressed { + host_offset: _, + length: _, + } + ) + } + } + } +} + +/// L2 table. +#[derive(Debug)] +pub(super) struct L2Table { + /// Cluster of the L2 table. + cluster: Option, + + /// Table data. + data: Box<[AtomicL2Entry]>, + + /// log2 of the cluster size. + cluster_bits: u32, + + /// Whether this image uses an external data file. + external_data_file: bool, + + /// Whether this table has been modified since it was last written. + modified: AtomicBool, + + /// Lock for creating `L2TableWriteGuard`. + writer_lock: Mutex<()>, +} + +/// Write guard for an L2 table. +#[derive(Debug)] +pub(super) struct L2TableWriteGuard<'a> { + /// Referenced L2 table. + table: &'a L2Table, + + /// Held guard mutex on that L2 table. + _lock: MutexGuard<'a, ()>, +} + +impl L2Table { + /// Create a new zeroed L2 table. + pub fn new_cleared(header: &Header) -> Self { + let mut data = Vec::with_capacity(header.l2_entries()); + data.resize_with(header.l2_entries(), Default::default); + + L2Table { + cluster: None, + data: data.into_boxed_slice(), + cluster_bits: header.cluster_bits(), + external_data_file: header.external_data_file(), + modified: true.into(), + writer_lock: Default::default(), + } + } + + /// Look up a cluster mapping. + pub fn get_mapping(&self, lookup_cluster: GuestCluster) -> io::Result { + self.get(lookup_cluster.l2_index(self.cluster_bits)) + .into_mapping(lookup_cluster, self.cluster_bits, self.external_data_file) + } + + /// Allow modifying this L2 table. + /// + /// Note that readers are allowed to exist while modifications are happening. + pub async fn lock_write(&self) -> L2TableWriteGuard<'_> { + L2TableWriteGuard { + table: self, + _lock: self.writer_lock.lock().await, + } + } +} + +impl L2TableWriteGuard<'_> { + /// Look up a cluster mapping. + pub fn get_mapping(&self, lookup_cluster: GuestCluster) -> io::Result { + self.table.get_mapping(lookup_cluster) + } + + /// Enter the given raw data cluster mapping into the L2 table. + /// + /// If the previous entry pointed to an allocated cluster, return the old allocation so its + /// refcount can be decreased (offset of the first cluster and number of clusters -- compressed + /// clusters can span across host cluster boundaries). + /// + /// If the allocation is reused, `None` is returned, so this function only returns `Some(_)` if + /// some cluster is indeed leaked. + #[must_use] + pub fn map_cluster( + &mut self, + index: usize, + host_cluster: HostCluster, + ) -> Option<(HostCluster, ClusterCount)> { + let new = L2Entry::from_mapping( + L2Mapping::DataFile { + host_cluster, + copied: true, + }, + self.table.cluster_bits, + ); + // Safe: We set a full valid mapping, and there is only one writer (thanks to + // `L2TableWriteGuard`). + let l2e = unsafe { self.table.data[index].swap(new) }; + self.table.modified.store(true, Ordering::Relaxed); + + let allocation = l2e.allocation(self.table.cluster_bits, self.table.external_data_file); + if let Some((a_cluster, a_count)) = allocation { + if a_cluster == host_cluster && a_count == ClusterCount(1) { + None + } else { + allocation + } + } else { + None + } + } +} + +impl Table for L2Table { + type InternalEntry = AtomicL2Entry; + type Entry = L2Entry; + const NAME: &'static str = "L2 table"; + const MAX_ENTRIES: usize = MAX_CLUSTER_SIZE / 8; + + fn from_data(data: Box<[AtomicL2Entry]>, header: &Header) -> Self { + assert!(data.len() == header.l2_entries()); + + Self { + cluster: None, + data, + cluster_bits: header.cluster_bits(), + external_data_file: header.external_data_file(), + modified: true.into(), + writer_lock: Default::default(), + } + } + + fn entries(&self) -> usize { + self.data.len() + } + + fn get_ref(&self, index: usize) -> Option<&AtomicL2Entry> { + self.data.get(index) + } + + fn get(&self, index: usize) -> L2Entry { + self.data + .get(index) + .map(|l2e| l2e.get()) + .unwrap_or(L2Entry(0)) + } + + fn get_cluster(&self) -> Option { + self.cluster + } + + fn get_offset(&self) -> Option { + self.cluster.map(|index| index.offset(self.cluster_bits)) + } + + fn set_cluster(&mut self, cluster: HostCluster) { + self.cluster = Some(cluster); + self.modified.store(true, Ordering::Relaxed); + } + + fn unset_cluster(&mut self) { + self.cluster = None; + } + + fn is_modified(&self) -> bool { + self.modified.load(Ordering::Relaxed) + } + + fn clear_modified(&self) { + self.modified.store(false, Ordering::Relaxed); + } + + fn set_modified(&self) { + self.modified.store(true, Ordering::Relaxed); + } + + fn cluster_bits(&self) -> u32 { + self.cluster_bits + } +} + +impl Clone for L2Table { + fn clone(&self) -> Self { + let mut data = Vec::with_capacity(self.data.len()); + for entry in &self.data { + // None of these can be `copied` + let entry = entry.get().without_copied(); + data.push(AtomicL2Entry(AtomicU64::new(entry.0))); + } + + let modified = AtomicBool::new(self.is_modified()); + + L2Table { + cluster: None, + data: data.into_boxed_slice(), + cluster_bits: self.cluster_bits, + external_data_file: self.external_data_file, + modified, + writer_lock: Default::default(), + } + } +} + +impl Drop for L2Table { + fn drop(&mut self) { + if self.is_modified() { + error!("L2 table dropped while modified; was the image closed before being flushed?"); + } + } +} + +/// Refcount table entry. +#[derive(Copy, Clone, Default, Debug)] +pub(super) struct RefTableEntry(u64); + +impl RefTableEntry { + /// Offset of the referenced refblock, if any. + pub fn refblock_offset(&self) -> Option { + let ofs = self.0 & 0xffff_ffff_ffff_fe00u64; + if ofs == 0 { + None + } else { + Some(HostOffset(ofs)) + } + } + + /// Return all reserved bits. + pub fn reserved_bits(&self) -> u64 { + self.0 & 0x0000_0000_0000_01ffu64 + } +} + +impl TableEntry for RefTableEntry { + fn try_from_plain(value: u64, header: &Header) -> io::Result { + let entry = RefTableEntry(value); + + if entry.reserved_bits() != 0 { + return Err(invalid_data(format!( + "Invalid reftable entry 0x{:x}, reserved bits set (0x{:x})", + value, + entry.reserved_bits(), + ))); + } + + if let Some(rb_ofs) = entry.refblock_offset() { + if rb_ofs.in_cluster_offset(header.cluster_bits()) != 0 { + return Err(invalid_data( + format!( + "Invalid reftable entry 0x{:x}, offset ({}) is not aligned to cluster size (0x{:x})", + value, + rb_ofs, + header.cluster_size(), + ), + )); + } + } + + Ok(entry) + } + + fn to_plain(&self) -> u64 { + self.0 + } +} + +/// Refcount table. +#[derive(Debug)] +pub(super) struct RefTable { + /// First cluster in the image file. + cluster: Option, + + /// Table data. + data: Box<[RefTableEntry]>, + + /// log2 of the cluster size. + cluster_bits: u32, + + /// Whether this table has been modified since it was last written. + modified: AtomicBool, +} + +impl RefTable { + /// Create a clone that covers at least `at_least_index`. + /// + /// Also ensure that beyond `at_least_index`, there are enough entries to self-describe the new + /// refcount table (so that it can actually be allocated). + pub fn clone_and_grow(&self, header: &Header, at_least_index: usize) -> io::Result { + let cluster_size = header.cluster_size(); + let rb_entries = header.rb_entries(); + + // There surely is an optimal O(1) solution, but probably would look less clear, and this + // is not a hot path. + let mut extra_rbs = 1; + let new_entry_count = loop { + let entry_count = cmp::max(at_least_index + 1 + extra_rbs, self.data.len()); + let entry_count = entry_count.next_multiple_of(cluster_size / size_of::()); + let size = entry_count * size_of::(); + // Full number of clusters needed to both the new reftable *and* the `extra_rbs` + let refcount_clusters = size / cluster_size + extra_rbs; + let rbs_needed = refcount_clusters.div_ceil(rb_entries); + if extra_rbs == rbs_needed { + break entry_count; + } + extra_rbs = rbs_needed; + }; + + if new_entry_count > ::MAX_ENTRIES { + return Err(io::Error::other( + "Cannot grow the image to this size; refcount table would become too big", + )); + } + + let mut new_data = vec![RefTableEntry::default(); new_entry_count]; + new_data[..self.data.len()].copy_from_slice(&self.data); + + Ok(Self { + cluster: None, + data: new_data.into_boxed_slice(), + cluster_bits: header.cluster_bits(), + modified: true.into(), + }) + } + + /// Check whether `index` is in bounds. + pub fn in_bounds(&self, index: usize) -> bool { + index < self.data.len() + } + + /// Enter the given refcount block into this refcount table. + pub fn enter_refblock(&mut self, index: usize, rb: &RefBlock) -> io::Result<()> { + let rb_offset = rb.get_offset().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "Refcount block as no assigned offset", + ) + })?; + + let rt_entry = RefTableEntry(rb_offset.0); + debug_assert!(rt_entry.reserved_bits() == 0); + self.data[index] = rt_entry; + self.modified.store(true, Ordering::Relaxed); + + Ok(()) + } +} + +impl Table for RefTable { + type InternalEntry = RefTableEntry; + type Entry = RefTableEntry; + const NAME: &'static str = "Refcount table"; + + /// Maximum number of refcount table entries. + /// + /// Not in QEMU, but makes sense to limit to the same as the L1 table. Note that refcount + /// blocks usually cover more clusters than an L2 table, so this generally allows larger image + /// files than would be necessary for the maximum guest disk size determined by the maximum + /// number of L1 entries. + const MAX_ENTRIES: usize = ::MAX_ENTRIES; + + fn from_data(data: Box<[RefTableEntry]>, header: &Header) -> Self { + Self { + cluster: None, + data, + cluster_bits: header.cluster_bits(), + modified: true.into(), + } + } + + fn entries(&self) -> usize { + self.data.len() + } + + fn get_ref(&self, index: usize) -> Option<&RefTableEntry> { + self.data.get(index) + } + + fn get(&self, index: usize) -> RefTableEntry { + self.data.get(index).copied().unwrap_or(RefTableEntry(0)) + } + + fn get_cluster(&self) -> Option { + self.cluster + } + + fn get_offset(&self) -> Option { + self.cluster.map(|index| index.offset(self.cluster_bits)) + } + + fn set_cluster(&mut self, cluster: HostCluster) { + self.cluster = Some(cluster); + self.modified.store(true, Ordering::Relaxed); + } + + fn unset_cluster(&mut self) { + self.cluster = None; + } + + fn is_modified(&self) -> bool { + self.modified.load(Ordering::Relaxed) + } + + fn clear_modified(&self) { + self.modified.store(false, Ordering::Relaxed); + } + + fn set_modified(&self) { + self.modified.store(true, Ordering::Relaxed); + } + + fn cluster_bits(&self) -> u32 { + self.cluster_bits + } +} + +/// Refcount block. +pub(super) struct RefBlock { + /// Cluster in the image file. + cluster: Option, + + /// Raw table data (big endian). + raw_data: IoBuffer, + + /// log2 of the refcount bits. + refcount_order: u32, + + /// log2 of the cluster size. + cluster_bits: u32, + + /// Whether this block has been modified since it was last written. + modified: AtomicBool, + + /// Lock for creating `RefBlockWriteGuard`. + writer_lock: Mutex<()>, +} + +/// Write guard for a refblock. +pub(super) struct RefBlockWriteGuard<'a> { + /// Referenced refblock. + rb: &'a RefBlock, + + /// Held guard mutex on that refblock. + _lock: MutexGuard<'a, ()>, +} + +impl RefBlock { + /// Create a new zeroed refcount block. + pub fn new_cleared(for_image: &S, header: &Header) -> io::Result { + let mut raw_data = IoBuffer::new(header.cluster_size(), for_image.mem_align())?; + raw_data.as_mut().into_slice().fill(0); + + Ok(RefBlock { + cluster: None, + raw_data, + refcount_order: header.refcount_order(), + cluster_bits: header.cluster_bits(), + modified: true.into(), + writer_lock: Default::default(), + }) + } + + /// Load a refcount block from disk. + pub async fn load( + image: &S, + header: &Header, + cluster: HostCluster, + ) -> io::Result { + let cluster_bits = header.cluster_bits(); + let cluster_size = 1 << cluster_bits; + let refcount_order = header.refcount_order(); + let offset = cluster.offset(cluster_bits); + + check_table( + "Refcount block", + offset.0, + cluster_size, + 1, + MAX_CLUSTER_SIZE, + cluster_size, + )?; + + let mut raw_data = + IoBuffer::new(cluster_size, cmp::max(image.mem_align(), size_of::()))?; + image.read(&mut raw_data, offset.0).await?; + + Ok(RefBlock { + cluster: Some(cluster), + raw_data, + refcount_order, + cluster_bits, + modified: false.into(), + writer_lock: Default::default(), + }) + } + + /// Write a refcount block to disk. + pub async fn write(&self, image: &S) -> io::Result<()> { + let offset = self + .get_offset() + .ok_or_else(|| io::Error::other("Cannot write qcow2 refcount block, no offset set"))?; + + self.clear_modified(); + if let Err(err) = image.write(self.raw_data.as_ref(), offset.0).await { + self.set_modified(); + return Err(err); + } + + Ok(()) + } + + /// Get the block’s cluster in the image file. + pub fn get_cluster(&self) -> Option { + self.cluster + } + + /// Get the block’s offset in the image file. + pub fn get_offset(&self) -> Option { + self.cluster.map(|index| index.offset(self.cluster_bits)) + } + + /// Change the block’s cluster in the image file (for writing). + pub fn set_cluster(&mut self, cluster: HostCluster) { + self.cluster = Some(cluster); + self.set_modified(); + } + + /// Calculate sub-byte refcount access parameters. + /// + /// For a given refcount index, return its: + /// - byte index, + /// - access mask, + /// - in-byte shift. + fn sub_byte_refcount_access(&self, index: usize) -> (usize, u8, usize) { + let order = self.refcount_order; + debug_assert!(order < 3); + + // Note that `order` is in bits, i.e. `1 << order` is the number of bits. `index` is in + // units of refcounts, so `index << order` is the bit index, and `index << (order - 3)` is + // then the byte index, which is equal to `index >> (3 - order)`. + let byte_index = index >> (3 - order); + // `1 << order` is the bits per refcount (bprc), so `(1 << bprc) - 1` is the mask for one + // refcount (its maximum value). + let mask = (1 << (1 << order)) - 1; + // `index` is in units of refcounts, so `index << order` is the bit index. `% 8`, we get + // the base index inside of a byte. + let shift = (index << order) % 8; + + (byte_index, mask, shift) + } + + /// Get the given cluster’s refcount. + pub fn get(&self, index: usize) -> u64 { + match self.refcount_order { + // refcount_bits == 1, 2, 4 + 0..=2 => { + let (index, mask, shift) = self.sub_byte_refcount_access(index); + let raw_data_slice = unsafe { self.raw_data.as_ref().into_typed_slice::() }; + let atomic = + unsafe { AtomicU8::from_ptr(&raw_data_slice[index] as *const u8 as *mut u8) }; + ((atomic.load(Ordering::Relaxed) >> shift) & mask) as u64 + } + + // refcount_bits == 8 + 3 => { + let raw_data_slice = unsafe { self.raw_data.as_ref().into_typed_slice::() }; + let atomic = + unsafe { AtomicU8::from_ptr(&raw_data_slice[index] as *const u8 as *mut u8) }; + atomic.load(Ordering::Relaxed) as u64 + } + + // refcount_bits == 16 + 4 => { + let raw_data_slice = unsafe { self.raw_data.as_ref().into_typed_slice::() }; + let atomic = unsafe { + AtomicU16::from_ptr(&raw_data_slice[index] as *const u16 as *mut u16) + }; + u16::from_be(atomic.load(Ordering::Relaxed)) as u64 + } + + // refcount_bits == 32 + 5 => { + let raw_data_slice = unsafe { self.raw_data.as_ref().into_typed_slice::() }; + let atomic = unsafe { + AtomicU32::from_ptr(&raw_data_slice[index] as *const u32 as *mut u32) + }; + u32::from_be(atomic.load(Ordering::Relaxed)) as u64 + } + + // refcount_bits == 64 + 6 => { + let raw_data_slice = unsafe { self.raw_data.as_ref().into_typed_slice::() }; + let atomic = unsafe { + AtomicU64::from_ptr(&raw_data_slice[index] as *const u64 as *mut u64) + }; + u64::from_be(atomic.load(Ordering::Relaxed)) + } + + _ => unreachable!(), + } + } + + /// Allow modifying this refcount block. + /// + /// Note that readers are allowed to exist while modifications are happening. + pub async fn lock_write(&self) -> RefBlockWriteGuard<'_> { + RefBlockWriteGuard { + rb: self, + _lock: self.writer_lock.lock().await, + } + } + + /// Check whether this block has been modified since it was last written. + pub fn is_modified(&self) -> bool { + self.modified.load(Ordering::Relaxed) + } + + /// Clear the modified flag. + pub fn clear_modified(&self) { + self.modified.store(false, Ordering::Relaxed); + } + + /// Set the modified flag. + pub fn set_modified(&self) { + self.modified.store(true, Ordering::Relaxed); + } + + /// Check whether the given cluster’s refcount is 0. + pub fn is_zero(&self, index: usize) -> bool { + self.get(index) == 0 + } +} + +impl RefBlockWriteGuard<'_> { + /// # Safety + /// Caller must ensure there are no concurrent writers. + unsafe fn fetch_update_bitset( + bitset: &AtomicU8, + change: i64, + base_mask: u8, + shift: usize, + ) -> io::Result { + let mask = base_mask << shift; + + // load + store is OK without concurrent writers + let full = bitset.load(Ordering::Relaxed); + let old = (full & mask) >> shift; + let new = if change > 0 { + let change = change.try_into().map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("Requested refcount change of {change} is too big for the image’s refcount width"), + ) + })?; + old.checked_add(change) + } else { + let change = (-change).try_into().map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("Requested refcount change of {change} is too big for the image’s refcount width"), + ) + })?; + old.checked_sub(change) + }; + let new = new.ok_or_else(|| { + invalid_data(format!( + "Changing refcount from {old} by {change} would overflow" + )) + })?; + if new > base_mask { + return Err(invalid_data(format!( + "Changing refcount from {old} to {new} (by {change}) would overflow" + ))); + } + + let full = (full & !mask) | (new << shift); + bitset.store(full, Ordering::Relaxed); + Ok(old as u64) + } + + /// # Safety + /// Caller must ensure there are no concurrent writers. + unsafe fn fetch_update_full< + T, + L: FnOnce(&T) -> u64, + S: FnOnce(&T, u64) -> Result<(), TryFromIntError>, + >( + atomic: &T, + change: i64, + load: L, + store: S, + ) -> io::Result { + // load + store is OK without concurrent writers + let old = load(atomic); + + let new = if change > 0 { + old.checked_add(change as u64) + } else { + old.checked_sub(-change as u64) + }; + let new = new.ok_or_else(|| { + invalid_data(format!( + "Changing refcount from {old} by {change} would overflow" + )) + })?; + + store(atomic, new).map_err(|_| { + invalid_data(format!( + "Changing refcount from {old} to {new} (by {change}) would overflow" + )) + })?; + + Ok(old) + } + + /// Modify the given cluster’s refcount. + fn modify(&mut self, index: usize, change: i64) -> io::Result { + let result = match self.rb.refcount_order { + // refcount_bits == 1, 2, 4 + 0..=2 => { + let (index, mask, shift) = self.rb.sub_byte_refcount_access(index); + let raw_data_slice = unsafe { self.rb.raw_data.as_ref().into_typed_slice::() }; + let atomic = + unsafe { AtomicU8::from_ptr(&raw_data_slice[index] as *const u8 as *mut u8) }; + // Safe: `RefBlockWriteGuard` ensures there are no concurrent writers. + unsafe { Self::fetch_update_bitset(atomic, change, mask, shift) } + } + + // refcount_bits == 8 + 3 => { + let raw_data_slice = unsafe { self.rb.raw_data.as_ref().into_typed_slice::() }; + let atomic = + unsafe { AtomicU8::from_ptr(&raw_data_slice[index] as *const u8 as *mut u8) }; + // Safe: `RefBlockWriteGuard` ensures there are no concurrent writers. + unsafe { + Self::fetch_update_full( + atomic, + change, + |a| a.load(Ordering::Relaxed) as u64, + |a, v| { + a.store(v.try_into()?, Ordering::Relaxed); + Ok(()) + }, + ) + } + } + + // refcount_bits == 16 + 4 => { + let raw_data_slice = unsafe { self.rb.raw_data.as_ref().into_typed_slice::() }; + let atomic = unsafe { + AtomicU16::from_ptr(&raw_data_slice[index] as *const u16 as *mut u16) + }; + unsafe { + Self::fetch_update_full( + atomic, + change, + |a| u16::from_be(a.load(Ordering::Relaxed)) as u64, + |a, v| { + a.store(u16::try_from(v)?.to_be(), Ordering::Relaxed); + Ok(()) + }, + ) + } + } + + // refcount_bits == 32 + 5 => { + let raw_data_slice = unsafe { self.rb.raw_data.as_ref().into_typed_slice::() }; + let atomic = unsafe { + AtomicU32::from_ptr(&raw_data_slice[index] as *const u32 as *mut u32) + }; + unsafe { + Self::fetch_update_full( + atomic, + change, + |a| u32::from_be(a.load(Ordering::Relaxed)) as u64, + |a, v| { + a.store(u32::try_from(v)?.to_be(), Ordering::Relaxed); + Ok(()) + }, + ) + } + } + + // refcount_bits == 64 + 6 => { + let raw_data_slice = unsafe { self.rb.raw_data.as_ref().into_typed_slice::() }; + let atomic = unsafe { + AtomicU64::from_ptr(&raw_data_slice[index] as *const u64 as *mut u64) + }; + unsafe { + Self::fetch_update_full( + atomic, + change, + |a| u64::from_be(a.load(Ordering::Relaxed)), + |a, v| { + a.store(v.to_be(), Ordering::Relaxed); + Ok(()) + }, + ) + } + } + + _ => unreachable!(), + }; + + let result = result?; + self.rb.modified.store(true, Ordering::Relaxed); + Ok(result) + } + + /// Increment the given cluster’s refcount. + /// + /// Returns the old value. + pub fn increment(&mut self, index: usize) -> io::Result { + self.modify(index, 1) + } + + /// Decrement the given cluster’s refcount. + /// + /// Returns the old value. + pub fn decrement(&mut self, index: usize) -> io::Result { + self.modify(index, -1) + } + + /// Check whether the given cluster’s refcount is 0. + pub fn is_zero(&self, index: usize) -> bool { + self.rb.is_zero(index) + } +} + +impl Drop for RefBlock { + fn drop(&mut self) { + if self.is_modified() { + error!( + "Refcount block dropped while modified; was the image closed before being flushed?" + ); + } + } +} + +/// Generic trait for qcow2 table entries (L1, L2, refcount table). +pub trait TableEntry +where + Self: Sized, +{ + /// Load the given raw value, checking it for validity. + fn try_from_plain(value: u64, header: &Header) -> io::Result; + + /// Return the contained raw value. + fn to_plain(&self) -> u64; +} + +/// Generic trait for qcow2 metadata tables (L1, L2, refcount table). +pub trait Table: Sized { + /// Internal type for each table entry. + type InternalEntry: TableEntry; + /// Externally visible type for each table entry. + type Entry: Copy; + /// User-readable struct name. + const NAME: &'static str; + /// Maximum allowable number of entries. + const MAX_ENTRIES: usize; + + /// Create a new table with the given contents + fn from_data(data: Box<[Self::InternalEntry]>, header: &Header) -> Self; + + /// Number of entries. + fn entries(&self) -> usize; + /// Get the given entry (as reference). + fn get_ref(&self, index: usize) -> Option<&Self::InternalEntry>; + /// Get the given entry (copied). + fn get(&self, index: usize) -> Self::Entry; + /// Get this table’s (first) cluster in the image file. + fn get_cluster(&self) -> Option; + /// Get this table’s offset in the image file. + fn get_offset(&self) -> Option; + /// Set this table’s (first) cluster in the image file (for writing). + fn set_cluster(&mut self, cluster: HostCluster); + /// Remove the table’s association with any cluster in the image file. + fn unset_cluster(&mut self); + + /// Return log2 of the cluster size. + /// + /// All tables store this anyway. + fn cluster_bits(&self) -> u32; + + /// Check whether this table has been modified since it was last written. + fn is_modified(&self) -> bool; + /// Clear the modified flag. + fn clear_modified(&self); + /// Set the modified flag. + fn set_modified(&self); + + /// Table size in bytes. + fn byte_size(&self) -> usize { + self.entries() * size_of::() + } + + /// Number of clusters used by this table. + fn cluster_count(&self) -> ClusterCount { + ClusterCount::from_byte_size(self.byte_size() as u64, self.cluster_bits()) + } + + /// Load a table from the image file. + async fn load( + image: &S, + header: &Header, + cluster: HostCluster, + entries: usize, + ) -> io::Result { + let offset = cluster.offset(header.cluster_bits()); + + check_table( + Self::NAME, + offset.0, + entries, + size_of::(), + Self::MAX_ENTRIES, + header.cluster_size(), + )?; + + let byte_size = entries * size_of::(); + let mut buffer = IoBuffer::new(byte_size, cmp::max(image.mem_align(), size_of::()))?; + + image.read(&mut buffer, offset.0).await?; + + // Safe because `u64` is a plain type, and the alignment fits + let raw_table = unsafe { buffer.as_ref().into_typed_slice::() }; + + let mut table = Vec::::with_capacity(entries); + for be_value in raw_table { + table.push(Self::InternalEntry::try_from_plain( + u64::from_be(*be_value), + header, + )?) + } + + let mut table = Self::from_data(table.into_boxed_slice(), header); + table.set_cluster(cluster); + table.clear_modified(); + Ok(table) + } + + /// Write a table to the image file. + /// + /// Callers must ensure the table is copied, i.e. its refcount is 1. + async fn write(&self, image: &S) -> io::Result<()> { + let offset = self + .get_offset() + .ok_or_else(|| io::Error::other("Cannot write qcow2 metadata table, no offset set"))?; + + check_table( + Self::NAME, + offset.0, + self.entries(), + size_of::(), + Self::MAX_ENTRIES, + 1 << self.cluster_bits(), + )?; + + let byte_size = self.byte_size(); + let mut buffer = IoBuffer::new(byte_size, cmp::max(image.mem_align(), size_of::()))?; + + self.clear_modified(); + + // Safe because we have just allocated this, and it fits the alignment + let raw_table = unsafe { buffer.as_mut().into_typed_slice::() }; + for (i, be_value) in raw_table.iter_mut().enumerate() { + // 0 always works, that’s by design. + *be_value = self.get_ref(i).map(|e| e.to_plain()).unwrap_or(0).to_be(); + } + + if let Err(err) = image.write(&buffer, offset.0).await { + self.set_modified(); + return Err(err); + } + + Ok(()) + } + + /// Write at least the given single (modified) entry to the image file. + /// + /// Potentially writes more of the table, if alignment requirements ask for that. + async fn write_entry(&self, image: &S, index: usize) -> io::Result<()> { + // This alignment calculation code implicitly assumes that the cluster size is aligned to + // the storage’s request/memory alignment, but that is often fair. If that is not the + // case, there is not much we can do anyway. + let byte_size = self.byte_size(); + let power_of_two_up_to_byte_size = if byte_size.is_power_of_two() { + byte_size + } else { + ((byte_size + 1) / 2).next_power_of_two() + }; + let alignment = cmp::min( + power_of_two_up_to_byte_size, + cmp::max( + cmp::max(image.mem_align(), image.req_align()), + size_of::(), + ), + ); + let alignment_in_entries = alignment / size_of::(); + + let offset = self + .get_offset() + .ok_or_else(|| io::Error::other("Cannot write qcow2 metadata table, no offset set"))?; + + check_table( + Self::NAME, + offset.0, + self.entries(), + size_of::(), + Self::MAX_ENTRIES, + 1 << self.cluster_bits(), + )?; + + let mut buffer = IoBuffer::new(alignment, cmp::max(image.mem_align(), size_of::()))?; + + // Safe because we have just allocated this, and it fits the alignment + let raw_entries = unsafe { buffer.as_mut().into_typed_slice::() }; + let first_index = (index / alignment_in_entries) * alignment_in_entries; + #[allow(clippy::needless_range_loop)] + for i in 0..alignment_in_entries { + // 0 always works, that’s by design. + raw_entries[i] = self + .get_ref(first_index + i) + .map(|e| e.to_plain()) + .unwrap_or(0) + .to_be(); + } + + image + .write(&buffer, offset.0 + (first_index * size_of::()) as u64) + .await + } +} + +/// Check whether the given table offset/size is valid. +/// +/// Also works for refcount blocks (with cheating, because their entry size can be less than a +/// byte), which is why it is outside of [`Table`]. +fn check_table( + name: &str, + offset: u64, + entries: usize, + entry_size: usize, + max_entries: usize, + cluster_size: usize, +) -> io::Result<()> { + if entries > max_entries { + return Err(invalid_data(format!( + "{name} too big: {entries} > {max_entries}", + ))); + } + + if offset % (cluster_size as u64) != 0 { + return Err(invalid_data(format!("{name}: Unaligned offset: {offset}"))); + } + + let byte_size = entries + .checked_mul(entry_size) + .ok_or_else(|| invalid_data(format!("{name} size overflow: {entries} * {entry_size}")))?; + let end_offset = offset + .checked_add(byte_size as u64) + .ok_or_else(|| invalid_data(format!("{name} offset overflow: {offset} + {byte_size}")))?; + if end_offset > MAX_FILE_LENGTH { + return Err(invalid_data(format!( + "{name}: Invalid end offset: {end_offset} > {MAX_FILE_LENGTH}" + ))); + } + + Ok(()) +} diff --git a/src/imago/src/qcow2/mod.rs b/src/imago/src/qcow2/mod.rs new file mode 100644 index 00000000..9922e4df --- /dev/null +++ b/src/imago/src/qcow2/mod.rs @@ -0,0 +1,425 @@ +//! Qcow2 implementation. + +mod allocation; +mod cache; +mod compressed; +mod cow; +mod io_func; +mod mappings; +mod metadata; +#[cfg(feature = "sync-wrappers")] +mod sync_wrappers; +mod types; + +use crate::async_lru_cache::AsyncLruCache; +use crate::format::drivers::{FormatDriverInstance, Mapping}; +use crate::format::wrapped::WrappedFormat; +use crate::io_buffers::IoVectorMut; +use crate::misc_helpers::{invalid_data, ResultErrorContext}; +use crate::raw::Raw; +use crate::{FormatAccess, Storage, StorageExt, StorageOpenOptions}; +use allocation::Allocator; +use async_trait::async_trait; +use cache::L2CacheBackend; +use metadata::*; +use std::fmt::{self, Debug, Display, Formatter}; +use std::ops::Range; +use std::path::Path; +use std::sync::Arc; +use std::{cmp, io}; +use tokio::sync::{Mutex, RwLock}; +use types::*; + +/// Access qcow2 images. +/// +/// Allows access to qcow2 images (v2 and v3), referencing the following objects: +/// - Metadata storage object: The image file itself +/// - Data file (storage object): May be the image file itself, or an external data file +/// - Backing image `WrappedFormat`: A backing disk image in any format +#[must_use = "qcow2 images must be flushed before closing"] +pub struct Qcow2 + 'static = FormatAccess> { + /// Image file (which contains the qcow2 metadata). + metadata: Arc, + + /// Whether this image may be modified. + writable: bool, + + /// Whether the user explicitly assigned a data file storage object (or `None`). + storage_set: bool, + /// Data file storage object; will use `metadata` if `None`. + storage: Option, + /// Whether the user explicitly assigned a backing file (or `None`). + backing_set: bool, + /// Backing image. + backing: Option, + + /// Qcow2 header. + header: Arc
, + /// L1 table. + l1_table: RwLock, + + /// L2 table cache. + l2_cache: AsyncLruCache>, + + /// Allocates clusters. + /// + /// Is `None` for read-only images. + allocator: Option>>, +} + +impl + 'static> Qcow2 { + /// Opens a qcow2 file. + /// + /// `metadata` is the file containing the qcow2 metadata. If `writable` is not set, no + /// modifications are permitted. + /// + /// This will not open any other storage objects needed, i.e. no backing image, no external + /// data file. If you want to handle those manually, check whether an external data file is + /// needed via [`Qcow2::requires_external_data_file()`], and, if necessary, assign one via + /// [`Qcow2::set_data_file()`]; and assign a backing image via [`Qcow2::set_backing()`]. + /// + /// If you want to use the implicit references given in the image header, use + /// [`Qcow2::open_implicit_dependencies()`]. + pub async fn open_image(metadata: S, writable: bool) -> io::Result { + let header = Arc::new(Header::load(&metadata, writable).await?); + + let cb = header.cluster_bits(); + let l1_offset = header.l1_table_offset(); + let l1_cluster = l1_offset + .checked_cluster(cb) + .ok_or_else(|| invalid_data("Unaligned L1 table: {l1_offset}"))?; + + let l1_table = + L1Table::load(&metadata, &header, l1_cluster, header.l1_table_entries()).await?; + + let metadata = Arc::new(metadata); + + let allocator = if writable { + let allocator = Allocator::new(Arc::clone(&metadata), Arc::clone(&header)).await?; + Some(Mutex::new(allocator)) + } else { + None + }; + + let l2_cache_backend = L2CacheBackend::new(Arc::clone(&metadata), Arc::clone(&header)); + let l2_cache = AsyncLruCache::new(l2_cache_backend, 128); + + Ok(Qcow2 { + metadata, + + writable, + + storage_set: false, + storage: None, + backing_set: false, + backing: None, + + header, + l1_table: RwLock::new(l1_table), + + l2_cache, + allocator, + }) + } + + /// Open a qcow2 file at the given path. + /// + /// Open the file as a storage object via [`Storage::open()`], with write access if specified, + /// then pass that object to [`Qcow2::open_image()`]. + /// + /// This will not open any other storage objects needed, i.e. no backing image, no external + /// data file. If you want to handle those manually, check whether an external data file is + /// needed via [`Qcow2::requires_external_data_file()`], and, if necessary, assign one via + /// [`Qcow2::set_data_file()`]; and assign a backing image via [`Qcow2::set_backing()`]. + /// + /// If you want to use the implicit references given in the image header, use + /// [`Qcow2::open_implicit_dependencies()`]. + pub async fn open_path>(path: P, writable: bool) -> io::Result { + let storage_opts = StorageOpenOptions::new().write(writable).filename(path); + let metadata = S::open(storage_opts).await?; + Self::open_image(metadata, writable).await + } + + /// Check whether the given image file is a qcow2 file. + pub(crate) async fn probe(metadata: &S) -> io::Result<()> { + Header::load(metadata, true).await?; + Ok(()) + } + + /// Does this qcow2 image require an external data file? + /// + /// Conversely, if this is `false`, this image must not use an external data file. + pub fn requires_external_data_file(&self) -> bool { + self.header.external_data_file() + } + + /// External data file filename given in the image header. + /// + /// Note that even if an image requires an external data file, the header may not contain its + /// filename. In this case, an external data file must be set explicitly via + /// [`Qcow2::set_data_file()`]. + pub fn implicit_external_data_file(&self) -> Option<&String> { + self.header.external_data_filename() + } + + /// Backing image filename given in the image header. + pub fn implicit_backing_file(&self) -> Option<&String> { + self.header.backing_filename() + } + + /// Backing image format given in the image header. + /// + /// If this is `None`, the backing image’s format should be probed. Note that this may be + /// dangerous if guests have write access to the backing file: Given a raw image, a guest can + /// write a qcow2 header into it, resulting in the image being opened as qcow2 the next time, + /// allowing the guest to read arbitrary files (e.g. by setting them as backing files). + pub fn implicit_backing_format(&self) -> Option<&String> { + self.header.backing_format() + } + + /// Assign the data file. + /// + /// `None` means using the same data storage for both metadata and data, which should be used + /// if [`Qcow2::requires_external_data_file()`] is `false`. + pub fn set_data_file(&mut self, file: Option) { + self.storage = file; + self.storage_set = true; + } + + /// Assign a backing image. + /// + /// `None` means no backing image, i.e. reading from unallocated areas will produce zeroes. + pub fn set_backing(&mut self, backing: Option) { + self.backing = backing; + self.backing_set = true; + } + + /// Get the data storage object. + /// + /// If we have an external data file, return that. Otherwise, return the image (metadata) + /// file. + fn storage(&self) -> &S { + self.storage.as_ref().unwrap_or(&self.metadata) + } + + /// Return the image’s implicit data file (as given in the image header). + async fn open_implicit_data_file(&self) -> io::Result> { + if !self.header.external_data_file() { + return Ok(None); + } + + let Some(filename) = self.header.external_data_filename() else { + return Err(io::Error::other( + "Image requires external data file, but no filename given", + )); + }; + + let absolute = self + .metadata + .resolve_relative_path(filename) + .err_context(|| format!("Cannot resolve external data file name {filename}"))?; + + let opts = StorageOpenOptions::new() + .write(true) + .filename(absolute.clone()); + + Ok(Some(S::open(opts).await.err_context(|| { + format!("External data file {absolute:?}") + })?)) + } + + /// Wrap `file` in the `Raw` format. Helper for [`Qcow2::implicit_backing_file()`]. + async fn open_raw_backing_file(&self, file: S) -> io::Result { + let raw = Raw::open_image(file, false).await?; + Ok(F::wrap(FormatAccess::new(raw))) + } + + /// Wrap `file` in the `Qcow2` format. Helper for [`Qcow2::implicit_backing_file()`]. + async fn open_qcow2_backing_file(&self, file: S) -> io::Result { + let mut qcow2 = Self::open_image(file, false).await?; + // Recursive, so needs to be boxed + Box::pin(qcow2.open_implicit_dependencies()).await?; + Ok(F::wrap(FormatAccess::new(qcow2))) + } + + /// Return the image’s implicit backing image (as given in the image header). + async fn open_implicit_backing_file(&self) -> io::Result> { + let Some(filename) = self.header.backing_filename() else { + return Ok(None); + }; + + let absolute = self + .metadata + .resolve_relative_path(filename) + .err_context(|| format!("Cannot resolve backing file name {filename}"))?; + + let opts = StorageOpenOptions::new().filename(absolute.clone()); + let file = S::open(opts) + .await + .err_context(|| format!("Backing file {absolute:?}"))?; + + let result = match self.header.backing_format().map(|f| f.as_str()) { + Some("qcow2") => self.open_qcow2_backing_file(file).await.map(Some), + Some("raw") | Some("file") => self.open_raw_backing_file(file).await.map(Some), + + Some(fmt) => Err(io::Error::other(format!("Unknown backing format {fmt}"))), + + None => { + if Self::probe(&file).await.is_ok() { + self.open_qcow2_backing_file(file).await.map(Some) + } else { + self.open_raw_backing_file(file).await.map(Some) + } + } + }; + + result.err_context(|| format!("Backing file {absolute:?}")) + } + + /// Open all implicit dependencies. + /// + /// Qcow2 images have dependencies: + /// - The metadata file, which is the image file itself. + /// - The data file, which may be the same as the metadata file, or may be an external data + /// file. + /// - A backing disk image in any format. + /// + /// All of this can be set explicitly: + /// - The metadata file is always given explicitly to [`Qcow2::open_image()`]. + /// - The data file can be set via [`Qcow2::set_data_file()`]. + /// - The backing image can be set via [`Qcow2::set_backing()`]. + /// + /// But the image header can also provide “default” references to the data file and a backing + /// image, which we call *implicit* dependencies. This function opens all such implicit + /// dependencies if they have not been overridden with prior calls to + /// [`Qcow2::set_data_file()`] or [`Qcow2::set_backing()`], respectively. + pub async fn open_implicit_dependencies(&mut self) -> io::Result<()> { + if !self.storage_set { + self.storage = self.open_implicit_data_file().await?; + self.storage_set = true; + } + + if !self.backing_set { + self.backing = self.open_implicit_backing_file().await?; + self.backing_set = true; + } + + Ok(()) + } + + /// Require write access, i.e. return an error for read-only images. + fn need_writable(&self) -> io::Result<()> { + self.writable + .then_some(()) + .ok_or_else(|| io::Error::other("Image is read-only")) + } +} + +#[async_trait(?Send)] +impl> FormatDriverInstance for Qcow2 { + type Storage = S; + + fn size(&self) -> u64 { + self.header.size() + } + + fn collect_storage_dependencies(&self) -> Vec<&S> { + let mut v = self + .backing + .as_ref() + .map(|b| b.unwrap().collect_storage_dependencies()) + .unwrap_or_default(); + + v.push(&self.metadata); + if let Some(storage) = self.storage.as_ref() { + v.push(storage); + } + + v + } + + fn writable(&self) -> bool { + self.writable + } + + async fn get_mapping<'a>( + &'a self, + offset: u64, + max_length: u64, + ) -> io::Result<(Mapping<'a, S>, u64)> { + let length_until_eof = match self.header.size().checked_sub(offset) { + None | Some(0) => return Ok((Mapping::Eof, 0)), + Some(length) => length, + }; + + let max_length = cmp::min(max_length, length_until_eof); + let offset = GuestOffset(offset); + self.do_get_mapping(offset, max_length).await + } + + async fn ensure_data_mapping<'a>( + &'a self, + offset: u64, + length: u64, + overwrite: bool, + ) -> io::Result<(&'a S, u64, u64)> { + let length_until_eof = self.header.size().saturating_sub(offset); + if length_until_eof < length { + return Err(io::Error::other("Cannot allocate beyond the disk size")); + } + + if length == 0 { + return Ok((self.storage(), 0, 0)); + } + + self.need_writable()?; + let offset = GuestOffset(offset); + self.do_ensure_data_mapping(offset, length, overwrite).await + } + + async fn readv_special(&self, bufv: IoVectorMut<'_>, offset: u64) -> io::Result<()> { + let offset = GuestOffset(offset); + self.do_readv_special(bufv, offset).await + } + + async fn flush(&self) -> io::Result<()> { + self.l2_cache.flush().await?; + if let Some(allocator) = self.allocator.as_ref() { + allocator.lock().await.flush_rb_cache().await?; + } + + self.metadata.flush().await?; + if let Some(storage) = self.storage.as_ref() { + storage.flush().await?; + } + // Backing file is read-only, so need not be flushed from us. + Ok(()) + } + + async fn sync(&self) -> io::Result<()> { + self.metadata.sync().await?; + if let Some(storage) = self.storage.as_ref() { + storage.sync().await?; + } + // Backing file is read-only, so need not be synced from us. + Ok(()) + } +} + +impl> Debug for Qcow2 { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("Qcow2") + .field("metadata", &self.metadata) + .field("storage_set", &self.storage_set) + .field("storage", &self.storage) + .field("backing_set", &self.backing_set) + .field("backing", &self.backing) + .finish() + } +} + +impl> Display for Qcow2 { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "qcow2[{}]", self.metadata) + } +} diff --git a/src/imago/src/qcow2/sync_wrappers.rs b/src/imago/src/qcow2/sync_wrappers.rs new file mode 100644 index 00000000..f78d41bf --- /dev/null +++ b/src/imago/src/qcow2/sync_wrappers.rs @@ -0,0 +1,32 @@ +//! Synchronous wrapper around qcow2 functions. + +use super::*; + +impl + 'static> Qcow2 { + /// Synchronous wrapper around [`Qcow2::open_image()`]. + /// + /// Runs the async function in an ephemeral tokio runtime. + pub fn open_image_sync(metadata: S, writable: bool) -> io::Result { + tokio::runtime::Builder::new_current_thread() + .build()? + .block_on(Self::open_image(metadata, writable)) + } + + /// Synchronous wrapper around [`Qcow2::open_path()`]. + /// + /// Runs the async function in an ephemeral tokio runtime. + pub fn open_path_sync>(path: P, writable: bool) -> io::Result { + tokio::runtime::Builder::new_current_thread() + .build()? + .block_on(Self::open_path(path, writable)) + } + + /// Synchronous wrapper around [`Qcow2::open_implicit_dependencies()`]. + /// + /// Runs the async function in an ephemeral tokio runtime. + pub fn open_implicit_dependencies_sync(&mut self) -> io::Result<()> { + tokio::runtime::Builder::new_current_thread() + .build()? + .block_on(self.open_implicit_dependencies()) + } +} diff --git a/src/imago/src/qcow2/types.rs b/src/imago/src/qcow2/types.rs new file mode 100644 index 00000000..a80f6ea7 --- /dev/null +++ b/src/imago/src/qcow2/types.rs @@ -0,0 +1,294 @@ +//! Helper types. +//! +//! Contains types like `GuestOffset` or `HostCluster`. This strong typing ensures there is no +//! confusion between what is what. + +use super::*; +use std::fmt::{self, Display, Formatter}; +use std::ops::{Add, AddAssign, Sub, SubAssign}; + +/// Guest offset. +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub(super) struct GuestOffset(pub u64); + +/// Guest cluster index. +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub(super) struct GuestCluster(pub u64); + +/// Host cluster offset. +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub(super) struct HostOffset(pub u64); + +/// Host cluster index. +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub(super) struct HostCluster(pub u64); + +/// Cluster count. +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] +pub(super) struct ClusterCount(pub u64); + +impl GuestOffset { + /// Return the offset from the start of the containing guest clusters. + pub fn in_cluster_offset(self, cluster_bits: u32) -> usize { + (self.0 % (1 << cluster_bits)) as usize + } + + /// Return the containing cluster’s index in its L2 table. + pub fn l2_index(self, cluster_bits: u32) -> usize { + self.cluster(cluster_bits).l2_index(cluster_bits) + } + + /// Return the containing cluster’s L2 table’s index in the L1 table. + pub fn l1_index(self, cluster_bits: u32) -> usize { + self.cluster(cluster_bits).l1_index(cluster_bits) + } + + /// Return the containing cluster’s index. + pub fn cluster(self, cluster_bits: u32) -> GuestCluster { + GuestCluster(self.0 >> cluster_bits) + } + + /// How many bytes remain in this cluster after this offset. + pub fn remaining_in_cluster(self, cluster_bits: u32) -> u64 { + ((1 << cluster_bits) - self.in_cluster_offset(cluster_bits)) as u64 + } + + /// How many bytes remain in this L2 table after this offset. + pub fn remaining_in_l2_table(self, cluster_bits: u32) -> u64 { + // See `Header::l2_entries()` + let l2_entries = 1 << (cluster_bits - 3); + let after_this = ((l2_entries - (self.l2_index(cluster_bits) + 1)) as u64) << cluster_bits; + self.remaining_in_cluster(cluster_bits) + after_this + } +} + +impl GuestCluster { + /// Return this cluster’s offset. + pub fn offset(self, cluster_bits: u32) -> GuestOffset { + GuestOffset(self.0 << cluster_bits) + } + + /// Return this cluster’s index in its L2 table. + pub fn l2_index(self, cluster_bits: u32) -> usize { + // See `Header::l2_entries()` + let l2_entries = 1 << (cluster_bits - 3); + (self.0 % l2_entries) as usize + } + + /// Return this cluster’s L2 table’s index in the L1 table. + pub fn l1_index(self, cluster_bits: u32) -> usize { + let l2_entries_shift = cluster_bits - 3; + (self.0 >> l2_entries_shift) as usize + } + + /// Return the cluster at the given L1 and L2 table indices. + pub fn from_l1_l2_indices(l1_index: usize, l2_index: usize, cluster_bits: u32) -> Self { + let l2_entries_shift = cluster_bits - 3; + GuestCluster(((l1_index as u64) << l2_entries_shift) + l2_index as u64) + } + + /// Return the next cluster in this L2 table, if any. + /// + /// Return `None` if this is the last cluster in this L2 table. + pub fn next_in_l2(self, cluster_bits: u32) -> Option { + // See `Header::l2_entries()` + let l2_entries = 1 << (cluster_bits - 3); + let l1_index = self.l1_index(cluster_bits); + let l2_index = self.l2_index(cluster_bits); + let l2_index = l2_index.checked_add(1)?; + if l2_index >= l2_entries { + None + } else { + Some(GuestCluster::from_l1_l2_indices( + l1_index, + l2_index, + cluster_bits, + )) + } + } + + /// Return the first cluster in the next L2 table. + pub fn first_in_next_l2(self, cluster_bits: u32) -> GuestCluster { + let l2_entries = 1 << (cluster_bits - 3); + GuestCluster((self.0 + 1).next_multiple_of(l2_entries)) + } +} + +impl HostOffset { + /// Return the offset from the start of the containing host cluster. + pub fn in_cluster_offset(self, cluster_bits: u32) -> usize { + (self.0 % (1 << cluster_bits)) as usize + } + + /// Return the containing cluster’s index. + pub fn cluster(self, cluster_bits: u32) -> HostCluster { + HostCluster(self.0 >> cluster_bits) + } + + /// If this offset points to the start of a cluster, get its index. + /// + /// If this offset points inside of a cluster, return `None`. As oposed to just `cluster()`, + /// this will not discard information: `self.checked_cluster(cb).unwrap().offset() == self`, + /// because there is no in-cluster offset that could be lost. + pub fn checked_cluster(self, cluster_bits: u32) -> Option { + (self.in_cluster_offset(cluster_bits) == 0).then_some(self.cluster(cluster_bits)) + } +} + +impl HostCluster { + /// Return this cluster’s offset. + pub fn offset(self, cluster_bits: u32) -> HostOffset { + HostOffset(self.0 << cluster_bits) + } + + /// Get this cluster’s index in its refcount block. + pub fn rb_index(self, rb_bits: u32) -> usize { + let rb_entries = 1 << rb_bits; + (self.0 % rb_entries) as usize + } + + /// Get this cluster’s refcount block’s index in the refcount table. + pub fn rt_index(self, rb_bits: u32) -> usize { + (self.0 >> rb_bits) as usize + } + + /// Get both the reftable and refblock indices for this cluster. + pub fn rt_rb_indices(self, rb_bits: u32) -> (usize, usize) { + (self.rt_index(rb_bits), self.rb_index(rb_bits)) + } + + /// Construct a cluster index from its reftable and refblock indices. + pub fn from_ref_indices(rt_index: usize, rb_index: usize, rb_bits: u32) -> Self { + HostCluster(((rt_index as u64) << rb_bits) + rb_index as u64) + } + + /// Returns the host offset corresponding to `guest_offset`. + /// + /// Assuming `guest_offset.cluster()` is mapped to `self`, return the exact host offset + /// matching `guest_offset`. + /// + /// Same as `self.offset(cb) + guest_offset.in_cluster_offset`. + pub fn relative_offset(self, guest_offset: GuestOffset, cluster_bits: u32) -> HostOffset { + self.offset(cluster_bits) + guest_offset.in_cluster_offset(cluster_bits) as u64 + } +} + +impl ClusterCount { + /// Get how many clusters are required to cover `byte_size`. + /// + /// This rounds up. + pub fn from_byte_size(byte_size: u64, cluster_bits: u32) -> Self { + ClusterCount(byte_size.div_ceil(1 << cluster_bits)) + } + + /// Return the full byte size of this many clusters. + pub fn byte_size(self, cluster_bits: u32) -> u64 { + self.0 << cluster_bits + } +} + +impl Add for HostCluster { + type Output = Self; + + fn add(self, rhs: ClusterCount) -> Self { + HostCluster(self.0 + rhs.0) + } +} + +impl AddAssign for HostCluster { + fn add_assign(&mut self, rhs: ClusterCount) { + self.0 += rhs.0; + } +} + +impl Sub for HostCluster { + type Output = Self; + + fn sub(self, rhs: ClusterCount) -> Self { + HostCluster(self.0 - rhs.0) + } +} + +impl SubAssign for HostCluster { + fn sub_assign(&mut self, rhs: ClusterCount) { + self.0 -= rhs.0; + } +} + +impl Sub for HostCluster { + type Output = ClusterCount; + + fn sub(self, rhs: Self) -> ClusterCount { + ClusterCount(self.0 - rhs.0) + } +} + +impl Add for ClusterCount { + type Output = Self; + + fn add(self, rhs: ClusterCount) -> Self { + ClusterCount(self.0 + rhs.0) + } +} + +impl AddAssign for ClusterCount { + fn add_assign(&mut self, rhs: ClusterCount) { + self.0 += rhs.0; + } +} + +impl Sub for ClusterCount { + type Output = Self; + + fn sub(self, rhs: ClusterCount) -> Self { + ClusterCount(self.0 - rhs.0) + } +} + +impl SubAssign for ClusterCount { + fn sub_assign(&mut self, rhs: ClusterCount) { + self.0 -= rhs.0; + } +} + +impl Add for HostOffset { + type Output = Self; + + fn add(self, rhs: u64) -> Self { + HostOffset(self.0 + rhs) + } +} + +impl Sub for HostOffset { + type Output = Self; + + fn sub(self, rhs: u64) -> Self { + HostOffset(self.0 - rhs) + } +} + +impl Sub for HostOffset { + type Output = u64; + + fn sub(self, rhs: Self) -> u64 { + self.0 - rhs.0 + } +} + +impl Display for GuestOffset { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "0x{:x}", self.0) + } +} + +impl Display for HostOffset { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "0x{:x}", self.0) + } +} + +impl Display for ClusterCount { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} diff --git a/src/imago/src/raw.rs b/src/imago/src/raw.rs new file mode 100644 index 00000000..ed1a2531 --- /dev/null +++ b/src/imago/src/raw.rs @@ -0,0 +1,128 @@ +//! Access generic files as images. +//! +//! Allows accessing generic storage objects (`Storage`) as images (i.e. `FormatAccess`). + +use crate::format::drivers::{FormatDriverInstance, Mapping}; +use crate::{Storage, StorageOpenOptions}; +use async_trait::async_trait; +use std::fmt::{self, Display, Formatter}; +use std::io; +use std::path::Path; + +/// Wraps a storage object without any translation. +#[derive(Debug)] +pub struct Raw { + /// Wrapped storage object. + inner: S, + + /// Whether this image may be modified. + writable: bool, + + /// Disk size, which is the file size when this object was created. + size: u64, +} + +impl Raw { + /// Wrap `inner`, allowing it to be used as a disk image in raw format. + pub async fn open_image(inner: S, writable: bool) -> io::Result { + let size = inner.size()?; + Ok(Raw { + inner, + writable, + size, + }) + } + + /// Open the given path as a storage object, and wrap it in `Raw`. + pub async fn open_path>(path: P, writable: bool) -> io::Result { + let storage_opts = StorageOpenOptions::new().write(writable).filename(path); + let inner = S::open(storage_opts).await?; + Self::open_image(inner, writable).await + } + + /// Wrap `inner`, allowing it to be used as a disk image in raw format. + #[cfg(feature = "sync-wrappers")] + pub fn open_image_sync(inner: S, writable: bool) -> io::Result { + let size = inner.size()?; + Ok(Raw { + inner, + writable, + size, + }) + } + + /// Synchronous wrapper around [`Raw::open_path()`]. + pub fn open_path_sync>(path: P, writable: bool) -> io::Result { + tokio::runtime::Builder::new_current_thread() + .build()? + .block_on(Self::open_path(path, writable)) + } +} + +#[async_trait(?Send)] +impl FormatDriverInstance for Raw { + type Storage = S; + + fn size(&self) -> u64 { + self.size + } + + fn collect_storage_dependencies(&self) -> Vec<&S> { + vec![&self.inner] + } + + fn writable(&self) -> bool { + self.writable + } + + async fn get_mapping<'a>( + &'a self, + offset: u64, + max_length: u64, + ) -> io::Result<(Mapping<'a, S>, u64)> { + let remaining = match self.size.checked_sub(offset) { + None | Some(0) => return Ok((Mapping::Eof, 0)), + Some(remaining) => remaining, + }; + + Ok(( + Mapping::Raw { + storage: &self.inner, + offset, + writable: true, + }, + std::cmp::min(max_length, remaining), + )) + } + + async fn ensure_data_mapping<'a>( + &'a self, + offset: u64, + length: u64, + _overwrite: bool, + ) -> io::Result<(&'a S, u64, u64)> { + let Some(remaining) = self.size.checked_sub(offset) else { + return Err(io::Error::other("Cannot allocate past the end of file")); + }; + if length > remaining { + return Err(io::Error::other("Cannot allocate past the end of file")); + } + + Ok((&self.inner, offset, length)) + } + + async fn flush(&self) -> io::Result<()> { + // No internal buffers to flush + self.inner.flush().await + } + + async fn sync(&self) -> io::Result<()> { + self.inner.sync().await + } +} + +impl Display for Raw { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "raw[{}]", self.inner) + } +} diff --git a/src/imago/src/storage/drivers.rs b/src/imago/src/storage/drivers.rs new file mode 100644 index 00000000..1edd5377 --- /dev/null +++ b/src/imago/src/storage/drivers.rs @@ -0,0 +1,184 @@ +//! Internal functionality for storage drivers. + +use crate::misc_helpers::Overlaps; +use crate::vector_select::FutureVector; +use std::ops::Range; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use tokio::sync::oneshot; + +/// Helper object for the [`StorageExt`](crate::StorageExt) implementation. +/// +/// State such as write blockers needs to be kept somewhere, and instead of introducing a wrapper +/// (that might be bypassed), we store it directly in the [`Storage`](crate::Storage) objects so it +/// cannot be bypassed (at least when using the [`StorageExt`](crate::StorageExt) methods). +#[derive(Debug, Default)] +pub struct CommonStorageHelper { + /// Current in-flight write that allow concurrent writes to the same region. + /// + /// Normal non-async RwLock, so do not await while locked! + weak_write_blockers: std::sync::RwLock, + + /// Current in-flight write that do not allow concurrent writes to the same region. + strong_write_blockers: std::sync::RwLock, +} + +/// A list of ranges blocked for some kind of concurrent access. +/// +/// Depending on the use, some will block all concurrent access (i.e. serializing writes will block +/// both serializing and non-serializing writes (strong blockers)), while others will only block a +/// subset (non-serializing writes will only block serializing writes (weak blockers)). +#[derive(Debug, Default)] +struct RangeBlockedList { + /// The list of ranges. + /// + /// Serializing writes (strong write blockers) are supposed to be rare, so it is important that + /// entering and removing items into/from this list is cheap, not that iterating it is. + blocked: Vec>, +} + +/// A range blocked for some kind of concurrent access. +#[derive(Debug)] +struct RangeBlocked { + /// The range. + range: Range, + + /// List of requests awaiting the range to become unblocked. + /// + /// When the corresponding `RangeBlockedGuard` is dropped, these will all be awoken (via + /// `oneshot::Sender::send(())`). + /// + /// Normal non-async mutex, so do not await while locked! + waitlist: std::sync::Mutex>>, + + /// Index in the corresponding `RangeBlockedList.blocked` list, so it can be dropped quickly. + /// + /// (When the corresponding `RangeBlockedGuard` is dropped, this entry is swap-removed from the + /// `blocked` list, and the other entry taking its place has its `index` updated.) + /// + /// Only access under `blocked` lock! + index: AtomicUsize, +} + +/// Keeps a `RangeBlocked` alive. +/// +/// When dropped, removes the `RangeBlocked` from its list, and wakes all requests in the `waitlist`. +#[derive(Debug)] +pub struct RangeBlockedGuard<'a> { + /// List where this blocker resides. + list: &'a std::sync::RwLock, + + /// `Option`, so `drop()` can `take()` it and unwrap the `Arc`. + /// + /// Consequently, do not clone: Must have refcount 1 when dropped. (The only clone must be in + /// `self.list.blocked`, under index `self.block.index`.) + block: Option>, +} + +impl CommonStorageHelper { + /// Await concurrent strong write blockers for the given range. + /// + /// Strong write blockers are set up for writes that must not be intersected by any other + /// write. Await such intersecting concurrent write requests, and return a guard that will + /// delay such new writes until the guard is dropped. + pub async fn weak_write_blocker(&self, range: Range) -> RangeBlockedGuard<'_> { + let mut intersecting = FutureVector::new(); + + let range_block = { + // Acquire write lock first + let mut weak = self.weak_write_blockers.write().unwrap(); + let strong = self.strong_write_blockers.read().unwrap(); + + strong.collect_intersecting_await_futures(&range, &mut intersecting); + weak.block(range) + }; + + intersecting.discarding_join().await.unwrap(); + + RangeBlockedGuard { + list: &self.weak_write_blockers, + block: Some(range_block), + } + } + + /// Await any concurrent write request for the given range. + /// + /// Block the given range for any concurrent write requests until the returned guard object is + /// dropped. Existing requests are awaited, and new ones will be delayed. + pub async fn strong_write_blocker(&self, range: Range) -> RangeBlockedGuard<'_> { + let mut intersecting = FutureVector::new(); + + let range_block = { + // Acquire write lock first + let mut strong = self.strong_write_blockers.write().unwrap(); + let weak = self.weak_write_blockers.read().unwrap(); + + weak.collect_intersecting_await_futures(&range, &mut intersecting); + strong.collect_intersecting_await_futures(&range, &mut intersecting); + strong.block(range) + }; + + intersecting.discarding_join().await.unwrap(); + + RangeBlockedGuard { + list: &self.strong_write_blockers, + block: Some(range_block), + } + } +} + +impl RangeBlockedList { + /// Collects futures to await intersecting request. + /// + /// Adds a future to `future_vector` for every intersecting request; awaiting that future will + /// await the request. + fn collect_intersecting_await_futures( + &self, + check_range: &Range, + future_vector: &mut FutureVector<(), oneshot::error::RecvError, oneshot::Receiver<()>>, + ) { + for range_block in self.blocked.iter() { + if range_block.range.overlaps(check_range) { + let (s, r) = oneshot::channel::<()>(); + range_block.waitlist.lock().unwrap().push(s); + future_vector.push(r); + } + } + } + + /// Enter a new blocked range into the list. + /// + /// This only blocks new requests, old requests must separately be awaited by awaiting all + /// futures returned by `collect_intersecting_await_futures()`. + fn block(&mut self, range: Range) -> Arc { + let range_block = Arc::new(RangeBlocked { + range, + waitlist: Default::default(), + index: self.blocked.len().into(), + }); + self.blocked.push(Arc::clone(&range_block)); + range_block + } +} + +impl Drop for RangeBlockedGuard<'_> { + fn drop(&mut self) { + let block = self.block.take().unwrap(); + + { + let mut list = self.list.write().unwrap(); + let i = block.index.load(Ordering::Relaxed); + let removed = list.blocked.swap_remove(i); + debug_assert!(Arc::ptr_eq(&removed, &block)); + if let Some(block) = list.blocked.get(i) { + block.index.store(i, Ordering::Relaxed); + } + } + + let block = Arc::into_inner(block).unwrap(); + let waitlist = block.waitlist.into_inner().unwrap(); + for waiting in waitlist { + waiting.send(()).unwrap(); + } + } +} diff --git a/src/imago/src/storage/ext.rs b/src/imago/src/storage/ext.rs new file mode 100644 index 00000000..54b4ab8b --- /dev/null +++ b/src/imago/src/storage/ext.rs @@ -0,0 +1,338 @@ +//! Provides the `StorageExt` struct for more convenient access. +//! +//! `Storage` is provided by the driver, so is supposed to be simple and only contain what’s +//! necessary. `StorageExt` builds on that to provide more convenient access, e.g. allows +//! unaligned requests and provides write serialization. + +use super::drivers::RangeBlockedGuard; +use crate::io_buffers::{IoBuffer, IoVector, IoVectorMut, IoVectorTrait}; +use crate::Storage; +use std::ops::Range; +use std::{cmp, io}; +use tracing::trace; + +/// Helper methods for storage objects. +/// +/// Provides some more convenient methods for accessing storage objects. +pub trait StorageExt: Storage { + /// Read data at `offset` into `bufv`. + /// + /// Reads until `bufv` is filled completely, i.e. will not do short reads. When reaching the + /// end of file, the rest of `bufv` is filled with 0. + /// + /// Checks alignment. If anything does not meet the requirements, enforces it (using ephemeral + /// bounce buffers). + #[allow(async_fn_in_trait)] // No need for Send + async fn readv(&self, bufv: IoVectorMut<'_>, offset: u64) -> io::Result<()>; + + /// Write data from `bufv` to `offset`. + /// + /// Writes all data from `bufv`, i.e. will not do short writes. When reaching the end of file, + /// it is grown as necessary so that the new end of file will be at `offset + bufv.len()`. + /// + /// If growing is not possible, expect writes beyond the end of file (even if only partially) + /// to fail. + /// + /// Checks alignment. If anything does not meet the requirements, enforces it using bounce + /// buffers and a read-modify-write cycle that blocks concurrent writes to the affected area. + #[allow(async_fn_in_trait)] // No need for Send + async fn writev(&self, bufv: IoVector<'_>, offset: u64) -> io::Result<()>; + + /// Read data at `offset` into `buf`. + /// + /// Reads until `buf` is filled completely, i.e. will not do short reads. When reaching the + /// end of file, the rest of `buf` is filled with 0. + /// + /// Checks alignment. If anything does not meet the requirements, enforces it (using ephemeral + /// bounce buffers). + #[allow(async_fn_in_trait)] // No need for Send + async fn read(&self, buf: impl Into>, offset: u64) -> io::Result<()>; + + /// Write data from `buf` to `offset`. + /// + /// Writes all data from `buf`, i.e. will not do short writes. When reaching the end of file, + /// it is grown as necessary so that the new end of file will be at `offset + buf.len()`. + /// + /// If growing is not possible, expect writes beyond the end of file (even if only partially) + /// to fail. + /// + /// Checks alignment. If anything does not meet the requirements, enforces it using bounce + /// buffers and a read-modify-write cycle that blocks concurrent writes to the affected area. + #[allow(async_fn_in_trait)] // No need for Send + async fn write(&self, buf: impl Into>, offset: u64) -> io::Result<()>; + + /// Ensure the given range reads back as zeroes. + #[allow(async_fn_in_trait)] // No need for Send + async fn write_zeroes(&self, offset: u64, length: u64) -> io::Result<()>; + + /// Discard the given range, with undefined contents when read back. + /// + /// Tell the storage layer this range is no longer needed and need not be backed by actual + /// storage. When read back, the data read will be undefined, i.e. not necessarily zeroes. + #[allow(async_fn_in_trait)] // No need for Send + async fn discard(&self, offset: u64, length: u64) -> io::Result<()>; + + /// Await concurrent strong write blockers for the given range. + /// + /// Strong write blockers are set up for writes that must not be intersected by any other + /// write. Await such intersecting concurrent write requests, and return a guard that will + /// delay such new writes until the guard is dropped. + #[allow(async_fn_in_trait)] // No need for Send + async fn weak_write_blocker(&self, range: Range) -> RangeBlockedGuard<'_>; + + /// Await any concurrent write request for the given range. + /// + /// Block the given range for any concurrent write requests until the returned guard object is + /// dropped. Existing requests are awaited, and new ones will be delayed. + #[allow(async_fn_in_trait)] // No need for Send + async fn strong_write_blocker(&self, range: Range) -> RangeBlockedGuard<'_>; +} + +impl StorageExt for S { + async fn readv(&self, mut bufv: IoVectorMut<'_>, offset: u64) -> io::Result<()> { + if bufv.is_empty() { + return Ok(()); + } + + let mem_align = self.mem_align(); + let req_align = self.req_align(); + + if is_aligned(&bufv, offset, mem_align, req_align) { + // Safe: Alignment checked + return unsafe { self.pure_readv(bufv, offset) }.await; + } + + trace!( + "Unaligned read: 0x{:x} + {} (size: {:#x})", + offset, + bufv.len(), + self.size().unwrap() + ); + + let req_align_mask = req_align as u64 - 1; + // Length must be aligned to both memory and request alignments + let len_align_mask = req_align_mask | (mem_align as u64 - 1); + debug_assert!((len_align_mask + 1) % (req_align as u64) == 0); + + let unpadded_end = offset + bufv.len(); + let padded_offset = offset & !req_align_mask; + // This will over-align at the end of file (aligning to exactly the end of file would be + // sufficient), but it is easier this way. + let padded_end = (unpadded_end + req_align_mask) & !req_align_mask; + // Now also align to memory alignment + let padded_len = (padded_end - padded_offset + len_align_mask) & !(len_align_mask); + let padded_end = padded_offset + padded_len; + + let padded_len: usize = (padded_end - padded_offset) + .try_into() + .map_err(|e| io::Error::other(format!("Cannot realign read: {e}")))?; + + trace!("Padded read: {padded_offset:#x} + {padded_len}"); + + let mut bounce_buf = IoBuffer::new(padded_len, mem_align)?; + + // Safe: Alignment enforced + unsafe { self.pure_readv(bounce_buf.as_mut().into(), padded_offset) }.await?; + + let in_buf_ofs = (offset - padded_offset) as usize; + // Must fit in `usize` because `padded_len: usize` + let in_buf_end = (unpadded_end - padded_offset) as usize; + + bufv.copy_from_slice(bounce_buf.as_ref_range(in_buf_ofs..in_buf_end).into_slice()); + + Ok(()) + } + + async fn writev(&self, bufv: IoVector<'_>, offset: u64) -> io::Result<()> { + if bufv.is_empty() { + return Ok(()); + } + + let mem_align = self.mem_align(); + let req_align = self.req_align(); + + if is_aligned(&bufv, offset, mem_align, req_align) { + let _sw_guard = self.weak_write_blocker(offset..(offset + bufv.len())).await; + + // Safe: Alignment checked, and weak write blocker set up + return unsafe { self.pure_writev(bufv, offset) }.await; + } + + trace!( + "Unaligned write: {:#x} + {} (size: {:#x})", + offset, + bufv.len(), + self.size().unwrap() + ); + + let req_align_mask = req_align - 1; + // Length must be aligned to both memory and request alignments + let len_align_mask = req_align_mask | (mem_align - 1); + let len_align = req_align_mask + 1; + debug_assert!(len_align % req_align == 0); + + let unpadded_end = offset + bufv.len(); + let padded_offset = offset & !(req_align_mask as u64); + // This will over-align at the end of file (aligning to exactly the end of file would be + // sufficient), but it is easier this way. Small TODO, as this will indeed increase the + // file length (which the over-alignment in `unaligned_readv()` does not). + let padded_end = (unpadded_end + req_align_mask as u64) & !(req_align_mask as u64); + // Now also align to memory alignment + let padded_len = + (padded_end - padded_offset + len_align_mask as u64) & !(len_align_mask as u64); + let padded_end = padded_offset + padded_len; + + let padded_len: usize = (padded_end - padded_offset) + .try_into() + .map_err(|e| io::Error::other(format!("Cannot realign write: {e}")))?; + + trace!("Padded write: {padded_offset:#x} + {padded_len}"); + + let mut bounce_buf = IoBuffer::new(padded_len, mem_align)?; + assert!(padded_len >= len_align && padded_len & len_align_mask == 0); + + // For the strong blocker, just the RMW regions (head and tail) would be enough. However, + // we don’t expect any concurrent writes to the non-RMW (pure write) regions (it is + // unlikely that the guest would write to the same area twice concurrently), so we don’t + // need to optimize for it. On the other hand, writes to the RMW regions are likely + // (adjacent writes), so those will be blocked either way. + // Instating fewer blockers makes them less expensive to check, though. + let _sw_guard = self.strong_write_blocker(padded_offset..padded_end).await; + + let in_buf_ofs = (offset - padded_offset) as usize; + // Must fit in `usize` because `padded_len: usize` + let in_buf_end = (unpadded_end - padded_offset) as usize; + + // RMW part 1: Read + + let head_len = in_buf_ofs; + let aligned_head_len = (head_len + len_align_mask) & !len_align_mask; + + let tail_len = padded_len - in_buf_end; + let aligned_tail_len = (tail_len + len_align_mask) & !len_align_mask; + + if aligned_head_len + aligned_tail_len == padded_len { + // Must read the whole bounce buffer + // Safe: Alignment enforced + unsafe { self.pure_readv(bounce_buf.as_mut().into(), padded_offset) }.await?; + } else { + if aligned_head_len > 0 { + let head_bufv = bounce_buf.as_mut_range(0..aligned_head_len).into(); + // Safe: Alignment enforced + unsafe { self.pure_readv(head_bufv, padded_offset) }.await?; + } + if aligned_tail_len > 0 { + let tail_start = padded_len - aligned_tail_len; + let tail_bufv = bounce_buf.as_mut_range(tail_start..padded_len).into(); + // Safe: Alignment enforced + unsafe { self.pure_readv(tail_bufv, padded_offset + tail_start as u64) }.await?; + } + } + + // RMW part 2: Modify + bufv.copy_into_slice(bounce_buf.as_mut_range(in_buf_ofs..in_buf_end).into_slice()); + + // RMW part 3: Write + // Safe: Alignment enforced, and strong write blocker set up + unsafe { self.pure_writev(bounce_buf.as_ref().into(), padded_offset) }.await + } + + async fn read(&self, buf: impl Into>, offset: u64) -> io::Result<()> { + self.readv(buf.into(), offset).await + } + + async fn write(&self, buf: impl Into>, offset: u64) -> io::Result<()> { + self.writev(buf.into(), offset).await + } + + async fn write_zeroes(&self, offset: u64, length: u64) -> io::Result<()> { + let zero_align = self.zero_align(); + debug_assert!(zero_align.is_power_of_two()); + let align_mask = zero_align as u64 - 1; + + let unaligned_end = offset + .checked_add(length) + .ok_or_else(|| io::Error::other("Zero-write wrap-around"))?; + let aligned_offset = (offset + align_mask) & !align_mask; + let aligned_end = unaligned_end & !align_mask; + + if aligned_end > aligned_offset { + let _sw_guard = self.weak_write_blocker(aligned_offset..aligned_end).await; + // Safe: Alignment checked, and weak write blocker set up + unsafe { self.pure_write_zeroes(aligned_offset, aligned_end - aligned_offset) }.await?; + } + + let zero_buf = if aligned_offset > offset || aligned_end < unaligned_end { + let mut buf = IoBuffer::new( + cmp::max(aligned_offset - offset, unaligned_end - aligned_end) as usize, + self.mem_align(), + )?; + buf.as_mut().into_slice().fill(0); + Some(buf) + } else { + None + }; + + if aligned_offset > offset { + let buf = zero_buf + .as_ref() + .unwrap() + .as_ref_range(0..((aligned_offset - offset) as usize)); + self.write(buf, offset).await?; + } + if aligned_end < unaligned_end { + let buf = zero_buf + .as_ref() + .unwrap() + .as_ref_range(0..((unaligned_end - aligned_end) as usize)); + self.write(buf, aligned_end).await?; + } + + Ok(()) + } + + async fn discard(&self, offset: u64, length: u64) -> io::Result<()> { + let discard_align = self.discard_align(); + debug_assert!(discard_align.is_power_of_two()); + let align_mask = discard_align as u64 - 1; + + let unaligned_end = offset + .checked_add(length) + .ok_or_else(|| io::Error::other("Discard wrap-around"))?; + let aligned_offset = (offset + align_mask) & !align_mask; + let aligned_end = unaligned_end & !align_mask; + + if aligned_end > aligned_offset { + let _sw_guard = self.weak_write_blocker(offset..(offset + length)).await; + // Safe: Alignment checked, and weak write blocker set up + unsafe { self.pure_discard(offset, length) }.await?; + } + + // Nothing to do for the unaligned part; discarding is always just advisory. + + Ok(()) + } + + async fn weak_write_blocker(&self, range: Range) -> RangeBlockedGuard<'_> { + self.get_storage_helper().weak_write_blocker(range).await + } + + async fn strong_write_blocker(&self, range: Range) -> RangeBlockedGuard<'_> { + self.get_storage_helper().strong_write_blocker(range).await + } +} + +/// Check whether the given request is aligned. +fn is_aligned(bufv: &V, offset: u64, mem_align: usize, req_align: usize) -> bool { + debug_assert!(mem_align.is_power_of_two() && req_align.is_power_of_two()); + + let req_align_mask = req_align as u64 - 1; + + if offset & req_align_mask != 0 { + false + } else if bufv.len() & req_align_mask == 0 { + bufv.is_aligned(mem_align, req_align) + } else { + false + } +} diff --git a/src/imago/src/storage/mod.rs b/src/imago/src/storage/mod.rs new file mode 100644 index 00000000..f464e11e --- /dev/null +++ b/src/imago/src/storage/mod.rs @@ -0,0 +1,535 @@ +//! Helper functionality to access storage. +//! +//! While not the primary purpose of this crate, to open VM images, we need to be able to access +//! different kinds of storage objects. Such objects are abstracted behind the `Storage` trait. + +pub(crate) mod drivers; +pub mod ext; + +use crate::io_buffers::{IoBuffer, IoVector, IoVectorMut}; +use drivers::CommonStorageHelper; +use ext::StorageExt; +use std::fmt::{Debug, Display}; +use std::future::Future; +use std::path::{Path, PathBuf}; +use std::pin::Pin; +use std::sync::Arc; +use std::{cmp, io}; + +/// Parameters from which a storage object can be constructed. +#[derive(Clone, Default)] +pub struct StorageOpenOptions { + /// Filename to open. + pub(crate) filename: Option, + + /// Whether the object should be opened as writable or read-only. + pub(crate) writable: bool, + + /// Whether to bypass the host page cache (if applicable). + pub(crate) direct: bool, +} + +/// Implementation for storage objects. +pub trait Storage: Debug + Display + Send + Sized + Sync { + /// Open a storage object. + /// + /// Different storage implementations may require different options. + #[allow(async_fn_in_trait)] // No need for Send + async fn open(_opts: StorageOpenOptions) -> io::Result { + Err(io::Error::new( + io::ErrorKind::Unsupported, + format!( + "Cannot open storage objects of type {}", + std::any::type_name::() + ), + )) + } + + /// Synchronous wrapper around [`Storage::open()`]. + #[cfg(feature = "sync-wrappers")] + fn open_sync(opts: StorageOpenOptions) -> io::Result { + tokio::runtime::Builder::new_current_thread() + .build()? + .block_on(Self::open(opts)) + } + + /// Minimum required alignment for memory buffers. + fn mem_align(&self) -> usize { + 1 + } + + /// Minimum required alignment for offsets and lengths. + fn req_align(&self) -> usize { + 1 + } + + /// Minimum required alignment for zero writes. + fn zero_align(&self) -> usize { + 1 + } + + /// Minimum required alignment for effective discards. + fn discard_align(&self) -> usize { + 1 + } + + /// Storage object length. + fn size(&self) -> io::Result; + + /// Resolve the given path relative to this storage object. + /// + /// `relative` need not really be a relative path; it is up to the storage driver to check + /// whether it is an absolute path that does not need to be changed, or a relative path that + /// needs to be resolved. + /// + /// Must not return a relative path. + /// + /// The returned `PathBuf` should be usable with `StorageOpenOptions::filename()`. + fn resolve_relative_path>(&self, _relative: P) -> io::Result { + Err(io::ErrorKind::Unsupported.into()) + } + + /// Read data at `offset` into `bufv`. + /// + /// Reads until `bufv` is filled completely, i.e. will not do short reads. When reaching the + /// end of file, the rest of `bufv` is filled with 0. + /// + /// # Safety + /// This is a pure read from storage. The request must be fully aligned to + /// [`Self::mem_align()`] and [`Self::req_align()`], and safeguards we want to implement for + /// safe concurrent access may not be available. + /// + /// Use [`StorageExt::readv()`] instead. + #[allow(async_fn_in_trait)] // No need for Send + async unsafe fn pure_readv(&self, bufv: IoVectorMut<'_>, offset: u64) -> io::Result<()>; + + /// Write data from `bufv` to `offset`. + /// + /// Writes all data from `bufv`, i.e. will not do short writes. When reaching the end of file, + /// grow it as necessary so that the new end of file will be at `offset + bufv.len()`. + /// + /// If growing is not possible, writes beyond the end of file (even if only partially) should + /// fail. + /// + /// # Safety + /// This is a pure write to storage. The request must be fully aligned to + /// [`Self::mem_align()`] and [`Self::req_align()`], and safeguards we want to implement for + /// safe concurrent access may not be available. + /// + /// Use [`StorageExt::writev()`] instead. + #[allow(async_fn_in_trait)] // No need for Send + async unsafe fn pure_writev(&self, bufv: IoVector<'_>, offset: u64) -> io::Result<()>; + + /// Ensure the given range reads back as zeroes. + /// + /// The default implementation writes actual zeroes as data, which is inefficient. Storage + /// drivers should override it with a more efficient implementation. + /// + /// # Safety + /// This is a pure write to storage. The request must be fully aligned to + /// [`Self::zero_align()`], and safeguards we want to implement for safe concurrent access may + /// not be available. + /// + /// Use [`StorageExt::write_zeroes()`] instead. + #[allow(async_fn_in_trait)] // No need for Send + async unsafe fn pure_write_zeroes(&self, mut offset: u64, mut length: u64) -> io::Result<()> { + let buflen = cmp::min(length, 1048576) as usize; + let mut buf = IoBuffer::new(buflen, self.mem_align())?; + buf.as_mut().into_slice().fill(0); + + while length > 0 { + let chunk_length = cmp::min(length, 1048576) as usize; + self.writev(buf.as_ref_range(0..chunk_length).into(), offset) + .await?; + offset += chunk_length as u64; + length -= chunk_length as u64; + } + + Ok(()) + } + + /// Discard the given range, with undefined contents when read back. + /// + /// Tell the storage layer this range is no longer needed and need not be backed by actual + /// storage. When read back, the data read will be undefined, i.e. not necessarily zeroes. + /// + /// No-op implementations therefore explicitly fulfill the interface contract. + /// + /// # Safety + /// This is a pure write to storage. The request must be fully aligned to + /// [`Self::discard_align()`], and safeguards we want to implement for safe concurrent access + /// may not be available. + /// + /// Use [`StorageExt::discard()`] instead. + #[allow(async_fn_in_trait)] // No need for Send + async unsafe fn pure_discard(&self, _offset: u64, _length: u64) -> io::Result<()> { + Ok(()) + } + + /// Flush internal buffers. + /// + /// Does not necessarily sync those buffers to disk. When using `flush()`, consider whether + /// you want to call `sync()` afterwards. + #[allow(async_fn_in_trait)] // No need for Send + async fn flush(&self) -> io::Result<()>; + + /// Sync data already written to the storage hardware. + /// + /// This does not necessarily include flushing internal buffers, i.e. `flush`. When using + /// `sync()`, consider whether you want to call `flush()` before it. + #[allow(async_fn_in_trait)] // No need for Send + async fn sync(&self) -> io::Result<()>; + + /// Return the storage helper object (used by the [`StorageExt`] implementation). + fn get_storage_helper(&self) -> &CommonStorageHelper; +} + +/// Allow dynamic use of storage objects (i.e. is object safe). +/// +/// When using normal `Storage` objects, they must all be of the same type within a single disk +/// image chain. For example, every storage object underneath a `FormatAccess` object +/// must be of type `StdFile`. +/// +/// `DynStorage` allows the use of `Box`, which implements `Storage`, to allow +/// mixed storage object types. Therefore, a `FormatAccess>` allows e.g. the +/// use of both `Box` and `Box` storage objects together. (`Arc` instead of `Box` +/// works, too.) +/// +/// Async functions in `DynStorage` return boxed futures (`Pin>`), which makes them +/// slighly less efficient than async functions in `Storage`, hence the distinction. +pub trait DynStorage: Debug + Display + Send + Sync { + /// Wrapper around [`Storage::mem_align()`]. + fn dyn_mem_align(&self) -> usize; + + /// Wrapper around [`Storage::req_align()`]. + fn dyn_req_align(&self) -> usize; + + /// Wrapper around [`Storage::zero_align()`]. + fn dyn_zero_align(&self) -> usize; + + /// Wrapper around [`Storage::discard_align()`]. + fn dyn_discard_align(&self) -> usize; + + /// Wrapper around [`Storage::size()`]. + fn dyn_size(&self) -> io::Result; + + /// Wrapper around [`Storage::resolve_relative_path()`]. + fn dyn_resolve_relative_path(&self, relative: &Path) -> io::Result; + + /// Object-safe wrapper around [`Storage::pure_readv()`]. + /// + /// # Safety + /// Same considerations are for [`Storage::pure_readv()`] apply. + unsafe fn dyn_pure_readv<'a>( + &'a self, + bufv: IoVectorMut<'a>, + offset: u64, + ) -> Pin> + 'a>>; + + /// Object-safe wrapper around [`Storage::pure_writev()`]. + /// + /// # Safety + /// Same considerations are for [`Storage::pure_writev()`] apply. + unsafe fn dyn_pure_writev<'a>( + &'a self, + bufv: IoVector<'a>, + offset: u64, + ) -> Pin> + 'a>>; + + /// Object-safe wrapper around [`Storage::pure_write_zeroes()`]. + /// + /// # Safety + /// Same considerations are for [`Storage::pure_write_zeroes()`] apply. + unsafe fn dyn_pure_write_zeroes( + &self, + offset: u64, + length: u64, + ) -> Pin> + '_>>; + + /// Object-safe wrapper around [`Storage::pure_discard()`]. + /// + /// # Safety + /// Same considerations are for [`Storage::pure_discard()`] apply. + unsafe fn dyn_pure_discard( + &self, + offset: u64, + length: u64, + ) -> Pin> + '_>>; + + /// Object-safe wrapper around [`Storage::flush()`]. + fn dyn_flush(&self) -> Pin> + '_>>; + + /// Object-safe wrapper around [`Storage::sync()`]. + fn dyn_sync(&self) -> Pin> + '_>>; + + /// Wrapper around [`Storage::get_storage_helper()`]. + fn dyn_get_storage_helper(&self) -> &CommonStorageHelper; +} + +impl Storage for &S { + fn mem_align(&self) -> usize { + (*self).mem_align() + } + + fn req_align(&self) -> usize { + (*self).req_align() + } + + fn zero_align(&self) -> usize { + (*self).zero_align() + } + + fn discard_align(&self) -> usize { + (*self).discard_align() + } + + fn size(&self) -> io::Result { + (*self).size() + } + + fn resolve_relative_path>(&self, relative: P) -> io::Result { + (*self).resolve_relative_path(relative) + } + + async unsafe fn pure_readv(&self, bufv: IoVectorMut<'_>, offset: u64) -> io::Result<()> { + unsafe { (*self).pure_readv(bufv, offset).await } + } + + async unsafe fn pure_writev(&self, bufv: IoVector<'_>, offset: u64) -> io::Result<()> { + unsafe { (*self).pure_writev(bufv, offset).await } + } + + async unsafe fn pure_write_zeroes(&self, offset: u64, length: u64) -> io::Result<()> { + unsafe { (*self).pure_write_zeroes(offset, length).await } + } + + async unsafe fn pure_discard(&self, offset: u64, length: u64) -> io::Result<()> { + unsafe { (*self).pure_discard(offset, length).await } + } + + async fn flush(&self) -> io::Result<()> { + (*self).flush().await + } + + async fn sync(&self) -> io::Result<()> { + (*self).sync().await + } + + fn get_storage_helper(&self) -> &CommonStorageHelper { + (*self).get_storage_helper() + } +} + +impl DynStorage for S { + fn dyn_mem_align(&self) -> usize { + ::mem_align(self) + } + + fn dyn_req_align(&self) -> usize { + ::req_align(self) + } + + fn dyn_zero_align(&self) -> usize { + ::zero_align(self) + } + + fn dyn_discard_align(&self) -> usize { + ::discard_align(self) + } + + fn dyn_size(&self) -> io::Result { + ::size(self) + } + + fn dyn_resolve_relative_path(&self, relative: &Path) -> io::Result { + ::resolve_relative_path(self, relative) + } + + unsafe fn dyn_pure_readv<'a>( + &'a self, + bufv: IoVectorMut<'a>, + offset: u64, + ) -> Pin> + 'a>> { + Box::pin(unsafe { ::pure_readv(self, bufv, offset) }) + } + + unsafe fn dyn_pure_writev<'a>( + &'a self, + bufv: IoVector<'a>, + offset: u64, + ) -> Pin> + 'a>> { + Box::pin(unsafe { ::pure_writev(self, bufv, offset) }) + } + + unsafe fn dyn_pure_write_zeroes( + &self, + offset: u64, + length: u64, + ) -> Pin> + '_>> { + Box::pin(unsafe { ::pure_write_zeroes(self, offset, length) }) + } + + unsafe fn dyn_pure_discard( + &self, + offset: u64, + length: u64, + ) -> Pin> + '_>> { + Box::pin(unsafe { ::pure_discard(self, offset, length) }) + } + + fn dyn_flush(&self) -> Pin> + '_>> { + Box::pin(::flush(self)) + } + + fn dyn_sync(&self) -> Pin> + '_>> { + Box::pin(::sync(self)) + } + + fn dyn_get_storage_helper(&self) -> &CommonStorageHelper { + ::get_storage_helper(self) + } +} + +impl Storage for Box { + async fn open(opts: StorageOpenOptions) -> io::Result { + // TODO: When we have more drivers, choose different defaults depending on the options + // given. Right now, only `File` really supports being opened through options, so it is an + // obvious choice. + Ok(Box::new(crate::file::File::open(opts).await?)) + } + + fn mem_align(&self) -> usize { + self.as_ref().dyn_mem_align() + } + + fn req_align(&self) -> usize { + self.as_ref().dyn_req_align() + } + + fn zero_align(&self) -> usize { + self.as_ref().dyn_zero_align() + } + + fn discard_align(&self) -> usize { + self.as_ref().dyn_discard_align() + } + + fn size(&self) -> io::Result { + self.as_ref().dyn_size() + } + + fn resolve_relative_path>(&self, relative: P) -> io::Result { + self.as_ref().dyn_resolve_relative_path(relative.as_ref()) + } + + async unsafe fn pure_readv(&self, bufv: IoVectorMut<'_>, offset: u64) -> io::Result<()> { + unsafe { self.as_ref().dyn_pure_readv(bufv, offset).await } + } + + async unsafe fn pure_writev(&self, bufv: IoVector<'_>, offset: u64) -> io::Result<()> { + unsafe { self.as_ref().dyn_pure_writev(bufv, offset).await } + } + + async unsafe fn pure_write_zeroes(&self, offset: u64, length: u64) -> io::Result<()> { + unsafe { self.as_ref().dyn_pure_write_zeroes(offset, length).await } + } + + async unsafe fn pure_discard(&self, offset: u64, length: u64) -> io::Result<()> { + unsafe { self.as_ref().dyn_pure_discard(offset, length).await } + } + + async fn flush(&self) -> io::Result<()> { + self.as_ref().dyn_flush().await + } + + async fn sync(&self) -> io::Result<()> { + self.as_ref().dyn_sync().await + } + + fn get_storage_helper(&self) -> &CommonStorageHelper { + self.as_ref().dyn_get_storage_helper() + } +} + +impl Storage for Arc { + async fn open(opts: StorageOpenOptions) -> io::Result { + Box::::open(opts).await.map(Into::into) + } + + fn mem_align(&self) -> usize { + self.as_ref().dyn_mem_align() + } + + fn req_align(&self) -> usize { + self.as_ref().dyn_req_align() + } + + fn zero_align(&self) -> usize { + self.as_ref().dyn_zero_align() + } + + fn discard_align(&self) -> usize { + self.as_ref().dyn_discard_align() + } + + fn size(&self) -> io::Result { + self.as_ref().dyn_size() + } + + fn resolve_relative_path>(&self, relative: P) -> io::Result { + self.as_ref().dyn_resolve_relative_path(relative.as_ref()) + } + + async unsafe fn pure_readv(&self, bufv: IoVectorMut<'_>, offset: u64) -> io::Result<()> { + unsafe { self.as_ref().dyn_pure_readv(bufv, offset) }.await + } + + async unsafe fn pure_writev(&self, bufv: IoVector<'_>, offset: u64) -> io::Result<()> { + unsafe { self.as_ref().dyn_pure_writev(bufv, offset) }.await + } + + async unsafe fn pure_write_zeroes(&self, offset: u64, length: u64) -> io::Result<()> { + unsafe { self.as_ref().dyn_pure_write_zeroes(offset, length) }.await + } + + async unsafe fn pure_discard(&self, offset: u64, length: u64) -> io::Result<()> { + unsafe { self.as_ref().dyn_pure_discard(offset, length) }.await + } + + async fn flush(&self) -> io::Result<()> { + self.as_ref().dyn_flush().await + } + + async fn sync(&self) -> io::Result<()> { + self.as_ref().dyn_sync().await + } + + fn get_storage_helper(&self) -> &CommonStorageHelper { + self.as_ref().dyn_get_storage_helper() + } +} + +impl StorageOpenOptions { + /// Create default options. + pub fn new() -> Self { + StorageOpenOptions::default() + } + + /// Set a filename to open. + pub fn filename>(mut self, filename: P) -> Self { + self.filename = Some(filename.as_ref().to_owned()); + self + } + + /// Whether the storage should be writable or not. + pub fn write(mut self, write: bool) -> Self { + self.writable = write; + self + } + + /// Whether to bypass the host page cache (if applicable). + pub fn direct(mut self, direct: bool) -> Self { + self.direct = direct; + self + } +} diff --git a/src/imago/src/vector_select.rs b/src/imago/src/vector_select.rs new file mode 100644 index 00000000..1252b85c --- /dev/null +++ b/src/imago/src/vector_select.rs @@ -0,0 +1,114 @@ +//! Async select over future vectors. +//! +//! Allows collecting `dyn Future` objects (i.e. async function instances) in a vector, and +//! `select`ing (awaiting one) or `join`ing (awaiting all) them. + +use std::future::Future; +use std::marker::Unpin; +use std::pin::Pin; +use std::task::{Context, Poll}; + +/// Collect futures and await one or all of them. +pub(crate) struct FutureVector> + Unpin> { + /// Pending futures. + vec: Vec, +} + +/// Await a single future. +pub(crate) struct FutureVectorSelect<'a, R, E, F: Future> + Unpin>( + &'a mut FutureVector, +); + +/// Await all futures, discarding successful results. +pub(crate) struct FutureVectorDiscardingJoin<'a, R, E, F: Future> + Unpin>( + &'a mut FutureVector, +); + +impl> + Unpin> FutureVector { + /// Create a new `FutureVector`. + pub fn new() -> Self { + FutureVector { vec: Vec::new() } + } + + /// Add a future. + pub fn push(&mut self, future: F) { + self.vec.push(future); + } + + /// `true` if and only if there are no pending futures. + pub fn is_empty(&self) -> bool { + self.vec.is_empty() + } + + /// Number of pending futures. + pub fn len(&self) -> usize { + self.vec.len() + } + + /// Await any one future. + /// + /// Return the result of the first future that becomes ready, removing it from the vector. + /// + /// Functionally, behaves like: + /// ```ignore + /// async fn select(&mut self) -> Result; + /// ``` + pub fn select(&mut self) -> FutureVectorSelect<'_, R, E, F> { + FutureVectorSelect(self) + } + + /// Join all futures, discarding successful results. + /// + /// If an error occurs, return it immediately. All pending futures remain. + /// + /// Functionally, behaves like: + /// ```ignore + /// async fn discarding_join(&mut self) -> Result<(), E>; + /// ``` + pub fn discarding_join(&mut self) -> FutureVectorDiscardingJoin<'_, R, E, F> { + FutureVectorDiscardingJoin(self) + } +} + +impl> + Unpin> Future for FutureVectorSelect<'_, R, E, F> { + type Output = F::Output; + + fn poll(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll { + assert!(!self.0.is_empty()); + + for (i, fut) in self.0.vec.iter_mut().enumerate() { + if let Poll::Ready(result) = F::poll(Pin::new(fut), ctx) { + self.0.vec.swap_remove(i); + return Poll::Ready(result); + } + } + + Poll::Pending + } +} + +impl> + Unpin> Future + for FutureVectorDiscardingJoin<'_, R, E, F> +{ + type Output = Result<(), E>; + + fn poll(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll> { + let mut i = 0; + while i < self.0.len() { + if let Poll::Ready(result) = F::poll(Pin::new(&mut self.0.vec[i]), ctx) { + self.0.vec.swap_remove(i); + if let Err(err) = result { + return Poll::Ready(Err(err)); + } + } else { + i += 1; + } + } + + if self.0.is_empty() { + Poll::Ready(Ok(())) + } else { + Poll::Pending + } + } +}