From 8e3bab73f158d9ccd1d2969a716941818ad9bc5c Mon Sep 17 00:00:00 2001
From: Matthew Ahrens <mahrens@delphix.com>
Date: Tue, 8 Feb 2022 18:38:08 -0800
Subject: [PATCH] healing reads should not bypass object cache (#179)

The performance of healing reads is sometimes relevant.  For example,
during `zpool import` we do healing reads of the blocks written in the
past few TXG's.  Also the performance of `zpool scrub` may matter.

The current behavior is to bypass all caching for healing reads,
including the object cache.  However, this has a huge performance impact
since we can see ~700x read inflation as we read every object once for
each block in it.  This inflation is guaranteed for `zpool import` since
the last few TXG's are necessarily densely packed into objects.

This commit changes the behavior of healing reads to bypass only the
zettacache, not the object cache, by default.  If the old behavior is
desired, the tunable can be changed, or the agent can be restarted the
clear the object cache.

I would recommend that a later commit removes the tunable and simplifies
the code.
---
 .../zettaobject/src/object_access.rs          | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/cmd/zfs_object_agent/zettaobject/src/object_access.rs b/cmd/zfs_object_agent/zettaobject/src/object_access.rs
index 3e03ff71a7d1..fc5bd92787cf 100644
--- a/cmd/zfs_object_agent/zettaobject/src/object_access.rs
+++ b/cmd/zfs_object_agent/zettaobject/src/object_access.rs
@@ -53,6 +53,7 @@ lazy_static! {
     static ref LONG_OPERATION_DURATION: Duration = Duration::from_secs(get_tunable("long_operation_secs", 2));
 
     pub static ref OBJECT_DELETION_BATCH_SIZE: usize = get_tunable("object_deletion_batch_size", 1000);
+    pub static ref OBJECT_CACHE_IS_BYPASSABLE: bool = get_tunable("object_cache_is_bypassable", false);
 }
 
 #[derive(Debug, Enum, Copy, Clone)]
@@ -542,14 +543,18 @@ impl ObjectAccess {
         key: String,
         stat_type: ObjectAccessStatType,
     ) -> Result<Bytes> {
-        let bytes = self.get_object_impl(key.clone(), stat_type, None).await?;
-        // Note: we *should* have the same data from S3 (in the `vec`) and in
-        // the cache, so this invalidation is normally not necessary.  However,
-        // in case a bug (or undetected RAM error) resulted in incorrect cached
-        // data, we want to invalidate the cache so that we won't get the bad
-        // cached data again.
-        Self::invalidate_cache(key);
-        Ok(bytes)
+        if *OBJECT_CACHE_IS_BYPASSABLE {
+            let bytes = self.get_object_impl(key.clone(), stat_type, None).await?;
+            // Note: we *should* have the same data from S3 (in the `vec`) and in
+            // the cache, so this invalidation is normally not necessary.  However,
+            // in case a bug (or undetected RAM error) resulted in incorrect cached
+            // data, we want to invalidate the cache so that we won't get the bad
+            // cached data again.
+            Self::invalidate_cache(key);
+            Ok(bytes)
+        } else {
+            self.get_object(key, stat_type).await
+        }
     }
 
     pub async fn get_object(&self, key: String, stat_type: ObjectAccessStatType) -> Result<Bytes> {