DLPX-87528 migration device removal held up by currently syncing TXG (o…

…penzfs#1132) = Problem The current removal code is sub-optimal for our evacuation to the object store use case. While copying data, once the `vca_outstanding_bytes` is full, we wait for the next txg, but we don’t cause it to be pushed out, so we may have to wait for the txg timeout before making progress. = This Patch While the logic remains the same for block-to-block removals, evacuations to the object store: 1] Push out a TXG so we don't have to wait for a TXG timeout 2] Adhere to a new global tunable that `zfs_remove_max_txg_bytes` which is compared to `svr_bytes_done` in order which looks at that limit in a per-TXG fashion. = Notes Matt was the one that first identified this issue and came up with the fix of this patch. I just propagated some of these changes to the ZTS code. = Perf results In a i3en.2xlarge VM I evacuated ~22.4GB of data with and without this patch. Without The Patch: ``` pool: test state: ONLINE remove: Evacuation of /dev/nvme2n1p1 in progress since Mon Aug 21 18:35:59 2023 22.2G copied out of 22.4G at 15.0M/s, 99.25% done, 0h0m to go config: NAME STATE READ WRITE CKSUM test ONLINE 0 0 0 nvme2n1 ONLINE 0 0 0 (removing) cloudburst-data-2 ONLINE 0 0 0 errors: No known data errors pool: test state: ONLINE remove: Removal of vdev 0 copied 22.4G in 0h25m, completed on Mon Aug 21 19:01:27 2023 36.0K memory used for removed device mappings config: NAME STATE READ WRITE CKSUM test ONLINE 0 0 0 cloudburst-data-2 ONLINE 0 0 0 ``` With This Patch: ``` pool: test state: ONLINE remove: Evacuation of /dev/nvme2n1p1 in progress since Mon Aug 21 19:41:38 2023 21.6G copied out of 22.4G at 96.6M/s, 96.63% done, 0h0m to go config: NAME STATE READ WRITE CKSUM test ONLINE 0 0 0 nvme2n1 ONLINE 0 0 0 (removing) cloudburst-data-2 ONLINE 0 0 0 errors: No known data errors pool: test state: ONLINE remove: Removal of vdev 0 copied 22.4G in 0h3m, completed on Mon Aug 21 19:45:34 2023 36.0K memory used for removed device mappings config: NAME STATE READ WRITE CKSUM test ONLINE 0 0 0 cloudburst-data-2 ONLINE 0 0 0 ```
pcd1193182 · Aug 23, 2023 · 0feaee0 · 0feaee0
1 parent 133238a
commit 0feaee0
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 11 deletions.
diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c
@@ -88,6 +88,18 @@
  */
 uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
 
+/*
+ * The maximum amount of memory to use for outstanding i/o, for each txg,
+ * while doing a device removal to an object store vdev.  When removing
+ * to object store, this is used instead of zfs_remove_max_copy_bytes.
+ *
+ * Note: Block to block removals don't need to push TXGs to make progress
+ * but removals to the object store do which can affect their performance.
+ * Attempting to provide similar performance for removals to the object
+ * store we set this tunable slightly higher than zfs_remove_max_copy_bytes.
+ */
+uint_t zfs_remove_max_txg_bytes = 128 * 1024 * 1024;
+
 /*
  * The largest contiguous segment that we will attempt to allocate when
  * removing a device.  This can be no larger than SPA_MAXBLOCKSIZE.  If
@@ -2432,20 +2444,47 @@ spa_vdev_remove_thread(void *arg)
 			    !svr->svr_thread_exit)
 				delay(hz);
 
-			mutex_enter(&svr->svr_vca.vca_lock);
-			while (svr->svr_vca.vca_outstanding_bytes >
-			    zfs_remove_max_copy_bytes) {
-				cv_wait(&svr->svr_vca.vca_cv,
-				    &svr->svr_vca.vca_lock);
+			/*
+			 * Memory usage during removal to object store is
+			 * controlled by zfs_remove_max_txg_bytes, below.
+			 */
+			if (!spa_is_object_based(spa)) {
+				mutex_enter(&svr->svr_vca.vca_lock);
+				while (svr->svr_vca.vca_outstanding_bytes >
+				    zfs_remove_max_copy_bytes) {
+					cv_wait(&svr->svr_vca.vca_cv,
+					    &svr->svr_vca.vca_lock);
+				}
+				mutex_exit(&svr->svr_vca.vca_lock);
 			}
-			mutex_exit(&svr->svr_vca.vca_lock);
 
 			dmu_tx_t *tx =
 			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
-
 			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 			uint64_t txg = dmu_tx_get_txg(tx);
 
+			while (svr->svr_bytes_done[txg & TXG_MASK] >
+			    zfs_remove_max_txg_bytes &&
+			    spa_is_object_based(spa)) {
+				/*
+				 * When writing to object store, the writes
+				 * complete in syncing context.  If we have
+				 * reached the limit of memory usage, ensure
+				 * we are pushing out a txg so that we don't
+				 * have to wait for the txg timeout to make
+				 * progress.
+				 */
+				dmu_tx_commit(tx);
+
+				txg_wait_open(spa_get_dsl(spa),
+				    txg + 1, B_TRUE);
+
+				tx = dmu_tx_create_dd(
+				    spa_get_dsl(spa)->dp_mos_dir);
+				VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+				txg = dmu_tx_get_txg(tx);
+			}
+
 			/*
 			 * Reacquire the vdev_config lock.  The vdev_t
 			 * that we're removing may have changed, e.g. due
@@ -3383,6 +3422,12 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_copy_bytes, UINT, ZMOD_RW,
 	"I/Os for device removal");
 /* END CSTYLED */
 
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_txg_bytes, UINT, ZMOD_RW,
+	"Maximum amount of memory that can be used for in flight "
+	"I/Os for device removal to object store, in each txg");
+/* END CSTYLED */
+
 ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_ignore_errors, INT, ZMOD_RW,
 	"Ignore hard IO errors when removing device");
 

diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
@@ -66,6 +66,7 @@ PREFETCH_DISABLE		prefetch.disable		zfs_prefetch_disable
 REBUILD_SCRUB_ENABLED		rebuild_scrub_enabled		zfs_rebuild_scrub_enabled
 REMOVAL_SUSPEND_PROGRESS	removal_suspend_progress	zfs_removal_suspend_progress
 REMOVE_MAX_COPY_BYTES		remove_max_copy_bytes		zfs_remove_max_copy_bytes
+REMOVE_MAX_TXG_BYTES		remove_max_txg_bytes		zfs_remove_max_txg_bytes
 REMOVE_MAX_SEGMENT		remove_max_segment		zfs_remove_max_segment
 RESILVER_MIN_TIME_MS		resilver_min_time_ms		zfs_resilver_min_time_ms
 SCAN_LEGACY			scan_legacy			zfs_scan_legacy

diff --git a/tests/zfs-tests/tests/functional/block_to_object/block_to_object.kshlib b/tests/zfs-tests/tests/functional/block_to_object/block_to_object.kshlib
@@ -94,7 +94,7 @@ function cleanup_testpool
 	log_must set_tunable64 METASLAB_FORCE_GANGING 16777217
 	log_must set_tunable64 SCAN_VDEV_LIMIT $ZFS_SCAN_VDEV_LIMIT_DEFAULT
 	log_must set_tunable32 REMOVE_MAX_SEGMENT 16777216
-	log_must set_tunable32 REMOVE_MAX_COPY_BYTES 67108864
+	log_must set_tunable32 REMOVE_MAX_TXG_BYTES 134217728
 	log_must set_tunable32 BLOCK_BOUNDARY_MEMLIMIT_PCT 75
 	log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
 	log_must set_tunable32 REMOVAL_SUSPEND_PROGRESS 0

diff --git a/tests/zfs-tests/tests/functional/block_to_object/pool_migration_with_import_export.ksh b/tests/zfs-tests/tests/functional/block_to_object/pool_migration_with_import_export.ksh
@@ -39,7 +39,7 @@
 #	9. Reset scan_vdev_limit tunable and wait for scan to finish.
 #	10. Export the pool and re-import it once the scan has
 #	   finished but the evacuation hasn't made any progress.
-#	11. Set remove_max_segment and remove_max_copy_bytes to slow
+#	11. Set remove_max_segment and remove_max_txg_bytes to slow
 #	   the removal down and unset the tunable that had suspended
 #	   it.
 #	12. Export the pool and re-import it while the removal is
@@ -164,7 +164,7 @@ for ashift in 9 12; do
 	# tunable
 	#
 	log_must set_tunable32 REMOVE_MAX_SEGMENT 16384
-	log_must set_tunable32 REMOVE_MAX_COPY_BYTES 1048576
+	log_must set_tunable32 REMOVE_MAX_TXG_BYTES 1048576
 	log_must set_tunable32 REMOVAL_SUSPEND_PROGRESS 0
 
 	#
@@ -188,7 +188,7 @@ for ashift in 9 12; do
 	# Reset removal speed and wait for it to finish
 	#
 	log_must set_tunable32 REMOVE_MAX_SEGMENT 16777216
-	log_must set_tunable32 REMOVE_MAX_COPY_BYTES 67108864
+	log_must set_tunable32 REMOVE_MAX_TXG_BYTES 134217728
 	log_must wait_for_removal $TESTPOOL
 
 	#