From 5d54281dbce4c27f4518c31a29ebe2d6da73f6d0 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 1 Mar 2024 15:48:53 +1100
Subject: [PATCH] zts: test for correct fsync() response to ZIL flush failure

If fsync() (zil_commit()) writes successfully, but then the flush fails,
fsync() should not return success, but instead should fall into a full
transaction wait.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 tests/runfiles/linux.run                      |   4 +
 tests/test-runner/bin/zts-report.py.in        |   1 +
 tests/zfs-tests/include/blkdev.shlib          |   8 +-
 tests/zfs-tests/tests/Makefile.am             |   3 +
 .../tests/functional/flush/cleanup.ksh        |  28 ++
 .../tests/functional/flush/setup.ksh          |  30 ++
 .../functional/flush/zil_flush_fallback.ksh   | 257 ++++++++++++++++++
 7 files changed, 328 insertions(+), 3 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/flush/cleanup.ksh
 create mode 100755 tests/zfs-tests/tests/functional/flush/setup.ksh
 create mode 100755 tests/zfs-tests/tests/functional/flush/zil_flush_fallback.ksh

diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 92ce09ec6fcb..6bfbf822a64d 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -124,6 +124,10 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
     'scrub_after_resilver', 'zpool_status_-s']
 tags = ['functional', 'fault']
 
+[tests/functional/flush:Linux]
+tests = ['zil_flush_fallback']
+tags = ['functional', 'flush']
+
 [tests/functional/features/large_dnode:Linux]
 tests = ['large_dnode_002_pos', 'large_dnode_006_pos', 'large_dnode_008_pos']
 tags = ['functional', 'features', 'large_dnode']
diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index 5ca130931364..4e47dc55639f 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -379,6 +379,7 @@ if os.environ.get('CI') == 'true':
         'fault/auto_replace_002_pos': ['SKIP', ci_reason],
         'fault/auto_spare_ashift': ['SKIP', ci_reason],
         'fault/auto_spare_shared': ['SKIP', ci_reason],
+        'flush/zil_flush_fallback': ['SKIP', ci_reason],
         'procfs/pool_state': ['SKIP', ci_reason],
     })
 
diff --git a/tests/zfs-tests/include/blkdev.shlib b/tests/zfs-tests/include/blkdev.shlib
index 51eff3023e73..73ff546dd090 100644
--- a/tests/zfs-tests/include/blkdev.shlib
+++ b/tests/zfs-tests/include/blkdev.shlib
@@ -462,13 +462,15 @@ function unload_scsi_debug
 # Get scsi_debug device name.
 # Returns basename of scsi_debug device (for example "sdb").
 #
-function get_debug_device
+function get_debug_device #num
 {
+  typeset num=${1:-1}
+
 	for i in {1..10} ; do
-		val=$(lsscsi | awk '/scsi_debug/ {print $6; exit}' | cut -d/ -f3)
+		val=$(lsscsi | awk '/scsi_debug/ {print $6}' | cut -d/ -f3 | head -n$num)
 
 		# lsscsi can take time to settle
-		if [ "$val" != "-" ] ; then
+		if [[ ! "$val" =~ "-" ]] ; then
 			break
 		fi
 		sleep 1
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 44eedcf6fae5..559d12a3c862 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1499,6 +1499,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/features/large_dnode/large_dnode_008_pos.ksh \
 	functional/features/large_dnode/large_dnode_009_pos.ksh \
 	functional/features/large_dnode/setup.ksh \
+	functional/flush/cleanup.ksh \
+	functional/flush/zil_flush_fallback.ksh \
+	functional/flush/setup.ksh \
 	functional/grow/grow_pool_001_pos.ksh \
 	functional/grow/grow_replicas_001_pos.ksh \
 	functional/history/cleanup.ksh \
diff --git a/tests/zfs-tests/tests/functional/flush/cleanup.ksh b/tests/zfs-tests/tests/functional/flush/cleanup.ksh
new file mode 100755
index 000000000000..f9359ed9b2aa
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/flush/cleanup.ksh
@@ -0,0 +1,28 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+
+#
+# Copyright (c) 2024, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/flush/setup.ksh b/tests/zfs-tests/tests/functional/flush/setup.ksh
new file mode 100755
index 000000000000..25aa2120f3fc
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/flush/setup.ksh
@@ -0,0 +1,30 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+
+#
+# Copyright (c) 2024, Klara Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/flush/zil_flush_fallback.ksh b/tests/zfs-tests/tests/functional/flush/zil_flush_fallback.ksh
new file mode 100755
index 000000000000..4f3ba6ff2218
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/flush/zil_flush_fallback.ksh
@@ -0,0 +1,257 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024, Klara Inc.
+#
+
+#
+# This tests that if the ZIL write sequence fails, it corectly falls back and
+# waits until the transaction has fully committed before returning.
+#
+# When this test was written, the ZIL has a flaw - it assumes that if its
+# writes succeed, then the data is definitely on disk and available for reply
+# if the pool fails. It issues a flush immediately after the write, but does
+# not check it is result. If a disk fails after the data has been accepted into
+# the disk cache, but before it can be written to permanent storage, then
+# fsync() will have returned success even though the data is not stored in the
+# ZIL for replay.
+#
+# If the main pool then fails before the transaction can be written, then data
+# is lost, and fsync() returning success was premature.
+#
+# To prove this, we create a pool with a separate log device. We inject two
+# faults:
+#
+# - ZIL writes appear to succeed, but never make it disk
+# - ZIL flushes fail, and return error
+#
+# We then remove the main pool device, and do a write+fsync. This goes to the
+# ZIL, and appears to succeed. When the txg closes, the write will fail, and
+# the pool suspends.
+#
+# Then, we simulate a reboot by copying the content of the pool devices aside.
+# We restore the pool devices, bring it back online, and export it - we don't
+# need it anymore, but we have to clean up properly. Then we restore the copied
+# content and import the pool, in whatever state it was in when it suspended.
+#
+# Finally, we check the content of the file we wrote to. If it matches what we
+# wrote, then the fsync() was correct, and all is well. If it doesn't match,
+# then the flaw is present, and the test fails.
+#
+# We run the test twice: once without the log device injections, one with. The
+# first confirms the expected behaviour of the ZIL - when the pool is imported,
+# the log is replayed. The second fails as above. When the flaw is corrected,
+# both will succeed, and this overall test succeeds.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+TMPDIR=${TMPDIR:-$TEST_BASE_DIR}
+
+BACKUP_MAIN="$TMPDIR/backup_main"
+BACKUP_LOG="$TMPDIR/backup_log"
+
+LOOP_LOG="$TMPDIR/loop_log"
+
+DATA_FILE="$TMPDIR/data_file"
+
+verify_runnable "global"
+
+function cleanup
+{
+	zinject -c all
+	destroy_pool $TESTPOOL
+	unload_scsi_debug
+	rm -f $BACKUP_MAIN $BACKUP_LOG $DATA_FILE
+}
+
+log_assert "verify fsync() waits if the ZIL commit fails"
+
+# create 128K of random data, and take its checksum. we do this up front to
+# ensure we don't get messed up by any latency from reading /dev/random or
+# checksumming the file on the pool
+log_must dd if=/dev/random of=$DATA_FILE bs=128K count=1
+typeset sum=$(cat $DATA_FILE | md5sum)
+
+# create a virtual scsi device with two device nodes. these are backed by the
+# same memory. we do this because we need to be able to take the device offline
+# properly in order to get the pool to suspend; fault injection on a loop
+# device can't do it. once offline, we can use the second node to take a copy
+# of its state.
+load_scsi_debug 100 1 2 1 '512b'
+set -A sd $(get_debug_device 2)
+
+# create a loop device for the log.
+truncate -s 100M $LOOP_LOG
+typeset ld=$(basename $(losetup -f))
+log_must losetup /dev/$ld $LOOP_LOG
+
+# this function runs the entire test sequence. the option decides if faults
+# are injected on the slog device, mimicking the trigger situation that causes
+# the fsync() bug to occur
+function test_fsync
+{
+	typeset -i do_fault_log="$1"
+
+	log_note "setting up test"
+
+	# create the pool. the main data store is on the scsi device, with the
+	# log on a loopback. we bias the ZIL towards to the log device to try
+	# to ensure that fsync() never involves the main device
+	log_must zpool create -f -O logbias=latency $TESTPOOL ${sd[0]} log $ld
+
+	# create the file ahead of time. the ZIL head structure is created on
+	# first use, and does a full txg wait, which we need to avoid
+	log_must dd if=/dev/zero of=/$TESTPOOL/data_file \
+	    bs=128k count=1 conv=fsync
+	log_must zpool sync
+
+	# arrange for writes to the log device to appear to succeed, and
+	# flushes to fail. this simulates a loss of the device between it
+	# accepting the the write into its cache, but before it can be written
+	# out
+	if [[ $do_fault_log != 0 ]] ; then
+		log_note "injecting log device faults"
+		log_must zinject -d $ld -e noop -T write $TESTPOOL
+		log_must zinject -d $ld -e io -T flush $TESTPOOL
+	fi
+
+	# take the main device offline. there is no IO in flight, so ZFS won't
+	# notice immediately
+	log_note "taking main pool offline"
+	log_must eval "echo offline > /sys/block/${sd[0]}/device/state"
+
+	# write out some data, then call fsync(). there are three possible
+	# results:
+	#
+	# - if the bug is present, fsync() will return success, and dd will
+	#   succeed "immediately"; before the pool suspends
+	# - if the bug is fixed, fsync() will block, the pool will suspend, and
+	#   dd will return success after the pool returns to service
+	# - if something else goes wrong, dd will fail; this may happen before
+	#   or after the pool suspends or returns. this shouldn't happen, and
+	#   should abort the test
+	#
+	# we have to put dd in the background, otherwise if it blocks we will
+	# block with it. what we're interested in is whether or not it succeeds
+	# before the pool is suspended. if it does, then we expect that after
+	# the suspended pool is reimported, the data will have been written
+	log_note "running dd in background to write data and call fsync()"
+	dd if=$DATA_FILE of=/$TESTPOOL/data_file bs=128k count=1 conv=fsync &
+	fsync_pid=$!
+
+	# wait for the pool to suspend. this should happen within ~5s, when the
+	# txg sync tries to write the change to the main device
+	log_note "waiting for pool to suspend"
+	typeset -i tries=10
+	until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do
+		if ((tries-- == 0)); then
+			log_fail "pool didn't suspend"
+		fi
+		sleep 1
+	done
+
+	# the pool is suspended. see if dd is still present; if it is, then
+	# it's blocked in fsync(), and we have no expectation that the write
+	# made it to disk. if dd has exited, then its return code will tell
+	# us whether fsync() returned success, or it failed for some other
+	# reason
+	typeset -i fsync_success=0
+	if kill -0 $fsync_pid ; then
+		log_note "dd is blocked; fsync() has not returned"
+	else
+		log_note "dd has finished, ensuring it was successful"
+		log_must wait $fsync_pid
+		fsync_success=1
+	fi
+
+	# pool is suspended. if we online the main device right now, it will
+	# retry writing the transaction, which will succed, and everything will
+	# continue as its supposed to. that's the opposite of what we want; we
+	# want to do an import, as if after reboot, to force the pool to try to
+	# replay the ZIL, so we can compare the final result against what
+	# fsync() told us
+	#
+	# however, right now the pool is wedged. we need to get it back online
+	# so we can export it, so we can do the import. so we need to copy the
+	# entire pool state away. for the scsi device, we can do this through
+	# the second device node. for the loopback, we can copy it directly
+	log_note "taking copy of suspended pool"
+	log_must cp /dev/${sd[1]} $BACKUP_MAIN
+	log_must cp /dev/$ld $BACKUP_LOG
+
+	# bring the entire pool back online, by clearing error injections and
+	# restoring the main device. this will unblock anything still waiting
+	# on it, and tidy up all the internals so we can reset it
+	log_note "bringing pool back online"
+	if [[ $do_fault_log != 0 ]] ; then
+		log_must zinject -c all
+	fi
+	log_must eval "echo running > /sys/block/${sd[0]}/device/state"
+	log_must zpool clear $TESTPOOL
+
+	# now the pool is back online. if dd was blocked, it should now
+	# complete successfully. make sure that's true
+	if [[ $fsync_success == 0 ]] ; then
+		log_note "ensuring blocked dd has now finished"
+		log_must wait $fsync_pid
+	fi
+
+	log_note "exporting pool"
+
+	# pool now clean, export it
+	log_must zpool export $TESTPOOL
+
+	log_note "reverting pool to suspended state"
+
+	# restore the pool to the suspended state, mimicking a reboot
+	log_must cp $BACKUP_MAIN /dev/${sd[0]}
+	log_must cp $BACKUP_LOG /dev/$ld
+
+	# import the crashed pool
+	log_must zpool import $TESTPOOL
+
+	# if fsync() succeeded before the pool suspended, then the ZIL should
+	# have replayed properly and the data is now available on the pool
+	#
+	# note that we don't check the alternative; fsync() blocking does not
+	# mean that data _didn't_ make it to disk, just the ZFS never claimed
+	# that it did. in that case we can't know what _should_ be on disk
+	# right now, so can't check
+	if [[ $fsync_success == 1 ]] ; then
+		log_note "fsync() succeeded earlier; checking data was written correctly"
+		typeset newsum=$(cat /$TESTPOOL/data_file | md5sum)
+		log_must test "$sum" = "$newsum"
+	fi
+
+	log_note "test finished, cleaning up"
+	log_must zpool destroy -f $TESTPOOL
+}
+
+log_note "first run: ZIL succeeds, and repairs the pool at import"
+test_fsync 0
+
+log_note "second run: ZIL commit fails, and falls back to txg sync"
+test_fsync 1
+
+log_pass "fsync() waits if the ZIL commit fails"