From e93b150f963063aedb2f2f34761470e1ad1de50e Mon Sep 17 00:00:00 2001
From: Prakash Surya <prakash.surya@delphix.com>
Date: Fri, 17 Nov 2017 08:15:51 -0800
Subject: [PATCH] fix: itx was being added to the wrong lwb

This should address the following panic seen when running on a system
with 8 SSDs and 32 vCPUs:

    [ 1806.416279] VERIFY3(itx->itx_wr_state == WR_INDIRECT) failed (4294967295 == 0)
    [ 1806.416315] PANIC at zil.c:1506:zil_lwb_commit()
    [ 1806.416333] Showing stack for process 28365
    [ 1806.416336] CPU: 5 PID: 28365 Comm: fio Tainted: P           OE   4.4.0-93-generic #116-Ubuntu
    [ 1806.416337] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 09/21/2015
    [ 1806.416338]  0000000000000286 16a1c125c5c4eec7 ffff887cd21678c0 ffffffff813f9f83
    [ 1806.416341]  ffffffffc136d454 00000000000005e2 ffff887cd21678d0 ffffffffc02d6fd2
    [ 1806.416342]  ffff887cd2167a58 ffffffffc02d70a5 ffff887b665d75d0 ffff887c00000030
    [ 1806.416344] Call Trace:
    [ 1806.416350]  [<ffffffff813f9f83>] dump_stack+0x63/0x90
    [ 1806.416362]  [<ffffffffc02d6fd2>] spl_dumpstack+0x42/0x50 [spl]
    [ 1806.416366]  [<ffffffffc02d70a5>] spl_panic+0xc5/0x100 [spl]
    [ 1806.416428]  [<ffffffffc127ab1e>] ? zio_rewrite+0x6e/0x70 [zfs]
    [ 1806.416469]  [<ffffffffc126bd90>] ? zil_lwb_write_open+0x520/0x520 [zfs]
    [ 1806.416509]  [<ffffffffc126b96f>] ? zil_lwb_write_open+0xff/0x520 [zfs]
    [ 1806.416551]  [<ffffffffc126bd90>] ? zil_lwb_write_open+0x520/0x520 [zfs]
    [ 1806.416591]  [<ffffffffc1271daa>] zil_commit+0x14ba/0x1d90 [zfs]
    [ 1806.416635]  [<ffffffffc1260d6d>] zfs_write+0xe0d/0x1020 [zfs]
    [ 1806.416677]  [<ffffffffc128b391>] zpl_write_common_iovec+0x91/0x130 [zfs]
    [ 1806.416717]  [<ffffffffc128b4e1>] zpl_iter_write+0xb1/0xf0 [zfs]
    [ 1806.416721]  [<ffffffff8120ec0b>] new_sync_write+0x9b/0xe0
    [ 1806.416723]  [<ffffffff8120ec76>] __vfs_write+0x26/0x40
    [ 1806.416724]  [<ffffffff8120f5f9>] vfs_write+0xa9/0x1a0
    [ 1806.416726]  [<ffffffff81210465>] SyS_pwrite64+0x95/0xb0
    [ 1806.416729]  [<ffffffff81039d03>] ? fpu__restore+0xf3/0x150
    [ 1806.416734]  [<ffffffff818431f2>] entry_SYSCALL_64_fastpath+0x16/0x71

The problem was that we'd add the itx to the incorrect lwb, causing the
itx to be free'd when it was still being used.

Specifially, we'd add the itx to the lwb that was passed into
zil_lwb_commit, but it's possible that this lwb will not be the one that
the itx is committed to. If there is insufficient space in that lwb for
this itx, that lwb will be issued to disk, and the itx will be committed
to the next lwb.

In this scenario, the itx would have been linked into the lwb's list for
the lwb that was just submitted. When this lwb completes, the itx will
be free'd. If the lwb happens to complete, prior to the call to
zil_lwb_commit completing, the itx will be free'd while it's still being
actively used in zil_lwb_commit. I believe this issue become more easy
to occur when the underlying storage is "fast", and would help explain
why I've only managed to trigger this crash when using SSDs.

Signed-off-by: Prakash Surya <prakash.surya@delphix.com>
---
 module/zfs/zil.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index ff146db6cbd5..2016b788b9e8 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1409,16 +1409,6 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 	lrc = &itx->itx_lr;
 	lrw = (lr_write_t *)lrc;
 
-	/*
-	 * TODO: What happens when the data for this itx is later (via
-	 * the logic further down in this function) gets split between
-	 * multiple lwb's? We want to add the itx and the waiter to that
-	 * "last" lwb for this itx, not the "first" like we're currently
-	 * doing, right? This seems bad.
-	 */
-
-	list_insert_tail(&lwb->lwb_itxs, itx);
-
 	/*
 	 * A commit itx doesn't represent any on-disk state; instead
 	 * it's simply used as a place holder on the commit list, and
@@ -2075,6 +2065,11 @@ zil_process_commit_list(zilog_t *zilog)
 		if (!synced || frozen) {
 			if (lwb != NULL) {
 				lwb = zil_lwb_commit(zilog, itx, lwb);
+
+				if (lwb == NULL)
+					list_insert_tail(&nolwb_itxs, itx);
+				else
+					list_insert_tail(&lwb->lwb_itxs, itx);
 			} else {
 				ASSERT3P(lwb, ==, NULL);