From 36cb97654ef633779ee02fd8301b3334d32636c7 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Sun, 16 Jun 2019 21:48:14 +0900
Subject: [PATCH] Fix race in parallel mount's thread dispatching algorithm

Strategy of parallel mount is as follows.

1) Initial thread dispatching selects sets of mount points that don't
 have dependencies on other sets, hence threads can/should go lock-less
 and shouldn't race with other threads for other sets. Each thread
 dispatched corresponds to top level directory which may or may not have
 datasets mounted on sub directories.

2) Subsequent recursive thread dispatching for each thread from 1) is
 to mount datasets for each set of mount points. The mount points within
 each set have dependencies (i.e. child directories), so child
 directories are processed only after parent directory completes.

The problem is that initial thread dispatching can spawn >1 threads
for datasets with the same top level directory, and this puts threads
under race condition. This appeared as mount issue on ZoL for ZoL having
different timing regarding mount(2) execution due to fork(2)/exec(2) of
mount(8) on mount. Fix it by libzfs_path_contains() returning true for
same paths.

Closes #8450
Closes #8833

Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
---
 lib/libzfs/libzfs_mount.c                     |   5 +-
 tests/runfiles/linux.run                      |   3 +-
 .../functional/cli_root/zfs_mount/Makefile.am |   1 +
 .../cli_root/zfs_mount/zfs_mount_test_race.sh | 111 ++++++++++++++++++
 4 files changed, 117 insertions(+), 3 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.sh

diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c
index 7497a72233ad..8c7a7eabed45 100644
--- a/lib/libzfs/libzfs_mount.c
+++ b/lib/libzfs/libzfs_mount.c
@@ -1336,12 +1336,13 @@ mountpoint_cmp(const void *arga, const void *argb)
 }
 
 /*
- * Return true if path2 is a child of path1.
+ * Return true if path2 is a child of path1 or path2 equals path1.
  */
 static boolean_t
 libzfs_path_contains(const char *path1, const char *path2)
 {
-	return (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/');
+	return ((strcmp(path1, path2) == 0) ||
+	    (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/'));
 }
 
 /*
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 5d33b2058124..53a3cdef9dd2 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -182,7 +182,8 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
     'zfs_mount_007_pos', 'zfs_mount_008_pos', 'zfs_mount_009_neg',
     'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_neg',
     'zfs_mount_all_001_pos', 'zfs_mount_encrypted', 'zfs_mount_remount',
-    'zfs_multi_mount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints']
+    'zfs_multi_mount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
+    'zfs_mount_test_race']
 tags = ['functional', 'cli_root', 'zfs_mount']
 
 [tests/functional/cli_root/zfs_program]
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am
index b2de98934b74..c208a1c378dc 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am
@@ -19,6 +19,7 @@ dist_pkgdata_SCRIPTS = \
 	zfs_mount_all_mountpoints.ksh \
 	zfs_mount_encrypted.ksh \
 	zfs_mount_remount.ksh \
+	zfs_mount_test_race.sh \
 	zfs_multi_mount.ksh
 
 dist_pkgdata_DATA = \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.sh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.sh
new file mode 100755
index 000000000000..28e3c02c054a
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.sh
@@ -0,0 +1,111 @@
+#!/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.cfg
+
+#
+# DESCRIPTION:
+# Verify parallel mount ordering is consistent.
+# See github.com/zfsonlinux/zfs/issues/8833 for details.
+#
+# STRATEGY:
+# 1. Create pools and filesystems.
+# 2. Set same mount point for >1 datasets.
+# 3. Unmount all datasets.
+# 4. Mount all datasets.
+# 5. Unmount all datasets.
+#
+
+verify_runnable "both"
+
+TMPDIR=${TMPDIR:-$TEST_BASE_DIR}
+MNTPT=$TMPDIR/zfs_mount_test_race_mntpt
+DISK1="$TMPDIR/zfs_mount_test_race_disk1"
+DISK2="$TMPDIR/zfs_mount_test_race_disk2"
+
+TESTPOOL1=zfs_mount_test_race_tp1
+TESTPOOL2=zfs_mount_test_race_tp2
+
+export __ZFS_POOL_RESTRICT="$TESTPOOL1 $TESTPOOL2"
+log_must zfs $unmountall
+unset __ZFS_POOL_RESTRICT
+
+function cleanup
+{
+	zpool destroy $TESTPOOL1
+	zpool destroy $TESTPOOL2
+	rm -rf $MNTPT
+	rm -rf /$TESTPOOL1
+	rm -rf /$TESTPOOL2
+	rm -f $DISK1
+	rm -f $DISK2
+	export __ZFS_POOL_RESTRICT="$TESTPOOL1 $TESTPOOL2"
+	log_must zfs $mountall
+	unset __ZFS_POOL_RESTRICT
+}
+log_onexit cleanup
+
+log_note "Verify parallel mount ordering is consistent"
+
+log_must truncate -s $MINVDEVSIZE $DISK1
+log_must truncate -s $MINVDEVSIZE $DISK2
+
+log_must zpool create -f $TESTPOOL1 $DISK1
+log_must zpool create -f $TESTPOOL2 $DISK2
+
+log_must zfs create $TESTPOOL1/$TESTFS1
+log_must zfs create $TESTPOOL2/$TESTFS2
+
+log_must zfs set mountpoint=none $TESTPOOL1
+log_must zfs set mountpoint=$MNTPT $TESTPOOL1/$TESTFS1
+
+# Note that unmount can fail (due to race condition on `zfs mount -a`) with or
+# without `canmount=off`.  The race has nothing to do with canmount property,
+# but turn it off for convenience of mount layout used in this test case.
+log_must zfs set canmount=off $TESTPOOL2
+log_must zfs set mountpoint=$MNTPT $TESTPOOL2
+
+# At this point, layout of datasets in two pools will look like below.
+# Previously, on next `zfs mount -a`, pthreads assigned to TESTFS1 and TESTFS2
+# could race, and TESTFS2 usually (actually always) won in ZoL.  Note that the
+# problem is how two or more threads could initially be assigned to the same
+# top level directory, not this specific layout.  This layout is just an example
+# that can reproduce race, and is also the layout reported in #8833.
+#
+# NAME                  MOUNTED  MOUNTPOINT
+# ----------------------------------------------
+# /$TESTPOOL1           no       none
+# /$TESTPOOL1/$TESTFS1  yes      $MNTPT
+# /$TESTPOOL2           no       $MNTPT
+# /$TESTPOOL2/$TESTFS2  yes      $MNTPT/$TESTFS2
+
+# Apparently two datasets must be mounted.
+log_must ismounted $TESTPOOL1/$TESTFS1
+log_must ismounted $TESTPOOL2/$TESTFS2
+# This unmount always succeeds, because potential race hasn't happened yet.
+log_must zfs unmount -a
+# This mount always succeeds, whether threads are under race condition or not.
+log_must zfs mount -a
+
+# Verify datasets are mounted (TESTFS2 fails if the race broke mount order).
+log_must ismounted $TESTPOOL1/$TESTFS1
+log_must ismounted $TESTPOOL2/$TESTFS2
+# Verify unmount succeeds (fails if the race broke mount order).
+log_must zfs unmount -a
+
+log_pass "Verify parallel mount ordering is consistent passed"