Skip to content

Commit

Permalink
Remove zfs_vdev_elevator module option
Browse files Browse the repository at this point in the history
Originally the zfs_vdev_elevator module option was added as a
convenience so the requested elevator would be automatically set
on the underlying block devices.  At the time this was simple
because the kernel provided an API function which did exactly this.

This API was then removed in the Linux 4.12 kernel which prompted
us to add compatibly code to set the elevator via a usermodehelper.
While well intentioned this can result in a system hang if the
usermodehelper does not return.  This is because it may be called
with the spa config lock held as a writter.  For example via the
zfs_ioc_pool_scan() -> spa_scan() -> spa_scan() -> vdev_reopen()
callpath.

Additionally, there's a reasonable argument that setting the elevator
is the resonsiblity of whatever software is being used to configure
the system.  Therefore, it's proposed that the zfs_vdev_elevator
be removed.

Signed-off-by: Brian Behlendorf <[email protected]>
Issue openzfs#8664
  • Loading branch information
behlendorf committed Sep 11, 2019
1 parent d666206 commit f9a619c
Show file tree
Hide file tree
Showing 5 changed files with 0 additions and 154 deletions.
25 changes: 0 additions & 25 deletions config/kernel-elevator-change.m4

This file was deleted.

1 change: 0 additions & 1 deletion config/kernel.m4
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
ZFS_AC_KERNEL_S_D_OP
ZFS_AC_KERNEL_BDI
ZFS_AC_KERNEL_SET_NLINK
ZFS_AC_KERNEL_ELEVATOR_CHANGE
ZFS_AC_KERNEL_5ARG_SGET
ZFS_AC_KERNEL_LSEEK_EXECUTE
ZFS_AC_KERNEL_VFS_ITERATE
Expand Down
8 changes: 0 additions & 8 deletions include/os/linux/kernel/linux/blkdev_compat.h
Original file line number Diff line number Diff line change
Expand Up @@ -638,14 +638,6 @@ blk_queue_discard_secure(struct request_queue *q)
#endif
}

/*
* Default Linux IO Scheduler,
* Setting the scheduler to noop will allow the Linux IO scheduler to
* still perform front and back merging, while leaving the request
* ordering and prioritization to the ZFS IO scheduler.
*/
#define VDEV_SCHEDULER "noop"

/*
* A common holder for vdev_bdev_open() is used to relax the exclusive open
* semantics slightly. Internal vdev disk callers may pass VDEV_HOLDER to
Expand Down
12 changes: 0 additions & 12 deletions man/man5/zfs-module-parameters.5
Original file line number Diff line number Diff line change
Expand Up @@ -3170,18 +3170,6 @@ threshold.
Default value: \fB32,768\fR.
.RE

.sp
.ne 2
.na
\fBzfs_vdev_scheduler\fR (charp)
.ad
.RS 12n
Set the Linux I/O scheduler on whole disk vdevs to this scheduler. Valid options
are noop, cfq, bfq & deadline
.sp
Default value: \fBnoop\fR.
.RE

.sp
.ne 2
.na
Expand Down
108 changes: 0 additions & 108 deletions module/os/linux/zfs/vdev_disk.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
#include <linux/msdos_fs.h>
#include <linux/vfs_compat.h>

char *zfs_vdev_scheduler = VDEV_SCHEDULER;
static void *zfs_vdev_holder = VDEV_HOLDER;

/* size of the "reserved" partition, in blocks */
Expand Down Expand Up @@ -159,75 +158,6 @@ vdev_disk_error(zio_t *zio)
zio->io_flags);
}

/*
* Use the Linux 'noop' elevator for zfs managed block devices. This
* strikes the ideal balance by allowing the zfs elevator to do all
* request ordering and prioritization. While allowing the Linux
* elevator to do the maximum front/back merging allowed by the
* physical device. This yields the largest possible requests for
* the device with the lowest total overhead.
*/
static void
vdev_elevator_switch(vdev_t *v, char *elevator)
{
vdev_disk_t *vd = v->vdev_tsd;
struct request_queue *q;
char *device;
int error;

for (int c = 0; c < v->vdev_children; c++)
vdev_elevator_switch(v->vdev_child[c], elevator);

if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
return;

q = bdev_get_queue(vd->vd_bdev);
device = vd->vd_bdev->bd_disk->disk_name;

/*
* Skip devices which are not whole disks (partitions).
* Device-mapper devices are excepted since they may be whole
* disks despite the vdev_wholedisk flag, in which case we can
* and should switch the elevator. If the device-mapper device
* does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
* "Skip devices without schedulers" check below will fail.
*/
if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
return;

/* Leave existing scheduler when set to "none" */
if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
return;

/*
* The elevator_change() function was available in kernels from
* 2.6.36 to 4.11. When not available fall back to using the user
* mode helper functionality to set the elevator via sysfs. This
* requires /bin/echo and sysfs to be mounted which may not be true
* early in the boot process.
*/
#ifdef HAVE_ELEVATOR_CHANGE
error = elevator_change(q, elevator);
#else
#define SET_SCHEDULER_CMD \
"exec 0</dev/null " \
" 1>/sys/block/%s/queue/scheduler " \
" 2>/dev/null; " \
"echo %s"

char *argv[] = { "/bin/sh", "-c", NULL, NULL };
char *envp[] = { NULL };

argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
strfree(argv[2]);
#endif /* HAVE_ELEVATOR_CHANGE */
if (error) {
zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d",
elevator, v->vdev_path, device, error);
}
}

static int
vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
uint64_t *ashift)
Expand Down Expand Up @@ -359,9 +289,6 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
/* Based on the minimum sector size set the block size */
*ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;

/* Try to set the io scheduler elevator algorithm */
(void) vdev_elevator_switch(v, zfs_vdev_scheduler);

return (0);
}

Expand Down Expand Up @@ -902,37 +829,6 @@ vdev_disk_rele(vdev_t *vd)
/* XXX: Implement me as a vnode rele for the device */
}

static int
param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
{
spa_t *spa = NULL;
char *p;

if (val == NULL)
return (SET_ERROR(-EINVAL));

if ((p = strchr(val, '\n')) != NULL)
*p = '\0';

if (spa_mode_global != 0) {
mutex_enter(&spa_namespace_lock);
while ((spa = spa_next(spa)) != NULL) {
if (spa_state(spa) != POOL_STATE_ACTIVE ||
!spa_writeable(spa) || spa_suspended(spa))
continue;

spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
mutex_enter(&spa_namespace_lock);
spa_close(spa, FTAG);
}
mutex_exit(&spa_namespace_lock);
}

return (param_set_charp(val, kp));
}

vdev_ops_t vdev_disk_ops = {
.vdev_op_open = vdev_disk_open,
.vdev_op_close = vdev_disk_close,
Expand All @@ -948,7 +844,3 @@ vdev_ops_t vdev_disk_ops = {
.vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
.vdev_op_leaf = B_TRUE /* leaf vdev */
};

module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
param_get_charp, &zfs_vdev_scheduler, 0644);
MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");

0 comments on commit f9a619c

Please sign in to comment.