Skip to content

Commit

Permalink
6531 Provide mechanism to artificially limit disk performance
Browse files Browse the repository at this point in the history
Reviewed by: Paul Dagnelie <[email protected]>
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed by: George Wilson <[email protected]>
Approved by: Dan McDonald <[email protected]>
  • Loading branch information
Prakash Surya authored and ahrens committed Jan 11, 2016
1 parent 6b4a8fe commit 97e8130
Show file tree
Hide file tree
Showing 8 changed files with 413 additions and 26 deletions.
111 changes: 108 additions & 3 deletions usr/src/cmd/zinject/zinject.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/

/*
Expand Down Expand Up @@ -225,21 +225,57 @@ usage(void)
"\t\tall records if 'all' is specificed.\n"
"\n"
"\tzinject -p <function name> pool\n"
"\n"
"\t\tInject a panic fault at the specified function. Only \n"
"\t\tfunctions which call spa_vdev_config_exit(), or \n"
"\t\tspa_vdev_exit() will trigger a panic.\n"
"\n"
"\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"
"\t [-T <read|write|free|claim|all> pool\n"
"\n"
"\t\tInject a fault into a particular device or the device's\n"
"\t\tlabel. Label injection can either be 'nvlist', 'uber',\n "
"\t\t'pad1', or 'pad2'.\n"
"\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n"
"\n"
"\tzinject -d device -A <degrade|fault> pool\n"
"\n"
"\t\tPerform a specific action on a particular device\n"
"\n"
"\tzinject -d device -D latency:lanes pool\n"
"\n"
"\t\tAdd an artificial delay to IO requests on a particular\n"
"\t\tdevice, such that the requests take a minimum of 'latency'\n"
"\t\tmilliseconds to complete. Each delay has an associated\n"
"\t\tnumber of 'lanes' which defines the number of concurrent\n"
"\t\tIO requests that can be processed.\n"
"\n"
"\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n"
"\t\tthe device will only be able to service a single IO request\n"
"\t\tat a time with each request taking 10 ms to complete. So,\n"
"\t\tif only a single request is submitted every 10 ms, the\n"
"\t\taverage latency will be 10 ms; but if more than one request\n"
"\t\tis submitted every 10 ms, the average latency will be more\n"
"\t\tthan 10 ms.\n"
"\n"
"\t\tSimilarly, if a delay of 10 ms is specified to have two\n"
"\t\tlanes (-D 10:2), then the device will be able to service\n"
"\t\ttwo requests at a time, each with a minimum latency of\n"
"\t\t10 ms. So, if two requests are submitted every 10 ms, then\n"
"\t\tthe average latency will be 10 ms; but if more than two\n"
"\t\trequests are submitted every 10 ms, the average latency\n"
"\t\twill be more than 10 ms.\n"
"\n"
"\t\tAlso note, these delays are additive. So two invocations\n"
"\t\tof '-D 10:1', is roughly equivalent to a single invocation\n"
"\t\tof '-D 10:2'. This also means, one can specify multiple\n"
"\t\tlanes with differing target latencies. For example, an\n"
"\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n"
"\t\tcreate 3 lanes on the device; one lane with a latency\n"
"\t\tof 10 ms and two lanes with a 25 ms latency.\n"
"\n"
"\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
"\n"
"\t\tCause the pool to stop writing blocks yet not\n"
"\t\treport errors for a duration. Simulates buggy hardware\n"
"\t\tthat fails to honor cache flush requests.\n"
Expand Down Expand Up @@ -353,6 +389,9 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
if (record->zi_guid == 0 || record->zi_func[0] != '\0')
return (0);

if (record->zi_cmd == ZINJECT_DELAY_IO)
return (0);

if (*count == 0) {
(void) printf("%3s %-15s %s\n", "ID", "POOL", "GUID");
(void) printf("--- --------------- ----------------\n");
Expand All @@ -366,6 +405,35 @@ print_device_handler(int id, const char *pool, zinject_record_t *record,
return (0);
}

static int
print_delay_handler(int id, const char *pool, zinject_record_t *record,
void *data)
{
int *count = data;

if (record->zi_guid == 0 || record->zi_func[0] != '\0')
return (0);

if (record->zi_cmd != ZINJECT_DELAY_IO)
return (0);

if (*count == 0) {
(void) printf("%3s %-15s %-15s %-15s %s\n",
"ID", "POOL", "DELAY (ms)", "LANES", "GUID");
(void) printf("--- --------------- --------------- "
"--------------- ----------------\n");
}

*count += 1;

(void) printf("%3d %-15s %-15llu %-15llu %llx\n", id, pool,
(u_longlong_t)NSEC2MSEC(record->zi_timer),
(u_longlong_t)record->zi_nlanes,
(u_longlong_t)record->zi_guid);

return (0);
}

static int
print_panic_handler(int id, const char *pool, zinject_record_t *record,
void *data)
Expand Down Expand Up @@ -403,6 +471,13 @@ print_all_handlers(void)
count = 0;
}

(void) iter_handlers(print_delay_handler, &count);
if (count > 0) {
total += count;
(void) printf("\n");
count = 0;
}

(void) iter_handlers(print_data_handler, &count);
if (count > 0) {
total += count;
Expand Down Expand Up @@ -545,6 +620,35 @@ perform_action(const char *pool, zinject_record_t *record, int cmd)
return (1);
}

static int
parse_delay(char *str, uint64_t *delay, uint64_t *nlanes)
{
unsigned long scan_delay;
unsigned long scan_nlanes;

if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2)
return (1);

/*
* We explicitly disallow a delay of zero here, because we key
* off this value being non-zero in translate_device(), to
* determine if the fault is a ZINJECT_DELAY_IO fault or not.
*/
if (scan_delay == 0)
return (1);

/*
* The units for the CLI delay parameter is milliseconds, but
* the data passed to the kernel is interpreted as nanoseconds.
* Thus we scale the milliseconds to nanoseconds here, and this
* nanosecond value is used to pass the delay to the kernel.
*/
*delay = MSEC2NSEC(scan_delay);
*nlanes = scan_nlanes;

return (0);
}

int
main(int argc, char **argv)
{
Expand Down Expand Up @@ -628,8 +732,9 @@ main(int argc, char **argv)
device = optarg;
break;
case 'D':
record.zi_timer = strtoull(optarg, &end, 10);
if (errno != 0 || *end != '\0') {
ret = parse_delay(optarg, &record.zi_timer,
&record.zi_nlanes);
if (ret != 0) {
(void) fprintf(stderr, "invalid i/o delay "
"value: '%s'\n", optarg);
usage();
Expand Down
1 change: 1 addition & 0 deletions usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,7 @@ typedef struct zinject_record {
uint32_t zi_iotype;
int32_t zi_duration;
uint64_t zi_timer;
uint64_t zi_nlanes;
uint32_t zi_cmd;
uint32_t zi_pad;
} zinject_record_t;
Expand Down
5 changes: 4 additions & 1 deletion usr/src/uts/common/fs/zfs/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,7 @@ struct zio {

uint64_t io_offset;
hrtime_t io_timestamp;
hrtime_t io_target_timestamp;
avl_node_t io_queue_node;
avl_node_t io_offset_node;

Expand Down Expand Up @@ -506,6 +507,8 @@ extern int zio_wait(zio_t *zio);
extern void zio_nowait(zio_t *zio);
extern void zio_execute(zio_t *zio);
extern void zio_interrupt(zio_t *zio);
extern void zio_delay_init(zio_t *zio);
extern void zio_delay_interrupt(zio_t *zio);

extern zio_t *zio_walk_parents(zio_t *cio);
extern zio_t *zio_walk_children(zio_t *pio);
Expand Down Expand Up @@ -567,7 +570,7 @@ extern int zio_handle_fault_injection(zio_t *zio, int error);
extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
extern int zio_handle_label_injection(zio_t *zio, int error);
extern void zio_handle_ignored_writes(zio_t *zio);
extern uint64_t zio_handle_io_delay(zio_t *zio);
extern hrtime_t zio_handle_io_delay(zio_t *zio);

/*
* Checksum ereport functions
Expand Down
5 changes: 3 additions & 2 deletions usr/src/uts/common/fs/zfs/vdev_disk.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 Joyent, Inc. All rights reserved.
*/
Expand Down Expand Up @@ -691,7 +691,7 @@ vdev_disk_io_intr(buf_t *bp)

kmem_free(vb, sizeof (vdev_buf_t));

zio_interrupt(zio);
zio_delay_interrupt(zio);
}

static void
Expand Down Expand Up @@ -797,6 +797,7 @@ vdev_disk_io_start(zio_t *zio)
}

ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
zio->io_target_timestamp = zio_handle_io_delay(zio);

vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);

Expand Down
5 changes: 3 additions & 2 deletions usr/src/uts/common/fs/zfs/vdev_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/

#include <sys/zfs_context.h>
Expand Down Expand Up @@ -158,7 +158,7 @@ vdev_file_io_intr(buf_t *bp)
zio->io_error = SET_ERROR(ENOSPC);

kmem_free(vb, sizeof (vdev_buf_t));
zio_interrupt(zio);
zio_delay_interrupt(zio);
}

static void
Expand Down Expand Up @@ -212,6 +212,7 @@ vdev_file_io_start(zio_t *zio)
}

ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
zio->io_target_timestamp = zio_handle_io_delay(zio);

vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);

Expand Down
3 changes: 0 additions & 3 deletions usr/src/uts/common/fs/zfs/vdev_queue.c
Original file line number Diff line number Diff line change
Expand Up @@ -729,9 +729,6 @@ vdev_queue_io_done(zio_t *zio)
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
zio_t *nio;

if (zio_injection_enabled)
delay(SEC_TO_TICK(zio_handle_io_delay(zio)));

mutex_enter(&vq->vq_lock);

vdev_queue_pending_remove(vq, zio);
Expand Down
52 changes: 52 additions & 0 deletions usr/src/uts/common/fs/zfs/zio.c
Original file line number Diff line number Diff line change
Expand Up @@ -1352,6 +1352,58 @@ zio_interrupt(zio_t *zio)
zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
}

void
zio_delay_interrupt(zio_t *zio)
{
/*
* The timeout_generic() function isn't defined in userspace, so
* rather than trying to implement the function, the zio delay
* functionality has been disabled for userspace builds.
*/

#ifdef _KERNEL
/*
* If io_target_timestamp is zero, then no delay has been registered
* for this IO, thus jump to the end of this function and "skip" the
* delay; issuing it directly to the zio layer.
*/
if (zio->io_target_timestamp != 0) {
hrtime_t now = gethrtime();

if (now >= zio->io_target_timestamp) {
/*
* This IO has already taken longer than the target
* delay to complete, so we don't want to delay it
* any longer; we "miss" the delay and issue it
* directly to the zio layer. This is likely due to
* the target latency being set to a value less than
* the underlying hardware can satisfy (e.g. delay
* set to 1ms, but the disks take 10ms to complete an
* IO request).
*/

DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
hrtime_t, now);

zio_interrupt(zio);
} else {
hrtime_t diff = zio->io_target_timestamp - now;

DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
hrtime_t, now, hrtime_t, diff);

(void) timeout_generic(CALLOUT_NORMAL,
(void (*)(void *))zio_interrupt, zio, diff, 1, 0);
}

return;
}
#endif

DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
zio_interrupt(zio);
}

/*
* Execute the I/O pipeline until one of the following occurs:
*
Expand Down
Loading

0 comments on commit 97e8130

Please sign in to comment.