Skip to content

Commit

Permalink
9425 allow channel programs to be stopped via signals
Browse files Browse the repository at this point in the history
Reviewed by: Sebastien Roy <[email protected]>
Reviewed by: Serapheim Dimitropoulos <[email protected]>
Reviewed by: Matt Ahrens <[email protected]>
Approved by: Robert Mustacchi <[email protected]>
  • Loading branch information
Don Brady authored and Prakash Surya committed Feb 20, 2019
1 parent 9d1587b commit d0cb1fb
Show file tree
Hide file tree
Showing 9 changed files with 289 additions and 89 deletions.
3 changes: 3 additions & 0 deletions usr/src/pkg/manifests/system-test-zfstest.mf
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,9 @@ file \
file \
path=opt/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_simple.zcp \
mode=0444
file \
path=opt/zfs-tests/tests/functional/channel_program/synctask_core/tst.terminate_by_signal \
mode=0555
$(i386_ONLY)file path=opt/zfs-tests/tests/functional/checksum/edonr_test.amd64 \
mode=0555
$(i386_ONLY)file path=opt/zfs-tests/tests/functional/checksum/edonr_test.i386 \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#

#
# Copyright (c) 2017 by Delphix. All rights reserved.
#
. $STF_SUITE/tests/functional/channel_program/channel_common.kshlib

#
# DESCRIPTION: Execute a long-running zfs channel program and attempt to
# cancel it by sending a signal.
#

verify_runnable "global"

rootfs=$TESTPOOL/$TESTFS
snapname=snap
limit=50000000

function cleanup
{
datasetexists $rootfs && log_must zfs destroy -R $rootfs
}

log_onexit cleanup

#
# Create a working set of 100 file systems
#
for i in {1..100}; do
log_must zfs create "$rootfs/child$i"
done

#
# Attempt to create 100 snapshots with zfs.sync.snapshot() along with some
# time consuming efforts. We use loops of zfs.check.* (dry run operations)
# to consume instructions before the next zfs.sync.snapshot() occurs.
#
# Without a signal interruption this ZCP would take several minutes and
# generate over 30 million Lua instructions.
#
function chan_prog
{
zfs program -t $limit $TESTPOOL - $rootfs $snapname <<-EOF
arg = ...
fs = arg["argv"][1]
snap = arg["argv"][2]
for child in zfs.list.children(fs) do
local snapname = child .. "@" .. snap
zfs.check.snapshot(snapname)
zfs.sync.snapshot(snapname)
for i=1,20000,1 do
zfs.check.snapshot(snapname)
zfs.check.destroy(snapname)
zfs.check.destroy(fs)
end
end
return "should not have reached here"
EOF
}

log_note "Executing a long-running zfs program in the background"
chan_prog &
CHILD=$!

#
# After waiting, send a kill signal to the channel program process.
# This should stop the ZCP near a million instructions but still have
# created some of the snapshots. Note that since the above zfs program
# command might get wrapped, we also issue a kill to the group.
#
sleep 10
log_pos pkill -P $CHILD
log_pos kill $CHILD

#
# Make sure the channel program did not fully complete by enforcing
# that not all of the snapshots were created.
#
snap_count=$(zfs list -t snapshot | grep $TESTPOOL | wc -l)
log_note "$snap_count snapshots created by ZCP"

if [ "$snap_count" -eq 0 ]; then
log_fail "Channel progam failed to run."
elif [ "$snap_count" -gt 50 ]; then
log_fail "Too many snapshots after a cancel ($snap_count)."
else
log_pass "Canceling a long-running channel program works."
fi
24 changes: 21 additions & 3 deletions usr/src/uts/common/fs/zfs/dsl_synctask.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ dsl_null_checkfunc(void *arg, dmu_tx_t *tx)

static int
dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc,
dsl_syncfunc_t *syncfunc, void *arg,
dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg,
int blocks_modified, zfs_space_check_t space_check, boolean_t early)
{
spa_t *spa;
Expand Down Expand Up @@ -85,6 +85,11 @@ dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc,

dmu_tx_commit(tx);

if (sigfunc != NULL && txg_wait_synced_sig(dp, dst.dst_txg)) {
/* current contract is to call func once */
sigfunc(arg, tx);
sigfunc = NULL; /* in case of an EAGAIN retry */
}
txg_wait_synced(dp, dst.dst_txg);

if (dst.dst_error == EAGAIN) {
Expand Down Expand Up @@ -124,7 +129,7 @@ dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
dsl_syncfunc_t *syncfunc, void *arg,
int blocks_modified, zfs_space_check_t space_check)
{
return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg,
return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg,
blocks_modified, space_check, B_FALSE));
}

Expand All @@ -146,10 +151,23 @@ dsl_early_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
dsl_syncfunc_t *syncfunc, void *arg,
int blocks_modified, zfs_space_check_t space_check)
{
return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg,
return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg,
blocks_modified, space_check, B_TRUE));
}

/*
* A standard synctask that can be interrupted from a signal. The sigfunc
* is called once if a signal occurred while waiting for the task to sync.
*/
int
dsl_sync_task_sig(const char *pool, dsl_checkfunc_t *checkfunc,
dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg,
int blocks_modified, zfs_space_check_t space_check)
{
return (dsl_sync_task_common(pool, checkfunc, syncfunc, sigfunc, arg,
blocks_modified, space_check, B_FALSE));
}

static void
dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx,
Expand Down
3 changes: 3 additions & 0 deletions usr/src/uts/common/fs/zfs/sys/dsl_synctask.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ struct dsl_pool;

typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *);
typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *);
typedef void (dsl_sigfunc_t)(void *, dmu_tx_t *);

typedef enum zfs_space_check {
/*
Expand Down Expand Up @@ -116,6 +117,8 @@ int dsl_early_sync_task(const char *, dsl_checkfunc_t *,
dsl_syncfunc_t *, void *, int, zfs_space_check_t);
void dsl_early_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
void *, int, zfs_space_check_t, dmu_tx_t *);
int dsl_sync_task_sig(const char *, dsl_checkfunc_t *, dsl_syncfunc_t *,
dsl_sigfunc_t *, void *, int, zfs_space_check_t);

#ifdef __cplusplus
}
Expand Down
5 changes: 5 additions & 0 deletions usr/src/uts/common/fs/zfs/sys/txg.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,11 @@ extern void txg_kick(struct dsl_pool *dp);
*/
extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg);

/*
* Wait as above. Returns true if the thread was signaled while waiting.
*/
extern boolean_t txg_wait_synced_sig(struct dsl_pool *dp, uint64_t txg);

/*
* Wait until the given transaction group, or one after it, is
* the open transaction group. Try to make this happen as soon
Expand Down
31 changes: 31 additions & 0 deletions usr/src/uts/common/fs/zfs/sys/zcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ typedef struct zcp_cleanup_handler {
list_node_t zch_node;
} zcp_cleanup_handler_t;

typedef struct zcp_alloc_arg {
boolean_t aa_must_succeed;
int64_t aa_alloc_remaining;
int64_t aa_alloc_limit;
} zcp_alloc_arg_t;

typedef struct zcp_run_info {
dsl_pool_t *zri_pool;

Expand Down Expand Up @@ -93,6 +99,11 @@ typedef struct zcp_run_info {
*/
boolean_t zri_timed_out;

/*
* Channel program was canceled by user
*/
boolean_t zri_canceled;

/*
* Boolean indicating whether or not we are running in syncing
* context.
Expand All @@ -104,6 +115,26 @@ typedef struct zcp_run_info {
* triggered in the event of a fatal error.
*/
list_t zri_cleanup_handlers;

/*
* The Lua state context of our channel program.
*/
lua_State *zri_state;

/*
* Lua memory allocator arguments.
*/
zcp_alloc_arg_t *zri_allocargs;

/*
* Contains output values from zcp script or error string.
*/
nvlist_t *zri_outnvl;

/*
* The errno number returned to caller of zcp_eval().
*/
int zri_result;
} zcp_run_info_t;

zcp_run_info_t *zcp_run_info(lua_State *);
Expand Down
36 changes: 33 additions & 3 deletions usr/src/uts/common/fs/zfs/txg.c
Original file line number Diff line number Diff line change
Expand Up @@ -632,8 +632,8 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
mutex_exit(&tx->tx_sync_lock);
}

void
txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
static boolean_t
txg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig)
{
tx_state_t *tx = &dp->dp_tx;

Expand All @@ -652,9 +652,39 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
"tx_synced=%llu waiting=%llu dp=%p\n",
tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
cv_broadcast(&tx->tx_sync_more_cv);
cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
if (wait_sig) {
/*
* Condition wait here but stop if the thread receives a
* signal. The caller may call txg_wait_synced*() again
* to resume waiting for this txg.
*/
if (cv_wait_sig(&tx->tx_sync_done_cv,
&tx->tx_sync_lock) == 0) {
mutex_exit(&tx->tx_sync_lock);
return (B_TRUE);
}
} else {
cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
}
}
mutex_exit(&tx->tx_sync_lock);
return (B_FALSE);
}

void
txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
{
VERIFY0(txg_wait_synced_impl(dp, txg, B_FALSE));
}

/*
* Similar to a txg_wait_synced but it can be interrupted from a signal.
* Returns B_TRUE if the thread was signaled while waiting.
*/
boolean_t
txg_wait_synced_sig(dsl_pool_t *dp, uint64_t txg)
{
return (txg_wait_synced_impl(dp, txg, B_TRUE));
}

void
Expand Down
Loading

0 comments on commit d0cb1fb

Please sign in to comment.