Skip to content

Commit

Permalink
block, scsi: Make SCSI quiesce and resume work reliably
Browse files Browse the repository at this point in the history
The contexts from which a SCSI device can be quiesced or resumed are:
* Writing into /sys/class/scsi_device/*/device/state.
* SCSI parallel (SPI) domain validation.
* The SCSI device power management methods. See also scsi_bus_pm_ops.

It is essential during suspend and resume that neither the filesystem
state nor the filesystem metadata in RAM changes. This is why while
the hibernation image is being written or restored that SCSI devices
are quiesced. The SCSI core quiesces devices through scsi_device_quiesce()
and scsi_device_resume(). In the SDEV_QUIESCE state execution of
non-preempt requests is deferred. This is realized by returning
BLKPREP_DEFER from inside scsi_prep_state_check() for quiesced SCSI
devices. Avoid that a full queue prevents power management requests
to be submitted by deferring allocation of non-preempt requests for
devices in the quiesced state. This patch has been tested by running
the following commands and by verifying that after each resume the
fio job was still running:

for ((i=0; i<10; i++)); do
  (
    cd /sys/block/md0/md &&
    while true; do
      [ "$(<sync_action)" = "idle" ] && echo check > sync_action
      sleep 1
    done
  ) &
  pids=($!)
  for d in /sys/class/block/sd*[a-z]; do
    bdev=${d#/sys/class/block/}
    hcil=$(readlink "$d/device")
    hcil=${hcil#../../../}
    echo 4 > "$d/queue/nr_requests"
    echo 1 > "/sys/class/scsi_device/$hcil/device/queue_depth"
    fio --name="$bdev" --filename="/dev/$bdev" --buffered=0 --bs=512 \
      --rw=randread --ioengine=libaio --numjobs=4 --iodepth=16       \
      --iodepth_batch=1 --thread --loops=$((2**31)) &
    pids+=($!)
  done
  sleep 1
  echo "$(date) Hibernating ..." >>hibernate-test-log.txt
  systemctl hibernate
  sleep 10
  kill "${pids[@]}"
  echo idle > /sys/block/md0/md/sync_action
  wait
  echo "$(date) Done." >>hibernate-test-log.txt
done

Reported-by: Oleksandr Natalenko <[email protected]>
References: "I/O hangs after resuming from suspend-to-ram" (https://marc.info/?l=linux-block&m=150340235201348).
Signed-off-by: Bart Van Assche <[email protected]>
Reviewed-by: Hannes Reinecke <[email protected]>
Tested-by: Martin Steigerwald <[email protected]>
Tested-by: Oleksandr Natalenko <[email protected]>
Cc: Martin K. Petersen <[email protected]>
Cc: Ming Lei <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Johannes Thumshirn <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
  • Loading branch information
KAGA-KOKO authored and axboe committed Nov 11, 2017
1 parent c9254f2 commit 3a0a529
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 25 deletions.
42 changes: 34 additions & 8 deletions block/blk-core.c
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,7 @@ void blk_clear_preempt_only(struct request_queue *q)

spin_lock_irqsave(q->queue_lock, flags);
queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
wake_up_all(&q->mq_freeze_wq);
spin_unlock_irqrestore(q->queue_lock, flags);
}
EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
Expand Down Expand Up @@ -795,15 +796,38 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
}
EXPORT_SYMBOL(blk_alloc_queue);

int blk_queue_enter(struct request_queue *q, bool nowait)
/**
* blk_queue_enter() - try to increase q->q_usage_counter
* @q: request queue pointer
* @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
*/
int blk_queue_enter(struct request_queue *q, unsigned int flags)
{
const bool preempt = flags & BLK_MQ_REQ_PREEMPT;

while (true) {
bool success = false;
int ret;

if (percpu_ref_tryget_live(&q->q_usage_counter))
rcu_read_lock_sched();
if (percpu_ref_tryget_live(&q->q_usage_counter)) {
/*
* The code that sets the PREEMPT_ONLY flag is
* responsible for ensuring that that flag is globally
* visible before the queue is unfrozen.
*/
if (preempt || !blk_queue_preempt_only(q)) {
success = true;
} else {
percpu_ref_put(&q->q_usage_counter);
}
}
rcu_read_unlock_sched();

if (success)
return 0;

if (nowait)
if (flags & BLK_MQ_REQ_NOWAIT)
return -EBUSY;

/*
Expand All @@ -816,7 +840,8 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
smp_rmb();

ret = wait_event_interruptible(q->mq_freeze_wq,
!atomic_read(&q->mq_freeze_depth) ||
(atomic_read(&q->mq_freeze_depth) == 0 &&
(preempt || !blk_queue_preempt_only(q))) ||
blk_queue_dying(q));
if (blk_queue_dying(q))
return -ENODEV;
Expand Down Expand Up @@ -1445,8 +1470,7 @@ static struct request *blk_old_get_request(struct request_queue *q,
/* create ioc upfront */
create_io_context(gfp_mask, q->node);

ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM) ||
(op & REQ_NOWAIT));
ret = blk_queue_enter(q, flags);
if (ret)
return ERR_PTR(ret);
spin_lock_irq(q->queue_lock);
Expand Down Expand Up @@ -2267,8 +2291,10 @@ blk_qc_t generic_make_request(struct bio *bio)
current->bio_list = bio_list_on_stack;
do {
struct request_queue *q = bio->bi_disk->queue;
unsigned int flags = bio->bi_opf & REQ_NOWAIT ?
BLK_MQ_REQ_NOWAIT : 0;

if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) {
if (likely(blk_queue_enter(q, flags) == 0)) {
struct bio_list lower, same;

/* Create a fresh bio_list for all subordinate requests */
Expand Down Expand Up @@ -2327,7 +2353,7 @@ blk_qc_t direct_make_request(struct bio *bio)
if (!generic_make_request_checks(bio))
return BLK_QC_T_NONE;

if (unlikely(blk_queue_enter(q, nowait))) {
if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
if (nowait && !blk_queue_dying(q))
bio->bi_status = BLK_STS_AGAIN;
else
Expand Down
4 changes: 2 additions & 2 deletions block/blk-mq.c
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
struct request *rq;
int ret;

ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
ret = blk_queue_enter(q, flags);
if (ret)
return ERR_PTR(ret);

Expand Down Expand Up @@ -428,7 +428,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
if (hctx_idx >= q->nr_hw_queues)
return ERR_PTR(-EIO);

ret = blk_queue_enter(q, true);
ret = blk_queue_enter(q, flags);
if (ret)
return ERR_PTR(ret);

Expand Down
42 changes: 30 additions & 12 deletions drivers/scsi/scsi_lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -2947,21 +2947,37 @@ static void scsi_wait_for_queuecommand(struct scsi_device *sdev)
int
scsi_device_quiesce(struct scsi_device *sdev)
{
struct request_queue *q = sdev->request_queue;
int err;

/*
* It is allowed to call scsi_device_quiesce() multiple times from
* the same context but concurrent scsi_device_quiesce() calls are
* not allowed.
*/
WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current);

blk_set_preempt_only(q);

blk_mq_freeze_queue(q);
/*
* Ensure that the effect of blk_set_preempt_only() will be visible
* for percpu_ref_tryget() callers that occur after the queue
* unfreeze even if the queue was already frozen before this function
* was called. See also https://lwn.net/Articles/573497/.
*/
synchronize_rcu();
blk_mq_unfreeze_queue(q);

mutex_lock(&sdev->state_mutex);
err = scsi_device_set_state(sdev, SDEV_QUIESCE);
if (err == 0)
sdev->quiesced_by = current;
else
blk_clear_preempt_only(q);
mutex_unlock(&sdev->state_mutex);

if (err)
return err;

scsi_run_queue(sdev->request_queue);
while (atomic_read(&sdev->device_busy)) {
msleep_interruptible(200);
scsi_run_queue(sdev->request_queue);
}
return 0;
return err;
}
EXPORT_SYMBOL(scsi_device_quiesce);

Expand All @@ -2981,9 +2997,11 @@ void scsi_device_resume(struct scsi_device *sdev)
* device deleted during suspend)
*/
mutex_lock(&sdev->state_mutex);
if (sdev->sdev_state == SDEV_QUIESCE &&
scsi_device_set_state(sdev, SDEV_RUNNING) == 0)
scsi_run_queue(sdev->request_queue);
WARN_ON_ONCE(!sdev->quiesced_by);
sdev->quiesced_by = NULL;
blk_clear_preempt_only(sdev->request_queue);
if (sdev->sdev_state == SDEV_QUIESCE)
scsi_device_set_state(sdev, SDEV_RUNNING);
mutex_unlock(&sdev->state_mutex);
}
EXPORT_SYMBOL(scsi_device_resume);
Expand Down
4 changes: 2 additions & 2 deletions fs/block_dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return result;

result = blk_queue_enter(bdev->bd_queue, false);
result = blk_queue_enter(bdev->bd_queue, 0);
if (result)
return result;
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, false);
Expand Down Expand Up @@ -698,7 +698,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,

if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
result = blk_queue_enter(bdev->bd_queue, false);
result = blk_queue_enter(bdev->bd_queue, 0);
if (result)
return result;

Expand Down
2 changes: 1 addition & 1 deletion include/linux/blkdev.h
Original file line number Diff line number Diff line change
Expand Up @@ -959,7 +959,7 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
struct scsi_ioctl_command __user *);

extern int blk_queue_enter(struct request_queue *q, bool nowait);
extern int blk_queue_enter(struct request_queue *q, unsigned int flags);
extern void blk_queue_exit(struct request_queue *q);
extern void blk_start_queue(struct request_queue *q);
extern void blk_start_queue_async(struct request_queue *q);
Expand Down
1 change: 1 addition & 0 deletions include/scsi/scsi_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ struct scsi_device {
unsigned char access_state;
struct mutex state_mutex;
enum scsi_device_state sdev_state;
struct task_struct *quiesced_by;
unsigned long sdev_data[0];
} __attribute__((aligned(sizeof(unsigned long))));

Expand Down

0 comments on commit 3a0a529

Please sign in to comment.