Skip to content

Commit

Permalink
5987 zfs prefetch code needs work
Browse files Browse the repository at this point in the history
Reviewed by: Adam Leventhal <[email protected]>
Reviewed by: George Wilson <[email protected]>
Reviewed by: Paul Dagnelie <[email protected]>
Approved by: Gordon Ross <[email protected]>
  • Loading branch information
ahrens committed Sep 9, 2015
1 parent e7c874a commit cf6106c
Show file tree
Hide file tree
Showing 8 changed files with 265 additions and 653 deletions.
76 changes: 68 additions & 8 deletions usr/src/uts/common/fs/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,8 @@ typedef struct arc_stats {
kstat_named_t arcstat_meta_limit;
kstat_named_t arcstat_meta_max;
kstat_named_t arcstat_meta_min;
kstat_named_t arcstat_sync_wait_for_async;
kstat_named_t arcstat_demand_hit_predictive_prefetch;
} arc_stats_t;

static arc_stats_t arc_stats = {
Expand Down Expand Up @@ -580,7 +582,9 @@ static arc_stats_t arc_stats = {
{ "arc_meta_used", KSTAT_DATA_UINT64 },
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
{ "arc_meta_max", KSTAT_DATA_UINT64 },
{ "arc_meta_min", KSTAT_DATA_UINT64 }
{ "arc_meta_min", KSTAT_DATA_UINT64 },
{ "sync_wait_for_async", KSTAT_DATA_UINT64 },
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
};

#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
Expand Down Expand Up @@ -4004,6 +4008,36 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,

if (HDR_IO_IN_PROGRESS(hdr)) {

if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
priority == ZIO_PRIORITY_SYNC_READ) {
/*
* This sync read must wait for an
* in-progress async read (e.g. a predictive
* prefetch). Async reads are queued
* separately at the vdev_queue layer, so
* this is a form of priority inversion.
* Ideally, we would "inherit" the demand
* i/o's priority by moving the i/o from
* the async queue to the synchronous queue,
* but there is currently no mechanism to do
* so. Track this so that we can evaluate
* the magnitude of this potential performance
* problem.
*
* Note that if the prefetch i/o is already
* active (has been issued to the device),
* the prefetch improved performance, because
* we issued it sooner than we would have
* without the prefetch.
*/
DTRACE_PROBE1(arc__sync__wait__for__async,
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_sync_wait_for_async);
}
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
}

if (*arc_flags & ARC_FLAG_WAIT) {
cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
mutex_exit(hash_lock);
Expand All @@ -4012,7 +4046,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
ASSERT(*arc_flags & ARC_FLAG_NOWAIT);

if (done) {
arc_callback_t *acb = NULL;
arc_callback_t *acb = NULL;

acb = kmem_zalloc(sizeof (arc_callback_t),
KM_SLEEP);
Expand All @@ -4037,6 +4071,19 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
hdr->b_l1hdr.b_state == arc_mfu);

if (done) {
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
/*
* This is a demand read which does not have to
* wait for i/o because we did a predictive
* prefetch i/o for it, which has completed.
*/
DTRACE_PROBE1(
arc__demand__hit__predictive__prefetch,
arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(
arcstat_demand_hit_predictive_prefetch);
hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
}
add_reference(hdr, hash_lock, private);
/*
* If this block is already in use, create a new
Expand Down Expand Up @@ -4099,12 +4146,16 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
goto top; /* restart the IO request */
}

/* if this is a prefetch, we don't have a reference */
if (*arc_flags & ARC_FLAG_PREFETCH) {
/*
* If there is a callback, we pass our reference to
* it; otherwise we remove our reference.
*/
if (done == NULL) {
(void) remove_reference(hdr, hash_lock,
private);
hdr->b_flags |= ARC_FLAG_PREFETCH;
}
if (*arc_flags & ARC_FLAG_PREFETCH)
hdr->b_flags |= ARC_FLAG_PREFETCH;
if (*arc_flags & ARC_FLAG_L2CACHE)
hdr->b_flags |= ARC_FLAG_L2CACHE;
if (*arc_flags & ARC_FLAG_L2COMPRESS)
Expand All @@ -4127,11 +4178,13 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);

/* if this is a prefetch, we don't have a reference */
/*
* If there is a callback, we pass a reference to it.
*/
if (done != NULL)
add_reference(hdr, hash_lock, private);
if (*arc_flags & ARC_FLAG_PREFETCH)
hdr->b_flags |= ARC_FLAG_PREFETCH;
else
add_reference(hdr, hash_lock, private);
if (*arc_flags & ARC_FLAG_L2CACHE)
hdr->b_flags |= ARC_FLAG_L2CACHE;
if (*arc_flags & ARC_FLAG_L2COMPRESS)
Expand All @@ -4149,6 +4202,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
arc_access(hdr, hash_lock);
}

if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH;
ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));

acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
Expand Down Expand Up @@ -4188,6 +4243,11 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, misses);

if (priority == ZIO_PRIORITY_ASYNC_READ)
hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ;
else
hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ;

if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
/*
* Read from the L2ARC if the following are true:
Expand Down
21 changes: 9 additions & 12 deletions usr/src/uts/common/fs/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
}

static void
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
{
dnode_t *dn;
zbookmark_phys_t zb;
Expand Down Expand Up @@ -664,7 +664,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
db->db.db_size, db, type));
bzero(db->db.db_data, db->db.db_size);
db->db_state = DB_CACHED;
*flags |= DB_RF_CACHED;
mutex_exit(&db->db_mtx);
return;
}
Expand All @@ -687,10 +686,8 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)

(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
(flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
&aflags, &zb);
if (aflags & ARC_FLAG_CACHED)
*flags |= DB_RF_CACHED;
}

int
Expand Down Expand Up @@ -723,8 +720,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
if (db->db_state == DB_CACHED) {
mutex_exit(&db->db_mtx);
if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, TRUE);
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db);
Expand All @@ -733,13 +729,12 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)

if (zio == NULL)
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
dbuf_read_impl(db, zio, &flags);
dbuf_read_impl(db, zio, flags);

/* dbuf_read_impl has dropped db_mtx for us */

if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, flags & DB_RF_CACHED);
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);

if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock);
Expand All @@ -758,8 +753,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
*/
mutex_exit(&db->db_mtx);
if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
db->db.db_size, TRUE);
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db);
Expand Down Expand Up @@ -2059,6 +2053,9 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));

if (blkid > dn->dn_maxblkid)
return;

if (dnode_block_freed(dn, blkid))
return;

Expand Down
35 changes: 21 additions & 14 deletions usr/src/uts/common/fs/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
Expand Down Expand Up @@ -386,7 +386,7 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
*/
static int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
{
dmu_buf_t **dbp;
uint64_t blkid, nblks, i;
Expand All @@ -396,15 +396,19 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,

ASSERT(length <= DMU_MAX_ACCESS);

dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
dbuf_flags |= DB_RF_NOPREFETCH;
/*
* Note: We directly notify the prefetch code of this read, so that
* we can tell it about the multi-block read. dbuf_read() only knows
* about the one block it is accessing.
*/
dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
DB_RF_NOPREFETCH;

rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) {
int blkshift = dn->dn_datablkshift;
nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
} else {
if (offset + length > dn->dn_datablksz) {
zfs_panic_recover("zfs: accessing past end of object "
Expand All @@ -423,19 +427,24 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, 0, offset);
for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
if (db == NULL) {
rw_exit(&dn->dn_struct_rwlock);
dmu_buf_rele_array(dbp, nblks, tag);
zio_nowait(zio);
return (SET_ERROR(EIO));
}

/* initiate async i/o */
if (read) {
if (read)
(void) dbuf_read(db, zio, dbuf_flags);
}
dbp[i] = &db->db;
}

if ((flags & DMU_READ_NO_PREFETCH) == 0 && read &&
length < zfetch_array_rd_sz) {
dmu_zfetch(&dn->dn_zfetch, blkid, nblks);
}
rw_exit(&dn->dn_struct_rwlock);

/* wait for async i/o */
Expand Down Expand Up @@ -489,7 +498,8 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,

int
dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
uint64_t length, boolean_t read, void *tag, int *numbufsp,
dmu_buf_t ***dbpp)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dnode_t *dn;
Expand Down Expand Up @@ -537,9 +547,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t blkid;
int nblks, err;

if (zfs_prefetch_disable)
return;

if (len == 0) { /* they're interested in the bonus buffer */
dn = DMU_META_DNODE(os);

Expand Down
Loading

0 comments on commit cf6106c

Please sign in to comment.