Skip to content

Commit

Permalink
zfetch: Don't issue new streams when old have not completed
Browse files Browse the repository at this point in the history
The current dmu_zfetch code implicitly assumes that I/Os complete
within min_sec_reap seconds. With async dmu and a readonly workload
(and thus no exponential backoff in operations from the "write
throttle") such as L2ARC rebuild it is possible to saturate the drives
with I/O requests. These are then effectively compounded with prefetch
requests.

This change reference counts streams and prevents them from being
recycled after their min_sec_reap timeout if they still have
outstanding I/Os.

Reviewed-by: Alexander Motin <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Matt Macy <[email protected]>
Closes #10900
  • Loading branch information
mattmacy authored Sep 28, 2020
1 parent cf26677 commit af20b97
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 44 deletions.
7 changes: 6 additions & 1 deletion include/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,8 @@ typedef struct dbuf_hash_table {
kmutex_t hash_mutexes[DBUF_MUTEXES];
} dbuf_hash_table_t;

typedef void (*dbuf_prefetch_fn)(void *, boolean_t);

uint64_t dbuf_whichblock(const struct dnode *di, const int64_t level,
const uint64_t offset);

Expand All @@ -324,7 +326,10 @@ int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp);

void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
int dbuf_prefetch_impl(struct dnode *dn, int64_t level, uint64_t blkid,
zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
void *arg);
int dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
zio_priority_t prio, arc_flags_t aflags);

void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
Expand Down
16 changes: 10 additions & 6 deletions include/sys/dmu_zfetch.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ extern unsigned long zfetch_array_rd_sz;

struct dnode; /* so we can reference dnode */

typedef struct zfetch {
kmutex_t zf_lock; /* protects zfetch structure */
list_t zf_stream; /* list of zstream_t's */
struct dnode *zf_dnode; /* dnode that owns this zfetch */
int zf_numstreams; /* number of zstream_t's */
} zfetch_t;

typedef struct zstream {
uint64_t zs_blkid; /* expect next access at this blkid */
uint64_t zs_pf_blkid; /* next block to prefetch */
Expand All @@ -52,15 +59,12 @@ typedef struct zstream {

kmutex_t zs_lock; /* protects stream */
hrtime_t zs_atime; /* time last prefetch issued */
hrtime_t zs_start_time; /* start of last prefetch */
list_node_t zs_node; /* link for zf_stream */
zfetch_t *zs_fetch; /* parent fetch */
zfs_refcount_t zs_blocks; /* number of pending blocks in the stream */
} zstream_t;

typedef struct zfetch {
kmutex_t zf_lock; /* protects zfetch structure */
list_t zf_stream; /* list of zstream_t's */
struct dnode *zf_dnode; /* dnode that owns this zfetch */
} zfetch_t;

void zfetch_init(void);
void zfetch_fini(void);

Expand Down
71 changes: 52 additions & 19 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -3003,8 +3003,29 @@ typedef struct dbuf_prefetch_arg {
zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */
void *dpa_arg; /* prefetch completion arg */
} dbuf_prefetch_arg_t;

static void
dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
{
if (dpa->dpa_cb != NULL)
dpa->dpa_cb(dpa->dpa_arg, io_done);
kmem_free(dpa, sizeof (*dpa));
}

static void
dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
const blkptr_t *iobp, arc_buf_t *abuf, void *private)
{
dbuf_prefetch_arg_t *dpa = private;

dbuf_prefetch_fini(dpa, B_TRUE);
if (abuf != NULL)
arc_buf_destroy(abuf, private);
}

/*
* Actually issue the prefetch read for the block given.
*/
Expand All @@ -3017,7 +3038,7 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
SPA_FEATURE_REDACTED_DATASETS));

if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
return;
return (dbuf_prefetch_fini(dpa, B_FALSE));

int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
arc_flags_t aflags =
Expand All @@ -3031,7 +3052,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
ASSERT(dpa->dpa_zio != NULL);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,
dbuf_issue_final_prefetch_done, dpa,
dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
}

Expand All @@ -3051,8 +3073,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,

if (abuf == NULL) {
ASSERT(zio == NULL || zio->io_error != 0);
kmem_free(dpa, sizeof (*dpa));
return;
return (dbuf_prefetch_fini(dpa, B_TRUE));
}
ASSERT(zio == NULL || zio->io_error == 0);

Expand Down Expand Up @@ -3084,11 +3105,9 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
dpa->dpa_curlevel, curblkid, FTAG);
if (db == NULL) {
kmem_free(dpa, sizeof (*dpa));
arc_buf_destroy(abuf, private);
return;
return (dbuf_prefetch_fini(dpa, B_TRUE));
}

(void) dbuf_read(db, NULL,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
dbuf_rele(db, FTAG);
Expand All @@ -3105,11 +3124,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
dpa->dpa_dnode->dn_objset->os_dsl_dataset,
SPA_FEATURE_REDACTED_DATASETS));
if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
kmem_free(dpa, sizeof (*dpa));
dbuf_prefetch_fini(dpa, B_TRUE);
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
dbuf_issue_final_prefetch(dpa, bp);
kmem_free(dpa, sizeof (*dpa));
} else {
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
zbookmark_phys_t zb;
Expand Down Expand Up @@ -3139,9 +3157,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
* complete. Note that the prefetch might fail if the dataset is encrypted and
* the encryption key is unmapped before the IO completes.
*/
void
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
arc_flags_t aflags)
int
dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
void *arg)
{
blkptr_t bp;
int epbs, nlevels, curlevel;
Expand All @@ -3151,22 +3170,22 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));

if (blkid > dn->dn_maxblkid)
return;
goto no_issue;

if (level == 0 && dnode_block_freed(dn, blkid))
return;
goto no_issue;

/*
* This dnode hasn't been written to disk yet, so there's nothing to
* prefetch.
*/
nlevels = dn->dn_phys->dn_nlevels;
if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
return;
goto no_issue;

epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
return;
goto no_issue;

dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
level, blkid);
Expand All @@ -3176,7 +3195,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
* This dbuf already exists. It is either CACHED, or
* (we assume) about to be read or filled.
*/
return;
goto no_issue;
}

/*
Expand Down Expand Up @@ -3212,7 +3231,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,
SPA_FEATURE_REDACTED_DATASETS));
if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))
return;
goto no_issue;

ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));

Expand All @@ -3230,6 +3249,8 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
dpa->dpa_dnode = dn;
dpa->dpa_epbs = epbs;
dpa->dpa_zio = pio;
dpa->dpa_cb = cb;
dpa->dpa_arg = arg;

/* flag if L2ARC eligible, l2arc_noprefetch then decides */
if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
Expand All @@ -3245,7 +3266,6 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
if (curlevel == level) {
ASSERT3U(curblkid, ==, blkid);
dbuf_issue_final_prefetch(dpa, &bp);
kmem_free(dpa, sizeof (*dpa));
} else {
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
zbookmark_phys_t zb;
Expand All @@ -3266,6 +3286,19 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
* dpa may have already been freed.
*/
zio_nowait(pio);
return (1);
no_issue:
if (cb != NULL)
cb(arg, B_FALSE);
return (0);
}

int
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
arc_flags_t aflags)
{

return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));
}

/*
Expand Down
Loading

0 comments on commit af20b97

Please sign in to comment.