Skip to content

Commit

Permalink
Two more speculative prefetcher improvements.
Browse files Browse the repository at this point in the history
Delete prefetch streams when they reach ends of files.  It saves up
to 1KB of RAM per file, plus reduces searches through the stream list.

Block data prefetch (speculation and indirect block prefetch is still
done since they are cheaper) if all dbufs of the stream are already
in DMU cache.  First cache miss immediately fires all the prefetch
that would be done for the stream by that time.  It saves some CPU
time if same files within DMU cache capacity are read over and over.

Signed-off-by: Alexander Motin <[email protected]>
Sponsored-By: iXsystems, Inc.
  • Loading branch information
amotin committed Mar 1, 2021
1 parent f586c6e commit a9e69ec
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 35 deletions.
5 changes: 3 additions & 2 deletions include/sys/dmu_zfetch.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ typedef struct zstream {
hrtime_t zs_atime; /* time last prefetch issued */
hrtime_t zs_start_time; /* start of last prefetch */
zfetch_t *zs_fetch; /* parent fetch */
boolean_t zs_missed; /* stream saw cache misses */
zfs_refcount_t zs_callers; /* number of pending callers */
zfs_refcount_t zs_blocks; /* number of pending blocks in the stream */
} zstream_t;
Expand All @@ -74,8 +75,8 @@ void dmu_zfetch_init(zfetch_t *, struct dnode *);
void dmu_zfetch_fini(zfetch_t *);
zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
boolean_t);
void dmu_zfetch_run(zstream_t *, boolean_t);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t,
void dmu_zfetch_run(zstream_t *, boolean_t, boolean_t);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
boolean_t);


Expand Down
6 changes: 3 additions & 3 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -1640,7 +1640,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
mutex_exit(&db->db_mtx);
if (err == 0 && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
flags & DB_RF_HAVESTRUCT);
B_FALSE, flags & DB_RF_HAVESTRUCT);
}
DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_hits);
Expand All @@ -1662,7 +1662,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
*/
if (!err && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
flags & DB_RF_HAVESTRUCT);
B_TRUE, flags & DB_RF_HAVESTRUCT);
}

DB_DNODE_EXIT(db);
Expand Down Expand Up @@ -1691,7 +1691,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
mutex_exit(&db->db_mtx);
if (prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
flags & DB_RF_HAVESTRUCT);
B_TRUE, flags & DB_RF_HAVESTRUCT);
}
DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_misses);
Expand Down
10 changes: 7 additions & 3 deletions module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
uint32_t dbuf_flags;
int err;
zio_t *zio = NULL;
boolean_t missed = B_FALSE;

ASSERT(length <= DMU_MAX_ACCESS);

Expand Down Expand Up @@ -546,7 +547,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
if (db == NULL) {
if (zs)
dmu_zfetch_run(zs, B_TRUE);
dmu_zfetch_run(zs, missed, B_TRUE);
rw_exit(&dn->dn_struct_rwlock);
dmu_buf_rele_array(dbp, nblks, tag);
if (read)
Expand All @@ -555,16 +556,19 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
}

/* initiate async i/o */
if (read)
if (read) {
(void) dbuf_read(db, zio, dbuf_flags);
if (db->db_state != DB_CACHED)
missed = B_TRUE;
}
dbp[i] = &db->db;
}

if (!read)
zfs_racct_write(length, nblks);

if (zs)
dmu_zfetch_run(zs, B_TRUE);
dmu_zfetch_run(zs, missed, B_TRUE);
rw_exit(&dn->dn_struct_rwlock);

if (read) {
Expand Down
71 changes: 44 additions & 27 deletions module/zfs/dmu_zfetch.c
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,6 @@ dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
{
ASSERT(MUTEX_HELD(&zf->zf_lock));
list_remove(&zf->zf_stream, zs);
zs->zs_fetch = NULL;
zf->zf_numstreams--;
if (zfs_refcount_remove(&zs->zs_blocks, NULL) == 0)
dmu_zfetch_stream_fini(zs);
Expand Down Expand Up @@ -216,6 +215,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
zs->zs_ipf_blkid = blkid;
zs->zs_atime = now;
zs->zs_fetch = zf;
zs->zs_missed = B_FALSE;
zfs_refcount_create(&zs->zs_callers);
zfs_refcount_create(&zs->zs_blocks);
/* One reference for zf_stream. */
Expand Down Expand Up @@ -257,7 +257,7 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
{
zstream_t *zs;
int64_t pf_start, ipf_start;
int64_t pf_ahead_blks, max_blks;
int64_t pf_ahead_blks, max_blks, maxblkid;
int max_dist_blks, pf_nblks, ipf_nblks;
uint64_t end_of_access_blkid;
end_of_access_blkid = blkid + nblks;
Expand Down Expand Up @@ -289,7 +289,7 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
* A fast path for small files for which no prefetch will
* happen.
*/
if (zf->zf_dnode->dn_maxblkid < 2) {
if ((maxblkid = zf->zf_dnode->dn_maxblkid) < 2) {
if (!have_lock)
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
return (NULL);
Expand All @@ -308,30 +308,37 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
} else if (blkid + 1 == zs->zs_blkid) {
blkid++;
nblks--;
if (nblks == 0) {
/* Already prefetched this before. */
mutex_exit(&zf->zf_lock);
if (!have_lock) {
rw_exit(&zf->zf_dnode->
dn_struct_rwlock);
}
return (NULL);
}
break;
}
}

/*
* If the file is ending, remove the matching stream if found.
* If not found then it is too late to create a new one now.
*/
if (end_of_access_blkid >= maxblkid) {
if (zs != NULL)
dmu_zfetch_stream_remove(zf, zs);
done:
mutex_exit(&zf->zf_lock);
if (!have_lock)
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
return (NULL);
}

if (nblks == 0)
goto done; /* Already prefetched this before. */

if (zs == NULL) {
/*
* This access is not part of any existing stream. Create
* a new stream for it.
*/
ZFETCHSTAT_BUMP(zfetchstat_misses);

dmu_zfetch_stream_create(zf, end_of_access_blkid);
mutex_exit(&zf->zf_lock);
if (!have_lock)
rw_exit(&zf->zf_dnode->dn_struct_rwlock);
ZFETCHSTAT_BUMP(zfetchstat_misses);
return (NULL);
}

Expand All @@ -348,6 +355,8 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
if (zs->zs_pf_blkid1 < end_of_access_blkid)
zs->zs_pf_blkid1 = end_of_access_blkid;
if (zs->zs_ipf_blkid1 < end_of_access_blkid)
zs->zs_ipf_blkid1 = end_of_access_blkid;

/*
* Double our amount of prefetched data, but don't let the
Expand Down Expand Up @@ -377,8 +386,6 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
* that point to them).
*/
ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
if (zs->zs_ipf_blkid1 < zs->zs_pf_blkid)
zs->zs_ipf_blkid1 = zs->zs_pf_blkid;
max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
/*
* We want to double our distance ahead of the data prefetch
Expand Down Expand Up @@ -409,33 +416,38 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
}

void
dmu_zfetch_run(zstream_t *zs, boolean_t have_lock)
dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
{
zfetch_t *zf = zs->zs_fetch;
int64_t pf_start, pf_end, ipf_start, ipf_end;
int epbs, issued;

if (missed)
zs->zs_missed = missed;

/*
* Postpone the prefetch if there are more concurrent callers.
* It happens when multiple requests are waiting for the same
* indirect block. The last one will run the prefetch for all.
*/
if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) {
/* Drop reference taken in dmu_zfetch_prepare(). */
VERIFY3S(zfs_refcount_remove(&zs->zs_blocks, NULL), >, 0);
if (zfs_refcount_remove(&zs->zs_blocks, NULL) == 0)
dmu_zfetch_stream_fini(zs);
return;
}

mutex_enter(&zf->zf_lock);
pf_start = zs->zs_pf_blkid1;
pf_end = zs->zs_pf_blkid1 = zs->zs_pf_blkid;
ipf_start = zs->zs_ipf_blkid1;
if (zs->zs_missed) {
pf_start = zs->zs_pf_blkid1;
pf_end = zs->zs_pf_blkid1 = zs->zs_pf_blkid;
} else {
pf_start = pf_end = 0;
}
ipf_start = MAX(zs->zs_pf_blkid1, zs->zs_ipf_blkid1);
ipf_end = zs->zs_ipf_blkid1 = zs->zs_ipf_blkid;
mutex_exit(&zf->zf_lock);

if (!have_lock)
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);

epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs;
Expand All @@ -445,9 +457,14 @@ dmu_zfetch_run(zstream_t *zs, boolean_t have_lock)
zfs_refcount_add_many(&zs->zs_blocks, issued - 1, NULL);
} else if (issued == 0) {
/* Some other thread has done our work, so drop the ref. */
VERIFY3S(zfs_refcount_remove(&zs->zs_blocks, NULL), >, 0);
if (zfs_refcount_remove(&zs->zs_blocks, NULL) == 0)
dmu_zfetch_stream_fini(zs);
return;
}

if (!have_lock)
rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);

issued = 0;
for (int64_t blk = pf_start; blk < pf_end; blk++) {
issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk,
Expand All @@ -469,13 +486,13 @@ dmu_zfetch_run(zstream_t *zs, boolean_t have_lock)

void
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
boolean_t have_lock)
boolean_t missed, boolean_t have_lock)
{
zstream_t *zs;

zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
if (zs)
dmu_zfetch_run(zs, have_lock);
dmu_zfetch_run(zs, missed, have_lock);
}

/* BEGIN CSTYLED */
Expand Down

0 comments on commit a9e69ec

Please sign in to comment.