Skip to content

Commit

Permalink
Implement uncached prefetch
Browse files Browse the repository at this point in the history
Previously the primarycache property was handled only in the dbuf
layer. Since the speculative prefetcher is implemented in the ARC,
it had to be disabled for uncacheable buffers.

This change gives the ARC knowledge about uncacheable buffers
via  arc_read() and arc_write(). So when remove_reference() drops
the last reference on the ARC header, it can either immediately destroy
it, or if it is marked as prefetch, put it into a new arc_uncached state. 
That state is scanned every second, evicting stale buffers that were
not demand read.

This change also tracks dbufs that were read from the beginning,
but not to the end.  It is assumed that such buffers may receive further
reads, and so they are stored in dbuf cache. If a following
reads reaches the end of the buffer, it is immediately evicted.
Otherwise it will follow regular dbuf cache eviction.  Since the dbuf
layer does not know actual file sizes, this logic is not applied to
the final buffer of a dnode.

Since uncacheable buffers should no longer stay in the ARC for long,
this patch also tries to optimize I/O by allocating ARC physical
buffers as linear to allow buffer sharing.

Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: George Wilson <[email protected]>
Reviewed-by: Ryan Moeller <[email protected]>
Signed-off-by: Alexander Motin <[email protected]>
Sponsored by: iXsystems, Inc.
Closes openzfs#14243
  • Loading branch information
amotin authored and lundman committed Mar 3, 2023
1 parent a097059 commit 393cf4b
Show file tree
Hide file tree
Showing 11 changed files with 243 additions and 139 deletions.
2 changes: 2 additions & 0 deletions include/os/linux/zfs/sys/trace_arc.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete);
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru);
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu);
DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__uncached);
DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__async__upgrade__sync);
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit);
DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss);
Expand Down Expand Up @@ -392,6 +393,7 @@ DEFINE_DTRACE_PROBE1(arc__evict);
DEFINE_DTRACE_PROBE1(arc__delete);
DEFINE_DTRACE_PROBE1(new_state__mru);
DEFINE_DTRACE_PROBE1(new_state__mfu);
DEFINE_DTRACE_PROBE1(new_state__uncached);
DEFINE_DTRACE_PROBE1(arc__async__upgrade__sync);
DEFINE_DTRACE_PROBE1(l2arc__hit);
DEFINE_DTRACE_PROBE1(l2arc__miss);
Expand Down
6 changes: 4 additions & 2 deletions include/sys/arc.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ typedef enum arc_flags
ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */
ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */
ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */
ARC_FLAG_UNCACHED = 1 << 5, /* evict after use */
ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */

/*
Expand Down Expand Up @@ -228,6 +229,7 @@ typedef enum arc_state_type {
ARC_STATE_MFU,
ARC_STATE_MFU_GHOST,
ARC_STATE_L2C_ONLY,
ARC_STATE_UNCACHED,
ARC_STATE_NUMTYPES
} arc_state_type_t;

Expand Down Expand Up @@ -301,8 +303,8 @@ int arc_referenced(arc_buf_t *buf);
int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_read_done_func_t *done, void *priv, zio_priority_t priority,
int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp,
arc_write_done_func_t *ready, arc_write_done_func_t *child_ready,
arc_write_done_func_t *physdone, arc_write_done_func_t *done,
void *priv, zio_priority_t priority, int zio_flags,
Expand Down
19 changes: 19 additions & 0 deletions include/sys/arc_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ extern "C" {
* ARC_mru_ghost - recently used, no longer in cache
* ARC_mfu - frequently used, currently cached
* ARC_mfu_ghost - frequently used, no longer in cache
* ARC_uncached - uncacheable prefetch, to be evicted
* ARC_l2c_only - exists in L2ARC but not other states
* When there are no active references to the buffer, they are
* are linked onto a list in one of these arc states. These are
Expand Down Expand Up @@ -542,6 +543,7 @@ typedef struct arc_stats {
kstat_named_t arcstat_mru_ghost_hits;
kstat_named_t arcstat_mfu_hits;
kstat_named_t arcstat_mfu_ghost_hits;
kstat_named_t arcstat_uncached_hits;
kstat_named_t arcstat_deleted;
/*
* Number of buffers that could not be evicted because the hash lock
Expand Down Expand Up @@ -744,6 +746,21 @@ typedef struct arc_stats {
* ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
*/
kstat_named_t arcstat_mfu_ghost_evictable_metadata;
/*
* Total number of bytes that are going to be evicted from ARC due to
* ARC_FLAG_UNCACHED being set.
*/
kstat_named_t arcstat_uncached_size;
/*
* Number of data bytes that are going to be evicted from ARC due to
* ARC_FLAG_UNCACHED being set.
*/
kstat_named_t arcstat_uncached_evictable_data;
/*
* Number of metadata bytes that that are going to be evicted from ARC
* due to ARC_FLAG_UNCACHED being set.
*/
kstat_named_t arcstat_uncached_evictable_metadata;
kstat_named_t arcstat_l2_hits;
kstat_named_t arcstat_l2_misses;
/*
Expand Down Expand Up @@ -900,6 +917,7 @@ typedef struct arc_sums {
wmsum_t arcstat_mru_ghost_hits;
wmsum_t arcstat_mfu_hits;
wmsum_t arcstat_mfu_ghost_hits;
wmsum_t arcstat_uncached_hits;
wmsum_t arcstat_deleted;
wmsum_t arcstat_mutex_miss;
wmsum_t arcstat_access_skip;
Expand Down Expand Up @@ -1006,6 +1024,7 @@ typedef struct arc_evict_waiter {
#define arc_mfu (&ARC_mfu)
#define arc_mfu_ghost (&ARC_mfu_ghost)
#define arc_l2c_only (&ARC_l2c_only)
#define arc_uncached (&ARC_uncached)

extern taskq_t *arc_prune_taskq;
extern arc_stats_t arc_stats;
Expand Down
5 changes: 5 additions & 0 deletions include/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ extern "C" {
#define DB_RF_NEVERWAIT (1 << 4)
#define DB_RF_CACHED (1 << 5)
#define DB_RF_NO_DECRYPT (1 << 6)
#define DB_RF_PARTIAL_FIRST (1 << 7)
#define DB_RF_PARTIAL_MORE (1 << 8)

/*
* The simplified state transition diagram for dbufs looks like:
Expand Down Expand Up @@ -321,6 +323,9 @@ typedef struct dmu_buf_impl {
uint8_t db_pending_evict;

uint8_t db_dirtycnt;

/* The buffer was partially read. More reads may follow. */
uint8_t db_partial_read;
} dmu_buf_impl_t;

#define DBUF_HASH_MUTEX(h, idx) \
Expand Down
8 changes: 2 additions & 6 deletions include/sys/dnode.h
Original file line number Diff line number Diff line change
Expand Up @@ -457,15 +457,11 @@ void dnode_free_interior_slots(dnode_t *dn);
#define DNODE_IS_DIRTY(_dn) \
((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa))

#define DNODE_IS_CACHEABLE(_dn) \
#define DNODE_LEVEL_IS_CACHEABLE(_dn, _level) \
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
(DMU_OT_IS_METADATA((_dn)->dn_type) && \
(((_level) > 0 || DMU_OT_IS_METADATA((_dn)->dn_type)) && \
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA))

#define DNODE_META_IS_CACHEABLE(_dn) \
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)

/*
* Used for dnodestats kstat.
*/
Expand Down
17 changes: 15 additions & 2 deletions module/os/freebsd/zfs/sysctl_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -366,10 +366,10 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
&ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
&ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
"size of anonymous state");
"size of metadata in anonymous state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
&ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
"size of anonymous state");
"size of data in anonymous state");
/* END CSTYLED */

extern arc_state_t ARC_mru;
Expand Down Expand Up @@ -424,6 +424,19 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
"size of data in mfu ghost state");
/* END CSTYLED */

extern arc_state_t ARC_uncached;

/* BEGIN CSTYLED */
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_size, CTLFLAG_RD,
&ARC_uncached.arcs_size.rc_count, 0, "size of uncached state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD,
&ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
"size of metadata in uncached state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD,
&ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
"size of data in uncached state");
/* END CSTYLED */

extern arc_state_t ARC_l2c_only;

/* BEGIN CSTYLED */
Expand Down
Loading

0 comments on commit 393cf4b

Please sign in to comment.