Skip to content

Commit

Permalink
[dweeezil] Lock contention on arcs mtx openzfs#3115
Browse files Browse the repository at this point in the history
I'm posting this as a pull request now in order that it get some wider review and also that the buildbots get a chance to chew on it.

I've given it some pretty intense testing, mainly with fio over the past few days and there don't seem to be any obvious regressions.

I still consider this a work-in-progress. The dbufs kstat still needs to be fixed but I don't expect that to be terribly difficult.
  • Loading branch information
kernelOfTruth committed Mar 13, 2015
1 parent 305040e commit b68105e
Show file tree
Hide file tree
Showing 27 changed files with 2,719 additions and 1,482 deletions.
2 changes: 0 additions & 2 deletions cmd/arcstat/arcstat.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@
"mrug": [4, 1000, "MRU Ghost List hits per second"],
"eskip": [5, 1000, "evict_skip per second"],
"mtxmis": [6, 1000, "mutex_miss per second"],
"rmis": [4, 1000, "recycle_miss per second"],
"dread": [5, 1000, "Demand accesses per second"],
"pread": [5, 1000, "Prefetch accesses per second"],
"l2hits": [6, 1000, "L2ARC hits per second"],
Expand Down Expand Up @@ -406,7 +405,6 @@ def calculate():
v["mrug"] = d["mru_ghost_hits"] / sint
v["mfug"] = d["mfu_ghost_hits"] / sint
v["eskip"] = d["evict_skip"] / sint
v["rmis"] = d["recycle_miss"] / sint
v["mtxmis"] = d["mutex_miss"] / sint

if l2exist:
Expand Down
2 changes: 1 addition & 1 deletion cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -1169,7 +1169,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
print_indirect(bp, zb, dnp);

if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
uint32_t flags = ARC_WAIT;
arc_flags_t flags = ARC_FLAG_WAIT;
int i;
blkptr_t *cbp;
int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
Expand Down
9 changes: 5 additions & 4 deletions cmd/ztest/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -4051,7 +4051,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
* assign an arcbuf to a dbuf.
*/
for (j = 0; j < s; j++) {
if (i != 5) {
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
bigbuf_arcbufs[j] =
dmu_request_arcbuf(bonus_db, chunksize);
} else {
Expand All @@ -4075,7 +4075,8 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
umem_free(packbuf, packsize);
umem_free(bigbuf, bigsize);
for (j = 0; j < s; j++) {
if (i != 5) {
if (i != 5 ||
chunksize < (SPA_MINBLOCKSIZE * 2)) {
dmu_return_arcbuf(bigbuf_arcbufs[j]);
} else {
dmu_return_arcbuf(
Expand Down Expand Up @@ -4120,7 +4121,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
}
for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
dmu_buf_t *dbt;
if (i != 5) {
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
bcopy((caddr_t)bigbuf + (off - bigoff),
bigbuf_arcbufs[j]->b_data, chunksize);
} else {
Expand All @@ -4137,7 +4138,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
VERIFY(dmu_buf_hold(os, bigobj, off,
FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
}
if (i != 5) {
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
dmu_assign_arcbuf(bonus_db, off,
bigbuf_arcbufs[j], tx);
} else {
Expand Down
1 change: 1 addition & 0 deletions include/sys/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/efi_partition.h \
$(top_srcdir)/include/sys/metaslab.h \
$(top_srcdir)/include/sys/metaslab_impl.h \
$(top_srcdir)/include/sys/multilist.h \
$(top_srcdir)/include/sys/nvpair.h \
$(top_srcdir)/include/sys/nvpair_impl.h \
$(top_srcdir)/include/sys/range_tree.h \
Expand Down
68 changes: 57 additions & 11 deletions include/sys/arc.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ extern "C" {
#include <sys/spa.h>
#include <sys/refcount.h>

/*
* Used by arc_flush() to inform arc_evict_state() that it should evict
* all available buffers from the arc state being passed in.
*/
#define ARC_EVICT_ALL -1ULL

typedef struct arc_buf_hdr arc_buf_hdr_t;
typedef struct arc_buf arc_buf_t;
typedef struct arc_prune arc_prune_t;
Expand All @@ -57,6 +63,55 @@ struct arc_prune {
refcount_t p_refcnt;
};

typedef enum arc_flags
{
/*
* Public flags that can be passed into the ARC by external consumers.
*/
ARC_FLAG_NONE = 1 << 0, /* No flags set */
ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */
ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */
ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */
ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */
ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */
ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */

/*
* Private ARC flags. These flags are private ARC only flags that
* will show up in b_flags in the arc_hdr_buf_t. These flags should
* only be set by ARC code.
*/
ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */
ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */
ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */
ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */
ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */
ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */
ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */
ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */
ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */
/* indicates that the buffer contains metadata (otherwise, data) */
ARC_FLAG_BUFC_METADATA = 1 << 16,

/* Flags specifying whether optional hdr struct fields are defined */
ARC_FLAG_HAS_L1HDR = 1 << 17,
ARC_FLAG_HAS_L2HDR = 1 << 18,

/*
* The arc buffer's compression mode is stored in the top 7 bits of the
* flags field, so these dummy flags are included so that MDB can
* interpret the enum properly.
*/
ARC_FLAG_COMPRESS_0 = 1 << 24,
ARC_FLAG_COMPRESS_1 = 1 << 25,
ARC_FLAG_COMPRESS_2 = 1 << 26,
ARC_FLAG_COMPRESS_3 = 1 << 27,
ARC_FLAG_COMPRESS_4 = 1 << 28,
ARC_FLAG_COMPRESS_5 = 1 << 29,
ARC_FLAG_COMPRESS_6 = 1 << 30

} arc_flags_t;

struct arc_buf {
arc_buf_hdr_t *b_hdr;
arc_buf_t *b_next;
Expand All @@ -71,15 +126,6 @@ typedef enum arc_buf_contents {
ARC_BUFC_METADATA, /* buffer contains metadata */
ARC_BUFC_NUMTYPES
} arc_buf_contents_t;
/*
* These are the flags we pass into calls to the arc
*/
#define ARC_WAIT (1 << 1) /* perform I/O synchronously */
#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */
#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */
#define ARC_CACHED (1 << 4) /* I/O was already in cache */
#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */
#define ARC_L2COMPRESS (1 << 6) /* compress in L2ARC */

/*
* The following breakdows of arc_size exist for kstat only.
Expand Down Expand Up @@ -146,7 +192,7 @@ int arc_referenced(arc_buf_t *buf);

int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_done_func_t *done, void *private, zio_priority_t priority, int flags,
uint32_t *arc_flags, const zbookmark_phys_t *zb);
arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
Expand All @@ -160,7 +206,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp);
void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
boolean_t arc_clear_callback(arc_buf_t *buf);

void arc_flush(spa_t *spa);
void arc_flush(spa_t *spa, boolean_t retry);
void arc_tempreserve_clear(uint64_t reserve);
int arc_tempreserve_space(uint64_t reserve, uint64_t txg);

Expand Down
120 changes: 90 additions & 30 deletions include/sys/arc_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,25 @@ extern "C" {
*/

typedef struct arc_state {
list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
uint64_t arcs_size; /* total amount of data in this state */
kmutex_t arcs_mtx;
/*
* list of evictable buffers
*/
multilist_t arcs_list[ARC_BUFC_NUMTYPES];
/*
* total amount of evictable data in this state
*/
uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
/*
* total amount of data in this state; this includes: evictable,
* non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
*/
uint64_t arcs_size;
/*
* supports the "dbufs" kstat
*/
arc_state_type_t arcs_state;
} arc_state_t;

typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;

typedef struct arc_callback arc_callback_t;

struct arc_callback {
Expand All @@ -96,31 +106,49 @@ struct arc_write_callback {
arc_buf_t *awcb_buf;
};

struct arc_buf_hdr {
/* protected by hash lock */
dva_t b_dva;
uint64_t b_birth;
uint64_t b_cksum0;

/*
* ARC buffers are separated into multiple structs as a memory saving measure:
* - Common fields struct, always defined, and embedded within it:
* - L2-only fields, always allocated but undefined when not in L2ARC
* - L1-only fields, only allocated when in L1ARC
*
* Buffer in L1 Buffer only in L2
* +------------------------+ +------------------------+
* | arc_buf_hdr_t | | arc_buf_hdr_t |
* | | | |
* | | | |
* | | | |
* +------------------------+ +------------------------+
* | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
* | (undefined if L1-only) | | |
* +------------------------+ +------------------------+
* | l1arc_buf_hdr_t |
* | |
* | |
* | |
* | |
* +------------------------+
*
* Because it's possible for the L2ARC to become extremely large, we can wind
* up eating a lot of memory in L2ARC buffer headers, so the size of a header
* is minimized by only allocating the fields necessary for an L1-cached buffer
* when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
* l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
* words in pointers. arc_hdr_realloc() is used to switch a header between
* these two allocation states.
*/
typedef struct l1arc_buf_hdr {
kmutex_t b_freeze_lock;
zio_cksum_t *b_freeze_cksum;

arc_buf_hdr_t *b_hash_next;
arc_buf_t *b_buf;
uint32_t b_flags;
uint32_t b_datacnt;

arc_callback_t *b_acb;
/* for waiting on writes to complete */
kcondvar_t b_cv;

/* immutable */
arc_buf_contents_t b_type;
uint64_t b_size;
uint64_t b_spa;

/* protected by arc state mutex */
arc_state_t *b_state;
list_node_t b_arc_node;
multilist_node_t b_arc_node;

/* updated atomically */
clock_t b_arc_access;
Expand All @@ -133,9 +161,10 @@ struct arc_buf_hdr {
/* self protecting */
refcount_t b_refcnt;

l2arc_buf_hdr_t *b_l2hdr;
list_node_t b_l2node;
};
arc_callback_t *b_acb;
/* temporary buffer holder for in-flight compressed data */
void *b_tmp_cdata;
} l1arc_buf_hdr_t;

typedef struct l2arc_dev {
vdev_t *l2ad_vdev; /* vdev */
Expand All @@ -146,15 +175,46 @@ typedef struct l2arc_dev {
uint64_t l2ad_evict; /* last addr eviction reached */
boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
list_t *l2ad_buflist; /* buffer list */
kmutex_t l2ad_mtx; /* lock for buffer list */
list_t l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
} l2arc_dev_t;

typedef struct l2arc_write_callback {
l2arc_dev_t *l2wcb_dev; /* device info */
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
} l2arc_write_callback_t;
typedef struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */
uint64_t b_daddr; /* disk address, offset byte */
/* real alloc'd buffer size depending on b_compress applied */
uint32_t b_hits;
int32_t b_asize;

list_node_t b_l2node;
} l2arc_buf_hdr_t;

struct arc_buf_hdr {
/* protected by hash lock */
dva_t b_dva;
uint64_t b_birth;
/*
* Even though this checksum is only set/verified when a buffer is in
* the L1 cache, it needs to be in the set of common fields because it
* must be preserved from the time before a buffer is written out to
* L2ARC until after it is read back in.
*/
zio_cksum_t *b_freeze_cksum;

arc_buf_hdr_t *b_hash_next;
arc_flags_t b_flags;

/* immutable */
int32_t b_size;
uint64_t b_spa;

/* L2ARC fields. Undefined when not in L2ARC. */
l2arc_buf_hdr_t b_l2hdr;
/* L1ARC fields. Undefined when in l2arc_only state */
l1arc_buf_hdr_t b_l1hdr;
};
#ifdef __cplusplus
}
#endif
Expand Down
2 changes: 0 additions & 2 deletions include/sys/ddt.h
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,6 @@ extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len);
extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
extern void ddt_enter(ddt_t *ddt);
extern void ddt_exit(ddt_t *ddt);
extern void ddt_init(void);
extern void ddt_fini(void);
extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
Expand Down
Loading

0 comments on commit b68105e

Please sign in to comment.