Skip to content

Commit

Permalink
Merge branch 'lock-contention-on-arcs_mtx-final'
Browse files Browse the repository at this point in the history
Signed-off-by: Tim Chase <[email protected]>
Signed-off-by: Brian Behlendorf
Closes #3115
Closes #3481
  • Loading branch information
behlendorf committed Jun 11, 2015
2 parents 44de2f0 + 121b3ca commit 06358ea
Show file tree
Hide file tree
Showing 33 changed files with 3,020 additions and 1,486 deletions.
4 changes: 0 additions & 4 deletions cmd/arc_summary/arc_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,10 @@ def get_arc_summary(Kstat):
### ARC Misc. ###
deleted = Kstat["kstat.zfs.misc.arcstats.deleted"]
mutex_miss = Kstat["kstat.zfs.misc.arcstats.mutex_miss"]
recycle_miss = Kstat["kstat.zfs.misc.arcstats.recycle_miss"]

### ARC Misc. ###
output["arc_misc"] = {}
output["arc_misc"]["deleted"] = fHits(deleted)
output["arc_misc"]['recycle_miss'] = fHits(recycle_miss)
output["arc_misc"]['mutex_miss'] = fHits(mutex_miss)
output["arc_misc"]['evict_skips'] = fHits(mutex_miss)

Expand Down Expand Up @@ -302,8 +300,6 @@ def _arc_summary(Kstat):
### ARC Misc. ###
sys.stdout.write("ARC Misc:\n")
sys.stdout.write("\tDeleted:\t\t\t\t%s\n" % arc['arc_misc']['deleted'])
sys.stdout.write("\tRecycle Misses:\t\t\t\t%s\n" %
arc['arc_misc']['recycle_miss'])
sys.stdout.write("\tMutex Misses:\t\t\t\t%s\n" %
arc['arc_misc']['mutex_miss'])
sys.stdout.write("\tEvict Skips:\t\t\t\t%s\n" %
Expand Down
2 changes: 0 additions & 2 deletions cmd/arcstat/arcstat.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@
"mrug": [4, 1000, "MRU Ghost List hits per second"],
"eskip": [5, 1000, "evict_skip per second"],
"mtxmis": [6, 1000, "mutex_miss per second"],
"rmis": [4, 1000, "recycle_miss per second"],
"dread": [5, 1000, "Demand accesses per second"],
"pread": [5, 1000, "Prefetch accesses per second"],
"l2hits": [6, 1000, "L2ARC hits per second"],
Expand Down Expand Up @@ -406,7 +405,6 @@ def calculate():
v["mrug"] = d["mru_ghost_hits"] / sint
v["mfug"] = d["mfu_ghost_hits"] / sint
v["eskip"] = d["evict_skip"] / sint
v["rmis"] = d["recycle_miss"] / sint
v["mtxmis"] = d["mutex_miss"] / sint

if l2exist:
Expand Down
2 changes: 1 addition & 1 deletion cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -1250,7 +1250,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
print_indirect(bp, zb, dnp);

if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
uint32_t flags = ARC_WAIT;
arc_flags_t flags = ARC_FLAG_WAIT;
int i;
blkptr_t *cbp;
int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
Expand Down
9 changes: 5 additions & 4 deletions cmd/ztest/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -4042,7 +4042,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
* assign an arcbuf to a dbuf.
*/
for (j = 0; j < s; j++) {
if (i != 5) {
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
bigbuf_arcbufs[j] =
dmu_request_arcbuf(bonus_db, chunksize);
} else {
Expand All @@ -4066,7 +4066,8 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
umem_free(packbuf, packsize);
umem_free(bigbuf, bigsize);
for (j = 0; j < s; j++) {
if (i != 5) {
if (i != 5 ||
chunksize < (SPA_MINBLOCKSIZE * 2)) {
dmu_return_arcbuf(bigbuf_arcbufs[j]);
} else {
dmu_return_arcbuf(
Expand Down Expand Up @@ -4111,7 +4112,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
}
for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
dmu_buf_t *dbt;
if (i != 5) {
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
bcopy((caddr_t)bigbuf + (off - bigoff),
bigbuf_arcbufs[j]->b_data, chunksize);
} else {
Expand All @@ -4128,7 +4129,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
VERIFY(dmu_buf_hold(os, bigobj, off,
FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
}
if (i != 5) {
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
dmu_assign_arcbuf(bonus_db, off,
bigbuf_arcbufs[j], tx);
} else {
Expand Down
2 changes: 2 additions & 0 deletions include/sys/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/efi_partition.h \
$(top_srcdir)/include/sys/metaslab.h \
$(top_srcdir)/include/sys/metaslab_impl.h \
$(top_srcdir)/include/sys/multilist.h \
$(top_srcdir)/include/sys/nvpair.h \
$(top_srcdir)/include/sys/nvpair_impl.h \
$(top_srcdir)/include/sys/range_tree.h \
Expand All @@ -53,6 +54,7 @@ COMMON_H = \
$(top_srcdir)/include/sys/trace_dbuf.h \
$(top_srcdir)/include/sys/trace_dmu.h \
$(top_srcdir)/include/sys/trace_dnode.h \
$(top_srcdir)/include/sys/trace_multilist.h \
$(top_srcdir)/include/sys/trace_txg.h \
$(top_srcdir)/include/sys/trace_zil.h \
$(top_srcdir)/include/sys/trace_zrlock.h \
Expand Down
75 changes: 63 additions & 12 deletions include/sys/arc.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ extern "C" {
#include <sys/spa.h>
#include <sys/refcount.h>

/*
* Used by arc_flush() to inform arc_evict_state() that it should evict
* all available buffers from the arc state being passed in.
*/
#define ARC_EVICT_ALL -1ULL

typedef struct arc_buf_hdr arc_buf_hdr_t;
typedef struct arc_buf arc_buf_t;
typedef struct arc_prune arc_prune_t;
Expand All @@ -53,10 +59,65 @@ arc_done_func_t arc_getbuf_func;
struct arc_prune {
arc_prune_func_t *p_pfunc;
void *p_private;
uint64_t p_adjust;
list_node_t p_node;
refcount_t p_refcnt;
};

typedef enum arc_strategy {
ARC_STRATEGY_META_ONLY = 0, /* Evict only meta data buffers */
ARC_STRATEGY_META_BALANCED = 1, /* Evict data buffers if needed */
} arc_strategy_t;

typedef enum arc_flags
{
/*
* Public flags that can be passed into the ARC by external consumers.
*/
ARC_FLAG_NONE = 1 << 0, /* No flags set */
ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */
ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */
ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */
ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */
ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */
ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */

/*
* Private ARC flags. These flags are private ARC only flags that
* will show up in b_flags in the arc_hdr_buf_t. These flags should
* only be set by ARC code.
*/
ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */
ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */
ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */
ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */
ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */
ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */
ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */
ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */
ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */
/* indicates that the buffer contains metadata (otherwise, data) */
ARC_FLAG_BUFC_METADATA = 1 << 16,

/* Flags specifying whether optional hdr struct fields are defined */
ARC_FLAG_HAS_L1HDR = 1 << 17,
ARC_FLAG_HAS_L2HDR = 1 << 18,

/*
* The arc buffer's compression mode is stored in the top 7 bits of the
* flags field, so these dummy flags are included so that MDB can
* interpret the enum properly.
*/
ARC_FLAG_COMPRESS_0 = 1 << 24,
ARC_FLAG_COMPRESS_1 = 1 << 25,
ARC_FLAG_COMPRESS_2 = 1 << 26,
ARC_FLAG_COMPRESS_3 = 1 << 27,
ARC_FLAG_COMPRESS_4 = 1 << 28,
ARC_FLAG_COMPRESS_5 = 1 << 29,
ARC_FLAG_COMPRESS_6 = 1 << 30

} arc_flags_t;

struct arc_buf {
arc_buf_hdr_t *b_hdr;
arc_buf_t *b_next;
Expand All @@ -71,15 +132,6 @@ typedef enum arc_buf_contents {
ARC_BUFC_METADATA, /* buffer contains metadata */
ARC_BUFC_NUMTYPES
} arc_buf_contents_t;
/*
* These are the flags we pass into calls to the arc
*/
#define ARC_WAIT (1 << 1) /* perform I/O synchronously */
#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */
#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */
#define ARC_CACHED (1 << 4) /* I/O was already in cache */
#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */
#define ARC_L2COMPRESS (1 << 6) /* compress in L2ARC */

/*
* The following breakdows of arc_size exist for kstat only.
Expand All @@ -106,7 +158,6 @@ typedef enum arc_state_type {
typedef struct arc_buf_info {
arc_state_type_t abi_state_type;
arc_buf_contents_t abi_state_contents;
uint64_t abi_state_index;
uint32_t abi_flags;
uint32_t abi_datacnt;
uint64_t abi_size;
Expand Down Expand Up @@ -146,7 +197,7 @@ int arc_referenced(arc_buf_t *buf);

int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
arc_done_func_t *done, void *private, zio_priority_t priority, int flags,
uint32_t *arc_flags, const zbookmark_phys_t *zb);
arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
Expand All @@ -160,7 +211,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp);
void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
boolean_t arc_clear_callback(arc_buf_t *buf);

void arc_flush(spa_t *spa);
void arc_flush(spa_t *spa, boolean_t retry);
void arc_tempreserve_clear(uint64_t reserve);
int arc_tempreserve_space(uint64_t reserve, uint64_t txg);

Expand Down
117 changes: 91 additions & 26 deletions include/sys/arc_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,25 @@ extern "C" {
*/

typedef struct arc_state {
list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */
uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
uint64_t arcs_size; /* total amount of data in this state */
kmutex_t arcs_mtx;
/*
* list of evictable buffers
*/
multilist_t arcs_list[ARC_BUFC_NUMTYPES];
/*
* total amount of evictable data in this state
*/
uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
/*
* total amount of data in this state; this includes: evictable,
* non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
*/
uint64_t arcs_size;
/*
* supports the "dbufs" kstat
*/
arc_state_type_t arcs_state;
} arc_state_t;

typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;

typedef struct arc_callback arc_callback_t;

struct arc_callback {
Expand All @@ -96,31 +106,49 @@ struct arc_write_callback {
arc_buf_t *awcb_buf;
};

struct arc_buf_hdr {
/* protected by hash lock */
dva_t b_dva;
uint64_t b_birth;
uint64_t b_cksum0;

/*
* ARC buffers are separated into multiple structs as a memory saving measure:
* - Common fields struct, always defined, and embedded within it:
* - L2-only fields, always allocated but undefined when not in L2ARC
* - L1-only fields, only allocated when in L1ARC
*
* Buffer in L1 Buffer only in L2
* +------------------------+ +------------------------+
* | arc_buf_hdr_t | | arc_buf_hdr_t |
* | | | |
* | | | |
* | | | |
* +------------------------+ +------------------------+
* | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
* | (undefined if L1-only) | | |
* +------------------------+ +------------------------+
* | l1arc_buf_hdr_t |
* | |
* | |
* | |
* | |
* +------------------------+
*
* Because it's possible for the L2ARC to become extremely large, we can wind
* up eating a lot of memory in L2ARC buffer headers, so the size of a header
* is minimized by only allocating the fields necessary for an L1-cached buffer
* when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
* l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
* words in pointers. arc_hdr_realloc() is used to switch a header between
* these two allocation states.
*/
typedef struct l1arc_buf_hdr {
kmutex_t b_freeze_lock;
zio_cksum_t *b_freeze_cksum;

arc_buf_hdr_t *b_hash_next;
arc_buf_t *b_buf;
uint32_t b_flags;
uint32_t b_datacnt;

arc_callback_t *b_acb;
/* for waiting on writes to complete */
kcondvar_t b_cv;

/* immutable */
arc_buf_contents_t b_type;
uint64_t b_size;
uint64_t b_spa;

/* protected by arc state mutex */
arc_state_t *b_state;
list_node_t b_arc_node;
multilist_node_t b_arc_node;

/* updated atomically */
clock_t b_arc_access;
Expand All @@ -133,9 +161,10 @@ struct arc_buf_hdr {
/* self protecting */
refcount_t b_refcnt;

l2arc_buf_hdr_t *b_l2hdr;
list_node_t b_l2node;
};
arc_callback_t *b_acb;
/* temporary buffer holder for in-flight compressed data */
void *b_tmp_cdata;
} l1arc_buf_hdr_t;

typedef struct l2arc_dev {
vdev_t *l2ad_vdev; /* vdev */
Expand All @@ -146,15 +175,51 @@ typedef struct l2arc_dev {
uint64_t l2ad_evict; /* last addr eviction reached */
boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
list_t *l2ad_buflist; /* buffer list */
kmutex_t l2ad_mtx; /* lock for buffer list */
list_t l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
} l2arc_dev_t;

typedef struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */
uint64_t b_daddr; /* disk address, offset byte */
/* real alloc'd buffer size depending on b_compress applied */
uint32_t b_hits;
int32_t b_asize;

list_node_t b_l2node;
} l2arc_buf_hdr_t;

typedef struct l2arc_write_callback {
l2arc_dev_t *l2wcb_dev; /* device info */
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
} l2arc_write_callback_t;

struct arc_buf_hdr {
/* protected by hash lock */
dva_t b_dva;
uint64_t b_birth;
/*
* Even though this checksum is only set/verified when a buffer is in
* the L1 cache, it needs to be in the set of common fields because it
* must be preserved from the time before a buffer is written out to
* L2ARC until after it is read back in.
*/
zio_cksum_t *b_freeze_cksum;

arc_buf_hdr_t *b_hash_next;
arc_flags_t b_flags;

/* immutable */
int32_t b_size;
uint64_t b_spa;

/* L2ARC fields. Undefined when not in L2ARC. */
l2arc_buf_hdr_t b_l2hdr;
/* L1ARC fields. Undefined when in l2arc_only state */
l1arc_buf_hdr_t b_l1hdr;
};
#ifdef __cplusplus
}
#endif
Expand Down
Loading

0 comments on commit 06358ea

Please sign in to comment.