From 98ab38d1096079d82247350f526f0d7268956fb5 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 30 Sep 2013 11:45:58 -0700 Subject: [PATCH 1/7] Revert "Add new kstat for monitoring time in dmu_tx_assign" This reverts commit 92334b14ec378b1693573b52c09816bbade9cf3e. Signed-off-by: Brian Behlendorf --- include/sys/dsl_pool.h | 5 ---- module/zfs/dmu_tx.c | 8 ------ module/zfs/dsl_pool.c | 59 ------------------------------------------ 3 files changed, 72 deletions(-) diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 51b588e6aecc..807d76b11f9f 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -90,7 +90,6 @@ typedef struct dsl_pool { uint64_t dp_root_dir_obj; struct taskq *dp_iput_taskq; kstat_t *dp_txg_kstat; - kstat_t *dp_tx_assign_kstat; /* No lock needed - sync context only */ blkptr_t dp_meta_rootbp; @@ -113,8 +112,6 @@ typedef struct dsl_pool { uint64_t dp_mos_uncompressed_delta; uint64_t dp_txg_history_size; list_t dp_txg_history; - uint64_t dp_tx_assign_size; - kstat_named_t *dp_tx_assign_buckets; /* Has its own locking */ @@ -171,8 +168,6 @@ int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp); void dsl_pool_rele(dsl_pool_t *dp, void *tag); -void dsl_pool_tx_assign_add_usecs(dsl_pool_t *dp, uint64_t usecs); - txg_history_t *dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg); txg_history_t *dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg); void dsl_pool_txg_history_put(txg_history_t *th); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 3e46a02f8e57..17982a04eca4 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1077,15 +1077,12 @@ dmu_tx_unassign(dmu_tx_t *tx) int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) { - hrtime_t before, after; int err; ASSERT(tx->tx_txg == 0); ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); - before = gethrtime(); - /* If we might wait, we must not hold the config lock. */ ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); @@ -1100,11 +1097,6 @@ dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) txg_rele_to_quiesce(&tx->tx_txgh); - after = gethrtime(); - - dsl_pool_tx_assign_add_usecs(tx->tx_pool, - (after - before) / NSEC_PER_USEC); - return (0); } diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index da4a9df85449..f81d843543a3 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -59,63 +59,6 @@ kmutex_t zfs_write_limit_lock; static pgcnt_t old_physmem = 0; -static void -dsl_pool_tx_assign_init(dsl_pool_t *dp, unsigned int ndata) -{ - kstat_named_t *ks; - char name[KSTAT_STRLEN]; - int i, data_size = ndata * sizeof(kstat_named_t); - - (void) snprintf(name, KSTAT_STRLEN, "dmu_tx_assign-%s", - spa_name(dp->dp_spa)); - - dp->dp_tx_assign_size = ndata; - - if (data_size) - dp->dp_tx_assign_buckets = kmem_alloc(data_size, KM_SLEEP); - else - dp->dp_tx_assign_buckets = NULL; - - for (i = 0; i < dp->dp_tx_assign_size; i++) { - ks = &dp->dp_tx_assign_buckets[i]; - ks->data_type = KSTAT_DATA_UINT64; - ks->value.ui64 = 0; - (void) snprintf(ks->name, KSTAT_STRLEN, "%u us", 1 << i); - } - - dp->dp_tx_assign_kstat = kstat_create("zfs", 0, name, "misc", - KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); - - if (dp->dp_tx_assign_kstat) { - dp->dp_tx_assign_kstat->ks_data = dp->dp_tx_assign_buckets; - dp->dp_tx_assign_kstat->ks_ndata = dp->dp_tx_assign_size; - dp->dp_tx_assign_kstat->ks_data_size = data_size; - kstat_install(dp->dp_tx_assign_kstat); - } -} - -static void -dsl_pool_tx_assign_destroy(dsl_pool_t *dp) -{ - if (dp->dp_tx_assign_buckets) - kmem_free(dp->dp_tx_assign_buckets, - dp->dp_tx_assign_size * sizeof(kstat_named_t)); - - if (dp->dp_tx_assign_kstat) - kstat_delete(dp->dp_tx_assign_kstat); -} - -void -dsl_pool_tx_assign_add_usecs(dsl_pool_t *dp, uint64_t usecs) -{ - uint64_t idx = 0; - - while (((1 << idx) < usecs) && (idx < dp->dp_tx_assign_size - 1)) - idx++; - - atomic_inc_64(&dp->dp_tx_assign_buckets[idx].value.ui64); -} - static int dsl_pool_txg_history_update(kstat_t *ksp, int rw) { @@ -296,7 +239,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) 1, 4, 0); dsl_pool_txg_history_init(dp, txg); - dsl_pool_tx_assign_init(dp, 32); return (dp); } @@ -438,7 +380,6 @@ dsl_pool_close(dsl_pool_t *dp) arc_flush(dp->dp_spa); txg_fini(dp); dsl_scan_fini(dp); - dsl_pool_tx_assign_destroy(dp); dsl_pool_txg_history_destroy(dp); rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); From 76463d4026e0fa4b3d7b96acd58cb5fb79c49af7 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 30 Sep 2013 11:51:20 -0700 Subject: [PATCH 2/7] Revert "Add txgs- kstat file" This reverts commit e95853a331529a6cb96fdf10476c53441e59f4e1. --- include/sys/dsl_pool.h | 15 ---- lib/libspl/include/sys/kstat.h | 27 +------ module/zfs/dsl_pool.c | 144 --------------------------------- module/zfs/txg.c | 46 ----------- 4 files changed, 1 insertion(+), 231 deletions(-) diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 807d76b11f9f..25abd9c03db1 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -71,13 +71,6 @@ typedef struct zfs_all_blkstats { zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1]; } zfs_all_blkstats_t; -typedef struct txg_history { - kstat_txg_t th_kstat; - vdev_stat_t th_vs1; - vdev_stat_t th_vs2; - kmutex_t th_lock; - list_node_t th_link; -} txg_history_t; typedef struct dsl_pool { /* Immutable */ @@ -89,7 +82,6 @@ typedef struct dsl_pool { struct dsl_dataset *dp_origin_snap; uint64_t dp_root_dir_obj; struct taskq *dp_iput_taskq; - kstat_t *dp_txg_kstat; /* No lock needed - sync context only */ blkptr_t dp_meta_rootbp; @@ -110,9 +102,6 @@ typedef struct dsl_pool { uint64_t dp_mos_used_delta; uint64_t dp_mos_compressed_delta; uint64_t dp_mos_uncompressed_delta; - uint64_t dp_txg_history_size; - list_t dp_txg_history; - /* Has its own locking */ tx_state_t dp_tx; @@ -168,10 +157,6 @@ int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp); void dsl_pool_rele(dsl_pool_t *dp, void *tag); -txg_history_t *dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg); -txg_history_t *dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg); -void dsl_pool_txg_history_put(txg_history_t *th); - #ifdef __cplusplus } #endif diff --git a/lib/libspl/include/sys/kstat.h b/lib/libspl/include/sys/kstat.h index 6bd2ec878a9c..fcd3ed98b382 100644 --- a/lib/libspl/include/sys/kstat.h +++ b/lib/libspl/include/sys/kstat.h @@ -228,10 +228,8 @@ typedef struct kstat32 { /* ks_ndata == 1 */ #define KSTAT_TYPE_TIMER 4 /* event timer */ /* ks_ndata >= 1 */ -#define KSTAT_TYPE_TXG 5 /* txg statistics */ - /* ks_ndata >= 0 */ -#define KSTAT_NUM_TYPES 6 +#define KSTAT_NUM_TYPES 5 /* * kstat class @@ -700,29 +698,6 @@ typedef struct kstat_timer { #define KSTAT_TIMER_PTR(kptr) ((kstat_timer_t *)(kptr)->ks_data) -/* - * TXG statistics - bytes read/written and iops performed - */ -typedef enum kstat_txg_state { - TXG_STATE_OPEN = 1, - TXG_STATE_QUIESCING = 2, - TXG_STATE_SYNCING = 3, - TXG_STATE_COMMITTED = 4, -} kstat_txg_state_t; - -typedef struct kstat_txg { - u_longlong_t txg; /* txg id */ - kstat_txg_state_t state; /* txg state */ - hrtime_t birth; /* birth time stamp */ - u_longlong_t nread; /* number of bytes read */ - u_longlong_t nwritten; /* number of bytes written */ - uint_t reads; /* number of read operations */ - uint_t writes; /* number of write operations */ - hrtime_t open_time; /* open time */ - hrtime_t quiesce_time; /* quiesce time */ - hrtime_t sync_time; /* sync time */ -} kstat_txg_t; - #if defined(_KERNEL) #include diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index f81d843543a3..ffef9be75a78 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -48,7 +48,6 @@ int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */ -int zfs_txg_history = 60; /* statistics for the last N txgs */ unsigned long zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ unsigned long zfs_write_limit_max = 0; /* max data payload per txg */ @@ -59,143 +58,6 @@ kmutex_t zfs_write_limit_lock; static pgcnt_t old_physmem = 0; -static int -dsl_pool_txg_history_update(kstat_t *ksp, int rw) -{ - dsl_pool_t *dp = ksp->ks_private; - txg_history_t *th; - int i = 0; - - if (rw == KSTAT_WRITE) - return (EACCES); - - if (ksp->ks_data) - kmem_free(ksp->ks_data, ksp->ks_data_size); - - mutex_enter(&dp->dp_lock); - - ksp->ks_ndata = dp->dp_txg_history_size; - ksp->ks_data_size = dp->dp_txg_history_size * sizeof(kstat_txg_t); - if (ksp->ks_data_size > 0) - ksp->ks_data = kmem_alloc(ksp->ks_data_size, KM_PUSHPAGE); - - /* Traversed oldest to youngest for the most readable kstat output */ - for (th = list_tail(&dp->dp_txg_history); th != NULL; - th = list_prev(&dp->dp_txg_history, th)) { - mutex_enter(&th->th_lock); - ASSERT3S(i + sizeof(kstat_txg_t), <=, ksp->ks_data_size); - memcpy(ksp->ks_data + i, &th->th_kstat, sizeof(kstat_txg_t)); - i += sizeof(kstat_txg_t); - mutex_exit(&th->th_lock); - } - - mutex_exit(&dp->dp_lock); - - return (0); -} - -static void -dsl_pool_txg_history_init(dsl_pool_t *dp, uint64_t txg) -{ - char name[KSTAT_STRLEN]; - - list_create(&dp->dp_txg_history, sizeof (txg_history_t), - offsetof(txg_history_t, th_link)); - dsl_pool_txg_history_add(dp, txg); - - (void) snprintf(name, KSTAT_STRLEN, "txgs-%s", spa_name(dp->dp_spa)); - dp->dp_txg_kstat = kstat_create("zfs", 0, name, "misc", - KSTAT_TYPE_TXG, 0, KSTAT_FLAG_VIRTUAL); - if (dp->dp_txg_kstat) { - dp->dp_txg_kstat->ks_data = NULL; - dp->dp_txg_kstat->ks_private = dp; - dp->dp_txg_kstat->ks_update = dsl_pool_txg_history_update; - kstat_install(dp->dp_txg_kstat); - } -} - -static void -dsl_pool_txg_history_destroy(dsl_pool_t *dp) -{ - txg_history_t *th; - - if (dp->dp_txg_kstat) { - if (dp->dp_txg_kstat->ks_data) - kmem_free(dp->dp_txg_kstat->ks_data, - dp->dp_txg_kstat->ks_data_size); - - kstat_delete(dp->dp_txg_kstat); - } - - mutex_enter(&dp->dp_lock); - while ((th = list_remove_head(&dp->dp_txg_history))) { - dp->dp_txg_history_size--; - mutex_destroy(&th->th_lock); - kmem_free(th, sizeof(txg_history_t)); - } - - ASSERT3U(dp->dp_txg_history_size, ==, 0); - list_destroy(&dp->dp_txg_history); - mutex_exit(&dp->dp_lock); -} - -txg_history_t * -dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg) -{ - txg_history_t *th, *rm; - - th = kmem_zalloc(sizeof(txg_history_t), KM_PUSHPAGE); - mutex_init(&th->th_lock, NULL, MUTEX_DEFAULT, NULL); - th->th_kstat.txg = txg; - th->th_kstat.state = TXG_STATE_OPEN; - th->th_kstat.birth = gethrtime(); - - mutex_enter(&dp->dp_lock); - - list_insert_head(&dp->dp_txg_history, th); - dp->dp_txg_history_size++; - - while (dp->dp_txg_history_size > zfs_txg_history) { - dp->dp_txg_history_size--; - rm = list_remove_tail(&dp->dp_txg_history); - mutex_destroy(&rm->th_lock); - kmem_free(rm, sizeof(txg_history_t)); - } - - mutex_exit(&dp->dp_lock); - - return (th); -} - -/* - * Traversed youngest to oldest because lookups are only done for open - * or syncing txgs which are guaranteed to be at the head of the list. - * The txg_history_t structure will be returned locked. - */ -txg_history_t * -dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg) -{ - txg_history_t *th; - - mutex_enter(&dp->dp_lock); - for (th = list_head(&dp->dp_txg_history); th != NULL; - th = list_next(&dp->dp_txg_history, th)) { - if (th->th_kstat.txg == txg) { - mutex_enter(&th->th_lock); - break; - } - } - mutex_exit(&dp->dp_lock); - - return (th); -} - -void -dsl_pool_txg_history_put(txg_history_t *th) -{ - mutex_exit(&th->th_lock); -} - int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { @@ -238,8 +100,6 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) dp->dp_iput_taskq = taskq_create("zfs_iput_taskq", 1, minclsyspri, 1, 4, 0); - dsl_pool_txg_history_init(dp, txg); - return (dp); } @@ -380,7 +240,6 @@ dsl_pool_close(dsl_pool_t *dp) arc_flush(dp->dp_spa); txg_fini(dp); dsl_scan_fini(dp); - dsl_pool_txg_history_destroy(dp); rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); taskq_destroy(dp->dp_iput_taskq); @@ -1182,9 +1041,6 @@ MODULE_PARM_DESC(zfs_write_limit_shift, "log2(fraction of memory) per txg"); module_param(zfs_txg_synctime_ms, int, 0644); MODULE_PARM_DESC(zfs_txg_synctime_ms, "Target milliseconds between txg sync"); -module_param(zfs_txg_history, int, 0644); -MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs"); - module_param(zfs_write_limit_min, ulong, 0444); MODULE_PARM_DESC(zfs_write_limit_min, "Min txg write limit"); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index b3e537f459c4..440353a50e2e 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -31,7 +31,6 @@ #include #include #include -#include /* * ZFS Transaction Groups @@ -351,8 +350,6 @@ txg_rele_to_sync(txg_handle_t *th) static void txg_quiesce(dsl_pool_t *dp, uint64_t txg) { - hrtime_t start; - txg_history_t *th; tx_state_t *tx = &dp->dp_tx; int g = txg & TXG_MASK; int c; @@ -373,20 +370,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) for (c = 0; c < max_ncpus; c++) mutex_exit(&tx->tx_cpu[c].tc_lock); - /* - * Measure how long the txg was open and replace the kstat. - */ - th = dsl_pool_txg_history_get(dp, txg); - th->th_kstat.open_time = gethrtime() - th->th_kstat.birth; - th->th_kstat.state = TXG_STATE_QUIESCING; - dsl_pool_txg_history_put(th); - dsl_pool_txg_history_add(dp, tx->tx_open_txg); - /* * Quiesce the transaction group by waiting for everyone to txg_exit(). */ - start = gethrtime(); - for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; mutex_enter(&tc->tc_lock); @@ -394,13 +380,6 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) cv_wait(&tc->tc_cv[g], &tc->tc_lock); mutex_exit(&tc->tc_lock); } - - /* - * Measure how long the txg took to quiesce. - */ - th = dsl_pool_txg_history_get(dp, txg); - th->th_kstat.quiesce_time = gethrtime() - start; - dsl_pool_txg_history_put(th); } static void @@ -487,8 +466,6 @@ txg_sync_thread(dsl_pool_t *dp) start = delta = 0; for (;;) { - hrtime_t hrstart; - txg_history_t *th; uint64_t timer, timeout; uint64_t txg; @@ -535,17 +512,11 @@ txg_sync_thread(dsl_pool_t *dp) tx->tx_syncing_txg = txg; cv_broadcast(&tx->tx_quiesce_more_cv); - th = dsl_pool_txg_history_get(dp, txg); - th->th_kstat.state = TXG_STATE_SYNCING; - vdev_get_stats(spa->spa_root_vdev, &th->th_vs1); - dsl_pool_txg_history_put(th); - dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); start = ddi_get_lbolt(); - hrstart = gethrtime(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; @@ -558,23 +529,6 @@ txg_sync_thread(dsl_pool_t *dp) * Dispatch commit callbacks to worker threads. */ txg_dispatch_callbacks(dp, txg); - - /* - * Measure the txg sync time determine the amount of I/O done. - */ - th = dsl_pool_txg_history_get(dp, txg); - vdev_get_stats(spa->spa_root_vdev, &th->th_vs2); - th->th_kstat.sync_time = gethrtime() - hrstart; - th->th_kstat.nread = th->th_vs2.vs_bytes[ZIO_TYPE_READ] - - th->th_vs1.vs_bytes[ZIO_TYPE_READ]; - th->th_kstat.nwritten = th->th_vs2.vs_bytes[ZIO_TYPE_WRITE] - - th->th_vs1.vs_bytes[ZIO_TYPE_WRITE]; - th->th_kstat.reads = th->th_vs2.vs_ops[ZIO_TYPE_READ] - - th->th_vs1.vs_ops[ZIO_TYPE_READ]; - th->th_kstat.writes = th->th_vs2.vs_ops[ZIO_TYPE_WRITE] - - th->th_vs1.vs_ops[ZIO_TYPE_WRITE]; - th->th_kstat.state = TXG_STATE_COMMITTED; - dsl_pool_txg_history_put(th); } } From 1421c89142376bfd41e4de22ed7c7846b9e41f95 Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Fri, 6 Sep 2013 16:09:05 -0700 Subject: [PATCH 3/7] Add visibility in to arc_read This change is an attempt to add visibility into the arc_read calls occurring on a system, in real time. To do this, a list was added to the in memory SPA data structure for a pool, with each element on the list corresponding to a call to arc_read. These entries are then exported through the kstat interface, which can then be interpreted in userspace. For each arc_read call, the following information is exported: * A unique identifier (uint64_t) * The time the entry was added to the list (hrtime_t) (*not* wall clock time; relative to the other entries on the list) * The objset ID (uint64_t) * The object number (uint64_t) * The indirection level (uint64_t) * The block ID (uint64_t) * The name of the function originating the arc_read call (char[24]) * The arc_flags from the arc_read call (uint32_t) * The PID of the reading thread (pid_t) * The command or name of thread originating read (char[16]) From this exported information one can see, in real time, exactly what is being read, what function is generating the read, and whether or not the read was found to be already cached. There is still some work to be done, but this should serve as a good starting point. Specifically, dbuf_read's are not accounted for in the currently exported information. Thus, a follow up patch should probably be added to export these calls that never call into arc_read (they only hit the dbuf hash table). In addition, it might be nice to create a utility similar to "arcstat.py" to digest the exported information and display it in a more readable format. Or perhaps, log the information and allow for it to be "replayed" at a later time. Signed-off-by: Prakash Surya Signed-off-by: Brian Behlendorf --- include/sys/spa.h | 21 ++++ include/sys/spa_impl.h | 2 + include/sys/zfs_context.h | 5 + include/sys/zio.h | 6 +- lib/libzpool/Makefile.am | 1 + lib/libzpool/kernel.c | 8 ++ module/zfs/Makefile.in | 1 + module/zfs/arc.c | 20 +-- module/zfs/dmu_traverse.c | 7 +- module/zfs/spa_misc.c | 4 +- module/zfs/spa_stats.c | 248 ++++++++++++++++++++++++++++++++++++++ 11 files changed, 310 insertions(+), 13 deletions(-) create mode 100644 module/zfs/spa_stats.c diff --git a/include/sys/spa.h b/include/sys/spa.h index 401ae8343e96..a4a717ac0450 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -51,6 +51,8 @@ typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; typedef struct ddt ddt_t; typedef struct ddt_entry ddt_entry_t; +typedef struct zbookmark zbookmark_t; + struct dsl_pool; struct dsl_dataset; @@ -534,6 +536,25 @@ extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_ALL ((1 << SCL_LOCKS) - 1) #define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) +/* Historical pool statistics */ +typedef struct spa_stats_history { + kmutex_t lock; + uint64_t count; + uint64_t size; + kstat_t *kstat; + void *private; + list_t list; +} spa_stats_history_t; + +typedef struct spa_stats { + spa_stats_history_t read_history; +} spa_stats_t; + +extern void spa_stats_init(spa_t *spa); +extern void spa_stats_destroy(spa_t *spa); +extern void spa_read_history_add(spa_t *spa, const zbookmark_t *zb, + uint32_t aflags); + /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 47dfe432eed2..107d14a76cfd 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -236,6 +236,8 @@ struct spa { uint64_t spa_deadman_calls; /* number of deadman calls */ uint64_t spa_sync_starttime; /* starting time fo spa_sync */ uint64_t spa_deadman_synctime; /* deadman expiration timer */ + spa_stats_t spa_stats; /* assorted spa statistics */ + /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index a126c058ebd9..dfd11d9f1e5e 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -210,6 +210,7 @@ typedef struct kthread { } kthread_t; #define curthread zk_thread_current() +#define getcomm() "unknown" #define thread_exit zk_thread_exit #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ zk_thread_create(stk, stksize, (thread_func_t)func, arg, \ @@ -347,6 +348,10 @@ extern kstat_t *kstat_create(char *, int, char *, char *, uchar_t, ulong_t, uchar_t); extern void kstat_install(kstat_t *); extern void kstat_delete(kstat_t *); +extern void kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)); /* * Kernel memory diff --git a/include/sys/zio.h b/include/sys/zio.h index 189966bef54c..f5a128e0b319 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -256,12 +256,13 @@ extern char *zio_type_name[ZIO_TYPES]; * Therefore it must not change size or alignment between 32/64 bit * compilation options. */ -typedef struct zbookmark { +struct zbookmark { uint64_t zb_objset; uint64_t zb_object; int64_t zb_level; uint64_t zb_blkid; -} zbookmark_t; + char * zb_func; +}; #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ @@ -269,6 +270,7 @@ typedef struct zbookmark { (zb)->zb_object = object; \ (zb)->zb_level = level; \ (zb)->zb_blkid = blkid; \ + (zb)->zb_func = FTAG; \ } #define ZB_DESTROYED_OBJSET (-1ULL) diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index da81659d240c..b2d6394546a8 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -62,6 +62,7 @@ libzpool_la_SOURCES = \ $(top_srcdir)/module/zfs/spa_errlog.c \ $(top_srcdir)/module/zfs/spa_history.c \ $(top_srcdir)/module/zfs/spa_misc.c \ + $(top_srcdir)/module/zfs/spa_stats.c \ $(top_srcdir)/module/zfs/space_map.c \ $(top_srcdir)/module/zfs/txg.c \ $(top_srcdir)/module/zfs/uberblock.c \ diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index e4d645cf7797..57a3739e23e8 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -240,6 +240,14 @@ void kstat_delete(kstat_t *ksp) {} +/*ARGSUSED*/ +void +kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{} + /* * ========================================================================= * mutexes diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index e71228454110..6f0f6ef05ae1 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -44,6 +44,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/spa_config.o $(MODULE)-objs += @top_srcdir@/module/zfs/spa_errlog.o $(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o $(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o +$(MODULE)-objs += @top_srcdir@/module/zfs/spa_stats.o $(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o $(MODULE)-objs += @top_srcdir@/module/zfs/txg.o $(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 1298c5b91bc6..366f5bf9b5da 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -2943,6 +2943,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, kmutex_t *hash_lock; zio_t *rzio; uint64_t guid = spa_load_guid(spa); + int rc = 0; top: hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), @@ -2976,10 +2977,10 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, hdr->b_acb = acb; add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); - return (0); + goto out; } mutex_exit(hash_lock); - return (0); + goto out; } ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); @@ -3174,12 +3175,12 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, if (*arc_flags & ARC_NOWAIT) { zio_nowait(rzio); - return (0); + goto out; } ASSERT(*arc_flags & ARC_WAIT); if (zio_wait(rzio) == 0) - return (0); + goto out; /* l2arc read error; goto zio_read() */ } else { @@ -3203,13 +3204,18 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, zio_flags, zb); - if (*arc_flags & ARC_WAIT) - return (zio_wait(rzio)); + if (*arc_flags & ARC_WAIT) { + rc = zio_wait(rzio); + goto out; + } ASSERT(*arc_flags & ARC_NOWAIT); zio_nowait(rzio); } - return (0); + +out: + spa_read_history_add(spa, zb, *arc_flags); + return (rc); } arc_prune_t * diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 32b3e50fc005..1827ffc7413f 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -517,6 +517,9 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL); + SET_BOOKMARK(czb, td->td_objset, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + /* See comment on ZIL traversal in dsl_scan_visitds. */ if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) { uint32_t flags = ARC_WAIT; @@ -525,7 +528,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL); + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, czb); if (err != 0) return (err); @@ -539,8 +542,6 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, td, TQ_NOQUEUE)) pd->pd_exited = B_TRUE; - SET_BOOKMARK(czb, td->td_objset, - ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); err = traverse_visitbp(td, NULL, rootbp, czb); mutex_enter(&pd->pd_mtx); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 9ba6e896338b..a0b4797c97cb 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "zfs_prop.h" #include "zfeature_common.h" @@ -253,7 +254,6 @@ unsigned long zfs_deadman_synctime = 1000ULL; */ int zfs_deadman_enabled = 1; - /* * ========================================================================== * SPA config locking @@ -495,6 +495,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); + spa_stats_init(spa); avl_add(&spa_namespace_avl, spa); @@ -580,6 +581,7 @@ spa_remove(spa_t *spa) refcount_destroy(&spa->spa_refcount); + spa_stats_destroy(spa); spa_config_lock_destroy(spa); for (t = 0; t < TXG_SIZE; t++) diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c new file mode 100644 index 000000000000..271a3c312f7e --- /dev/null +++ b/module/zfs/spa_stats.c @@ -0,0 +1,248 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include + +/* + * Keeps stats on last N reads per spa_t, disabled by default. + */ +int zfs_read_history = 0; + +/* + * Include cache hits in history, disabled by default. + */ +int zfs_read_history_hits = 0; + +/* + * ========================================================================== + * SPA Read History Routines + * ========================================================================== + */ + +/* + * Read statistics - Information exported regarding each arc_read call + */ +typedef struct spa_read_history { + uint64_t uid; /* unique identifier */ + hrtime_t start; /* time read completed */ + uint64_t objset; /* read from this objset */ + uint64_t object; /* read of this object number */ + uint64_t level; /* block's indirection level */ + uint64_t blkid; /* read of this block id */ + char origin[24]; /* read originated from here */ + uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ + pid_t pid; /* PID of task doing read */ + char comm[16]; /* process name of task doing read */ + list_node_t srh_link; +} spa_read_history_t; + +static int +spa_read_history_headers(char *buf, size_t size) +{ + size = snprintf(buf, size - 1, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " + "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", + "level", "blkid", "aflags", "origin", "pid", "process"); + buf[size] = '\0'; + + return (0); +} + +static int +spa_read_history_data(char *buf, size_t size, void *data) +{ + spa_read_history_t *srh = (spa_read_history_t *)data; + + size = snprintf(buf, size - 1, "%-8llu %-16llu 0x%-6llx " + "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", + (u_longlong_t)srh->uid, srh->start, + (longlong_t)srh->objset, (longlong_t)srh->object, + (longlong_t)srh->level, (longlong_t)srh->blkid, + srh->aflags, srh->origin, srh->pid, srh->comm); + buf[size] = '\0'; + + return (0); +} + +/* + * Calculate the address for the next spa_stats_history_t entry. The + * ssh->lock will be held until ksp->ks_ndata entries are processed. + */ +static void * +spa_read_history_addr(kstat_t *ksp, loff_t n) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + + ASSERT(MUTEX_HELD(&ssh->lock)); + + if (n == 0) + ssh->private = list_tail(&ssh->list); + else if (ssh->private) + ssh->private = list_prev(&ssh->list, ssh->private); + + return (ssh->private); +} + +/* + * When the kstat is written discard all spa_read_history_t entires. The + * ssh->lock will be held until ksp->ks_ndata entries are processed. + */ +static int +spa_read_history_update(kstat_t *ksp, int rw) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + + if (rw == KSTAT_WRITE) { + spa_read_history_t *srh; + + while ((srh = list_remove_head(&ssh->list))) { + ssh->size--; + kmem_free(srh, sizeof(spa_read_history_t)); + } + + ASSERT3U(ssh->size, ==, 0); + } + + ksp->ks_ndata = ssh->size; + ksp->ks_data_size = ssh->size * sizeof(spa_read_history_t); + + return (0); +} + +static void +spa_read_history_init(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + char name[KSTAT_STRLEN]; + kstat_t *ksp; + + mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&ssh->list, sizeof (spa_read_history_t), + offsetof(spa_read_history_t, srh_link)); + + ssh->count = 0; + ssh->size = 0; + ssh->private = NULL; + + (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); + name[KSTAT_STRLEN-1] = '\0'; + + ksp = kstat_create(name, 0, "reads", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + ssh->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &ssh->lock; + ksp->ks_data = NULL; + ksp->ks_private = spa; + ksp->ks_update = spa_read_history_update; + kstat_set_raw_ops(ksp, spa_read_history_headers, + spa_read_history_data, spa_read_history_addr); + kstat_install(ksp); + } +} + +static void +spa_read_history_destroy(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + spa_read_history_t *srh; + kstat_t *ksp; + + ksp = ssh->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_enter(&ssh->lock); + while ((srh = list_remove_head(&ssh->list))) { + ssh->size--; + kmem_free(srh, sizeof(spa_read_history_t)); + } + + ASSERT3U(ssh->size, ==, 0); + list_destroy(&ssh->list); + mutex_exit(&ssh->lock); + + mutex_destroy(&ssh->lock); +} + +void +spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags) +{ + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + spa_read_history_t *srh, *rm; + + ASSERT3P(spa, !=, NULL); + ASSERT3P(zb, !=, NULL); + + if (zfs_read_history == 0 && ssh->size == 0) + return; + + if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED)) + return; + + srh = kmem_zalloc(sizeof(spa_read_history_t), KM_PUSHPAGE); + strlcpy(srh->origin, zb->zb_func, sizeof(srh->origin)); + strlcpy(srh->comm, getcomm(), sizeof(srh->comm)); + srh->start = gethrtime(); + srh->objset = zb->zb_objset; + srh->object = zb->zb_object; + srh->level = zb->zb_level; + srh->blkid = zb->zb_blkid; + srh->aflags = aflags; + srh->pid = getpid(); + + mutex_enter(&ssh->lock); + + srh->uid = ssh->count++; + list_insert_head(&ssh->list, srh); + ssh->size++; + + while (ssh->size > zfs_read_history) { + ssh->size--; + rm = list_remove_tail(&ssh->list); + kmem_free(rm, sizeof(spa_read_history_t)); + } + + mutex_exit(&ssh->lock); +} + +void +spa_stats_init(spa_t *spa) +{ + spa_read_history_init(spa); +} + +void +spa_stats_destroy(spa_t *spa) +{ + spa_read_history_destroy(spa); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_read_history, int, 0644); +MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads"); + +module_param(zfs_read_history_hits, int, 0644); +MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history"); +#endif From 0b1401ee911c5a0c0bdb7a8e6ad36840cea3af24 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 1 Oct 2013 09:50:50 -0700 Subject: [PATCH 4/7] Add visibility in to txg sync behavior This change is an attempt to add visibility in to how txgs are being formed on a system, in real time. To do this, a list was added to the in memory SPA data structure for a pool, with each element on the list corresponding to txg. These entries are then exported through the kstat interface, which can then be interpreted in userspace. For each txg, the following information is exported: * Unique txg number (uint64_t) * The time the txd was born (hrtime_t) (*not* wall clock time; relative to the other entries on the list) * The current txg state ((O)pen/(Q)uiescing/(S)yncing/(C)ommitted) * The number of reserved bytes for the txg (uint64_t) * The number of bytes read during the txg (uint64_t) * The number of bytes written during the txg (uint64_t) * The number of read operations during the txg (uint64_t) * The number of write operations during the txg (uint64_t) * The time the txg was closed (hrtime_t) * The time the txg was quiesced (hrtime_t) * The time the txg was synced (hrtime_t) Note that while the raw kstat now stores relative hrtimes for the open, quiesce, and sync times. Those relative times are used to calculate how long each state took and these deltas and printed by output handlers. Signed-off-by: Brian Behlendorf --- include/sys/spa.h | 14 ++ module/zfs/spa_stats.c | 282 +++++++++++++++++++++++++++++++++++++++++ module/zfs/txg.c | 27 +++- 3 files changed, 322 insertions(+), 1 deletion(-) diff --git a/include/sys/spa.h b/include/sys/spa.h index a4a717ac0450..85a62588aedd 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -548,12 +548,26 @@ typedef struct spa_stats_history { typedef struct spa_stats { spa_stats_history_t read_history; + spa_stats_history_t txg_history; } spa_stats_t; +typedef enum txg_state { + TXG_STATE_BIRTH = 0, + TXG_STATE_OPEN = 1, + TXG_STATE_QUIESCED = 2, + TXG_STATE_SYNCED = 3, + TXG_STATE_COMMITTED = 4, +} txg_state_t; + extern void spa_stats_init(spa_t *spa); extern void spa_stats_destroy(spa_t *spa); extern void spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags); +extern void spa_txg_history_add(spa_t *spa, uint64_t txg); +extern int spa_txg_history_set(spa_t *spa, uint64_t txg, + txg_state_t completed_state, hrtime_t completed_time); +extern int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, + uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved); /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index 271a3c312f7e..16ccbb0ad53a 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -32,6 +32,11 @@ int zfs_read_history = 0; */ int zfs_read_history_hits = 0; +/* + * Keeps stats on the last N txgs, disabled by default. + */ +int zfs_txg_history = 0; + /* * ========================================================================== * SPA Read History Routines @@ -227,15 +232,289 @@ spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags) mutex_exit(&ssh->lock); } +/* + * ========================================================================== + * SPA TXG History Routines + * ========================================================================== + */ + +/* + * Txg statistics - Information exported regarding each txg sync + */ + +typedef struct spa_txg_history { + uint64_t txg; /* txg id */ + txg_state_t state; /* active txg state */ + uint64_t nread; /* number of bytes read */ + uint64_t nwritten; /* number of bytes written */ + uint64_t reads; /* number of read operations */ + uint64_t writes; /* number of write operations */ + uint64_t nreserved; /* number of bytes reserved */ + hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ + list_node_t sth_link; +} spa_txg_history_t; + +static int +spa_txg_history_headers(char *buf, size_t size) +{ + size = snprintf(buf, size - 1, "%-8s %-16s %-5s %-12s %-12s %-12s " + "%-8s %-8s %-12s %-12s %-12s\n", "txg", "birth", "state", + "nreserved", "nread", "nwritten", "reads", "writes", + "otime", "qtime", "stime"); + buf[size] = '\0'; + + return (0); +} + +static int +spa_txg_history_data(char *buf, size_t size, void *data) +{ + spa_txg_history_t *sth = (spa_txg_history_t *)data; + uint64_t open = 0, quiesce = 0, sync = 0; + char state; + + switch (sth->state) { + case TXG_STATE_BIRTH: state = 'B'; break; + case TXG_STATE_OPEN: state = 'O'; break; + case TXG_STATE_QUIESCED: state = 'Q'; break; + case TXG_STATE_SYNCED: state = 'S'; break; + case TXG_STATE_COMMITTED: state = 'C'; break; + default: state = '?'; break; + } + + if (sth->times[TXG_STATE_OPEN]) + open = sth->times[TXG_STATE_OPEN] - + sth->times[TXG_STATE_BIRTH]; + + if (sth->times[TXG_STATE_QUIESCED]) + quiesce = sth->times[TXG_STATE_QUIESCED] - + sth->times[TXG_STATE_OPEN]; + + if (sth->times[TXG_STATE_SYNCED]) + sync = sth->times[TXG_STATE_SYNCED] - + sth->times[TXG_STATE_QUIESCED]; + + size = snprintf(buf, size - 1, "%-8llu %-16llu %-5c %-12llu " + "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu\n", + (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, + (u_longlong_t)sth->nreserved, + (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, + (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, + (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)sync); + buf[size] = '\0'; + + return (0); +} + +/* + * Calculate the address for the next spa_stats_history_t entry. The + * ssh->lock will be held until ksp->ks_ndata entries are processed. + */ +static void * +spa_txg_history_addr(kstat_t *ksp, loff_t n) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + + ASSERT(MUTEX_HELD(&ssh->lock)); + + if (n == 0) + ssh->private = list_tail(&ssh->list); + else if (ssh->private) + ssh->private = list_prev(&ssh->list, ssh->private); + + return (ssh->private); +} + +/* + * When the kstat is written discard all spa_txg_history_t entires. The + * ssh->lock will be held until ksp->ks_ndata entries are processed. + */ +static int +spa_txg_history_update(kstat_t *ksp, int rw) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + + ASSERT(MUTEX_HELD(&ssh->lock)); + + if (rw == KSTAT_WRITE) { + spa_txg_history_t *sth; + + while ((sth = list_remove_head(&ssh->list))) { + ssh->size--; + kmem_free(sth, sizeof(spa_txg_history_t)); + } + + ASSERT3U(ssh->size, ==, 0); + } + + ksp->ks_ndata = ssh->size; + ksp->ks_data_size = ssh->size * sizeof(spa_txg_history_t); + + return (0); +} + +static void +spa_txg_history_init(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + char name[KSTAT_STRLEN]; + kstat_t *ksp; + + mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&ssh->list, sizeof (spa_txg_history_t), + offsetof(spa_txg_history_t, sth_link)); + + ssh->count = 0; + ssh->size = 0; + ssh->private = NULL; + + (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); + name[KSTAT_STRLEN-1] = '\0'; + + ksp = kstat_create(name, 0, "txgs", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + ssh->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &ssh->lock; + ksp->ks_data = NULL; + ksp->ks_private = spa; + ksp->ks_update = spa_txg_history_update; + kstat_set_raw_ops(ksp, spa_txg_history_headers, + spa_txg_history_data, spa_txg_history_addr); + kstat_install(ksp); + } +} + +static void +spa_txg_history_destroy(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + kstat_t *ksp; + + ksp = ssh->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_enter(&ssh->lock); + while ((sth = list_remove_head(&ssh->list))) { + ssh->size--; + kmem_free(sth, sizeof(spa_txg_history_t)); + } + + ASSERT3U(ssh->size, ==, 0); + list_destroy(&ssh->list); + mutex_exit(&ssh->lock); + + mutex_destroy(&ssh->lock); +} + +/* + * Add a new txg to historical record. + */ +void +spa_txg_history_add(spa_t *spa, uint64_t txg) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_txg_history_t *sth, *rm; + + if (zfs_txg_history == 0 && ssh->size == 0) + return; + + sth = kmem_zalloc(sizeof(spa_txg_history_t), KM_PUSHPAGE); + sth->txg = txg; + sth->state = TXG_STATE_OPEN; + sth->times[TXG_STATE_BIRTH] = gethrtime(); + + mutex_enter(&ssh->lock); + + list_insert_head(&ssh->list, sth); + ssh->size++; + + while (ssh->size > zfs_txg_history) { + ssh->size--; + rm = list_remove_tail(&ssh->list); + kmem_free(rm, sizeof(spa_txg_history_t)); + } + + mutex_exit(&ssh->lock); +} + +/* + * Set txg state completion time and increment current state. + */ +int +spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, + hrtime_t completed_time) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + int error = ENOENT; + + if (zfs_txg_history == 0) + return (0); + + mutex_enter(&ssh->lock); + for (sth = list_head(&ssh->list); sth != NULL; + sth = list_next(&ssh->list, sth)) { + if (sth->txg == txg) { + sth->times[completed_state] = completed_time; + sth->state++; + error = 0; + break; + } + } + mutex_exit(&ssh->lock); + + return (error); +} + +/* + * Set txg IO stats. + */ +int +spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, + uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved) +{ + spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + int error = ENOENT; + + if (zfs_txg_history == 0) + return (0); + + mutex_enter(&ssh->lock); + for (sth = list_head(&ssh->list); sth != NULL; + sth = list_next(&ssh->list, sth)) { + if (sth->txg == txg) { + sth->nread = nread; + sth->nwritten = nwritten; + sth->reads = reads; + sth->writes = writes; + sth->nreserved = nreserved; + error = 0; + break; + } + } + mutex_exit(&ssh->lock); + + return (error); +} + void spa_stats_init(spa_t *spa) { spa_read_history_init(spa); + spa_txg_history_init(spa); } void spa_stats_destroy(spa_t *spa) { + spa_txg_history_destroy(spa); spa_read_history_destroy(spa); } @@ -245,4 +524,7 @@ MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads"); module_param(zfs_read_history_hits, int, 0644); MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history"); + +module_param(zfs_txg_history, int, 0644); +MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs"); #endif diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 440353a50e2e..7a3da8647d17 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -363,6 +364,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) ASSERT(txg == tx->tx_open_txg); tx->tx_open_txg++; + spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, gethrtime()); + spa_txg_history_add(dp->dp_spa, tx->tx_open_txg); + /* * Now that we've incremented tx_open_txg, we can let threads * enter the next transaction group. @@ -380,6 +384,8 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) cv_wait(&tc->tc_cv[g], &tc->tc_lock); mutex_exit(&tc->tc_lock); } + + spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime()); } static void @@ -451,6 +457,7 @@ txg_sync_thread(dsl_pool_t *dp) spa_t *spa = dp->dp_spa; tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; + vdev_stat_t *vs1, *vs2; uint64_t start, delta; #ifdef _KERNEL @@ -464,6 +471,9 @@ txg_sync_thread(dsl_pool_t *dp) txg_thread_enter(tx, &cpr); + vs1 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE); + vs2 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE); + start = delta = 0; for (;;) { uint64_t timer, timeout; @@ -499,8 +509,13 @@ txg_sync_thread(dsl_pool_t *dp) txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); } - if (tx->tx_exiting) + if (tx->tx_exiting) { + kmem_free(vs2, sizeof(vdev_stat_t)); + kmem_free(vs1, sizeof(vdev_stat_t)); txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); + } + + vdev_get_stats(spa->spa_root_vdev, vs1); /* * Consume the quiesced txg which has been handed off to @@ -529,6 +544,16 @@ txg_sync_thread(dsl_pool_t *dp) * Dispatch commit callbacks to worker threads. */ txg_dispatch_callbacks(dp, txg); + + vdev_get_stats(spa->spa_root_vdev, vs2); + spa_txg_history_set_io(spa, txg, + vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ], + vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE], + vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ], + vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE], + dp->dp_space_towrite[txg & TXG_MASK] + + dp->dp_tempreserved[txg & TXG_MASK] / 2); + spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime()); } } From 2d37239a28b8b2ddc0e8312093f8d8810c6351fa Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 2 Oct 2013 11:43:52 -0700 Subject: [PATCH 5/7] Add visibility in to dmu_tx_assign times This change adds a new kstat to gain some visibility into the amount of time spent in each call to dmu_tx_assign. A histogram is exported via the new dmu_tx_assign file. The information contained in this histogram is the frequency dmu_tx_assign took to complete given an interval range. Signed-off-by: Prakash Surya Signed-off-by: Brian Behlendorf --- include/sys/spa.h | 2 + module/zfs/dmu_tx.c | 5 ++ module/zfs/spa_stats.c | 106 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+) diff --git a/include/sys/spa.h b/include/sys/spa.h index 85a62588aedd..cb3ce11bca6d 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -549,6 +549,7 @@ typedef struct spa_stats_history { typedef struct spa_stats { spa_stats_history_t read_history; spa_stats_history_t txg_history; + spa_stats_history_t tx_assign_histogram; } spa_stats_t; typedef enum txg_state { @@ -568,6 +569,7 @@ extern int spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, hrtime_t completed_time); extern int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t nreserved); +extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs); /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 17982a04eca4..7b62239567f5 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1077,12 +1077,15 @@ dmu_tx_unassign(dmu_tx_t *tx) int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) { + hrtime_t before; int err; ASSERT(tx->tx_txg == 0); ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); + before = gethrtime(); + /* If we might wait, we must not hold the config lock. */ ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); @@ -1097,6 +1100,8 @@ dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) txg_rele_to_quiesce(&tx->tx_txgh); + spa_tx_assign_add_nsecs(tx->tx_pool->dp_spa, gethrtime() - before); + return (0); } diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index 16ccbb0ad53a..789e8c3e61b7 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -504,16 +504,122 @@ spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, return (error); } +/* + * ========================================================================== + * SPA TX Assign Histogram Routines + * ========================================================================== + */ + +/* + * Tx statistics - Information exported regarding dmu_tx_assign time. + */ + +/* + * When the kstat is written zero all buckets. When the kstat is read + * count the number of trailing buckets set to zero and update ks_ndata + * such that they are not output. + */ +static int +spa_tx_assign_update(kstat_t *ksp, int rw) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; + int i; + + if (rw == KSTAT_WRITE) { + for (i = 0; i < ssh->count; i++) + ((kstat_named_t *)ssh->private)[i].value.ui64 = 0; + } + + for (i = ssh->count; i > 0; i--) + if (((kstat_named_t *)ssh->private)[i-1].value.ui64 != 0) + break; + + ksp->ks_ndata = i; + ksp->ks_data_size = i * sizeof(kstat_named_t); + + return (0); +} + +static void +spa_tx_assign_init(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; + char name[KSTAT_STRLEN]; + kstat_named_t *ks; + kstat_t *ksp; + int i; + + mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); + + ssh->count = 42; /* power of two buckets for 1ns to 2,199s */ + ssh->size = ssh->count * sizeof(kstat_named_t); + ssh->private = kmem_alloc(ssh->size, KM_SLEEP); + + (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); + name[KSTAT_STRLEN-1] = '\0'; + + for (i = 0; i < ssh->count; i++) { + ks = &((kstat_named_t *)ssh->private)[i]; + ks->data_type = KSTAT_DATA_UINT64; + ks->value.ui64 = 0; + (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", + (u_longlong_t)1 << i); + } + + ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", + KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); + ssh->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &ssh->lock; + ksp->ks_data = ssh->private; + ksp->ks_ndata = ssh->count; + ksp->ks_data_size = ssh->size; + ksp->ks_private = spa; + ksp->ks_update = spa_tx_assign_update; + kstat_install(ksp); + } +} + +static void +spa_tx_assign_destroy(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; + kstat_t *ksp; + + ksp = ssh->kstat; + if (ksp) + kstat_delete(ksp); + + kmem_free(ssh->private, ssh->size); + mutex_destroy(&ssh->lock); +} + +void +spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) +{ + spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; + uint64_t idx = 0; + + while (((1 << idx) < nsecs) && (idx < ssh->size - 1)) + idx++; + + atomic_inc_64(&((kstat_named_t *)ssh->private)[idx].value.ui64); +} + void spa_stats_init(spa_t *spa) { spa_read_history_init(spa); spa_txg_history_init(spa); + spa_tx_assign_init(spa); } void spa_stats_destroy(spa_t *spa) { + spa_tx_assign_destroy(spa); spa_txg_history_destroy(spa); spa_read_history_destroy(spa); } From e0b0ca983d6897bcddf05af2c0e5d01ff66f90db Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 2 Oct 2013 17:11:19 -0700 Subject: [PATCH 6/7] Add visibility in to cached dbufs Currently there is no mechanism to inspect which dbufs are being cached by the system. There are some coarse counters in arcstats by they only give a rough idea of what's being cached. This patch aims to improve the current situation by adding a new dbufs kstat. When read this new kstat will walk all cached dbufs linked in to the dbuf_hash. For each dbuf it will dump detailed information about the buffer. It will also dump additional information about the referenced arc buffer and its related dnode. This provides a more complete view in to exactly what is being cached. With this generic infrastructure in place utilities can be written to post-process the data to understand exactly how the caching is working. For example, the data could be processed to show a list of all cached dnodes and how much space they're consuming. Or a similar list could be generated based on dnode type. Many other ways to interpret the data exist based on what kinds of questions you're trying to answer. Signed-off-by: Brian Behlendorf Signed-off-by: Prakash Surya --- include/sys/arc.h | 31 ++++++ include/sys/dbuf.h | 3 + include/sys/dmu.h | 1 + lib/libzpool/Makefile.am | 1 + module/zfs/Makefile.in | 1 + module/zfs/arc.c | 87 ++++++++++++++- module/zfs/dbuf.c | 4 + module/zfs/dbuf_stats.c | 230 +++++++++++++++++++++++++++++++++++++++ module/zfs/dmu.c | 18 +-- 9 files changed, 368 insertions(+), 8 deletions(-) create mode 100644 module/zfs/dbuf_stats.c diff --git a/include/sys/arc.h b/include/sys/arc.h index 8c10d947c386..efafb551acfb 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -92,6 +92,36 @@ typedef enum arc_space_type { ARC_SPACE_NUMTYPES } arc_space_type_t; +typedef enum arc_state_type { + ARC_STATE_ANON, + ARC_STATE_MRU, + ARC_STATE_MRU_GHOST, + ARC_STATE_MFU, + ARC_STATE_MFU_GHOST, + ARC_STATE_L2C_ONLY, + ARC_STATE_NUMTYPES +} arc_state_type_t; + +typedef struct arc_buf_info { + arc_state_type_t abi_state_type; + arc_buf_contents_t abi_state_contents; + uint64_t abi_state_index; + uint32_t abi_flags; + uint32_t abi_datacnt; + uint64_t abi_size; + uint64_t abi_spa; + uint64_t abi_access; + uint32_t abi_mru_hits; + uint32_t abi_mru_ghost_hits; + uint32_t abi_mfu_hits; + uint32_t abi_mfu_ghost_hits; + uint32_t abi_l2arc_hits; + uint32_t abi_holds; + uint64_t abi_l2arc_dattr; + uint64_t abi_l2arc_asize; + enum zio_compress abi_l2arc_compress; +} arc_buf_info_t; + void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, @@ -101,6 +131,7 @@ void arc_return_buf(arc_buf_t *buf, void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); void arc_buf_add_ref(arc_buf_t *buf, void *tag); boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag); +void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); int arc_released(arc_buf_t *buf); diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 8cd1fde01f9c..85e967d0fe06 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -282,6 +282,9 @@ void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); +void dbuf_stats_init(dbuf_hash_table_t *hash); +void dbuf_stats_destroy(void); + #define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) #define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock) #define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index b0db7604da7e..fd1460d5c47c 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -660,6 +660,7 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); +void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index b2d6394546a8..e4189a37885a 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -25,6 +25,7 @@ libzpool_la_SOURCES = \ $(top_srcdir)/module/zfs/bpobj.c \ $(top_srcdir)/module/zfs/bptree.c \ $(top_srcdir)/module/zfs/dbuf.c \ + $(top_srcdir)/module/zfs/dbuf_stats.c \ $(top_srcdir)/module/zfs/ddt.c \ $(top_srcdir)/module/zfs/ddt_zap.c \ $(top_srcdir)/module/zfs/dmu.c \ diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 6f0f6ef05ae1..5552436adffa 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -8,6 +8,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o $(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o $(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o $(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o +$(MODULE)-objs += @top_srcdir@/module/zfs/dbuf_stats.o $(MODULE)-objs += @top_srcdir@/module/zfs/bptree.o $(MODULE)-objs += @top_srcdir@/module/zfs/ddt.o $(MODULE)-objs += @top_srcdir@/module/zfs/ddt_zap.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 366f5bf9b5da..9098988fddb8 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -232,6 +232,7 @@ typedef struct arc_state { uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ uint64_t arcs_size; /* total amount of data in this state */ kmutex_t arcs_mtx; + arc_state_type_t arcs_state; } arc_state_t; /* The 6 states: */ @@ -534,6 +535,11 @@ struct arc_buf_hdr { /* updated atomically */ clock_t b_arc_access; + uint32_t b_mru_hits; + uint32_t b_mru_ghost_hits; + uint32_t b_mfu_hits; + uint32_t b_mfu_ghost_hits; + uint32_t b_l2_hits; /* self protecting */ refcount_t b_refcnt; @@ -709,7 +715,8 @@ struct l2arc_buf_hdr { /* compression applied to buffer data */ enum zio_compress b_compress; /* real alloc'd buffer size depending on b_compress applied */ - int b_asize; + uint32_t b_asize; + uint32_t b_hits; /* temporary buffer holder for in-flight compressed data */ void *b_tmp_cdata; }; @@ -1137,6 +1144,54 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) return (cnt); } +/* + * Returns detailed information about a specific arc buffer. When the + * state_index argument is set the function will calculate the arc header + * list position for its arc state. Since this requires a linear traversal + * callers are strongly encourage not to do this. However, it can be helpful + * for targeted analysis so the functionality is provided. + */ +void +arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) +{ + arc_buf_hdr_t *hdr = ab->b_hdr; + arc_state_t *state = hdr->b_state; + + memset(abi, 0, sizeof(arc_buf_info_t)); + abi->abi_flags = hdr->b_flags; + abi->abi_datacnt = hdr->b_datacnt; + abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; + abi->abi_state_contents = hdr->b_type; + abi->abi_state_index = -1; + abi->abi_size = hdr->b_size; + abi->abi_access = hdr->b_arc_access; + abi->abi_mru_hits = hdr->b_mru_hits; + abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits; + abi->abi_mfu_hits = hdr->b_mfu_hits; + abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits; + abi->abi_holds = refcount_count(&hdr->b_refcnt); + + if (hdr->b_l2hdr) { + abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr; + abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize; + abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress; + abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits; + } + + if (state && state_index && list_link_active(&hdr->b_arc_node)) { + list_t *list = &state->arcs_list[hdr->b_type]; + arc_buf_hdr_t *h; + + mutex_enter(&state->arcs_mtx); + for (h = list_head(list); h != NULL; h = list_next(list, h)) { + abi->abi_state_index++; + if (h == hdr) + break; + } + mutex_exit(&state->arcs_mtx); + } +} + /* * Move the supplied buffer to the indicated state. The mutex * for the buffer must be held by the caller. @@ -1298,6 +1353,11 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) hdr->b_spa = spa_load_guid(spa); hdr->b_state = arc_anon; hdr->b_arc_access = 0; + hdr->b_mru_hits = 0; + hdr->b_mru_ghost_hits = 0; + hdr->b_mfu_hits = 0; + hdr->b_mfu_ghost_hits = 0; + hdr->b_l2_hits = 0; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; @@ -2670,6 +2730,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) ASSERT(list_link_active(&buf->b_arc_node)); } else { buf->b_flags &= ~ARC_PREFETCH; + atomic_inc_32(&buf->b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } buf->b_arc_access = now; @@ -2691,6 +2752,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(arc_mfu, buf, hash_lock); } + atomic_inc_32(&buf->b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } else if (buf->b_state == arc_mru_ghost) { arc_state_t *new_state; @@ -2713,6 +2775,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) buf->b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, buf, hash_lock); + atomic_inc_32(&buf->b_mru_ghost_hits); ARCSTAT_BUMP(arcstat_mru_ghost_hits); } else if (buf->b_state == arc_mfu) { /* @@ -2728,6 +2791,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) ASSERT(refcount_count(&buf->b_refcnt) == 0); ASSERT(list_link_active(&buf->b_arc_node)); } + atomic_inc_32(&buf->b_mfu_hits); ARCSTAT_BUMP(arcstat_mfu_hits); buf->b_arc_access = ddi_get_lbolt(); } else if (buf->b_state == arc_mfu_ghost) { @@ -2751,6 +2815,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(new_state, buf, hash_lock); + atomic_inc_32(&buf->b_mfu_ghost_hits); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); } else if (buf->b_state == arc_l2c_only) { /* @@ -3134,6 +3199,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); + atomic_inc_32(&hdr->b_l2hdr->b_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_PUSHPAGE); @@ -3469,6 +3535,11 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_buf = buf; nhdr->b_state = arc_anon; nhdr->b_arc_access = 0; + nhdr->b_mru_hits = 0; + nhdr->b_mru_ghost_hits = 0; + nhdr->b_mfu_hits = 0; + nhdr->b_mfu_ghost_hits = 0; + nhdr->b_l2_hits = 0; nhdr->b_flags = flags & ARC_L2_WRITING; nhdr->b_l2hdr = NULL; nhdr->b_datacnt = 1; @@ -3485,6 +3556,11 @@ arc_release(arc_buf_t *buf, void *tag) if (hdr->b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); hdr->b_arc_access = 0; + hdr->b_mru_hits = 0; + hdr->b_mru_ghost_hits = 0; + hdr->b_mfu_hits = 0; + hdr->b_mfu_ghost_hits = 0; + hdr->b_l2_hits = 0; if (hash_lock) mutex_exit(hash_lock); @@ -3902,6 +3978,13 @@ arc_init(void) list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + arc_anon->arcs_state = ARC_STATE_ANON; + arc_mru->arcs_state = ARC_STATE_MRU; + arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; + arc_mfu->arcs_state = ARC_STATE_MFU; + arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; + arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; + buf_init(); arc_thread_exit = 0; @@ -4785,6 +4868,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, l2hdr->b_compress = ZIO_COMPRESS_OFF; l2hdr->b_asize = ab->b_size; l2hdr->b_tmp_cdata = ab->b_buf->b_data; + l2hdr->b_hits = 0; buf_sz = ab->b_size; ab->b_l2hdr = l2hdr; @@ -5317,6 +5401,7 @@ l2arc_stop(void) #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(arc_read); EXPORT_SYMBOL(arc_buf_remove_ref); +EXPORT_SYMBOL(arc_buf_info); EXPORT_SYMBOL(arc_getbuf_func); EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index d655d66212ce..44e9419b7ab2 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -317,6 +317,8 @@ dbuf_init(void) for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + + dbuf_stats_init(h); } void @@ -325,6 +327,8 @@ dbuf_fini(void) dbuf_hash_table_t *h = &dbuf_hash_table; int i; + dbuf_stats_destroy(); + for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); #if defined(_KERNEL) && defined(HAVE_SPL) diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c new file mode 100644 index 000000000000..ef760eabaf7d --- /dev/null +++ b/module/zfs/dbuf_stats.c @@ -0,0 +1,230 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include + +/* + * Calculate the index of the arc header for the state, disabled by default. + */ +int zfs_dbuf_state_index = 0; + +/* + * ========================================================================== + * Dbuf Hash Read Routines + * ========================================================================== + */ +typedef struct dbuf_stats_t { + kmutex_t lock; + kstat_t *kstat; + dbuf_hash_table_t *hash; + int idx; +} dbuf_stats_t; + +static dbuf_stats_t dbuf_stats_hash_table; + +static int +dbuf_stats_hash_table_headers(char *buf, size_t size) +{ + size = snprintf(buf, size - 1, + "%-88s | %-124s | %s\n" + "%-16s %-8s %-8s %-8s %-8s %-8s %-8s %-5s %-5s %5s | " + "%-5s %-5s %-6s %-8s %-6s %-8s %-12s " + "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-5s | " + "%-6s %-6s %-8s %-8s %-6s %-6s %-5s %-8s %-8s\n", + "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level", + "blkid", "offset", "dbsize", "meta", "state", "dbholds", "list", + "atype", "index", "flags", "count", "asize", "access", "mru", "gmru", + "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds", + "dtype", "btype", "data_bs", "meta_bs", "bsize", + "lvls", "dholds", "blocks", "dsize"); + buf[size] = '\0'; + + return (0); +} + +int +__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) +{ + arc_buf_info_t abi = { 0 }; + dmu_object_info_t doi = { 0 }; + dnode_t *dn = DB_DNODE(db); + + if (db->db_buf) + arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index); + + if (dn) + __dmu_object_info_from_dnode(dn, &doi); + + size = snprintf(buf, size - 1, + "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | " + "%-5d %-5d %-6lld 0x%-6x %-6lu %-8llu %-12llu " + "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-5lu | " + "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-5lu %-8llu %-8llu\n", + /* dmu_buf_impl_t */ + spa_name(dn->dn_objset->os_spa), + (u_longlong_t)dmu_objset_id(db->db_objset), + (longlong_t)db->db.db_object, + (longlong_t)db->db_level, + (longlong_t)db->db_blkid, + (u_longlong_t)db->db.db_offset, + (u_longlong_t)db->db.db_size, + !!dbuf_is_metadata(db), + db->db_state, + (ulong_t)refcount_count(&db->db_holds), + /* arc_buf_info_t */ + abi.abi_state_type, + abi.abi_state_contents, + (longlong_t)abi.abi_state_index, + abi.abi_flags, + (ulong_t)abi.abi_datacnt, + (u_longlong_t)abi.abi_size, + (u_longlong_t)abi.abi_access, + (ulong_t)abi.abi_mru_hits, + (ulong_t)abi.abi_mru_ghost_hits, + (ulong_t)abi.abi_mfu_hits, + (ulong_t)abi.abi_mfu_ghost_hits, + (ulong_t)abi.abi_l2arc_hits, + (u_longlong_t)abi.abi_l2arc_dattr, + (u_longlong_t)abi.abi_l2arc_asize, + abi.abi_l2arc_compress, + (ulong_t)abi.abi_holds, + /* dmu_object_info_t */ + doi.doi_type, + doi.doi_bonus_type, + (ulong_t)doi.doi_data_block_size, + (ulong_t)doi.doi_metadata_block_size, + (u_longlong_t)doi.doi_bonus_size, + (ulong_t)doi.doi_indirection, + (ulong_t)refcount_count(&dn->dn_holds), + (u_longlong_t)doi.doi_fill_count, + (u_longlong_t)doi.doi_max_offset); + buf[size] = '\0'; + + return (size); +} + +static int +dbuf_stats_hash_table_data(char *buf, size_t size, void *data) +{ + dbuf_stats_t *dsh = (dbuf_stats_t *)data; + dbuf_hash_table_t *h = dsh->hash; + dmu_buf_impl_t *db; + int length, error = 0; + + ASSERT3S(dsh->idx, >=, 0); + ASSERT3S(dsh->idx, <=, h->hash_table_mask); + memset(buf, 0, size); + + mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); + for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) { + /* + * Returning ENOMEM will cause the data and header functions + * to be called with a larger scratch buffers. + */ + if (size < 512) { + error = ENOMEM; + break; + } + + mutex_enter(&db->db_mtx); + mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); + + length = __dbuf_stats_hash_table_data(buf, size, db); + buf += length; + size -= length; + + mutex_exit(&db->db_mtx); + mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); + } + mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); + + return (error); +} + +static void * +dbuf_stats_hash_table_addr(kstat_t *ksp, loff_t n) +{ + dbuf_stats_t *dsh = ksp->ks_private; + + ASSERT(MUTEX_HELD(&dsh->lock)); + + if (n <= dsh->hash->hash_table_mask) { + dsh->idx = n; + return (dsh); + } + + return (NULL); +} + +static void +dbuf_stats_hash_table_init(dbuf_hash_table_t *hash) +{ + dbuf_stats_t *dsh = &dbuf_stats_hash_table; + kstat_t *ksp; + + mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL); + dsh->hash = hash; + + ksp = kstat_create("zfs", 0, "dbufs", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + dsh->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &dsh->lock; + ksp->ks_ndata = UINT32_MAX; + ksp->ks_private = dsh; + kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers, + dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr); + kstat_install(ksp); + } +} + +static void +dbuf_stats_hash_table_destroy(void) +{ + dbuf_stats_t *dsh = &dbuf_stats_hash_table; + kstat_t *ksp; + + ksp = dsh->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_destroy(&dsh->lock); +} + +void +dbuf_stats_init(dbuf_hash_table_t *hash) +{ + dbuf_stats_hash_table_init(hash); +} + +void +dbuf_stats_destroy(void) +{ + dbuf_stats_hash_table_destroy(); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_dbuf_state_index, int, 0644); +MODULE_PARM_DESC(zfs_dbuf_state_index, "Calculate arc header index"); +#endif diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index cbf4790b1799..4ec9cb46ad32 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1815,16 +1815,11 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) } void -dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) +__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { - dnode_phys_t *dnp; + dnode_phys_t *dnp = dn->dn_phys; int i; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - mutex_enter(&dn->dn_mtx); - - dnp = dn->dn_phys; - doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; @@ -1839,6 +1834,15 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) doi->doi_fill_count = 0; for (i = 0; i < dnp->dn_nblkptr; i++) doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; +} + +void +dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) +{ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + mutex_enter(&dn->dn_mtx); + + __dmu_object_info_from_dnode(dn, doi); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); From d738d34da5b25b5e5daef966c29386468fd16263 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 25 Oct 2013 13:58:45 -0700 Subject: [PATCH 7/7] Add dbufstat.py command The dbufstat.py command was added to provide a conveniant way to easily determine what ZFS is caching. The script consumes the raw /proc/spl/kstat/zfs/dbufs kstat data can consolidates it in to a more human readable form. This was designed primarily as a tool to aid developers but it may also be useful for advanced users who want more visibility in to what the ARC is caching. When run without options dbufstat.py will default to showing a list of all objects with at least one buffer present in the cache. The total cache space consumed by that object will be printed on the right along with the object type. Similar to the arcstats.py command the -x option may used to display additional fields. Two other modes of operation are also supported by dbufstat.py and the expectation is additional display modes may be added as needed. The -t option will summerize the total number of bytes cached for each object type, and the -b option will show every dbuf currently cached. The script was designed to be consistent with arcstat.py and includes most of the same options and funcationality. Signed-off-by: Prakash Surya Signed-off-by: Brian Behlendorf --- cmd/Makefile.am | 2 +- cmd/arcstat/arcstat.py | 10 +- cmd/dbufstat/Makefile.am | 2 + cmd/dbufstat/dbufstat.py | 535 +++++++++++++++++++++++++++++++++++++++ configure.ac | 1 + 5 files changed, 544 insertions(+), 6 deletions(-) create mode 100644 cmd/dbufstat/Makefile.am create mode 100755 cmd/dbufstat/dbufstat.py diff --git a/cmd/Makefile.am b/cmd/Makefile.am index bad1af63eab9..4c3e4bbb2d24 100644 --- a/cmd/Makefile.am +++ b/cmd/Makefile.am @@ -1,2 +1,2 @@ SUBDIRS = zfs zpool zdb zhack zinject zstreamdump ztest zpios -SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat +SUBDIRS += mount_zfs fsck_zfs zvol_id vdev_id arcstat dbufstat diff --git a/cmd/arcstat/arcstat.py b/cmd/arcstat/arcstat.py index e01dd8b4c866..f703e33cd179 100755 --- a/cmd/arcstat/arcstat.py +++ b/cmd/arcstat/arcstat.py @@ -106,7 +106,7 @@ sep = " " # Default separator is 2 spaces version = "0.4" l2exist = False -cmd = ("Usage: arcstat [-hvx] [-f fields] [-o file] [-s string] [interval " +cmd = ("Usage: arcstat.py [-hvx] [-f fields] [-o file] [-s string] [interval " "[count]]\n") cur = {} d = {} @@ -136,10 +136,10 @@ def usage(): sys.stderr.write("\t -s : Override default field separator with custom " "character or string\n") sys.stderr.write("\nExamples:\n") - sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n") - sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n") - sys.stderr.write("\tarcstat -v\n") - sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n") + sys.stderr.write("\tarcstat.py -o /tmp/a.log 2 10\n") + sys.stderr.write("\tarcstat.py -s \",\" -o /tmp/a.log 2 10\n") + sys.stderr.write("\tarcstat.py -v\n") + sys.stderr.write("\tarcstat.py -f time,hit%,dh%,ph%,mh% 1\n") sys.stderr.write("\n") sys.exit(1) diff --git a/cmd/dbufstat/Makefile.am b/cmd/dbufstat/Makefile.am new file mode 100644 index 000000000000..0548b240c308 --- /dev/null +++ b/cmd/dbufstat/Makefile.am @@ -0,0 +1,2 @@ +bin_SCRIPTS = dbufstat.py +EXTRA_DIST = $(bin_SCRIPTS) diff --git a/cmd/dbufstat/dbufstat.py b/cmd/dbufstat/dbufstat.py new file mode 100755 index 000000000000..80569b1d451e --- /dev/null +++ b/cmd/dbufstat/dbufstat.py @@ -0,0 +1,535 @@ +#!/usr/bin/python +# +# Print out statistics for all cached dmu buffers. This information +# is available through the dbufs kstat and may be post-processed as +# needed by the script. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (C) 2013 Lawrence Livermore National Security, LLC. +# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). +# + +import sys +import getopt +import errno + +bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"] +bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize", + "meta", "state", "dbholds", "list", "atype", "index", "flags", "count", + "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", + "l2_asize", "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs", + "bsize", "lvls", "dholds", "blocks", "dsize"] +bincompat = ["cached", "direct", "indirect", "bonus", "spill"] + +dhdr = ["pool", "objset", "object", "dtype", "cached"] +dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs", + "bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct", + "indirect", "bonus", "spill"] +dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds", + "list", "atype", "index", "flags", "count", "asize", "access", "mru", + "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", "aholds"] + +thdr = ["pool", "objset", "dtype", "cached"] +txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect", + "bonus", "spill"] +tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state", + "dbholds", "list", "atype", "index", "flags", "count", "asize", "access", + "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", "l2_comp", + "aholds", "btype", "data_bs", "meta_bs", "bsize", "lvls", "dholds", + "blocks", "dsize"] + +cols = { + # hdr: [size, scale, description] + "pool": [15, -1, "pool name"], + "objset": [6, -1, "dataset identification number"], + "object": [10, -1, "object number"], + "level": [5, -1, "indirection level of buffer"], + "blkid": [8, -1, "block number of buffer"], + "offset": [12, 1024, "offset in object of buffer"], + "dbsize": [7, 1024, "size of buffer"], + "meta": [4, -1, "is this buffer metadata?"], + "state": [5, -1, "state of buffer (read, cached, etc)"], + "dbholds": [7, 1000, "number of holds on buffer"], + "list": [4, -1, "which ARC list contains this buffer"], + "atype": [7, -1, "ARC header type (data or metadata)"], + "index": [5, -1, "buffer's index into its ARC list"], + "flags": [8, -1, "ARC read flags"], + "count": [5, -1, "ARC data count"], + "asize": [7, 1024, "size of this ARC buffer"], + "access": [10, -1, "time this ARC buffer was last accessed"], + "mru": [5, 1000, "hits while on the ARC's MRU list"], + "gmru": [5, 1000, "hits while on the ARC's MRU ghost list"], + "mfu": [5, 1000, "hits while on the ARC's MFU list"], + "gmfu": [5, 1000, "hits while on the ARC's MFU ghost list"], + "l2": [5, 1000, "hits while on the L2ARC"], + "l2_dattr": [8, -1, "L2ARC disk address/offset"], + "l2_asize": [8, 1024, "L2ARC alloc'd size (depending on compression)"], + "l2_comp": [21, -1, "L2ARC compression algorithm for buffer"], + "aholds": [6, 1000, "number of holds on this ARC buffer"], + "dtype": [27, -1, "dnode type"], + "btype": [27, -1, "bonus buffer type"], + "data_bs": [7, 1024, "data block size"], + "meta_bs": [7, 1024, "metadata block size"], + "bsize": [6, 1024, "bonus buffer size"], + "lvls": [6, -1, "number of indirection levels"], + "dholds": [6, 1000, "number of holds on dnode"], + "blocks": [8, 1000, "number of allocated blocks"], + "dsize": [12, 1024, "size of dnode"], + "cached": [6, 1024, "bytes cached for all blocks"], + "direct": [6, 1024, "bytes cached for direct blocks"], + "indirect": [8, 1024, "bytes cached for indirect blocks"], + "bonus": [5, 1024, "bytes cached for bonus buffer"], + "spill": [5, 1024, "bytes cached for spill block"], +} + +hdr = None +xhdr = None +sep = " " # Default separator is 2 spaces +cmd = ("Usage: dbufstat.py [-bdhrtvx] [-i file] [-f fields] [-o file] " + "[-s string]\n") +raw = 0 + +def print_incompat_helper(incompat): + cnt = 0 + for key in sorted(incompat): + if cnt is 0: + sys.stderr.write("\t") + elif cnt > 8: + sys.stderr.write(",\n\t") + cnt = 0 + else: + sys.stderr.write(", ") + + sys.stderr.write("%s" % key) + cnt += 1 + + sys.stderr.write("\n\n") + +def detailed_usage(): + sys.stderr.write("%s\n" % cmd) + + sys.stderr.write("Field definitions incompatible with '-b' option:\n") + print_incompat_helper(bincompat) + + sys.stderr.write("Field definitions incompatible with '-d' option:\n") + print_incompat_helper(dincompat) + + sys.stderr.write("Field definitions incompatible with '-t' option:\n") + print_incompat_helper(tincompat) + + sys.stderr.write("Field definitions are as follows:\n") + for key in sorted(cols.keys()): + sys.stderr.write("%11s : %s\n" % (key, cols[key][2])) + sys.stderr.write("\n") + + sys.exit(1) + +def usage(): + sys.stderr.write("%s\n" % cmd) + sys.stderr.write("\t -b : Print table of information for each dbuf\n") + sys.stderr.write("\t -d : Print table of information for each dnode\n") + sys.stderr.write("\t -h : Print this help message\n") + sys.stderr.write("\t -r : Print raw values\n") + sys.stderr.write("\t -t : Print table of information for each dnode type\n") + sys.stderr.write("\t -v : List all possible field headers and definitions" + "\n") + sys.stderr.write("\t -x : Print extended stats\n") + sys.stderr.write("\t -i : Redirect input from the specified file\n") + sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n") + sys.stderr.write("\t -o : Redirect output to the specified file\n") + sys.stderr.write("\t -s : Override default field separator with custom " + "character or string\n") + sys.stderr.write("\nExamples:\n") + sys.stderr.write("\tdbufstat.py -d -o /tmp/d.log\n") + sys.stderr.write("\tdbufstat.py -t -s \",\" -o /tmp/t.log\n") + sys.stderr.write("\tdbufstat.py -v\n") + sys.stderr.write("\tdbufstat.py -d -f pool,object,objset,dsize,cached\n") + sys.stderr.write("\n") + + sys.exit(1) + +def prettynum(sz, scale, num=0): + global raw + + suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z'] + index = 0 + save = 0 + + if raw or scale == -1: + return "%*s" % (sz, num) + + # Rounding error, return 0 + elif num > 0 and num < 1: + num = 0 + + while num > scale and index < 5: + save = num + num = num / scale + index += 1 + + if index == 0: + return "%*d" % (sz, num) + + if (save / scale) < 10: + return "%*.1f%s" % (sz - 1, num, suffix[index]) + else: + return "%*d%s" % (sz - 1, num, suffix[index]) + +def print_values(v): + global hdr + global sep + + try: + for col in hdr: + sys.stdout.write("%s%s" % ( + prettynum(cols[col][0], cols[col][1], v[col]), sep)) + sys.stdout.write("\n") + except IOError as e: + if e.errno == errno.EPIPE: + sys.exit(1) + +def print_header(): + global hdr + global sep + + try: + for col in hdr: + sys.stdout.write("%*s%s" % (cols[col][0], col, sep)) + sys.stdout.write("\n") + except IOError as e: + if e.errno == errno.EPIPE: + sys.exit(1) + +def get_typestring(t): + type_strings = ["DMU_OT_NONE", + # general: + "DMU_OT_OBJECT_DIRECTORY", "DMU_OT_OBJECT_ARRAY", + "DMU_OT_PACKED_NVLIST", "DMU_OT_PACKED_NVLIST_SIZE", + "DMU_OT_BPOBJ", "DMU_OT_BPOBJ_HDR", + # spa: + "DMU_OT_SPACE_MAP_HEADER", "DMU_OT_SPACE_MAP", + # zil: + "DMU_OT_INTENT_LOG", + # dmu: + "DMU_OT_DNODE", "DMU_OT_OBJSET", + # dsl: + "DMU_OT_DSL_DIR", "DMU_OT_DSL_DIR_CHILD_MAP", + "DMU_OT_DSL_DS_SNAP_MAP", "DMU_OT_DSL_PROPS", + "DMU_OT_DSL_DATASET", + # zpl: + "DMU_OT_ZNODE", "DMU_OT_OLDACL", "DMU_OT_PLAIN_FILE_CONTENTS", + "DMU_OT_DIRECTORY_CONTENTS", "DMU_OT_MASTER_NODE", + "DMU_OT_UNLINKED_SET", + # zvol: + "DMU_OT_ZVOL", "DMU_OT_ZVOL_PROP", + # other; for testing only! + "DMU_OT_PLAIN_OTHER", "DMU_OT_UINT64_OTHER", "DMU_OT_ZAP_OTHER", + # new object types: + "DMU_OT_ERROR_LOG", "DMU_OT_SPA_HISTORY", + "DMU_OT_SPA_HISTORY_OFFSETS", "DMU_OT_POOL_PROPS", + "DMU_OT_DSL_PERMS", "DMU_OT_ACL", "DMU_OT_SYSACL", + "DMU_OT_FUID", "DMU_OT_FUID_SIZE", "DMU_OT_NEXT_CLONES", + "DMU_OT_SCAN_QUEUE", "DMU_OT_USERGROUP_USED", + "DMU_OT_USERGROUP_QUOTA", "DMU_OT_USERREFS", "DMU_OT_DDT_ZAP", + "DMU_OT_DDT_STATS", "DMU_OT_SA", "DMU_OT_SA_MASTER_NODE", + "DMU_OT_SA_ATTR_REGISTRATION", "DMU_OT_SA_ATTR_LAYOUTS", + "DMU_OT_SCAN_XLATE", "DMU_OT_DEDUP", "DMU_OT_DEADLIST", + "DMU_OT_DEADLIST_HDR", "DMU_OT_DSL_CLONES", + "DMU_OT_BPOBJ_SUBOBJ"] + + # If "-rr" option is used, don't convert to string representation + if raw > 1: + return "%i" % t + + try: + return type_strings[t]; + except IndexError: + return "%i" % t + +def get_compstring(c): + comp_strings = ["ZIO_COMPRESS_INHERIT", "ZIO_COMPRESS_ON", + "ZIO_COMPRESS_OFF", "ZIO_COMPRESS_LZJB", + "ZIO_COMPRESS_EMPTY", "ZIO_COMPRESS_GZIP_1", + "ZIO_COMPRESS_GZIP_2", "ZIO_COMPRESS_GZIP_3", + "ZIO_COMPRESS_GZIP_4", "ZIO_COMPRESS_GZIP_5", + "ZIO_COMPRESS_GZIP_6", "ZIO_COMPRESS_GZIP_7", + "ZIO_COMPRESS_GZIP_8", "ZIO_COMPRESS_GZIP_9", + "ZIO_COMPRESS_ZLE", "ZIO_COMPRESS_LZ4", + "ZIO_COMPRESS_FUNCTION"] + + # If "-rr" option is used, don't convert to string representation + if raw > 1: + return "%i" % c + + try: + return comp_strings[c]; + except IndexError: + return "%i" % c + +def parse_line(line, labels): + global hdr + + new = dict() + val = None + for col in hdr: + # These are "special" fields computed in the update_dict + # function, prevent KeyError exception on labels[col] for these. + if col not in ['bonus', 'cached', 'direct', 'indirect', 'spill']: + val = line[labels[col]] + + if col in ['pool', 'flags']: + new[col] = str(val) + elif col in ['dtype', 'btype']: + new[col] = get_typestring(int(val)) + elif col in ['l2_comp']: + new[col] = get_compstring(int(val)) + else: + new[col] = int(val) + + return new + +def update_dict(d, k, line, labels): + pool = line[labels['pool']] + objset = line[labels['objset']] + key = line[labels[k]] + + dbsize = int(line[labels['dbsize']]) + blkid = int(line[labels['blkid']]) + level = int(line[labels['level']]) + + if pool not in d: + d[pool] = dict() + + if objset not in d[pool]: + d[pool][objset] = dict() + + if key not in d[pool][objset]: + d[pool][objset][key] = parse_line(line, labels) + d[pool][objset][key]['bonus'] = 0 + d[pool][objset][key]['cached'] = 0 + d[pool][objset][key]['direct'] = 0 + d[pool][objset][key]['indirect'] = 0 + d[pool][objset][key]['spill'] = 0 + + d[pool][objset][key]['cached'] += dbsize + + if blkid == -1: + d[pool][objset][key]['bonus'] += dbsize + elif blkid == -2: + d[pool][objset][key]['spill'] += dbsize + else: + if level == 0: + d[pool][objset][key]['direct'] += dbsize + else: + d[pool][objset][key]['indirect'] += dbsize + + return d + +def print_dict(d): + print_header() + for pool in d.iterkeys(): + for objset in d[pool].iterkeys(): + for v in d[pool][objset].itervalues(): + print_values(v) + +def dnodes_build_dict(filehandle): + labels = dict() + dnodes = dict() + + # First 3 lines are header information, skip the first two + for i in range(0, 2): + next(filehandle) + + # The third line contains the labels and index locations + for i, v in enumerate(next(filehandle).split()): + labels[v] = i + + # The rest of the file is buffer information + for line in filehandle: + update_dict(dnodes, 'object', line.split(), labels) + + return dnodes + +def types_build_dict(filehandle): + labels = dict() + types = dict() + + # First 3 lines are header information, skip the first two + for i in range(0, 2): + next(filehandle) + + # The third line contains the labels and index locations + for i, v in enumerate(next(filehandle).split()): + labels[v] = i + + # The rest of the file is buffer information + for line in filehandle: + update_dict(types, 'dtype', line.split(), labels) + + return types + +def buffers_print_all(filehandle): + labels = dict() + + # First 3 lines are header information, skip the first two + for i in range(0, 2): + next(filehandle) + + # The third line contains the labels and index locations + for i, v in enumerate(next(filehandle).split()): + labels[v] = i + + print_header() + + # The rest of the file is buffer information + for line in filehandle: + print_values(parse_line(line.split(), labels)) + +def main(): + global hdr + global sep + global raw + + desired_cols = None + bflag = False + dflag = False + hflag = False + ifile = None + ofile = None + tflag = False + vflag = False + xflag = False + + try: + opts, args = getopt.getopt( + sys.argv[1:], + "bdf:hi:o:rs:tvx", + [ + "buffers", + "dnodes", + "columns", + "help", + "infile", + "outfile", + "seperator", + "types", + "verbose", + "extended" + ] + ) + except getopt.error: + usage() + + for opt, arg in opts: + if opt in ('-b', '--buffers'): + bflag = True + if opt in ('-d', '--dnodes'): + dflag = True + if opt in ('-f', '--columns'): + desired_cols = arg + if opt in ('-h', '--help'): + hflag = True + if opt in ('-i', '--infile'): + ifile = arg + if opt in ('-o', '--outfile'): + ofile = arg + if opt in ('-r', '--raw'): + raw += 1 + if opt in ('-s', '--seperator'): + sep = arg + if opt in ('-t', '--types'): + tflag = True + if opt in ('-v', '--verbose'): + vflag = True + if opt in ('-x', '--extended'): + xflag = True + + if hflag or (xflag and desired_cols): + usage() + + if vflag: + detailed_usage() + + # Ensure at most only one of b, d, or t flags are set + if (bflag and dflag) or (bflag and tflag) or (dflag and tflag): + usage() + + if bflag: + hdr = bxhdr if xflag else bhdr + elif tflag: + hdr = txhdr if xflag else thdr + else: # Even if dflag is False, it's the default if none set + dflag = True + hdr = dxhdr if xflag else dhdr + + if desired_cols: + hdr = desired_cols.split(",") + + invalid = [] + incompat = [] + for ele in hdr: + if ele not in cols: + invalid.append(ele) + elif ((bflag and bincompat and ele in bincompat) or + (dflag and dincompat and ele in dincompat) or + (tflag and tincompat and ele in tincompat)): + incompat.append(ele) + + if len(invalid) > 0: + sys.stderr.write("Invalid column definition! -- %s\n" % invalid) + usage() + + if len(incompat) > 0: + sys.stderr.write("Incompatible field specified! -- %s\n" % incompat) + usage() + + if ofile: + try: + tmp = open(ofile, "w") + sys.stdout = tmp + + except: + sys.stderr.write("Cannot open %s for writing\n", ofile) + sys.exit(1) + + if not ifile: + ifile = '/proc/spl/kstat/zfs/dbufs' + + if ifile is not "-": + try: + tmp = open(ifile, "r") + sys.stdin = tmp + except: + sys.stderr.write("Cannot open %s for reading\n" % ifile) + sys.exit(1) + + if bflag: + buffers_print_all(sys.stdin) + + if dflag: + print_dict(dnodes_build_dict(sys.stdin)) + + if tflag: + print_dict(types_build_dict(sys.stdin)) + +if __name__ == '__main__': + main() diff --git a/configure.ac b/configure.ac index 08b807e3f16c..7c480942625d 100644 --- a/configure.ac +++ b/configure.ac @@ -104,6 +104,7 @@ AC_CONFIG_FILES([ cmd/zvol_id/Makefile cmd/vdev_id/Makefile cmd/arcstat/Makefile + cmd/dbufstat/Makefile module/Makefile module/avl/Makefile module/nvpair/Makefile