From 1421c89142376bfd41e4de22ed7c7846b9e41f95 Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Fri, 6 Sep 2013 16:09:05 -0700 Subject: [PATCH] Add visibility in to arc_read This change is an attempt to add visibility into the arc_read calls occurring on a system, in real time. To do this, a list was added to the in memory SPA data structure for a pool, with each element on the list corresponding to a call to arc_read. These entries are then exported through the kstat interface, which can then be interpreted in userspace. For each arc_read call, the following information is exported: * A unique identifier (uint64_t) * The time the entry was added to the list (hrtime_t) (*not* wall clock time; relative to the other entries on the list) * The objset ID (uint64_t) * The object number (uint64_t) * The indirection level (uint64_t) * The block ID (uint64_t) * The name of the function originating the arc_read call (char[24]) * The arc_flags from the arc_read call (uint32_t) * The PID of the reading thread (pid_t) * The command or name of thread originating read (char[16]) From this exported information one can see, in real time, exactly what is being read, what function is generating the read, and whether or not the read was found to be already cached. There is still some work to be done, but this should serve as a good starting point. Specifically, dbuf_read's are not accounted for in the currently exported information. Thus, a follow up patch should probably be added to export these calls that never call into arc_read (they only hit the dbuf hash table). In addition, it might be nice to create a utility similar to "arcstat.py" to digest the exported information and display it in a more readable format. Or perhaps, log the information and allow for it to be "replayed" at a later time. Signed-off-by: Prakash Surya Signed-off-by: Brian Behlendorf --- include/sys/spa.h | 21 ++++ include/sys/spa_impl.h | 2 + include/sys/zfs_context.h | 5 + include/sys/zio.h | 6 +- lib/libzpool/Makefile.am | 1 + lib/libzpool/kernel.c | 8 ++ module/zfs/Makefile.in | 1 + module/zfs/arc.c | 20 +-- module/zfs/dmu_traverse.c | 7 +- module/zfs/spa_misc.c | 4 +- module/zfs/spa_stats.c | 248 ++++++++++++++++++++++++++++++++++++++ 11 files changed, 310 insertions(+), 13 deletions(-) create mode 100644 module/zfs/spa_stats.c diff --git a/include/sys/spa.h b/include/sys/spa.h index 401ae8343e96..a4a717ac0450 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -51,6 +51,8 @@ typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; typedef struct ddt ddt_t; typedef struct ddt_entry ddt_entry_t; +typedef struct zbookmark zbookmark_t; + struct dsl_pool; struct dsl_dataset; @@ -534,6 +536,25 @@ extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_ALL ((1 << SCL_LOCKS) - 1) #define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) +/* Historical pool statistics */ +typedef struct spa_stats_history { + kmutex_t lock; + uint64_t count; + uint64_t size; + kstat_t *kstat; + void *private; + list_t list; +} spa_stats_history_t; + +typedef struct spa_stats { + spa_stats_history_t read_history; +} spa_stats_t; + +extern void spa_stats_init(spa_t *spa); +extern void spa_stats_destroy(spa_t *spa); +extern void spa_read_history_add(spa_t *spa, const zbookmark_t *zb, + uint32_t aflags); + /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 47dfe432eed2..107d14a76cfd 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -236,6 +236,8 @@ struct spa { uint64_t spa_deadman_calls; /* number of deadman calls */ uint64_t spa_sync_starttime; /* starting time fo spa_sync */ uint64_t spa_deadman_synctime; /* deadman expiration timer */ + spa_stats_t spa_stats; /* assorted spa statistics */ + /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index a126c058ebd9..dfd11d9f1e5e 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -210,6 +210,7 @@ typedef struct kthread { } kthread_t; #define curthread zk_thread_current() +#define getcomm() "unknown" #define thread_exit zk_thread_exit #define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ zk_thread_create(stk, stksize, (thread_func_t)func, arg, \ @@ -347,6 +348,10 @@ extern kstat_t *kstat_create(char *, int, char *, char *, uchar_t, ulong_t, uchar_t); extern void kstat_install(kstat_t *); extern void kstat_delete(kstat_t *); +extern void kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)); /* * Kernel memory diff --git a/include/sys/zio.h b/include/sys/zio.h index 189966bef54c..f5a128e0b319 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -256,12 +256,13 @@ extern char *zio_type_name[ZIO_TYPES]; * Therefore it must not change size or alignment between 32/64 bit * compilation options. */ -typedef struct zbookmark { +struct zbookmark { uint64_t zb_objset; uint64_t zb_object; int64_t zb_level; uint64_t zb_blkid; -} zbookmark_t; + char * zb_func; +}; #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ @@ -269,6 +270,7 @@ typedef struct zbookmark { (zb)->zb_object = object; \ (zb)->zb_level = level; \ (zb)->zb_blkid = blkid; \ + (zb)->zb_func = FTAG; \ } #define ZB_DESTROYED_OBJSET (-1ULL) diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index da81659d240c..b2d6394546a8 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -62,6 +62,7 @@ libzpool_la_SOURCES = \ $(top_srcdir)/module/zfs/spa_errlog.c \ $(top_srcdir)/module/zfs/spa_history.c \ $(top_srcdir)/module/zfs/spa_misc.c \ + $(top_srcdir)/module/zfs/spa_stats.c \ $(top_srcdir)/module/zfs/space_map.c \ $(top_srcdir)/module/zfs/txg.c \ $(top_srcdir)/module/zfs/uberblock.c \ diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index e4d645cf7797..57a3739e23e8 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -240,6 +240,14 @@ void kstat_delete(kstat_t *ksp) {} +/*ARGSUSED*/ +void +kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{} + /* * ========================================================================= * mutexes diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index e71228454110..6f0f6ef05ae1 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -44,6 +44,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/spa_config.o $(MODULE)-objs += @top_srcdir@/module/zfs/spa_errlog.o $(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o $(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o +$(MODULE)-objs += @top_srcdir@/module/zfs/spa_stats.o $(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o $(MODULE)-objs += @top_srcdir@/module/zfs/txg.o $(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 1298c5b91bc6..366f5bf9b5da 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -2943,6 +2943,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, kmutex_t *hash_lock; zio_t *rzio; uint64_t guid = spa_load_guid(spa); + int rc = 0; top: hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), @@ -2976,10 +2977,10 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, hdr->b_acb = acb; add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); - return (0); + goto out; } mutex_exit(hash_lock); - return (0); + goto out; } ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); @@ -3174,12 +3175,12 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, if (*arc_flags & ARC_NOWAIT) { zio_nowait(rzio); - return (0); + goto out; } ASSERT(*arc_flags & ARC_WAIT); if (zio_wait(rzio) == 0) - return (0); + goto out; /* l2arc read error; goto zio_read() */ } else { @@ -3203,13 +3204,18 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, zio_flags, zb); - if (*arc_flags & ARC_WAIT) - return (zio_wait(rzio)); + if (*arc_flags & ARC_WAIT) { + rc = zio_wait(rzio); + goto out; + } ASSERT(*arc_flags & ARC_NOWAIT); zio_nowait(rzio); } - return (0); + +out: + spa_read_history_add(spa, zb, *arc_flags); + return (rc); } arc_prune_t * diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 32b3e50fc005..1827ffc7413f 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -517,6 +517,9 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL); + SET_BOOKMARK(czb, td->td_objset, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + /* See comment on ZIL traversal in dsl_scan_visitds. */ if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) { uint32_t flags = ARC_WAIT; @@ -525,7 +528,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL); + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, czb); if (err != 0) return (err); @@ -539,8 +542,6 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, td, TQ_NOQUEUE)) pd->pd_exited = B_TRUE; - SET_BOOKMARK(czb, td->td_objset, - ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); err = traverse_visitbp(td, NULL, rootbp, czb); mutex_enter(&pd->pd_mtx); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 9ba6e896338b..a0b4797c97cb 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "zfs_prop.h" #include "zfeature_common.h" @@ -253,7 +254,6 @@ unsigned long zfs_deadman_synctime = 1000ULL; */ int zfs_deadman_enabled = 1; - /* * ========================================================================== * SPA config locking @@ -495,6 +495,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); + spa_stats_init(spa); avl_add(&spa_namespace_avl, spa); @@ -580,6 +581,7 @@ spa_remove(spa_t *spa) refcount_destroy(&spa->spa_refcount); + spa_stats_destroy(spa); spa_config_lock_destroy(spa); for (t = 0; t < TXG_SIZE; t++) diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c new file mode 100644 index 000000000000..271a3c312f7e --- /dev/null +++ b/module/zfs/spa_stats.c @@ -0,0 +1,248 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include + +/* + * Keeps stats on last N reads per spa_t, disabled by default. + */ +int zfs_read_history = 0; + +/* + * Include cache hits in history, disabled by default. + */ +int zfs_read_history_hits = 0; + +/* + * ========================================================================== + * SPA Read History Routines + * ========================================================================== + */ + +/* + * Read statistics - Information exported regarding each arc_read call + */ +typedef struct spa_read_history { + uint64_t uid; /* unique identifier */ + hrtime_t start; /* time read completed */ + uint64_t objset; /* read from this objset */ + uint64_t object; /* read of this object number */ + uint64_t level; /* block's indirection level */ + uint64_t blkid; /* read of this block id */ + char origin[24]; /* read originated from here */ + uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ + pid_t pid; /* PID of task doing read */ + char comm[16]; /* process name of task doing read */ + list_node_t srh_link; +} spa_read_history_t; + +static int +spa_read_history_headers(char *buf, size_t size) +{ + size = snprintf(buf, size - 1, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " + "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", + "level", "blkid", "aflags", "origin", "pid", "process"); + buf[size] = '\0'; + + return (0); +} + +static int +spa_read_history_data(char *buf, size_t size, void *data) +{ + spa_read_history_t *srh = (spa_read_history_t *)data; + + size = snprintf(buf, size - 1, "%-8llu %-16llu 0x%-6llx " + "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", + (u_longlong_t)srh->uid, srh->start, + (longlong_t)srh->objset, (longlong_t)srh->object, + (longlong_t)srh->level, (longlong_t)srh->blkid, + srh->aflags, srh->origin, srh->pid, srh->comm); + buf[size] = '\0'; + + return (0); +} + +/* + * Calculate the address for the next spa_stats_history_t entry. The + * ssh->lock will be held until ksp->ks_ndata entries are processed. + */ +static void * +spa_read_history_addr(kstat_t *ksp, loff_t n) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + + ASSERT(MUTEX_HELD(&ssh->lock)); + + if (n == 0) + ssh->private = list_tail(&ssh->list); + else if (ssh->private) + ssh->private = list_prev(&ssh->list, ssh->private); + + return (ssh->private); +} + +/* + * When the kstat is written discard all spa_read_history_t entires. The + * ssh->lock will be held until ksp->ks_ndata entries are processed. + */ +static int +spa_read_history_update(kstat_t *ksp, int rw) +{ + spa_t *spa = ksp->ks_private; + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + + if (rw == KSTAT_WRITE) { + spa_read_history_t *srh; + + while ((srh = list_remove_head(&ssh->list))) { + ssh->size--; + kmem_free(srh, sizeof(spa_read_history_t)); + } + + ASSERT3U(ssh->size, ==, 0); + } + + ksp->ks_ndata = ssh->size; + ksp->ks_data_size = ssh->size * sizeof(spa_read_history_t); + + return (0); +} + +static void +spa_read_history_init(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + char name[KSTAT_STRLEN]; + kstat_t *ksp; + + mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&ssh->list, sizeof (spa_read_history_t), + offsetof(spa_read_history_t, srh_link)); + + ssh->count = 0; + ssh->size = 0; + ssh->private = NULL; + + (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); + name[KSTAT_STRLEN-1] = '\0'; + + ksp = kstat_create(name, 0, "reads", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + ssh->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &ssh->lock; + ksp->ks_data = NULL; + ksp->ks_private = spa; + ksp->ks_update = spa_read_history_update; + kstat_set_raw_ops(ksp, spa_read_history_headers, + spa_read_history_data, spa_read_history_addr); + kstat_install(ksp); + } +} + +static void +spa_read_history_destroy(spa_t *spa) +{ + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + spa_read_history_t *srh; + kstat_t *ksp; + + ksp = ssh->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_enter(&ssh->lock); + while ((srh = list_remove_head(&ssh->list))) { + ssh->size--; + kmem_free(srh, sizeof(spa_read_history_t)); + } + + ASSERT3U(ssh->size, ==, 0); + list_destroy(&ssh->list); + mutex_exit(&ssh->lock); + + mutex_destroy(&ssh->lock); +} + +void +spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags) +{ + spa_stats_history_t *ssh = &spa->spa_stats.read_history; + spa_read_history_t *srh, *rm; + + ASSERT3P(spa, !=, NULL); + ASSERT3P(zb, !=, NULL); + + if (zfs_read_history == 0 && ssh->size == 0) + return; + + if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED)) + return; + + srh = kmem_zalloc(sizeof(spa_read_history_t), KM_PUSHPAGE); + strlcpy(srh->origin, zb->zb_func, sizeof(srh->origin)); + strlcpy(srh->comm, getcomm(), sizeof(srh->comm)); + srh->start = gethrtime(); + srh->objset = zb->zb_objset; + srh->object = zb->zb_object; + srh->level = zb->zb_level; + srh->blkid = zb->zb_blkid; + srh->aflags = aflags; + srh->pid = getpid(); + + mutex_enter(&ssh->lock); + + srh->uid = ssh->count++; + list_insert_head(&ssh->list, srh); + ssh->size++; + + while (ssh->size > zfs_read_history) { + ssh->size--; + rm = list_remove_tail(&ssh->list); + kmem_free(rm, sizeof(spa_read_history_t)); + } + + mutex_exit(&ssh->lock); +} + +void +spa_stats_init(spa_t *spa) +{ + spa_read_history_init(spa); +} + +void +spa_stats_destroy(spa_t *spa) +{ + spa_read_history_destroy(spa); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_read_history, int, 0644); +MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads"); + +module_param(zfs_read_history_hits, int, 0644); +MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history"); +#endif