From f490a4a3718faa4aaf28c773ce8d8f427795ebd6 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Thu, 1 Oct 2015 13:39:46 -0400 Subject: [PATCH] Revert "Revert "Illumos 5056 - ZFS deadlock on db_mtx and dn_holds"" This reverts commit f346623d401c387ccf7d0df36071b0ec8df31e0c. --- include/sys/dbuf.h | 5 +- include/sys/dmu.h | 132 ++++++++++++++++++++++----- include/sys/dmu_objset.h | 13 ++- include/sys/dnode.h | 3 +- include/sys/dsl_dataset.h | 10 +-- include/sys/dsl_dir.h | 3 + include/sys/sa.h | 1 - include/sys/sa_impl.h | 3 +- include/sys/spa.h | 4 + include/sys/spa_impl.h | 3 + include/sys/zap_impl.h | 3 +- include/sys/zap_leaf.h | 1 + module/zfs/dbuf.c | 182 +++++++++++++++++++++++++++----------- module/zfs/dmu_objset.c | 112 ++++++++++++++--------- module/zfs/dmu_send.c | 8 +- module/zfs/dmu_traverse.c | 2 +- module/zfs/dnode.c | 82 ++++++++++------- module/zfs/dnode_sync.c | 69 ++++++--------- module/zfs/dsl_bookmark.c | 2 +- module/zfs/dsl_dataset.c | 48 +++++----- module/zfs/dsl_deadlist.c | 2 + module/zfs/dsl_deleg.c | 2 +- module/zfs/dsl_destroy.c | 8 +- module/zfs/dsl_dir.c | 37 +++++--- module/zfs/dsl_pool.c | 2 + module/zfs/dsl_prop.c | 14 ++- module/zfs/dsl_scan.c | 8 +- module/zfs/dsl_userhold.c | 2 +- module/zfs/sa.c | 41 +++++---- module/zfs/spa.c | 19 +++- module/zfs/spa_misc.c | 47 ++++++++++ module/zfs/zap.c | 34 +++---- module/zfs/zap_micro.c | 8 +- module/zfs/zfs_ioctl.c | 4 +- module/zfs/zfs_sa.c | 3 +- module/zfs/zil.c | 2 +- 36 files changed, 613 insertions(+), 306 deletions(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 8ade4c9c0b79..94d326d5716c 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -227,9 +227,8 @@ typedef struct dmu_buf_impl { /* Data which is unique to data (leaf) blocks: */ - /* stuff we store for the user (see dmu_buf_set_user) */ - void *db_user_ptr; - dmu_buf_evict_func_t *db_evict_func; + /* User callback information. */ + dmu_buf_user_t *db_user; uint8_t db_immediate_evict; uint8_t db_freed_in_flight; diff --git a/include/sys/dmu.h b/include/sys/dmu.h index de85b51b598b..855ba5cd456b 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -40,11 +40,9 @@ * dmu_spa.h. */ +#include #include -#include -#include #include -#include #include #include @@ -290,8 +288,6 @@ typedef struct dmu_buf { void *db_data; /* data in buffer */ } dmu_buf_t; -typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); - /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. */ @@ -493,36 +489,126 @@ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); +typedef void dmu_buf_evict_func_t(void *user_ptr); + +/* + * A DMU buffer user object may be associated with a dbuf for the + * duration of its lifetime. This allows the user of a dbuf (client) + * to attach private data to a dbuf (e.g. in-core only data such as a + * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified + * when that dbuf has been evicted. Clients typically respond to the + * eviction notification by freeing their private data, thus ensuring + * the same lifetime for both dbuf and private data. + * + * The mapping from a dmu_buf_user_t to any client private data is the + * client's responsibility. All current consumers of the API with private + * data embed a dmu_buf_user_t as the first member of the structure for + * their private data. This allows conversions between the two types + * with a simple cast. Since the DMU buf user API never needs access + * to the private data, other strategies can be employed if necessary + * or convenient for the client (e.g. using container_of() to do the + * conversion for private data that cannot have the dmu_buf_user_t as + * its first member). + * + * Eviction callbacks are executed without the dbuf mutex held or any + * other type of mechanism to guarantee that the dbuf is still available. + * For this reason, users must assume the dbuf has already been freed + * and not reference the dbuf from the callback context. + * + * Users requesting "immediate eviction" are notified as soon as the dbuf + * is only referenced by dirty records (dirties == holds). Otherwise the + * notification occurs after eviction processing for the dbuf begins. + */ +typedef struct dmu_buf_user { + /* + * Asynchronous user eviction callback state. + */ + taskq_ent_t dbu_tqent; + + /* This instance's eviction function pointer. */ + dmu_buf_evict_func_t *dbu_evict_func; +#ifdef ZFS_DEBUG + /* + * Pointer to user's dbuf pointer. NULL for clients that do + * not associate a dbuf with their user data. + * + * The dbuf pointer is cleared upon eviction so as to catch + * use-after-evict bugs in clients. + */ + dmu_buf_t **dbu_clear_on_evict_dbufp; +#endif +} dmu_buf_user_t; + +/* + * Initialize the given dmu_buf_user_t instance with the eviction function + * evict_func, to be called when the user is evicted. + * + * NOTE: This function should only be called once on a given dmu_buf_user_t. + * To allow enforcement of this, dbu must already be zeroed on entry. + */ +#ifdef __lint +/* Very ugly, but it beats issuing suppression directives in many Makefiles. */ +extern void +dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, + dmu_buf_t **clear_on_evict_dbufp); +#else /* __lint */ +static inline void +dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, + dmu_buf_t **clear_on_evict_dbufp) +{ + ASSERT(dbu->dbu_evict_func == NULL); + ASSERT(evict_func != NULL); + dbu->dbu_evict_func = evict_func; +#ifdef ZFS_DEBUG + dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; +#endif +} +#endif /* __lint */ + /* - * Returns NULL on success, or the existing user ptr if it's already - * been set. + * Attach user data to a dbuf and mark it for normal (when the dbuf's + * data is cleared or its reference count goes to zero) eviction processing. * - * user_ptr is for use by the user and can be obtained via dmu_buf_get_user(). + * Returns NULL on success, or the existing user if another user currently + * owns the buffer. + */ +void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); + +/* + * Attach user data to a dbuf and mark it for immediate (its dirty and + * reference counts are equal) eviction processing. * - * If non-NULL, pageout func will be called when this buffer is being - * excised from the cache, so that you can clean up the data structure - * pointed to by user_ptr. + * Returns NULL on success, or the existing user if another user currently + * owns the buffer. + */ +void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); + +/* + * Replace the current user of a dbuf. * - * dmu_evict_user() will call the pageout func for all buffers in a - * objset with a given pageout func. + * If given the current user of a dbuf, replaces the dbuf's user with + * "new_user" and returns the user data pointer that was replaced. + * Otherwise returns the current, and unmodified, dbuf user pointer. */ -void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, - dmu_buf_evict_func_t *pageout_func); +void *dmu_buf_replace_user(dmu_buf_t *db, + dmu_buf_user_t *old_user, dmu_buf_user_t *new_user); + /* - * set_user_ie is the same as set_user, but request immediate eviction - * when hold count goes to zero. + * Remove the specified user data for a DMU buffer. + * + * Returns the user that was removed on success, or the current user if + * another user currently owns the buffer. */ -void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr, - dmu_buf_evict_func_t *pageout_func); -void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, - void *user_ptr, dmu_buf_evict_func_t *pageout_func); -void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func); +void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); /* - * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set. + * Returns the user data (dmu_buf_user_t *) associated with this dbuf. */ void *dmu_buf_get_user(dmu_buf_t *db); +/* Block until any in-progress dmu buf user evictions complete. */ +void dmu_buf_user_evict_wait(void); + /* * Returns the blkptr associated with this dbuf, or NULL if not set. */ diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index fac41b264e8e..252df6e76a61 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -75,22 +75,25 @@ struct objset { arc_buf_t *os_phys_buf; objset_phys_t *os_phys; /* - * The following "special" dnodes have no parent and are exempt from - * dnode_move(), but they root their descendents in this objset using - * handles anyway, so that all access to dnodes from dbufs consistently - * uses handles. + * The following "special" dnodes have no parent, are exempt + * from dnode_move(), and are not recorded in os_dnodes, but they + * root their descendents in this objset using handles anyway, so + * that all access to dnodes from dbufs consistently uses handles. */ dnode_handle_t os_meta_dnode; dnode_handle_t os_userused_dnode; dnode_handle_t os_groupused_dnode; zilog_t *os_zil; + list_node_t os_evicting_node; + /* can change, under dsl_dir's locks: */ enum zio_checksum os_checksum; enum zio_compress os_compress; uint8_t os_copies; enum zio_checksum os_dedup_checksum; boolean_t os_dedup_verify; + boolean_t os_evicting; zfs_logbias_op_t os_logbias; zfs_cache_type_t os_primary_cache; zfs_cache_type_t os_secondary_cache; @@ -172,6 +175,8 @@ int dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); int dmu_fsname(const char *snapname, char *buf); +void dmu_objset_evict_done(objset_t *os); + /* Code for handling userspace interface */ extern const char *dmu_objset_types[]; diff --git a/include/sys/dnode.h b/include/sys/dnode.h index b727f213f707..50e01155903a 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -278,6 +278,7 @@ typedef struct dnode_handle { } dnode_handle_t; typedef struct dnode_children { + dmu_buf_user_t dnc_dbu; /* User evict data */ size_t dnc_count; /* number of children */ dnode_handle_t dnc_children[]; /* sized dynamically */ } dnode_children_t; @@ -288,7 +289,7 @@ typedef struct free_range { uint64_t fr_nblks; } free_range_t; -dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp, +void dnode_special_open(struct objset *dd, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh); void dnode_special_close(dnode_handle_t *dnh); diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index 5bfa3b40abc0..a56e344a099d 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -133,11 +133,14 @@ typedef struct dsl_dataset_phys { } dsl_dataset_phys_t; typedef struct dsl_dataset { + dmu_buf_user_t ds_dbu; + /* Immutable: */ struct dsl_dir *ds_dir; dmu_buf_t *ds_dbuf; uint64_t ds_object; uint64_t ds_fsid_guid; + boolean_t ds_is_snapshot; /* only used in syncing context, only valid for non-snapshots: */ struct dsl_dataset *ds_prev; @@ -198,11 +201,8 @@ dsl_dataset_phys(dsl_dataset_t *ds) */ #define MAX_TAG_PREFIX_LEN 17 -static inline boolean_t -dsl_dataset_is_snapshot(dsl_dataset_t *ds) -{ - return (dsl_dataset_phys(ds)->ds_num_children != 0); -} +#define dsl_dataset_is_snapshot(ds) \ + (dsl_dataset_phys(ds)->ds_num_children != 0) #define DS_UNIQUE_IS_ACCURATE(ds) \ ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0) diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h index 041d5cd0786b..b16b1b2a3a28 100644 --- a/include/sys/dsl_dir.h +++ b/include/sys/dsl_dir.h @@ -85,6 +85,8 @@ typedef struct dsl_dir_phys { } dsl_dir_phys_t; struct dsl_dir { + dmu_buf_user_t dd_dbu; + /* These are immutable; no lock needed: */ uint64_t dd_object; dsl_pool_t *dd_pool; @@ -123,6 +125,7 @@ struct dsl_dataset; typedef struct dsl_dataset dsl_dataset_t; void dsl_dir_rele(dsl_dir_t *dd, void *tag); +void dsl_dir_async_rele(dsl_dir_t *dd, void *tag); int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dir_t **, const char **tail); int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, diff --git a/include/sys/sa.h b/include/sys/sa.h index 7b5b03a5629f..48e3bcd7cdf3 100644 --- a/include/sys/sa.h +++ b/include/sys/sa.h @@ -133,7 +133,6 @@ int sa_update_from_cb(sa_handle_t *, sa_attr_type_t, uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *); void sa_object_info(sa_handle_t *, dmu_object_info_t *); void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *); -void sa_update_user(sa_handle_t *, sa_handle_t *); void *sa_get_userdata(sa_handle_t *); void sa_set_userp(sa_handle_t *, void *); dmu_buf_t *sa_get_db(sa_handle_t *); diff --git a/include/sys/sa_impl.h b/include/sys/sa_impl.h index 24a7ad04230c..6f2f1db6dcf9 100644 --- a/include/sys/sa_impl.h +++ b/include/sys/sa_impl.h @@ -209,11 +209,12 @@ typedef enum sa_data_op { */ struct sa_handle { + dmu_buf_user_t sa_dbu; kmutex_t sa_lock; dmu_buf_t *sa_bonus; dmu_buf_t *sa_spill; objset_t *sa_os; - void *sa_userp; + void *sa_userp; sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */ sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */ }; diff --git a/include/sys/spa.h b/include/sys/spa.h index 118e427c8fe4..5dc9084dad6b 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -690,6 +690,7 @@ extern spa_t *spa_next(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, void *tag); extern void spa_close(spa_t *spa, void *tag); +extern void spa_async_close(spa_t *spa, void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_NONE 0x00 @@ -800,6 +801,9 @@ extern uint64_t spa_version(spa_t *spa); extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); +extern void spa_evicting_os_register(spa_t *, objset_t *os); +extern void spa_evicting_os_deregister(spa_t *, objset_t *os); +extern void spa_evicting_os_wait(spa_t *spa); extern int spa_max_replication(spa_t *spa); extern int spa_prev_software_version(spa_t *spa); extern uint8_t spa_get_failmode(spa_t *spa); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index ad4597bfff81..0b49c7147b10 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -145,6 +145,9 @@ struct spa { uint64_t spa_claim_max_txg; /* highest claimed birth txg */ timespec_t spa_loaded_ts; /* 1st successful open time */ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ + kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */ + list_t spa_evicting_os_list; /* Objsets being evicted. */ + kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ int spa_min_ashift; /* of vdevs in normal class */ diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h index 3ba617f2988d..bfd43e31da80 100644 --- a/include/sys/zap_impl.h +++ b/include/sys/zap_impl.h @@ -140,6 +140,7 @@ typedef struct zap_phys { typedef struct zap_table_phys zap_table_phys_t; typedef struct zap { + dmu_buf_user_t zap_dbu; objset_t *zap_objset; uint64_t zap_object; struct dmu_buf *zap_dbuf; @@ -196,7 +197,7 @@ boolean_t zap_match(zap_name_t *zn, const char *matchname); int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp); void zap_unlockdir(zap_t *zap); -void zap_evict(dmu_buf_t *db, void *vmzap); +void zap_evict(void *dbu); zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); void zap_name_free(zap_name_t *zn); int zap_hashbits(zap_t *zap); diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h index d78ec21941cf..e784c5963b2e 100644 --- a/include/sys/zap_leaf.h +++ b/include/sys/zap_leaf.h @@ -153,6 +153,7 @@ typedef union zap_leaf_chunk { } zap_leaf_chunk_t; typedef struct zap_leaf { + dmu_buf_user_t l_dbu; krwlock_t l_rwlock; uint64_t l_blkid; /* 1<db_user == NULL) + return; + + /* Only data blocks support the attachment of user data. */ + ASSERT(db->db_level == 0); + + /* Clients must resolve a dbuf before attaching user data. */ + ASSERT(db->db.db_data != NULL); + ASSERT3U(db->db_state, ==, DB_CACHED); + + holds = refcount_count(&db->db_holds); + if (verify_type == DBVU_EVICTING) { + /* + * Immediate eviction occurs when holds == dirtycnt. + * For normal eviction buffers, holds is zero on + * eviction, except when dbuf_fix_old_data() calls + * dbuf_clear_data(). However, the hold count can grow + * during eviction even though db_mtx is held (see + * dmu_bonus_hold() for an example), so we can only + * test the generic invariant that holds >= dirtycnt. + */ + ASSERT3U(holds, >=, db->db_dirtycnt); + } else { + if (db->db_immediate_evict == TRUE) + ASSERT3U(holds, >=, db->db_dirtycnt); + else + ASSERT3U(holds, >, 0); + } +#endif +} + static void dbuf_evict_user(dmu_buf_impl_t *db) { + dmu_buf_user_t *dbu = db->db_user; + ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_level != 0 || db->db_evict_func == NULL) + if (dbu == NULL) return; - db->db_evict_func(&db->db, db->db_user_ptr); - db->db_user_ptr = NULL; - db->db_evict_func = NULL; + dbuf_verify_user(db, DBVU_EVICTING); + db->db_user = NULL; + +#ifdef ZFS_DEBUG + if (dbu->dbu_clear_on_evict_dbufp != NULL) + *dbu->dbu_clear_on_evict_dbufp = NULL; +#endif + + /* + * Invoke the callback from a taskq to avoid lock order reversals + * and limit stack depth. + */ + taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0, + &dbu->dbu_tqent); } boolean_t @@ -348,6 +409,12 @@ dbuf_init(void) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); dbuf_stats_init(h); + + /* + * All entries are queued via taskq_dispatch_ent(), so min/maxalloc + * configuration is not required. + */ + dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); } void @@ -370,6 +437,7 @@ dbuf_fini(void) kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); #endif kmem_cache_destroy(dbuf_cache); + taskq_destroy(dbu_evict_taskq); } /* @@ -487,22 +555,28 @@ dbuf_verify(dmu_buf_impl_t *db) } #endif +static void +dbuf_clear_data(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + dbuf_evict_user(db); + db->db_buf = NULL; + db->db.db_data = NULL; + if (db->db_state != DB_NOFILL) + db->db_state = DB_UNCACHED; +} + static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(buf != NULL); + db->db_buf = buf; - if (buf != NULL) { - ASSERT(buf->b_data != NULL); - db->db.db_data = buf->b_data; - if (!arc_released(buf)) - arc_set_callback(buf, dbuf_do_evict, db); - } else { - dbuf_evict_user(db); - db->db.db_data = NULL; - if (db->db_state != DB_NOFILL) - db->db_state = DB_UNCACHED; - } + ASSERT(buf->b_data != NULL); + db->db.db_data = buf->b_data; + if (!arc_released(buf)) + arc_set_callback(buf, dbuf_do_evict, db); } /* @@ -524,7 +598,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); - dbuf_set_data(db, NULL); + dbuf_clear_data(db); mutex_exit(&db->db_mtx); } return (abuf); @@ -764,7 +838,7 @@ dbuf_noread(dmu_buf_impl_t *db) dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { - dbuf_set_data(db, NULL); + dbuf_clear_data(db); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } @@ -820,7 +894,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { - dbuf_set_data(db, NULL); + dbuf_clear_data(db); } } @@ -871,7 +945,8 @@ void dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_tx_t *tx) { - dmu_buf_impl_t *db, *db_next, *db_search; + dmu_buf_impl_t *db_search; + dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; avl_index_t where; boolean_t freespill = @@ -1455,7 +1530,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) arc_buf_t *buf = db->db_buf; ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); - dbuf_set_data(db, NULL); + dbuf_clear_data(db); VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); return (B_TRUE); @@ -1804,8 +1879,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_parent = parent; db->db_blkptr = blkptr; - db->db_user_ptr = NULL; - db->db_evict_func = NULL; + db->db_user = NULL; db->db_immediate_evict = 0; db->db_freed_in_flight = 0; @@ -2316,7 +2390,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) /* * This dbuf has anonymous data associated with it. */ - dbuf_set_data(db, NULL); + dbuf_clear_data(db); VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); } else { @@ -2349,7 +2423,8 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) } else { dbuf_clear(db); } - } else if (arc_buf_eviction_needed(db->db_buf)) { + } else if (db->db_objset->os_evicting || + arc_buf_eviction_needed(db->db_buf)) { dbuf_clear(db); } else { mutex_exit(&db->db_mtx); @@ -2368,51 +2443,57 @@ dbuf_refcount(dmu_buf_impl_t *db) } void * -dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, - dmu_buf_evict_func_t *evict_func) +dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, + dmu_buf_user_t *new_user) { - return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func)); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + mutex_enter(&db->db_mtx); + dbuf_verify_user(db, DBVU_NOT_EVICTING); + if (db->db_user == old_user) + db->db_user = new_user; + else + old_user = db->db_user; + dbuf_verify_user(db, DBVU_NOT_EVICTING); + mutex_exit(&db->db_mtx); + + return (old_user); } void * -dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, - dmu_buf_evict_func_t *evict_func) +dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - db->db_immediate_evict = TRUE; - return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func)); + return (dmu_buf_replace_user(db_fake, NULL, user)); } void * -dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, - dmu_buf_evict_func_t *evict_func) +dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(db->db_level == 0); - - ASSERT((user_ptr == NULL) == (evict_func == NULL)); - - mutex_enter(&db->db_mtx); - if (db->db_user_ptr == old_user_ptr) { - db->db_user_ptr = user_ptr; - db->db_evict_func = evict_func; - } else { - old_user_ptr = db->db_user_ptr; - } + db->db_immediate_evict = TRUE; + return (dmu_buf_set_user(db_fake, user)); +} - mutex_exit(&db->db_mtx); - return (old_user_ptr); +void * +dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) +{ + return (dmu_buf_replace_user(db_fake, user, NULL)); } void * dmu_buf_get_user(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(!refcount_is_zero(&db->db_holds)); - return (db->db_user_ptr); + dbuf_verify_user(db, DBVU_NOT_EVICTING); + return (db->db_user); +} + +void +dmu_buf_user_evict_wait() +{ + taskq_wait(dbu_evict_taskq); } boolean_t @@ -3084,7 +3165,6 @@ EXPORT_SYMBOL(dbuf_refcount); EXPORT_SYMBOL(dbuf_sync_list); EXPORT_SYMBOL(dmu_buf_set_user); EXPORT_SYMBOL(dmu_buf_set_user_ie); -EXPORT_SYMBOL(dmu_buf_update_user); EXPORT_SYMBOL(dmu_buf_get_user); EXPORT_SYMBOL(dmu_buf_freeable); EXPORT_SYMBOL(dmu_buf_get_blkptr); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 589a5cf64e4e..49bf273e4d30 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -371,7 +371,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); } - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), @@ -433,7 +433,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_secondary_cache = ZFS_CACHE_ALL; } - if (ds == NULL || !dsl_dataset_is_snapshot(ds)) + if (ds == NULL || !ds->ds_is_snapshot) os->os_zil_header = os->os_phys->os_zil_header; os->os_zil = zil_alloc(os, &os->os_zil_header); @@ -448,20 +448,19 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); + list_link_init(&os->os_evicting_node); + mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); - DMU_META_DNODE(os) = dnode_special_open(os, - &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, - &os->os_meta_dnode); + dnode_special_open(os, &os->os_phys->os_meta_dnode, + DMU_META_DNODE_OBJECT, &os->os_meta_dnode); if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { - DMU_USERUSED_DNODE(os) = dnode_special_open(os, - &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, - &os->os_userused_dnode); - DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, - &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, - &os->os_groupused_dnode); + dnode_special_open(os, &os->os_phys->os_userused_dnode, + DMU_USERUSED_OBJECT, &os->os_userused_dnode); + dnode_special_open(os, &os->os_phys->os_groupused_dnode, + DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); } *osp = os; @@ -629,41 +628,57 @@ dmu_objset_disown(objset_t *os, void *tag) void dmu_objset_evict_dbufs(objset_t *os) { + dnode_t *dn_marker; dnode_t *dn; - mutex_enter(&os->os_lock); + dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP); - /* process the mdn last, since the other dnodes have holds on it */ - list_remove(&os->os_dnodes, DMU_META_DNODE(os)); - list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); + mutex_enter(&os->os_lock); + dn = list_head(&os->os_dnodes); + while (dn != NULL) { + /* + * Skip dnodes without holds. We have to do this dance + * because dnode_add_ref() only works if there is already a + * hold. If the dnode has no holds, then it has no dbufs. + */ + if (dnode_add_ref(dn, FTAG)) { + list_insert_after(&os->os_dnodes, dn, dn_marker); + mutex_exit(&os->os_lock); - /* - * Find the first dnode with holds. We have to do this dance - * because dnode_add_ref() only works if you already have a - * hold. If there are no holds then it has no dbufs so OK to - * skip. - */ - for (dn = list_head(&os->os_dnodes); - dn && !dnode_add_ref(dn, FTAG); - dn = list_next(&os->os_dnodes, dn)) - continue; + dnode_evict_dbufs(dn); + dnode_rele(dn, FTAG); - while (dn) { - dnode_t *next_dn = dn; + mutex_enter(&os->os_lock); + dn = list_next(&os->os_dnodes, dn_marker); + list_remove(&os->os_dnodes, dn_marker); + } else { + dn = list_next(&os->os_dnodes, dn); + } + } + mutex_exit(&os->os_lock); - do { - next_dn = list_next(&os->os_dnodes, next_dn); - } while (next_dn && !dnode_add_ref(next_dn, FTAG)); + kmem_free(dn_marker, sizeof (dnode_t)); - mutex_exit(&os->os_lock); - dnode_evict_dbufs(dn); - dnode_rele(dn, FTAG); - mutex_enter(&os->os_lock); - dn = next_dn; + if (DMU_USERUSED_DNODE(os) != NULL) { + dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os)); + dnode_evict_dbufs(DMU_USERUSED_DNODE(os)); } - mutex_exit(&os->os_lock); + dnode_evict_dbufs(DMU_META_DNODE(os)); } +/* + * Objset eviction processing is split into into two pieces. + * The first marks the objset as evicting, evicts any dbufs that + * have a refcount of zero, and then queues up the objset for the + * second phase of eviction. Once os->os_dnodes has been cleared by + * dnode_buf_pageout()->dnode_destroy(), the second phase is executed. + * The second phase closes the special dnodes, dequeues the objset from + * the list of those undergoing eviction, and finally frees the objset. + * + * NOTE: Due to asynchronous eviction processing (invocation of + * dnode_buf_pageout()), it is possible for the meta dnode for the + * objset to have no holds even though os->os_dnodes is not empty. + */ void dmu_objset_evict(objset_t *os) { @@ -675,7 +690,7 @@ dmu_objset_evict(objset_t *os) ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os)); @@ -712,8 +727,24 @@ dmu_objset_evict(objset_t *os) if (os->os_sa) sa_tear_down(os); + os->os_evicting = B_TRUE; dmu_objset_evict_dbufs(os); + mutex_enter(&os->os_lock); + spa_evicting_os_register(os->os_spa, os); + if (list_is_empty(&os->os_dnodes)) { + mutex_exit(&os->os_lock); + dmu_objset_evict_done(os); + } else { + mutex_exit(&os->os_lock); + } +} + +void +dmu_objset_evict_done(objset_t *os) +{ + ASSERT3P(list_head(&os->os_dnodes), ==, NULL); + dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { dnode_special_close(&os->os_userused_dnode); @@ -721,8 +752,6 @@ dmu_objset_evict(objset_t *os) } zil_free(os->os_zil); - ASSERT3P(list_head(&os->os_dnodes), ==, NULL); - VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); /* @@ -737,6 +766,7 @@ dmu_objset_evict(objset_t *os) mutex_destroy(&os->os_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); + spa_evicting_os_deregister(os->os_spa, os); kmem_free(os, sizeof (objset_t)); } @@ -935,7 +965,7 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx) return (error); /* You can only clone snapshots, not the head datasets. */ - if (!dsl_dataset_is_snapshot(origin)) { + if (!origin->ds_is_snapshot) { dsl_dataset_rele(origin, FTAG); return (SET_ERROR(EINVAL)); } @@ -1515,7 +1545,7 @@ int dmu_objset_is_snapshot(objset_t *os) { if (os->os_dsl_dataset != NULL) - return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); + return (os->os_dsl_dataset->ds_is_snapshot); else return (B_FALSE); } diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index a53732eb8752..b2d844eb4256 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -639,7 +639,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, fromtxg = fromzb->zbm_creation_txg; } dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", sizeof (drr->drr_u.drr_begin.drr_toname)); } @@ -879,11 +879,11 @@ dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); /* tosnap must be a snapshot */ - if (!dsl_dataset_is_snapshot(ds)) + if (!ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* fromsnap, if provided, must be a snapshot */ - if (fromds != NULL && !dsl_dataset_is_snapshot(fromds)) + if (fromds != NULL && !fromds->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* @@ -1158,7 +1158,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) dsl_dataset_rele(ds, FTAG); return (error); } - if (!dsl_dataset_is_snapshot(origin)) { + if (!origin->ds_is_snapshot) { dsl_dataset_rele(origin, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 38c3e318aeeb..12d099bfd414 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -546,7 +546,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); /* See comment on ZIL traversal in dsl_scan_visitds. */ - if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) { + if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { uint32_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 71ec8782c3e7..2858bbfb492e 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -404,8 +404,9 @@ static dnode_t * dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, uint64_t object, dnode_handle_t *dnh) { - dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); + dnode_t *dn; + dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); ASSERT(!POINTER_IS_VALID(dn->dn_objset)); dn->dn_moved = 0; @@ -442,13 +443,31 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); mutex_enter(&os->os_lock); - list_insert_head(&os->os_dnodes, dn); + if (dnh->dnh_dnode != NULL) { + /* Lost the allocation race. */ + mutex_exit(&os->os_lock); + kmem_cache_free(dnode_cache, dn); + return (dnh->dnh_dnode); + } + + /* + * Exclude special dnodes from os_dnodes so an empty os_dnodes + * signifies that the special dnodes have no references from + * their children (the entries in os_dnodes). This allows + * dnode_destroy() to easily determine if the last child has + * been removed and then complete eviction of the objset. + */ + if (!DMU_OBJECT_IS_SPECIAL(object)) + list_insert_head(&os->os_dnodes, dn); membar_producer(); + /* - * Everything else must be valid before assigning dn_objset makes the - * dnode eligible for dnode_move(). + * Everything else must be valid before assigning dn_objset + * makes the dnode eligible for dnode_move(). */ dn->dn_objset = os; + + dnh->dnh_dnode = dn; mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); @@ -462,12 +481,18 @@ static void dnode_destroy(dnode_t *dn) { objset_t *os = dn->dn_objset; + boolean_t complete_os_eviction = B_FALSE; ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); mutex_enter(&os->os_lock); POINTER_INVALIDATE(&dn->dn_objset); - list_remove(&os->os_dnodes, dn); + if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + list_remove(&os->os_dnodes, dn); + complete_os_eviction = + list_is_empty(&os->os_dnodes) && + list_link_active(&os->os_evicting_node); + } mutex_exit(&os->os_lock); /* the dnode can no longer move, so we can release the handle */ @@ -502,6 +527,9 @@ dnode_destroy(dnode_t *dn) dmu_zfetch_rele(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); + + if (complete_os_eviction) + dmu_objset_evict_done(os); } void @@ -968,33 +996,32 @@ dnode_special_close(dnode_handle_t *dnh) */ while (refcount_count(&dn->dn_holds) > 0) delay(1); + ASSERT(dn->dn_dbuf == NULL || + dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); zrl_add(&dnh->dnh_zrlock); dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = NULL; } -dnode_t * +void dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh) { - dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh); - dnh->dnh_dnode = dn; + dnode_t *dn; + + dn = dnode_create(os, dnp, NULL, object, dnh); zrl_init(&dnh->dnh_zrlock); DNODE_VERIFY(dn); - return (dn); } static void -dnode_buf_pageout(dmu_buf_t *db, void *arg) +dnode_buf_pageout(void *dbu) { - dnode_children_t *children_dnodes = arg; + dnode_children_t *children_dnodes = dbu; int i; - int epb = db->db_size >> DNODE_SHIFT; - ASSERT(epb == children_dnodes->dnc_count); - - for (i = 0; i < epb; i++) { + for (i = 0; i < children_dnodes->dnc_count; i++) { dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; dnode_t *dn; @@ -1024,7 +1051,7 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg) dnh->dnh_dnode = NULL; } kmem_free(children_dnodes, sizeof (dnode_children_t) + - epb * sizeof (dnode_handle_t)); + children_dnodes->dnc_count * sizeof (dnode_handle_t)); } /* @@ -1108,16 +1135,17 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, if (children_dnodes == NULL) { int i; dnode_children_t *winner; - children_dnodes = kmem_alloc(sizeof (dnode_children_t) + + children_dnodes = kmem_zalloc(sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t), KM_SLEEP); children_dnodes->dnc_count = epb; dnh = &children_dnodes->dnc_children[0]; for (i = 0; i < epb; i++) { zrl_init(&dnh[i].dnh_zrlock); - dnh[i].dnh_dnode = NULL; } - if ((winner = dmu_buf_set_user(&db->db, children_dnodes, - dnode_buf_pageout))) { + dmu_buf_init_user(&children_dnodes->dnc_dbu, + dnode_buf_pageout, NULL); + winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu); + if (winner != NULL) { for (i = 0; i < epb; i++) { zrl_destroy(&dnh[i].dnh_zrlock); @@ -1132,17 +1160,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, dnh = &children_dnodes->dnc_children[idx]; zrl_add(&dnh->dnh_zrlock); - if ((dn = dnh->dnh_dnode) == NULL) { + dn = dnh->dnh_dnode; + if (dn == NULL) { dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; - dnode_t *winner; dn = dnode_create(os, phys, db, object, dnh); - winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn); - if (winner != NULL) { - zrl_add(&dnh->dnh_zrlock); - dnode_destroy(dn); /* implicit zrl_remove() */ - dn = winner; - } } mutex_enter(&dn->dn_mtx); @@ -1156,10 +1178,10 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); } - mutex_exit(&dn->dn_mtx); - if (refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dnh); + mutex_exit(&dn->dn_mtx); + /* Now we can rely on the hold to prevent the dnode from moving. */ zrl_remove(&dnh->dnh_zrlock); diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index d896deb58a3f..a8fa9a9527a9 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -404,53 +404,41 @@ dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) void dnode_evict_dbufs(dnode_t *dn) { - int progress; - int pass = 0; + dmu_buf_impl_t *db_marker; + dmu_buf_impl_t *db, *db_next; - do { - dmu_buf_impl_t *db, *db_next; - int evicting = FALSE; + db_marker = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP); + + mutex_enter(&dn->dn_dbufs_mtx); + for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { - progress = FALSE; - mutex_enter(&dn->dn_dbufs_mtx); - for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { - db_next = AVL_NEXT(&dn->dn_dbufs, db); #ifdef DEBUG - DB_DNODE_ENTER(db); - ASSERT3P(DB_DNODE(db), ==, dn); - DB_DNODE_EXIT(db); + DB_DNODE_ENTER(db); + ASSERT3P(DB_DNODE(db), ==, dn); + DB_DNODE_EXIT(db); #endif /* DEBUG */ - mutex_enter(&db->db_mtx); - if (db->db_state == DB_EVICTING) { - progress = TRUE; - evicting = TRUE; - mutex_exit(&db->db_mtx); - } else if (refcount_is_zero(&db->db_holds)) { - progress = TRUE; - dbuf_clear(db); /* exits db_mtx for us */ - } else { - mutex_exit(&db->db_mtx); - } - + mutex_enter(&db->db_mtx); + if (db->db_state != DB_EVICTING && + refcount_is_zero(&db->db_holds)) { + db_marker->db_level = db->db_level; + db_marker->db_blkid = db->db_blkid; + db_marker->db_state = DB_SEARCH; + avl_insert_here(&dn->dn_dbufs, db_marker, db, + AVL_BEFORE); + + dbuf_clear(db); + + db_next = AVL_NEXT(&dn->dn_dbufs, db_marker); + avl_remove(&dn->dn_dbufs, db_marker); + } else { + mutex_exit(&db->db_mtx); + db_next = AVL_NEXT(&dn->dn_dbufs, db); } - /* - * NB: we need to drop dn_dbufs_mtx between passes so - * that any DB_EVICTING dbufs can make progress. - * Ideally, we would have some cv we could wait on, but - * since we don't, just wait a bit to give the other - * thread a chance to run. - */ - mutex_exit(&dn->dn_dbufs_mtx); - if (evicting) - delay(1); - pass++; - if ((pass % 100) == 0) - dprintf("Exceeded %d passes evicting dbufs\n", pass); - } while (progress); + } + mutex_exit(&dn->dn_dbufs_mtx); - if (pass >= 100) - dprintf("Required %d passes to evict dbufs\n", pass); + kmem_free(db_marker, sizeof (dmu_buf_impl_t)); dnode_evict_bonus(dn); } @@ -515,7 +503,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); dnode_evict_dbufs(dn); ASSERT(avl_is_empty(&dn->dn_dbufs)); - ASSERT3P(dn->dn_bonus, ==, NULL); /* * XXX - It would be nice to assert this, but we may still diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c index f793dbc0ec62..447a3a2dc3a2 100644 --- a/module/zfs/dsl_bookmark.c +++ b/module/zfs/dsl_bookmark.c @@ -120,7 +120,7 @@ dsl_bookmark_create_check_impl(dsl_dataset_t *snapds, const char *bookmark_name, int error; zfs_bookmark_phys_t bmark_phys; - if (!dsl_dataset_is_snapshot(snapds)) + if (!snapds->ds_is_snapshot) return (SET_ERROR(EINVAL)); error = dsl_bookmark_hold_ds(dp, bookmark_name, diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 32014f2864d3..ad805eb34493 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -72,7 +72,6 @@ int zfs_max_recordsize = 1 * 1024 * 1024; #define DS_REF_MAX (1ULL << 62) extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds); -extern inline boolean_t dsl_dataset_is_snapshot(dsl_dataset_t *ds); /* * Figure out how much of this delta should be propogated to the dsl_dir @@ -158,7 +157,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, } ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); - ASSERT(!dsl_dataset_is_snapshot(ds)); + ASSERT(!ds->ds_is_snapshot); dmu_buf_will_dirty(ds->ds_dbuf, tx); if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { @@ -256,14 +255,15 @@ dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, return (B_TRUE); } -/* ARGSUSED */ static void -dsl_dataset_evict(dmu_buf_t *db, void *dsv) +dsl_dataset_evict(void *dbu) { - dsl_dataset_t *ds = dsv; + dsl_dataset_t *ds = dbu; ASSERT(ds->ds_owner == NULL); + ds->ds_dbuf = NULL; + unique_remove(ds->ds_fsid_guid); if (ds->ds_objset != NULL) @@ -275,10 +275,10 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) } bplist_destroy(&ds->ds_pending_deadlist); - if (dsl_dataset_phys(ds)->ds_deadlist_obj != 0) + if (ds->ds_deadlist.dl_os != NULL) dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_dir) - dsl_dir_rele(ds->ds_dir, ds); + dsl_dir_async_rele(ds->ds_dir, ds); ASSERT(!list_link_active(&ds->ds_synced_link)); @@ -416,6 +416,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); ds->ds_dbuf = dbuf; ds->ds_object = dsobj; + ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0; list_link_init(&ds->ds_synced_link); mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); @@ -456,7 +457,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, return (err); } - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { ds->ds_snapname[0] = '\0'; if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { err = dsl_dataset_hold_obj(dp, @@ -483,7 +484,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, } } - if (err == 0 && !dsl_dataset_is_snapshot(ds)) { + if (err == 0 && !ds->ds_is_snapshot) { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &ds->ds_reserved); @@ -496,8 +497,11 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_reserved = ds->ds_quota = 0; } - if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds, - dsl_dataset_evict)) != NULL) { + dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict, &ds->ds_dbuf); + if (err == 0) + winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu); + + if (err != 0 || winner != NULL) { bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_prev) @@ -877,7 +881,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) uint64_t mrs_used; uint64_t dlused, dlcomp, dluncomp; - ASSERT(!dsl_dataset_is_snapshot(ds)); + ASSERT(!ds->ds_is_snapshot); if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes; @@ -1635,7 +1639,7 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED, dsl_dataset_phys(ds)->ds_uncompressed_bytes); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dsl_dataset_phys(ds)->ds_unique_bytes); @@ -1699,7 +1703,7 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT; stat->dds_guid = dsl_dataset_phys(ds)->ds_guid; stat->dds_origin[0] = '\0'; - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { stat->dds_is_snapshot = B_TRUE; stat->dds_num_clones = dsl_dataset_phys(ds)->ds_num_children - 1; @@ -1985,7 +1989,7 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) return (error); /* must not be a snapshot */ - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } @@ -2563,7 +2567,7 @@ promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag) return (error); dd = ddpa->ddpa_clone->ds_dir; - if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) || + if (ddpa->ddpa_clone->ds_is_snapshot || !dsl_dir_is_clone(dd)) { dsl_dataset_rele(ddpa->ddpa_clone, tag); return (SET_ERROR(EINVAL)); @@ -2655,8 +2659,8 @@ dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, int64_t unused_refres_delta; /* they should both be heads */ - if (dsl_dataset_is_snapshot(clone) || - dsl_dataset_is_snapshot(origin_head)) + if (clone->ds_is_snapshot || + origin_head->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* if we are not forcing, the branch point should be just before them */ @@ -3158,8 +3162,8 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, uint64_t snapobj; dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; - ASSERT(dsl_dataset_is_snapshot(firstsnap)); - ASSERT(dsl_dataset_is_snapshot(lastsnap)); + ASSERT(firstsnap->ds_is_snapshot); + ASSERT(lastsnap->ds_is_snapshot); /* * Check that the snapshots are in the same dsl_dir, and firstsnap @@ -3285,12 +3289,12 @@ dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, dsl_dataset_t *origin; ASSERT(dsl_pool_config_held(dp)); - ASSERT(dsl_dataset_is_snapshot(earlier) || earlier_txg != 0); + ASSERT(earlier->ds_is_snapshot || earlier_txg != 0); if (earlier_txg == 0) earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg; - if (dsl_dataset_is_snapshot(later) && + if (later->ds_is_snapshot && earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg) return (B_FALSE); diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 098e9419b8c1..8da77ebd7b6e 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -122,6 +122,8 @@ dsl_deadlist_close(dsl_deadlist_t *dl) void *cookie = NULL; dsl_deadlist_entry_t *dle; + dl->dl_os = NULL; + if (dl->dl_oldfmt) { dl->dl_oldfmt = B_FALSE; bpobj_close(&dl->dl_bpobj); diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c index b32da3ec0af6..952422be2381 100644 --- a/module/zfs/dsl_deleg.c +++ b/module/zfs/dsl_deleg.c @@ -570,7 +570,7 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) SPA_VERSION_DELEGATED_PERMS) return (SET_ERROR(EPERM)); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { /* * Snapshots are treated as descendents only, * local permissions do not apply. diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 91afec4a4d8a..0145a3c795d4 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -51,7 +51,7 @@ typedef struct dmu_snapshots_destroy_arg { int dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) { - if (!dsl_dataset_is_snapshot(ds)) + if (!ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); if (dsl_dataset_long_held(ds)) @@ -360,7 +360,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) dsl_dataset_remove_clones_key(ds, dsl_dataset_phys(ds)->ds_creation_txg, tx); - if (dsl_dataset_is_snapshot(ds_next)) { + if (ds_next->ds_is_snapshot) { dsl_dataset_t *ds_nextnext; /* @@ -609,8 +609,8 @@ dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) uint64_t count; objset_t *mos; - ASSERT(!dsl_dataset_is_snapshot(ds)); - if (dsl_dataset_is_snapshot(ds)) + ASSERT(!ds->ds_is_snapshot); + if (ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); if (refcount_count(&ds->ds_longholds) != expected_holds) diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index fc58bdf78c62..bacf54ff9dcc 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -127,14 +127,15 @@ extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); -/* ARGSUSED */ static void -dsl_dir_evict(dmu_buf_t *db, void *arg) +dsl_dir_evict(void *dbu) { - dsl_dir_t *dd = arg; + dsl_dir_t *dd = dbu; int t; ASSERTV(dsl_pool_t *dp = dd->dd_pool); + dd->dd_dbuf = NULL; + for (t = 0; t < TXG_SIZE; t++) { ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); ASSERT(dd->dd_tempreserved[t] == 0); @@ -142,9 +143,9 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) } if (dd->dd_parent) - dsl_dir_rele(dd->dd_parent, dd); + dsl_dir_async_rele(dd->dd_parent, dd); - spa_close(dd->dd_pool->dp_spa, dd); + spa_async_close(dd->dd_pool->dp_spa, dd); /* * The props callback list should have been cleaned up by @@ -240,8 +241,9 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dmu_buf_rele(origin_bonus, FTAG); } - winner = dmu_buf_set_user_ie(dbuf, dd, dsl_dir_evict); - if (winner) { + dmu_buf_init_user(&dd->dd_dbu, dsl_dir_evict, &dd->dd_dbuf); + winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); + if (winner != NULL) { if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); mutex_destroy(&dd->dd_lock); @@ -285,6 +287,21 @@ dsl_dir_rele(dsl_dir_t *dd, void *tag) dmu_buf_rele(dd->dd_dbuf, tag); } +/* + * Remove a reference to the given dsl dir that is being asynchronously + * released. Async releases occur from a taskq performing eviction of + * dsl datasets and dirs. This process is identical to a normal release + * with the exception of using the async API for releasing the reference on + * the spa. + */ +void +dsl_dir_async_rele(dsl_dir_t *dd, void *tag) +{ + dprintf_dd(dd, "%s\n", ""); + spa_async_close(dd->dd_pool->dp_spa, tag); + dmu_buf_rele(dd->dd_dbuf, tag); +} + /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */ void dsl_dir_name(dsl_dir_t *dd, char *buf) @@ -418,7 +435,7 @@ dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, } while (next != NULL) { - dsl_dir_t *child_ds; + dsl_dir_t *child_dd; err = getcomponent(next, buf, &nextnext); if (err != 0) break; @@ -437,11 +454,11 @@ dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, break; } - err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds); + err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd); if (err != 0) break; dsl_dir_rele(dd, tag); - dd = child_ds; + dd = child_dd; next = nextnext; } diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 03c59c0168b6..1c0014196a01 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -327,6 +327,8 @@ dsl_pool_close(dsl_pool_t *dp) txg_fini(dp); dsl_scan_fini(dp); + dmu_buf_user_evict_wait(); + rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); taskq_destroy(dp->dp_iput_taskq); diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c index a3864efde91e..dd02c02c6433 100644 --- a/module/zfs/dsl_prop.c +++ b/module/zfs/dsl_prop.c @@ -163,19 +163,17 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, { zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t inheritable; - boolean_t snapshot; uint64_t zapobj; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); - snapshot = dsl_dataset_is_snapshot(ds); zapobj = dsl_dataset_phys(ds)->ds_props_obj; if (zapobj != 0) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; int err; - ASSERT(snapshot); + ASSERT(ds->ds_is_snapshot); /* Check for a local value. */ err = zap_lookup(mos, zapobj, propname, intsz, numints, buf); @@ -215,7 +213,7 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, } return (dsl_prop_get_dd(ds->ds_dir, propname, - intsz, numints, buf, setpoint, snapshot)); + intsz, numints, buf, setpoint, ds->ds_is_snapshot)); } /* @@ -577,7 +575,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, isint = (dodefault(propname, 8, 1, &intval) == 0); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { ASSERT(version >= SPA_VERSION_SNAP_PROPS); if (dsl_dataset_phys(ds)->ds_props_obj == 0) { dmu_buf_will_dirty(ds->ds_dbuf, tx); @@ -674,7 +672,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, if (isint) { VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval)); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { dsl_prop_cb_record_t *cbr; /* * It's a snapshot; nothing can inherit this @@ -897,7 +895,7 @@ dsl_props_set_check(void *arg, dmu_tx_t *tx) } } - if (dsl_dataset_is_snapshot(ds) && version < SPA_VERSION_SNAP_PROPS) { + if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOTSUP)); } @@ -1134,7 +1132,7 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if (dsl_dataset_is_snapshot(ds)) + if (ds->ds_is_snapshot) flags |= DSL_PROP_GET_SNAPSHOT; ASSERT(dsl_pool_config_held(dp)); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 58e760449255..6c8d34fbc04c 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -415,7 +415,7 @@ static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; - if (dsl_dataset_is_snapshot(ds)) + if (ds->ds_is_snapshot) return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); return (smt); } @@ -865,7 +865,7 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) return; if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { /* Note, scn_cur_{min,max}_txg stays the same. */ scn->scn_phys.scn_bookmark.zb_objset = dsl_dataset_phys(ds)->ds_next_snap_obj; @@ -887,7 +887,7 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { /* * We keep the same mintxg; it could be > * ds_creation_txg if the previous snapshot was @@ -1073,7 +1073,7 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) * ZIL here, rather than in scan_recurse(), because the regular * snapshot block-sharing rules don't apply to it. */ - if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds)) + if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot) dsl_scan_zil(dp, &os->os_zil_header); /* diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c index 007f525d1928..1b234ed480f9 100644 --- a/module/zfs/dsl_userhold.c +++ b/module/zfs/dsl_userhold.c @@ -355,7 +355,7 @@ dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura, objset_t *mos; int numholds; - if (!dsl_dataset_is_snapshot(ds)) + if (!ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); if (nvlist_empty(holds)) diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 94730d020c63..2383252e2447 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -1301,10 +1301,10 @@ sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) } /*ARGSUSED*/ -void -sa_evict(dmu_buf_t *db, void *sap) +static void +sa_evict(void *dbu) { - panic("evicting sa dbuf %p\n", (void *)db); + panic("evicting sa dbuf\n"); } static void @@ -1356,9 +1356,10 @@ sa_spill_rele(sa_handle_t *hdl) void sa_handle_destroy(sa_handle_t *hdl) { + dmu_buf_t *db = hdl->sa_bonus; + mutex_enter(&hdl->sa_lock); - (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl, - NULL, NULL); + (void) dmu_buf_remove_user(db, &hdl->sa_dbu); if (hdl->sa_bonus_tab) sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); @@ -1380,7 +1381,7 @@ sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, sa_handle_type_t hdl_type, sa_handle_t **handlepp) { int error = 0; - sa_handle_t *handle; + sa_handle_t *handle = NULL; #ifdef ZFS_DEBUG dmu_object_info_t doi; @@ -1391,10 +1392,14 @@ sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, /* find handle, if it exists */ /* if one doesn't exist then create a new one, and initialize it */ - handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL; + if (hdl_type == SA_HDL_SHARED) + handle = dmu_buf_get_user(db); + if (handle == NULL) { - sa_handle_t *newhandle; + sa_handle_t *winner = NULL; + handle = kmem_cache_alloc(sa_cache, KM_SLEEP); + handle->sa_dbu.dbu_evict_func = NULL; handle->sa_userp = userp; handle->sa_bonus = db; handle->sa_os = os; @@ -1403,12 +1408,15 @@ sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, handle->sa_spill_tab = NULL; error = sa_build_index(handle, SA_BONUS); - newhandle = (hdl_type == SA_HDL_SHARED) ? - dmu_buf_set_user_ie(db, handle, sa_evict) : NULL; - if (newhandle != NULL) { + if (hdl_type == SA_HDL_SHARED) { + dmu_buf_init_user(&handle->sa_dbu, sa_evict, NULL); + winner = dmu_buf_set_user_ie(db, &handle->sa_dbu); + } + + if (winner != NULL) { kmem_cache_free(sa_cache, handle); - handle = newhandle; + handle = winner; } } *handlepp = handle; @@ -1940,14 +1948,6 @@ sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks) blksize, nblocks); } -void -sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl) -{ - (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus, - oldhdl, newhdl, sa_evict); - oldhdl->sa_bonus = NULL; -} - void sa_set_userp(sa_handle_t *hdl, void *ptr) { @@ -2046,7 +2046,6 @@ EXPORT_SYMBOL(sa_size); EXPORT_SYMBOL(sa_update_from_cb); EXPORT_SYMBOL(sa_object_info); EXPORT_SYMBOL(sa_object_size); -EXPORT_SYMBOL(sa_update_user); EXPORT_SYMBOL(sa_get_userdata); EXPORT_SYMBOL(sa_set_userp); EXPORT_SYMBOL(sa_get_db); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b9a81d371113..07a48c13a0ac 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1125,6 +1125,8 @@ spa_activate(spa_t *spa, int mode) list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); + list_create(&spa->spa_evicting_os_list, sizeof (objset_t), + offsetof(objset_t, os_evicting_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); @@ -1153,9 +1155,12 @@ spa_deactivate(spa_t *spa) ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); + spa_evicting_os_wait(spa); + txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); + list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); @@ -2168,6 +2173,11 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, mosconfig, &ereport); } + /* + * Don't count references from objsets that are already closed + * and are making their way through the eviction process. + */ + spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); if (error) { if (error != EEXIST) { @@ -3809,6 +3819,11 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_history_log_version(spa, "create"); + /* + * Don't count references from objsets that are already closed + * and are making their way through the eviction process. + */ + spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); mutex_exit(&spa_namespace_lock); @@ -4365,8 +4380,10 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, * modify its state. Objsets may be open only because they're dirty, * so we have to force it to sync before checking spa_refcnt. */ - if (spa->spa_sync_on) + if (spa->spa_sync_on) { txg_wait_synced(spa->spa_dsl_pool, 0); + spa_evicting_os_wait(spa); + } /* * A pool cannot be exported or destroyed if there are active diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index a0e7b47f1054..409dce121212 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -551,6 +551,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); @@ -560,6 +561,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); @@ -649,6 +651,7 @@ spa_remove(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0); nvlist_free(spa->spa_config_splitting); @@ -681,6 +684,7 @@ spa_remove(spa_t *spa) bplist_destroy(&spa->spa_free_bplist[t]); cv_destroy(&spa->spa_async_cv); + cv_destroy(&spa->spa_evicting_os_cv); cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); @@ -688,6 +692,7 @@ spa_remove(spa_t *spa) mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); + mutex_destroy(&spa->spa_evicting_os_lock); mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); @@ -744,6 +749,20 @@ spa_close(spa_t *spa, void *tag) (void) refcount_remove(&spa->spa_refcount, tag); } +/* + * Remove a reference to the given spa_t held by a dsl dir that is + * being asynchronously released. Async releases occur from a taskq + * performing eviction of dsl datasets and dirs. The namespace lock + * isn't held and the hold by the object being evicted may contribute to + * spa_minref (e.g. dataset or directory released during pool export), + * so the asserts in spa_close() do not apply. + */ +void +spa_async_close(spa_t *spa, void *tag) +{ + (void) refcount_remove(&spa->spa_refcount, tag); +} + /* * Check to see if the spa refcount is zero. Must be called with * spa_namespace_lock held. We really compare against spa_minref, which is the @@ -1639,6 +1658,34 @@ spa_log_class(spa_t *spa) return (spa->spa_log_class); } +void +spa_evicting_os_register(spa_t *spa, objset_t *os) +{ + mutex_enter(&spa->spa_evicting_os_lock); + list_insert_head(&spa->spa_evicting_os_list, os); + mutex_exit(&spa->spa_evicting_os_lock); +} + +void +spa_evicting_os_deregister(spa_t *spa, objset_t *os) +{ + mutex_enter(&spa->spa_evicting_os_lock); + list_remove(&spa->spa_evicting_os_list, os); + cv_broadcast(&spa->spa_evicting_os_cv); + mutex_exit(&spa->spa_evicting_os_lock); +} + +void +spa_evicting_os_wait(spa_t *spa) +{ + mutex_enter(&spa->spa_evicting_os_lock); + while (!list_is_empty(&spa->spa_evicting_os_list)) + cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); + mutex_exit(&spa->spa_evicting_os_lock); + + dmu_buf_user_evict_wait(); +} + int spa_max_replication(spa_t *spa) { diff --git a/module/zfs/zap.c b/module/zfs/zap.c index bc1d57899fc1..c5ea392b6a1d 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -53,7 +53,6 @@ int fzap_default_block_shift = 14; /* 16k blocksize */ extern inline zap_phys_t *zap_f_phys(zap_t *zap); -static void zap_leaf_pageout(dmu_buf_t *db, void *vl); static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); void @@ -82,7 +81,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); zap->zap_ismicro = FALSE; - (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, zap_evict); + zap->zap_dbu.dbu_evict_func = zap_evict; mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; @@ -388,11 +387,20 @@ zap_allocate_blocks(zap_t *zap, int nblocks) return (newblk); } +static void +zap_leaf_pageout(void *dbu) +{ + zap_leaf_t *l = dbu; + + rw_destroy(&l->l_rwlock); + kmem_free(l, sizeof (zap_leaf_t)); +} + static zap_leaf_t * zap_create_leaf(zap_t *zap, dmu_tx_t *tx) { void *winner; - zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); + zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); @@ -404,7 +412,8 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx) VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, DMU_READ_NO_PREFETCH)); - winner = dmu_buf_set_user(l->l_dbuf, l, zap_leaf_pageout); + dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf); + winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu); ASSERT(winner == NULL); dmu_buf_will_dirty(l->l_dbuf, tx); @@ -436,16 +445,6 @@ zap_put_leaf(zap_leaf_t *l) dmu_buf_rele(l->l_dbuf, NULL); } -_NOTE(ARGSUSED(0)) -static void -zap_leaf_pageout(dmu_buf_t *db, void *vl) -{ - zap_leaf_t *l = vl; - - rw_destroy(&l->l_rwlock); - kmem_free(l, sizeof (zap_leaf_t)); -} - static zap_leaf_t * zap_open_leaf(uint64_t blkid, dmu_buf_t *db) { @@ -453,19 +452,20 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db) ASSERT(blkid != 0); - l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); + l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL); rw_enter(&l->l_rwlock, RW_WRITER); l->l_blkid = blkid; l->l_bs = highbit64(db->db_size) - 1; l->l_dbuf = db; - winner = dmu_buf_set_user(db, l, zap_leaf_pageout); + dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf); + winner = dmu_buf_set_user(db, &l->l_dbu); rw_exit(&l->l_rwlock); if (winner != NULL) { /* someone else set it first */ - zap_leaf_pageout(NULL, l); + zap_leaf_pageout(&l->l_dbu); l = winner; } diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index b6013b2e6983..29406e660c5b 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -388,7 +388,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) * it, because zap_lockdir() checks zap_ismicro without the lock * held. */ - winner = dmu_buf_set_user(db, zap, zap_evict); + dmu_buf_init_user(&zap->zap_dbu, zap_evict, &zap->zap_dbuf); + winner = dmu_buf_set_user(db, &zap->zap_dbu); if (winner != NULL) { rw_exit(&zap->zap_rwlock); @@ -677,11 +678,10 @@ zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) return (dmu_object_free(os, zapobj, tx)); } -_NOTE(ARGSUSED(0)) void -zap_evict(dmu_buf_t *db, void *vzap) +zap_evict(void *dbu) { - zap_t *zap = vzap; + zap_t *zap = dbu; rw_destroy(&zap->zap_rwlock); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 0fa944076fe7..9d9d2edec350 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -5392,7 +5392,7 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) return (error); error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); - if (error == 0 && !dsl_dataset_is_snapshot(new)) { + if (error == 0 && !new->ds_is_snapshot) { dsl_dataset_rele(new, FTAG); error = SET_ERROR(EINVAL); } @@ -5401,7 +5401,7 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) return (error); } error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); - if (error == 0 && !dsl_dataset_is_snapshot(old)) { + if (error == 0 && !old->ds_is_snapshot) { dsl_dataset_rele(old, FTAG); error = SET_ERROR(EINVAL); } diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c index 257ab4254bbd..c9a9da7528d7 100644 --- a/module/zfs/zfs_sa.c +++ b/module/zfs/zfs_sa.c @@ -22,8 +22,7 @@ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ -#include -#include +#include #include #include #include diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 2b8fc548ffc9..175d2c66b4a4 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -501,7 +501,7 @@ zilog_dirty(zilog_t *zilog, uint64_t txg) dsl_pool_t *dp = zilog->zl_dmu_pool; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); - if (dsl_dataset_is_snapshot(ds)) + if (ds->ds_is_snapshot) panic("dirtying snapshot!"); if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {