From b1d5794f84a7d28b218303d8ae52425430b511d2 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Thu, 1 Oct 2015 10:29:06 -0400 Subject: [PATCH] Revert "Illumos 5056 - ZFS deadlock on db_mtx and dn_holds" This reverts commit 0c66c32d1d8b64a261cceb5f50a9e86777c5d0b2. It caused a runtime failure: https://clusterhq.atlassian.net/browse/ZFS-37 The original deadlock must be resolved differently. All Spectralogic copyright notices introduced in the reverted commit have been retained due to either additional changes that depend on them that we presently have or additional changes that depend on them that we yet to have. Signed-off-by: Richard Yao --- include/sys/dbuf.h | 5 +- include/sys/dmu.h | 132 +++++---------------------- include/sys/dmu_objset.h | 13 +-- include/sys/dnode.h | 3 +- include/sys/dsl_dataset.h | 10 +-- include/sys/dsl_dir.h | 3 - include/sys/sa.h | 1 + include/sys/sa_impl.h | 3 +- include/sys/spa.h | 4 - include/sys/spa_impl.h | 3 - include/sys/zap_impl.h | 3 +- include/sys/zap_leaf.h | 1 - module/zfs/dbuf.c | 182 +++++++++++--------------------------- module/zfs/dmu_objset.c | 112 +++++++++-------------- module/zfs/dmu_send.c | 8 +- module/zfs/dmu_traverse.c | 2 +- module/zfs/dnode.c | 82 +++++++---------- module/zfs/dnode_sync.c | 69 +++++++++------ module/zfs/dsl_bookmark.c | 2 +- module/zfs/dsl_dataset.c | 48 +++++----- module/zfs/dsl_deadlist.c | 2 - module/zfs/dsl_deleg.c | 2 +- module/zfs/dsl_destroy.c | 8 +- module/zfs/dsl_dir.c | 37 +++----- module/zfs/dsl_pool.c | 2 - module/zfs/dsl_prop.c | 14 +-- module/zfs/dsl_scan.c | 8 +- module/zfs/dsl_userhold.c | 2 +- module/zfs/sa.c | 41 ++++----- module/zfs/spa.c | 19 +--- module/zfs/spa_misc.c | 47 ---------- module/zfs/zap.c | 34 +++---- module/zfs/zap_micro.c | 8 +- module/zfs/zfs_ioctl.c | 4 +- module/zfs/zfs_sa.c | 3 +- module/zfs/zil.c | 2 +- 36 files changed, 306 insertions(+), 613 deletions(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 94d326d5716c..8ade4c9c0b79 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -227,8 +227,9 @@ typedef struct dmu_buf_impl { /* Data which is unique to data (leaf) blocks: */ - /* User callback information. */ - dmu_buf_user_t *db_user; + /* stuff we store for the user (see dmu_buf_set_user) */ + void *db_user_ptr; + dmu_buf_evict_func_t *db_evict_func; uint8_t db_immediate_evict; uint8_t db_freed_in_flight; diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 855ba5cd456b..de85b51b598b 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -40,9 +40,11 @@ * dmu_spa.h. */ -#include #include +#include +#include #include +#include #include #include @@ -288,6 +290,8 @@ typedef struct dmu_buf { void *db_data; /* data in buffer */ } dmu_buf_t; +typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); + /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. */ @@ -489,126 +493,36 @@ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); -typedef void dmu_buf_evict_func_t(void *user_ptr); - -/* - * A DMU buffer user object may be associated with a dbuf for the - * duration of its lifetime. This allows the user of a dbuf (client) - * to attach private data to a dbuf (e.g. in-core only data such as a - * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified - * when that dbuf has been evicted. Clients typically respond to the - * eviction notification by freeing their private data, thus ensuring - * the same lifetime for both dbuf and private data. - * - * The mapping from a dmu_buf_user_t to any client private data is the - * client's responsibility. All current consumers of the API with private - * data embed a dmu_buf_user_t as the first member of the structure for - * their private data. This allows conversions between the two types - * with a simple cast. Since the DMU buf user API never needs access - * to the private data, other strategies can be employed if necessary - * or convenient for the client (e.g. using container_of() to do the - * conversion for private data that cannot have the dmu_buf_user_t as - * its first member). - * - * Eviction callbacks are executed without the dbuf mutex held or any - * other type of mechanism to guarantee that the dbuf is still available. - * For this reason, users must assume the dbuf has already been freed - * and not reference the dbuf from the callback context. - * - * Users requesting "immediate eviction" are notified as soon as the dbuf - * is only referenced by dirty records (dirties == holds). Otherwise the - * notification occurs after eviction processing for the dbuf begins. - */ -typedef struct dmu_buf_user { - /* - * Asynchronous user eviction callback state. - */ - taskq_ent_t dbu_tqent; - - /* This instance's eviction function pointer. */ - dmu_buf_evict_func_t *dbu_evict_func; -#ifdef ZFS_DEBUG - /* - * Pointer to user's dbuf pointer. NULL for clients that do - * not associate a dbuf with their user data. - * - * The dbuf pointer is cleared upon eviction so as to catch - * use-after-evict bugs in clients. - */ - dmu_buf_t **dbu_clear_on_evict_dbufp; -#endif -} dmu_buf_user_t; - -/* - * Initialize the given dmu_buf_user_t instance with the eviction function - * evict_func, to be called when the user is evicted. - * - * NOTE: This function should only be called once on a given dmu_buf_user_t. - * To allow enforcement of this, dbu must already be zeroed on entry. - */ -#ifdef __lint -/* Very ugly, but it beats issuing suppression directives in many Makefiles. */ -extern void -dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, - dmu_buf_t **clear_on_evict_dbufp); -#else /* __lint */ -static inline void -dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, - dmu_buf_t **clear_on_evict_dbufp) -{ - ASSERT(dbu->dbu_evict_func == NULL); - ASSERT(evict_func != NULL); - dbu->dbu_evict_func = evict_func; -#ifdef ZFS_DEBUG - dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; -#endif -} -#endif /* __lint */ - /* - * Attach user data to a dbuf and mark it for normal (when the dbuf's - * data is cleared or its reference count goes to zero) eviction processing. + * Returns NULL on success, or the existing user ptr if it's already + * been set. * - * Returns NULL on success, or the existing user if another user currently - * owns the buffer. - */ -void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); - -/* - * Attach user data to a dbuf and mark it for immediate (its dirty and - * reference counts are equal) eviction processing. + * user_ptr is for use by the user and can be obtained via dmu_buf_get_user(). * - * Returns NULL on success, or the existing user if another user currently - * owns the buffer. - */ -void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); - -/* - * Replace the current user of a dbuf. + * If non-NULL, pageout func will be called when this buffer is being + * excised from the cache, so that you can clean up the data structure + * pointed to by user_ptr. * - * If given the current user of a dbuf, replaces the dbuf's user with - * "new_user" and returns the user data pointer that was replaced. - * Otherwise returns the current, and unmodified, dbuf user pointer. + * dmu_evict_user() will call the pageout func for all buffers in a + * objset with a given pageout func. */ -void *dmu_buf_replace_user(dmu_buf_t *db, - dmu_buf_user_t *old_user, dmu_buf_user_t *new_user); - +void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, + dmu_buf_evict_func_t *pageout_func); /* - * Remove the specified user data for a DMU buffer. - * - * Returns the user that was removed on success, or the current user if - * another user currently owns the buffer. + * set_user_ie is the same as set_user, but request immediate eviction + * when hold count goes to zero. */ -void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); +void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr, + dmu_buf_evict_func_t *pageout_func); +void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, + void *user_ptr, dmu_buf_evict_func_t *pageout_func); +void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func); /* - * Returns the user data (dmu_buf_user_t *) associated with this dbuf. + * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set. */ void *dmu_buf_get_user(dmu_buf_t *db); -/* Block until any in-progress dmu buf user evictions complete. */ -void dmu_buf_user_evict_wait(void); - /* * Returns the blkptr associated with this dbuf, or NULL if not set. */ diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index 252df6e76a61..fac41b264e8e 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -75,25 +75,22 @@ struct objset { arc_buf_t *os_phys_buf; objset_phys_t *os_phys; /* - * The following "special" dnodes have no parent, are exempt - * from dnode_move(), and are not recorded in os_dnodes, but they - * root their descendents in this objset using handles anyway, so - * that all access to dnodes from dbufs consistently uses handles. + * The following "special" dnodes have no parent and are exempt from + * dnode_move(), but they root their descendents in this objset using + * handles anyway, so that all access to dnodes from dbufs consistently + * uses handles. */ dnode_handle_t os_meta_dnode; dnode_handle_t os_userused_dnode; dnode_handle_t os_groupused_dnode; zilog_t *os_zil; - list_node_t os_evicting_node; - /* can change, under dsl_dir's locks: */ enum zio_checksum os_checksum; enum zio_compress os_compress; uint8_t os_copies; enum zio_checksum os_dedup_checksum; boolean_t os_dedup_verify; - boolean_t os_evicting; zfs_logbias_op_t os_logbias; zfs_cache_type_t os_primary_cache; zfs_cache_type_t os_secondary_cache; @@ -175,8 +172,6 @@ int dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); int dmu_fsname(const char *snapname, char *buf); -void dmu_objset_evict_done(objset_t *os); - /* Code for handling userspace interface */ extern const char *dmu_objset_types[]; diff --git a/include/sys/dnode.h b/include/sys/dnode.h index 50e01155903a..b727f213f707 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -278,7 +278,6 @@ typedef struct dnode_handle { } dnode_handle_t; typedef struct dnode_children { - dmu_buf_user_t dnc_dbu; /* User evict data */ size_t dnc_count; /* number of children */ dnode_handle_t dnc_children[]; /* sized dynamically */ } dnode_children_t; @@ -289,7 +288,7 @@ typedef struct free_range { uint64_t fr_nblks; } free_range_t; -void dnode_special_open(struct objset *dd, dnode_phys_t *dnp, +dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh); void dnode_special_close(dnode_handle_t *dnh); diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index a56e344a099d..5bfa3b40abc0 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -133,14 +133,11 @@ typedef struct dsl_dataset_phys { } dsl_dataset_phys_t; typedef struct dsl_dataset { - dmu_buf_user_t ds_dbu; - /* Immutable: */ struct dsl_dir *ds_dir; dmu_buf_t *ds_dbuf; uint64_t ds_object; uint64_t ds_fsid_guid; - boolean_t ds_is_snapshot; /* only used in syncing context, only valid for non-snapshots: */ struct dsl_dataset *ds_prev; @@ -201,8 +198,11 @@ dsl_dataset_phys(dsl_dataset_t *ds) */ #define MAX_TAG_PREFIX_LEN 17 -#define dsl_dataset_is_snapshot(ds) \ - (dsl_dataset_phys(ds)->ds_num_children != 0) +static inline boolean_t +dsl_dataset_is_snapshot(dsl_dataset_t *ds) +{ + return (dsl_dataset_phys(ds)->ds_num_children != 0); +} #define DS_UNIQUE_IS_ACCURATE(ds) \ ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0) diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h index b16b1b2a3a28..041d5cd0786b 100644 --- a/include/sys/dsl_dir.h +++ b/include/sys/dsl_dir.h @@ -85,8 +85,6 @@ typedef struct dsl_dir_phys { } dsl_dir_phys_t; struct dsl_dir { - dmu_buf_user_t dd_dbu; - /* These are immutable; no lock needed: */ uint64_t dd_object; dsl_pool_t *dd_pool; @@ -125,7 +123,6 @@ struct dsl_dataset; typedef struct dsl_dataset dsl_dataset_t; void dsl_dir_rele(dsl_dir_t *dd, void *tag); -void dsl_dir_async_rele(dsl_dir_t *dd, void *tag); int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dir_t **, const char **tail); int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, diff --git a/include/sys/sa.h b/include/sys/sa.h index 48e3bcd7cdf3..7b5b03a5629f 100644 --- a/include/sys/sa.h +++ b/include/sys/sa.h @@ -133,6 +133,7 @@ int sa_update_from_cb(sa_handle_t *, sa_attr_type_t, uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *); void sa_object_info(sa_handle_t *, dmu_object_info_t *); void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *); +void sa_update_user(sa_handle_t *, sa_handle_t *); void *sa_get_userdata(sa_handle_t *); void sa_set_userp(sa_handle_t *, void *); dmu_buf_t *sa_get_db(sa_handle_t *); diff --git a/include/sys/sa_impl.h b/include/sys/sa_impl.h index 6f2f1db6dcf9..24a7ad04230c 100644 --- a/include/sys/sa_impl.h +++ b/include/sys/sa_impl.h @@ -209,12 +209,11 @@ typedef enum sa_data_op { */ struct sa_handle { - dmu_buf_user_t sa_dbu; kmutex_t sa_lock; dmu_buf_t *sa_bonus; dmu_buf_t *sa_spill; objset_t *sa_os; - void *sa_userp; + void *sa_userp; sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */ sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */ }; diff --git a/include/sys/spa.h b/include/sys/spa.h index 5dc9084dad6b..118e427c8fe4 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -690,7 +690,6 @@ extern spa_t *spa_next(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, void *tag); extern void spa_close(spa_t *spa, void *tag); -extern void spa_async_close(spa_t *spa, void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_NONE 0x00 @@ -801,9 +800,6 @@ extern uint64_t spa_version(spa_t *spa); extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); -extern void spa_evicting_os_register(spa_t *, objset_t *os); -extern void spa_evicting_os_deregister(spa_t *, objset_t *os); -extern void spa_evicting_os_wait(spa_t *spa); extern int spa_max_replication(spa_t *spa); extern int spa_prev_software_version(spa_t *spa); extern uint8_t spa_get_failmode(spa_t *spa); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 0b49c7147b10..ad4597bfff81 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -145,9 +145,6 @@ struct spa { uint64_t spa_claim_max_txg; /* highest claimed birth txg */ timespec_t spa_loaded_ts; /* 1st successful open time */ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ - kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */ - list_t spa_evicting_os_list; /* Objsets being evicted. */ - kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ int spa_min_ashift; /* of vdevs in normal class */ diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h index bfd43e31da80..3ba617f2988d 100644 --- a/include/sys/zap_impl.h +++ b/include/sys/zap_impl.h @@ -140,7 +140,6 @@ typedef struct zap_phys { typedef struct zap_table_phys zap_table_phys_t; typedef struct zap { - dmu_buf_user_t zap_dbu; objset_t *zap_objset; uint64_t zap_object; struct dmu_buf *zap_dbuf; @@ -197,7 +196,7 @@ boolean_t zap_match(zap_name_t *zn, const char *matchname); int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp); void zap_unlockdir(zap_t *zap); -void zap_evict(void *dbu); +void zap_evict(dmu_buf_t *db, void *vmzap); zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); void zap_name_free(zap_name_t *zn); int zap_hashbits(zap_t *zap); diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h index e784c5963b2e..d78ec21941cf 100644 --- a/include/sys/zap_leaf.h +++ b/include/sys/zap_leaf.h @@ -153,7 +153,6 @@ typedef union zap_leaf_chunk { } zap_leaf_chunk_t; typedef struct zap_leaf { - dmu_buf_user_t l_dbu; krwlock_t l_rwlock; uint64_t l_blkid; /* 1<db_user == NULL) - return; - - /* Only data blocks support the attachment of user data. */ - ASSERT(db->db_level == 0); - - /* Clients must resolve a dbuf before attaching user data. */ - ASSERT(db->db.db_data != NULL); - ASSERT3U(db->db_state, ==, DB_CACHED); - - holds = refcount_count(&db->db_holds); - if (verify_type == DBVU_EVICTING) { - /* - * Immediate eviction occurs when holds == dirtycnt. - * For normal eviction buffers, holds is zero on - * eviction, except when dbuf_fix_old_data() calls - * dbuf_clear_data(). However, the hold count can grow - * during eviction even though db_mtx is held (see - * dmu_bonus_hold() for an example), so we can only - * test the generic invariant that holds >= dirtycnt. - */ - ASSERT3U(holds, >=, db->db_dirtycnt); - } else { - if (db->db_immediate_evict == TRUE) - ASSERT3U(holds, >=, db->db_dirtycnt); - else - ASSERT3U(holds, >, 0); - } -#endif -} - static void dbuf_evict_user(dmu_buf_impl_t *db) { - dmu_buf_user_t *dbu = db->db_user; - ASSERT(MUTEX_HELD(&db->db_mtx)); - if (dbu == NULL) + if (db->db_level != 0 || db->db_evict_func == NULL) return; - dbuf_verify_user(db, DBVU_EVICTING); - db->db_user = NULL; - -#ifdef ZFS_DEBUG - if (dbu->dbu_clear_on_evict_dbufp != NULL) - *dbu->dbu_clear_on_evict_dbufp = NULL; -#endif - - /* - * Invoke the callback from a taskq to avoid lock order reversals - * and limit stack depth. - */ - taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0, - &dbu->dbu_tqent); + db->db_evict_func(&db->db, db->db_user_ptr); + db->db_user_ptr = NULL; + db->db_evict_func = NULL; } boolean_t @@ -409,12 +348,6 @@ dbuf_init(void) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); dbuf_stats_init(h); - - /* - * All entries are queued via taskq_dispatch_ent(), so min/maxalloc - * configuration is not required. - */ - dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); } void @@ -437,7 +370,6 @@ dbuf_fini(void) kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); #endif kmem_cache_destroy(dbuf_cache); - taskq_destroy(dbu_evict_taskq); } /* @@ -555,28 +487,22 @@ dbuf_verify(dmu_buf_impl_t *db) } #endif -static void -dbuf_clear_data(dmu_buf_impl_t *db) -{ - ASSERT(MUTEX_HELD(&db->db_mtx)); - dbuf_evict_user(db); - db->db_buf = NULL; - db->db.db_data = NULL; - if (db->db_state != DB_NOFILL) - db->db_state = DB_UNCACHED; -} - static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(buf != NULL); - db->db_buf = buf; - ASSERT(buf->b_data != NULL); - db->db.db_data = buf->b_data; - if (!arc_released(buf)) - arc_set_callback(buf, dbuf_do_evict, db); + if (buf != NULL) { + ASSERT(buf->b_data != NULL); + db->db.db_data = buf->b_data; + if (!arc_released(buf)) + arc_set_callback(buf, dbuf_do_evict, db); + } else { + dbuf_evict_user(db); + db->db.db_data = NULL; + if (db->db_state != DB_NOFILL) + db->db_state = DB_UNCACHED; + } } /* @@ -598,7 +524,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); - dbuf_clear_data(db); + dbuf_set_data(db, NULL); mutex_exit(&db->db_mtx); } return (abuf); @@ -838,7 +764,7 @@ dbuf_noread(dmu_buf_impl_t *db) dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { - dbuf_clear_data(db); + dbuf_set_data(db, NULL); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } @@ -894,7 +820,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { - dbuf_clear_data(db); + dbuf_set_data(db, NULL); } } @@ -945,8 +871,7 @@ void dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_tx_t *tx) { - dmu_buf_impl_t *db_search; - dmu_buf_impl_t *db, *db_next; + dmu_buf_impl_t *db, *db_next, *db_search; uint64_t txg = tx->tx_txg; avl_index_t where; boolean_t freespill = @@ -1530,7 +1455,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) arc_buf_t *buf = db->db_buf; ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); - dbuf_clear_data(db); + dbuf_set_data(db, NULL); VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); return (B_TRUE); @@ -1879,7 +1804,8 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_parent = parent; db->db_blkptr = blkptr; - db->db_user = NULL; + db->db_user_ptr = NULL; + db->db_evict_func = NULL; db->db_immediate_evict = 0; db->db_freed_in_flight = 0; @@ -2390,7 +2316,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) /* * This dbuf has anonymous data associated with it. */ - dbuf_clear_data(db); + dbuf_set_data(db, NULL); VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); } else { @@ -2423,8 +2349,7 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) } else { dbuf_clear(db); } - } else if (db->db_objset->os_evicting || - arc_buf_eviction_needed(db->db_buf)) { + } else if (arc_buf_eviction_needed(db->db_buf)) { dbuf_clear(db); } else { mutex_exit(&db->db_mtx); @@ -2443,57 +2368,51 @@ dbuf_refcount(dmu_buf_impl_t *db) } void * -dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, - dmu_buf_user_t *new_user) +dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, + dmu_buf_evict_func_t *evict_func) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - mutex_enter(&db->db_mtx); - dbuf_verify_user(db, DBVU_NOT_EVICTING); - if (db->db_user == old_user) - db->db_user = new_user; - else - old_user = db->db_user; - dbuf_verify_user(db, DBVU_NOT_EVICTING); - mutex_exit(&db->db_mtx); - - return (old_user); + return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func)); } void * -dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) -{ - return (dmu_buf_replace_user(db_fake, NULL, user)); -} - -void * -dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) +dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, + dmu_buf_evict_func_t *evict_func) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_immediate_evict = TRUE; - return (dmu_buf_set_user(db_fake, user)); + return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func)); } void * -dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) +dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, + dmu_buf_evict_func_t *evict_func) { - return (dmu_buf_replace_user(db_fake, user, NULL)); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(db->db_level == 0); + + ASSERT((user_ptr == NULL) == (evict_func == NULL)); + + mutex_enter(&db->db_mtx); + + if (db->db_user_ptr == old_user_ptr) { + db->db_user_ptr = user_ptr; + db->db_evict_func = evict_func; + } else { + old_user_ptr = db->db_user_ptr; + } + + mutex_exit(&db->db_mtx); + return (old_user_ptr); } void * dmu_buf_get_user(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + ASSERT(!refcount_is_zero(&db->db_holds)); - dbuf_verify_user(db, DBVU_NOT_EVICTING); - return (db->db_user); -} - -void -dmu_buf_user_evict_wait() -{ - taskq_wait(dbu_evict_taskq); + return (db->db_user_ptr); } boolean_t @@ -3165,6 +3084,7 @@ EXPORT_SYMBOL(dbuf_refcount); EXPORT_SYMBOL(dbuf_sync_list); EXPORT_SYMBOL(dmu_buf_set_user); EXPORT_SYMBOL(dmu_buf_set_user_ie); +EXPORT_SYMBOL(dmu_buf_update_user); EXPORT_SYMBOL(dmu_buf_get_user); EXPORT_SYMBOL(dmu_buf_freeable); EXPORT_SYMBOL(dmu_buf_get_blkptr); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 49bf273e4d30..589a5cf64e4e 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -371,7 +371,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); } - if (!ds->ds_is_snapshot) { + if (!dsl_dataset_is_snapshot(ds)) { if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), @@ -433,7 +433,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_secondary_cache = ZFS_CACHE_ALL; } - if (ds == NULL || !ds->ds_is_snapshot) + if (ds == NULL || !dsl_dataset_is_snapshot(ds)) os->os_zil_header = os->os_phys->os_zil_header; os->os_zil = zil_alloc(os, &os->os_zil_header); @@ -448,19 +448,20 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); - list_link_init(&os->os_evicting_node); - mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); - dnode_special_open(os, &os->os_phys->os_meta_dnode, - DMU_META_DNODE_OBJECT, &os->os_meta_dnode); + DMU_META_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, + &os->os_meta_dnode); if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { - dnode_special_open(os, &os->os_phys->os_userused_dnode, - DMU_USERUSED_OBJECT, &os->os_userused_dnode); - dnode_special_open(os, &os->os_phys->os_groupused_dnode, - DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); + DMU_USERUSED_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, + &os->os_userused_dnode); + DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, + &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, + &os->os_groupused_dnode); } *osp = os; @@ -628,57 +629,41 @@ dmu_objset_disown(objset_t *os, void *tag) void dmu_objset_evict_dbufs(objset_t *os) { - dnode_t *dn_marker; dnode_t *dn; - dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP); - mutex_enter(&os->os_lock); - dn = list_head(&os->os_dnodes); - while (dn != NULL) { - /* - * Skip dnodes without holds. We have to do this dance - * because dnode_add_ref() only works if there is already a - * hold. If the dnode has no holds, then it has no dbufs. - */ - if (dnode_add_ref(dn, FTAG)) { - list_insert_after(&os->os_dnodes, dn, dn_marker); - mutex_exit(&os->os_lock); - dnode_evict_dbufs(dn); - dnode_rele(dn, FTAG); + /* process the mdn last, since the other dnodes have holds on it */ + list_remove(&os->os_dnodes, DMU_META_DNODE(os)); + list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); - mutex_enter(&os->os_lock); - dn = list_next(&os->os_dnodes, dn_marker); - list_remove(&os->os_dnodes, dn_marker); - } else { - dn = list_next(&os->os_dnodes, dn); - } - } - mutex_exit(&os->os_lock); + /* + * Find the first dnode with holds. We have to do this dance + * because dnode_add_ref() only works if you already have a + * hold. If there are no holds then it has no dbufs so OK to + * skip. + */ + for (dn = list_head(&os->os_dnodes); + dn && !dnode_add_ref(dn, FTAG); + dn = list_next(&os->os_dnodes, dn)) + continue; - kmem_free(dn_marker, sizeof (dnode_t)); + while (dn) { + dnode_t *next_dn = dn; - if (DMU_USERUSED_DNODE(os) != NULL) { - dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os)); - dnode_evict_dbufs(DMU_USERUSED_DNODE(os)); + do { + next_dn = list_next(&os->os_dnodes, next_dn); + } while (next_dn && !dnode_add_ref(next_dn, FTAG)); + + mutex_exit(&os->os_lock); + dnode_evict_dbufs(dn); + dnode_rele(dn, FTAG); + mutex_enter(&os->os_lock); + dn = next_dn; } - dnode_evict_dbufs(DMU_META_DNODE(os)); + mutex_exit(&os->os_lock); } -/* - * Objset eviction processing is split into into two pieces. - * The first marks the objset as evicting, evicts any dbufs that - * have a refcount of zero, and then queues up the objset for the - * second phase of eviction. Once os->os_dnodes has been cleared by - * dnode_buf_pageout()->dnode_destroy(), the second phase is executed. - * The second phase closes the special dnodes, dequeues the objset from - * the list of those undergoing eviction, and finally frees the objset. - * - * NOTE: Due to asynchronous eviction processing (invocation of - * dnode_buf_pageout()), it is possible for the meta dnode for the - * objset to have no holds even though os->os_dnodes is not empty. - */ void dmu_objset_evict(objset_t *os) { @@ -690,7 +675,7 @@ dmu_objset_evict(objset_t *os) ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { - if (!ds->ds_is_snapshot) { + if (!dsl_dataset_is_snapshot(ds)) { VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os)); @@ -727,24 +712,8 @@ dmu_objset_evict(objset_t *os) if (os->os_sa) sa_tear_down(os); - os->os_evicting = B_TRUE; dmu_objset_evict_dbufs(os); - mutex_enter(&os->os_lock); - spa_evicting_os_register(os->os_spa, os); - if (list_is_empty(&os->os_dnodes)) { - mutex_exit(&os->os_lock); - dmu_objset_evict_done(os); - } else { - mutex_exit(&os->os_lock); - } -} - -void -dmu_objset_evict_done(objset_t *os) -{ - ASSERT3P(list_head(&os->os_dnodes), ==, NULL); - dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { dnode_special_close(&os->os_userused_dnode); @@ -752,6 +721,8 @@ dmu_objset_evict_done(objset_t *os) } zil_free(os->os_zil); + ASSERT3P(list_head(&os->os_dnodes), ==, NULL); + VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); /* @@ -766,7 +737,6 @@ dmu_objset_evict_done(objset_t *os) mutex_destroy(&os->os_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); - spa_evicting_os_deregister(os->os_spa, os); kmem_free(os, sizeof (objset_t)); } @@ -965,7 +935,7 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx) return (error); /* You can only clone snapshots, not the head datasets. */ - if (!origin->ds_is_snapshot) { + if (!dsl_dataset_is_snapshot(origin)) { dsl_dataset_rele(origin, FTAG); return (SET_ERROR(EINVAL)); } @@ -1545,7 +1515,7 @@ int dmu_objset_is_snapshot(objset_t *os) { if (os->os_dsl_dataset != NULL) - return (os->os_dsl_dataset->ds_is_snapshot); + return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); else return (B_FALSE); } diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index b2d844eb4256..a53732eb8752 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -639,7 +639,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, fromtxg = fromzb->zbm_creation_txg; } dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); - if (!ds->ds_is_snapshot) { + if (!dsl_dataset_is_snapshot(ds)) { (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", sizeof (drr->drr_u.drr_begin.drr_toname)); } @@ -879,11 +879,11 @@ dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); /* tosnap must be a snapshot */ - if (!ds->ds_is_snapshot) + if (!dsl_dataset_is_snapshot(ds)) return (SET_ERROR(EINVAL)); /* fromsnap, if provided, must be a snapshot */ - if (fromds != NULL && !fromds->ds_is_snapshot) + if (fromds != NULL && !dsl_dataset_is_snapshot(fromds)) return (SET_ERROR(EINVAL)); /* @@ -1158,7 +1158,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) dsl_dataset_rele(ds, FTAG); return (error); } - if (!origin->ds_is_snapshot) { + if (!dsl_dataset_is_snapshot(origin)) { dsl_dataset_rele(origin, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 12d099bfd414..38c3e318aeeb 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -546,7 +546,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); /* See comment on ZIL traversal in dsl_scan_visitds. */ - if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { + if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) { uint32_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 2858bbfb492e..71ec8782c3e7 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -404,9 +404,8 @@ static dnode_t * dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, uint64_t object, dnode_handle_t *dnh) { - dnode_t *dn; + dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); - dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); ASSERT(!POINTER_IS_VALID(dn->dn_objset)); dn->dn_moved = 0; @@ -443,31 +442,13 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); mutex_enter(&os->os_lock); - if (dnh->dnh_dnode != NULL) { - /* Lost the allocation race. */ - mutex_exit(&os->os_lock); - kmem_cache_free(dnode_cache, dn); - return (dnh->dnh_dnode); - } - - /* - * Exclude special dnodes from os_dnodes so an empty os_dnodes - * signifies that the special dnodes have no references from - * their children (the entries in os_dnodes). This allows - * dnode_destroy() to easily determine if the last child has - * been removed and then complete eviction of the objset. - */ - if (!DMU_OBJECT_IS_SPECIAL(object)) - list_insert_head(&os->os_dnodes, dn); + list_insert_head(&os->os_dnodes, dn); membar_producer(); - /* - * Everything else must be valid before assigning dn_objset - * makes the dnode eligible for dnode_move(). + * Everything else must be valid before assigning dn_objset makes the + * dnode eligible for dnode_move(). */ dn->dn_objset = os; - - dnh->dnh_dnode = dn; mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); @@ -481,18 +462,12 @@ static void dnode_destroy(dnode_t *dn) { objset_t *os = dn->dn_objset; - boolean_t complete_os_eviction = B_FALSE; ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); mutex_enter(&os->os_lock); POINTER_INVALIDATE(&dn->dn_objset); - if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { - list_remove(&os->os_dnodes, dn); - complete_os_eviction = - list_is_empty(&os->os_dnodes) && - list_link_active(&os->os_evicting_node); - } + list_remove(&os->os_dnodes, dn); mutex_exit(&os->os_lock); /* the dnode can no longer move, so we can release the handle */ @@ -527,9 +502,6 @@ dnode_destroy(dnode_t *dn) dmu_zfetch_rele(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); - - if (complete_os_eviction) - dmu_objset_evict_done(os); } void @@ -996,32 +968,33 @@ dnode_special_close(dnode_handle_t *dnh) */ while (refcount_count(&dn->dn_holds) > 0) delay(1); - ASSERT(dn->dn_dbuf == NULL || - dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); zrl_add(&dnh->dnh_zrlock); dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = NULL; } -void +dnode_t * dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh) { - dnode_t *dn; - - dn = dnode_create(os, dnp, NULL, object, dnh); + dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh); + dnh->dnh_dnode = dn; zrl_init(&dnh->dnh_zrlock); DNODE_VERIFY(dn); + return (dn); } static void -dnode_buf_pageout(void *dbu) +dnode_buf_pageout(dmu_buf_t *db, void *arg) { - dnode_children_t *children_dnodes = dbu; + dnode_children_t *children_dnodes = arg; int i; + int epb = db->db_size >> DNODE_SHIFT; - for (i = 0; i < children_dnodes->dnc_count; i++) { + ASSERT(epb == children_dnodes->dnc_count); + + for (i = 0; i < epb; i++) { dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; dnode_t *dn; @@ -1051,7 +1024,7 @@ dnode_buf_pageout(void *dbu) dnh->dnh_dnode = NULL; } kmem_free(children_dnodes, sizeof (dnode_children_t) + - children_dnodes->dnc_count * sizeof (dnode_handle_t)); + epb * sizeof (dnode_handle_t)); } /* @@ -1135,17 +1108,16 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, if (children_dnodes == NULL) { int i; dnode_children_t *winner; - children_dnodes = kmem_zalloc(sizeof (dnode_children_t) + + children_dnodes = kmem_alloc(sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t), KM_SLEEP); children_dnodes->dnc_count = epb; dnh = &children_dnodes->dnc_children[0]; for (i = 0; i < epb; i++) { zrl_init(&dnh[i].dnh_zrlock); + dnh[i].dnh_dnode = NULL; } - dmu_buf_init_user(&children_dnodes->dnc_dbu, - dnode_buf_pageout, NULL); - winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu); - if (winner != NULL) { + if ((winner = dmu_buf_set_user(&db->db, children_dnodes, + dnode_buf_pageout))) { for (i = 0; i < epb; i++) { zrl_destroy(&dnh[i].dnh_zrlock); @@ -1160,11 +1132,17 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, dnh = &children_dnodes->dnc_children[idx]; zrl_add(&dnh->dnh_zrlock); - dn = dnh->dnh_dnode; - if (dn == NULL) { + if ((dn = dnh->dnh_dnode) == NULL) { dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; + dnode_t *winner; dn = dnode_create(os, phys, db, object, dnh); + winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn); + if (winner != NULL) { + zrl_add(&dnh->dnh_zrlock); + dnode_destroy(dn); /* implicit zrl_remove() */ + dn = winner; + } } mutex_enter(&dn->dn_mtx); @@ -1178,10 +1156,10 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); } - if (refcount_add(&dn->dn_holds, tag) == 1) - dbuf_add_ref(db, dnh); mutex_exit(&dn->dn_mtx); + if (refcount_add(&dn->dn_holds, tag) == 1) + dbuf_add_ref(db, dnh); /* Now we can rely on the hold to prevent the dnode from moving. */ zrl_remove(&dnh->dnh_zrlock); diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index a8fa9a9527a9..d896deb58a3f 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -404,41 +404,53 @@ dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) void dnode_evict_dbufs(dnode_t *dn) { - dmu_buf_impl_t *db_marker; - dmu_buf_impl_t *db, *db_next; + int progress; + int pass = 0; - db_marker = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP); - - mutex_enter(&dn->dn_dbufs_mtx); - for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { + do { + dmu_buf_impl_t *db, *db_next; + int evicting = FALSE; + progress = FALSE; + mutex_enter(&dn->dn_dbufs_mtx); + for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { + db_next = AVL_NEXT(&dn->dn_dbufs, db); #ifdef DEBUG - DB_DNODE_ENTER(db); - ASSERT3P(DB_DNODE(db), ==, dn); - DB_DNODE_EXIT(db); + DB_DNODE_ENTER(db); + ASSERT3P(DB_DNODE(db), ==, dn); + DB_DNODE_EXIT(db); #endif /* DEBUG */ - mutex_enter(&db->db_mtx); - if (db->db_state != DB_EVICTING && - refcount_is_zero(&db->db_holds)) { - db_marker->db_level = db->db_level; - db_marker->db_blkid = db->db_blkid; - db_marker->db_state = DB_SEARCH; - avl_insert_here(&dn->dn_dbufs, db_marker, db, - AVL_BEFORE); - - dbuf_clear(db); - - db_next = AVL_NEXT(&dn->dn_dbufs, db_marker); - avl_remove(&dn->dn_dbufs, db_marker); - } else { - mutex_exit(&db->db_mtx); - db_next = AVL_NEXT(&dn->dn_dbufs, db); + mutex_enter(&db->db_mtx); + if (db->db_state == DB_EVICTING) { + progress = TRUE; + evicting = TRUE; + mutex_exit(&db->db_mtx); + } else if (refcount_is_zero(&db->db_holds)) { + progress = TRUE; + dbuf_clear(db); /* exits db_mtx for us */ + } else { + mutex_exit(&db->db_mtx); + } + } - } - mutex_exit(&dn->dn_dbufs_mtx); + /* + * NB: we need to drop dn_dbufs_mtx between passes so + * that any DB_EVICTING dbufs can make progress. + * Ideally, we would have some cv we could wait on, but + * since we don't, just wait a bit to give the other + * thread a chance to run. + */ + mutex_exit(&dn->dn_dbufs_mtx); + if (evicting) + delay(1); + pass++; + if ((pass % 100) == 0) + dprintf("Exceeded %d passes evicting dbufs\n", pass); + } while (progress); - kmem_free(db_marker, sizeof (dmu_buf_impl_t)); + if (pass >= 100) + dprintf("Required %d passes to evict dbufs\n", pass); dnode_evict_bonus(dn); } @@ -503,6 +515,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); dnode_evict_dbufs(dn); ASSERT(avl_is_empty(&dn->dn_dbufs)); + ASSERT3P(dn->dn_bonus, ==, NULL); /* * XXX - It would be nice to assert this, but we may still diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c index 447a3a2dc3a2..f793dbc0ec62 100644 --- a/module/zfs/dsl_bookmark.c +++ b/module/zfs/dsl_bookmark.c @@ -120,7 +120,7 @@ dsl_bookmark_create_check_impl(dsl_dataset_t *snapds, const char *bookmark_name, int error; zfs_bookmark_phys_t bmark_phys; - if (!snapds->ds_is_snapshot) + if (!dsl_dataset_is_snapshot(snapds)) return (SET_ERROR(EINVAL)); error = dsl_bookmark_hold_ds(dp, bookmark_name, diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index ad805eb34493..32014f2864d3 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -72,6 +72,7 @@ int zfs_max_recordsize = 1 * 1024 * 1024; #define DS_REF_MAX (1ULL << 62) extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds); +extern inline boolean_t dsl_dataset_is_snapshot(dsl_dataset_t *ds); /* * Figure out how much of this delta should be propogated to the dsl_dir @@ -157,7 +158,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, } ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); - ASSERT(!ds->ds_is_snapshot); + ASSERT(!dsl_dataset_is_snapshot(ds)); dmu_buf_will_dirty(ds->ds_dbuf, tx); if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { @@ -255,15 +256,14 @@ dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, return (B_TRUE); } +/* ARGSUSED */ static void -dsl_dataset_evict(void *dbu) +dsl_dataset_evict(dmu_buf_t *db, void *dsv) { - dsl_dataset_t *ds = dbu; + dsl_dataset_t *ds = dsv; ASSERT(ds->ds_owner == NULL); - ds->ds_dbuf = NULL; - unique_remove(ds->ds_fsid_guid); if (ds->ds_objset != NULL) @@ -275,10 +275,10 @@ dsl_dataset_evict(void *dbu) } bplist_destroy(&ds->ds_pending_deadlist); - if (ds->ds_deadlist.dl_os != NULL) + if (dsl_dataset_phys(ds)->ds_deadlist_obj != 0) dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_dir) - dsl_dir_async_rele(ds->ds_dir, ds); + dsl_dir_rele(ds->ds_dir, ds); ASSERT(!list_link_active(&ds->ds_synced_link)); @@ -416,7 +416,6 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); ds->ds_dbuf = dbuf; ds->ds_object = dsobj; - ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0; list_link_init(&ds->ds_synced_link); mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); @@ -457,7 +456,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, return (err); } - if (!ds->ds_is_snapshot) { + if (!dsl_dataset_is_snapshot(ds)) { ds->ds_snapname[0] = '\0'; if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { err = dsl_dataset_hold_obj(dp, @@ -484,7 +483,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, } } - if (err == 0 && !ds->ds_is_snapshot) { + if (err == 0 && !dsl_dataset_is_snapshot(ds)) { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &ds->ds_reserved); @@ -497,11 +496,8 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_reserved = ds->ds_quota = 0; } - dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict, &ds->ds_dbuf); - if (err == 0) - winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu); - - if (err != 0 || winner != NULL) { + if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds, + dsl_dataset_evict)) != NULL) { bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_prev) @@ -881,7 +877,7 @@ dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) uint64_t mrs_used; uint64_t dlused, dlcomp, dluncomp; - ASSERT(!ds->ds_is_snapshot); + ASSERT(!dsl_dataset_is_snapshot(ds)); if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes; @@ -1639,7 +1635,7 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED, dsl_dataset_phys(ds)->ds_uncompressed_bytes); - if (ds->ds_is_snapshot) { + if (dsl_dataset_is_snapshot(ds)) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dsl_dataset_phys(ds)->ds_unique_bytes); @@ -1703,7 +1699,7 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT; stat->dds_guid = dsl_dataset_phys(ds)->ds_guid; stat->dds_origin[0] = '\0'; - if (ds->ds_is_snapshot) { + if (dsl_dataset_is_snapshot(ds)) { stat->dds_is_snapshot = B_TRUE; stat->dds_num_clones = dsl_dataset_phys(ds)->ds_num_children - 1; @@ -1989,7 +1985,7 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) return (error); /* must not be a snapshot */ - if (ds->ds_is_snapshot) { + if (dsl_dataset_is_snapshot(ds)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } @@ -2567,7 +2563,7 @@ promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag) return (error); dd = ddpa->ddpa_clone->ds_dir; - if (ddpa->ddpa_clone->ds_is_snapshot || + if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) || !dsl_dir_is_clone(dd)) { dsl_dataset_rele(ddpa->ddpa_clone, tag); return (SET_ERROR(EINVAL)); @@ -2659,8 +2655,8 @@ dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, int64_t unused_refres_delta; /* they should both be heads */ - if (clone->ds_is_snapshot || - origin_head->ds_is_snapshot) + if (dsl_dataset_is_snapshot(clone) || + dsl_dataset_is_snapshot(origin_head)) return (SET_ERROR(EINVAL)); /* if we are not forcing, the branch point should be just before them */ @@ -3162,8 +3158,8 @@ dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, uint64_t snapobj; dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; - ASSERT(firstsnap->ds_is_snapshot); - ASSERT(lastsnap->ds_is_snapshot); + ASSERT(dsl_dataset_is_snapshot(firstsnap)); + ASSERT(dsl_dataset_is_snapshot(lastsnap)); /* * Check that the snapshots are in the same dsl_dir, and firstsnap @@ -3289,12 +3285,12 @@ dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, dsl_dataset_t *origin; ASSERT(dsl_pool_config_held(dp)); - ASSERT(earlier->ds_is_snapshot || earlier_txg != 0); + ASSERT(dsl_dataset_is_snapshot(earlier) || earlier_txg != 0); if (earlier_txg == 0) earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg; - if (later->ds_is_snapshot && + if (dsl_dataset_is_snapshot(later) && earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg) return (B_FALSE); diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 8da77ebd7b6e..098e9419b8c1 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -122,8 +122,6 @@ dsl_deadlist_close(dsl_deadlist_t *dl) void *cookie = NULL; dsl_deadlist_entry_t *dle; - dl->dl_os = NULL; - if (dl->dl_oldfmt) { dl->dl_oldfmt = B_FALSE; bpobj_close(&dl->dl_bpobj); diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c index 952422be2381..b32da3ec0af6 100644 --- a/module/zfs/dsl_deleg.c +++ b/module/zfs/dsl_deleg.c @@ -570,7 +570,7 @@ dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) SPA_VERSION_DELEGATED_PERMS) return (SET_ERROR(EPERM)); - if (ds->ds_is_snapshot) { + if (dsl_dataset_is_snapshot(ds)) { /* * Snapshots are treated as descendents only, * local permissions do not apply. diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 0145a3c795d4..91afec4a4d8a 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -51,7 +51,7 @@ typedef struct dmu_snapshots_destroy_arg { int dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) { - if (!ds->ds_is_snapshot) + if (!dsl_dataset_is_snapshot(ds)) return (SET_ERROR(EINVAL)); if (dsl_dataset_long_held(ds)) @@ -360,7 +360,7 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) dsl_dataset_remove_clones_key(ds, dsl_dataset_phys(ds)->ds_creation_txg, tx); - if (ds_next->ds_is_snapshot) { + if (dsl_dataset_is_snapshot(ds_next)) { dsl_dataset_t *ds_nextnext; /* @@ -609,8 +609,8 @@ dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) uint64_t count; objset_t *mos; - ASSERT(!ds->ds_is_snapshot); - if (ds->ds_is_snapshot) + ASSERT(!dsl_dataset_is_snapshot(ds)); + if (dsl_dataset_is_snapshot(ds)) return (SET_ERROR(EINVAL)); if (refcount_count(&ds->ds_longholds) != expected_holds) diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index bacf54ff9dcc..fc58bdf78c62 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -127,15 +127,14 @@ extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); +/* ARGSUSED */ static void -dsl_dir_evict(void *dbu) +dsl_dir_evict(dmu_buf_t *db, void *arg) { - dsl_dir_t *dd = dbu; + dsl_dir_t *dd = arg; int t; ASSERTV(dsl_pool_t *dp = dd->dd_pool); - dd->dd_dbuf = NULL; - for (t = 0; t < TXG_SIZE; t++) { ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); ASSERT(dd->dd_tempreserved[t] == 0); @@ -143,9 +142,9 @@ dsl_dir_evict(void *dbu) } if (dd->dd_parent) - dsl_dir_async_rele(dd->dd_parent, dd); + dsl_dir_rele(dd->dd_parent, dd); - spa_async_close(dd->dd_pool->dp_spa, dd); + spa_close(dd->dd_pool->dp_spa, dd); /* * The props callback list should have been cleaned up by @@ -241,9 +240,8 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dmu_buf_rele(origin_bonus, FTAG); } - dmu_buf_init_user(&dd->dd_dbu, dsl_dir_evict, &dd->dd_dbuf); - winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); - if (winner != NULL) { + winner = dmu_buf_set_user_ie(dbuf, dd, dsl_dir_evict); + if (winner) { if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); mutex_destroy(&dd->dd_lock); @@ -287,21 +285,6 @@ dsl_dir_rele(dsl_dir_t *dd, void *tag) dmu_buf_rele(dd->dd_dbuf, tag); } -/* - * Remove a reference to the given dsl dir that is being asynchronously - * released. Async releases occur from a taskq performing eviction of - * dsl datasets and dirs. This process is identical to a normal release - * with the exception of using the async API for releasing the reference on - * the spa. - */ -void -dsl_dir_async_rele(dsl_dir_t *dd, void *tag) -{ - dprintf_dd(dd, "%s\n", ""); - spa_async_close(dd->dd_pool->dp_spa, tag); - dmu_buf_rele(dd->dd_dbuf, tag); -} - /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */ void dsl_dir_name(dsl_dir_t *dd, char *buf) @@ -435,7 +418,7 @@ dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, } while (next != NULL) { - dsl_dir_t *child_dd; + dsl_dir_t *child_ds; err = getcomponent(next, buf, &nextnext); if (err != 0) break; @@ -454,11 +437,11 @@ dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, break; } - err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd); + err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds); if (err != 0) break; dsl_dir_rele(dd, tag); - dd = child_dd; + dd = child_ds; next = nextnext; } diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 1c0014196a01..03c59c0168b6 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -327,8 +327,6 @@ dsl_pool_close(dsl_pool_t *dp) txg_fini(dp); dsl_scan_fini(dp); - dmu_buf_user_evict_wait(); - rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); taskq_destroy(dp->dp_iput_taskq); diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c index dd02c02c6433..a3864efde91e 100644 --- a/module/zfs/dsl_prop.c +++ b/module/zfs/dsl_prop.c @@ -163,17 +163,19 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, { zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t inheritable; + boolean_t snapshot; uint64_t zapobj; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); + snapshot = dsl_dataset_is_snapshot(ds); zapobj = dsl_dataset_phys(ds)->ds_props_obj; if (zapobj != 0) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; int err; - ASSERT(ds->ds_is_snapshot); + ASSERT(snapshot); /* Check for a local value. */ err = zap_lookup(mos, zapobj, propname, intsz, numints, buf); @@ -213,7 +215,7 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, } return (dsl_prop_get_dd(ds->ds_dir, propname, - intsz, numints, buf, setpoint, ds->ds_is_snapshot)); + intsz, numints, buf, setpoint, snapshot)); } /* @@ -575,7 +577,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, isint = (dodefault(propname, 8, 1, &intval) == 0); - if (ds->ds_is_snapshot) { + if (dsl_dataset_is_snapshot(ds)) { ASSERT(version >= SPA_VERSION_SNAP_PROPS); if (dsl_dataset_phys(ds)->ds_props_obj == 0) { dmu_buf_will_dirty(ds->ds_dbuf, tx); @@ -672,7 +674,7 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, if (isint) { VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval)); - if (ds->ds_is_snapshot) { + if (dsl_dataset_is_snapshot(ds)) { dsl_prop_cb_record_t *cbr; /* * It's a snapshot; nothing can inherit this @@ -895,7 +897,7 @@ dsl_props_set_check(void *arg, dmu_tx_t *tx) } } - if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) { + if (dsl_dataset_is_snapshot(ds) && version < SPA_VERSION_SNAP_PROPS) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOTSUP)); } @@ -1132,7 +1134,7 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if (ds->ds_is_snapshot) + if (dsl_dataset_is_snapshot(ds)) flags |= DSL_PROP_GET_SNAPSHOT; ASSERT(dsl_pool_config_held(dp)); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 6c8d34fbc04c..58e760449255 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -415,7 +415,7 @@ static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; - if (ds->ds_is_snapshot) + if (dsl_dataset_is_snapshot(ds)) return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); return (smt); } @@ -865,7 +865,7 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) return; if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { - if (ds->ds_is_snapshot) { + if (dsl_dataset_is_snapshot(ds)) { /* Note, scn_cur_{min,max}_txg stays the same. */ scn->scn_phys.scn_bookmark.zb_objset = dsl_dataset_phys(ds)->ds_next_snap_obj; @@ -887,7 +887,7 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); - if (ds->ds_is_snapshot) { + if (dsl_dataset_is_snapshot(ds)) { /* * We keep the same mintxg; it could be > * ds_creation_txg if the previous snapshot was @@ -1073,7 +1073,7 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) * ZIL here, rather than in scan_recurse(), because the regular * snapshot block-sharing rules don't apply to it. */ - if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot) + if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds)) dsl_scan_zil(dp, &os->os_zil_header); /* diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c index 1b234ed480f9..007f525d1928 100644 --- a/module/zfs/dsl_userhold.c +++ b/module/zfs/dsl_userhold.c @@ -355,7 +355,7 @@ dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura, objset_t *mos; int numholds; - if (!ds->ds_is_snapshot) + if (!dsl_dataset_is_snapshot(ds)) return (SET_ERROR(EINVAL)); if (nvlist_empty(holds)) diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 2383252e2447..94730d020c63 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -1301,10 +1301,10 @@ sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) } /*ARGSUSED*/ -static void -sa_evict(void *dbu) +void +sa_evict(dmu_buf_t *db, void *sap) { - panic("evicting sa dbuf\n"); + panic("evicting sa dbuf %p\n", (void *)db); } static void @@ -1356,10 +1356,9 @@ sa_spill_rele(sa_handle_t *hdl) void sa_handle_destroy(sa_handle_t *hdl) { - dmu_buf_t *db = hdl->sa_bonus; - mutex_enter(&hdl->sa_lock); - (void) dmu_buf_remove_user(db, &hdl->sa_dbu); + (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl, + NULL, NULL); if (hdl->sa_bonus_tab) sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); @@ -1381,7 +1380,7 @@ sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, sa_handle_type_t hdl_type, sa_handle_t **handlepp) { int error = 0; - sa_handle_t *handle = NULL; + sa_handle_t *handle; #ifdef ZFS_DEBUG dmu_object_info_t doi; @@ -1392,14 +1391,10 @@ sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, /* find handle, if it exists */ /* if one doesn't exist then create a new one, and initialize it */ - if (hdl_type == SA_HDL_SHARED) - handle = dmu_buf_get_user(db); - + handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL; if (handle == NULL) { - sa_handle_t *winner = NULL; - + sa_handle_t *newhandle; handle = kmem_cache_alloc(sa_cache, KM_SLEEP); - handle->sa_dbu.dbu_evict_func = NULL; handle->sa_userp = userp; handle->sa_bonus = db; handle->sa_os = os; @@ -1408,15 +1403,12 @@ sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, handle->sa_spill_tab = NULL; error = sa_build_index(handle, SA_BONUS); + newhandle = (hdl_type == SA_HDL_SHARED) ? + dmu_buf_set_user_ie(db, handle, sa_evict) : NULL; - if (hdl_type == SA_HDL_SHARED) { - dmu_buf_init_user(&handle->sa_dbu, sa_evict, NULL); - winner = dmu_buf_set_user_ie(db, &handle->sa_dbu); - } - - if (winner != NULL) { + if (newhandle != NULL) { kmem_cache_free(sa_cache, handle); - handle = winner; + handle = newhandle; } } *handlepp = handle; @@ -1948,6 +1940,14 @@ sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks) blksize, nblocks); } +void +sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl) +{ + (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus, + oldhdl, newhdl, sa_evict); + oldhdl->sa_bonus = NULL; +} + void sa_set_userp(sa_handle_t *hdl, void *ptr) { @@ -2046,6 +2046,7 @@ EXPORT_SYMBOL(sa_size); EXPORT_SYMBOL(sa_update_from_cb); EXPORT_SYMBOL(sa_object_info); EXPORT_SYMBOL(sa_object_size); +EXPORT_SYMBOL(sa_update_user); EXPORT_SYMBOL(sa_get_userdata); EXPORT_SYMBOL(sa_set_userp); EXPORT_SYMBOL(sa_get_db); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 07a48c13a0ac..b9a81d371113 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1125,8 +1125,6 @@ spa_activate(spa_t *spa, int mode) list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); - list_create(&spa->spa_evicting_os_list, sizeof (objset_t), - offsetof(objset_t, os_evicting_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); @@ -1155,12 +1153,9 @@ spa_deactivate(spa_t *spa) ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); - spa_evicting_os_wait(spa); - txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); - list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); taskq_cancel_id(system_taskq, spa->spa_deadman_tqid); @@ -2173,11 +2168,6 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, mosconfig, &ereport); } - /* - * Don't count references from objsets that are already closed - * and are making their way through the eviction process. - */ - spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); if (error) { if (error != EEXIST) { @@ -3819,11 +3809,6 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_history_log_version(spa, "create"); - /* - * Don't count references from objsets that are already closed - * and are making their way through the eviction process. - */ - spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); mutex_exit(&spa_namespace_lock); @@ -4380,10 +4365,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, * modify its state. Objsets may be open only because they're dirty, * so we have to force it to sync before checking spa_refcnt. */ - if (spa->spa_sync_on) { + if (spa->spa_sync_on) txg_wait_synced(spa->spa_dsl_pool, 0); - spa_evicting_os_wait(spa); - } /* * A pool cannot be exported or destroyed if there are active diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 409dce121212..a0e7b47f1054 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -551,7 +551,6 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); @@ -561,7 +560,6 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); - cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); @@ -651,7 +649,6 @@ spa_remove(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); - ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0); nvlist_free(spa->spa_config_splitting); @@ -684,7 +681,6 @@ spa_remove(spa_t *spa) bplist_destroy(&spa->spa_free_bplist[t]); cv_destroy(&spa->spa_async_cv); - cv_destroy(&spa->spa_evicting_os_cv); cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); @@ -692,7 +688,6 @@ spa_remove(spa_t *spa) mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); - mutex_destroy(&spa->spa_evicting_os_lock); mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); @@ -749,20 +744,6 @@ spa_close(spa_t *spa, void *tag) (void) refcount_remove(&spa->spa_refcount, tag); } -/* - * Remove a reference to the given spa_t held by a dsl dir that is - * being asynchronously released. Async releases occur from a taskq - * performing eviction of dsl datasets and dirs. The namespace lock - * isn't held and the hold by the object being evicted may contribute to - * spa_minref (e.g. dataset or directory released during pool export), - * so the asserts in spa_close() do not apply. - */ -void -spa_async_close(spa_t *spa, void *tag) -{ - (void) refcount_remove(&spa->spa_refcount, tag); -} - /* * Check to see if the spa refcount is zero. Must be called with * spa_namespace_lock held. We really compare against spa_minref, which is the @@ -1658,34 +1639,6 @@ spa_log_class(spa_t *spa) return (spa->spa_log_class); } -void -spa_evicting_os_register(spa_t *spa, objset_t *os) -{ - mutex_enter(&spa->spa_evicting_os_lock); - list_insert_head(&spa->spa_evicting_os_list, os); - mutex_exit(&spa->spa_evicting_os_lock); -} - -void -spa_evicting_os_deregister(spa_t *spa, objset_t *os) -{ - mutex_enter(&spa->spa_evicting_os_lock); - list_remove(&spa->spa_evicting_os_list, os); - cv_broadcast(&spa->spa_evicting_os_cv); - mutex_exit(&spa->spa_evicting_os_lock); -} - -void -spa_evicting_os_wait(spa_t *spa) -{ - mutex_enter(&spa->spa_evicting_os_lock); - while (!list_is_empty(&spa->spa_evicting_os_list)) - cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); - mutex_exit(&spa->spa_evicting_os_lock); - - dmu_buf_user_evict_wait(); -} - int spa_max_replication(spa_t *spa) { diff --git a/module/zfs/zap.c b/module/zfs/zap.c index c5ea392b6a1d..bc1d57899fc1 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -53,6 +53,7 @@ int fzap_default_block_shift = 14; /* 16k blocksize */ extern inline zap_phys_t *zap_f_phys(zap_t *zap); +static void zap_leaf_pageout(dmu_buf_t *db, void *vl); static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); void @@ -81,7 +82,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); zap->zap_ismicro = FALSE; - zap->zap_dbu.dbu_evict_func = zap_evict; + (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, zap_evict); mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; @@ -387,20 +388,11 @@ zap_allocate_blocks(zap_t *zap, int nblocks) return (newblk); } -static void -zap_leaf_pageout(void *dbu) -{ - zap_leaf_t *l = dbu; - - rw_destroy(&l->l_rwlock); - kmem_free(l, sizeof (zap_leaf_t)); -} - static zap_leaf_t * zap_create_leaf(zap_t *zap, dmu_tx_t *tx) { void *winner; - zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); @@ -412,8 +404,7 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx) VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, DMU_READ_NO_PREFETCH)); - dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf); - winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu); + winner = dmu_buf_set_user(l->l_dbuf, l, zap_leaf_pageout); ASSERT(winner == NULL); dmu_buf_will_dirty(l->l_dbuf, tx); @@ -445,6 +436,16 @@ zap_put_leaf(zap_leaf_t *l) dmu_buf_rele(l->l_dbuf, NULL); } +_NOTE(ARGSUSED(0)) +static void +zap_leaf_pageout(dmu_buf_t *db, void *vl) +{ + zap_leaf_t *l = vl; + + rw_destroy(&l->l_rwlock); + kmem_free(l, sizeof (zap_leaf_t)); +} + static zap_leaf_t * zap_open_leaf(uint64_t blkid, dmu_buf_t *db) { @@ -452,20 +453,19 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db) ASSERT(blkid != 0); - l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL); rw_enter(&l->l_rwlock, RW_WRITER); l->l_blkid = blkid; l->l_bs = highbit64(db->db_size) - 1; l->l_dbuf = db; - dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf); - winner = dmu_buf_set_user(db, &l->l_dbu); + winner = dmu_buf_set_user(db, l, zap_leaf_pageout); rw_exit(&l->l_rwlock); if (winner != NULL) { /* someone else set it first */ - zap_leaf_pageout(&l->l_dbu); + zap_leaf_pageout(NULL, l); l = winner; } diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 29406e660c5b..b6013b2e6983 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -388,8 +388,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) * it, because zap_lockdir() checks zap_ismicro without the lock * held. */ - dmu_buf_init_user(&zap->zap_dbu, zap_evict, &zap->zap_dbuf); - winner = dmu_buf_set_user(db, &zap->zap_dbu); + winner = dmu_buf_set_user(db, zap, zap_evict); if (winner != NULL) { rw_exit(&zap->zap_rwlock); @@ -678,10 +677,11 @@ zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) return (dmu_object_free(os, zapobj, tx)); } +_NOTE(ARGSUSED(0)) void -zap_evict(void *dbu) +zap_evict(dmu_buf_t *db, void *vzap) { - zap_t *zap = dbu; + zap_t *zap = vzap; rw_destroy(&zap->zap_rwlock); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 9d9d2edec350..0fa944076fe7 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -5392,7 +5392,7 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) return (error); error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); - if (error == 0 && !new->ds_is_snapshot) { + if (error == 0 && !dsl_dataset_is_snapshot(new)) { dsl_dataset_rele(new, FTAG); error = SET_ERROR(EINVAL); } @@ -5401,7 +5401,7 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) return (error); } error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); - if (error == 0 && !old->ds_is_snapshot) { + if (error == 0 && !dsl_dataset_is_snapshot(old)) { dsl_dataset_rele(old, FTAG); error = SET_ERROR(EINVAL); } diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c index c9a9da7528d7..257ab4254bbd 100644 --- a/module/zfs/zfs_sa.c +++ b/module/zfs/zfs_sa.c @@ -22,7 +22,8 @@ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ -#include +#include +#include #include #include #include diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 175d2c66b4a4..2b8fc548ffc9 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -501,7 +501,7 @@ zilog_dirty(zilog_t *zilog, uint64_t txg) dsl_pool_t *dp = zilog->zl_dmu_pool; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); - if (ds->ds_is_snapshot) + if (dsl_dataset_is_snapshot(ds)) panic("dirtying snapshot!"); if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {