LU-6155 osd-zfs: dbuf_hold_impl() called without the lock

The osd-zfs osd_count_not_mapped() calls dbuf_hold_impl() without the required lock. In addition, dbuf_hold_impl() is an internal function and has the expensive side effect of reading the block from disk which would convert a full-block write into a read-modify-write. Since space estimation with ZFS is complicated any way, just use the worst case as a rough estimate where a snapshot holds all current blocks, i.e. no old space can be freed after the COW. Skip test sanity-quota/23 on ZFS because overwrites on ZFS are not guarenteed to be space neutral, and new worst-case assumptions will always cause this test to fail. Change-Id: Idf6f2ff80ff185ca8c0f38e1002ff90e457c3ca0 Signed-off-by: Isaac Huang <[email protected]> Signed-off-by: Nathaniel Clark <[email protected]> Reviewed-on: http://review.whamcloud.com/13541 Tested-by: Jenkins Reviewed-by: Alex Zhuravlev <[email protected]> Tested-by: Maloo <[email protected]> Reviewed-by: Oleg Drokin <[email protected]>
Xyratex · Oct 16, 2015 · 10ebe81 · 10ebe81
1 parent bee9c18
commit 10ebe81
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 72 deletions.
diff --git a/libcfs/include/libcfs/libcfs.h b/libcfs/include/libcfs/libcfs.h
@@ -87,9 +87,9 @@ static inline int __is_po2(unsigned long long val)
         return !(val & (val - 1));
 }
 
-#define IS_PO2(val) __is_po2((unsigned long long)(val))
-
-#define LOWEST_BIT_SET(x)       ((x) & ~((x) - 1))
+#define IS_PO2(val)	  __is_po2((unsigned long long)(val))
+#define PO2_ROUNDUP_TYPED(x, po2, type) (-(-(type)(x) & -(type)(po2)))
+#define LOWEST_BIT_SET(x) ((x) & ~((x) - 1))
 
 /* Sparse annotations */
 #ifdef __KERNEL__

diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c
@@ -537,84 +537,44 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt,
 	return 0;
 }
 
-/* Return number of blocks that aren't mapped in the [start, start + size]
- * region */
-static int osd_count_not_mapped(struct osd_object *obj, uint64_t start,
-				uint32_t size)
+static inline uint32_t osd_get_blocksz(struct osd_object *obj)
 {
-	dmu_buf_impl_t	*dbi = (dmu_buf_impl_t *)obj->oo_db;
-	dmu_buf_impl_t	*db;
-	dnode_t		*dn;
-	uint32_t	 blkshift;
-	uint64_t	 end, blkid;
-	int		 rc;
-	ENTRY;
-
-	DB_DNODE_ENTER(dbi);
-	dn = DB_DNODE(dbi);
-
-	if (dn->dn_maxblkid == 0) {
-		if (start + size <= dn->dn_datablksz)
-			GOTO(out, size = 0);
-		if (start < dn->dn_datablksz)
-			start = dn->dn_datablksz;
-		/* assume largest block size */
-		blkshift = osd_spa_maxblockshift(
-			dmu_objset_spa(osd_obj2dev(obj)->od_os));
-	} else {
-		/* blocksize can't change */
-		blkshift = dn->dn_datablkshift;
-	}
+	uint32_t blksz;
+	u_longlong_t unused;
 
-	/* compute address of last block */
-	end = (start + size - 1) >> blkshift;
-	/* align start on block boundaries */
-	start >>= blkshift;
+	LASSERT(obj->oo_db);
 
-	/* size is null, can't be mapped */
-	if (obj->oo_attr.la_size == 0 || dn->dn_maxblkid == 0)
-		GOTO(out, size = (end - start + 1) << blkshift);
+	dmu_object_size_from_db(obj->oo_db, &blksz, &unused);
+	return blksz;
+}
 
-	/* beyond EOF, can't be mapped */
-	if (start > dn->dn_maxblkid)
-		GOTO(out, size = (end - start + 1) << blkshift);
+static inline uint64_t osd_roundup2blocksz(uint64_t size,
+					   uint64_t offset,
+					   uint32_t blksz)
+{
+	LASSERT(blksz > 0);
 
-	size = 0;
-	for (blkid = start; blkid <= end; blkid++) {
-		if (blkid == dn->dn_maxblkid)
-			/* this one is mapped for sure */
-			continue;
-		if (blkid > dn->dn_maxblkid) {
-			size += (end - blkid + 1) << blkshift;
-			GOTO(out, size);
-		}
+	size += offset % blksz;
 
-		rc = dbuf_hold_impl(dn, 0, blkid, TRUE, FTAG, &db);
-		if (rc) {
-			/* for ENOENT (block not mapped) and any other errors,
-			 * assume the block isn't mapped */
-			size += 1 << blkshift;
-			continue;
-		}
-		dbuf_rele(db, FTAG);
-	}
+	if (likely(IS_PO2(blksz)))
+		return PO2_ROUNDUP_TYPED(size, blksz, uint64_t);
 
-	GOTO(out, size);
-out:
-	DB_DNODE_EXIT(dbi);
-	return size;
+	size += blksz - 1;
+	do_div(size, blksz);
+	return size * blksz;
 }
 
 static int osd_declare_write_commit(const struct lu_env *env,
-				struct dt_object *dt,
-				struct niobuf_local *lnb, int npages,
-				struct thandle *th)
+				    struct dt_object *dt,
+				    struct niobuf_local *lnb, int npages,
+				    struct thandle *th)
 {
 	struct osd_object  *obj = osd_dt_obj(dt);
 	struct osd_device  *osd = osd_obj2dev(obj);
 	struct osd_thandle *oh;
 	uint64_t            offset = 0;
 	uint32_t            size = 0;
+	uint32_t	    blksz = osd_get_blocksz(obj);
 	int		    i, rc, flags = 0;
 	bool		    ignore_quota = false, synced = false;
 	long long	    space = 0;
@@ -667,11 +627,13 @@ static int osd_declare_write_commit(const struct lu_env *env,
 
 		dmu_tx_hold_write(oh->ot_tx, obj->oo_db->db_object,
 				  offset, size);
-		/* estimating space that will be consumed by a write is rather
+		/* Estimating space to be consumed by a write is rather
 		 * complicated with ZFS. As a consequence, we don't account for
-		 * indirect blocks and quota overrun will be adjusted once the
-		 * operation is committed, if required. */
-		space += osd_count_not_mapped(obj, offset, size);
+		 * indirect blocks and just use as a rough estimate the worse
+		 * case where the old space is being held by a snapshot. Quota
+		 * overrun will be adjusted once the operation is committed, if
+		 * required. */
+		space += osd_roundup2blocksz(size, offset, blksz);
 
 		offset = lnb[i].lnb_file_offset;
 		size = lnb[i].lnb_len;
@@ -680,7 +642,7 @@ static int osd_declare_write_commit(const struct lu_env *env,
 	if (size) {
 		dmu_tx_hold_write(oh->ot_tx, obj->oo_db->db_object,
 				  offset, size);
-		space += osd_count_not_mapped(obj, offset, size);
+		space += osd_roundup2blocksz(size, offset, blksz);
 	}
 
 	dmu_tx_hold_sa(oh->ot_tx, obj->oo_sa_hdl, 0);
@@ -691,8 +653,8 @@ static int osd_declare_write_commit(const struct lu_env *env,
 	 * copies */
 	space  *= osd->od_os->os_copies;
 	space   = toqb(space);
-	CDEBUG(D_QUOTA, "writting %d pages, reserving "LPD64"K of quota "
-	       "space\n", npages, space);
+	CDEBUG(D_QUOTA, "writing %d pages, reserving "LPD64"K of quota space\n",
+	       npages, space);
 
 	record_start_io(osd, WRITE, discont_pages);
 retry:

diff --git a/lustre/tests/sanity-quota.sh b/lustre/tests/sanity-quota.sh
@@ -1952,6 +1952,10 @@ test_23_sub() {
 }
 
 test_23() {
+	[ $(facet_fstype ost1) == "zfs" ] &&
+		skip "Overwrite in place is not guaranteed to be " \
+		"space neutral on ZFS" && return
+
 	local OST0_MIN=$((6 * 1024)) # 6MB, extra space for meta blocks.
 	check_whether_skip && return 0
 	log "run for 4MB test file"