From 69766a55f87b8f06d3d58f779f5222dfa84594ae Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 19 Nov 2019 12:07:33 +0000 Subject: [PATCH 01/47] Btrfs: fix missing hole after hole punching and fsync when using NO_HOLES When using the NO_HOLES feature, if we punch a hole into a file and then fsync it, there are cases where a subsequent fsync will miss the fact that a hole was punched, resulting in the holes not existing after replaying the log tree. Essentially these cases all imply that, tree-log.c:copy_items(), is not invoked for the leafs that delimit holes, because nothing changed those leafs in the current transaction. And it's precisely copy_items() where we currenly detect and log holes, which works as long as the holes are between file extent items in the input leaf or between the beginning of input leaf and the previous leaf or between the last item in the leaf and the next leaf. First example where we miss a hole: *) The extent items of the inode span multiple leafs; *) The punched hole covers a range that affects only the extent items of the first leaf; *) The fsync operation is done in full mode (BTRFS_INODE_NEEDS_FULL_SYNC is set in the inode's runtime flags). That results in the hole not existing after replaying the log tree. For example, if the fs/subvolume tree has the following layout for a particular inode: Leaf N, generation 10: [ ... INODE_ITEM INODE_REF EXTENT_ITEM (0 64K) EXTENT_ITEM (64K 128K) ] Leaf N + 1, generation 10: [ EXTENT_ITEM (128K 64K) ... ] If at transaction 11 we punch a hole coverting the range [0, 128K[, we end up dropping the two extent items from leaf N, but we don't touch the other leaf, so we end up in the following state: Leaf N, generation 11: [ ... INODE_ITEM INODE_REF ] Leaf N + 1, generation 10: [ EXTENT_ITEM (128K 64K) ... ] A full fsync after punching the hole will only process leaf N because it was modified in the current transaction, but not leaf N + 1, since it was not modified in the current transaction (generation 10 and not 11). As a result the fsync will not log any holes, because it didn't process any leaf with extent items. Second example where we will miss a hole: *) An inode as its items spanning 5 (or more) leafs; *) A hole is punched and it covers only the extents items of the 3rd leaf. This resulsts in deleting the entire leaf and not touching any of the other leafs. So the only leaf that is modified in the current transaction, when punching the hole, is the first leaf, which contains the inode item. During the full fsync, the only leaf that is passed to copy_items() is that first leaf, and that's not enough for the hole detection code in copy_items() to determine there's a hole between the last file extent item in the 2nd leaf and the first file extent item in the 3rd leaf (which was the 4th leaf before punching the hole). Fix this by scanning all leafs and punch holes as necessary when doing a full fsync (less common than a non-full fsync) when the NO_HOLES feature is enabled. The lack of explicit file extent items to mark holes makes it necessary to scan existing extents to determine if holes exist. A test case for fstests follows soon. Fixes: 16e7549f045d33 ("Btrfs: incompatible format change to remove hole extents") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 388 ++++++++++++-------------------------------- 1 file changed, 100 insertions(+), 288 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d3f115909ff05a..44480762003cd3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3935,7 +3935,7 @@ static int log_csums(struct btrfs_trans_handle *trans, static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *dst_path, - struct btrfs_path *src_path, u64 *last_extent, + struct btrfs_path *src_path, int start_slot, int nr, int inode_only, u64 logged_isize) { @@ -3946,7 +3946,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *extent; struct btrfs_inode_item *inode_item; struct extent_buffer *src = src_path->nodes[0]; - struct btrfs_key first_key, last_key, key; int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; @@ -3954,9 +3953,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int i; struct list_head ordered_sums; int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; - bool has_extents = false; - bool need_find_last_extent = true; - bool done = false; INIT_LIST_HEAD(&ordered_sums); @@ -3965,8 +3961,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, if (!ins_data) return -ENOMEM; - first_key.objectid = (u64)-1; - ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); @@ -3987,9 +3981,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + i); - if (i == nr - 1) - last_key = ins_keys[i]; - if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_path->slots[0], @@ -4003,20 +3994,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset, ins_sizes[i]); } - /* - * We set need_find_last_extent here in case we know we were - * processing other items and then walk into the first extent in - * the inode. If we don't hit an extent then nothing changes, - * we'll do the last search the next time around. - */ - if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { - has_extents = true; - if (first_key.objectid == (u64)-1) - first_key = ins_keys[i]; - } else { - need_find_last_extent = false; - } - /* take a reference on file data extents so that truncates * or deletes of this inode don't have to relog the inode * again @@ -4082,167 +4059,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, kfree(sums); } - if (!has_extents) - return ret; - - if (need_find_last_extent && *last_extent == first_key.offset) { - /* - * We don't have any leafs between our current one and the one - * we processed before that can have file extent items for our - * inode (and have a generation number smaller than our current - * transaction id). - */ - need_find_last_extent = false; - } - - /* - * Because we use btrfs_search_forward we could skip leaves that were - * not modified and then assume *last_extent is valid when it really - * isn't. So back up to the previous leaf and read the end of the last - * extent before we go and fill in holes. - */ - if (need_find_last_extent) { - u64 len; - - ret = btrfs_prev_leaf(inode->root, src_path); - if (ret < 0) - return ret; - if (ret) - goto fill_holes; - if (src_path->slots[0]) - src_path->slots[0]--; - src = src_path->nodes[0]; - btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); - if (key.objectid != btrfs_ino(inode) || - key.type != BTRFS_EXTENT_DATA_KEY) - goto fill_holes; - extent = btrfs_item_ptr(src, src_path->slots[0], - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(src, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(src, extent); - *last_extent = ALIGN(key.offset + len, - fs_info->sectorsize); - } else { - len = btrfs_file_extent_num_bytes(src, extent); - *last_extent = key.offset + len; - } - } -fill_holes: - /* So we did prev_leaf, now we need to move to the next leaf, but a few - * things could have happened - * - * 1) A merge could have happened, so we could currently be on a leaf - * that holds what we were copying in the first place. - * 2) A split could have happened, and now not all of the items we want - * are on the same leaf. - * - * So we need to adjust how we search for holes, we need to drop the - * path and re-search for the first extent key we found, and then walk - * forward until we hit the last one we copied. - */ - if (need_find_last_extent) { - /* btrfs_prev_leaf could return 1 without releasing the path */ - btrfs_release_path(src_path); - ret = btrfs_search_slot(NULL, inode->root, &first_key, - src_path, 0, 0); - if (ret < 0) - return ret; - ASSERT(ret == 0); - src = src_path->nodes[0]; - i = src_path->slots[0]; - } else { - i = start_slot; - } - - /* - * Ok so here we need to go through and fill in any holes we may have - * to make sure that holes are punched for those areas in case they had - * extents previously. - */ - while (!done) { - u64 offset, len; - u64 extent_end; - - if (i >= btrfs_header_nritems(src_path->nodes[0])) { - ret = btrfs_next_leaf(inode->root, src_path); - if (ret < 0) - return ret; - ASSERT(ret == 0); - src = src_path->nodes[0]; - i = 0; - need_find_last_extent = true; - } - - btrfs_item_key_to_cpu(src, &key, i); - if (!btrfs_comp_cpu_keys(&key, &last_key)) - done = true; - if (key.objectid != btrfs_ino(inode) || - key.type != BTRFS_EXTENT_DATA_KEY) { - i++; - continue; - } - extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(src, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(src, extent); - extent_end = ALIGN(key.offset + len, - fs_info->sectorsize); - } else { - len = btrfs_file_extent_num_bytes(src, extent); - extent_end = key.offset + len; - } - i++; - - if (*last_extent == key.offset) { - *last_extent = extent_end; - continue; - } - offset = *last_extent; - len = key.offset - *last_extent; - ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), - offset, 0, 0, len, 0, len, 0, 0, 0); - if (ret) - break; - *last_extent = extent_end; - } - - /* - * Check if there is a hole between the last extent found in our leaf - * and the first extent in the next leaf. If there is one, we need to - * log an explicit hole so that at replay time we can punch the hole. - */ - if (ret == 0 && - key.objectid == btrfs_ino(inode) && - key.type == BTRFS_EXTENT_DATA_KEY && - i == btrfs_header_nritems(src_path->nodes[0])) { - ret = btrfs_next_leaf(inode->root, src_path); - need_find_last_extent = true; - if (ret > 0) { - ret = 0; - } else if (ret == 0) { - btrfs_item_key_to_cpu(src_path->nodes[0], &key, - src_path->slots[0]); - if (key.objectid == btrfs_ino(inode) && - key.type == BTRFS_EXTENT_DATA_KEY && - *last_extent < key.offset) { - const u64 len = key.offset - *last_extent; - - ret = btrfs_insert_file_extent(trans, log, - btrfs_ino(inode), - *last_extent, 0, - 0, len, 0, len, - 0, 0, 0); - *last_extent += len; - } - } - } - /* - * Need to let the callers know we dropped the path so they should - * re-search. - */ - if (!ret && need_find_last_extent) - ret = 1; return ret; } @@ -4407,7 +4223,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, const u64 i_size = i_size_read(&inode->vfs_inode); const u64 ino = btrfs_ino(inode); struct btrfs_path *dst_path = NULL; - u64 last_extent = (u64)-1; + bool dropped_extents = false; int ins_nr = 0; int start_slot; int ret; @@ -4429,8 +4245,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, if (slot >= btrfs_header_nritems(leaf)) { if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - &last_extent, start_slot, - ins_nr, 1, 0); + start_slot, ins_nr, 1, 0); if (ret < 0) goto out; ins_nr = 0; @@ -4454,8 +4269,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, path->slots[0]++; continue; } - if (last_extent == (u64)-1) { - last_extent = key.offset; + if (!dropped_extents) { /* * Avoid logging extent items logged in past fsync calls * and leading to duplicate keys in the log tree. @@ -4469,6 +4283,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, } while (ret == -EAGAIN); if (ret) goto out; + dropped_extents = true; } if (ins_nr == 0) start_slot = slot; @@ -4483,7 +4298,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, } } if (ins_nr > 0) { - ret = copy_items(trans, inode, dst_path, path, &last_extent, + ret = copy_items(trans, inode, dst_path, path, start_slot, ins_nr, 1, 0); if (ret > 0) ret = 0; @@ -4670,13 +4485,8 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, if (slot >= nritems) { if (ins_nr > 0) { - u64 last_extent = 0; - ret = copy_items(trans, inode, dst_path, path, - &last_extent, start_slot, - ins_nr, 1, 0); - /* can't be 1, extent items aren't processed */ - ASSERT(ret <= 0); + start_slot, ins_nr, 1, 0); if (ret < 0) return ret; ins_nr = 0; @@ -4700,13 +4510,8 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, cond_resched(); } if (ins_nr > 0) { - u64 last_extent = 0; - ret = copy_items(trans, inode, dst_path, path, - &last_extent, start_slot, - ins_nr, 1, 0); - /* can't be 1, extent items aren't processed */ - ASSERT(ret <= 0); + start_slot, ins_nr, 1, 0); if (ret < 0) return ret; } @@ -4715,100 +4520,119 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, } /* - * If the no holes feature is enabled we need to make sure any hole between the - * last extent and the i_size of our inode is explicitly marked in the log. This - * is to make sure that doing something like: - * - * 1) create file with 128Kb of data - * 2) truncate file to 64Kb - * 3) truncate file to 256Kb - * 4) fsync file - * 5) - * 6) mount fs and trigger log replay - * - * Will give us a file with a size of 256Kb, the first 64Kb of data match what - * the file had in its first 64Kb of data at step 1 and the last 192Kb of the - * file correspond to a hole. The presence of explicit holes in a log tree is - * what guarantees that log replay will remove/adjust file extent items in the - * fs/subvol tree. - * - * Here we do not need to care about holes between extents, that is already done - * by copy_items(). We also only need to do this in the full sync path, where we - * lookup for extents from the fs/subvol tree only. In the fast path case, we - * lookup the list of modified extent maps and if any represents a hole, we - * insert a corresponding extent representing a hole in the log tree. + * When using the NO_HOLES feature if we punched a hole that causes the + * deletion of entire leafs or all the extent items of the first leaf (the one + * that contains the inode item and references) we may end up not processing + * any extents, because there are no leafs with a generation matching the + * current transaction that have extent items for our inode. So we need to find + * if any holes exist and then log them. We also need to log holes after any + * truncate operation that changes the inode's size. */ -static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode, - struct btrfs_path *path) +static int btrfs_log_holes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_inode *inode, + struct btrfs_path *path) { struct btrfs_fs_info *fs_info = root->fs_info; - int ret; struct btrfs_key key; - u64 hole_start; - u64 hole_size; - struct extent_buffer *leaf; - struct btrfs_root *log = root->log_root; const u64 ino = btrfs_ino(inode); const u64 i_size = i_size_read(&inode->vfs_inode); + u64 prev_extent_end = 0; + int ret; - if (!btrfs_fs_incompat(fs_info, NO_HOLES)) + if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) return 0; key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = (u64)-1; + key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - ASSERT(ret != 0); if (ret < 0) return ret; - ASSERT(path->slots[0] > 0); - path->slots[0]--; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { - /* inode does not have any extents */ - hole_start = 0; - hole_size = i_size; - } else { + while (true) { struct btrfs_file_extent_item *extent; + struct extent_buffer *leaf = path->nodes[0]; u64 len; - /* - * If there's an extent beyond i_size, an explicit hole was - * already inserted by copy_items(). - */ - if (key.offset >= i_size) - return 0; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + if (ret > 0) { + ret = 0; + break; + } + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + break; + + /* We have a hole, log it. */ + if (prev_extent_end < key.offset) { + const u64 hole_len = key.offset - prev_extent_end; + + /* + * Release the path to avoid deadlocks with other code + * paths that search the root while holding locks on + * leafs from the log root. + */ + btrfs_release_path(path); + ret = btrfs_insert_file_extent(trans, root->log_root, + ino, prev_extent_end, 0, + 0, hole_len, 0, hole_len, + 0, 0, 0); + if (ret < 0) + return ret; + + /* + * Search for the same key again in the root. Since it's + * an extent item and we are holding the inode lock, the + * key must still exist. If it doesn't just emit warning + * and return an error to fall back to a transaction + * commit. + */ + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + if (WARN_ON(ret > 0)) + return -ENOENT; + leaf = path->nodes[0]; + } extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, extent) == - BTRFS_FILE_EXTENT_INLINE) - return 0; + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_ram_bytes(leaf, extent); + prev_extent_end = ALIGN(key.offset + len, + fs_info->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(leaf, extent); + prev_extent_end = key.offset + len; + } - len = btrfs_file_extent_num_bytes(leaf, extent); - /* Last extent goes beyond i_size, no need to log a hole. */ - if (key.offset + len > i_size) - return 0; - hole_start = key.offset + len; - hole_size = i_size - hole_start; + path->slots[0]++; + cond_resched(); } - btrfs_release_path(path); - /* Last extent ends at i_size. */ - if (hole_size == 0) - return 0; + if (prev_extent_end < i_size) { + u64 hole_len; - hole_size = ALIGN(hole_size, fs_info->sectorsize); - ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, - hole_size, 0, hole_size, 0, 0, 0); - return ret; + btrfs_release_path(path); + hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); + ret = btrfs_insert_file_extent(trans, root->log_root, + ino, prev_extent_end, 0, 0, + hole_len, 0, hole_len, + 0, 0, 0); + if (ret < 0) + return ret; + } + + return 0; } /* @@ -5110,7 +4934,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_key max_key; struct btrfs_root *log = root->log_root; - u64 last_extent = 0; int err = 0; int ret; int nritems; @@ -5288,7 +5111,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ins_start_slot = path->slots[0]; } ret = copy_items(trans, inode, dst_path, path, - &last_extent, ins_start_slot, + ins_start_slot, ins_nr, inode_only, logged_isize); if (ret < 0) { @@ -5311,17 +5134,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (ins_nr == 0) goto next_slot; ret = copy_items(trans, inode, dst_path, path, - &last_extent, ins_start_slot, + ins_start_slot, ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; } ins_nr = 0; - if (ret) { - btrfs_release_path(path); - continue; - } goto next_slot; } @@ -5334,18 +5153,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, goto next_slot; } - ret = copy_items(trans, inode, dst_path, path, &last_extent, + ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; } - if (ret) { - ins_nr = 0; - btrfs_release_path(path); - continue; - } ins_nr = 1; ins_start_slot = path->slots[0]; next_slot: @@ -5359,13 +5173,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, - &last_extent, ins_start_slot, + ins_start_slot, ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; } - ret = 0; ins_nr = 0; } btrfs_release_path(path); @@ -5380,14 +5193,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } } if (ins_nr) { - ret = copy_items(trans, inode, dst_path, path, &last_extent, + ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; } - ret = 0; ins_nr = 0; } @@ -5400,7 +5212,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_trailing_hole(trans, root, inode, path); + err = btrfs_log_holes(trans, root, inode, path); if (err) goto out_unlock; } From 1bbbde10ac37409600e366d22b5d318649f8286d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 14 May 2019 01:59:54 +0200 Subject: [PATCH 02/47] btrfs: use raid_attr table in calc_stripe_length for nparity The table is already used for ncopies, replace open coding of stripes with the raid_attr value. Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a6d3f08bfff37b..4881b9b8ad89ef 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6476,19 +6476,14 @@ static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) { int index = btrfs_bg_flags_to_raid_index(type); int ncopies = btrfs_raid_array[index].ncopies; + const int nparity = btrfs_raid_array[index].nparity; int data_stripes; - switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { - case BTRFS_BLOCK_GROUP_RAID5: - data_stripes = num_stripes - 1; - break; - case BTRFS_BLOCK_GROUP_RAID6: - data_stripes = num_stripes - 2; - break; - default: + if (nparity) + data_stripes = num_stripes - nparity; + else data_stripes = num_stripes / ncopies; - break; - } + return div_u64(chunk_len, data_stripes); } From bee1249440aaa969abfcb2041fa6815373236ee0 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 25 Nov 2019 15:34:48 +0100 Subject: [PATCH 03/47] btrfs: fill ncopies for all raid table entries Make the number of copies explicit even for entries that use the default 0 value for consistency. Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4881b9b8ad89ef..c565650639eeda 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -66,6 +66,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 2, .devs_increment = 3, .ncopies = 3, + .nparity = 0, .raid_name = "raid1c3", .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, @@ -78,6 +79,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 3, .devs_increment = 4, .ncopies = 4, + .nparity = 0, .raid_name = "raid1c4", .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, From 722bd06817fdb71460df15d7ce205bf14eb17464 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 21 Nov 2019 17:33:30 +0800 Subject: [PATCH 04/47] btrfs: sysfs, rename devices kobject holder to devices_kobj The struct member btrfs_device::device_dir_kobj holds the kobj of the sysfs directory /sys/fs/btrfs/UUID/devices, so rename it from device_dir_kobj to devices_kobj. No functional changes. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 26 ++++++++++++-------------- fs/btrfs/volumes.h | 2 +- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5ebbe8a5ee76d6..3799af42ee293f 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -734,10 +734,10 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { - if (fs_devs->device_dir_kobj) { - kobject_del(fs_devs->device_dir_kobj); - kobject_put(fs_devs->device_dir_kobj); - fs_devs->device_dir_kobj = NULL; + if (fs_devs->devices_kobj) { + kobject_del(fs_devs->devices_kobj); + kobject_put(fs_devs->devices_kobj); + fs_devs->devices_kobj = NULL; } if (fs_devs->fsid_kobj.state_initialized) { @@ -969,15 +969,14 @@ int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, struct hd_struct *disk; struct kobject *disk_kobj; - if (!fs_devices->device_dir_kobj) + if (!fs_devices->devices_kobj) return -EINVAL; if (one_device && one_device->bdev) { disk = one_device->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_devices->device_dir_kobj, - disk_kobj->name); + sysfs_remove_link(fs_devices->devices_kobj, disk_kobj->name); } if (one_device) @@ -990,8 +989,7 @@ int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, disk = one_device->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_devices->device_dir_kobj, - disk_kobj->name); + sysfs_remove_link(fs_devices->devices_kobj, disk_kobj->name); } return 0; @@ -999,11 +997,11 @@ int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) { - if (!fs_devs->device_dir_kobj) - fs_devs->device_dir_kobj = kobject_create_and_add("devices", - &fs_devs->fsid_kobj); + if (!fs_devs->devices_kobj) + fs_devs->devices_kobj = kobject_create_and_add("devices", + &fs_devs->fsid_kobj); - if (!fs_devs->device_dir_kobj) + if (!fs_devs->devices_kobj) return -ENOMEM; return 0; @@ -1028,7 +1026,7 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, disk = dev->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; - error = sysfs_create_link(fs_devices->device_dir_kobj, + error = sysfs_create_link(fs_devices->devices_kobj, disk_kobj, disk_kobj->name); if (error) break; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index fc1b564b9cfe57..3c56ef571b00be 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -255,7 +255,7 @@ struct btrfs_fs_devices { struct btrfs_fs_info *fs_info; /* sysfs kobjects */ struct kobject fsid_kobj; - struct kobject *device_dir_kobj; + struct kobject *devices_kobj; struct completion kobj_unregister; }; From 61fa4d77fef8bbfabc0415d5bf899716975916ca Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 21 Nov 2019 17:33:32 +0800 Subject: [PATCH 05/47] btrfs: sysfs, btrfs_sysfs_add_fsid() drop unused argument parent Commit 24bd69cb ("Btrfs: sysfs: add support to add parent for fsid") added parent argument in preparation to show the seed fsid under the sprout fsid as in the patch [1] in the mailing list. [1] Btrfs: sysfs: support seed devices in the sysfs layout But later this idea was superseded by another idea to rename the fsid as in the commit f93c39970b1d ("btrfs: factor out sysfs code for updating sprout fsid"). So we don't need parent argument anymore. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/sysfs.c | 11 ++++++----- fs/btrfs/sysfs.h | 5 ++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e0edfdc9c82bea..6391043266910e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3082,7 +3082,7 @@ int __cold open_ctree(struct super_block *sb, btrfs_free_extra_devids(fs_devices, 1); - ret = btrfs_sysfs_add_fsid(fs_devices, NULL); + ret = btrfs_sysfs_add_fsid(fs_devices); if (ret) { btrfs_err(fs_info, "failed to init sysfs fsid interface: %d", ret); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 3799af42ee293f..1d8ee57da164a7 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1065,18 +1065,19 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, static struct kset *btrfs_kset; /* + * Creates: + * /sys/fs/btrfs/UUID + * * Can be called by the device discovery thread. - * And parent can be specified for seed device */ -int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, - struct kobject *parent) +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) { int error; init_completion(&fs_devs->kobj_unregister); fs_devs->fsid_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs_devs->fsid_kobj, - &btrfs_ktype, parent, "%pU", fs_devs->fsid); + error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL, + "%pU", fs_devs->fsid); if (error) { kobject_put(&fs_devs->fsid_kobj); return error; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index e10c3adfc30fc6..a0454947f41a28 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -18,9 +18,8 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, - struct kobject *parent); -int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); +int btrfs_sysfs_add_devices_kobj(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, const u8 *fsid); From 3d852c80097c5ac4f83f4d763f295d6242cab0b4 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 21 Nov 2019 17:33:33 +0800 Subject: [PATCH 06/47] btrfs: sysfs, rename btrfs_sysfs_add_device() btrfs_sysfs_add_device() creates the directory /sys/fs/btrfs/UUID/devices but its function name is misleading. Rename it to btrfs_sysfs_add_devices_kobj() instead. No functional changes. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/sysfs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6391043266910e..65cfd0fd0f832a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3089,7 +3089,7 @@ int __cold open_ctree(struct super_block *sb, goto fail_block_groups; } - ret = btrfs_sysfs_add_device(fs_devices); + ret = btrfs_sysfs_add_devices_kobj(fs_devices); if (ret) { btrfs_err(fs_info, "failed to init sysfs device interface: %d", ret); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 1d8ee57da164a7..c76c38d749724b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -995,7 +995,7 @@ int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, return 0; } -int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) +int btrfs_sysfs_add_devices_kobj(struct btrfs_fs_devices *fs_devs) { if (!fs_devs->devices_kobj) fs_devs->devices_kobj = kobject_create_and_add("devices", From 7fc36f24a290adca9152006f0e703701aa25566f Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 21 Nov 2019 17:33:34 +0800 Subject: [PATCH 07/47] btrfs: sysfs, merge btrfs_sysfs_add devices_kobj and fsid Merge btrfs_sysfs_add_fsid() and btrfs_sysfs_add_devices_kobj() functions as these two are small and they are called one after the other. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 7 ------- fs/btrfs/sysfs.c | 21 +++++++++------------ fs/btrfs/sysfs.h | 1 - 3 files changed, 9 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 65cfd0fd0f832a..ab888d89d844c2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3089,13 +3089,6 @@ int __cold open_ctree(struct super_block *sb, goto fail_block_groups; } - ret = btrfs_sysfs_add_devices_kobj(fs_devices); - if (ret) { - btrfs_err(fs_info, "failed to init sysfs device interface: %d", - ret); - goto fail_fsdev_sysfs; - } - ret = btrfs_sysfs_add_mounted(fs_info); if (ret) { btrfs_err(fs_info, "failed to init sysfs interface: %d", ret); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c76c38d749724b..16379f491ca1db 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -995,18 +995,6 @@ int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, return 0; } -int btrfs_sysfs_add_devices_kobj(struct btrfs_fs_devices *fs_devs) -{ - if (!fs_devs->devices_kobj) - fs_devs->devices_kobj = kobject_create_and_add("devices", - &fs_devs->fsid_kobj); - - if (!fs_devs->devices_kobj) - return -ENOMEM; - - return 0; -} - int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { @@ -1083,6 +1071,15 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) return error; } + fs_devs->devices_kobj = kobject_create_and_add("devices", + &fs_devs->fsid_kobj); + if (!fs_devs->devices_kobj) { + btrfs_err(fs_devs->fs_info, + "failed to init sysfs device interface"); + kobject_put(&fs_devs->fsid_kobj); + return -ENOMEM; + } + return 0; } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index a0454947f41a28..3d27b39eaf9459 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -19,7 +19,6 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); -int btrfs_sysfs_add_devices_kobj(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, const u8 *fsid); From d31055abec27ac721092217a053dba64b88be2cd Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Mon, 25 Nov 2019 21:58:50 -0300 Subject: [PATCH 08/47] btrfs: qgroup: remove one-time use variables for quota_root checks Remove some variables that are set only to be checked later, and never used. Reviewed-by: Qu Wenruo Signed-off-by: Marcos Paulo de Souza Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index d4282e12f2a64a..417fafb4b4f6fc 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1243,7 +1243,6 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; @@ -1259,8 +1258,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, return -ENOMEM; mutex_lock(&fs_info->qgroup_ioctl_lock); - quota_root = fs_info->quota_root; - if (!quota_root) { + if (!fs_info->quota_root) { ret = -EINVAL; goto out; } @@ -1307,7 +1305,6 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; @@ -1320,8 +1317,7 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, if (!tmp) return -ENOMEM; - quota_root = fs_info->quota_root; - if (!quota_root) { + if (!fs_info->quota_root) { ret = -EINVAL; goto out; } @@ -1387,11 +1383,11 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); - quota_root = fs_info->quota_root; - if (!quota_root) { + if (!fs_info->quota_root) { ret = -EINVAL; goto out; } + quota_root = fs_info->quota_root; qgroup = find_qgroup_rb(fs_info, qgroupid); if (qgroup) { ret = -EEXIST; @@ -1416,14 +1412,12 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_qgroup_list *list; int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); - quota_root = fs_info->quota_root; - if (!quota_root) { + if (!fs_info->quota_root) { ret = -EINVAL; goto out; } @@ -1465,7 +1459,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; int ret = 0; /* Sometimes we would want to clear the limit on this qgroup. @@ -1475,8 +1468,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, const u64 CLEAR_VALUE = -1; mutex_lock(&fs_info->qgroup_ioctl_lock); - quota_root = fs_info->quota_root; - if (!quota_root) { + if (!fs_info->quota_root) { ret = -EINVAL; goto out; } @@ -2578,10 +2570,9 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) int btrfs_run_qgroups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root = fs_info->quota_root; int ret = 0; - if (!quota_root) + if (!fs_info->quota_root) return ret; spin_lock(&fs_info->qgroup_lock); @@ -2875,7 +2866,6 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, enum btrfs_qgroup_rsv_type type) { - struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; @@ -2894,8 +2884,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, enforce = false; spin_lock(&fs_info->qgroup_lock); - quota_root = fs_info->quota_root; - if (!quota_root) + if (!fs_info->quota_root) goto out; qgroup = find_qgroup_rb(fs_info, ref_root); @@ -2962,7 +2951,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { - struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct ulist_node *unode; struct ulist_iterator uiter; @@ -2980,8 +2968,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, } spin_lock(&fs_info->qgroup_lock); - quota_root = fs_info->quota_root; - if (!quota_root) + if (!fs_info->quota_root) goto out; qgroup = find_qgroup_rb(fs_info, ref_root); @@ -3681,7 +3668,6 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, int num_bytes) { - struct btrfs_root *quota_root = fs_info->quota_root; struct btrfs_qgroup *qgroup; struct ulist_node *unode; struct ulist_iterator uiter; @@ -3689,7 +3675,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, if (num_bytes == 0) return; - if (!quota_root) + if (!fs_info->quota_root) return; spin_lock(&fs_info->qgroup_lock); From 90cf1deba5eabc23c4e115d672f8c5e954c832c7 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Mon, 25 Nov 2019 21:58:51 -0300 Subject: [PATCH 09/47] btrfs: qgroup: return ENOTCONN instead of EINVAL when quotas are not enabled [PROBLEM] qgroup create/remove code is currently returning EINVAL when the user tries to create a qgroup on a subvolume without quota enabled. EINVAL is already being used for too many error scenarios so that is hard to depict what is the problem. [FIX] Currently scrub and balance code return -ENOTCONN when the user tries to cancel/pause and no scrub or balance is currently running for the desired subvolume. Do the same here by returning -ENOTCONN when a user tries to create/delete/assing/list a qgroup on a subvolume without quota enabled. Reviewed-by: Qu Wenruo Signed-off-by: Marcos Paulo de Souza Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 417fafb4b4f6fc..b046b04d7cce48 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1259,7 +1259,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { - ret = -EINVAL; + ret = -ENOTCONN; goto out; } member = find_qgroup_rb(fs_info, src); @@ -1318,7 +1318,7 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, return -ENOMEM; if (!fs_info->quota_root) { - ret = -EINVAL; + ret = -ENOTCONN; goto out; } @@ -1384,7 +1384,7 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { - ret = -EINVAL; + ret = -ENOTCONN; goto out; } quota_root = fs_info->quota_root; @@ -1418,7 +1418,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { - ret = -EINVAL; + ret = -ENOTCONN; goto out; } @@ -1469,7 +1469,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { - ret = -EINVAL; + ret = -ENOTCONN; goto out; } From a48967bf044e912a9104f16fc078fd83320e73a5 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 21 Nov 2019 14:03:29 +0200 Subject: [PATCH 10/47] btrfs: Don't discard unwritten extents All callers of btrfs_free_reserved_extent (respectively __btrfs_free_reserved_extent with in set to 0) pass in extents which have only been reserved but not yet written to. Namely, * in cow_file_range that function is called only if create_io_em fails or btrfs_add_ordered_extent fail, both of which happen _before_ any IO is submitted to the newly reserved range * in submit_compressed_extents the code flow is similar - out_free_reserve can be called only before btrfs_submit_compressed_write which is where any writes to the range could occur * btrfs_new_extent_direct also calls btrfs_free_reserved_extent only if extent_map fails, before any IO is issued * __btrfs_prealloc_file_range also calls btrfs_free_reserved_extent in case insertion of the metadata fails * btrfs_alloc_tree_block again can only be called in case in-memory operations fail, before any IO is submitted * btrfs_finish_ordered_io - this is the only caller where discarding the extent could have a material effect, since it can be called for an extent which was partially written. With this change the submission of discards is optimised since discards are now not being created for extents which are known to not have been touched on disk. Reviewed-by: Filipe Manana Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 -- fs/btrfs/inode.c | 11 ++++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 274318e9114eee..3495710c72b82a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4182,8 +4182,6 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, if (pin) pin_down_extent(cache, start, len, 1); else { - if (btrfs_test_opt(fs_info, DISCARD)) - ret = btrfs_discard_extent(fs_info, start, len, NULL); btrfs_add_free_space(cache, start, len); btrfs_free_reserved_bytes(cache, len, delalloc); trace_btrfs_reserved_extent_free(fs_info, start, len); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e3c76645cad76e..e120221cc076f3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3250,10 +3250,19 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if ((ret || !logical_len) && clear_reserved_extent && !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && - !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { + /* + * Discard the range before returning it back to the + * free space pool + */ + if (ret && btrfs_test_opt(fs_info, DISCARD)) + btrfs_discard_extent(fs_info, + ordered_extent->start, + ordered_extent->disk_len, NULL); btrfs_free_reserved_extent(fs_info, ordered_extent->start, ordered_extent->disk_len, 1); + } } From 6a376729166861251e28c11923b49ddc2cbbd4a3 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 21 Nov 2019 14:03:30 +0200 Subject: [PATCH 11/47] btrfs: Open code __btrfs_free_reserved_extent in btrfs_free_reserved_extent __btrfs_free_reserved_extent performs 2 entirely different operations depending on whether its 'pin' argument is true or false. This patch lifts the 2nd case (pin is false) into it's sole caller btrfs_free_reserved_extent. No semantics changes. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3495710c72b82a..215dceaacaead1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4179,14 +4179,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, return -ENOSPC; } - if (pin) - pin_down_extent(cache, start, len, 1); - else { - btrfs_add_free_space(cache, start, len); - btrfs_free_reserved_bytes(cache, len, delalloc); - trace_btrfs_reserved_extent_free(fs_info, start, len); - } - + ret = pin_down_extent(cache, start, len, 1); btrfs_put_block_group(cache); return ret; } @@ -4194,7 +4187,20 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc) { - return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc); + struct btrfs_block_group *cache; + + cache = btrfs_lookup_block_group(fs_info, start); + if (!cache) { + btrfs_err(fs_info, "unable to find block group for %llu", start); + return -ENOSPC; + } + + btrfs_add_free_space(cache, start, len); + btrfs_free_reserved_bytes(cache, len, delalloc); + trace_btrfs_reserved_extent_free(fs_info, start, len); + + btrfs_put_block_group(cache); + return 0; } int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, From 1da8bc05b2482c29979951b41678ea990e87ce92 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 21 Nov 2019 14:03:31 +0200 Subject: [PATCH 12/47] btrfs: Rename __btrfs_free_reserved_extent to btrfs_pin_reserved_extent __btrfs_free_reserved_extent now performs the actions of btrfs_free_and_pin_reserved_extent. But this name is a bit of a misnomer, since the extent is not really freed but just pinned. Reflect this in the new name. No semantics changes. Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 4 ++-- fs/btrfs/extent-tree.c | 30 +++++++++++------------------- fs/btrfs/tree-log.c | 12 +++++------- 3 files changed, 18 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 54efb21c272726..ea49e4b52cd26a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2449,8 +2449,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); -int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len); +int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, + u64 len); void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 215dceaacaead1..2a7dff22c3b743 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4165,12 +4165,10 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, return ret; } -static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, - int pin, int delalloc) +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 len, int delalloc) { struct btrfs_block_group *cache; - int ret = 0; cache = btrfs_lookup_block_group(fs_info, start); if (!cache) { @@ -4179,15 +4177,18 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, return -ENOSPC; } - ret = pin_down_extent(cache, start, len, 1); + btrfs_add_free_space(cache, start, len); + btrfs_free_reserved_bytes(cache, len, delalloc); + trace_btrfs_reserved_extent_free(fs_info, start, len); + btrfs_put_block_group(cache); - return ret; + return 0; } -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc) +int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct btrfs_block_group *cache; + int ret = 0; cache = btrfs_lookup_block_group(fs_info, start); if (!cache) { @@ -4195,18 +4196,9 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, return -ENOSPC; } - btrfs_add_free_space(cache, start, len); - btrfs_free_reserved_bytes(cache, len, delalloc); - trace_btrfs_reserved_extent_free(fs_info, start, len); - + ret = pin_down_extent(cache, start, len, 1); btrfs_put_block_group(cache); - return 0; -} - -int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len) -{ - return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0); + return ret; } static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 44480762003cd3..47f4c6eea85129 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2732,9 +2732,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent( - fs_info, bytenr, - blocksize); + ret = btrfs_pin_reserved_extent(fs_info, + bytenr, blocksize); if (ret) { free_extent_buffer(next); return ret; @@ -2815,8 +2814,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent( - fs_info, + ret = btrfs_pin_reserved_extent(fs_info, path->nodes[*level]->start, path->nodes[*level]->len); if (ret) @@ -2898,8 +2896,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, WARN_ON(log->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent(fs_info, - next->start, next->len); + ret = btrfs_pin_reserved_extent(fs_info, next->start, + next->len); if (ret) goto out; } From 42d84f021e113fd490f96dce74b06bc95e691a34 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Dec 2019 11:40:13 +0200 Subject: [PATCH 13/47] btrfs: Remove WARN_ON in walk_log_tree The log_root passed to walk_log_tree is guaranteed to have its root_key.objectid always be BTRFS_TREE_LOG_OBJECTID. This is by merit that all log roots of an ordinary root are allocated in alloc_log_tree which hard-codes objectid to be BTRFS_TREE_LOG_OBJECTID. In case walk_log_tree is called for a log tree found by btrfs_read_fs_root in btrfs_recover_log_trees, that function already ensures found_key.objectid is BTRFS_TREE_LOG_OBJECTID. No functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 47f4c6eea85129..9a6b1303fcabce 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2894,8 +2894,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, clear_extent_buffer_dirty(next); } - WARN_ON(log->root_key.objectid != - BTRFS_TREE_LOG_OBJECTID); ret = btrfs_pin_reserved_extent(fs_info, next->start, next->len); if (ret) From aaa8e35be7cbf737547a7e59d02d9cbf94857b51 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 2 Dec 2019 11:40:14 +0200 Subject: [PATCH 14/47] btrfs: Remove redundant WARN_ON in walk_down_log_tree level <0 and level >= BTRFS_MAX_LEVEL are already performed upon extent buffer read by tree checker in btrfs_check_node. go. As far as 'level <= 0' we are guaranteed that level is '> 0' because the value of level _before_ reading 'next' is larger than 1 (otherwise we wouldn't have executed that code at all) this in turn guarantees that 'level' after btrfs_read_buffer is 'level - 1' since we verify this invariant in: btrfs_read_buffer btree_read_extent_buffer_pages btrfs_verify_level_key This guarantees that level can never be '<= 0' so the warn on is never triggered. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9a6b1303fcabce..a2bae5c230e19c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2674,14 +2674,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, u32 blocksize; int ret = 0; - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - while (*level > 0) { struct btrfs_key first_key; - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); cur = path->nodes[*level]; WARN_ON(btrfs_header_level(cur) != *level); @@ -2748,7 +2743,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return ret; } - WARN_ON(*level <= 0); if (path->nodes[*level-1]) free_extent_buffer(path->nodes[*level-1]); path->nodes[*level-1] = next; @@ -2756,9 +2750,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, path->slots[*level] = 0; cond_resched(); } - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); cond_resched(); From 52434fecd1d31d533f00322779b2a3e08570020d Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Tue, 3 Dec 2019 16:59:25 +0800 Subject: [PATCH 15/47] btrfs: remove unused condition check in btrfs_page_mkwrite() The condition '!ret2' is always true. commit 717beb96d969 ("Btrfs: fix regression in btrfs_page_mkwrite() from vm_fault_t conversion") left behind the check after moving this code out of the goto, so remove the unused condition check. Reviewed-by: Omar Sandoval Signed-off-by: Yunfeng Ye Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e120221cc076f3..acf2d8edaf6ccc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9080,7 +9080,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; goto out_unlock; } - ret2 = 0; /* page is wholly or partially inside EOF */ if (page_start + PAGE_SIZE > size) @@ -9104,12 +9103,10 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) unlock_extent_cached(io_tree, page_start, page_end, &cached_state); - if (!ret2) { - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return VM_FAULT_LOCKED; - } + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); + return VM_FAULT_LOCKED; out_unlock: unlock_page(page); From 502f9baeb9ea149c1a2ab7f4a4cbb49b5e6a9483 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 29 Nov 2019 12:40:59 +0800 Subject: [PATCH 16/47] btrfs: relocation: Output current relocation stage at btrfs_relocate_block_group() There are two relocation stages but both print the same message. Add the description of the stage. This can help debugging or provides informative message to users. BTRFS info (device dm-5): balance: start -d -m -s BTRFS info (device dm-5): relocating block group 30408704 flags metadata|dup BTRFS info (device dm-5): found 2 extents, stage: move data extents BTRFS info (device dm-5): relocating block group 22020096 flags system|dup BTRFS info (device dm-5): found 1 extents, stage: move data extents BTRFS info (device dm-5): relocating block group 13631488 flags data BTRFS info (device dm-5): found 1 extents, stage: move data extents BTRFS info (device dm-5): found 1 extents, stage: update data pointers BTRFS info (device dm-5): balance: ended with status: 0 Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index c58245797f30af..73f93f78d84bf2 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4291,6 +4291,15 @@ static void describe_relocation(struct btrfs_fs_info *fs_info, block_group->start, buf); } +static const char *stage_to_string(int stage) +{ + if (stage == MOVE_DATA_EXTENTS) + return "move data extents"; + if (stage == UPDATE_DATA_PTRS) + return "update data pointers"; + return "unknown"; +} + /* * function to relocate all extents in a block group. */ @@ -4365,12 +4374,15 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) rc->block_group->length); while (1) { + int finishes_stage; + mutex_lock(&fs_info->cleaner_mutex); ret = relocate_block_group(rc); mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) err = ret; + finishes_stage = rc->stage; /* * We may have gotten ENOSPC after we already dirtied some * extents. If writeout happens while we're relocating a @@ -4396,8 +4408,8 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) if (rc->extents_found == 0) break; - btrfs_info(fs_info, "found %llu extents", rc->extents_found); - + btrfs_info(fs_info, "found %llu extents, stage: %s", + rc->extents_found, stage_to_string(finishes_stage)); } WARN_ON(rc->block_group->pinned > 0); From a922659af74f60f7a5511bbaf448d32135068050 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 29 Nov 2019 11:38:13 +0200 Subject: [PATCH 17/47] btrfs: Opencode ordered_data_tree_panic It's a simple wrapper over btrfs_panic and is called only once. Just open code it. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index fb09bc2f8e4d5a..ed29c30802abc2 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -52,14 +52,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, return NULL; } -static void ordered_data_tree_panic(struct inode *inode, int errno, - u64 offset) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - btrfs_panic(fs_info, errno, - "Inconsistency in ordered tree at offset %llu", offset); -} - /* * look for a given offset in the tree, and if it can't be found return the * first lesser offset @@ -219,7 +211,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, node = tree_insert(&tree->tree, file_offset, &entry->rb_node); if (node) - ordered_data_tree_panic(inode, -EEXIST, file_offset); + btrfs_panic(fs_info, -EEXIST, + "inconsistency in ordered tree at offset %llu", + file_offset); spin_unlock_irq(&tree->lock); spin_lock(&root->ordered_extent_lock); From bfac80db7e288a4c7fd4906056ab78f25cd6feed Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Nov 2019 11:25:53 -0500 Subject: [PATCH 18/47] btrfs: don't pass system_chunk into can_overcommit We have the space_info, we can just check its flags to see if it's the system chunk space info. Reviewed-by: Nikolay Borisov Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 42 +++++++++++++++--------------------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index f09aa6ee911322..537bc310a673f1 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -161,8 +161,7 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) static int can_overcommit(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk) + enum btrfs_reserve_flush_enum flush) { u64 profile; u64 avail; @@ -173,7 +172,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info, if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) return 0; - if (system_chunk) + if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) profile = btrfs_system_alloc_profile(fs_info); else profile = btrfs_metadata_alloc_profile(fs_info); @@ -227,8 +226,7 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, /* Check and see if our ticket can be satisified now. */ if ((used + ticket->bytes <= space_info->total_bytes) || - can_overcommit(fs_info, space_info, ticket->bytes, flush, - false)) { + can_overcommit(fs_info, space_info, ticket->bytes, flush)) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, ticket->bytes); @@ -626,8 +624,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, static inline u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - bool system_chunk) + struct btrfs_space_info *space_info) { struct reserve_ticket *ticket; u64 used; @@ -643,13 +640,12 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); if (can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + BTRFS_RESERVE_FLUSH_ALL)) return 0; used = btrfs_space_info_used(space_info, true); - if (can_overcommit(fs_info, space_info, SZ_1M, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + if (can_overcommit(fs_info, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -665,7 +661,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, - u64 used, bool system_chunk) + u64 used) { u64 thresh = div_factor_fine(space_info->total_bytes, 98); @@ -673,8 +669,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) return 0; - if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, - system_chunk)) + if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info)) return 0; return (used >= thresh && !btrfs_fs_closing(fs_info) && @@ -765,8 +760,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, - false); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); if (!to_reclaim) { space_info->flush = 0; spin_unlock(&space_info->lock); @@ -785,8 +779,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) return; } to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, - space_info, - false); + space_info); if (last_tickets_id == space_info->tickets_id) { flush_state++; } else { @@ -858,8 +851,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, int flush_state; spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, - false); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); if (!to_reclaim) { spin_unlock(&space_info->lock); return; @@ -990,8 +982,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk) + enum btrfs_reserve_flush_enum flush) { struct reserve_ticket ticket; u64 used; @@ -1013,8 +1004,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, */ if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || - can_overcommit(fs_info, space_info, orig_bytes, flush, - system_chunk))) { + can_overcommit(fs_info, space_info, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); ret = 0; @@ -1054,8 +1044,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_do_async_reclaim(fs_info, space_info, - used, system_chunk) && + need_do_async_reclaim(fs_info, space_info, used) && !work_busy(&fs_info->async_reclaim_work)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); @@ -1092,10 +1081,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; - bool system_chunk = (root == fs_info->chunk_root); ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, - orig_bytes, flush, system_chunk); + orig_bytes, flush); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { if (block_rsv != global_rsv && From 3a0b0b4aa7ef4edf40635f972258ab64e224c685 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Nov 2019 11:25:54 -0500 Subject: [PATCH 19/47] btrfs: kill min_allocable_bytes in inc_block_group_ro This is a relic from a time before we had a proper reservation mechanism and you could end up with really full chunks at chunk allocation time. This doesn't make sense anymore, so just kill it. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 6934a5b8708fed..66fa39632cdea7 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1185,21 +1185,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; u64 sinfo_used; - u64 min_allocable_bytes; int ret = -ENOSPC; - /* - * We need some metadata space and system metadata space for - * allocating chunks in some corner cases until we force to set - * it to be readonly. - */ - if ((sinfo->flags & - (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && - !force) - min_allocable_bytes = SZ_1M; - else - min_allocable_bytes = 0; - spin_lock(&sinfo->lock); spin_lock(&cache->lock); @@ -1217,10 +1204,9 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) * sinfo_used + num_bytes should always <= sinfo->total_bytes. * * Here we make sure if we mark this bg RO, we still have enough - * free space as buffer (if min_allocable_bytes is not 0). + * free space as buffer. */ - if (sinfo_used + num_bytes + min_allocable_bytes <= - sinfo->total_bytes) { + if (sinfo_used + num_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); @@ -1233,8 +1219,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) btrfs_info(cache->fs_info, "unable to make block group %llu ro", cache->start); btrfs_info(cache->fs_info, - "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", - sinfo_used, num_bytes, min_allocable_bytes); + "sinfo_used=%llu bg_num_bytes=%llu", + sinfo_used, num_bytes); btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); } return ret; From 92c886f0ea7d7e83c8c34a2e6e2f727a4517bb0a Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 5 Dec 2019 14:19:57 +0100 Subject: [PATCH 20/47] btrfs: fix possible NULL-pointer dereference in integrity checks A user reports a possible NULL-pointer dereference in btrfsic_process_superblock(). We are assigning state->fs_info to a local fs_info variable and afterwards checking for the presence of state. While we would BUG_ON() a NULL state anyways, we can also just remove the local fs_info copy, as fs_info is only used once as the first argument for btrfs_num_copies(). There we can just pass in state->fs_info as well. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=205003 Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 0b52ab4cb96490..72c70f59fc605e 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -629,7 +629,6 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev, static int btrfsic_process_superblock(struct btrfsic_state *state, struct btrfs_fs_devices *fs_devices) { - struct btrfs_fs_info *fs_info = state->fs_info; struct btrfs_super_block *selected_super; struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; @@ -700,7 +699,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, break; } - num_copies = btrfs_num_copies(fs_info, next_bytenr, + num_copies = btrfs_num_copies(state->fs_info, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) pr_info("num_copies(log_bytenr=%llu) = %d\n", From 20ced9072b1a87a611892161415a10b2b85a406f Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 5 Dec 2019 14:19:58 +0100 Subject: [PATCH 21/47] btrfs: remove superfluous BUG_ON() in integrity checks btrfsic_process_superblock() BUG_ON()s if 'state' is NULL. But this can never happen as the only caller from btrfsic_process_superblock() is btrfsic_mount() which allocates 'state' some lines above calling btrfsic_process_superblock() and checks for the allocation to succeed. Let's just remove the impossible to hit BUG_ON(). Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 72c70f59fc605e..a0ce69f2d27cd9 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -636,7 +636,6 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, int ret = 0; int pass; - BUG_ON(NULL == state); selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); if (NULL == selected_super) { pr_info("btrfsic: error, kmalloc failed!\n"); From f74c8f6146d18dacb273db7175a522ca977342cd Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 5 Dec 2019 14:19:59 +0100 Subject: [PATCH 22/47] btrfs: remove impossible WARN_ON in btrfs_destroy_dev_replace_tgtdev() We have a user report, that cppcheck is complaining about a possible NULL-pointer dereference in btrfs_destroy_dev_replace_tgtdev(). We're first dereferencing the 'tgtdev' variable and the later check for the validity of the pointer with a WARN_ON(!tgtdev); But all callers of btrfs_destroy_dev_replace_tgtdev() either explicitly check if 'tgtdev' is non-NULL or directly allocate 'tgtdev', so the WARN_ON() is impossible to hit. Just remove it to silence the checker's complains. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=205003 Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c565650639eeda..f5c0c401c3307c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2132,7 +2132,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) { struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; - WARN_ON(!tgtdev); mutex_lock(&fs_devices->device_list_mutex); btrfs_sysfs_rm_device_link(fs_devices, tgtdev); From 937a79d63aea1dd5c10a04d39d6277d3b267688a Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 5 Dec 2019 10:36:04 -0800 Subject: [PATCH 23/47] btrfs: use simple_dir_inode_operations for placeholder subvolume directory When you snapshot a subvolume containing a subvolume, you get a placeholder directory where the subvolume would be. These directories have their own btrfs_dir_ro_inode_operations. Al pointed out [1] that these directories can use simple_lookup() instead of btrfs_lookup(), as they are always empty. Furthermore, they can use the default generic_permission() instead of btrfs_permission(); the additional checks in the latter don't matter because we can't write to the directory anyways. Finally, they can use the default generic_update_time() instead of btrfs_update_time(), as the inode doesn't exist on disk and doesn't need any special handling. All together, this means that we can get rid of btrfs_dir_ro_inode_operations and use simple_dir_inode_operations instead. 1: https://lore.kernel.org/linux-btrfs/20190929052934.GY26530@ZenIV.linux.org.uk/ Cc: Al Viro Reviewed-by: Josef Bacik Signed-off-by: Omar Sandoval Reviewed-by: David Sterba [ add comment ] Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index acf2d8edaf6ccc..a34a1ae89c963d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -64,7 +64,6 @@ struct btrfs_dio_data { static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; -static const struct inode_operations btrfs_dir_ro_inode_operations; static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; @@ -5850,7 +5849,11 @@ static struct inode *new_simple_dir(struct super_block *s, set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; - inode->i_op = &btrfs_dir_ro_inode_operations; + /* + * We only need lookup, the rest is read-only and there's no inode + * associated with the dentry + */ + inode->i_op = &simple_dir_inode_operations; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; @@ -11005,11 +11008,6 @@ static const struct inode_operations btrfs_dir_inode_operations = { .update_time = btrfs_update_time, .tmpfile = btrfs_tmpfile, }; -static const struct inode_operations btrfs_dir_ro_inode_operations = { - .lookup = btrfs_lookup, - .permission = btrfs_permission, - .update_time = btrfs_update_time, -}; static const struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, From 16c3a76b6e16e1807e0753d3326778313dfacb33 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 26 Nov 2019 09:40:05 +0100 Subject: [PATCH 24/47] btrfs: decrement number of open devices after closing the device not before In btrfs_close_one_device we're decrementing the number of open devices before we're calling btrfs_close_bdev(). As there is no intermediate exit between these points in this function it is technically OK to do so, but it makes the code a bit harder to understand. Move both operations closer together and move the decrement step after btrfs_close_bdev(). Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f5c0c401c3307c..7d535756ba1b78 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1069,9 +1069,6 @@ static void btrfs_close_one_device(struct btrfs_device *device) struct btrfs_device *new_device; struct rcu_string *name; - if (device->bdev) - fs_devices->open_devices--; - if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { list_del_init(&device->dev_alloc_list); @@ -1082,6 +1079,8 @@ static void btrfs_close_one_device(struct btrfs_device *device) fs_devices->missing_devices--; btrfs_close_bdev(device); + if (device->bdev) + fs_devices->open_devices--; new_device = btrfs_alloc_device(NULL, &device->devid, device->uuid); From 2ca855b71c49a25cce519da793a21ee4767f082f Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 4 Dec 2019 14:36:39 +0100 Subject: [PATCH 25/47] btrfs: reset device back to allocation state when removing When closing a device, btrfs_close_one_device() first allocates a new device, copies the device to close's name, replaces it in the dev_list with the copy and then finally frees it. This involves two memory allocation, which can potentially fail. As this code path is tricky to unwind, the allocation failures where handled by BUG_ON()s. But this copying isn't strictly needed, all that is needed is resetting the device in question to it's state it had after the allocation. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7d535756ba1b78..66377e678504c3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1066,8 +1066,6 @@ static void btrfs_close_bdev(struct btrfs_device *device) static void btrfs_close_one_device(struct btrfs_device *device) { struct btrfs_fs_devices *fs_devices = device->fs_devices; - struct btrfs_device *new_device; - struct rcu_string *name; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -1079,25 +1077,21 @@ static void btrfs_close_one_device(struct btrfs_device *device) fs_devices->missing_devices--; btrfs_close_bdev(device); - if (device->bdev) + if (device->bdev) { fs_devices->open_devices--; - - new_device = btrfs_alloc_device(NULL, &device->devid, - device->uuid); - BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ - - /* Safe because we are under uuid_mutex */ - if (device->name) { - name = rcu_string_strdup(device->name->str, GFP_NOFS); - BUG_ON(!name); /* -ENOMEM */ - rcu_assign_pointer(new_device->name, name); + device->bdev = NULL; } + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - list_replace_rcu(&device->dev_list, &new_device->dev_list); - new_device->fs_devices = device->fs_devices; + atomic_set(&device->dev_stats_ccnt, 0); + extent_io_tree_release(&device->alloc_state); - synchronize_rcu(); - btrfs_free_device(device); + /* Verify the device is back in a pristine state */ + ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); + ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); + ASSERT(list_empty(&device->dev_alloc_list)); + ASSERT(list_empty(&device->post_commit_list)); + ASSERT(atomic_read(&device->reada_in_flight) == 0); } static int close_fs_devices(struct btrfs_fs_devices *fs_devices) From b9e4db501b894003fd4483693b90498579fa7e52 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 2 Dec 2019 17:34:17 -0800 Subject: [PATCH 26/47] btrfs: get rid of trivial __btrfs_lookup_bio_sums() wrappers Currently, we have two wrappers for __btrfs_lookup_bio_sums(): btrfs_lookup_bio_sums_dio(), which is used for direct I/O, and btrfs_lookup_bio_sums(), which is used everywhere else. The only difference is that the _dio variant looks up csums starting at the given offset instead of using the page index, which isn't actually direct I/O-specific. Let's clean up the signature and return value of __btrfs_lookup_bio_sums(), rename it to btrfs_lookup_bio_sums(), and get rid of the trivial helpers. Reviewed-by: Nikolay Borisov Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/compression.c | 4 ++-- fs/btrfs/ctree.h | 4 +--- fs/btrfs/file-item.c | 35 +++++++++++++++++------------------ fs/btrfs/inode.c | 6 +++--- 4 files changed, 23 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ee834ef7beb4a3..03eb50727038fd 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -758,7 +758,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(inode, comp_bio, - sums); + false, 0, sums); BUG_ON(ret); /* -ENOMEM */ } @@ -786,7 +786,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, BUG_ON(ret); /* -ENOMEM */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); + ret = btrfs_lookup_bio_sums(inode, comp_bio, false, 0, sums); BUG_ON(ret); /* -ENOMEM */ } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ea49e4b52cd26a..b1c971cea33a82 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2789,9 +2789,7 @@ struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u8 *dst); -blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, - u64 logical_offset); + bool at_offset, u64 offset, u8 *dst); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b1bfdc5c1387ae..b7f5394c37a1fa 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -148,8 +148,21 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } -static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u64 logical_offset, u8 *dst, int dio) +/** + * btrfs_lookup_bio_sums - Look up checksums for a bio. + * @inode: inode that the bio is for. + * @bio: bio embedded in btrfs_io_bio. + * @at_offset: If true, look up checksums for the extent at @offset. + * If false, use the page offsets from the bio. + * @offset: If @at_offset is true, offset in file to look up checksums for. + * Ignored otherwise. + * @dst: Buffer of size btrfs_super_csum_size() used to return checksum. If + * NULL, the checksum is returned in btrfs_io_bio(bio)->csum instead. + * + * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. + */ +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, + bool at_offset, u64 offset, u8 *dst) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct bio_vec bvec; @@ -159,7 +172,6 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_path *path; u8 *csum; - u64 offset = 0; u64 item_start_offset = 0; u64 item_last_offset = 0; u64 disk_bytenr; @@ -205,15 +217,13 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio } disk_bytenr = (u64)bio->bi_iter.bi_sector << 9; - if (dio) - offset = logical_offset; bio_for_each_segment(bvec, bio, iter) { page_bytes_left = bvec.bv_len; if (count) goto next; - if (!dio) + if (!at_offset) offset = page_offset(bvec.bv_page) + bvec.bv_offset; count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, csum, nblocks); @@ -285,18 +295,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio WARN_ON_ONCE(count); btrfs_free_path(path); - return 0; -} - -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u8 *dst) -{ - return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); -} - -blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) -{ - return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1); + return BLK_STS_OK; } int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a34a1ae89c963d..f1165e87f65867 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2127,7 +2127,7 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, bio_flags); goto out; } else if (!skip_sum) { - ret = btrfs_lookup_bio_sums(inode, bio, NULL); + ret = btrfs_lookup_bio_sums(inode, bio, false, 0, NULL); if (ret) goto out; } @@ -8368,8 +8368,8 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, * contention. */ if (dip->logical_offset == file_offset) { - ret = btrfs_lookup_bio_sums_dio(inode, dip->orig_bio, - file_offset); + ret = btrfs_lookup_bio_sums(inode, dip->orig_bio, true, + file_offset, NULL); if (ret) return ret; } From 2092e012d560e1d90d04460c27da81a239b19802 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 10 Dec 2019 10:37:35 -0800 Subject: [PATCH 27/47] btrfs: get rid of at_offset parameter to btrfs_lookup_bio_sums() We can encode this in the offset parameter: -1 means use the page offsets, anything else is a valid offset. Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/compression.c | 4 ++-- fs/btrfs/ctree.h | 2 +- fs/btrfs/file-item.c | 11 +++++------ fs/btrfs/inode.c | 6 +++--- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 03eb50727038fd..ed05e527739982 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -758,7 +758,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(inode, comp_bio, - false, 0, sums); + (u64)-1, sums); BUG_ON(ret); /* -ENOMEM */ } @@ -786,7 +786,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, BUG_ON(ret); /* -ENOMEM */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = btrfs_lookup_bio_sums(inode, comp_bio, false, 0, sums); + ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums); BUG_ON(ret); /* -ENOMEM */ } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b1c971cea33a82..f895fb490b7570 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2789,7 +2789,7 @@ struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - bool at_offset, u64 offset, u8 *dst); + u64 offset, u8 *dst); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b7f5394c37a1fa..b670014bfc1cf8 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -152,17 +152,15 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, * btrfs_lookup_bio_sums - Look up checksums for a bio. * @inode: inode that the bio is for. * @bio: bio embedded in btrfs_io_bio. - * @at_offset: If true, look up checksums for the extent at @offset. - * If false, use the page offsets from the bio. - * @offset: If @at_offset is true, offset in file to look up checksums for. - * Ignored otherwise. + * @offset: Unless (u64)-1, look up checksums for this offset in the file. + * If (u64)-1, use the page offsets from the bio instead. * @dst: Buffer of size btrfs_super_csum_size() used to return checksum. If * NULL, the checksum is returned in btrfs_io_bio(bio)->csum instead. * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - bool at_offset, u64 offset, u8 *dst) + u64 offset, u8 *dst) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct bio_vec bvec; @@ -171,6 +169,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_path *path; + const bool page_offsets = (offset == (u64)-1); u8 *csum; u64 item_start_offset = 0; u64 item_last_offset = 0; @@ -223,7 +222,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (count) goto next; - if (!at_offset) + if (page_offsets) offset = page_offset(bvec.bv_page) + bvec.bv_offset; count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, csum, nblocks); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f1165e87f65867..bab8661e5aaced 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2127,7 +2127,7 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, bio_flags); goto out; } else if (!skip_sum) { - ret = btrfs_lookup_bio_sums(inode, bio, false, 0, NULL); + ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL); if (ret) goto out; } @@ -8368,8 +8368,8 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, * contention. */ if (dip->logical_offset == file_offset) { - ret = btrfs_lookup_bio_sums(inode, dip->orig_bio, true, - file_offset, NULL); + ret = btrfs_lookup_bio_sums(inode, dip->orig_bio, file_offset, + NULL); if (ret) return ret; } From b1b7009c6f1fc9b2f1b1ac14ab03bcb4d3a4157e Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 2 Dec 2019 17:34:18 -0800 Subject: [PATCH 28/47] btrfs: remove dead snapshot-aware defrag code Snapshot-aware defrag has been disabled since commit 8101c8dbf624 ("Btrfs: disable snapshot aware defrag for now") almost 6 years ago. Let's remove the dead code. If someone is up to the task of bringing it back, they can dig it up from git. This is logically a revert of commit 38c227d87c49 ("Btrfs: snapshot-aware defrag") except that now we have to clear the EXTENT_DEFRAG bit to avoid need_force_cow() returning true forever. The reasons to disable were caused by runtime problems (like long stalls or memory consumption) on heavily referenced extents (eg. thousands of snapshots). There were attempts to fix that but never finished. Current defrag breaks the extent references and some users prefer that behaviour over the one implemented by snapshot aware (ie. keeping links for defragmentation). To enable both usecases we'd need to extend defrag ioctl but let's do that properly from scratch. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Omar Sandoval Reviewed-by: David Sterba [ enhance ] Signed-off-by: David Sterba --- fs/btrfs/inode.c | 695 +---------------------------------------------- 1 file changed, 11 insertions(+), 684 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bab8661e5aaced..40963d27520702 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -44,7 +44,6 @@ #include "locking.h" #include "free-space-cache.h" #include "inode-map.h" -#include "backref.h" #include "props.h" #include "qgroup.h" #include "delalloc-space.h" @@ -2393,649 +2392,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, return ret; } -/* snapshot-aware defrag */ -struct sa_defrag_extent_backref { - struct rb_node node; - struct old_sa_defrag_extent *old; - u64 root_id; - u64 inum; - u64 file_pos; - u64 extent_offset; - u64 num_bytes; - u64 generation; -}; - -struct old_sa_defrag_extent { - struct list_head list; - struct new_sa_defrag_extent *new; - - u64 extent_offset; - u64 bytenr; - u64 offset; - u64 len; - int count; -}; - -struct new_sa_defrag_extent { - struct rb_root root; - struct list_head head; - struct btrfs_path *path; - struct inode *inode; - u64 file_pos; - u64 len; - u64 bytenr; - u64 disk_len; - u8 compress_type; -}; - -static int backref_comp(struct sa_defrag_extent_backref *b1, - struct sa_defrag_extent_backref *b2) -{ - if (b1->root_id < b2->root_id) - return -1; - else if (b1->root_id > b2->root_id) - return 1; - - if (b1->inum < b2->inum) - return -1; - else if (b1->inum > b2->inum) - return 1; - - if (b1->file_pos < b2->file_pos) - return -1; - else if (b1->file_pos > b2->file_pos) - return 1; - - /* - * [------------------------------] ===> (a range of space) - * |<--->| |<---->| =============> (fs/file tree A) - * |<---------------------------->| ===> (fs/file tree B) - * - * A range of space can refer to two file extents in one tree while - * refer to only one file extent in another tree. - * - * So we may process a disk offset more than one time(two extents in A) - * and locate at the same extent(one extent in B), then insert two same - * backrefs(both refer to the extent in B). - */ - return 0; -} - -static void backref_insert(struct rb_root *root, - struct sa_defrag_extent_backref *backref) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct sa_defrag_extent_backref *entry; - int ret; - - while (*p) { - parent = *p; - entry = rb_entry(parent, struct sa_defrag_extent_backref, node); - - ret = backref_comp(backref, entry); - if (ret < 0) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - - rb_link_node(&backref->node, parent, p); - rb_insert_color(&backref->node, root); -} - -/* - * Note the backref might has changed, and in this case we just return 0. - */ -static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, - void *ctx) -{ - struct btrfs_file_extent_item *extent; - struct old_sa_defrag_extent *old = ctx; - struct new_sa_defrag_extent *new = old->new; - struct btrfs_path *path = new->path; - struct btrfs_key key; - struct btrfs_root *root; - struct sa_defrag_extent_backref *backref; - struct extent_buffer *leaf; - struct inode *inode = new->inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int slot; - int ret; - u64 extent_offset; - u64 num_bytes; - - if (BTRFS_I(inode)->root->root_key.objectid == root_id && - inum == btrfs_ino(BTRFS_I(inode))) - return 0; - - key.objectid = root_id; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(root)) { - if (PTR_ERR(root) == -ENOENT) - return 0; - WARN_ON(1); - btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu", - inum, offset, root_id); - return PTR_ERR(root); - } - - key.objectid = inum; - key.type = BTRFS_EXTENT_DATA_KEY; - if (offset > (u64)-1 << 32) - key.offset = 0; - else - key.offset = offset; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (WARN_ON(ret < 0)) - return ret; - ret = 0; - - while (1) { - cond_resched(); - - leaf = path->nodes[0]; - slot = path->slots[0]; - - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - goto out; - } - continue; - } - - path->slots[0]++; - - btrfs_item_key_to_cpu(leaf, &key, slot); - - if (key.objectid > inum) - goto out; - - if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) - continue; - - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - - if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) - continue; - - /* - * 'offset' refers to the exact key.offset, - * NOT the 'offset' field in btrfs_extent_data_ref, ie. - * (key.offset - extent_offset). - */ - if (key.offset != offset) - continue; - - extent_offset = btrfs_file_extent_offset(leaf, extent); - num_bytes = btrfs_file_extent_num_bytes(leaf, extent); - - if (extent_offset >= old->extent_offset + old->offset + - old->len || extent_offset + num_bytes <= - old->extent_offset + old->offset) - continue; - break; - } - - backref = kmalloc(sizeof(*backref), GFP_NOFS); - if (!backref) { - ret = -ENOENT; - goto out; - } - - backref->root_id = root_id; - backref->inum = inum; - backref->file_pos = offset; - backref->num_bytes = num_bytes; - backref->extent_offset = extent_offset; - backref->generation = btrfs_file_extent_generation(leaf, extent); - backref->old = old; - backref_insert(&new->root, backref); - old->count++; -out: - btrfs_release_path(path); - WARN_ON(ret); - return ret; -} - -static noinline bool record_extent_backrefs(struct btrfs_path *path, - struct new_sa_defrag_extent *new) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); - struct old_sa_defrag_extent *old, *tmp; - int ret; - - new->path = path; - - list_for_each_entry_safe(old, tmp, &new->head, list) { - ret = iterate_inodes_from_logical(old->bytenr + - old->extent_offset, fs_info, - path, record_one_backref, - old, false); - if (ret < 0 && ret != -ENOENT) - return false; - - /* no backref to be processed for this extent */ - if (!old->count) { - list_del(&old->list); - kfree(old); - } - } - - if (list_empty(&new->head)) - return false; - - return true; -} - -static int relink_is_mergable(struct extent_buffer *leaf, - struct btrfs_file_extent_item *fi, - struct new_sa_defrag_extent *new) -{ - if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr) - return 0; - - if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) - return 0; - - if (btrfs_file_extent_compression(leaf, fi) != new->compress_type) - return 0; - - if (btrfs_file_extent_encryption(leaf, fi) || - btrfs_file_extent_other_encoding(leaf, fi)) - return 0; - - return 1; -} - -/* - * Note the backref might has changed, and in this case we just return 0. - */ -static noinline int relink_extent_backref(struct btrfs_path *path, - struct sa_defrag_extent_backref *prev, - struct sa_defrag_extent_backref *backref) -{ - struct btrfs_file_extent_item *extent; - struct btrfs_file_extent_item *item; - struct btrfs_ordered_extent *ordered; - struct btrfs_trans_handle *trans; - struct btrfs_ref ref = { 0 }; - struct btrfs_root *root; - struct btrfs_key key; - struct extent_buffer *leaf; - struct old_sa_defrag_extent *old = backref->old; - struct new_sa_defrag_extent *new = old->new; - struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); - struct inode *inode; - struct extent_state *cached = NULL; - int ret = 0; - u64 start; - u64 len; - u64 lock_start; - u64 lock_end; - bool merge = false; - int index; - - if (prev && prev->root_id == backref->root_id && - prev->inum == backref->inum && - prev->file_pos + prev->num_bytes == backref->file_pos) - merge = true; - - /* step 1: get root */ - key.objectid = backref->root_id; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - index = srcu_read_lock(&fs_info->subvol_srcu); - - root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - if (PTR_ERR(root) == -ENOENT) - return 0; - return PTR_ERR(root); - } - - if (btrfs_root_readonly(root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - return 0; - } - - /* step 2: get inode */ - key.objectid = backref->inum; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - - inode = btrfs_iget(fs_info->sb, &key, root); - if (IS_ERR(inode)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - return 0; - } - - srcu_read_unlock(&fs_info->subvol_srcu, index); - - /* step 3: relink backref */ - lock_start = backref->file_pos; - lock_end = backref->file_pos + backref->num_bytes - 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, - &cached); - - ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); - if (ordered) { - btrfs_put_ordered_extent(ordered); - goto out_unlock; - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_unlock; - } - - key.objectid = backref->inum; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = backref->file_pos; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - goto out_free_path; - } else if (ret > 0) { - ret = 0; - goto out_free_path; - } - - extent = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - - if (btrfs_file_extent_generation(path->nodes[0], extent) != - backref->generation) - goto out_free_path; - - btrfs_release_path(path); - - start = backref->file_pos; - if (backref->extent_offset < old->extent_offset + old->offset) - start += old->extent_offset + old->offset - - backref->extent_offset; - - len = min(backref->extent_offset + backref->num_bytes, - old->extent_offset + old->offset + old->len); - len -= max(backref->extent_offset, old->extent_offset + old->offset); - - ret = btrfs_drop_extents(trans, root, inode, start, - start + len, 1); - if (ret) - goto out_free_path; -again: - key.objectid = btrfs_ino(BTRFS_I(inode)); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = start; - - path->leave_spinning = 1; - if (merge) { - struct btrfs_file_extent_item *fi; - u64 extent_len; - struct btrfs_key found_key; - - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) - goto out_free_path; - - path->slots[0]--; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - extent_len = btrfs_file_extent_num_bytes(leaf, fi); - - if (extent_len + found_key.offset == start && - relink_is_mergable(leaf, fi, new)) { - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_len + len); - btrfs_mark_buffer_dirty(leaf); - inode_add_bytes(inode, len); - - ret = 1; - goto out_free_path; - } else { - merge = false; - btrfs_release_path(path); - goto again; - } - } - - ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(*extent)); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_free_path; - } - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); - btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); - btrfs_set_file_extent_num_bytes(leaf, item, len); - btrfs_set_file_extent_ram_bytes(leaf, item, new->len); - btrfs_set_file_extent_generation(leaf, item, trans->transid); - btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); - btrfs_set_file_extent_compression(leaf, item, new->compress_type); - btrfs_set_file_extent_encryption(leaf, item, 0); - btrfs_set_file_extent_other_encoding(leaf, item, 0); - - btrfs_mark_buffer_dirty(leaf); - inode_add_bytes(inode, len); - btrfs_release_path(path); - - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new->bytenr, - new->disk_len, 0); - btrfs_init_data_ref(&ref, backref->root_id, backref->inum, - new->file_pos); /* start - extent_offset */ - ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_free_path; - } - - ret = 1; -out_free_path: - btrfs_release_path(path); - path->leave_spinning = 0; - btrfs_end_transaction(trans); -out_unlock: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, - &cached); - iput(inode); - return ret; -} - -static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) -{ - struct old_sa_defrag_extent *old, *tmp; - - if (!new) - return; - - list_for_each_entry_safe(old, tmp, &new->head, list) { - kfree(old); - } - kfree(new); -} - -static void relink_file_extents(struct new_sa_defrag_extent *new) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); - struct btrfs_path *path; - struct sa_defrag_extent_backref *backref; - struct sa_defrag_extent_backref *prev = NULL; - struct rb_node *node; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return; - - if (!record_extent_backrefs(path, new)) { - btrfs_free_path(path); - goto out; - } - btrfs_release_path(path); - - while (1) { - node = rb_first(&new->root); - if (!node) - break; - rb_erase(node, &new->root); - - backref = rb_entry(node, struct sa_defrag_extent_backref, node); - - ret = relink_extent_backref(path, prev, backref); - WARN_ON(ret < 0); - - kfree(prev); - - if (ret == 1) - prev = backref; - else - prev = NULL; - cond_resched(); - } - kfree(prev); - - btrfs_free_path(path); -out: - free_sa_defrag_extent(new); - - atomic_dec(&fs_info->defrag_running); - wake_up(&fs_info->transaction_wait); -} - -static struct new_sa_defrag_extent * -record_old_file_extents(struct inode *inode, - struct btrfs_ordered_extent *ordered) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path; - struct btrfs_key key; - struct old_sa_defrag_extent *old; - struct new_sa_defrag_extent *new; - int ret; - - new = kmalloc(sizeof(*new), GFP_NOFS); - if (!new) - return NULL; - - new->inode = inode; - new->file_pos = ordered->file_offset; - new->len = ordered->len; - new->bytenr = ordered->start; - new->disk_len = ordered->disk_len; - new->compress_type = ordered->compress_type; - new->root = RB_ROOT; - INIT_LIST_HEAD(&new->head); - - path = btrfs_alloc_path(); - if (!path) - goto out_kfree; - - key.objectid = btrfs_ino(BTRFS_I(inode)); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = new->file_pos; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out_free_path; - if (ret > 0 && path->slots[0] > 0) - path->slots[0]--; - - /* find out all the old extents for the file range */ - while (1) { - struct btrfs_file_extent_item *extent; - struct extent_buffer *l; - int slot; - u64 num_bytes; - u64 offset; - u64 end; - u64 disk_bytenr; - u64 extent_offset; - - l = path->nodes[0]; - slot = path->slots[0]; - - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out_free_path; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.objectid != btrfs_ino(BTRFS_I(inode))) - break; - if (key.type != BTRFS_EXTENT_DATA_KEY) - break; - if (key.offset >= new->file_pos + new->len) - break; - - extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); - - num_bytes = btrfs_file_extent_num_bytes(l, extent); - if (key.offset + num_bytes < new->file_pos) - goto next; - - disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); - if (!disk_bytenr) - goto next; - - extent_offset = btrfs_file_extent_offset(l, extent); - - old = kmalloc(sizeof(*old), GFP_NOFS); - if (!old) - goto out_free_path; - - offset = max(new->file_pos, key.offset); - end = min(new->file_pos + new->len, key.offset + num_bytes); - - old->bytenr = disk_bytenr; - old->extent_offset = extent_offset; - old->offset = offset - key.offset; - old->len = end - offset; - old->new = new; - old->count = 0; - list_add_tail(&old->list, &new->head); -next: - path->slots[0]++; - cond_resched(); - } - - btrfs_free_path(path); - atomic_inc(&fs_info->defrag_running); - - return new; - -out_free_path: - btrfs_free_path(path); -out_kfree: - free_sa_defrag_extent(new); - return NULL; -} - static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, u64 start, u64 len) { @@ -3063,7 +2419,6 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - struct new_sa_defrag_extent *new = NULL; int compress_type = 0; int ret = 0; u64 logical_len = ordered_extent->len; @@ -3072,6 +2427,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) bool range_locked = false; bool clear_new_delalloc_bytes = false; bool clear_reserved_extent = true; + unsigned int clear_bits; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && @@ -3130,20 +2486,6 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset + ordered_extent->len - 1, &cached_state); - ret = test_range_bit(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + ordered_extent->len - 1, - EXTENT_DEFRAG, 0, cached_state); - if (ret) { - u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); - if (0 && last_snapshot >= BTRFS_I(inode)->generation) - /* the inode is shared */ - new = record_old_file_extents(inode, ordered_extent); - - clear_extent_bit(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + ordered_extent->len - 1, - EXTENT_DEFRAG, 0, 0, &cached_state); - } - if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else @@ -3204,21 +2546,16 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) } ret = 0; out: - if (range_locked || clear_new_delalloc_bytes) { - unsigned int clear_bits = 0; - - if (range_locked) - clear_bits |= EXTENT_LOCKED; - if (clear_new_delalloc_bytes) - clear_bits |= EXTENT_DELALLOC_NEW; - clear_extent_bit(&BTRFS_I(inode)->io_tree, - ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, - clear_bits, - (clear_bits & EXTENT_LOCKED) ? 1 : 0, - 0, &cached_state); - } + clear_bits = EXTENT_DEFRAG; + if (range_locked) + clear_bits |= EXTENT_LOCKED; + if (clear_new_delalloc_bytes) + clear_bits |= EXTENT_DELALLOC_NEW; + clear_extent_bit(&BTRFS_I(inode)->io_tree, + ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + clear_bits, (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, + &cached_state); if (trans) btrfs_end_transaction(trans); @@ -3271,16 +2608,6 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) */ btrfs_remove_ordered_extent(inode, ordered_extent); - /* for snapshot-aware defrag */ - if (new) { - if (ret) { - free_sa_defrag_extent(new); - atomic_dec(&fs_info->defrag_running); - } else { - relink_file_extents(new); - } - } - /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ From 807c41c0c0c784aad33fd51be3583bd9956abd5d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 2 Dec 2019 17:34:19 -0800 Subject: [PATCH 29/47] btrfs: make btrfs_ordered_extent naming consistent with btrfs_file_extent_item ordered->start, ordered->len, and ordered->disk_len correspond to fi->disk_bytenr, fi->num_bytes, and fi->disk_num_bytes, respectively. It's confusing to translate between the two naming schemes. Since a btrfs_ordered_extent is basically a pending btrfs_file_extent_item, let's make the former use the naming from the latter. Note that I didn't touch the names in tracepoints just in case there are scripts depending on the current naming. Reviewed-by: Johannes Thumshirn Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 4 +-- fs/btrfs/file.c | 6 ++-- fs/btrfs/inode.c | 70 +++++++++++++++++------------------- fs/btrfs/ordered-data.c | 69 ++++++++++++++++++----------------- fs/btrfs/ordered-data.h | 26 +++++++------- fs/btrfs/relocation.c | 4 +-- include/trace/events/btrfs.h | 6 ++-- 7 files changed, 90 insertions(+), 95 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b670014bfc1cf8..bb374042d29786 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -481,8 +481,8 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, - 1); for (i = 0; i < nr_sectors; i++) { - if (offset >= ordered->file_offset + ordered->len || - offset < ordered->file_offset) { + if (offset >= ordered->file_offset + ordered->num_bytes || + offset < ordered->file_offset) { unsigned long bytes_left; sums->len = this_sum_bytes; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8d47c76b7bd107..76c68c70d3e284 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1501,7 +1501,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, ordered = btrfs_lookup_ordered_range(inode, start_pos, last_pos - start_pos + 1); if (ordered && - ordered->file_offset + ordered->len > start_pos && + ordered->file_offset + ordered->num_bytes > start_pos && ordered->file_offset <= last_pos) { unlock_extent_cached(&inode->io_tree, start_pos, last_pos, cached_state); @@ -2426,7 +2426,7 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, * we need to try again. */ if ((!ordered || - (ordered->file_offset + ordered->len <= lockstart || + (ordered->file_offset + ordered->num_bytes <= lockstart || ordered->file_offset > lockend)) && !filemap_range_has_page(inode->i_mapping, lockstart, lockend)) { @@ -3248,7 +3248,7 @@ static long btrfs_fallocate(struct file *file, int mode, ordered = btrfs_lookup_first_ordered_extent(inode, locked_end); if (ordered && - ordered->file_offset + ordered->len > alloc_start && + ordered->file_offset + ordered->num_bytes > alloc_start && ordered->file_offset < alloc_end) { btrfs_put_ordered_extent(ordered); unlock_extent_cached(&BTRFS_I(inode)->io_tree, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 40963d27520702..9aa7583a5d7b5a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2419,9 +2419,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; + u64 start, end; int compress_type = 0; int ret = 0; - u64 logical_len = ordered_extent->len; + u64 logical_len = ordered_extent->num_bytes; bool freespace_inode; bool truncated = false; bool range_locked = false; @@ -2429,6 +2430,9 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) bool clear_reserved_extent = true; unsigned int clear_bits; + start = ordered_extent->file_offset; + end = start + ordered_extent->num_bytes - 1; + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) @@ -2441,10 +2445,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - btrfs_free_io_failure_record(BTRFS_I(inode), - ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1); + btrfs_free_io_failure_record(BTRFS_I(inode), start, end); if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; @@ -2462,8 +2463,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) * space for NOCOW range. * As NOCOW won't cause a new delayed ref, just free the space */ - btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, - ordered_extent->len); + btrfs_qgroup_free_data(inode, NULL, start, + ordered_extent->num_bytes); btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); @@ -2482,9 +2483,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) } range_locked = true; - lock_extent_bits(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + ordered_extent->len - 1, - &cached_state); + lock_extent_bits(io_tree, start, end, &cached_state); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); @@ -2502,31 +2501,30 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); - btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, - ordered_extent->len); + btrfs_qgroup_free_data(inode, NULL, start, + ordered_extent->num_bytes); ret = btrfs_mark_extent_written(trans, BTRFS_I(inode), ordered_extent->file_offset, ordered_extent->file_offset + logical_len); } else { BUG_ON(root == fs_info->tree_root); - ret = insert_reserved_file_extent(trans, inode, - ordered_extent->file_offset, - ordered_extent->start, - ordered_extent->disk_len, + ret = insert_reserved_file_extent(trans, inode, start, + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes, logical_len, logical_len, compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); if (!ret) { clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, - ordered_extent->start, - ordered_extent->disk_len); + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); } } unpin_extent_cache(&BTRFS_I(inode)->extent_tree, - ordered_extent->file_offset, ordered_extent->len, - trans->transid); + ordered_extent->file_offset, + ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -2551,27 +2549,22 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) clear_bits |= EXTENT_LOCKED; if (clear_new_delalloc_bytes) clear_bits |= EXTENT_DELALLOC_NEW; - clear_extent_bit(&BTRFS_I(inode)->io_tree, - ordered_extent->file_offset, - ordered_extent->file_offset + ordered_extent->len - 1, - clear_bits, (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, + (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, &cached_state); if (trans) btrfs_end_transaction(trans); if (ret || truncated) { - u64 start, end; + u64 unwritten_start = start; if (truncated) - start = ordered_extent->file_offset + logical_len; - else - start = ordered_extent->file_offset; - end = ordered_extent->file_offset + ordered_extent->len - 1; - clear_extent_uptodate(io_tree, start, end, NULL); + unwritten_start += logical_len; + clear_extent_uptodate(io_tree, unwritten_start, end, NULL); /* Drop the cache for the part of the extent we didn't write. */ - btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), unwritten_start, end, 0); /* * If the ordered extent had an IOERR or something else went @@ -2593,15 +2586,15 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) */ if (ret && btrfs_test_opt(fs_info, DISCARD)) btrfs_discard_extent(fs_info, - ordered_extent->start, - ordered_extent->disk_len, NULL); + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes, + NULL); btrfs_free_reserved_extent(fs_info, - ordered_extent->start, - ordered_extent->disk_len, 1); + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes, 1); } } - /* * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. @@ -8209,7 +8202,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, page_end - start + 1); if (ordered) { - end = min(page_end, ordered->file_offset + ordered->len - 1); + end = min(page_end, + ordered->file_offset + ordered->num_bytes - 1); /* * IO on this page will never be started, so we need * to account for any ordered extents now @@ -8734,7 +8728,7 @@ void btrfs_destroy_inode(struct inode *inode) else { btrfs_err(fs_info, "found ordered extent %llu %llu on inode cleanup", - ordered->file_offset, ordered->len); + ordered->file_offset, ordered->num_bytes); btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ed29c30802abc2..ecb9fb6a6fe07f 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -20,9 +20,9 @@ static struct kmem_cache *btrfs_ordered_extent_cache; static u64 entry_end(struct btrfs_ordered_extent *entry) { - if (entry->file_offset + entry->len < entry->file_offset) + if (entry->file_offset + entry->num_bytes < entry->file_offset) return (u64)-1; - return entry->file_offset + entry->len; + return entry->file_offset + entry->num_bytes; } /* returns NULL if the insertion worked, or it returns the node it did find @@ -112,7 +112,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) { if (file_offset < entry->file_offset || - entry->file_offset + entry->len <= file_offset) + entry->file_offset + entry->num_bytes <= file_offset) return 0; return 1; } @@ -121,7 +121,7 @@ static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, u64 len) { if (file_offset + len <= entry->file_offset || - entry->file_offset + entry->len <= file_offset) + entry->file_offset + entry->num_bytes <= file_offset) return 0; return 1; } @@ -153,19 +153,14 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, } /* allocate and add a new ordered_extent into the per-inode tree. - * file_offset is the logical offset in the file - * - * start is the disk block number of an extent already reserved in the - * extent allocation tree - * - * len is the length of the extent * * The tree is given a single reference on the ordered extent that was * inserted. */ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, - int type, int dio, int compress_type) + u64 disk_bytenr, u64 num_bytes, + u64 disk_num_bytes, int type, int dio, + int compress_type) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -179,10 +174,10 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, return -ENOMEM; entry->file_offset = file_offset; - entry->start = start; - entry->len = len; - entry->disk_len = disk_len; - entry->bytes_left = len; + entry->disk_bytenr = disk_bytenr; + entry->num_bytes = num_bytes; + entry->disk_num_bytes = disk_num_bytes; + entry->bytes_left = num_bytes; entry->inode = igrab(inode); entry->compress_type = compress_type; entry->truncated_len = (u64)-1; @@ -190,7 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, set_bit(type, &entry->flags); if (dio) { - percpu_counter_add_batch(&fs_info->dio_bytes, len, + percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes, fs_info->delalloc_batch); set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); } @@ -241,27 +236,30 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, } int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type) + u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, + int type) { - return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 0, + return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, + num_bytes, disk_num_bytes, type, 0, BTRFS_COMPRESS_NONE); } int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type) + u64 disk_bytenr, u64 num_bytes, + u64 disk_num_bytes, int type) { - return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 1, + return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, + num_bytes, disk_num_bytes, type, 1, BTRFS_COMPRESS_NONE); } int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, - int type, int compress_type) + u64 disk_bytenr, u64 num_bytes, + u64 disk_num_bytes, int type, + int compress_type) { - return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 0, + return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, + num_bytes, disk_num_bytes, type, 0, compress_type); } @@ -322,8 +320,8 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, } dec_start = max(*file_offset, entry->file_offset); - dec_end = min(*file_offset + io_size, entry->file_offset + - entry->len); + dec_end = min(*file_offset + io_size, + entry->file_offset + entry->num_bytes); *file_offset = dec_end; if (dec_start > dec_end) { btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu", @@ -465,10 +463,11 @@ void btrfs_remove_ordered_extent(struct inode *inode, btrfs_mod_outstanding_extents(btrfs_inode, -1); spin_unlock(&btrfs_inode->lock); if (root != fs_info->tree_root) - btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false); + btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes, + false); if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - percpu_counter_add_batch(&fs_info->dio_bytes, -entry->len, + percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes, fs_info->delalloc_batch); tree = &btrfs_inode->ordered_tree; @@ -528,8 +527,8 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, ordered = list_first_entry(&splice, struct btrfs_ordered_extent, root_extent_list); - if (range_end <= ordered->start || - ordered->start + ordered->disk_len <= range_start) { + if (range_end <= ordered->disk_bytenr || + ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) { list_move_tail(&ordered->root_extent_list, &skipped); cond_resched_lock(&root->ordered_extent_lock); continue; @@ -613,7 +612,7 @@ void btrfs_start_ordered_extent(struct inode *inode, int wait) { u64 start = entry->file_offset; - u64 end = start + entry->len - 1; + u64 end = start + entry->num_bytes - 1; trace_btrfs_ordered_extent_start(inode, entry); @@ -674,7 +673,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - if (ordered->file_offset + ordered->len <= start) { + if (ordered->file_offset + ordered->num_bytes <= start) { btrfs_put_ordered_extent(ordered); break; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4eb0319a86d7a8..3beb4da4ab419a 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -67,14 +67,13 @@ struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; - /* disk byte number */ - u64 start; - - /* ram length of the extent in bytes */ - u64 len; - - /* extent length on disk */ - u64 disk_len; + /* + * These fields directly correspond to the same fields in + * btrfs_file_extent_item. + */ + u64 disk_bytenr; + u64 num_bytes; + u64 disk_num_bytes; /* number of bytes that still need writing */ u64 bytes_left; @@ -161,12 +160,15 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, u64 *file_offset, u64 io_size, int uptodate); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type); + u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, + int type); int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type); + u64 disk_bytenr, u64 num_bytes, + u64 disk_num_bytes, int type); int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, - int type, int compress_type); + u64 disk_bytenr, u64 num_bytes, + u64 disk_num_bytes, int type, + int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 73f93f78d84bf2..af4dd49a71c78e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4627,7 +4627,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) LIST_HEAD(list); ordered = btrfs_lookup_ordered_extent(inode, file_pos); - BUG_ON(ordered->file_offset != file_pos || ordered->len != len); + BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, @@ -4651,7 +4651,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) * disk_len vs real len like with real inodes since it's all * disk length. */ - new_bytenr = ordered->start + (sums->bytenr - disk_bytenr); + new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr; sums->bytenr = new_bytenr; btrfs_add_ordered_sum(ordered, sums); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 620bf1b38fba78..17088a112ed0bb 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -496,9 +496,9 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), __entry->ino = btrfs_ino(BTRFS_I(inode)); __entry->file_offset = ordered->file_offset; - __entry->start = ordered->start; - __entry->len = ordered->len; - __entry->disk_len = ordered->disk_len; + __entry->start = ordered->disk_bytenr; + __entry->len = ordered->num_bytes; + __entry->disk_len = ordered->disk_num_bytes; __entry->bytes_left = ordered->bytes_left; __entry->flags = ordered->flags; __entry->compress_type = ordered->compress_type; From 3facc994ca75dc09f1bc20e72191f0f4a663bd5f Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 2 Dec 2019 17:34:20 -0800 Subject: [PATCH 30/47] btrfs: remove unnecessary pg_offset assignments in __extent_writepage() We're initializing pg_offset to 0, setting it immediately, then reassigning it to 0 again after. The former became unnecessary in 211c17f51f46 ("Fix corners in writepage and btrfs_truncate_page"). The latter is a leftover that should've been removed in 40f765805f08 ("Btrfs: split up __extent_writepage to lower stack usage"). Remove both. Reviewed-by: Johannes Thumshirn Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2f4802f405a246..b2d0418a4c5583 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3562,7 +3562,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, u64 page_end = start + PAGE_SIZE - 1; int ret; int nr = 0; - size_t pg_offset = 0; + size_t pg_offset; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; unsigned long nr_written = 0; @@ -3591,8 +3591,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, flush_dcache_page(page); } - pg_offset = 0; - set_page_extent_mapped(page); if (!epd->extent_locked) { From 54f358cc219e3a9022015109458f537e33271955 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 2 Dec 2019 17:34:21 -0800 Subject: [PATCH 31/47] btrfs: remove trivial goto label in __extent_writepage() Since 40f765805f08 ("Btrfs: split up __extent_writepage to lower stack usage"), done_unlocked is simply a return 0. Get rid of it. Mid-statement block returns don seem to make the code less readable here. Reviewed-by: Johannes Thumshirn Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b2d0418a4c5583..9475e81dc41988 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3596,7 +3596,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (!epd->extent_locked) { ret = writepage_delalloc(inode, page, wbc, start, &nr_written); if (ret == 1) - goto done_unlocked; + return 0; if (ret) goto done; } @@ -3604,7 +3604,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ret = __extent_writepage_io(inode, page, wbc, epd, i_size, nr_written, &nr); if (ret == 1) - goto done_unlocked; + return 0; done: if (nr == 0) { @@ -3619,9 +3619,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, unlock_page(page); ASSERT(ret <= 0); return ret; - -done_unlocked: - return 0; } void wait_on_extent_buffer_writeback(struct extent_buffer *eb) From a92bcc203149bda774d22b5161b0737c4365612a Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 2 Dec 2019 17:34:22 -0800 Subject: [PATCH 32/47] btrfs: remove redundant i_size check in __extent_writepage_io() In __extent_writepage_io(), we check whether i_size <= page_offset(page). Note that if i_size < page_offset(page), then i_size >> PAGE_SHIFT < page->index. If i_size == page_offset(page), then i_size >> PAGE_SHIFT == page->index && offset_in_page(i_size) == 0. __extent_writepage() already has a check for these cases that returns without calling __extent_writepage_io(): end_index = i_size >> PAGE_SHIFT pg_offset = offset_in_page(i_size); if (page->index > end_index || (page->index == end_index && !pg_offset)) { page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); unlock_page(page); return 0; } Get rid of the one in __extent_writepage_io(), which was obsoleted in 211c17f51f46 ("Fix corners in writepage and btrfs_truncate_page"). Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9475e81dc41988..00ddefcb54c887 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3455,11 +3455,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, update_nr_written(wbc, nr_written + 1); end = page_end; - if (i_size <= start) { - btrfs_writepage_endio_finish_ordered(page, start, page_end, 1); - goto done; - } - blocksize = inode->i_sb->s_blocksize; while (cur <= end) { @@ -3540,7 +3535,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, pg_offset += iosize; nr++; } -done: *nr_ret = nr; return ret; } From 931bb7f970a60c1f2a0a72936c3d21a6853ededc Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 2 Dec 2019 17:34:23 -0800 Subject: [PATCH 33/47] btrfs: drop create parameter to btrfs_get_extent() We only pass this as 1 from __extent_writepage_io(). The parameter basically means "pretend I didn't pass in a page". This is silly since we can simply not pass in the page. Get rid of the parameter from btrfs_get_extent(), and since it's used as a get_extent_t callback, remove it from get_extent_t and btree_get_extent(), neither of which need it. While we're here, let's document btrfs_get_extent(). Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/disk-io.h | 4 ++-- fs/btrfs/extent_io.c | 6 ++--- fs/btrfs/extent_io.h | 6 ++--- fs/btrfs/file.c | 17 +++++++------- fs/btrfs/inode.c | 41 +++++++++++++++++++-------------- fs/btrfs/ioctl.c | 2 +- fs/btrfs/tests/inode-tests.c | 44 +++++++++++++++++------------------- 9 files changed, 64 insertions(+), 62 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f895fb490b7570..e416ef6c9415a7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2875,7 +2875,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, - u64 start, u64 end, int create); + u64 start, u64 end); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ab888d89d844c2..881aba162e4e23 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -202,8 +202,8 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, * that covers the entire device */ struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, u64 start, u64 len, - int create) + struct page *page, size_t pg_offset, + u64 start, u64 len) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 76f123ebb29217..8c2d6cf1ce5969 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -134,8 +134,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, u64 start, u64 len, - int create); + struct page *page, size_t pg_offset, + u64 start, u64 len); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 00ddefcb54c887..bbfb102d65b86f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3043,7 +3043,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); + em = get_extent(BTRFS_I(inode), page, pg_offset, start, len); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); @@ -3466,8 +3466,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page_end, 1); break; } - em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur, - end - cur + 1, 1); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur, + end - cur + 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); ret = PTR_ERR_OR_ZERO(em); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a8551a1f56e247..5d205bbaafdcb1 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -183,10 +183,8 @@ static inline int extent_compress_type(unsigned long bio_flags) struct extent_map_tree; typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, - struct page *page, - size_t pg_offset, - u64 start, u64 len, - int create); + struct page *page, size_t pg_offset, + u64 start, u64 len); int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 76c68c70d3e284..a16da274c9aad0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -477,8 +477,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, u64 em_len; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, search_start, - search_len, 0); + em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); if (IS_ERR(em)) return PTR_ERR(em); @@ -2390,7 +2389,7 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, round_down(*start, fs_info->sectorsize), - round_up(*len, fs_info->sectorsize), 0); + round_up(*len, fs_info->sectorsize)); if (IS_ERR(em)) return PTR_ERR(em); @@ -2957,7 +2956,7 @@ static int btrfs_zero_range_check_range_boundary(struct inode *inode, int ret; offset = round_down(offset, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); @@ -2990,8 +2989,8 @@ static int btrfs_zero_range(struct inode *inode, inode_dio_wait(inode); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, - alloc_start, alloc_end - alloc_start, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, + alloc_end - alloc_start); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -3034,8 +3033,8 @@ static int btrfs_zero_range(struct inode *inode, if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, - alloc_start, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, + sectorsize); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -3273,7 +3272,7 @@ static long btrfs_fallocate(struct file *file, int mode, INIT_LIST_HEAD(&reserve_list); while (cur_offset < alloc_end) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, - alloc_end - cur_offset, 0); + alloc_end - cur_offset); if (IS_ERR(em)) { ret = PTR_ERR(em); break; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9aa7583a5d7b5a..f7c91aea6a6dc9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4485,7 +4485,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) cur_offset = hole_start; while (1) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, - block_end - cur_offset, 0); + block_end - cur_offset); if (IS_ERR(em)) { err = PTR_ERR(em); em = NULL; @@ -6264,18 +6264,27 @@ static noinline int uncompress_inline(struct btrfs_path *path, return ret; } -/* - * a bit scary, this does extent mapping from logical file offset to the disk. - * the ugly parts come from merging extents from the disk with the in-ram - * representation. This gets more complex because of the data=ordered code, - * where the in-ram extents might be locked pending data=ordered completion. +/** + * btrfs_get_extent - Lookup the first extent overlapping a range in a file. + * @inode: file to search in + * @page: page to read extent data into if the extent is inline + * @pg_offset: offset into @page to copy to + * @start: file offset + * @len: length of range starting at @start + * + * This returns the first &struct extent_map which overlaps with the given + * range, reading it from the B-tree and caching it if necessary. Note that + * there may be more extents which overlap the given range after the returned + * extent_map. * - * This also copies inline extents directly into the page. + * If @page is not NULL and the extent is inline, this also reads the extent + * data directly into the page and marks the extent up to date in the io_tree. + * + * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) + struct page *page, size_t pg_offset, + u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret; @@ -6292,7 +6301,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct extent_map *em = NULL; struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_io_tree *io_tree = &inode->io_tree; - const bool new_inline = !page || create; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -6415,8 +6423,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, goto insert; } - btrfs_extent_item_to_extent_map(inode, path, item, - new_inline, em); + btrfs_extent_item_to_extent_map(inode, path, item, !page, em); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -6428,7 +6435,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, size_t extent_offset; size_t copy_size; - if (new_inline) + if (!page) goto out; size = btrfs_file_extent_ram_bytes(leaf, item); @@ -6511,7 +6518,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 delalloc_end; int err = 0; - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + em = btrfs_get_extent(inode, NULL, 0, start, len); if (IS_ERR(em)) return em; /* @@ -7136,7 +7143,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto err; } - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; @@ -10161,7 +10168,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_block_group *bg; u64 len = isize - start; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 18e328ce4b5496..0fa1c386d02064 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1128,7 +1128,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) /* get the big lock and read metadata off disk */ lock_extent_bits(io_tree, start, end, &cached); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); unlock_extent_cached(io_tree, start, end, &cached); if (IS_ERR(em)) diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 09ecf7dc7b0829..24a8c714f56cf2 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -263,7 +263,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize); if (IS_ERR(em)) { em = NULL; test_err("got an error when we shouldn't have"); @@ -283,7 +283,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) */ setup_file_extents(root, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -305,7 +305,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -333,7 +333,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -356,7 +356,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -384,7 +384,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -413,7 +413,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -435,7 +435,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -469,7 +469,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -498,7 +498,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -528,7 +528,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -561,7 +561,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -596,7 +596,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -630,7 +630,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -665,7 +665,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -692,7 +692,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -727,8 +727,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, - sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -755,7 +754,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -788,7 +787,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -872,7 +871,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) insert_inode_item_key(root); insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -894,8 +893,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) } free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, - 2 * sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; From f19d3a0114e0f104ebe12a9f54bde6f1e2c2f47a Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 2 Dec 2019 17:34:24 -0800 Subject: [PATCH 34/47] btrfs: simplify compressed/inline check in __extent_writepage_io() Commit 7087a9d8db88 ("btrfs: Remove extent_io_ops::writepage_end_io_hook") left this logic in a confusing state. Simplify it. Reviewed-by: Johannes Thumshirn Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bbfb102d65b86f..394beb474a693e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3492,22 +3492,11 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, */ if (compressed || block_start == EXTENT_MAP_HOLE || block_start == EXTENT_MAP_INLINE) { - /* - * end_io notification does not happen here for - * compressed extents - */ - if (!compressed) - btrfs_writepage_endio_finish_ordered(page, cur, - cur + iosize - 1, - 1); - else if (compressed) { - /* we don't want to end_page_writeback on - * a compressed extent. this happens - * elsewhere - */ + if (compressed) nr++; - } - + else + btrfs_writepage_endio_finish_ordered(page, cur, + cur + iosize - 1, 1); cur += iosize; pg_offset += iosize; continue; From fceadba130807e9dd27250402b508d298d77cc5f Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 2 Dec 2019 17:34:25 -0800 Subject: [PATCH 35/47] btrfs: remove struct find_free_extent.ram_bytes This hasn't been used since it was first introduced in commit b4bd745d1230 ("btrfs: Introduce find_free_extent_ctl structure for later rework"). Passing that to btrfs_add_reserved_bytes in find_free_extent is not strictly necessary and using the local ram_bytes instead seems cleaner. Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2a7dff22c3b743..180b9b81d01aa0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3438,7 +3438,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, */ struct find_free_extent_ctl { /* Basic allocation info */ - u64 ram_bytes; u64 num_bytes; u64 empty_size; u64 flags; @@ -3810,7 +3809,6 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, WARN_ON(num_bytes < fs_info->sectorsize); - ffe_ctl.ram_bytes = ram_bytes; ffe_ctl.num_bytes = num_bytes; ffe_ctl.empty_size = empty_size; ffe_ctl.flags = flags; From dccad8b6223e666db6636a544143567a80c451b1 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 11 Dec 2019 16:07:06 -0800 Subject: [PATCH 36/47] btrfs: punt all bios created in btrfs_submit_compressed_write() Compressed writes happen in the background via kworkers. However, this causes bios to be attributed to root bypassing any cgroup limits from the actual writer. We tag the first bio with REQ_CGROUP_PUNT, which will punt the bio to an appropriate cgroup specific workqueue and attribute the IO properly. However, if btrfs_submit_compressed_write() creates a new bio, we don't tag it the same way. Add the appropriate tagging for subsequent bios. Fixes: ec39f7696ccfa ("Btrfs: use REQ_CGROUP_PUNT for worker thread submitted bios") Reviewed-by: Chris Mason Signed-off-by: Dennis Zhou Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ed05e527739982..4ce81571f0cde4 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -491,6 +491,10 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; + if (blkcg_css) { + bio->bi_opf |= REQ_CGROUP_PUNT; + bio_associate_blkg_from_css(bio, blkcg_css); + } bio_add_page(bio, page, PAGE_SIZE, 0); } if (bytes_left < PAGE_SIZE) { From e089b2feea3ea59ede903b92c34db8cdf0e95625 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Wed, 11 Dec 2019 15:20:15 -0800 Subject: [PATCH 37/47] btrfs: fix compressed write bio blkcg attribution Bio attribution is handled at bio_set_dev() as once we have a device, we have a corresponding request_queue and then can derive the current css. In special cases, we want to attribute to bio to someone else. This can be done by calling bio_associate_blkg_from_css() or kthread_associate_blkcg() depending on the scenario. Btrfs does this for compressed writeback as they are handled by kworkers, so the latter can be done here. Commit 1a41802701ec ("btrfs: drop bio_set_dev where not needed") removes early bio_set_dev() calls prior to submit_stripe_bio(). This breaks the above assumption that we'll have a request_queue when we are doing association. To fix this, switch to using kthread_associate_blkcg(). Without this, we crash in btrfs/024: [ 3052.093088] BUG: kernel NULL pointer dereference, address: 0000000000000510 [ 3052.107013] #PF: supervisor read access in kernel mode [ 3052.107014] #PF: error_code(0x0000) - not-present page [ 3052.107015] PGD 0 P4D 0 [ 3052.107021] Oops: 0000 [#1] SMP [ 3052.138904] CPU: 42 PID: 201270 Comm: kworker/u161:0 Kdump: loaded Not tainted 5.5.0-rc1-00062-g4852d8ac90a9 #712 [ 3052.138905] Hardware name: Quanta Tioga Pass Single Side 01-0032211004/Tioga Pass Single Side, BIOS F08_3A18 12/20/2018 [ 3052.138912] Workqueue: btrfs-delalloc btrfs_work_helper [ 3052.191375] RIP: 0010:bio_associate_blkg_from_css+0x1e/0x3c0 [ 3052.191379] RSP: 0018:ffffc900210cfc90 EFLAGS: 00010282 [ 3052.191380] RAX: 0000000000000000 RBX: ffff88bfe5573c00 RCX: 0000000000000000 [ 3052.191382] RDX: ffff889db48ec2f0 RSI: ffff88bfe5573c00 RDI: ffff889db48ec2f0 [ 3052.191386] RBP: 0000000000000800 R08: 0000000000203bb0 R09: ffff889db16b2400 [ 3052.293364] R10: 0000000000000000 R11: ffff88a07fffde80 R12: ffff889db48ec2f0 [ 3052.293365] R13: 0000000000001000 R14: ffff889de82bc000 R15: ffff889e2b7bdcc8 [ 3052.293367] FS: 0000000000000000(0000) GS:ffff889ffba00000(0000) knlGS:0000000000000000 [ 3052.293368] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3052.293369] CR2: 0000000000000510 CR3: 0000000002611001 CR4: 00000000007606e0 [ 3052.293370] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 3052.293371] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 3052.293372] PKRU: 55555554 [ 3052.293376] Call Trace: [ 3052.402552] btrfs_submit_compressed_write+0x137/0x390 [ 3052.402558] submit_compressed_extents+0x40f/0x4c0 [ 3052.422401] btrfs_work_helper+0x246/0x5a0 [ 3052.422408] process_one_work+0x200/0x570 [ 3052.438601] ? process_one_work+0x180/0x570 [ 3052.438605] worker_thread+0x4c/0x3e0 [ 3052.438614] kthread+0x103/0x140 [ 3052.460735] ? process_one_work+0x570/0x570 [ 3052.460737] ? kthread_mod_delayed_work+0xc0/0xc0 [ 3052.460744] ret_from_fork+0x24/0x30 Fixes: 1a41802701ec ("btrfs: drop bio_set_dev where not needed") Reported-by: Chris Murphy Signed-off-by: Dennis Zhou Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 4ce81571f0cde4..de95ad27722f70 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -447,7 +447,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, if (blkcg_css) { bio->bi_opf |= REQ_CGROUP_PUNT; - bio_associate_blkg_from_css(bio, blkcg_css); + kthread_associate_blkcg(blkcg_css); } refcount_set(&cb->pending_bios, 1); @@ -491,10 +491,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; - if (blkcg_css) { + if (blkcg_css) bio->bi_opf |= REQ_CGROUP_PUNT; - bio_associate_blkg_from_css(bio, blkcg_css); - } bio_add_page(bio, page, PAGE_SIZE, 0); } if (bytes_left < PAGE_SIZE) { @@ -521,6 +519,9 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_endio(bio); } + if (blkcg_css) + kthread_associate_blkcg(NULL); + return 0; } From fe8510c709bba7a4822bc72610f63e570aab8706 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 11 Dec 2019 09:01:40 +0000 Subject: [PATCH 38/47] Btrfs: fix infinite loop during nocow writeback due to race When starting writeback for a range that covers part of a preallocated extent, due to a race with writeback for another range that also covers another part of the same preallocated extent, we can end up in an infinite loop. Consider the following example where for inode 280 we have two dirty ranges: range A, from 294912 to 303103, 8192 bytes range B, from 348160 to 438271, 90112 bytes and we have the following file extent item layout for our inode: leaf 38895616 gen 24544 total ptrs 29 free space 13820 owner 5 (...) item 27 key (280 108 200704) itemoff 14598 itemsize 53 extent data disk bytenr 0 nr 0 type 1 (regular) extent data offset 0 nr 94208 ram 94208 item 28 key (280 108 294912) itemoff 14545 itemsize 53 extent data disk bytenr 10433052672 nr 81920 type 2 (prealloc) extent data offset 0 nr 81920 ram 81920 Then the following happens: 1) Writeback starts for range B (from 348160 to 438271), execution of run_delalloc_nocow() starts; 2) The first iteration of run_delalloc_nocow()'s whil loop leaves us at the extent item at slot 28, pointing to the prealloc extent item covering the range from 294912 to 376831. This extent covers part of our range; 3) An ordered extent is created against that extent, covering the file range from 348160 to 376831 (28672 bytes); 4) We adjust 'cur_offset' to 376832 and move on to the next iteration of the while loop; 5) The call to btrfs_lookup_file_extent() leaves us at the same leaf, pointing to slot 29, 1 slot after the last item (the extent item we processed in the previous iteration); 6) Because we are a slot beyond the last item, we call btrfs_next_leaf(), which releases the search path before doing a another search for the last key of the leaf (280 108 294912); 7) Right after btrfs_next_leaf() released the path, and before it did another search for the last key of the leaf, writeback for the range A (from 294912 to 303103) completes (it was previously started at some point); 8) Upon completion of the ordered extent for range A, the prealloc extent we previously found got split into two extent items, one covering the range from 294912 to 303103 (8192 bytes), with a type of regular extent (and no longer prealloc) and another covering the range from 303104 to 376831 (73728 bytes), with a type of prealloc and an offset of 8192 bytes. So our leaf now has the following layout: leaf 38895616 gen 24544 total ptrs 31 free space 13664 owner 5 (...) item 27 key (280 108 200704) itemoff 14598 itemsize 53 extent data disk bytenr 0 nr 0 type 1 extent data offset 0 nr 8192 ram 94208 item 28 key (280 108 208896) itemoff 14545 itemsize 53 extent data disk bytenr 10433142784 nr 86016 type 1 extent data offset 0 nr 86016 ram 86016 item 29 key (280 108 294912) itemoff 14492 itemsize 53 extent data disk bytenr 10433052672 nr 81920 type 1 extent data offset 0 nr 8192 ram 81920 item 30 key (280 108 303104) itemoff 14439 itemsize 53 extent data disk bytenr 10433052672 nr 81920 type 2 extent data offset 8192 nr 73728 ram 81920 9) After btrfs_next_leaf() returns, we have our path pointing to that same leaf and at slot 30, since it has a key we didn't have before and it's the first key greater then the key that was previously the last key of the leaf (key (280 108 294912)); 10) The extent item at slot 30 covers the range from 303104 to 376831 which is in our target range, so we process it, despite having already created an ordered extent against this extent for the file range from 348160 to 376831. This is because we skip to the next extent item only if its end is less than or equals to the start of our delalloc range, and not less than or equals to the current offset ('cur_offset'); 11) As a result we compute 'num_bytes' as: num_bytes = min(end + 1, extent_end) - cur_offset; = min(438271 + 1, 376832) - 376832 = 0 12) We then call create_io_em() for a 0 bytes range starting at offset 376832; 13) Then create_io_em() enters an infinite loop because its calls to btrfs_drop_extent_cache() do nothing due to the 0 length range passed to it. So no existing extent maps that cover the offset 376832 get removed, and therefore calls to add_extent_mapping() return -EEXIST, resulting in an infinite loop. This loop from create_io_em() is the following: do { btrfs_drop_extent_cache(BTRFS_I(inode), em->start, em->start + em->len - 1, 0); write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); /* * The caller has taken lock_extent(), who could race with us * to add em? */ } while (ret == -EEXIST); Also, each call to btrfs_drop_extent_cache() triggers a warning because the start offset passed to it (376832) is smaller then the end offset (376832 - 1) passed to it by -1, due to the 0 length: [258532.052621] ------------[ cut here ]------------ [258532.052643] WARNING: CPU: 0 PID: 9987 at fs/btrfs/file.c:602 btrfs_drop_extent_cache+0x3f4/0x590 [btrfs] (...) [258532.052672] CPU: 0 PID: 9987 Comm: fsx Tainted: G W 5.4.0-rc7-btrfs-next-64 #1 [258532.052673] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014 [258532.052691] RIP: 0010:btrfs_drop_extent_cache+0x3f4/0x590 [btrfs] (...) [258532.052695] RSP: 0018:ffffb4be0153f860 EFLAGS: 00010287 [258532.052700] RAX: ffff975b445ee360 RBX: ffff975b44eb3e08 RCX: 0000000000000000 [258532.052700] RDX: 0000000000038fff RSI: 0000000000039000 RDI: ffff975b445ee308 [258532.052700] RBP: 0000000000038fff R08: 0000000000000000 R09: 0000000000000001 [258532.052701] R10: ffff975b513c5c10 R11: 00000000e3c0cfa9 R12: 0000000000039000 [258532.052703] R13: ffff975b445ee360 R14: 00000000ffffffef R15: ffff975b445ee308 [258532.052705] FS: 00007f86a821de80(0000) GS:ffff975b76a00000(0000) knlGS:0000000000000000 [258532.052707] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [258532.052708] CR2: 00007fdacf0f3ab4 CR3: 00000001f9d26002 CR4: 00000000003606f0 [258532.052712] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [258532.052717] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [258532.052717] Call Trace: [258532.052718] ? preempt_schedule_common+0x32/0x70 [258532.052722] ? ___preempt_schedule+0x16/0x20 [258532.052741] create_io_em+0xff/0x180 [btrfs] [258532.052767] run_delalloc_nocow+0x942/0xb10 [btrfs] [258532.052791] btrfs_run_delalloc_range+0x30b/0x520 [btrfs] [258532.052812] ? find_lock_delalloc_range+0x221/0x250 [btrfs] [258532.052834] writepage_delalloc+0xe4/0x140 [btrfs] [258532.052855] __extent_writepage+0x110/0x4e0 [btrfs] [258532.052876] extent_write_cache_pages+0x21c/0x480 [btrfs] [258532.052906] extent_writepages+0x52/0xb0 [btrfs] [258532.052911] do_writepages+0x23/0x80 [258532.052915] __filemap_fdatawrite_range+0xd2/0x110 [258532.052938] btrfs_fdatawrite_range+0x1b/0x50 [btrfs] [258532.052954] start_ordered_ops+0x57/0xa0 [btrfs] [258532.052973] ? btrfs_sync_file+0x225/0x490 [btrfs] [258532.052988] btrfs_sync_file+0x225/0x490 [btrfs] [258532.052997] __x64_sys_msync+0x199/0x200 [258532.053004] do_syscall_64+0x5c/0x250 [258532.053007] entry_SYSCALL_64_after_hwframe+0x49/0xbe [258532.053010] RIP: 0033:0x7f86a7dfd760 (...) [258532.053014] RSP: 002b:00007ffd99af0368 EFLAGS: 00000246 ORIG_RAX: 000000000000001a [258532.053016] RAX: ffffffffffffffda RBX: 0000000000000ec9 RCX: 00007f86a7dfd760 [258532.053017] RDX: 0000000000000004 RSI: 000000000000836c RDI: 00007f86a8221000 [258532.053019] RBP: 0000000000021ec9 R08: 0000000000000003 R09: 00007f86a812037c [258532.053020] R10: 0000000000000001 R11: 0000000000000246 R12: 00000000000074a3 [258532.053021] R13: 00007f86a8221000 R14: 000000000000836c R15: 0000000000000001 [258532.053032] irq event stamp: 1653450494 [258532.053035] hardirqs last enabled at (1653450493): [] _raw_spin_unlock_irq+0x29/0x50 [258532.053037] hardirqs last disabled at (1653450494): [] trace_hardirqs_off_thunk+0x1a/0x20 [258532.053039] softirqs last enabled at (1653449852): [] __do_softirq+0x466/0x6bd [258532.053042] softirqs last disabled at (1653449845): [] irq_exit+0xec/0x120 [258532.053043] ---[ end trace 8476fce13d9ce20a ]--- Which results in flooding dmesg/syslog since btrfs_drop_extent_cache() uses WARN_ON() and not WARN_ON_ONCE(). So fix this issue by changing run_delalloc_nocow()'s loop to move to the next extent item when the current extent item ends at at offset less than or equals to the current offset instead of the start offset. Fixes: 80ff385665b7fc ("Btrfs: update nodatacow code v2") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f7c91aea6a6dc9..dbc9bcaf583121 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1477,10 +1477,10 @@ static noinline int run_delalloc_nocow(struct inode *inode, disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); /* - * If extent we got ends before our range starts, skip - * to next extent + * If the extent we got ends before our current offset, + * skip to the next extent. */ - if (extent_end <= start) { + if (extent_end <= cur_offset) { path->slots[0]++; goto next_slot; } From 4eb110d9d1e0ef5a920fc53459bd54507909ed9c Mon Sep 17 00:00:00 2001 From: zhengbin Date: Thu, 19 Dec 2019 17:25:34 +0800 Subject: [PATCH 39/47] btrfs: Remove unneeded semicolon Fixes coccicheck warning: fs/btrfs/print-tree.c:320:3-4: Unneeded semicolon Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: David Sterba --- fs/btrfs/print-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 873b6b694107a1..61f44e78e3c9e7 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -317,7 +317,7 @@ void btrfs_print_leaf(struct extent_buffer *l) print_uuid_item(l, btrfs_item_ptr_offset(l, i), btrfs_item_size_nr(l, i)); break; - }; + } } } From 51f2e54e971d500adaa1c27536cc526a140e9f59 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 17 Dec 2019 18:58:20 +0800 Subject: [PATCH 40/47] btrfs: tree-checker: Check leaf chunk item size Inspired by btrfs-progs github issue #208, where chunk item in chunk tree has invalid num_stripes (0). Although that can already be caught by current btrfs_check_chunk_valid(), that function doesn't really check item size as it needs to handle chunk item in super block sys_chunk_array(). This patch will add two extra checks for chunk items in chunk tree: - Basic chunk item size If the item is smaller than btrfs_chunk (which already contains one stripe), exit right now as reading num_stripes may even go beyond eb boundary. - Item size check against num_stripes If item size doesn't match with calculated chunk size, then either the item size or the num_stripes is corrupted. Error out anyway. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 97f3520b8d98d2..16b7b5408bd4c9 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -738,6 +738,44 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, return 0; } +/* + * Enhanced version of chunk item checker. + * + * The common btrfs_check_chunk_valid() doesn't check item size since it needs + * to work on super block sys_chunk_array which doesn't have full item ptr. + */ +static int check_leaf_chunk_item(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_key *key, int slot) +{ + int num_stripes; + + if (btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk)) { + chunk_err(leaf, chunk, key->offset, + "invalid chunk item size: have %u expect [%zu, %u)", + btrfs_item_size_nr(leaf, slot), + sizeof(struct btrfs_chunk), + BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); + return -EUCLEAN; + } + + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + /* Let btrfs_check_chunk_valid() handle this error type */ + if (num_stripes == 0) + goto out; + + if (btrfs_chunk_item_size(num_stripes) != + btrfs_item_size_nr(leaf, slot)) { + chunk_err(leaf, chunk, key->offset, + "invalid chunk item size: have %u expect %lu", + btrfs_item_size_nr(leaf, slot), + btrfs_chunk_item_size(num_stripes)); + return -EUCLEAN; + } +out: + return btrfs_check_chunk_valid(leaf, chunk, key->offset); +} + __printf(3, 4) __cold static void dev_item_err(const struct extent_buffer *eb, int slot, @@ -1384,7 +1422,7 @@ static int check_leaf_item(struct extent_buffer *leaf, break; case BTRFS_CHUNK_ITEM_KEY: chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); - ret = btrfs_check_chunk_valid(leaf, chunk, key->offset); + ret = check_leaf_chunk_item(leaf, chunk, key, slot); break; case BTRFS_DEV_ITEM_KEY: ret = check_dev_item(leaf, key, slot); From 21b1a8e5a3cc398fbaeeaf37af347c0ffcba48d5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 9 Dec 2019 18:54:32 +0800 Subject: [PATCH 41/47] btrfs: tree-checker: Clean up fs_info parameter from error message wrapper The @fs_info parameter can be extracted from extent_buffer structure, and there are already some wrappers getting rid of the @fs_info parameter. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 16b7b5408bd4c9..b6f3dc713d5ed5 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -839,7 +839,7 @@ static int check_dev_item(struct extent_buffer *leaf, } /* Inode item error output has the same format as dir_item_err() */ -#define inode_item_err(fs_info, eb, slot, fmt, ...) \ +#define inode_item_err(eb, slot, fmt, ...) \ dir_item_err(eb, slot, fmt, __VA_ARGS__) static int check_inode_item(struct extent_buffer *leaf, @@ -864,7 +864,7 @@ static int check_inode_item(struct extent_buffer *leaf, return -EUCLEAN; } if (key->offset != 0) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "invalid key offset: has %llu expect 0", key->offset); return -EUCLEAN; @@ -873,7 +873,7 @@ static int check_inode_item(struct extent_buffer *leaf, /* Here we use super block generation + 1 to handle log tree */ if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "invalid inode generation: has %llu expect (0, %llu]", btrfs_inode_generation(leaf, iitem), super_gen + 1); @@ -881,7 +881,7 @@ static int check_inode_item(struct extent_buffer *leaf, } /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "invalid inode generation: has %llu expect [0, %llu]", btrfs_inode_transid(leaf, iitem), super_gen + 1); return -EUCLEAN; @@ -894,7 +894,7 @@ static int check_inode_item(struct extent_buffer *leaf, */ mode = btrfs_inode_mode(leaf, iitem); if (mode & ~valid_mask) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "unknown mode bit detected: 0x%x", mode & ~valid_mask); return -EUCLEAN; @@ -907,20 +907,20 @@ static int check_inode_item(struct extent_buffer *leaf, */ if (!has_single_bit_set(mode & S_IFMT)) { if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "invalid mode: has 0%o expect valid S_IF* bit(s)", mode & S_IFMT); return -EUCLEAN; } } if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "invalid nlink: has %u expect no more than 1 for dir", btrfs_inode_nlink(leaf, iitem)); return -EUCLEAN; } if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "unknown flags detected: 0x%llx", btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK); @@ -1340,8 +1340,8 @@ static int check_extent_data_ref(struct extent_buffer *leaf, return 0; } -#define inode_ref_err(fs_info, eb, slot, fmt, args...) \ - inode_item_err(fs_info, eb, slot, fmt, ##args) +#define inode_ref_err(eb, slot, fmt, args...) \ + inode_item_err(eb, slot, fmt, ##args) static int check_inode_ref(struct extent_buffer *leaf, struct btrfs_key *key, struct btrfs_key *prev_key, int slot) @@ -1354,7 +1354,7 @@ static int check_inode_ref(struct extent_buffer *leaf, return -EUCLEAN; /* namelen can't be 0, so item_size == sizeof() is also invalid */ if (btrfs_item_size_nr(leaf, slot) <= sizeof(*iref)) { - inode_ref_err(fs_info, leaf, slot, + inode_ref_err(leaf, slot, "invalid item size, have %u expect (%zu, %u)", btrfs_item_size_nr(leaf, slot), sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); @@ -1367,7 +1367,7 @@ static int check_inode_ref(struct extent_buffer *leaf, u16 namelen; if (ptr + sizeof(iref) > end) { - inode_ref_err(fs_info, leaf, slot, + inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu inode_ref_size %zu", ptr, end, sizeof(iref)); return -EUCLEAN; @@ -1376,7 +1376,7 @@ static int check_inode_ref(struct extent_buffer *leaf, iref = (struct btrfs_inode_ref *)ptr; namelen = btrfs_inode_ref_name_len(leaf, iref); if (ptr + sizeof(*iref) + namelen > end) { - inode_ref_err(fs_info, leaf, slot, + inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu namelen %u", ptr, end, namelen); return -EUCLEAN; From a02b5c63a83ff174d50e8a19ebe64e5962b599d5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 9 Dec 2019 18:54:33 +0800 Subject: [PATCH 42/47] btrfs: tree-checker: Refactor inode key check into seperate function Inode key check is not as easy as several lines, and it will be called in more than one location (INODE_ITEM check and DIR_ITEM/DIR_INDEX/XATTR_ITEM location key check). So here refactor such check into check_inode_key(). And add extra checks for XATTR_ITEM. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 78 +++++++++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index b6f3dc713d5ed5..285c362b31219a 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -373,6 +373,61 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, return 0; } +/* Inode item error output has the same format as dir_item_err() */ +#define inode_item_err(eb, slot, fmt, ...) \ + dir_item_err(eb, slot, fmt, __VA_ARGS__) + +static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key, + int slot) +{ + struct btrfs_key item_key; + bool is_inode_item; + + btrfs_item_key_to_cpu(leaf, &item_key, slot); + is_inode_item = (item_key.type == BTRFS_INODE_ITEM_KEY); + + /* For XATTR_ITEM, location key should be all 0 */ + if (item_key.type == BTRFS_XATTR_ITEM_KEY) { + if (key->type != 0 || key->objectid != 0 || key->offset != 0) + return -EUCLEAN; + return 0; + } + + if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID || + key->objectid > BTRFS_LAST_FREE_OBJECTID) && + key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && + key->objectid != BTRFS_FREE_INO_OBJECTID) { + if (is_inode_item) { + generic_err(leaf, slot, + "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu", + key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, + BTRFS_FIRST_FREE_OBJECTID, + BTRFS_LAST_FREE_OBJECTID, + BTRFS_FREE_INO_OBJECTID); + } else { + dir_item_err(leaf, slot, +"invalid location key objectid: has %llu expect %llu or [%llu, %llu] or %llu", + key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, + BTRFS_FIRST_FREE_OBJECTID, + BTRFS_LAST_FREE_OBJECTID, + BTRFS_FREE_INO_OBJECTID); + } + return -EUCLEAN; + } + if (key->offset != 0) { + if (is_inode_item) + inode_item_err(leaf, slot, + "invalid key offset: has %llu expect 0", + key->offset); + else + dir_item_err(leaf, slot, + "invalid location key offset:has %llu expect 0", + key->offset); + return -EUCLEAN; + } + return 0; +} + static int check_dir_item(struct extent_buffer *leaf, struct btrfs_key *key, struct btrfs_key *prev_key, int slot) @@ -850,25 +905,12 @@ static int check_inode_item(struct extent_buffer *leaf, u64 super_gen = btrfs_super_generation(fs_info->super_copy); u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); u32 mode; + int ret; + + ret = check_inode_key(leaf, key, slot); + if (ret < 0) + return ret; - if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID || - key->objectid > BTRFS_LAST_FREE_OBJECTID) && - key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && - key->objectid != BTRFS_FREE_INO_OBJECTID) { - generic_err(leaf, slot, - "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu", - key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, - BTRFS_FIRST_FREE_OBJECTID, - BTRFS_LAST_FREE_OBJECTID, - BTRFS_FREE_INO_OBJECTID); - return -EUCLEAN; - } - if (key->offset != 0) { - inode_item_err(leaf, slot, - "invalid key offset: has %llu expect 0", - key->offset); - return -EUCLEAN; - } iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); /* Here we use super block generation + 1 to handle log tree */ From 44de0889a1b10d3c083f71772e5f39c321726e15 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 9 Dec 2019 18:54:34 +0800 Subject: [PATCH 43/47] btrfs: tree-checker: Refactor root key check into separate function ROOT_ITEM key check itself is not as simple as single line check, and will be reused for both ROOT_ITEM and DIR_ITEM/DIR_INDEX location key check, so refactor such check into check_root_key(). Also since we are here, fix a comment error about ROOT_ITEM offset, which is transid of snapshot creation, not some "older kernel behavior". Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 62 +++++++++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 285c362b31219a..9fc438be4bd936 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -428,6 +428,49 @@ static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key, return 0; } +static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, + int slot) +{ + struct btrfs_key item_key; + bool is_root_item; + + btrfs_item_key_to_cpu(leaf, &item_key, slot); + is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY); + + /* No such tree id */ + if (key->objectid == 0) { + if (is_root_item) + generic_err(leaf, slot, "invalid root id 0"); + else + dir_item_err(leaf, slot, + "invalid location key root id 0"); + return -EUCLEAN; + } + + /* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */ + if (!is_fstree(key->objectid) && !is_root_item) { + dir_item_err(leaf, slot, + "invalid location key objectid, have %llu expect [%llu, %llu]", + key->objectid, BTRFS_FIRST_FREE_OBJECTID, + BTRFS_LAST_FREE_OBJECTID); + return -EUCLEAN; + } + + /* + * ROOT_ITEM with non-zero offset means this is a snapshot, created at + * @offset transid. + * Furthermore, for location key in DIR_ITEM, its offset is always -1. + * + * So here we only check offset for reloc tree whose key->offset must + * be a valid tree. + */ + if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) { + generic_err(leaf, slot, "invalid root id 0 for reloc tree"); + return -EUCLEAN; + } + return 0; +} + static int check_dir_item(struct extent_buffer *leaf, struct btrfs_key *key, struct btrfs_key *prev_key, int slot) @@ -978,22 +1021,11 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, struct btrfs_root_item ri; const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY | BTRFS_ROOT_SUBVOL_DEAD; + int ret; - /* No such tree id */ - if (key->objectid == 0) { - generic_err(leaf, slot, "invalid root id 0"); - return -EUCLEAN; - } - - /* - * Some older kernel may create ROOT_ITEM with non-zero offset, so here - * we only check offset for reloc tree whose key->offset must be a - * valid tree. - */ - if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) { - generic_err(leaf, slot, "invalid root id 0 for reloc tree"); - return -EUCLEAN; - } + ret = check_root_key(leaf, key, slot); + if (ret < 0) + return ret; if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) { generic_err(leaf, slot, From a9183d7d153dc122ae8c1bbb9fa2dba7bb92990e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 9 Dec 2019 18:54:35 +0800 Subject: [PATCH 44/47] btrfs: tree-checker: Verify location key for DIR_ITEM/DIR_INDEX [PROBLEM] There is a user report in the mail list, showing the following corrupted tree blocks: item 62 key (486836 DIR_ITEM 2543451757) itemoff 6273 itemsize 74 location key (4065004 INODE_ITEM 1073741824) type FILE transid 21397 data_len 0 name_len 44 name: FILENAME Note that location key, its offset should be 0 for all INODE_ITEMS. This caused failed lookup of the inode. [CAUSE] That offending value, 1073741824, is 0x40000000. So this looks like a memory bit flip. [FIX] This patch will enhance tree-checker to check location key of DIR_INDEX/DIR_ITEM/XATTR_ITEM. There are several different combinations needs to check: - item_key.type == DIR_INDEX/DIR_ITEM * location_key.type == BTRFS_INODE_ITEM_KEY This location_key should follow the check in inode_item check. * location_key.type == BTRFS_ROOT_ITEM_KEY Despite the existing check, DIR_INDEX/DIR_ITEM can only points to subvolume trees. * All other keys are not allowed. - item_key.type == XATTR_ITEM location_key should be all 0. Reported-by: Mike Gilbert Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 9fc438be4bd936..a92f8a6dd19291 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -484,12 +484,14 @@ static int check_dir_item(struct extent_buffer *leaf, return -EUCLEAN; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); while (cur < item_size) { + struct btrfs_key location_key; u32 name_len; u32 data_len; u32 max_name_len; u32 total_size; u32 name_hash; u8 dir_type; + int ret; /* header itself should not cross item boundary */ if (cur + sizeof(*di) > item_size) { @@ -499,6 +501,25 @@ static int check_dir_item(struct extent_buffer *leaf, return -EUCLEAN; } + /* Location key check */ + btrfs_dir_item_key_to_cpu(leaf, di, &location_key); + if (location_key.type == BTRFS_ROOT_ITEM_KEY) { + ret = check_root_key(leaf, &location_key, slot); + if (ret < 0) + return ret; + } else if (location_key.type == BTRFS_INODE_ITEM_KEY || + location_key.type == 0) { + ret = check_inode_key(leaf, &location_key, slot); + if (ret < 0) + return ret; + } else { + dir_item_err(leaf, slot, + "invalid location key type, have %u, expect %u or %u", + location_key.type, BTRFS_ROOT_ITEM_KEY, + BTRFS_INODE_ITEM_KEY); + return -EUCLEAN; + } + /* dir type check */ dir_type = btrfs_dir_type(leaf, di); if (dir_type >= BTRFS_FT_MAX) { From 9c120e4e45615e494989736a0588ded592577ea1 Mon Sep 17 00:00:00 2001 From: Stijn Rutjens Date: Sat, 4 Jan 2020 20:47:54 +0100 Subject: [PATCH 45/47] btrfs: Allow storing compression level per extent --- fs/btrfs/compression.c | 4 ++-- fs/btrfs/inode.c | 33 +++++++++++++++++++++++++-------- fs/btrfs/ioctl.c | 10 +++++++--- fs/btrfs/tree-checker.c | 4 ++-- 4 files changed, 36 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index de95ad27722f70..3663c3128f1ed5 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1201,7 +1201,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) { struct list_head *workspace; int ret; - int type = cb->compress_type; + int type = cb->compress_type & 0xF; workspace = get_workspace(type, 0); ret = compression_decompress_bio(type, workspace, cb); @@ -1220,7 +1220,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, { struct list_head *workspace; int ret; - + type = type & 0xF; workspace = get_workspace(type, 0); ret = compression_decompress(type, workspace, data_in, dest_page, start_byte, srclen, destlen); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dbc9bcaf583121..76a4f037aa6268 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -573,14 +573,31 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) } /* Compression level is applied here and only here */ - ret = btrfs_compress_pages( - compress_type | (fs_info->compress_level << 4), - inode->i_mapping, start, - pages, - &nr_pages, - &total_in, - &total_compressed); - + /* + * Check if the upper bits are set, and if so, + * take them as the compression level. + * the inode compression level takes precendence, if set + */ + if ((compress_type & 0xF) == compress_type) { + ret = btrfs_compress_pages( + compress_type | (fs_info->compress_level << 4), + inode->i_mapping, start, + pages, + &nr_pages, + &total_in, + &total_compressed); + } else { + int compress_level = btrfs_compress_set_level( + compress_type & 0xF, + compress_type>>4); + ret = btrfs_compress_pages( + compress_type | (compress_level << 4), + inode->i_mapping, start, + pages, + &nr_pages, + &total_in, + &total_compressed); + } if (!ret) { unsigned long offset = offset_in_page(total_compressed); struct page *page = pages[nr_pages - 1]; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0fa1c386d02064..f8ff1563e9db13 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1414,7 +1414,11 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, return -EINVAL; if (do_compress) { - if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) + /* + * The bottom 4 bits of compress_type are for used for the + * compression type, the other bits for the compression level + */ + if ((range->compress_type & 0xF) >= BTRFS_NR_COMPRESS_TYPES) return -EINVAL; if (range->compress_type) compress_type = range->compress_type; @@ -1572,9 +1576,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, filemap_flush(inode->i_mapping); } - if (range->compress_type == BTRFS_COMPRESS_LZO) { + if ((range->compress_type & 0xF) == BTRFS_COMPRESS_LZO) { btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); - } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) { + } else if ((range->compress_type & 0xF) == BTRFS_COMPRESS_ZSTD) { btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); } diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index a92f8a6dd19291..54057d5207b7b6 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -244,10 +244,10 @@ static int check_extent_data_item(struct extent_buffer *leaf, * Support for new compression/encryption must introduce incompat flag, * and must be caught in open_ctree(). */ - if (btrfs_file_extent_compression(leaf, fi) >= BTRFS_NR_COMPRESS_TYPES) { + if ((btrfs_file_extent_compression(leaf, fi) & 0xF) >= BTRFS_NR_COMPRESS_TYPES) { file_extent_err(leaf, slot, "invalid compression for file extent, have %u expect range [0, %u]", - btrfs_file_extent_compression(leaf, fi), + btrfs_file_extent_compression(leaf, fi) & 0xF, BTRFS_NR_COMPRESS_TYPES - 1); return -EUCLEAN; } From b8998e5dd182342414e33004ee485e5ef759b2a2 Mon Sep 17 00:00:00 2001 From: qwerty123443 Date: Sat, 11 Apr 2020 22:32:24 +0200 Subject: [PATCH 46/47] Update inode.c --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 76a4f037aa6268..baf813672257d2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -227,7 +227,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, compressed_size -= cur_size; } btrfs_set_file_extent_compression(leaf, ei, - compress_type); + compress_type & 0XF); } else { page = find_get_page(inode->i_mapping, start >> PAGE_SHIFT); From 4c6f10d8f62e1f2ebcd097d48900c8c2e756e9a9 Mon Sep 17 00:00:00 2001 From: qwerty123443 Date: Sat, 11 Apr 2020 22:33:52 +0200 Subject: [PATCH 47/47] Update tree-checker.c --- fs/btrfs/tree-checker.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 54057d5207b7b6..a92f8a6dd19291 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -244,10 +244,10 @@ static int check_extent_data_item(struct extent_buffer *leaf, * Support for new compression/encryption must introduce incompat flag, * and must be caught in open_ctree(). */ - if ((btrfs_file_extent_compression(leaf, fi) & 0xF) >= BTRFS_NR_COMPRESS_TYPES) { + if (btrfs_file_extent_compression(leaf, fi) >= BTRFS_NR_COMPRESS_TYPES) { file_extent_err(leaf, slot, "invalid compression for file extent, have %u expect range [0, %u]", - btrfs_file_extent_compression(leaf, fi) & 0xF, + btrfs_file_extent_compression(leaf, fi), BTRFS_NR_COMPRESS_TYPES - 1); return -EUCLEAN; }