linux_dsm_epyc7002/fs/btrfs/inode-map.c

582 lines
15 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include <linux/kthread.h>
#include <linux/pagemap.h>
#include "ctree.h"
#include "disk-io.h"
#include "free-space-cache.h"
#include "inode-map.h"
#include "transaction.h"
#include "delalloc-space.h"
static void fail_caching_thread(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
btrfs_warn(fs_info, "failed to start inode caching task");
btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE,
"disabling inode map caching");
spin_lock(&root->ino_cache_lock);
root->ino_cache_state = BTRFS_CACHE_ERROR;
spin_unlock(&root->ino_cache_lock);
wake_up(&root->ino_cache_wait);
}
static int caching_kthread(void *data)
{
struct btrfs_root *root = data;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct btrfs_key key;
struct btrfs_path *path;
struct extent_buffer *leaf;
u64 last = (u64)-1;
int slot;
int ret;
if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
return 0;
path = btrfs_alloc_path();
if (!path) {
fail_caching_thread(root);
return -ENOMEM;
}
/* Since the commit root is read-only, we can safely skip locking. */
path->skip_locking = 1;
path->search_commit_root = 1;
path->reada = READA_FORWARD;
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.offset = 0;
key.type = BTRFS_INODE_ITEM_KEY;
again:
/* need to make sure the commit_root doesn't disappear */
down_read(&fs_info->commit_root_sem);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
while (1) {
if (btrfs_fs_closing(fs_info))
goto out;
leaf = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out;
else if (ret > 0)
break;
if (need_resched() ||
btrfs_transaction_in_commit(fs_info)) {
leaf = path->nodes[0];
if (WARN_ON(btrfs_header_nritems(leaf) == 0))
break;
/*
* Save the key so we can advances forward
* in the next search.
*/
btrfs_item_key_to_cpu(leaf, &key, 0);
btrfs_release_path(path);
root->ino_cache_progress = last;
up_read(&fs_info->commit_root_sem);
schedule_timeout(1);
goto again;
} else
continue;
}
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.type != BTRFS_INODE_ITEM_KEY)
goto next;
if (key.objectid >= root->highest_objectid)
break;
if (last != (u64)-1 && last + 1 != key.objectid) {
__btrfs_add_free_space(fs_info, ctl, last + 1,
key.objectid - last - 1, 0);
wake_up(&root->ino_cache_wait);
}
last = key.objectid;
next:
path->slots[0]++;
}
if (last < root->highest_objectid - 1) {
__btrfs_add_free_space(fs_info, ctl, last + 1,
root->highest_objectid - last - 1, 0);
}
spin_lock(&root->ino_cache_lock);
root->ino_cache_state = BTRFS_CACHE_FINISHED;
spin_unlock(&root->ino_cache_lock);
root->ino_cache_progress = (u64)-1;
btrfs_unpin_free_ino(root);
out:
wake_up(&root->ino_cache_wait);
up_read(&fs_info->commit_root_sem);
btrfs_free_path(path);
return ret;
}
static void start_caching(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct task_struct *tsk;
int ret;
u64 objectid;
if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
return;
spin_lock(&root->ino_cache_lock);
if (root->ino_cache_state != BTRFS_CACHE_NO) {
spin_unlock(&root->ino_cache_lock);
return;
}
root->ino_cache_state = BTRFS_CACHE_STARTED;
spin_unlock(&root->ino_cache_lock);
ret = load_free_ino_cache(fs_info, root);
if (ret == 1) {
spin_lock(&root->ino_cache_lock);
root->ino_cache_state = BTRFS_CACHE_FINISHED;
spin_unlock(&root->ino_cache_lock);
Btrfs: fix hang when loading existing inode cache off disk If we are able to load an existing inode cache off disk, we set the state of the cache to BTRFS_CACHE_FINISHED, but we don't wake up any one waiting for the cache to be available. This means that anyone waiting for the cache to be available, waiting on the condition that either its state is BTRFS_CACHE_FINISHED or its available free space is greather than zero, can hang forever. This could be observed running fstests with MOUNT_OPTIONS="-o inode_cache", in particular test case generic/161 triggered it very frequently for me, producing a trace like the following: [63795.739712] BTRFS info (device sdc): enabling inode map caching [63795.739714] BTRFS info (device sdc): disk space caching is enabled [63795.739716] BTRFS info (device sdc): has skinny extents [64036.653886] INFO: task btrfs-transacti:3917 blocked for more than 120 seconds. [64036.654079] Not tainted 5.2.0-rc4-btrfs-next-50 #1 [64036.654143] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [64036.654232] btrfs-transacti D 0 3917 2 0x80004000 [64036.654239] Call Trace: [64036.654258] ? __schedule+0x3ae/0x7b0 [64036.654271] schedule+0x3a/0xb0 [64036.654325] btrfs_commit_transaction+0x978/0xae0 [btrfs] [64036.654339] ? remove_wait_queue+0x60/0x60 [64036.654395] transaction_kthread+0x146/0x180 [btrfs] [64036.654450] ? btrfs_cleanup_transaction+0x620/0x620 [btrfs] [64036.654456] kthread+0x103/0x140 [64036.654464] ? kthread_create_worker_on_cpu+0x70/0x70 [64036.654476] ret_from_fork+0x3a/0x50 [64036.654504] INFO: task xfs_io:3919 blocked for more than 120 seconds. [64036.654568] Not tainted 5.2.0-rc4-btrfs-next-50 #1 [64036.654617] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [64036.654685] xfs_io D 0 3919 3633 0x00000000 [64036.654691] Call Trace: [64036.654703] ? __schedule+0x3ae/0x7b0 [64036.654716] schedule+0x3a/0xb0 [64036.654756] btrfs_find_free_ino+0xa9/0x120 [btrfs] [64036.654764] ? remove_wait_queue+0x60/0x60 [64036.654809] btrfs_create+0x72/0x1f0 [btrfs] [64036.654822] lookup_open+0x6bc/0x790 [64036.654849] path_openat+0x3bc/0xc00 [64036.654854] ? __lock_acquire+0x331/0x1cb0 [64036.654869] do_filp_open+0x99/0x110 [64036.654884] ? __alloc_fd+0xee/0x200 [64036.654895] ? do_raw_spin_unlock+0x49/0xc0 [64036.654909] ? do_sys_open+0x132/0x220 [64036.654913] do_sys_open+0x132/0x220 [64036.654926] do_syscall_64+0x60/0x1d0 [64036.654933] entry_SYSCALL_64_after_hwframe+0x49/0xbe Fix this by adding a wake_up() call right after setting the cache state to BTRFS_CACHE_FINISHED, at start_caching(), when we are able to load the cache from disk. Fixes: 82d5902d9c681b ("Btrfs: Support reading/writing on disk free ino cache") Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-04 22:24:09 +07:00
wake_up(&root->ino_cache_wait);
return;
}
/*
* It can be quite time-consuming to fill the cache by searching
* through the extent tree, and this can keep ino allocation path
* waiting. Therefore at start we quickly find out the highest
* inode number and we know we can use inode numbers which fall in
* [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
*/
ret = btrfs_find_free_objectid(root, &objectid);
if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
__btrfs_add_free_space(fs_info, ctl, objectid,
BTRFS_LAST_FREE_OBJECTID - objectid + 1,
0);
wake_up(&root->ino_cache_wait);
}
tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
root->root_key.objectid);
if (IS_ERR(tsk))
fail_caching_thread(root);
}
int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
{
if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return btrfs_find_free_objectid(root, objectid);
again:
*objectid = btrfs_find_ino_for_alloc(root);
if (*objectid != 0)
return 0;
start_caching(root);
wait_event(root->ino_cache_wait,
root->ino_cache_state == BTRFS_CACHE_FINISHED ||
root->ino_cache_state == BTRFS_CACHE_ERROR ||
root->free_ino_ctl->free_space > 0);
if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
root->free_ino_ctl->free_space == 0)
return -ENOSPC;
else if (root->ino_cache_state == BTRFS_CACHE_ERROR)
return btrfs_find_free_objectid(root, objectid);
else
goto again;
}
void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
return;
again:
if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
__btrfs_add_free_space(fs_info, pinned, objectid, 1, 0);
} else {
down_write(&fs_info->commit_root_sem);
spin_lock(&root->ino_cache_lock);
if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
spin_unlock(&root->ino_cache_lock);
up_write(&fs_info->commit_root_sem);
goto again;
}
spin_unlock(&root->ino_cache_lock);
start_caching(root);
__btrfs_add_free_space(fs_info, pinned, objectid, 1, 0);
up_write(&fs_info->commit_root_sem);
}
}
/*
* When a transaction is committed, we'll move those inode numbers which are
* smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
* others will just be dropped, because the commit root we were searching has
* changed.
*
* Must be called with root->fs_info->commit_root_sem held
*/
void btrfs_unpin_free_ino(struct btrfs_root *root)
{
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
Btrfs: fix race between caching kthread and returning inode to inode cache While the inode cache caching kthread is calling btrfs_unpin_free_ino(), we could have a concurrent call to btrfs_return_ino() that adds a new entry to the root's free space cache of pinned inodes. This concurrent call does not acquire the fs_info->commit_root_sem before adding a new entry if the caching state is BTRFS_CACHE_FINISHED, which is a problem because the caching kthread calls btrfs_unpin_free_ino() after setting the caching state to BTRFS_CACHE_FINISHED and therefore races with the task calling btrfs_return_ino(), which is adding a new entry, while the former (caching kthread) is navigating the cache's rbtree, removing and freeing nodes from the cache's rbtree without acquiring the spinlock that protects the rbtree. This race resulted in memory corruption due to double free of struct btrfs_free_space objects because both tasks can end up doing freeing the same objects. Note that adding a new entry can result in merging it with other entries in the cache, in which case those entries are freed. This is particularly important as btrfs_free_space structures are also used for the block group free space caches. This memory corruption can be detected by a debugging kernel, which reports it with the following trace: [132408.501148] slab error in verify_redzone_free(): cache `btrfs_free_space': double free detected [132408.505075] CPU: 15 PID: 12248 Comm: btrfs-ino-cache Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [132408.505075] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [132408.505075] ffff880023e7d320 ffff880163d73cd8 ffffffff8145eec7 ffffffff81095dce [132408.505075] ffff880009735d40 ffff880163d73ce8 ffffffff81154e1e ffff880163d73d68 [132408.505075] ffffffff81155733 ffffffffa054a95a ffff8801b6099f00 ffffffffa0505b5f [132408.505075] Call Trace: [132408.505075] [<ffffffff8145eec7>] dump_stack+0x4f/0x7b [132408.505075] [<ffffffff81095dce>] ? console_unlock+0x356/0x3a2 [132408.505075] [<ffffffff81154e1e>] __slab_error.isra.28+0x25/0x36 [132408.505075] [<ffffffff81155733>] __cache_free+0xe2/0x4b6 [132408.505075] [<ffffffffa054a95a>] ? __btrfs_add_free_space+0x2f0/0x343 [btrfs] [132408.505075] [<ffffffffa0505b5f>] ? btrfs_unpin_free_ino+0x8e/0x99 [btrfs] [132408.505075] [<ffffffff810f3b30>] ? time_hardirqs_off+0x15/0x28 [132408.505075] [<ffffffff81084d42>] ? trace_hardirqs_off+0xd/0xf [132408.505075] [<ffffffff811563a1>] ? kfree+0xb6/0x14e [132408.505075] [<ffffffff811563d0>] kfree+0xe5/0x14e [132408.505075] [<ffffffffa0505b5f>] btrfs_unpin_free_ino+0x8e/0x99 [btrfs] [132408.505075] [<ffffffffa0505e08>] caching_kthread+0x29e/0x2d9 [btrfs] [132408.505075] [<ffffffffa0505b6a>] ? btrfs_unpin_free_ino+0x99/0x99 [btrfs] [132408.505075] [<ffffffff8106698f>] kthread+0xef/0xf7 [132408.505075] [<ffffffff810f3b08>] ? time_hardirqs_on+0x15/0x28 [132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad [132408.505075] [<ffffffff814653d2>] ret_from_fork+0x42/0x70 [132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad [132408.505075] ffff880023e7d320: redzone 1:0x9f911029d74e35b, redzone 2:0x9f911029d74e35b. [132409.501654] slab: double free detected in cache 'btrfs_free_space', objp ffff880023e7d320 [132409.503355] ------------[ cut here ]------------ [132409.504241] kernel BUG at mm/slab.c:2571! Therefore fix this by having btrfs_unpin_free_ino() acquire the lock that protects the rbtree while doing the searches and removing entries. Fixes: 1c70d8fb4dfa ("Btrfs: fix inode caching vs tree log") Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-13 12:52:57 +07:00
spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock;
struct btrfs_free_space *info;
struct rb_node *n;
u64 count;
if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return;
while (1) {
Btrfs: fix race between caching kthread and returning inode to inode cache While the inode cache caching kthread is calling btrfs_unpin_free_ino(), we could have a concurrent call to btrfs_return_ino() that adds a new entry to the root's free space cache of pinned inodes. This concurrent call does not acquire the fs_info->commit_root_sem before adding a new entry if the caching state is BTRFS_CACHE_FINISHED, which is a problem because the caching kthread calls btrfs_unpin_free_ino() after setting the caching state to BTRFS_CACHE_FINISHED and therefore races with the task calling btrfs_return_ino(), which is adding a new entry, while the former (caching kthread) is navigating the cache's rbtree, removing and freeing nodes from the cache's rbtree without acquiring the spinlock that protects the rbtree. This race resulted in memory corruption due to double free of struct btrfs_free_space objects because both tasks can end up doing freeing the same objects. Note that adding a new entry can result in merging it with other entries in the cache, in which case those entries are freed. This is particularly important as btrfs_free_space structures are also used for the block group free space caches. This memory corruption can be detected by a debugging kernel, which reports it with the following trace: [132408.501148] slab error in verify_redzone_free(): cache `btrfs_free_space': double free detected [132408.505075] CPU: 15 PID: 12248 Comm: btrfs-ino-cache Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [132408.505075] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [132408.505075] ffff880023e7d320 ffff880163d73cd8 ffffffff8145eec7 ffffffff81095dce [132408.505075] ffff880009735d40 ffff880163d73ce8 ffffffff81154e1e ffff880163d73d68 [132408.505075] ffffffff81155733 ffffffffa054a95a ffff8801b6099f00 ffffffffa0505b5f [132408.505075] Call Trace: [132408.505075] [<ffffffff8145eec7>] dump_stack+0x4f/0x7b [132408.505075] [<ffffffff81095dce>] ? console_unlock+0x356/0x3a2 [132408.505075] [<ffffffff81154e1e>] __slab_error.isra.28+0x25/0x36 [132408.505075] [<ffffffff81155733>] __cache_free+0xe2/0x4b6 [132408.505075] [<ffffffffa054a95a>] ? __btrfs_add_free_space+0x2f0/0x343 [btrfs] [132408.505075] [<ffffffffa0505b5f>] ? btrfs_unpin_free_ino+0x8e/0x99 [btrfs] [132408.505075] [<ffffffff810f3b30>] ? time_hardirqs_off+0x15/0x28 [132408.505075] [<ffffffff81084d42>] ? trace_hardirqs_off+0xd/0xf [132408.505075] [<ffffffff811563a1>] ? kfree+0xb6/0x14e [132408.505075] [<ffffffff811563d0>] kfree+0xe5/0x14e [132408.505075] [<ffffffffa0505b5f>] btrfs_unpin_free_ino+0x8e/0x99 [btrfs] [132408.505075] [<ffffffffa0505e08>] caching_kthread+0x29e/0x2d9 [btrfs] [132408.505075] [<ffffffffa0505b6a>] ? btrfs_unpin_free_ino+0x99/0x99 [btrfs] [132408.505075] [<ffffffff8106698f>] kthread+0xef/0xf7 [132408.505075] [<ffffffff810f3b08>] ? time_hardirqs_on+0x15/0x28 [132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad [132408.505075] [<ffffffff814653d2>] ret_from_fork+0x42/0x70 [132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad [132408.505075] ffff880023e7d320: redzone 1:0x9f911029d74e35b, redzone 2:0x9f911029d74e35b. [132409.501654] slab: double free detected in cache 'btrfs_free_space', objp ffff880023e7d320 [132409.503355] ------------[ cut here ]------------ [132409.504241] kernel BUG at mm/slab.c:2571! Therefore fix this by having btrfs_unpin_free_ino() acquire the lock that protects the rbtree while doing the searches and removing entries. Fixes: 1c70d8fb4dfa ("Btrfs: fix inode caching vs tree log") Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-13 12:52:57 +07:00
spin_lock(rbroot_lock);
n = rb_first(rbroot);
Btrfs: fix race between caching kthread and returning inode to inode cache While the inode cache caching kthread is calling btrfs_unpin_free_ino(), we could have a concurrent call to btrfs_return_ino() that adds a new entry to the root's free space cache of pinned inodes. This concurrent call does not acquire the fs_info->commit_root_sem before adding a new entry if the caching state is BTRFS_CACHE_FINISHED, which is a problem because the caching kthread calls btrfs_unpin_free_ino() after setting the caching state to BTRFS_CACHE_FINISHED and therefore races with the task calling btrfs_return_ino(), which is adding a new entry, while the former (caching kthread) is navigating the cache's rbtree, removing and freeing nodes from the cache's rbtree without acquiring the spinlock that protects the rbtree. This race resulted in memory corruption due to double free of struct btrfs_free_space objects because both tasks can end up doing freeing the same objects. Note that adding a new entry can result in merging it with other entries in the cache, in which case those entries are freed. This is particularly important as btrfs_free_space structures are also used for the block group free space caches. This memory corruption can be detected by a debugging kernel, which reports it with the following trace: [132408.501148] slab error in verify_redzone_free(): cache `btrfs_free_space': double free detected [132408.505075] CPU: 15 PID: 12248 Comm: btrfs-ino-cache Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [132408.505075] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [132408.505075] ffff880023e7d320 ffff880163d73cd8 ffffffff8145eec7 ffffffff81095dce [132408.505075] ffff880009735d40 ffff880163d73ce8 ffffffff81154e1e ffff880163d73d68 [132408.505075] ffffffff81155733 ffffffffa054a95a ffff8801b6099f00 ffffffffa0505b5f [132408.505075] Call Trace: [132408.505075] [<ffffffff8145eec7>] dump_stack+0x4f/0x7b [132408.505075] [<ffffffff81095dce>] ? console_unlock+0x356/0x3a2 [132408.505075] [<ffffffff81154e1e>] __slab_error.isra.28+0x25/0x36 [132408.505075] [<ffffffff81155733>] __cache_free+0xe2/0x4b6 [132408.505075] [<ffffffffa054a95a>] ? __btrfs_add_free_space+0x2f0/0x343 [btrfs] [132408.505075] [<ffffffffa0505b5f>] ? btrfs_unpin_free_ino+0x8e/0x99 [btrfs] [132408.505075] [<ffffffff810f3b30>] ? time_hardirqs_off+0x15/0x28 [132408.505075] [<ffffffff81084d42>] ? trace_hardirqs_off+0xd/0xf [132408.505075] [<ffffffff811563a1>] ? kfree+0xb6/0x14e [132408.505075] [<ffffffff811563d0>] kfree+0xe5/0x14e [132408.505075] [<ffffffffa0505b5f>] btrfs_unpin_free_ino+0x8e/0x99 [btrfs] [132408.505075] [<ffffffffa0505e08>] caching_kthread+0x29e/0x2d9 [btrfs] [132408.505075] [<ffffffffa0505b6a>] ? btrfs_unpin_free_ino+0x99/0x99 [btrfs] [132408.505075] [<ffffffff8106698f>] kthread+0xef/0xf7 [132408.505075] [<ffffffff810f3b08>] ? time_hardirqs_on+0x15/0x28 [132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad [132408.505075] [<ffffffff814653d2>] ret_from_fork+0x42/0x70 [132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad [132408.505075] ffff880023e7d320: redzone 1:0x9f911029d74e35b, redzone 2:0x9f911029d74e35b. [132409.501654] slab: double free detected in cache 'btrfs_free_space', objp ffff880023e7d320 [132409.503355] ------------[ cut here ]------------ [132409.504241] kernel BUG at mm/slab.c:2571! Therefore fix this by having btrfs_unpin_free_ino() acquire the lock that protects the rbtree while doing the searches and removing entries. Fixes: 1c70d8fb4dfa ("Btrfs: fix inode caching vs tree log") Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-13 12:52:57 +07:00
if (!n) {
spin_unlock(rbroot_lock);
break;
Btrfs: fix race between caching kthread and returning inode to inode cache While the inode cache caching kthread is calling btrfs_unpin_free_ino(), we could have a concurrent call to btrfs_return_ino() that adds a new entry to the root's free space cache of pinned inodes. This concurrent call does not acquire the fs_info->commit_root_sem before adding a new entry if the caching state is BTRFS_CACHE_FINISHED, which is a problem because the caching kthread calls btrfs_unpin_free_ino() after setting the caching state to BTRFS_CACHE_FINISHED and therefore races with the task calling btrfs_return_ino(), which is adding a new entry, while the former (caching kthread) is navigating the cache's rbtree, removing and freeing nodes from the cache's rbtree without acquiring the spinlock that protects the rbtree. This race resulted in memory corruption due to double free of struct btrfs_free_space objects because both tasks can end up doing freeing the same objects. Note that adding a new entry can result in merging it with other entries in the cache, in which case those entries are freed. This is particularly important as btrfs_free_space structures are also used for the block group free space caches. This memory corruption can be detected by a debugging kernel, which reports it with the following trace: [132408.501148] slab error in verify_redzone_free(): cache `btrfs_free_space': double free detected [132408.505075] CPU: 15 PID: 12248 Comm: btrfs-ino-cache Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [132408.505075] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [132408.505075] ffff880023e7d320 ffff880163d73cd8 ffffffff8145eec7 ffffffff81095dce [132408.505075] ffff880009735d40 ffff880163d73ce8 ffffffff81154e1e ffff880163d73d68 [132408.505075] ffffffff81155733 ffffffffa054a95a ffff8801b6099f00 ffffffffa0505b5f [132408.505075] Call Trace: [132408.505075] [<ffffffff8145eec7>] dump_stack+0x4f/0x7b [132408.505075] [<ffffffff81095dce>] ? console_unlock+0x356/0x3a2 [132408.505075] [<ffffffff81154e1e>] __slab_error.isra.28+0x25/0x36 [132408.505075] [<ffffffff81155733>] __cache_free+0xe2/0x4b6 [132408.505075] [<ffffffffa054a95a>] ? __btrfs_add_free_space+0x2f0/0x343 [btrfs] [132408.505075] [<ffffffffa0505b5f>] ? btrfs_unpin_free_ino+0x8e/0x99 [btrfs] [132408.505075] [<ffffffff810f3b30>] ? time_hardirqs_off+0x15/0x28 [132408.505075] [<ffffffff81084d42>] ? trace_hardirqs_off+0xd/0xf [132408.505075] [<ffffffff811563a1>] ? kfree+0xb6/0x14e [132408.505075] [<ffffffff811563d0>] kfree+0xe5/0x14e [132408.505075] [<ffffffffa0505b5f>] btrfs_unpin_free_ino+0x8e/0x99 [btrfs] [132408.505075] [<ffffffffa0505e08>] caching_kthread+0x29e/0x2d9 [btrfs] [132408.505075] [<ffffffffa0505b6a>] ? btrfs_unpin_free_ino+0x99/0x99 [btrfs] [132408.505075] [<ffffffff8106698f>] kthread+0xef/0xf7 [132408.505075] [<ffffffff810f3b08>] ? time_hardirqs_on+0x15/0x28 [132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad [132408.505075] [<ffffffff814653d2>] ret_from_fork+0x42/0x70 [132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad [132408.505075] ffff880023e7d320: redzone 1:0x9f911029d74e35b, redzone 2:0x9f911029d74e35b. [132409.501654] slab: double free detected in cache 'btrfs_free_space', objp ffff880023e7d320 [132409.503355] ------------[ cut here ]------------ [132409.504241] kernel BUG at mm/slab.c:2571! Therefore fix this by having btrfs_unpin_free_ino() acquire the lock that protects the rbtree while doing the searches and removing entries. Fixes: 1c70d8fb4dfa ("Btrfs: fix inode caching vs tree log") Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-13 12:52:57 +07:00
}
info = rb_entry(n, struct btrfs_free_space, offset_index);
BUG_ON(info->bitmap); /* Logic error */
if (info->offset > root->ino_cache_progress)
count = 0;
else
count = min(root->ino_cache_progress - info->offset + 1,
info->bytes);
rb_erase(&info->offset_index, rbroot);
Btrfs: fix race between caching kthread and returning inode to inode cache While the inode cache caching kthread is calling btrfs_unpin_free_ino(), we could have a concurrent call to btrfs_return_ino() that adds a new entry to the root's free space cache of pinned inodes. This concurrent call does not acquire the fs_info->commit_root_sem before adding a new entry if the caching state is BTRFS_CACHE_FINISHED, which is a problem because the caching kthread calls btrfs_unpin_free_ino() after setting the caching state to BTRFS_CACHE_FINISHED and therefore races with the task calling btrfs_return_ino(), which is adding a new entry, while the former (caching kthread) is navigating the cache's rbtree, removing and freeing nodes from the cache's rbtree without acquiring the spinlock that protects the rbtree. This race resulted in memory corruption due to double free of struct btrfs_free_space objects because both tasks can end up doing freeing the same objects. Note that adding a new entry can result in merging it with other entries in the cache, in which case those entries are freed. This is particularly important as btrfs_free_space structures are also used for the block group free space caches. This memory corruption can be detected by a debugging kernel, which reports it with the following trace: [132408.501148] slab error in verify_redzone_free(): cache `btrfs_free_space': double free detected [132408.505075] CPU: 15 PID: 12248 Comm: btrfs-ino-cache Tainted: G W 4.1.0-rc5-btrfs-next-10+ #1 [132408.505075] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 [132408.505075] ffff880023e7d320 ffff880163d73cd8 ffffffff8145eec7 ffffffff81095dce [132408.505075] ffff880009735d40 ffff880163d73ce8 ffffffff81154e1e ffff880163d73d68 [132408.505075] ffffffff81155733 ffffffffa054a95a ffff8801b6099f00 ffffffffa0505b5f [132408.505075] Call Trace: [132408.505075] [<ffffffff8145eec7>] dump_stack+0x4f/0x7b [132408.505075] [<ffffffff81095dce>] ? console_unlock+0x356/0x3a2 [132408.505075] [<ffffffff81154e1e>] __slab_error.isra.28+0x25/0x36 [132408.505075] [<ffffffff81155733>] __cache_free+0xe2/0x4b6 [132408.505075] [<ffffffffa054a95a>] ? __btrfs_add_free_space+0x2f0/0x343 [btrfs] [132408.505075] [<ffffffffa0505b5f>] ? btrfs_unpin_free_ino+0x8e/0x99 [btrfs] [132408.505075] [<ffffffff810f3b30>] ? time_hardirqs_off+0x15/0x28 [132408.505075] [<ffffffff81084d42>] ? trace_hardirqs_off+0xd/0xf [132408.505075] [<ffffffff811563a1>] ? kfree+0xb6/0x14e [132408.505075] [<ffffffff811563d0>] kfree+0xe5/0x14e [132408.505075] [<ffffffffa0505b5f>] btrfs_unpin_free_ino+0x8e/0x99 [btrfs] [132408.505075] [<ffffffffa0505e08>] caching_kthread+0x29e/0x2d9 [btrfs] [132408.505075] [<ffffffffa0505b6a>] ? btrfs_unpin_free_ino+0x99/0x99 [btrfs] [132408.505075] [<ffffffff8106698f>] kthread+0xef/0xf7 [132408.505075] [<ffffffff810f3b08>] ? time_hardirqs_on+0x15/0x28 [132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad [132408.505075] [<ffffffff814653d2>] ret_from_fork+0x42/0x70 [132408.505075] [<ffffffff810668a0>] ? __kthread_parkme+0xad/0xad [132408.505075] ffff880023e7d320: redzone 1:0x9f911029d74e35b, redzone 2:0x9f911029d74e35b. [132409.501654] slab: double free detected in cache 'btrfs_free_space', objp ffff880023e7d320 [132409.503355] ------------[ cut here ]------------ [132409.504241] kernel BUG at mm/slab.c:2571! Therefore fix this by having btrfs_unpin_free_ino() acquire the lock that protects the rbtree while doing the searches and removing entries. Fixes: 1c70d8fb4dfa ("Btrfs: fix inode caching vs tree log") Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-06-13 12:52:57 +07:00
spin_unlock(rbroot_lock);
if (count)
__btrfs_add_free_space(root->fs_info, ctl,
info->offset, count, 0);
kmem_cache_free(btrfs_free_space_cachep, info);
}
}
#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space))
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 19:29:47 +07:00
#define INODES_PER_BITMAP (PAGE_SIZE * 8)
/*
* The goal is to keep the memory used by the free_ino tree won't
* exceed the memory if we use bitmaps only.
*/
static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
struct btrfs_free_space *info;
struct rb_node *n;
int max_ino;
int max_bitmaps;
n = rb_last(&ctl->free_space_offset);
if (!n) {
ctl->extents_thresh = INIT_THRESHOLD;
return;
}
info = rb_entry(n, struct btrfs_free_space, offset_index);
/*
* Find the maximum inode number in the filesystem. Note we
* ignore the fact that this can be a bitmap, because we are
* not doing precise calculation.
*/
max_ino = info->bytes - 1;
max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
if (max_bitmaps <= ctl->total_bitmaps) {
ctl->extents_thresh = 0;
return;
}
ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 19:29:47 +07:00
PAGE_SIZE / sizeof(*info);
}
/*
* We don't fall back to bitmap, if we are below the extents threshold
* or this chunk of inode numbers is a big one.
*/
static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
if (ctl->free_extents < ctl->extents_thresh ||
info->bytes > INODES_PER_BITMAP / 10)
return false;
return true;
}
static const struct btrfs_free_space_op free_ino_op = {
.recalc_thresholds = recalculate_thresholds,
.use_bitmap = use_bitmap,
};
static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
{
}
static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
/*
* We always use extents for two reasons:
*
* - The pinned tree is only used during the process of caching
* work.
* - Make code simpler. See btrfs_unpin_free_ino().
*/
return false;
}
static const struct btrfs_free_space_op pinned_free_ino_op = {
.recalc_thresholds = pinned_recalc_thresholds,
.use_bitmap = pinned_use_bitmap,
};
void btrfs_init_free_ino_ctl(struct btrfs_root *root)
{
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
spin_lock_init(&ctl->tree_lock);
ctl->unit = 1;
ctl->start = 0;
ctl->private = NULL;
ctl->op = &free_ino_op;
Btrfs: fix race between writing free space cache and trimming Trimming is completely transactionless, and the way it operates consists of hiding free space entries from a block group, perform the trim/discard and then make the free space entries visible again. Therefore while a free space entry is being trimmed, we can have free space cache writing running in parallel (as part of a transaction commit) which will miss the free space entry. This means that an unmount (or crash/reboot) after that transaction commit and mount again before another transaction starts/commits after the discard finishes, we will have some free space that won't be used again unless the free space cache is rebuilt. After the unmount, fsck (btrfsck, btrfs check) reports the issue like the following example: *** fsck.btrfs output *** checking extents checking free space cache There is no free space entry for 521764864-521781248 There is no free space entry for 521764864-1103101952 cache appears valid but isnt 29360128 Checking filesystem on /dev/sdc UUID: b4789e27-4774-4626-98e9-ae8dfbfb0fb5 found 1235681286 bytes used err is -22 (...) Another issue caused by this race is a crash while writing bitmap entries to the cache, because while the cache writeout task accesses the bitmaps, the trim task can be concurrently modifying the bitmap or worse might be freeing the bitmap. The later case results in the following crash: [55650.804460] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC [55650.804835] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc loop parport_pc parport i2c_piix4 psmouse evdev pcspkr microcode processor i2ccore serio_raw thermal_sys button ext4 crc16 jbd2 mbcache sg sd_mod crc_t10dif sr_mod cdrom crct10dif_generic crct10dif_common ata_generic virtio_scsi floppy ata_piix libata virtio_pci virtio_ring virtio scsi_mod e1000 [last unloaded: btrfs] [55650.806169] CPU: 1 PID: 31002 Comm: btrfs-transacti Tainted: G W 3.17.0-rc5-btrfs-next-1+ #1 [55650.806493] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [55650.806867] task: ffff8800b12f6410 ti: ffff880071538000 task.ti: ffff880071538000 [55650.807166] RIP: 0010:[<ffffffffa037cf45>] [<ffffffffa037cf45>] write_bitmap_entries+0x65/0xbb [btrfs] [55650.807514] RSP: 0018:ffff88007153bc30 EFLAGS: 00010246 [55650.807687] RAX: 000000005d1ec000 RBX: ffff8800a665df08 RCX: 0000000000000400 [55650.807885] RDX: ffff88005d1ec000 RSI: 6b6b6b6b6b6b6b6b RDI: ffff88005d1ec000 [55650.808017] RBP: ffff88007153bc58 R08: 00000000ddd51536 R09: 00000000000001e0 [55650.808017] R10: 0000000000000000 R11: 0000000000000037 R12: 6b6b6b6b6b6b6b6b [55650.808017] R13: ffff88007153bca8 R14: 6b6b6b6b6b6b6b6b R15: ffff88007153bc98 [55650.808017] FS: 0000000000000000(0000) GS:ffff88023ec80000(0000) knlGS:0000000000000000 [55650.808017] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [55650.808017] CR2: 0000000002273b88 CR3: 00000000b18f6000 CR4: 00000000000006e0 [55650.808017] Stack: [55650.808017] ffff88020e834e00 ffff880172d68db0 0000000000000000 ffff88019257c800 [55650.808017] ffff8801d42ea720 ffff88007153bd10 ffffffffa037d2fa ffff880224e99180 [55650.808017] ffff8801469a6188 ffff880224e99140 ffff880172d68c50 00000003000000b7 [55650.808017] Call Trace: [55650.808017] [<ffffffffa037d2fa>] __btrfs_write_out_cache+0x1ea/0x37f [btrfs] [55650.808017] [<ffffffffa037d959>] btrfs_write_out_cache+0xa1/0xd8 [btrfs] [55650.808017] [<ffffffffa033936b>] btrfs_write_dirty_block_groups+0x4b5/0x505 [btrfs] [55650.808017] [<ffffffffa03aa98e>] commit_cowonly_roots+0x15e/0x1f7 [btrfs] [55650.808017] [<ffffffff813eb9c7>] ? _raw_spin_lock+0xe/0x10 [55650.808017] [<ffffffffa0346e46>] btrfs_commit_transaction+0x411/0x882 [btrfs] [55650.808017] [<ffffffffa03432a4>] transaction_kthread+0xf2/0x1a4 [btrfs] [55650.808017] [<ffffffffa03431b2>] ? btrfs_cleanup_transaction+0x3d8/0x3d8 [btrfs] [55650.808017] [<ffffffff8105966b>] kthread+0xb7/0xbf [55650.808017] [<ffffffff810595b4>] ? __kthread_parkme+0x67/0x67 [55650.808017] [<ffffffff813ebeac>] ret_from_fork+0x7c/0xb0 [55650.808017] [<ffffffff810595b4>] ? __kthread_parkme+0x67/0x67 [55650.808017] Code: 4c 89 ef 8d 70 ff e8 d4 fc ff ff 41 8b 45 34 41 39 45 30 7d 5c 31 f6 4c 89 ef e8 80 f6 ff ff 49 8b 7d 00 4c 89 f6 b9 00 04 00 00 <f3> a5 4c 89 ef 41 8b 45 30 8d 70 ff e8 a3 fc ff ff 41 8b 45 34 [55650.808017] RIP [<ffffffffa037cf45>] write_bitmap_entries+0x65/0xbb [btrfs] [55650.808017] RSP <ffff88007153bc30> [55650.815725] ---[ end trace 1c032e96b149ff86 ]--- Fix this by serializing both tasks in such a way that cache writeout doesn't wait for the trim/discard of free space entries to finish and doesn't miss any free space entry. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-12-02 00:04:09 +07:00
INIT_LIST_HEAD(&ctl->trimming_ranges);
mutex_init(&ctl->cache_writeout_mutex);
/*
* Initially we allow to use 16K of ram to cache chunks of
* inode numbers before we resort to bitmaps. This is somewhat
* arbitrary, but it will be adjusted in runtime.
*/
ctl->extents_thresh = INIT_THRESHOLD;
spin_lock_init(&pinned->tree_lock);
pinned->unit = 1;
pinned->start = 0;
pinned->private = NULL;
pinned->extents_thresh = 0;
pinned->op = &pinned_free_ino_op;
}
int btrfs_save_ino_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct btrfs_path *path;
struct inode *inode;
Btrfs: fix no reserved space for writing out inode cache I-node cache forgets to reserve the space when writing out it. And when we do some stress test, such as synctest, it will trigger WARN_ON() in use_block_rsv(). WARNING: at fs/btrfs/extent-tree.c:5718 btrfs_alloc_free_block+0xbf/0x281 [btrfs]() ... Call Trace: [<ffffffff8104df86>] warn_slowpath_common+0x80/0x98 [<ffffffff8104dfb3>] warn_slowpath_null+0x15/0x17 [<ffffffffa0369c60>] btrfs_alloc_free_block+0xbf/0x281 [btrfs] [<ffffffff810cbcb8>] ? __set_page_dirty_nobuffers+0xfe/0x108 [<ffffffffa035c040>] __btrfs_cow_block+0x118/0x3b5 [btrfs] [<ffffffffa035c7ba>] btrfs_cow_block+0x103/0x14e [btrfs] [<ffffffffa035e4c4>] btrfs_search_slot+0x249/0x6a4 [btrfs] [<ffffffffa036d086>] btrfs_lookup_inode+0x2a/0x8a [btrfs] [<ffffffffa03788b7>] btrfs_update_inode+0xaa/0x141 [btrfs] [<ffffffffa036d7ec>] btrfs_save_ino_cache+0xea/0x202 [btrfs] [<ffffffffa03a761e>] ? btrfs_update_reloc_root+0x17e/0x197 [btrfs] [<ffffffffa0373867>] commit_fs_roots+0xaa/0x158 [btrfs] [<ffffffffa03746a6>] btrfs_commit_transaction+0x405/0x731 [btrfs] [<ffffffff810690df>] ? wake_up_bit+0x25/0x25 [<ffffffffa039d652>] ? btrfs_log_dentry_safe+0x43/0x51 [btrfs] [<ffffffffa0381c5f>] btrfs_sync_file+0x16a/0x198 [btrfs] [<ffffffff81122806>] ? mntput+0x21/0x23 [<ffffffff8112d150>] vfs_fsync_range+0x18/0x21 [<ffffffff8112d170>] vfs_fsync+0x17/0x19 [<ffffffff8112d316>] do_fsync+0x29/0x3e [<ffffffff8112d348>] sys_fsync+0xb/0xf [<ffffffff81468352>] system_call_fastpath+0x16/0x1b Sometimes it causes BUG_ON() in the reservation code of the delayed inode is triggered. So we must reserve enough space for inode cache. Note: If we can not reserve the enough space for inode cache, we will give up writing out it. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-11-11 08:45:04 +07:00
struct btrfs_block_rsv *rsv;
struct extent_changeset *data_reserved = NULL;
Btrfs: fix no reserved space for writing out inode cache I-node cache forgets to reserve the space when writing out it. And when we do some stress test, such as synctest, it will trigger WARN_ON() in use_block_rsv(). WARNING: at fs/btrfs/extent-tree.c:5718 btrfs_alloc_free_block+0xbf/0x281 [btrfs]() ... Call Trace: [<ffffffff8104df86>] warn_slowpath_common+0x80/0x98 [<ffffffff8104dfb3>] warn_slowpath_null+0x15/0x17 [<ffffffffa0369c60>] btrfs_alloc_free_block+0xbf/0x281 [btrfs] [<ffffffff810cbcb8>] ? __set_page_dirty_nobuffers+0xfe/0x108 [<ffffffffa035c040>] __btrfs_cow_block+0x118/0x3b5 [btrfs] [<ffffffffa035c7ba>] btrfs_cow_block+0x103/0x14e [btrfs] [<ffffffffa035e4c4>] btrfs_search_slot+0x249/0x6a4 [btrfs] [<ffffffffa036d086>] btrfs_lookup_inode+0x2a/0x8a [btrfs] [<ffffffffa03788b7>] btrfs_update_inode+0xaa/0x141 [btrfs] [<ffffffffa036d7ec>] btrfs_save_ino_cache+0xea/0x202 [btrfs] [<ffffffffa03a761e>] ? btrfs_update_reloc_root+0x17e/0x197 [btrfs] [<ffffffffa0373867>] commit_fs_roots+0xaa/0x158 [btrfs] [<ffffffffa03746a6>] btrfs_commit_transaction+0x405/0x731 [btrfs] [<ffffffff810690df>] ? wake_up_bit+0x25/0x25 [<ffffffffa039d652>] ? btrfs_log_dentry_safe+0x43/0x51 [btrfs] [<ffffffffa0381c5f>] btrfs_sync_file+0x16a/0x198 [btrfs] [<ffffffff81122806>] ? mntput+0x21/0x23 [<ffffffff8112d150>] vfs_fsync_range+0x18/0x21 [<ffffffff8112d170>] vfs_fsync+0x17/0x19 [<ffffffff8112d316>] do_fsync+0x29/0x3e [<ffffffff8112d348>] sys_fsync+0xb/0xf [<ffffffff81468352>] system_call_fastpath+0x16/0x1b Sometimes it causes BUG_ON() in the reservation code of the delayed inode is triggered. So we must reserve enough space for inode cache. Note: If we can not reserve the enough space for inode cache, we will give up writing out it. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-11-11 08:45:04 +07:00
u64 num_bytes;
u64 alloc_hint = 0;
int ret;
int prealloc;
bool retry = false;
/* only fs tree and subvol/snap needs ino cache */
if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
(root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
return 0;
/* Don't save inode cache if we are deleting this root */
if (btrfs_root_refs(&root->root_item) == 0)
return 0;
if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
Btrfs: fix no reserved space for writing out inode cache I-node cache forgets to reserve the space when writing out it. And when we do some stress test, such as synctest, it will trigger WARN_ON() in use_block_rsv(). WARNING: at fs/btrfs/extent-tree.c:5718 btrfs_alloc_free_block+0xbf/0x281 [btrfs]() ... Call Trace: [<ffffffff8104df86>] warn_slowpath_common+0x80/0x98 [<ffffffff8104dfb3>] warn_slowpath_null+0x15/0x17 [<ffffffffa0369c60>] btrfs_alloc_free_block+0xbf/0x281 [btrfs] [<ffffffff810cbcb8>] ? __set_page_dirty_nobuffers+0xfe/0x108 [<ffffffffa035c040>] __btrfs_cow_block+0x118/0x3b5 [btrfs] [<ffffffffa035c7ba>] btrfs_cow_block+0x103/0x14e [btrfs] [<ffffffffa035e4c4>] btrfs_search_slot+0x249/0x6a4 [btrfs] [<ffffffffa036d086>] btrfs_lookup_inode+0x2a/0x8a [btrfs] [<ffffffffa03788b7>] btrfs_update_inode+0xaa/0x141 [btrfs] [<ffffffffa036d7ec>] btrfs_save_ino_cache+0xea/0x202 [btrfs] [<ffffffffa03a761e>] ? btrfs_update_reloc_root+0x17e/0x197 [btrfs] [<ffffffffa0373867>] commit_fs_roots+0xaa/0x158 [btrfs] [<ffffffffa03746a6>] btrfs_commit_transaction+0x405/0x731 [btrfs] [<ffffffff810690df>] ? wake_up_bit+0x25/0x25 [<ffffffffa039d652>] ? btrfs_log_dentry_safe+0x43/0x51 [btrfs] [<ffffffffa0381c5f>] btrfs_sync_file+0x16a/0x198 [btrfs] [<ffffffff81122806>] ? mntput+0x21/0x23 [<ffffffff8112d150>] vfs_fsync_range+0x18/0x21 [<ffffffff8112d170>] vfs_fsync+0x17/0x19 [<ffffffff8112d316>] do_fsync+0x29/0x3e [<ffffffff8112d348>] sys_fsync+0xb/0xf [<ffffffff81468352>] system_call_fastpath+0x16/0x1b Sometimes it causes BUG_ON() in the reservation code of the delayed inode is triggered. So we must reserve enough space for inode cache. Note: If we can not reserve the enough space for inode cache, we will give up writing out it. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-11-11 08:45:04 +07:00
rsv = trans->block_rsv;
trans->block_rsv = &fs_info->trans_block_rsv;
Btrfs: fix no reserved space for writing out inode cache I-node cache forgets to reserve the space when writing out it. And when we do some stress test, such as synctest, it will trigger WARN_ON() in use_block_rsv(). WARNING: at fs/btrfs/extent-tree.c:5718 btrfs_alloc_free_block+0xbf/0x281 [btrfs]() ... Call Trace: [<ffffffff8104df86>] warn_slowpath_common+0x80/0x98 [<ffffffff8104dfb3>] warn_slowpath_null+0x15/0x17 [<ffffffffa0369c60>] btrfs_alloc_free_block+0xbf/0x281 [btrfs] [<ffffffff810cbcb8>] ? __set_page_dirty_nobuffers+0xfe/0x108 [<ffffffffa035c040>] __btrfs_cow_block+0x118/0x3b5 [btrfs] [<ffffffffa035c7ba>] btrfs_cow_block+0x103/0x14e [btrfs] [<ffffffffa035e4c4>] btrfs_search_slot+0x249/0x6a4 [btrfs] [<ffffffffa036d086>] btrfs_lookup_inode+0x2a/0x8a [btrfs] [<ffffffffa03788b7>] btrfs_update_inode+0xaa/0x141 [btrfs] [<ffffffffa036d7ec>] btrfs_save_ino_cache+0xea/0x202 [btrfs] [<ffffffffa03a761e>] ? btrfs_update_reloc_root+0x17e/0x197 [btrfs] [<ffffffffa0373867>] commit_fs_roots+0xaa/0x158 [btrfs] [<ffffffffa03746a6>] btrfs_commit_transaction+0x405/0x731 [btrfs] [<ffffffff810690df>] ? wake_up_bit+0x25/0x25 [<ffffffffa039d652>] ? btrfs_log_dentry_safe+0x43/0x51 [btrfs] [<ffffffffa0381c5f>] btrfs_sync_file+0x16a/0x198 [btrfs] [<ffffffff81122806>] ? mntput+0x21/0x23 [<ffffffff8112d150>] vfs_fsync_range+0x18/0x21 [<ffffffff8112d170>] vfs_fsync+0x17/0x19 [<ffffffff8112d316>] do_fsync+0x29/0x3e [<ffffffff8112d348>] sys_fsync+0xb/0xf [<ffffffff81468352>] system_call_fastpath+0x16/0x1b Sometimes it causes BUG_ON() in the reservation code of the delayed inode is triggered. So we must reserve enough space for inode cache. Note: If we can not reserve the enough space for inode cache, we will give up writing out it. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-11-11 08:45:04 +07:00
num_bytes = trans->bytes_reserved;
/*
* 1 item for inode item insertion if need
* 4 items for inode item update (in the worst case)
* 1 items for slack space if we need do truncation
Btrfs: fix no reserved space for writing out inode cache I-node cache forgets to reserve the space when writing out it. And when we do some stress test, such as synctest, it will trigger WARN_ON() in use_block_rsv(). WARNING: at fs/btrfs/extent-tree.c:5718 btrfs_alloc_free_block+0xbf/0x281 [btrfs]() ... Call Trace: [<ffffffff8104df86>] warn_slowpath_common+0x80/0x98 [<ffffffff8104dfb3>] warn_slowpath_null+0x15/0x17 [<ffffffffa0369c60>] btrfs_alloc_free_block+0xbf/0x281 [btrfs] [<ffffffff810cbcb8>] ? __set_page_dirty_nobuffers+0xfe/0x108 [<ffffffffa035c040>] __btrfs_cow_block+0x118/0x3b5 [btrfs] [<ffffffffa035c7ba>] btrfs_cow_block+0x103/0x14e [btrfs] [<ffffffffa035e4c4>] btrfs_search_slot+0x249/0x6a4 [btrfs] [<ffffffffa036d086>] btrfs_lookup_inode+0x2a/0x8a [btrfs] [<ffffffffa03788b7>] btrfs_update_inode+0xaa/0x141 [btrfs] [<ffffffffa036d7ec>] btrfs_save_ino_cache+0xea/0x202 [btrfs] [<ffffffffa03a761e>] ? btrfs_update_reloc_root+0x17e/0x197 [btrfs] [<ffffffffa0373867>] commit_fs_roots+0xaa/0x158 [btrfs] [<ffffffffa03746a6>] btrfs_commit_transaction+0x405/0x731 [btrfs] [<ffffffff810690df>] ? wake_up_bit+0x25/0x25 [<ffffffffa039d652>] ? btrfs_log_dentry_safe+0x43/0x51 [btrfs] [<ffffffffa0381c5f>] btrfs_sync_file+0x16a/0x198 [btrfs] [<ffffffff81122806>] ? mntput+0x21/0x23 [<ffffffff8112d150>] vfs_fsync_range+0x18/0x21 [<ffffffff8112d170>] vfs_fsync+0x17/0x19 [<ffffffff8112d316>] do_fsync+0x29/0x3e [<ffffffff8112d348>] sys_fsync+0xb/0xf [<ffffffff81468352>] system_call_fastpath+0x16/0x1b Sometimes it causes BUG_ON() in the reservation code of the delayed inode is triggered. So we must reserve enough space for inode cache. Note: If we can not reserve the enough space for inode cache, we will give up writing out it. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-11-11 08:45:04 +07:00
* 1 item for free space object
* 3 items for pre-allocation
*/
trans->bytes_reserved = btrfs_calc_insert_metadata_size(fs_info, 10);
ret = btrfs_block_rsv_add(root, trans->block_rsv,
trans->bytes_reserved,
BTRFS_RESERVE_NO_FLUSH);
Btrfs: fix no reserved space for writing out inode cache I-node cache forgets to reserve the space when writing out it. And when we do some stress test, such as synctest, it will trigger WARN_ON() in use_block_rsv(). WARNING: at fs/btrfs/extent-tree.c:5718 btrfs_alloc_free_block+0xbf/0x281 [btrfs]() ... Call Trace: [<ffffffff8104df86>] warn_slowpath_common+0x80/0x98 [<ffffffff8104dfb3>] warn_slowpath_null+0x15/0x17 [<ffffffffa0369c60>] btrfs_alloc_free_block+0xbf/0x281 [btrfs] [<ffffffff810cbcb8>] ? __set_page_dirty_nobuffers+0xfe/0x108 [<ffffffffa035c040>] __btrfs_cow_block+0x118/0x3b5 [btrfs] [<ffffffffa035c7ba>] btrfs_cow_block+0x103/0x14e [btrfs] [<ffffffffa035e4c4>] btrfs_search_slot+0x249/0x6a4 [btrfs] [<ffffffffa036d086>] btrfs_lookup_inode+0x2a/0x8a [btrfs] [<ffffffffa03788b7>] btrfs_update_inode+0xaa/0x141 [btrfs] [<ffffffffa036d7ec>] btrfs_save_ino_cache+0xea/0x202 [btrfs] [<ffffffffa03a761e>] ? btrfs_update_reloc_root+0x17e/0x197 [btrfs] [<ffffffffa0373867>] commit_fs_roots+0xaa/0x158 [btrfs] [<ffffffffa03746a6>] btrfs_commit_transaction+0x405/0x731 [btrfs] [<ffffffff810690df>] ? wake_up_bit+0x25/0x25 [<ffffffffa039d652>] ? btrfs_log_dentry_safe+0x43/0x51 [btrfs] [<ffffffffa0381c5f>] btrfs_sync_file+0x16a/0x198 [btrfs] [<ffffffff81122806>] ? mntput+0x21/0x23 [<ffffffff8112d150>] vfs_fsync_range+0x18/0x21 [<ffffffff8112d170>] vfs_fsync+0x17/0x19 [<ffffffff8112d316>] do_fsync+0x29/0x3e [<ffffffff8112d348>] sys_fsync+0xb/0xf [<ffffffff81468352>] system_call_fastpath+0x16/0x1b Sometimes it causes BUG_ON() in the reservation code of the delayed inode is triggered. So we must reserve enough space for inode cache. Note: If we can not reserve the enough space for inode cache, we will give up writing out it. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-11-11 08:45:04 +07:00
if (ret)
goto out;
trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
trans->bytes_reserved, 1);
again:
inode = lookup_free_ino_inode(root, path);
if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
ret = PTR_ERR(inode);
Btrfs: fix no reserved space for writing out inode cache I-node cache forgets to reserve the space when writing out it. And when we do some stress test, such as synctest, it will trigger WARN_ON() in use_block_rsv(). WARNING: at fs/btrfs/extent-tree.c:5718 btrfs_alloc_free_block+0xbf/0x281 [btrfs]() ... Call Trace: [<ffffffff8104df86>] warn_slowpath_common+0x80/0x98 [<ffffffff8104dfb3>] warn_slowpath_null+0x15/0x17 [<ffffffffa0369c60>] btrfs_alloc_free_block+0xbf/0x281 [btrfs] [<ffffffff810cbcb8>] ? __set_page_dirty_nobuffers+0xfe/0x108 [<ffffffffa035c040>] __btrfs_cow_block+0x118/0x3b5 [btrfs] [<ffffffffa035c7ba>] btrfs_cow_block+0x103/0x14e [btrfs] [<ffffffffa035e4c4>] btrfs_search_slot+0x249/0x6a4 [btrfs] [<ffffffffa036d086>] btrfs_lookup_inode+0x2a/0x8a [btrfs] [<ffffffffa03788b7>] btrfs_update_inode+0xaa/0x141 [btrfs] [<ffffffffa036d7ec>] btrfs_save_ino_cache+0xea/0x202 [btrfs] [<ffffffffa03a761e>] ? btrfs_update_reloc_root+0x17e/0x197 [btrfs] [<ffffffffa0373867>] commit_fs_roots+0xaa/0x158 [btrfs] [<ffffffffa03746a6>] btrfs_commit_transaction+0x405/0x731 [btrfs] [<ffffffff810690df>] ? wake_up_bit+0x25/0x25 [<ffffffffa039d652>] ? btrfs_log_dentry_safe+0x43/0x51 [btrfs] [<ffffffffa0381c5f>] btrfs_sync_file+0x16a/0x198 [btrfs] [<ffffffff81122806>] ? mntput+0x21/0x23 [<ffffffff8112d150>] vfs_fsync_range+0x18/0x21 [<ffffffff8112d170>] vfs_fsync+0x17/0x19 [<ffffffff8112d316>] do_fsync+0x29/0x3e [<ffffffff8112d348>] sys_fsync+0xb/0xf [<ffffffff81468352>] system_call_fastpath+0x16/0x1b Sometimes it causes BUG_ON() in the reservation code of the delayed inode is triggered. So we must reserve enough space for inode cache. Note: If we can not reserve the enough space for inode cache, we will give up writing out it. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-11-11 08:45:04 +07:00
goto out_release;
}
if (IS_ERR(inode)) {
BUG_ON(retry); /* Logic error */
retry = true;
ret = create_free_ino_inode(root, trans, path);
if (ret)
Btrfs: fix no reserved space for writing out inode cache I-node cache forgets to reserve the space when writing out it. And when we do some stress test, such as synctest, it will trigger WARN_ON() in use_block_rsv(). WARNING: at fs/btrfs/extent-tree.c:5718 btrfs_alloc_free_block+0xbf/0x281 [btrfs]() ... Call Trace: [<ffffffff8104df86>] warn_slowpath_common+0x80/0x98 [<ffffffff8104dfb3>] warn_slowpath_null+0x15/0x17 [<ffffffffa0369c60>] btrfs_alloc_free_block+0xbf/0x281 [btrfs] [<ffffffff810cbcb8>] ? __set_page_dirty_nobuffers+0xfe/0x108 [<ffffffffa035c040>] __btrfs_cow_block+0x118/0x3b5 [btrfs] [<ffffffffa035c7ba>] btrfs_cow_block+0x103/0x14e [btrfs] [<ffffffffa035e4c4>] btrfs_search_slot+0x249/0x6a4 [btrfs] [<ffffffffa036d086>] btrfs_lookup_inode+0x2a/0x8a [btrfs] [<ffffffffa03788b7>] btrfs_update_inode+0xaa/0x141 [btrfs] [<ffffffffa036d7ec>] btrfs_save_ino_cache+0xea/0x202 [btrfs] [<ffffffffa03a761e>] ? btrfs_update_reloc_root+0x17e/0x197 [btrfs] [<ffffffffa0373867>] commit_fs_roots+0xaa/0x158 [btrfs] [<ffffffffa03746a6>] btrfs_commit_transaction+0x405/0x731 [btrfs] [<ffffffff810690df>] ? wake_up_bit+0x25/0x25 [<ffffffffa039d652>] ? btrfs_log_dentry_safe+0x43/0x51 [btrfs] [<ffffffffa0381c5f>] btrfs_sync_file+0x16a/0x198 [btrfs] [<ffffffff81122806>] ? mntput+0x21/0x23 [<ffffffff8112d150>] vfs_fsync_range+0x18/0x21 [<ffffffff8112d170>] vfs_fsync+0x17/0x19 [<ffffffff8112d316>] do_fsync+0x29/0x3e [<ffffffff8112d348>] sys_fsync+0xb/0xf [<ffffffff81468352>] system_call_fastpath+0x16/0x1b Sometimes it causes BUG_ON() in the reservation code of the delayed inode is triggered. So we must reserve enough space for inode cache. Note: If we can not reserve the enough space for inode cache, we will give up writing out it. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-11-11 08:45:04 +07:00
goto out_release;
goto again;
}
BTRFS_I(inode)->generation = 0;
ret = btrfs_update_inode(trans, root, inode);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_put;
}
if (i_size_read(inode) > 0) {
ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
if (ret) {
if (ret != -ENOSPC)
btrfs_abort_transaction(trans, ret);
goto out_put;
}
}
spin_lock(&root->ino_cache_lock);
if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
ret = -1;
spin_unlock(&root->ino_cache_lock);
goto out_put;
}
spin_unlock(&root->ino_cache_lock);
spin_lock(&ctl->tree_lock);
prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 19:29:47 +07:00
prealloc = ALIGN(prealloc, PAGE_SIZE);
prealloc += ctl->total_bitmaps * PAGE_SIZE;
spin_unlock(&ctl->tree_lock);
/* Just to make sure we have enough space */
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 19:29:47 +07:00
prealloc += 8 * PAGE_SIZE;
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc);
if (ret)
goto out_put;
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
if (ret) {
btrfs: qgroup: Always free PREALLOC META reserve in btrfs_delalloc_release_extents() [Background] Btrfs qgroup uses two types of reserved space for METADATA space, PERTRANS and PREALLOC. PERTRANS is metadata space reserved for each transaction started by btrfs_start_transaction(). While PREALLOC is for delalloc, where we reserve space before joining a transaction, and finally it will be converted to PERTRANS after the writeback is done. [Inconsistency] However there is inconsistency in how we handle PREALLOC metadata space. The most obvious one is: In btrfs_buffered_write(): btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, true); We always free qgroup PREALLOC meta space. While in btrfs_truncate_block(): btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0)); We only free qgroup PREALLOC meta space when something went wrong. [The Correct Behavior] The correct behavior should be the one in btrfs_buffered_write(), we should always free PREALLOC metadata space. The reason is, the btrfs_delalloc_* mechanism works by: - Reserve metadata first, even it's not necessary In btrfs_delalloc_reserve_metadata() - Free the unused metadata space Normally in: btrfs_delalloc_release_extents() |- btrfs_inode_rsv_release() Here we do calculation on whether we should release or not. E.g. for 64K buffered write, the metadata rsv works like: /* The first page */ reserve_meta: num_bytes=calc_inode_reservations() free_meta: num_bytes=0 total: num_bytes=calc_inode_reservations() /* The first page caused one outstanding extent, thus needs metadata rsv */ /* The 2nd page */ reserve_meta: num_bytes=calc_inode_reservations() free_meta: num_bytes=calc_inode_reservations() total: not changed /* The 2nd page doesn't cause new outstanding extent, needs no new meta rsv, so we free what we have reserved */ /* The 3rd~16th pages */ reserve_meta: num_bytes=calc_inode_reservations() free_meta: num_bytes=calc_inode_reservations() total: not changed (still space for one outstanding extent) This means, if btrfs_delalloc_release_extents() determines to free some space, then those space should be freed NOW. So for qgroup, we should call btrfs_qgroup_free_meta_prealloc() other than btrfs_qgroup_convert_reserved_meta(). The good news is: - The callers are not that hot The hottest caller is in btrfs_buffered_write(), which is already fixed by commit 336a8bb8e36a ("btrfs: Fix wrong btrfs_delalloc_release_extents parameter"). Thus it's not that easy to cause false EDQUOT. - The trans commit in advance for qgroup would hide the bug Since commit f5fef4593653 ("btrfs: qgroup: Make qgroup async transaction commit more aggressive"), when btrfs qgroup metadata free space is slow, it will try to commit transaction and free the wrongly converted PERTRANS space, so it's not that easy to hit such bug. [FIX] So to fix the problem, remove the @qgroup_free parameter for btrfs_delalloc_release_extents(), and always pass true to btrfs_inode_rsv_release(). Reported-by: Filipe Manana <fdmanana@suse.com> Fixes: 43b18595d660 ("btrfs: qgroup: Use separate meta reservation type for delalloc") CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-14 13:34:51 +07:00
btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
Btrfs: fix inode cache block reserve leak on failure to allocate data space If we failed to allocate the data extent(s) for the inode space cache, we were bailing out without releasing the previously reserved metadata. This was triggering the following warnings when unmounting a filesystem: $ cat -n fs/btrfs/inode.c (...) 9268 void btrfs_destroy_inode(struct inode *inode) 9269 { (...) 9276 WARN_ON(BTRFS_I(inode)->block_rsv.reserved); 9277 WARN_ON(BTRFS_I(inode)->block_rsv.size); (...) 9281 WARN_ON(BTRFS_I(inode)->csum_bytes); 9282 WARN_ON(BTRFS_I(inode)->defrag_bytes); (...) Several fstests test cases triggered this often, such as generic/083, generic/102, generic/172, generic/269 and generic/300 at least, producing stack traces like the following in dmesg/syslog: [82039.079546] WARNING: CPU: 2 PID: 13167 at fs/btrfs/inode.c:9276 btrfs_destroy_inode+0x203/0x270 [btrfs] (...) [82039.081543] CPU: 2 PID: 13167 Comm: umount Tainted: G W 5.2.0-rc4-btrfs-next-50 #1 [82039.081912] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [82039.082673] RIP: 0010:btrfs_destroy_inode+0x203/0x270 [btrfs] (...) [82039.083913] RSP: 0018:ffffac0b426a7d30 EFLAGS: 00010206 [82039.084320] RAX: ffff8ddf77691158 RBX: ffff8dde29b34660 RCX: 0000000000000002 [82039.084736] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8dde29b34660 [82039.085156] RBP: ffff8ddf5fbec000 R08: 0000000000000000 R09: 0000000000000000 [82039.085578] R10: ffffac0b426a7c90 R11: ffffffffb9aad768 R12: ffffac0b426a7db0 [82039.086000] R13: ffff8ddf5fbec0a0 R14: dead000000000100 R15: 0000000000000000 [82039.086416] FS: 00007f8db96d12c0(0000) GS:ffff8de036b00000(0000) knlGS:0000000000000000 [82039.086837] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [82039.087253] CR2: 0000000001416108 CR3: 00000002315cc001 CR4: 00000000003606e0 [82039.087672] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [82039.088089] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [82039.088504] Call Trace: [82039.088918] destroy_inode+0x3b/0x70 [82039.089340] btrfs_free_fs_root+0x16/0xa0 [btrfs] [82039.089768] btrfs_free_fs_roots+0xd8/0x160 [btrfs] [82039.090183] ? wait_for_completion+0x65/0x1a0 [82039.090607] close_ctree+0x172/0x370 [btrfs] [82039.091021] generic_shutdown_super+0x6c/0x110 [82039.091427] kill_anon_super+0xe/0x30 [82039.091832] btrfs_kill_super+0x12/0xa0 [btrfs] [82039.092233] deactivate_locked_super+0x3a/0x70 [82039.092636] cleanup_mnt+0x3b/0x80 [82039.093039] task_work_run+0x93/0xc0 [82039.093457] exit_to_usermode_loop+0xfa/0x100 [82039.093856] do_syscall_64+0x162/0x1d0 [82039.094244] entry_SYSCALL_64_after_hwframe+0x49/0xbe [82039.094634] RIP: 0033:0x7f8db8fbab37 (...) [82039.095876] RSP: 002b:00007ffdce35b468 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [82039.096290] RAX: 0000000000000000 RBX: 0000560d20b00060 RCX: 00007f8db8fbab37 [82039.096700] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000560d20b00240 [82039.097110] RBP: 0000560d20b00240 R08: 0000560d20b00270 R09: 0000000000000015 [82039.097522] R10: 00000000000006b4 R11: 0000000000000246 R12: 00007f8db94bce64 [82039.097937] R13: 0000000000000000 R14: 0000000000000000 R15: 00007ffdce35b6f0 [82039.098350] irq event stamp: 0 [82039.098750] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [82039.099150] hardirqs last disabled at (0): [<ffffffffb7884ff2>] copy_process.part.33+0x7f2/0x1f00 [82039.099545] softirqs last enabled at (0): [<ffffffffb7884ff2>] copy_process.part.33+0x7f2/0x1f00 [82039.099925] softirqs last disabled at (0): [<0000000000000000>] 0x0 [82039.100292] ---[ end trace f2521afa616ddccc ]--- [82039.100707] WARNING: CPU: 2 PID: 13167 at fs/btrfs/inode.c:9277 btrfs_destroy_inode+0x1ac/0x270 [btrfs] (...) [82039.103050] CPU: 2 PID: 13167 Comm: umount Tainted: G W 5.2.0-rc4-btrfs-next-50 #1 [82039.103428] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [82039.104203] RIP: 0010:btrfs_destroy_inode+0x1ac/0x270 [btrfs] (...) [82039.105461] RSP: 0018:ffffac0b426a7d30 EFLAGS: 00010206 [82039.105866] RAX: ffff8ddf77691158 RBX: ffff8dde29b34660 RCX: 0000000000000002 [82039.106270] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8dde29b34660 [82039.106673] RBP: ffff8ddf5fbec000 R08: 0000000000000000 R09: 0000000000000000 [82039.107078] R10: ffffac0b426a7c90 R11: ffffffffb9aad768 R12: ffffac0b426a7db0 [82039.107487] R13: ffff8ddf5fbec0a0 R14: dead000000000100 R15: 0000000000000000 [82039.107894] FS: 00007f8db96d12c0(0000) GS:ffff8de036b00000(0000) knlGS:0000000000000000 [82039.108309] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [82039.108723] CR2: 0000000001416108 CR3: 00000002315cc001 CR4: 00000000003606e0 [82039.109146] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [82039.109567] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [82039.109989] Call Trace: [82039.110405] destroy_inode+0x3b/0x70 [82039.110830] btrfs_free_fs_root+0x16/0xa0 [btrfs] [82039.111257] btrfs_free_fs_roots+0xd8/0x160 [btrfs] [82039.111675] ? wait_for_completion+0x65/0x1a0 [82039.112101] close_ctree+0x172/0x370 [btrfs] [82039.112519] generic_shutdown_super+0x6c/0x110 [82039.112988] kill_anon_super+0xe/0x30 [82039.113439] btrfs_kill_super+0x12/0xa0 [btrfs] [82039.113861] deactivate_locked_super+0x3a/0x70 [82039.114278] cleanup_mnt+0x3b/0x80 [82039.114685] task_work_run+0x93/0xc0 [82039.115083] exit_to_usermode_loop+0xfa/0x100 [82039.115476] do_syscall_64+0x162/0x1d0 [82039.115863] entry_SYSCALL_64_after_hwframe+0x49/0xbe [82039.116254] RIP: 0033:0x7f8db8fbab37 (...) [82039.117463] RSP: 002b:00007ffdce35b468 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [82039.117882] RAX: 0000000000000000 RBX: 0000560d20b00060 RCX: 00007f8db8fbab37 [82039.118330] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000560d20b00240 [82039.118743] RBP: 0000560d20b00240 R08: 0000560d20b00270 R09: 0000000000000015 [82039.119159] R10: 00000000000006b4 R11: 0000000000000246 R12: 00007f8db94bce64 [82039.119574] R13: 0000000000000000 R14: 0000000000000000 R15: 00007ffdce35b6f0 [82039.119987] irq event stamp: 0 [82039.120387] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [82039.120787] hardirqs last disabled at (0): [<ffffffffb7884ff2>] copy_process.part.33+0x7f2/0x1f00 [82039.121182] softirqs last enabled at (0): [<ffffffffb7884ff2>] copy_process.part.33+0x7f2/0x1f00 [82039.121563] softirqs last disabled at (0): [<0000000000000000>] 0x0 [82039.121933] ---[ end trace f2521afa616ddccd ]--- [82039.122353] WARNING: CPU: 2 PID: 13167 at fs/btrfs/inode.c:9278 btrfs_destroy_inode+0x1bc/0x270 [btrfs] (...) [82039.124606] CPU: 2 PID: 13167 Comm: umount Tainted: G W 5.2.0-rc4-btrfs-next-50 #1 [82039.125008] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [82039.125801] RIP: 0010:btrfs_destroy_inode+0x1bc/0x270 [btrfs] (...) [82039.126998] RSP: 0018:ffffac0b426a7d30 EFLAGS: 00010202 [82039.127399] RAX: ffff8ddf77691158 RBX: ffff8dde29b34660 RCX: 0000000000000002 [82039.127803] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff8dde29b34660 [82039.128206] RBP: ffff8ddf5fbec000 R08: 0000000000000000 R09: 0000000000000000 [82039.128611] R10: ffffac0b426a7c90 R11: ffffffffb9aad768 R12: ffffac0b426a7db0 [82039.129020] R13: ffff8ddf5fbec0a0 R14: dead000000000100 R15: 0000000000000000 [82039.129428] FS: 00007f8db96d12c0(0000) GS:ffff8de036b00000(0000) knlGS:0000000000000000 [82039.129846] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [82039.130261] CR2: 0000000001416108 CR3: 00000002315cc001 CR4: 00000000003606e0 [82039.130684] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [82039.131142] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [82039.131561] Call Trace: [82039.131990] destroy_inode+0x3b/0x70 [82039.132417] btrfs_free_fs_root+0x16/0xa0 [btrfs] [82039.132844] btrfs_free_fs_roots+0xd8/0x160 [btrfs] [82039.133262] ? wait_for_completion+0x65/0x1a0 [82039.133688] close_ctree+0x172/0x370 [btrfs] [82039.134157] generic_shutdown_super+0x6c/0x110 [82039.134575] kill_anon_super+0xe/0x30 [82039.134997] btrfs_kill_super+0x12/0xa0 [btrfs] [82039.135415] deactivate_locked_super+0x3a/0x70 [82039.135832] cleanup_mnt+0x3b/0x80 [82039.136239] task_work_run+0x93/0xc0 [82039.136637] exit_to_usermode_loop+0xfa/0x100 [82039.137029] do_syscall_64+0x162/0x1d0 [82039.137418] entry_SYSCALL_64_after_hwframe+0x49/0xbe [82039.137812] RIP: 0033:0x7f8db8fbab37 (...) [82039.139059] RSP: 002b:00007ffdce35b468 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [82039.139475] RAX: 0000000000000000 RBX: 0000560d20b00060 RCX: 00007f8db8fbab37 [82039.139890] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000560d20b00240 [82039.140302] RBP: 0000560d20b00240 R08: 0000560d20b00270 R09: 0000000000000015 [82039.140719] R10: 00000000000006b4 R11: 0000000000000246 R12: 00007f8db94bce64 [82039.141138] R13: 0000000000000000 R14: 0000000000000000 R15: 00007ffdce35b6f0 [82039.141597] irq event stamp: 0 [82039.142043] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [82039.142443] hardirqs last disabled at (0): [<ffffffffb7884ff2>] copy_process.part.33+0x7f2/0x1f00 [82039.142839] softirqs last enabled at (0): [<ffffffffb7884ff2>] copy_process.part.33+0x7f2/0x1f00 [82039.143220] softirqs last disabled at (0): [<0000000000000000>] 0x0 [82039.143588] ---[ end trace f2521afa616ddcce ]--- [82039.167472] WARNING: CPU: 3 PID: 13167 at fs/btrfs/extent-tree.c:10120 btrfs_free_block_groups+0x30d/0x460 [btrfs] (...) [82039.173800] CPU: 3 PID: 13167 Comm: umount Tainted: G W 5.2.0-rc4-btrfs-next-50 #1 [82039.174847] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [82039.177031] RIP: 0010:btrfs_free_block_groups+0x30d/0x460 [btrfs] (...) [82039.180397] RSP: 0018:ffffac0b426a7dd8 EFLAGS: 00010206 [82039.181574] RAX: ffff8de010a1db40 RBX: ffff8de010a1db40 RCX: 0000000000170014 [82039.182711] RDX: ffff8ddff4380040 RSI: ffff8de010a1da58 RDI: 0000000000000246 [82039.183817] RBP: ffff8ddf5fbec000 R08: 0000000000000000 R09: 0000000000000000 [82039.184925] R10: ffff8de036404380 R11: ffffffffb8a5ea00 R12: ffff8de010a1b2b8 [82039.186090] R13: ffff8de010a1b2b8 R14: 0000000000000000 R15: dead000000000100 [82039.187208] FS: 00007f8db96d12c0(0000) GS:ffff8de036b80000(0000) knlGS:0000000000000000 [82039.188345] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [82039.189481] CR2: 00007fb044005170 CR3: 00000002315cc006 CR4: 00000000003606e0 [82039.190674] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [82039.191829] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [82039.192978] Call Trace: [82039.194160] close_ctree+0x19a/0x370 [btrfs] [82039.195315] generic_shutdown_super+0x6c/0x110 [82039.196486] kill_anon_super+0xe/0x30 [82039.197645] btrfs_kill_super+0x12/0xa0 [btrfs] [82039.198696] deactivate_locked_super+0x3a/0x70 [82039.199619] cleanup_mnt+0x3b/0x80 [82039.200559] task_work_run+0x93/0xc0 [82039.201505] exit_to_usermode_loop+0xfa/0x100 [82039.202436] do_syscall_64+0x162/0x1d0 [82039.203339] entry_SYSCALL_64_after_hwframe+0x49/0xbe [82039.204091] RIP: 0033:0x7f8db8fbab37 (...) [82039.206360] RSP: 002b:00007ffdce35b468 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [82039.207132] RAX: 0000000000000000 RBX: 0000560d20b00060 RCX: 00007f8db8fbab37 [82039.207906] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000560d20b00240 [82039.208621] RBP: 0000560d20b00240 R08: 0000560d20b00270 R09: 0000000000000015 [82039.209285] R10: 00000000000006b4 R11: 0000000000000246 R12: 00007f8db94bce64 [82039.209984] R13: 0000000000000000 R14: 0000000000000000 R15: 00007ffdce35b6f0 [82039.210642] irq event stamp: 0 [82039.211306] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [82039.211971] hardirqs last disabled at (0): [<ffffffffb7884ff2>] copy_process.part.33+0x7f2/0x1f00 [82039.212643] softirqs last enabled at (0): [<ffffffffb7884ff2>] copy_process.part.33+0x7f2/0x1f00 [82039.213304] softirqs last disabled at (0): [<0000000000000000>] 0x0 [82039.213875] ---[ end trace f2521afa616ddccf ]--- Fix this by releasing the reserved metadata on failure to allocate data extent(s) for the inode cache. Fixes: 69fe2d75dd91d0 ("btrfs: make the delalloc block rsv per inode") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-04 22:24:19 +07:00
btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true);
goto out_put;
}
ret = btrfs_write_out_ino_cache(root, trans, path, inode);
btrfs: qgroup: Always free PREALLOC META reserve in btrfs_delalloc_release_extents() [Background] Btrfs qgroup uses two types of reserved space for METADATA space, PERTRANS and PREALLOC. PERTRANS is metadata space reserved for each transaction started by btrfs_start_transaction(). While PREALLOC is for delalloc, where we reserve space before joining a transaction, and finally it will be converted to PERTRANS after the writeback is done. [Inconsistency] However there is inconsistency in how we handle PREALLOC metadata space. The most obvious one is: In btrfs_buffered_write(): btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, true); We always free qgroup PREALLOC meta space. While in btrfs_truncate_block(): btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0)); We only free qgroup PREALLOC meta space when something went wrong. [The Correct Behavior] The correct behavior should be the one in btrfs_buffered_write(), we should always free PREALLOC metadata space. The reason is, the btrfs_delalloc_* mechanism works by: - Reserve metadata first, even it's not necessary In btrfs_delalloc_reserve_metadata() - Free the unused metadata space Normally in: btrfs_delalloc_release_extents() |- btrfs_inode_rsv_release() Here we do calculation on whether we should release or not. E.g. for 64K buffered write, the metadata rsv works like: /* The first page */ reserve_meta: num_bytes=calc_inode_reservations() free_meta: num_bytes=0 total: num_bytes=calc_inode_reservations() /* The first page caused one outstanding extent, thus needs metadata rsv */ /* The 2nd page */ reserve_meta: num_bytes=calc_inode_reservations() free_meta: num_bytes=calc_inode_reservations() total: not changed /* The 2nd page doesn't cause new outstanding extent, needs no new meta rsv, so we free what we have reserved */ /* The 3rd~16th pages */ reserve_meta: num_bytes=calc_inode_reservations() free_meta: num_bytes=calc_inode_reservations() total: not changed (still space for one outstanding extent) This means, if btrfs_delalloc_release_extents() determines to free some space, then those space should be freed NOW. So for qgroup, we should call btrfs_qgroup_free_meta_prealloc() other than btrfs_qgroup_convert_reserved_meta(). The good news is: - The callers are not that hot The hottest caller is in btrfs_buffered_write(), which is already fixed by commit 336a8bb8e36a ("btrfs: Fix wrong btrfs_delalloc_release_extents parameter"). Thus it's not that easy to cause false EDQUOT. - The trans commit in advance for qgroup would hide the bug Since commit f5fef4593653 ("btrfs: qgroup: Make qgroup async transaction commit more aggressive"), when btrfs qgroup metadata free space is slow, it will try to commit transaction and free the wrongly converted PERTRANS space, so it's not that easy to hit such bug. [FIX] So to fix the problem, remove the @qgroup_free parameter for btrfs_delalloc_release_extents(), and always pass true to btrfs_inode_rsv_release(). Reported-by: Filipe Manana <fdmanana@suse.com> Fixes: 43b18595d660 ("btrfs: qgroup: Use separate meta reservation type for delalloc") CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-14 13:34:51 +07:00
btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
out_put:
iput(inode);
Btrfs: fix no reserved space for writing out inode cache I-node cache forgets to reserve the space when writing out it. And when we do some stress test, such as synctest, it will trigger WARN_ON() in use_block_rsv(). WARNING: at fs/btrfs/extent-tree.c:5718 btrfs_alloc_free_block+0xbf/0x281 [btrfs]() ... Call Trace: [<ffffffff8104df86>] warn_slowpath_common+0x80/0x98 [<ffffffff8104dfb3>] warn_slowpath_null+0x15/0x17 [<ffffffffa0369c60>] btrfs_alloc_free_block+0xbf/0x281 [btrfs] [<ffffffff810cbcb8>] ? __set_page_dirty_nobuffers+0xfe/0x108 [<ffffffffa035c040>] __btrfs_cow_block+0x118/0x3b5 [btrfs] [<ffffffffa035c7ba>] btrfs_cow_block+0x103/0x14e [btrfs] [<ffffffffa035e4c4>] btrfs_search_slot+0x249/0x6a4 [btrfs] [<ffffffffa036d086>] btrfs_lookup_inode+0x2a/0x8a [btrfs] [<ffffffffa03788b7>] btrfs_update_inode+0xaa/0x141 [btrfs] [<ffffffffa036d7ec>] btrfs_save_ino_cache+0xea/0x202 [btrfs] [<ffffffffa03a761e>] ? btrfs_update_reloc_root+0x17e/0x197 [btrfs] [<ffffffffa0373867>] commit_fs_roots+0xaa/0x158 [btrfs] [<ffffffffa03746a6>] btrfs_commit_transaction+0x405/0x731 [btrfs] [<ffffffff810690df>] ? wake_up_bit+0x25/0x25 [<ffffffffa039d652>] ? btrfs_log_dentry_safe+0x43/0x51 [btrfs] [<ffffffffa0381c5f>] btrfs_sync_file+0x16a/0x198 [btrfs] [<ffffffff81122806>] ? mntput+0x21/0x23 [<ffffffff8112d150>] vfs_fsync_range+0x18/0x21 [<ffffffff8112d170>] vfs_fsync+0x17/0x19 [<ffffffff8112d316>] do_fsync+0x29/0x3e [<ffffffff8112d348>] sys_fsync+0xb/0xf [<ffffffff81468352>] system_call_fastpath+0x16/0x1b Sometimes it causes BUG_ON() in the reservation code of the delayed inode is triggered. So we must reserve enough space for inode cache. Note: If we can not reserve the enough space for inode cache, we will give up writing out it. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-11-11 08:45:04 +07:00
out_release:
trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid,
trans->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, trans->block_rsv,
trans->bytes_reserved, NULL);
out:
Btrfs: fix no reserved space for writing out inode cache I-node cache forgets to reserve the space when writing out it. And when we do some stress test, such as synctest, it will trigger WARN_ON() in use_block_rsv(). WARNING: at fs/btrfs/extent-tree.c:5718 btrfs_alloc_free_block+0xbf/0x281 [btrfs]() ... Call Trace: [<ffffffff8104df86>] warn_slowpath_common+0x80/0x98 [<ffffffff8104dfb3>] warn_slowpath_null+0x15/0x17 [<ffffffffa0369c60>] btrfs_alloc_free_block+0xbf/0x281 [btrfs] [<ffffffff810cbcb8>] ? __set_page_dirty_nobuffers+0xfe/0x108 [<ffffffffa035c040>] __btrfs_cow_block+0x118/0x3b5 [btrfs] [<ffffffffa035c7ba>] btrfs_cow_block+0x103/0x14e [btrfs] [<ffffffffa035e4c4>] btrfs_search_slot+0x249/0x6a4 [btrfs] [<ffffffffa036d086>] btrfs_lookup_inode+0x2a/0x8a [btrfs] [<ffffffffa03788b7>] btrfs_update_inode+0xaa/0x141 [btrfs] [<ffffffffa036d7ec>] btrfs_save_ino_cache+0xea/0x202 [btrfs] [<ffffffffa03a761e>] ? btrfs_update_reloc_root+0x17e/0x197 [btrfs] [<ffffffffa0373867>] commit_fs_roots+0xaa/0x158 [btrfs] [<ffffffffa03746a6>] btrfs_commit_transaction+0x405/0x731 [btrfs] [<ffffffff810690df>] ? wake_up_bit+0x25/0x25 [<ffffffffa039d652>] ? btrfs_log_dentry_safe+0x43/0x51 [btrfs] [<ffffffffa0381c5f>] btrfs_sync_file+0x16a/0x198 [btrfs] [<ffffffff81122806>] ? mntput+0x21/0x23 [<ffffffff8112d150>] vfs_fsync_range+0x18/0x21 [<ffffffff8112d170>] vfs_fsync+0x17/0x19 [<ffffffff8112d316>] do_fsync+0x29/0x3e [<ffffffff8112d348>] sys_fsync+0xb/0xf [<ffffffff81468352>] system_call_fastpath+0x16/0x1b Sometimes it causes BUG_ON() in the reservation code of the delayed inode is triggered. So we must reserve enough space for inode cache. Note: If we can not reserve the enough space for inode cache, we will give up writing out it. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-11-11 08:45:04 +07:00
trans->block_rsv = rsv;
trans->bytes_reserved = num_bytes;
btrfs_free_path(path);
extent_changeset_free(data_reserved);
return ret;
}
Btrfs: Initialize btrfs_root->highest_objectid when loading tree root and subvolume roots The following call trace is seen when btrfs/031 test is executed in a loop, [ 158.661848] ------------[ cut here ]------------ [ 158.662634] WARNING: CPU: 2 PID: 890 at /home/chandan/repos/linux/fs/btrfs/ioctl.c:558 create_subvol+0x3d1/0x6ea() [ 158.664102] BTRFS: Transaction aborted (error -2) [ 158.664774] Modules linked in: [ 158.665266] CPU: 2 PID: 890 Comm: btrfs Not tainted 4.4.0-rc6-g511711a #2 [ 158.666251] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 [ 158.667392] ffffffff81c0a6b0 ffff8806c7c4f8e8 ffffffff81431fc8 ffff8806c7c4f930 [ 158.668515] ffff8806c7c4f920 ffffffff81051aa1 ffff880c85aff000 ffff8800bb44d000 [ 158.669647] ffff8808863b5c98 0000000000000000 00000000fffffffe ffff8806c7c4f980 [ 158.670769] Call Trace: [ 158.671153] [<ffffffff81431fc8>] dump_stack+0x44/0x5c [ 158.671884] [<ffffffff81051aa1>] warn_slowpath_common+0x81/0xc0 [ 158.672769] [<ffffffff81051b27>] warn_slowpath_fmt+0x47/0x50 [ 158.673620] [<ffffffff813bc98d>] create_subvol+0x3d1/0x6ea [ 158.674440] [<ffffffff813777c9>] btrfs_mksubvol.isra.30+0x369/0x520 [ 158.675376] [<ffffffff8108a4aa>] ? percpu_down_read+0x1a/0x50 [ 158.676235] [<ffffffff81377a81>] btrfs_ioctl_snap_create_transid+0x101/0x180 [ 158.677268] [<ffffffff81377b52>] btrfs_ioctl_snap_create+0x52/0x70 [ 158.678183] [<ffffffff8137afb4>] btrfs_ioctl+0x474/0x2f90 [ 158.678975] [<ffffffff81144b8e>] ? vma_merge+0xee/0x300 [ 158.679751] [<ffffffff8115be31>] ? alloc_pages_vma+0x91/0x170 [ 158.680599] [<ffffffff81123f62>] ? lru_cache_add_active_or_unevictable+0x22/0x70 [ 158.681686] [<ffffffff813d99cf>] ? selinux_file_ioctl+0xff/0x1d0 [ 158.682581] [<ffffffff8117b791>] do_vfs_ioctl+0x2c1/0x490 [ 158.683399] [<ffffffff813d3cde>] ? security_file_ioctl+0x3e/0x60 [ 158.684297] [<ffffffff8117b9d4>] SyS_ioctl+0x74/0x80 [ 158.685051] [<ffffffff819b2bd7>] entry_SYSCALL_64_fastpath+0x12/0x6a [ 158.685958] ---[ end trace 4b63312de5a2cb76 ]--- [ 158.686647] BTRFS: error (device loop0) in create_subvol:558: errno=-2 No such entry [ 158.709508] BTRFS info (device loop0): forced readonly [ 158.737113] BTRFS info (device loop0): disk space caching is enabled [ 158.738096] BTRFS error (device loop0): Remounting read-write after error is not allowed [ 158.851303] BTRFS error (device loop0): cleaner transaction attach returned -30 This occurs because, Mount filesystem Create subvol with ID 257 Unmount filesystem Mount filesystem Delete subvol with ID 257 btrfs_drop_snapshot() Add root corresponding to subvol 257 into btrfs_transaction->dropped_roots list Create new subvol (i.e. create_subvol()) 257 is returned as the next free objectid btrfs_read_fs_root_no_name() Finds the btrfs_root instance corresponding to the old subvol with ID 257 in btrfs_fs_info->fs_roots_radix. Returns error since btrfs_root_item->refs has the value of 0. To fix the issue the commit initializes tree root's and subvolume root's highest_objectid when loading the roots from disk. Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com> Signed-off-by: David Sterba <dsterba@suse.com>
2016-01-07 20:26:59 +07:00
int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
{
struct btrfs_path *path;
int ret;
struct extent_buffer *l;
struct btrfs_key search_key;
struct btrfs_key found_key;
int slot;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
search_key.type = -1;
search_key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
goto error;
BUG_ON(ret == 0); /* Corruption */
if (path->slots[0] > 0) {
slot = path->slots[0] - 1;
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
*objectid = max_t(u64, found_key.objectid,
BTRFS_FIRST_FREE_OBJECTID - 1);
} else {
*objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
}
ret = 0;
error:
btrfs_free_path(path);
return ret;
}
int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
{
int ret;
mutex_lock(&root->objectid_mutex);
if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
btrfs_warn(root->fs_info,
"the objectid of root %llu reaches its highest value",
root->root_key.objectid);
ret = -ENOSPC;
goto out;
}
*objectid = ++root->highest_objectid;
ret = 0;
out:
mutex_unlock(&root->objectid_mutex);
return ret;
}