linux_dsm_epyc7002/fs/btrfs/inode-map.c
Filipe Manana 55507ce361 Btrfs: fix race between writing free space cache and trimming
Trimming is completely transactionless, and the way it operates consists
of hiding free space entries from a block group, perform the trim/discard
and then make the free space entries visible again.
Therefore while a free space entry is being trimmed, we can have free space
cache writing running in parallel (as part of a transaction commit) which
will miss the free space entry. This means that an unmount (or crash/reboot)
after that transaction commit and mount again before another transaction
starts/commits after the discard finishes, we will have some free space
that won't be used again unless the free space cache is rebuilt. After the
unmount, fsck (btrfsck, btrfs check) reports the issue like the following
example:

        *** fsck.btrfs output ***
        checking extents
        checking free space cache
        There is no free space entry for 521764864-521781248
        There is no free space entry for 521764864-1103101952
        cache appears valid but isnt 29360128
        Checking filesystem on /dev/sdc
        UUID: b4789e27-4774-4626-98e9-ae8dfbfb0fb5
        found 1235681286 bytes used err is -22
        (...)

Another issue caused by this race is a crash while writing bitmap entries
to the cache, because while the cache writeout task accesses the bitmaps,
the trim task can be concurrently modifying the bitmap or worse might
be freeing the bitmap. The later case results in the following crash:

[55650.804460] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC
[55650.804835] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc loop parport_pc parport i2c_piix4 psmouse evdev pcspkr microcode processor i2ccore serio_raw thermal_sys button ext4 crc16 jbd2 mbcache sg sd_mod crc_t10dif sr_mod cdrom crct10dif_generic crct10dif_common ata_generic virtio_scsi floppy ata_piix libata virtio_pci virtio_ring virtio scsi_mod e1000 [last unloaded: btrfs]
[55650.806169] CPU: 1 PID: 31002 Comm: btrfs-transacti Tainted: G        W      3.17.0-rc5-btrfs-next-1+ #1
[55650.806493] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
[55650.806867] task: ffff8800b12f6410 ti: ffff880071538000 task.ti: ffff880071538000
[55650.807166] RIP: 0010:[<ffffffffa037cf45>]  [<ffffffffa037cf45>] write_bitmap_entries+0x65/0xbb [btrfs]
[55650.807514] RSP: 0018:ffff88007153bc30  EFLAGS: 00010246
[55650.807687] RAX: 000000005d1ec000 RBX: ffff8800a665df08 RCX: 0000000000000400
[55650.807885] RDX: ffff88005d1ec000 RSI: 6b6b6b6b6b6b6b6b RDI: ffff88005d1ec000
[55650.808017] RBP: ffff88007153bc58 R08: 00000000ddd51536 R09: 00000000000001e0
[55650.808017] R10: 0000000000000000 R11: 0000000000000037 R12: 6b6b6b6b6b6b6b6b
[55650.808017] R13: ffff88007153bca8 R14: 6b6b6b6b6b6b6b6b R15: ffff88007153bc98
[55650.808017] FS:  0000000000000000(0000) GS:ffff88023ec80000(0000) knlGS:0000000000000000
[55650.808017] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[55650.808017] CR2: 0000000002273b88 CR3: 00000000b18f6000 CR4: 00000000000006e0
[55650.808017] Stack:
[55650.808017]  ffff88020e834e00 ffff880172d68db0 0000000000000000 ffff88019257c800
[55650.808017]  ffff8801d42ea720 ffff88007153bd10 ffffffffa037d2fa ffff880224e99180
[55650.808017]  ffff8801469a6188 ffff880224e99140 ffff880172d68c50 00000003000000b7
[55650.808017] Call Trace:
[55650.808017]  [<ffffffffa037d2fa>] __btrfs_write_out_cache+0x1ea/0x37f [btrfs]
[55650.808017]  [<ffffffffa037d959>] btrfs_write_out_cache+0xa1/0xd8 [btrfs]
[55650.808017]  [<ffffffffa033936b>] btrfs_write_dirty_block_groups+0x4b5/0x505 [btrfs]
[55650.808017]  [<ffffffffa03aa98e>] commit_cowonly_roots+0x15e/0x1f7 [btrfs]
[55650.808017]  [<ffffffff813eb9c7>] ? _raw_spin_lock+0xe/0x10
[55650.808017]  [<ffffffffa0346e46>] btrfs_commit_transaction+0x411/0x882 [btrfs]
[55650.808017]  [<ffffffffa03432a4>] transaction_kthread+0xf2/0x1a4 [btrfs]
[55650.808017]  [<ffffffffa03431b2>] ? btrfs_cleanup_transaction+0x3d8/0x3d8 [btrfs]
[55650.808017]  [<ffffffff8105966b>] kthread+0xb7/0xbf
[55650.808017]  [<ffffffff810595b4>] ? __kthread_parkme+0x67/0x67
[55650.808017]  [<ffffffff813ebeac>] ret_from_fork+0x7c/0xb0
[55650.808017]  [<ffffffff810595b4>] ? __kthread_parkme+0x67/0x67
[55650.808017] Code: 4c 89 ef 8d 70 ff e8 d4 fc ff ff 41 8b 45 34 41 39 45 30 7d 5c 31 f6 4c 89 ef e8 80 f6 ff ff 49 8b 7d 00 4c 89 f6 b9 00 04 00 00 <f3> a5 4c 89 ef 41 8b 45 30 8d 70 ff e8 a3 fc ff ff 41 8b 45 34
[55650.808017] RIP  [<ffffffffa037cf45>] write_bitmap_entries+0x65/0xbb [btrfs]
[55650.808017]  RSP <ffff88007153bc30>
[55650.815725] ---[ end trace 1c032e96b149ff86 ]---

Fix this by serializing both tasks in such a way that cache writeout
doesn't wait for the trim/discard of free space entries to finish and
doesn't miss any free space entry.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-12-02 18:35:09 -08:00

569 lines
14 KiB
C

/*
* Copyright (C) 2007 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/pagemap.h>
#include "ctree.h"
#include "disk-io.h"
#include "free-space-cache.h"
#include "inode-map.h"
#include "transaction.h"
static int caching_kthread(void *data)
{
struct btrfs_root *root = data;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct btrfs_key key;
struct btrfs_path *path;
struct extent_buffer *leaf;
u64 last = (u64)-1;
int slot;
int ret;
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
/* Since the commit root is read-only, we can safely skip locking. */
path->skip_locking = 1;
path->search_commit_root = 1;
path->reada = 2;
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.offset = 0;
key.type = BTRFS_INODE_ITEM_KEY;
again:
/* need to make sure the commit_root doesn't disappear */
down_read(&fs_info->commit_root_sem);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
while (1) {
if (btrfs_fs_closing(fs_info))
goto out;
leaf = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out;
else if (ret > 0)
break;
if (need_resched() ||
btrfs_transaction_in_commit(fs_info)) {
leaf = path->nodes[0];
if (WARN_ON(btrfs_header_nritems(leaf) == 0))
break;
/*
* Save the key so we can advances forward
* in the next search.
*/
btrfs_item_key_to_cpu(leaf, &key, 0);
btrfs_release_path(path);
root->ino_cache_progress = last;
up_read(&fs_info->commit_root_sem);
schedule_timeout(1);
goto again;
} else
continue;
}
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.type != BTRFS_INODE_ITEM_KEY)
goto next;
if (key.objectid >= root->highest_objectid)
break;
if (last != (u64)-1 && last + 1 != key.objectid) {
__btrfs_add_free_space(ctl, last + 1,
key.objectid - last - 1);
wake_up(&root->ino_cache_wait);
}
last = key.objectid;
next:
path->slots[0]++;
}
if (last < root->highest_objectid - 1) {
__btrfs_add_free_space(ctl, last + 1,
root->highest_objectid - last - 1);
}
spin_lock(&root->ino_cache_lock);
root->ino_cache_state = BTRFS_CACHE_FINISHED;
spin_unlock(&root->ino_cache_lock);
root->ino_cache_progress = (u64)-1;
btrfs_unpin_free_ino(root);
out:
wake_up(&root->ino_cache_wait);
up_read(&fs_info->commit_root_sem);
btrfs_free_path(path);
return ret;
}
static void start_caching(struct btrfs_root *root)
{
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct task_struct *tsk;
int ret;
u64 objectid;
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return;
spin_lock(&root->ino_cache_lock);
if (root->ino_cache_state != BTRFS_CACHE_NO) {
spin_unlock(&root->ino_cache_lock);
return;
}
root->ino_cache_state = BTRFS_CACHE_STARTED;
spin_unlock(&root->ino_cache_lock);
ret = load_free_ino_cache(root->fs_info, root);
if (ret == 1) {
spin_lock(&root->ino_cache_lock);
root->ino_cache_state = BTRFS_CACHE_FINISHED;
spin_unlock(&root->ino_cache_lock);
return;
}
/*
* It can be quite time-consuming to fill the cache by searching
* through the extent tree, and this can keep ino allocation path
* waiting. Therefore at start we quickly find out the highest
* inode number and we know we can use inode numbers which fall in
* [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
*/
ret = btrfs_find_free_objectid(root, &objectid);
if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
__btrfs_add_free_space(ctl, objectid,
BTRFS_LAST_FREE_OBJECTID - objectid + 1);
}
tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
root->root_key.objectid);
if (IS_ERR(tsk)) {
btrfs_warn(root->fs_info, "failed to start inode caching task");
btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE,
"disabling inode map caching");
}
}
int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
{
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return btrfs_find_free_objectid(root, objectid);
again:
*objectid = btrfs_find_ino_for_alloc(root);
if (*objectid != 0)
return 0;
start_caching(root);
wait_event(root->ino_cache_wait,
root->ino_cache_state == BTRFS_CACHE_FINISHED ||
root->free_ino_ctl->free_space > 0);
if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
root->free_ino_ctl->free_space == 0)
return -ENOSPC;
else
goto again;
}
void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
{
struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return;
again:
if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
__btrfs_add_free_space(pinned, objectid, 1);
} else {
down_write(&root->fs_info->commit_root_sem);
spin_lock(&root->ino_cache_lock);
if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
spin_unlock(&root->ino_cache_lock);
up_write(&root->fs_info->commit_root_sem);
goto again;
}
spin_unlock(&root->ino_cache_lock);
start_caching(root);
__btrfs_add_free_space(pinned, objectid, 1);
up_write(&root->fs_info->commit_root_sem);
}
}
/*
* When a transaction is committed, we'll move those inode numbers which are
* smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
* others will just be dropped, because the commit root we were searching has
* changed.
*
* Must be called with root->fs_info->commit_root_sem held
*/
void btrfs_unpin_free_ino(struct btrfs_root *root)
{
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
struct btrfs_free_space *info;
struct rb_node *n;
u64 count;
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return;
while (1) {
n = rb_first(rbroot);
if (!n)
break;
info = rb_entry(n, struct btrfs_free_space, offset_index);
BUG_ON(info->bitmap); /* Logic error */
if (info->offset > root->ino_cache_progress)
goto free;
else if (info->offset + info->bytes > root->ino_cache_progress)
count = root->ino_cache_progress - info->offset + 1;
else
count = info->bytes;
__btrfs_add_free_space(ctl, info->offset, count);
free:
rb_erase(&info->offset_index, rbroot);
kfree(info);
}
}
#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space))
#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)
/*
* The goal is to keep the memory used by the free_ino tree won't
* exceed the memory if we use bitmaps only.
*/
static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
struct btrfs_free_space *info;
struct rb_node *n;
int max_ino;
int max_bitmaps;
n = rb_last(&ctl->free_space_offset);
if (!n) {
ctl->extents_thresh = INIT_THRESHOLD;
return;
}
info = rb_entry(n, struct btrfs_free_space, offset_index);
/*
* Find the maximum inode number in the filesystem. Note we
* ignore the fact that this can be a bitmap, because we are
* not doing precise calculation.
*/
max_ino = info->bytes - 1;
max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
if (max_bitmaps <= ctl->total_bitmaps) {
ctl->extents_thresh = 0;
return;
}
ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
PAGE_CACHE_SIZE / sizeof(*info);
}
/*
* We don't fall back to bitmap, if we are below the extents threshold
* or this chunk of inode numbers is a big one.
*/
static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
if (ctl->free_extents < ctl->extents_thresh ||
info->bytes > INODES_PER_BITMAP / 10)
return false;
return true;
}
static struct btrfs_free_space_op free_ino_op = {
.recalc_thresholds = recalculate_thresholds,
.use_bitmap = use_bitmap,
};
static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
{
}
static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
/*
* We always use extents for two reasons:
*
* - The pinned tree is only used during the process of caching
* work.
* - Make code simpler. See btrfs_unpin_free_ino().
*/
return false;
}
static struct btrfs_free_space_op pinned_free_ino_op = {
.recalc_thresholds = pinned_recalc_thresholds,
.use_bitmap = pinned_use_bitmap,
};
void btrfs_init_free_ino_ctl(struct btrfs_root *root)
{
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
spin_lock_init(&ctl->tree_lock);
ctl->unit = 1;
ctl->start = 0;
ctl->private = NULL;
ctl->op = &free_ino_op;
INIT_LIST_HEAD(&ctl->trimming_ranges);
mutex_init(&ctl->cache_writeout_mutex);
/*
* Initially we allow to use 16K of ram to cache chunks of
* inode numbers before we resort to bitmaps. This is somewhat
* arbitrary, but it will be adjusted in runtime.
*/
ctl->extents_thresh = INIT_THRESHOLD;
spin_lock_init(&pinned->tree_lock);
pinned->unit = 1;
pinned->start = 0;
pinned->private = NULL;
pinned->extents_thresh = 0;
pinned->op = &pinned_free_ino_op;
}
int btrfs_save_ino_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans)
{
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct btrfs_path *path;
struct inode *inode;
struct btrfs_block_rsv *rsv;
u64 num_bytes;
u64 alloc_hint = 0;
int ret;
int prealloc;
bool retry = false;
/* only fs tree and subvol/snap needs ino cache */
if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
(root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
return 0;
/* Don't save inode cache if we are deleting this root */
if (btrfs_root_refs(&root->root_item) == 0)
return 0;
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
rsv = trans->block_rsv;
trans->block_rsv = &root->fs_info->trans_block_rsv;
num_bytes = trans->bytes_reserved;
/*
* 1 item for inode item insertion if need
* 4 items for inode item update (in the worst case)
* 1 items for slack space if we need do truncation
* 1 item for free space object
* 3 items for pre-allocation
*/
trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10);
ret = btrfs_block_rsv_add(root, trans->block_rsv,
trans->bytes_reserved,
BTRFS_RESERVE_NO_FLUSH);
if (ret)
goto out;
trace_btrfs_space_reservation(root->fs_info, "ino_cache",
trans->transid, trans->bytes_reserved, 1);
again:
inode = lookup_free_ino_inode(root, path);
if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
ret = PTR_ERR(inode);
goto out_release;
}
if (IS_ERR(inode)) {
BUG_ON(retry); /* Logic error */
retry = true;
ret = create_free_ino_inode(root, trans, path);
if (ret)
goto out_release;
goto again;
}
BTRFS_I(inode)->generation = 0;
ret = btrfs_update_inode(trans, root, inode);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out_put;
}
if (i_size_read(inode) > 0) {
ret = btrfs_truncate_free_space_cache(root, trans, inode);
if (ret) {
if (ret != -ENOSPC)
btrfs_abort_transaction(trans, root, ret);
goto out_put;
}
}
spin_lock(&root->ino_cache_lock);
if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
ret = -1;
spin_unlock(&root->ino_cache_lock);
goto out_put;
}
spin_unlock(&root->ino_cache_lock);
spin_lock(&ctl->tree_lock);
prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE);
prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE;
spin_unlock(&ctl->tree_lock);
/* Just to make sure we have enough space */
prealloc += 8 * PAGE_CACHE_SIZE;
ret = btrfs_delalloc_reserve_space(inode, prealloc);
if (ret)
goto out_put;
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
if (ret) {
btrfs_delalloc_release_space(inode, prealloc);
goto out_put;
}
btrfs_free_reserved_data_space(inode, prealloc);
ret = btrfs_write_out_ino_cache(root, trans, path, inode);
out_put:
iput(inode);
out_release:
trace_btrfs_space_reservation(root->fs_info, "ino_cache",
trans->transid, trans->bytes_reserved, 0);
btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
out:
trans->block_rsv = rsv;
trans->bytes_reserved = num_bytes;
btrfs_free_path(path);
return ret;
}
static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
{
struct btrfs_path *path;
int ret;
struct extent_buffer *l;
struct btrfs_key search_key;
struct btrfs_key found_key;
int slot;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
search_key.type = -1;
search_key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
goto error;
BUG_ON(ret == 0); /* Corruption */
if (path->slots[0] > 0) {
slot = path->slots[0] - 1;
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
*objectid = max_t(u64, found_key.objectid,
BTRFS_FIRST_FREE_OBJECTID - 1);
} else {
*objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
}
ret = 0;
error:
btrfs_free_path(path);
return ret;
}
int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
{
int ret;
mutex_lock(&root->objectid_mutex);
if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_find_highest_objectid(root,
&root->highest_objectid);
if (ret)
goto out;
}
if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
ret = -ENOSPC;
goto out;
}
*objectid = ++root->highest_objectid;
ret = 0;
out:
mutex_unlock(&root->objectid_mutex);
return ret;
}