linux_dsm_epyc7002/fs/btrfs/tests/btrfs-tests.c
Filipe Manana 7227ff4de5 Btrfs: fix race between adding and putting tree mod seq elements and nodes
There is a race between adding and removing elements to the tree mod log
list and rbtree that can lead to use-after-free problems.

Consider the following example that explains how/why the problems happens:

1) Task A has mod log element with sequence number 200. It currently is
   the only element in the mod log list;

2) Task A calls btrfs_put_tree_mod_seq() because it no longer needs to
   access the tree mod log. When it enters the function, it initializes
   'min_seq' to (u64)-1. Then it acquires the lock 'tree_mod_seq_lock'
   before checking if there are other elements in the mod seq list.
   Since the list it empty, 'min_seq' remains set to (u64)-1. Then it
   unlocks the lock 'tree_mod_seq_lock';

3) Before task A acquires the lock 'tree_mod_log_lock', task B adds
   itself to the mod seq list through btrfs_get_tree_mod_seq() and gets a
   sequence number of 201;

4) Some other task, name it task C, modifies a btree and because there
   elements in the mod seq list, it adds a tree mod elem to the tree
   mod log rbtree. That node added to the mod log rbtree is assigned
   a sequence number of 202;

5) Task B, which is doing fiemap and resolving indirect back references,
   calls btrfs get_old_root(), with 'time_seq' == 201, which in turn
   calls tree_mod_log_search() - the search returns the mod log node
   from the rbtree with sequence number 202, created by task C;

6) Task A now acquires the lock 'tree_mod_log_lock', starts iterating
   the mod log rbtree and finds the node with sequence number 202. Since
   202 is less than the previously computed 'min_seq', (u64)-1, it
   removes the node and frees it;

7) Task B still has a pointer to the node with sequence number 202, and
   it dereferences the pointer itself and through the call to
   __tree_mod_log_rewind(), resulting in a use-after-free problem.

This issue can be triggered sporadically with the test case generic/561
from fstests, and it happens more frequently with a higher number of
duperemove processes. When it happens to me, it either freezes the VM or
it produces a trace like the following before crashing:

  [ 1245.321140] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
  [ 1245.321200] CPU: 1 PID: 26997 Comm: pool Not tainted 5.5.0-rc6-btrfs-next-52 #1
  [ 1245.321235] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014
  [ 1245.321287] RIP: 0010:rb_next+0x16/0x50
  [ 1245.321307] Code: ....
  [ 1245.321372] RSP: 0018:ffffa151c4d039b0 EFLAGS: 00010202
  [ 1245.321388] RAX: 6b6b6b6b6b6b6b6b RBX: ffff8ae221363c80 RCX: 6b6b6b6b6b6b6b6b
  [ 1245.321409] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff8ae221363c80
  [ 1245.321439] RBP: ffff8ae20fcc4688 R08: 0000000000000002 R09: 0000000000000000
  [ 1245.321475] R10: ffff8ae20b120910 R11: 00000000243f8bb1 R12: 0000000000000038
  [ 1245.321506] R13: ffff8ae221363c80 R14: 000000000000075f R15: ffff8ae223f762b8
  [ 1245.321539] FS:  00007fdee1ec7700(0000) GS:ffff8ae236c80000(0000) knlGS:0000000000000000
  [ 1245.321591] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [ 1245.321614] CR2: 00007fded4030c48 CR3: 000000021da16003 CR4: 00000000003606e0
  [ 1245.321642] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  [ 1245.321668] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  [ 1245.321706] Call Trace:
  [ 1245.321798]  __tree_mod_log_rewind+0xbf/0x280 [btrfs]
  [ 1245.321841]  btrfs_search_old_slot+0x105/0xd00 [btrfs]
  [ 1245.321877]  resolve_indirect_refs+0x1eb/0xc60 [btrfs]
  [ 1245.321912]  find_parent_nodes+0x3dc/0x11b0 [btrfs]
  [ 1245.321947]  btrfs_check_shared+0x115/0x1c0 [btrfs]
  [ 1245.321980]  ? extent_fiemap+0x59d/0x6d0 [btrfs]
  [ 1245.322029]  extent_fiemap+0x59d/0x6d0 [btrfs]
  [ 1245.322066]  do_vfs_ioctl+0x45a/0x750
  [ 1245.322081]  ksys_ioctl+0x70/0x80
  [ 1245.322092]  ? trace_hardirqs_off_thunk+0x1a/0x1c
  [ 1245.322113]  __x64_sys_ioctl+0x16/0x20
  [ 1245.322126]  do_syscall_64+0x5c/0x280
  [ 1245.322139]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
  [ 1245.322155] RIP: 0033:0x7fdee3942dd7
  [ 1245.322177] Code: ....
  [ 1245.322258] RSP: 002b:00007fdee1ec6c88 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
  [ 1245.322294] RAX: ffffffffffffffda RBX: 00007fded40210d8 RCX: 00007fdee3942dd7
  [ 1245.322314] RDX: 00007fded40210d8 RSI: 00000000c020660b RDI: 0000000000000004
  [ 1245.322337] RBP: 0000562aa89e7510 R08: 0000000000000000 R09: 00007fdee1ec6d44
  [ 1245.322369] R10: 0000000000000073 R11: 0000000000000246 R12: 00007fdee1ec6d48
  [ 1245.322390] R13: 00007fdee1ec6d40 R14: 00007fded40210d0 R15: 00007fdee1ec6d50
  [ 1245.322423] Modules linked in: ....
  [ 1245.323443] ---[ end trace 01de1e9ec5dff3cd ]---

Fix this by ensuring that btrfs_put_tree_mod_seq() computes the minimum
sequence number and iterates the rbtree while holding the lock
'tree_mod_log_lock' in write mode. Also get rid of the 'tree_mod_seq_lock'
lock, since it is now redundant.

Fixes: bd989ba359 ("Btrfs: add tree modification log functions")
Fixes: 097b8a7c9e ("Btrfs: join tree mod log code with the code holding back delayed refs")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-01-31 14:01:20 +01:00

325 lines
8.0 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2013 Fusion IO. All rights reserved.
*/
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/magic.h>
#include "btrfs-tests.h"
#include "../ctree.h"
#include "../free-space-cache.h"
#include "../free-space-tree.h"
#include "../transaction.h"
#include "../volumes.h"
#include "../disk-io.h"
#include "../qgroup.h"
#include "../block-group.h"
static struct vfsmount *test_mnt = NULL;
const char *test_error[] = {
[TEST_ALLOC_FS_INFO] = "cannot allocate fs_info",
[TEST_ALLOC_ROOT] = "cannot allocate root",
[TEST_ALLOC_EXTENT_BUFFER] = "cannot extent buffer",
[TEST_ALLOC_PATH] = "cannot allocate path",
[TEST_ALLOC_INODE] = "cannot allocate inode",
[TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group",
[TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map",
};
static const struct super_operations btrfs_test_super_ops = {
.alloc_inode = btrfs_alloc_inode,
.destroy_inode = btrfs_test_destroy_inode,
};
static int btrfs_test_init_fs_context(struct fs_context *fc)
{
struct pseudo_fs_context *ctx = init_pseudo(fc, BTRFS_TEST_MAGIC);
if (!ctx)
return -ENOMEM;
ctx->ops = &btrfs_test_super_ops;
return 0;
}
static struct file_system_type test_type = {
.name = "btrfs_test_fs",
.init_fs_context = btrfs_test_init_fs_context,
.kill_sb = kill_anon_super,
};
struct inode *btrfs_new_test_inode(void)
{
struct inode *inode;
inode = new_inode(test_mnt->mnt_sb);
if (inode)
inode_init_owner(inode, NULL, S_IFREG);
return inode;
}
static int btrfs_init_test_fs(void)
{
int ret;
ret = register_filesystem(&test_type);
if (ret) {
printk(KERN_ERR "btrfs: cannot register test file system\n");
return ret;
}
test_mnt = kern_mount(&test_type);
if (IS_ERR(test_mnt)) {
printk(KERN_ERR "btrfs: cannot mount test file system\n");
unregister_filesystem(&test_type);
return PTR_ERR(test_mnt);
}
return 0;
}
static void btrfs_destroy_test_fs(void)
{
kern_unmount(test_mnt);
unregister_filesystem(&test_type);
}
struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
{
struct btrfs_device *dev;
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
if (!dev)
return ERR_PTR(-ENOMEM);
extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
INIT_LIST_HEAD(&dev->dev_list);
list_add(&dev->dev_list, &fs_info->fs_devices->devices);
return dev;
}
static void btrfs_free_dummy_device(struct btrfs_device *dev)
{
extent_io_tree_release(&dev->alloc_state);
kfree(dev);
}
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
{
struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
GFP_KERNEL);
if (!fs_info)
return fs_info;
fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
GFP_KERNEL);
if (!fs_info->fs_devices) {
kfree(fs_info);
return NULL;
}
fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
GFP_KERNEL);
if (!fs_info->super_copy) {
kfree(fs_info->fs_devices);
kfree(fs_info);
return NULL;
}
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
if (init_srcu_struct(&fs_info->subvol_srcu)) {
kfree(fs_info->fs_devices);
kfree(fs_info->super_copy);
kfree(fs_info);
return NULL;
}
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->qgroup_lock);
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
mutex_init(&fs_info->qgroup_ioctl_lock);
mutex_init(&fs_info->qgroup_rescan_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
fs_info->running_transaction = NULL;
fs_info->qgroup_tree = RB_ROOT;
fs_info->qgroup_ulist = NULL;
atomic64_set(&fs_info->tree_mod_seq, 0);
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->fs_devices->devices);
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
extent_io_tree_init(fs_info, &fs_info->freed_extents[0],
IO_TREE_FS_INFO_FREED_EXTENTS0, NULL);
extent_io_tree_init(fs_info, &fs_info->freed_extents[1],
IO_TREE_FS_INFO_FREED_EXTENTS1, NULL);
extent_map_tree_init(&fs_info->mapping_tree);
fs_info->pinned_extents = &fs_info->freed_extents[0];
set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
test_mnt->mnt_sb->s_fs_info = fs_info;
return fs_info;
}
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
{
struct radix_tree_iter iter;
void **slot;
struct btrfs_device *dev, *tmp;
if (!fs_info)
return;
if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
&fs_info->fs_state)))
return;
test_mnt->mnt_sb->s_fs_info = NULL;
spin_lock(&fs_info->buffer_lock);
radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
struct extent_buffer *eb;
eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
if (!eb)
continue;
/* Shouldn't happen but that kind of thinking creates CVE's */
if (radix_tree_exception(eb)) {
if (radix_tree_deref_retry(eb))
slot = radix_tree_iter_retry(&iter);
continue;
}
slot = radix_tree_iter_resume(slot, &iter);
spin_unlock(&fs_info->buffer_lock);
free_extent_buffer_stale(eb);
spin_lock(&fs_info->buffer_lock);
}
spin_unlock(&fs_info->buffer_lock);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
dev_list) {
btrfs_free_dummy_device(dev);
}
btrfs_free_qgroup_config(fs_info);
btrfs_free_fs_roots(fs_info);
cleanup_srcu_struct(&fs_info->subvol_srcu);
kfree(fs_info->super_copy);
kfree(fs_info->fs_devices);
kfree(fs_info);
}
void btrfs_free_dummy_root(struct btrfs_root *root)
{
if (!root)
return;
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
return;
if (root->node) {
/* One for allocate_extent_buffer */
free_extent_buffer(root->node);
}
kfree(root);
}
struct btrfs_block_group *
btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info,
unsigned long length)
{
struct btrfs_block_group *cache;
cache = kzalloc(sizeof(*cache), GFP_KERNEL);
if (!cache)
return NULL;
cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
GFP_KERNEL);
if (!cache->free_space_ctl) {
kfree(cache);
return NULL;
}
cache->start = 0;
cache->length = length;
cache->full_stripe_len = fs_info->sectorsize;
cache->fs_info = fs_info;
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
btrfs_init_free_space_ctl(cache);
mutex_init(&cache->free_space_lock);
return cache;
}
void btrfs_free_dummy_block_group(struct btrfs_block_group *cache)
{
if (!cache)
return;
__btrfs_remove_free_space_cache(cache->free_space_ctl);
kfree(cache->free_space_ctl);
kfree(cache);
}
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
memset(trans, 0, sizeof(*trans));
trans->transid = 1;
trans->type = __TRANS_DUMMY;
trans->fs_info = fs_info;
}
int btrfs_run_sanity_tests(void)
{
int ret, i;
u32 sectorsize, nodesize;
u32 test_sectorsize[] = {
PAGE_SIZE,
};
ret = btrfs_init_test_fs();
if (ret)
return ret;
for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) {
sectorsize = test_sectorsize[i];
for (nodesize = sectorsize;
nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE;
nodesize <<= 1) {
pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n",
sectorsize, nodesize);
ret = btrfs_test_free_space_cache(sectorsize, nodesize);
if (ret)
goto out;
ret = btrfs_test_extent_buffer_operations(sectorsize,
nodesize);
if (ret)
goto out;
ret = btrfs_test_extent_io(sectorsize, nodesize);
if (ret)
goto out;
ret = btrfs_test_inodes(sectorsize, nodesize);
if (ret)
goto out;
ret = btrfs_test_qgroups(sectorsize, nodesize);
if (ret)
goto out;
ret = btrfs_test_free_space_tree(sectorsize, nodesize);
if (ret)
goto out;
}
}
ret = btrfs_test_extent_map();
out:
btrfs_destroy_test_fs();
return ret;
}