linux_dsm_epyc7002/fs/btrfs/extent_io.h
Josef Bacik a48b73eca4 btrfs: fix potential deadlock in the search ioctl
With the conversion of the tree locks to rwsem I got the following
lockdep splat:

  ======================================================
  WARNING: possible circular locking dependency detected
  5.8.0-rc7-00165-g04ec4da5f45f-dirty #922 Not tainted
  ------------------------------------------------------
  compsize/11122 is trying to acquire lock:
  ffff889fabca8768 (&mm->mmap_lock#2){++++}-{3:3}, at: __might_fault+0x3e/0x90

  but task is already holding lock:
  ffff889fe720fe40 (btrfs-fs-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180

  which lock already depends on the new lock.

  the existing dependency chain (in reverse order) is:

  -> #2 (btrfs-fs-00){++++}-{3:3}:
	 down_write_nested+0x3b/0x70
	 __btrfs_tree_lock+0x24/0x120
	 btrfs_search_slot+0x756/0x990
	 btrfs_lookup_inode+0x3a/0xb4
	 __btrfs_update_delayed_inode+0x93/0x270
	 btrfs_async_run_delayed_root+0x168/0x230
	 btrfs_work_helper+0xd4/0x570
	 process_one_work+0x2ad/0x5f0
	 worker_thread+0x3a/0x3d0
	 kthread+0x133/0x150
	 ret_from_fork+0x1f/0x30

  -> #1 (&delayed_node->mutex){+.+.}-{3:3}:
	 __mutex_lock+0x9f/0x930
	 btrfs_delayed_update_inode+0x50/0x440
	 btrfs_update_inode+0x8a/0xf0
	 btrfs_dirty_inode+0x5b/0xd0
	 touch_atime+0xa1/0xd0
	 btrfs_file_mmap+0x3f/0x60
	 mmap_region+0x3a4/0x640
	 do_mmap+0x376/0x580
	 vm_mmap_pgoff+0xd5/0x120
	 ksys_mmap_pgoff+0x193/0x230
	 do_syscall_64+0x50/0x90
	 entry_SYSCALL_64_after_hwframe+0x44/0xa9

  -> #0 (&mm->mmap_lock#2){++++}-{3:3}:
	 __lock_acquire+0x1272/0x2310
	 lock_acquire+0x9e/0x360
	 __might_fault+0x68/0x90
	 _copy_to_user+0x1e/0x80
	 copy_to_sk.isra.32+0x121/0x300
	 search_ioctl+0x106/0x200
	 btrfs_ioctl_tree_search_v2+0x7b/0xf0
	 btrfs_ioctl+0x106f/0x30a0
	 ksys_ioctl+0x83/0xc0
	 __x64_sys_ioctl+0x16/0x20
	 do_syscall_64+0x50/0x90
	 entry_SYSCALL_64_after_hwframe+0x44/0xa9

  other info that might help us debug this:

  Chain exists of:
    &mm->mmap_lock#2 --> &delayed_node->mutex --> btrfs-fs-00

   Possible unsafe locking scenario:

	 CPU0                    CPU1
	 ----                    ----
    lock(btrfs-fs-00);
				 lock(&delayed_node->mutex);
				 lock(btrfs-fs-00);
    lock(&mm->mmap_lock#2);

   *** DEADLOCK ***

  1 lock held by compsize/11122:
   #0: ffff889fe720fe40 (btrfs-fs-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180

  stack backtrace:
  CPU: 17 PID: 11122 Comm: compsize Kdump: loaded Not tainted 5.8.0-rc7-00165-g04ec4da5f45f-dirty #922
  Hardware name: Quanta Tioga Pass Single Side 01-0030993006/Tioga Pass Single Side, BIOS F08_3A18 12/20/2018
  Call Trace:
   dump_stack+0x78/0xa0
   check_noncircular+0x165/0x180
   __lock_acquire+0x1272/0x2310
   lock_acquire+0x9e/0x360
   ? __might_fault+0x3e/0x90
   ? find_held_lock+0x72/0x90
   __might_fault+0x68/0x90
   ? __might_fault+0x3e/0x90
   _copy_to_user+0x1e/0x80
   copy_to_sk.isra.32+0x121/0x300
   ? btrfs_search_forward+0x2a6/0x360
   search_ioctl+0x106/0x200
   btrfs_ioctl_tree_search_v2+0x7b/0xf0
   btrfs_ioctl+0x106f/0x30a0
   ? __do_sys_newfstat+0x5a/0x70
   ? ksys_ioctl+0x83/0xc0
   ksys_ioctl+0x83/0xc0
   __x64_sys_ioctl+0x16/0x20
   do_syscall_64+0x50/0x90
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

The problem is we're doing a copy_to_user() while holding tree locks,
which can deadlock if we have to do a page fault for the copy_to_user().
This exists even without my locking changes, so it needs to be fixed.
Rework the search ioctl to do the pre-fault and then
copy_to_user_nofault for the copying.

CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-08-27 13:56:27 +02:00

339 lines
11 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_EXTENT_IO_H
#define BTRFS_EXTENT_IO_H
#include <linux/rbtree.h>
#include <linux/refcount.h>
#include <linux/fiemap.h>
#include "ulist.h"
/*
* flags for bio submission. The high bits indicate the compression
* type for this bio
*/
#define EXTENT_BIO_COMPRESSED 1
#define EXTENT_BIO_FLAG_SHIFT 16
enum {
EXTENT_BUFFER_UPTODATE,
EXTENT_BUFFER_DIRTY,
EXTENT_BUFFER_CORRUPT,
/* this got triggered by readahead */
EXTENT_BUFFER_READAHEAD,
EXTENT_BUFFER_TREE_REF,
EXTENT_BUFFER_STALE,
EXTENT_BUFFER_WRITEBACK,
/* read IO error */
EXTENT_BUFFER_READ_ERR,
EXTENT_BUFFER_UNMAPPED,
EXTENT_BUFFER_IN_TREE,
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
};
/* these are flags for __process_pages_contig */
#define PAGE_UNLOCK (1 << 0)
#define PAGE_CLEAR_DIRTY (1 << 1)
#define PAGE_SET_WRITEBACK (1 << 2)
#define PAGE_END_WRITEBACK (1 << 3)
#define PAGE_SET_PRIVATE2 (1 << 4)
#define PAGE_SET_ERROR (1 << 5)
#define PAGE_LOCK (1 << 6)
/*
* page->private values. Every page that is controlled by the extent
* map has page->private set to one.
*/
#define EXTENT_PAGE_PRIVATE 1
/*
* The extent buffer bitmap operations are done with byte granularity instead of
* word granularity for two reasons:
* 1. The bitmaps must be little-endian on disk.
* 2. Bitmap items are not guaranteed to be aligned to a word and therefore a
* single word in a bitmap may straddle two pages in the extent buffer.
*/
#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
#define BITMAP_FIRST_BYTE_MASK(start) \
((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
#define BITMAP_LAST_BYTE_MASK(nbits) \
(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
struct btrfs_root;
struct btrfs_inode;
struct btrfs_io_bio;
struct io_failure_record;
struct extent_io_tree;
typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
int mirror_num,
unsigned long bio_flags);
typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
struct bio *bio, u64 bio_offset);
struct extent_io_ops {
/*
* The following callbacks must be always defined, the function
* pointer will be called unconditionally.
*/
submit_bio_hook_t *submit_bio_hook;
int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
struct page *page, u64 start, u64 end,
int mirror);
};
#define INLINE_EXTENT_BUFFER_PAGES 16
#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
struct extent_buffer {
u64 start;
unsigned long len;
unsigned long bflags;
struct btrfs_fs_info *fs_info;
spinlock_t refs_lock;
atomic_t refs;
atomic_t io_pages;
int read_mirror;
struct rcu_head rcu_head;
pid_t lock_owner;
int blocking_writers;
atomic_t blocking_readers;
bool lock_nested;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
short log_index;
/* protects write locks */
rwlock_t lock;
/* readers use lock_wq while they wait for the write
* lock holders to unlock
*/
wait_queue_head_t write_lock_wq;
/* writers use read_lock_wq while they wait for readers
* to unlock
*/
wait_queue_head_t read_lock_wq;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
int spinning_writers;
atomic_t spinning_readers;
atomic_t read_locks;
int write_locks;
struct list_head leak_list;
#endif
};
/*
* Structure to record how many bytes and which ranges are set/cleared
*/
struct extent_changeset {
/* How many bytes are set/cleared in this operation */
unsigned int bytes_changed;
/* Changed ranges */
struct ulist range_changed;
};
static inline void extent_changeset_init(struct extent_changeset *changeset)
{
changeset->bytes_changed = 0;
ulist_init(&changeset->range_changed);
}
static inline struct extent_changeset *extent_changeset_alloc(void)
{
struct extent_changeset *ret;
ret = kmalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
extent_changeset_init(ret);
return ret;
}
static inline void extent_changeset_release(struct extent_changeset *changeset)
{
if (!changeset)
return;
changeset->bytes_changed = 0;
ulist_release(&changeset->range_changed);
}
static inline void extent_changeset_free(struct extent_changeset *changeset)
{
if (!changeset)
return;
extent_changeset_release(changeset);
kfree(changeset);
}
static inline void extent_set_compress_type(unsigned long *bio_flags,
int compress_type)
{
*bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
}
static inline int extent_compress_type(unsigned long bio_flags)
{
return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
}
struct extent_map_tree;
typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
u64 start, u64 len);
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
int extent_read_full_page(struct page *page, get_extent_t *get_extent,
int mirror_num);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
int extent_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
void extent_readahead(struct readahead_control *rac);
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
void free_extent_buffer(struct extent_buffer *eb);
void free_extent_buffer_stale(struct extent_buffer *eb);
#define WAIT_NONE 0
#define WAIT_COMPLETE 1
#define WAIT_PAGE_LOCK 2
int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
int mirror_num);
void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
static inline int num_extent_pages(const struct extent_buffer *eb)
{
return (round_up(eb->start + eb->len, PAGE_SIZE) >> PAGE_SHIFT) -
(eb->start >> PAGE_SHIFT);
}
static inline int extent_buffer_uptodate(const struct extent_buffer *eb)
{
return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
}
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len);
void read_extent_buffer(const struct extent_buffer *eb, void *dst,
unsigned long start,
unsigned long len);
int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
void __user *dst, unsigned long start,
unsigned long len);
void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src);
void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
const void *src);
void write_extent_buffer(const struct extent_buffer *eb, const void *src,
unsigned long start, unsigned long len);
void copy_extent_buffer_full(const struct extent_buffer *dst,
const struct extent_buffer *src);
void copy_extent_buffer(const struct extent_buffer *dst,
const struct extent_buffer *src,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len);
void memcpy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len);
void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len);
void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
unsigned long len);
int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
unsigned long pos);
void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len);
void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
unsigned long start, unsigned long pos,
unsigned long len);
void clear_extent_buffer_dirty(const struct extent_buffer *eb);
bool set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(const struct extent_buffer *eb);
void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned bits_to_clear,
unsigned long page_ops);
struct bio *btrfs_bio_alloc(u64 first_byte);
struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone(struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
struct btrfs_fs_info;
struct btrfs_inode;
int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
/*
* When IO fails, either with EIO or csum verification fails, we
* try other mirrors that might have a good copy of the data. This
* io_failure_record is used to record state as we go through all the
* mirrors. If another mirror has good data, the page is set up to date
* and things continue. If a good mirror can't be found, the original
* bio end_io callback is called to indicate things have failed.
*/
struct io_failure_record {
struct page *page;
u64 start;
u64 len;
u64 logical;
unsigned long bio_flags;
int this_mirror;
int failed_mirror;
int in_validation;
};
blk_status_t btrfs_submit_read_repair(struct inode *inode,
struct bio *failed_bio, u64 phy_offset,
struct page *page, unsigned int pgoff,
u64 start, u64 end, int failed_mirror,
submit_bio_hook_t *submit_bio_hook);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
struct page *locked_page, u64 *start,
u64 *end);
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
#ifdef CONFIG_BTRFS_DEBUG
void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info);
#else
#define btrfs_extent_buffer_leak_debug_check(fs_info) do {} while (0)
#endif
#endif