linux_dsm_epyc7002/include/linux/kernfs.h
Tejun Heo ecca47ce82 kernfs: kernfs_notify() must be useable from non-sleepable contexts
d911d98748 ("kernfs: make kernfs_notify() trigger inotify events
too") added fsnotify triggering to kernfs_notify() which requires a
sleepable context.  There are already existing users of
kernfs_notify() which invoke it from an atomic context and in general
it's silly to require a sleepable context for triggering a
notification.

The following is an invalid context bug triggerd by md invoking
sysfs_notify() from IO completion path.

 BUG: sleeping function called from invalid context at kernel/locking/mutex.c:586
 in_atomic(): 1, irqs_disabled(): 1, pid: 0, name: swapper/1
 2 locks held by swapper/1/0:
  #0:  (&(&vblk->vq_lock)->rlock){-.-...}, at: [<ffffffffa0039042>] virtblk_done+0x42/0xe0 [virtio_blk]
  #1:  (&(&bitmap->counts.lock)->rlock){-.....}, at: [<ffffffff81633718>] bitmap_endwrite+0x68/0x240
 irq event stamp: 33518
 hardirqs last  enabled at (33515): [<ffffffff8102544f>] default_idle+0x1f/0x230
 hardirqs last disabled at (33516): [<ffffffff818122ed>] common_interrupt+0x6d/0x72
 softirqs last  enabled at (33518): [<ffffffff810a1272>] _local_bh_enable+0x22/0x50
 softirqs last disabled at (33517): [<ffffffff810a29e0>] irq_enter+0x60/0x80
 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.16.0-0.rc2.git2.1.fc21.x86_64 #1
 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  0000000000000000 f90db13964f4ee05 ffff88007d403b80 ffffffff81807b4c
  0000000000000000 ffff88007d403ba8 ffffffff810d4f14 0000000000000000
  0000000000441800 ffff880078fa1780 ffff88007d403c38 ffffffff8180caf2
 Call Trace:
  <IRQ>  [<ffffffff81807b4c>] dump_stack+0x4d/0x66
  [<ffffffff810d4f14>] __might_sleep+0x184/0x240
  [<ffffffff8180caf2>] mutex_lock_nested+0x42/0x440
  [<ffffffff812d76a0>] kernfs_notify+0x90/0x150
  [<ffffffff8163377c>] bitmap_endwrite+0xcc/0x240
  [<ffffffffa00de863>] close_write+0x93/0xb0 [raid1]
  [<ffffffffa00df029>] r1_bio_write_done+0x29/0x50 [raid1]
  [<ffffffffa00e0474>] raid1_end_write_request+0xe4/0x260 [raid1]
  [<ffffffff813acb8b>] bio_endio+0x6b/0xa0
  [<ffffffff813b46c4>] blk_update_request+0x94/0x420
  [<ffffffff813bf0ea>] blk_mq_end_io+0x1a/0x70
  [<ffffffffa00392c2>] virtblk_request_done+0x32/0x80 [virtio_blk]
  [<ffffffff813c0648>] __blk_mq_complete_request+0x88/0x120
  [<ffffffff813c070a>] blk_mq_complete_request+0x2a/0x30
  [<ffffffffa0039066>] virtblk_done+0x66/0xe0 [virtio_blk]
  [<ffffffffa002535a>] vring_interrupt+0x3a/0xa0 [virtio_ring]
  [<ffffffff81116177>] handle_irq_event_percpu+0x77/0x340
  [<ffffffff8111647d>] handle_irq_event+0x3d/0x60
  [<ffffffff81119436>] handle_edge_irq+0x66/0x130
  [<ffffffff8101c3e4>] handle_irq+0x84/0x150
  [<ffffffff818146ad>] do_IRQ+0x4d/0xe0
  [<ffffffff818122f2>] common_interrupt+0x72/0x72
  <EOI>  [<ffffffff8105f706>] ? native_safe_halt+0x6/0x10
  [<ffffffff81025454>] default_idle+0x24/0x230
  [<ffffffff81025f9f>] arch_cpu_idle+0xf/0x20
  [<ffffffff810f5adc>] cpu_startup_entry+0x37c/0x7b0
  [<ffffffff8104df1b>] start_secondary+0x25b/0x300

This patch fixes it by punting the notification delivery through a
work item.  This ends up adding an extra pointer to kernfs_elem_attr
enlarging kernfs_node by a pointer, which is not ideal but not a very
big deal either.  If this turns out to be an actual issue, we can move
kernfs_elem_attr->size to kernfs_node->iattr later.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Josh Boyer <jwboyer@fedoraproject.org>
Cc: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2014-07-02 09:32:09 -07:00

466 lines
14 KiB
C

/*
* kernfs.h - pseudo filesystem decoupled from vfs locking
*
* This file is released under the GPLv2.
*/
#ifndef __LINUX_KERNFS_H
#define __LINUX_KERNFS_H
#include <linux/kernel.h>
#include <linux/err.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/idr.h>
#include <linux/lockdep.h>
#include <linux/rbtree.h>
#include <linux/atomic.h>
#include <linux/wait.h>
struct file;
struct dentry;
struct iattr;
struct seq_file;
struct vm_area_struct;
struct super_block;
struct file_system_type;
struct kernfs_open_node;
struct kernfs_iattrs;
enum kernfs_node_type {
KERNFS_DIR = 0x0001,
KERNFS_FILE = 0x0002,
KERNFS_LINK = 0x0004,
};
#define KERNFS_TYPE_MASK 0x000f
#define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK
enum kernfs_node_flag {
KERNFS_ACTIVATED = 0x0010,
KERNFS_NS = 0x0020,
KERNFS_HAS_SEQ_SHOW = 0x0040,
KERNFS_HAS_MMAP = 0x0080,
KERNFS_LOCKDEP = 0x0100,
KERNFS_STATIC_NAME = 0x0200,
KERNFS_SUICIDAL = 0x0400,
KERNFS_SUICIDED = 0x0800,
};
/* @flags for kernfs_create_root() */
enum kernfs_root_flag {
/*
* kernfs_nodes are created in the deactivated state and invisible.
* They require explicit kernfs_activate() to become visible. This
* can be used to make related nodes become visible atomically
* after all nodes are created successfully.
*/
KERNFS_ROOT_CREATE_DEACTIVATED = 0x0001,
/*
* For regular flies, if the opener has CAP_DAC_OVERRIDE, open(2)
* succeeds regardless of the RW permissions. sysfs had an extra
* layer of enforcement where open(2) fails with -EACCES regardless
* of CAP_DAC_OVERRIDE if the permission doesn't have the
* respective read or write access at all (none of S_IRUGO or
* S_IWUGO) or the respective operation isn't implemented. The
* following flag enables that behavior.
*/
KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK = 0x0002,
};
/* type-specific structures for kernfs_node union members */
struct kernfs_elem_dir {
unsigned long subdirs;
/* children rbtree starts here and goes through kn->rb */
struct rb_root children;
/*
* The kernfs hierarchy this directory belongs to. This fits
* better directly in kernfs_node but is here to save space.
*/
struct kernfs_root *root;
};
struct kernfs_elem_symlink {
struct kernfs_node *target_kn;
};
struct kernfs_elem_attr {
const struct kernfs_ops *ops;
struct kernfs_open_node *open;
loff_t size;
struct kernfs_node *notify_next; /* for kernfs_notify() */
};
/*
* kernfs_node - the building block of kernfs hierarchy. Each and every
* kernfs node is represented by single kernfs_node. Most fields are
* private to kernfs and shouldn't be accessed directly by kernfs users.
*
* As long as s_count reference is held, the kernfs_node itself is
* accessible. Dereferencing elem or any other outer entity requires
* active reference.
*/
struct kernfs_node {
atomic_t count;
atomic_t active;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
/*
* Use kernfs_get_parent() and kernfs_name/path() instead of
* accessing the following two fields directly. If the node is
* never moved to a different parent, it is safe to access the
* parent directly.
*/
struct kernfs_node *parent;
const char *name;
struct rb_node rb;
const void *ns; /* namespace tag */
unsigned int hash; /* ns + name hash */
union {
struct kernfs_elem_dir dir;
struct kernfs_elem_symlink symlink;
struct kernfs_elem_attr attr;
};
void *priv;
unsigned short flags;
umode_t mode;
unsigned int ino;
struct kernfs_iattrs *iattr;
};
/*
* kernfs_syscall_ops may be specified on kernfs_create_root() to support
* syscalls. These optional callbacks are invoked on the matching syscalls
* and can perform any kernfs operations which don't necessarily have to be
* the exact operation requested. An active reference is held for each
* kernfs_node parameter.
*/
struct kernfs_syscall_ops {
int (*remount_fs)(struct kernfs_root *root, int *flags, char *data);
int (*show_options)(struct seq_file *sf, struct kernfs_root *root);
int (*mkdir)(struct kernfs_node *parent, const char *name,
umode_t mode);
int (*rmdir)(struct kernfs_node *kn);
int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent,
const char *new_name);
};
struct kernfs_root {
/* published fields */
struct kernfs_node *kn;
unsigned int flags; /* KERNFS_ROOT_* flags */
/* private fields, do not use outside kernfs proper */
struct ida ino_ida;
struct kernfs_syscall_ops *syscall_ops;
/* list of kernfs_super_info of this root, protected by kernfs_mutex */
struct list_head supers;
wait_queue_head_t deactivate_waitq;
};
struct kernfs_open_file {
/* published fields */
struct kernfs_node *kn;
struct file *file;
void *priv;
/* private fields, do not use outside kernfs proper */
struct mutex mutex;
int event;
struct list_head list;
size_t atomic_write_len;
bool mmapped;
const struct vm_operations_struct *vm_ops;
};
struct kernfs_ops {
/*
* Read is handled by either seq_file or raw_read().
*
* If seq_show() is present, seq_file path is active. Other seq
* operations are optional and if not implemented, the behavior is
* equivalent to single_open(). @sf->private points to the
* associated kernfs_open_file.
*
* read() is bounced through kernel buffer and a read larger than
* PAGE_SIZE results in partial operation of PAGE_SIZE.
*/
int (*seq_show)(struct seq_file *sf, void *v);
void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
void (*seq_stop)(struct seq_file *sf, void *v);
ssize_t (*read)(struct kernfs_open_file *of, char *buf, size_t bytes,
loff_t off);
/*
* write() is bounced through kernel buffer. If atomic_write_len
* is not set, a write larger than PAGE_SIZE results in partial
* operations of PAGE_SIZE chunks. If atomic_write_len is set,
* writes upto the specified size are executed atomically but
* larger ones are rejected with -E2BIG.
*/
size_t atomic_write_len;
ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes,
loff_t off);
int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lock_class_key lockdep_key;
#endif
};
#ifdef CONFIG_KERNFS
static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
{
return kn->flags & KERNFS_TYPE_MASK;
}
/**
* kernfs_enable_ns - enable namespace under a directory
* @kn: directory of interest, should be empty
*
* This is to be called right after @kn is created to enable namespace
* under it. All children of @kn must have non-NULL namespace tags and
* only the ones which match the super_block's tag will be visible.
*/
static inline void kernfs_enable_ns(struct kernfs_node *kn)
{
WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children));
kn->flags |= KERNFS_NS;
}
/**
* kernfs_ns_enabled - test whether namespace is enabled
* @kn: the node to test
*
* Test whether namespace filtering is enabled for the children of @ns.
*/
static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
{
return kn->flags & KERNFS_NS;
}
int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
size_t buflen);
void pr_cont_kernfs_name(struct kernfs_node *kn);
void pr_cont_kernfs_path(struct kernfs_node *kn);
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn);
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
const char *name, const void *ns);
void kernfs_get(struct kernfs_node *kn);
void kernfs_put(struct kernfs_node *kn);
struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry);
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb);
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
unsigned int flags, void *priv);
void kernfs_destroy_root(struct kernfs_root *root);
struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
const char *name, umode_t mode,
void *priv, const void *ns);
struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
const char *name,
umode_t mode, loff_t size,
const struct kernfs_ops *ops,
void *priv, const void *ns,
bool name_is_static,
struct lock_class_key *key);
struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
const char *name,
struct kernfs_node *target);
void kernfs_activate(struct kernfs_node *kn);
void kernfs_remove(struct kernfs_node *kn);
void kernfs_break_active_protection(struct kernfs_node *kn);
void kernfs_unbreak_active_protection(struct kernfs_node *kn);
bool kernfs_remove_self(struct kernfs_node *kn);
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
const void *ns);
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
const char *new_name, const void *new_ns);
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
void kernfs_notify(struct kernfs_node *kn);
const void *kernfs_super_ns(struct super_block *sb);
struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
struct kernfs_root *root, unsigned long magic,
bool *new_sb_created, const void *ns);
void kernfs_kill_sb(struct super_block *sb);
void kernfs_init(void);
#else /* CONFIG_KERNFS */
static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
{ return 0; } /* whatever */
static inline void kernfs_enable_ns(struct kernfs_node *kn) { }
static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
{ return false; }
static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
{ return -ENOSYS; }
static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
size_t buflen)
{ return NULL; }
static inline void pr_cont_kernfs_name(struct kernfs_node *kn) { }
static inline void pr_cont_kernfs_path(struct kernfs_node *kn) { }
static inline struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{ return NULL; }
static inline struct kernfs_node *
kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name,
const void *ns)
{ return NULL; }
static inline void kernfs_get(struct kernfs_node *kn) { }
static inline void kernfs_put(struct kernfs_node *kn) { }
static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
{ return NULL; }
static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
{ return NULL; }
static inline struct kernfs_root *
kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags,
void *priv)
{ return ERR_PTR(-ENOSYS); }
static inline void kernfs_destroy_root(struct kernfs_root *root) { }
static inline struct kernfs_node *
kernfs_create_dir_ns(struct kernfs_node *parent, const char *name,
umode_t mode, void *priv, const void *ns)
{ return ERR_PTR(-ENOSYS); }
static inline struct kernfs_node *
__kernfs_create_file(struct kernfs_node *parent, const char *name,
umode_t mode, loff_t size, const struct kernfs_ops *ops,
void *priv, const void *ns, bool name_is_static,
struct lock_class_key *key)
{ return ERR_PTR(-ENOSYS); }
static inline struct kernfs_node *
kernfs_create_link(struct kernfs_node *parent, const char *name,
struct kernfs_node *target)
{ return ERR_PTR(-ENOSYS); }
static inline void kernfs_activate(struct kernfs_node *kn) { }
static inline void kernfs_remove(struct kernfs_node *kn) { }
static inline bool kernfs_remove_self(struct kernfs_node *kn)
{ return false; }
static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn,
const char *name, const void *ns)
{ return -ENOSYS; }
static inline int kernfs_rename_ns(struct kernfs_node *kn,
struct kernfs_node *new_parent,
const char *new_name, const void *new_ns)
{ return -ENOSYS; }
static inline int kernfs_setattr(struct kernfs_node *kn,
const struct iattr *iattr)
{ return -ENOSYS; }
static inline void kernfs_notify(struct kernfs_node *kn) { }
static inline const void *kernfs_super_ns(struct super_block *sb)
{ return NULL; }
static inline struct dentry *
kernfs_mount_ns(struct file_system_type *fs_type, int flags,
struct kernfs_root *root, unsigned long magic,
bool *new_sb_created, const void *ns)
{ return ERR_PTR(-ENOSYS); }
static inline void kernfs_kill_sb(struct super_block *sb) { }
static inline void kernfs_init(void) { }
#endif /* CONFIG_KERNFS */
static inline struct kernfs_node *
kernfs_find_and_get(struct kernfs_node *kn, const char *name)
{
return kernfs_find_and_get_ns(kn, name, NULL);
}
static inline struct kernfs_node *
kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode,
void *priv)
{
return kernfs_create_dir_ns(parent, name, mode, priv, NULL);
}
static inline struct kernfs_node *
kernfs_create_file_ns(struct kernfs_node *parent, const char *name,
umode_t mode, loff_t size, const struct kernfs_ops *ops,
void *priv, const void *ns)
{
struct lock_class_key *key = NULL;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
key = (struct lock_class_key *)&ops->lockdep_key;
#endif
return __kernfs_create_file(parent, name, mode, size, ops, priv, ns,
false, key);
}
static inline struct kernfs_node *
kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode,
loff_t size, const struct kernfs_ops *ops, void *priv)
{
return kernfs_create_file_ns(parent, name, mode, size, ops, priv, NULL);
}
static inline int kernfs_remove_by_name(struct kernfs_node *parent,
const char *name)
{
return kernfs_remove_by_name_ns(parent, name, NULL);
}
static inline int kernfs_rename(struct kernfs_node *kn,
struct kernfs_node *new_parent,
const char *new_name)
{
return kernfs_rename_ns(kn, new_parent, new_name, NULL);
}
static inline struct dentry *
kernfs_mount(struct file_system_type *fs_type, int flags,
struct kernfs_root *root, unsigned long magic,
bool *new_sb_created)
{
return kernfs_mount_ns(fs_type, flags, root,
magic, new_sb_created, NULL);
}
#endif /* __LINUX_KERNFS_H */