fs: Use rename lock and RCU for multi-step operations

The remaining usages for dcache_lock is to allow atomic, multi-step read-side
operations over the directory tree by excluding modifications to the tree.
Also, to walk in the leaf->root direction in the tree where we don't have
a natural d_lock ordering.

This could be accomplished by taking every d_lock, but this would mean a
huge number of locks and actually gets very tricky.

Solve this instead by using the rename seqlock for multi-step read-side
operations, retry in case of a rename so we don't walk up the wrong parent.
Concurrent dentry insertions are not serialised against.  Concurrent deletes
are tricky when walking up the directory: our parent might have been deleted
when dropping locks so also need to check and retry for that.

We can also use the rename lock in cases where livelock is a worry (and it
is introduced in subsequent patch).

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
This commit is contained in:
Nick Piggin 2011-01-07 17:49:37 +11:00
parent 9abca36087
commit 949854d024
5 changed files with 151 additions and 29 deletions

View File

@ -83,10 +83,11 @@ int pohmelfs_construct_path_string(struct pohmelfs_inode *pi, void *data, int le
int pohmelfs_path_length(struct pohmelfs_inode *pi) int pohmelfs_path_length(struct pohmelfs_inode *pi)
{ {
struct dentry *d, *root, *first; struct dentry *d, *root, *first;
int len = 1; /* Root slash */ int len;
unsigned seq;
first = d = d_find_alias(&pi->vfs_inode); first = d_find_alias(&pi->vfs_inode);
if (!d) { if (!first) {
dprintk("%s: ino: %llu, mode: %o.\n", __func__, pi->ino, pi->vfs_inode.i_mode); dprintk("%s: ino: %llu, mode: %o.\n", __func__, pi->ino, pi->vfs_inode.i_mode);
return -ENOENT; return -ENOENT;
} }
@ -95,6 +96,11 @@ int pohmelfs_path_length(struct pohmelfs_inode *pi)
root = dget(current->fs->root.dentry); root = dget(current->fs->root.dentry);
spin_unlock(&current->fs->lock); spin_unlock(&current->fs->lock);
rename_retry:
len = 1; /* Root slash */
d = first;
seq = read_seqbegin(&rename_lock);
rcu_read_lock();
spin_lock(&dcache_lock); spin_lock(&dcache_lock);
if (!IS_ROOT(d) && d_unhashed(d)) if (!IS_ROOT(d) && d_unhashed(d))
@ -105,6 +111,9 @@ int pohmelfs_path_length(struct pohmelfs_inode *pi)
d = d->d_parent; d = d->d_parent;
} }
spin_unlock(&dcache_lock); spin_unlock(&dcache_lock);
rcu_read_unlock();
if (read_seqretry(&rename_lock, seq))
goto rename_retry;
dput(root); dput(root);
dput(first); dput(first);

View File

@ -186,16 +186,25 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
{ {
struct dentry *root = sbi->sb->s_root; struct dentry *root = sbi->sb->s_root;
struct dentry *tmp; struct dentry *tmp;
char *buf = *name; char *buf;
char *p; char *p;
int len = 0; int len;
unsigned seq;
rename_retry:
buf = *name;
len = 0;
seq = read_seqbegin(&rename_lock);
rcu_read_lock();
spin_lock(&dcache_lock); spin_lock(&dcache_lock);
for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
len += tmp->d_name.len + 1; len += tmp->d_name.len + 1;
if (!len || --len > NAME_MAX) { if (!len || --len > NAME_MAX) {
spin_unlock(&dcache_lock); spin_unlock(&dcache_lock);
rcu_read_unlock();
if (read_seqretry(&rename_lock, seq))
goto rename_retry;
return 0; return 0;
} }
@ -209,6 +218,9 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
strncpy(p, tmp->d_name.name, tmp->d_name.len); strncpy(p, tmp->d_name.name, tmp->d_name.len);
} }
spin_unlock(&dcache_lock); spin_unlock(&dcache_lock);
rcu_read_unlock();
if (read_seqretry(&rename_lock, seq))
goto rename_retry;
return len; return len;
} }

View File

@ -80,6 +80,7 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
EXPORT_SYMBOL(rename_lock);
EXPORT_SYMBOL(dcache_inode_lock); EXPORT_SYMBOL(dcache_inode_lock);
EXPORT_SYMBOL(dcache_lock); EXPORT_SYMBOL(dcache_lock);
@ -243,6 +244,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
__releases(dcache_inode_lock) __releases(dcache_inode_lock)
__releases(dcache_lock) __releases(dcache_lock)
{ {
dentry->d_parent = NULL;
list_del(&dentry->d_u.d_child); list_del(&dentry->d_u.d_child);
if (parent) if (parent)
spin_unlock(&parent->d_lock); spin_unlock(&parent->d_lock);
@ -1017,11 +1019,15 @@ void shrink_dcache_for_umount(struct super_block *sb)
* Return true if the parent or its subdirectories contain * Return true if the parent or its subdirectories contain
* a mount point * a mount point
*/ */
int have_submounts(struct dentry *parent) int have_submounts(struct dentry *parent)
{ {
struct dentry *this_parent = parent; struct dentry *this_parent;
struct list_head *next; struct list_head *next;
unsigned seq;
rename_retry:
this_parent = parent;
seq = read_seqbegin(&rename_lock);
spin_lock(&dcache_lock); spin_lock(&dcache_lock);
if (d_mountpoint(parent)) if (d_mountpoint(parent))
@ -1055,17 +1061,37 @@ int have_submounts(struct dentry *parent)
* All done at this level ... ascend and resume the search. * All done at this level ... ascend and resume the search.
*/ */
if (this_parent != parent) { if (this_parent != parent) {
next = this_parent->d_u.d_child.next; struct dentry *tmp;
struct dentry *child;
tmp = this_parent->d_parent;
rcu_read_lock();
spin_unlock(&this_parent->d_lock); spin_unlock(&this_parent->d_lock);
this_parent = this_parent->d_parent; child = this_parent;
this_parent = tmp;
spin_lock(&this_parent->d_lock); spin_lock(&this_parent->d_lock);
/* might go back up the wrong parent if we have had a rename
* or deletion */
if (this_parent != child->d_parent ||
read_seqretry(&rename_lock, seq)) {
spin_unlock(&this_parent->d_lock);
spin_unlock(&dcache_lock);
rcu_read_unlock();
goto rename_retry;
}
rcu_read_unlock();
next = child->d_u.d_child.next;
goto resume; goto resume;
} }
spin_unlock(&this_parent->d_lock); spin_unlock(&this_parent->d_lock);
spin_unlock(&dcache_lock); spin_unlock(&dcache_lock);
if (read_seqretry(&rename_lock, seq))
goto rename_retry;
return 0; /* No mount points found in tree */ return 0; /* No mount points found in tree */
positive: positive:
spin_unlock(&dcache_lock); spin_unlock(&dcache_lock);
if (read_seqretry(&rename_lock, seq))
goto rename_retry;
return 1; return 1;
} }
EXPORT_SYMBOL(have_submounts); EXPORT_SYMBOL(have_submounts);
@ -1086,10 +1112,15 @@ EXPORT_SYMBOL(have_submounts);
*/ */
static int select_parent(struct dentry * parent) static int select_parent(struct dentry * parent)
{ {
struct dentry *this_parent = parent; struct dentry *this_parent;
struct list_head *next; struct list_head *next;
unsigned seq;
int found = 0; int found = 0;
rename_retry:
this_parent = parent;
seq = read_seqbegin(&rename_lock);
spin_lock(&dcache_lock); spin_lock(&dcache_lock);
spin_lock(&this_parent->d_lock); spin_lock(&this_parent->d_lock);
repeat: repeat:
@ -1099,7 +1130,6 @@ static int select_parent(struct dentry * parent)
struct list_head *tmp = next; struct list_head *tmp = next;
struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
next = tmp->next; next = tmp->next;
BUG_ON(this_parent == dentry);
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
@ -1142,17 +1172,32 @@ static int select_parent(struct dentry * parent)
*/ */
if (this_parent != parent) { if (this_parent != parent) {
struct dentry *tmp; struct dentry *tmp;
next = this_parent->d_u.d_child.next; struct dentry *child;
tmp = this_parent->d_parent; tmp = this_parent->d_parent;
rcu_read_lock();
spin_unlock(&this_parent->d_lock); spin_unlock(&this_parent->d_lock);
BUG_ON(tmp == this_parent); child = this_parent;
this_parent = tmp; this_parent = tmp;
spin_lock(&this_parent->d_lock); spin_lock(&this_parent->d_lock);
/* might go back up the wrong parent if we have had a rename
* or deletion */
if (this_parent != child->d_parent ||
read_seqretry(&rename_lock, seq)) {
spin_unlock(&this_parent->d_lock);
spin_unlock(&dcache_lock);
rcu_read_unlock();
goto rename_retry;
}
rcu_read_unlock();
next = child->d_u.d_child.next;
goto resume; goto resume;
} }
out: out:
spin_unlock(&this_parent->d_lock); spin_unlock(&this_parent->d_lock);
spin_unlock(&dcache_lock); spin_unlock(&dcache_lock);
if (read_seqretry(&rename_lock, seq))
goto rename_retry;
return found; return found;
} }
@ -1654,7 +1699,7 @@ EXPORT_SYMBOL(d_add_ci);
struct dentry * d_lookup(struct dentry * parent, struct qstr * name) struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
{ {
struct dentry * dentry = NULL; struct dentry * dentry = NULL;
unsigned long seq; unsigned seq;
do { do {
seq = read_seqbegin(&rename_lock); seq = read_seqbegin(&rename_lock);
@ -2290,7 +2335,7 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
* @buffer: pointer to the end of the buffer * @buffer: pointer to the end of the buffer
* @buflen: pointer to buffer length * @buflen: pointer to buffer length
* *
* Caller holds the dcache_lock. * Caller holds the rename_lock.
* *
* If path is not reachable from the supplied root, then the value of * If path is not reachable from the supplied root, then the value of
* root is changed (without modifying refcounts). * root is changed (without modifying refcounts).
@ -2377,7 +2422,9 @@ char *__d_path(const struct path *path, struct path *root,
prepend(&res, &buflen, "\0", 1); prepend(&res, &buflen, "\0", 1);
spin_lock(&dcache_lock); spin_lock(&dcache_lock);
write_seqlock(&rename_lock);
error = prepend_path(path, root, &res, &buflen); error = prepend_path(path, root, &res, &buflen);
write_sequnlock(&rename_lock);
spin_unlock(&dcache_lock); spin_unlock(&dcache_lock);
if (error) if (error)
@ -2441,10 +2488,12 @@ char *d_path(const struct path *path, char *buf, int buflen)
get_fs_root(current->fs, &root); get_fs_root(current->fs, &root);
spin_lock(&dcache_lock); spin_lock(&dcache_lock);
write_seqlock(&rename_lock);
tmp = root; tmp = root;
error = path_with_deleted(path, &tmp, &res, &buflen); error = path_with_deleted(path, &tmp, &res, &buflen);
if (error) if (error)
res = ERR_PTR(error); res = ERR_PTR(error);
write_sequnlock(&rename_lock);
spin_unlock(&dcache_lock); spin_unlock(&dcache_lock);
path_put(&root); path_put(&root);
return res; return res;
@ -2472,10 +2521,12 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
get_fs_root(current->fs, &root); get_fs_root(current->fs, &root);
spin_lock(&dcache_lock); spin_lock(&dcache_lock);
write_seqlock(&rename_lock);
tmp = root; tmp = root;
error = path_with_deleted(path, &tmp, &res, &buflen); error = path_with_deleted(path, &tmp, &res, &buflen);
if (!error && !path_equal(&tmp, &root)) if (!error && !path_equal(&tmp, &root))
error = prepend_unreachable(&res, &buflen); error = prepend_unreachable(&res, &buflen);
write_sequnlock(&rename_lock);
spin_unlock(&dcache_lock); spin_unlock(&dcache_lock);
path_put(&root); path_put(&root);
if (error) if (error)
@ -2544,7 +2595,9 @@ char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
char *retval; char *retval;
spin_lock(&dcache_lock); spin_lock(&dcache_lock);
write_seqlock(&rename_lock);
retval = __dentry_path(dentry, buf, buflen); retval = __dentry_path(dentry, buf, buflen);
write_sequnlock(&rename_lock);
spin_unlock(&dcache_lock); spin_unlock(&dcache_lock);
return retval; return retval;
@ -2557,6 +2610,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
char *retval; char *retval;
spin_lock(&dcache_lock); spin_lock(&dcache_lock);
write_seqlock(&rename_lock);
if (d_unlinked(dentry)) { if (d_unlinked(dentry)) {
p = buf + buflen; p = buf + buflen;
if (prepend(&p, &buflen, "//deleted", 10) != 0) if (prepend(&p, &buflen, "//deleted", 10) != 0)
@ -2564,6 +2618,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
buflen++; buflen++;
} }
retval = __dentry_path(dentry, buf, buflen); retval = __dentry_path(dentry, buf, buflen);
write_sequnlock(&rename_lock);
spin_unlock(&dcache_lock); spin_unlock(&dcache_lock);
if (!IS_ERR(retval) && p) if (!IS_ERR(retval) && p)
*p = '/'; /* restore '/' overriden with '\0' */ *p = '/'; /* restore '/' overriden with '\0' */
@ -2604,6 +2659,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
error = -ENOENT; error = -ENOENT;
spin_lock(&dcache_lock); spin_lock(&dcache_lock);
write_seqlock(&rename_lock);
if (!d_unlinked(pwd.dentry)) { if (!d_unlinked(pwd.dentry)) {
unsigned long len; unsigned long len;
struct path tmp = root; struct path tmp = root;
@ -2612,6 +2668,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
prepend(&cwd, &buflen, "\0", 1); prepend(&cwd, &buflen, "\0", 1);
error = prepend_path(&pwd, &tmp, &cwd, &buflen); error = prepend_path(&pwd, &tmp, &cwd, &buflen);
write_sequnlock(&rename_lock);
spin_unlock(&dcache_lock); spin_unlock(&dcache_lock);
if (error) if (error)
@ -2631,8 +2688,10 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
if (copy_to_user(buf, cwd, len)) if (copy_to_user(buf, cwd, len))
error = -EFAULT; error = -EFAULT;
} }
} else } else {
write_sequnlock(&rename_lock);
spin_unlock(&dcache_lock); spin_unlock(&dcache_lock);
}
out: out:
path_put(&pwd); path_put(&pwd);
@ -2660,25 +2719,25 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
{ {
int result; int result;
unsigned long seq; unsigned seq;
if (new_dentry == old_dentry) if (new_dentry == old_dentry)
return 1; return 1;
/*
* Need rcu_readlock to protect against the d_parent trashing
* due to d_move
*/
rcu_read_lock();
do { do {
/* for restarting inner loop in case of seq retry */ /* for restarting inner loop in case of seq retry */
seq = read_seqbegin(&rename_lock); seq = read_seqbegin(&rename_lock);
/*
* Need rcu_readlock to protect against the d_parent trashing
* due to d_move
*/
rcu_read_lock();
if (d_ancestor(old_dentry, new_dentry)) if (d_ancestor(old_dentry, new_dentry))
result = 1; result = 1;
else else
result = 0; result = 0;
rcu_read_unlock();
} while (read_seqretry(&rename_lock, seq)); } while (read_seqretry(&rename_lock, seq));
rcu_read_unlock();
return result; return result;
} }
@ -2710,9 +2769,13 @@ EXPORT_SYMBOL(path_is_under);
void d_genocide(struct dentry *root) void d_genocide(struct dentry *root)
{ {
struct dentry *this_parent = root; struct dentry *this_parent;
struct list_head *next; struct list_head *next;
unsigned seq;
rename_retry:
this_parent = root;
seq = read_seqbegin(&rename_lock);
spin_lock(&dcache_lock); spin_lock(&dcache_lock);
spin_lock(&this_parent->d_lock); spin_lock(&this_parent->d_lock);
repeat: repeat:
@ -2722,6 +2785,7 @@ void d_genocide(struct dentry *root)
struct list_head *tmp = next; struct list_head *tmp = next;
struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
next = tmp->next; next = tmp->next;
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
if (d_unhashed(dentry) || !dentry->d_inode) { if (d_unhashed(dentry) || !dentry->d_inode) {
spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_lock);
@ -2734,19 +2798,43 @@ void d_genocide(struct dentry *root)
spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
goto repeat; goto repeat;
} }
dentry->d_count--; if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
dentry->d_flags |= DCACHE_GENOCIDE;
dentry->d_count--;
}
spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_lock);
} }
if (this_parent != root) { if (this_parent != root) {
next = this_parent->d_u.d_child.next; struct dentry *tmp;
this_parent->d_count--; struct dentry *child;
tmp = this_parent->d_parent;
if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
this_parent->d_flags |= DCACHE_GENOCIDE;
this_parent->d_count--;
}
rcu_read_lock();
spin_unlock(&this_parent->d_lock); spin_unlock(&this_parent->d_lock);
this_parent = this_parent->d_parent; child = this_parent;
this_parent = tmp;
spin_lock(&this_parent->d_lock); spin_lock(&this_parent->d_lock);
/* might go back up the wrong parent if we have had a rename
* or deletion */
if (this_parent != child->d_parent ||
read_seqretry(&rename_lock, seq)) {
spin_unlock(&this_parent->d_lock);
spin_unlock(&dcache_lock);
rcu_read_unlock();
goto rename_retry;
}
rcu_read_unlock();
next = child->d_u.d_child.next;
goto resume; goto resume;
} }
spin_unlock(&this_parent->d_lock); spin_unlock(&this_parent->d_lock);
spin_unlock(&dcache_lock); spin_unlock(&dcache_lock);
if (read_seqretry(&rename_lock, seq))
goto rename_retry;
} }
/** /**

View File

@ -49,11 +49,17 @@ char *nfs_path(const char *base,
const struct dentry *dentry, const struct dentry *dentry,
char *buffer, ssize_t buflen) char *buffer, ssize_t buflen)
{ {
char *end = buffer+buflen; char *end;
int namelen; int namelen;
unsigned seq;
rename_retry:
end = buffer+buflen;
*--end = '\0'; *--end = '\0';
buflen--; buflen--;
seq = read_seqbegin(&rename_lock);
rcu_read_lock();
spin_lock(&dcache_lock); spin_lock(&dcache_lock);
while (!IS_ROOT(dentry) && dentry != droot) { while (!IS_ROOT(dentry) && dentry != droot) {
namelen = dentry->d_name.len; namelen = dentry->d_name.len;
@ -66,6 +72,9 @@ char *nfs_path(const char *base,
dentry = dentry->d_parent; dentry = dentry->d_parent;
} }
spin_unlock(&dcache_lock); spin_unlock(&dcache_lock);
rcu_read_unlock();
if (read_seqretry(&rename_lock, seq))
goto rename_retry;
if (*end != '/') { if (*end != '/') {
if (--buflen < 0) if (--buflen < 0)
goto Elong; goto Elong;
@ -83,6 +92,9 @@ char *nfs_path(const char *base,
return end; return end;
Elong_unlock: Elong_unlock:
spin_unlock(&dcache_lock); spin_unlock(&dcache_lock);
rcu_read_unlock();
if (read_seqretry(&rename_lock, seq))
goto rename_retry;
Elong: Elong:
return ERR_PTR(-ENAMETOOLONG); return ERR_PTR(-ENAMETOOLONG);
} }

View File

@ -180,6 +180,7 @@ struct dentry_operations {
#define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 /* Parent inode is watched by some fsnotify listener */ #define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 /* Parent inode is watched by some fsnotify listener */
#define DCACHE_CANT_MOUNT 0x0100 #define DCACHE_CANT_MOUNT 0x0100
#define DCACHE_GENOCIDE 0x0200
extern spinlock_t dcache_inode_lock; extern spinlock_t dcache_inode_lock;
extern spinlock_t dcache_lock; extern spinlock_t dcache_lock;