linux_dsm_epyc7002/fs/minix/inode.c

690 lines
18 KiB
C
Raw Normal View History

/*
* linux/fs/minix/inode.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* Copyright (C) 1996 Gertjan van Wingerde
* Minix V2 fs support.
*
* Modified for 680x0 by Andreas Schwab
* Updated to filesystem version 3 by Daniel Aragones
*/
#include <linux/module.h>
#include "minix.h"
#include <linux/buffer_head.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/highuid.h>
#include <linux/vfs.h>
#include <linux/writeback.h>
static int minix_write_inode(struct inode *inode,
struct writeback_control *wbc);
static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
static int minix_remount (struct super_block * sb, int * flags, char * data);
static void minix_evict_inode(struct inode *inode)
{
mm + fs: store shadow entries in page cache Reclaim will be leaving shadow entries in the page cache radix tree upon evicting the real page. As those pages are found from the LRU, an iput() can lead to the inode being freed concurrently. At this point, reclaim must no longer install shadow pages because the inode freeing code needs to ensure the page tree is really empty. Add an address_space flag, AS_EXITING, that the inode freeing code sets under the tree lock before doing the final truncate. Reclaim will check for this flag before installing shadow pages. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Rik van Riel <riel@redhat.com> Reviewed-by: Minchan Kim <minchan@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Bob Liu <bob.liu@oracle.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Greg Thelen <gthelen@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jan Kara <jack@suse.cz> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Luigi Semenzato <semenzato@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Metin Doslu <metin@citusdata.com> Cc: Michel Lespinasse <walken@google.com> Cc: Ozgun Erdogan <ozgun@citusdata.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Roman Gushchin <klamm@yandex-team.ru> Cc: Ryan Mallon <rmallon@gmail.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-04 04:47:49 +07:00
truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
minix_truncate(inode);
}
invalidate_inode_buffers(inode);
clear_inode(inode);
if (!inode->i_nlink)
minix_free_inode(inode);
}
static void minix_put_super(struct super_block *sb)
{
int i;
struct minix_sb_info *sbi = minix_sb(sb);
if (!(sb->s_flags & MS_RDONLY)) {
if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */
sbi->s_ms->s_state = sbi->s_mount_state;
mark_buffer_dirty(sbi->s_sbh);
}
for (i = 0; i < sbi->s_imap_blocks; i++)
brelse(sbi->s_imap[i]);
for (i = 0; i < sbi->s_zmap_blocks; i++)
brelse(sbi->s_zmap[i]);
brelse (sbi->s_sbh);
kfree(sbi->s_imap);
sb->s_fs_info = NULL;
kfree(sbi);
}
static struct kmem_cache * minix_inode_cachep;
static struct inode *minix_alloc_inode(struct super_block *sb)
{
struct minix_inode_info *ei;
ei = (struct minix_inode_info *)kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
}
2011-01-07 13:49:49 +07:00
static void minix_i_callback(struct rcu_head *head)
{
2011-01-07 13:49:49 +07:00
struct inode *inode = container_of(head, struct inode, i_rcu);
kmem_cache_free(minix_inode_cachep, minix_i(inode));
}
2011-01-07 13:49:49 +07:00
static void minix_destroy_inode(struct inode *inode)
{
call_rcu(&inode->i_rcu, minix_i_callback);
}
static void init_once(void *foo)
{
struct minix_inode_info *ei = (struct minix_inode_info *) foo;
inode_init_once(&ei->vfs_inode);
}
static int __init init_inodecache(void)
{
minix_inode_cachep = kmem_cache_create("minix_inode_cache",
sizeof(struct minix_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD),
init_once);
if (minix_inode_cachep == NULL)
return -ENOMEM;
return 0;
}
static void destroy_inodecache(void)
{
/*
* Make sure all delayed rcu free inodes are flushed before we
* destroy cache.
*/
rcu_barrier();
kmem_cache_destroy(minix_inode_cachep);
}
static const struct super_operations minix_sops = {
.alloc_inode = minix_alloc_inode,
.destroy_inode = minix_destroy_inode,
.write_inode = minix_write_inode,
.evict_inode = minix_evict_inode,
.put_super = minix_put_super,
.statfs = minix_statfs,
.remount_fs = minix_remount,
};
static int minix_remount (struct super_block * sb, int * flags, char * data)
{
struct minix_sb_info * sbi = minix_sb(sb);
struct minix_super_block * ms;
ms = sbi->s_ms;
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
return 0;
if (*flags & MS_RDONLY) {
if (ms->s_state & MINIX_VALID_FS ||
!(sbi->s_mount_state & MINIX_VALID_FS))
return 0;
/* Mounting a rw partition read-only. */
if (sbi->s_version != MINIX_V3)
ms->s_state = sbi->s_mount_state;
mark_buffer_dirty(sbi->s_sbh);
} else {
/* Mount a partition which is read-only, read-write. */
if (sbi->s_version != MINIX_V3) {
sbi->s_mount_state = ms->s_state;
ms->s_state &= ~MINIX_VALID_FS;
} else {
sbi->s_mount_state = MINIX_VALID_FS;
}
mark_buffer_dirty(sbi->s_sbh);
if (!(sbi->s_mount_state & MINIX_VALID_FS))
printk("MINIX-fs warning: remounting unchecked fs, "
"running fsck is recommended\n");
else if ((sbi->s_mount_state & MINIX_ERROR_FS))
printk("MINIX-fs warning: remounting fs with errors, "
"running fsck is recommended\n");
}
return 0;
}
static int minix_fill_super(struct super_block *s, void *data, int silent)
{
struct buffer_head *bh;
struct buffer_head **map;
struct minix_super_block *ms;
struct minix3_super_block *m3s = NULL;
unsigned long i, block;
struct inode *root_inode;
struct minix_sb_info *sbi;
int ret = -EINVAL;
sbi = kzalloc(sizeof(struct minix_sb_info), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
s->s_fs_info = sbi;
BUILD_BUG_ON(32 != sizeof (struct minix_inode));
BUILD_BUG_ON(64 != sizeof(struct minix2_inode));
if (!sb_set_blocksize(s, BLOCK_SIZE))
goto out_bad_hblock;
if (!(bh = sb_bread(s, 1)))
goto out_bad_sb;
ms = (struct minix_super_block *) bh->b_data;
sbi->s_ms = ms;
sbi->s_sbh = bh;
sbi->s_mount_state = ms->s_state;
sbi->s_ninodes = ms->s_ninodes;
sbi->s_nzones = ms->s_nzones;
sbi->s_imap_blocks = ms->s_imap_blocks;
sbi->s_zmap_blocks = ms->s_zmap_blocks;
sbi->s_firstdatazone = ms->s_firstdatazone;
sbi->s_log_zone_size = ms->s_log_zone_size;
sbi->s_max_size = ms->s_max_size;
s->s_magic = ms->s_magic;
if (s->s_magic == MINIX_SUPER_MAGIC) {
sbi->s_version = MINIX_V1;
sbi->s_dirsize = 16;
sbi->s_namelen = 14;
s->s_max_links = MINIX_LINK_MAX;
} else if (s->s_magic == MINIX_SUPER_MAGIC2) {
sbi->s_version = MINIX_V1;
sbi->s_dirsize = 32;
sbi->s_namelen = 30;
s->s_max_links = MINIX_LINK_MAX;
} else if (s->s_magic == MINIX2_SUPER_MAGIC) {
sbi->s_version = MINIX_V2;
sbi->s_nzones = ms->s_zones;
sbi->s_dirsize = 16;
sbi->s_namelen = 14;
s->s_max_links = MINIX2_LINK_MAX;
} else if (s->s_magic == MINIX2_SUPER_MAGIC2) {
sbi->s_version = MINIX_V2;
sbi->s_nzones = ms->s_zones;
sbi->s_dirsize = 32;
sbi->s_namelen = 30;
s->s_max_links = MINIX2_LINK_MAX;
} else if ( *(__u16 *)(bh->b_data + 24) == MINIX3_SUPER_MAGIC) {
m3s = (struct minix3_super_block *) bh->b_data;
s->s_magic = m3s->s_magic;
sbi->s_imap_blocks = m3s->s_imap_blocks;
sbi->s_zmap_blocks = m3s->s_zmap_blocks;
sbi->s_firstdatazone = m3s->s_firstdatazone;
sbi->s_log_zone_size = m3s->s_log_zone_size;
sbi->s_max_size = m3s->s_max_size;
sbi->s_ninodes = m3s->s_ninodes;
sbi->s_nzones = m3s->s_zones;
sbi->s_dirsize = 64;
sbi->s_namelen = 60;
sbi->s_version = MINIX_V3;
sbi->s_mount_state = MINIX_VALID_FS;
sb_set_blocksize(s, m3s->s_blocksize);
s->s_max_links = MINIX2_LINK_MAX;
} else
goto out_no_fs;
/*
* Allocate the buffer map to keep the superblock small.
*/
if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0)
goto out_illegal_sb;
i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh);
map = kzalloc(i, GFP_KERNEL);
if (!map)
goto out_no_map;
sbi->s_imap = &map[0];
sbi->s_zmap = &map[sbi->s_imap_blocks];
block=2;
for (i=0 ; i < sbi->s_imap_blocks ; i++) {
if (!(sbi->s_imap[i]=sb_bread(s, block)))
goto out_no_bitmap;
block++;
}
for (i=0 ; i < sbi->s_zmap_blocks ; i++) {
if (!(sbi->s_zmap[i]=sb_bread(s, block)))
goto out_no_bitmap;
block++;
}
minix_set_bit(0,sbi->s_imap[0]->b_data);
minix_set_bit(0,sbi->s_zmap[0]->b_data);
/* Apparently minix can create filesystems that allocate more blocks for
* the bitmaps than needed. We simply ignore that, but verify it didn't
* create one with not enough blocks and bail out if so.
*/
block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize);
if (sbi->s_imap_blocks < block) {
printk("MINIX-fs: file system does not have enough "
"imap blocks allocated. Refusing to mount\n");
goto out_no_bitmap;
}
block = minix_blocks_needed(
(sbi->s_nzones - (sbi->s_firstdatazone + 1)),
s->s_blocksize);
if (sbi->s_zmap_blocks < block) {
printk("MINIX-fs: file system does not have enough "
"zmap blocks allocated. Refusing to mount.\n");
goto out_no_bitmap;
}
/* set up enough so that it can read an inode */
s->s_op = &minix_sops;
root_inode = minix_iget(s, MINIX_ROOT_INO);
if (IS_ERR(root_inode)) {
ret = PTR_ERR(root_inode);
goto out_no_root;
}
ret = -ENOMEM;
s->s_root = d_make_root(root_inode);
if (!s->s_root)
goto out_no_root;
if (!(s->s_flags & MS_RDONLY)) {
if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */
ms->s_state &= ~MINIX_VALID_FS;
mark_buffer_dirty(bh);
}
if (!(sbi->s_mount_state & MINIX_VALID_FS))
printk("MINIX-fs: mounting unchecked file system, "
"running fsck is recommended\n");
else if (sbi->s_mount_state & MINIX_ERROR_FS)
printk("MINIX-fs: mounting file system with errors, "
"running fsck is recommended\n");
return 0;
out_no_root:
if (!silent)
printk("MINIX-fs: get root inode failed\n");
goto out_freemap;
out_no_bitmap:
printk("MINIX-fs: bad superblock or unable to read bitmaps\n");
out_freemap:
for (i = 0; i < sbi->s_imap_blocks; i++)
brelse(sbi->s_imap[i]);
for (i = 0; i < sbi->s_zmap_blocks; i++)
brelse(sbi->s_zmap[i]);
kfree(sbi->s_imap);
goto out_release;
out_no_map:
ret = -ENOMEM;
if (!silent)
printk("MINIX-fs: can't allocate map\n");
goto out_release;
out_illegal_sb:
if (!silent)
printk("MINIX-fs: bad superblock\n");
goto out_release;
out_no_fs:
if (!silent)
printk("VFS: Can't find a Minix filesystem V1 | V2 | V3 "
"on device %s.\n", s->s_id);
out_release:
brelse(bh);
goto out;
out_bad_hblock:
printk("MINIX-fs: blocksize too small for device\n");
goto out;
out_bad_sb:
printk("MINIX-fs: unable to read superblock\n");
out:
s->s_fs_info = NULL;
kfree(sbi);
return ret;
}
static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct minix_sb_info *sbi = minix_sb(sb);
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = sb->s_magic;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
buf->f_bfree = minix_count_free_blocks(sb);
buf->f_bavail = buf->f_bfree;
buf->f_files = sbi->s_ninodes;
buf->f_ffree = minix_count_free_inodes(sb);
buf->f_namelen = sbi->s_namelen;
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
return 0;
}
static int minix_get_block(struct inode *inode, sector_t block,
struct buffer_head *bh_result, int create)
{
if (INODE_VERSION(inode) == MINIX_V1)
return V1_minix_get_block(inode, block, bh_result, create);
else
return V2_minix_get_block(inode, block, bh_result, create);
}
static int minix_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, minix_get_block, wbc);
}
static int minix_readpage(struct file *file, struct page *page)
{
return block_read_full_page(page,minix_get_block);
}
int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
{
return __block_write_begin(page, pos, len, minix_get_block);
}
static void minix_write_failed(struct address_space *mapping, loff_t to)
{
struct inode *inode = mapping->host;
if (to > inode->i_size) {
truncate_pagecache(inode, inode->i_size);
minix_truncate(inode);
}
}
static int minix_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
int ret;
ret = block_write_begin(mapping, pos, len, flags, pagep,
minix_get_block);
if (unlikely(ret))
minix_write_failed(mapping, pos + len);
return ret;
}
static sector_t minix_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,minix_get_block);
}
static const struct address_space_operations minix_aops = {
.readpage = minix_readpage,
.writepage = minix_writepage,
.write_begin = minix_write_begin,
.write_end = generic_write_end,
.bmap = minix_bmap
};
static const struct inode_operations minix_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
.getattr = minix_getattr,
};
void minix_set_inode(struct inode *inode, dev_t rdev)
{
if (S_ISREG(inode->i_mode)) {
inode->i_op = &minix_file_inode_operations;
inode->i_fop = &minix_file_operations;
inode->i_mapping->a_ops = &minix_aops;
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &minix_dir_inode_operations;
inode->i_fop = &minix_dir_operations;
inode->i_mapping->a_ops = &minix_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &minix_symlink_inode_operations;
inode->i_mapping->a_ops = &minix_aops;
} else
init_special_inode(inode, inode->i_mode, rdev);
}
/*
* The minix V1 function to read an inode.
*/
static struct inode *V1_minix_iget(struct inode *inode)
{
struct buffer_head * bh;
struct minix_inode * raw_inode;
struct minix_inode_info *minix_inode = minix_i(inode);
int i;
raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh);
if (!raw_inode) {
iget_failed(inode);
return ERR_PTR(-EIO);
}
inode->i_mode = raw_inode->i_mode;
i_uid_write(inode, raw_inode->i_uid);
i_gid_write(inode, raw_inode->i_gid);
set_nlink(inode, raw_inode->i_nlinks);
inode->i_size = raw_inode->i_size;
inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time;
inode->i_mtime.tv_nsec = 0;
inode->i_atime.tv_nsec = 0;
inode->i_ctime.tv_nsec = 0;
inode->i_blocks = 0;
for (i = 0; i < 9; i++)
minix_inode->u.i1_data[i] = raw_inode->i_zone[i];
minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0]));
brelse(bh);
unlock_new_inode(inode);
return inode;
}
/*
* The minix V2 function to read an inode.
*/
static struct inode *V2_minix_iget(struct inode *inode)
{
struct buffer_head * bh;
struct minix2_inode * raw_inode;
struct minix_inode_info *minix_inode = minix_i(inode);
int i;
raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh);
if (!raw_inode) {
iget_failed(inode);
return ERR_PTR(-EIO);
}
inode->i_mode = raw_inode->i_mode;
i_uid_write(inode, raw_inode->i_uid);
i_gid_write(inode, raw_inode->i_gid);
set_nlink(inode, raw_inode->i_nlinks);
inode->i_size = raw_inode->i_size;
inode->i_mtime.tv_sec = raw_inode->i_mtime;
inode->i_atime.tv_sec = raw_inode->i_atime;
inode->i_ctime.tv_sec = raw_inode->i_ctime;
inode->i_mtime.tv_nsec = 0;
inode->i_atime.tv_nsec = 0;
inode->i_ctime.tv_nsec = 0;
inode->i_blocks = 0;
for (i = 0; i < 10; i++)
minix_inode->u.i2_data[i] = raw_inode->i_zone[i];
minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0]));
brelse(bh);
unlock_new_inode(inode);
return inode;
}
/*
* The global function to read an inode.
*/
struct inode *minix_iget(struct super_block *sb, unsigned long ino)
{
struct inode *inode;
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW))
return inode;
if (INODE_VERSION(inode) == MINIX_V1)
return V1_minix_iget(inode);
else
return V2_minix_iget(inode);
}
/*
* The minix V1 function to synchronize an inode.
*/
static struct buffer_head * V1_minix_update_inode(struct inode * inode)
{
struct buffer_head * bh;
struct minix_inode * raw_inode;
struct minix_inode_info *minix_inode = minix_i(inode);
int i;
raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh);
if (!raw_inode)
return NULL;
raw_inode->i_mode = inode->i_mode;
raw_inode->i_uid = fs_high2lowuid(i_uid_read(inode));
raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode));
raw_inode->i_nlinks = inode->i_nlink;
raw_inode->i_size = inode->i_size;
raw_inode->i_time = inode->i_mtime.tv_sec;
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
else for (i = 0; i < 9; i++)
raw_inode->i_zone[i] = minix_inode->u.i1_data[i];
mark_buffer_dirty(bh);
return bh;
}
/*
* The minix V2 function to synchronize an inode.
*/
static struct buffer_head * V2_minix_update_inode(struct inode * inode)
{
struct buffer_head * bh;
struct minix2_inode * raw_inode;
struct minix_inode_info *minix_inode = minix_i(inode);
int i;
raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh);
if (!raw_inode)
return NULL;
raw_inode->i_mode = inode->i_mode;
raw_inode->i_uid = fs_high2lowuid(i_uid_read(inode));
raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode));
raw_inode->i_nlinks = inode->i_nlink;
raw_inode->i_size = inode->i_size;
raw_inode->i_mtime = inode->i_mtime.tv_sec;
raw_inode->i_atime = inode->i_atime.tv_sec;
raw_inode->i_ctime = inode->i_ctime.tv_sec;
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
else for (i = 0; i < 10; i++)
raw_inode->i_zone[i] = minix_inode->u.i2_data[i];
mark_buffer_dirty(bh);
return bh;
}
static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int err = 0;
struct buffer_head *bh;
if (INODE_VERSION(inode) == MINIX_V1)
bh = V1_minix_update_inode(inode);
else
bh = V2_minix_update_inode(inode);
if (!bh)
return -EIO;
if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) {
sync_dirty_buffer(bh);
if (buffer_req(bh) && !buffer_uptodate(bh)) {
printk("IO error syncing minix inode [%s:%08lx]\n",
inode->i_sb->s_id, inode->i_ino);
err = -EIO;
}
}
brelse (bh);
return err;
}
int minix_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct super_block *sb = dentry->d_sb;
generic_fillattr(dentry->d_inode, stat);
if (INODE_VERSION(dentry->d_inode) == MINIX_V1)
stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb);
else
stat->blocks = (sb->s_blocksize / 512) * V2_minix_blocks(stat->size, sb);
stat->blksize = sb->s_blocksize;
return 0;
}
/*
* The function that is called for file truncation.
*/
void minix_truncate(struct inode * inode)
{
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)))
return;
if (INODE_VERSION(inode) == MINIX_V1)
V1_minix_truncate(inode);
else
V2_minix_truncate(inode);
}
static struct dentry *minix_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super);
}
static struct file_system_type minix_fs_type = {
.owner = THIS_MODULE,
.name = "minix",
.mount = minix_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
fs: Limit sys_mount to only request filesystem modules. Modify the request_module to prefix the file system type with "fs-" and add aliases to all of the filesystems that can be built as modules to match. A common practice is to build all of the kernel code and leave code that is not commonly needed as modules, with the result that many users are exposed to any bug anywhere in the kernel. Looking for filesystems with a fs- prefix limits the pool of possible modules that can be loaded by mount to just filesystems trivially making things safer with no real cost. Using aliases means user space can control the policy of which filesystem modules are auto-loaded by editing /etc/modprobe.d/*.conf with blacklist and alias directives. Allowing simple, safe, well understood work-arounds to known problematic software. This also addresses a rare but unfortunate problem where the filesystem name is not the same as it's module name and module auto-loading would not work. While writing this patch I saw a handful of such cases. The most significant being autofs that lives in the module autofs4. This is relevant to user namespaces because we can reach the request module in get_fs_type() without having any special permissions, and people get uncomfortable when a user specified string (in this case the filesystem type) goes all of the way to request_module. After having looked at this issue I don't think there is any particular reason to perform any filtering or permission checks beyond making it clear in the module request that we want a filesystem module. The common pattern in the kernel is to call request_module() without regards to the users permissions. In general all a filesystem module does once loaded is call register_filesystem() and go to sleep. Which means there is not much attack surface exposed by loading a filesytem module unless the filesystem is mounted. In a user namespace filesystems are not mounted unless .fs_flags = FS_USERNS_MOUNT, which most filesystems do not set today. Acked-by: Serge Hallyn <serge.hallyn@canonical.com> Acked-by: Kees Cook <keescook@chromium.org> Reported-by: Kees Cook <keescook@google.com> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
2013-03-03 10:39:14 +07:00
MODULE_ALIAS_FS("minix");
static int __init init_minix_fs(void)
{
int err = init_inodecache();
if (err)
goto out1;
err = register_filesystem(&minix_fs_type);
if (err)
goto out;
return 0;
out:
destroy_inodecache();
out1:
return err;
}
static void __exit exit_minix_fs(void)
{
unregister_filesystem(&minix_fs_type);
destroy_inodecache();
}
module_init(init_minix_fs)
module_exit(exit_minix_fs)
MODULE_LICENSE("GPL");