2012-11-29 11:28:09 +07:00
|
|
|
/*
|
2012-11-02 15:10:40 +07:00
|
|
|
* fs/f2fs/inode.c
|
|
|
|
*
|
|
|
|
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
|
|
|
|
* http://www.samsung.com/
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*/
|
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/f2fs_fs.h>
|
|
|
|
#include <linux/buffer_head.h>
|
2016-09-10 06:59:39 +07:00
|
|
|
#include <linux/backing-dev.h>
|
2012-11-02 15:10:40 +07:00
|
|
|
#include <linux/writeback.h>
|
|
|
|
|
|
|
|
#include "f2fs.h"
|
|
|
|
#include "node.h"
|
2017-06-14 22:00:56 +07:00
|
|
|
#include "segment.h"
|
2012-11-02 15:10:40 +07:00
|
|
|
|
2013-04-19 23:28:40 +07:00
|
|
|
#include <trace/events/f2fs.h>
|
|
|
|
|
2016-10-15 01:51:23 +07:00
|
|
|
void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync)
|
2016-07-01 09:09:37 +07:00
|
|
|
{
|
2018-01-11 09:26:19 +07:00
|
|
|
if (is_inode_flag_set(inode, FI_NEW_INODE))
|
|
|
|
return;
|
|
|
|
|
2016-10-15 01:51:23 +07:00
|
|
|
if (f2fs_inode_dirtied(inode, sync))
|
2016-07-01 09:09:37 +07:00
|
|
|
return;
|
2016-10-15 01:51:23 +07:00
|
|
|
|
2016-07-01 09:09:37 +07:00
|
|
|
mark_inode_dirty_sync(inode);
|
|
|
|
}
|
|
|
|
|
2012-11-02 15:10:40 +07:00
|
|
|
void f2fs_set_inode_flags(struct inode *inode)
|
|
|
|
{
|
|
|
|
unsigned int flags = F2FS_I(inode)->i_flags;
|
2014-04-15 13:19:38 +07:00
|
|
|
unsigned int new_fl = 0;
|
2012-11-02 15:10:40 +07:00
|
|
|
|
|
|
|
if (flags & FS_SYNC_FL)
|
2014-04-15 13:19:38 +07:00
|
|
|
new_fl |= S_SYNC;
|
2012-11-02 15:10:40 +07:00
|
|
|
if (flags & FS_APPEND_FL)
|
2014-04-15 13:19:38 +07:00
|
|
|
new_fl |= S_APPEND;
|
2012-11-02 15:10:40 +07:00
|
|
|
if (flags & FS_IMMUTABLE_FL)
|
2014-04-15 13:19:38 +07:00
|
|
|
new_fl |= S_IMMUTABLE;
|
2012-11-02 15:10:40 +07:00
|
|
|
if (flags & FS_NOATIME_FL)
|
2014-04-15 13:19:38 +07:00
|
|
|
new_fl |= S_NOATIME;
|
2012-11-02 15:10:40 +07:00
|
|
|
if (flags & FS_DIRSYNC_FL)
|
2014-04-15 13:19:38 +07:00
|
|
|
new_fl |= S_DIRSYNC;
|
2017-10-10 02:15:35 +07:00
|
|
|
if (f2fs_encrypted_inode(inode))
|
|
|
|
new_fl |= S_ENCRYPTED;
|
2015-08-24 09:41:32 +07:00
|
|
|
inode_set_flags(inode, new_fl,
|
2017-10-10 02:15:35 +07:00
|
|
|
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|
|
|
|
|
S_ENCRYPTED);
|
2012-11-02 15:10:40 +07:00
|
|
|
}
|
|
|
|
|
2013-10-08 16:01:51 +07:00
|
|
|
static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
|
|
|
|
{
|
2017-07-18 23:19:06 +07:00
|
|
|
int extra_size = get_extra_isize(inode);
|
|
|
|
|
2013-10-08 16:01:51 +07:00
|
|
|
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
|
|
|
|
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
|
2017-07-18 23:19:06 +07:00
|
|
|
if (ri->i_addr[extra_size])
|
|
|
|
inode->i_rdev = old_decode_dev(
|
|
|
|
le32_to_cpu(ri->i_addr[extra_size]));
|
2013-10-08 16:01:51 +07:00
|
|
|
else
|
2017-07-18 23:19:06 +07:00
|
|
|
inode->i_rdev = new_decode_dev(
|
|
|
|
le32_to_cpu(ri->i_addr[extra_size + 1]));
|
2013-10-08 16:01:51 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-03-18 07:16:35 +07:00
|
|
|
static bool __written_first_block(struct f2fs_inode *ri)
|
|
|
|
{
|
2017-07-18 23:19:06 +07:00
|
|
|
block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]);
|
2015-03-25 02:04:20 +07:00
|
|
|
|
|
|
|
if (addr != NEW_ADDR && addr != NULL_ADDR)
|
2015-03-18 07:16:35 +07:00
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2013-10-08 16:01:51 +07:00
|
|
|
static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
|
|
|
|
{
|
2017-07-18 23:19:06 +07:00
|
|
|
int extra_size = get_extra_isize(inode);
|
|
|
|
|
2013-10-08 16:01:51 +07:00
|
|
|
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
|
|
|
|
if (old_valid_dev(inode->i_rdev)) {
|
2017-07-18 23:19:06 +07:00
|
|
|
ri->i_addr[extra_size] =
|
2014-01-18 03:44:39 +07:00
|
|
|
cpu_to_le32(old_encode_dev(inode->i_rdev));
|
2017-07-18 23:19:06 +07:00
|
|
|
ri->i_addr[extra_size + 1] = 0;
|
2013-10-08 16:01:51 +07:00
|
|
|
} else {
|
2017-07-18 23:19:06 +07:00
|
|
|
ri->i_addr[extra_size] = 0;
|
|
|
|
ri->i_addr[extra_size + 1] =
|
2014-01-18 03:44:39 +07:00
|
|
|
cpu_to_le32(new_encode_dev(inode->i_rdev));
|
2017-07-18 23:19:06 +07:00
|
|
|
ri->i_addr[extra_size + 2] = 0;
|
2013-10-08 16:01:51 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-06 13:28:43 +07:00
|
|
|
static void __recover_inline_status(struct inode *inode, struct page *ipage)
|
2014-10-24 09:48:09 +07:00
|
|
|
{
|
2017-07-18 23:19:05 +07:00
|
|
|
void *inline_data = inline_data_addr(inode, ipage);
|
2015-01-06 13:28:43 +07:00
|
|
|
__le32 *start = inline_data;
|
2017-07-18 23:19:05 +07:00
|
|
|
__le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32);
|
2014-10-24 09:48:09 +07:00
|
|
|
|
2015-01-06 13:28:43 +07:00
|
|
|
while (start < end) {
|
|
|
|
if (*start++) {
|
2016-01-20 22:43:51 +07:00
|
|
|
f2fs_wait_on_page_writeback(ipage, NODE, true);
|
2014-10-24 09:48:09 +07:00
|
|
|
|
2016-05-21 00:13:22 +07:00
|
|
|
set_inode_flag(inode, FI_DATA_EXIST);
|
|
|
|
set_raw_inline(inode, F2FS_INODE(ipage));
|
2015-01-06 13:28:43 +07:00
|
|
|
set_page_dirty(ipage);
|
|
|
|
return;
|
|
|
|
}
|
2014-10-24 09:48:09 +07:00
|
|
|
}
|
2015-01-06 13:28:43 +07:00
|
|
|
return;
|
2014-10-24 09:48:09 +07:00
|
|
|
}
|
|
|
|
|
2017-07-31 19:19:09 +07:00
|
|
|
static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page)
|
|
|
|
{
|
|
|
|
struct f2fs_inode *ri = &F2FS_NODE(page)->i;
|
|
|
|
int extra_isize = le32_to_cpu(ri->i_extra_isize);
|
|
|
|
|
|
|
|
if (!f2fs_sb_has_inode_chksum(sbi->sb))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!F2FS_FITS_IN_INODE(ri, extra_isize, i_inode_checksum))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page)
|
|
|
|
{
|
|
|
|
struct f2fs_node *node = F2FS_NODE(page);
|
|
|
|
struct f2fs_inode *ri = &node->i;
|
|
|
|
__le32 ino = node->footer.ino;
|
|
|
|
__le32 gen = ri->i_generation;
|
|
|
|
__u32 chksum, chksum_seed;
|
|
|
|
__u32 dummy_cs = 0;
|
|
|
|
unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum);
|
|
|
|
unsigned int cs_size = sizeof(dummy_cs);
|
|
|
|
|
|
|
|
chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino,
|
|
|
|
sizeof(ino));
|
|
|
|
chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen));
|
|
|
|
|
|
|
|
chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset);
|
|
|
|
chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size);
|
|
|
|
offset += cs_size;
|
|
|
|
chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset,
|
|
|
|
F2FS_BLKSIZE - offset);
|
|
|
|
return chksum;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page)
|
|
|
|
{
|
|
|
|
struct f2fs_inode *ri;
|
|
|
|
__u32 provided, calculated;
|
|
|
|
|
2017-09-01 06:54:51 +07:00
|
|
|
if (!f2fs_enable_inode_chksum(sbi, page) ||
|
|
|
|
PageDirty(page) || PageWriteback(page))
|
2017-07-31 19:19:09 +07:00
|
|
|
return true;
|
|
|
|
|
|
|
|
ri = &F2FS_NODE(page)->i;
|
|
|
|
provided = le32_to_cpu(ri->i_inode_checksum);
|
|
|
|
calculated = f2fs_inode_chksum(sbi, page);
|
|
|
|
|
|
|
|
if (provided != calculated)
|
|
|
|
f2fs_msg(sbi->sb, KERN_WARNING,
|
|
|
|
"checksum invalid, ino = %x, %x vs. %x",
|
|
|
|
ino_of_node(page), provided, calculated);
|
|
|
|
|
|
|
|
return provided == calculated;
|
|
|
|
}
|
|
|
|
|
|
|
|
void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page)
|
|
|
|
{
|
|
|
|
struct f2fs_inode *ri = &F2FS_NODE(page)->i;
|
|
|
|
|
|
|
|
if (!f2fs_enable_inode_chksum(sbi, page))
|
|
|
|
return;
|
|
|
|
|
|
|
|
ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page));
|
|
|
|
}
|
|
|
|
|
2012-11-02 15:10:40 +07:00
|
|
|
static int do_read_inode(struct inode *inode)
|
|
|
|
{
|
2014-09-03 05:31:18 +07:00
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
2012-11-02 15:10:40 +07:00
|
|
|
struct f2fs_inode_info *fi = F2FS_I(inode);
|
|
|
|
struct page *node_page;
|
|
|
|
struct f2fs_inode *ri;
|
2017-07-25 23:01:41 +07:00
|
|
|
projid_t i_projid;
|
2012-11-02 15:10:40 +07:00
|
|
|
|
|
|
|
/* Check if ino is within scope */
|
2013-03-17 15:27:20 +07:00
|
|
|
if (check_nid_range(sbi, inode->i_ino)) {
|
|
|
|
f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu",
|
|
|
|
(unsigned long) inode->i_ino);
|
2014-06-12 12:23:41 +07:00
|
|
|
WARN_ON(1);
|
2013-03-17 15:27:20 +07:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
2012-11-02 15:10:40 +07:00
|
|
|
|
|
|
|
node_page = get_node_page(sbi, inode->i_ino);
|
|
|
|
if (IS_ERR(node_page))
|
|
|
|
return PTR_ERR(node_page);
|
|
|
|
|
2013-12-26 14:30:41 +07:00
|
|
|
ri = F2FS_INODE(node_page);
|
2012-11-02 15:10:40 +07:00
|
|
|
|
|
|
|
inode->i_mode = le16_to_cpu(ri->i_mode);
|
|
|
|
i_uid_write(inode, le32_to_cpu(ri->i_uid));
|
|
|
|
i_gid_write(inode, le32_to_cpu(ri->i_gid));
|
|
|
|
set_nlink(inode, le32_to_cpu(ri->i_links));
|
|
|
|
inode->i_size = le64_to_cpu(ri->i_size);
|
f2fs: don't count inode block in in-memory inode.i_blocks
Previously, we count all inode consumed blocks including inode block,
xattr block, index block, data block into i_blocks, for other generic
filesystems, they won't count inode block into i_blocks, so for
userspace applications or quota system, they may detect incorrect block
count according to i_blocks value in inode.
This patch changes to count all blocks into inode.i_blocks excluding
inode block, for on-disk i_blocks, we keep counting inode block for
backward compatibility.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2017-07-06 00:11:31 +07:00
|
|
|
inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1);
|
2012-11-02 15:10:40 +07:00
|
|
|
|
|
|
|
inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime);
|
|
|
|
inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime);
|
|
|
|
inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime);
|
|
|
|
inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
|
|
|
|
inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
|
|
|
|
inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
|
|
|
|
inode->i_generation = le32_to_cpu(ri->i_generation);
|
|
|
|
|
|
|
|
fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
|
|
|
|
fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
|
|
|
|
fi->i_flags = le32_to_cpu(ri->i_flags);
|
|
|
|
fi->flags = 0;
|
|
|
|
fi->i_advise = ri->i_advise;
|
f2fs: fix tracking parent inode number
Previously, f2fs didn't track the parent inode number correctly which is stored
in each f2fs_inode. In the case of the following scenario, a bug can be occured.
Let's suppose there are one directory, "/b", and two files, "/a" and "/b/a".
- pino of "/a" is ROOT_INO.
- pino of "/b/a" is DIR_B_INO.
Then,
# sync
: The inode pages of "/a" and "/b/a" contain the parent inode numbers as
ROOT_INO and DIR_B_INO respectively.
# mv /a /b/a
: The parent inode number of "/a" should be changed to DIR_B_INO, but f2fs
didn't do that. Ref. f2fs_set_link().
In order to fix this clearly, I added i_pino in f2fs_inode_info, and whenever
it needs to be changed like in f2fs_add_link() and f2fs_set_link(), it is
updated temporarily in f2fs_inode_info.
And later, f2fs_write_inode() stores the latest information to the inode pages.
For power-off-recovery, f2fs_sync_file() triggers simply f2fs_write_inode().
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-12-10 15:52:48 +07:00
|
|
|
fi->i_pino = le32_to_cpu(ri->i_pino);
|
2014-02-27 16:20:00 +07:00
|
|
|
fi->i_dir_level = ri->i_dir_level;
|
2013-10-08 16:01:51 +07:00
|
|
|
|
2015-12-29 02:39:06 +07:00
|
|
|
if (f2fs_init_extent_tree(inode, &ri->i_ext))
|
|
|
|
set_page_dirty(node_page);
|
2015-02-05 16:46:29 +07:00
|
|
|
|
2016-05-21 00:13:22 +07:00
|
|
|
get_inline_info(inode, ri);
|
2013-10-08 16:01:51 +07:00
|
|
|
|
2017-07-18 23:19:06 +07:00
|
|
|
fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
|
|
|
|
le16_to_cpu(ri->i_extra_isize) : 0;
|
|
|
|
|
f2fs: support flexible inline xattr size
Now, in product, more and more features based on file encryption were
introduced, their demand of xattr space is increasing, however, inline
xattr has fixed-size of 200 bytes, once inline xattr space is full, new
increased xattr data would occupy additional xattr block which may bring
us more space usage and performance regression during persisting.
In order to resolve above issue, it's better to expand inline xattr size
flexibly according to user's requirement.
So this patch introduces new filesystem feature 'flexible inline xattr',
and new mount option 'inline_xattr_size=%u', once mkfs enables the
feature, we can use the option to make f2fs supporting flexible inline
xattr size.
To support this feature, we add extra attribute i_inline_xattr_size in
inode layout, indicating that how many space inline xattr borrows from
block address mapping space in inode layout, by this, we can easily
locate and store flexible-sized inline xattr data in inode.
Inode disk layout:
+----------------------+
| .i_mode |
| ... |
| .i_ext |
+----------------------+
| .i_extra_isize |
| .i_inline_xattr_size |-----------+
| ... | |
+----------------------+ |
| .i_addr | |
| - block address or | |
| - inline data | |
+----------------------+<---+ v
| inline xattr | +---inline xattr range
+----------------------+<---+
| .i_nid |
+----------------------+
| node_footer |
| (nid, ino, offset) |
+----------------------+
Note that, we have to cnosider backward compatibility which reserved
inline_data space, 200 bytes, all the time, reported by Sheng Yong.
Previous inline data or directory always reserved 200 bytes in inode layout,
even if inline_xattr is disabled. In order to keep inline_dentry's structure
for backward compatibility, we get the space back only from inline_data.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Reported-by: Sheng Yong <shengyong1@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2017-09-06 20:59:50 +07:00
|
|
|
if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
|
|
|
|
f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
|
|
|
|
fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size);
|
|
|
|
} else if (f2fs_has_inline_xattr(inode) ||
|
|
|
|
f2fs_has_inline_dentry(inode)) {
|
|
|
|
fi->i_inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
|
|
|
|
} else {
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Previous inline data or directory always reserved 200 bytes
|
|
|
|
* in inode layout, even if inline_xattr is disabled. In order
|
|
|
|
* to keep inline_dentry's structure for backward compatibility,
|
|
|
|
* we get the space back only from inline_data.
|
|
|
|
*/
|
|
|
|
fi->i_inline_xattr_size = 0;
|
|
|
|
}
|
|
|
|
|
2014-10-24 09:48:09 +07:00
|
|
|
/* check data exist */
|
|
|
|
if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
|
2015-01-06 13:28:43 +07:00
|
|
|
__recover_inline_status(inode, node_page);
|
2014-10-24 09:48:09 +07:00
|
|
|
|
2013-10-08 16:01:51 +07:00
|
|
|
/* get rdev by using inline_info */
|
|
|
|
__get_inode_rdev(inode, ri);
|
|
|
|
|
2015-03-18 07:16:35 +07:00
|
|
|
if (__written_first_block(ri))
|
2016-05-21 00:13:22 +07:00
|
|
|
set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
|
2015-03-18 07:16:35 +07:00
|
|
|
|
2016-05-21 10:42:37 +07:00
|
|
|
if (!need_inode_block_update(sbi, inode->i_ino))
|
|
|
|
fi->last_disk_size = inode->i_size;
|
|
|
|
|
2017-07-25 23:01:41 +07:00
|
|
|
if (fi->i_flags & FS_PROJINHERIT_FL)
|
|
|
|
set_inode_flag(inode, FI_PROJ_INHERIT);
|
|
|
|
|
|
|
|
if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) &&
|
|
|
|
F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid))
|
|
|
|
i_projid = (projid_t)le32_to_cpu(ri->i_projid);
|
|
|
|
else
|
|
|
|
i_projid = F2FS_DEF_PROJID;
|
|
|
|
fi->i_projid = make_kprojid(&init_user_ns, i_projid);
|
|
|
|
|
2018-01-25 13:54:42 +07:00
|
|
|
if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi->sb) &&
|
|
|
|
F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) {
|
|
|
|
fi->i_crtime.tv_sec = le64_to_cpu(ri->i_crtime);
|
|
|
|
fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec);
|
|
|
|
}
|
|
|
|
|
2012-11-02 15:10:40 +07:00
|
|
|
f2fs_put_page(node_page, 1);
|
2014-12-06 01:51:50 +07:00
|
|
|
|
2015-07-15 16:28:53 +07:00
|
|
|
stat_inc_inline_xattr(inode);
|
2014-12-06 01:51:50 +07:00
|
|
|
stat_inc_inline_inode(inode);
|
|
|
|
stat_inc_inline_dir(inode);
|
|
|
|
|
2015-01-06 13:28:43 +07:00
|
|
|
return 0;
|
2012-11-02 15:10:40 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
|
|
|
|
{
|
|
|
|
struct f2fs_sb_info *sbi = F2FS_SB(sb);
|
|
|
|
struct inode *inode;
|
2013-04-19 23:28:40 +07:00
|
|
|
int ret = 0;
|
2012-11-02 15:10:40 +07:00
|
|
|
|
|
|
|
inode = iget_locked(sb, ino);
|
|
|
|
if (!inode)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
2013-04-19 23:28:40 +07:00
|
|
|
|
|
|
|
if (!(inode->i_state & I_NEW)) {
|
|
|
|
trace_f2fs_iget(inode);
|
2012-11-02 15:10:40 +07:00
|
|
|
return inode;
|
2013-04-19 23:28:40 +07:00
|
|
|
}
|
2012-11-02 15:10:40 +07:00
|
|
|
if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi))
|
|
|
|
goto make_now;
|
|
|
|
|
|
|
|
ret = do_read_inode(inode);
|
|
|
|
if (ret)
|
|
|
|
goto bad_inode;
|
|
|
|
make_now:
|
|
|
|
if (ino == F2FS_NODE_INO(sbi)) {
|
|
|
|
inode->i_mapping->a_ops = &f2fs_node_aops;
|
|
|
|
mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
|
|
|
|
} else if (ino == F2FS_META_INO(sbi)) {
|
|
|
|
inode->i_mapping->a_ops = &f2fs_meta_aops;
|
|
|
|
mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
|
|
|
|
} else if (S_ISREG(inode->i_mode)) {
|
|
|
|
inode->i_op = &f2fs_file_inode_operations;
|
|
|
|
inode->i_fop = &f2fs_file_operations;
|
|
|
|
inode->i_mapping->a_ops = &f2fs_dblock_aops;
|
|
|
|
} else if (S_ISDIR(inode->i_mode)) {
|
|
|
|
inode->i_op = &f2fs_dir_inode_operations;
|
|
|
|
inode->i_fop = &f2fs_dir_operations;
|
|
|
|
inode->i_mapping->a_ops = &f2fs_dblock_aops;
|
2014-10-18 07:57:29 +07:00
|
|
|
mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
|
2012-11-02 15:10:40 +07:00
|
|
|
} else if (S_ISLNK(inode->i_mode)) {
|
2015-04-30 05:10:53 +07:00
|
|
|
if (f2fs_encrypted_inode(inode))
|
|
|
|
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
|
|
|
|
else
|
|
|
|
inode->i_op = &f2fs_symlink_inode_operations;
|
2015-11-17 13:07:57 +07:00
|
|
|
inode_nohighmem(inode);
|
2012-11-02 15:10:40 +07:00
|
|
|
inode->i_mapping->a_ops = &f2fs_dblock_aops;
|
|
|
|
} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
|
|
|
|
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
|
|
|
|
inode->i_op = &f2fs_special_inode_operations;
|
|
|
|
init_special_inode(inode, inode->i_mode, inode->i_rdev);
|
|
|
|
} else {
|
|
|
|
ret = -EIO;
|
|
|
|
goto bad_inode;
|
|
|
|
}
|
2017-05-17 03:20:16 +07:00
|
|
|
f2fs_set_inode_flags(inode);
|
2012-11-02 15:10:40 +07:00
|
|
|
unlock_new_inode(inode);
|
2013-04-19 23:28:40 +07:00
|
|
|
trace_f2fs_iget(inode);
|
2012-11-02 15:10:40 +07:00
|
|
|
return inode;
|
|
|
|
|
|
|
|
bad_inode:
|
|
|
|
iget_failed(inode);
|
2013-04-19 23:28:40 +07:00
|
|
|
trace_f2fs_iget_exit(inode, ret);
|
2012-11-02 15:10:40 +07:00
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
2016-09-10 06:59:39 +07:00
|
|
|
struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino)
|
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
retry:
|
|
|
|
inode = f2fs_iget(sb, ino);
|
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
if (PTR_ERR(inode) == -ENOMEM) {
|
|
|
|
congestion_wait(BLK_RW_ASYNC, HZ/50);
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
2017-12-05 11:07:47 +07:00
|
|
|
void update_inode(struct inode *inode, struct page *node_page)
|
2012-11-02 15:10:40 +07:00
|
|
|
{
|
|
|
|
struct f2fs_inode *ri;
|
2016-10-11 21:57:05 +07:00
|
|
|
struct extent_tree *et = F2FS_I(inode)->extent_tree;
|
2012-11-02 15:10:40 +07:00
|
|
|
|
2016-01-20 22:43:51 +07:00
|
|
|
f2fs_wait_on_page_writeback(node_page, NODE, true);
|
2017-12-05 11:07:47 +07:00
|
|
|
set_page_dirty(node_page);
|
|
|
|
|
|
|
|
f2fs_inode_synced(inode);
|
2012-11-02 15:10:40 +07:00
|
|
|
|
2013-12-26 14:30:41 +07:00
|
|
|
ri = F2FS_INODE(node_page);
|
2012-11-02 15:10:40 +07:00
|
|
|
|
|
|
|
ri->i_mode = cpu_to_le16(inode->i_mode);
|
|
|
|
ri->i_advise = F2FS_I(inode)->i_advise;
|
|
|
|
ri->i_uid = cpu_to_le32(i_uid_read(inode));
|
|
|
|
ri->i_gid = cpu_to_le32(i_gid_read(inode));
|
|
|
|
ri->i_links = cpu_to_le32(inode->i_nlink);
|
|
|
|
ri->i_size = cpu_to_le64(i_size_read(inode));
|
f2fs: don't count inode block in in-memory inode.i_blocks
Previously, we count all inode consumed blocks including inode block,
xattr block, index block, data block into i_blocks, for other generic
filesystems, they won't count inode block into i_blocks, so for
userspace applications or quota system, they may detect incorrect block
count according to i_blocks value in inode.
This patch changes to count all blocks into inode.i_blocks excluding
inode block, for on-disk i_blocks, we keep counting inode block for
backward compatibility.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2017-07-06 00:11:31 +07:00
|
|
|
ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1);
|
2015-02-05 16:46:29 +07:00
|
|
|
|
2016-10-11 21:57:05 +07:00
|
|
|
if (et) {
|
|
|
|
read_lock(&et->lock);
|
|
|
|
set_raw_extent(&et->largest, &ri->i_ext);
|
|
|
|
read_unlock(&et->lock);
|
|
|
|
} else {
|
2015-06-20 07:53:26 +07:00
|
|
|
memset(&ri->i_ext, 0, sizeof(ri->i_ext));
|
2016-10-11 21:57:05 +07:00
|
|
|
}
|
2016-05-21 00:13:22 +07:00
|
|
|
set_raw_inline(inode, ri);
|
2012-11-02 15:10:40 +07:00
|
|
|
|
|
|
|
ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
|
|
|
|
ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
|
|
|
|
ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
|
|
|
|
ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
|
|
|
|
ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
|
|
|
|
ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
|
|
|
|
ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth);
|
|
|
|
ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid);
|
|
|
|
ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
|
f2fs: fix tracking parent inode number
Previously, f2fs didn't track the parent inode number correctly which is stored
in each f2fs_inode. In the case of the following scenario, a bug can be occured.
Let's suppose there are one directory, "/b", and two files, "/a" and "/b/a".
- pino of "/a" is ROOT_INO.
- pino of "/b/a" is DIR_B_INO.
Then,
# sync
: The inode pages of "/a" and "/b/a" contain the parent inode numbers as
ROOT_INO and DIR_B_INO respectively.
# mv /a /b/a
: The parent inode number of "/a" should be changed to DIR_B_INO, but f2fs
didn't do that. Ref. f2fs_set_link().
In order to fix this clearly, I added i_pino in f2fs_inode_info, and whenever
it needs to be changed like in f2fs_add_link() and f2fs_set_link(), it is
updated temporarily in f2fs_inode_info.
And later, f2fs_write_inode() stores the latest information to the inode pages.
For power-off-recovery, f2fs_sync_file() triggers simply f2fs_write_inode().
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-12-10 15:52:48 +07:00
|
|
|
ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
|
2012-11-02 15:10:40 +07:00
|
|
|
ri->i_generation = cpu_to_le32(inode->i_generation);
|
2014-02-27 16:20:00 +07:00
|
|
|
ri->i_dir_level = F2FS_I(inode)->i_dir_level;
|
f2fs: save device node number into f2fs_inode
This patch stores inode->i_rdev into on-disk inode structure.
Alun reported that:
aspire tmp # mount -t f2fs /dev/sdb mnt
aspire tmp # mknod mnt/sda1 b 8 1
aspire tmp # mknod mnt/null c 1 3
aspire tmp # mknod mnt/console c 5 1
aspire tmp # ls -l mnt
total 2
crw-r--r-- 1 root root 5, 1 Jan 22 18:44 console
crw-r--r-- 1 root root 1, 3 Jan 22 18:44 null
brw-r--r-- 1 root root 8, 1 Jan 22 18:44 sda1
aspire tmp # umount mnt
aspire tmp # mount -t f2fs /dev/sdb mnt
aspire tmp # ls -l mnt
total 2
crw-r--r-- 1 root root 0, 0 Jan 22 18:44 console
crw-r--r-- 1 root root 0, 0 Jan 22 18:44 null
brw-r--r-- 1 root root 0, 0 Jan 22 18:44 sda1
In this report, f2fs lost the major/minor numbers of device files after umount.
The reason was revealed that f2fs does not store the inode->i_rdev to the
on-disk inode data structure.
So, as the other file systems do, f2fs also stores i_rdev into the i_addr fields
in on-disk inode structure without any on-disk layout changes.
Note that, this bug is limited to device files made by mknod().
Reported-and-Tested-by: Alun Jones <alun.linux@ty-penguin.org.uk>
Signed-off-by: Changman Lee <cm224.lee@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-01-23 07:40:23 +07:00
|
|
|
|
2017-07-25 23:01:41 +07:00
|
|
|
if (f2fs_has_extra_attr(inode)) {
|
2017-07-18 23:19:06 +07:00
|
|
|
ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize);
|
|
|
|
|
f2fs: support flexible inline xattr size
Now, in product, more and more features based on file encryption were
introduced, their demand of xattr space is increasing, however, inline
xattr has fixed-size of 200 bytes, once inline xattr space is full, new
increased xattr data would occupy additional xattr block which may bring
us more space usage and performance regression during persisting.
In order to resolve above issue, it's better to expand inline xattr size
flexibly according to user's requirement.
So this patch introduces new filesystem feature 'flexible inline xattr',
and new mount option 'inline_xattr_size=%u', once mkfs enables the
feature, we can use the option to make f2fs supporting flexible inline
xattr size.
To support this feature, we add extra attribute i_inline_xattr_size in
inode layout, indicating that how many space inline xattr borrows from
block address mapping space in inode layout, by this, we can easily
locate and store flexible-sized inline xattr data in inode.
Inode disk layout:
+----------------------+
| .i_mode |
| ... |
| .i_ext |
+----------------------+
| .i_extra_isize |
| .i_inline_xattr_size |-----------+
| ... | |
+----------------------+ |
| .i_addr | |
| - block address or | |
| - inline data | |
+----------------------+<---+ v
| inline xattr | +---inline xattr range
+----------------------+<---+
| .i_nid |
+----------------------+
| node_footer |
| (nid, ino, offset) |
+----------------------+
Note that, we have to cnosider backward compatibility which reserved
inline_data space, 200 bytes, all the time, reported by Sheng Yong.
Previous inline data or directory always reserved 200 bytes in inode layout,
even if inline_xattr is disabled. In order to keep inline_dentry's structure
for backward compatibility, we get the space back only from inline_data.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Reported-by: Sheng Yong <shengyong1@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2017-09-06 20:59:50 +07:00
|
|
|
if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)->sb))
|
|
|
|
ri->i_inline_xattr_size =
|
|
|
|
cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size);
|
|
|
|
|
2017-07-25 23:01:41 +07:00
|
|
|
if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) &&
|
|
|
|
F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
|
|
|
|
i_projid)) {
|
|
|
|
projid_t i_projid;
|
|
|
|
|
|
|
|
i_projid = from_kprojid(&init_user_ns,
|
|
|
|
F2FS_I(inode)->i_projid);
|
|
|
|
ri->i_projid = cpu_to_le32(i_projid);
|
|
|
|
}
|
2018-01-25 13:54:42 +07:00
|
|
|
|
|
|
|
if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)->sb) &&
|
|
|
|
F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
|
|
|
|
i_crtime)) {
|
|
|
|
ri->i_crtime =
|
|
|
|
cpu_to_le64(F2FS_I(inode)->i_crtime.tv_sec);
|
|
|
|
ri->i_crtime_nsec =
|
|
|
|
cpu_to_le32(F2FS_I(inode)->i_crtime.tv_nsec);
|
|
|
|
}
|
2017-07-25 23:01:41 +07:00
|
|
|
}
|
|
|
|
|
2013-10-08 16:01:51 +07:00
|
|
|
__set_inode_rdev(inode, ri);
|
f2fs: fix handling errors got by f2fs_write_inode
Ruslan reported that f2fs hangs with an infinite loop in f2fs_sync_file():
while (sync_node_pages(sbi, inode->i_ino, &wbc) == 0)
f2fs_write_inode(inode, NULL);
The reason was revealed that the cold flag is not set even thought this inode is
a normal file. Therefore, sync_node_pages() skips to write node blocks since it
only writes cold node blocks.
The cold flag is stored to the node_footer in node block, and whenever a new
node page is allocated, it is set according to its file type, file or directory.
But, after sudden-power-off, when recovering the inode page, f2fs doesn't recover
its cold flag.
So, let's assign the cold flag in more right places.
One more thing:
If f2fs_write_inode() returns an error due to whatever situations, there would
be no dirty node pages so that sync_node_pages() returns zero.
(i.e., zero means nothing was written.)
Reported-by: Ruslan N. Marchenko <me@ruff.mobi>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-12-19 13:28:39 +07:00
|
|
|
set_cold_node(inode, node_page);
|
2016-01-08 04:23:12 +07:00
|
|
|
|
2016-01-25 20:57:05 +07:00
|
|
|
/* deleted inode */
|
|
|
|
if (inode->i_nlink == 0)
|
|
|
|
clear_inline_node(node_page);
|
|
|
|
|
2012-11-02 15:10:40 +07:00
|
|
|
}
|
|
|
|
|
2017-12-05 11:07:47 +07:00
|
|
|
void update_inode_page(struct inode *inode)
|
2012-11-02 15:10:40 +07:00
|
|
|
{
|
2014-09-03 05:31:18 +07:00
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
2012-11-02 15:10:40 +07:00
|
|
|
struct page *node_page;
|
2014-01-24 07:42:16 +07:00
|
|
|
retry:
|
2012-11-02 15:10:40 +07:00
|
|
|
node_page = get_node_page(sbi, inode->i_ino);
|
2014-01-24 07:42:16 +07:00
|
|
|
if (IS_ERR(node_page)) {
|
|
|
|
int err = PTR_ERR(node_page);
|
|
|
|
if (err == -ENOMEM) {
|
|
|
|
cond_resched();
|
|
|
|
goto retry;
|
|
|
|
} else if (err != -ENOENT) {
|
2016-05-19 04:07:56 +07:00
|
|
|
f2fs_stop_checkpoint(sbi, false);
|
2014-01-24 07:42:16 +07:00
|
|
|
}
|
2017-12-05 11:07:47 +07:00
|
|
|
return;
|
2014-01-24 07:42:16 +07:00
|
|
|
}
|
2017-12-05 11:07:47 +07:00
|
|
|
update_inode(inode, node_page);
|
2012-11-02 15:10:40 +07:00
|
|
|
f2fs_put_page(node_page, 1);
|
|
|
|
}
|
|
|
|
|
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 14:21:29 +07:00
|
|
|
int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
|
|
|
|
{
|
2014-09-03 05:31:18 +07:00
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 14:21:29 +07:00
|
|
|
|
|
|
|
if (inode->i_ino == F2FS_NODE_INO(sbi) ||
|
|
|
|
inode->i_ino == F2FS_META_INO(sbi))
|
|
|
|
return 0;
|
|
|
|
|
2016-05-21 00:13:22 +07:00
|
|
|
if (!is_inode_flag_set(inode, FI_DIRTY_INODE))
|
2013-06-10 07:17:01 +07:00
|
|
|
return 0;
|
|
|
|
|
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 14:21:29 +07:00
|
|
|
/*
|
2015-09-13 01:25:30 +07:00
|
|
|
* We need to balance fs here to prevent from producing dirty node pages
|
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 14:21:29 +07:00
|
|
|
* during the urgent cleaning time when runing out of free sections.
|
|
|
|
*/
|
2017-04-21 03:51:57 +07:00
|
|
|
update_inode_page(inode);
|
|
|
|
if (wbc && wbc->nr_to_write)
|
2016-01-08 05:15:04 +07:00
|
|
|
f2fs_balance_fs(sbi, true);
|
2014-01-24 07:42:16 +07:00
|
|
|
return 0;
|
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 14:21:29 +07:00
|
|
|
}
|
|
|
|
|
2012-11-29 11:28:09 +07:00
|
|
|
/*
|
2012-11-02 15:10:40 +07:00
|
|
|
* Called at the last iput() if i_nlink is zero
|
|
|
|
*/
|
|
|
|
void f2fs_evict_inode(struct inode *inode)
|
|
|
|
{
|
2014-09-03 05:31:18 +07:00
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
2016-05-21 00:13:22 +07:00
|
|
|
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
|
2015-08-24 16:40:45 +07:00
|
|
|
int err = 0;
|
2012-11-02 15:10:40 +07:00
|
|
|
|
2014-10-07 07:39:50 +07:00
|
|
|
/* some remained atomic pages should discarded */
|
2014-12-09 21:08:59 +07:00
|
|
|
if (f2fs_is_atomic_file(inode))
|
2016-02-06 13:38:29 +07:00
|
|
|
drop_inmem_pages(inode);
|
2014-10-07 07:39:50 +07:00
|
|
|
|
2013-04-19 23:28:40 +07:00
|
|
|
trace_f2fs_evict_inode(inode);
|
2014-04-04 04:47:49 +07:00
|
|
|
truncate_inode_pages_final(&inode->i_data);
|
2012-11-02 15:10:40 +07:00
|
|
|
|
|
|
|
if (inode->i_ino == F2FS_NODE_INO(sbi) ||
|
|
|
|
inode->i_ino == F2FS_META_INO(sbi))
|
f2fs: avoid use invalid mapping of node_inode when evict meta inode
Andrey Tsyvarev reported:
"Using memory error detector reveals the following use-after-free error
in 3.15.0:
AddressSanitizer: heap-use-after-free in f2fs_evict_inode
Read of size 8 by thread T22279:
[<ffffffffa02d8702>] f2fs_evict_inode+0x102/0x2e0 [f2fs]
[<ffffffff812359af>] evict+0x15f/0x290
[< inlined >] iput+0x196/0x280 iput_final
[<ffffffff812369a6>] iput+0x196/0x280
[<ffffffffa02dc416>] f2fs_put_super+0xd6/0x170 [f2fs]
[<ffffffff81210095>] generic_shutdown_super+0xc5/0x1b0
[<ffffffff812105fd>] kill_block_super+0x4d/0xb0
[<ffffffff81210a86>] deactivate_locked_super+0x66/0x80
[<ffffffff81211c98>] deactivate_super+0x68/0x80
[<ffffffff8123cc88>] mntput_no_expire+0x198/0x250
[< inlined >] SyS_umount+0xe9/0x1a0 SYSC_umount
[<ffffffff8123f1c9>] SyS_umount+0xe9/0x1a0
[<ffffffff81cc8df9>] system_call_fastpath+0x16/0x1b
Freed by thread T3:
[<ffffffffa02dc337>] f2fs_i_callback+0x27/0x30 [f2fs]
[< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_reclaim
[< inlined >] rcu_process_callbacks+0x2d6/0x930 rcu_do_batch
[< inlined >] rcu_process_callbacks+0x2d6/0x930 invoke_rcu_callbacks
[< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_process_callbacks
[<ffffffff810fd266>] rcu_process_callbacks+0x2d6/0x930
[<ffffffff8107cce2>] __do_softirq+0x142/0x380
[<ffffffff8107cf50>] run_ksoftirqd+0x30/0x50
[<ffffffff810b2a87>] smpboot_thread_fn+0x197/0x280
[<ffffffff810a8238>] kthread+0x148/0x160
[<ffffffff81cc8d4c>] ret_from_fork+0x7c/0xb0
Allocated by thread T22276:
[<ffffffffa02dc7dd>] f2fs_alloc_inode+0x2d/0x170 [f2fs]
[<ffffffff81235e2a>] iget_locked+0x10a/0x230
[<ffffffffa02d7495>] f2fs_iget+0x35/0xa80 [f2fs]
[<ffffffffa02e2393>] f2fs_fill_super+0xb53/0xff0 [f2fs]
[<ffffffff81211bce>] mount_bdev+0x1de/0x240
[<ffffffffa02dbce0>] f2fs_mount+0x10/0x20 [f2fs]
[<ffffffff81212a85>] mount_fs+0x55/0x220
[<ffffffff8123c026>] vfs_kern_mount+0x66/0x200
[< inlined >] do_mount+0x2b4/0x1120 do_new_mount
[<ffffffff812400d4>] do_mount+0x2b4/0x1120
[< inlined >] SyS_mount+0xb2/0x110 SYSC_mount
[<ffffffff812414a2>] SyS_mount+0xb2/0x110
[<ffffffff81cc8df9>] system_call_fastpath+0x16/0x1b
The buggy address ffff8800587866c8 is located 48 bytes inside
of 680-byte region [ffff880058786698, ffff880058786940)
Memory state around the buggy address:
ffff880058786100: ffffffff ffffffff ffffffff ffffffff
ffff880058786200: ffffffff ffffffff ffffffrr rrrrrrrr
ffff880058786300: rrrrrrrr rrffffff ffffffff ffffffff
ffff880058786400: ffffffff ffffffff ffffffff ffffffff
ffff880058786500: ffffffff ffffffff ffffffff fffffffr
>ffff880058786600: rrrrrrrr rrrrrrrr rrrfffff ffffffff
^
ffff880058786700: ffffffff ffffffff ffffffff ffffffff
ffff880058786800: ffffffff ffffffff ffffffff ffffffff
ffff880058786900: ffffffff rrrrrrrr rrrrrrrr rrrr....
ffff880058786a00: ........ ........ ........ ........
ffff880058786b00: ........ ........ ........ ........
Legend:
f - 8 freed bytes
r - 8 redzone bytes
. - 8 allocated bytes
x=1..7 - x allocated bytes + (8-x) redzone bytes
Investigation shows, that f2fs_evict_inode, when called for
'meta_inode', uses invalidate_mapping_pages() for 'node_inode'.
But 'node_inode' is deleted before 'meta_inode' in f2fs_put_super via
iput().
It seems that in common usage scenario this use-after-free is benign,
because 'node_inode' remains partially valid data even after
kmem_cache_free().
But things may change if, while 'meta_inode' is evicted in one f2fs
filesystem, another (mounted) f2fs filesystem requests inode from cache,
and formely
'node_inode' of the first filesystem is returned."
Nids for both meta_inode and node_inode are reservation, so it's not necessary
for us to invalidate pages which will never be allocated.
To fix this issue, let's skipping needlessly invalidating pages for
{meta,node}_inode in f2fs_evict_inode.
Reported-by: Andrey Tsyvarev <tsyvarev@ispras.ru>
Tested-by: Andrey Tsyvarev <tsyvarev@ispras.ru>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-07-25 11:00:57 +07:00
|
|
|
goto out_clear;
|
2012-11-02 15:10:40 +07:00
|
|
|
|
2014-09-13 05:53:45 +07:00
|
|
|
f2fs_bug_on(sbi, get_dirty_pages(inode));
|
2015-12-16 12:09:20 +07:00
|
|
|
remove_dirty_inode(inode);
|
2012-11-02 15:10:40 +07:00
|
|
|
|
2015-06-20 07:53:26 +07:00
|
|
|
f2fs_destroy_extent_tree(inode);
|
|
|
|
|
2012-11-02 15:10:40 +07:00
|
|
|
if (inode->i_nlink || is_bad_inode(inode))
|
|
|
|
goto no_delete;
|
|
|
|
|
2017-07-08 23:13:07 +07:00
|
|
|
dquot_initialize(inode);
|
|
|
|
|
2016-11-02 19:43:21 +07:00
|
|
|
remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
|
|
|
|
remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
|
2017-09-29 12:59:38 +07:00
|
|
|
remove_ino_entry(sbi, inode->i_ino, FLUSH_INO);
|
2016-11-02 19:43:21 +07:00
|
|
|
|
2013-01-29 16:30:07 +07:00
|
|
|
sb_start_intwrite(inode->i_sb);
|
2016-05-21 00:13:22 +07:00
|
|
|
set_inode_flag(inode, FI_NO_ALLOC);
|
2012-11-02 15:10:40 +07:00
|
|
|
i_size_write(inode, 0);
|
2016-05-03 23:22:18 +07:00
|
|
|
retry:
|
2012-11-02 15:10:40 +07:00
|
|
|
if (F2FS_HAS_BLOCKS(inode))
|
2016-06-03 03:49:38 +07:00
|
|
|
err = f2fs_truncate(inode);
|
2012-11-02 15:10:40 +07:00
|
|
|
|
2017-03-08 04:32:20 +07:00
|
|
|
#ifdef CONFIG_F2FS_FAULT_INJECTION
|
|
|
|
if (time_to_inject(sbi, FAULT_EVICT_INODE)) {
|
|
|
|
f2fs_show_injection_info(FAULT_EVICT_INODE);
|
|
|
|
err = -EIO;
|
|
|
|
}
|
|
|
|
#endif
|
2015-08-24 16:40:45 +07:00
|
|
|
if (!err) {
|
|
|
|
f2fs_lock_op(sbi);
|
|
|
|
err = remove_inode_page(inode);
|
|
|
|
f2fs_unlock_op(sbi);
|
2016-10-11 21:56:59 +07:00
|
|
|
if (err == -ENOENT)
|
|
|
|
err = 0;
|
2015-08-24 16:40:45 +07:00
|
|
|
}
|
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 14:21:29 +07:00
|
|
|
|
2016-05-03 23:22:18 +07:00
|
|
|
/* give more chances, if ENOMEM case */
|
|
|
|
if (err == -ENOMEM) {
|
|
|
|
err = 0;
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
2016-05-21 01:10:10 +07:00
|
|
|
if (err)
|
|
|
|
update_inode_page(inode);
|
2017-07-08 23:13:07 +07:00
|
|
|
dquot_free_inode(inode);
|
2013-01-29 16:30:07 +07:00
|
|
|
sb_end_intwrite(inode->i_sb);
|
2012-11-02 15:10:40 +07:00
|
|
|
no_delete:
|
2017-07-08 23:13:07 +07:00
|
|
|
dquot_drop(inode);
|
|
|
|
|
2015-07-15 16:28:53 +07:00
|
|
|
stat_dec_inline_xattr(inode);
|
2014-10-14 10:00:16 +07:00
|
|
|
stat_dec_inline_dir(inode);
|
2014-10-15 00:29:50 +07:00
|
|
|
stat_dec_inline_inode(inode);
|
2015-03-19 18:27:51 +07:00
|
|
|
|
2017-10-13 09:12:53 +07:00
|
|
|
if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)))
|
2017-09-12 13:04:05 +07:00
|
|
|
f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE));
|
2017-10-13 09:12:53 +07:00
|
|
|
else
|
|
|
|
f2fs_inode_synced(inode);
|
2017-09-12 13:04:05 +07:00
|
|
|
|
2017-03-05 04:56:10 +07:00
|
|
|
/* ino == 0, if f2fs_new_inode() was failed t*/
|
|
|
|
if (inode->i_ino)
|
|
|
|
invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino,
|
|
|
|
inode->i_ino);
|
2014-08-04 08:54:58 +07:00
|
|
|
if (xnid)
|
|
|
|
invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
|
2016-11-02 19:43:21 +07:00
|
|
|
if (inode->i_nlink) {
|
|
|
|
if (is_inode_flag_set(inode, FI_APPEND_WRITE))
|
|
|
|
add_ino_entry(sbi, inode->i_ino, APPEND_INO);
|
|
|
|
if (is_inode_flag_set(inode, FI_UPDATE_WRITE))
|
|
|
|
add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
|
|
|
|
}
|
2016-05-21 00:13:22 +07:00
|
|
|
if (is_inode_flag_set(inode, FI_FREE_NID)) {
|
2016-05-03 02:34:48 +07:00
|
|
|
alloc_nid_failed(sbi, inode->i_ino);
|
2016-05-21 00:13:22 +07:00
|
|
|
clear_inode_flag(inode, FI_FREE_NID);
|
2017-06-02 05:39:27 +07:00
|
|
|
} else {
|
|
|
|
f2fs_bug_on(sbi, err &&
|
|
|
|
!exist_written_data(sbi, inode->i_ino, ORPHAN_INO));
|
2015-06-24 00:36:08 +07:00
|
|
|
}
|
f2fs: avoid use invalid mapping of node_inode when evict meta inode
Andrey Tsyvarev reported:
"Using memory error detector reveals the following use-after-free error
in 3.15.0:
AddressSanitizer: heap-use-after-free in f2fs_evict_inode
Read of size 8 by thread T22279:
[<ffffffffa02d8702>] f2fs_evict_inode+0x102/0x2e0 [f2fs]
[<ffffffff812359af>] evict+0x15f/0x290
[< inlined >] iput+0x196/0x280 iput_final
[<ffffffff812369a6>] iput+0x196/0x280
[<ffffffffa02dc416>] f2fs_put_super+0xd6/0x170 [f2fs]
[<ffffffff81210095>] generic_shutdown_super+0xc5/0x1b0
[<ffffffff812105fd>] kill_block_super+0x4d/0xb0
[<ffffffff81210a86>] deactivate_locked_super+0x66/0x80
[<ffffffff81211c98>] deactivate_super+0x68/0x80
[<ffffffff8123cc88>] mntput_no_expire+0x198/0x250
[< inlined >] SyS_umount+0xe9/0x1a0 SYSC_umount
[<ffffffff8123f1c9>] SyS_umount+0xe9/0x1a0
[<ffffffff81cc8df9>] system_call_fastpath+0x16/0x1b
Freed by thread T3:
[<ffffffffa02dc337>] f2fs_i_callback+0x27/0x30 [f2fs]
[< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_reclaim
[< inlined >] rcu_process_callbacks+0x2d6/0x930 rcu_do_batch
[< inlined >] rcu_process_callbacks+0x2d6/0x930 invoke_rcu_callbacks
[< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_process_callbacks
[<ffffffff810fd266>] rcu_process_callbacks+0x2d6/0x930
[<ffffffff8107cce2>] __do_softirq+0x142/0x380
[<ffffffff8107cf50>] run_ksoftirqd+0x30/0x50
[<ffffffff810b2a87>] smpboot_thread_fn+0x197/0x280
[<ffffffff810a8238>] kthread+0x148/0x160
[<ffffffff81cc8d4c>] ret_from_fork+0x7c/0xb0
Allocated by thread T22276:
[<ffffffffa02dc7dd>] f2fs_alloc_inode+0x2d/0x170 [f2fs]
[<ffffffff81235e2a>] iget_locked+0x10a/0x230
[<ffffffffa02d7495>] f2fs_iget+0x35/0xa80 [f2fs]
[<ffffffffa02e2393>] f2fs_fill_super+0xb53/0xff0 [f2fs]
[<ffffffff81211bce>] mount_bdev+0x1de/0x240
[<ffffffffa02dbce0>] f2fs_mount+0x10/0x20 [f2fs]
[<ffffffff81212a85>] mount_fs+0x55/0x220
[<ffffffff8123c026>] vfs_kern_mount+0x66/0x200
[< inlined >] do_mount+0x2b4/0x1120 do_new_mount
[<ffffffff812400d4>] do_mount+0x2b4/0x1120
[< inlined >] SyS_mount+0xb2/0x110 SYSC_mount
[<ffffffff812414a2>] SyS_mount+0xb2/0x110
[<ffffffff81cc8df9>] system_call_fastpath+0x16/0x1b
The buggy address ffff8800587866c8 is located 48 bytes inside
of 680-byte region [ffff880058786698, ffff880058786940)
Memory state around the buggy address:
ffff880058786100: ffffffff ffffffff ffffffff ffffffff
ffff880058786200: ffffffff ffffffff ffffffrr rrrrrrrr
ffff880058786300: rrrrrrrr rrffffff ffffffff ffffffff
ffff880058786400: ffffffff ffffffff ffffffff ffffffff
ffff880058786500: ffffffff ffffffff ffffffff fffffffr
>ffff880058786600: rrrrrrrr rrrrrrrr rrrfffff ffffffff
^
ffff880058786700: ffffffff ffffffff ffffffff ffffffff
ffff880058786800: ffffffff ffffffff ffffffff ffffffff
ffff880058786900: ffffffff rrrrrrrr rrrrrrrr rrrr....
ffff880058786a00: ........ ........ ........ ........
ffff880058786b00: ........ ........ ........ ........
Legend:
f - 8 freed bytes
r - 8 redzone bytes
. - 8 allocated bytes
x=1..7 - x allocated bytes + (8-x) redzone bytes
Investigation shows, that f2fs_evict_inode, when called for
'meta_inode', uses invalidate_mapping_pages() for 'node_inode'.
But 'node_inode' is deleted before 'meta_inode' in f2fs_put_super via
iput().
It seems that in common usage scenario this use-after-free is benign,
because 'node_inode' remains partially valid data even after
kmem_cache_free().
But things may change if, while 'meta_inode' is evicted in one f2fs
filesystem, another (mounted) f2fs filesystem requests inode from cache,
and formely
'node_inode' of the first filesystem is returned."
Nids for both meta_inode and node_inode are reservation, so it's not necessary
for us to invalidate pages which will never be allocated.
To fix this issue, let's skipping needlessly invalidating pages for
{meta,node}_inode in f2fs_evict_inode.
Reported-by: Andrey Tsyvarev <tsyvarev@ispras.ru>
Tested-by: Andrey Tsyvarev <tsyvarev@ispras.ru>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-07-25 11:00:57 +07:00
|
|
|
out_clear:
|
2018-01-12 11:30:13 +07:00
|
|
|
fscrypt_put_encryption_info(inode);
|
f2fs: avoid use invalid mapping of node_inode when evict meta inode
Andrey Tsyvarev reported:
"Using memory error detector reveals the following use-after-free error
in 3.15.0:
AddressSanitizer: heap-use-after-free in f2fs_evict_inode
Read of size 8 by thread T22279:
[<ffffffffa02d8702>] f2fs_evict_inode+0x102/0x2e0 [f2fs]
[<ffffffff812359af>] evict+0x15f/0x290
[< inlined >] iput+0x196/0x280 iput_final
[<ffffffff812369a6>] iput+0x196/0x280
[<ffffffffa02dc416>] f2fs_put_super+0xd6/0x170 [f2fs]
[<ffffffff81210095>] generic_shutdown_super+0xc5/0x1b0
[<ffffffff812105fd>] kill_block_super+0x4d/0xb0
[<ffffffff81210a86>] deactivate_locked_super+0x66/0x80
[<ffffffff81211c98>] deactivate_super+0x68/0x80
[<ffffffff8123cc88>] mntput_no_expire+0x198/0x250
[< inlined >] SyS_umount+0xe9/0x1a0 SYSC_umount
[<ffffffff8123f1c9>] SyS_umount+0xe9/0x1a0
[<ffffffff81cc8df9>] system_call_fastpath+0x16/0x1b
Freed by thread T3:
[<ffffffffa02dc337>] f2fs_i_callback+0x27/0x30 [f2fs]
[< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_reclaim
[< inlined >] rcu_process_callbacks+0x2d6/0x930 rcu_do_batch
[< inlined >] rcu_process_callbacks+0x2d6/0x930 invoke_rcu_callbacks
[< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_process_callbacks
[<ffffffff810fd266>] rcu_process_callbacks+0x2d6/0x930
[<ffffffff8107cce2>] __do_softirq+0x142/0x380
[<ffffffff8107cf50>] run_ksoftirqd+0x30/0x50
[<ffffffff810b2a87>] smpboot_thread_fn+0x197/0x280
[<ffffffff810a8238>] kthread+0x148/0x160
[<ffffffff81cc8d4c>] ret_from_fork+0x7c/0xb0
Allocated by thread T22276:
[<ffffffffa02dc7dd>] f2fs_alloc_inode+0x2d/0x170 [f2fs]
[<ffffffff81235e2a>] iget_locked+0x10a/0x230
[<ffffffffa02d7495>] f2fs_iget+0x35/0xa80 [f2fs]
[<ffffffffa02e2393>] f2fs_fill_super+0xb53/0xff0 [f2fs]
[<ffffffff81211bce>] mount_bdev+0x1de/0x240
[<ffffffffa02dbce0>] f2fs_mount+0x10/0x20 [f2fs]
[<ffffffff81212a85>] mount_fs+0x55/0x220
[<ffffffff8123c026>] vfs_kern_mount+0x66/0x200
[< inlined >] do_mount+0x2b4/0x1120 do_new_mount
[<ffffffff812400d4>] do_mount+0x2b4/0x1120
[< inlined >] SyS_mount+0xb2/0x110 SYSC_mount
[<ffffffff812414a2>] SyS_mount+0xb2/0x110
[<ffffffff81cc8df9>] system_call_fastpath+0x16/0x1b
The buggy address ffff8800587866c8 is located 48 bytes inside
of 680-byte region [ffff880058786698, ffff880058786940)
Memory state around the buggy address:
ffff880058786100: ffffffff ffffffff ffffffff ffffffff
ffff880058786200: ffffffff ffffffff ffffffrr rrrrrrrr
ffff880058786300: rrrrrrrr rrffffff ffffffff ffffffff
ffff880058786400: ffffffff ffffffff ffffffff ffffffff
ffff880058786500: ffffffff ffffffff ffffffff fffffffr
>ffff880058786600: rrrrrrrr rrrrrrrr rrrfffff ffffffff
^
ffff880058786700: ffffffff ffffffff ffffffff ffffffff
ffff880058786800: ffffffff ffffffff ffffffff ffffffff
ffff880058786900: ffffffff rrrrrrrr rrrrrrrr rrrr....
ffff880058786a00: ........ ........ ........ ........
ffff880058786b00: ........ ........ ........ ........
Legend:
f - 8 freed bytes
r - 8 redzone bytes
. - 8 allocated bytes
x=1..7 - x allocated bytes + (8-x) redzone bytes
Investigation shows, that f2fs_evict_inode, when called for
'meta_inode', uses invalidate_mapping_pages() for 'node_inode'.
But 'node_inode' is deleted before 'meta_inode' in f2fs_put_super via
iput().
It seems that in common usage scenario this use-after-free is benign,
because 'node_inode' remains partially valid data even after
kmem_cache_free().
But things may change if, while 'meta_inode' is evicted in one f2fs
filesystem, another (mounted) f2fs filesystem requests inode from cache,
and formely
'node_inode' of the first filesystem is returned."
Nids for both meta_inode and node_inode are reservation, so it's not necessary
for us to invalidate pages which will never be allocated.
To fix this issue, let's skipping needlessly invalidating pages for
{meta,node}_inode in f2fs_evict_inode.
Reported-by: Andrey Tsyvarev <tsyvarev@ispras.ru>
Tested-by: Andrey Tsyvarev <tsyvarev@ispras.ru>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-07-25 11:00:57 +07:00
|
|
|
clear_inode(inode);
|
2012-11-02 15:10:40 +07:00
|
|
|
}
|
2014-09-26 01:55:53 +07:00
|
|
|
|
|
|
|
/* caller should call f2fs_lock_op() */
|
|
|
|
void handle_failed_inode(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
2016-05-03 02:34:48 +07:00
|
|
|
struct node_info ni;
|
2014-09-26 01:55:53 +07:00
|
|
|
|
2016-10-11 21:56:59 +07:00
|
|
|
/*
|
|
|
|
* clear nlink of inode in order to release resource of inode
|
|
|
|
* immediately.
|
|
|
|
*/
|
|
|
|
clear_nlink(inode);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* we must call this to avoid inode being remained as dirty, resulting
|
|
|
|
* in a panic when flushing dirty inodes in gdirty_list.
|
|
|
|
*/
|
|
|
|
update_inode_page(inode);
|
2017-04-12 09:01:26 +07:00
|
|
|
f2fs_inode_synced(inode);
|
2016-10-11 21:56:59 +07:00
|
|
|
|
2016-05-03 02:34:48 +07:00
|
|
|
/* don't make bad inode, since it becomes a regular file. */
|
2014-09-26 01:55:53 +07:00
|
|
|
unlock_new_inode(inode);
|
|
|
|
|
2015-08-24 16:40:45 +07:00
|
|
|
/*
|
|
|
|
* Note: we should add inode to orphan list before f2fs_unlock_op()
|
|
|
|
* so we can prevent losing this orphan when encoutering checkpoint
|
|
|
|
* and following suddenly power-off.
|
|
|
|
*/
|
2016-05-03 02:34:48 +07:00
|
|
|
get_node_info(sbi, inode->i_ino, &ni);
|
|
|
|
|
|
|
|
if (ni.blk_addr != NULL_ADDR) {
|
|
|
|
int err = acquire_orphan_inode(sbi);
|
|
|
|
if (err) {
|
|
|
|
set_sbi_flag(sbi, SBI_NEED_FSCK);
|
|
|
|
f2fs_msg(sbi->sb, KERN_WARNING,
|
|
|
|
"Too many orphan inodes, run fsck to fix.");
|
|
|
|
} else {
|
2016-06-14 08:27:02 +07:00
|
|
|
add_orphan_inode(inode);
|
2016-05-03 02:34:48 +07:00
|
|
|
}
|
|
|
|
alloc_nid_done(sbi, inode->i_ino);
|
|
|
|
} else {
|
2016-05-21 00:13:22 +07:00
|
|
|
set_inode_flag(inode, FI_FREE_NID);
|
2015-08-24 16:40:45 +07:00
|
|
|
}
|
2014-09-26 01:55:53 +07:00
|
|
|
|
|
|
|
f2fs_unlock_op(sbi);
|
|
|
|
|
|
|
|
/* iput will drop the inode object */
|
|
|
|
iput(inode);
|
|
|
|
}
|