linux_dsm_epyc7002/fs/f2fs/gc.c

898 lines
22 KiB
C
Raw Normal View History

/*
* fs/f2fs/gc.c
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/f2fs_fs.h>
#include <linux/kthread.h>
#include <linux/delay.h>
#include <linux/freezer.h>
#include "f2fs.h"
#include "node.h"
#include "segment.h"
#include "gc.h"
#include <trace/events/f2fs.h>
static int gc_thread_func(void *data)
{
struct f2fs_sb_info *sbi = data;
struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
long wait_ms;
wait_ms = gc_th->min_sleep_time;
do {
if (try_to_freeze())
continue;
else
wait_event_interruptible_timeout(*wq,
kthread_should_stop(),
msecs_to_jiffies(wait_ms));
if (kthread_should_stop())
break;
if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
increase_sleep_time(gc_th, &wait_ms);
continue;
}
/*
* [GC triggering condition]
* 0. GC is not conducted currently.
* 1. There are enough dirty segments.
* 2. IO subsystem is idle by checking the # of writeback pages.
* 3. IO subsystem is idle by checking the # of requests in
* bdev's request list.
*
* Note) We have to avoid triggering GCs frequently.
* Because it is possible that some segments can be
* invalidated soon after by user update or deletion.
* So, I'd like to wait some time to collect dirty segments.
*/
if (!mutex_trylock(&sbi->gc_mutex))
continue;
if (!is_idle(sbi)) {
increase_sleep_time(gc_th, &wait_ms);
mutex_unlock(&sbi->gc_mutex);
continue;
}
if (has_enough_invalid_blocks(sbi))
decrease_sleep_time(gc_th, &wait_ms);
else
increase_sleep_time(gc_th, &wait_ms);
stat_inc_bggc_count(sbi);
/* if return value is not zero, no victim was selected */
if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC)))
wait_ms = gc_th->no_gc_sleep_time;
trace_f2fs_background_gc(sbi->sb, wait_ms,
prefree_segments(sbi), free_segments(sbi));
/* balancing f2fs's metadata periodically */
f2fs_balance_fs_bg(sbi);
} while (!kthread_should_stop());
return 0;
}
int start_gc_thread(struct f2fs_sb_info *sbi)
{
struct f2fs_gc_kthread *gc_th;
dev_t dev = sbi->sb->s_bdev->bd_dev;
int err = 0;
gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
if (!gc_th) {
err = -ENOMEM;
goto out;
}
gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
gc_th->gc_idle = 0;
sbi->gc_thread = gc_th;
init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(gc_th->f2fs_gc_task)) {
err = PTR_ERR(gc_th->f2fs_gc_task);
kfree(gc_th);
sbi->gc_thread = NULL;
}
out:
return err;
}
void stop_gc_thread(struct f2fs_sb_info *sbi)
{
struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
if (!gc_th)
return;
kthread_stop(gc_th->f2fs_gc_task);
kfree(gc_th);
sbi->gc_thread = NULL;
}
static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type)
{
int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
if (gc_th && gc_th->gc_idle) {
if (gc_th->gc_idle == 1)
gc_mode = GC_CB;
else if (gc_th->gc_idle == 2)
gc_mode = GC_GREEDY;
}
return gc_mode;
}
static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
int type, struct victim_sel_policy *p)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
if (p->alloc_mode == SSR) {
p->gc_mode = GC_GREEDY;
p->dirty_segmap = dirty_i->dirty_segmap[type];
f2fs: optimize gc for better performance This patch improves the gc efficiency by optimizing the victim selection policy. With this optimization, the random re-write performance could increase up to 20%. For f2fs, when disk is in shortage of free spaces, gc will selects dirty segments and moves valid blocks around for making more space available. The gc cost of a segment is determined by the valid blocks in the segment. The less the valid blocks, the higher the efficiency. The ideal victim segment is the one that has the most garbage blocks. Currently, it searches up to 20 dirty segments for a victim segment. The selected victim is not likely the best victim for gc when there are much more dirty segments. Why not searching more dirty segments for a better victim? The cost of searching dirty segments is negligible in comparison to moving blocks. In this patch, it enlarges the MAX_VICTIM_SEARCH to 4096 to make the search more aggressively for a possible better victim. Since it also applies to victim selection for SSR, it will likely improve the SSR efficiency as well. The test case is simple. It creates as many files until the disk full. The size for each file is 32KB. Then it writes as many as 100000 records of 4KB size to random offsets of random files in sync mode. The testing was done on a 2GB partition of a SDHC card. Let's see the test result of f2fs without and with the patch. --------------------------------------- 2GB partition, SDHC create 52023 files of size 32768 bytes random re-write 100000 records of 4KB --------------------------------------- | file creation (s) | rewrite time (s) | gc count | gc garbage blocks | [no patch] 341 4227 1174 174840 [patched] 324 2958 645 106682 It's obvious that, with the patch, f2fs finishes the test in 20+% less time than without the patch. And internally it does much less gc with higher efficiency than before. Since the performance improvement is related to gc, it might not be so obvious for other tests that do not trigger gc as often as this one ( This is because f2fs selects dirty segments for SSR use most of the time when free space is in shortage). The well-known iozone test tool was not used for benchmarking the patch becuase it seems do not have a test case that performs random re-write on a full disk. This patch is the revised version based on the suggestion from Jaegeuk Kim. Signed-off-by: Jin Xu <jinuxstyle@gmail.com> [Jaegeuk Kim: suggested simpler solution] Reviewed-by: Jaegeuk Kim <jaegeuk.kim@samsung.com> Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-09-05 11:45:26 +07:00
p->max_search = dirty_i->nr_dirty[type];
p->ofs_unit = 1;
} else {
p->gc_mode = select_gc_type(sbi->gc_thread, gc_type);
p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
f2fs: optimize gc for better performance This patch improves the gc efficiency by optimizing the victim selection policy. With this optimization, the random re-write performance could increase up to 20%. For f2fs, when disk is in shortage of free spaces, gc will selects dirty segments and moves valid blocks around for making more space available. The gc cost of a segment is determined by the valid blocks in the segment. The less the valid blocks, the higher the efficiency. The ideal victim segment is the one that has the most garbage blocks. Currently, it searches up to 20 dirty segments for a victim segment. The selected victim is not likely the best victim for gc when there are much more dirty segments. Why not searching more dirty segments for a better victim? The cost of searching dirty segments is negligible in comparison to moving blocks. In this patch, it enlarges the MAX_VICTIM_SEARCH to 4096 to make the search more aggressively for a possible better victim. Since it also applies to victim selection for SSR, it will likely improve the SSR efficiency as well. The test case is simple. It creates as many files until the disk full. The size for each file is 32KB. Then it writes as many as 100000 records of 4KB size to random offsets of random files in sync mode. The testing was done on a 2GB partition of a SDHC card. Let's see the test result of f2fs without and with the patch. --------------------------------------- 2GB partition, SDHC create 52023 files of size 32768 bytes random re-write 100000 records of 4KB --------------------------------------- | file creation (s) | rewrite time (s) | gc count | gc garbage blocks | [no patch] 341 4227 1174 174840 [patched] 324 2958 645 106682 It's obvious that, with the patch, f2fs finishes the test in 20+% less time than without the patch. And internally it does much less gc with higher efficiency than before. Since the performance improvement is related to gc, it might not be so obvious for other tests that do not trigger gc as often as this one ( This is because f2fs selects dirty segments for SSR use most of the time when free space is in shortage). The well-known iozone test tool was not used for benchmarking the patch becuase it seems do not have a test case that performs random re-write on a full disk. This patch is the revised version based on the suggestion from Jaegeuk Kim. Signed-off-by: Jin Xu <jinuxstyle@gmail.com> [Jaegeuk Kim: suggested simpler solution] Reviewed-by: Jaegeuk Kim <jaegeuk.kim@samsung.com> Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-09-05 11:45:26 +07:00
p->max_search = dirty_i->nr_dirty[DIRTY];
p->ofs_unit = sbi->segs_per_sec;
}
f2fs: optimize gc for better performance This patch improves the gc efficiency by optimizing the victim selection policy. With this optimization, the random re-write performance could increase up to 20%. For f2fs, when disk is in shortage of free spaces, gc will selects dirty segments and moves valid blocks around for making more space available. The gc cost of a segment is determined by the valid blocks in the segment. The less the valid blocks, the higher the efficiency. The ideal victim segment is the one that has the most garbage blocks. Currently, it searches up to 20 dirty segments for a victim segment. The selected victim is not likely the best victim for gc when there are much more dirty segments. Why not searching more dirty segments for a better victim? The cost of searching dirty segments is negligible in comparison to moving blocks. In this patch, it enlarges the MAX_VICTIM_SEARCH to 4096 to make the search more aggressively for a possible better victim. Since it also applies to victim selection for SSR, it will likely improve the SSR efficiency as well. The test case is simple. It creates as many files until the disk full. The size for each file is 32KB. Then it writes as many as 100000 records of 4KB size to random offsets of random files in sync mode. The testing was done on a 2GB partition of a SDHC card. Let's see the test result of f2fs without and with the patch. --------------------------------------- 2GB partition, SDHC create 52023 files of size 32768 bytes random re-write 100000 records of 4KB --------------------------------------- | file creation (s) | rewrite time (s) | gc count | gc garbage blocks | [no patch] 341 4227 1174 174840 [patched] 324 2958 645 106682 It's obvious that, with the patch, f2fs finishes the test in 20+% less time than without the patch. And internally it does much less gc with higher efficiency than before. Since the performance improvement is related to gc, it might not be so obvious for other tests that do not trigger gc as often as this one ( This is because f2fs selects dirty segments for SSR use most of the time when free space is in shortage). The well-known iozone test tool was not used for benchmarking the patch becuase it seems do not have a test case that performs random re-write on a full disk. This patch is the revised version based on the suggestion from Jaegeuk Kim. Signed-off-by: Jin Xu <jinuxstyle@gmail.com> [Jaegeuk Kim: suggested simpler solution] Reviewed-by: Jaegeuk Kim <jaegeuk.kim@samsung.com> Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-09-05 11:45:26 +07:00
if (p->max_search > sbi->max_victim_search)
p->max_search = sbi->max_victim_search;
f2fs: optimize gc for better performance This patch improves the gc efficiency by optimizing the victim selection policy. With this optimization, the random re-write performance could increase up to 20%. For f2fs, when disk is in shortage of free spaces, gc will selects dirty segments and moves valid blocks around for making more space available. The gc cost of a segment is determined by the valid blocks in the segment. The less the valid blocks, the higher the efficiency. The ideal victim segment is the one that has the most garbage blocks. Currently, it searches up to 20 dirty segments for a victim segment. The selected victim is not likely the best victim for gc when there are much more dirty segments. Why not searching more dirty segments for a better victim? The cost of searching dirty segments is negligible in comparison to moving blocks. In this patch, it enlarges the MAX_VICTIM_SEARCH to 4096 to make the search more aggressively for a possible better victim. Since it also applies to victim selection for SSR, it will likely improve the SSR efficiency as well. The test case is simple. It creates as many files until the disk full. The size for each file is 32KB. Then it writes as many as 100000 records of 4KB size to random offsets of random files in sync mode. The testing was done on a 2GB partition of a SDHC card. Let's see the test result of f2fs without and with the patch. --------------------------------------- 2GB partition, SDHC create 52023 files of size 32768 bytes random re-write 100000 records of 4KB --------------------------------------- | file creation (s) | rewrite time (s) | gc count | gc garbage blocks | [no patch] 341 4227 1174 174840 [patched] 324 2958 645 106682 It's obvious that, with the patch, f2fs finishes the test in 20+% less time than without the patch. And internally it does much less gc with higher efficiency than before. Since the performance improvement is related to gc, it might not be so obvious for other tests that do not trigger gc as often as this one ( This is because f2fs selects dirty segments for SSR use most of the time when free space is in shortage). The well-known iozone test tool was not used for benchmarking the patch becuase it seems do not have a test case that performs random re-write on a full disk. This patch is the revised version based on the suggestion from Jaegeuk Kim. Signed-off-by: Jin Xu <jinuxstyle@gmail.com> [Jaegeuk Kim: suggested simpler solution] Reviewed-by: Jaegeuk Kim <jaegeuk.kim@samsung.com> Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-09-05 11:45:26 +07:00
p->offset = sbi->last_victim[p->gc_mode];
}
static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
struct victim_sel_policy *p)
{
/* SSR allocates in a segment unit */
if (p->alloc_mode == SSR)
return sbi->blocks_per_seg;
if (p->gc_mode == GC_GREEDY)
return sbi->blocks_per_seg * p->ofs_unit;
else if (p->gc_mode == GC_CB)
return UINT_MAX;
else /* No other gc_mode */
return 0;
}
static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned int secno;
/*
* If the gc_type is FG_GC, we can select victim segments
* selected by background GC before.
* Those segments guarantee they have small valid blocks.
*/
for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
if (sec_usage_check(sbi, secno))
continue;
clear_bit(secno, dirty_i->victim_secmap);
return secno * sbi->segs_per_sec;
}
return NULL_SEGNO;
}
static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
{
struct sit_info *sit_i = SIT_I(sbi);
unsigned int secno = GET_SECNO(sbi, segno);
unsigned int start = secno * sbi->segs_per_sec;
unsigned long long mtime = 0;
unsigned int vblocks;
unsigned char age = 0;
unsigned char u;
unsigned int i;
for (i = 0; i < sbi->segs_per_sec; i++)
mtime += get_seg_entry(sbi, start + i)->mtime;
vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
mtime = div_u64(mtime, sbi->segs_per_sec);
vblocks = div_u64(vblocks, sbi->segs_per_sec);
u = (vblocks * 100) >> sbi->log_blocks_per_seg;
/* Handle if the system time has changed by the user */
if (mtime < sit_i->min_mtime)
sit_i->min_mtime = mtime;
if (mtime > sit_i->max_mtime)
sit_i->max_mtime = mtime;
if (sit_i->max_mtime != sit_i->min_mtime)
age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime),
sit_i->max_mtime - sit_i->min_mtime);
return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
}
static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
unsigned int segno, struct victim_sel_policy *p)
{
if (p->alloc_mode == SSR)
return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
/* alloc_mode == LFS */
if (p->gc_mode == GC_GREEDY)
return get_valid_blocks(sbi, segno, sbi->segs_per_sec);
else
return get_cb_cost(sbi, segno);
}
/*
* This function is called from two paths.
* One is garbage collection and the other is SSR segment selection.
* When it is called during GC, it just gets a victim segment
* and it does not remove it from dirty seglist.
* When it is called from SSR segment selection, it finds a segment
* which has minimum valid blocks and removes it from dirty seglist.
*/
static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned int *result, int gc_type, int type, char alloc_mode)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct victim_sel_policy p;
unsigned int secno, max_cost;
unsigned int last_segment = MAIN_SEGS(sbi);
int nsearched = 0;
mutex_lock(&dirty_i->seglist_lock);
p.alloc_mode = alloc_mode;
select_policy(sbi, gc_type, type, &p);
p.min_segno = NULL_SEGNO;
p.min_cost = max_cost = get_max_cost(sbi, &p);
if (p.max_search == 0)
goto out;
if (p.alloc_mode == LFS && gc_type == FG_GC) {
p.min_segno = check_bg_victims(sbi);
if (p.min_segno != NULL_SEGNO)
goto got_it;
}
while (1) {
unsigned long cost;
unsigned int segno;
segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
if (segno >= last_segment) {
if (sbi->last_victim[p.gc_mode]) {
last_segment = sbi->last_victim[p.gc_mode];
sbi->last_victim[p.gc_mode] = 0;
p.offset = 0;
continue;
}
break;
}
p.offset = segno + p.ofs_unit;
if (p.ofs_unit > 1)
p.offset -= segno % p.ofs_unit;
secno = GET_SECNO(sbi, segno);
if (sec_usage_check(sbi, secno))
continue;
if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
continue;
cost = get_gc_cost(sbi, segno, &p);
if (p.min_cost > cost) {
p.min_segno = segno;
p.min_cost = cost;
} else if (unlikely(cost == max_cost)) {
continue;
}
f2fs: optimize gc for better performance This patch improves the gc efficiency by optimizing the victim selection policy. With this optimization, the random re-write performance could increase up to 20%. For f2fs, when disk is in shortage of free spaces, gc will selects dirty segments and moves valid blocks around for making more space available. The gc cost of a segment is determined by the valid blocks in the segment. The less the valid blocks, the higher the efficiency. The ideal victim segment is the one that has the most garbage blocks. Currently, it searches up to 20 dirty segments for a victim segment. The selected victim is not likely the best victim for gc when there are much more dirty segments. Why not searching more dirty segments for a better victim? The cost of searching dirty segments is negligible in comparison to moving blocks. In this patch, it enlarges the MAX_VICTIM_SEARCH to 4096 to make the search more aggressively for a possible better victim. Since it also applies to victim selection for SSR, it will likely improve the SSR efficiency as well. The test case is simple. It creates as many files until the disk full. The size for each file is 32KB. Then it writes as many as 100000 records of 4KB size to random offsets of random files in sync mode. The testing was done on a 2GB partition of a SDHC card. Let's see the test result of f2fs without and with the patch. --------------------------------------- 2GB partition, SDHC create 52023 files of size 32768 bytes random re-write 100000 records of 4KB --------------------------------------- | file creation (s) | rewrite time (s) | gc count | gc garbage blocks | [no patch] 341 4227 1174 174840 [patched] 324 2958 645 106682 It's obvious that, with the patch, f2fs finishes the test in 20+% less time than without the patch. And internally it does much less gc with higher efficiency than before. Since the performance improvement is related to gc, it might not be so obvious for other tests that do not trigger gc as often as this one ( This is because f2fs selects dirty segments for SSR use most of the time when free space is in shortage). The well-known iozone test tool was not used for benchmarking the patch becuase it seems do not have a test case that performs random re-write on a full disk. This patch is the revised version based on the suggestion from Jaegeuk Kim. Signed-off-by: Jin Xu <jinuxstyle@gmail.com> [Jaegeuk Kim: suggested simpler solution] Reviewed-by: Jaegeuk Kim <jaegeuk.kim@samsung.com> Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-09-05 11:45:26 +07:00
if (nsearched++ >= p.max_search) {
sbi->last_victim[p.gc_mode] = segno;
break;
}
}
if (p.min_segno != NULL_SEGNO) {
got_it:
if (p.alloc_mode == LFS) {
secno = GET_SECNO(sbi, p.min_segno);
if (gc_type == FG_GC)
sbi->cur_victim_sec = secno;
else
set_bit(secno, dirty_i->victim_secmap);
}
*result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
sbi->cur_victim_sec,
prefree_segments(sbi), free_segments(sbi));
}
out:
mutex_unlock(&dirty_i->seglist_lock);
return (p.min_segno == NULL_SEGNO) ? 0 : 1;
}
static const struct victim_selection default_v_ops = {
.get_victim = get_victim_by_default,
};
static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino)
{
struct inode_entry *ie;
ie = radix_tree_lookup(&gc_list->iroot, ino);
if (ie)
return ie->inode;
return NULL;
}
static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
{
struct inode_entry *new_ie;
if (inode == find_gc_inode(gc_list, inode->i_ino)) {
iput(inode);
return;
}
new_ie = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
new_ie->inode = inode;
f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
list_add_tail(&new_ie->list, &gc_list->ilist);
}
static void put_gc_inode(struct gc_inode_list *gc_list)
{
struct inode_entry *ie, *next_ie;
list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) {
radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
iput(ie->inode);
list_del(&ie->list);
kmem_cache_free(inode_entry_slab, ie);
}
}
static int check_valid_map(struct f2fs_sb_info *sbi,
unsigned int segno, int offset)
{
struct sit_info *sit_i = SIT_I(sbi);
struct seg_entry *sentry;
int ret;
mutex_lock(&sit_i->sentry_lock);
sentry = get_seg_entry(sbi, segno);
ret = f2fs_test_bit(offset, sentry->cur_valid_map);
mutex_unlock(&sit_i->sentry_lock);
return ret;
}
/*
* This function compares node address got in summary with that in NAT.
* On validity, copy that node with cold status, otherwise (invalid node)
* ignore that.
*/
static int gc_node_segment(struct f2fs_sb_info *sbi,
struct f2fs_summary *sum, unsigned int segno, int gc_type)
{
bool initial = true;
struct f2fs_summary *entry;
block_t start_addr;
int off;
start_addr = START_BLOCK(sbi, segno);
next_step:
entry = sum;
f2fs: give a chance to merge IOs by IO scheduler Previously, background GC submits many 4KB read requests to load victim blocks and/or its (i)node blocks. ... f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb61, blkaddr = 0x3b964ed f2fs_gc : block_rq_complete: 8,16 R () 499854968 + 8 [0] f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb6f, blkaddr = 0x3b964ee f2fs_gc : block_rq_complete: 8,16 R () 499854976 + 8 [0] f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb79, blkaddr = 0x3b964ef f2fs_gc : block_rq_complete: 8,16 R () 499854984 + 8 [0] ... However, by the fact that many IOs are sequential, we can give a chance to merge the IOs by IO scheduler. In order to do that, let's use blk_plug. ... f2fs_gc : f2fs_iget: ino = 143 f2fs_gc : f2fs_readpage: ino = 143, page_index = 0x1c6, blkaddr = 0x2e6ee f2fs_gc : f2fs_iget: ino = 143 f2fs_gc : f2fs_readpage: ino = 143, page_index = 0x1c7, blkaddr = 0x2e6ef <idle> : block_rq_complete: 8,16 R () 1519616 + 8 [0] <idle> : block_rq_complete: 8,16 R () 1519848 + 8 [0] <idle> : block_rq_complete: 8,16 R () 1520432 + 96 [0] <idle> : block_rq_complete: 8,16 R () 1520536 + 104 [0] <idle> : block_rq_complete: 8,16 R () 1521008 + 112 [0] <idle> : block_rq_complete: 8,16 R () 1521440 + 152 [0] <idle> : block_rq_complete: 8,16 R () 1521688 + 144 [0] <idle> : block_rq_complete: 8,16 R () 1522128 + 192 [0] <idle> : block_rq_complete: 8,16 R () 1523256 + 328 [0] ... Note that this issue should be addressed in checkpoint, and some readahead flows too. Reviewed-by: Namjae Jeon <namjae.jeon@samsung.com> Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-04-24 11:19:56 +07:00
for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
nid_t nid = le32_to_cpu(entry->nid);
struct page *node_page;
struct node_info ni;
/* stop BG_GC if there is not enough free sections. */
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
return 0;
if (check_valid_map(sbi, segno, off) == 0)
continue;
if (initial) {
ra_node_page(sbi, nid);
continue;
}
node_page = get_node_page(sbi, nid);
if (IS_ERR(node_page))
continue;
/* block may become invalid during get_node_page */
if (check_valid_map(sbi, segno, off) == 0) {
f2fs_put_page(node_page, 1);
continue;
}
get_node_info(sbi, nid, &ni);
if (ni.blk_addr != start_addr + off) {
f2fs_put_page(node_page, 1);
continue;
}
/* set page dirty and write it */
if (gc_type == FG_GC) {
f2fs_wait_on_page_writeback(node_page, NODE);
set_page_dirty(node_page);
} else {
if (!PageWriteback(node_page))
set_page_dirty(node_page);
}
f2fs_put_page(node_page, 1);
stat_inc_node_blk_count(sbi, 1, gc_type);
}
f2fs: give a chance to merge IOs by IO scheduler Previously, background GC submits many 4KB read requests to load victim blocks and/or its (i)node blocks. ... f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb61, blkaddr = 0x3b964ed f2fs_gc : block_rq_complete: 8,16 R () 499854968 + 8 [0] f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb6f, blkaddr = 0x3b964ee f2fs_gc : block_rq_complete: 8,16 R () 499854976 + 8 [0] f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb79, blkaddr = 0x3b964ef f2fs_gc : block_rq_complete: 8,16 R () 499854984 + 8 [0] ... However, by the fact that many IOs are sequential, we can give a chance to merge the IOs by IO scheduler. In order to do that, let's use blk_plug. ... f2fs_gc : f2fs_iget: ino = 143 f2fs_gc : f2fs_readpage: ino = 143, page_index = 0x1c6, blkaddr = 0x2e6ee f2fs_gc : f2fs_iget: ino = 143 f2fs_gc : f2fs_readpage: ino = 143, page_index = 0x1c7, blkaddr = 0x2e6ef <idle> : block_rq_complete: 8,16 R () 1519616 + 8 [0] <idle> : block_rq_complete: 8,16 R () 1519848 + 8 [0] <idle> : block_rq_complete: 8,16 R () 1520432 + 96 [0] <idle> : block_rq_complete: 8,16 R () 1520536 + 104 [0] <idle> : block_rq_complete: 8,16 R () 1521008 + 112 [0] <idle> : block_rq_complete: 8,16 R () 1521440 + 152 [0] <idle> : block_rq_complete: 8,16 R () 1521688 + 144 [0] <idle> : block_rq_complete: 8,16 R () 1522128 + 192 [0] <idle> : block_rq_complete: 8,16 R () 1523256 + 328 [0] ... Note that this issue should be addressed in checkpoint, and some readahead flows too. Reviewed-by: Namjae Jeon <namjae.jeon@samsung.com> Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-04-24 11:19:56 +07:00
if (initial) {
initial = false;
goto next_step;
}
if (gc_type == FG_GC) {
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = LONG_MAX,
.for_reclaim = 0,
};
sync_node_pages(sbi, 0, &wbc);
/* return 1 only if FG_GC succefully reclaimed one */
if (get_valid_blocks(sbi, segno, 1) == 0)
return 1;
}
return 0;
}
/*
* Calculate start block index indicating the given node offset.
* Be careful, caller should give this node offset only indicating direct node
* blocks. If any node offsets, which point the other types of node blocks such
* as indirect or double indirect node blocks, are given, it must be a caller's
* bug.
*/
block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
{
unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
unsigned int bidx;
if (node_ofs == 0)
return 0;
if (node_ofs <= 2) {
bidx = node_ofs - 1;
} else if (node_ofs <= indirect_blks) {
int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
bidx = node_ofs - 2 - dec;
} else {
int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
bidx = node_ofs - 5 - dec;
}
return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi);
}
static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
struct node_info *dni, block_t blkaddr, unsigned int *nofs)
{
struct page *node_page;
nid_t nid;
unsigned int ofs_in_node;
block_t source_blkaddr;
nid = le32_to_cpu(sum->nid);
ofs_in_node = le16_to_cpu(sum->ofs_in_node);
node_page = get_node_page(sbi, nid);
if (IS_ERR(node_page))
return false;
get_node_info(sbi, nid, dni);
if (sum->version != dni->version) {
f2fs_put_page(node_page, 1);
return false;
}
*nofs = ofs_of_node(node_page);
source_blkaddr = datablock_addr(node_page, ofs_in_node);
f2fs_put_page(node_page, 1);
if (source_blkaddr != blkaddr)
return false;
return true;
}
static void move_encrypted_block(struct inode *inode, block_t bidx)
{
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(inode),
.type = DATA,
.rw = READ_SYNC,
.encrypted_page = NULL,
};
struct dnode_of_data dn;
struct f2fs_summary sum;
struct node_info ni;
struct page *page;
int err;
/* do not read out */
page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
if (!page)
return;
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
if (err)
goto out;
f2fs crypto: fix racing of accessing encrypted page among different competitors Since we use different page cache (normally inode's page cache for R/W and meta inode's page cache for GC) to cache the same physical block which is belong to an encrypted inode. Writeback of these two page cache should be exclusive, but now we didn't handle writeback state well, so there may be potential racing problem: a) kworker: f2fs_gc: - f2fs_write_data_pages - f2fs_write_data_page - do_write_data_page - write_data_page - f2fs_submit_page_mbio (page#1 in inode's page cache was queued in f2fs bio cache, and be ready to write to new blkaddr) - gc_data_segment - move_encrypted_block - pagecache_get_page (page#2 in meta inode's page cache was cached with the invalid datas of physical block located in new blkaddr) - f2fs_submit_page_mbio (page#1 was submitted, later, page#2 with invalid data will be submitted) b) f2fs_gc: - gc_data_segment - move_encrypted_block - f2fs_submit_page_mbio (page#1 in meta inode's page cache was queued in f2fs bio cache, and be ready to write to new blkaddr) user thread: - f2fs_write_begin - f2fs_submit_page_bio (we submit the request to block layer to update page#2 in inode's page cache with physical block located in new blkaddr, so here we may read gabbage data from new blkaddr since GC hasn't writebacked the page#1 yet) This patch fixes above potential racing problem for encrypted inode. Signed-off-by: Chao Yu <chao2.yu@samsung.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-10-08 12:27:34 +07:00
if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
ClearPageUptodate(page);
goto put_out;
f2fs crypto: fix racing of accessing encrypted page among different competitors Since we use different page cache (normally inode's page cache for R/W and meta inode's page cache for GC) to cache the same physical block which is belong to an encrypted inode. Writeback of these two page cache should be exclusive, but now we didn't handle writeback state well, so there may be potential racing problem: a) kworker: f2fs_gc: - f2fs_write_data_pages - f2fs_write_data_page - do_write_data_page - write_data_page - f2fs_submit_page_mbio (page#1 in inode's page cache was queued in f2fs bio cache, and be ready to write to new blkaddr) - gc_data_segment - move_encrypted_block - pagecache_get_page (page#2 in meta inode's page cache was cached with the invalid datas of physical block located in new blkaddr) - f2fs_submit_page_mbio (page#1 was submitted, later, page#2 with invalid data will be submitted) b) f2fs_gc: - gc_data_segment - move_encrypted_block - f2fs_submit_page_mbio (page#1 in meta inode's page cache was queued in f2fs bio cache, and be ready to write to new blkaddr) user thread: - f2fs_write_begin - f2fs_submit_page_bio (we submit the request to block layer to update page#2 in inode's page cache with physical block located in new blkaddr, so here we may read gabbage data from new blkaddr since GC hasn't writebacked the page#1 yet) This patch fixes above potential racing problem for encrypted inode. Signed-off-by: Chao Yu <chao2.yu@samsung.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-10-08 12:27:34 +07:00
}
/*
* don't cache encrypted data into meta inode until previous dirty
* data were writebacked to avoid racing between GC and flush.
*/
f2fs_wait_on_page_writeback(page, DATA);
get_node_info(fio.sbi, dn.nid, &ni);
set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
/* read page */
fio.page = page;
fio.blk_addr = dn.data_blkaddr;
fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi),
fio.blk_addr,
FGP_LOCK|FGP_CREAT,
GFP_NOFS);
if (!fio.encrypted_page)
goto put_out;
err = f2fs_submit_page_bio(&fio);
if (err)
goto put_page_out;
/* write page */
lock_page(fio.encrypted_page);
if (unlikely(!PageUptodate(fio.encrypted_page)))
goto put_page_out;
if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi)))
goto put_page_out;
f2fs: call set_page_dirty to attach i_wb for cgroup The cgroup attaches inode->i_wb via mark_inode_dirty and when set_page_writeback is called, __inc_wb_stat() updates i_wb's stat. So, we need to explicitly call set_page_dirty->__mark_inode_dirty in prior to any writebacking pages. This patch should resolve the following kernel panic reported by Andreas Reis. https://bugzilla.kernel.org/show_bug.cgi?id=101801 --- Comment #2 from Andreas Reis <andreas.reis@gmail.com> --- BUG: unable to handle kernel NULL pointer dereference at 00000000000000a8 IP: [<ffffffff8149deea>] __percpu_counter_add+0x1a/0x90 PGD 2951ff067 PUD 2df43f067 PMD 0 Oops: 0000 [#1] PREEMPT SMP Modules linked in: CPU: 7 PID: 10356 Comm: gcc Tainted: G W 4.2.0-1-cu #1 Hardware name: Gigabyte Technology Co., Ltd. G1.Sniper M5/G1.Sniper M5, BIOS T01 02/03/2015 task: ffff880295044f80 ti: ffff880295140000 task.ti: ffff880295140000 RIP: 0010:[<ffffffff8149deea>] [<ffffffff8149deea>] __percpu_counter_add+0x1a/0x90 RSP: 0018:ffff880295143ac8 EFLAGS: 00010082 RAX: 0000000000000003 RBX: ffffea000a526d40 RCX: 0000000000000001 RDX: 0000000000000020 RSI: 0000000000000001 RDI: 0000000000000088 RBP: ffff880295143ae8 R08: 0000000000000000 R09: ffff88008f69bb30 R10: 00000000fffffffa R11: 0000000000000000 R12: 0000000000000088 R13: 0000000000000001 R14: ffff88041d099000 R15: ffff880084a205d0 FS: 00007f8549374700(0000) GS:ffff88042f3c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000a8 CR3: 000000033e1d5000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Stack: 0000000000000000 ffffea000a526d40 ffff880084a20738 ffff880084a20750 ffff880295143b48 ffffffff811cc91e ffff880000000000 0000000000000296 0000000000000000 ffff880417090198 0000000000000000 ffffea000a526d40 Call Trace: [<ffffffff811cc91e>] __test_set_page_writeback+0xde/0x1d0 [<ffffffff813fee87>] do_write_data_page+0xe7/0x3a0 [<ffffffff813faeea>] gc_data_segment+0x5aa/0x640 [<ffffffff813fb0b8>] do_garbage_collect+0x138/0x150 [<ffffffff813fb3fe>] f2fs_gc+0x1be/0x3e0 [<ffffffff81405541>] f2fs_balance_fs+0x81/0x90 [<ffffffff813ee357>] f2fs_unlink+0x47/0x1d0 [<ffffffff81239329>] vfs_unlink+0x109/0x1b0 [<ffffffff8123e3d7>] do_unlinkat+0x287/0x2c0 [<ffffffff8123ebc6>] SyS_unlink+0x16/0x20 [<ffffffff81942e2e>] entry_SYSCALL_64_fastpath+0x12/0x71 Code: 41 5e 5d c3 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 41 55 49 89 f5 41 54 49 89 fc 53 48 83 ec 08 65 ff 05 e6 d9 b6 7e <48> 8b 47 20 48 63 ca 65 8b 18 48 63 db 48 01 f3 48 39 cb 7d 0a RIP [<ffffffff8149deea>] __percpu_counter_add+0x1a/0x90 RSP <ffff880295143ac8> CR2: 00000000000000a8 ---[ end trace 5132449a58ed93a3 ]--- note: gcc[10356] exited with preempt_count 2 Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-25 14:29:17 +07:00
set_page_dirty(fio.encrypted_page);
f2fs crypto: fix racing of accessing encrypted page among different competitors Since we use different page cache (normally inode's page cache for R/W and meta inode's page cache for GC) to cache the same physical block which is belong to an encrypted inode. Writeback of these two page cache should be exclusive, but now we didn't handle writeback state well, so there may be potential racing problem: a) kworker: f2fs_gc: - f2fs_write_data_pages - f2fs_write_data_page - do_write_data_page - write_data_page - f2fs_submit_page_mbio (page#1 in inode's page cache was queued in f2fs bio cache, and be ready to write to new blkaddr) - gc_data_segment - move_encrypted_block - pagecache_get_page (page#2 in meta inode's page cache was cached with the invalid datas of physical block located in new blkaddr) - f2fs_submit_page_mbio (page#1 was submitted, later, page#2 with invalid data will be submitted) b) f2fs_gc: - gc_data_segment - move_encrypted_block - f2fs_submit_page_mbio (page#1 in meta inode's page cache was queued in f2fs bio cache, and be ready to write to new blkaddr) user thread: - f2fs_write_begin - f2fs_submit_page_bio (we submit the request to block layer to update page#2 in inode's page cache with physical block located in new blkaddr, so here we may read gabbage data from new blkaddr since GC hasn't writebacked the page#1 yet) This patch fixes above potential racing problem for encrypted inode. Signed-off-by: Chao Yu <chao2.yu@samsung.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-10-08 12:27:34 +07:00
f2fs_wait_on_page_writeback(fio.encrypted_page, DATA);
f2fs: call set_page_dirty to attach i_wb for cgroup The cgroup attaches inode->i_wb via mark_inode_dirty and when set_page_writeback is called, __inc_wb_stat() updates i_wb's stat. So, we need to explicitly call set_page_dirty->__mark_inode_dirty in prior to any writebacking pages. This patch should resolve the following kernel panic reported by Andreas Reis. https://bugzilla.kernel.org/show_bug.cgi?id=101801 --- Comment #2 from Andreas Reis <andreas.reis@gmail.com> --- BUG: unable to handle kernel NULL pointer dereference at 00000000000000a8 IP: [<ffffffff8149deea>] __percpu_counter_add+0x1a/0x90 PGD 2951ff067 PUD 2df43f067 PMD 0 Oops: 0000 [#1] PREEMPT SMP Modules linked in: CPU: 7 PID: 10356 Comm: gcc Tainted: G W 4.2.0-1-cu #1 Hardware name: Gigabyte Technology Co., Ltd. G1.Sniper M5/G1.Sniper M5, BIOS T01 02/03/2015 task: ffff880295044f80 ti: ffff880295140000 task.ti: ffff880295140000 RIP: 0010:[<ffffffff8149deea>] [<ffffffff8149deea>] __percpu_counter_add+0x1a/0x90 RSP: 0018:ffff880295143ac8 EFLAGS: 00010082 RAX: 0000000000000003 RBX: ffffea000a526d40 RCX: 0000000000000001 RDX: 0000000000000020 RSI: 0000000000000001 RDI: 0000000000000088 RBP: ffff880295143ae8 R08: 0000000000000000 R09: ffff88008f69bb30 R10: 00000000fffffffa R11: 0000000000000000 R12: 0000000000000088 R13: 0000000000000001 R14: ffff88041d099000 R15: ffff880084a205d0 FS: 00007f8549374700(0000) GS:ffff88042f3c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000a8 CR3: 000000033e1d5000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Stack: 0000000000000000 ffffea000a526d40 ffff880084a20738 ffff880084a20750 ffff880295143b48 ffffffff811cc91e ffff880000000000 0000000000000296 0000000000000000 ffff880417090198 0000000000000000 ffffea000a526d40 Call Trace: [<ffffffff811cc91e>] __test_set_page_writeback+0xde/0x1d0 [<ffffffff813fee87>] do_write_data_page+0xe7/0x3a0 [<ffffffff813faeea>] gc_data_segment+0x5aa/0x640 [<ffffffff813fb0b8>] do_garbage_collect+0x138/0x150 [<ffffffff813fb3fe>] f2fs_gc+0x1be/0x3e0 [<ffffffff81405541>] f2fs_balance_fs+0x81/0x90 [<ffffffff813ee357>] f2fs_unlink+0x47/0x1d0 [<ffffffff81239329>] vfs_unlink+0x109/0x1b0 [<ffffffff8123e3d7>] do_unlinkat+0x287/0x2c0 [<ffffffff8123ebc6>] SyS_unlink+0x16/0x20 [<ffffffff81942e2e>] entry_SYSCALL_64_fastpath+0x12/0x71 Code: 41 5e 5d c3 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 41 55 49 89 f5 41 54 49 89 fc 53 48 83 ec 08 65 ff 05 e6 d9 b6 7e <48> 8b 47 20 48 63 ca 65 8b 18 48 63 db 48 01 f3 48 39 cb 7d 0a RIP [<ffffffff8149deea>] __percpu_counter_add+0x1a/0x90 RSP <ffff880295143ac8> CR2: 00000000000000a8 ---[ end trace 5132449a58ed93a3 ]--- note: gcc[10356] exited with preempt_count 2 Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-25 14:29:17 +07:00
if (clear_page_dirty_for_io(fio.encrypted_page))
dec_page_count(fio.sbi, F2FS_DIRTY_META);
set_page_writeback(fio.encrypted_page);
/* allocate block address */
f2fs_wait_on_page_writeback(dn.node_page, NODE);
allocate_data_block(fio.sbi, NULL, fio.blk_addr,
&fio.blk_addr, &sum, CURSEG_COLD_DATA);
fio.rw = WRITE_SYNC;
f2fs_submit_page_mbio(&fio);
dn.data_blkaddr = fio.blk_addr;
set_data_blkaddr(&dn);
f2fs_update_extent_cache(&dn);
set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
if (page->index == 0)
set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
put_page_out:
f2fs_put_page(fio.encrypted_page, 1);
put_out:
f2fs_put_dnode(&dn);
out:
f2fs_put_page(page, 1);
}
static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
{
struct page *page;
page = get_lock_data_page(inode, bidx, true);
if (IS_ERR(page))
return;
if (gc_type == BG_GC) {
if (PageWriteback(page))
goto out;
set_page_dirty(page);
set_cold_data(page);
} else {
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(inode),
.type = DATA,
.rw = WRITE_SYNC,
.page = page,
.encrypted_page = NULL,
};
f2fs: call set_page_dirty to attach i_wb for cgroup The cgroup attaches inode->i_wb via mark_inode_dirty and when set_page_writeback is called, __inc_wb_stat() updates i_wb's stat. So, we need to explicitly call set_page_dirty->__mark_inode_dirty in prior to any writebacking pages. This patch should resolve the following kernel panic reported by Andreas Reis. https://bugzilla.kernel.org/show_bug.cgi?id=101801 --- Comment #2 from Andreas Reis <andreas.reis@gmail.com> --- BUG: unable to handle kernel NULL pointer dereference at 00000000000000a8 IP: [<ffffffff8149deea>] __percpu_counter_add+0x1a/0x90 PGD 2951ff067 PUD 2df43f067 PMD 0 Oops: 0000 [#1] PREEMPT SMP Modules linked in: CPU: 7 PID: 10356 Comm: gcc Tainted: G W 4.2.0-1-cu #1 Hardware name: Gigabyte Technology Co., Ltd. G1.Sniper M5/G1.Sniper M5, BIOS T01 02/03/2015 task: ffff880295044f80 ti: ffff880295140000 task.ti: ffff880295140000 RIP: 0010:[<ffffffff8149deea>] [<ffffffff8149deea>] __percpu_counter_add+0x1a/0x90 RSP: 0018:ffff880295143ac8 EFLAGS: 00010082 RAX: 0000000000000003 RBX: ffffea000a526d40 RCX: 0000000000000001 RDX: 0000000000000020 RSI: 0000000000000001 RDI: 0000000000000088 RBP: ffff880295143ae8 R08: 0000000000000000 R09: ffff88008f69bb30 R10: 00000000fffffffa R11: 0000000000000000 R12: 0000000000000088 R13: 0000000000000001 R14: ffff88041d099000 R15: ffff880084a205d0 FS: 00007f8549374700(0000) GS:ffff88042f3c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000a8 CR3: 000000033e1d5000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Stack: 0000000000000000 ffffea000a526d40 ffff880084a20738 ffff880084a20750 ffff880295143b48 ffffffff811cc91e ffff880000000000 0000000000000296 0000000000000000 ffff880417090198 0000000000000000 ffffea000a526d40 Call Trace: [<ffffffff811cc91e>] __test_set_page_writeback+0xde/0x1d0 [<ffffffff813fee87>] do_write_data_page+0xe7/0x3a0 [<ffffffff813faeea>] gc_data_segment+0x5aa/0x640 [<ffffffff813fb0b8>] do_garbage_collect+0x138/0x150 [<ffffffff813fb3fe>] f2fs_gc+0x1be/0x3e0 [<ffffffff81405541>] f2fs_balance_fs+0x81/0x90 [<ffffffff813ee357>] f2fs_unlink+0x47/0x1d0 [<ffffffff81239329>] vfs_unlink+0x109/0x1b0 [<ffffffff8123e3d7>] do_unlinkat+0x287/0x2c0 [<ffffffff8123ebc6>] SyS_unlink+0x16/0x20 [<ffffffff81942e2e>] entry_SYSCALL_64_fastpath+0x12/0x71 Code: 41 5e 5d c3 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 41 55 49 89 f5 41 54 49 89 fc 53 48 83 ec 08 65 ff 05 e6 d9 b6 7e <48> 8b 47 20 48 63 ca 65 8b 18 48 63 db 48 01 f3 48 39 cb 7d 0a RIP [<ffffffff8149deea>] __percpu_counter_add+0x1a/0x90 RSP <ffff880295143ac8> CR2: 00000000000000a8 ---[ end trace 5132449a58ed93a3 ]--- note: gcc[10356] exited with preempt_count 2 Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-25 14:29:17 +07:00
set_page_dirty(page);
f2fs_wait_on_page_writeback(page, DATA);
if (clear_page_dirty_for_io(page))
inode_dec_dirty_pages(inode);
set_cold_data(page);
do_write_data_page(&fio);
clear_cold_data(page);
}
out:
f2fs_put_page(page, 1);
}
/*
* This function tries to get parent node of victim data block, and identifies
* data block validity. If the block is valid, copy that with cold status and
* modify parent node.
* If the parent node is not valid or the data block address is different,
* the victim data block is ignored.
*/
static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
{
struct super_block *sb = sbi->sb;
struct f2fs_summary *entry;
block_t start_addr;
int off;
int phase = 0;
start_addr = START_BLOCK(sbi, segno);
next_step:
entry = sum;
f2fs: give a chance to merge IOs by IO scheduler Previously, background GC submits many 4KB read requests to load victim blocks and/or its (i)node blocks. ... f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb61, blkaddr = 0x3b964ed f2fs_gc : block_rq_complete: 8,16 R () 499854968 + 8 [0] f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb6f, blkaddr = 0x3b964ee f2fs_gc : block_rq_complete: 8,16 R () 499854976 + 8 [0] f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb79, blkaddr = 0x3b964ef f2fs_gc : block_rq_complete: 8,16 R () 499854984 + 8 [0] ... However, by the fact that many IOs are sequential, we can give a chance to merge the IOs by IO scheduler. In order to do that, let's use blk_plug. ... f2fs_gc : f2fs_iget: ino = 143 f2fs_gc : f2fs_readpage: ino = 143, page_index = 0x1c6, blkaddr = 0x2e6ee f2fs_gc : f2fs_iget: ino = 143 f2fs_gc : f2fs_readpage: ino = 143, page_index = 0x1c7, blkaddr = 0x2e6ef <idle> : block_rq_complete: 8,16 R () 1519616 + 8 [0] <idle> : block_rq_complete: 8,16 R () 1519848 + 8 [0] <idle> : block_rq_complete: 8,16 R () 1520432 + 96 [0] <idle> : block_rq_complete: 8,16 R () 1520536 + 104 [0] <idle> : block_rq_complete: 8,16 R () 1521008 + 112 [0] <idle> : block_rq_complete: 8,16 R () 1521440 + 152 [0] <idle> : block_rq_complete: 8,16 R () 1521688 + 144 [0] <idle> : block_rq_complete: 8,16 R () 1522128 + 192 [0] <idle> : block_rq_complete: 8,16 R () 1523256 + 328 [0] ... Note that this issue should be addressed in checkpoint, and some readahead flows too. Reviewed-by: Namjae Jeon <namjae.jeon@samsung.com> Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-04-24 11:19:56 +07:00
for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
struct page *data_page;
struct inode *inode;
struct node_info dni; /* dnode info for the data */
unsigned int ofs_in_node, nofs;
block_t start_bidx;
/* stop BG_GC if there is not enough free sections. */
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
return 0;
if (check_valid_map(sbi, segno, off) == 0)
continue;
if (phase == 0) {
ra_node_page(sbi, le32_to_cpu(entry->nid));
continue;
}
/* Get an inode by ino with checking validity */
if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs))
continue;
if (phase == 1) {
ra_node_page(sbi, dni.ino);
continue;
}
ofs_in_node = le16_to_cpu(entry->ofs_in_node);
if (phase == 2) {
f2fs: avoid balanc_fs during evict_inode 1. Background Previously, if f2fs tries to move data blocks of an *evicting* inode during the cleaning process, it stops the process incompletely and then restarts the whole process, since it needs a locked inode to grab victim data pages in its address space. In order to get a locked inode, iget_locked() by f2fs_iget() is normally used, but, it waits if the inode is on freeing. So, here is a deadlock scenario. 1. f2fs_evict_inode() <- inode "A" 2. f2fs_balance_fs() 3. f2fs_gc() 4. gc_data_segment() 5. f2fs_iget() <- inode "A" too! If step #1 and #5 treat a same inode "A", step #5 would fall into deadlock since the inode "A" is on freeing. In order to resolve this, f2fs_iget_nowait() which skips __wait_on_freeing_inode() was introduced in step #5, and stops f2fs_gc() to complete f2fs_evict_inode(). 1. f2fs_evict_inode() <- inode "A" 2. f2fs_balance_fs() 3. f2fs_gc() 4. gc_data_segment() 5. f2fs_iget_nowait() <- inode "A", then stop f2fs_gc() w/ -ENOENT 2. Problem and Solution In the above scenario, however, f2fs cannot finish f2fs_evict_inode() only if: o there are not enough free sections, and o f2fs_gc() tries to move data blocks of the *evicting* inode repeatedly. So, the final solution is to use f2fs_iget() and remove f2fs_balance_fs() in f2fs_evict_inode(). The f2fs_evict_inode() actually truncates all the data and node blocks, which means that it doesn't produce any dirty node pages accordingly. So, we don't need to do f2fs_balance_fs() in practical. Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-01-31 13:36:04 +07:00
inode = f2fs_iget(sb, dni.ino);
if (IS_ERR(inode) || is_bad_inode(inode))
continue;
/* if encrypted inode, let's go phase 3 */
if (f2fs_encrypted_inode(inode) &&
S_ISREG(inode->i_mode)) {
add_gc_inode(gc_list, inode);
continue;
}
start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
data_page = get_read_data_page(inode,
start_bidx + ofs_in_node, READA, true);
if (IS_ERR(data_page)) {
iput(inode);
continue;
}
f2fs_put_page(data_page, 0);
add_gc_inode(gc_list, inode);
continue;
}
/* phase 3 */
inode = find_gc_inode(gc_list, dni.ino);
if (inode) {
start_bidx = start_bidx_of_node(nofs, F2FS_I(inode))
+ ofs_in_node;
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
move_encrypted_block(inode, start_bidx);
else
move_data_page(inode, start_bidx, gc_type);
stat_inc_data_blk_count(sbi, 1, gc_type);
}
}
f2fs: give a chance to merge IOs by IO scheduler Previously, background GC submits many 4KB read requests to load victim blocks and/or its (i)node blocks. ... f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb61, blkaddr = 0x3b964ed f2fs_gc : block_rq_complete: 8,16 R () 499854968 + 8 [0] f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb6f, blkaddr = 0x3b964ee f2fs_gc : block_rq_complete: 8,16 R () 499854976 + 8 [0] f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb79, blkaddr = 0x3b964ef f2fs_gc : block_rq_complete: 8,16 R () 499854984 + 8 [0] ... However, by the fact that many IOs are sequential, we can give a chance to merge the IOs by IO scheduler. In order to do that, let's use blk_plug. ... f2fs_gc : f2fs_iget: ino = 143 f2fs_gc : f2fs_readpage: ino = 143, page_index = 0x1c6, blkaddr = 0x2e6ee f2fs_gc : f2fs_iget: ino = 143 f2fs_gc : f2fs_readpage: ino = 143, page_index = 0x1c7, blkaddr = 0x2e6ef <idle> : block_rq_complete: 8,16 R () 1519616 + 8 [0] <idle> : block_rq_complete: 8,16 R () 1519848 + 8 [0] <idle> : block_rq_complete: 8,16 R () 1520432 + 96 [0] <idle> : block_rq_complete: 8,16 R () 1520536 + 104 [0] <idle> : block_rq_complete: 8,16 R () 1521008 + 112 [0] <idle> : block_rq_complete: 8,16 R () 1521440 + 152 [0] <idle> : block_rq_complete: 8,16 R () 1521688 + 144 [0] <idle> : block_rq_complete: 8,16 R () 1522128 + 192 [0] <idle> : block_rq_complete: 8,16 R () 1523256 + 328 [0] ... Note that this issue should be addressed in checkpoint, and some readahead flows too. Reviewed-by: Namjae Jeon <namjae.jeon@samsung.com> Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-04-24 11:19:56 +07:00
if (++phase < 4)
goto next_step;
if (gc_type == FG_GC) {
f2fs_submit_merged_bio(sbi, DATA, WRITE);
/* return 1 only if FG_GC succefully reclaimed one */
if (get_valid_blocks(sbi, segno, 1) == 0)
return 1;
}
return 0;
}
static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
int gc_type)
{
struct sit_info *sit_i = SIT_I(sbi);
int ret;
mutex_lock(&sit_i->sentry_lock);
ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type,
NO_CHECK_TYPE, LFS);
mutex_unlock(&sit_i->sentry_lock);
return ret;
}
static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
struct gc_inode_list *gc_list, int gc_type)
{
struct page *sum_page;
struct f2fs_summary_block *sum;
f2fs: give a chance to merge IOs by IO scheduler Previously, background GC submits many 4KB read requests to load victim blocks and/or its (i)node blocks. ... f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb61, blkaddr = 0x3b964ed f2fs_gc : block_rq_complete: 8,16 R () 499854968 + 8 [0] f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb6f, blkaddr = 0x3b964ee f2fs_gc : block_rq_complete: 8,16 R () 499854976 + 8 [0] f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb79, blkaddr = 0x3b964ef f2fs_gc : block_rq_complete: 8,16 R () 499854984 + 8 [0] ... However, by the fact that many IOs are sequential, we can give a chance to merge the IOs by IO scheduler. In order to do that, let's use blk_plug. ... f2fs_gc : f2fs_iget: ino = 143 f2fs_gc : f2fs_readpage: ino = 143, page_index = 0x1c6, blkaddr = 0x2e6ee f2fs_gc : f2fs_iget: ino = 143 f2fs_gc : f2fs_readpage: ino = 143, page_index = 0x1c7, blkaddr = 0x2e6ef <idle> : block_rq_complete: 8,16 R () 1519616 + 8 [0] <idle> : block_rq_complete: 8,16 R () 1519848 + 8 [0] <idle> : block_rq_complete: 8,16 R () 1520432 + 96 [0] <idle> : block_rq_complete: 8,16 R () 1520536 + 104 [0] <idle> : block_rq_complete: 8,16 R () 1521008 + 112 [0] <idle> : block_rq_complete: 8,16 R () 1521440 + 152 [0] <idle> : block_rq_complete: 8,16 R () 1521688 + 144 [0] <idle> : block_rq_complete: 8,16 R () 1522128 + 192 [0] <idle> : block_rq_complete: 8,16 R () 1523256 + 328 [0] ... Note that this issue should be addressed in checkpoint, and some readahead flows too. Reviewed-by: Namjae Jeon <namjae.jeon@samsung.com> Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-04-24 11:19:56 +07:00
struct blk_plug plug;
int nfree = 0;
/* read segment summary of victim */
sum_page = get_sum_page(sbi, segno);
f2fs: give a chance to merge IOs by IO scheduler Previously, background GC submits many 4KB read requests to load victim blocks and/or its (i)node blocks. ... f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb61, blkaddr = 0x3b964ed f2fs_gc : block_rq_complete: 8,16 R () 499854968 + 8 [0] f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb6f, blkaddr = 0x3b964ee f2fs_gc : block_rq_complete: 8,16 R () 499854976 + 8 [0] f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb79, blkaddr = 0x3b964ef f2fs_gc : block_rq_complete: 8,16 R () 499854984 + 8 [0] ... However, by the fact that many IOs are sequential, we can give a chance to merge the IOs by IO scheduler. In order to do that, let's use blk_plug. ... f2fs_gc : f2fs_iget: ino = 143 f2fs_gc : f2fs_readpage: ino = 143, page_index = 0x1c6, blkaddr = 0x2e6ee f2fs_gc : f2fs_iget: ino = 143 f2fs_gc : f2fs_readpage: ino = 143, page_index = 0x1c7, blkaddr = 0x2e6ef <idle> : block_rq_complete: 8,16 R () 1519616 + 8 [0] <idle> : block_rq_complete: 8,16 R () 1519848 + 8 [0] <idle> : block_rq_complete: 8,16 R () 1520432 + 96 [0] <idle> : block_rq_complete: 8,16 R () 1520536 + 104 [0] <idle> : block_rq_complete: 8,16 R () 1521008 + 112 [0] <idle> : block_rq_complete: 8,16 R () 1521440 + 152 [0] <idle> : block_rq_complete: 8,16 R () 1521688 + 144 [0] <idle> : block_rq_complete: 8,16 R () 1522128 + 192 [0] <idle> : block_rq_complete: 8,16 R () 1523256 + 328 [0] ... Note that this issue should be addressed in checkpoint, and some readahead flows too. Reviewed-by: Namjae Jeon <namjae.jeon@samsung.com> Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-04-24 11:19:56 +07:00
blk_start_plug(&plug);
sum = page_address(sum_page);
/*
* this is to avoid deadlock:
* - lock_page(sum_page) - f2fs_replace_block
* - check_valid_map() - mutex_lock(sentry_lock)
* - mutex_lock(sentry_lock) - change_curseg()
* - lock_page(sum_page)
*/
unlock_page(sum_page);
switch (GET_SUM_TYPE((&sum->footer))) {
case SUM_TYPE_NODE:
nfree = gc_node_segment(sbi, sum->entries, segno, gc_type);
break;
case SUM_TYPE_DATA:
nfree = gc_data_segment(sbi, sum->entries, gc_list,
segno, gc_type);
break;
}
f2fs: give a chance to merge IOs by IO scheduler Previously, background GC submits many 4KB read requests to load victim blocks and/or its (i)node blocks. ... f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb61, blkaddr = 0x3b964ed f2fs_gc : block_rq_complete: 8,16 R () 499854968 + 8 [0] f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb6f, blkaddr = 0x3b964ee f2fs_gc : block_rq_complete: 8,16 R () 499854976 + 8 [0] f2fs_gc : f2fs_readpage: ino = 1, page_index = 0xb79, blkaddr = 0x3b964ef f2fs_gc : block_rq_complete: 8,16 R () 499854984 + 8 [0] ... However, by the fact that many IOs are sequential, we can give a chance to merge the IOs by IO scheduler. In order to do that, let's use blk_plug. ... f2fs_gc : f2fs_iget: ino = 143 f2fs_gc : f2fs_readpage: ino = 143, page_index = 0x1c6, blkaddr = 0x2e6ee f2fs_gc : f2fs_iget: ino = 143 f2fs_gc : f2fs_readpage: ino = 143, page_index = 0x1c7, blkaddr = 0x2e6ef <idle> : block_rq_complete: 8,16 R () 1519616 + 8 [0] <idle> : block_rq_complete: 8,16 R () 1519848 + 8 [0] <idle> : block_rq_complete: 8,16 R () 1520432 + 96 [0] <idle> : block_rq_complete: 8,16 R () 1520536 + 104 [0] <idle> : block_rq_complete: 8,16 R () 1521008 + 112 [0] <idle> : block_rq_complete: 8,16 R () 1521440 + 152 [0] <idle> : block_rq_complete: 8,16 R () 1521688 + 144 [0] <idle> : block_rq_complete: 8,16 R () 1522128 + 192 [0] <idle> : block_rq_complete: 8,16 R () 1523256 + 328 [0] ... Note that this issue should be addressed in checkpoint, and some readahead flows too. Reviewed-by: Namjae Jeon <namjae.jeon@samsung.com> Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-04-24 11:19:56 +07:00
blk_finish_plug(&plug);
stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type);
stat_inc_call_count(sbi->stat_info);
f2fs_put_page(sum_page, 0);
return nfree;
}
int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
{
unsigned int segno, i;
int gc_type = sync ? FG_GC : BG_GC;
int sec_freed = 0;
int ret = -EINVAL;
struct cp_control cpc;
struct gc_inode_list gc_list = {
.ilist = LIST_HEAD_INIT(gc_list.ilist),
.iroot = RADIX_TREE_INIT(GFP_NOFS),
};
cpc.reason = __get_cp_reason(sbi);
gc_more:
segno = NULL_SEGNO;
if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
goto stop;
if (unlikely(f2fs_cp_error(sbi))) {
ret = -EIO;
goto stop;
}
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) {
gc_type = FG_GC;
/*
* If there is no victim and no prefree segment but still not
* enough free sections, we should flush dent/node blocks and do
* garbage collections.
*/
if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi))
write_checkpoint(sbi, &cpc);
else if (has_not_enough_free_secs(sbi, 0))
write_checkpoint(sbi, &cpc);
}
if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
goto stop;
ret = 0;
/* readahead multi ssa blocks those have contiguous address */
if (sbi->segs_per_sec > 1)
ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
META_SSA, true);
for (i = 0; i < sbi->segs_per_sec; i++) {
/*
* for FG_GC case, halt gcing left segments once failed one
* of segments in selected section to avoid long latency.
*/
if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) &&
gc_type == FG_GC)
break;
}
if (i == sbi->segs_per_sec && gc_type == FG_GC)
sec_freed++;
if (gc_type == FG_GC)
sbi->cur_victim_sec = NULL_SEGNO;
if (!sync) {
if (has_not_enough_free_secs(sbi, sec_freed))
goto gc_more;
if (gc_type == FG_GC)
write_checkpoint(sbi, &cpc);
}
stop:
mutex_unlock(&sbi->gc_mutex);
put_gc_inode(&gc_list);
if (sync)
ret = sec_freed ? 0 : -EAGAIN;
return ret;
}
void build_gc_manager(struct f2fs_sb_info *sbi)
{
DIRTY_I(sbi)->v_ops = &default_v_ops;
}