mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-22 18:09:40 +07:00
934eed395d
This is a bad one. I wonder whether we were so far protected by no_free_segments(sb) usually being smaller than LOGFS_NO_AREAS. Found by Dan Carpenter <dan.carpenter@oracle.com> using smatch. Signed-off-by: Joern Engel <joern@logfs.org> Signed-off-by: Prasad Joshi <prasadjoshi.linux@gmail.com>
733 lines
20 KiB
C
733 lines
20 KiB
C
/*
|
|
* fs/logfs/gc.c - garbage collection code
|
|
*
|
|
* As should be obvious for Linux kernel code, license is GPLv2
|
|
*
|
|
* Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
|
|
*/
|
|
#include "logfs.h"
|
|
#include <linux/sched.h>
|
|
#include <linux/slab.h>
|
|
|
|
/*
|
|
* Wear leveling needs to kick in when the difference between low erase
|
|
* counts and high erase counts gets too big. A good value for "too big"
|
|
* may be somewhat below 10% of maximum erase count for the device.
|
|
* Why not 397, to pick a nice round number with no specific meaning? :)
|
|
*
|
|
* WL_RATELIMIT is the minimum time between two wear level events. A huge
|
|
* number of segments may fulfil the requirements for wear leveling at the
|
|
* same time. If that happens we don't want to cause a latency from hell,
|
|
* but just gently pick one segment every so often and minimize overhead.
|
|
*/
|
|
#define WL_DELTA 397
|
|
#define WL_RATELIMIT 100
|
|
#define MAX_OBJ_ALIASES 2600
|
|
#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */
|
|
#define LIST_SIZE 64 /* base size of candidate lists */
|
|
#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */
|
|
#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
|
|
|
|
static int no_free_segments(struct super_block *sb)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
|
|
return super->s_free_list.count;
|
|
}
|
|
|
|
/* journal has distance -1, top-most ifile layer distance 0 */
|
|
static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
u8 gc_level = (__force u8)__gc_level;
|
|
|
|
switch (gc_level) {
|
|
case 0: /* fall through */
|
|
case 1: /* fall through */
|
|
case 2: /* fall through */
|
|
case 3:
|
|
/* file data or indirect blocks */
|
|
return super->s_ifile_levels + super->s_iblock_levels - gc_level;
|
|
case 6: /* fall through */
|
|
case 7: /* fall through */
|
|
case 8: /* fall through */
|
|
case 9:
|
|
/* inode file data or indirect blocks */
|
|
return super->s_ifile_levels - (gc_level - 6);
|
|
default:
|
|
printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
|
|
gc_level);
|
|
WARN_ON(1);
|
|
return super->s_ifile_levels + super->s_iblock_levels;
|
|
}
|
|
}
|
|
|
|
static int segment_is_reserved(struct super_block *sb, u32 segno)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
struct logfs_area *area;
|
|
void *reserved;
|
|
int i;
|
|
|
|
/* Some segments are reserved. Just pretend they were all valid */
|
|
reserved = btree_lookup32(&super->s_reserved_segments, segno);
|
|
if (reserved)
|
|
return 1;
|
|
|
|
/* Currently open segments */
|
|
for_each_area(i) {
|
|
area = super->s_area[i];
|
|
if (area->a_is_open && area->a_segno == segno)
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
|
|
{
|
|
BUG();
|
|
}
|
|
|
|
/*
|
|
* Returns the bytes consumed by valid objects in this segment. Object headers
|
|
* are counted, the segment header is not.
|
|
*/
|
|
static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
|
|
gc_level_t *gc_level)
|
|
{
|
|
struct logfs_segment_entry se;
|
|
u32 ec_level;
|
|
|
|
logfs_get_segment_entry(sb, segno, &se);
|
|
if (se.ec_level == cpu_to_be32(BADSEG) ||
|
|
se.valid == cpu_to_be32(RESERVED))
|
|
return RESERVED;
|
|
|
|
ec_level = be32_to_cpu(se.ec_level);
|
|
*ec = ec_level >> 4;
|
|
*gc_level = GC_LEVEL(ec_level & 0xf);
|
|
return be32_to_cpu(se.valid);
|
|
}
|
|
|
|
static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
|
|
u64 bix, gc_level_t gc_level)
|
|
{
|
|
struct inode *inode;
|
|
int err, cookie;
|
|
|
|
inode = logfs_safe_iget(sb, ino, &cookie);
|
|
err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
|
|
BUG_ON(err);
|
|
logfs_safe_iput(inode, cookie);
|
|
}
|
|
|
|
static u32 logfs_gc_segment(struct super_block *sb, u32 segno)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
struct logfs_segment_header sh;
|
|
struct logfs_object_header oh;
|
|
u64 ofs, ino, bix;
|
|
u32 seg_ofs, logical_segno, cleaned = 0;
|
|
int err, len, valid;
|
|
gc_level_t gc_level;
|
|
|
|
LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
|
|
|
|
btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
|
|
err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
|
|
BUG_ON(err);
|
|
gc_level = GC_LEVEL(sh.level);
|
|
logical_segno = be32_to_cpu(sh.segno);
|
|
if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
|
|
logfs_mark_segment_bad(sb, segno);
|
|
cleaned = -1;
|
|
goto out;
|
|
}
|
|
|
|
for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
|
|
seg_ofs + sizeof(oh) < super->s_segsize; ) {
|
|
ofs = dev_ofs(sb, logical_segno, seg_ofs);
|
|
err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
|
|
&oh);
|
|
BUG_ON(err);
|
|
|
|
if (!memchr_inv(&oh, 0xff, sizeof(oh)))
|
|
break;
|
|
|
|
if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
|
|
logfs_mark_segment_bad(sb, segno);
|
|
cleaned = super->s_segsize - 1;
|
|
goto out;
|
|
}
|
|
|
|
ino = be64_to_cpu(oh.ino);
|
|
bix = be64_to_cpu(oh.bix);
|
|
len = sizeof(oh) + be16_to_cpu(oh.len);
|
|
valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
|
|
if (valid == 1) {
|
|
logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
|
|
cleaned += len;
|
|
} else if (valid == 2) {
|
|
/* Will be invalid upon journal commit */
|
|
cleaned += len;
|
|
}
|
|
seg_ofs += len;
|
|
}
|
|
out:
|
|
btree_remove32(&super->s_reserved_segments, segno);
|
|
return cleaned;
|
|
}
|
|
|
|
static struct gc_candidate *add_list(struct gc_candidate *cand,
|
|
struct candidate_list *list)
|
|
{
|
|
struct rb_node **p = &list->rb_tree.rb_node;
|
|
struct rb_node *parent = NULL;
|
|
struct gc_candidate *cur;
|
|
int comp;
|
|
|
|
cand->list = list;
|
|
while (*p) {
|
|
parent = *p;
|
|
cur = rb_entry(parent, struct gc_candidate, rb_node);
|
|
|
|
if (list->sort_by_ec)
|
|
comp = cand->erase_count < cur->erase_count;
|
|
else
|
|
comp = cand->valid < cur->valid;
|
|
|
|
if (comp)
|
|
p = &parent->rb_left;
|
|
else
|
|
p = &parent->rb_right;
|
|
}
|
|
rb_link_node(&cand->rb_node, parent, p);
|
|
rb_insert_color(&cand->rb_node, &list->rb_tree);
|
|
|
|
if (list->count <= list->maxcount) {
|
|
list->count++;
|
|
return NULL;
|
|
}
|
|
cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
|
|
rb_erase(&cand->rb_node, &list->rb_tree);
|
|
cand->list = NULL;
|
|
return cand;
|
|
}
|
|
|
|
static void remove_from_list(struct gc_candidate *cand)
|
|
{
|
|
struct candidate_list *list = cand->list;
|
|
|
|
rb_erase(&cand->rb_node, &list->rb_tree);
|
|
list->count--;
|
|
}
|
|
|
|
static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
|
|
btree_remove32(&super->s_cand_tree, cand->segno);
|
|
kfree(cand);
|
|
}
|
|
|
|
u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
|
|
{
|
|
struct gc_candidate *cand;
|
|
u32 segno;
|
|
|
|
BUG_ON(list->count == 0);
|
|
|
|
cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
|
|
remove_from_list(cand);
|
|
segno = cand->segno;
|
|
if (ec)
|
|
*ec = cand->erase_count;
|
|
free_candidate(sb, cand);
|
|
return segno;
|
|
}
|
|
|
|
/*
|
|
* We have several lists to manage segments with. The reserve_list is used to
|
|
* deal with bad blocks. We try to keep the best (lowest ec) segments on this
|
|
* list.
|
|
* The free_list contains free segments for normal usage. It usually gets the
|
|
* second pick after the reserve_list. But when the free_list is running short
|
|
* it is more important to keep the free_list full than to keep a reserve.
|
|
*
|
|
* Segments that are not free are put onto a per-level low_list. If we have
|
|
* to run garbage collection, we pick a candidate from there. All segments on
|
|
* those lists should have at least some free space so GC will make progress.
|
|
*
|
|
* And last we have the ec_list, which is used to pick segments for wear
|
|
* leveling.
|
|
*
|
|
* If all appropriate lists are full, we simply free the candidate and forget
|
|
* about that segment for a while. We have better candidates for each purpose.
|
|
*/
|
|
static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
|
|
|
|
if (cand->valid == 0) {
|
|
/* 100% free segments */
|
|
log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
|
|
cand->segno, cand->erase_count,
|
|
dev_ofs(sb, cand->segno, 0));
|
|
cand = add_list(cand, &super->s_reserve_list);
|
|
if (cand) {
|
|
log_gc_noisy("add free segment %x (ec %x) at %llx\n",
|
|
cand->segno, cand->erase_count,
|
|
dev_ofs(sb, cand->segno, 0));
|
|
cand = add_list(cand, &super->s_free_list);
|
|
}
|
|
} else {
|
|
/* good candidates for Garbage Collection */
|
|
if (cand->valid < full)
|
|
cand = add_list(cand, &super->s_low_list[cand->dist]);
|
|
/* good candidates for wear leveling,
|
|
* segments that were recently written get ignored */
|
|
if (cand)
|
|
cand = add_list(cand, &super->s_ec_list);
|
|
}
|
|
if (cand)
|
|
free_candidate(sb, cand);
|
|
}
|
|
|
|
static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
|
|
u8 dist)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
struct gc_candidate *cand;
|
|
|
|
cand = kmalloc(sizeof(*cand), GFP_NOFS);
|
|
if (!cand)
|
|
return -ENOMEM;
|
|
|
|
cand->segno = segno;
|
|
cand->valid = valid;
|
|
cand->erase_count = ec;
|
|
cand->dist = dist;
|
|
|
|
btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
|
|
__add_candidate(sb, cand);
|
|
return 0;
|
|
}
|
|
|
|
static void remove_segment_from_lists(struct super_block *sb, u32 segno)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
struct gc_candidate *cand;
|
|
|
|
cand = btree_lookup32(&super->s_cand_tree, segno);
|
|
if (cand) {
|
|
remove_from_list(cand);
|
|
free_candidate(sb, cand);
|
|
}
|
|
}
|
|
|
|
static void scan_segment(struct super_block *sb, u32 segno)
|
|
{
|
|
u32 valid, ec = 0;
|
|
gc_level_t gc_level = 0;
|
|
u8 dist;
|
|
|
|
if (segment_is_reserved(sb, segno))
|
|
return;
|
|
|
|
remove_segment_from_lists(sb, segno);
|
|
valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
|
|
if (valid == RESERVED)
|
|
return;
|
|
|
|
dist = root_distance(sb, gc_level);
|
|
add_candidate(sb, segno, valid, ec, dist);
|
|
}
|
|
|
|
static struct gc_candidate *first_in_list(struct candidate_list *list)
|
|
{
|
|
if (list->count == 0)
|
|
return NULL;
|
|
return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
|
|
}
|
|
|
|
/*
|
|
* Find the best segment for garbage collection. Main criterion is
|
|
* the segment requiring the least effort to clean. Secondary
|
|
* criterion is to GC on the lowest level available.
|
|
*
|
|
* So we search the least effort segment on the lowest level first,
|
|
* then move up and pick another segment iff is requires significantly
|
|
* less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
|
|
*/
|
|
static struct gc_candidate *get_candidate(struct super_block *sb)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
int i, max_dist;
|
|
struct gc_candidate *cand = NULL, *this;
|
|
|
|
max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1);
|
|
|
|
for (i = max_dist; i >= 0; i--) {
|
|
this = first_in_list(&super->s_low_list[i]);
|
|
if (!this)
|
|
continue;
|
|
if (!cand)
|
|
cand = this;
|
|
if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
|
|
cand = this;
|
|
}
|
|
return cand;
|
|
}
|
|
|
|
static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
gc_level_t gc_level;
|
|
u32 cleaned, valid, segno, ec;
|
|
u8 dist;
|
|
|
|
if (!cand) {
|
|
log_gc("GC attempted, but no candidate found\n");
|
|
return 0;
|
|
}
|
|
|
|
segno = cand->segno;
|
|
dist = cand->dist;
|
|
valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
|
|
free_candidate(sb, cand);
|
|
log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
|
|
segno, (u64)segno << super->s_segshift,
|
|
dist, no_free_segments(sb), valid,
|
|
super->s_free_bytes);
|
|
cleaned = logfs_gc_segment(sb, segno);
|
|
log_gc("GC segment #%02x complete - now %x valid\n", segno,
|
|
valid - cleaned);
|
|
BUG_ON(cleaned != valid);
|
|
return 1;
|
|
}
|
|
|
|
static int logfs_gc_once(struct super_block *sb)
|
|
{
|
|
struct gc_candidate *cand;
|
|
|
|
cand = get_candidate(sb);
|
|
if (cand)
|
|
remove_from_list(cand);
|
|
return __logfs_gc_once(sb, cand);
|
|
}
|
|
|
|
/* returns 1 if a wrap occurs, 0 otherwise */
|
|
static int logfs_scan_some(struct super_block *sb)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
u32 segno;
|
|
int i, ret = 0;
|
|
|
|
segno = super->s_sweeper;
|
|
for (i = SCAN_RATIO; i > 0; i--) {
|
|
segno++;
|
|
if (segno >= super->s_no_segs) {
|
|
segno = 0;
|
|
ret = 1;
|
|
/* Break out of the loop. We want to read a single
|
|
* block from the segment size on next invocation if
|
|
* SCAN_RATIO is set to match block size
|
|
*/
|
|
break;
|
|
}
|
|
|
|
scan_segment(sb, segno);
|
|
}
|
|
super->s_sweeper = segno;
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* In principle, this function should loop forever, looking for GC candidates
|
|
* and moving data. LogFS is designed in such a way that this loop is
|
|
* guaranteed to terminate.
|
|
*
|
|
* Limiting the loop to some iterations serves purely to catch cases when
|
|
* these guarantees have failed. An actual endless loop is an obvious bug
|
|
* and should be reported as such.
|
|
*/
|
|
static void __logfs_gc_pass(struct super_block *sb, int target)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
struct logfs_block *block;
|
|
int round, progress, last_progress = 0;
|
|
|
|
/*
|
|
* Doing too many changes to the segfile at once would result
|
|
* in a large number of aliases. Write the journal before
|
|
* things get out of hand.
|
|
*/
|
|
if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
|
|
logfs_write_anchor(sb);
|
|
|
|
if (no_free_segments(sb) >= target &&
|
|
super->s_no_object_aliases < MAX_OBJ_ALIASES)
|
|
return;
|
|
|
|
log_gc("__logfs_gc_pass(%x)\n", target);
|
|
for (round = 0; round < SCAN_ROUNDS; ) {
|
|
if (no_free_segments(sb) >= target)
|
|
goto write_alias;
|
|
|
|
/* Sync in-memory state with on-medium state in case they
|
|
* diverged */
|
|
logfs_write_anchor(sb);
|
|
round += logfs_scan_some(sb);
|
|
if (no_free_segments(sb) >= target)
|
|
goto write_alias;
|
|
progress = logfs_gc_once(sb);
|
|
if (progress)
|
|
last_progress = round;
|
|
else if (round - last_progress > 2)
|
|
break;
|
|
continue;
|
|
|
|
/*
|
|
* The goto logic is nasty, I just don't know a better way to
|
|
* code it. GC is supposed to ensure two things:
|
|
* 1. Enough free segments are available.
|
|
* 2. The number of aliases is bounded.
|
|
* When 1. is achieved, we take a look at 2. and write back
|
|
* some alias-containing blocks, if necessary. However, after
|
|
* each such write we need to go back to 1., as writes can
|
|
* consume free segments.
|
|
*/
|
|
write_alias:
|
|
if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
|
|
return;
|
|
if (list_empty(&super->s_object_alias)) {
|
|
/* All aliases are still in btree */
|
|
return;
|
|
}
|
|
log_gc("Write back one alias\n");
|
|
block = list_entry(super->s_object_alias.next,
|
|
struct logfs_block, alias_list);
|
|
block->ops->write_block(block);
|
|
/*
|
|
* To round off the nasty goto logic, we reset round here. It
|
|
* is a safety-net for GC not making any progress and limited
|
|
* to something reasonably small. If incremented it for every
|
|
* single alias, the loop could terminate rather quickly.
|
|
*/
|
|
round = 0;
|
|
}
|
|
LOGFS_BUG(sb);
|
|
}
|
|
|
|
static int wl_ratelimit(struct super_block *sb, u64 *next_event)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
|
|
if (*next_event < super->s_gec) {
|
|
*next_event = super->s_gec + WL_RATELIMIT;
|
|
return 0;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
static void logfs_wl_pass(struct super_block *sb)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
struct gc_candidate *wl_cand, *free_cand;
|
|
|
|
if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
|
|
return;
|
|
|
|
wl_cand = first_in_list(&super->s_ec_list);
|
|
if (!wl_cand)
|
|
return;
|
|
free_cand = first_in_list(&super->s_free_list);
|
|
if (!free_cand)
|
|
return;
|
|
|
|
if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
|
|
remove_from_list(wl_cand);
|
|
__logfs_gc_once(sb, wl_cand);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* The journal needs wear leveling as well. But moving the journal is an
|
|
* expensive operation so we try to avoid it as much as possible. And if we
|
|
* have to do it, we move the whole journal, not individual segments.
|
|
*
|
|
* Ratelimiting is not strictly necessary here, it mainly serves to avoid the
|
|
* calculations. First we check whether moving the journal would be a
|
|
* significant improvement. That means that a) the current journal segments
|
|
* have more wear than the future journal segments and b) the current journal
|
|
* segments have more wear than normal ostore segments.
|
|
* Rationale for b) is that we don't have to move the journal if it is aging
|
|
* less than the ostore, even if the reserve segments age even less (they are
|
|
* excluded from wear leveling, after all).
|
|
* Next we check that the superblocks have less wear than the journal. Since
|
|
* moving the journal requires writing the superblocks, we have to protect the
|
|
* superblocks even more than the journal.
|
|
*
|
|
* Also we double the acceptable wear difference, compared to ostore wear
|
|
* leveling. Journal data is read and rewritten rapidly, comparatively. So
|
|
* soft errors have much less time to accumulate and we allow the journal to
|
|
* be a bit worse than the ostore.
|
|
*/
|
|
static void logfs_journal_wl_pass(struct super_block *sb)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
struct gc_candidate *cand;
|
|
u32 min_journal_ec = -1, max_reserve_ec = 0;
|
|
int i;
|
|
|
|
if (wl_ratelimit(sb, &super->s_wl_gec_journal))
|
|
return;
|
|
|
|
if (super->s_reserve_list.count < super->s_no_journal_segs) {
|
|
/* Reserve is not full enough to move complete journal */
|
|
return;
|
|
}
|
|
|
|
journal_for_each(i)
|
|
if (super->s_journal_seg[i])
|
|
min_journal_ec = min(min_journal_ec,
|
|
super->s_journal_ec[i]);
|
|
cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
|
|
struct gc_candidate, rb_node);
|
|
max_reserve_ec = cand->erase_count;
|
|
for (i = 0; i < 2; i++) {
|
|
struct logfs_segment_entry se;
|
|
u32 segno = seg_no(sb, super->s_sb_ofs[i]);
|
|
u32 ec;
|
|
|
|
logfs_get_segment_entry(sb, segno, &se);
|
|
ec = be32_to_cpu(se.ec_level) >> 4;
|
|
max_reserve_ec = max(max_reserve_ec, ec);
|
|
}
|
|
|
|
if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
|
|
do_logfs_journal_wl_pass(sb);
|
|
}
|
|
}
|
|
|
|
void logfs_gc_pass(struct super_block *sb)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
|
|
//BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
|
|
/* Write journal before free space is getting saturated with dirty
|
|
* objects.
|
|
*/
|
|
if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
|
|
+ LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
|
|
logfs_write_anchor(sb);
|
|
__logfs_gc_pass(sb, super->s_total_levels);
|
|
logfs_wl_pass(sb);
|
|
logfs_journal_wl_pass(sb);
|
|
}
|
|
|
|
static int check_area(struct super_block *sb, int i)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
struct logfs_area *area = super->s_area[i];
|
|
gc_level_t gc_level;
|
|
u32 cleaned, valid, ec;
|
|
u32 segno = area->a_segno;
|
|
u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
|
|
|
|
if (!area->a_is_open)
|
|
return 0;
|
|
|
|
if (super->s_devops->can_write_buf(sb, ofs) == 0)
|
|
return 0;
|
|
|
|
printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs);
|
|
/*
|
|
* The device cannot write back the write buffer. Most likely the
|
|
* wbuf was already written out and the system crashed at some point
|
|
* before the journal commit happened. In that case we wouldn't have
|
|
* to do anything. But if the crash happened before the wbuf was
|
|
* written out correctly, we must GC this segment. So assume the
|
|
* worst and always do the GC run.
|
|
*/
|
|
area->a_is_open = 0;
|
|
valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
|
|
cleaned = logfs_gc_segment(sb, segno);
|
|
if (cleaned != valid)
|
|
return -EIO;
|
|
return 0;
|
|
}
|
|
|
|
int logfs_check_areas(struct super_block *sb)
|
|
{
|
|
int i, err;
|
|
|
|
for_each_area(i) {
|
|
err = check_area(sb, i);
|
|
if (err)
|
|
return err;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static void logfs_init_candlist(struct candidate_list *list, int maxcount,
|
|
int sort_by_ec)
|
|
{
|
|
list->count = 0;
|
|
list->maxcount = maxcount;
|
|
list->sort_by_ec = sort_by_ec;
|
|
list->rb_tree = RB_ROOT;
|
|
}
|
|
|
|
int logfs_init_gc(struct super_block *sb)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
int i;
|
|
|
|
btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
|
|
logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
|
|
logfs_init_candlist(&super->s_reserve_list,
|
|
super->s_bad_seg_reserve, 1);
|
|
for_each_area(i)
|
|
logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
|
|
logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
|
|
return 0;
|
|
}
|
|
|
|
static void logfs_cleanup_list(struct super_block *sb,
|
|
struct candidate_list *list)
|
|
{
|
|
struct gc_candidate *cand;
|
|
|
|
while (list->count) {
|
|
cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
|
|
rb_node);
|
|
remove_from_list(cand);
|
|
free_candidate(sb, cand);
|
|
}
|
|
BUG_ON(list->rb_tree.rb_node);
|
|
}
|
|
|
|
void logfs_cleanup_gc(struct super_block *sb)
|
|
{
|
|
struct logfs_super *super = logfs_super(sb);
|
|
int i;
|
|
|
|
if (!super->s_free_list.count)
|
|
return;
|
|
|
|
/*
|
|
* FIXME: The btree may still contain a single empty node. So we
|
|
* call the grim visitor to clean up that mess. Btree code should
|
|
* do it for us, really.
|
|
*/
|
|
btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
|
|
logfs_cleanup_list(sb, &super->s_free_list);
|
|
logfs_cleanup_list(sb, &super->s_reserve_list);
|
|
for_each_area(i)
|
|
logfs_cleanup_list(sb, &super->s_low_list[i]);
|
|
logfs_cleanup_list(sb, &super->s_ec_list);
|
|
}
|