linux_dsm_epyc7002/fs/gfs2/log.c

1184 lines
32 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
* Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
*/
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
#include <linux/crc32c.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
#include <linux/list_sort.h>
#include "gfs2.h"
#include "incore.h"
#include "bmap.h"
#include "glock.h"
#include "log.h"
#include "lops.h"
#include "meta_io.h"
#include "util.h"
#include "dir.h"
#include "trace_gfs2.h"
static void gfs2_log_shutdown(struct gfs2_sbd *sdp);
/**
* gfs2_struct2blk - compute stuff
* @sdp: the filesystem
* @nstruct: the number of structures
*
* Compute the number of log descriptor blocks needed to hold a certain number
* of structures of a certain size.
*
* Returns: the number of blocks needed (minimum is always 1)
*/
unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct)
{
unsigned int blks;
unsigned int first, second;
blks = 1;
first = sdp->sd_ldptrs;
if (nstruct > first) {
second = sdp->sd_inptrs;
blks += DIV_ROUND_UP(nstruct - first, second);
}
return blks;
}
/**
* gfs2_remove_from_ail - Remove an entry from the ail lists, updating counters
* @mapping: The associated mapping (maybe NULL)
* @bd: The gfs2_bufdata to remove
*
* The ail lock _must_ be held when calling this function
*
*/
static void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
{
bd->bd_tr = NULL;
list_del_init(&bd->bd_ail_st_list);
list_del_init(&bd->bd_ail_gl_list);
atomic_dec(&bd->bd_gl->gl_ail_count);
brelse(bd->bd_bh);
}
/**
* gfs2_ail1_start_one - Start I/O on a part of the AIL
* @sdp: the filesystem
* @wbc: The writeback control structure
* @ai: The ail structure
*
*/
static int gfs2_ail1_start_one(struct gfs2_sbd *sdp,
struct writeback_control *wbc,
struct gfs2_trans *tr)
__releases(&sdp->sd_ail_lock)
__acquires(&sdp->sd_ail_lock)
{
struct gfs2_glock *gl = NULL;
struct address_space *mapping;
struct gfs2_bufdata *bd, *s;
struct buffer_head *bh;
int ret = 0;
list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) {
bh = bd->bd_bh;
gfs2_assert(sdp, bd->bd_tr == tr);
if (!buffer_busy(bh)) {
if (buffer_uptodate(bh)) {
list_move(&bd->bd_ail_st_list,
&tr->tr_ail2_list);
continue;
}
if (!cmpxchg(&sdp->sd_log_error, 0, -EIO)) {
gfs2_io_error_bh(sdp, bh);
gfs2_withdraw_delayed(sdp);
}
}
if (gfs2_withdrawn(sdp)) {
gfs2_remove_from_ail(bd);
continue;
}
if (!buffer_dirty(bh))
continue;
if (gl == bd->bd_gl)
continue;
gl = bd->bd_gl;
list_move(&bd->bd_ail_st_list, &tr->tr_ail1_list);
mapping = bh->b_page->mapping;
if (!mapping)
continue;
spin_unlock(&sdp->sd_ail_lock);
ret = generic_writepages(mapping, wbc);
spin_lock(&sdp->sd_ail_lock);
if (ret || wbc->nr_to_write <= 0)
break;
return -EBUSY;
}
return ret;
}
static void dump_ail_list(struct gfs2_sbd *sdp)
{
struct gfs2_trans *tr;
struct gfs2_bufdata *bd;
struct buffer_head *bh;
fs_err(sdp, "Error: In gfs2_ail1_flush for ten minutes! t=%d\n",
current->journal_info ? 1 : 0);
list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) {
list_for_each_entry_reverse(bd, &tr->tr_ail1_list,
bd_ail_st_list) {
bh = bd->bd_bh;
fs_err(sdp, "bd %p: blk:0x%llx bh=%p ", bd,
(unsigned long long)bd->bd_blkno, bh);
if (!bh) {
fs_err(sdp, "\n");
continue;
}
fs_err(sdp, "0x%llx up2:%d dirt:%d lkd:%d req:%d "
"map:%d new:%d ar:%d aw:%d delay:%d "
"io err:%d unwritten:%d dfr:%d pin:%d esc:%d\n",
(unsigned long long)bh->b_blocknr,
buffer_uptodate(bh), buffer_dirty(bh),
buffer_locked(bh), buffer_req(bh),
buffer_mapped(bh), buffer_new(bh),
buffer_async_read(bh), buffer_async_write(bh),
buffer_delay(bh), buffer_write_io_error(bh),
buffer_unwritten(bh),
buffer_defer_completion(bh),
buffer_pinned(bh), buffer_escaped(bh));
}
}
}
/**
* gfs2_ail1_flush - start writeback of some ail1 entries
* @sdp: The super block
* @wbc: The writeback control structure
*
* Writes back some ail1 entries, according to the limits in the
* writeback control structure
*/
void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
{
struct list_head *head = &sdp->sd_ail1_list;
struct gfs2_trans *tr;
struct blk_plug plug;
int ret;
unsigned long flush_start = jiffies;
trace_gfs2_ail_flush(sdp, wbc, 1);
blk_start_plug(&plug);
spin_lock(&sdp->sd_ail_lock);
restart:
ret = 0;
if (time_after(jiffies, flush_start + (HZ * 600))) {
dump_ail_list(sdp);
goto out;
}
list_for_each_entry_reverse(tr, head, tr_list) {
if (wbc->nr_to_write <= 0)
break;
ret = gfs2_ail1_start_one(sdp, wbc, tr);
if (ret) {
if (ret == -EBUSY)
goto restart;
break;
}
}
out:
spin_unlock(&sdp->sd_ail_lock);
blk_finish_plug(&plug);
if (ret) {
gfs2_lm(sdp, "gfs2_ail1_start_one (generic_writepages) "
"returned: %d\n", ret);
gfs2_withdraw(sdp);
}
trace_gfs2_ail_flush(sdp, wbc, 0);
}
/**
* gfs2_ail1_start - start writeback of all ail1 entries
* @sdp: The superblock
*/
static void gfs2_ail1_start(struct gfs2_sbd *sdp)
{
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
.nr_to_write = LONG_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
};
return gfs2_ail1_flush(sdp, &wbc);
}
/**
* gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
* @sdp: the filesystem
gfs2: Issue revokes more intelligently Before this patch, function gfs2_write_revokes would call gfs2_ail1_empty, then traverse the sd_ail1_list looking for transactions that had bds which were no longer queued to a glock. And if it found some, it would try to issue revokes for them, up to a predetermined maximum. There were two problems with how it did this. First was the fact that gfs2_ail1_empty moves transactions which have nothing remaining on the ail1 list from the sd_ail1_list to the sd_ail2_list, thus making its traversal of sd_ail1_list miss them completely, and therefore, never issue revokes for them. Second was the fact that there were three traversals (or partial traversals) of the sd_ail1_list, each of which took and then released the sd_ail_lock lock: First inside gfs2_ail1_empty, second to determine if there are any revokes to be issued, and third to actually issue them. All this taking and releasing of the sd_ail_lock meant other processes could modify the lists and the conditions in which we're working. This patch simplies the whole process by adding a new parameter to function gfs2_ail1_empty, max_revokes. For normal calls, this is passed in as 0, meaning we don't want to issue any revokes. For function gfs2_write_revokes, we pass in the maximum number of revokes we can, thus allowing gfs2_ail1_empty to add the revokes where needed. This simplies the code, allows for a single holding of the sd_ail_lock, and allows gfs2_ail1_empty to add revokes for all the necessary bd items without missing any. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-02-22 04:28:07 +07:00
* @tr: the transaction
* @max_revokes: If nonzero, issue revokes for the bd items for written buffers
*
*/
gfs2: Issue revokes more intelligently Before this patch, function gfs2_write_revokes would call gfs2_ail1_empty, then traverse the sd_ail1_list looking for transactions that had bds which were no longer queued to a glock. And if it found some, it would try to issue revokes for them, up to a predetermined maximum. There were two problems with how it did this. First was the fact that gfs2_ail1_empty moves transactions which have nothing remaining on the ail1 list from the sd_ail1_list to the sd_ail2_list, thus making its traversal of sd_ail1_list miss them completely, and therefore, never issue revokes for them. Second was the fact that there were three traversals (or partial traversals) of the sd_ail1_list, each of which took and then released the sd_ail_lock lock: First inside gfs2_ail1_empty, second to determine if there are any revokes to be issued, and third to actually issue them. All this taking and releasing of the sd_ail_lock meant other processes could modify the lists and the conditions in which we're working. This patch simplies the whole process by adding a new parameter to function gfs2_ail1_empty, max_revokes. For normal calls, this is passed in as 0, meaning we don't want to issue any revokes. For function gfs2_write_revokes, we pass in the maximum number of revokes we can, thus allowing gfs2_ail1_empty to add the revokes where needed. This simplies the code, allows for a single holding of the sd_ail_lock, and allows gfs2_ail1_empty to add revokes for all the necessary bd items without missing any. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-02-22 04:28:07 +07:00
static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
int *max_revokes)
{
struct gfs2_bufdata *bd, *s;
struct buffer_head *bh;
list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list,
bd_ail_st_list) {
bh = bd->bd_bh;
gfs2_assert(sdp, bd->bd_tr == tr);
/*
* If another process flagged an io error, e.g. writing to the
* journal, error all other bhs and move them off the ail1 to
* prevent a tight loop when unmount tries to flush ail1,
* regardless of whether they're still busy. If no outside
* errors were found and the buffer is busy, move to the next.
* If the ail buffer is not busy and caught an error, flag it
* for others.
*/
if (!sdp->sd_log_error && buffer_busy(bh))
continue;
if (!buffer_uptodate(bh) &&
!cmpxchg(&sdp->sd_log_error, 0, -EIO)) {
gfs2_io_error_bh(sdp, bh);
gfs2_withdraw_delayed(sdp);
}
gfs2: Issue revokes more intelligently Before this patch, function gfs2_write_revokes would call gfs2_ail1_empty, then traverse the sd_ail1_list looking for transactions that had bds which were no longer queued to a glock. And if it found some, it would try to issue revokes for them, up to a predetermined maximum. There were two problems with how it did this. First was the fact that gfs2_ail1_empty moves transactions which have nothing remaining on the ail1 list from the sd_ail1_list to the sd_ail2_list, thus making its traversal of sd_ail1_list miss them completely, and therefore, never issue revokes for them. Second was the fact that there were three traversals (or partial traversals) of the sd_ail1_list, each of which took and then released the sd_ail_lock lock: First inside gfs2_ail1_empty, second to determine if there are any revokes to be issued, and third to actually issue them. All this taking and releasing of the sd_ail_lock meant other processes could modify the lists and the conditions in which we're working. This patch simplies the whole process by adding a new parameter to function gfs2_ail1_empty, max_revokes. For normal calls, this is passed in as 0, meaning we don't want to issue any revokes. For function gfs2_write_revokes, we pass in the maximum number of revokes we can, thus allowing gfs2_ail1_empty to add the revokes where needed. This simplies the code, allows for a single holding of the sd_ail_lock, and allows gfs2_ail1_empty to add revokes for all the necessary bd items without missing any. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-02-22 04:28:07 +07:00
/*
* If we have space for revokes and the bd is no longer on any
* buf list, we can just add a revoke for it immediately and
* avoid having to put it on the ail2 list, where it would need
* to be revoked later.
*/
if (*max_revokes && list_empty(&bd->bd_list)) {
gfs2_add_revoke(sdp, bd);
(*max_revokes)--;
continue;
}
list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list);
}
}
/**
* gfs2_ail1_empty - Try to empty the ail1 lists
* @sdp: The superblock
gfs2: Issue revokes more intelligently Before this patch, function gfs2_write_revokes would call gfs2_ail1_empty, then traverse the sd_ail1_list looking for transactions that had bds which were no longer queued to a glock. And if it found some, it would try to issue revokes for them, up to a predetermined maximum. There were two problems with how it did this. First was the fact that gfs2_ail1_empty moves transactions which have nothing remaining on the ail1 list from the sd_ail1_list to the sd_ail2_list, thus making its traversal of sd_ail1_list miss them completely, and therefore, never issue revokes for them. Second was the fact that there were three traversals (or partial traversals) of the sd_ail1_list, each of which took and then released the sd_ail_lock lock: First inside gfs2_ail1_empty, second to determine if there are any revokes to be issued, and third to actually issue them. All this taking and releasing of the sd_ail_lock meant other processes could modify the lists and the conditions in which we're working. This patch simplies the whole process by adding a new parameter to function gfs2_ail1_empty, max_revokes. For normal calls, this is passed in as 0, meaning we don't want to issue any revokes. For function gfs2_write_revokes, we pass in the maximum number of revokes we can, thus allowing gfs2_ail1_empty to add the revokes where needed. This simplies the code, allows for a single holding of the sd_ail_lock, and allows gfs2_ail1_empty to add revokes for all the necessary bd items without missing any. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-02-22 04:28:07 +07:00
* @max_revokes: If non-zero, add revokes where appropriate
*
* Tries to empty the ail1 lists, starting with the oldest first
*/
gfs2: Issue revokes more intelligently Before this patch, function gfs2_write_revokes would call gfs2_ail1_empty, then traverse the sd_ail1_list looking for transactions that had bds which were no longer queued to a glock. And if it found some, it would try to issue revokes for them, up to a predetermined maximum. There were two problems with how it did this. First was the fact that gfs2_ail1_empty moves transactions which have nothing remaining on the ail1 list from the sd_ail1_list to the sd_ail2_list, thus making its traversal of sd_ail1_list miss them completely, and therefore, never issue revokes for them. Second was the fact that there were three traversals (or partial traversals) of the sd_ail1_list, each of which took and then released the sd_ail_lock lock: First inside gfs2_ail1_empty, second to determine if there are any revokes to be issued, and third to actually issue them. All this taking and releasing of the sd_ail_lock meant other processes could modify the lists and the conditions in which we're working. This patch simplies the whole process by adding a new parameter to function gfs2_ail1_empty, max_revokes. For normal calls, this is passed in as 0, meaning we don't want to issue any revokes. For function gfs2_write_revokes, we pass in the maximum number of revokes we can, thus allowing gfs2_ail1_empty to add the revokes where needed. This simplies the code, allows for a single holding of the sd_ail_lock, and allows gfs2_ail1_empty to add revokes for all the necessary bd items without missing any. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-02-22 04:28:07 +07:00
static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes)
{
struct gfs2_trans *tr, *s;
int oldest_tr = 1;
int ret;
spin_lock(&sdp->sd_ail_lock);
list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
gfs2: Issue revokes more intelligently Before this patch, function gfs2_write_revokes would call gfs2_ail1_empty, then traverse the sd_ail1_list looking for transactions that had bds which were no longer queued to a glock. And if it found some, it would try to issue revokes for them, up to a predetermined maximum. There were two problems with how it did this. First was the fact that gfs2_ail1_empty moves transactions which have nothing remaining on the ail1 list from the sd_ail1_list to the sd_ail2_list, thus making its traversal of sd_ail1_list miss them completely, and therefore, never issue revokes for them. Second was the fact that there were three traversals (or partial traversals) of the sd_ail1_list, each of which took and then released the sd_ail_lock lock: First inside gfs2_ail1_empty, second to determine if there are any revokes to be issued, and third to actually issue them. All this taking and releasing of the sd_ail_lock meant other processes could modify the lists and the conditions in which we're working. This patch simplies the whole process by adding a new parameter to function gfs2_ail1_empty, max_revokes. For normal calls, this is passed in as 0, meaning we don't want to issue any revokes. For function gfs2_write_revokes, we pass in the maximum number of revokes we can, thus allowing gfs2_ail1_empty to add the revokes where needed. This simplies the code, allows for a single holding of the sd_ail_lock, and allows gfs2_ail1_empty to add revokes for all the necessary bd items without missing any. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-02-22 04:28:07 +07:00
gfs2_ail1_empty_one(sdp, tr, &max_revokes);
if (list_empty(&tr->tr_ail1_list) && oldest_tr)
list_move(&tr->tr_list, &sdp->sd_ail2_list);
else
oldest_tr = 0;
}
ret = list_empty(&sdp->sd_ail1_list);
spin_unlock(&sdp->sd_ail_lock);
if (test_bit(SDF_WITHDRAWING, &sdp->sd_flags)) {
gfs2_lm(sdp, "fatal: I/O error(s)\n");
gfs2_withdraw(sdp);
}
return ret;
}
static void gfs2_ail1_wait(struct gfs2_sbd *sdp)
{
struct gfs2_trans *tr;
struct gfs2_bufdata *bd;
struct buffer_head *bh;
spin_lock(&sdp->sd_ail_lock);
list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) {
list_for_each_entry(bd, &tr->tr_ail1_list, bd_ail_st_list) {
bh = bd->bd_bh;
if (!buffer_locked(bh))
continue;
get_bh(bh);
spin_unlock(&sdp->sd_ail_lock);
wait_on_buffer(bh);
brelse(bh);
return;
}
}
spin_unlock(&sdp->sd_ail_lock);
}
/**
* gfs2_ail_empty_tr - empty one of the ail lists for a transaction
*/
static void gfs2_ail_empty_tr(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
struct list_head *head)
{
struct gfs2_bufdata *bd;
while (!list_empty(head)) {
bd = list_first_entry(head, struct gfs2_bufdata,
bd_ail_st_list);
gfs2_assert(sdp, bd->bd_tr == tr);
gfs2_remove_from_ail(bd);
}
}
static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
{
struct gfs2_trans *tr, *safe;
unsigned int old_tail = sdp->sd_log_tail;
int wrap = (new_tail < old_tail);
int a, b, rm;
spin_lock(&sdp->sd_ail_lock);
list_for_each_entry_safe(tr, safe, &sdp->sd_ail2_list, tr_list) {
a = (old_tail <= tr->tr_first);
b = (tr->tr_first < new_tail);
rm = (wrap) ? (a || b) : (a && b);
if (!rm)
continue;
gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list);
list_del(&tr->tr_list);
gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list));
gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list));
kfree(tr);
}
spin_unlock(&sdp->sd_ail_lock);
}
GFS2: remove transaction glock GFS2 has a transaction glock, which must be grabbed for every transaction, whose purpose is to deal with freezing the filesystem. Aside from this involving a large amount of locking, it is very easy to make the current fsfreeze code hang on unfreezing. This patch rewrites how gfs2 handles freezing the filesystem. The transaction glock is removed. In it's place is a freeze glock, which is cached (but not held) in a shared state by every node in the cluster when the filesystem is mounted. This lock only needs to be grabbed on freezing, and actions which need to be safe from freezing, like recovery. When a node wants to freeze the filesystem, it grabs this glock exclusively. When the freeze glock state changes on the nodes (either from shared to unlocked, or shared to exclusive), the filesystem does a special log flush. gfs2_log_flush() does all the work for flushing out the and shutting down the incore log, and then it tries to grab the freeze glock in a shared state again. Since the filesystem is stuck in gfs2_log_flush, no new transaction can start, and nothing can be written to disk. Unfreezing the filesytem simply involes dropping the freeze glock, allowing gfs2_log_flush() to grab and then release the shared lock, so it is cached for next time. However, in order for the unfreezing ioctl to occur, gfs2 needs to get a shared lock on the filesystem root directory inode to check permissions. If that glock has already been grabbed exclusively, fsfreeze will be unable to get the shared lock and unfreeze the filesystem. In order to allow the unfreeze, this patch makes gfs2 grab a shared lock on the filesystem root directory during the freeze, and hold it until it unfreezes the filesystem. The functions which need to grab a shared lock in order to allow the unfreeze ioctl to be issued now use the lock grabbed by the freeze code instead. The freeze and unfreeze code take care to make sure that this shared lock will not be dropped while another process is using it. Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-05-02 10:26:55 +07:00
/**
* gfs2_log_release - Release a given number of log blocks
* @sdp: The GFS2 superblock
* @blks: The number of blocks
*
*/
void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
{
atomic_add(blks, &sdp->sd_log_blks_free);
trace_gfs2_log_blocks(sdp, blks);
gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
sdp->sd_jdesc->jd_blocks);
up_read(&sdp->sd_log_flush_lock);
}
/**
* gfs2_log_reserve - Make a log reservation
* @sdp: The GFS2 superblock
* @blks: The number of blocks to reserve
*
* Note that we never give out the last few blocks of the journal. Thats
[GFS2] assertion failure after writing to journaled file, umount This patch passes all my nasty tests that were causing the code to fail under one circumstance or another. Here is a complete summary of all changes from today's git tree, in order of appearance: 1. There are now separate variables for metadata buffer accounting. 2. Variable sd_log_num_hdrs is no longer needed, since the header accounting is taken care of by the reserve/refund sequence. 3. Fixed a tiny grammatical problem in a comment. 4. Added a new function "calc_reserved" to calculate the reserved log space. This isn't entirely necessary, but it has two benefits: First, it simplifies the gfs2_log_refund function greatly. Second, it allows for easier debugging because I could sprinkle the code with calls to this function to make sure the accounting is proper (by adding asserts and printks) at strategic point of the code. 5. In log_pull_tail there apparently was a kludge to fix up the accounting based on a "pull" parameter. The buffer accounting is now done properly, so the kludge was removed. 6. File sync operations were making a call to gfs2_log_flush that writes another journal header. Since that header was unplanned for (reserved) by the reserve/refund sequence, the free space had to be decremented so that when log_pull_tail gets called, the free space is be adjusted properly. (Did I hear you call that a kludge? well, maybe, but a lot more justifiable than the one I removed). 7. In the gfs2_log_shutdown code, it optionally syncs the log by specifying the PULL parameter to log_write_header. I'm not sure this is necessary anymore. It just seems to me there could be cases where shutdown is called while there are outstanding log buffers. 8. In the (data)buf_lo_before_commit functions, I changed some offset values from being calculated on the fly to being constants. That simplified some code and we might as well let the compiler do the calculation once rather than redoing those cycles at run time. 9. This version has my rewritten databuf_lo_add function. This version is much more like its predecessor, buf_lo_add, which makes it easier to understand. Again, this might not be necessary, but it seems as if this one works as well as the previous one, maybe even better, so I decided to leave it in. 10. In databuf_lo_before_commit, a previous data corruption problem was caused by going off the end of the buffer. The proper solution is to have the proper limit in place, rather than stopping earlier. (Thus my previous attempt to fix it is wrong). If you don't wrap the buffer, you're stopping too early and that causes more log buffer accounting problems. 11. In lops.h there are two new (previously mentioned) constants for figuring out the data offset for the journal buffers. 12. There are also two new functions, buf_limit and databuf_limit to calculate how many entries will fit in the buffer. 13. In function gfs2_meta_wipe, it needs to distinguish between pinned metadata buffers and journaled data buffers for proper journal buffer accounting. It can't use the JDATA gfs2_inode flag because it's sometimes passed the "real" inode and sometimes the "metadata inode" and the inode flags will be random bits in a metadata gfs2_inode. It needs to base its decision on which was passed in. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-06-19 02:50:20 +07:00
* due to the fact that there is a small number of header blocks
* associated with each log flush. The exact number can't be known until
* flush time, so we ensure that we have just enough free blocks at all
* times to avoid running out during a log flush.
*
* We no longer flush the log here, instead we wake up logd to do that
* for us. To avoid the thundering herd and to ensure that we deal fairly
* with queued waiters, we use an exclusive wait. This means that when we
* get woken with enough journal space to get our reservation, we need to
* wake the next waiter on the list.
*
* Returns: errno
*/
int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
{
int ret = 0;
unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize);
unsigned wanted = blks + reserved_blks;
DEFINE_WAIT(wait);
int did_wait = 0;
unsigned int free_blocks;
if (gfs2_assert_warn(sdp, blks) ||
gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
return -EINVAL;
atomic_add(blks, &sdp->sd_log_blks_needed);
retry:
free_blocks = atomic_read(&sdp->sd_log_blks_free);
if (unlikely(free_blocks <= wanted)) {
do {
prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait,
TASK_UNINTERRUPTIBLE);
wake_up(&sdp->sd_logd_waitq);
did_wait = 1;
if (atomic_read(&sdp->sd_log_blks_free) <= wanted)
io_schedule();
free_blocks = atomic_read(&sdp->sd_log_blks_free);
} while(free_blocks <= wanted);
finish_wait(&sdp->sd_log_waitq, &wait);
}
atomic_inc(&sdp->sd_reserving_log);
if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks,
free_blocks - blks) != free_blocks) {
if (atomic_dec_and_test(&sdp->sd_reserving_log))
wake_up(&sdp->sd_reserving_log_wait);
goto retry;
}
atomic_sub(blks, &sdp->sd_log_blks_needed);
trace_gfs2_log_blocks(sdp, -blks);
/*
* If we waited, then so might others, wake them up _after_ we get
* our share of the log.
*/
if (unlikely(did_wait))
wake_up(&sdp->sd_log_waitq);
down_read(&sdp->sd_log_flush_lock);
GFS2: remove transaction glock GFS2 has a transaction glock, which must be grabbed for every transaction, whose purpose is to deal with freezing the filesystem. Aside from this involving a large amount of locking, it is very easy to make the current fsfreeze code hang on unfreezing. This patch rewrites how gfs2 handles freezing the filesystem. The transaction glock is removed. In it's place is a freeze glock, which is cached (but not held) in a shared state by every node in the cluster when the filesystem is mounted. This lock only needs to be grabbed on freezing, and actions which need to be safe from freezing, like recovery. When a node wants to freeze the filesystem, it grabs this glock exclusively. When the freeze glock state changes on the nodes (either from shared to unlocked, or shared to exclusive), the filesystem does a special log flush. gfs2_log_flush() does all the work for flushing out the and shutting down the incore log, and then it tries to grab the freeze glock in a shared state again. Since the filesystem is stuck in gfs2_log_flush, no new transaction can start, and nothing can be written to disk. Unfreezing the filesytem simply involes dropping the freeze glock, allowing gfs2_log_flush() to grab and then release the shared lock, so it is cached for next time. However, in order for the unfreezing ioctl to occur, gfs2 needs to get a shared lock on the filesystem root directory inode to check permissions. If that glock has already been grabbed exclusively, fsfreeze will be unable to get the shared lock and unfreeze the filesystem. In order to allow the unfreeze, this patch makes gfs2 grab a shared lock on the filesystem root directory during the freeze, and hold it until it unfreezes the filesystem. The functions which need to grab a shared lock in order to allow the unfreeze ioctl to be issued now use the lock grabbed by the freeze code instead. The freeze and unfreeze code take care to make sure that this shared lock will not be dropped while another process is using it. Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-05-02 10:26:55 +07:00
if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) {
gfs2_log_release(sdp, blks);
ret = -EROFS;
GFS2: remove transaction glock GFS2 has a transaction glock, which must be grabbed for every transaction, whose purpose is to deal with freezing the filesystem. Aside from this involving a large amount of locking, it is very easy to make the current fsfreeze code hang on unfreezing. This patch rewrites how gfs2 handles freezing the filesystem. The transaction glock is removed. In it's place is a freeze glock, which is cached (but not held) in a shared state by every node in the cluster when the filesystem is mounted. This lock only needs to be grabbed on freezing, and actions which need to be safe from freezing, like recovery. When a node wants to freeze the filesystem, it grabs this glock exclusively. When the freeze glock state changes on the nodes (either from shared to unlocked, or shared to exclusive), the filesystem does a special log flush. gfs2_log_flush() does all the work for flushing out the and shutting down the incore log, and then it tries to grab the freeze glock in a shared state again. Since the filesystem is stuck in gfs2_log_flush, no new transaction can start, and nothing can be written to disk. Unfreezing the filesytem simply involes dropping the freeze glock, allowing gfs2_log_flush() to grab and then release the shared lock, so it is cached for next time. However, in order for the unfreezing ioctl to occur, gfs2 needs to get a shared lock on the filesystem root directory inode to check permissions. If that glock has already been grabbed exclusively, fsfreeze will be unable to get the shared lock and unfreeze the filesystem. In order to allow the unfreeze, this patch makes gfs2 grab a shared lock on the filesystem root directory during the freeze, and hold it until it unfreezes the filesystem. The functions which need to grab a shared lock in order to allow the unfreeze ioctl to be issued now use the lock grabbed by the freeze code instead. The freeze and unfreeze code take care to make sure that this shared lock will not be dropped while another process is using it. Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-05-02 10:26:55 +07:00
}
if (atomic_dec_and_test(&sdp->sd_reserving_log))
wake_up(&sdp->sd_reserving_log_wait);
return ret;
}
/**
* log_distance - Compute distance between two journal blocks
* @sdp: The GFS2 superblock
* @newer: The most recent journal block of the pair
* @older: The older journal block of the pair
*
* Compute the distance (in the journal direction) between two
* blocks in the journal
*
* Returns: the distance in blocks
*/
static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer,
unsigned int older)
{
int dist;
dist = newer - older;
if (dist < 0)
dist += sdp->sd_jdesc->jd_blocks;
return dist;
}
[GFS2] assertion failure after writing to journaled file, umount This patch passes all my nasty tests that were causing the code to fail under one circumstance or another. Here is a complete summary of all changes from today's git tree, in order of appearance: 1. There are now separate variables for metadata buffer accounting. 2. Variable sd_log_num_hdrs is no longer needed, since the header accounting is taken care of by the reserve/refund sequence. 3. Fixed a tiny grammatical problem in a comment. 4. Added a new function "calc_reserved" to calculate the reserved log space. This isn't entirely necessary, but it has two benefits: First, it simplifies the gfs2_log_refund function greatly. Second, it allows for easier debugging because I could sprinkle the code with calls to this function to make sure the accounting is proper (by adding asserts and printks) at strategic point of the code. 5. In log_pull_tail there apparently was a kludge to fix up the accounting based on a "pull" parameter. The buffer accounting is now done properly, so the kludge was removed. 6. File sync operations were making a call to gfs2_log_flush that writes another journal header. Since that header was unplanned for (reserved) by the reserve/refund sequence, the free space had to be decremented so that when log_pull_tail gets called, the free space is be adjusted properly. (Did I hear you call that a kludge? well, maybe, but a lot more justifiable than the one I removed). 7. In the gfs2_log_shutdown code, it optionally syncs the log by specifying the PULL parameter to log_write_header. I'm not sure this is necessary anymore. It just seems to me there could be cases where shutdown is called while there are outstanding log buffers. 8. In the (data)buf_lo_before_commit functions, I changed some offset values from being calculated on the fly to being constants. That simplified some code and we might as well let the compiler do the calculation once rather than redoing those cycles at run time. 9. This version has my rewritten databuf_lo_add function. This version is much more like its predecessor, buf_lo_add, which makes it easier to understand. Again, this might not be necessary, but it seems as if this one works as well as the previous one, maybe even better, so I decided to leave it in. 10. In databuf_lo_before_commit, a previous data corruption problem was caused by going off the end of the buffer. The proper solution is to have the proper limit in place, rather than stopping earlier. (Thus my previous attempt to fix it is wrong). If you don't wrap the buffer, you're stopping too early and that causes more log buffer accounting problems. 11. In lops.h there are two new (previously mentioned) constants for figuring out the data offset for the journal buffers. 12. There are also two new functions, buf_limit and databuf_limit to calculate how many entries will fit in the buffer. 13. In function gfs2_meta_wipe, it needs to distinguish between pinned metadata buffers and journaled data buffers for proper journal buffer accounting. It can't use the JDATA gfs2_inode flag because it's sometimes passed the "real" inode and sometimes the "metadata inode" and the inode flags will be random bits in a metadata gfs2_inode. It needs to base its decision on which was passed in. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-06-19 02:50:20 +07:00
/**
* calc_reserved - Calculate the number of blocks to reserve when
* refunding a transaction's unused buffers.
* @sdp: The GFS2 superblock
*
* This is complex. We need to reserve room for all our currently used
* metadata buffers (e.g. normal file I/O rewriting file time stamps) and
* all our journaled data buffers for journaled files (e.g. files in the
* meta_fs like rindex, or files for which chattr +j was done.)
* If we don't reserve enough space, gfs2_log_refund and gfs2_log_flush
* will count it as free space (sd_log_blks_free) and corruption will follow.
*
* We can have metadata bufs and jdata bufs in the same journal. So each
* type gets its own log header, for which we need to reserve a block.
* In fact, each type has the potential for needing more than one header
* in cases where we have more buffers than will fit on a journal page.
* Metadata journal entries take up half the space of journaled buffer entries.
* Thus, metadata entries have buf_limit (502) and journaled buffers have
* databuf_limit (251) before they cause a wrap around.
*
* Also, we need to reserve blocks for revoke journal entries and one for an
* overall header for the lot.
*
* Returns: the number of blocks reserved
*/
static unsigned int calc_reserved(struct gfs2_sbd *sdp)
{
unsigned int reserved = 0;
unsigned int mbuf;
unsigned int dbuf;
struct gfs2_trans *tr = sdp->sd_log_tr;
[GFS2] assertion failure after writing to journaled file, umount This patch passes all my nasty tests that were causing the code to fail under one circumstance or another. Here is a complete summary of all changes from today's git tree, in order of appearance: 1. There are now separate variables for metadata buffer accounting. 2. Variable sd_log_num_hdrs is no longer needed, since the header accounting is taken care of by the reserve/refund sequence. 3. Fixed a tiny grammatical problem in a comment. 4. Added a new function "calc_reserved" to calculate the reserved log space. This isn't entirely necessary, but it has two benefits: First, it simplifies the gfs2_log_refund function greatly. Second, it allows for easier debugging because I could sprinkle the code with calls to this function to make sure the accounting is proper (by adding asserts and printks) at strategic point of the code. 5. In log_pull_tail there apparently was a kludge to fix up the accounting based on a "pull" parameter. The buffer accounting is now done properly, so the kludge was removed. 6. File sync operations were making a call to gfs2_log_flush that writes another journal header. Since that header was unplanned for (reserved) by the reserve/refund sequence, the free space had to be decremented so that when log_pull_tail gets called, the free space is be adjusted properly. (Did I hear you call that a kludge? well, maybe, but a lot more justifiable than the one I removed). 7. In the gfs2_log_shutdown code, it optionally syncs the log by specifying the PULL parameter to log_write_header. I'm not sure this is necessary anymore. It just seems to me there could be cases where shutdown is called while there are outstanding log buffers. 8. In the (data)buf_lo_before_commit functions, I changed some offset values from being calculated on the fly to being constants. That simplified some code and we might as well let the compiler do the calculation once rather than redoing those cycles at run time. 9. This version has my rewritten databuf_lo_add function. This version is much more like its predecessor, buf_lo_add, which makes it easier to understand. Again, this might not be necessary, but it seems as if this one works as well as the previous one, maybe even better, so I decided to leave it in. 10. In databuf_lo_before_commit, a previous data corruption problem was caused by going off the end of the buffer. The proper solution is to have the proper limit in place, rather than stopping earlier. (Thus my previous attempt to fix it is wrong). If you don't wrap the buffer, you're stopping too early and that causes more log buffer accounting problems. 11. In lops.h there are two new (previously mentioned) constants for figuring out the data offset for the journal buffers. 12. There are also two new functions, buf_limit and databuf_limit to calculate how many entries will fit in the buffer. 13. In function gfs2_meta_wipe, it needs to distinguish between pinned metadata buffers and journaled data buffers for proper journal buffer accounting. It can't use the JDATA gfs2_inode flag because it's sometimes passed the "real" inode and sometimes the "metadata inode" and the inode flags will be random bits in a metadata gfs2_inode. It needs to base its decision on which was passed in. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-06-19 02:50:20 +07:00
if (tr) {
mbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm;
dbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm;
reserved = mbuf + dbuf;
/* Account for header blocks */
reserved += DIV_ROUND_UP(mbuf, buf_limit(sdp));
reserved += DIV_ROUND_UP(dbuf, databuf_limit(sdp));
}
[GFS2] assertion failure after writing to journaled file, umount This patch passes all my nasty tests that were causing the code to fail under one circumstance or another. Here is a complete summary of all changes from today's git tree, in order of appearance: 1. There are now separate variables for metadata buffer accounting. 2. Variable sd_log_num_hdrs is no longer needed, since the header accounting is taken care of by the reserve/refund sequence. 3. Fixed a tiny grammatical problem in a comment. 4. Added a new function "calc_reserved" to calculate the reserved log space. This isn't entirely necessary, but it has two benefits: First, it simplifies the gfs2_log_refund function greatly. Second, it allows for easier debugging because I could sprinkle the code with calls to this function to make sure the accounting is proper (by adding asserts and printks) at strategic point of the code. 5. In log_pull_tail there apparently was a kludge to fix up the accounting based on a "pull" parameter. The buffer accounting is now done properly, so the kludge was removed. 6. File sync operations were making a call to gfs2_log_flush that writes another journal header. Since that header was unplanned for (reserved) by the reserve/refund sequence, the free space had to be decremented so that when log_pull_tail gets called, the free space is be adjusted properly. (Did I hear you call that a kludge? well, maybe, but a lot more justifiable than the one I removed). 7. In the gfs2_log_shutdown code, it optionally syncs the log by specifying the PULL parameter to log_write_header. I'm not sure this is necessary anymore. It just seems to me there could be cases where shutdown is called while there are outstanding log buffers. 8. In the (data)buf_lo_before_commit functions, I changed some offset values from being calculated on the fly to being constants. That simplified some code and we might as well let the compiler do the calculation once rather than redoing those cycles at run time. 9. This version has my rewritten databuf_lo_add function. This version is much more like its predecessor, buf_lo_add, which makes it easier to understand. Again, this might not be necessary, but it seems as if this one works as well as the previous one, maybe even better, so I decided to leave it in. 10. In databuf_lo_before_commit, a previous data corruption problem was caused by going off the end of the buffer. The proper solution is to have the proper limit in place, rather than stopping earlier. (Thus my previous attempt to fix it is wrong). If you don't wrap the buffer, you're stopping too early and that causes more log buffer accounting problems. 11. In lops.h there are two new (previously mentioned) constants for figuring out the data offset for the journal buffers. 12. There are also two new functions, buf_limit and databuf_limit to calculate how many entries will fit in the buffer. 13. In function gfs2_meta_wipe, it needs to distinguish between pinned metadata buffers and journaled data buffers for proper journal buffer accounting. It can't use the JDATA gfs2_inode flag because it's sometimes passed the "real" inode and sometimes the "metadata inode" and the inode flags will be random bits in a metadata gfs2_inode. It needs to base its decision on which was passed in. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-06-19 02:50:20 +07:00
if (sdp->sd_log_committed_revoke > 0)
reserved += gfs2_struct2blk(sdp, sdp->sd_log_committed_revoke);
[GFS2] assertion failure after writing to journaled file, umount This patch passes all my nasty tests that were causing the code to fail under one circumstance or another. Here is a complete summary of all changes from today's git tree, in order of appearance: 1. There are now separate variables for metadata buffer accounting. 2. Variable sd_log_num_hdrs is no longer needed, since the header accounting is taken care of by the reserve/refund sequence. 3. Fixed a tiny grammatical problem in a comment. 4. Added a new function "calc_reserved" to calculate the reserved log space. This isn't entirely necessary, but it has two benefits: First, it simplifies the gfs2_log_refund function greatly. Second, it allows for easier debugging because I could sprinkle the code with calls to this function to make sure the accounting is proper (by adding asserts and printks) at strategic point of the code. 5. In log_pull_tail there apparently was a kludge to fix up the accounting based on a "pull" parameter. The buffer accounting is now done properly, so the kludge was removed. 6. File sync operations were making a call to gfs2_log_flush that writes another journal header. Since that header was unplanned for (reserved) by the reserve/refund sequence, the free space had to be decremented so that when log_pull_tail gets called, the free space is be adjusted properly. (Did I hear you call that a kludge? well, maybe, but a lot more justifiable than the one I removed). 7. In the gfs2_log_shutdown code, it optionally syncs the log by specifying the PULL parameter to log_write_header. I'm not sure this is necessary anymore. It just seems to me there could be cases where shutdown is called while there are outstanding log buffers. 8. In the (data)buf_lo_before_commit functions, I changed some offset values from being calculated on the fly to being constants. That simplified some code and we might as well let the compiler do the calculation once rather than redoing those cycles at run time. 9. This version has my rewritten databuf_lo_add function. This version is much more like its predecessor, buf_lo_add, which makes it easier to understand. Again, this might not be necessary, but it seems as if this one works as well as the previous one, maybe even better, so I decided to leave it in. 10. In databuf_lo_before_commit, a previous data corruption problem was caused by going off the end of the buffer. The proper solution is to have the proper limit in place, rather than stopping earlier. (Thus my previous attempt to fix it is wrong). If you don't wrap the buffer, you're stopping too early and that causes more log buffer accounting problems. 11. In lops.h there are two new (previously mentioned) constants for figuring out the data offset for the journal buffers. 12. There are also two new functions, buf_limit and databuf_limit to calculate how many entries will fit in the buffer. 13. In function gfs2_meta_wipe, it needs to distinguish between pinned metadata buffers and journaled data buffers for proper journal buffer accounting. It can't use the JDATA gfs2_inode flag because it's sometimes passed the "real" inode and sometimes the "metadata inode" and the inode flags will be random bits in a metadata gfs2_inode. It needs to base its decision on which was passed in. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-06-19 02:50:20 +07:00
/* One for the overall header */
if (reserved)
reserved++;
return reserved;
}
static unsigned int current_tail(struct gfs2_sbd *sdp)
{
struct gfs2_trans *tr;
unsigned int tail;
spin_lock(&sdp->sd_ail_lock);
if (list_empty(&sdp->sd_ail1_list)) {
tail = sdp->sd_log_head;
} else {
tr = list_last_entry(&sdp->sd_ail1_list, struct gfs2_trans,
tr_list);
tail = tr->tr_first;
}
spin_unlock(&sdp->sd_ail_lock);
return tail;
}
[GFS2] assertion failure after writing to journaled file, umount This patch passes all my nasty tests that were causing the code to fail under one circumstance or another. Here is a complete summary of all changes from today's git tree, in order of appearance: 1. There are now separate variables for metadata buffer accounting. 2. Variable sd_log_num_hdrs is no longer needed, since the header accounting is taken care of by the reserve/refund sequence. 3. Fixed a tiny grammatical problem in a comment. 4. Added a new function "calc_reserved" to calculate the reserved log space. This isn't entirely necessary, but it has two benefits: First, it simplifies the gfs2_log_refund function greatly. Second, it allows for easier debugging because I could sprinkle the code with calls to this function to make sure the accounting is proper (by adding asserts and printks) at strategic point of the code. 5. In log_pull_tail there apparently was a kludge to fix up the accounting based on a "pull" parameter. The buffer accounting is now done properly, so the kludge was removed. 6. File sync operations were making a call to gfs2_log_flush that writes another journal header. Since that header was unplanned for (reserved) by the reserve/refund sequence, the free space had to be decremented so that when log_pull_tail gets called, the free space is be adjusted properly. (Did I hear you call that a kludge? well, maybe, but a lot more justifiable than the one I removed). 7. In the gfs2_log_shutdown code, it optionally syncs the log by specifying the PULL parameter to log_write_header. I'm not sure this is necessary anymore. It just seems to me there could be cases where shutdown is called while there are outstanding log buffers. 8. In the (data)buf_lo_before_commit functions, I changed some offset values from being calculated on the fly to being constants. That simplified some code and we might as well let the compiler do the calculation once rather than redoing those cycles at run time. 9. This version has my rewritten databuf_lo_add function. This version is much more like its predecessor, buf_lo_add, which makes it easier to understand. Again, this might not be necessary, but it seems as if this one works as well as the previous one, maybe even better, so I decided to leave it in. 10. In databuf_lo_before_commit, a previous data corruption problem was caused by going off the end of the buffer. The proper solution is to have the proper limit in place, rather than stopping earlier. (Thus my previous attempt to fix it is wrong). If you don't wrap the buffer, you're stopping too early and that causes more log buffer accounting problems. 11. In lops.h there are two new (previously mentioned) constants for figuring out the data offset for the journal buffers. 12. There are also two new functions, buf_limit and databuf_limit to calculate how many entries will fit in the buffer. 13. In function gfs2_meta_wipe, it needs to distinguish between pinned metadata buffers and journaled data buffers for proper journal buffer accounting. It can't use the JDATA gfs2_inode flag because it's sometimes passed the "real" inode and sometimes the "metadata inode" and the inode flags will be random bits in a metadata gfs2_inode. It needs to base its decision on which was passed in. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-06-19 02:50:20 +07:00
static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
{
unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
ail2_empty(sdp, new_tail);
atomic_add(dist, &sdp->sd_log_blks_free);
trace_gfs2_log_blocks(sdp, dist);
gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
sdp->sd_jdesc->jd_blocks);
sdp->sd_log_tail = new_tail;
}
void log_flush_wait(struct gfs2_sbd *sdp)
{
DEFINE_WAIT(wait);
if (atomic_read(&sdp->sd_log_in_flight)) {
do {
prepare_to_wait(&sdp->sd_log_flush_wait, &wait,
TASK_UNINTERRUPTIBLE);
if (atomic_read(&sdp->sd_log_in_flight))
io_schedule();
} while(atomic_read(&sdp->sd_log_in_flight));
finish_wait(&sdp->sd_log_flush_wait, &wait);
}
}
static int ip_cmp(void *priv, struct list_head *a, struct list_head *b)
{
struct gfs2_inode *ipa, *ipb;
ipa = list_entry(a, struct gfs2_inode, i_ordered);
ipb = list_entry(b, struct gfs2_inode, i_ordered);
if (ipa->i_no_addr < ipb->i_no_addr)
return -1;
if (ipa->i_no_addr > ipb->i_no_addr)
return 1;
return 0;
}
static void gfs2_ordered_write(struct gfs2_sbd *sdp)
{
struct gfs2_inode *ip;
LIST_HEAD(written);
spin_lock(&sdp->sd_ordered_lock);
list_sort(NULL, &sdp->sd_log_ordered, &ip_cmp);
while (!list_empty(&sdp->sd_log_ordered)) {
ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered);
if (ip->i_inode.i_mapping->nrpages == 0) {
test_and_clear_bit(GIF_ORDERED, &ip->i_flags);
list_del(&ip->i_ordered);
continue;
}
list_move(&ip->i_ordered, &written);
spin_unlock(&sdp->sd_ordered_lock);
filemap_fdatawrite(ip->i_inode.i_mapping);
spin_lock(&sdp->sd_ordered_lock);
}
list_splice(&written, &sdp->sd_log_ordered);
spin_unlock(&sdp->sd_ordered_lock);
}
static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
{
struct gfs2_inode *ip;
spin_lock(&sdp->sd_ordered_lock);
while (!list_empty(&sdp->sd_log_ordered)) {
ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered);
list_del(&ip->i_ordered);
WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags));
if (ip->i_inode.i_mapping->nrpages == 0)
continue;
spin_unlock(&sdp->sd_ordered_lock);
filemap_fdatawait(ip->i_inode.i_mapping);
spin_lock(&sdp->sd_ordered_lock);
}
spin_unlock(&sdp->sd_ordered_lock);
}
void gfs2_ordered_del_inode(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
spin_lock(&sdp->sd_ordered_lock);
if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags))
list_del(&ip->i_ordered);
spin_unlock(&sdp->sd_ordered_lock);
}
void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
{
struct buffer_head *bh = bd->bd_bh;
struct gfs2_glock *gl = bd->bd_gl;
bh->b_private = NULL;
bd->bd_blkno = bh->b_blocknr;
gfs2_remove_from_ail(bd); /* drops ref on bh */
bd->bd_bh = NULL;
sdp->sd_log_num_revoke++;
if (atomic_inc_return(&gl->gl_revokes) == 1)
gfs2: Fix occasional glock use-after-free This patch has to do with the life cycle of glocks and buffers. When gfs2 metadata or journaled data is queued to be written, a gfs2_bufdata object is assigned to track the buffer, and that is queued to various lists, including the glock's gl_ail_list to indicate it's on the active items list. Once the page associated with the buffer has been written, it is removed from the ail list, but its life isn't over until a revoke has been successfully written. So after the block is written, its bufdata object is moved from the glock's gl_ail_list to a file-system-wide list of pending revokes, sd_log_le_revoke. At that point the glock still needs to track how many revokes it contributed to that list (in gl_revokes) so that things like glock go_sync can ensure all the metadata has been not only written, but also revoked before the glock is granted to a different node. This is to guarantee journal replay doesn't replay the block once the glock has been granted to another node. Ross Lagerwall recently discovered a race in which an inode could be evicted, and its glock freed after its ail list had been synced, but while it still had unwritten revokes on the sd_log_le_revoke list. The evict decremented the glock reference count to zero, which allowed the glock to be freed. After the revoke was written, function revoke_lo_after_commit tried to adjust the glock's gl_revokes counter and clear its GLF_LFLUSH flag, at which time it referenced the freed glock. This patch fixes the problem by incrementing the glock reference count in gfs2_add_revoke when the glock's first bufdata object is moved from the glock to the global revokes list. Later, when the glock's last such bufdata object is freed, the reference count is decremented. This guarantees that whichever process finishes last (the revoke writing or the evict) will properly free the glock, and neither will reference the glock after it has been freed. Reported-by: Ross Lagerwall <ross.lagerwall@citrix.com> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com> Signed-off-by: Bob Peterson <rpeterso@redhat.com>
2019-04-05 03:11:11 +07:00
gfs2_glock_hold(gl);
set_bit(GLF_LFLUSH, &gl->gl_flags);
list_add(&bd->bd_list, &sdp->sd_log_revokes);
}
void gfs2_glock_remove_revoke(struct gfs2_glock *gl)
{
if (atomic_dec_return(&gl->gl_revokes) == 0) {
clear_bit(GLF_LFLUSH, &gl->gl_flags);
gfs2_glock_queue_put(gl);
}
}
gfs2: Issue revokes more intelligently Before this patch, function gfs2_write_revokes would call gfs2_ail1_empty, then traverse the sd_ail1_list looking for transactions that had bds which were no longer queued to a glock. And if it found some, it would try to issue revokes for them, up to a predetermined maximum. There were two problems with how it did this. First was the fact that gfs2_ail1_empty moves transactions which have nothing remaining on the ail1 list from the sd_ail1_list to the sd_ail2_list, thus making its traversal of sd_ail1_list miss them completely, and therefore, never issue revokes for them. Second was the fact that there were three traversals (or partial traversals) of the sd_ail1_list, each of which took and then released the sd_ail_lock lock: First inside gfs2_ail1_empty, second to determine if there are any revokes to be issued, and third to actually issue them. All this taking and releasing of the sd_ail_lock meant other processes could modify the lists and the conditions in which we're working. This patch simplies the whole process by adding a new parameter to function gfs2_ail1_empty, max_revokes. For normal calls, this is passed in as 0, meaning we don't want to issue any revokes. For function gfs2_write_revokes, we pass in the maximum number of revokes we can, thus allowing gfs2_ail1_empty to add the revokes where needed. This simplies the code, allows for a single holding of the sd_ail_lock, and allows gfs2_ail1_empty to add revokes for all the necessary bd items without missing any. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-02-22 04:28:07 +07:00
/**
* gfs2_write_revokes - Add as many revokes to the system transaction as we can
* @sdp: The GFS2 superblock
*
* Our usual strategy is to defer writing revokes as much as we can in the hope
* that we'll eventually overwrite the journal, which will make those revokes
* go away. This changes when we flush the log: at that point, there will
* likely be some left-over space in the last revoke block of that transaction.
* We can fill that space with additional revokes for blocks that have already
* been written back. This will basically come at no cost now, and will save
* us from having to keep track of those blocks on the AIL2 list later.
*/
void gfs2_write_revokes(struct gfs2_sbd *sdp)
{
gfs2: Issue revokes more intelligently Before this patch, function gfs2_write_revokes would call gfs2_ail1_empty, then traverse the sd_ail1_list looking for transactions that had bds which were no longer queued to a glock. And if it found some, it would try to issue revokes for them, up to a predetermined maximum. There were two problems with how it did this. First was the fact that gfs2_ail1_empty moves transactions which have nothing remaining on the ail1 list from the sd_ail1_list to the sd_ail2_list, thus making its traversal of sd_ail1_list miss them completely, and therefore, never issue revokes for them. Second was the fact that there were three traversals (or partial traversals) of the sd_ail1_list, each of which took and then released the sd_ail_lock lock: First inside gfs2_ail1_empty, second to determine if there are any revokes to be issued, and third to actually issue them. All this taking and releasing of the sd_ail_lock meant other processes could modify the lists and the conditions in which we're working. This patch simplies the whole process by adding a new parameter to function gfs2_ail1_empty, max_revokes. For normal calls, this is passed in as 0, meaning we don't want to issue any revokes. For function gfs2_write_revokes, we pass in the maximum number of revokes we can, thus allowing gfs2_ail1_empty to add the revokes where needed. This simplies the code, allows for a single holding of the sd_ail_lock, and allows gfs2_ail1_empty to add revokes for all the necessary bd items without missing any. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-02-22 04:28:07 +07:00
/* number of revokes we still have room for */
int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
gfs2: Issue revokes more intelligently Before this patch, function gfs2_write_revokes would call gfs2_ail1_empty, then traverse the sd_ail1_list looking for transactions that had bds which were no longer queued to a glock. And if it found some, it would try to issue revokes for them, up to a predetermined maximum. There were two problems with how it did this. First was the fact that gfs2_ail1_empty moves transactions which have nothing remaining on the ail1 list from the sd_ail1_list to the sd_ail2_list, thus making its traversal of sd_ail1_list miss them completely, and therefore, never issue revokes for them. Second was the fact that there were three traversals (or partial traversals) of the sd_ail1_list, each of which took and then released the sd_ail_lock lock: First inside gfs2_ail1_empty, second to determine if there are any revokes to be issued, and third to actually issue them. All this taking and releasing of the sd_ail_lock meant other processes could modify the lists and the conditions in which we're working. This patch simplies the whole process by adding a new parameter to function gfs2_ail1_empty, max_revokes. For normal calls, this is passed in as 0, meaning we don't want to issue any revokes. For function gfs2_write_revokes, we pass in the maximum number of revokes we can, thus allowing gfs2_ail1_empty to add the revokes where needed. This simplies the code, allows for a single holding of the sd_ail_lock, and allows gfs2_ail1_empty to add revokes for all the necessary bd items without missing any. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-02-22 04:28:07 +07:00
gfs2_log_lock(sdp);
while (sdp->sd_log_num_revoke > max_revokes)
max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
max_revokes -= sdp->sd_log_num_revoke;
if (!sdp->sd_log_num_revoke) {
atomic_dec(&sdp->sd_log_blks_free);
/* If no blocks have been reserved, we need to also
* reserve a block for the header */
if (!sdp->sd_log_blks_reserved)
atomic_dec(&sdp->sd_log_blks_free);
}
gfs2: Issue revokes more intelligently Before this patch, function gfs2_write_revokes would call gfs2_ail1_empty, then traverse the sd_ail1_list looking for transactions that had bds which were no longer queued to a glock. And if it found some, it would try to issue revokes for them, up to a predetermined maximum. There were two problems with how it did this. First was the fact that gfs2_ail1_empty moves transactions which have nothing remaining on the ail1 list from the sd_ail1_list to the sd_ail2_list, thus making its traversal of sd_ail1_list miss them completely, and therefore, never issue revokes for them. Second was the fact that there were three traversals (or partial traversals) of the sd_ail1_list, each of which took and then released the sd_ail_lock lock: First inside gfs2_ail1_empty, second to determine if there are any revokes to be issued, and third to actually issue them. All this taking and releasing of the sd_ail_lock meant other processes could modify the lists and the conditions in which we're working. This patch simplies the whole process by adding a new parameter to function gfs2_ail1_empty, max_revokes. For normal calls, this is passed in as 0, meaning we don't want to issue any revokes. For function gfs2_write_revokes, we pass in the maximum number of revokes we can, thus allowing gfs2_ail1_empty to add the revokes where needed. This simplies the code, allows for a single holding of the sd_ail_lock, and allows gfs2_ail1_empty to add revokes for all the necessary bd items without missing any. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-02-22 04:28:07 +07:00
gfs2_ail1_empty(sdp, max_revokes);
gfs2_log_unlock(sdp);
if (!sdp->sd_log_num_revoke) {
atomic_inc(&sdp->sd_log_blks_free);
if (!sdp->sd_log_blks_reserved)
atomic_inc(&sdp->sd_log_blks_free);
}
}
/**
* gfs2_write_log_header - Write a journal log header buffer at lblock
* @sdp: The GFS2 superblock
* @jd: journal descriptor of the journal to which we are writing
* @seq: sequence number
* @tail: tail of the log
* @lblock: value for lh_blkno (block number relative to start of journal)
* @flags: log header flags GFS2_LOG_HEAD_*
* @op_flags: flags to pass to the bio
*
* Returns: the initialized log buffer descriptor
*/
void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
u64 seq, u32 tail, u32 lblock, u32 flags,
int op_flags)
{
struct gfs2_log_header *lh;
u32 hash, crc;
struct page *page;
struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
struct timespec64 tv;
struct super_block *sb = sdp->sd_vfs;
u64 dblock;
if (gfs2_withdrawn(sdp))
goto out;
page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
lh = page_address(page);
clear_page(lh);
lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
lh->lh_header.__pad0 = cpu_to_be64(0);
lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
lh->lh_sequence = cpu_to_be64(seq);
lh->lh_flags = cpu_to_be32(flags);
lh->lh_tail = cpu_to_be32(tail);
lh->lh_blkno = cpu_to_be32(lblock);
hash = ~crc32(~0, lh, LH_V1_SIZE);
lh->lh_hash = cpu_to_be32(hash);
ktime_get_coarse_real_ts64(&tv);
lh->lh_nsec = cpu_to_be32(tv.tv_nsec);
lh->lh_sec = cpu_to_be64(tv.tv_sec);
if (!list_empty(&jd->extent_list))
dblock = gfs2_log_bmap(jd, lblock);
else {
int ret = gfs2_lblk_to_dblk(jd->jd_inode, lblock, &dblock);
if (gfs2_assert_withdraw(sdp, ret == 0))
return;
}
lh->lh_addr = cpu_to_be64(dblock);
lh->lh_jinode = cpu_to_be64(GFS2_I(jd->jd_inode)->i_no_addr);
/* We may only write local statfs, quota, etc., when writing to our
own journal. The values are left 0 when recovering a journal
different from our own. */
if (!(flags & GFS2_LOG_HEAD_RECOVERY)) {
lh->lh_statfs_addr =
cpu_to_be64(GFS2_I(sdp->sd_sc_inode)->i_no_addr);
lh->lh_quota_addr =
cpu_to_be64(GFS2_I(sdp->sd_qc_inode)->i_no_addr);
spin_lock(&sdp->sd_statfs_spin);
lh->lh_local_total = cpu_to_be64(l_sc->sc_total);
lh->lh_local_free = cpu_to_be64(l_sc->sc_free);
lh->lh_local_dinodes = cpu_to_be64(l_sc->sc_dinodes);
spin_unlock(&sdp->sd_statfs_spin);
}
BUILD_BUG_ON(offsetof(struct gfs2_log_header, lh_crc) != LH_V1_SIZE);
crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4,
sb->s_blocksize - LH_V1_SIZE - 4);
lh->lh_crc = cpu_to_be32(crc);
gfs2_log_write(sdp, page, sb->s_blocksize, 0, dblock);
gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags);
out:
log_flush_wait(sdp);
}
/**
* log_write_header - Get and initialize a journal header buffer
* @sdp: The GFS2 superblock
* @flags: The log header flags, including log header origin
*
* Returns: the initialized log buffer descriptor
*/
static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
{
unsigned int tail;
int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC;
enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
gfs2_assert_withdraw(sdp, (state != SFS_FROZEN));
tail = current_tail(sdp);
if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) {
gfs2_ordered_wait(sdp);
log_flush_wait(sdp);
op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
}
sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
gfs2_write_log_header(sdp, sdp->sd_jdesc, sdp->sd_log_sequence++, tail,
sdp->sd_log_flush_head, flags, op_flags);
gfs2_log_incr_head(sdp);
if (sdp->sd_log_tail != tail)
log_pull_tail(sdp, tail);
}
/**
* ail_drain - drain the ail lists after a withdraw
* @sdp: Pointer to GFS2 superblock
*/
static void ail_drain(struct gfs2_sbd *sdp)
{
struct gfs2_trans *tr;
spin_lock(&sdp->sd_ail_lock);
/*
* For transactions on the sd_ail1_list we need to drain both the
* ail1 and ail2 lists. That's because function gfs2_ail1_start_one
* (temporarily) moves items from its tr_ail1 list to tr_ail2 list
* before revokes are sent for that block. Items on the sd_ail2_list
* should have already gotten beyond that point, so no need.
*/
while (!list_empty(&sdp->sd_ail1_list)) {
tr = list_first_entry(&sdp->sd_ail1_list, struct gfs2_trans,
tr_list);
gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail1_list);
gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list);
list_del(&tr->tr_list);
kfree(tr);
}
while (!list_empty(&sdp->sd_ail2_list)) {
tr = list_first_entry(&sdp->sd_ail2_list, struct gfs2_trans,
tr_list);
gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list);
list_del(&tr->tr_list);
kfree(tr);
}
spin_unlock(&sdp->sd_ail_lock);
}
/**
* gfs2_log_flush - flush incore transaction(s)
* @sdp: the filesystem
* @gl: The glock structure to flush. If NULL, flush the whole incore log
* @flags: The log header flags: GFS2_LOG_HEAD_FLUSH_* and debug flags
*
*/
void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
{
struct gfs2_trans *tr = NULL;
enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
down_write(&sdp->sd_log_flush_lock);
/*
* Do this check while holding the log_flush_lock to prevent new
* buffers from being added to the ail via gfs2_pin()
*/
if (gfs2_withdrawn(sdp))
goto out;
/* Log might have been flushed while we waited for the flush lock */
if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) {
up_write(&sdp->sd_log_flush_lock);
return;
}
trace_gfs2_log_flush(sdp, 1, flags);
if (flags & GFS2_LOG_HEAD_FLUSH_SHUTDOWN)
clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
sdp->sd_log_flush_head = sdp->sd_log_head;
tr = sdp->sd_log_tr;
if (tr) {
sdp->sd_log_tr = NULL;
INIT_LIST_HEAD(&tr->tr_ail1_list);
INIT_LIST_HEAD(&tr->tr_ail2_list);
tr->tr_first = sdp->sd_log_flush_head;
if (unlikely (state == SFS_FROZEN))
if (gfs2_assert_withdraw_delayed(sdp,
!tr->tr_num_buf_new && !tr->tr_num_databuf_new))
goto out;
}
if (unlikely(state == SFS_FROZEN))
if (gfs2_assert_withdraw_delayed(sdp, !sdp->sd_log_num_revoke))
goto out;
if (gfs2_assert_withdraw_delayed(sdp,
sdp->sd_log_num_revoke == sdp->sd_log_committed_revoke))
goto out;
gfs2_ordered_write(sdp);
if (gfs2_withdrawn(sdp))
goto out;
lops_before_commit(sdp, tr);
if (gfs2_withdrawn(sdp))
goto out;
gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE);
if (gfs2_withdrawn(sdp))
goto out;
if (sdp->sd_log_head != sdp->sd_log_flush_head) {
log_flush_wait(sdp);
log_write_header(sdp, flags);
} else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
trace_gfs2_log_blocks(sdp, -1);
log_write_header(sdp, flags);
[GFS2] assertion failure after writing to journaled file, umount This patch passes all my nasty tests that were causing the code to fail under one circumstance or another. Here is a complete summary of all changes from today's git tree, in order of appearance: 1. There are now separate variables for metadata buffer accounting. 2. Variable sd_log_num_hdrs is no longer needed, since the header accounting is taken care of by the reserve/refund sequence. 3. Fixed a tiny grammatical problem in a comment. 4. Added a new function "calc_reserved" to calculate the reserved log space. This isn't entirely necessary, but it has two benefits: First, it simplifies the gfs2_log_refund function greatly. Second, it allows for easier debugging because I could sprinkle the code with calls to this function to make sure the accounting is proper (by adding asserts and printks) at strategic point of the code. 5. In log_pull_tail there apparently was a kludge to fix up the accounting based on a "pull" parameter. The buffer accounting is now done properly, so the kludge was removed. 6. File sync operations were making a call to gfs2_log_flush that writes another journal header. Since that header was unplanned for (reserved) by the reserve/refund sequence, the free space had to be decremented so that when log_pull_tail gets called, the free space is be adjusted properly. (Did I hear you call that a kludge? well, maybe, but a lot more justifiable than the one I removed). 7. In the gfs2_log_shutdown code, it optionally syncs the log by specifying the PULL parameter to log_write_header. I'm not sure this is necessary anymore. It just seems to me there could be cases where shutdown is called while there are outstanding log buffers. 8. In the (data)buf_lo_before_commit functions, I changed some offset values from being calculated on the fly to being constants. That simplified some code and we might as well let the compiler do the calculation once rather than redoing those cycles at run time. 9. This version has my rewritten databuf_lo_add function. This version is much more like its predecessor, buf_lo_add, which makes it easier to understand. Again, this might not be necessary, but it seems as if this one works as well as the previous one, maybe even better, so I decided to leave it in. 10. In databuf_lo_before_commit, a previous data corruption problem was caused by going off the end of the buffer. The proper solution is to have the proper limit in place, rather than stopping earlier. (Thus my previous attempt to fix it is wrong). If you don't wrap the buffer, you're stopping too early and that causes more log buffer accounting problems. 11. In lops.h there are two new (previously mentioned) constants for figuring out the data offset for the journal buffers. 12. There are also two new functions, buf_limit and databuf_limit to calculate how many entries will fit in the buffer. 13. In function gfs2_meta_wipe, it needs to distinguish between pinned metadata buffers and journaled data buffers for proper journal buffer accounting. It can't use the JDATA gfs2_inode flag because it's sometimes passed the "real" inode and sometimes the "metadata inode" and the inode flags will be random bits in a metadata gfs2_inode. It needs to base its decision on which was passed in. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-06-19 02:50:20 +07:00
}
if (gfs2_withdrawn(sdp))
goto out;
lops_after_commit(sdp, tr);
gfs2_log_lock(sdp);
sdp->sd_log_head = sdp->sd_log_flush_head;
sdp->sd_log_blks_reserved = 0;
sdp->sd_log_committed_revoke = 0;
spin_lock(&sdp->sd_ail_lock);
if (tr && !list_empty(&tr->tr_ail1_list)) {
list_add(&tr->tr_list, &sdp->sd_ail1_list);
tr = NULL;
}
spin_unlock(&sdp->sd_ail_lock);
gfs2_log_unlock(sdp);
GFS2: remove transaction glock GFS2 has a transaction glock, which must be grabbed for every transaction, whose purpose is to deal with freezing the filesystem. Aside from this involving a large amount of locking, it is very easy to make the current fsfreeze code hang on unfreezing. This patch rewrites how gfs2 handles freezing the filesystem. The transaction glock is removed. In it's place is a freeze glock, which is cached (but not held) in a shared state by every node in the cluster when the filesystem is mounted. This lock only needs to be grabbed on freezing, and actions which need to be safe from freezing, like recovery. When a node wants to freeze the filesystem, it grabs this glock exclusively. When the freeze glock state changes on the nodes (either from shared to unlocked, or shared to exclusive), the filesystem does a special log flush. gfs2_log_flush() does all the work for flushing out the and shutting down the incore log, and then it tries to grab the freeze glock in a shared state again. Since the filesystem is stuck in gfs2_log_flush, no new transaction can start, and nothing can be written to disk. Unfreezing the filesytem simply involes dropping the freeze glock, allowing gfs2_log_flush() to grab and then release the shared lock, so it is cached for next time. However, in order for the unfreezing ioctl to occur, gfs2 needs to get a shared lock on the filesystem root directory inode to check permissions. If that glock has already been grabbed exclusively, fsfreeze will be unable to get the shared lock and unfreeze the filesystem. In order to allow the unfreeze, this patch makes gfs2 grab a shared lock on the filesystem root directory during the freeze, and hold it until it unfreezes the filesystem. The functions which need to grab a shared lock in order to allow the unfreeze ioctl to be issued now use the lock grabbed by the freeze code instead. The freeze and unfreeze code take care to make sure that this shared lock will not be dropped while another process is using it. Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-05-02 10:26:55 +07:00
if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) {
GFS2: remove transaction glock GFS2 has a transaction glock, which must be grabbed for every transaction, whose purpose is to deal with freezing the filesystem. Aside from this involving a large amount of locking, it is very easy to make the current fsfreeze code hang on unfreezing. This patch rewrites how gfs2 handles freezing the filesystem. The transaction glock is removed. In it's place is a freeze glock, which is cached (but not held) in a shared state by every node in the cluster when the filesystem is mounted. This lock only needs to be grabbed on freezing, and actions which need to be safe from freezing, like recovery. When a node wants to freeze the filesystem, it grabs this glock exclusively. When the freeze glock state changes on the nodes (either from shared to unlocked, or shared to exclusive), the filesystem does a special log flush. gfs2_log_flush() does all the work for flushing out the and shutting down the incore log, and then it tries to grab the freeze glock in a shared state again. Since the filesystem is stuck in gfs2_log_flush, no new transaction can start, and nothing can be written to disk. Unfreezing the filesytem simply involes dropping the freeze glock, allowing gfs2_log_flush() to grab and then release the shared lock, so it is cached for next time. However, in order for the unfreezing ioctl to occur, gfs2 needs to get a shared lock on the filesystem root directory inode to check permissions. If that glock has already been grabbed exclusively, fsfreeze will be unable to get the shared lock and unfreeze the filesystem. In order to allow the unfreeze, this patch makes gfs2 grab a shared lock on the filesystem root directory during the freeze, and hold it until it unfreezes the filesystem. The functions which need to grab a shared lock in order to allow the unfreeze ioctl to be issued now use the lock grabbed by the freeze code instead. The freeze and unfreeze code take care to make sure that this shared lock will not be dropped while another process is using it. Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-05-02 10:26:55 +07:00
if (!sdp->sd_log_idle) {
for (;;) {
gfs2_ail1_start(sdp);
gfs2_ail1_wait(sdp);
gfs2: Issue revokes more intelligently Before this patch, function gfs2_write_revokes would call gfs2_ail1_empty, then traverse the sd_ail1_list looking for transactions that had bds which were no longer queued to a glock. And if it found some, it would try to issue revokes for them, up to a predetermined maximum. There were two problems with how it did this. First was the fact that gfs2_ail1_empty moves transactions which have nothing remaining on the ail1 list from the sd_ail1_list to the sd_ail2_list, thus making its traversal of sd_ail1_list miss them completely, and therefore, never issue revokes for them. Second was the fact that there were three traversals (or partial traversals) of the sd_ail1_list, each of which took and then released the sd_ail_lock lock: First inside gfs2_ail1_empty, second to determine if there are any revokes to be issued, and third to actually issue them. All this taking and releasing of the sd_ail_lock meant other processes could modify the lists and the conditions in which we're working. This patch simplies the whole process by adding a new parameter to function gfs2_ail1_empty, max_revokes. For normal calls, this is passed in as 0, meaning we don't want to issue any revokes. For function gfs2_write_revokes, we pass in the maximum number of revokes we can, thus allowing gfs2_ail1_empty to add the revokes where needed. This simplies the code, allows for a single holding of the sd_ail_lock, and allows gfs2_ail1_empty to add revokes for all the necessary bd items without missing any. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-02-22 04:28:07 +07:00
if (gfs2_ail1_empty(sdp, 0))
GFS2: remove transaction glock GFS2 has a transaction glock, which must be grabbed for every transaction, whose purpose is to deal with freezing the filesystem. Aside from this involving a large amount of locking, it is very easy to make the current fsfreeze code hang on unfreezing. This patch rewrites how gfs2 handles freezing the filesystem. The transaction glock is removed. In it's place is a freeze glock, which is cached (but not held) in a shared state by every node in the cluster when the filesystem is mounted. This lock only needs to be grabbed on freezing, and actions which need to be safe from freezing, like recovery. When a node wants to freeze the filesystem, it grabs this glock exclusively. When the freeze glock state changes on the nodes (either from shared to unlocked, or shared to exclusive), the filesystem does a special log flush. gfs2_log_flush() does all the work for flushing out the and shutting down the incore log, and then it tries to grab the freeze glock in a shared state again. Since the filesystem is stuck in gfs2_log_flush, no new transaction can start, and nothing can be written to disk. Unfreezing the filesytem simply involes dropping the freeze glock, allowing gfs2_log_flush() to grab and then release the shared lock, so it is cached for next time. However, in order for the unfreezing ioctl to occur, gfs2 needs to get a shared lock on the filesystem root directory inode to check permissions. If that glock has already been grabbed exclusively, fsfreeze will be unable to get the shared lock and unfreeze the filesystem. In order to allow the unfreeze, this patch makes gfs2 grab a shared lock on the filesystem root directory during the freeze, and hold it until it unfreezes the filesystem. The functions which need to grab a shared lock in order to allow the unfreeze ioctl to be issued now use the lock grabbed by the freeze code instead. The freeze and unfreeze code take care to make sure that this shared lock will not be dropped while another process is using it. Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-05-02 10:26:55 +07:00
break;
}
if (gfs2_withdrawn(sdp))
goto out;
GFS2: remove transaction glock GFS2 has a transaction glock, which must be grabbed for every transaction, whose purpose is to deal with freezing the filesystem. Aside from this involving a large amount of locking, it is very easy to make the current fsfreeze code hang on unfreezing. This patch rewrites how gfs2 handles freezing the filesystem. The transaction glock is removed. In it's place is a freeze glock, which is cached (but not held) in a shared state by every node in the cluster when the filesystem is mounted. This lock only needs to be grabbed on freezing, and actions which need to be safe from freezing, like recovery. When a node wants to freeze the filesystem, it grabs this glock exclusively. When the freeze glock state changes on the nodes (either from shared to unlocked, or shared to exclusive), the filesystem does a special log flush. gfs2_log_flush() does all the work for flushing out the and shutting down the incore log, and then it tries to grab the freeze glock in a shared state again. Since the filesystem is stuck in gfs2_log_flush, no new transaction can start, and nothing can be written to disk. Unfreezing the filesytem simply involes dropping the freeze glock, allowing gfs2_log_flush() to grab and then release the shared lock, so it is cached for next time. However, in order for the unfreezing ioctl to occur, gfs2 needs to get a shared lock on the filesystem root directory inode to check permissions. If that glock has already been grabbed exclusively, fsfreeze will be unable to get the shared lock and unfreeze the filesystem. In order to allow the unfreeze, this patch makes gfs2 grab a shared lock on the filesystem root directory during the freeze, and hold it until it unfreezes the filesystem. The functions which need to grab a shared lock in order to allow the unfreeze ioctl to be issued now use the lock grabbed by the freeze code instead. The freeze and unfreeze code take care to make sure that this shared lock will not be dropped while another process is using it. Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-05-02 10:26:55 +07:00
atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
trace_gfs2_log_blocks(sdp, -1);
log_write_header(sdp, flags);
GFS2: remove transaction glock GFS2 has a transaction glock, which must be grabbed for every transaction, whose purpose is to deal with freezing the filesystem. Aside from this involving a large amount of locking, it is very easy to make the current fsfreeze code hang on unfreezing. This patch rewrites how gfs2 handles freezing the filesystem. The transaction glock is removed. In it's place is a freeze glock, which is cached (but not held) in a shared state by every node in the cluster when the filesystem is mounted. This lock only needs to be grabbed on freezing, and actions which need to be safe from freezing, like recovery. When a node wants to freeze the filesystem, it grabs this glock exclusively. When the freeze glock state changes on the nodes (either from shared to unlocked, or shared to exclusive), the filesystem does a special log flush. gfs2_log_flush() does all the work for flushing out the and shutting down the incore log, and then it tries to grab the freeze glock in a shared state again. Since the filesystem is stuck in gfs2_log_flush, no new transaction can start, and nothing can be written to disk. Unfreezing the filesytem simply involes dropping the freeze glock, allowing gfs2_log_flush() to grab and then release the shared lock, so it is cached for next time. However, in order for the unfreezing ioctl to occur, gfs2 needs to get a shared lock on the filesystem root directory inode to check permissions. If that glock has already been grabbed exclusively, fsfreeze will be unable to get the shared lock and unfreeze the filesystem. In order to allow the unfreeze, this patch makes gfs2 grab a shared lock on the filesystem root directory during the freeze, and hold it until it unfreezes the filesystem. The functions which need to grab a shared lock in order to allow the unfreeze ioctl to be issued now use the lock grabbed by the freeze code instead. The freeze and unfreeze code take care to make sure that this shared lock will not be dropped while another process is using it. Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-05-02 10:26:55 +07:00
sdp->sd_log_head = sdp->sd_log_flush_head;
}
if (flags & (GFS2_LOG_HEAD_FLUSH_SHUTDOWN |
GFS2_LOG_HEAD_FLUSH_FREEZE))
GFS2: remove transaction glock GFS2 has a transaction glock, which must be grabbed for every transaction, whose purpose is to deal with freezing the filesystem. Aside from this involving a large amount of locking, it is very easy to make the current fsfreeze code hang on unfreezing. This patch rewrites how gfs2 handles freezing the filesystem. The transaction glock is removed. In it's place is a freeze glock, which is cached (but not held) in a shared state by every node in the cluster when the filesystem is mounted. This lock only needs to be grabbed on freezing, and actions which need to be safe from freezing, like recovery. When a node wants to freeze the filesystem, it grabs this glock exclusively. When the freeze glock state changes on the nodes (either from shared to unlocked, or shared to exclusive), the filesystem does a special log flush. gfs2_log_flush() does all the work for flushing out the and shutting down the incore log, and then it tries to grab the freeze glock in a shared state again. Since the filesystem is stuck in gfs2_log_flush, no new transaction can start, and nothing can be written to disk. Unfreezing the filesytem simply involes dropping the freeze glock, allowing gfs2_log_flush() to grab and then release the shared lock, so it is cached for next time. However, in order for the unfreezing ioctl to occur, gfs2 needs to get a shared lock on the filesystem root directory inode to check permissions. If that glock has already been grabbed exclusively, fsfreeze will be unable to get the shared lock and unfreeze the filesystem. In order to allow the unfreeze, this patch makes gfs2 grab a shared lock on the filesystem root directory during the freeze, and hold it until it unfreezes the filesystem. The functions which need to grab a shared lock in order to allow the unfreeze ioctl to be issued now use the lock grabbed by the freeze code instead. The freeze and unfreeze code take care to make sure that this shared lock will not be dropped while another process is using it. Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-05-02 10:26:55 +07:00
gfs2_log_shutdown(sdp);
if (flags & GFS2_LOG_HEAD_FLUSH_FREEZE)
atomic_set(&sdp->sd_freeze_state, SFS_FROZEN);
GFS2: remove transaction glock GFS2 has a transaction glock, which must be grabbed for every transaction, whose purpose is to deal with freezing the filesystem. Aside from this involving a large amount of locking, it is very easy to make the current fsfreeze code hang on unfreezing. This patch rewrites how gfs2 handles freezing the filesystem. The transaction glock is removed. In it's place is a freeze glock, which is cached (but not held) in a shared state by every node in the cluster when the filesystem is mounted. This lock only needs to be grabbed on freezing, and actions which need to be safe from freezing, like recovery. When a node wants to freeze the filesystem, it grabs this glock exclusively. When the freeze glock state changes on the nodes (either from shared to unlocked, or shared to exclusive), the filesystem does a special log flush. gfs2_log_flush() does all the work for flushing out the and shutting down the incore log, and then it tries to grab the freeze glock in a shared state again. Since the filesystem is stuck in gfs2_log_flush, no new transaction can start, and nothing can be written to disk. Unfreezing the filesytem simply involes dropping the freeze glock, allowing gfs2_log_flush() to grab and then release the shared lock, so it is cached for next time. However, in order for the unfreezing ioctl to occur, gfs2 needs to get a shared lock on the filesystem root directory inode to check permissions. If that glock has already been grabbed exclusively, fsfreeze will be unable to get the shared lock and unfreeze the filesystem. In order to allow the unfreeze, this patch makes gfs2 grab a shared lock on the filesystem root directory during the freeze, and hold it until it unfreezes the filesystem. The functions which need to grab a shared lock in order to allow the unfreeze ioctl to be issued now use the lock grabbed by the freeze code instead. The freeze and unfreeze code take care to make sure that this shared lock will not be dropped while another process is using it. Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2014-05-02 10:26:55 +07:00
}
out:
if (gfs2_withdrawn(sdp)) {
ail_drain(sdp); /* frees all transactions */
tr = NULL;
}
trace_gfs2_log_flush(sdp, 0, flags);
up_write(&sdp->sd_log_flush_lock);
kfree(tr);
}
/**
* gfs2_merge_trans - Merge a new transaction into a cached transaction
* @old: Original transaction to be expanded
* @new: New transaction to be merged
*/
static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new)
{
WARN_ON_ONCE(!test_bit(TR_ATTACHED, &old->tr_flags));
old->tr_num_buf_new += new->tr_num_buf_new;
old->tr_num_databuf_new += new->tr_num_databuf_new;
old->tr_num_buf_rm += new->tr_num_buf_rm;
old->tr_num_databuf_rm += new->tr_num_databuf_rm;
old->tr_num_revoke += new->tr_num_revoke;
old->tr_num_revoke_rm += new->tr_num_revoke_rm;
list_splice_tail_init(&new->tr_databuf, &old->tr_databuf);
list_splice_tail_init(&new->tr_buf, &old->tr_buf);
}
static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
[GFS2] assertion failure after writing to journaled file, umount This patch passes all my nasty tests that were causing the code to fail under one circumstance or another. Here is a complete summary of all changes from today's git tree, in order of appearance: 1. There are now separate variables for metadata buffer accounting. 2. Variable sd_log_num_hdrs is no longer needed, since the header accounting is taken care of by the reserve/refund sequence. 3. Fixed a tiny grammatical problem in a comment. 4. Added a new function "calc_reserved" to calculate the reserved log space. This isn't entirely necessary, but it has two benefits: First, it simplifies the gfs2_log_refund function greatly. Second, it allows for easier debugging because I could sprinkle the code with calls to this function to make sure the accounting is proper (by adding asserts and printks) at strategic point of the code. 5. In log_pull_tail there apparently was a kludge to fix up the accounting based on a "pull" parameter. The buffer accounting is now done properly, so the kludge was removed. 6. File sync operations were making a call to gfs2_log_flush that writes another journal header. Since that header was unplanned for (reserved) by the reserve/refund sequence, the free space had to be decremented so that when log_pull_tail gets called, the free space is be adjusted properly. (Did I hear you call that a kludge? well, maybe, but a lot more justifiable than the one I removed). 7. In the gfs2_log_shutdown code, it optionally syncs the log by specifying the PULL parameter to log_write_header. I'm not sure this is necessary anymore. It just seems to me there could be cases where shutdown is called while there are outstanding log buffers. 8. In the (data)buf_lo_before_commit functions, I changed some offset values from being calculated on the fly to being constants. That simplified some code and we might as well let the compiler do the calculation once rather than redoing those cycles at run time. 9. This version has my rewritten databuf_lo_add function. This version is much more like its predecessor, buf_lo_add, which makes it easier to understand. Again, this might not be necessary, but it seems as if this one works as well as the previous one, maybe even better, so I decided to leave it in. 10. In databuf_lo_before_commit, a previous data corruption problem was caused by going off the end of the buffer. The proper solution is to have the proper limit in place, rather than stopping earlier. (Thus my previous attempt to fix it is wrong). If you don't wrap the buffer, you're stopping too early and that causes more log buffer accounting problems. 11. In lops.h there are two new (previously mentioned) constants for figuring out the data offset for the journal buffers. 12. There are also two new functions, buf_limit and databuf_limit to calculate how many entries will fit in the buffer. 13. In function gfs2_meta_wipe, it needs to distinguish between pinned metadata buffers and journaled data buffers for proper journal buffer accounting. It can't use the JDATA gfs2_inode flag because it's sometimes passed the "real" inode and sometimes the "metadata inode" and the inode flags will be random bits in a metadata gfs2_inode. It needs to base its decision on which was passed in. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-06-19 02:50:20 +07:00
unsigned int reserved;
unsigned int unused;
unsigned int maxres;
gfs2_log_lock(sdp);
if (sdp->sd_log_tr) {
gfs2_merge_trans(sdp->sd_log_tr, tr);
} else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) {
gfs2_assert_withdraw(sdp, test_bit(TR_ALLOCED, &tr->tr_flags));
sdp->sd_log_tr = tr;
set_bit(TR_ATTACHED, &tr->tr_flags);
}
sdp->sd_log_committed_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
[GFS2] assertion failure after writing to journaled file, umount This patch passes all my nasty tests that were causing the code to fail under one circumstance or another. Here is a complete summary of all changes from today's git tree, in order of appearance: 1. There are now separate variables for metadata buffer accounting. 2. Variable sd_log_num_hdrs is no longer needed, since the header accounting is taken care of by the reserve/refund sequence. 3. Fixed a tiny grammatical problem in a comment. 4. Added a new function "calc_reserved" to calculate the reserved log space. This isn't entirely necessary, but it has two benefits: First, it simplifies the gfs2_log_refund function greatly. Second, it allows for easier debugging because I could sprinkle the code with calls to this function to make sure the accounting is proper (by adding asserts and printks) at strategic point of the code. 5. In log_pull_tail there apparently was a kludge to fix up the accounting based on a "pull" parameter. The buffer accounting is now done properly, so the kludge was removed. 6. File sync operations were making a call to gfs2_log_flush that writes another journal header. Since that header was unplanned for (reserved) by the reserve/refund sequence, the free space had to be decremented so that when log_pull_tail gets called, the free space is be adjusted properly. (Did I hear you call that a kludge? well, maybe, but a lot more justifiable than the one I removed). 7. In the gfs2_log_shutdown code, it optionally syncs the log by specifying the PULL parameter to log_write_header. I'm not sure this is necessary anymore. It just seems to me there could be cases where shutdown is called while there are outstanding log buffers. 8. In the (data)buf_lo_before_commit functions, I changed some offset values from being calculated on the fly to being constants. That simplified some code and we might as well let the compiler do the calculation once rather than redoing those cycles at run time. 9. This version has my rewritten databuf_lo_add function. This version is much more like its predecessor, buf_lo_add, which makes it easier to understand. Again, this might not be necessary, but it seems as if this one works as well as the previous one, maybe even better, so I decided to leave it in. 10. In databuf_lo_before_commit, a previous data corruption problem was caused by going off the end of the buffer. The proper solution is to have the proper limit in place, rather than stopping earlier. (Thus my previous attempt to fix it is wrong). If you don't wrap the buffer, you're stopping too early and that causes more log buffer accounting problems. 11. In lops.h there are two new (previously mentioned) constants for figuring out the data offset for the journal buffers. 12. There are also two new functions, buf_limit and databuf_limit to calculate how many entries will fit in the buffer. 13. In function gfs2_meta_wipe, it needs to distinguish between pinned metadata buffers and journaled data buffers for proper journal buffer accounting. It can't use the JDATA gfs2_inode flag because it's sometimes passed the "real" inode and sometimes the "metadata inode" and the inode flags will be random bits in a metadata gfs2_inode. It needs to base its decision on which was passed in. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-06-19 02:50:20 +07:00
reserved = calc_reserved(sdp);
maxres = sdp->sd_log_blks_reserved + tr->tr_reserved;
gfs2_assert_withdraw(sdp, maxres >= reserved);
unused = maxres - reserved;
atomic_add(unused, &sdp->sd_log_blks_free);
trace_gfs2_log_blocks(sdp, unused);
gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
[GFS2] assertion failure after writing to journaled file, umount This patch passes all my nasty tests that were causing the code to fail under one circumstance or another. Here is a complete summary of all changes from today's git tree, in order of appearance: 1. There are now separate variables for metadata buffer accounting. 2. Variable sd_log_num_hdrs is no longer needed, since the header accounting is taken care of by the reserve/refund sequence. 3. Fixed a tiny grammatical problem in a comment. 4. Added a new function "calc_reserved" to calculate the reserved log space. This isn't entirely necessary, but it has two benefits: First, it simplifies the gfs2_log_refund function greatly. Second, it allows for easier debugging because I could sprinkle the code with calls to this function to make sure the accounting is proper (by adding asserts and printks) at strategic point of the code. 5. In log_pull_tail there apparently was a kludge to fix up the accounting based on a "pull" parameter. The buffer accounting is now done properly, so the kludge was removed. 6. File sync operations were making a call to gfs2_log_flush that writes another journal header. Since that header was unplanned for (reserved) by the reserve/refund sequence, the free space had to be decremented so that when log_pull_tail gets called, the free space is be adjusted properly. (Did I hear you call that a kludge? well, maybe, but a lot more justifiable than the one I removed). 7. In the gfs2_log_shutdown code, it optionally syncs the log by specifying the PULL parameter to log_write_header. I'm not sure this is necessary anymore. It just seems to me there could be cases where shutdown is called while there are outstanding log buffers. 8. In the (data)buf_lo_before_commit functions, I changed some offset values from being calculated on the fly to being constants. That simplified some code and we might as well let the compiler do the calculation once rather than redoing those cycles at run time. 9. This version has my rewritten databuf_lo_add function. This version is much more like its predecessor, buf_lo_add, which makes it easier to understand. Again, this might not be necessary, but it seems as if this one works as well as the previous one, maybe even better, so I decided to leave it in. 10. In databuf_lo_before_commit, a previous data corruption problem was caused by going off the end of the buffer. The proper solution is to have the proper limit in place, rather than stopping earlier. (Thus my previous attempt to fix it is wrong). If you don't wrap the buffer, you're stopping too early and that causes more log buffer accounting problems. 11. In lops.h there are two new (previously mentioned) constants for figuring out the data offset for the journal buffers. 12. There are also two new functions, buf_limit and databuf_limit to calculate how many entries will fit in the buffer. 13. In function gfs2_meta_wipe, it needs to distinguish between pinned metadata buffers and journaled data buffers for proper journal buffer accounting. It can't use the JDATA gfs2_inode flag because it's sometimes passed the "real" inode and sometimes the "metadata inode" and the inode flags will be random bits in a metadata gfs2_inode. It needs to base its decision on which was passed in. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
2007-06-19 02:50:20 +07:00
sdp->sd_jdesc->jd_blocks);
sdp->sd_log_blks_reserved = reserved;
gfs2_log_unlock(sdp);
}
/**
* gfs2_log_commit - Commit a transaction to the log
* @sdp: the filesystem
* @tr: the transaction
*
* We wake up gfs2_logd if the number of pinned blocks exceed thresh1
* or the total number of used blocks (pinned blocks plus AIL blocks)
* is greater than thresh2.
*
* At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of
* journal size.
*
* Returns: errno
*/
void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
log_refund(sdp, tr);
if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
atomic_read(&sdp->sd_log_thresh2)))
wake_up(&sdp->sd_logd_waitq);
}
/**
* gfs2_log_shutdown - write a shutdown header into a journal
* @sdp: the filesystem
*
*/
static void gfs2_log_shutdown(struct gfs2_sbd *sdp)
{
gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
sdp->sd_log_flush_head = sdp->sd_log_head;
log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT | GFS2_LFC_SHUTDOWN);
gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
sdp->sd_log_head = sdp->sd_log_flush_head;
sdp->sd_log_tail = sdp->sd_log_head;
}
static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
{
return (atomic_read(&sdp->sd_log_pinned) +
atomic_read(&sdp->sd_log_blks_needed) >=
atomic_read(&sdp->sd_log_thresh1));
}
static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
{
unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
if (test_and_clear_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags))
return 1;
return used_blocks + atomic_read(&sdp->sd_log_blks_needed) >=
atomic_read(&sdp->sd_log_thresh2);
}
/**
* gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
* @sdp: Pointer to GFS2 superblock
*
* Also, periodically check to make sure that we're using the most recent
* journal index.
*/
int gfs2_logd(void *data)
{
struct gfs2_sbd *sdp = data;
unsigned long t = 1;
DEFINE_WAIT(wait);
bool did_flush;
while (!kthread_should_stop()) {
/* Check for errors writing to the journal */
if (sdp->sd_log_error) {
gfs2_lm(sdp,
"GFS2: fsid=%s: error %d: "
"withdrawing the file system to "
"prevent further damage.\n",
sdp->sd_fsname, sdp->sd_log_error);
gfs2_withdraw(sdp);
}
did_flush = false;
if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
gfs2: Issue revokes more intelligently Before this patch, function gfs2_write_revokes would call gfs2_ail1_empty, then traverse the sd_ail1_list looking for transactions that had bds which were no longer queued to a glock. And if it found some, it would try to issue revokes for them, up to a predetermined maximum. There were two problems with how it did this. First was the fact that gfs2_ail1_empty moves transactions which have nothing remaining on the ail1 list from the sd_ail1_list to the sd_ail2_list, thus making its traversal of sd_ail1_list miss them completely, and therefore, never issue revokes for them. Second was the fact that there were three traversals (or partial traversals) of the sd_ail1_list, each of which took and then released the sd_ail_lock lock: First inside gfs2_ail1_empty, second to determine if there are any revokes to be issued, and third to actually issue them. All this taking and releasing of the sd_ail_lock meant other processes could modify the lists and the conditions in which we're working. This patch simplies the whole process by adding a new parameter to function gfs2_ail1_empty, max_revokes. For normal calls, this is passed in as 0, meaning we don't want to issue any revokes. For function gfs2_write_revokes, we pass in the maximum number of revokes we can, thus allowing gfs2_ail1_empty to add the revokes where needed. This simplies the code, allows for a single holding of the sd_ail_lock, and allows gfs2_ail1_empty to add revokes for all the necessary bd items without missing any. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-02-22 04:28:07 +07:00
gfs2_ail1_empty(sdp, 0);
gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
GFS2_LFC_LOGD_JFLUSH_REQD);
did_flush = true;
}
if (gfs2_ail_flush_reqd(sdp)) {
gfs2_ail1_start(sdp);
gfs2_ail1_wait(sdp);
gfs2: Issue revokes more intelligently Before this patch, function gfs2_write_revokes would call gfs2_ail1_empty, then traverse the sd_ail1_list looking for transactions that had bds which were no longer queued to a glock. And if it found some, it would try to issue revokes for them, up to a predetermined maximum. There were two problems with how it did this. First was the fact that gfs2_ail1_empty moves transactions which have nothing remaining on the ail1 list from the sd_ail1_list to the sd_ail2_list, thus making its traversal of sd_ail1_list miss them completely, and therefore, never issue revokes for them. Second was the fact that there were three traversals (or partial traversals) of the sd_ail1_list, each of which took and then released the sd_ail_lock lock: First inside gfs2_ail1_empty, second to determine if there are any revokes to be issued, and third to actually issue them. All this taking and releasing of the sd_ail_lock meant other processes could modify the lists and the conditions in which we're working. This patch simplies the whole process by adding a new parameter to function gfs2_ail1_empty, max_revokes. For normal calls, this is passed in as 0, meaning we don't want to issue any revokes. For function gfs2_write_revokes, we pass in the maximum number of revokes we can, thus allowing gfs2_ail1_empty to add the revokes where needed. This simplies the code, allows for a single holding of the sd_ail_lock, and allows gfs2_ail1_empty to add revokes for all the necessary bd items without missing any. Signed-off-by: Bob Peterson <rpeterso@redhat.com> Reviewed-by: Andreas Gruenbacher <agruenba@redhat.com>
2019-02-22 04:28:07 +07:00
gfs2_ail1_empty(sdp, 0);
gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
GFS2_LFC_LOGD_AIL_FLUSH_REQD);
did_flush = true;
}
if (!gfs2_ail_flush_reqd(sdp) || did_flush)
wake_up(&sdp->sd_log_waitq);
t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
try_to_freeze();
do {
prepare_to_wait(&sdp->sd_logd_waitq, &wait,
TASK_INTERRUPTIBLE);
if (!gfs2_ail_flush_reqd(sdp) &&
!gfs2_jrnl_flush_reqd(sdp) &&
!kthread_should_stop())
t = schedule_timeout(t);
} while(t && !gfs2_ail_flush_reqd(sdp) &&
!gfs2_jrnl_flush_reqd(sdp) &&
!kthread_should_stop());
finish_wait(&sdp->sd_logd_waitq, &wait);
}
return 0;
}