linux_dsm_epyc7002/fs/jbd2/checkpoint.c

679 lines
20 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0+
/*
* linux/fs/jbd2/checkpoint.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
*
* Copyright 1999 Red Hat Software --- All Rights Reserved
*
* Checkpoint routines for the generic filesystem journaling code.
* Part of the ext2fs journaling system.
*
* Checkpointing is the process of ensuring that a section of the log is
* committed fully to disk, so that that portion of the log can be
* reused.
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <trace/events/jbd2.h>
/*
* Unlink a buffer from a transaction checkpoint list.
*
* Called with j_list_lock held.
*/
static inline void __buffer_unlink_first(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
jh->b_cpnext->b_cpprev = jh->b_cpprev;
jh->b_cpprev->b_cpnext = jh->b_cpnext;
if (transaction->t_checkpoint_list == jh) {
transaction->t_checkpoint_list = jh->b_cpnext;
if (transaction->t_checkpoint_list == jh)
transaction->t_checkpoint_list = NULL;
}
}
/*
* Unlink a buffer from a transaction checkpoint(io) list.
*
* Called with j_list_lock held.
*/
static inline void __buffer_unlink(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
__buffer_unlink_first(jh);
if (transaction->t_checkpoint_io_list == jh) {
transaction->t_checkpoint_io_list = jh->b_cpnext;
if (transaction->t_checkpoint_io_list == jh)
transaction->t_checkpoint_io_list = NULL;
}
}
/*
* Move a buffer from the checkpoint list to the checkpoint io list
*
* Called with j_list_lock held
*/
static inline void __buffer_relink_io(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
__buffer_unlink_first(jh);
if (!transaction->t_checkpoint_io_list) {
jh->b_cpnext = jh->b_cpprev = jh;
} else {
jh->b_cpnext = transaction->t_checkpoint_io_list;
jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
jh->b_cpprev->b_cpnext = jh;
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_io_list = jh;
}
/*
* Try to release a checkpointed buffer from its transaction.
* Returns 1 if we released it and 2 if we also released the
* whole transaction.
*
* Requires j_list_lock
*/
static int __try_to_free_cp_buf(struct journal_head *jh)
{
int ret = 0;
struct buffer_head *bh = jh2bh(jh);
if (jh->b_transaction == NULL && !buffer_locked(bh) &&
!buffer_dirty(bh) && !buffer_write_io_error(bh)) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
ret = __jbd2_journal_remove_checkpoint(jh) + 1;
}
return ret;
}
/*
* __jbd2_log_wait_for_space: wait until there is space in the journal.
*
* Called under j-state_lock *only*. It will be unlocked if we have to wait
* for a checkpoint to free up some space in the log.
*/
void __jbd2_log_wait_for_space(journal_t *journal)
{
int nblocks, space_left;
/* assert_spin_locked(&journal->j_state_lock); */
nblocks = jbd2_space_needed(journal);
while (jbd2_log_space_left(journal) < nblocks) {
write_unlock(&journal->j_state_lock);
mutex_lock_io(&journal->j_checkpoint_mutex);
/*
* Test again, another process may have checkpointed while we
* were waiting for the checkpoint lock. If there are no
* transactions ready to be checkpointed, try to recover
* journal space by calling cleanup_journal_tail(), and if
* that doesn't work, by waiting for the currently committing
* transaction to complete. If there is absolutely no way
* to make progress, this is either a BUG or corrupted
* filesystem, so abort the journal and leave a stack
* trace for forensic evidence.
*/
write_lock(&journal->j_state_lock);
jbd2: jbd2_log_wait_for_space improve error detetcion If EIO happens after we have dropped j_state_lock, we won't notice that the journal has been aborted. So it is reasonable to move this check after we have grabbed the j_checkpoint_mutex and re-grabbed the j_state_lock. This patch helps to prevent false positive complain after EIO. #DMESG: __jbd2_log_wait_for_space: needed 8448 blocks and only had 8386 space available __jbd2_log_wait_for_space: no way to get more journal space in ram1-8 ------------[ cut here ]------------ WARNING: CPU: 15 PID: 6739 at fs/jbd2/checkpoint.c:168 __jbd2_log_wait_for_space+0x188/0x200() Modules linked in: brd iTCO_wdt lpc_ich mfd_core igb ptp dm_mirror dm_region_hash dm_log dm_mod CPU: 15 PID: 6739 Comm: fsstress Tainted: G W 3.17.0-rc2-00429-g684de57 #139 Hardware name: Intel Corporation W2600CR/W2600CR, BIOS SE5C600.86B.99.99.x028.061320111235 06/13/2011 00000000000000a8 ffff88077aaab878 ffffffff815c1a8c 00000000000000a8 0000000000000000 ffff88077aaab8b8 ffffffff8106ce8c ffff88077aaab898 ffff8807c57e6000 ffff8807c57e6028 0000000000002100 ffff8807c57e62f0 Call Trace: [<ffffffff815c1a8c>] dump_stack+0x51/0x6d [<ffffffff8106ce8c>] warn_slowpath_common+0x8c/0xc0 [<ffffffff8106ceda>] warn_slowpath_null+0x1a/0x20 [<ffffffff812419f8>] __jbd2_log_wait_for_space+0x188/0x200 [<ffffffff8123be9a>] start_this_handle+0x4da/0x7b0 [<ffffffff810990e5>] ? local_clock+0x25/0x30 [<ffffffff810aba87>] ? lockdep_init_map+0xe7/0x180 [<ffffffff8123c5bc>] jbd2__journal_start+0xdc/0x1d0 [<ffffffff811f2414>] ? __ext4_new_inode+0x7f4/0x1330 [<ffffffff81222a38>] __ext4_journal_start_sb+0xf8/0x110 [<ffffffff811f2414>] __ext4_new_inode+0x7f4/0x1330 [<ffffffff810ac359>] ? lock_release_holdtime+0x29/0x190 [<ffffffff812025bb>] ext4_create+0x8b/0x150 [<ffffffff8117fe3b>] vfs_create+0x7b/0xb0 [<ffffffff8118097b>] do_last+0x7db/0xcf0 [<ffffffff8117e31d>] ? inode_permission+0x4d/0x50 [<ffffffff811845d2>] path_openat+0x242/0x590 [<ffffffff81191a76>] ? __alloc_fd+0x36/0x140 [<ffffffff81184a6a>] do_filp_open+0x4a/0xb0 [<ffffffff81191b61>] ? __alloc_fd+0x121/0x140 [<ffffffff81172f20>] do_sys_open+0x170/0x220 [<ffffffff8117300e>] SyS_open+0x1e/0x20 [<ffffffff811715d6>] SyS_creat+0x16/0x20 [<ffffffff815c7e12>] system_call_fastpath+0x16/0x1b ---[ end trace cd71c831f82059db ]--- Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2014-09-17 01:50:50 +07:00
if (journal->j_flags & JBD2_ABORT) {
mutex_unlock(&journal->j_checkpoint_mutex);
return;
}
spin_lock(&journal->j_list_lock);
space_left = jbd2_log_space_left(journal);
if (space_left < nblocks) {
int chkpt = journal->j_checkpoint_transactions != NULL;
tid_t tid = 0;
if (journal->j_committing_transaction)
tid = journal->j_committing_transaction->t_tid;
spin_unlock(&journal->j_list_lock);
write_unlock(&journal->j_state_lock);
if (chkpt) {
jbd2_log_do_checkpoint(journal);
} else if (jbd2_cleanup_journal_tail(journal) == 0) {
/* We were able to recover space; yay! */
;
} else if (tid) {
jbd2: drop checkpoint mutex when waiting in __jbd2_log_wait_for_space() While trying to debug an an issue under extreme I/O loading on preempt-rt kernels, the following backtrace was observed via SysRQ output: rm D ffff8802203afbc0 4600 4878 4748 0x00000000 ffff8802217bfb78 0000000000000082 ffff88021fc2bb80 ffff88021fc2bb80 ffff88021fc2bb80 ffff8802217bffd8 ffff8802217bffd8 ffff8802217bffd8 ffff88021f1d4c80 ffff88021fc2bb80 ffff8802217bfb88 ffff88022437b000 Call Trace: [<ffffffff8172dc34>] schedule+0x24/0x70 [<ffffffff81225b5d>] jbd2_log_wait_commit+0xbd/0x140 [<ffffffff81060390>] ? __init_waitqueue_head+0x50/0x50 [<ffffffff81223635>] jbd2_log_do_checkpoint+0xf5/0x520 [<ffffffff81223b09>] __jbd2_log_wait_for_space+0xa9/0x1f0 [<ffffffff8121dc40>] start_this_handle.isra.10+0x2e0/0x530 [<ffffffff81060390>] ? __init_waitqueue_head+0x50/0x50 [<ffffffff8121e0a3>] jbd2__journal_start+0xc3/0x110 [<ffffffff811de7ce>] ? ext4_rmdir+0x6e/0x230 [<ffffffff8121e0fe>] jbd2_journal_start+0xe/0x10 [<ffffffff811f308b>] ext4_journal_start_sb+0x5b/0x160 [<ffffffff811de7ce>] ext4_rmdir+0x6e/0x230 [<ffffffff811435c5>] vfs_rmdir+0xd5/0x140 [<ffffffff8114370f>] do_rmdir+0xdf/0x120 [<ffffffff8105c6b4>] ? task_work_run+0x44/0x80 [<ffffffff81002889>] ? do_notify_resume+0x89/0x100 [<ffffffff817361ae>] ? int_signal+0x12/0x17 [<ffffffff81145d85>] sys_unlinkat+0x25/0x40 [<ffffffff81735f22>] system_call_fastpath+0x16/0x1b What is interesting here, is that we call log_wait_commit, from within wait_for_space, but we are still holding the checkpoint_mutex as it surrounds mostly the whole of wait_for_space. And then, as we are waiting, journal_commit_transaction can run, and if the JBD2_FLUSHED bit is set, then we will also try to take the same checkpoint_mutex. It seems that we need to drop the checkpoint_mutex while sitting in jbd2_log_wait_commit, if we want to guarantee that progress can be made by jbd2_journal_commit_transaction(). There does not seem to be anything preempt-rt specific about this, other then perhaps increasing the odds of it happening. Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2013-06-13 09:47:35 +07:00
/*
* jbd2_journal_commit_transaction() may want
* to take the checkpoint_mutex if JBD2_FLUSHED
* is set. So we need to temporarily drop it.
*/
mutex_unlock(&journal->j_checkpoint_mutex);
jbd2_log_wait_commit(journal, tid);
jbd2: drop checkpoint mutex when waiting in __jbd2_log_wait_for_space() While trying to debug an an issue under extreme I/O loading on preempt-rt kernels, the following backtrace was observed via SysRQ output: rm D ffff8802203afbc0 4600 4878 4748 0x00000000 ffff8802217bfb78 0000000000000082 ffff88021fc2bb80 ffff88021fc2bb80 ffff88021fc2bb80 ffff8802217bffd8 ffff8802217bffd8 ffff8802217bffd8 ffff88021f1d4c80 ffff88021fc2bb80 ffff8802217bfb88 ffff88022437b000 Call Trace: [<ffffffff8172dc34>] schedule+0x24/0x70 [<ffffffff81225b5d>] jbd2_log_wait_commit+0xbd/0x140 [<ffffffff81060390>] ? __init_waitqueue_head+0x50/0x50 [<ffffffff81223635>] jbd2_log_do_checkpoint+0xf5/0x520 [<ffffffff81223b09>] __jbd2_log_wait_for_space+0xa9/0x1f0 [<ffffffff8121dc40>] start_this_handle.isra.10+0x2e0/0x530 [<ffffffff81060390>] ? __init_waitqueue_head+0x50/0x50 [<ffffffff8121e0a3>] jbd2__journal_start+0xc3/0x110 [<ffffffff811de7ce>] ? ext4_rmdir+0x6e/0x230 [<ffffffff8121e0fe>] jbd2_journal_start+0xe/0x10 [<ffffffff811f308b>] ext4_journal_start_sb+0x5b/0x160 [<ffffffff811de7ce>] ext4_rmdir+0x6e/0x230 [<ffffffff811435c5>] vfs_rmdir+0xd5/0x140 [<ffffffff8114370f>] do_rmdir+0xdf/0x120 [<ffffffff8105c6b4>] ? task_work_run+0x44/0x80 [<ffffffff81002889>] ? do_notify_resume+0x89/0x100 [<ffffffff817361ae>] ? int_signal+0x12/0x17 [<ffffffff81145d85>] sys_unlinkat+0x25/0x40 [<ffffffff81735f22>] system_call_fastpath+0x16/0x1b What is interesting here, is that we call log_wait_commit, from within wait_for_space, but we are still holding the checkpoint_mutex as it surrounds mostly the whole of wait_for_space. And then, as we are waiting, journal_commit_transaction can run, and if the JBD2_FLUSHED bit is set, then we will also try to take the same checkpoint_mutex. It seems that we need to drop the checkpoint_mutex while sitting in jbd2_log_wait_commit, if we want to guarantee that progress can be made by jbd2_journal_commit_transaction(). There does not seem to be anything preempt-rt specific about this, other then perhaps increasing the odds of it happening. Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2013-06-13 09:47:35 +07:00
write_lock(&journal->j_state_lock);
continue;
} else {
printk(KERN_ERR "%s: needed %d blocks and "
"only had %d space available\n",
__func__, nblocks, space_left);
printk(KERN_ERR "%s: no way to get more "
"journal space in %s\n", __func__,
journal->j_devname);
WARN_ON(1);
jbd2_journal_abort(journal, 0);
}
write_lock(&journal->j_state_lock);
} else {
spin_unlock(&journal->j_list_lock);
}
mutex_unlock(&journal->j_checkpoint_mutex);
}
}
static void
__flush_batch(journal_t *journal, int *batch_count)
{
int i;
jbd2: use WRITE_SYNC in journal checkpoint In journal checkpoint, we write the buffer and wait for its finish. But in cfq, the async queue has a very low priority, and in our test, if there are too many sync queues and every queue is filled up with requests, the write request will be delayed for quite a long time and all the tasks which are waiting for journal space will end with errors like: INFO: task attr_set:3816 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. attr_set D ffff880028393480 0 3816 1 0x00000000 ffff8802073fbae8 0000000000000086 ffff8802140847c8 ffff8800283934e8 ffff8802073fb9d8 ffffffff8103e456 ffff8802140847b8 ffff8801ed728080 ffff8801db4bc080 ffff8801ed728450 ffff880028393480 0000000000000002 Call Trace: [<ffffffff8103e456>] ? __dequeue_entity+0x33/0x38 [<ffffffff8103caad>] ? need_resched+0x23/0x2d [<ffffffff814006a6>] ? thread_return+0xa2/0xbc [<ffffffffa01f6224>] ? jbd2_journal_dirty_metadata+0x116/0x126 [jbd2] [<ffffffffa01f6224>] ? jbd2_journal_dirty_metadata+0x116/0x126 [jbd2] [<ffffffff81400d31>] __mutex_lock_common+0x14e/0x1a9 [<ffffffffa021dbfb>] ? brelse+0x13/0x15 [ext4] [<ffffffff81400ddb>] __mutex_lock_slowpath+0x19/0x1b [<ffffffff81400b2d>] mutex_lock+0x1b/0x32 [<ffffffffa01f927b>] __jbd2_journal_insert_checkpoint+0xe3/0x20c [jbd2] [<ffffffffa01f547b>] start_this_handle+0x438/0x527 [jbd2] [<ffffffff8106f491>] ? autoremove_wake_function+0x0/0x3e [<ffffffffa01f560b>] jbd2_journal_start+0xa1/0xcc [jbd2] [<ffffffffa02353be>] ext4_journal_start_sb+0x57/0x81 [ext4] [<ffffffffa024a314>] ext4_xattr_set+0x6c/0xe3 [ext4] [<ffffffffa024aaff>] ext4_xattr_user_set+0x42/0x4b [ext4] [<ffffffff81145adb>] generic_setxattr+0x6b/0x76 [<ffffffff81146ac0>] __vfs_setxattr_noperm+0x47/0xc0 [<ffffffff81146bb8>] vfs_setxattr+0x7f/0x9a [<ffffffff81146c88>] setxattr+0xb5/0xe8 [<ffffffff81137467>] ? do_filp_open+0x571/0xa6e [<ffffffff81146d26>] sys_fsetxattr+0x6b/0x91 [<ffffffff81002d32>] system_call_fastpath+0x16/0x1b So this patch tries to use WRITE_SYNC in __flush_batch so that the request will be moved into sync queue and handled by cfq timely. We also use the new plug, sot that all the WRITE_SYNC requests can be given as a whole when we unplug it. Signed-off-by: Tao Ma <boyu.mt@taobao.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Cc: Jan Kara <jack@suse.cz> Reported-by: Robin Dong <sanbai@taobao.com>
2011-06-27 23:36:29 +07:00
struct blk_plug plug;
jbd2: use WRITE_SYNC in journal checkpoint In journal checkpoint, we write the buffer and wait for its finish. But in cfq, the async queue has a very low priority, and in our test, if there are too many sync queues and every queue is filled up with requests, the write request will be delayed for quite a long time and all the tasks which are waiting for journal space will end with errors like: INFO: task attr_set:3816 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. attr_set D ffff880028393480 0 3816 1 0x00000000 ffff8802073fbae8 0000000000000086 ffff8802140847c8 ffff8800283934e8 ffff8802073fb9d8 ffffffff8103e456 ffff8802140847b8 ffff8801ed728080 ffff8801db4bc080 ffff8801ed728450 ffff880028393480 0000000000000002 Call Trace: [<ffffffff8103e456>] ? __dequeue_entity+0x33/0x38 [<ffffffff8103caad>] ? need_resched+0x23/0x2d [<ffffffff814006a6>] ? thread_return+0xa2/0xbc [<ffffffffa01f6224>] ? jbd2_journal_dirty_metadata+0x116/0x126 [jbd2] [<ffffffffa01f6224>] ? jbd2_journal_dirty_metadata+0x116/0x126 [jbd2] [<ffffffff81400d31>] __mutex_lock_common+0x14e/0x1a9 [<ffffffffa021dbfb>] ? brelse+0x13/0x15 [ext4] [<ffffffff81400ddb>] __mutex_lock_slowpath+0x19/0x1b [<ffffffff81400b2d>] mutex_lock+0x1b/0x32 [<ffffffffa01f927b>] __jbd2_journal_insert_checkpoint+0xe3/0x20c [jbd2] [<ffffffffa01f547b>] start_this_handle+0x438/0x527 [jbd2] [<ffffffff8106f491>] ? autoremove_wake_function+0x0/0x3e [<ffffffffa01f560b>] jbd2_journal_start+0xa1/0xcc [jbd2] [<ffffffffa02353be>] ext4_journal_start_sb+0x57/0x81 [ext4] [<ffffffffa024a314>] ext4_xattr_set+0x6c/0xe3 [ext4] [<ffffffffa024aaff>] ext4_xattr_user_set+0x42/0x4b [ext4] [<ffffffff81145adb>] generic_setxattr+0x6b/0x76 [<ffffffff81146ac0>] __vfs_setxattr_noperm+0x47/0xc0 [<ffffffff81146bb8>] vfs_setxattr+0x7f/0x9a [<ffffffff81146c88>] setxattr+0xb5/0xe8 [<ffffffff81137467>] ? do_filp_open+0x571/0xa6e [<ffffffff81146d26>] sys_fsetxattr+0x6b/0x91 [<ffffffff81002d32>] system_call_fastpath+0x16/0x1b So this patch tries to use WRITE_SYNC in __flush_batch so that the request will be moved into sync queue and handled by cfq timely. We also use the new plug, sot that all the WRITE_SYNC requests can be given as a whole when we unplug it. Signed-off-by: Tao Ma <boyu.mt@taobao.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Cc: Jan Kara <jack@suse.cz> Reported-by: Robin Dong <sanbai@taobao.com>
2011-06-27 23:36:29 +07:00
blk_start_plug(&plug);
for (i = 0; i < *batch_count; i++)
write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC);
jbd2: use WRITE_SYNC in journal checkpoint In journal checkpoint, we write the buffer and wait for its finish. But in cfq, the async queue has a very low priority, and in our test, if there are too many sync queues and every queue is filled up with requests, the write request will be delayed for quite a long time and all the tasks which are waiting for journal space will end with errors like: INFO: task attr_set:3816 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. attr_set D ffff880028393480 0 3816 1 0x00000000 ffff8802073fbae8 0000000000000086 ffff8802140847c8 ffff8800283934e8 ffff8802073fb9d8 ffffffff8103e456 ffff8802140847b8 ffff8801ed728080 ffff8801db4bc080 ffff8801ed728450 ffff880028393480 0000000000000002 Call Trace: [<ffffffff8103e456>] ? __dequeue_entity+0x33/0x38 [<ffffffff8103caad>] ? need_resched+0x23/0x2d [<ffffffff814006a6>] ? thread_return+0xa2/0xbc [<ffffffffa01f6224>] ? jbd2_journal_dirty_metadata+0x116/0x126 [jbd2] [<ffffffffa01f6224>] ? jbd2_journal_dirty_metadata+0x116/0x126 [jbd2] [<ffffffff81400d31>] __mutex_lock_common+0x14e/0x1a9 [<ffffffffa021dbfb>] ? brelse+0x13/0x15 [ext4] [<ffffffff81400ddb>] __mutex_lock_slowpath+0x19/0x1b [<ffffffff81400b2d>] mutex_lock+0x1b/0x32 [<ffffffffa01f927b>] __jbd2_journal_insert_checkpoint+0xe3/0x20c [jbd2] [<ffffffffa01f547b>] start_this_handle+0x438/0x527 [jbd2] [<ffffffff8106f491>] ? autoremove_wake_function+0x0/0x3e [<ffffffffa01f560b>] jbd2_journal_start+0xa1/0xcc [jbd2] [<ffffffffa02353be>] ext4_journal_start_sb+0x57/0x81 [ext4] [<ffffffffa024a314>] ext4_xattr_set+0x6c/0xe3 [ext4] [<ffffffffa024aaff>] ext4_xattr_user_set+0x42/0x4b [ext4] [<ffffffff81145adb>] generic_setxattr+0x6b/0x76 [<ffffffff81146ac0>] __vfs_setxattr_noperm+0x47/0xc0 [<ffffffff81146bb8>] vfs_setxattr+0x7f/0x9a [<ffffffff81146c88>] setxattr+0xb5/0xe8 [<ffffffff81137467>] ? do_filp_open+0x571/0xa6e [<ffffffff81146d26>] sys_fsetxattr+0x6b/0x91 [<ffffffff81002d32>] system_call_fastpath+0x16/0x1b So this patch tries to use WRITE_SYNC in __flush_batch so that the request will be moved into sync queue and handled by cfq timely. We also use the new plug, sot that all the WRITE_SYNC requests can be given as a whole when we unplug it. Signed-off-by: Tao Ma <boyu.mt@taobao.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Cc: Jan Kara <jack@suse.cz> Reported-by: Robin Dong <sanbai@taobao.com>
2011-06-27 23:36:29 +07:00
blk_finish_plug(&plug);
for (i = 0; i < *batch_count; i++) {
struct buffer_head *bh = journal->j_chkpt_bhs[i];
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
}
*batch_count = 0;
}
/*
* Perform an actual checkpoint. We take the first transaction on the
* list of transactions to be checkpointed and send all its buffers
* to disk. We submit larger chunks of data at once.
*
* The journal should be locked before calling this function.
* Called with j_checkpoint_mutex held.
*/
int jbd2_log_do_checkpoint(journal_t *journal)
{
struct journal_head *jh;
struct buffer_head *bh;
transaction_t *transaction;
tid_t this_tid;
int result, batch_count = 0;
jbd_debug(1, "Start checkpoint\n");
/*
* First thing: if there are any transactions in the log which
* don't need checkpointing, just eliminate them from the
* journal straight away.
*/
result = jbd2_cleanup_journal_tail(journal);
trace_jbd2_checkpoint(journal, result);
jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
if (result <= 0)
return result;
/*
* OK, we need to start writing disk blocks. Take one transaction
* and write it.
*/
result = 0;
spin_lock(&journal->j_list_lock);
if (!journal->j_checkpoint_transactions)
goto out;
transaction = journal->j_checkpoint_transactions;
jbd2: jbd2 stats through procfs The patch below updates the jbd stats patch to 2.6.20/jbd2. The initial patch was posted by Alex Tomas in December 2005 (http://marc.info/?l=linux-ext4&m=113538565128617&w=2). It provides statistics via procfs such as transaction lifetime and size. Sometimes, investigating performance problems, i find useful to have stats from jbd about transaction's lifetime, size, etc. here is a patch for review and inclusion probably. for example, stats after creation of 3M files in htree directory: [root@bob ~]# cat /proc/fs/jbd/sda/history R/C tid wait run lock flush log hndls block inlog ctime write drop close R 261 8260 2720 0 0 750 9892 8170 8187 C 259 750 0 4885 1 R 262 20 2200 10 0 770 9836 8170 8187 R 263 30 2200 10 0 3070 9812 8170 8187 R 264 0 5000 10 0 1340 0 0 0 C 261 8240 3212 4957 0 R 265 8260 1470 0 0 4640 9854 8170 8187 R 266 0 5000 10 0 1460 0 0 0 C 262 8210 2989 4868 0 R 267 8230 1490 10 0 4440 9875 8171 8188 R 268 0 5000 10 0 1260 0 0 0 C 263 7710 2937 4908 0 R 269 7730 1470 10 0 3330 9841 8170 8187 R 270 0 5000 10 0 830 0 0 0 C 265 8140 3234 4898 0 C 267 720 0 4849 1 R 271 8630 2740 20 0 740 9819 8170 8187 C 269 800 0 4214 1 R 272 40 2170 10 0 830 9716 8170 8187 R 273 40 2280 0 0 3530 9799 8170 8187 R 274 0 5000 10 0 990 0 0 0 where, R - line for transaction's life from T_RUNNING to T_FINISHED C - line for transaction's checkpointing tid - transaction's id wait - for how long we were waiting for new transaction to start (the longest period journal_start() took in this transaction) run - real transaction's lifetime (from T_RUNNING to T_LOCKED lock - how long we were waiting for all handles to close (time the transaction was in T_LOCKED) flush - how long it took to flush all data (data=ordered) log - how long it took to write the transaction to the log hndls - how many handles got to the transaction block - how many blocks got to the transaction inlog - how many blocks are written to the log (block + descriptors) ctime - how long it took to checkpoint the transaction write - how many blocks have been written during checkpointing drop - how many blocks have been dropped during checkpointing close - how many running transactions have been closed to checkpoint this one all times are in msec. [root@bob ~]# cat /proc/fs/jbd/sda/info 280 transaction, each upto 8192 blocks average: 1633ms waiting for transaction 3616ms running transaction 5ms transaction was being locked 1ms flushing data (in ordered mode) 1799ms logging transaction 11781 handles per transaction 5629 blocks per transaction 5641 logged blocks per transaction Signed-off-by: Johann Lombardi <johann.lombardi@bull.net> Signed-off-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Eric Sandeen <sandeen@redhat.com>
2008-01-29 11:58:27 +07:00
if (transaction->t_chp_stats.cs_chp_time == 0)
transaction->t_chp_stats.cs_chp_time = jiffies;
this_tid = transaction->t_tid;
restart:
/*
* If someone cleaned up this transaction while we slept, we're
* done (maybe it's a new transaction, but it fell at the same
* address).
*/
if (journal->j_checkpoint_transactions != transaction ||
transaction->t_tid != this_tid)
goto out;
/* checkpoint all of the transaction's buffers */
while (transaction->t_checkpoint_list) {
jh = transaction->t_checkpoint_list;
bh = jh2bh(jh);
if (buffer_locked(bh)) {
get_bh(bh);
spin_unlock(&journal->j_list_lock);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
goto retry;
}
if (jh->b_transaction != NULL) {
transaction_t *t = jh->b_transaction;
tid_t tid = t->t_tid;
transaction->t_chp_stats.cs_forced_to_close++;
spin_unlock(&journal->j_list_lock);
if (unlikely(journal->j_flags & JBD2_UNMOUNT))
/*
* The journal thread is dead; so
* starting and waiting for a commit
* to finish will cause us to wait for
* a _very_ long time.
*/
printk(KERN_ERR
"JBD2: %s: Waiting for Godot: block %llu\n",
journal->j_devname, (unsigned long long) bh->b_blocknr);
if (batch_count)
__flush_batch(journal, &batch_count);
jbd2_log_start_commit(journal, tid);
/*
* jbd2_journal_commit_transaction() may want
* to take the checkpoint_mutex if JBD2_FLUSHED
* is set, jbd2_update_log_tail() called by
* jbd2_journal_commit_transaction() may also take
* checkpoint_mutex. So we need to temporarily
* drop it.
*/
mutex_unlock(&journal->j_checkpoint_mutex);
jbd2_log_wait_commit(journal, tid);
mutex_lock_io(&journal->j_checkpoint_mutex);
spin_lock(&journal->j_list_lock);
goto restart;
}
if (!buffer_dirty(bh)) {
if (unlikely(buffer_write_io_error(bh)) && !result)
result = -EIO;
BUFFER_TRACE(bh, "remove from checkpoint");
if (__jbd2_journal_remove_checkpoint(jh))
/* The transaction was released; we're done */
goto out;
continue;
}
/*
* Important: we are about to write the buffer, and
* possibly block, while still holding the journal
* lock. We cannot afford to let the transaction
* logic start messing around with this buffer before
* we write it to disk, as that would break
* recoverability.
*/
BUFFER_TRACE(bh, "queue");
get_bh(bh);
J_ASSERT_BH(bh, !buffer_jwrite(bh));
journal->j_chkpt_bhs[batch_count++] = bh;
__buffer_relink_io(jh);
transaction->t_chp_stats.cs_written++;
if ((batch_count == JBD2_NR_BATCH) ||
need_resched() ||
spin_needbreak(&journal->j_list_lock))
goto unlock_and_flush;
}
if (batch_count) {
unlock_and_flush:
spin_unlock(&journal->j_list_lock);
retry:
if (batch_count)
__flush_batch(journal, &batch_count);
spin_lock(&journal->j_list_lock);
goto restart;
}
/*
* Now we issued all of the transaction's buffers, let's deal
* with the buffers that are out for I/O.
*/
restart2:
/* Did somebody clean up the transaction in the meanwhile? */
if (journal->j_checkpoint_transactions != transaction ||
transaction->t_tid != this_tid)
goto out;
while (transaction->t_checkpoint_io_list) {
jh = transaction->t_checkpoint_io_list;
bh = jh2bh(jh);
if (buffer_locked(bh)) {
get_bh(bh);
spin_unlock(&journal->j_list_lock);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
spin_lock(&journal->j_list_lock);
goto restart2;
}
if (unlikely(buffer_write_io_error(bh)) && !result)
result = -EIO;
/*
* Now in whatever state the buffer currently is, we
* know that it has been written out and so we can
* drop it from the list
*/
if (__jbd2_journal_remove_checkpoint(jh))
break;
}
out:
spin_unlock(&journal->j_list_lock);
if (result < 0)
jbd2_journal_abort(journal, result);
else
result = jbd2_cleanup_journal_tail(journal);
return (result < 0) ? result : 0;
}
/*
* Check the list of checkpoint transactions for the journal to see if
* we have already got rid of any since the last update of the log tail
* in the journal superblock. If so, we can instantly roll the
* superblock forward to remove those transactions from the log.
*
* Return <0 on error, 0 on success, 1 if there was nothing to clean up.
*
* Called with the journal lock held.
*
* This is the only part of the journaling code which really needs to be
* aware of transaction aborts. Checkpointing involves writing to the
* main filesystem area rather than to the journal, so it can proceed
* even in abort state, but we must not update the super block if
* checkpointing may have failed. Otherwise, we would lose some metadata
* buffers which should be written-back to the filesystem.
*/
int jbd2_cleanup_journal_tail(journal_t *journal)
{
tid_t first_tid;
unsigned long blocknr;
if (is_journal_aborted(journal))
return -EIO;
if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
return 1;
J_ASSERT(blocknr != 0);
/*
* We need to make sure that any blocks that were recently written out
* --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before
* we drop the transactions from the journal. It's unlikely this will
* be necessary, especially with an appropriately sized journal, but we
* need this to guarantee correctness. Fortunately
* jbd2_cleanup_journal_tail() doesn't get called all that often.
*/
if (journal->j_flags & JBD2_BARRIER)
jbd2: use GFP_NOFS in jbd2_cleanup_journal_tail() jbd2_cleanup_journal_tail() can be invoked by jbd2__journal_start() So allocations should be done with GFP_NOFS [Full stack trace snipped from 3.10-rh7] [<ffffffff815c4bd4>] dump_stack+0x19/0x1b [<ffffffff8105dba1>] warn_slowpath_common+0x61/0x80 [<ffffffff8105dcca>] warn_slowpath_null+0x1a/0x20 [<ffffffff815c2142>] slab_pre_alloc_hook.isra.31.part.32+0x15/0x17 [<ffffffff8119c045>] kmem_cache_alloc+0x55/0x210 [<ffffffff811477f5>] ? mempool_alloc_slab+0x15/0x20 [<ffffffff811477f5>] mempool_alloc_slab+0x15/0x20 [<ffffffff81147939>] mempool_alloc+0x69/0x170 [<ffffffff815cb69e>] ? _raw_spin_unlock_irq+0xe/0x20 [<ffffffff8109160d>] ? finish_task_switch+0x5d/0x150 [<ffffffff811f1a8e>] bio_alloc_bioset+0x1be/0x2e0 [<ffffffff8127ee49>] blkdev_issue_flush+0x99/0x120 [<ffffffffa019a733>] jbd2_cleanup_journal_tail+0x93/0xa0 [jbd2] -->GFP_KERNEL [<ffffffffa019aca1>] jbd2_log_do_checkpoint+0x221/0x4a0 [jbd2] [<ffffffffa019afc7>] __jbd2_log_wait_for_space+0xa7/0x1e0 [jbd2] [<ffffffffa01952d8>] start_this_handle+0x2d8/0x550 [jbd2] [<ffffffff811b02a9>] ? __memcg_kmem_put_cache+0x29/0x30 [<ffffffff8119c120>] ? kmem_cache_alloc+0x130/0x210 [<ffffffffa019573a>] jbd2__journal_start+0xba/0x190 [jbd2] [<ffffffff811532ce>] ? lru_cache_add+0xe/0x10 [<ffffffffa01c9549>] ? ext4_da_write_begin+0xf9/0x330 [ext4] [<ffffffffa01f2c77>] __ext4_journal_start_sb+0x77/0x160 [ext4] [<ffffffffa01c9549>] ext4_da_write_begin+0xf9/0x330 [ext4] [<ffffffff811446ec>] generic_file_buffered_write_iter+0x10c/0x270 [<ffffffff81146918>] __generic_file_write_iter+0x178/0x390 [<ffffffff81146c6b>] __generic_file_aio_write+0x8b/0xb0 [<ffffffff81146ced>] generic_file_aio_write+0x5d/0xc0 [<ffffffffa01bf289>] ext4_file_write+0xa9/0x450 [ext4] [<ffffffff811c31d9>] ? pipe_read+0x379/0x4f0 [<ffffffff811b93f0>] do_sync_write+0x90/0xe0 [<ffffffff811b9b6d>] vfs_write+0xbd/0x1e0 [<ffffffff811ba5b8>] SyS_write+0x58/0xb0 [<ffffffff815d4799>] system_call_fastpath+0x16/0x1b Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: Theodore Ts'o <tytso@mit.edu> Cc: stable@vger.kernel.org
2015-06-15 11:18:02 +07:00
blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
return __jbd2_update_log_tail(journal, first_tid, blocknr);
}
/* Checkpoint list management */
/*
* journal_clean_one_cp_list
*
jbd2: Fix oops in jbd2_journal_remove_journal_head() jbd2_journal_remove_journal_head() can oops when trying to access journal_head returned by bh2jh(). This is caused for example by the following race: TASK1 TASK2 jbd2_journal_commit_transaction() ... processing t_forget list __jbd2_journal_refile_buffer(jh); if (!jh->b_transaction) { jbd_unlock_bh_state(bh); jbd2_journal_try_to_free_buffers() jbd2_journal_grab_journal_head(bh) jbd_lock_bh_state(bh) __journal_try_to_free_buffer() jbd2_journal_put_journal_head(jh) jbd2_journal_remove_journal_head(bh); jbd2_journal_put_journal_head() in TASK2 sees that b_jcount == 0 and buffer is not part of any transaction and thus frees journal_head before TASK1 gets to doing so. Note that even buffer_head can be released by try_to_free_buffers() after jbd2_journal_put_journal_head() which adds even larger opportunity for oops (but I didn't see this happen in reality). Fix the problem by making transactions hold their own journal_head reference (in b_jcount). That way we don't have to remove journal_head explicitely via jbd2_journal_remove_journal_head() and instead just remove journal_head when b_jcount drops to zero. The result of this is that [__]jbd2_journal_refile_buffer(), [__]jbd2_journal_unfile_buffer(), and __jdb2_journal_remove_checkpoint() can free journal_head which needs modification of a few callers. Also we have to be careful because once journal_head is removed, buffer_head might be freed as well. So we have to get our own buffer_head reference where it matters. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2011-06-14 02:38:22 +07:00
* Find all the written-back checkpoint buffers in the given list and
* release them. If 'destroy' is set, clean all buffers unconditionally.
*
* Called with j_list_lock held.
* Returns 1 if we freed the transaction, 0 otherwise.
*/
static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
{
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
int ret;
if (!jh)
return 0;
last_jh = jh->b_cpprev;
do {
jh = next_jh;
next_jh = jh->b_cpnext;
if (!destroy)
ret = __try_to_free_cp_buf(jh);
else
ret = __jbd2_journal_remove_checkpoint(jh) + 1;
if (!ret)
return 0;
if (ret == 2)
return 1;
/*
* This function only frees up some memory
* if possible so we dont have an obligation
* to finish processing. Bail out if preemption
* requested:
*/
if (need_resched())
return 0;
} while (jh != last_jh);
return 0;
}
/*
* journal_clean_checkpoint_list
*
* Find all the written-back checkpoint buffers in the journal and release them.
* If 'destroy' is set, release all buffers unconditionally.
*
* Called with j_list_lock held.
*/
void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy)
{
transaction_t *transaction, *last_transaction, *next_transaction;
int ret;
transaction = journal->j_checkpoint_transactions;
if (!transaction)
return;
last_transaction = transaction->t_cpprev;
next_transaction = transaction;
do {
transaction = next_transaction;
next_transaction = transaction->t_cpnext;
ret = journal_clean_one_cp_list(transaction->t_checkpoint_list,
destroy);
/*
* This function only frees up some memory if possible so we
* dont have an obligation to finish processing. Bail out if
* preemption requested:
*/
if (need_resched())
return;
if (ret)
continue;
/*
* It is essential that we are as careful as in the case of
* t_checkpoint_list with removing the buffer from the list as
* we can possibly see not yet submitted buffers on io_list
*/
ret = journal_clean_one_cp_list(transaction->
t_checkpoint_io_list, destroy);
if (need_resched())
return;
/*
* Stop scanning if we couldn't free the transaction. This
* avoids pointless scanning of transactions which still
* weren't checkpointed.
*/
if (!ret)
return;
} while (transaction != last_transaction);
}
/*
* Remove buffers from all checkpoint lists as journal is aborted and we just
* need to free memory
*/
void jbd2_journal_destroy_checkpoint(journal_t *journal)
{
/*
* We loop because __jbd2_journal_clean_checkpoint_list() may abort
* early due to a need of rescheduling.
*/
while (1) {
spin_lock(&journal->j_list_lock);
if (!journal->j_checkpoint_transactions) {
spin_unlock(&journal->j_list_lock);
break;
}
__jbd2_journal_clean_checkpoint_list(journal, true);
spin_unlock(&journal->j_list_lock);
cond_resched();
}
}
/*
* journal_remove_checkpoint: called after a buffer has been committed
* to disk (either by being write-back flushed to disk, or being
* committed to the log).
*
* We cannot safely clean a transaction out of the log until all of the
* buffer updates committed in that transaction have safely been stored
* elsewhere on disk. To achieve this, all of the buffers in a
* transaction need to be maintained on the transaction's checkpoint
* lists until they have been rewritten, at which point this function is
* called to remove the buffer from the existing transaction's
* checkpoint lists.
*
* The function returns 1 if it frees the transaction, 0 otherwise.
jbd2: Fix oops in jbd2_journal_remove_journal_head() jbd2_journal_remove_journal_head() can oops when trying to access journal_head returned by bh2jh(). This is caused for example by the following race: TASK1 TASK2 jbd2_journal_commit_transaction() ... processing t_forget list __jbd2_journal_refile_buffer(jh); if (!jh->b_transaction) { jbd_unlock_bh_state(bh); jbd2_journal_try_to_free_buffers() jbd2_journal_grab_journal_head(bh) jbd_lock_bh_state(bh) __journal_try_to_free_buffer() jbd2_journal_put_journal_head(jh) jbd2_journal_remove_journal_head(bh); jbd2_journal_put_journal_head() in TASK2 sees that b_jcount == 0 and buffer is not part of any transaction and thus frees journal_head before TASK1 gets to doing so. Note that even buffer_head can be released by try_to_free_buffers() after jbd2_journal_put_journal_head() which adds even larger opportunity for oops (but I didn't see this happen in reality). Fix the problem by making transactions hold their own journal_head reference (in b_jcount). That way we don't have to remove journal_head explicitely via jbd2_journal_remove_journal_head() and instead just remove journal_head when b_jcount drops to zero. The result of this is that [__]jbd2_journal_refile_buffer(), [__]jbd2_journal_unfile_buffer(), and __jdb2_journal_remove_checkpoint() can free journal_head which needs modification of a few callers. Also we have to be careful because once journal_head is removed, buffer_head might be freed as well. So we have to get our own buffer_head reference where it matters. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2011-06-14 02:38:22 +07:00
* The function can free jh and bh.
*
* This function is called with j_list_lock held.
*/
int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
{
struct transaction_chp_stats_s *stats;
transaction_t *transaction;
journal_t *journal;
int ret = 0;
JBUFFER_TRACE(jh, "entry");
if ((transaction = jh->b_cp_transaction) == NULL) {
JBUFFER_TRACE(jh, "not on transaction");
goto out;
}
journal = transaction->t_journal;
jbd2: Fix oops in jbd2_journal_remove_journal_head() jbd2_journal_remove_journal_head() can oops when trying to access journal_head returned by bh2jh(). This is caused for example by the following race: TASK1 TASK2 jbd2_journal_commit_transaction() ... processing t_forget list __jbd2_journal_refile_buffer(jh); if (!jh->b_transaction) { jbd_unlock_bh_state(bh); jbd2_journal_try_to_free_buffers() jbd2_journal_grab_journal_head(bh) jbd_lock_bh_state(bh) __journal_try_to_free_buffer() jbd2_journal_put_journal_head(jh) jbd2_journal_remove_journal_head(bh); jbd2_journal_put_journal_head() in TASK2 sees that b_jcount == 0 and buffer is not part of any transaction and thus frees journal_head before TASK1 gets to doing so. Note that even buffer_head can be released by try_to_free_buffers() after jbd2_journal_put_journal_head() which adds even larger opportunity for oops (but I didn't see this happen in reality). Fix the problem by making transactions hold their own journal_head reference (in b_jcount). That way we don't have to remove journal_head explicitely via jbd2_journal_remove_journal_head() and instead just remove journal_head when b_jcount drops to zero. The result of this is that [__]jbd2_journal_refile_buffer(), [__]jbd2_journal_unfile_buffer(), and __jdb2_journal_remove_checkpoint() can free journal_head which needs modification of a few callers. Also we have to be careful because once journal_head is removed, buffer_head might be freed as well. So we have to get our own buffer_head reference where it matters. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2011-06-14 02:38:22 +07:00
JBUFFER_TRACE(jh, "removing from transaction");
__buffer_unlink(jh);
jh->b_cp_transaction = NULL;
jbd2: Fix oops in jbd2_journal_remove_journal_head() jbd2_journal_remove_journal_head() can oops when trying to access journal_head returned by bh2jh(). This is caused for example by the following race: TASK1 TASK2 jbd2_journal_commit_transaction() ... processing t_forget list __jbd2_journal_refile_buffer(jh); if (!jh->b_transaction) { jbd_unlock_bh_state(bh); jbd2_journal_try_to_free_buffers() jbd2_journal_grab_journal_head(bh) jbd_lock_bh_state(bh) __journal_try_to_free_buffer() jbd2_journal_put_journal_head(jh) jbd2_journal_remove_journal_head(bh); jbd2_journal_put_journal_head() in TASK2 sees that b_jcount == 0 and buffer is not part of any transaction and thus frees journal_head before TASK1 gets to doing so. Note that even buffer_head can be released by try_to_free_buffers() after jbd2_journal_put_journal_head() which adds even larger opportunity for oops (but I didn't see this happen in reality). Fix the problem by making transactions hold their own journal_head reference (in b_jcount). That way we don't have to remove journal_head explicitely via jbd2_journal_remove_journal_head() and instead just remove journal_head when b_jcount drops to zero. The result of this is that [__]jbd2_journal_refile_buffer(), [__]jbd2_journal_unfile_buffer(), and __jdb2_journal_remove_checkpoint() can free journal_head which needs modification of a few callers. Also we have to be careful because once journal_head is removed, buffer_head might be freed as well. So we have to get our own buffer_head reference where it matters. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2011-06-14 02:38:22 +07:00
jbd2_journal_put_journal_head(jh);
if (transaction->t_checkpoint_list != NULL ||
transaction->t_checkpoint_io_list != NULL)
goto out;
/*
* There is one special case to worry about: if we have just pulled the
* buffer off a running or committing transaction's checkpoing list,
* then even if the checkpoint list is empty, the transaction obviously
* cannot be dropped!
*
* The locking here around t_state is a bit sleazy.
* See the comment at the end of jbd2_journal_commit_transaction().
*/
jbd2: Fix oops in jbd2_journal_remove_journal_head() jbd2_journal_remove_journal_head() can oops when trying to access journal_head returned by bh2jh(). This is caused for example by the following race: TASK1 TASK2 jbd2_journal_commit_transaction() ... processing t_forget list __jbd2_journal_refile_buffer(jh); if (!jh->b_transaction) { jbd_unlock_bh_state(bh); jbd2_journal_try_to_free_buffers() jbd2_journal_grab_journal_head(bh) jbd_lock_bh_state(bh) __journal_try_to_free_buffer() jbd2_journal_put_journal_head(jh) jbd2_journal_remove_journal_head(bh); jbd2_journal_put_journal_head() in TASK2 sees that b_jcount == 0 and buffer is not part of any transaction and thus frees journal_head before TASK1 gets to doing so. Note that even buffer_head can be released by try_to_free_buffers() after jbd2_journal_put_journal_head() which adds even larger opportunity for oops (but I didn't see this happen in reality). Fix the problem by making transactions hold their own journal_head reference (in b_jcount). That way we don't have to remove journal_head explicitely via jbd2_journal_remove_journal_head() and instead just remove journal_head when b_jcount drops to zero. The result of this is that [__]jbd2_journal_refile_buffer(), [__]jbd2_journal_unfile_buffer(), and __jdb2_journal_remove_checkpoint() can free journal_head which needs modification of a few callers. Also we have to be careful because once journal_head is removed, buffer_head might be freed as well. So we have to get our own buffer_head reference where it matters. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2011-06-14 02:38:22 +07:00
if (transaction->t_state != T_FINISHED)
goto out;
/* OK, that was the last buffer for the transaction: we can now
safely remove this transaction from the log */
stats = &transaction->t_chp_stats;
if (stats->cs_chp_time)
stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
jiffies);
trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev,
transaction->t_tid, stats);
__jbd2_journal_drop_transaction(journal, transaction);
jbd2_journal_free_transaction(transaction);
ret = 1;
out:
return ret;
}
/*
* journal_insert_checkpoint: put a committed buffer onto a checkpoint
* list so that we know when it is safe to clean the transaction out of
* the log.
*
* Called with the journal locked.
* Called with j_list_lock held.
*/
void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
transaction_t *transaction)
{
JBUFFER_TRACE(jh, "entry");
J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
jbd2: Fix oops in jbd2_journal_remove_journal_head() jbd2_journal_remove_journal_head() can oops when trying to access journal_head returned by bh2jh(). This is caused for example by the following race: TASK1 TASK2 jbd2_journal_commit_transaction() ... processing t_forget list __jbd2_journal_refile_buffer(jh); if (!jh->b_transaction) { jbd_unlock_bh_state(bh); jbd2_journal_try_to_free_buffers() jbd2_journal_grab_journal_head(bh) jbd_lock_bh_state(bh) __journal_try_to_free_buffer() jbd2_journal_put_journal_head(jh) jbd2_journal_remove_journal_head(bh); jbd2_journal_put_journal_head() in TASK2 sees that b_jcount == 0 and buffer is not part of any transaction and thus frees journal_head before TASK1 gets to doing so. Note that even buffer_head can be released by try_to_free_buffers() after jbd2_journal_put_journal_head() which adds even larger opportunity for oops (but I didn't see this happen in reality). Fix the problem by making transactions hold their own journal_head reference (in b_jcount). That way we don't have to remove journal_head explicitely via jbd2_journal_remove_journal_head() and instead just remove journal_head when b_jcount drops to zero. The result of this is that [__]jbd2_journal_refile_buffer(), [__]jbd2_journal_unfile_buffer(), and __jdb2_journal_remove_checkpoint() can free journal_head which needs modification of a few callers. Also we have to be careful because once journal_head is removed, buffer_head might be freed as well. So we have to get our own buffer_head reference where it matters. Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2011-06-14 02:38:22 +07:00
/* Get reference for checkpointing transaction */
jbd2_journal_grab_journal_head(jh2bh(jh));
jh->b_cp_transaction = transaction;
if (!transaction->t_checkpoint_list) {
jh->b_cpnext = jh->b_cpprev = jh;
} else {
jh->b_cpnext = transaction->t_checkpoint_list;
jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
jh->b_cpprev->b_cpnext = jh;
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_list = jh;
}
/*
* We've finished with this transaction structure: adios...
*
* The transaction must have no links except for the checkpoint by this
* point.
*
* Called with the journal locked.
* Called with j_list_lock held.
*/
void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
{
assert_spin_locked(&journal->j_list_lock);
if (transaction->t_cpnext) {
transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
if (journal->j_checkpoint_transactions == transaction)
journal->j_checkpoint_transactions =
transaction->t_cpnext;
if (journal->j_checkpoint_transactions == transaction)
journal->j_checkpoint_transactions = NULL;
}
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
J_ASSERT(transaction->t_checkpoint_list == NULL);
J_ASSERT(transaction->t_checkpoint_io_list == NULL);
J_ASSERT(atomic_read(&transaction->t_updates) == 0);
J_ASSERT(journal->j_committing_transaction != transaction);
J_ASSERT(journal->j_running_transaction != transaction);
trace_jbd2_drop_transaction(journal, transaction);
jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
}