linux_dsm_epyc7002/fs/ext4/fsync.c
Jan Kara b0857d309f ext4: defer clearing of PageWriteback after extent conversion
Currently PageWriteback bit gets cleared from put_io_page() called
from ext4_end_bio().  This is somewhat inconvenient as extent tree is
not fully updated at that time (unwritten extents are not marked as
written) so we cannot read the data back yet.  This design was
dictated by lock ordering as we cannot start a transaction while
PageWriteback bit is set (we could easily deadlock with
ext4_da_writepages()).  But now that we use transaction reservation
for extent conversion, locking issues are solved and we can move
PageWriteback bit clearing after extent conversion is done.  As a
result we can remove wait for unwritten extent conversion from
ext4_sync_file() because it already implicitely happens through
wait_on_page_writeback().

We implement deferring of PageWriteback clearing by queueing completed
bios to appropriate io_end and processing all the pages when io_end is
going to be freed instead of at the moment ext4_io_end() is called.

Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2013-06-04 14:23:41 -04:00

176 lines
4.9 KiB
C

/*
* linux/fs/ext4/fsync.c
*
* Copyright (C) 1993 Stephen Tweedie (sct@redhat.com)
* from
* Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
* Laboratoire MASI - Institut Blaise Pascal
* Universite Pierre et Marie Curie (Paris VI)
* from
* linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
*
* ext4fs fsync primitive
*
* Big-endian to little-endian byte-swapping/bitmaps by
* David S. Miller (davem@caip.rutgers.edu), 1995
*
* Removed unnecessary code duplication for little endian machines
* and excessive __inline__s.
* Andi Kleen, 1997
*
* Major simplications and cleanup - we only need to do the metadata, because
* we can depend on generic_block_fdatasync() to sync the data blocks.
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/jbd2.h>
#include <linux/blkdev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include <trace/events/ext4.h>
/*
* If we're not journaling and this is a just-created file, we have to
* sync our parent directory (if it was freshly created) since
* otherwise it will only be written by writeback, leaving a huge
* window during which a crash may lose the file. This may apply for
* the parent directory's parent as well, and so on recursively, if
* they are also freshly created.
*/
static int ext4_sync_parent(struct inode *inode)
{
struct dentry *dentry = NULL;
struct inode *next;
int ret = 0;
if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
return 0;
inode = igrab(inode);
while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
dentry = d_find_any_alias(inode);
if (!dentry)
break;
next = igrab(dentry->d_parent->d_inode);
dput(dentry);
if (!next)
break;
iput(inode);
inode = next;
ret = sync_mapping_buffers(inode->i_mapping);
if (ret)
break;
ret = sync_inode_metadata(inode, 1);
if (ret)
break;
}
iput(inode);
return ret;
}
/**
* __sync_file - generic_file_fsync without the locking and filemap_write
* @inode: inode to sync
* @datasync: only sync essential metadata if true
*
* This is just generic_file_fsync without the locking. This is needed for
* nojournal mode to make sure this inodes data/metadata makes it to disk
* properly. The i_mutex should be held already.
*/
static int __sync_inode(struct inode *inode, int datasync)
{
int err;
int ret;
ret = sync_mapping_buffers(inode->i_mapping);
if (!(inode->i_state & I_DIRTY))
return ret;
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
return ret;
err = sync_inode_metadata(inode, 1);
if (ret == 0)
ret = err;
return ret;
}
/*
* akpm: A new design for ext4_sync_file().
*
* This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
* There cannot be a transaction open by this task.
* Another task could have dirtied this inode. Its data can be in any
* state in the journalling system.
*
* What we do is just kick off a commit and wait on it. This will snapshot the
* inode to disk.
*/
int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret, err;
tid_t commit_tid;
bool needs_barrier = false;
J_ASSERT(ext4_journal_current_handle() == NULL);
trace_ext4_sync_file_enter(file, datasync);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret)
return ret;
mutex_lock(&inode->i_mutex);
if (inode->i_sb->s_flags & MS_RDONLY)
goto out;
if (!journal) {
ret = __sync_inode(inode, datasync);
if (!ret && !hlist_empty(&inode->i_dentry))
ret = ext4_sync_parent(inode);
goto out;
}
/*
* data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
* Metadata is in the journal, we wait for proper transaction to
* commit here.
*
* data=journal:
* filemap_fdatawrite won't do anything (the buffers are clean).
* ext4_force_commit will write the file data into the journal and
* will wait on that.
* filemap_fdatawait() will encounter a ton of newly-dirtied pages
* (they were dirtied by commit). But that's OK - the blocks are
* safe in-journal, which is all fsync() needs to ensure.
*/
if (ext4_should_journal_data(inode)) {
ret = ext4_force_commit(inode->i_sb);
goto out;
}
commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_trans_will_send_data_barrier(journal, commit_tid))
needs_barrier = true;
ret = jbd2_complete_transaction(journal, commit_tid);
if (needs_barrier) {
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (!ret)
ret = err;
}
out:
mutex_unlock(&inode->i_mutex);
trace_ext4_sync_file_exit(inode, ret);
return ret;
}