mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-28 11:18:45 +07:00
5660e13d2f
Most filesystems currently use mapping_set_error and filemap_check_errors for setting and reporting/clearing writeback errors at the mapping level. filemap_check_errors is indirectly called from most of the filemap_fdatawait_* functions and from filemap_write_and_wait*. These functions are called from all sorts of contexts to wait on writeback to finish -- e.g. mostly in fsync, but also in truncate calls, getattr, etc. The non-fsync callers are problematic. We should be reporting writeback errors during fsync, but many places spread over the tree clear out errors before they can be properly reported, or report errors at nonsensical times. If I get -EIO on a stat() call, there is no reason for me to assume that it is because some previous writeback failed. The fact that it also clears out the error such that a subsequent fsync returns 0 is a bug, and a nasty one since that's potentially silent data corruption. This patch adds a small bit of new infrastructure for setting and reporting errors during address_space writeback. While the above was my original impetus for adding this, I think it's also the case that current fsync semantics are just problematic for userland. Most applications that call fsync do so to ensure that the data they wrote has hit the backing store. In the case where there are multiple writers to the file at the same time, this is really hard to determine. The first one to call fsync will see any stored error, and the rest get back 0. The processes with open fds may not be associated with one another in any way. They could even be in different containers, so ensuring coordination between all fsync callers is not really an option. One way to remedy this would be to track what file descriptor was used to dirty the file, but that's rather cumbersome and would likely be slow. However, there is a simpler way to improve the semantics here without incurring too much overhead. This set adds an errseq_t to struct address_space, and a corresponding one is added to struct file. Writeback errors are recorded in the mapping's errseq_t, and the one in struct file is used as the "since" value. This changes the semantics of the Linux fsync implementation such that applications can now use it to determine whether there were any writeback errors since fsync(fd) was last called (or since the file was opened in the case of fsync having never been called). Note that those writeback errors may have occurred when writing data that was dirtied via an entirely different fd, but that's the case now with the current mapping_set_error/filemap_check_error infrastructure. This will at least prevent you from getting a false report of success. The new behavior is still consistent with the POSIX spec, and is more reliable for application developers. This patch just adds some basic infrastructure for doing this, and ensures that the f_wb_err "cursor" is properly set when a file is opened. Later patches will change the existing code to use this new infrastructure for reporting errors at fsync time. Signed-off-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Jan Kara <jack@suse.cz>
116 lines
2.7 KiB
C
116 lines
2.7 KiB
C
#undef TRACE_SYSTEM
|
|
#define TRACE_SYSTEM filemap
|
|
|
|
#if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ)
|
|
#define _TRACE_FILEMAP_H
|
|
|
|
#include <linux/types.h>
|
|
#include <linux/tracepoint.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/memcontrol.h>
|
|
#include <linux/device.h>
|
|
#include <linux/kdev_t.h>
|
|
#include <linux/errseq.h>
|
|
|
|
DECLARE_EVENT_CLASS(mm_filemap_op_page_cache,
|
|
|
|
TP_PROTO(struct page *page),
|
|
|
|
TP_ARGS(page),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(unsigned long, pfn)
|
|
__field(unsigned long, i_ino)
|
|
__field(unsigned long, index)
|
|
__field(dev_t, s_dev)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->pfn = page_to_pfn(page);
|
|
__entry->i_ino = page->mapping->host->i_ino;
|
|
__entry->index = page->index;
|
|
if (page->mapping->host->i_sb)
|
|
__entry->s_dev = page->mapping->host->i_sb->s_dev;
|
|
else
|
|
__entry->s_dev = page->mapping->host->i_rdev;
|
|
),
|
|
|
|
TP_printk("dev %d:%d ino %lx page=%p pfn=%lu ofs=%lu",
|
|
MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
|
|
__entry->i_ino,
|
|
pfn_to_page(__entry->pfn),
|
|
__entry->pfn,
|
|
__entry->index << PAGE_SHIFT)
|
|
);
|
|
|
|
DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_delete_from_page_cache,
|
|
TP_PROTO(struct page *page),
|
|
TP_ARGS(page)
|
|
);
|
|
|
|
DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache,
|
|
TP_PROTO(struct page *page),
|
|
TP_ARGS(page)
|
|
);
|
|
|
|
TRACE_EVENT(filemap_set_wb_err,
|
|
TP_PROTO(struct address_space *mapping, errseq_t eseq),
|
|
|
|
TP_ARGS(mapping, eseq),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(unsigned long, i_ino)
|
|
__field(dev_t, s_dev)
|
|
__field(errseq_t, errseq)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->i_ino = mapping->host->i_ino;
|
|
__entry->errseq = eseq;
|
|
if (mapping->host->i_sb)
|
|
__entry->s_dev = mapping->host->i_sb->s_dev;
|
|
else
|
|
__entry->s_dev = mapping->host->i_rdev;
|
|
),
|
|
|
|
TP_printk("dev=%d:%d ino=0x%lx errseq=0x%x",
|
|
MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
|
|
__entry->i_ino, __entry->errseq)
|
|
);
|
|
|
|
TRACE_EVENT(file_check_and_advance_wb_err,
|
|
TP_PROTO(struct file *file, errseq_t old),
|
|
|
|
TP_ARGS(file, old),
|
|
|
|
TP_STRUCT__entry(
|
|
__field(struct file *, file);
|
|
__field(unsigned long, i_ino)
|
|
__field(dev_t, s_dev)
|
|
__field(errseq_t, old)
|
|
__field(errseq_t, new)
|
|
),
|
|
|
|
TP_fast_assign(
|
|
__entry->file = file;
|
|
__entry->i_ino = file->f_mapping->host->i_ino;
|
|
if (file->f_mapping->host->i_sb)
|
|
__entry->s_dev =
|
|
file->f_mapping->host->i_sb->s_dev;
|
|
else
|
|
__entry->s_dev =
|
|
file->f_mapping->host->i_rdev;
|
|
__entry->old = old;
|
|
__entry->new = file->f_wb_err;
|
|
),
|
|
|
|
TP_printk("file=%p dev=%d:%d ino=0x%lx old=0x%x new=0x%x",
|
|
__entry->file, MAJOR(__entry->s_dev),
|
|
MINOR(__entry->s_dev), __entry->i_ino, __entry->old,
|
|
__entry->new)
|
|
);
|
|
#endif /* _TRACE_FILEMAP_H */
|
|
|
|
/* This part must be outside protection */
|
|
#include <trace/define_trace.h>
|