linux_dsm_epyc7002/fs
Filipe Manana b84b8390d6 Btrfs: fix file read corruption after extent cloning and fsync
If we partially clone one extent of a file into a lower offset of the
file, fsync the file, power fail and then mount the fs to trigger log
replay, we can get multiple checksum items in the csum tree that overlap
each other and result in checksum lookup failures later. Those failures
can make file data read requests assume a checksum value of 0, but they
will not return an error (-EIO for example) to userspace exactly because
the expected checksum value 0 is a special value that makes the read bio
endio callback return success and set all the bytes of the corresponding
page with the value 0x01 (at fs/btrfs/inode.c:__readpage_endio_check()).
From a userspace perspective this is equivalent to file corruption
because we are not returning what was written to the file.

Details about how this can happen, and why, are included inline in the
following reproducer test case for fstests and the comment added to
tree-log.c.

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      _cleanup_flakey
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter
  . ./common/dmflakey

  # real QA test starts here
  _need_to_be_root
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _require_dm_flakey
  _require_cloner
  _require_metadata_journaling $SCRATCH_DEV

  rm -f $seqres.full

  _scratch_mkfs >>$seqres.full 2>&1
  _init_flakey
  _mount_flakey

  # Create our test file with a single 100K extent starting at file
  # offset 800K. We fsync the file here to make the fsync log tree gets
  # a single csum item that covers the whole 100K extent, which causes
  # the second fsync, done after the cloning operation below, to not
  # leave in the log tree two csum items covering two sub-ranges
  # ([0, 20K[ and [20K, 100K[)) of our extent.
  $XFS_IO_PROG -f -c "pwrite -S 0xaa 800K 100K"  \
                  -c "fsync"                     \
                   $SCRATCH_MNT/foo | _filter_xfs_io

  # Now clone part of our extent into file offset 400K. This adds a file
  # extent item to our inode's metadata that points to the 100K extent
  # we created before, using a data offset of 20K and a data length of
  # 20K, so that it refers to the sub-range [20K, 40K[ of our original
  # extent.
  $CLONER_PROG -s $((800 * 1024 + 20 * 1024)) -d $((400 * 1024)) \
      -l $((20 * 1024)) $SCRATCH_MNT/foo $SCRATCH_MNT/foo

  # Now fsync our file to make sure the extent cloning is durably
  # persisted. This fsync will not add a second csum item to the log
  # tree containing the checksums for the blocks in the sub-range
  # [20K, 40K[ of our extent, because there was already a csum item in
  # the log tree covering the whole extent, added by the first fsync
  # we did before.
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo

  echo "File digest before power failure:"
  md5sum $SCRATCH_MNT/foo | _filter_scratch

  # Silently drop all writes and ummount to simulate a crash/power
  # failure.
  _load_flakey_table $FLAKEY_DROP_WRITES
  _unmount_flakey

  # Allow writes again, mount to trigger log replay and validate file
  # contents.
  # The fsync log replay first processes the file extent item
  # corresponding to the file offset 400K (the one which refers to the
  # [20K, 40K[ sub-range of our 100K extent) and then processes the file
  # extent item for file offset 800K. It used to happen that when
  # processing the later, it erroneously left in the csum tree 2 csum
  # items that overlapped each other, 1 for the sub-range [20K, 40K[ and
  # 1 for the whole range of our extent. This introduced a problem where
  # subsequent lookups for the checksums of blocks within the range
  # [40K, 100K[ of our extent would not find anything because lookups in
  # the csum tree ended up looking only at the smaller csum item, the
  # one covering the subrange [20K, 40K[. This made read requests assume
  # an expected checksum with a value of 0 for those blocks, which caused
  # checksum verification failure when the read operations finished.
  # However those checksum failure did not result in read requests
  # returning an error to user space (like -EIO for e.g.) because the
  # expected checksum value had the special value 0, and in that case
  # btrfs set all bytes of the corresponding pages with the value 0x01
  # and produce the following warning in dmesg/syslog:
  #
  #  "BTRFS warning (device dm-0): csum failed ino 257 off 917504 csum\
  #   1322675045 expected csum 0"
  #
  _load_flakey_table $FLAKEY_ALLOW_WRITES
  _mount_flakey

  echo "File digest after log replay:"
  # Must match the same digest he had after cloning the extent and
  # before the power failure happened.
  md5sum $SCRATCH_MNT/foo | _filter_scratch

  _unmount_flakey

  status=0
  exit

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-08-19 14:27:46 -07:00
..
9p 9p: don't leave a half-initialized inode sitting around 2015-07-12 11:22:05 -04:00
adfs
affs Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
afs
autofs4
befs Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
bfs
btrfs Btrfs: fix file read corruption after extent cloning and fsync 2015-08-19 14:27:46 -07:00
cachefiles
ceph Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
cifs
coda
configfs configfs: fix kernel infoleak through user-controlled format string 2015-07-17 16:39:53 -07:00
cramfs
debugfs Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
devpts
dlm
ecryptfs ioctl_compat: handle FITRIM 2015-07-09 11:42:21 -07:00
efivarfs
efs
exofs
exportfs
ext2 Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
ext3
ext4 ioctl_compat: handle FITRIM 2015-07-09 11:42:21 -07:00
f2fs f2fs: call set_page_dirty to attach i_wb for cgroup 2015-07-25 08:54:26 -07:00
fat
freevxfs
fscache
fuse Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
gfs2
hfs Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
hfsplus Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
hostfs
hpfs hpfs: hpfs_error: Remove static buffer, use vsprintf extension %pV instead 2015-07-09 13:35:31 -07:00
hugetlbfs
isofs
jbd
jbd2
jffs2 Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
jfs A couple trivial fixes and an error path fix 2015-07-16 16:28:28 -07:00
kernfs Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace 2015-07-03 15:20:57 -07:00
lockd
logfs
minix Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
ncpfs
nfs NFS client bugfixes for Linux 4.2 2015-07-28 09:37:44 -07:00
nfs_common
nfsd
nilfs2 ioctl_compat: handle FITRIM 2015-07-09 11:42:21 -07:00
nls
notify Revert "fsnotify: fix oops in fsnotify_clear_marks_by_group_flags()" 2015-07-21 16:06:53 -07:00
ntfs Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
ocfs2 ioctl_compat: handle FITRIM 2015-07-09 11:42:21 -07:00
omfs
openpromfs
overlayfs fix a braino in ovl_d_select_inode() 2015-07-12 11:22:05 -04:00
proc Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip 2015-07-18 10:49:57 -07:00
pstore Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace 2015-07-03 15:20:57 -07:00
qnx4
qnx6
quota
ramfs
reiserfs
romfs
squashfs
sysfs Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace 2015-07-03 15:20:57 -07:00
sysv
tracefs Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
ubifs
udf udf: Don't corrupt unalloc spacetable when writing it 2015-07-09 16:38:57 +02:00
ufs Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
xfs xfs: remote attributes need to be considered data 2015-07-29 11:48:02 +10:00
aio.c
anon_inodes.c
attr.c
bad_inode.c
binfmt_aout.c
binfmt_elf_fdpic.c
binfmt_elf.c Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
binfmt_em86.c
binfmt_flat.c
binfmt_misc.c
binfmt_script.c
block_dev.c Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
buffer.c
char_dev.c
compat_binfmt_elf.c
compat_ioctl.c ioctl_compat: handle FITRIM 2015-07-09 11:42:21 -07:00
compat.c
coredump.c Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
dax.c xfs: call dax_fault on read page faults for DAX 2015-07-29 11:48:00 +10:00
dcache.c freeing unlinked file indefinitely delayed 2015-07-12 11:27:04 -04:00
dcookies.c
direct-io.c
drop_caches.c
eventfd.c
eventpoll.c
exec.c
fcntl.c
fhandle.c
file_table.c
file.c
filesystems.c
fs_pin.c
fs_struct.c
fs-writeback.c block: export bio_associate_*() and wbc_account_io() 2015-07-23 13:36:44 -06:00
inode.c Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
internal.h
ioctl.c
Kconfig
Kconfig.binfmt
libfs.c Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
locks.c locks: inline posix_lock_file_wait and flock_lock_file_wait 2015-07-13 06:29:11 -04:00
Makefile
mbcache.c
mount.h
mpage.c
namei.c link_path_walk(): be careful when failing with ENOTDIR 2015-08-01 20:18:38 -04:00
namespace.c mnt: In detach_mounts detach the appropriate unmounted mount 2015-07-23 11:31:15 -05:00
no-block.c
nsfs.c
open.c
pipe.c
pnode.c
pnode.h mnt: Clarify and correct the disconnect logic in umount_tree 2015-07-22 20:33:27 -05:00
posix_acl.c
proc_namespace.c
read_write.c
readdir.c
select.c
seq_file.c Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2015-07-04 19:36:06 -07:00
signalfd.c
splice.c
stack.c
stat.c
statfs.c
super.c
sync.c
timerfd.c
utimes.c
xattr.c