Changes to copy_file_range for 5.3 from Dave and Amir:

- Create a generic copy_file_range handler and make individual
   filesystems responsible for calling it (i.e. no more assuming that
   do_splice_direct will work or is appropriate)
 - Refactor copy_file_range and remap_range parameter checking where they
   are the same
 - Install missing copy_file_range parameter checking(!)
 - Remove suid/sgid and update mtime like any other file write
 - Change the behavior so that a copy range crossing the source file's
   eof will result in a short copy to the source file's eof instead of
   EINVAL
 - Permit filesystems to decide if they want to handle cross-superblock
   copy_file_range in their local handlers.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEUzaAxoMeQq6m2jMV+H93GTRKtOsFAl0BGvAACgkQ+H93GTRK
 tOu2aw/+KGG7PiXm9ED3ZXUppKVddrZMOgqM7mSfHo6TBgW3pJUJcRIhawK0Wz/P
 stgTsOkurHSl3iT3vQyX4GTZvLoGN/rfsRLPxogJptBUqVv3BOrXsrI53f7V/kbm
 rtjlYsgExji7VBUiMTe5kOWWqxyR7B4nXyvY/8rier57rW/8C1I58B0OrxAmTK0k
 rz1e5BtE1dg91xA7cSdEc38FInz8MW8cvsrEzW9vyYY4IVE0PBuhhA1EvryxTrAZ
 hfthHFfzwxhJkI0mdha8uqNufNWrHLSqiwyjYC7pwAwSQzQPiQz9U17flu+URnfF
 kXaR5LdXbBP3pl46RdthrfuonWsv612cC1Qwfjs8PBG9lG7b9PGJ40MGVTiw7LlQ
 924/03ho0zAnV0E8Qn5O9nPshQNDJhwhzMS39EmMyFKb1D5XGzdMV0gDdIfx6hdO
 HDbw6VQ33S59gvk7v/gxsFB5Bs4PKfamHx/QmwQwpqWM5XExcr0yJ90OTBtAuY4r
 S+9gwG6uED3aPh8HbQ5UgnA8bZmMmi8AkcBvqJ9GgNw5SbZl0oyv9Sj6JNpoOejV
 8y9JkhoZUxqiihnKTw/vtMrj5RCOfifNBjMSwrShfLdLKtK0AZl1mXC0/1Q3VnEQ
 TUcyRHEzrtHgJ9/AK9xIyDNvNYzvHSLZj7maoZZumgQa2FOFrmw=
 =qM44
 -----END PGP SIGNATURE-----

Merge tag 'copy-file-range-fixes-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull copy_file_range updates from Darrick Wong:
 "This fixes numerous parameter checking problems and inconsistent
  behaviors in the new(ish) copy_file_range system call.

  Now the system call will actually check its range parameters
  correctly; refuse to copy into files for which the caller does not
  have sufficient privileges; update mtime and strip setuid like file
  writes are supposed to do; and allows copying up to the EOF of the
  source file instead of failing the call like we used to.

  Summary:

   - Create a generic copy_file_range handler and make individual
     filesystems responsible for calling it (i.e. no more assuming that
     do_splice_direct will work or is appropriate)

   - Refactor copy_file_range and remap_range parameter checking where
     they are the same

   - Install missing copy_file_range parameter checking(!)

   - Remove suid/sgid and update mtime like any other file write

   - Change the behavior so that a copy range crossing the source file's
     eof will result in a short copy to the source file's eof instead of
     EINVAL

   - Permit filesystems to decide if they want to handle
     cross-superblock copy_file_range in their local handlers"

* tag 'copy-file-range-fixes-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
  fuse: copy_file_range needs to strip setuid bits and update timestamps
  vfs: allow copy_file_range to copy across devices
  xfs: use file_modified() helper
  vfs: introduce file_modified() helper
  vfs: add missing checks to copy_file_range
  vfs: remove redundant checks from generic_remap_checks()
  vfs: introduce generic_file_rw_checks()
  vfs: no fallback for ->copy_file_range
  vfs: introduce generic_copy_file_range()
This commit is contained in:
Linus Torvalds 2019-07-10 20:32:37 -07:00
commit 40f06c7995
9 changed files with 257 additions and 100 deletions

View File

@ -1889,9 +1889,9 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
return 0;
}
static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off,
size_t len, unsigned int flags)
static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off,
size_t len, unsigned int flags)
{
struct inode *src_inode = file_inode(src_file);
struct inode *dst_inode = file_inode(dst_file);
@ -1909,6 +1909,8 @@ static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (src_inode == dst_inode)
return -EINVAL;
if (src_inode->i_sb != dst_inode->i_sb)
return -EXDEV;
if (ceph_snap(dst_inode) != CEPH_NOSNAP)
return -EROFS;
@ -2100,6 +2102,21 @@ static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
return ret;
}
static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off,
size_t len, unsigned int flags)
{
ssize_t ret;
ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off,
len, flags);
if (ret == -EOPNOTSUPP || ret == -EXDEV)
ret = generic_copy_file_range(src_file, src_off, dst_file,
dst_off, len, flags);
return ret;
}
const struct file_operations ceph_file_fops = {
.open = ceph_open,
.release = ceph_release,

View File

@ -1149,6 +1149,10 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff,
len, flags);
free_xid(xid);
if (rc == -EOPNOTSUPP || rc == -EXDEV)
rc = generic_copy_file_range(src_file, off, dst_file,
destoff, len, flags);
return rc;
}

View File

@ -3112,9 +3112,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
return err;
}
static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t len, unsigned int flags)
static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t len, unsigned int flags)
{
struct fuse_file *ff_in = file_in->private_data;
struct fuse_file *ff_out = file_out->private_data;
@ -3142,6 +3142,9 @@ static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (fc->no_copy_file_range)
return -EOPNOTSUPP;
if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
return -EXDEV;
if (fc->writeback_cache) {
inode_lock(inode_in);
err = fuse_writeback_range(inode_in, pos_in, pos_in + len);
@ -3152,6 +3155,10 @@ static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
inode_lock(inode_out);
err = file_modified(file_out);
if (err)
goto out;
if (fc->writeback_cache) {
err = fuse_writeback_range(inode_out, pos_out, pos_out + len);
if (err)
@ -3190,10 +3197,26 @@ static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
inode_unlock(inode_out);
file_accessed(file_in);
return err;
}
static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off,
size_t len, unsigned int flags)
{
ssize_t ret;
ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
len, flags);
if (ret == -EOPNOTSUPP || ret == -EXDEV)
ret = generic_copy_file_range(src_file, src_off, dst_file,
dst_off, len, flags);
return ret;
}
static const struct file_operations fuse_file_operations = {
.llseek = fuse_file_llseek,
.read_iter = fuse_file_read_iter,

View File

@ -1899,6 +1899,26 @@ int file_update_time(struct file *file)
}
EXPORT_SYMBOL(file_update_time);
/* Caller must hold the file's inode lock */
int file_modified(struct file *file)
{
int err;
/*
* Clear the security bits if the process is not being run by root.
* This keeps people from modifying setuid and setgid binaries.
*/
err = file_remove_privs(file);
if (err)
return err;
if (unlikely(file->f_mode & FMODE_NOCMTIME))
return 0;
return file_update_time(file);
}
EXPORT_SYMBOL(file_modified);
int inode_needs_sync(struct inode *inode)
{
if (IS_SYNC(inode))

View File

@ -129,10 +129,13 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
}
#ifdef CONFIG_NFS_V4_2
static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t count, unsigned int flags)
static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t count, unsigned int flags)
{
/* Only offload copy if superblock is the same */
if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
return -EXDEV;
if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY))
return -EOPNOTSUPP;
if (file_inode(file_in) == file_inode(file_out))
@ -140,6 +143,20 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
}
static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t count, unsigned int flags)
{
ssize_t ret;
ret = __nfs4_copy_file_range(file_in, pos_in, file_out, pos_out, count,
flags);
if (ret == -EOPNOTSUPP || ret == -EXDEV)
ret = generic_copy_file_range(file_in, pos_in, file_out,
pos_out, count, flags);
return ret;
}
static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
{
loff_t ret;

View File

@ -1565,6 +1565,58 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
}
#endif
/**
* generic_copy_file_range - copy data between two files
* @file_in: file structure to read from
* @pos_in: file offset to read from
* @file_out: file structure to write data to
* @pos_out: file offset to write data to
* @len: amount of data to copy
* @flags: copy flags
*
* This is a generic filesystem helper to copy data from one file to another.
* It has no constraints on the source or destination file owners - the files
* can belong to different superblocks and different filesystem types. Short
* copies are allowed.
*
* This should be called from the @file_out filesystem, as per the
* ->copy_file_range() method.
*
* Returns the number of bytes copied or a negative error indicating the
* failure.
*/
ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t len, unsigned int flags)
{
return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
}
EXPORT_SYMBOL(generic_copy_file_range);
static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t len, unsigned int flags)
{
/*
* Although we now allow filesystems to handle cross sb copy, passing
* a file of the wrong filesystem type to filesystem driver can result
* in an attempt to dereference the wrong type of ->private_data, so
* avoid doing that until we really have a good reason. NFS defines
* several different file_system_type structures, but they all end up
* using the same ->copy_file_range() function pointer.
*/
if (file_out->f_op->copy_file_range &&
file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
return file_out->f_op->copy_file_range(file_in, pos_in,
file_out, pos_out,
len, flags);
return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
flags);
}
/*
* copy_file_range() differs from regular file read and write in that it
* specifically allows return partial success. When it does so is up to
@ -1574,17 +1626,15 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t len, unsigned int flags)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
ssize_t ret;
if (flags != 0)
return -EINVAL;
if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
flags);
if (unlikely(ret))
return ret;
ret = rw_verify_area(READ, file_in, &pos_in, len);
if (unlikely(ret))
@ -1594,15 +1644,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (unlikely(ret))
return ret;
if (!(file_in->f_mode & FMODE_READ) ||
!(file_out->f_mode & FMODE_WRITE) ||
(file_out->f_flags & O_APPEND))
return -EBADF;
/* this could be relaxed once a method supports cross-fs copies */
if (inode_in->i_sb != inode_out->i_sb)
return -EXDEV;
if (len == 0)
return 0;
@ -1612,7 +1653,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
* Try cloning first, this is supported by more file systems, and
* more efficient if both clone and copy are supported (e.g. NFS).
*/
if (file_in->f_op->remap_file_range) {
if (file_in->f_op->remap_file_range &&
file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
loff_t cloned;
cloned = file_in->f_op->remap_file_range(file_in, pos_in,
@ -1625,16 +1667,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
}
}
if (file_out->f_op->copy_file_range) {
ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
pos_out, len, flags);
if (ret != -EOPNOTSUPP)
goto done;
}
ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
flags);
WARN_ON_ONCE(ret == -EOPNOTSUPP);
done:
if (ret > 0) {
fsnotify_access(file_in);
@ -1951,25 +1986,10 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
return ret;
/* If can't alter the file contents, we're done. */
if (!(remap_flags & REMAP_FILE_DEDUP)) {
/* Update the timestamps, since we can alter file contents. */
if (!(file_out->f_mode & FMODE_NOCMTIME)) {
ret = file_update_time(file_out);
if (ret)
return ret;
}
if (!(remap_flags & REMAP_FILE_DEDUP))
ret = file_modified(file_out);
/*
* Clear the security bits if the process is not being run by
* root. This keeps people from modifying setuid and setgid
* binaries.
*/
ret = file_remove_privs(file_out);
if (ret)
return ret;
}
return 0;
return ret;
}
EXPORT_SYMBOL(generic_remap_file_range_prep);
@ -1977,29 +1997,21 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
loff_t ret;
WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
/*
* FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
* the same mount. Practically, they only need to be on the same file
* system.
*/
if (inode_in->i_sb != inode_out->i_sb)
if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
return -EXDEV;
if (!(file_in->f_mode & FMODE_READ) ||
!(file_out->f_mode & FMODE_WRITE) ||
(file_out->f_flags & O_APPEND))
return -EBADF;
ret = generic_file_rw_checks(file_in, file_out);
if (ret < 0)
return ret;
if (!file_in->f_op->remap_file_range)
return -EOPNOTSUPP;

View File

@ -367,20 +367,7 @@ xfs_file_aio_write_checks(
* lock above. Eventually we should look into a way to avoid
* the pointless lock roundtrip.
*/
if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
error = file_update_time(file);
if (error)
return error;
}
/*
* If we're writing the file then make sure to clear the setuid and
* setgid bits if the process is not being run by root. This keeps
* people from modifying setuid and setgid binaries.
*/
if (!IS_NOSEC(inode))
return file_remove_privs(file);
return 0;
return file_modified(file);
}
static int

View File

@ -1889,6 +1889,9 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
unsigned long, loff_t *, rwf_t);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
loff_t, size_t, unsigned int);
extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t len, unsigned int flags);
extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *count,
@ -2174,6 +2177,8 @@ static inline void file_accessed(struct file *file)
touch_atime(&file->f_path);
}
extern int file_modified(struct file *file);
int sync_inode(struct inode *inode, struct writeback_control *wbc);
int sync_inode_metadata(struct inode *inode, int wait);
@ -3047,6 +3052,10 @@ extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *count, unsigned int remap_flags);
extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
extern int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t *count, unsigned int flags);
extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);

View File

@ -2925,24 +2925,11 @@ EXPORT_SYMBOL(read_cache_page_gfp);
* LFS limits. If pos is under the limit it becomes a short access. If it
* exceeds the limit we return -EFBIG.
*/
static int generic_access_check_limits(struct file *file, loff_t pos,
loff_t *count)
{
struct inode *inode = file->f_mapping->host;
loff_t max_size = inode->i_sb->s_maxbytes;
if (!(file->f_flags & O_LARGEFILE))
max_size = MAX_NON_LFS;
if (unlikely(pos >= max_size))
return -EFBIG;
*count = min(*count, max_size - pos);
return 0;
}
static int generic_write_check_limits(struct file *file, loff_t pos,
loff_t *count)
{
struct inode *inode = file->f_mapping->host;
loff_t max_size = inode->i_sb->s_maxbytes;
loff_t limit = rlimit(RLIMIT_FSIZE);
if (limit != RLIM_INFINITY) {
@ -2953,7 +2940,15 @@ static int generic_write_check_limits(struct file *file, loff_t pos,
*count = min(*count, limit - pos);
}
return generic_access_check_limits(file, pos, count);
if (!(file->f_flags & O_LARGEFILE))
max_size = MAX_NON_LFS;
if (unlikely(pos >= max_size))
return -EFBIG;
*count = min(*count, max_size - pos);
return 0;
}
/*
@ -2993,7 +2988,7 @@ EXPORT_SYMBOL(generic_write_checks);
/*
* Performs necessary checks before doing a clone.
*
* Can adjust amount of bytes to clone.
* Can adjust amount of bytes to clone via @req_count argument.
* Returns appropriate error code that caller should return or
* zero in case the clone should be allowed.
*/
@ -3031,10 +3026,6 @@ int generic_remap_checks(struct file *file_in, loff_t pos_in,
return -EINVAL;
count = min(count, size_in - (uint64_t)pos_in);
ret = generic_access_check_limits(file_in, pos_in, &count);
if (ret)
return ret;
ret = generic_write_check_limits(file_out, pos_out, &count);
if (ret)
return ret;
@ -3071,6 +3062,83 @@ int generic_remap_checks(struct file *file_in, loff_t pos_in,
return 0;
}
/*
* Performs common checks before doing a file copy/clone
* from @file_in to @file_out.
*/
int generic_file_rw_checks(struct file *file_in, struct file *file_out)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
/* Don't copy dirs, pipes, sockets... */
if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
return -EISDIR;
if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
return -EINVAL;
if (!(file_in->f_mode & FMODE_READ) ||
!(file_out->f_mode & FMODE_WRITE) ||
(file_out->f_flags & O_APPEND))
return -EBADF;
return 0;
}
/*
* Performs necessary checks before doing a file copy
*
* Can adjust amount of bytes to copy via @req_count argument.
* Returns appropriate error code that caller should return or
* zero in case the copy should be allowed.
*/
int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t *req_count, unsigned int flags)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
uint64_t count = *req_count;
loff_t size_in;
int ret;
ret = generic_file_rw_checks(file_in, file_out);
if (ret)
return ret;
/* Don't touch certain kinds of inodes */
if (IS_IMMUTABLE(inode_out))
return -EPERM;
if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
return -ETXTBSY;
/* Ensure offsets don't wrap. */
if (pos_in + count < pos_in || pos_out + count < pos_out)
return -EOVERFLOW;
/* Shorten the copy to EOF */
size_in = i_size_read(inode_in);
if (pos_in >= size_in)
count = 0;
else
count = min(count, size_in - (uint64_t)pos_in);
ret = generic_write_check_limits(file_out, pos_out, &count);
if (ret)
return ret;
/* Don't allow overlapped copying within the same file. */
if (inode_in == inode_out &&
pos_out + count > pos_in &&
pos_out < pos_in + count)
return -EINVAL;
*req_count = count;
return 0;
}
int pagecache_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)