2019-05-19 19:08:55 +07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2020-06-12 13:57:37 +07:00
|
|
|
#include <crypto/hash.h>
|
2014-02-06 07:11:33 +07:00
|
|
|
#include <linux/export.h>
|
2016-11-01 20:40:13 +07:00
|
|
|
#include <linux/bvec.h>
|
2020-10-16 10:13:50 +07:00
|
|
|
#include <linux/fault-inject-usercopy.h>
|
2014-02-06 07:11:33 +07:00
|
|
|
#include <linux/uio.h>
|
|
|
|
#include <linux/pagemap.h>
|
2014-03-21 15:58:33 +07:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/vmalloc.h>
|
2016-09-23 03:33:12 +07:00
|
|
|
#include <linux/splice.h>
|
2020-09-25 11:51:40 +07:00
|
|
|
#include <linux/compat.h>
|
2014-11-24 13:08:00 +07:00
|
|
|
#include <net/checksum.h>
|
2018-12-04 08:52:09 +07:00
|
|
|
#include <linux/scatterlist.h>
|
2020-01-21 23:05:11 +07:00
|
|
|
#include <linux/instrumented.h>
|
2014-02-06 07:11:33 +07:00
|
|
|
|
2016-09-23 03:33:12 +07:00
|
|
|
#define PIPE_PARANOIA /* for now */
|
|
|
|
|
iov_iter.c: macros for iterating over iov_iter
iterate_all_kinds(iter, size, ident, step_iovec, step_bvec)
iterates through the ranges covered by iter (up to size bytes total),
repeating step_iovec or step_bvec for each of those. ident is
declared in expansion of that thing, either as struct iovec or
struct bvec, and it contains the range we are currently looking
at. step_bvec should be a void expression, step_iovec - a size_t
one, with non-zero meaning "stop here, that many bytes from this
range left". In the end, the amount actually handled is stored
in size.
iov_iter_copy_from_user_atomic() and iov_iter_alignment() converted
to it.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-11-28 01:51:41 +07:00
|
|
|
#define iterate_iovec(i, n, __v, __p, skip, STEP) { \
|
|
|
|
size_t left; \
|
|
|
|
size_t wanted = n; \
|
|
|
|
__p = i->iov; \
|
|
|
|
__v.iov_len = min(n, __p->iov_len - skip); \
|
|
|
|
if (likely(__v.iov_len)) { \
|
|
|
|
__v.iov_base = __p->iov_base + skip; \
|
|
|
|
left = (STEP); \
|
|
|
|
__v.iov_len -= left; \
|
|
|
|
skip += __v.iov_len; \
|
|
|
|
n -= __v.iov_len; \
|
|
|
|
} else { \
|
|
|
|
left = 0; \
|
|
|
|
} \
|
|
|
|
while (unlikely(!left && n)) { \
|
|
|
|
__p++; \
|
|
|
|
__v.iov_len = min(n, __p->iov_len); \
|
|
|
|
if (unlikely(!__v.iov_len)) \
|
|
|
|
continue; \
|
|
|
|
__v.iov_base = __p->iov_base; \
|
|
|
|
left = (STEP); \
|
|
|
|
__v.iov_len -= left; \
|
|
|
|
skip = __v.iov_len; \
|
|
|
|
n -= __v.iov_len; \
|
|
|
|
} \
|
|
|
|
n = wanted - n; \
|
|
|
|
}
|
|
|
|
|
2014-11-28 02:48:42 +07:00
|
|
|
#define iterate_kvec(i, n, __v, __p, skip, STEP) { \
|
|
|
|
size_t wanted = n; \
|
|
|
|
__p = i->kvec; \
|
|
|
|
__v.iov_len = min(n, __p->iov_len - skip); \
|
|
|
|
if (likely(__v.iov_len)) { \
|
|
|
|
__v.iov_base = __p->iov_base + skip; \
|
|
|
|
(void)(STEP); \
|
|
|
|
skip += __v.iov_len; \
|
|
|
|
n -= __v.iov_len; \
|
|
|
|
} \
|
|
|
|
while (unlikely(n)) { \
|
|
|
|
__p++; \
|
|
|
|
__v.iov_len = min(n, __p->iov_len); \
|
|
|
|
if (unlikely(!__v.iov_len)) \
|
|
|
|
continue; \
|
|
|
|
__v.iov_base = __p->iov_base; \
|
|
|
|
(void)(STEP); \
|
|
|
|
skip = __v.iov_len; \
|
|
|
|
n -= __v.iov_len; \
|
|
|
|
} \
|
|
|
|
n = wanted; \
|
|
|
|
}
|
|
|
|
|
2016-05-30 20:34:32 +07:00
|
|
|
#define iterate_bvec(i, n, __v, __bi, skip, STEP) { \
|
|
|
|
struct bvec_iter __start; \
|
|
|
|
__start.bi_size = n; \
|
|
|
|
__start.bi_bvec_done = skip; \
|
|
|
|
__start.bi_idx = 0; \
|
|
|
|
for_each_bvec(__v, i->bvec, __bi, __start) { \
|
|
|
|
if (!__v.bv_len) \
|
iov_iter.c: macros for iterating over iov_iter
iterate_all_kinds(iter, size, ident, step_iovec, step_bvec)
iterates through the ranges covered by iter (up to size bytes total),
repeating step_iovec or step_bvec for each of those. ident is
declared in expansion of that thing, either as struct iovec or
struct bvec, and it contains the range we are currently looking
at. step_bvec should be a void expression, step_iovec - a size_t
one, with non-zero meaning "stop here, that many bytes from this
range left". In the end, the amount actually handled is stored
in size.
iov_iter_copy_from_user_atomic() and iov_iter_alignment() converted
to it.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-11-28 01:51:41 +07:00
|
|
|
continue; \
|
|
|
|
(void)(STEP); \
|
|
|
|
} \
|
|
|
|
}
|
|
|
|
|
2014-11-28 02:48:42 +07:00
|
|
|
#define iterate_all_kinds(i, n, v, I, B, K) { \
|
2016-12-22 09:55:02 +07:00
|
|
|
if (likely(n)) { \
|
|
|
|
size_t skip = i->iov_offset; \
|
|
|
|
if (unlikely(i->type & ITER_BVEC)) { \
|
|
|
|
struct bio_vec v; \
|
|
|
|
struct bvec_iter __bi; \
|
|
|
|
iterate_bvec(i, n, v, __bi, skip, (B)) \
|
|
|
|
} else if (unlikely(i->type & ITER_KVEC)) { \
|
|
|
|
const struct kvec *kvec; \
|
|
|
|
struct kvec v; \
|
|
|
|
iterate_kvec(i, n, v, kvec, skip, (K)) \
|
2018-10-20 06:57:56 +07:00
|
|
|
} else if (unlikely(i->type & ITER_DISCARD)) { \
|
2016-12-22 09:55:02 +07:00
|
|
|
} else { \
|
|
|
|
const struct iovec *iov; \
|
|
|
|
struct iovec v; \
|
|
|
|
iterate_iovec(i, n, v, iov, skip, (I)) \
|
|
|
|
} \
|
iov_iter.c: macros for iterating over iov_iter
iterate_all_kinds(iter, size, ident, step_iovec, step_bvec)
iterates through the ranges covered by iter (up to size bytes total),
repeating step_iovec or step_bvec for each of those. ident is
declared in expansion of that thing, either as struct iovec or
struct bvec, and it contains the range we are currently looking
at. step_bvec should be a void expression, step_iovec - a size_t
one, with non-zero meaning "stop here, that many bytes from this
range left". In the end, the amount actually handled is stored
in size.
iov_iter_copy_from_user_atomic() and iov_iter_alignment() converted
to it.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-11-28 01:51:41 +07:00
|
|
|
} \
|
|
|
|
}
|
|
|
|
|
2014-11-28 02:48:42 +07:00
|
|
|
#define iterate_and_advance(i, n, v, I, B, K) { \
|
2016-05-09 22:54:48 +07:00
|
|
|
if (unlikely(i->count < n)) \
|
|
|
|
n = i->count; \
|
2016-05-26 04:36:19 +07:00
|
|
|
if (i->count) { \
|
2016-05-09 22:54:48 +07:00
|
|
|
size_t skip = i->iov_offset; \
|
|
|
|
if (unlikely(i->type & ITER_BVEC)) { \
|
2016-05-30 20:34:32 +07:00
|
|
|
const struct bio_vec *bvec = i->bvec; \
|
2016-05-09 22:54:48 +07:00
|
|
|
struct bio_vec v; \
|
2016-05-30 20:34:32 +07:00
|
|
|
struct bvec_iter __bi; \
|
|
|
|
iterate_bvec(i, n, v, __bi, skip, (B)) \
|
|
|
|
i->bvec = __bvec_iter_bvec(i->bvec, __bi); \
|
|
|
|
i->nr_segs -= i->bvec - bvec; \
|
|
|
|
skip = __bi.bi_bvec_done; \
|
2016-05-09 22:54:48 +07:00
|
|
|
} else if (unlikely(i->type & ITER_KVEC)) { \
|
|
|
|
const struct kvec *kvec; \
|
|
|
|
struct kvec v; \
|
|
|
|
iterate_kvec(i, n, v, kvec, skip, (K)) \
|
|
|
|
if (skip == kvec->iov_len) { \
|
|
|
|
kvec++; \
|
|
|
|
skip = 0; \
|
|
|
|
} \
|
|
|
|
i->nr_segs -= kvec - i->kvec; \
|
|
|
|
i->kvec = kvec; \
|
2018-10-20 06:57:56 +07:00
|
|
|
} else if (unlikely(i->type & ITER_DISCARD)) { \
|
|
|
|
skip += n; \
|
2016-05-09 22:54:48 +07:00
|
|
|
} else { \
|
|
|
|
const struct iovec *iov; \
|
|
|
|
struct iovec v; \
|
|
|
|
iterate_iovec(i, n, v, iov, skip, (I)) \
|
|
|
|
if (skip == iov->iov_len) { \
|
|
|
|
iov++; \
|
|
|
|
skip = 0; \
|
|
|
|
} \
|
|
|
|
i->nr_segs -= iov - i->iov; \
|
|
|
|
i->iov = iov; \
|
2014-11-28 01:59:45 +07:00
|
|
|
} \
|
2016-05-09 22:54:48 +07:00
|
|
|
i->count -= n; \
|
|
|
|
i->iov_offset = skip; \
|
2014-11-28 01:59:45 +07:00
|
|
|
} \
|
|
|
|
}
|
|
|
|
|
2017-06-30 09:25:14 +07:00
|
|
|
static int copyout(void __user *to, const void *from, size_t n)
|
|
|
|
{
|
2020-10-16 10:13:50 +07:00
|
|
|
if (should_fail_usercopy())
|
|
|
|
return n;
|
Remove 'type' argument from access_ok() function
Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument
of the user address range verification function since we got rid of the
old racy i386-only code to walk page tables by hand.
It existed because the original 80386 would not honor the write protect
bit when in kernel mode, so you had to do COW by hand before doing any
user access. But we haven't supported that in a long time, and these
days the 'type' argument is a purely historical artifact.
A discussion about extending 'user_access_begin()' to do the range
checking resulted this patch, because there is no way we're going to
move the old VERIFY_xyz interface to that model. And it's best done at
the end of the merge window when I've done most of my merges, so let's
just get this done once and for all.
This patch was mostly done with a sed-script, with manual fix-ups for
the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form.
There were a couple of notable cases:
- csky still had the old "verify_area()" name as an alias.
- the iter_iov code had magical hardcoded knowledge of the actual
values of VERIFY_{READ,WRITE} (not that they mattered, since nothing
really used it)
- microblaze used the type argument for a debug printout
but other than those oddities this should be a total no-op patch.
I tried to fix up all architectures, did fairly extensive grepping for
access_ok() uses, and the changes are trivial, but I may have missed
something. Any missed conversion should be trivially fixable, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-04 09:57:57 +07:00
|
|
|
if (access_ok(to, n)) {
|
2020-01-21 23:05:11 +07:00
|
|
|
instrument_copy_to_user(to, from, n);
|
2017-06-30 09:25:14 +07:00
|
|
|
n = raw_copy_to_user(to, from, n);
|
|
|
|
}
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int copyin(void *to, const void __user *from, size_t n)
|
|
|
|
{
|
2020-10-16 10:13:50 +07:00
|
|
|
if (should_fail_usercopy())
|
|
|
|
return n;
|
Remove 'type' argument from access_ok() function
Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument
of the user address range verification function since we got rid of the
old racy i386-only code to walk page tables by hand.
It existed because the original 80386 would not honor the write protect
bit when in kernel mode, so you had to do COW by hand before doing any
user access. But we haven't supported that in a long time, and these
days the 'type' argument is a purely historical artifact.
A discussion about extending 'user_access_begin()' to do the range
checking resulted this patch, because there is no way we're going to
move the old VERIFY_xyz interface to that model. And it's best done at
the end of the merge window when I've done most of my merges, so let's
just get this done once and for all.
This patch was mostly done with a sed-script, with manual fix-ups for
the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form.
There were a couple of notable cases:
- csky still had the old "verify_area()" name as an alias.
- the iter_iov code had magical hardcoded knowledge of the actual
values of VERIFY_{READ,WRITE} (not that they mattered, since nothing
really used it)
- microblaze used the type argument for a debug printout
but other than those oddities this should be a total no-op patch.
I tried to fix up all architectures, did fairly extensive grepping for
access_ok() uses, and the changes are trivial, but I may have missed
something. Any missed conversion should be trivially fixable, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-04 09:57:57 +07:00
|
|
|
if (access_ok(from, n)) {
|
2020-01-21 23:05:11 +07:00
|
|
|
instrument_copy_from_user(to, from, n);
|
2017-06-30 09:25:14 +07:00
|
|
|
n = raw_copy_from_user(to, from, n);
|
|
|
|
}
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
2014-04-05 10:12:29 +07:00
|
|
|
static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
|
2014-02-06 07:11:33 +07:00
|
|
|
struct iov_iter *i)
|
|
|
|
{
|
|
|
|
size_t skip, copy, left, wanted;
|
|
|
|
const struct iovec *iov;
|
|
|
|
char __user *buf;
|
|
|
|
void *kaddr, *from;
|
|
|
|
|
|
|
|
if (unlikely(bytes > i->count))
|
|
|
|
bytes = i->count;
|
|
|
|
|
|
|
|
if (unlikely(!bytes))
|
|
|
|
return 0;
|
|
|
|
|
2017-06-30 09:25:14 +07:00
|
|
|
might_fault();
|
2014-02-06 07:11:33 +07:00
|
|
|
wanted = bytes;
|
|
|
|
iov = i->iov;
|
|
|
|
skip = i->iov_offset;
|
|
|
|
buf = iov->iov_base + skip;
|
|
|
|
copy = min(bytes, iov->iov_len - skip);
|
|
|
|
|
2016-07-29 05:48:50 +07:00
|
|
|
if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
|
2014-02-06 07:11:33 +07:00
|
|
|
kaddr = kmap_atomic(page);
|
|
|
|
from = kaddr + offset;
|
|
|
|
|
|
|
|
/* first chunk, usually the only one */
|
2017-06-30 09:25:14 +07:00
|
|
|
left = copyout(buf, from, copy);
|
2014-02-06 07:11:33 +07:00
|
|
|
copy -= left;
|
|
|
|
skip += copy;
|
|
|
|
from += copy;
|
|
|
|
bytes -= copy;
|
|
|
|
|
|
|
|
while (unlikely(!left && bytes)) {
|
|
|
|
iov++;
|
|
|
|
buf = iov->iov_base;
|
|
|
|
copy = min(bytes, iov->iov_len);
|
2017-06-30 09:25:14 +07:00
|
|
|
left = copyout(buf, from, copy);
|
2014-02-06 07:11:33 +07:00
|
|
|
copy -= left;
|
|
|
|
skip = copy;
|
|
|
|
from += copy;
|
|
|
|
bytes -= copy;
|
|
|
|
}
|
|
|
|
if (likely(!bytes)) {
|
|
|
|
kunmap_atomic(kaddr);
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
offset = from - kaddr;
|
|
|
|
buf += copy;
|
|
|
|
kunmap_atomic(kaddr);
|
|
|
|
copy = min(bytes, iov->iov_len - skip);
|
|
|
|
}
|
|
|
|
/* Too bad - revert to non-atomic kmap */
|
2016-07-29 05:48:50 +07:00
|
|
|
|
2014-02-06 07:11:33 +07:00
|
|
|
kaddr = kmap(page);
|
|
|
|
from = kaddr + offset;
|
2017-06-30 09:25:14 +07:00
|
|
|
left = copyout(buf, from, copy);
|
2014-02-06 07:11:33 +07:00
|
|
|
copy -= left;
|
|
|
|
skip += copy;
|
|
|
|
from += copy;
|
|
|
|
bytes -= copy;
|
|
|
|
while (unlikely(!left && bytes)) {
|
|
|
|
iov++;
|
|
|
|
buf = iov->iov_base;
|
|
|
|
copy = min(bytes, iov->iov_len);
|
2017-06-30 09:25:14 +07:00
|
|
|
left = copyout(buf, from, copy);
|
2014-02-06 07:11:33 +07:00
|
|
|
copy -= left;
|
|
|
|
skip = copy;
|
|
|
|
from += copy;
|
|
|
|
bytes -= copy;
|
|
|
|
}
|
|
|
|
kunmap(page);
|
2016-07-29 05:48:50 +07:00
|
|
|
|
2014-02-06 07:11:33 +07:00
|
|
|
done:
|
2014-04-05 06:23:46 +07:00
|
|
|
if (skip == iov->iov_len) {
|
|
|
|
iov++;
|
|
|
|
skip = 0;
|
|
|
|
}
|
2014-02-06 07:11:33 +07:00
|
|
|
i->count -= wanted - bytes;
|
|
|
|
i->nr_segs -= iov - i->iov;
|
|
|
|
i->iov = iov;
|
|
|
|
i->iov_offset = skip;
|
|
|
|
return wanted - bytes;
|
|
|
|
}
|
|
|
|
|
2014-04-05 10:12:29 +07:00
|
|
|
static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
|
2014-04-04 02:05:18 +07:00
|
|
|
struct iov_iter *i)
|
|
|
|
{
|
|
|
|
size_t skip, copy, left, wanted;
|
|
|
|
const struct iovec *iov;
|
|
|
|
char __user *buf;
|
|
|
|
void *kaddr, *to;
|
|
|
|
|
|
|
|
if (unlikely(bytes > i->count))
|
|
|
|
bytes = i->count;
|
|
|
|
|
|
|
|
if (unlikely(!bytes))
|
|
|
|
return 0;
|
|
|
|
|
2017-06-30 09:25:14 +07:00
|
|
|
might_fault();
|
2014-04-04 02:05:18 +07:00
|
|
|
wanted = bytes;
|
|
|
|
iov = i->iov;
|
|
|
|
skip = i->iov_offset;
|
|
|
|
buf = iov->iov_base + skip;
|
|
|
|
copy = min(bytes, iov->iov_len - skip);
|
|
|
|
|
2016-07-29 05:48:50 +07:00
|
|
|
if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
|
2014-04-04 02:05:18 +07:00
|
|
|
kaddr = kmap_atomic(page);
|
|
|
|
to = kaddr + offset;
|
|
|
|
|
|
|
|
/* first chunk, usually the only one */
|
2017-06-30 09:25:14 +07:00
|
|
|
left = copyin(to, buf, copy);
|
2014-04-04 02:05:18 +07:00
|
|
|
copy -= left;
|
|
|
|
skip += copy;
|
|
|
|
to += copy;
|
|
|
|
bytes -= copy;
|
|
|
|
|
|
|
|
while (unlikely(!left && bytes)) {
|
|
|
|
iov++;
|
|
|
|
buf = iov->iov_base;
|
|
|
|
copy = min(bytes, iov->iov_len);
|
2017-06-30 09:25:14 +07:00
|
|
|
left = copyin(to, buf, copy);
|
2014-04-04 02:05:18 +07:00
|
|
|
copy -= left;
|
|
|
|
skip = copy;
|
|
|
|
to += copy;
|
|
|
|
bytes -= copy;
|
|
|
|
}
|
|
|
|
if (likely(!bytes)) {
|
|
|
|
kunmap_atomic(kaddr);
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
offset = to - kaddr;
|
|
|
|
buf += copy;
|
|
|
|
kunmap_atomic(kaddr);
|
|
|
|
copy = min(bytes, iov->iov_len - skip);
|
|
|
|
}
|
|
|
|
/* Too bad - revert to non-atomic kmap */
|
2016-07-29 05:48:50 +07:00
|
|
|
|
2014-04-04 02:05:18 +07:00
|
|
|
kaddr = kmap(page);
|
|
|
|
to = kaddr + offset;
|
2017-06-30 09:25:14 +07:00
|
|
|
left = copyin(to, buf, copy);
|
2014-04-04 02:05:18 +07:00
|
|
|
copy -= left;
|
|
|
|
skip += copy;
|
|
|
|
to += copy;
|
|
|
|
bytes -= copy;
|
|
|
|
while (unlikely(!left && bytes)) {
|
|
|
|
iov++;
|
|
|
|
buf = iov->iov_base;
|
|
|
|
copy = min(bytes, iov->iov_len);
|
2017-06-30 09:25:14 +07:00
|
|
|
left = copyin(to, buf, copy);
|
2014-04-04 02:05:18 +07:00
|
|
|
copy -= left;
|
|
|
|
skip = copy;
|
|
|
|
to += copy;
|
|
|
|
bytes -= copy;
|
|
|
|
}
|
|
|
|
kunmap(page);
|
2016-07-29 05:48:50 +07:00
|
|
|
|
2014-04-04 02:05:18 +07:00
|
|
|
done:
|
2014-04-05 06:23:46 +07:00
|
|
|
if (skip == iov->iov_len) {
|
|
|
|
iov++;
|
|
|
|
skip = 0;
|
|
|
|
}
|
2014-04-04 02:05:18 +07:00
|
|
|
i->count -= wanted - bytes;
|
|
|
|
i->nr_segs -= iov - i->iov;
|
|
|
|
i->iov = iov;
|
|
|
|
i->iov_offset = skip;
|
|
|
|
return wanted - bytes;
|
|
|
|
}
|
|
|
|
|
2016-09-23 03:33:12 +07:00
|
|
|
#ifdef PIPE_PARANOIA
|
|
|
|
static bool sanity(const struct iov_iter *i)
|
|
|
|
{
|
|
|
|
struct pipe_inode_info *pipe = i->pipe;
|
2019-11-15 20:30:32 +07:00
|
|
|
unsigned int p_head = pipe->head;
|
|
|
|
unsigned int p_tail = pipe->tail;
|
|
|
|
unsigned int p_mask = pipe->ring_size - 1;
|
|
|
|
unsigned int p_occupancy = pipe_occupancy(p_head, p_tail);
|
|
|
|
unsigned int i_head = i->head;
|
|
|
|
unsigned int idx;
|
|
|
|
|
2016-09-23 03:33:12 +07:00
|
|
|
if (i->iov_offset) {
|
|
|
|
struct pipe_buffer *p;
|
2019-11-15 20:30:32 +07:00
|
|
|
if (unlikely(p_occupancy == 0))
|
2016-09-23 03:33:12 +07:00
|
|
|
goto Bad; // pipe must be non-empty
|
2019-11-15 20:30:32 +07:00
|
|
|
if (unlikely(i_head != p_head - 1))
|
2016-09-23 03:33:12 +07:00
|
|
|
goto Bad; // must be at the last buffer...
|
|
|
|
|
2019-11-15 20:30:32 +07:00
|
|
|
p = &pipe->bufs[i_head & p_mask];
|
2016-09-23 03:33:12 +07:00
|
|
|
if (unlikely(p->offset + p->len != i->iov_offset))
|
|
|
|
goto Bad; // ... at the end of segment
|
|
|
|
} else {
|
2019-11-15 20:30:32 +07:00
|
|
|
if (i_head != p_head)
|
2016-09-23 03:33:12 +07:00
|
|
|
goto Bad; // must be right after the last buffer
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
Bad:
|
2019-11-15 20:30:32 +07:00
|
|
|
printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
|
|
|
|
printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
|
|
|
|
p_head, p_tail, pipe->ring_size);
|
|
|
|
for (idx = 0; idx < pipe->ring_size; idx++)
|
2016-09-23 03:33:12 +07:00
|
|
|
printk(KERN_ERR "[%p %p %d %d]\n",
|
|
|
|
pipe->bufs[idx].ops,
|
|
|
|
pipe->bufs[idx].page,
|
|
|
|
pipe->bufs[idx].offset,
|
|
|
|
pipe->bufs[idx].len);
|
|
|
|
WARN_ON(1);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
#define sanity(i) true
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
|
|
|
|
struct iov_iter *i)
|
|
|
|
{
|
|
|
|
struct pipe_inode_info *pipe = i->pipe;
|
|
|
|
struct pipe_buffer *buf;
|
2019-11-15 20:30:32 +07:00
|
|
|
unsigned int p_tail = pipe->tail;
|
|
|
|
unsigned int p_mask = pipe->ring_size - 1;
|
|
|
|
unsigned int i_head = i->head;
|
2016-09-23 03:33:12 +07:00
|
|
|
size_t off;
|
|
|
|
|
|
|
|
if (unlikely(bytes > i->count))
|
|
|
|
bytes = i->count;
|
|
|
|
|
|
|
|
if (unlikely(!bytes))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (!sanity(i))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
off = i->iov_offset;
|
2019-11-15 20:30:32 +07:00
|
|
|
buf = &pipe->bufs[i_head & p_mask];
|
2016-09-23 03:33:12 +07:00
|
|
|
if (off) {
|
|
|
|
if (offset == off && buf->page == page) {
|
|
|
|
/* merge with the last one */
|
|
|
|
buf->len += bytes;
|
|
|
|
i->iov_offset += bytes;
|
|
|
|
goto out;
|
|
|
|
}
|
2019-11-15 20:30:32 +07:00
|
|
|
i_head++;
|
|
|
|
buf = &pipe->bufs[i_head & p_mask];
|
2016-09-23 03:33:12 +07:00
|
|
|
}
|
2019-10-16 22:47:32 +07:00
|
|
|
if (pipe_full(i_head, p_tail, pipe->max_usage))
|
2016-09-23 03:33:12 +07:00
|
|
|
return 0;
|
2019-11-15 20:30:32 +07:00
|
|
|
|
2016-09-23 03:33:12 +07:00
|
|
|
buf->ops = &page_cache_pipe_buf_ops;
|
2019-11-15 20:30:32 +07:00
|
|
|
get_page(page);
|
|
|
|
buf->page = page;
|
2016-09-23 03:33:12 +07:00
|
|
|
buf->offset = offset;
|
|
|
|
buf->len = bytes;
|
2019-11-15 20:30:32 +07:00
|
|
|
|
|
|
|
pipe->head = i_head + 1;
|
2016-09-23 03:33:12 +07:00
|
|
|
i->iov_offset = offset + bytes;
|
2019-11-15 20:30:32 +07:00
|
|
|
i->head = i_head;
|
2016-09-23 03:33:12 +07:00
|
|
|
out:
|
|
|
|
i->count -= bytes;
|
|
|
|
return bytes;
|
|
|
|
}
|
|
|
|
|
2015-03-11 21:43:31 +07:00
|
|
|
/*
|
|
|
|
* Fault in one or more iovecs of the given iov_iter, to a maximum length of
|
|
|
|
* bytes. For each iovec, fault in each page that constitutes the iovec.
|
|
|
|
*
|
|
|
|
* Return 0 on success, or non-zero if the memory could not be accessed (i.e.
|
|
|
|
* because it is an invalid address).
|
|
|
|
*/
|
2016-09-16 06:11:45 +07:00
|
|
|
int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
|
2015-03-11 21:43:31 +07:00
|
|
|
{
|
|
|
|
size_t skip = i->iov_offset;
|
|
|
|
const struct iovec *iov;
|
|
|
|
int err;
|
|
|
|
struct iovec v;
|
|
|
|
|
|
|
|
if (!(i->type & (ITER_BVEC|ITER_KVEC))) {
|
|
|
|
iterate_iovec(i, bytes, v, iov, skip, ({
|
2016-09-18 05:02:44 +07:00
|
|
|
err = fault_in_pages_readable(v.iov_base, v.iov_len);
|
2015-03-11 21:43:31 +07:00
|
|
|
if (unlikely(err))
|
|
|
|
return err;
|
|
|
|
0;}))
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2016-09-16 06:11:45 +07:00
|
|
|
EXPORT_SYMBOL(iov_iter_fault_in_readable);
|
2015-03-11 21:43:31 +07:00
|
|
|
|
2018-10-20 06:57:56 +07:00
|
|
|
void iov_iter_init(struct iov_iter *i, unsigned int direction,
|
2014-03-06 07:28:09 +07:00
|
|
|
const struct iovec *iov, unsigned long nr_segs,
|
|
|
|
size_t count)
|
|
|
|
{
|
2018-10-20 06:57:56 +07:00
|
|
|
WARN_ON(direction & ~(READ | WRITE));
|
|
|
|
direction &= READ | WRITE;
|
|
|
|
|
2014-03-06 07:28:09 +07:00
|
|
|
/* It will get better. Eventually... */
|
2017-03-21 08:08:07 +07:00
|
|
|
if (uaccess_kernel()) {
|
2018-10-20 06:57:56 +07:00
|
|
|
i->type = ITER_KVEC | direction;
|
2014-11-28 02:48:42 +07:00
|
|
|
i->kvec = (struct kvec *)iov;
|
|
|
|
} else {
|
2018-10-20 06:57:56 +07:00
|
|
|
i->type = ITER_IOVEC | direction;
|
2014-11-28 02:48:42 +07:00
|
|
|
i->iov = iov;
|
|
|
|
}
|
2014-03-06 07:28:09 +07:00
|
|
|
i->nr_segs = nr_segs;
|
|
|
|
i->iov_offset = 0;
|
|
|
|
i->count = count;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_init);
|
2014-03-15 15:05:57 +07:00
|
|
|
|
2014-04-05 10:12:29 +07:00
|
|
|
static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len)
|
|
|
|
{
|
|
|
|
char *from = kmap_atomic(page);
|
|
|
|
memcpy(to, from + offset, len);
|
|
|
|
kunmap_atomic(from);
|
|
|
|
}
|
|
|
|
|
2015-12-07 04:49:22 +07:00
|
|
|
static void memcpy_to_page(struct page *page, size_t offset, const char *from, size_t len)
|
2014-04-05 10:12:29 +07:00
|
|
|
{
|
|
|
|
char *to = kmap_atomic(page);
|
|
|
|
memcpy(to + offset, from, len);
|
|
|
|
kunmap_atomic(to);
|
|
|
|
}
|
|
|
|
|
2014-08-01 20:27:22 +07:00
|
|
|
static void memzero_page(struct page *page, size_t offset, size_t len)
|
|
|
|
{
|
|
|
|
char *addr = kmap_atomic(page);
|
|
|
|
memset(addr + offset, 0, len);
|
|
|
|
kunmap_atomic(addr);
|
|
|
|
}
|
|
|
|
|
2016-09-23 03:33:12 +07:00
|
|
|
static inline bool allocated(struct pipe_buffer *buf)
|
|
|
|
{
|
|
|
|
return buf->ops == &default_pipe_buf_ops;
|
|
|
|
}
|
|
|
|
|
2019-11-15 20:30:32 +07:00
|
|
|
static inline void data_start(const struct iov_iter *i,
|
|
|
|
unsigned int *iter_headp, size_t *offp)
|
2016-09-23 03:33:12 +07:00
|
|
|
{
|
2019-11-15 20:30:32 +07:00
|
|
|
unsigned int p_mask = i->pipe->ring_size - 1;
|
|
|
|
unsigned int iter_head = i->head;
|
2016-09-23 03:33:12 +07:00
|
|
|
size_t off = i->iov_offset;
|
2019-11-15 20:30:32 +07:00
|
|
|
|
|
|
|
if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) ||
|
|
|
|
off == PAGE_SIZE)) {
|
|
|
|
iter_head++;
|
2016-09-23 03:33:12 +07:00
|
|
|
off = 0;
|
|
|
|
}
|
2019-11-15 20:30:32 +07:00
|
|
|
*iter_headp = iter_head;
|
2016-09-23 03:33:12 +07:00
|
|
|
*offp = off;
|
|
|
|
}
|
|
|
|
|
|
|
|
static size_t push_pipe(struct iov_iter *i, size_t size,
|
2019-11-15 20:30:32 +07:00
|
|
|
int *iter_headp, size_t *offp)
|
2016-09-23 03:33:12 +07:00
|
|
|
{
|
|
|
|
struct pipe_inode_info *pipe = i->pipe;
|
2019-11-15 20:30:32 +07:00
|
|
|
unsigned int p_tail = pipe->tail;
|
|
|
|
unsigned int p_mask = pipe->ring_size - 1;
|
|
|
|
unsigned int iter_head;
|
2016-09-23 03:33:12 +07:00
|
|
|
size_t off;
|
|
|
|
ssize_t left;
|
|
|
|
|
|
|
|
if (unlikely(size > i->count))
|
|
|
|
size = i->count;
|
|
|
|
if (unlikely(!size))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
left = size;
|
2019-11-15 20:30:32 +07:00
|
|
|
data_start(i, &iter_head, &off);
|
|
|
|
*iter_headp = iter_head;
|
2016-09-23 03:33:12 +07:00
|
|
|
*offp = off;
|
|
|
|
if (off) {
|
|
|
|
left -= PAGE_SIZE - off;
|
|
|
|
if (left <= 0) {
|
2019-11-15 20:30:32 +07:00
|
|
|
pipe->bufs[iter_head & p_mask].len += size;
|
2016-09-23 03:33:12 +07:00
|
|
|
return size;
|
|
|
|
}
|
2019-11-15 20:30:32 +07:00
|
|
|
pipe->bufs[iter_head & p_mask].len = PAGE_SIZE;
|
|
|
|
iter_head++;
|
2016-09-23 03:33:12 +07:00
|
|
|
}
|
2019-10-16 22:47:32 +07:00
|
|
|
while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
|
2019-11-15 20:30:32 +07:00
|
|
|
struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask];
|
2016-09-23 03:33:12 +07:00
|
|
|
struct page *page = alloc_page(GFP_USER);
|
|
|
|
if (!page)
|
|
|
|
break;
|
2019-11-15 20:30:32 +07:00
|
|
|
|
|
|
|
buf->ops = &default_pipe_buf_ops;
|
|
|
|
buf->page = page;
|
|
|
|
buf->offset = 0;
|
|
|
|
buf->len = min_t(ssize_t, left, PAGE_SIZE);
|
|
|
|
left -= buf->len;
|
|
|
|
iter_head++;
|
|
|
|
pipe->head = iter_head;
|
|
|
|
|
|
|
|
if (left == 0)
|
2016-09-23 03:33:12 +07:00
|
|
|
return size;
|
|
|
|
}
|
|
|
|
return size - left;
|
|
|
|
}
|
|
|
|
|
|
|
|
static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
|
|
|
|
struct iov_iter *i)
|
|
|
|
{
|
|
|
|
struct pipe_inode_info *pipe = i->pipe;
|
2019-11-15 20:30:32 +07:00
|
|
|
unsigned int p_mask = pipe->ring_size - 1;
|
|
|
|
unsigned int i_head;
|
2016-09-23 03:33:12 +07:00
|
|
|
size_t n, off;
|
|
|
|
|
|
|
|
if (!sanity(i))
|
|
|
|
return 0;
|
|
|
|
|
2019-11-15 20:30:32 +07:00
|
|
|
bytes = n = push_pipe(i, bytes, &i_head, &off);
|
2016-09-23 03:33:12 +07:00
|
|
|
if (unlikely(!n))
|
|
|
|
return 0;
|
2019-11-15 20:30:32 +07:00
|
|
|
do {
|
2016-09-23 03:33:12 +07:00
|
|
|
size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
|
2019-11-15 20:30:32 +07:00
|
|
|
memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
|
|
|
|
i->head = i_head;
|
2016-09-23 03:33:12 +07:00
|
|
|
i->iov_offset = off + chunk;
|
|
|
|
n -= chunk;
|
|
|
|
addr += chunk;
|
2019-11-15 20:30:32 +07:00
|
|
|
off = 0;
|
|
|
|
i_head++;
|
|
|
|
} while (n);
|
2016-09-23 03:33:12 +07:00
|
|
|
i->count -= bytes;
|
|
|
|
return bytes;
|
|
|
|
}
|
|
|
|
|
2018-11-28 10:32:59 +07:00
|
|
|
static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
|
|
|
|
__wsum sum, size_t off)
|
|
|
|
{
|
2020-07-11 11:12:07 +07:00
|
|
|
__wsum next = csum_partial_copy_nocheck(from, to, len);
|
2018-11-28 10:32:59 +07:00
|
|
|
return csum_block_add(sum, next, off);
|
|
|
|
}
|
|
|
|
|
2018-11-26 04:24:16 +07:00
|
|
|
static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
|
|
|
|
__wsum *csum, struct iov_iter *i)
|
|
|
|
{
|
|
|
|
struct pipe_inode_info *pipe = i->pipe;
|
2019-11-15 20:30:32 +07:00
|
|
|
unsigned int p_mask = pipe->ring_size - 1;
|
|
|
|
unsigned int i_head;
|
2018-11-26 04:24:16 +07:00
|
|
|
size_t n, r;
|
|
|
|
size_t off = 0;
|
2018-11-28 10:32:59 +07:00
|
|
|
__wsum sum = *csum;
|
2018-11-26 04:24:16 +07:00
|
|
|
|
|
|
|
if (!sanity(i))
|
|
|
|
return 0;
|
|
|
|
|
2019-11-15 20:30:32 +07:00
|
|
|
bytes = n = push_pipe(i, bytes, &i_head, &r);
|
2018-11-26 04:24:16 +07:00
|
|
|
if (unlikely(!n))
|
|
|
|
return 0;
|
2019-11-15 20:30:32 +07:00
|
|
|
do {
|
2018-11-26 04:24:16 +07:00
|
|
|
size_t chunk = min_t(size_t, n, PAGE_SIZE - r);
|
2019-11-15 20:30:32 +07:00
|
|
|
char *p = kmap_atomic(pipe->bufs[i_head & p_mask].page);
|
2018-11-28 10:32:59 +07:00
|
|
|
sum = csum_and_memcpy(p + r, addr, chunk, sum, off);
|
2018-11-26 04:24:16 +07:00
|
|
|
kunmap_atomic(p);
|
2019-11-15 20:30:32 +07:00
|
|
|
i->head = i_head;
|
2018-11-26 04:24:16 +07:00
|
|
|
i->iov_offset = r + chunk;
|
|
|
|
n -= chunk;
|
|
|
|
off += chunk;
|
|
|
|
addr += chunk;
|
2019-11-15 20:30:32 +07:00
|
|
|
r = 0;
|
|
|
|
i_head++;
|
|
|
|
} while (n);
|
2018-11-26 04:24:16 +07:00
|
|
|
i->count -= bytes;
|
|
|
|
*csum = sum;
|
|
|
|
return bytes;
|
|
|
|
}
|
|
|
|
|
2017-06-30 08:45:10 +07:00
|
|
|
size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
|
2014-04-05 10:12:29 +07:00
|
|
|
{
|
2015-12-07 04:49:22 +07:00
|
|
|
const char *from = addr;
|
2018-10-22 19:07:28 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i)))
|
2016-09-23 03:33:12 +07:00
|
|
|
return copy_pipe_to_iter(addr, bytes, i);
|
2017-06-30 09:25:14 +07:00
|
|
|
if (iter_is_iovec(i))
|
|
|
|
might_fault();
|
2014-11-28 02:28:06 +07:00
|
|
|
iterate_and_advance(i, bytes, v,
|
2017-06-30 09:25:14 +07:00
|
|
|
copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
|
2014-11-28 02:28:06 +07:00
|
|
|
memcpy_to_page(v.bv_page, v.bv_offset,
|
2014-11-28 02:48:42 +07:00
|
|
|
(from += v.bv_len) - v.bv_len, v.bv_len),
|
|
|
|
memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len)
|
2014-11-28 02:28:06 +07:00
|
|
|
)
|
2014-04-05 10:12:29 +07:00
|
|
|
|
2014-11-28 02:28:06 +07:00
|
|
|
return bytes;
|
2014-08-01 20:27:22 +07:00
|
|
|
}
|
2017-06-30 08:45:10 +07:00
|
|
|
EXPORT_SYMBOL(_copy_to_iter);
|
2014-08-01 20:27:22 +07:00
|
|
|
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 10:40:16 +07:00
|
|
|
#ifdef CONFIG_ARCH_HAS_COPY_MC
|
|
|
|
static int copyout_mc(void __user *to, const void *from, size_t n)
|
2018-05-04 07:06:31 +07:00
|
|
|
{
|
Remove 'type' argument from access_ok() function
Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument
of the user address range verification function since we got rid of the
old racy i386-only code to walk page tables by hand.
It existed because the original 80386 would not honor the write protect
bit when in kernel mode, so you had to do COW by hand before doing any
user access. But we haven't supported that in a long time, and these
days the 'type' argument is a purely historical artifact.
A discussion about extending 'user_access_begin()' to do the range
checking resulted this patch, because there is no way we're going to
move the old VERIFY_xyz interface to that model. And it's best done at
the end of the merge window when I've done most of my merges, so let's
just get this done once and for all.
This patch was mostly done with a sed-script, with manual fix-ups for
the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form.
There were a couple of notable cases:
- csky still had the old "verify_area()" name as an alias.
- the iter_iov code had magical hardcoded knowledge of the actual
values of VERIFY_{READ,WRITE} (not that they mattered, since nothing
really used it)
- microblaze used the type argument for a debug printout
but other than those oddities this should be a total no-op patch.
I tried to fix up all architectures, did fairly extensive grepping for
access_ok() uses, and the changes are trivial, but I may have missed
something. Any missed conversion should be trivially fixable, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-04 09:57:57 +07:00
|
|
|
if (access_ok(to, n)) {
|
2020-01-21 23:05:11 +07:00
|
|
|
instrument_copy_to_user(to, from, n);
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 10:40:16 +07:00
|
|
|
n = copy_mc_to_user((__force void *) to, from, n);
|
2018-05-04 07:06:31 +07:00
|
|
|
}
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 10:40:16 +07:00
|
|
|
static unsigned long copy_mc_to_page(struct page *page, size_t offset,
|
2018-05-04 07:06:31 +07:00
|
|
|
const char *from, size_t len)
|
|
|
|
{
|
|
|
|
unsigned long ret;
|
|
|
|
char *to;
|
|
|
|
|
|
|
|
to = kmap_atomic(page);
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 10:40:16 +07:00
|
|
|
ret = copy_mc_to_kernel(to + offset, from, len);
|
2018-05-04 07:06:31 +07:00
|
|
|
kunmap_atomic(to);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 10:40:16 +07:00
|
|
|
static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
|
2018-07-09 03:46:12 +07:00
|
|
|
struct iov_iter *i)
|
|
|
|
{
|
|
|
|
struct pipe_inode_info *pipe = i->pipe;
|
2019-11-15 20:30:32 +07:00
|
|
|
unsigned int p_mask = pipe->ring_size - 1;
|
|
|
|
unsigned int i_head;
|
2018-07-09 03:46:12 +07:00
|
|
|
size_t n, off, xfer = 0;
|
|
|
|
|
|
|
|
if (!sanity(i))
|
|
|
|
return 0;
|
|
|
|
|
2019-11-15 20:30:32 +07:00
|
|
|
bytes = n = push_pipe(i, bytes, &i_head, &off);
|
2018-07-09 03:46:12 +07:00
|
|
|
if (unlikely(!n))
|
|
|
|
return 0;
|
2019-11-15 20:30:32 +07:00
|
|
|
do {
|
2018-07-09 03:46:12 +07:00
|
|
|
size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
|
|
|
|
unsigned long rem;
|
|
|
|
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 10:40:16 +07:00
|
|
|
rem = copy_mc_to_page(pipe->bufs[i_head & p_mask].page,
|
2019-11-15 20:30:32 +07:00
|
|
|
off, addr, chunk);
|
|
|
|
i->head = i_head;
|
2018-07-09 03:46:12 +07:00
|
|
|
i->iov_offset = off + chunk - rem;
|
|
|
|
xfer += chunk - rem;
|
|
|
|
if (rem)
|
|
|
|
break;
|
|
|
|
n -= chunk;
|
|
|
|
addr += chunk;
|
2019-11-15 20:30:32 +07:00
|
|
|
off = 0;
|
|
|
|
i_head++;
|
|
|
|
} while (n);
|
2018-07-09 03:46:12 +07:00
|
|
|
i->count -= xfer;
|
|
|
|
return xfer;
|
|
|
|
}
|
|
|
|
|
2018-07-09 03:46:02 +07:00
|
|
|
/**
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 10:40:16 +07:00
|
|
|
* _copy_mc_to_iter - copy to iter with source memory error exception handling
|
2018-07-09 03:46:02 +07:00
|
|
|
* @addr: source kernel address
|
|
|
|
* @bytes: total transfer length
|
|
|
|
* @iter: destination iterator
|
|
|
|
*
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 10:40:16 +07:00
|
|
|
* The pmem driver deploys this for the dax operation
|
|
|
|
* (dax_copy_to_iter()) for dax reads (bypass page-cache and the
|
|
|
|
* block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
|
|
|
|
* successfully copied.
|
2018-07-09 03:46:02 +07:00
|
|
|
*
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 10:40:16 +07:00
|
|
|
* The main differences between this and typical _copy_to_iter().
|
2018-07-09 03:46:02 +07:00
|
|
|
*
|
|
|
|
* * Typical tail/residue handling after a fault retries the copy
|
|
|
|
* byte-by-byte until the fault happens again. Re-triggering machine
|
|
|
|
* checks is potentially fatal so the implementation uses source
|
|
|
|
* alignment and poison alignment assumptions to avoid re-triggering
|
|
|
|
* hardware exceptions.
|
|
|
|
*
|
|
|
|
* * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
|
|
|
|
* Compare to copy_to_iter() where only ITER_IOVEC attempts might return
|
|
|
|
* a short copy.
|
|
|
|
*/
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 10:40:16 +07:00
|
|
|
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
|
2018-05-04 07:06:31 +07:00
|
|
|
{
|
|
|
|
const char *from = addr;
|
|
|
|
unsigned long rem, curr_addr, s_addr = (unsigned long) addr;
|
|
|
|
|
2018-10-22 19:07:28 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i)))
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 10:40:16 +07:00
|
|
|
return copy_mc_pipe_to_iter(addr, bytes, i);
|
2018-05-04 07:06:31 +07:00
|
|
|
if (iter_is_iovec(i))
|
|
|
|
might_fault();
|
|
|
|
iterate_and_advance(i, bytes, v,
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 10:40:16 +07:00
|
|
|
copyout_mc(v.iov_base, (from += v.iov_len) - v.iov_len,
|
|
|
|
v.iov_len),
|
2018-05-04 07:06:31 +07:00
|
|
|
({
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 10:40:16 +07:00
|
|
|
rem = copy_mc_to_page(v.bv_page, v.bv_offset,
|
|
|
|
(from += v.bv_len) - v.bv_len, v.bv_len);
|
2018-05-04 07:06:31 +07:00
|
|
|
if (rem) {
|
|
|
|
curr_addr = (unsigned long) from;
|
|
|
|
bytes = curr_addr - s_addr - rem;
|
|
|
|
return bytes;
|
|
|
|
}
|
|
|
|
}),
|
|
|
|
({
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 10:40:16 +07:00
|
|
|
rem = copy_mc_to_kernel(v.iov_base, (from += v.iov_len)
|
|
|
|
- v.iov_len, v.iov_len);
|
2018-05-04 07:06:31 +07:00
|
|
|
if (rem) {
|
|
|
|
curr_addr = (unsigned long) from;
|
|
|
|
bytes = curr_addr - s_addr - rem;
|
|
|
|
return bytes;
|
|
|
|
}
|
|
|
|
})
|
|
|
|
)
|
|
|
|
|
|
|
|
return bytes;
|
|
|
|
}
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-06 10:40:16 +07:00
|
|
|
EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
|
|
|
|
#endif /* CONFIG_ARCH_HAS_COPY_MC */
|
2018-05-04 07:06:31 +07:00
|
|
|
|
2017-06-30 08:45:10 +07:00
|
|
|
size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
|
2014-08-01 20:27:22 +07:00
|
|
|
{
|
2014-11-28 02:26:43 +07:00
|
|
|
char *to = addr;
|
2018-10-22 19:07:28 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i))) {
|
2016-09-23 03:33:12 +07:00
|
|
|
WARN_ON(1);
|
|
|
|
return 0;
|
|
|
|
}
|
2017-06-30 09:25:14 +07:00
|
|
|
if (iter_is_iovec(i))
|
|
|
|
might_fault();
|
2014-11-28 02:26:43 +07:00
|
|
|
iterate_and_advance(i, bytes, v,
|
2017-06-30 09:25:14 +07:00
|
|
|
copyin((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
|
2014-11-28 02:26:43 +07:00
|
|
|
memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
|
2014-11-28 02:48:42 +07:00
|
|
|
v.bv_offset, v.bv_len),
|
|
|
|
memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
|
2014-11-28 02:26:43 +07:00
|
|
|
)
|
|
|
|
|
|
|
|
return bytes;
|
2014-08-01 20:27:22 +07:00
|
|
|
}
|
2017-06-30 08:45:10 +07:00
|
|
|
EXPORT_SYMBOL(_copy_from_iter);
|
2014-08-01 20:27:22 +07:00
|
|
|
|
2017-06-30 08:45:10 +07:00
|
|
|
bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
|
2016-11-02 09:09:04 +07:00
|
|
|
{
|
|
|
|
char *to = addr;
|
2018-10-22 19:07:28 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i))) {
|
2016-11-02 09:09:04 +07:00
|
|
|
WARN_ON(1);
|
|
|
|
return false;
|
|
|
|
}
|
2016-12-22 09:55:02 +07:00
|
|
|
if (unlikely(i->count < bytes))
|
2016-11-02 09:09:04 +07:00
|
|
|
return false;
|
|
|
|
|
2017-06-30 09:25:14 +07:00
|
|
|
if (iter_is_iovec(i))
|
|
|
|
might_fault();
|
2016-11-02 09:09:04 +07:00
|
|
|
iterate_all_kinds(i, bytes, v, ({
|
2017-06-30 09:25:14 +07:00
|
|
|
if (copyin((to += v.iov_len) - v.iov_len,
|
2016-11-02 09:09:04 +07:00
|
|
|
v.iov_base, v.iov_len))
|
|
|
|
return false;
|
|
|
|
0;}),
|
|
|
|
memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
|
|
|
|
v.bv_offset, v.bv_len),
|
|
|
|
memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
|
|
|
|
)
|
|
|
|
|
|
|
|
iov_iter_advance(i, bytes);
|
|
|
|
return true;
|
|
|
|
}
|
2017-06-30 08:45:10 +07:00
|
|
|
EXPORT_SYMBOL(_copy_from_iter_full);
|
2016-11-02 09:09:04 +07:00
|
|
|
|
2017-06-30 08:45:10 +07:00
|
|
|
size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
|
2014-11-28 08:27:08 +07:00
|
|
|
{
|
|
|
|
char *to = addr;
|
2018-10-22 19:07:28 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i))) {
|
2016-09-23 03:33:12 +07:00
|
|
|
WARN_ON(1);
|
|
|
|
return 0;
|
|
|
|
}
|
2014-11-28 08:27:08 +07:00
|
|
|
iterate_and_advance(i, bytes, v,
|
2017-03-26 05:47:28 +07:00
|
|
|
__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
|
2014-11-28 08:27:08 +07:00
|
|
|
v.iov_base, v.iov_len),
|
|
|
|
memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
|
|
|
|
v.bv_offset, v.bv_len),
|
|
|
|
memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
|
|
|
|
)
|
|
|
|
|
|
|
|
return bytes;
|
|
|
|
}
|
2017-06-30 08:45:10 +07:00
|
|
|
EXPORT_SYMBOL(_copy_from_iter_nocache);
|
2014-11-28 08:27:08 +07:00
|
|
|
|
2017-05-30 02:22:50 +07:00
|
|
|
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
|
2018-07-09 03:46:07 +07:00
|
|
|
/**
|
|
|
|
* _copy_from_iter_flushcache - write destination through cpu cache
|
|
|
|
* @addr: destination kernel address
|
|
|
|
* @bytes: total transfer length
|
|
|
|
* @iter: source iterator
|
|
|
|
*
|
|
|
|
* The pmem driver arranges for filesystem-dax to use this facility via
|
|
|
|
* dax_copy_from_iter() for ensuring that writes to persistent memory
|
|
|
|
* are flushed through the CPU cache. It is differentiated from
|
|
|
|
* _copy_from_iter_nocache() in that guarantees all data is flushed for
|
|
|
|
* all iterator types. The _copy_from_iter_nocache() only attempts to
|
|
|
|
* bypass the cache for the ITER_IOVEC case, and on some archs may use
|
|
|
|
* instructions that strand dirty-data in the cache.
|
|
|
|
*/
|
Merge branch 'uaccess-work.iov_iter' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull iov_iter hardening from Al Viro:
"This is the iov_iter/uaccess/hardening pile.
For one thing, it trims the inline part of copy_to_user/copy_from_user
to the minimum that *does* need to be inlined - object size checks,
basically. For another, it sanitizes the checks for iov_iter
primitives. There are 4 groups of checks: access_ok(), might_fault(),
object size and KASAN.
- access_ok() had been verified by whoever had set the iov_iter up.
However, that has happened in a function far away, so proving that
there's no path to actual copying bypassing those checks is hard
and proving that iov_iter has not been buggered in the meanwhile is
also not pleasant. So we want those redone in actual
copyin/copyout.
- might_fault() is better off consolidated - we know whether it needs
to be checked as soon as we enter iov_iter primitive and observe
the iov_iter flavour. No need to wait until the copyin/copyout. The
call chains are short enough to make sure we won't miss anything -
in fact, it's more robust that way, since there are cases where we
do e.g. forced fault-in before getting to copyin/copyout. It's not
quite what we need to check (in particular, combination of
iovec-backed and set_fs(KERNEL_DS) is almost certainly a bug, not a
cause to skip checks), but that's for later series. For now let's
keep might_fault().
- KASAN checks belong in copyin/copyout - at the same level where
other iov_iter flavours would've hit them in memcpy().
- object size checks should apply to *all* iov_iter flavours, not
just iovec-backed ones.
There are two groups of primitives - one gets the kernel object
described as pointer + size (copy_to_iter(), etc.) while another gets
it as page + offset + size (copy_page_to_iter(), etc.)
For the first group the checks are best done where we actually have a
chance to find the object size. In other words, those belong in inline
wrappers in uio.h, before calling into iov_iter.c. Same kind as we
have for inlined part of copy_to_user().
For the second group there is no object to look at - offset in page is
just a number, it bears no type information. So we do them in the
common helper called by iov_iter.c primitives of that kind. All it
currently does is checking that we are not trying to access outside of
the compound page; eventually we might want to add some sanity checks
on the page involved.
So the things we need in copyin/copyout part of iov_iter.c do not
quite match anything in uaccess.h (we want no zeroing, we *do* want
access_ok() and KASAN and we want no might_fault() or object size
checks done on that level). OTOH, these needs are simple enough to
provide a couple of helpers (static in iov_iter.c) doing just what we
need..."
* 'uaccess-work.iov_iter' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
iov_iter: saner checks on copyin/copyout
iov_iter: sanity checks for copy to/from page primitives
iov_iter/hardening: move object size checks to inlined part
copy_{to,from}_user(): consolidate object size checks
copy_{from,to}_user(): move kasan checks and might_fault() out-of-line
2017-07-08 10:39:20 +07:00
|
|
|
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
|
2017-05-30 02:22:50 +07:00
|
|
|
{
|
|
|
|
char *to = addr;
|
2018-10-22 19:07:28 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i))) {
|
2017-05-30 02:22:50 +07:00
|
|
|
WARN_ON(1);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
iterate_and_advance(i, bytes, v,
|
|
|
|
__copy_from_user_flushcache((to += v.iov_len) - v.iov_len,
|
|
|
|
v.iov_base, v.iov_len),
|
|
|
|
memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
|
|
|
|
v.bv_offset, v.bv_len),
|
|
|
|
memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base,
|
|
|
|
v.iov_len)
|
|
|
|
)
|
|
|
|
|
|
|
|
return bytes;
|
|
|
|
}
|
Merge branch 'uaccess-work.iov_iter' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull iov_iter hardening from Al Viro:
"This is the iov_iter/uaccess/hardening pile.
For one thing, it trims the inline part of copy_to_user/copy_from_user
to the minimum that *does* need to be inlined - object size checks,
basically. For another, it sanitizes the checks for iov_iter
primitives. There are 4 groups of checks: access_ok(), might_fault(),
object size and KASAN.
- access_ok() had been verified by whoever had set the iov_iter up.
However, that has happened in a function far away, so proving that
there's no path to actual copying bypassing those checks is hard
and proving that iov_iter has not been buggered in the meanwhile is
also not pleasant. So we want those redone in actual
copyin/copyout.
- might_fault() is better off consolidated - we know whether it needs
to be checked as soon as we enter iov_iter primitive and observe
the iov_iter flavour. No need to wait until the copyin/copyout. The
call chains are short enough to make sure we won't miss anything -
in fact, it's more robust that way, since there are cases where we
do e.g. forced fault-in before getting to copyin/copyout. It's not
quite what we need to check (in particular, combination of
iovec-backed and set_fs(KERNEL_DS) is almost certainly a bug, not a
cause to skip checks), but that's for later series. For now let's
keep might_fault().
- KASAN checks belong in copyin/copyout - at the same level where
other iov_iter flavours would've hit them in memcpy().
- object size checks should apply to *all* iov_iter flavours, not
just iovec-backed ones.
There are two groups of primitives - one gets the kernel object
described as pointer + size (copy_to_iter(), etc.) while another gets
it as page + offset + size (copy_page_to_iter(), etc.)
For the first group the checks are best done where we actually have a
chance to find the object size. In other words, those belong in inline
wrappers in uio.h, before calling into iov_iter.c. Same kind as we
have for inlined part of copy_to_user().
For the second group there is no object to look at - offset in page is
just a number, it bears no type information. So we do them in the
common helper called by iov_iter.c primitives of that kind. All it
currently does is checking that we are not trying to access outside of
the compound page; eventually we might want to add some sanity checks
on the page involved.
So the things we need in copyin/copyout part of iov_iter.c do not
quite match anything in uaccess.h (we want no zeroing, we *do* want
access_ok() and KASAN and we want no might_fault() or object size
checks done on that level). OTOH, these needs are simple enough to
provide a couple of helpers (static in iov_iter.c) doing just what we
need..."
* 'uaccess-work.iov_iter' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
iov_iter: saner checks on copyin/copyout
iov_iter: sanity checks for copy to/from page primitives
iov_iter/hardening: move object size checks to inlined part
copy_{to,from}_user(): consolidate object size checks
copy_{from,to}_user(): move kasan checks and might_fault() out-of-line
2017-07-08 10:39:20 +07:00
|
|
|
EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
|
2017-05-30 02:22:50 +07:00
|
|
|
#endif
|
|
|
|
|
2017-06-30 08:45:10 +07:00
|
|
|
bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
|
2016-11-02 09:09:04 +07:00
|
|
|
{
|
|
|
|
char *to = addr;
|
2018-10-22 19:07:28 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i))) {
|
2016-11-02 09:09:04 +07:00
|
|
|
WARN_ON(1);
|
|
|
|
return false;
|
|
|
|
}
|
2016-12-22 09:55:02 +07:00
|
|
|
if (unlikely(i->count < bytes))
|
2016-11-02 09:09:04 +07:00
|
|
|
return false;
|
|
|
|
iterate_all_kinds(i, bytes, v, ({
|
2017-03-26 05:47:28 +07:00
|
|
|
if (__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
|
2016-11-02 09:09:04 +07:00
|
|
|
v.iov_base, v.iov_len))
|
|
|
|
return false;
|
|
|
|
0;}),
|
|
|
|
memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
|
|
|
|
v.bv_offset, v.bv_len),
|
|
|
|
memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
|
|
|
|
)
|
|
|
|
|
|
|
|
iov_iter_advance(i, bytes);
|
|
|
|
return true;
|
|
|
|
}
|
2017-06-30 08:45:10 +07:00
|
|
|
EXPORT_SYMBOL(_copy_from_iter_full_nocache);
|
2016-11-02 09:09:04 +07:00
|
|
|
|
2017-06-30 08:52:57 +07:00
|
|
|
static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
|
|
|
|
{
|
2019-02-27 01:42:39 +07:00
|
|
|
struct page *head;
|
|
|
|
size_t v = n + offset;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The general case needs to access the page order in order
|
|
|
|
* to compute the page size.
|
|
|
|
* However, we mostly deal with order-0 pages and thus can
|
|
|
|
* avoid a possible cache line miss for requests that fit all
|
|
|
|
* page orders.
|
|
|
|
*/
|
|
|
|
if (n <= v && v <= PAGE_SIZE)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
head = compound_head(page);
|
|
|
|
v += (page - head) << PAGE_SHIFT;
|
2017-08-30 01:20:32 +07:00
|
|
|
|
2019-09-24 05:34:25 +07:00
|
|
|
if (likely(n <= v && v <= (page_size(head))))
|
2017-06-30 08:52:57 +07:00
|
|
|
return true;
|
|
|
|
WARN_ON(1);
|
|
|
|
return false;
|
|
|
|
}
|
2016-11-02 09:09:04 +07:00
|
|
|
|
2014-04-05 10:12:29 +07:00
|
|
|
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
|
|
|
|
struct iov_iter *i)
|
|
|
|
{
|
2017-06-30 08:52:57 +07:00
|
|
|
if (unlikely(!page_copy_sane(page, offset, bytes)))
|
|
|
|
return 0;
|
2014-11-28 02:22:37 +07:00
|
|
|
if (i->type & (ITER_BVEC|ITER_KVEC)) {
|
|
|
|
void *kaddr = kmap_atomic(page);
|
|
|
|
size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
|
|
|
|
kunmap_atomic(kaddr);
|
|
|
|
return wanted;
|
2018-10-20 06:57:56 +07:00
|
|
|
} else if (unlikely(iov_iter_is_discard(i)))
|
|
|
|
return bytes;
|
|
|
|
else if (likely(!iov_iter_is_pipe(i)))
|
2014-04-05 10:12:29 +07:00
|
|
|
return copy_page_to_iter_iovec(page, offset, bytes, i);
|
2016-09-23 03:33:12 +07:00
|
|
|
else
|
|
|
|
return copy_page_to_iter_pipe(page, offset, bytes, i);
|
2014-04-05 10:12:29 +07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(copy_page_to_iter);
|
|
|
|
|
|
|
|
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
|
|
|
|
struct iov_iter *i)
|
|
|
|
{
|
2017-06-30 08:52:57 +07:00
|
|
|
if (unlikely(!page_copy_sane(page, offset, bytes)))
|
|
|
|
return 0;
|
2018-10-20 06:57:56 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
|
2016-09-23 03:33:12 +07:00
|
|
|
WARN_ON(1);
|
|
|
|
return 0;
|
|
|
|
}
|
2014-11-28 02:48:42 +07:00
|
|
|
if (i->type & (ITER_BVEC|ITER_KVEC)) {
|
2014-11-28 02:22:37 +07:00
|
|
|
void *kaddr = kmap_atomic(page);
|
2017-06-30 08:45:10 +07:00
|
|
|
size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
|
2014-11-28 02:22:37 +07:00
|
|
|
kunmap_atomic(kaddr);
|
|
|
|
return wanted;
|
|
|
|
} else
|
2014-04-05 10:12:29 +07:00
|
|
|
return copy_page_from_iter_iovec(page, offset, bytes, i);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(copy_page_from_iter);
|
|
|
|
|
2016-09-23 03:33:12 +07:00
|
|
|
static size_t pipe_zero(size_t bytes, struct iov_iter *i)
|
|
|
|
{
|
|
|
|
struct pipe_inode_info *pipe = i->pipe;
|
2019-11-15 20:30:32 +07:00
|
|
|
unsigned int p_mask = pipe->ring_size - 1;
|
|
|
|
unsigned int i_head;
|
2016-09-23 03:33:12 +07:00
|
|
|
size_t n, off;
|
|
|
|
|
|
|
|
if (!sanity(i))
|
|
|
|
return 0;
|
|
|
|
|
2019-11-15 20:30:32 +07:00
|
|
|
bytes = n = push_pipe(i, bytes, &i_head, &off);
|
2016-09-23 03:33:12 +07:00
|
|
|
if (unlikely(!n))
|
|
|
|
return 0;
|
|
|
|
|
2019-11-15 20:30:32 +07:00
|
|
|
do {
|
2016-09-23 03:33:12 +07:00
|
|
|
size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
|
2019-11-15 20:30:32 +07:00
|
|
|
memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk);
|
|
|
|
i->head = i_head;
|
2016-09-23 03:33:12 +07:00
|
|
|
i->iov_offset = off + chunk;
|
|
|
|
n -= chunk;
|
2019-11-15 20:30:32 +07:00
|
|
|
off = 0;
|
|
|
|
i_head++;
|
|
|
|
} while (n);
|
2016-09-23 03:33:12 +07:00
|
|
|
i->count -= bytes;
|
|
|
|
return bytes;
|
|
|
|
}
|
|
|
|
|
2014-08-01 20:27:22 +07:00
|
|
|
size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
|
|
|
|
{
|
2018-10-22 19:07:28 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i)))
|
2016-09-23 03:33:12 +07:00
|
|
|
return pipe_zero(bytes, i);
|
2014-11-28 02:18:54 +07:00
|
|
|
iterate_and_advance(i, bytes, v,
|
2017-06-30 09:25:14 +07:00
|
|
|
clear_user(v.iov_base, v.iov_len),
|
2014-11-28 02:48:42 +07:00
|
|
|
memzero_page(v.bv_page, v.bv_offset, v.bv_len),
|
|
|
|
memset(v.iov_base, 0, v.iov_len)
|
2014-11-28 02:18:54 +07:00
|
|
|
)
|
|
|
|
|
|
|
|
return bytes;
|
2014-08-01 20:27:22 +07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_zero);
|
|
|
|
|
2014-04-05 10:12:29 +07:00
|
|
|
size_t iov_iter_copy_from_user_atomic(struct page *page,
|
|
|
|
struct iov_iter *i, unsigned long offset, size_t bytes)
|
|
|
|
{
|
iov_iter.c: macros for iterating over iov_iter
iterate_all_kinds(iter, size, ident, step_iovec, step_bvec)
iterates through the ranges covered by iter (up to size bytes total),
repeating step_iovec or step_bvec for each of those. ident is
declared in expansion of that thing, either as struct iovec or
struct bvec, and it contains the range we are currently looking
at. step_bvec should be a void expression, step_iovec - a size_t
one, with non-zero meaning "stop here, that many bytes from this
range left". In the end, the amount actually handled is stored
in size.
iov_iter_copy_from_user_atomic() and iov_iter_alignment() converted
to it.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-11-28 01:51:41 +07:00
|
|
|
char *kaddr = kmap_atomic(page), *p = kaddr + offset;
|
2017-06-30 08:52:57 +07:00
|
|
|
if (unlikely(!page_copy_sane(page, offset, bytes))) {
|
|
|
|
kunmap_atomic(kaddr);
|
|
|
|
return 0;
|
|
|
|
}
|
2018-10-20 06:57:56 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
|
2016-09-23 03:33:12 +07:00
|
|
|
kunmap_atomic(kaddr);
|
|
|
|
WARN_ON(1);
|
|
|
|
return 0;
|
|
|
|
}
|
iov_iter.c: macros for iterating over iov_iter
iterate_all_kinds(iter, size, ident, step_iovec, step_bvec)
iterates through the ranges covered by iter (up to size bytes total),
repeating step_iovec or step_bvec for each of those. ident is
declared in expansion of that thing, either as struct iovec or
struct bvec, and it contains the range we are currently looking
at. step_bvec should be a void expression, step_iovec - a size_t
one, with non-zero meaning "stop here, that many bytes from this
range left". In the end, the amount actually handled is stored
in size.
iov_iter_copy_from_user_atomic() and iov_iter_alignment() converted
to it.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-11-28 01:51:41 +07:00
|
|
|
iterate_all_kinds(i, bytes, v,
|
2017-06-30 09:25:14 +07:00
|
|
|
copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
|
iov_iter.c: macros for iterating over iov_iter
iterate_all_kinds(iter, size, ident, step_iovec, step_bvec)
iterates through the ranges covered by iter (up to size bytes total),
repeating step_iovec or step_bvec for each of those. ident is
declared in expansion of that thing, either as struct iovec or
struct bvec, and it contains the range we are currently looking
at. step_bvec should be a void expression, step_iovec - a size_t
one, with non-zero meaning "stop here, that many bytes from this
range left". In the end, the amount actually handled is stored
in size.
iov_iter_copy_from_user_atomic() and iov_iter_alignment() converted
to it.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-11-28 01:51:41 +07:00
|
|
|
memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
|
2014-11-28 02:48:42 +07:00
|
|
|
v.bv_offset, v.bv_len),
|
|
|
|
memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
|
iov_iter.c: macros for iterating over iov_iter
iterate_all_kinds(iter, size, ident, step_iovec, step_bvec)
iterates through the ranges covered by iter (up to size bytes total),
repeating step_iovec or step_bvec for each of those. ident is
declared in expansion of that thing, either as struct iovec or
struct bvec, and it contains the range we are currently looking
at. step_bvec should be a void expression, step_iovec - a size_t
one, with non-zero meaning "stop here, that many bytes from this
range left". In the end, the amount actually handled is stored
in size.
iov_iter_copy_from_user_atomic() and iov_iter_alignment() converted
to it.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-11-28 01:51:41 +07:00
|
|
|
)
|
|
|
|
kunmap_atomic(kaddr);
|
|
|
|
return bytes;
|
2014-04-05 10:12:29 +07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
|
|
|
|
|
2017-01-15 07:33:08 +07:00
|
|
|
static inline void pipe_truncate(struct iov_iter *i)
|
|
|
|
{
|
|
|
|
struct pipe_inode_info *pipe = i->pipe;
|
2019-11-15 20:30:32 +07:00
|
|
|
unsigned int p_tail = pipe->tail;
|
|
|
|
unsigned int p_head = pipe->head;
|
|
|
|
unsigned int p_mask = pipe->ring_size - 1;
|
|
|
|
|
|
|
|
if (!pipe_empty(p_head, p_tail)) {
|
|
|
|
struct pipe_buffer *buf;
|
|
|
|
unsigned int i_head = i->head;
|
2017-01-15 07:33:08 +07:00
|
|
|
size_t off = i->iov_offset;
|
2019-11-15 20:30:32 +07:00
|
|
|
|
2017-01-15 07:33:08 +07:00
|
|
|
if (off) {
|
2019-11-15 20:30:32 +07:00
|
|
|
buf = &pipe->bufs[i_head & p_mask];
|
|
|
|
buf->len = off - buf->offset;
|
|
|
|
i_head++;
|
2017-01-15 07:33:08 +07:00
|
|
|
}
|
2019-11-15 20:30:32 +07:00
|
|
|
while (p_head != i_head) {
|
|
|
|
p_head--;
|
|
|
|
pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
|
2017-01-15 07:33:08 +07:00
|
|
|
}
|
2019-11-15 20:30:32 +07:00
|
|
|
|
|
|
|
pipe->head = p_head;
|
2017-01-15 07:33:08 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-09-23 03:33:12 +07:00
|
|
|
static void pipe_advance(struct iov_iter *i, size_t size)
|
|
|
|
{
|
|
|
|
struct pipe_inode_info *pipe = i->pipe;
|
|
|
|
if (unlikely(i->count < size))
|
|
|
|
size = i->count;
|
|
|
|
if (size) {
|
2017-01-15 07:33:08 +07:00
|
|
|
struct pipe_buffer *buf;
|
2019-11-15 20:30:32 +07:00
|
|
|
unsigned int p_mask = pipe->ring_size - 1;
|
|
|
|
unsigned int i_head = i->head;
|
2017-01-15 07:33:08 +07:00
|
|
|
size_t off = i->iov_offset, left = size;
|
2019-11-15 20:30:32 +07:00
|
|
|
|
2016-09-23 03:33:12 +07:00
|
|
|
if (off) /* make it relative to the beginning of buffer */
|
2019-11-15 20:30:32 +07:00
|
|
|
left += off - pipe->bufs[i_head & p_mask].offset;
|
2016-09-23 03:33:12 +07:00
|
|
|
while (1) {
|
2019-11-15 20:30:32 +07:00
|
|
|
buf = &pipe->bufs[i_head & p_mask];
|
2017-01-15 07:33:08 +07:00
|
|
|
if (left <= buf->len)
|
2016-09-23 03:33:12 +07:00
|
|
|
break;
|
2017-01-15 07:33:08 +07:00
|
|
|
left -= buf->len;
|
2019-11-15 20:30:32 +07:00
|
|
|
i_head++;
|
2016-09-23 03:33:12 +07:00
|
|
|
}
|
2019-11-15 20:30:32 +07:00
|
|
|
i->head = i_head;
|
2017-01-15 07:33:08 +07:00
|
|
|
i->iov_offset = buf->offset + left;
|
2016-09-23 03:33:12 +07:00
|
|
|
}
|
2017-01-15 07:33:08 +07:00
|
|
|
i->count -= size;
|
|
|
|
/* ... and discard everything past that point */
|
|
|
|
pipe_truncate(i);
|
2016-09-23 03:33:12 +07:00
|
|
|
}
|
|
|
|
|
2014-04-05 10:12:29 +07:00
|
|
|
void iov_iter_advance(struct iov_iter *i, size_t size)
|
|
|
|
{
|
2018-10-22 19:07:28 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i))) {
|
2016-09-23 03:33:12 +07:00
|
|
|
pipe_advance(i, size);
|
|
|
|
return;
|
|
|
|
}
|
2018-10-20 06:57:56 +07:00
|
|
|
if (unlikely(iov_iter_is_discard(i))) {
|
|
|
|
i->count -= size;
|
|
|
|
return;
|
|
|
|
}
|
2014-11-28 02:48:42 +07:00
|
|
|
iterate_and_advance(i, size, v, 0, 0, 0)
|
2014-04-05 10:12:29 +07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_advance);
|
|
|
|
|
2017-02-18 06:42:24 +07:00
|
|
|
void iov_iter_revert(struct iov_iter *i, size_t unroll)
|
|
|
|
{
|
|
|
|
if (!unroll)
|
|
|
|
return;
|
2017-05-09 00:54:47 +07:00
|
|
|
if (WARN_ON(unroll > MAX_RW_COUNT))
|
|
|
|
return;
|
2017-02-18 06:42:24 +07:00
|
|
|
i->count += unroll;
|
2018-10-22 19:07:28 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i))) {
|
2017-02-18 06:42:24 +07:00
|
|
|
struct pipe_inode_info *pipe = i->pipe;
|
2019-11-15 20:30:32 +07:00
|
|
|
unsigned int p_mask = pipe->ring_size - 1;
|
|
|
|
unsigned int i_head = i->head;
|
2017-02-18 06:42:24 +07:00
|
|
|
size_t off = i->iov_offset;
|
|
|
|
while (1) {
|
2019-11-15 20:30:32 +07:00
|
|
|
struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
|
|
|
|
size_t n = off - b->offset;
|
2017-02-18 06:42:24 +07:00
|
|
|
if (unroll < n) {
|
2017-04-30 03:42:30 +07:00
|
|
|
off -= unroll;
|
2017-02-18 06:42:24 +07:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
unroll -= n;
|
2019-11-15 20:30:32 +07:00
|
|
|
if (!unroll && i_head == i->start_head) {
|
2017-02-18 06:42:24 +07:00
|
|
|
off = 0;
|
|
|
|
break;
|
|
|
|
}
|
2019-11-15 20:30:32 +07:00
|
|
|
i_head--;
|
|
|
|
b = &pipe->bufs[i_head & p_mask];
|
|
|
|
off = b->offset + b->len;
|
2017-02-18 06:42:24 +07:00
|
|
|
}
|
|
|
|
i->iov_offset = off;
|
2019-11-15 20:30:32 +07:00
|
|
|
i->head = i_head;
|
2017-02-18 06:42:24 +07:00
|
|
|
pipe_truncate(i);
|
|
|
|
return;
|
|
|
|
}
|
2018-10-20 06:57:56 +07:00
|
|
|
if (unlikely(iov_iter_is_discard(i)))
|
|
|
|
return;
|
2017-02-18 06:42:24 +07:00
|
|
|
if (unroll <= i->iov_offset) {
|
|
|
|
i->iov_offset -= unroll;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
unroll -= i->iov_offset;
|
2018-10-22 19:07:28 +07:00
|
|
|
if (iov_iter_is_bvec(i)) {
|
2017-02-18 06:42:24 +07:00
|
|
|
const struct bio_vec *bvec = i->bvec;
|
|
|
|
while (1) {
|
|
|
|
size_t n = (--bvec)->bv_len;
|
|
|
|
i->nr_segs++;
|
|
|
|
if (unroll <= n) {
|
|
|
|
i->bvec = bvec;
|
|
|
|
i->iov_offset = n - unroll;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
unroll -= n;
|
|
|
|
}
|
|
|
|
} else { /* same logics for iovec and kvec */
|
|
|
|
const struct iovec *iov = i->iov;
|
|
|
|
while (1) {
|
|
|
|
size_t n = (--iov)->iov_len;
|
|
|
|
i->nr_segs++;
|
|
|
|
if (unroll <= n) {
|
|
|
|
i->iov = iov;
|
|
|
|
i->iov_offset = n - unroll;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
unroll -= n;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_revert);
|
|
|
|
|
2014-04-05 10:12:29 +07:00
|
|
|
/*
|
|
|
|
* Return the count of just the current iov_iter segment.
|
|
|
|
*/
|
|
|
|
size_t iov_iter_single_seg_count(const struct iov_iter *i)
|
|
|
|
{
|
2018-10-22 19:07:28 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i)))
|
2016-09-23 03:33:12 +07:00
|
|
|
return i->count; // it is a silly place, anyway
|
2014-04-05 10:12:29 +07:00
|
|
|
if (i->nr_segs == 1)
|
|
|
|
return i->count;
|
2018-10-20 06:57:56 +07:00
|
|
|
if (unlikely(iov_iter_is_discard(i)))
|
|
|
|
return i->count;
|
2018-10-22 19:07:28 +07:00
|
|
|
else if (iov_iter_is_bvec(i))
|
2014-04-05 10:12:29 +07:00
|
|
|
return min(i->count, i->bvec->bv_len - i->iov_offset);
|
2014-11-13 16:15:23 +07:00
|
|
|
else
|
|
|
|
return min(i->count, i->iov->iov_len - i->iov_offset);
|
2014-04-05 10:12:29 +07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_single_seg_count);
|
|
|
|
|
2018-10-20 06:57:56 +07:00
|
|
|
void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
|
2015-01-23 13:08:07 +07:00
|
|
|
const struct kvec *kvec, unsigned long nr_segs,
|
2014-11-25 02:46:11 +07:00
|
|
|
size_t count)
|
|
|
|
{
|
2018-10-20 06:57:56 +07:00
|
|
|
WARN_ON(direction & ~(READ | WRITE));
|
|
|
|
i->type = ITER_KVEC | (direction & (READ | WRITE));
|
2015-01-23 13:08:07 +07:00
|
|
|
i->kvec = kvec;
|
2014-11-25 02:46:11 +07:00
|
|
|
i->nr_segs = nr_segs;
|
|
|
|
i->iov_offset = 0;
|
|
|
|
i->count = count;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_kvec);
|
|
|
|
|
2018-10-20 06:57:56 +07:00
|
|
|
void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
|
2015-01-23 13:08:07 +07:00
|
|
|
const struct bio_vec *bvec, unsigned long nr_segs,
|
|
|
|
size_t count)
|
|
|
|
{
|
2018-10-20 06:57:56 +07:00
|
|
|
WARN_ON(direction & ~(READ | WRITE));
|
|
|
|
i->type = ITER_BVEC | (direction & (READ | WRITE));
|
2015-01-23 13:08:07 +07:00
|
|
|
i->bvec = bvec;
|
|
|
|
i->nr_segs = nr_segs;
|
|
|
|
i->iov_offset = 0;
|
|
|
|
i->count = count;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_bvec);
|
|
|
|
|
2018-10-20 06:57:56 +07:00
|
|
|
void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
|
2016-09-23 03:33:12 +07:00
|
|
|
struct pipe_inode_info *pipe,
|
|
|
|
size_t count)
|
|
|
|
{
|
2018-10-20 06:57:56 +07:00
|
|
|
BUG_ON(direction != READ);
|
2019-11-15 20:30:32 +07:00
|
|
|
WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
|
2018-10-20 06:57:56 +07:00
|
|
|
i->type = ITER_PIPE | READ;
|
2016-09-23 03:33:12 +07:00
|
|
|
i->pipe = pipe;
|
2019-11-15 20:30:32 +07:00
|
|
|
i->head = pipe->head;
|
2016-09-23 03:33:12 +07:00
|
|
|
i->iov_offset = 0;
|
|
|
|
i->count = count;
|
2019-11-15 20:30:32 +07:00
|
|
|
i->start_head = i->head;
|
2016-09-23 03:33:12 +07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_pipe);
|
|
|
|
|
2018-10-20 06:57:56 +07:00
|
|
|
/**
|
|
|
|
* iov_iter_discard - Initialise an I/O iterator that discards data
|
|
|
|
* @i: The iterator to initialise.
|
|
|
|
* @direction: The direction of the transfer.
|
|
|
|
* @count: The size of the I/O buffer in bytes.
|
|
|
|
*
|
|
|
|
* Set up an I/O iterator that just discards everything that's written to it.
|
|
|
|
* It's only available as a READ iterator.
|
|
|
|
*/
|
|
|
|
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
|
|
|
|
{
|
|
|
|
BUG_ON(direction != READ);
|
|
|
|
i->type = ITER_DISCARD | READ;
|
|
|
|
i->count = count;
|
|
|
|
i->iov_offset = 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_discard);
|
|
|
|
|
2014-04-05 10:12:29 +07:00
|
|
|
unsigned long iov_iter_alignment(const struct iov_iter *i)
|
|
|
|
{
|
iov_iter.c: macros for iterating over iov_iter
iterate_all_kinds(iter, size, ident, step_iovec, step_bvec)
iterates through the ranges covered by iter (up to size bytes total),
repeating step_iovec or step_bvec for each of those. ident is
declared in expansion of that thing, either as struct iovec or
struct bvec, and it contains the range we are currently looking
at. step_bvec should be a void expression, step_iovec - a size_t
one, with non-zero meaning "stop here, that many bytes from this
range left". In the end, the amount actually handled is stored
in size.
iov_iter_copy_from_user_atomic() and iov_iter_alignment() converted
to it.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-11-28 01:51:41 +07:00
|
|
|
unsigned long res = 0;
|
|
|
|
size_t size = i->count;
|
|
|
|
|
2018-10-22 19:07:28 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i))) {
|
2019-12-16 17:54:32 +07:00
|
|
|
unsigned int p_mask = i->pipe->ring_size - 1;
|
|
|
|
|
2019-11-15 20:30:32 +07:00
|
|
|
if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask]))
|
2016-09-23 03:33:12 +07:00
|
|
|
return size | i->iov_offset;
|
|
|
|
return size;
|
|
|
|
}
|
iov_iter.c: macros for iterating over iov_iter
iterate_all_kinds(iter, size, ident, step_iovec, step_bvec)
iterates through the ranges covered by iter (up to size bytes total),
repeating step_iovec or step_bvec for each of those. ident is
declared in expansion of that thing, either as struct iovec or
struct bvec, and it contains the range we are currently looking
at. step_bvec should be a void expression, step_iovec - a size_t
one, with non-zero meaning "stop here, that many bytes from this
range left". In the end, the amount actually handled is stored
in size.
iov_iter_copy_from_user_atomic() and iov_iter_alignment() converted
to it.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-11-28 01:51:41 +07:00
|
|
|
iterate_all_kinds(i, size, v,
|
|
|
|
(res |= (unsigned long)v.iov_base | v.iov_len, 0),
|
2014-11-28 02:48:42 +07:00
|
|
|
res |= v.bv_offset | v.bv_len,
|
|
|
|
res |= (unsigned long)v.iov_base | v.iov_len
|
iov_iter.c: macros for iterating over iov_iter
iterate_all_kinds(iter, size, ident, step_iovec, step_bvec)
iterates through the ranges covered by iter (up to size bytes total),
repeating step_iovec or step_bvec for each of those. ident is
declared in expansion of that thing, either as struct iovec or
struct bvec, and it contains the range we are currently looking
at. step_bvec should be a void expression, step_iovec - a size_t
one, with non-zero meaning "stop here, that many bytes from this
range left". In the end, the amount actually handled is stored
in size.
iov_iter_copy_from_user_atomic() and iov_iter_alignment() converted
to it.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-11-28 01:51:41 +07:00
|
|
|
)
|
|
|
|
return res;
|
2014-04-05 10:12:29 +07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_alignment);
|
|
|
|
|
2016-04-09 06:05:19 +07:00
|
|
|
unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
|
|
|
|
{
|
2016-12-22 09:55:02 +07:00
|
|
|
unsigned long res = 0;
|
2016-04-09 06:05:19 +07:00
|
|
|
size_t size = i->count;
|
|
|
|
|
2018-10-20 06:57:56 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
|
2016-09-23 03:33:12 +07:00
|
|
|
WARN_ON(1);
|
|
|
|
return ~0U;
|
|
|
|
}
|
|
|
|
|
2016-04-09 06:05:19 +07:00
|
|
|
iterate_all_kinds(i, size, v,
|
|
|
|
(res |= (!res ? 0 : (unsigned long)v.iov_base) |
|
|
|
|
(size != v.iov_len ? size : 0), 0),
|
|
|
|
(res |= (!res ? 0 : (unsigned long)v.bv_offset) |
|
|
|
|
(size != v.bv_len ? size : 0)),
|
|
|
|
(res |= (!res ? 0 : (unsigned long)v.iov_base) |
|
|
|
|
(size != v.iov_len ? size : 0))
|
|
|
|
);
|
2016-12-22 09:55:02 +07:00
|
|
|
return res;
|
2016-04-09 06:05:19 +07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_gap_alignment);
|
|
|
|
|
2018-05-03 01:16:56 +07:00
|
|
|
static inline ssize_t __pipe_get_pages(struct iov_iter *i,
|
2016-09-23 03:33:12 +07:00
|
|
|
size_t maxsize,
|
|
|
|
struct page **pages,
|
2019-11-15 20:30:32 +07:00
|
|
|
int iter_head,
|
2016-09-23 03:33:12 +07:00
|
|
|
size_t *start)
|
|
|
|
{
|
|
|
|
struct pipe_inode_info *pipe = i->pipe;
|
2019-11-15 20:30:32 +07:00
|
|
|
unsigned int p_mask = pipe->ring_size - 1;
|
|
|
|
ssize_t n = push_pipe(i, maxsize, &iter_head, start);
|
2016-09-23 03:33:12 +07:00
|
|
|
if (!n)
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
maxsize = n;
|
|
|
|
n += *start;
|
2016-10-12 00:21:14 +07:00
|
|
|
while (n > 0) {
|
2019-11-15 20:30:32 +07:00
|
|
|
get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
|
|
|
|
iter_head++;
|
2016-09-23 03:33:12 +07:00
|
|
|
n -= PAGE_SIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
return maxsize;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t pipe_get_pages(struct iov_iter *i,
|
|
|
|
struct page **pages, size_t maxsize, unsigned maxpages,
|
|
|
|
size_t *start)
|
|
|
|
{
|
2019-11-15 20:30:32 +07:00
|
|
|
unsigned int iter_head, npages;
|
2016-09-23 03:33:12 +07:00
|
|
|
size_t capacity;
|
|
|
|
|
2016-12-22 09:55:02 +07:00
|
|
|
if (!maxsize)
|
|
|
|
return 0;
|
|
|
|
|
2016-09-23 03:33:12 +07:00
|
|
|
if (!sanity(i))
|
|
|
|
return -EFAULT;
|
|
|
|
|
2019-11-15 20:30:32 +07:00
|
|
|
data_start(i, &iter_head, start);
|
|
|
|
/* Amount of free space: some of this one + all after this one */
|
|
|
|
npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
|
|
|
|
capacity = min(npages, maxpages) * PAGE_SIZE - *start;
|
2016-09-23 03:33:12 +07:00
|
|
|
|
2019-11-15 20:30:32 +07:00
|
|
|
return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
|
2016-09-23 03:33:12 +07:00
|
|
|
}
|
|
|
|
|
2014-04-05 10:12:29 +07:00
|
|
|
ssize_t iov_iter_get_pages(struct iov_iter *i,
|
2014-09-24 22:09:11 +07:00
|
|
|
struct page **pages, size_t maxsize, unsigned maxpages,
|
2014-04-05 10:12:29 +07:00
|
|
|
size_t *start)
|
|
|
|
{
|
2014-11-28 02:12:09 +07:00
|
|
|
if (maxsize > i->count)
|
|
|
|
maxsize = i->count;
|
|
|
|
|
2018-10-22 19:07:28 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i)))
|
2016-09-23 03:33:12 +07:00
|
|
|
return pipe_get_pages(i, pages, maxsize, maxpages, start);
|
2018-10-20 06:57:56 +07:00
|
|
|
if (unlikely(iov_iter_is_discard(i)))
|
|
|
|
return -EFAULT;
|
|
|
|
|
2014-11-28 02:12:09 +07:00
|
|
|
iterate_all_kinds(i, maxsize, v, ({
|
|
|
|
unsigned long addr = (unsigned long)v.iov_base;
|
|
|
|
size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
|
|
|
|
int n;
|
|
|
|
int res;
|
|
|
|
|
|
|
|
if (len > maxpages * PAGE_SIZE)
|
|
|
|
len = maxpages * PAGE_SIZE;
|
|
|
|
addr &= ~(PAGE_SIZE - 1);
|
|
|
|
n = DIV_ROUND_UP(len, PAGE_SIZE);
|
2019-05-14 07:17:11 +07:00
|
|
|
res = get_user_pages_fast(addr, n,
|
|
|
|
iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0,
|
|
|
|
pages);
|
2014-11-28 02:12:09 +07:00
|
|
|
if (unlikely(res < 0))
|
|
|
|
return res;
|
|
|
|
return (res == n ? len : res * PAGE_SIZE) - *start;
|
|
|
|
0;}),({
|
|
|
|
/* can't be more than PAGE_SIZE */
|
|
|
|
*start = v.bv_offset;
|
|
|
|
get_page(*pages = v.bv_page);
|
|
|
|
return v.bv_len;
|
2014-11-28 02:48:42 +07:00
|
|
|
}),({
|
|
|
|
return -EFAULT;
|
2014-11-28 02:12:09 +07:00
|
|
|
})
|
|
|
|
)
|
|
|
|
return 0;
|
2014-04-05 10:12:29 +07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_get_pages);
|
|
|
|
|
2014-11-28 02:14:31 +07:00
|
|
|
static struct page **get_pages_array(size_t n)
|
|
|
|
{
|
2017-05-09 05:57:27 +07:00
|
|
|
return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
|
2014-11-28 02:14:31 +07:00
|
|
|
}
|
|
|
|
|
2016-09-23 03:33:12 +07:00
|
|
|
static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
|
|
|
|
struct page ***pages, size_t maxsize,
|
|
|
|
size_t *start)
|
|
|
|
{
|
|
|
|
struct page **p;
|
2019-11-15 20:30:32 +07:00
|
|
|
unsigned int iter_head, npages;
|
2018-05-03 01:16:57 +07:00
|
|
|
ssize_t n;
|
2016-09-23 03:33:12 +07:00
|
|
|
|
2016-12-22 09:55:02 +07:00
|
|
|
if (!maxsize)
|
|
|
|
return 0;
|
|
|
|
|
2016-09-23 03:33:12 +07:00
|
|
|
if (!sanity(i))
|
|
|
|
return -EFAULT;
|
|
|
|
|
2019-11-15 20:30:32 +07:00
|
|
|
data_start(i, &iter_head, start);
|
|
|
|
/* Amount of free space: some of this one + all after this one */
|
|
|
|
npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
|
2016-09-23 03:33:12 +07:00
|
|
|
n = npages * PAGE_SIZE - *start;
|
|
|
|
if (maxsize > n)
|
|
|
|
maxsize = n;
|
|
|
|
else
|
|
|
|
npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
|
|
|
|
p = get_pages_array(npages);
|
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
2019-11-15 20:30:32 +07:00
|
|
|
n = __pipe_get_pages(i, maxsize, p, iter_head, start);
|
2016-09-23 03:33:12 +07:00
|
|
|
if (n > 0)
|
|
|
|
*pages = p;
|
|
|
|
else
|
|
|
|
kvfree(p);
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
2014-04-05 10:12:29 +07:00
|
|
|
ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
|
|
|
|
struct page ***pages, size_t maxsize,
|
|
|
|
size_t *start)
|
|
|
|
{
|
2014-11-28 02:14:31 +07:00
|
|
|
struct page **p;
|
|
|
|
|
|
|
|
if (maxsize > i->count)
|
|
|
|
maxsize = i->count;
|
|
|
|
|
2018-10-22 19:07:28 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i)))
|
2016-09-23 03:33:12 +07:00
|
|
|
return pipe_get_pages_alloc(i, pages, maxsize, start);
|
2018-10-20 06:57:56 +07:00
|
|
|
if (unlikely(iov_iter_is_discard(i)))
|
|
|
|
return -EFAULT;
|
|
|
|
|
2014-11-28 02:14:31 +07:00
|
|
|
iterate_all_kinds(i, maxsize, v, ({
|
|
|
|
unsigned long addr = (unsigned long)v.iov_base;
|
|
|
|
size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
|
|
|
|
int n;
|
|
|
|
int res;
|
|
|
|
|
|
|
|
addr &= ~(PAGE_SIZE - 1);
|
|
|
|
n = DIV_ROUND_UP(len, PAGE_SIZE);
|
|
|
|
p = get_pages_array(n);
|
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
2019-05-14 07:17:11 +07:00
|
|
|
res = get_user_pages_fast(addr, n,
|
|
|
|
iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p);
|
2014-11-28 02:14:31 +07:00
|
|
|
if (unlikely(res < 0)) {
|
|
|
|
kvfree(p);
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
*pages = p;
|
|
|
|
return (res == n ? len : res * PAGE_SIZE) - *start;
|
|
|
|
0;}),({
|
|
|
|
/* can't be more than PAGE_SIZE */
|
|
|
|
*start = v.bv_offset;
|
|
|
|
*pages = p = get_pages_array(1);
|
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
|
|
|
get_page(*p = v.bv_page);
|
|
|
|
return v.bv_len;
|
2014-11-28 02:48:42 +07:00
|
|
|
}),({
|
|
|
|
return -EFAULT;
|
2014-11-28 02:14:31 +07:00
|
|
|
})
|
|
|
|
)
|
|
|
|
return 0;
|
2014-04-05 10:12:29 +07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_get_pages_alloc);
|
|
|
|
|
2014-11-24 13:08:00 +07:00
|
|
|
size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
|
|
|
|
struct iov_iter *i)
|
|
|
|
{
|
|
|
|
char *to = addr;
|
|
|
|
__wsum sum, next;
|
|
|
|
size_t off = 0;
|
|
|
|
sum = *csum;
|
2018-10-20 06:57:56 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
|
2016-09-23 03:33:12 +07:00
|
|
|
WARN_ON(1);
|
|
|
|
return 0;
|
|
|
|
}
|
2014-11-24 13:08:00 +07:00
|
|
|
iterate_and_advance(i, bytes, v, ({
|
2016-11-02 09:09:04 +07:00
|
|
|
next = csum_and_copy_from_user(v.iov_base,
|
2014-11-24 13:08:00 +07:00
|
|
|
(to += v.iov_len) - v.iov_len,
|
2020-07-11 11:27:49 +07:00
|
|
|
v.iov_len);
|
|
|
|
if (next) {
|
2014-11-24 13:08:00 +07:00
|
|
|
sum = csum_block_add(sum, next, off);
|
|
|
|
off += v.iov_len;
|
|
|
|
}
|
2020-07-11 11:27:49 +07:00
|
|
|
next ? 0 : v.iov_len;
|
2014-11-24 13:08:00 +07:00
|
|
|
}), ({
|
|
|
|
char *p = kmap_atomic(v.bv_page);
|
2018-11-28 10:32:59 +07:00
|
|
|
sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
|
|
|
|
p + v.bv_offset, v.bv_len,
|
|
|
|
sum, off);
|
2014-11-24 13:08:00 +07:00
|
|
|
kunmap_atomic(p);
|
|
|
|
off += v.bv_len;
|
|
|
|
}),({
|
2018-11-28 10:32:59 +07:00
|
|
|
sum = csum_and_memcpy((to += v.iov_len) - v.iov_len,
|
|
|
|
v.iov_base, v.iov_len,
|
|
|
|
sum, off);
|
2014-11-24 13:08:00 +07:00
|
|
|
off += v.iov_len;
|
|
|
|
})
|
|
|
|
)
|
|
|
|
*csum = sum;
|
|
|
|
return bytes;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(csum_and_copy_from_iter);
|
|
|
|
|
2016-11-02 09:09:04 +07:00
|
|
|
bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum,
|
|
|
|
struct iov_iter *i)
|
|
|
|
{
|
|
|
|
char *to = addr;
|
|
|
|
__wsum sum, next;
|
|
|
|
size_t off = 0;
|
|
|
|
sum = *csum;
|
2018-10-20 06:57:56 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
|
2016-11-02 09:09:04 +07:00
|
|
|
WARN_ON(1);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (unlikely(i->count < bytes))
|
|
|
|
return false;
|
|
|
|
iterate_all_kinds(i, bytes, v, ({
|
|
|
|
next = csum_and_copy_from_user(v.iov_base,
|
|
|
|
(to += v.iov_len) - v.iov_len,
|
2020-07-11 11:27:49 +07:00
|
|
|
v.iov_len);
|
|
|
|
if (!next)
|
2016-11-02 09:09:04 +07:00
|
|
|
return false;
|
|
|
|
sum = csum_block_add(sum, next, off);
|
|
|
|
off += v.iov_len;
|
|
|
|
0;
|
|
|
|
}), ({
|
|
|
|
char *p = kmap_atomic(v.bv_page);
|
2018-11-28 10:32:59 +07:00
|
|
|
sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
|
|
|
|
p + v.bv_offset, v.bv_len,
|
|
|
|
sum, off);
|
2016-11-02 09:09:04 +07:00
|
|
|
kunmap_atomic(p);
|
|
|
|
off += v.bv_len;
|
|
|
|
}),({
|
2018-11-28 10:32:59 +07:00
|
|
|
sum = csum_and_memcpy((to += v.iov_len) - v.iov_len,
|
|
|
|
v.iov_base, v.iov_len,
|
|
|
|
sum, off);
|
2016-11-02 09:09:04 +07:00
|
|
|
off += v.iov_len;
|
|
|
|
})
|
|
|
|
)
|
|
|
|
*csum = sum;
|
|
|
|
iov_iter_advance(i, bytes);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(csum_and_copy_from_iter_full);
|
|
|
|
|
2018-12-04 08:52:07 +07:00
|
|
|
size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump,
|
2014-11-24 13:08:00 +07:00
|
|
|
struct iov_iter *i)
|
|
|
|
{
|
2015-12-07 04:49:22 +07:00
|
|
|
const char *from = addr;
|
2018-12-04 08:52:07 +07:00
|
|
|
__wsum *csum = csump;
|
2014-11-24 13:08:00 +07:00
|
|
|
__wsum sum, next;
|
|
|
|
size_t off = 0;
|
2018-11-26 04:24:16 +07:00
|
|
|
|
|
|
|
if (unlikely(iov_iter_is_pipe(i)))
|
|
|
|
return csum_and_copy_to_pipe_iter(addr, bytes, csum, i);
|
|
|
|
|
2014-11-24 13:08:00 +07:00
|
|
|
sum = *csum;
|
2018-11-26 04:24:16 +07:00
|
|
|
if (unlikely(iov_iter_is_discard(i))) {
|
2016-09-23 03:33:12 +07:00
|
|
|
WARN_ON(1); /* for now */
|
|
|
|
return 0;
|
|
|
|
}
|
2014-11-24 13:08:00 +07:00
|
|
|
iterate_and_advance(i, bytes, v, ({
|
|
|
|
next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
|
2016-11-02 09:09:04 +07:00
|
|
|
v.iov_base,
|
2020-07-11 11:27:49 +07:00
|
|
|
v.iov_len);
|
|
|
|
if (next) {
|
2014-11-24 13:08:00 +07:00
|
|
|
sum = csum_block_add(sum, next, off);
|
|
|
|
off += v.iov_len;
|
|
|
|
}
|
2020-07-11 11:27:49 +07:00
|
|
|
next ? 0 : v.iov_len;
|
2014-11-24 13:08:00 +07:00
|
|
|
}), ({
|
|
|
|
char *p = kmap_atomic(v.bv_page);
|
2018-11-28 10:32:59 +07:00
|
|
|
sum = csum_and_memcpy(p + v.bv_offset,
|
|
|
|
(from += v.bv_len) - v.bv_len,
|
|
|
|
v.bv_len, sum, off);
|
2014-11-24 13:08:00 +07:00
|
|
|
kunmap_atomic(p);
|
|
|
|
off += v.bv_len;
|
|
|
|
}),({
|
2018-11-28 10:32:59 +07:00
|
|
|
sum = csum_and_memcpy(v.iov_base,
|
|
|
|
(from += v.iov_len) - v.iov_len,
|
|
|
|
v.iov_len, sum, off);
|
2014-11-24 13:08:00 +07:00
|
|
|
off += v.iov_len;
|
|
|
|
})
|
|
|
|
)
|
|
|
|
*csum = sum;
|
|
|
|
return bytes;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(csum_and_copy_to_iter);
|
|
|
|
|
2018-12-04 08:52:09 +07:00
|
|
|
size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
|
|
|
|
struct iov_iter *i)
|
|
|
|
{
|
2020-06-12 13:57:37 +07:00
|
|
|
#ifdef CONFIG_CRYPTO_HASH
|
2018-12-04 08:52:09 +07:00
|
|
|
struct ahash_request *hash = hashp;
|
|
|
|
struct scatterlist sg;
|
|
|
|
size_t copied;
|
|
|
|
|
|
|
|
copied = copy_to_iter(addr, bytes, i);
|
|
|
|
sg_init_one(&sg, addr, copied);
|
|
|
|
ahash_request_set_crypt(hash, &sg, NULL, copied);
|
|
|
|
crypto_ahash_update(hash);
|
|
|
|
return copied;
|
2019-04-04 09:31:14 +07:00
|
|
|
#else
|
|
|
|
return 0;
|
|
|
|
#endif
|
2018-12-04 08:52:09 +07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(hash_and_copy_to_iter);
|
|
|
|
|
2014-04-05 10:12:29 +07:00
|
|
|
int iov_iter_npages(const struct iov_iter *i, int maxpages)
|
|
|
|
{
|
2014-11-28 02:09:46 +07:00
|
|
|
size_t size = i->count;
|
|
|
|
int npages = 0;
|
|
|
|
|
|
|
|
if (!size)
|
|
|
|
return 0;
|
2018-10-20 06:57:56 +07:00
|
|
|
if (unlikely(iov_iter_is_discard(i)))
|
|
|
|
return 0;
|
2014-11-28 02:09:46 +07:00
|
|
|
|
2018-10-22 19:07:28 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(i))) {
|
2016-09-23 03:33:12 +07:00
|
|
|
struct pipe_inode_info *pipe = i->pipe;
|
2019-11-15 20:30:32 +07:00
|
|
|
unsigned int iter_head;
|
2016-09-23 03:33:12 +07:00
|
|
|
size_t off;
|
|
|
|
|
|
|
|
if (!sanity(i))
|
|
|
|
return 0;
|
|
|
|
|
2019-11-15 20:30:32 +07:00
|
|
|
data_start(i, &iter_head, &off);
|
2016-09-23 03:33:12 +07:00
|
|
|
/* some of this one + all after this one */
|
2019-11-15 20:30:32 +07:00
|
|
|
npages = pipe_space_for_user(iter_head, pipe->tail, pipe);
|
2016-09-23 03:33:12 +07:00
|
|
|
if (npages >= maxpages)
|
|
|
|
return maxpages;
|
|
|
|
} else iterate_all_kinds(i, size, v, ({
|
2014-11-28 02:09:46 +07:00
|
|
|
unsigned long p = (unsigned long)v.iov_base;
|
|
|
|
npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
|
|
|
|
- p / PAGE_SIZE;
|
|
|
|
if (npages >= maxpages)
|
|
|
|
return maxpages;
|
|
|
|
0;}),({
|
|
|
|
npages++;
|
|
|
|
if (npages >= maxpages)
|
|
|
|
return maxpages;
|
2014-11-28 02:48:42 +07:00
|
|
|
}),({
|
|
|
|
unsigned long p = (unsigned long)v.iov_base;
|
|
|
|
npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
|
|
|
|
- p / PAGE_SIZE;
|
|
|
|
if (npages >= maxpages)
|
|
|
|
return maxpages;
|
2014-11-28 02:09:46 +07:00
|
|
|
})
|
|
|
|
)
|
|
|
|
return npages;
|
2014-04-05 10:12:29 +07:00
|
|
|
}
|
2014-03-19 12:16:16 +07:00
|
|
|
EXPORT_SYMBOL(iov_iter_npages);
|
2015-02-01 08:08:47 +07:00
|
|
|
|
|
|
|
const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
|
|
|
|
{
|
|
|
|
*new = *old;
|
2018-10-22 19:07:28 +07:00
|
|
|
if (unlikely(iov_iter_is_pipe(new))) {
|
2016-09-23 03:33:12 +07:00
|
|
|
WARN_ON(1);
|
|
|
|
return NULL;
|
|
|
|
}
|
2018-10-20 06:57:56 +07:00
|
|
|
if (unlikely(iov_iter_is_discard(new)))
|
|
|
|
return NULL;
|
2018-10-22 19:07:28 +07:00
|
|
|
if (iov_iter_is_bvec(new))
|
2015-02-01 08:08:47 +07:00
|
|
|
return new->bvec = kmemdup(new->bvec,
|
|
|
|
new->nr_segs * sizeof(struct bio_vec),
|
|
|
|
flags);
|
|
|
|
else
|
|
|
|
/* iovec and kvec have identical layout */
|
|
|
|
return new->iov = kmemdup(new->iov,
|
|
|
|
new->nr_segs * sizeof(struct iovec),
|
|
|
|
flags);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(dup_iter);
|
saner iov_iter initialization primitives
iovec-backed iov_iter instances are assumed to satisfy several properties:
* no more than UIO_MAXIOV elements in iovec array
* total size of all ranges is no more than MAX_RW_COUNT
* all ranges pass access_ok().
The problem is, invariants of data structures should be established in the
primitives creating those data structures, not in the code using those
primitives. And iov_iter_init() violates that principle. For a while we
managed to get away with that, but once the use of iov_iter started to
spread, it didn't take long for shit to hit the fan - missed check in
sys_sendto() had introduced a roothole.
We _do_ have primitives for importing and validating iovecs (both native and
compat ones) and those primitives are almost always followed by shoving the
resulting iovec into iov_iter. Life would be considerably simpler (and safer)
if we combined those primitives with initializing iov_iter.
That gives us two new primitives - import_iovec() and compat_import_iovec().
Calling conventions:
iovec = iov_array;
err = import_iovec(direction, uvec, nr_segs,
ARRAY_SIZE(iov_array), &iovec,
&iter);
imports user vector into kernel space (into iov_array if it fits, allocated
if it doesn't fit or if iovec was NULL), validates it and sets iter up to
refer to it. On success 0 is returned and allocated kernel copy (or NULL
if the array had fit into caller-supplied one) is returned via iovec.
On failure all allocations are undone and -E... is returned. If the total
size of ranges exceeds MAX_RW_COUNT, the excess is silently truncated.
compat_import_iovec() expects uvec to be a pointer to user array of compat_iovec;
otherwise it's identical to import_iovec().
Finally, import_single_range() sets iov_iter backed by single-element iovec
covering a user-supplied range -
err = import_single_range(direction, address, size, iovec, &iter);
does validation and sets iter up. Again, size in excess of MAX_RW_COUNT gets
silently truncated.
Next commits will be switching the things up to use of those and reducing
the amount of iov_iter_init() instances.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-03-22 04:45:43 +07:00
|
|
|
|
2020-09-25 11:51:40 +07:00
|
|
|
static int copy_compat_iovec_from_user(struct iovec *iov,
|
|
|
|
const struct iovec __user *uvec, unsigned long nr_segs)
|
|
|
|
{
|
|
|
|
const struct compat_iovec __user *uiov =
|
|
|
|
(const struct compat_iovec __user *)uvec;
|
|
|
|
int ret = -EFAULT, i;
|
|
|
|
|
|
|
|
if (!user_access_begin(uvec, nr_segs * sizeof(*uvec)))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
for (i = 0; i < nr_segs; i++) {
|
|
|
|
compat_uptr_t buf;
|
|
|
|
compat_ssize_t len;
|
|
|
|
|
|
|
|
unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
|
|
|
|
unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
|
|
|
|
|
|
|
|
/* check for compat_size_t not fitting in compat_ssize_t .. */
|
|
|
|
if (len < 0) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto uaccess_end;
|
|
|
|
}
|
|
|
|
iov[i].iov_base = compat_ptr(buf);
|
|
|
|
iov[i].iov_len = len;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
uaccess_end:
|
|
|
|
user_access_end();
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int copy_iovec_from_user(struct iovec *iov,
|
|
|
|
const struct iovec __user *uvec, unsigned long nr_segs)
|
2020-09-25 11:51:39 +07:00
|
|
|
{
|
|
|
|
unsigned long seg;
|
|
|
|
|
2020-09-25 11:51:40 +07:00
|
|
|
if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec)))
|
|
|
|
return -EFAULT;
|
|
|
|
for (seg = 0; seg < nr_segs; seg++) {
|
|
|
|
if ((ssize_t)iov[seg].iov_len < 0)
|
|
|
|
return -EINVAL;
|
2020-09-25 11:51:39 +07:00
|
|
|
}
|
|
|
|
|
2020-09-25 11:51:40 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct iovec *iovec_from_user(const struct iovec __user *uvec,
|
|
|
|
unsigned long nr_segs, unsigned long fast_segs,
|
|
|
|
struct iovec *fast_iov, bool compat)
|
|
|
|
{
|
|
|
|
struct iovec *iov = fast_iov;
|
|
|
|
int ret;
|
|
|
|
|
2020-09-25 11:51:39 +07:00
|
|
|
/*
|
2020-09-25 11:51:40 +07:00
|
|
|
* SuS says "The readv() function *may* fail if the iovcnt argument was
|
|
|
|
* less than or equal to 0, or greater than {IOV_MAX}. Linux has
|
|
|
|
* traditionally returned zero for zero segments, so...
|
2020-09-25 11:51:39 +07:00
|
|
|
*/
|
2020-09-25 11:51:40 +07:00
|
|
|
if (nr_segs == 0)
|
|
|
|
return iov;
|
|
|
|
if (nr_segs > UIO_MAXIOV)
|
|
|
|
return ERR_PTR(-EINVAL);
|
2020-09-25 11:51:39 +07:00
|
|
|
if (nr_segs > fast_segs) {
|
|
|
|
iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
|
2020-09-25 11:51:40 +07:00
|
|
|
if (!iov)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
2020-09-25 11:51:39 +07:00
|
|
|
}
|
2020-09-25 11:51:40 +07:00
|
|
|
|
|
|
|
if (compat)
|
|
|
|
ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
|
|
|
|
else
|
|
|
|
ret = copy_iovec_from_user(iov, uvec, nr_segs);
|
|
|
|
if (ret) {
|
|
|
|
if (iov != fast_iov)
|
|
|
|
kfree(iov);
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
return iov;
|
|
|
|
}
|
|
|
|
|
|
|
|
ssize_t __import_iovec(int type, const struct iovec __user *uvec,
|
|
|
|
unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
|
|
|
|
struct iov_iter *i, bool compat)
|
|
|
|
{
|
|
|
|
ssize_t total_len = 0;
|
|
|
|
unsigned long seg;
|
|
|
|
struct iovec *iov;
|
|
|
|
|
|
|
|
iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
|
|
|
|
if (IS_ERR(iov)) {
|
|
|
|
*iovp = NULL;
|
|
|
|
return PTR_ERR(iov);
|
2020-09-25 11:51:39 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2020-09-25 11:51:40 +07:00
|
|
|
* According to the Single Unix Specification we should return EINVAL if
|
|
|
|
* an element length is < 0 when cast to ssize_t or if the total length
|
|
|
|
* would overflow the ssize_t return value of the system call.
|
2020-09-25 11:51:39 +07:00
|
|
|
*
|
|
|
|
* Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
|
|
|
|
* overflow case.
|
|
|
|
*/
|
|
|
|
for (seg = 0; seg < nr_segs; seg++) {
|
|
|
|
ssize_t len = (ssize_t)iov[seg].iov_len;
|
|
|
|
|
2020-09-25 11:51:40 +07:00
|
|
|
if (!access_ok(iov[seg].iov_base, len)) {
|
|
|
|
if (iov != *iovp)
|
|
|
|
kfree(iov);
|
|
|
|
*iovp = NULL;
|
|
|
|
return -EFAULT;
|
2020-09-25 11:51:39 +07:00
|
|
|
}
|
2020-09-25 11:51:40 +07:00
|
|
|
|
|
|
|
if (len > MAX_RW_COUNT - total_len) {
|
|
|
|
len = MAX_RW_COUNT - total_len;
|
2020-09-25 11:51:39 +07:00
|
|
|
iov[seg].iov_len = len;
|
|
|
|
}
|
2020-09-25 11:51:40 +07:00
|
|
|
total_len += len;
|
2020-09-25 11:51:39 +07:00
|
|
|
}
|
2020-09-25 11:51:40 +07:00
|
|
|
|
|
|
|
iov_iter_init(i, type, iov, nr_segs, total_len);
|
|
|
|
if (iov == *iovp)
|
|
|
|
*iovp = NULL;
|
|
|
|
else
|
|
|
|
*iovp = iov;
|
|
|
|
return total_len;
|
2020-09-25 11:51:39 +07:00
|
|
|
}
|
|
|
|
|
2016-10-08 16:18:07 +07:00
|
|
|
/**
|
|
|
|
* import_iovec() - Copy an array of &struct iovec from userspace
|
|
|
|
* into the kernel, check that it is valid, and initialize a new
|
|
|
|
* &struct iov_iter iterator to access it.
|
|
|
|
*
|
|
|
|
* @type: One of %READ or %WRITE.
|
2020-09-25 11:51:40 +07:00
|
|
|
* @uvec: Pointer to the userspace array.
|
2016-10-08 16:18:07 +07:00
|
|
|
* @nr_segs: Number of elements in userspace array.
|
|
|
|
* @fast_segs: Number of elements in @iov.
|
2020-09-25 11:51:40 +07:00
|
|
|
* @iovp: (input and output parameter) Pointer to pointer to (usually small
|
2016-10-08 16:18:07 +07:00
|
|
|
* on-stack) kernel array.
|
|
|
|
* @i: Pointer to iterator that will be initialized on success.
|
|
|
|
*
|
|
|
|
* If the array pointed to by *@iov is large enough to hold all @nr_segs,
|
|
|
|
* then this function places %NULL in *@iov on return. Otherwise, a new
|
|
|
|
* array will be allocated and the result placed in *@iov. This means that
|
|
|
|
* the caller may call kfree() on *@iov regardless of whether the small
|
|
|
|
* on-stack array was used or not (and regardless of whether this function
|
|
|
|
* returns an error or not).
|
|
|
|
*
|
2019-05-15 05:02:22 +07:00
|
|
|
* Return: Negative error code on error, bytes imported on success
|
2016-10-08 16:18:07 +07:00
|
|
|
*/
|
2020-09-25 11:51:40 +07:00
|
|
|
ssize_t import_iovec(int type, const struct iovec __user *uvec,
|
saner iov_iter initialization primitives
iovec-backed iov_iter instances are assumed to satisfy several properties:
* no more than UIO_MAXIOV elements in iovec array
* total size of all ranges is no more than MAX_RW_COUNT
* all ranges pass access_ok().
The problem is, invariants of data structures should be established in the
primitives creating those data structures, not in the code using those
primitives. And iov_iter_init() violates that principle. For a while we
managed to get away with that, but once the use of iov_iter started to
spread, it didn't take long for shit to hit the fan - missed check in
sys_sendto() had introduced a roothole.
We _do_ have primitives for importing and validating iovecs (both native and
compat ones) and those primitives are almost always followed by shoving the
resulting iovec into iov_iter. Life would be considerably simpler (and safer)
if we combined those primitives with initializing iov_iter.
That gives us two new primitives - import_iovec() and compat_import_iovec().
Calling conventions:
iovec = iov_array;
err = import_iovec(direction, uvec, nr_segs,
ARRAY_SIZE(iov_array), &iovec,
&iter);
imports user vector into kernel space (into iov_array if it fits, allocated
if it doesn't fit or if iovec was NULL), validates it and sets iter up to
refer to it. On success 0 is returned and allocated kernel copy (or NULL
if the array had fit into caller-supplied one) is returned via iovec.
On failure all allocations are undone and -E... is returned. If the total
size of ranges exceeds MAX_RW_COUNT, the excess is silently truncated.
compat_import_iovec() expects uvec to be a pointer to user array of compat_iovec;
otherwise it's identical to import_iovec().
Finally, import_single_range() sets iov_iter backed by single-element iovec
covering a user-supplied range -
err = import_single_range(direction, address, size, iovec, &iter);
does validation and sets iter up. Again, size in excess of MAX_RW_COUNT gets
silently truncated.
Next commits will be switching the things up to use of those and reducing
the amount of iov_iter_init() instances.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-03-22 04:45:43 +07:00
|
|
|
unsigned nr_segs, unsigned fast_segs,
|
2020-09-25 11:51:40 +07:00
|
|
|
struct iovec **iovp, struct iov_iter *i)
|
saner iov_iter initialization primitives
iovec-backed iov_iter instances are assumed to satisfy several properties:
* no more than UIO_MAXIOV elements in iovec array
* total size of all ranges is no more than MAX_RW_COUNT
* all ranges pass access_ok().
The problem is, invariants of data structures should be established in the
primitives creating those data structures, not in the code using those
primitives. And iov_iter_init() violates that principle. For a while we
managed to get away with that, but once the use of iov_iter started to
spread, it didn't take long for shit to hit the fan - missed check in
sys_sendto() had introduced a roothole.
We _do_ have primitives for importing and validating iovecs (both native and
compat ones) and those primitives are almost always followed by shoving the
resulting iovec into iov_iter. Life would be considerably simpler (and safer)
if we combined those primitives with initializing iov_iter.
That gives us two new primitives - import_iovec() and compat_import_iovec().
Calling conventions:
iovec = iov_array;
err = import_iovec(direction, uvec, nr_segs,
ARRAY_SIZE(iov_array), &iovec,
&iter);
imports user vector into kernel space (into iov_array if it fits, allocated
if it doesn't fit or if iovec was NULL), validates it and sets iter up to
refer to it. On success 0 is returned and allocated kernel copy (or NULL
if the array had fit into caller-supplied one) is returned via iovec.
On failure all allocations are undone and -E... is returned. If the total
size of ranges exceeds MAX_RW_COUNT, the excess is silently truncated.
compat_import_iovec() expects uvec to be a pointer to user array of compat_iovec;
otherwise it's identical to import_iovec().
Finally, import_single_range() sets iov_iter backed by single-element iovec
covering a user-supplied range -
err = import_single_range(direction, address, size, iovec, &iter);
does validation and sets iter up. Again, size in excess of MAX_RW_COUNT gets
silently truncated.
Next commits will be switching the things up to use of those and reducing
the amount of iov_iter_init() instances.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-03-22 04:45:43 +07:00
|
|
|
{
|
2020-09-25 11:51:41 +07:00
|
|
|
return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
|
|
|
|
in_compat_syscall());
|
saner iov_iter initialization primitives
iovec-backed iov_iter instances are assumed to satisfy several properties:
* no more than UIO_MAXIOV elements in iovec array
* total size of all ranges is no more than MAX_RW_COUNT
* all ranges pass access_ok().
The problem is, invariants of data structures should be established in the
primitives creating those data structures, not in the code using those
primitives. And iov_iter_init() violates that principle. For a while we
managed to get away with that, but once the use of iov_iter started to
spread, it didn't take long for shit to hit the fan - missed check in
sys_sendto() had introduced a roothole.
We _do_ have primitives for importing and validating iovecs (both native and
compat ones) and those primitives are almost always followed by shoving the
resulting iovec into iov_iter. Life would be considerably simpler (and safer)
if we combined those primitives with initializing iov_iter.
That gives us two new primitives - import_iovec() and compat_import_iovec().
Calling conventions:
iovec = iov_array;
err = import_iovec(direction, uvec, nr_segs,
ARRAY_SIZE(iov_array), &iovec,
&iter);
imports user vector into kernel space (into iov_array if it fits, allocated
if it doesn't fit or if iovec was NULL), validates it and sets iter up to
refer to it. On success 0 is returned and allocated kernel copy (or NULL
if the array had fit into caller-supplied one) is returned via iovec.
On failure all allocations are undone and -E... is returned. If the total
size of ranges exceeds MAX_RW_COUNT, the excess is silently truncated.
compat_import_iovec() expects uvec to be a pointer to user array of compat_iovec;
otherwise it's identical to import_iovec().
Finally, import_single_range() sets iov_iter backed by single-element iovec
covering a user-supplied range -
err = import_single_range(direction, address, size, iovec, &iter);
does validation and sets iter up. Again, size in excess of MAX_RW_COUNT gets
silently truncated.
Next commits will be switching the things up to use of those and reducing
the amount of iov_iter_init() instances.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-03-22 04:45:43 +07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(import_iovec);
|
|
|
|
|
|
|
|
int import_single_range(int rw, void __user *buf, size_t len,
|
|
|
|
struct iovec *iov, struct iov_iter *i)
|
|
|
|
{
|
|
|
|
if (len > MAX_RW_COUNT)
|
|
|
|
len = MAX_RW_COUNT;
|
Remove 'type' argument from access_ok() function
Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument
of the user address range verification function since we got rid of the
old racy i386-only code to walk page tables by hand.
It existed because the original 80386 would not honor the write protect
bit when in kernel mode, so you had to do COW by hand before doing any
user access. But we haven't supported that in a long time, and these
days the 'type' argument is a purely historical artifact.
A discussion about extending 'user_access_begin()' to do the range
checking resulted this patch, because there is no way we're going to
move the old VERIFY_xyz interface to that model. And it's best done at
the end of the merge window when I've done most of my merges, so let's
just get this done once and for all.
This patch was mostly done with a sed-script, with manual fix-ups for
the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form.
There were a couple of notable cases:
- csky still had the old "verify_area()" name as an alias.
- the iter_iov code had magical hardcoded knowledge of the actual
values of VERIFY_{READ,WRITE} (not that they mattered, since nothing
really used it)
- microblaze used the type argument for a debug printout
but other than those oddities this should be a total no-op patch.
I tried to fix up all architectures, did fairly extensive grepping for
access_ok() uses, and the changes are trivial, but I may have missed
something. Any missed conversion should be trivially fixable, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-04 09:57:57 +07:00
|
|
|
if (unlikely(!access_ok(buf, len)))
|
saner iov_iter initialization primitives
iovec-backed iov_iter instances are assumed to satisfy several properties:
* no more than UIO_MAXIOV elements in iovec array
* total size of all ranges is no more than MAX_RW_COUNT
* all ranges pass access_ok().
The problem is, invariants of data structures should be established in the
primitives creating those data structures, not in the code using those
primitives. And iov_iter_init() violates that principle. For a while we
managed to get away with that, but once the use of iov_iter started to
spread, it didn't take long for shit to hit the fan - missed check in
sys_sendto() had introduced a roothole.
We _do_ have primitives for importing and validating iovecs (both native and
compat ones) and those primitives are almost always followed by shoving the
resulting iovec into iov_iter. Life would be considerably simpler (and safer)
if we combined those primitives with initializing iov_iter.
That gives us two new primitives - import_iovec() and compat_import_iovec().
Calling conventions:
iovec = iov_array;
err = import_iovec(direction, uvec, nr_segs,
ARRAY_SIZE(iov_array), &iovec,
&iter);
imports user vector into kernel space (into iov_array if it fits, allocated
if it doesn't fit or if iovec was NULL), validates it and sets iter up to
refer to it. On success 0 is returned and allocated kernel copy (or NULL
if the array had fit into caller-supplied one) is returned via iovec.
On failure all allocations are undone and -E... is returned. If the total
size of ranges exceeds MAX_RW_COUNT, the excess is silently truncated.
compat_import_iovec() expects uvec to be a pointer to user array of compat_iovec;
otherwise it's identical to import_iovec().
Finally, import_single_range() sets iov_iter backed by single-element iovec
covering a user-supplied range -
err = import_single_range(direction, address, size, iovec, &iter);
does validation and sets iter up. Again, size in excess of MAX_RW_COUNT gets
silently truncated.
Next commits will be switching the things up to use of those and reducing
the amount of iov_iter_init() instances.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-03-22 04:45:43 +07:00
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
iov->iov_base = buf;
|
|
|
|
iov->iov_len = len;
|
|
|
|
iov_iter_init(i, rw, iov, 1, len);
|
|
|
|
return 0;
|
|
|
|
}
|
2015-12-07 08:38:56 +07:00
|
|
|
EXPORT_SYMBOL(import_single_range);
|
2017-02-18 13:44:03 +07:00
|
|
|
|
|
|
|
int iov_iter_for_each_range(struct iov_iter *i, size_t bytes,
|
|
|
|
int (*f)(struct kvec *vec, void *context),
|
|
|
|
void *context)
|
|
|
|
{
|
|
|
|
struct kvec w;
|
|
|
|
int err = -EINVAL;
|
|
|
|
if (!bytes)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
iterate_all_kinds(i, bytes, v, -EINVAL, ({
|
|
|
|
w.iov_base = kmap(v.bv_page) + v.bv_offset;
|
|
|
|
w.iov_len = v.bv_len;
|
|
|
|
err = f(&w, context);
|
|
|
|
kunmap(v.bv_page);
|
|
|
|
err;}), ({
|
|
|
|
w = v;
|
|
|
|
err = f(&w, context);})
|
|
|
|
)
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(iov_iter_for_each_range);
|