mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-02-26 06:51:09 +07:00
Merge branch 'tcp-mmap-rework-zerocopy-receive'
Eric Dumazet says: ==================== tcp: mmap: rework zerocopy receive syzbot reported a lockdep issue caused by tcp mmap() support. I implemented Andy Lutomirski nice suggestions to resolve the issue and increase scalability as well. First patch is adding a new getsockopt() operation and changes mmap() behavior. Second patch changes tcp_mmap reference program. v4: tcp mmap() support depends on CONFIG_MMU, as kbuild bot told us. v3: change TCP_ZEROCOPY_RECEIVE to be a getsockopt() option instead of setsockopt(), feedback from Ka-Cheon Poon v2: Added a missing page align of zc->length in tcp_zerocopy_receive() Properly clear zc->recv_skip_hint in case user request was completed. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
5d659b1d7d
@ -122,6 +122,7 @@ enum {
|
||||
#define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */
|
||||
#define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */
|
||||
#define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */
|
||||
#define TCP_ZEROCOPY_RECEIVE 35
|
||||
|
||||
struct tcp_repair_opt {
|
||||
__u32 opt_code;
|
||||
@ -276,4 +277,11 @@ struct tcp_diag_md5sig {
|
||||
__u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN];
|
||||
};
|
||||
|
||||
/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
|
||||
|
||||
struct tcp_zerocopy_receive {
|
||||
__u64 address; /* in: address of mapping */
|
||||
__u32 length; /* in/out: number of bytes to map/mapped */
|
||||
__u32 recv_skip_hint; /* out: amount of bytes to skip */
|
||||
};
|
||||
#endif /* _UAPI_LINUX_TCP_H */
|
||||
|
@ -994,7 +994,9 @@ const struct proto_ops inet_stream_ops = {
|
||||
.getsockopt = sock_common_getsockopt,
|
||||
.sendmsg = inet_sendmsg,
|
||||
.recvmsg = inet_recvmsg,
|
||||
#ifdef CONFIG_MMU
|
||||
.mmap = tcp_mmap,
|
||||
#endif
|
||||
.sendpage = inet_sendpage,
|
||||
.splice_read = tcp_splice_read,
|
||||
.read_sock = tcp_read_sock,
|
||||
|
196
net/ipv4/tcp.c
196
net/ipv4/tcp.c
@ -1726,118 +1726,113 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
|
||||
}
|
||||
EXPORT_SYMBOL(tcp_set_rcvlowat);
|
||||
|
||||
/* When user wants to mmap X pages, we first need to perform the mapping
|
||||
* before freeing any skbs in receive queue, otherwise user would be unable
|
||||
* to fallback to standard recvmsg(). This happens if some data in the
|
||||
* requested block is not exactly fitting in a page.
|
||||
*
|
||||
* We only support order-0 pages for the moment.
|
||||
* mmap() on TCP is very strict, there is no point
|
||||
* trying to accommodate with pathological layouts.
|
||||
*/
|
||||
#ifdef CONFIG_MMU
|
||||
static const struct vm_operations_struct tcp_vm_ops = {
|
||||
};
|
||||
|
||||
int tcp_mmap(struct file *file, struct socket *sock,
|
||||
struct vm_area_struct *vma)
|
||||
{
|
||||
unsigned long size = vma->vm_end - vma->vm_start;
|
||||
unsigned int nr_pages = size >> PAGE_SHIFT;
|
||||
struct page **pages_array = NULL;
|
||||
u32 seq, len, offset, nr = 0;
|
||||
struct sock *sk = sock->sk;
|
||||
const skb_frag_t *frags;
|
||||
if (vma->vm_flags & (VM_WRITE | VM_EXEC))
|
||||
return -EPERM;
|
||||
vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
|
||||
|
||||
/* Instruct vm_insert_page() to not down_read(mmap_sem) */
|
||||
vma->vm_flags |= VM_MIXEDMAP;
|
||||
|
||||
vma->vm_ops = &tcp_vm_ops;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(tcp_mmap);
|
||||
|
||||
static int tcp_zerocopy_receive(struct sock *sk,
|
||||
struct tcp_zerocopy_receive *zc)
|
||||
{
|
||||
unsigned long address = (unsigned long)zc->address;
|
||||
const skb_frag_t *frags = NULL;
|
||||
u32 length = 0, seq, offset;
|
||||
struct vm_area_struct *vma;
|
||||
struct sk_buff *skb = NULL;
|
||||
struct tcp_sock *tp;
|
||||
struct sk_buff *skb;
|
||||
int ret;
|
||||
|
||||
if (vma->vm_pgoff || !nr_pages)
|
||||
if (address & (PAGE_SIZE - 1) || address != zc->address)
|
||||
return -EINVAL;
|
||||
|
||||
if (vma->vm_flags & VM_WRITE)
|
||||
return -EPERM;
|
||||
/* TODO: Maybe the following is not needed if pages are COW */
|
||||
vma->vm_flags &= ~VM_MAYWRITE;
|
||||
|
||||
lock_sock(sk);
|
||||
|
||||
ret = -ENOTCONN;
|
||||
if (sk->sk_state == TCP_LISTEN)
|
||||
goto out;
|
||||
return -ENOTCONN;
|
||||
|
||||
sock_rps_record_flow(sk);
|
||||
|
||||
if (tcp_inq(sk) < size) {
|
||||
ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
|
||||
down_read(¤t->mm->mmap_sem);
|
||||
|
||||
ret = -EINVAL;
|
||||
vma = find_vma(current->mm, address);
|
||||
if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
|
||||
goto out;
|
||||
}
|
||||
zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
|
||||
|
||||
tp = tcp_sk(sk);
|
||||
seq = tp->copied_seq;
|
||||
/* Abort if urgent data is in the area */
|
||||
if (unlikely(tp->urg_data)) {
|
||||
u32 urg_offset = tp->urg_seq - seq;
|
||||
zc->length = min_t(u32, zc->length, tcp_inq(sk));
|
||||
zc->length &= ~(PAGE_SIZE - 1);
|
||||
|
||||
ret = -EINVAL;
|
||||
if (urg_offset < size)
|
||||
goto out;
|
||||
}
|
||||
ret = -ENOMEM;
|
||||
pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
|
||||
GFP_KERNEL);
|
||||
if (!pages_array)
|
||||
goto out;
|
||||
skb = tcp_recv_skb(sk, seq, &offset);
|
||||
ret = -EINVAL;
|
||||
skb_start:
|
||||
/* We do not support anything not in page frags */
|
||||
offset -= skb_headlen(skb);
|
||||
if ((int)offset < 0)
|
||||
goto out;
|
||||
if (skb_has_frag_list(skb))
|
||||
goto out;
|
||||
len = skb->data_len - offset;
|
||||
frags = skb_shinfo(skb)->frags;
|
||||
while (offset) {
|
||||
if (frags->size > offset)
|
||||
goto out;
|
||||
offset -= frags->size;
|
||||
zap_page_range(vma, address, zc->length);
|
||||
|
||||
zc->recv_skip_hint = 0;
|
||||
ret = 0;
|
||||
while (length + PAGE_SIZE <= zc->length) {
|
||||
if (zc->recv_skip_hint < PAGE_SIZE) {
|
||||
if (skb) {
|
||||
skb = skb->next;
|
||||
offset = seq - TCP_SKB_CB(skb)->seq;
|
||||
} else {
|
||||
skb = tcp_recv_skb(sk, seq, &offset);
|
||||
}
|
||||
|
||||
zc->recv_skip_hint = skb->len - offset;
|
||||
offset -= skb_headlen(skb);
|
||||
if ((int)offset < 0 || skb_has_frag_list(skb))
|
||||
break;
|
||||
frags = skb_shinfo(skb)->frags;
|
||||
while (offset) {
|
||||
if (frags->size > offset)
|
||||
goto out;
|
||||
offset -= frags->size;
|
||||
frags++;
|
||||
}
|
||||
}
|
||||
if (frags->size != PAGE_SIZE || frags->page_offset)
|
||||
break;
|
||||
ret = vm_insert_page(vma, address + length,
|
||||
skb_frag_page(frags));
|
||||
if (ret)
|
||||
break;
|
||||
length += PAGE_SIZE;
|
||||
seq += PAGE_SIZE;
|
||||
zc->recv_skip_hint -= PAGE_SIZE;
|
||||
frags++;
|
||||
}
|
||||
while (nr < nr_pages) {
|
||||
if (len) {
|
||||
if (len < PAGE_SIZE)
|
||||
goto out;
|
||||
if (frags->size != PAGE_SIZE || frags->page_offset)
|
||||
goto out;
|
||||
pages_array[nr++] = skb_frag_page(frags);
|
||||
frags++;
|
||||
len -= PAGE_SIZE;
|
||||
seq += PAGE_SIZE;
|
||||
continue;
|
||||
}
|
||||
skb = skb->next;
|
||||
offset = seq - TCP_SKB_CB(skb)->seq;
|
||||
goto skb_start;
|
||||
}
|
||||
/* OK, we have a full set of pages ready to be inserted into vma */
|
||||
for (nr = 0; nr < nr_pages; nr++) {
|
||||
ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
|
||||
pages_array[nr]);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
/* operation is complete, we can 'consume' all skbs */
|
||||
tp->copied_seq = seq;
|
||||
tcp_rcv_space_adjust(sk);
|
||||
|
||||
/* Clean up data we have read: This will do ACK frames. */
|
||||
tcp_recv_skb(sk, seq, &offset);
|
||||
tcp_cleanup_rbuf(sk, size);
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
release_sock(sk);
|
||||
kvfree(pages_array);
|
||||
up_read(¤t->mm->mmap_sem);
|
||||
if (length) {
|
||||
tp->copied_seq = seq;
|
||||
tcp_rcv_space_adjust(sk);
|
||||
|
||||
/* Clean up data we have read: This will do ACK frames. */
|
||||
tcp_recv_skb(sk, seq, &offset);
|
||||
tcp_cleanup_rbuf(sk, length);
|
||||
ret = 0;
|
||||
if (length == zc->length)
|
||||
zc->recv_skip_hint = 0;
|
||||
} else {
|
||||
if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
|
||||
ret = -EIO;
|
||||
}
|
||||
zc->length = length;
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(tcp_mmap);
|
||||
#endif
|
||||
|
||||
static void tcp_update_recv_tstamps(struct sk_buff *skb,
|
||||
struct scm_timestamping *tss)
|
||||
@ -3472,6 +3467,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#ifdef CONFIG_MMU
|
||||
case TCP_ZEROCOPY_RECEIVE: {
|
||||
struct tcp_zerocopy_receive zc;
|
||||
int err;
|
||||
|
||||
if (get_user(len, optlen))
|
||||
return -EFAULT;
|
||||
if (len != sizeof(zc))
|
||||
return -EINVAL;
|
||||
if (copy_from_user(&zc, optval, len))
|
||||
return -EFAULT;
|
||||
lock_sock(sk);
|
||||
err = tcp_zerocopy_receive(sk, &zc);
|
||||
release_sock(sk);
|
||||
if (!err && copy_to_user(optval, &zc, len))
|
||||
err = -EFAULT;
|
||||
return err;
|
||||
}
|
||||
#endif
|
||||
default:
|
||||
return -ENOPROTOOPT;
|
||||
}
|
||||
|
@ -578,7 +578,9 @@ const struct proto_ops inet6_stream_ops = {
|
||||
.getsockopt = sock_common_getsockopt, /* ok */
|
||||
.sendmsg = inet_sendmsg, /* ok */
|
||||
.recvmsg = inet_recvmsg, /* ok */
|
||||
#ifdef CONFIG_MMU
|
||||
.mmap = tcp_mmap,
|
||||
#endif
|
||||
.sendpage = inet_sendpage,
|
||||
.sendmsg_locked = tcp_sendmsg_locked,
|
||||
.sendpage_locked = tcp_sendpage_locked,
|
||||
|
@ -76,9 +76,10 @@
|
||||
#include <time.h>
|
||||
#include <sys/time.h>
|
||||
#include <netinet/in.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <poll.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <assert.h>
|
||||
|
||||
#ifndef MSG_ZEROCOPY
|
||||
#define MSG_ZEROCOPY 0x4000000
|
||||
@ -134,11 +135,12 @@ void hash_zone(void *zone, unsigned int length)
|
||||
void *child_thread(void *arg)
|
||||
{
|
||||
unsigned long total_mmap = 0, total = 0;
|
||||
struct tcp_zerocopy_receive zc;
|
||||
unsigned long delta_usec;
|
||||
int flags = MAP_SHARED;
|
||||
struct timeval t0, t1;
|
||||
char *buffer = NULL;
|
||||
void *oaddr = NULL;
|
||||
void *addr = NULL;
|
||||
double throughput;
|
||||
struct rusage ru;
|
||||
int lu, fd;
|
||||
@ -153,41 +155,46 @@ void *child_thread(void *arg)
|
||||
perror("malloc");
|
||||
goto error;
|
||||
}
|
||||
if (zflg) {
|
||||
addr = mmap(NULL, chunk_size, PROT_READ, flags, fd, 0);
|
||||
if (addr == (void *)-1)
|
||||
zflg = 0;
|
||||
}
|
||||
while (1) {
|
||||
struct pollfd pfd = { .fd = fd, .events = POLLIN, };
|
||||
int sub;
|
||||
|
||||
poll(&pfd, 1, 10000);
|
||||
if (zflg) {
|
||||
void *naddr;
|
||||
socklen_t zc_len = sizeof(zc);
|
||||
int res;
|
||||
|
||||
naddr = mmap(oaddr, chunk_size, PROT_READ, flags, fd, 0);
|
||||
if (naddr == (void *)-1) {
|
||||
if (errno == EAGAIN) {
|
||||
/* That is if SO_RCVLOWAT is buggy */
|
||||
usleep(1000);
|
||||
continue;
|
||||
}
|
||||
if (errno == EINVAL) {
|
||||
flags = MAP_SHARED;
|
||||
oaddr = NULL;
|
||||
goto fallback;
|
||||
}
|
||||
if (errno != EIO)
|
||||
perror("mmap()");
|
||||
zc.address = (__u64)addr;
|
||||
zc.length = chunk_size;
|
||||
zc.recv_skip_hint = 0;
|
||||
res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE,
|
||||
&zc, &zc_len);
|
||||
if (res == -1)
|
||||
break;
|
||||
|
||||
if (zc.length) {
|
||||
assert(zc.length <= chunk_size);
|
||||
total_mmap += zc.length;
|
||||
if (xflg)
|
||||
hash_zone(addr, zc.length);
|
||||
total += zc.length;
|
||||
}
|
||||
total_mmap += chunk_size;
|
||||
if (xflg)
|
||||
hash_zone(naddr, chunk_size);
|
||||
total += chunk_size;
|
||||
if (!keepflag) {
|
||||
flags |= MAP_FIXED;
|
||||
oaddr = naddr;
|
||||
if (zc.recv_skip_hint) {
|
||||
assert(zc.recv_skip_hint <= chunk_size);
|
||||
lu = read(fd, buffer, zc.recv_skip_hint);
|
||||
if (lu > 0) {
|
||||
if (xflg)
|
||||
hash_zone(buffer, lu);
|
||||
total += lu;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
fallback:
|
||||
sub = 0;
|
||||
while (sub < chunk_size) {
|
||||
lu = read(fd, buffer + sub, chunk_size - sub);
|
||||
@ -228,6 +235,8 @@ void *child_thread(void *arg)
|
||||
error:
|
||||
free(buffer);
|
||||
close(fd);
|
||||
if (zflg)
|
||||
munmap(addr, chunk_size);
|
||||
pthread_exit(0);
|
||||
}
|
||||
|
||||
@ -371,7 +380,8 @@ int main(int argc, char *argv[])
|
||||
setup_sockaddr(cfg_family, host, &listenaddr);
|
||||
|
||||
if (mss &&
|
||||
setsockopt(fdlisten, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
|
||||
setsockopt(fdlisten, IPPROTO_TCP, TCP_MAXSEG,
|
||||
&mss, sizeof(mss)) == -1) {
|
||||
perror("setsockopt TCP_MAXSEG");
|
||||
exit(1);
|
||||
}
|
||||
@ -402,7 +412,7 @@ int main(int argc, char *argv[])
|
||||
setup_sockaddr(cfg_family, host, &addr);
|
||||
|
||||
if (mss &&
|
||||
setsockopt(fd, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
|
||||
setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
|
||||
perror("setsockopt TCP_MAXSEG");
|
||||
exit(1);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user