mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-26 04:05:18 +07:00
Merge branch 'sockmap_and_ktls'
Daniel Borkmann says: ==================== This work adds a generic sk_msg layer and converts both sockmap and later ktls over to make use of it as a common data structure for application data (similarly as sk_buff for network packets). With that in place the sk_msg framework spans accross ULP layer in the kernel and allows for introspection or filtering of L7 data with the help of BPF programs operating on a common input context. In a second step, we enable the latter for ktls which was previously not possible, meaning, ktls and sk_msg verdict programs were mutually exclusive in the ULP layer which created challenges for the orchestrator when trying to apply TCP based policy, for example. Leveraging the prior consolidation we can finally overcome this limitation. Note, there's no change in behavior when ktls is not used in combination with BPF, and also no change in behavior for stand alone sockmap. The kselftest suites for ktls, sockmap and ktls with sockmap combined also runs through successfully. For further details please see individual patches. Thanks! v1 -> v2: - Removed leftover comment spotted by Alexei - Improved commit messages, rebase ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
commit
d04fb13c9f
10
MAINTAINERS
10
MAINTAINERS
@ -8188,6 +8188,16 @@ S: Maintained
|
||||
F: net/l3mdev
|
||||
F: include/net/l3mdev.h
|
||||
|
||||
L7 BPF FRAMEWORK
|
||||
M: John Fastabend <john.fastabend@gmail.com>
|
||||
M: Daniel Borkmann <daniel@iogearbox.net>
|
||||
L: netdev@vger.kernel.org
|
||||
S: Maintained
|
||||
F: include/linux/skmsg.h
|
||||
F: net/core/skmsg.c
|
||||
F: net/core/sock_map.c
|
||||
F: net/ipv4/tcp_bpf.c
|
||||
|
||||
LANTIQ / INTEL Ethernet drivers
|
||||
M: Hauke Mehrtens <hauke@hauke-m.de>
|
||||
L: netdev@vger.kernel.org
|
||||
|
@ -737,33 +737,18 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
|
||||
}
|
||||
#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
|
||||
|
||||
#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
|
||||
struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
|
||||
struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
|
||||
int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
|
||||
int sockmap_get_from_fd(const union bpf_attr *attr, int type,
|
||||
struct bpf_prog *prog);
|
||||
#if defined(CONFIG_BPF_STREAM_PARSER)
|
||||
int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which);
|
||||
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
|
||||
#else
|
||||
static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline struct sock *__sock_hash_lookup_elem(struct bpf_map *map,
|
||||
void *key)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline int sock_map_prog(struct bpf_map *map,
|
||||
struct bpf_prog *prog,
|
||||
u32 type)
|
||||
static inline int sock_map_prog_update(struct bpf_map *map,
|
||||
struct bpf_prog *prog, u32 which)
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
static inline int sockmap_get_from_fd(const union bpf_attr *attr, int type,
|
||||
struct bpf_prog *prog)
|
||||
static inline int sock_map_get_from_fd(const union bpf_attr *attr,
|
||||
struct bpf_prog *prog)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -839,6 +824,10 @@ extern const struct bpf_func_proto bpf_get_stack_proto;
|
||||
extern const struct bpf_func_proto bpf_sock_map_update_proto;
|
||||
extern const struct bpf_func_proto bpf_sock_hash_update_proto;
|
||||
extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
|
||||
extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
|
||||
extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
|
||||
extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
|
||||
extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
|
||||
|
||||
extern const struct bpf_func_proto bpf_get_local_storage_proto;
|
||||
|
||||
|
@ -57,7 +57,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
|
||||
BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
|
||||
#ifdef CONFIG_NET
|
||||
BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
|
||||
#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET)
|
||||
#if defined(CONFIG_BPF_STREAM_PARSER)
|
||||
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
|
||||
BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
|
||||
#endif
|
||||
|
@ -520,24 +520,6 @@ struct bpf_skb_data_end {
|
||||
void *data_end;
|
||||
};
|
||||
|
||||
struct sk_msg_buff {
|
||||
void *data;
|
||||
void *data_end;
|
||||
__u32 apply_bytes;
|
||||
__u32 cork_bytes;
|
||||
int sg_copybreak;
|
||||
int sg_start;
|
||||
int sg_curr;
|
||||
int sg_end;
|
||||
struct scatterlist sg_data[MAX_SKB_FRAGS];
|
||||
bool sg_copy[MAX_SKB_FRAGS];
|
||||
__u32 flags;
|
||||
struct sock *sk_redir;
|
||||
struct sock *sk;
|
||||
struct sk_buff *skb;
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
struct bpf_redirect_info {
|
||||
u32 ifindex;
|
||||
u32 flags;
|
||||
@ -833,9 +815,6 @@ void xdp_do_flush_map(void);
|
||||
|
||||
void bpf_warn_invalid_xdp_action(u32 act);
|
||||
|
||||
struct sock *do_sk_redirect_map(struct sk_buff *skb);
|
||||
struct sock *do_msg_redirect_map(struct sk_msg_buff *md);
|
||||
|
||||
#ifdef CONFIG_INET
|
||||
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
|
||||
struct bpf_prog *prog, struct sk_buff *skb,
|
||||
|
410
include/linux/skmsg.h
Normal file
410
include/linux/skmsg.h
Normal file
@ -0,0 +1,410 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
|
||||
|
||||
#ifndef _LINUX_SKMSG_H
|
||||
#define _LINUX_SKMSG_H
|
||||
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/filter.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <linux/skbuff.h>
|
||||
|
||||
#include <net/sock.h>
|
||||
#include <net/tcp.h>
|
||||
#include <net/strparser.h>
|
||||
|
||||
#define MAX_MSG_FRAGS MAX_SKB_FRAGS
|
||||
|
||||
enum __sk_action {
|
||||
__SK_DROP = 0,
|
||||
__SK_PASS,
|
||||
__SK_REDIRECT,
|
||||
__SK_NONE,
|
||||
};
|
||||
|
||||
struct sk_msg_sg {
|
||||
u32 start;
|
||||
u32 curr;
|
||||
u32 end;
|
||||
u32 size;
|
||||
u32 copybreak;
|
||||
bool copy[MAX_MSG_FRAGS];
|
||||
/* The extra element is used for chaining the front and sections when
|
||||
* the list becomes partitioned (e.g. end < start). The crypto APIs
|
||||
* require the chaining.
|
||||
*/
|
||||
struct scatterlist data[MAX_MSG_FRAGS + 1];
|
||||
};
|
||||
|
||||
struct sk_msg {
|
||||
struct sk_msg_sg sg;
|
||||
void *data;
|
||||
void *data_end;
|
||||
u32 apply_bytes;
|
||||
u32 cork_bytes;
|
||||
u32 flags;
|
||||
struct sk_buff *skb;
|
||||
struct sock *sk_redir;
|
||||
struct sock *sk;
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
struct sk_psock_progs {
|
||||
struct bpf_prog *msg_parser;
|
||||
struct bpf_prog *skb_parser;
|
||||
struct bpf_prog *skb_verdict;
|
||||
};
|
||||
|
||||
enum sk_psock_state_bits {
|
||||
SK_PSOCK_TX_ENABLED,
|
||||
};
|
||||
|
||||
struct sk_psock_link {
|
||||
struct list_head list;
|
||||
struct bpf_map *map;
|
||||
void *link_raw;
|
||||
};
|
||||
|
||||
struct sk_psock_parser {
|
||||
struct strparser strp;
|
||||
bool enabled;
|
||||
void (*saved_data_ready)(struct sock *sk);
|
||||
};
|
||||
|
||||
struct sk_psock_work_state {
|
||||
struct sk_buff *skb;
|
||||
u32 len;
|
||||
u32 off;
|
||||
};
|
||||
|
||||
struct sk_psock {
|
||||
struct sock *sk;
|
||||
struct sock *sk_redir;
|
||||
u32 apply_bytes;
|
||||
u32 cork_bytes;
|
||||
u32 eval;
|
||||
struct sk_msg *cork;
|
||||
struct sk_psock_progs progs;
|
||||
struct sk_psock_parser parser;
|
||||
struct sk_buff_head ingress_skb;
|
||||
struct list_head ingress_msg;
|
||||
unsigned long state;
|
||||
struct list_head link;
|
||||
spinlock_t link_lock;
|
||||
refcount_t refcnt;
|
||||
void (*saved_unhash)(struct sock *sk);
|
||||
void (*saved_close)(struct sock *sk, long timeout);
|
||||
void (*saved_write_space)(struct sock *sk);
|
||||
struct proto *sk_proto;
|
||||
struct sk_psock_work_state work_state;
|
||||
struct work_struct work;
|
||||
union {
|
||||
struct rcu_head rcu;
|
||||
struct work_struct gc;
|
||||
};
|
||||
};
|
||||
|
||||
int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
|
||||
int elem_first_coalesce);
|
||||
int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src,
|
||||
u32 off, u32 len);
|
||||
void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len);
|
||||
int sk_msg_free(struct sock *sk, struct sk_msg *msg);
|
||||
int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg);
|
||||
void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes);
|
||||
void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
|
||||
u32 bytes);
|
||||
|
||||
void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes);
|
||||
void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes);
|
||||
|
||||
int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
|
||||
struct sk_msg *msg, u32 bytes);
|
||||
int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
|
||||
struct sk_msg *msg, u32 bytes);
|
||||
|
||||
static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
|
||||
{
|
||||
WARN_ON(i == msg->sg.end && bytes);
|
||||
}
|
||||
|
||||
static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes)
|
||||
{
|
||||
if (psock->apply_bytes) {
|
||||
if (psock->apply_bytes < bytes)
|
||||
psock->apply_bytes = 0;
|
||||
else
|
||||
psock->apply_bytes -= bytes;
|
||||
}
|
||||
}
|
||||
|
||||
#define sk_msg_iter_var_prev(var) \
|
||||
do { \
|
||||
if (var == 0) \
|
||||
var = MAX_MSG_FRAGS - 1; \
|
||||
else \
|
||||
var--; \
|
||||
} while (0)
|
||||
|
||||
#define sk_msg_iter_var_next(var) \
|
||||
do { \
|
||||
var++; \
|
||||
if (var == MAX_MSG_FRAGS) \
|
||||
var = 0; \
|
||||
} while (0)
|
||||
|
||||
#define sk_msg_iter_prev(msg, which) \
|
||||
sk_msg_iter_var_prev(msg->sg.which)
|
||||
|
||||
#define sk_msg_iter_next(msg, which) \
|
||||
sk_msg_iter_var_next(msg->sg.which)
|
||||
|
||||
static inline void sk_msg_clear_meta(struct sk_msg *msg)
|
||||
{
|
||||
memset(&msg->sg, 0, offsetofend(struct sk_msg_sg, copy));
|
||||
}
|
||||
|
||||
static inline void sk_msg_init(struct sk_msg *msg)
|
||||
{
|
||||
BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != MAX_MSG_FRAGS);
|
||||
memset(msg, 0, sizeof(*msg));
|
||||
sg_init_marker(msg->sg.data, MAX_MSG_FRAGS);
|
||||
}
|
||||
|
||||
static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src,
|
||||
int which, u32 size)
|
||||
{
|
||||
dst->sg.data[which] = src->sg.data[which];
|
||||
dst->sg.data[which].length = size;
|
||||
src->sg.data[which].length -= size;
|
||||
src->sg.data[which].offset += size;
|
||||
}
|
||||
|
||||
static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src)
|
||||
{
|
||||
memcpy(dst, src, sizeof(*src));
|
||||
sk_msg_init(src);
|
||||
}
|
||||
|
||||
static inline u32 sk_msg_elem_used(const struct sk_msg *msg)
|
||||
{
|
||||
return msg->sg.end >= msg->sg.start ?
|
||||
msg->sg.end - msg->sg.start :
|
||||
msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start);
|
||||
}
|
||||
|
||||
static inline bool sk_msg_full(const struct sk_msg *msg)
|
||||
{
|
||||
return (msg->sg.end == msg->sg.start) && msg->sg.size;
|
||||
}
|
||||
|
||||
static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which)
|
||||
{
|
||||
return &msg->sg.data[which];
|
||||
}
|
||||
|
||||
static inline struct page *sk_msg_page(struct sk_msg *msg, int which)
|
||||
{
|
||||
return sg_page(sk_msg_elem(msg, which));
|
||||
}
|
||||
|
||||
static inline bool sk_msg_to_ingress(const struct sk_msg *msg)
|
||||
{
|
||||
return msg->flags & BPF_F_INGRESS;
|
||||
}
|
||||
|
||||
static inline void sk_msg_compute_data_pointers(struct sk_msg *msg)
|
||||
{
|
||||
struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start);
|
||||
|
||||
if (msg->sg.copy[msg->sg.start]) {
|
||||
msg->data = NULL;
|
||||
msg->data_end = NULL;
|
||||
} else {
|
||||
msg->data = sg_virt(sge);
|
||||
msg->data_end = msg->data + sge->length;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page,
|
||||
u32 len, u32 offset)
|
||||
{
|
||||
struct scatterlist *sge;
|
||||
|
||||
get_page(page);
|
||||
sge = sk_msg_elem(msg, msg->sg.end);
|
||||
sg_set_page(sge, page, len, offset);
|
||||
sg_unmark_end(sge);
|
||||
|
||||
msg->sg.copy[msg->sg.end] = true;
|
||||
msg->sg.size += len;
|
||||
sk_msg_iter_next(msg, end);
|
||||
}
|
||||
|
||||
static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state)
|
||||
{
|
||||
do {
|
||||
msg->sg.copy[i] = copy_state;
|
||||
sk_msg_iter_var_next(i);
|
||||
if (i == msg->sg.end)
|
||||
break;
|
||||
} while (1);
|
||||
}
|
||||
|
||||
static inline void sk_msg_sg_copy_set(struct sk_msg *msg, u32 start)
|
||||
{
|
||||
sk_msg_sg_copy(msg, start, true);
|
||||
}
|
||||
|
||||
static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start)
|
||||
{
|
||||
sk_msg_sg_copy(msg, start, false);
|
||||
}
|
||||
|
||||
static inline struct sk_psock *sk_psock(const struct sock *sk)
|
||||
{
|
||||
return rcu_dereference_sk_user_data(sk);
|
||||
}
|
||||
|
||||
static inline bool sk_has_psock(struct sock *sk)
|
||||
{
|
||||
return sk_psock(sk) != NULL && sk->sk_prot->recvmsg == tcp_bpf_recvmsg;
|
||||
}
|
||||
|
||||
static inline void sk_psock_queue_msg(struct sk_psock *psock,
|
||||
struct sk_msg *msg)
|
||||
{
|
||||
list_add_tail(&msg->list, &psock->ingress_msg);
|
||||
}
|
||||
|
||||
static inline bool sk_psock_queue_empty(const struct sk_psock *psock)
|
||||
{
|
||||
return psock ? list_empty(&psock->ingress_msg) : true;
|
||||
}
|
||||
|
||||
static inline void sk_psock_report_error(struct sk_psock *psock, int err)
|
||||
{
|
||||
struct sock *sk = psock->sk;
|
||||
|
||||
sk->sk_err = err;
|
||||
sk->sk_error_report(sk);
|
||||
}
|
||||
|
||||
struct sk_psock *sk_psock_init(struct sock *sk, int node);
|
||||
|
||||
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
|
||||
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock);
|
||||
void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock);
|
||||
|
||||
int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
|
||||
struct sk_msg *msg);
|
||||
|
||||
static inline struct sk_psock_link *sk_psock_init_link(void)
|
||||
{
|
||||
return kzalloc(sizeof(struct sk_psock_link),
|
||||
GFP_ATOMIC | __GFP_NOWARN);
|
||||
}
|
||||
|
||||
static inline void sk_psock_free_link(struct sk_psock_link *link)
|
||||
{
|
||||
kfree(link);
|
||||
}
|
||||
|
||||
struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock);
|
||||
#if defined(CONFIG_BPF_STREAM_PARSER)
|
||||
void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link);
|
||||
#else
|
||||
static inline void sk_psock_unlink(struct sock *sk,
|
||||
struct sk_psock_link *link)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
void __sk_psock_purge_ingress_msg(struct sk_psock *psock);
|
||||
|
||||
static inline void sk_psock_cork_free(struct sk_psock *psock)
|
||||
{
|
||||
if (psock->cork) {
|
||||
sk_msg_free(psock->sk, psock->cork);
|
||||
kfree(psock->cork);
|
||||
psock->cork = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void sk_psock_update_proto(struct sock *sk,
|
||||
struct sk_psock *psock,
|
||||
struct proto *ops)
|
||||
{
|
||||
psock->saved_unhash = sk->sk_prot->unhash;
|
||||
psock->saved_close = sk->sk_prot->close;
|
||||
psock->saved_write_space = sk->sk_write_space;
|
||||
|
||||
psock->sk_proto = sk->sk_prot;
|
||||
sk->sk_prot = ops;
|
||||
}
|
||||
|
||||
static inline void sk_psock_restore_proto(struct sock *sk,
|
||||
struct sk_psock *psock)
|
||||
{
|
||||
if (psock->sk_proto) {
|
||||
sk->sk_prot = psock->sk_proto;
|
||||
psock->sk_proto = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void sk_psock_set_state(struct sk_psock *psock,
|
||||
enum sk_psock_state_bits bit)
|
||||
{
|
||||
set_bit(bit, &psock->state);
|
||||
}
|
||||
|
||||
static inline void sk_psock_clear_state(struct sk_psock *psock,
|
||||
enum sk_psock_state_bits bit)
|
||||
{
|
||||
clear_bit(bit, &psock->state);
|
||||
}
|
||||
|
||||
static inline bool sk_psock_test_state(const struct sk_psock *psock,
|
||||
enum sk_psock_state_bits bit)
|
||||
{
|
||||
return test_bit(bit, &psock->state);
|
||||
}
|
||||
|
||||
static inline struct sk_psock *sk_psock_get(struct sock *sk)
|
||||
{
|
||||
struct sk_psock *psock;
|
||||
|
||||
rcu_read_lock();
|
||||
psock = sk_psock(sk);
|
||||
if (psock && !refcount_inc_not_zero(&psock->refcnt))
|
||||
psock = NULL;
|
||||
rcu_read_unlock();
|
||||
return psock;
|
||||
}
|
||||
|
||||
void sk_psock_stop(struct sock *sk, struct sk_psock *psock);
|
||||
void sk_psock_destroy(struct rcu_head *rcu);
|
||||
void sk_psock_drop(struct sock *sk, struct sk_psock *psock);
|
||||
|
||||
static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
|
||||
{
|
||||
if (refcount_dec_and_test(&psock->refcnt))
|
||||
sk_psock_drop(sk, psock);
|
||||
}
|
||||
|
||||
static inline void psock_set_prog(struct bpf_prog **pprog,
|
||||
struct bpf_prog *prog)
|
||||
{
|
||||
prog = xchg(pprog, prog);
|
||||
if (prog)
|
||||
bpf_prog_put(prog);
|
||||
}
|
||||
|
||||
static inline void psock_progs_drop(struct sk_psock_progs *progs)
|
||||
{
|
||||
psock_set_prog(&progs->msg_parser, NULL);
|
||||
psock_set_prog(&progs->skb_parser, NULL);
|
||||
psock_set_prog(&progs->skb_verdict, NULL);
|
||||
}
|
||||
|
||||
#endif /* _LINUX_SKMSG_H */
|
@ -2214,10 +2214,6 @@ static inline struct page_frag *sk_page_frag(struct sock *sk)
|
||||
|
||||
bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);
|
||||
|
||||
int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
|
||||
int sg_start, int *sg_curr, unsigned int *sg_size,
|
||||
int first_coalesce);
|
||||
|
||||
/*
|
||||
* Default write policy as shown to user space via poll/select/SIGIO
|
||||
*/
|
||||
|
@ -858,6 +858,21 @@ static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
|
||||
TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
|
||||
}
|
||||
|
||||
static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
|
||||
{
|
||||
return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
|
||||
}
|
||||
|
||||
static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
|
||||
{
|
||||
return TCP_SKB_CB(skb)->bpf.sk_redir;
|
||||
}
|
||||
|
||||
static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
|
||||
{
|
||||
TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
|
||||
}
|
||||
|
||||
#if IS_ENABLED(CONFIG_IPV6)
|
||||
/* This is the variant of inet6_iif() that must be used by TCP,
|
||||
* as TCP moves IP6CB into a different location in skb->cb[]
|
||||
@ -2057,7 +2072,6 @@ struct tcp_ulp_ops {
|
||||
int tcp_register_ulp(struct tcp_ulp_ops *type);
|
||||
void tcp_unregister_ulp(struct tcp_ulp_ops *type);
|
||||
int tcp_set_ulp(struct sock *sk, const char *name);
|
||||
int tcp_set_ulp_id(struct sock *sk, const int ulp);
|
||||
void tcp_get_available_ulp(char *buf, size_t len);
|
||||
void tcp_cleanup_ulp(struct sock *sk);
|
||||
|
||||
@ -2065,6 +2079,18 @@ void tcp_cleanup_ulp(struct sock *sk);
|
||||
__MODULE_INFO(alias, alias_userspace, name); \
|
||||
__MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
|
||||
|
||||
struct sk_msg;
|
||||
struct sk_psock;
|
||||
|
||||
int tcp_bpf_init(struct sock *sk);
|
||||
void tcp_bpf_reinit(struct sock *sk);
|
||||
int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
|
||||
int flags);
|
||||
int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
|
||||
int nonblock, int flags, int *addr_len);
|
||||
int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
|
||||
struct msghdr *msg, int len);
|
||||
|
||||
/* Call BPF_SOCK_OPS program that returns an int. If the return value
|
||||
* is < 0, then the BPF op failed (for example if the loaded BPF
|
||||
* program does not support the chosen operation or there is no BPF
|
||||
|
@ -39,6 +39,8 @@
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <linux/skmsg.h>
|
||||
|
||||
#include <net/tcp.h>
|
||||
#include <net/strparser.h>
|
||||
#include <crypto/aead.h>
|
||||
@ -103,15 +105,13 @@ struct tls_rec {
|
||||
int tx_flags;
|
||||
int inplace_crypto;
|
||||
|
||||
/* AAD | sg_plaintext_data | sg_tag */
|
||||
struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS + 1];
|
||||
/* AAD | sg_encrypted_data (data contain overhead for hdr&iv&tag) */
|
||||
struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS + 1];
|
||||
struct sk_msg msg_plaintext;
|
||||
struct sk_msg msg_encrypted;
|
||||
|
||||
unsigned int sg_plaintext_size;
|
||||
unsigned int sg_encrypted_size;
|
||||
int sg_plaintext_num_elem;
|
||||
int sg_encrypted_num_elem;
|
||||
/* AAD | msg_plaintext.sg.data | sg_tag */
|
||||
struct scatterlist sg_aead_in[2];
|
||||
/* AAD | msg_encrypted.sg.data (data contains overhead for hdr & iv & tag) */
|
||||
struct scatterlist sg_aead_out[2];
|
||||
|
||||
char aad_space[TLS_AAD_SPACE_SIZE];
|
||||
struct aead_request aead_req;
|
||||
@ -142,8 +142,7 @@ struct tls_sw_context_rx {
|
||||
|
||||
struct strparser strp;
|
||||
void (*saved_data_ready)(struct sock *sk);
|
||||
unsigned int (*sk_poll)(struct file *file, struct socket *sock,
|
||||
struct poll_table_struct *wait);
|
||||
|
||||
struct sk_buff *recv_pkt;
|
||||
u8 control;
|
||||
bool decrypted;
|
||||
@ -223,8 +222,8 @@ struct tls_context {
|
||||
|
||||
unsigned long flags;
|
||||
bool in_tcp_sendpages;
|
||||
bool pending_open_record_frags;
|
||||
|
||||
u16 pending_open_record_frags;
|
||||
int (*push_pending_record)(struct sock *sk, int flags);
|
||||
|
||||
void (*sk_write_space)(struct sock *sk);
|
||||
@ -272,8 +271,7 @@ void tls_sw_free_resources_rx(struct sock *sk);
|
||||
void tls_sw_release_resources_rx(struct sock *sk);
|
||||
int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
|
||||
int nonblock, int flags, int *addr_len);
|
||||
unsigned int tls_sw_poll(struct file *file, struct socket *sock,
|
||||
struct poll_table_struct *wait);
|
||||
bool tls_sw_stream_read(const struct sock *sk);
|
||||
ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
|
||||
struct pipe_inode_info *pipe,
|
||||
size_t len, unsigned int flags);
|
||||
|
@ -13,11 +13,6 @@ ifeq ($(CONFIG_XDP_SOCKETS),y)
|
||||
obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
|
||||
endif
|
||||
obj-$(CONFIG_BPF_SYSCALL) += offload.o
|
||||
ifeq ($(CONFIG_STREAM_PARSER),y)
|
||||
ifeq ($(CONFIG_INET),y)
|
||||
obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifeq ($(CONFIG_PERF_EVENTS),y)
|
||||
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
|
||||
|
@ -1792,8 +1792,6 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
|
||||
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
|
||||
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
|
||||
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
|
||||
const struct bpf_func_proto bpf_sock_map_update_proto __weak;
|
||||
const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
|
||||
const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
|
||||
const struct bpf_func_proto bpf_get_local_storage_proto __weak;
|
||||
|
||||
|
2629
kernel/bpf/sockmap.c
2629
kernel/bpf/sockmap.c
File diff suppressed because it is too large
Load Diff
@ -1664,7 +1664,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
|
||||
switch (ptype) {
|
||||
case BPF_PROG_TYPE_SK_SKB:
|
||||
case BPF_PROG_TYPE_SK_MSG:
|
||||
ret = sockmap_get_from_fd(attr, ptype, prog);
|
||||
ret = sock_map_get_from_fd(attr, prog);
|
||||
break;
|
||||
case BPF_PROG_TYPE_LIRC_MODE2:
|
||||
ret = lirc_prog_attach(attr, prog);
|
||||
@ -1718,10 +1718,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
|
||||
ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
|
||||
break;
|
||||
case BPF_SK_MSG_VERDICT:
|
||||
return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL);
|
||||
return sock_map_get_from_fd(attr, NULL);
|
||||
case BPF_SK_SKB_STREAM_PARSER:
|
||||
case BPF_SK_SKB_STREAM_VERDICT:
|
||||
return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
|
||||
return sock_map_get_from_fd(attr, NULL);
|
||||
case BPF_LIRC_MODE2:
|
||||
return lirc_prog_detach(attr);
|
||||
case BPF_FLOW_DISSECTOR:
|
||||
|
11
net/Kconfig
11
net/Kconfig
@ -300,8 +300,11 @@ config BPF_JIT
|
||||
|
||||
config BPF_STREAM_PARSER
|
||||
bool "enable BPF STREAM_PARSER"
|
||||
depends on INET
|
||||
depends on BPF_SYSCALL
|
||||
depends on CGROUP_BPF
|
||||
select STREAM_PARSER
|
||||
select NET_SOCK_MSG
|
||||
---help---
|
||||
Enabling this allows a stream parser to be used with
|
||||
BPF_MAP_TYPE_SOCKMAP.
|
||||
@ -413,6 +416,14 @@ config GRO_CELLS
|
||||
config SOCK_VALIDATE_XMIT
|
||||
bool
|
||||
|
||||
config NET_SOCK_MSG
|
||||
bool
|
||||
default n
|
||||
help
|
||||
The NET_SOCK_MSG provides a framework for plain sockets (e.g. TCP) or
|
||||
ULPs (upper layer modules, e.g. TLS) to process L7 application data
|
||||
with the help of BPF programs.
|
||||
|
||||
config NET_DEVLINK
|
||||
tristate "Network physical/parent device Netlink interface"
|
||||
help
|
||||
|
@ -16,6 +16,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
|
||||
obj-y += net-sysfs.o
|
||||
obj-$(CONFIG_PAGE_POOL) += page_pool.o
|
||||
obj-$(CONFIG_PROC_FS) += net-procfs.o
|
||||
obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
|
||||
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
|
||||
obj-$(CONFIG_NETPOLL) += netpoll.o
|
||||
obj-$(CONFIG_FIB_RULES) += fib_rules.o
|
||||
@ -27,6 +28,7 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
|
||||
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
|
||||
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
|
||||
obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
|
||||
obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o
|
||||
obj-$(CONFIG_DST_CACHE) += dst_cache.o
|
||||
obj-$(CONFIG_HWBM) += hwbm.o
|
||||
obj-$(CONFIG_NET_DEVLINK) += devlink.o
|
||||
|
@ -38,6 +38,7 @@
|
||||
#include <net/protocol.h>
|
||||
#include <net/netlink.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/skmsg.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/flow_dissector.h>
|
||||
#include <linux/errno.h>
|
||||
@ -2142,123 +2143,7 @@ static const struct bpf_func_proto bpf_redirect_proto = {
|
||||
.arg2_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
|
||||
struct bpf_map *, map, void *, key, u64, flags)
|
||||
{
|
||||
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
|
||||
|
||||
/* If user passes invalid input drop the packet. */
|
||||
if (unlikely(flags & ~(BPF_F_INGRESS)))
|
||||
return SK_DROP;
|
||||
|
||||
tcb->bpf.flags = flags;
|
||||
tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
|
||||
if (!tcb->bpf.sk_redir)
|
||||
return SK_DROP;
|
||||
|
||||
return SK_PASS;
|
||||
}
|
||||
|
||||
static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
|
||||
.func = bpf_sk_redirect_hash,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_CONST_MAP_PTR,
|
||||
.arg3_type = ARG_PTR_TO_MAP_KEY,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
|
||||
struct bpf_map *, map, u32, key, u64, flags)
|
||||
{
|
||||
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
|
||||
|
||||
/* If user passes invalid input drop the packet. */
|
||||
if (unlikely(flags & ~(BPF_F_INGRESS)))
|
||||
return SK_DROP;
|
||||
|
||||
tcb->bpf.flags = flags;
|
||||
tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
|
||||
if (!tcb->bpf.sk_redir)
|
||||
return SK_DROP;
|
||||
|
||||
return SK_PASS;
|
||||
}
|
||||
|
||||
struct sock *do_sk_redirect_map(struct sk_buff *skb)
|
||||
{
|
||||
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
|
||||
|
||||
return tcb->bpf.sk_redir;
|
||||
}
|
||||
|
||||
static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
|
||||
.func = bpf_sk_redirect_map,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_CONST_MAP_PTR,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
|
||||
struct bpf_map *, map, void *, key, u64, flags)
|
||||
{
|
||||
/* If user passes invalid input drop the packet. */
|
||||
if (unlikely(flags & ~(BPF_F_INGRESS)))
|
||||
return SK_DROP;
|
||||
|
||||
msg->flags = flags;
|
||||
msg->sk_redir = __sock_hash_lookup_elem(map, key);
|
||||
if (!msg->sk_redir)
|
||||
return SK_DROP;
|
||||
|
||||
return SK_PASS;
|
||||
}
|
||||
|
||||
static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
|
||||
.func = bpf_msg_redirect_hash,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_CONST_MAP_PTR,
|
||||
.arg3_type = ARG_PTR_TO_MAP_KEY,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
|
||||
struct bpf_map *, map, u32, key, u64, flags)
|
||||
{
|
||||
/* If user passes invalid input drop the packet. */
|
||||
if (unlikely(flags & ~(BPF_F_INGRESS)))
|
||||
return SK_DROP;
|
||||
|
||||
msg->flags = flags;
|
||||
msg->sk_redir = __sock_map_lookup_elem(map, key);
|
||||
if (!msg->sk_redir)
|
||||
return SK_DROP;
|
||||
|
||||
return SK_PASS;
|
||||
}
|
||||
|
||||
struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
|
||||
{
|
||||
return msg->sk_redir;
|
||||
}
|
||||
|
||||
static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
|
||||
.func = bpf_msg_redirect_map,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_CONST_MAP_PTR,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes)
|
||||
BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
|
||||
{
|
||||
msg->apply_bytes = bytes;
|
||||
return 0;
|
||||
@ -2272,7 +2157,7 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
|
||||
.arg2_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes)
|
||||
BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
|
||||
{
|
||||
msg->cork_bytes = bytes;
|
||||
return 0;
|
||||
@ -2286,45 +2171,37 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
|
||||
.arg2_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
#define sk_msg_iter_var(var) \
|
||||
do { \
|
||||
var++; \
|
||||
if (var == MAX_SKB_FRAGS) \
|
||||
var = 0; \
|
||||
} while (0)
|
||||
|
||||
BPF_CALL_4(bpf_msg_pull_data,
|
||||
struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
|
||||
BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
|
||||
u32, end, u64, flags)
|
||||
{
|
||||
unsigned int len = 0, offset = 0, copy = 0, poffset = 0;
|
||||
int bytes = end - start, bytes_sg_total;
|
||||
struct scatterlist *sg = msg->sg_data;
|
||||
int first_sg, last_sg, i, shift;
|
||||
unsigned char *p, *to, *from;
|
||||
u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
|
||||
u32 first_sge, last_sge, i, shift, bytes_sg_total;
|
||||
struct scatterlist *sge;
|
||||
u8 *raw, *to, *from;
|
||||
struct page *page;
|
||||
|
||||
if (unlikely(flags || end <= start))
|
||||
return -EINVAL;
|
||||
|
||||
/* First find the starting scatterlist element */
|
||||
i = msg->sg_start;
|
||||
i = msg->sg.start;
|
||||
do {
|
||||
len = sg[i].length;
|
||||
len = sk_msg_elem(msg, i)->length;
|
||||
if (start < offset + len)
|
||||
break;
|
||||
offset += len;
|
||||
sk_msg_iter_var(i);
|
||||
} while (i != msg->sg_end);
|
||||
sk_msg_iter_var_next(i);
|
||||
} while (i != msg->sg.end);
|
||||
|
||||
if (unlikely(start >= offset + len))
|
||||
return -EINVAL;
|
||||
|
||||
first_sg = i;
|
||||
first_sge = i;
|
||||
/* The start may point into the sg element so we need to also
|
||||
* account for the headroom.
|
||||
*/
|
||||
bytes_sg_total = start - offset + bytes;
|
||||
if (!msg->sg_copy[i] && bytes_sg_total <= len)
|
||||
if (!msg->sg.copy[i] && bytes_sg_total <= len)
|
||||
goto out;
|
||||
|
||||
/* At this point we need to linearize multiple scatterlist
|
||||
@ -2338,12 +2215,12 @@ BPF_CALL_4(bpf_msg_pull_data,
|
||||
* will copy the entire sg entry.
|
||||
*/
|
||||
do {
|
||||
copy += sg[i].length;
|
||||
sk_msg_iter_var(i);
|
||||
copy += sk_msg_elem(msg, i)->length;
|
||||
sk_msg_iter_var_next(i);
|
||||
if (bytes_sg_total <= copy)
|
||||
break;
|
||||
} while (i != msg->sg_end);
|
||||
last_sg = i;
|
||||
} while (i != msg->sg.end);
|
||||
last_sge = i;
|
||||
|
||||
if (unlikely(bytes_sg_total > copy))
|
||||
return -EINVAL;
|
||||
@ -2352,63 +2229,61 @@ BPF_CALL_4(bpf_msg_pull_data,
|
||||
get_order(copy));
|
||||
if (unlikely(!page))
|
||||
return -ENOMEM;
|
||||
p = page_address(page);
|
||||
|
||||
i = first_sg;
|
||||
raw = page_address(page);
|
||||
i = first_sge;
|
||||
do {
|
||||
from = sg_virt(&sg[i]);
|
||||
len = sg[i].length;
|
||||
to = p + poffset;
|
||||
sge = sk_msg_elem(msg, i);
|
||||
from = sg_virt(sge);
|
||||
len = sge->length;
|
||||
to = raw + poffset;
|
||||
|
||||
memcpy(to, from, len);
|
||||
poffset += len;
|
||||
sg[i].length = 0;
|
||||
put_page(sg_page(&sg[i]));
|
||||
sge->length = 0;
|
||||
put_page(sg_page(sge));
|
||||
|
||||
sk_msg_iter_var(i);
|
||||
} while (i != last_sg);
|
||||
sk_msg_iter_var_next(i);
|
||||
} while (i != last_sge);
|
||||
|
||||
sg[first_sg].length = copy;
|
||||
sg_set_page(&sg[first_sg], page, copy, 0);
|
||||
sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
|
||||
|
||||
/* To repair sg ring we need to shift entries. If we only
|
||||
* had a single entry though we can just replace it and
|
||||
* be done. Otherwise walk the ring and shift the entries.
|
||||
*/
|
||||
WARN_ON_ONCE(last_sg == first_sg);
|
||||
shift = last_sg > first_sg ?
|
||||
last_sg - first_sg - 1 :
|
||||
MAX_SKB_FRAGS - first_sg + last_sg - 1;
|
||||
WARN_ON_ONCE(last_sge == first_sge);
|
||||
shift = last_sge > first_sge ?
|
||||
last_sge - first_sge - 1 :
|
||||
MAX_SKB_FRAGS - first_sge + last_sge - 1;
|
||||
if (!shift)
|
||||
goto out;
|
||||
|
||||
i = first_sg;
|
||||
sk_msg_iter_var(i);
|
||||
i = first_sge;
|
||||
sk_msg_iter_var_next(i);
|
||||
do {
|
||||
int move_from;
|
||||
u32 move_from;
|
||||
|
||||
if (i + shift >= MAX_SKB_FRAGS)
|
||||
move_from = i + shift - MAX_SKB_FRAGS;
|
||||
if (i + shift >= MAX_MSG_FRAGS)
|
||||
move_from = i + shift - MAX_MSG_FRAGS;
|
||||
else
|
||||
move_from = i + shift;
|
||||
|
||||
if (move_from == msg->sg_end)
|
||||
if (move_from == msg->sg.end)
|
||||
break;
|
||||
|
||||
sg[i] = sg[move_from];
|
||||
sg[move_from].length = 0;
|
||||
sg[move_from].page_link = 0;
|
||||
sg[move_from].offset = 0;
|
||||
|
||||
sk_msg_iter_var(i);
|
||||
msg->sg.data[i] = msg->sg.data[move_from];
|
||||
msg->sg.data[move_from].length = 0;
|
||||
msg->sg.data[move_from].page_link = 0;
|
||||
msg->sg.data[move_from].offset = 0;
|
||||
sk_msg_iter_var_next(i);
|
||||
} while (1);
|
||||
msg->sg_end -= shift;
|
||||
if (msg->sg_end < 0)
|
||||
msg->sg_end += MAX_SKB_FRAGS;
|
||||
out:
|
||||
msg->data = sg_virt(&sg[first_sg]) + start - offset;
|
||||
msg->data_end = msg->data + bytes;
|
||||
|
||||
msg->sg.end = msg->sg.end - shift > msg->sg.end ?
|
||||
msg->sg.end - shift + MAX_MSG_FRAGS :
|
||||
msg->sg.end - shift;
|
||||
out:
|
||||
msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
|
||||
msg->data_end = msg->data + bytes;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -5203,6 +5078,9 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||
}
|
||||
}
|
||||
|
||||
const struct bpf_func_proto bpf_sock_map_update_proto __weak;
|
||||
const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
|
||||
|
||||
static const struct bpf_func_proto *
|
||||
sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||
{
|
||||
@ -5226,6 +5104,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||
}
|
||||
}
|
||||
|
||||
const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
|
||||
const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;
|
||||
|
||||
static const struct bpf_func_proto *
|
||||
sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||
{
|
||||
@ -5247,6 +5128,9 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||
}
|
||||
}
|
||||
|
||||
const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
|
||||
const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;
|
||||
|
||||
static const struct bpf_func_proto *
|
||||
sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||
{
|
||||
@ -7001,22 +6885,22 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
|
||||
|
||||
switch (si->off) {
|
||||
case offsetof(struct sk_msg_md, data):
|
||||
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data),
|
||||
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
|
||||
si->dst_reg, si->src_reg,
|
||||
offsetof(struct sk_msg_buff, data));
|
||||
offsetof(struct sk_msg, data));
|
||||
break;
|
||||
case offsetof(struct sk_msg_md, data_end):
|
||||
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end),
|
||||
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
|
||||
si->dst_reg, si->src_reg,
|
||||
offsetof(struct sk_msg_buff, data_end));
|
||||
offsetof(struct sk_msg, data_end));
|
||||
break;
|
||||
case offsetof(struct sk_msg_md, family):
|
||||
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
|
||||
|
||||
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
|
||||
struct sk_msg_buff, sk),
|
||||
struct sk_msg, sk),
|
||||
si->dst_reg, si->src_reg,
|
||||
offsetof(struct sk_msg_buff, sk));
|
||||
offsetof(struct sk_msg, sk));
|
||||
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
|
||||
offsetof(struct sock_common, skc_family));
|
||||
break;
|
||||
@ -7025,9 +6909,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
|
||||
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
|
||||
|
||||
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
|
||||
struct sk_msg_buff, sk),
|
||||
struct sk_msg, sk),
|
||||
si->dst_reg, si->src_reg,
|
||||
offsetof(struct sk_msg_buff, sk));
|
||||
offsetof(struct sk_msg, sk));
|
||||
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
|
||||
offsetof(struct sock_common, skc_daddr));
|
||||
break;
|
||||
@ -7037,9 +6921,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
|
||||
skc_rcv_saddr) != 4);
|
||||
|
||||
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
|
||||
struct sk_msg_buff, sk),
|
||||
struct sk_msg, sk),
|
||||
si->dst_reg, si->src_reg,
|
||||
offsetof(struct sk_msg_buff, sk));
|
||||
offsetof(struct sk_msg, sk));
|
||||
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
|
||||
offsetof(struct sock_common,
|
||||
skc_rcv_saddr));
|
||||
@ -7054,9 +6938,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
|
||||
off = si->off;
|
||||
off -= offsetof(struct sk_msg_md, remote_ip6[0]);
|
||||
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
|
||||
struct sk_msg_buff, sk),
|
||||
struct sk_msg, sk),
|
||||
si->dst_reg, si->src_reg,
|
||||
offsetof(struct sk_msg_buff, sk));
|
||||
offsetof(struct sk_msg, sk));
|
||||
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
|
||||
offsetof(struct sock_common,
|
||||
skc_v6_daddr.s6_addr32[0]) +
|
||||
@ -7075,9 +6959,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
|
||||
off = si->off;
|
||||
off -= offsetof(struct sk_msg_md, local_ip6[0]);
|
||||
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
|
||||
struct sk_msg_buff, sk),
|
||||
struct sk_msg, sk),
|
||||
si->dst_reg, si->src_reg,
|
||||
offsetof(struct sk_msg_buff, sk));
|
||||
offsetof(struct sk_msg, sk));
|
||||
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
|
||||
offsetof(struct sock_common,
|
||||
skc_v6_rcv_saddr.s6_addr32[0]) +
|
||||
@ -7091,9 +6975,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
|
||||
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
|
||||
|
||||
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
|
||||
struct sk_msg_buff, sk),
|
||||
struct sk_msg, sk),
|
||||
si->dst_reg, si->src_reg,
|
||||
offsetof(struct sk_msg_buff, sk));
|
||||
offsetof(struct sk_msg, sk));
|
||||
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
|
||||
offsetof(struct sock_common, skc_dport));
|
||||
#ifndef __BIG_ENDIAN_BITFIELD
|
||||
@ -7105,9 +6989,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
|
||||
BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
|
||||
|
||||
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
|
||||
struct sk_msg_buff, sk),
|
||||
struct sk_msg, sk),
|
||||
si->dst_reg, si->src_reg,
|
||||
offsetof(struct sk_msg_buff, sk));
|
||||
offsetof(struct sk_msg, sk));
|
||||
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
|
||||
offsetof(struct sock_common, skc_num));
|
||||
break;
|
||||
|
802
net/core/skmsg.c
Normal file
802
net/core/skmsg.c
Normal file
@ -0,0 +1,802 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
|
||||
|
||||
#include <linux/skmsg.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/scatterlist.h>
|
||||
|
||||
#include <net/sock.h>
|
||||
#include <net/tcp.h>
|
||||
|
||||
static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
|
||||
{
|
||||
if (msg->sg.end > msg->sg.start &&
|
||||
elem_first_coalesce < msg->sg.end)
|
||||
return true;
|
||||
|
||||
if (msg->sg.end < msg->sg.start &&
|
||||
(elem_first_coalesce > msg->sg.start ||
|
||||
elem_first_coalesce < msg->sg.end))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
|
||||
int elem_first_coalesce)
|
||||
{
|
||||
struct page_frag *pfrag = sk_page_frag(sk);
|
||||
int ret = 0;
|
||||
|
||||
len -= msg->sg.size;
|
||||
while (len > 0) {
|
||||
struct scatterlist *sge;
|
||||
u32 orig_offset;
|
||||
int use, i;
|
||||
|
||||
if (!sk_page_frag_refill(sk, pfrag))
|
||||
return -ENOMEM;
|
||||
|
||||
orig_offset = pfrag->offset;
|
||||
use = min_t(int, len, pfrag->size - orig_offset);
|
||||
if (!sk_wmem_schedule(sk, use))
|
||||
return -ENOMEM;
|
||||
|
||||
i = msg->sg.end;
|
||||
sk_msg_iter_var_prev(i);
|
||||
sge = &msg->sg.data[i];
|
||||
|
||||
if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) &&
|
||||
sg_page(sge) == pfrag->page &&
|
||||
sge->offset + sge->length == orig_offset) {
|
||||
sge->length += use;
|
||||
} else {
|
||||
if (sk_msg_full(msg)) {
|
||||
ret = -ENOSPC;
|
||||
break;
|
||||
}
|
||||
|
||||
sge = &msg->sg.data[msg->sg.end];
|
||||
sg_unmark_end(sge);
|
||||
sg_set_page(sge, pfrag->page, use, orig_offset);
|
||||
get_page(pfrag->page);
|
||||
sk_msg_iter_next(msg, end);
|
||||
}
|
||||
|
||||
sk_mem_charge(sk, use);
|
||||
msg->sg.size += use;
|
||||
pfrag->offset += use;
|
||||
len -= use;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_msg_alloc);
|
||||
|
||||
int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src,
|
||||
u32 off, u32 len)
|
||||
{
|
||||
int i = src->sg.start;
|
||||
struct scatterlist *sge = sk_msg_elem(src, i);
|
||||
u32 sge_len, sge_off;
|
||||
|
||||
if (sk_msg_full(dst))
|
||||
return -ENOSPC;
|
||||
|
||||
while (off) {
|
||||
if (sge->length > off)
|
||||
break;
|
||||
off -= sge->length;
|
||||
sk_msg_iter_var_next(i);
|
||||
if (i == src->sg.end && off)
|
||||
return -ENOSPC;
|
||||
sge = sk_msg_elem(src, i);
|
||||
}
|
||||
|
||||
while (len) {
|
||||
sge_len = sge->length - off;
|
||||
sge_off = sge->offset + off;
|
||||
if (sge_len > len)
|
||||
sge_len = len;
|
||||
off = 0;
|
||||
len -= sge_len;
|
||||
sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off);
|
||||
sk_mem_charge(sk, sge_len);
|
||||
sk_msg_iter_var_next(i);
|
||||
if (i == src->sg.end && len)
|
||||
return -ENOSPC;
|
||||
sge = sk_msg_elem(src, i);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_msg_clone);
|
||||
|
||||
void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes)
|
||||
{
|
||||
int i = msg->sg.start;
|
||||
|
||||
do {
|
||||
struct scatterlist *sge = sk_msg_elem(msg, i);
|
||||
|
||||
if (bytes < sge->length) {
|
||||
sge->length -= bytes;
|
||||
sge->offset += bytes;
|
||||
sk_mem_uncharge(sk, bytes);
|
||||
break;
|
||||
}
|
||||
|
||||
sk_mem_uncharge(sk, sge->length);
|
||||
bytes -= sge->length;
|
||||
sge->length = 0;
|
||||
sge->offset = 0;
|
||||
sk_msg_iter_var_next(i);
|
||||
} while (bytes && i != msg->sg.end);
|
||||
msg->sg.start = i;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_msg_return_zero);
|
||||
|
||||
void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes)
|
||||
{
|
||||
int i = msg->sg.start;
|
||||
|
||||
do {
|
||||
struct scatterlist *sge = &msg->sg.data[i];
|
||||
int uncharge = (bytes < sge->length) ? bytes : sge->length;
|
||||
|
||||
sk_mem_uncharge(sk, uncharge);
|
||||
bytes -= uncharge;
|
||||
sk_msg_iter_var_next(i);
|
||||
} while (i != msg->sg.end);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_msg_return);
|
||||
|
||||
static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i,
|
||||
bool charge)
|
||||
{
|
||||
struct scatterlist *sge = sk_msg_elem(msg, i);
|
||||
u32 len = sge->length;
|
||||
|
||||
if (charge)
|
||||
sk_mem_uncharge(sk, len);
|
||||
if (!msg->skb)
|
||||
put_page(sg_page(sge));
|
||||
memset(sge, 0, sizeof(*sge));
|
||||
return len;
|
||||
}
|
||||
|
||||
static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i,
|
||||
bool charge)
|
||||
{
|
||||
struct scatterlist *sge = sk_msg_elem(msg, i);
|
||||
int freed = 0;
|
||||
|
||||
while (msg->sg.size) {
|
||||
msg->sg.size -= sge->length;
|
||||
freed += sk_msg_free_elem(sk, msg, i, charge);
|
||||
sk_msg_iter_var_next(i);
|
||||
sk_msg_check_to_free(msg, i, msg->sg.size);
|
||||
sge = sk_msg_elem(msg, i);
|
||||
}
|
||||
if (msg->skb)
|
||||
consume_skb(msg->skb);
|
||||
sk_msg_init(msg);
|
||||
return freed;
|
||||
}
|
||||
|
||||
int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg)
|
||||
{
|
||||
return __sk_msg_free(sk, msg, msg->sg.start, false);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_msg_free_nocharge);
|
||||
|
||||
int sk_msg_free(struct sock *sk, struct sk_msg *msg)
|
||||
{
|
||||
return __sk_msg_free(sk, msg, msg->sg.start, true);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_msg_free);
|
||||
|
||||
static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg,
|
||||
u32 bytes, bool charge)
|
||||
{
|
||||
struct scatterlist *sge;
|
||||
u32 i = msg->sg.start;
|
||||
|
||||
while (bytes) {
|
||||
sge = sk_msg_elem(msg, i);
|
||||
if (!sge->length)
|
||||
break;
|
||||
if (bytes < sge->length) {
|
||||
if (charge)
|
||||
sk_mem_uncharge(sk, bytes);
|
||||
sge->length -= bytes;
|
||||
sge->offset += bytes;
|
||||
msg->sg.size -= bytes;
|
||||
break;
|
||||
}
|
||||
|
||||
msg->sg.size -= sge->length;
|
||||
bytes -= sge->length;
|
||||
sk_msg_free_elem(sk, msg, i, charge);
|
||||
sk_msg_iter_var_next(i);
|
||||
sk_msg_check_to_free(msg, i, bytes);
|
||||
}
|
||||
msg->sg.start = i;
|
||||
}
|
||||
|
||||
void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes)
|
||||
{
|
||||
__sk_msg_free_partial(sk, msg, bytes, true);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_msg_free_partial);
|
||||
|
||||
void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
|
||||
u32 bytes)
|
||||
{
|
||||
__sk_msg_free_partial(sk, msg, bytes, false);
|
||||
}
|
||||
|
||||
void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len)
|
||||
{
|
||||
int trim = msg->sg.size - len;
|
||||
u32 i = msg->sg.end;
|
||||
|
||||
if (trim <= 0) {
|
||||
WARN_ON(trim < 0);
|
||||
return;
|
||||
}
|
||||
|
||||
sk_msg_iter_var_prev(i);
|
||||
msg->sg.size = len;
|
||||
while (msg->sg.data[i].length &&
|
||||
trim >= msg->sg.data[i].length) {
|
||||
trim -= msg->sg.data[i].length;
|
||||
sk_msg_free_elem(sk, msg, i, true);
|
||||
sk_msg_iter_var_prev(i);
|
||||
if (!trim)
|
||||
goto out;
|
||||
}
|
||||
|
||||
msg->sg.data[i].length -= trim;
|
||||
sk_mem_uncharge(sk, trim);
|
||||
out:
|
||||
/* If we trim data before curr pointer update copybreak and current
|
||||
* so that any future copy operations start at new copy location.
|
||||
* However trimed data that has not yet been used in a copy op
|
||||
* does not require an update.
|
||||
*/
|
||||
if (msg->sg.curr >= i) {
|
||||
msg->sg.curr = i;
|
||||
msg->sg.copybreak = msg->sg.data[i].length;
|
||||
}
|
||||
sk_msg_iter_var_next(i);
|
||||
msg->sg.end = i;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_msg_trim);
|
||||
|
||||
int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
|
||||
struct sk_msg *msg, u32 bytes)
|
||||
{
|
||||
int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg);
|
||||
const int to_max_pages = MAX_MSG_FRAGS;
|
||||
struct page *pages[MAX_MSG_FRAGS];
|
||||
ssize_t orig, copied, use, offset;
|
||||
|
||||
orig = msg->sg.size;
|
||||
while (bytes > 0) {
|
||||
i = 0;
|
||||
maxpages = to_max_pages - num_elems;
|
||||
if (maxpages == 0) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
copied = iov_iter_get_pages(from, pages, bytes, maxpages,
|
||||
&offset);
|
||||
if (copied <= 0) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
iov_iter_advance(from, copied);
|
||||
bytes -= copied;
|
||||
msg->sg.size += copied;
|
||||
|
||||
while (copied) {
|
||||
use = min_t(int, copied, PAGE_SIZE - offset);
|
||||
sg_set_page(&msg->sg.data[msg->sg.end],
|
||||
pages[i], use, offset);
|
||||
sg_unmark_end(&msg->sg.data[msg->sg.end]);
|
||||
sk_mem_charge(sk, use);
|
||||
|
||||
offset = 0;
|
||||
copied -= use;
|
||||
sk_msg_iter_next(msg, end);
|
||||
num_elems++;
|
||||
i++;
|
||||
}
|
||||
/* When zerocopy is mixed with sk_msg_*copy* operations we
|
||||
* may have a copybreak set in this case clear and prefer
|
||||
* zerocopy remainder when possible.
|
||||
*/
|
||||
msg->sg.copybreak = 0;
|
||||
msg->sg.curr = msg->sg.end;
|
||||
}
|
||||
out:
|
||||
/* Revert iov_iter updates, msg will need to use 'trim' later if it
|
||||
* also needs to be cleared.
|
||||
*/
|
||||
if (ret)
|
||||
iov_iter_revert(from, msg->sg.size - orig);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter);
|
||||
|
||||
int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
|
||||
struct sk_msg *msg, u32 bytes)
|
||||
{
|
||||
int ret = -ENOSPC, i = msg->sg.curr;
|
||||
struct scatterlist *sge;
|
||||
u32 copy, buf_size;
|
||||
void *to;
|
||||
|
||||
do {
|
||||
sge = sk_msg_elem(msg, i);
|
||||
/* This is possible if a trim operation shrunk the buffer */
|
||||
if (msg->sg.copybreak >= sge->length) {
|
||||
msg->sg.copybreak = 0;
|
||||
sk_msg_iter_var_next(i);
|
||||
if (i == msg->sg.end)
|
||||
break;
|
||||
sge = sk_msg_elem(msg, i);
|
||||
}
|
||||
|
||||
buf_size = sge->length - msg->sg.copybreak;
|
||||
copy = (buf_size > bytes) ? bytes : buf_size;
|
||||
to = sg_virt(sge) + msg->sg.copybreak;
|
||||
msg->sg.copybreak += copy;
|
||||
if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
|
||||
ret = copy_from_iter_nocache(to, copy, from);
|
||||
else
|
||||
ret = copy_from_iter(to, copy, from);
|
||||
if (ret != copy) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
bytes -= copy;
|
||||
if (!bytes)
|
||||
break;
|
||||
msg->sg.copybreak = 0;
|
||||
sk_msg_iter_var_next(i);
|
||||
} while (i != msg->sg.end);
|
||||
out:
|
||||
msg->sg.curr = i;
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
|
||||
|
||||
static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
|
||||
{
|
||||
struct sock *sk = psock->sk;
|
||||
int copied = 0, num_sge;
|
||||
struct sk_msg *msg;
|
||||
|
||||
msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
|
||||
if (unlikely(!msg))
|
||||
return -EAGAIN;
|
||||
if (!sk_rmem_schedule(sk, skb, skb->len)) {
|
||||
kfree(msg);
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
sk_msg_init(msg);
|
||||
num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
|
||||
if (unlikely(num_sge < 0)) {
|
||||
kfree(msg);
|
||||
return num_sge;
|
||||
}
|
||||
|
||||
sk_mem_charge(sk, skb->len);
|
||||
copied = skb->len;
|
||||
msg->sg.start = 0;
|
||||
msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge;
|
||||
msg->skb = skb;
|
||||
|
||||
sk_psock_queue_msg(psock, msg);
|
||||
sk->sk_data_ready(sk);
|
||||
return copied;
|
||||
}
|
||||
|
||||
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
|
||||
u32 off, u32 len, bool ingress)
|
||||
{
|
||||
if (ingress)
|
||||
return sk_psock_skb_ingress(psock, skb);
|
||||
else
|
||||
return skb_send_sock_locked(psock->sk, skb, off, len);
|
||||
}
|
||||
|
||||
static void sk_psock_backlog(struct work_struct *work)
|
||||
{
|
||||
struct sk_psock *psock = container_of(work, struct sk_psock, work);
|
||||
struct sk_psock_work_state *state = &psock->work_state;
|
||||
struct sk_buff *skb;
|
||||
bool ingress;
|
||||
u32 len, off;
|
||||
int ret;
|
||||
|
||||
/* Lock sock to avoid losing sk_socket during loop. */
|
||||
lock_sock(psock->sk);
|
||||
if (state->skb) {
|
||||
skb = state->skb;
|
||||
len = state->len;
|
||||
off = state->off;
|
||||
state->skb = NULL;
|
||||
goto start;
|
||||
}
|
||||
|
||||
while ((skb = skb_dequeue(&psock->ingress_skb))) {
|
||||
len = skb->len;
|
||||
off = 0;
|
||||
start:
|
||||
ingress = tcp_skb_bpf_ingress(skb);
|
||||
do {
|
||||
ret = -EIO;
|
||||
if (likely(psock->sk->sk_socket))
|
||||
ret = sk_psock_handle_skb(psock, skb, off,
|
||||
len, ingress);
|
||||
if (ret <= 0) {
|
||||
if (ret == -EAGAIN) {
|
||||
state->skb = skb;
|
||||
state->len = len;
|
||||
state->off = off;
|
||||
goto end;
|
||||
}
|
||||
/* Hard errors break pipe and stop xmit. */
|
||||
sk_psock_report_error(psock, ret ? -ret : EPIPE);
|
||||
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
|
||||
kfree_skb(skb);
|
||||
goto end;
|
||||
}
|
||||
off += ret;
|
||||
len -= ret;
|
||||
} while (len);
|
||||
|
||||
if (!ingress)
|
||||
kfree_skb(skb);
|
||||
}
|
||||
end:
|
||||
release_sock(psock->sk);
|
||||
}
|
||||
|
||||
struct sk_psock *sk_psock_init(struct sock *sk, int node)
|
||||
{
|
||||
struct sk_psock *psock = kzalloc_node(sizeof(*psock),
|
||||
GFP_ATOMIC | __GFP_NOWARN,
|
||||
node);
|
||||
if (!psock)
|
||||
return NULL;
|
||||
|
||||
psock->sk = sk;
|
||||
psock->eval = __SK_NONE;
|
||||
|
||||
INIT_LIST_HEAD(&psock->link);
|
||||
spin_lock_init(&psock->link_lock);
|
||||
|
||||
INIT_WORK(&psock->work, sk_psock_backlog);
|
||||
INIT_LIST_HEAD(&psock->ingress_msg);
|
||||
skb_queue_head_init(&psock->ingress_skb);
|
||||
|
||||
sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
|
||||
refcount_set(&psock->refcnt, 1);
|
||||
|
||||
rcu_assign_sk_user_data(sk, psock);
|
||||
sock_hold(sk);
|
||||
|
||||
return psock;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_psock_init);
|
||||
|
||||
struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
|
||||
{
|
||||
struct sk_psock_link *link;
|
||||
|
||||
spin_lock_bh(&psock->link_lock);
|
||||
link = list_first_entry_or_null(&psock->link, struct sk_psock_link,
|
||||
list);
|
||||
if (link)
|
||||
list_del(&link->list);
|
||||
spin_unlock_bh(&psock->link_lock);
|
||||
return link;
|
||||
}
|
||||
|
||||
void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
|
||||
{
|
||||
struct sk_msg *msg, *tmp;
|
||||
|
||||
list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
|
||||
list_del(&msg->list);
|
||||
sk_msg_free(psock->sk, msg);
|
||||
kfree(msg);
|
||||
}
|
||||
}
|
||||
|
||||
static void sk_psock_zap_ingress(struct sk_psock *psock)
|
||||
{
|
||||
__skb_queue_purge(&psock->ingress_skb);
|
||||
__sk_psock_purge_ingress_msg(psock);
|
||||
}
|
||||
|
||||
static void sk_psock_link_destroy(struct sk_psock *psock)
|
||||
{
|
||||
struct sk_psock_link *link, *tmp;
|
||||
|
||||
list_for_each_entry_safe(link, tmp, &psock->link, list) {
|
||||
list_del(&link->list);
|
||||
sk_psock_free_link(link);
|
||||
}
|
||||
}
|
||||
|
||||
static void sk_psock_destroy_deferred(struct work_struct *gc)
|
||||
{
|
||||
struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
|
||||
|
||||
/* No sk_callback_lock since already detached. */
|
||||
if (psock->parser.enabled)
|
||||
strp_done(&psock->parser.strp);
|
||||
|
||||
cancel_work_sync(&psock->work);
|
||||
|
||||
psock_progs_drop(&psock->progs);
|
||||
|
||||
sk_psock_link_destroy(psock);
|
||||
sk_psock_cork_free(psock);
|
||||
sk_psock_zap_ingress(psock);
|
||||
|
||||
if (psock->sk_redir)
|
||||
sock_put(psock->sk_redir);
|
||||
sock_put(psock->sk);
|
||||
kfree(psock);
|
||||
}
|
||||
|
||||
void sk_psock_destroy(struct rcu_head *rcu)
|
||||
{
|
||||
struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
|
||||
|
||||
INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
|
||||
schedule_work(&psock->gc);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_psock_destroy);
|
||||
|
||||
void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
|
||||
{
|
||||
rcu_assign_sk_user_data(sk, NULL);
|
||||
sk_psock_cork_free(psock);
|
||||
sk_psock_restore_proto(sk, psock);
|
||||
|
||||
write_lock_bh(&sk->sk_callback_lock);
|
||||
if (psock->progs.skb_parser)
|
||||
sk_psock_stop_strp(sk, psock);
|
||||
write_unlock_bh(&sk->sk_callback_lock);
|
||||
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
|
||||
|
||||
call_rcu_sched(&psock->rcu, sk_psock_destroy);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_psock_drop);
|
||||
|
||||
static int sk_psock_map_verd(int verdict, bool redir)
|
||||
{
|
||||
switch (verdict) {
|
||||
case SK_PASS:
|
||||
return redir ? __SK_REDIRECT : __SK_PASS;
|
||||
case SK_DROP:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return __SK_DROP;
|
||||
}
|
||||
|
||||
int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
|
||||
struct sk_msg *msg)
|
||||
{
|
||||
struct bpf_prog *prog;
|
||||
int ret;
|
||||
|
||||
preempt_disable();
|
||||
rcu_read_lock();
|
||||
prog = READ_ONCE(psock->progs.msg_parser);
|
||||
if (unlikely(!prog)) {
|
||||
ret = __SK_PASS;
|
||||
goto out;
|
||||
}
|
||||
|
||||
sk_msg_compute_data_pointers(msg);
|
||||
msg->sk = sk;
|
||||
ret = BPF_PROG_RUN(prog, msg);
|
||||
ret = sk_psock_map_verd(ret, msg->sk_redir);
|
||||
psock->apply_bytes = msg->apply_bytes;
|
||||
if (ret == __SK_REDIRECT) {
|
||||
if (psock->sk_redir)
|
||||
sock_put(psock->sk_redir);
|
||||
psock->sk_redir = msg->sk_redir;
|
||||
if (!psock->sk_redir) {
|
||||
ret = __SK_DROP;
|
||||
goto out;
|
||||
}
|
||||
sock_hold(psock->sk_redir);
|
||||
}
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
preempt_enable();
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
|
||||
|
||||
static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
|
||||
struct sk_buff *skb)
|
||||
{
|
||||
int ret;
|
||||
|
||||
skb->sk = psock->sk;
|
||||
bpf_compute_data_end_sk_skb(skb);
|
||||
preempt_disable();
|
||||
ret = BPF_PROG_RUN(prog, skb);
|
||||
preempt_enable();
|
||||
/* strparser clones the skb before handing it to a upper layer,
|
||||
* meaning skb_orphan has been called. We NULL sk on the way out
|
||||
* to ensure we don't trigger a BUG_ON() in skb/sk operations
|
||||
* later and because we are not charging the memory of this skb
|
||||
* to any socket yet.
|
||||
*/
|
||||
skb->sk = NULL;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
|
||||
{
|
||||
struct sk_psock_parser *parser;
|
||||
|
||||
parser = container_of(strp, struct sk_psock_parser, strp);
|
||||
return container_of(parser, struct sk_psock, parser);
|
||||
}
|
||||
|
||||
static void sk_psock_verdict_apply(struct sk_psock *psock,
|
||||
struct sk_buff *skb, int verdict)
|
||||
{
|
||||
struct sk_psock *psock_other;
|
||||
struct sock *sk_other;
|
||||
bool ingress;
|
||||
|
||||
switch (verdict) {
|
||||
case __SK_REDIRECT:
|
||||
sk_other = tcp_skb_bpf_redirect_fetch(skb);
|
||||
if (unlikely(!sk_other))
|
||||
goto out_free;
|
||||
psock_other = sk_psock(sk_other);
|
||||
if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
|
||||
!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED))
|
||||
goto out_free;
|
||||
ingress = tcp_skb_bpf_ingress(skb);
|
||||
if ((!ingress && sock_writeable(sk_other)) ||
|
||||
(ingress &&
|
||||
atomic_read(&sk_other->sk_rmem_alloc) <=
|
||||
sk_other->sk_rcvbuf)) {
|
||||
if (!ingress)
|
||||
skb_set_owner_w(skb, sk_other);
|
||||
skb_queue_tail(&psock_other->ingress_skb, skb);
|
||||
schedule_work(&psock_other->work);
|
||||
break;
|
||||
}
|
||||
/* fall-through */
|
||||
case __SK_DROP:
|
||||
/* fall-through */
|
||||
default:
|
||||
out_free:
|
||||
kfree_skb(skb);
|
||||
}
|
||||
}
|
||||
|
||||
static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
|
||||
{
|
||||
struct sk_psock *psock = sk_psock_from_strp(strp);
|
||||
struct bpf_prog *prog;
|
||||
int ret = __SK_DROP;
|
||||
|
||||
rcu_read_lock();
|
||||
prog = READ_ONCE(psock->progs.skb_verdict);
|
||||
if (likely(prog)) {
|
||||
skb_orphan(skb);
|
||||
tcp_skb_bpf_redirect_clear(skb);
|
||||
ret = sk_psock_bpf_run(psock, prog, skb);
|
||||
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
|
||||
}
|
||||
rcu_read_unlock();
|
||||
sk_psock_verdict_apply(psock, skb, ret);
|
||||
}
|
||||
|
||||
static int sk_psock_strp_read_done(struct strparser *strp, int err)
|
||||
{
|
||||
return err;
|
||||
}
|
||||
|
||||
static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
|
||||
{
|
||||
struct sk_psock *psock = sk_psock_from_strp(strp);
|
||||
struct bpf_prog *prog;
|
||||
int ret = skb->len;
|
||||
|
||||
rcu_read_lock();
|
||||
prog = READ_ONCE(psock->progs.skb_parser);
|
||||
if (likely(prog))
|
||||
ret = sk_psock_bpf_run(psock, prog, skb);
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Called with socket lock held. */
|
||||
static void sk_psock_data_ready(struct sock *sk)
|
||||
{
|
||||
struct sk_psock *psock;
|
||||
|
||||
rcu_read_lock();
|
||||
psock = sk_psock(sk);
|
||||
if (likely(psock)) {
|
||||
write_lock_bh(&sk->sk_callback_lock);
|
||||
strp_data_ready(&psock->parser.strp);
|
||||
write_unlock_bh(&sk->sk_callback_lock);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static void sk_psock_write_space(struct sock *sk)
|
||||
{
|
||||
struct sk_psock *psock;
|
||||
void (*write_space)(struct sock *sk);
|
||||
|
||||
rcu_read_lock();
|
||||
psock = sk_psock(sk);
|
||||
if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)))
|
||||
schedule_work(&psock->work);
|
||||
write_space = psock->saved_write_space;
|
||||
rcu_read_unlock();
|
||||
write_space(sk);
|
||||
}
|
||||
|
||||
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
|
||||
{
|
||||
static const struct strp_callbacks cb = {
|
||||
.rcv_msg = sk_psock_strp_read,
|
||||
.read_sock_done = sk_psock_strp_read_done,
|
||||
.parse_msg = sk_psock_strp_parse,
|
||||
};
|
||||
|
||||
psock->parser.enabled = false;
|
||||
return strp_init(&psock->parser.strp, sk, &cb);
|
||||
}
|
||||
|
||||
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
|
||||
{
|
||||
struct sk_psock_parser *parser = &psock->parser;
|
||||
|
||||
if (parser->enabled)
|
||||
return;
|
||||
|
||||
parser->saved_data_ready = sk->sk_data_ready;
|
||||
sk->sk_data_ready = sk_psock_data_ready;
|
||||
sk->sk_write_space = sk_psock_write_space;
|
||||
parser->enabled = true;
|
||||
}
|
||||
|
||||
void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
|
||||
{
|
||||
struct sk_psock_parser *parser = &psock->parser;
|
||||
|
||||
if (!parser->enabled)
|
||||
return;
|
||||
|
||||
sk->sk_data_ready = parser->saved_data_ready;
|
||||
parser->saved_data_ready = NULL;
|
||||
strp_stop(&parser->strp);
|
||||
parser->enabled = false;
|
||||
}
|
@ -2238,67 +2238,6 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
|
||||
}
|
||||
EXPORT_SYMBOL(sk_page_frag_refill);
|
||||
|
||||
int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
|
||||
int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
|
||||
int first_coalesce)
|
||||
{
|
||||
int sg_curr = *sg_curr_index, use = 0, rc = 0;
|
||||
unsigned int size = *sg_curr_size;
|
||||
struct page_frag *pfrag;
|
||||
struct scatterlist *sge;
|
||||
|
||||
len -= size;
|
||||
pfrag = sk_page_frag(sk);
|
||||
|
||||
while (len > 0) {
|
||||
unsigned int orig_offset;
|
||||
|
||||
if (!sk_page_frag_refill(sk, pfrag)) {
|
||||
rc = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
use = min_t(int, len, pfrag->size - pfrag->offset);
|
||||
|
||||
if (!sk_wmem_schedule(sk, use)) {
|
||||
rc = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
sk_mem_charge(sk, use);
|
||||
size += use;
|
||||
orig_offset = pfrag->offset;
|
||||
pfrag->offset += use;
|
||||
|
||||
sge = sg + sg_curr - 1;
|
||||
if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
|
||||
sge->offset + sge->length == orig_offset) {
|
||||
sge->length += use;
|
||||
} else {
|
||||
sge = sg + sg_curr;
|
||||
sg_unmark_end(sge);
|
||||
sg_set_page(sge, pfrag->page, use, orig_offset);
|
||||
get_page(pfrag->page);
|
||||
sg_curr++;
|
||||
|
||||
if (sg_curr == MAX_SKB_FRAGS)
|
||||
sg_curr = 0;
|
||||
|
||||
if (sg_curr == sg_start) {
|
||||
rc = -ENOSPC;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
len -= use;
|
||||
}
|
||||
out:
|
||||
*sg_curr_size = size;
|
||||
*sg_curr_index = sg_curr;
|
||||
return rc;
|
||||
}
|
||||
EXPORT_SYMBOL(sk_alloc_sg);
|
||||
|
||||
static void __lock_sock(struct sock *sk)
|
||||
__releases(&sk->sk_lock.slock)
|
||||
__acquires(&sk->sk_lock.slock)
|
||||
|
1002
net/core/sock_map.c
Normal file
1002
net/core/sock_map.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -63,6 +63,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
|
||||
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
|
||||
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
|
||||
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
|
||||
obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
|
||||
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
|
||||
|
||||
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
|
||||
|
655
net/ipv4/tcp_bpf.c
Normal file
655
net/ipv4/tcp_bpf.c
Normal file
@ -0,0 +1,655 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
|
||||
|
||||
#include <linux/skmsg.h>
|
||||
#include <linux/filter.h>
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/wait.h>
|
||||
|
||||
#include <net/inet_common.h>
|
||||
|
||||
static bool tcp_bpf_stream_read(const struct sock *sk)
|
||||
{
|
||||
struct sk_psock *psock;
|
||||
bool empty = true;
|
||||
|
||||
rcu_read_lock();
|
||||
psock = sk_psock(sk);
|
||||
if (likely(psock))
|
||||
empty = list_empty(&psock->ingress_msg);
|
||||
rcu_read_unlock();
|
||||
return !empty;
|
||||
}
|
||||
|
||||
static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
|
||||
int flags, long timeo, int *err)
|
||||
{
|
||||
DEFINE_WAIT_FUNC(wait, woken_wake_function);
|
||||
int ret;
|
||||
|
||||
add_wait_queue(sk_sleep(sk), &wait);
|
||||
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
|
||||
ret = sk_wait_event(sk, &timeo,
|
||||
!list_empty(&psock->ingress_msg) ||
|
||||
!skb_queue_empty(&sk->sk_receive_queue), &wait);
|
||||
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
|
||||
remove_wait_queue(sk_sleep(sk), &wait);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
|
||||
struct msghdr *msg, int len)
|
||||
{
|
||||
struct iov_iter *iter = &msg->msg_iter;
|
||||
int i, ret, copied = 0;
|
||||
|
||||
while (copied != len) {
|
||||
struct scatterlist *sge;
|
||||
struct sk_msg *msg_rx;
|
||||
|
||||
msg_rx = list_first_entry_or_null(&psock->ingress_msg,
|
||||
struct sk_msg, list);
|
||||
if (unlikely(!msg_rx))
|
||||
break;
|
||||
|
||||
i = msg_rx->sg.start;
|
||||
do {
|
||||
struct page *page;
|
||||
int copy;
|
||||
|
||||
sge = sk_msg_elem(msg_rx, i);
|
||||
copy = sge->length;
|
||||
page = sg_page(sge);
|
||||
if (copied + copy > len)
|
||||
copy = len - copied;
|
||||
ret = copy_page_to_iter(page, sge->offset, copy, iter);
|
||||
if (ret != copy) {
|
||||
msg_rx->sg.start = i;
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
copied += copy;
|
||||
sge->offset += copy;
|
||||
sge->length -= copy;
|
||||
sk_mem_uncharge(sk, copy);
|
||||
if (!sge->length) {
|
||||
i++;
|
||||
if (i == MAX_SKB_FRAGS)
|
||||
i = 0;
|
||||
if (!msg_rx->skb)
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
if (copied == len)
|
||||
break;
|
||||
} while (i != msg_rx->sg.end);
|
||||
|
||||
msg_rx->sg.start = i;
|
||||
if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
|
||||
list_del(&msg_rx->list);
|
||||
if (msg_rx->skb)
|
||||
consume_skb(msg_rx->skb);
|
||||
kfree(msg_rx);
|
||||
}
|
||||
}
|
||||
|
||||
return copied;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
|
||||
|
||||
int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
|
||||
int nonblock, int flags, int *addr_len)
|
||||
{
|
||||
struct sk_psock *psock;
|
||||
int copied, ret;
|
||||
|
||||
if (unlikely(flags & MSG_ERRQUEUE))
|
||||
return inet_recv_error(sk, msg, len, addr_len);
|
||||
if (!skb_queue_empty(&sk->sk_receive_queue))
|
||||
return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
|
||||
|
||||
psock = sk_psock_get(sk);
|
||||
if (unlikely(!psock))
|
||||
return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
|
||||
lock_sock(sk);
|
||||
msg_bytes_ready:
|
||||
copied = __tcp_bpf_recvmsg(sk, psock, msg, len);
|
||||
if (!copied) {
|
||||
int data, err = 0;
|
||||
long timeo;
|
||||
|
||||
timeo = sock_rcvtimeo(sk, nonblock);
|
||||
data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
|
||||
if (data) {
|
||||
if (skb_queue_empty(&sk->sk_receive_queue))
|
||||
goto msg_bytes_ready;
|
||||
release_sock(sk);
|
||||
sk_psock_put(sk, psock);
|
||||
return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
|
||||
}
|
||||
if (err) {
|
||||
ret = err;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
ret = copied;
|
||||
out:
|
||||
release_sock(sk);
|
||||
sk_psock_put(sk, psock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
|
||||
struct sk_msg *msg, u32 apply_bytes, int flags)
|
||||
{
|
||||
bool apply = apply_bytes;
|
||||
struct scatterlist *sge;
|
||||
u32 size, copied = 0;
|
||||
struct sk_msg *tmp;
|
||||
int i, ret = 0;
|
||||
|
||||
tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL);
|
||||
if (unlikely(!tmp))
|
||||
return -ENOMEM;
|
||||
|
||||
lock_sock(sk);
|
||||
tmp->sg.start = msg->sg.start;
|
||||
i = msg->sg.start;
|
||||
do {
|
||||
sge = sk_msg_elem(msg, i);
|
||||
size = (apply && apply_bytes < sge->length) ?
|
||||
apply_bytes : sge->length;
|
||||
if (!sk_wmem_schedule(sk, size)) {
|
||||
if (!copied)
|
||||
ret = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
|
||||
sk_mem_charge(sk, size);
|
||||
sk_msg_xfer(tmp, msg, i, size);
|
||||
copied += size;
|
||||
if (sge->length)
|
||||
get_page(sk_msg_page(tmp, i));
|
||||
sk_msg_iter_var_next(i);
|
||||
tmp->sg.end = i;
|
||||
if (apply) {
|
||||
apply_bytes -= size;
|
||||
if (!apply_bytes)
|
||||
break;
|
||||
}
|
||||
} while (i != msg->sg.end);
|
||||
|
||||
if (!ret) {
|
||||
msg->sg.start = i;
|
||||
msg->sg.size -= apply_bytes;
|
||||
sk_psock_queue_msg(psock, tmp);
|
||||
sk->sk_data_ready(sk);
|
||||
} else {
|
||||
sk_msg_free(sk, tmp);
|
||||
kfree(tmp);
|
||||
}
|
||||
|
||||
release_sock(sk);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
|
||||
int flags, bool uncharge)
|
||||
{
|
||||
bool apply = apply_bytes;
|
||||
struct scatterlist *sge;
|
||||
struct page *page;
|
||||
int size, ret = 0;
|
||||
u32 off;
|
||||
|
||||
while (1) {
|
||||
sge = sk_msg_elem(msg, msg->sg.start);
|
||||
size = (apply && apply_bytes < sge->length) ?
|
||||
apply_bytes : sge->length;
|
||||
off = sge->offset;
|
||||
page = sg_page(sge);
|
||||
|
||||
tcp_rate_check_app_limited(sk);
|
||||
retry:
|
||||
ret = do_tcp_sendpages(sk, page, off, size, flags);
|
||||
if (ret <= 0)
|
||||
return ret;
|
||||
if (apply)
|
||||
apply_bytes -= ret;
|
||||
msg->sg.size -= ret;
|
||||
sge->offset += ret;
|
||||
sge->length -= ret;
|
||||
if (uncharge)
|
||||
sk_mem_uncharge(sk, ret);
|
||||
if (ret != size) {
|
||||
size -= ret;
|
||||
off += ret;
|
||||
goto retry;
|
||||
}
|
||||
if (!sge->length) {
|
||||
put_page(page);
|
||||
sk_msg_iter_next(msg, start);
|
||||
sg_init_table(sge, 1);
|
||||
if (msg->sg.start == msg->sg.end)
|
||||
break;
|
||||
}
|
||||
if (apply && !apply_bytes)
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg,
|
||||
u32 apply_bytes, int flags, bool uncharge)
|
||||
{
|
||||
int ret;
|
||||
|
||||
lock_sock(sk);
|
||||
ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge);
|
||||
release_sock(sk);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
|
||||
u32 bytes, int flags)
|
||||
{
|
||||
bool ingress = sk_msg_to_ingress(msg);
|
||||
struct sk_psock *psock = sk_psock_get(sk);
|
||||
int ret;
|
||||
|
||||
if (unlikely(!psock)) {
|
||||
sk_msg_free(sk, msg);
|
||||
return 0;
|
||||
}
|
||||
ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) :
|
||||
tcp_bpf_push_locked(sk, msg, bytes, flags, false);
|
||||
sk_psock_put(sk, psock);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
|
||||
|
||||
static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
|
||||
struct sk_msg *msg, int *copied, int flags)
|
||||
{
|
||||
bool cork = false, enospc = msg->sg.start == msg->sg.end;
|
||||
struct sock *sk_redir;
|
||||
u32 tosend;
|
||||
int ret;
|
||||
|
||||
more_data:
|
||||
if (psock->eval == __SK_NONE)
|
||||
psock->eval = sk_psock_msg_verdict(sk, psock, msg);
|
||||
|
||||
if (msg->cork_bytes &&
|
||||
msg->cork_bytes > msg->sg.size && !enospc) {
|
||||
psock->cork_bytes = msg->cork_bytes - msg->sg.size;
|
||||
if (!psock->cork) {
|
||||
psock->cork = kzalloc(sizeof(*psock->cork),
|
||||
GFP_ATOMIC | __GFP_NOWARN);
|
||||
if (!psock->cork)
|
||||
return -ENOMEM;
|
||||
}
|
||||
memcpy(psock->cork, msg, sizeof(*msg));
|
||||
return 0;
|
||||
}
|
||||
|
||||
tosend = msg->sg.size;
|
||||
if (psock->apply_bytes && psock->apply_bytes < tosend)
|
||||
tosend = psock->apply_bytes;
|
||||
|
||||
switch (psock->eval) {
|
||||
case __SK_PASS:
|
||||
ret = tcp_bpf_push(sk, msg, tosend, flags, true);
|
||||
if (unlikely(ret)) {
|
||||
*copied -= sk_msg_free(sk, msg);
|
||||
break;
|
||||
}
|
||||
sk_msg_apply_bytes(psock, tosend);
|
||||
break;
|
||||
case __SK_REDIRECT:
|
||||
sk_redir = psock->sk_redir;
|
||||
sk_msg_apply_bytes(psock, tosend);
|
||||
if (psock->cork) {
|
||||
cork = true;
|
||||
psock->cork = NULL;
|
||||
}
|
||||
sk_msg_return(sk, msg, tosend);
|
||||
release_sock(sk);
|
||||
ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
|
||||
lock_sock(sk);
|
||||
if (unlikely(ret < 0)) {
|
||||
int free = sk_msg_free_nocharge(sk, msg);
|
||||
|
||||
if (!cork)
|
||||
*copied -= free;
|
||||
}
|
||||
if (cork) {
|
||||
sk_msg_free(sk, msg);
|
||||
kfree(msg);
|
||||
msg = NULL;
|
||||
ret = 0;
|
||||
}
|
||||
break;
|
||||
case __SK_DROP:
|
||||
default:
|
||||
sk_msg_free_partial(sk, msg, tosend);
|
||||
sk_msg_apply_bytes(psock, tosend);
|
||||
*copied -= tosend;
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
if (likely(!ret)) {
|
||||
if (!psock->apply_bytes) {
|
||||
psock->eval = __SK_NONE;
|
||||
if (psock->sk_redir) {
|
||||
sock_put(psock->sk_redir);
|
||||
psock->sk_redir = NULL;
|
||||
}
|
||||
}
|
||||
if (msg &&
|
||||
msg->sg.data[msg->sg.start].page_link &&
|
||||
msg->sg.data[msg->sg.start].length)
|
||||
goto more_data;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
|
||||
{
|
||||
struct sk_msg tmp, *msg_tx = NULL;
|
||||
int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
|
||||
int copied = 0, err = 0;
|
||||
struct sk_psock *psock;
|
||||
long timeo;
|
||||
|
||||
psock = sk_psock_get(sk);
|
||||
if (unlikely(!psock))
|
||||
return tcp_sendmsg(sk, msg, size);
|
||||
|
||||
lock_sock(sk);
|
||||
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
|
||||
while (msg_data_left(msg)) {
|
||||
bool enospc = false;
|
||||
u32 copy, osize;
|
||||
|
||||
if (sk->sk_err) {
|
||||
err = -sk->sk_err;
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
copy = msg_data_left(msg);
|
||||
if (!sk_stream_memory_free(sk))
|
||||
goto wait_for_sndbuf;
|
||||
if (psock->cork) {
|
||||
msg_tx = psock->cork;
|
||||
} else {
|
||||
msg_tx = &tmp;
|
||||
sk_msg_init(msg_tx);
|
||||
}
|
||||
|
||||
osize = msg_tx->sg.size;
|
||||
err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1);
|
||||
if (err) {
|
||||
if (err != -ENOSPC)
|
||||
goto wait_for_memory;
|
||||
enospc = true;
|
||||
copy = msg_tx->sg.size - osize;
|
||||
}
|
||||
|
||||
err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx,
|
||||
copy);
|
||||
if (err < 0) {
|
||||
sk_msg_trim(sk, msg_tx, osize);
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
copied += copy;
|
||||
if (psock->cork_bytes) {
|
||||
if (size > psock->cork_bytes)
|
||||
psock->cork_bytes = 0;
|
||||
else
|
||||
psock->cork_bytes -= size;
|
||||
if (psock->cork_bytes && !enospc)
|
||||
goto out_err;
|
||||
/* All cork bytes are accounted, rerun the prog. */
|
||||
psock->eval = __SK_NONE;
|
||||
psock->cork_bytes = 0;
|
||||
}
|
||||
|
||||
err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags);
|
||||
if (unlikely(err < 0))
|
||||
goto out_err;
|
||||
continue;
|
||||
wait_for_sndbuf:
|
||||
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
|
||||
wait_for_memory:
|
||||
err = sk_stream_wait_memory(sk, &timeo);
|
||||
if (err) {
|
||||
if (msg_tx && msg_tx != psock->cork)
|
||||
sk_msg_free(sk, msg_tx);
|
||||
goto out_err;
|
||||
}
|
||||
}
|
||||
out_err:
|
||||
if (err < 0)
|
||||
err = sk_stream_error(sk, msg->msg_flags, err);
|
||||
release_sock(sk);
|
||||
sk_psock_put(sk, psock);
|
||||
return copied ? copied : err;
|
||||
}
|
||||
|
||||
static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset,
|
||||
size_t size, int flags)
|
||||
{
|
||||
struct sk_msg tmp, *msg = NULL;
|
||||
int err = 0, copied = 0;
|
||||
struct sk_psock *psock;
|
||||
bool enospc = false;
|
||||
|
||||
psock = sk_psock_get(sk);
|
||||
if (unlikely(!psock))
|
||||
return tcp_sendpage(sk, page, offset, size, flags);
|
||||
|
||||
lock_sock(sk);
|
||||
if (psock->cork) {
|
||||
msg = psock->cork;
|
||||
} else {
|
||||
msg = &tmp;
|
||||
sk_msg_init(msg);
|
||||
}
|
||||
|
||||
/* Catch case where ring is full and sendpage is stalled. */
|
||||
if (unlikely(sk_msg_full(msg)))
|
||||
goto out_err;
|
||||
|
||||
sk_msg_page_add(msg, page, size, offset);
|
||||
sk_mem_charge(sk, size);
|
||||
copied = size;
|
||||
if (sk_msg_full(msg))
|
||||
enospc = true;
|
||||
if (psock->cork_bytes) {
|
||||
if (size > psock->cork_bytes)
|
||||
psock->cork_bytes = 0;
|
||||
else
|
||||
psock->cork_bytes -= size;
|
||||
if (psock->cork_bytes && !enospc)
|
||||
goto out_err;
|
||||
/* All cork bytes are accounted, rerun the prog. */
|
||||
psock->eval = __SK_NONE;
|
||||
psock->cork_bytes = 0;
|
||||
}
|
||||
|
||||
err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags);
|
||||
out_err:
|
||||
release_sock(sk);
|
||||
sk_psock_put(sk, psock);
|
||||
return copied ? copied : err;
|
||||
}
|
||||
|
||||
static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock)
|
||||
{
|
||||
struct sk_psock_link *link;
|
||||
|
||||
sk_psock_cork_free(psock);
|
||||
__sk_psock_purge_ingress_msg(psock);
|
||||
while ((link = sk_psock_link_pop(psock))) {
|
||||
sk_psock_unlink(sk, link);
|
||||
sk_psock_free_link(link);
|
||||
}
|
||||
}
|
||||
|
||||
static void tcp_bpf_unhash(struct sock *sk)
|
||||
{
|
||||
void (*saved_unhash)(struct sock *sk);
|
||||
struct sk_psock *psock;
|
||||
|
||||
rcu_read_lock();
|
||||
psock = sk_psock(sk);
|
||||
if (unlikely(!psock)) {
|
||||
rcu_read_unlock();
|
||||
if (sk->sk_prot->unhash)
|
||||
sk->sk_prot->unhash(sk);
|
||||
return;
|
||||
}
|
||||
|
||||
saved_unhash = psock->saved_unhash;
|
||||
tcp_bpf_remove(sk, psock);
|
||||
rcu_read_unlock();
|
||||
saved_unhash(sk);
|
||||
}
|
||||
|
||||
static void tcp_bpf_close(struct sock *sk, long timeout)
|
||||
{
|
||||
void (*saved_close)(struct sock *sk, long timeout);
|
||||
struct sk_psock *psock;
|
||||
|
||||
lock_sock(sk);
|
||||
rcu_read_lock();
|
||||
psock = sk_psock(sk);
|
||||
if (unlikely(!psock)) {
|
||||
rcu_read_unlock();
|
||||
release_sock(sk);
|
||||
return sk->sk_prot->close(sk, timeout);
|
||||
}
|
||||
|
||||
saved_close = psock->saved_close;
|
||||
tcp_bpf_remove(sk, psock);
|
||||
rcu_read_unlock();
|
||||
release_sock(sk);
|
||||
saved_close(sk, timeout);
|
||||
}
|
||||
|
||||
enum {
|
||||
TCP_BPF_IPV4,
|
||||
TCP_BPF_IPV6,
|
||||
TCP_BPF_NUM_PROTS,
|
||||
};
|
||||
|
||||
enum {
|
||||
TCP_BPF_BASE,
|
||||
TCP_BPF_TX,
|
||||
TCP_BPF_NUM_CFGS,
|
||||
};
|
||||
|
||||
static struct proto *tcpv6_prot_saved __read_mostly;
|
||||
static DEFINE_SPINLOCK(tcpv6_prot_lock);
|
||||
static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS];
|
||||
|
||||
static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
|
||||
struct proto *base)
|
||||
{
|
||||
prot[TCP_BPF_BASE] = *base;
|
||||
prot[TCP_BPF_BASE].unhash = tcp_bpf_unhash;
|
||||
prot[TCP_BPF_BASE].close = tcp_bpf_close;
|
||||
prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
|
||||
prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read;
|
||||
|
||||
prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
|
||||
prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;
|
||||
prot[TCP_BPF_TX].sendpage = tcp_bpf_sendpage;
|
||||
}
|
||||
|
||||
static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops)
|
||||
{
|
||||
if (sk->sk_family == AF_INET6 &&
|
||||
unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
|
||||
spin_lock_bh(&tcpv6_prot_lock);
|
||||
if (likely(ops != tcpv6_prot_saved)) {
|
||||
tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops);
|
||||
smp_store_release(&tcpv6_prot_saved, ops);
|
||||
}
|
||||
spin_unlock_bh(&tcpv6_prot_lock);
|
||||
}
|
||||
}
|
||||
|
||||
static int __init tcp_bpf_v4_build_proto(void)
|
||||
{
|
||||
tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot);
|
||||
return 0;
|
||||
}
|
||||
core_initcall(tcp_bpf_v4_build_proto);
|
||||
|
||||
static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock)
|
||||
{
|
||||
int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
|
||||
int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
|
||||
|
||||
sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]);
|
||||
}
|
||||
|
||||
static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock)
|
||||
{
|
||||
int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
|
||||
int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
|
||||
|
||||
/* Reinit occurs when program types change e.g. TCP_BPF_TX is removed
|
||||
* or added requiring sk_prot hook updates. We keep original saved
|
||||
* hooks in this case.
|
||||
*/
|
||||
sk->sk_prot = &tcp_bpf_prots[family][config];
|
||||
}
|
||||
|
||||
static int tcp_bpf_assert_proto_ops(struct proto *ops)
|
||||
{
|
||||
/* In order to avoid retpoline, we make assumptions when we call
|
||||
* into ops if e.g. a psock is not present. Make sure they are
|
||||
* indeed valid assumptions.
|
||||
*/
|
||||
return ops->recvmsg == tcp_recvmsg &&
|
||||
ops->sendmsg == tcp_sendmsg &&
|
||||
ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
|
||||
}
|
||||
|
||||
void tcp_bpf_reinit(struct sock *sk)
|
||||
{
|
||||
struct sk_psock *psock;
|
||||
|
||||
sock_owned_by_me(sk);
|
||||
|
||||
rcu_read_lock();
|
||||
psock = sk_psock(sk);
|
||||
tcp_bpf_reinit_sk_prot(sk, psock);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
int tcp_bpf_init(struct sock *sk)
|
||||
{
|
||||
struct proto *ops = READ_ONCE(sk->sk_prot);
|
||||
struct sk_psock *psock;
|
||||
|
||||
sock_owned_by_me(sk);
|
||||
|
||||
rcu_read_lock();
|
||||
psock = sk_psock(sk);
|
||||
if (unlikely(!psock || psock->sk_proto ||
|
||||
tcp_bpf_assert_proto_ops(ops))) {
|
||||
rcu_read_unlock();
|
||||
return -EINVAL;
|
||||
}
|
||||
tcp_bpf_check_v6_needs_rebuild(sk, ops);
|
||||
tcp_bpf_update_sk_prot(sk, psock);
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
@ -6,7 +6,7 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include<linux/module.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/list.h>
|
||||
@ -29,18 +29,6 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp)
|
||||
{
|
||||
struct tcp_ulp_ops *e;
|
||||
|
||||
list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
|
||||
if (e->uid == ulp)
|
||||
return e;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
|
||||
{
|
||||
const struct tcp_ulp_ops *ulp = NULL;
|
||||
@ -63,18 +51,6 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
|
||||
return ulp;
|
||||
}
|
||||
|
||||
static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid)
|
||||
{
|
||||
const struct tcp_ulp_ops *ulp;
|
||||
|
||||
rcu_read_lock();
|
||||
ulp = tcp_ulp_find_id(uid);
|
||||
if (!ulp || !try_module_get(ulp->owner))
|
||||
ulp = NULL;
|
||||
rcu_read_unlock();
|
||||
return ulp;
|
||||
}
|
||||
|
||||
/* Attach new upper layer protocol to the list
|
||||
* of available protocols.
|
||||
*/
|
||||
@ -123,6 +99,8 @@ void tcp_cleanup_ulp(struct sock *sk)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
|
||||
sock_owned_by_me(sk);
|
||||
|
||||
if (!icsk->icsk_ulp_ops)
|
||||
return;
|
||||
|
||||
@ -133,54 +111,35 @@ void tcp_cleanup_ulp(struct sock *sk)
|
||||
icsk->icsk_ulp_ops = NULL;
|
||||
}
|
||||
|
||||
/* Change upper layer protocol for socket */
|
||||
int tcp_set_ulp(struct sock *sk, const char *name)
|
||||
static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
const struct tcp_ulp_ops *ulp_ops;
|
||||
int err = 0;
|
||||
int err;
|
||||
|
||||
err = -EEXIST;
|
||||
if (icsk->icsk_ulp_ops)
|
||||
return -EEXIST;
|
||||
goto out_err;
|
||||
|
||||
err = ulp_ops->init(sk);
|
||||
if (err)
|
||||
goto out_err;
|
||||
|
||||
icsk->icsk_ulp_ops = ulp_ops;
|
||||
return 0;
|
||||
out_err:
|
||||
module_put(ulp_ops->owner);
|
||||
return err;
|
||||
}
|
||||
|
||||
int tcp_set_ulp(struct sock *sk, const char *name)
|
||||
{
|
||||
const struct tcp_ulp_ops *ulp_ops;
|
||||
|
||||
sock_owned_by_me(sk);
|
||||
|
||||
ulp_ops = __tcp_ulp_find_autoload(name);
|
||||
if (!ulp_ops)
|
||||
return -ENOENT;
|
||||
|
||||
if (!ulp_ops->user_visible) {
|
||||
module_put(ulp_ops->owner);
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
err = ulp_ops->init(sk);
|
||||
if (err) {
|
||||
module_put(ulp_ops->owner);
|
||||
return err;
|
||||
}
|
||||
|
||||
icsk->icsk_ulp_ops = ulp_ops;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int tcp_set_ulp_id(struct sock *sk, int ulp)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
const struct tcp_ulp_ops *ulp_ops;
|
||||
int err;
|
||||
|
||||
if (icsk->icsk_ulp_ops)
|
||||
return -EEXIST;
|
||||
|
||||
ulp_ops = __tcp_ulp_lookup(ulp);
|
||||
if (!ulp_ops)
|
||||
return -ENOENT;
|
||||
|
||||
err = ulp_ops->init(sk);
|
||||
if (err) {
|
||||
module_put(ulp_ops->owner);
|
||||
return err;
|
||||
}
|
||||
|
||||
icsk->icsk_ulp_ops = ulp_ops;
|
||||
return 0;
|
||||
return __tcp_set_ulp(sk, ulp_ops);
|
||||
}
|
||||
|
@ -1,4 +1,2 @@
|
||||
|
||||
config STREAM_PARSER
|
||||
tristate
|
||||
default n
|
||||
def_bool n
|
||||
|
@ -8,6 +8,7 @@ config TLS
|
||||
select CRYPTO_AES
|
||||
select CRYPTO_GCM
|
||||
select STREAM_PARSER
|
||||
select NET_SOCK_MSG
|
||||
default n
|
||||
---help---
|
||||
Enable kernel support for TLS protocol. This allows symmetric
|
||||
|
@ -421,7 +421,7 @@ static int tls_push_data(struct sock *sk,
|
||||
tls_push_record_flags = flags;
|
||||
if (more) {
|
||||
tls_ctx->pending_open_record_frags =
|
||||
record->num_frags;
|
||||
!!record->num_frags;
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -620,12 +620,14 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
|
||||
prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage;
|
||||
|
||||
prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE];
|
||||
prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg;
|
||||
prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close;
|
||||
prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg;
|
||||
prot[TLS_BASE][TLS_SW].stream_memory_read = tls_sw_stream_read;
|
||||
prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close;
|
||||
|
||||
prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE];
|
||||
prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg;
|
||||
prot[TLS_SW][TLS_SW].close = tls_sk_proto_close;
|
||||
prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg;
|
||||
prot[TLS_SW][TLS_SW].stream_memory_read = tls_sw_stream_read;
|
||||
prot[TLS_SW][TLS_SW].close = tls_sk_proto_close;
|
||||
|
||||
#ifdef CONFIG_TLS_DEVICE
|
||||
prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
|
||||
@ -724,7 +726,6 @@ static int __init tls_register(void)
|
||||
build_protos(tls_prots[TLSV4], &tcp_prot);
|
||||
|
||||
tls_sw_proto_ops = inet_stream_ops;
|
||||
tls_sw_proto_ops.poll = tls_sw_poll;
|
||||
tls_sw_proto_ops.splice_read = tls_sw_splice_read;
|
||||
|
||||
#ifdef CONFIG_TLS_DEVICE
|
||||
|
912
net/tls/tls_sw.c
912
net/tls/tls_sw.c
File diff suppressed because it is too large
Load Diff
@ -71,6 +71,7 @@ int txmsg_start;
|
||||
int txmsg_end;
|
||||
int txmsg_ingress;
|
||||
int txmsg_skb;
|
||||
int ktls;
|
||||
|
||||
static const struct option long_options[] = {
|
||||
{"help", no_argument, NULL, 'h' },
|
||||
@ -92,6 +93,7 @@ static const struct option long_options[] = {
|
||||
{"txmsg_end", required_argument, NULL, 'e'},
|
||||
{"txmsg_ingress", no_argument, &txmsg_ingress, 1 },
|
||||
{"txmsg_skb", no_argument, &txmsg_skb, 1 },
|
||||
{"ktls", no_argument, &ktls, 1 },
|
||||
{0, 0, NULL, 0 }
|
||||
};
|
||||
|
||||
@ -112,6 +114,76 @@ static void usage(char *argv[])
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
#define TCP_ULP 31
|
||||
#define TLS_TX 1
|
||||
#define TLS_RX 2
|
||||
#include <linux/tls.h>
|
||||
|
||||
char *sock_to_string(int s)
|
||||
{
|
||||
if (s == c1)
|
||||
return "client1";
|
||||
else if (s == c2)
|
||||
return "client2";
|
||||
else if (s == s1)
|
||||
return "server1";
|
||||
else if (s == s2)
|
||||
return "server2";
|
||||
else if (s == p1)
|
||||
return "peer1";
|
||||
else if (s == p2)
|
||||
return "peer2";
|
||||
else
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
static int sockmap_init_ktls(int verbose, int s)
|
||||
{
|
||||
struct tls12_crypto_info_aes_gcm_128 tls_tx = {
|
||||
.info = {
|
||||
.version = TLS_1_2_VERSION,
|
||||
.cipher_type = TLS_CIPHER_AES_GCM_128,
|
||||
},
|
||||
};
|
||||
struct tls12_crypto_info_aes_gcm_128 tls_rx = {
|
||||
.info = {
|
||||
.version = TLS_1_2_VERSION,
|
||||
.cipher_type = TLS_CIPHER_AES_GCM_128,
|
||||
},
|
||||
};
|
||||
int so_buf = 6553500;
|
||||
int err;
|
||||
|
||||
err = setsockopt(s, 6, TCP_ULP, "tls", sizeof("tls"));
|
||||
if (err) {
|
||||
fprintf(stderr, "setsockopt: TCP_ULP(%s) failed with error %i\n", sock_to_string(s), err);
|
||||
return -EINVAL;
|
||||
}
|
||||
err = setsockopt(s, SOL_TLS, TLS_TX, (void *)&tls_tx, sizeof(tls_tx));
|
||||
if (err) {
|
||||
fprintf(stderr, "setsockopt: TLS_TX(%s) failed with error %i\n", sock_to_string(s), err);
|
||||
return -EINVAL;
|
||||
}
|
||||
err = setsockopt(s, SOL_TLS, TLS_RX, (void *)&tls_rx, sizeof(tls_rx));
|
||||
if (err) {
|
||||
fprintf(stderr, "setsockopt: TLS_RX(%s) failed with error %i\n", sock_to_string(s), err);
|
||||
return -EINVAL;
|
||||
}
|
||||
err = setsockopt(s, SOL_SOCKET, SO_SNDBUF, &so_buf, sizeof(so_buf));
|
||||
if (err) {
|
||||
fprintf(stderr, "setsockopt: (%s) failed sndbuf with error %i\n", sock_to_string(s), err);
|
||||
return -EINVAL;
|
||||
}
|
||||
err = setsockopt(s, SOL_SOCKET, SO_RCVBUF, &so_buf, sizeof(so_buf));
|
||||
if (err) {
|
||||
fprintf(stderr, "setsockopt: (%s) failed rcvbuf with error %i\n", sock_to_string(s), err);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (verbose)
|
||||
fprintf(stdout, "socket(%s) kTLS enabled\n", sock_to_string(s));
|
||||
return 0;
|
||||
}
|
||||
static int sockmap_init_sockets(int verbose)
|
||||
{
|
||||
int i, err, one = 1;
|
||||
@ -456,6 +528,21 @@ static int sendmsg_test(struct sockmap_options *opt)
|
||||
else
|
||||
rx_fd = p2;
|
||||
|
||||
if (ktls) {
|
||||
/* Redirecting into non-TLS socket which sends into a TLS
|
||||
* socket is not a valid test. So in this case lets not
|
||||
* enable kTLS but still run the test.
|
||||
*/
|
||||
if (!txmsg_redir || (txmsg_redir && txmsg_ingress)) {
|
||||
err = sockmap_init_ktls(opt->verbose, rx_fd);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
err = sockmap_init_ktls(opt->verbose, c1);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
rxpid = fork();
|
||||
if (rxpid == 0) {
|
||||
if (opt->drop_expected)
|
||||
@ -907,6 +994,8 @@ static void test_options(char *options)
|
||||
strncat(options, "ingress,", OPTSTRING);
|
||||
if (txmsg_skb)
|
||||
strncat(options, "skb,", OPTSTRING);
|
||||
if (ktls)
|
||||
strncat(options, "ktls,", OPTSTRING);
|
||||
}
|
||||
|
||||
static int __test_exec(int cgrp, int test, struct sockmap_options *opt)
|
||||
|
Loading…
Reference in New Issue
Block a user