Merge branch 'bpf-sockmap-fixes'

John Fastabend says:

====================
When I added the test_sockmap to selftests I mistakenly changed the
test logic a bit. The result of this was on redirect cases we ended up
choosing the wrong sock from the BPF program and ended up sending to a
socket that had no receive handler. The result was the actual receive
handler, running on a different socket, is timing out and closing the
socket. This results in errors (-EPIPE to be specific) on the sending
side. Typically happening if the sender does not complete the send
before the receive side times out. So depending on timing and the size
of the send we may get errors. This exposed some bugs in the sockmap
error path handling.

This series fixes the errors. The primary issue is we did not do proper
memory accounting in these cases which resulted in missing a
sk_mem_uncharge(). This happened in the redirect path and in one case
on the normal send path. See the three patches for the details.

The other take-away from this is we need to fix the test_sockmap and
also add more negative test cases. That will happen in bpf-next.

Finally, I tested this using the existing test_sockmap program, the
older sockmap sample test script, and a few real use cases with
Cilium. All of these seem to be in working correctly.

v2: fix compiler warning, drop iterator variable 'i' that is no longer
    used in patch 3.
====================

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Alexei Starovoitov 2018-05-02 15:30:45 -07:00
commit b5b6ff7302

View File

@ -326,6 +326,9 @@ static int bpf_tcp_push(struct sock *sk, int apply_bytes,
if (ret > 0) {
if (apply)
apply_bytes -= ret;
sg->offset += ret;
sg->length -= ret;
size -= ret;
offset += ret;
if (uncharge)
@ -333,8 +336,6 @@ static int bpf_tcp_push(struct sock *sk, int apply_bytes,
goto retry;
}
sg->length = size;
sg->offset = offset;
return ret;
}
@ -392,7 +393,8 @@ static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
} while (i != md->sg_end);
}
static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
static void free_bytes_sg(struct sock *sk, int bytes,
struct sk_msg_buff *md, bool charge)
{
struct scatterlist *sg = md->sg_data;
int i = md->sg_start, free;
@ -402,11 +404,13 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
if (bytes < free) {
sg[i].length -= bytes;
sg[i].offset += bytes;
sk_mem_uncharge(sk, bytes);
if (charge)
sk_mem_uncharge(sk, bytes);
break;
}
sk_mem_uncharge(sk, sg[i].length);
if (charge)
sk_mem_uncharge(sk, sg[i].length);
put_page(sg_page(&sg[i]));
bytes -= sg[i].length;
sg[i].length = 0;
@ -417,6 +421,7 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
if (i == MAX_SKB_FRAGS)
i = 0;
}
md->sg_start = i;
}
static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
@ -575,10 +580,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
struct sk_msg_buff *md,
int flags)
{
bool ingress = !!(md->flags & BPF_F_INGRESS);
struct smap_psock *psock;
struct scatterlist *sg;
int i, err, free = 0;
bool ingress = !!(md->flags & BPF_F_INGRESS);
int err = 0;
sg = md->sg_data;
@ -606,16 +611,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
out_rcu:
rcu_read_unlock();
out:
i = md->sg_start;
while (sg[i].length) {
free += sg[i].length;
put_page(sg_page(&sg[i]));
sg[i].length = 0;
i++;
if (i == MAX_SKB_FRAGS)
i = 0;
}
return free;
free_bytes_sg(NULL, send, md, false);
return err;
}
static inline void bpf_md_init(struct smap_psock *psock)
@ -700,19 +697,26 @@ static int bpf_exec_tx_verdict(struct smap_psock *psock,
err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags);
lock_sock(sk);
if (unlikely(err < 0)) {
free_start_sg(sk, m);
psock->sg_size = 0;
if (!cork)
*copied -= send;
} else {
psock->sg_size -= send;
}
if (cork) {
free_start_sg(sk, m);
psock->sg_size = 0;
kfree(m);
m = NULL;
err = 0;
}
if (unlikely(err))
*copied -= err;
else
psock->sg_size -= send;
break;
case __SK_DROP:
default:
free_bytes_sg(sk, send, m);
free_bytes_sg(sk, send, m, true);
apply_bytes_dec(psock, send);
*copied -= send;
psock->sg_size -= send;