mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-11-25 10:30:54 +07:00
6d0060f600
Per-packet metadata required to write the MPTCP DSS option is written to the skb_ext area. One write to the socket may contain more than one packet of data, which is copied to page fragments and mapped in to MPTCP DSS segments with size determined by the available page fragments and the maximum mapping length allowed by the MPTCP specification. If do_tcp_sendpages() splits a DSS segment in to multiple skbs, that's ok - the later skbs can either have duplicated DSS mapping information or none at all, and the receiver can handle that. The current implementation uses the subflow frag cache and tcp sendpages to avoid excessive code duplication. More work is required to ensure that it works correctly under memory pressure and to support MPTCP-level retransmissions. The MPTCP DSS checksum is not yet implemented. Co-developed-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com> Co-developed-by: Peter Krystad <peter.krystad@linux.intel.com> Signed-off-by: Peter Krystad <peter.krystad@linux.intel.com> Co-developed-by: Florian Westphal <fw@strlen.de> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: Christoph Paasch <cpaasch@apple.com> Signed-off-by: David S. Miller <davem@davemloft.net>
345 lines
8.4 KiB
C
345 lines
8.4 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/* Multipath TCP
|
|
*
|
|
* Copyright (c) 2017 - 2019, Intel Corporation.
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
#include <net/tcp.h>
|
|
#include <net/mptcp.h>
|
|
#include "protocol.h"
|
|
|
|
void mptcp_parse_option(const unsigned char *ptr, int opsize,
|
|
struct tcp_options_received *opt_rx)
|
|
{
|
|
struct mptcp_options_received *mp_opt = &opt_rx->mptcp;
|
|
u8 subtype = *ptr >> 4;
|
|
u8 version;
|
|
u8 flags;
|
|
|
|
switch (subtype) {
|
|
case MPTCPOPT_MP_CAPABLE:
|
|
if (opsize != TCPOLEN_MPTCP_MPC_SYN &&
|
|
opsize != TCPOLEN_MPTCP_MPC_ACK)
|
|
break;
|
|
|
|
version = *ptr++ & MPTCP_VERSION_MASK;
|
|
if (version != MPTCP_SUPPORTED_VERSION)
|
|
break;
|
|
|
|
flags = *ptr++;
|
|
if (!((flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA1) ||
|
|
(flags & MPTCP_CAP_EXTENSIBILITY))
|
|
break;
|
|
|
|
/* RFC 6824, Section 3.1:
|
|
* "For the Checksum Required bit (labeled "A"), if either
|
|
* host requires the use of checksums, checksums MUST be used.
|
|
* In other words, the only way for checksums not to be used
|
|
* is if both hosts in their SYNs set A=0."
|
|
*
|
|
* Section 3.3.0:
|
|
* "If a checksum is not present when its use has been
|
|
* negotiated, the receiver MUST close the subflow with a RST as
|
|
* it is considered broken."
|
|
*
|
|
* We don't implement DSS checksum - fall back to TCP.
|
|
*/
|
|
if (flags & MPTCP_CAP_CHECKSUM_REQD)
|
|
break;
|
|
|
|
mp_opt->mp_capable = 1;
|
|
mp_opt->sndr_key = get_unaligned_be64(ptr);
|
|
ptr += 8;
|
|
|
|
if (opsize == TCPOLEN_MPTCP_MPC_ACK) {
|
|
mp_opt->rcvr_key = get_unaligned_be64(ptr);
|
|
ptr += 8;
|
|
pr_debug("MP_CAPABLE sndr=%llu, rcvr=%llu",
|
|
mp_opt->sndr_key, mp_opt->rcvr_key);
|
|
} else {
|
|
pr_debug("MP_CAPABLE sndr=%llu", mp_opt->sndr_key);
|
|
}
|
|
break;
|
|
|
|
case MPTCPOPT_DSS:
|
|
pr_debug("DSS");
|
|
mp_opt->dss = 1;
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
void mptcp_get_options(const struct sk_buff *skb,
|
|
struct tcp_options_received *opt_rx)
|
|
{
|
|
const unsigned char *ptr;
|
|
const struct tcphdr *th = tcp_hdr(skb);
|
|
int length = (th->doff * 4) - sizeof(struct tcphdr);
|
|
|
|
ptr = (const unsigned char *)(th + 1);
|
|
|
|
while (length > 0) {
|
|
int opcode = *ptr++;
|
|
int opsize;
|
|
|
|
switch (opcode) {
|
|
case TCPOPT_EOL:
|
|
return;
|
|
case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
|
|
length--;
|
|
continue;
|
|
default:
|
|
opsize = *ptr++;
|
|
if (opsize < 2) /* "silly options" */
|
|
return;
|
|
if (opsize > length)
|
|
return; /* don't parse partial options */
|
|
if (opcode == TCPOPT_MPTCP)
|
|
mptcp_parse_option(ptr, opsize, opt_rx);
|
|
ptr += opsize - 2;
|
|
length -= opsize;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool mptcp_syn_options(struct sock *sk, unsigned int *size,
|
|
struct mptcp_out_options *opts)
|
|
{
|
|
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
|
|
|
|
if (subflow->request_mptcp) {
|
|
pr_debug("local_key=%llu", subflow->local_key);
|
|
opts->suboptions = OPTION_MPTCP_MPC_SYN;
|
|
opts->sndr_key = subflow->local_key;
|
|
*size = TCPOLEN_MPTCP_MPC_SYN;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void mptcp_rcv_synsent(struct sock *sk)
|
|
{
|
|
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
|
|
pr_debug("subflow=%p", subflow);
|
|
if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
|
|
subflow->mp_capable = 1;
|
|
subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
|
|
} else {
|
|
tcp_sk(sk)->is_mptcp = 0;
|
|
}
|
|
}
|
|
|
|
static bool mptcp_established_options_mp(struct sock *sk, unsigned int *size,
|
|
unsigned int remaining,
|
|
struct mptcp_out_options *opts)
|
|
{
|
|
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
|
|
|
|
if (!subflow->fourth_ack) {
|
|
opts->suboptions = OPTION_MPTCP_MPC_ACK;
|
|
opts->sndr_key = subflow->local_key;
|
|
opts->rcvr_key = subflow->remote_key;
|
|
*size = TCPOLEN_MPTCP_MPC_ACK;
|
|
subflow->fourth_ack = 1;
|
|
pr_debug("subflow=%p, local_key=%llu, remote_key=%llu",
|
|
subflow, subflow->local_key, subflow->remote_key);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
|
|
struct mptcp_ext *ext)
|
|
{
|
|
ext->data_fin = 1;
|
|
|
|
if (!ext->use_map) {
|
|
/* RFC6824 requires a DSS mapping with specific values
|
|
* if DATA_FIN is set but no data payload is mapped
|
|
*/
|
|
ext->use_map = 1;
|
|
ext->dsn64 = 1;
|
|
ext->data_seq = mptcp_sk(subflow->conn)->write_seq;
|
|
ext->subflow_seq = 0;
|
|
ext->data_len = 1;
|
|
} else {
|
|
/* If there's an existing DSS mapping, DATA_FIN consumes
|
|
* 1 additional byte of mapping space.
|
|
*/
|
|
ext->data_len++;
|
|
}
|
|
}
|
|
|
|
static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
|
|
unsigned int *size,
|
|
unsigned int remaining,
|
|
struct mptcp_out_options *opts)
|
|
{
|
|
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
|
|
unsigned int dss_size = 0;
|
|
struct mptcp_ext *mpext;
|
|
struct mptcp_sock *msk;
|
|
unsigned int ack_size;
|
|
u8 tcp_fin;
|
|
|
|
if (skb) {
|
|
mpext = mptcp_get_ext(skb);
|
|
tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
|
|
} else {
|
|
mpext = NULL;
|
|
tcp_fin = 0;
|
|
}
|
|
|
|
if (!skb || (mpext && mpext->use_map) || tcp_fin) {
|
|
unsigned int map_size;
|
|
|
|
map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;
|
|
|
|
remaining -= map_size;
|
|
dss_size = map_size;
|
|
if (mpext)
|
|
opts->ext_copy = *mpext;
|
|
|
|
if (skb && tcp_fin &&
|
|
subflow->conn->sk_state != TCP_ESTABLISHED)
|
|
mptcp_write_data_fin(subflow, &opts->ext_copy);
|
|
}
|
|
|
|
ack_size = TCPOLEN_MPTCP_DSS_ACK64;
|
|
|
|
/* Add kind/length/subtype/flag overhead if mapping is not populated */
|
|
if (dss_size == 0)
|
|
ack_size += TCPOLEN_MPTCP_DSS_BASE;
|
|
|
|
dss_size += ack_size;
|
|
|
|
msk = mptcp_sk(mptcp_subflow_ctx(sk)->conn);
|
|
if (msk) {
|
|
opts->ext_copy.data_ack = msk->ack_seq;
|
|
} else {
|
|
mptcp_crypto_key_sha(mptcp_subflow_ctx(sk)->remote_key,
|
|
NULL, &opts->ext_copy.data_ack);
|
|
opts->ext_copy.data_ack++;
|
|
}
|
|
|
|
opts->ext_copy.ack64 = 1;
|
|
opts->ext_copy.use_ack = 1;
|
|
|
|
*size = ALIGN(dss_size, 4);
|
|
return true;
|
|
}
|
|
|
|
bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
|
|
unsigned int *size, unsigned int remaining,
|
|
struct mptcp_out_options *opts)
|
|
{
|
|
unsigned int opt_size = 0;
|
|
bool ret = false;
|
|
|
|
if (mptcp_established_options_mp(sk, &opt_size, remaining, opts))
|
|
ret = true;
|
|
else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
|
|
opts))
|
|
ret = true;
|
|
|
|
/* we reserved enough space for the above options, and exceeding the
|
|
* TCP option space would be fatal
|
|
*/
|
|
if (WARN_ON_ONCE(opt_size > remaining))
|
|
return false;
|
|
|
|
*size += opt_size;
|
|
remaining -= opt_size;
|
|
|
|
return ret;
|
|
}
|
|
|
|
bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
|
|
struct mptcp_out_options *opts)
|
|
{
|
|
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
|
|
|
|
if (subflow_req->mp_capable) {
|
|
opts->suboptions = OPTION_MPTCP_MPC_SYNACK;
|
|
opts->sndr_key = subflow_req->local_key;
|
|
*size = TCPOLEN_MPTCP_MPC_SYNACK;
|
|
pr_debug("subflow_req=%p, local_key=%llu",
|
|
subflow_req, subflow_req->local_key);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
|
|
{
|
|
if ((OPTION_MPTCP_MPC_SYN |
|
|
OPTION_MPTCP_MPC_SYNACK |
|
|
OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
|
|
u8 len;
|
|
|
|
if (OPTION_MPTCP_MPC_SYN & opts->suboptions)
|
|
len = TCPOLEN_MPTCP_MPC_SYN;
|
|
else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions)
|
|
len = TCPOLEN_MPTCP_MPC_SYNACK;
|
|
else
|
|
len = TCPOLEN_MPTCP_MPC_ACK;
|
|
|
|
*ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) |
|
|
(MPTCPOPT_MP_CAPABLE << 12) |
|
|
(MPTCP_SUPPORTED_VERSION << 8) |
|
|
MPTCP_CAP_HMAC_SHA1);
|
|
put_unaligned_be64(opts->sndr_key, ptr);
|
|
ptr += 2;
|
|
if (OPTION_MPTCP_MPC_ACK & opts->suboptions) {
|
|
put_unaligned_be64(opts->rcvr_key, ptr);
|
|
ptr += 2;
|
|
}
|
|
}
|
|
|
|
if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
|
|
struct mptcp_ext *mpext = &opts->ext_copy;
|
|
u8 len = TCPOLEN_MPTCP_DSS_BASE;
|
|
u8 flags = 0;
|
|
|
|
if (mpext->use_ack) {
|
|
len += TCPOLEN_MPTCP_DSS_ACK64;
|
|
flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64;
|
|
}
|
|
|
|
if (mpext->use_map) {
|
|
len += TCPOLEN_MPTCP_DSS_MAP64;
|
|
|
|
/* Use only 64-bit mapping flags for now, add
|
|
* support for optional 32-bit mappings later.
|
|
*/
|
|
flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
|
|
if (mpext->data_fin)
|
|
flags |= MPTCP_DSS_DATA_FIN;
|
|
}
|
|
|
|
*ptr++ = htonl((TCPOPT_MPTCP << 24) |
|
|
(len << 16) |
|
|
(MPTCPOPT_DSS << 12) |
|
|
(flags));
|
|
|
|
if (mpext->use_ack) {
|
|
put_unaligned_be64(mpext->data_ack, ptr);
|
|
ptr += 2;
|
|
}
|
|
|
|
if (mpext->use_map) {
|
|
put_unaligned_be64(mpext->data_seq, ptr);
|
|
ptr += 2;
|
|
put_unaligned_be32(mpext->subflow_seq, ptr);
|
|
ptr += 1;
|
|
put_unaligned_be32(mpext->data_len << 16 |
|
|
TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
|
|
}
|
|
}
|
|
}
|