linux_dsm_epyc7002/net/mptcp/options.c
Mat Martineau 6d0060f600 mptcp: Write MPTCP DSS headers to outgoing data packets
Per-packet metadata required to write the MPTCP DSS option is written to
the skb_ext area. One write to the socket may contain more than one
packet of data, which is copied to page fragments and mapped in to MPTCP
DSS segments with size determined by the available page fragments and
the maximum mapping length allowed by the MPTCP specification. If
do_tcp_sendpages() splits a DSS segment in to multiple skbs, that's ok -
the later skbs can either have duplicated DSS mapping information or
none at all, and the receiver can handle that.

The current implementation uses the subflow frag cache and tcp
sendpages to avoid excessive code duplication. More work is required to
ensure that it works correctly under memory pressure and to support
MPTCP-level retransmissions.

The MPTCP DSS checksum is not yet implemented.

Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Peter Krystad <peter.krystad@linux.intel.com>
Signed-off-by: Peter Krystad <peter.krystad@linux.intel.com>
Co-developed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-24 13:44:07 +01:00

345 lines
8.4 KiB
C

// SPDX-License-Identifier: GPL-2.0
/* Multipath TCP
*
* Copyright (c) 2017 - 2019, Intel Corporation.
*/
#include <linux/kernel.h>
#include <net/tcp.h>
#include <net/mptcp.h>
#include "protocol.h"
void mptcp_parse_option(const unsigned char *ptr, int opsize,
struct tcp_options_received *opt_rx)
{
struct mptcp_options_received *mp_opt = &opt_rx->mptcp;
u8 subtype = *ptr >> 4;
u8 version;
u8 flags;
switch (subtype) {
case MPTCPOPT_MP_CAPABLE:
if (opsize != TCPOLEN_MPTCP_MPC_SYN &&
opsize != TCPOLEN_MPTCP_MPC_ACK)
break;
version = *ptr++ & MPTCP_VERSION_MASK;
if (version != MPTCP_SUPPORTED_VERSION)
break;
flags = *ptr++;
if (!((flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA1) ||
(flags & MPTCP_CAP_EXTENSIBILITY))
break;
/* RFC 6824, Section 3.1:
* "For the Checksum Required bit (labeled "A"), if either
* host requires the use of checksums, checksums MUST be used.
* In other words, the only way for checksums not to be used
* is if both hosts in their SYNs set A=0."
*
* Section 3.3.0:
* "If a checksum is not present when its use has been
* negotiated, the receiver MUST close the subflow with a RST as
* it is considered broken."
*
* We don't implement DSS checksum - fall back to TCP.
*/
if (flags & MPTCP_CAP_CHECKSUM_REQD)
break;
mp_opt->mp_capable = 1;
mp_opt->sndr_key = get_unaligned_be64(ptr);
ptr += 8;
if (opsize == TCPOLEN_MPTCP_MPC_ACK) {
mp_opt->rcvr_key = get_unaligned_be64(ptr);
ptr += 8;
pr_debug("MP_CAPABLE sndr=%llu, rcvr=%llu",
mp_opt->sndr_key, mp_opt->rcvr_key);
} else {
pr_debug("MP_CAPABLE sndr=%llu", mp_opt->sndr_key);
}
break;
case MPTCPOPT_DSS:
pr_debug("DSS");
mp_opt->dss = 1;
break;
default:
break;
}
}
void mptcp_get_options(const struct sk_buff *skb,
struct tcp_options_received *opt_rx)
{
const unsigned char *ptr;
const struct tcphdr *th = tcp_hdr(skb);
int length = (th->doff * 4) - sizeof(struct tcphdr);
ptr = (const unsigned char *)(th + 1);
while (length > 0) {
int opcode = *ptr++;
int opsize;
switch (opcode) {
case TCPOPT_EOL:
return;
case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
length--;
continue;
default:
opsize = *ptr++;
if (opsize < 2) /* "silly options" */
return;
if (opsize > length)
return; /* don't parse partial options */
if (opcode == TCPOPT_MPTCP)
mptcp_parse_option(ptr, opsize, opt_rx);
ptr += opsize - 2;
length -= opsize;
}
}
}
bool mptcp_syn_options(struct sock *sk, unsigned int *size,
struct mptcp_out_options *opts)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
if (subflow->request_mptcp) {
pr_debug("local_key=%llu", subflow->local_key);
opts->suboptions = OPTION_MPTCP_MPC_SYN;
opts->sndr_key = subflow->local_key;
*size = TCPOLEN_MPTCP_MPC_SYN;
return true;
}
return false;
}
void mptcp_rcv_synsent(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct tcp_sock *tp = tcp_sk(sk);
pr_debug("subflow=%p", subflow);
if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
subflow->mp_capable = 1;
subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
} else {
tcp_sk(sk)->is_mptcp = 0;
}
}
static bool mptcp_established_options_mp(struct sock *sk, unsigned int *size,
unsigned int remaining,
struct mptcp_out_options *opts)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
if (!subflow->fourth_ack) {
opts->suboptions = OPTION_MPTCP_MPC_ACK;
opts->sndr_key = subflow->local_key;
opts->rcvr_key = subflow->remote_key;
*size = TCPOLEN_MPTCP_MPC_ACK;
subflow->fourth_ack = 1;
pr_debug("subflow=%p, local_key=%llu, remote_key=%llu",
subflow, subflow->local_key, subflow->remote_key);
return true;
}
return false;
}
static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
struct mptcp_ext *ext)
{
ext->data_fin = 1;
if (!ext->use_map) {
/* RFC6824 requires a DSS mapping with specific values
* if DATA_FIN is set but no data payload is mapped
*/
ext->use_map = 1;
ext->dsn64 = 1;
ext->data_seq = mptcp_sk(subflow->conn)->write_seq;
ext->subflow_seq = 0;
ext->data_len = 1;
} else {
/* If there's an existing DSS mapping, DATA_FIN consumes
* 1 additional byte of mapping space.
*/
ext->data_len++;
}
}
static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
unsigned int *size,
unsigned int remaining,
struct mptcp_out_options *opts)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
unsigned int dss_size = 0;
struct mptcp_ext *mpext;
struct mptcp_sock *msk;
unsigned int ack_size;
u8 tcp_fin;
if (skb) {
mpext = mptcp_get_ext(skb);
tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
} else {
mpext = NULL;
tcp_fin = 0;
}
if (!skb || (mpext && mpext->use_map) || tcp_fin) {
unsigned int map_size;
map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;
remaining -= map_size;
dss_size = map_size;
if (mpext)
opts->ext_copy = *mpext;
if (skb && tcp_fin &&
subflow->conn->sk_state != TCP_ESTABLISHED)
mptcp_write_data_fin(subflow, &opts->ext_copy);
}
ack_size = TCPOLEN_MPTCP_DSS_ACK64;
/* Add kind/length/subtype/flag overhead if mapping is not populated */
if (dss_size == 0)
ack_size += TCPOLEN_MPTCP_DSS_BASE;
dss_size += ack_size;
msk = mptcp_sk(mptcp_subflow_ctx(sk)->conn);
if (msk) {
opts->ext_copy.data_ack = msk->ack_seq;
} else {
mptcp_crypto_key_sha(mptcp_subflow_ctx(sk)->remote_key,
NULL, &opts->ext_copy.data_ack);
opts->ext_copy.data_ack++;
}
opts->ext_copy.ack64 = 1;
opts->ext_copy.use_ack = 1;
*size = ALIGN(dss_size, 4);
return true;
}
bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
unsigned int *size, unsigned int remaining,
struct mptcp_out_options *opts)
{
unsigned int opt_size = 0;
bool ret = false;
if (mptcp_established_options_mp(sk, &opt_size, remaining, opts))
ret = true;
else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
opts))
ret = true;
/* we reserved enough space for the above options, and exceeding the
* TCP option space would be fatal
*/
if (WARN_ON_ONCE(opt_size > remaining))
return false;
*size += opt_size;
remaining -= opt_size;
return ret;
}
bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
struct mptcp_out_options *opts)
{
struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
if (subflow_req->mp_capable) {
opts->suboptions = OPTION_MPTCP_MPC_SYNACK;
opts->sndr_key = subflow_req->local_key;
*size = TCPOLEN_MPTCP_MPC_SYNACK;
pr_debug("subflow_req=%p, local_key=%llu",
subflow_req, subflow_req->local_key);
return true;
}
return false;
}
void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
{
if ((OPTION_MPTCP_MPC_SYN |
OPTION_MPTCP_MPC_SYNACK |
OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
u8 len;
if (OPTION_MPTCP_MPC_SYN & opts->suboptions)
len = TCPOLEN_MPTCP_MPC_SYN;
else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions)
len = TCPOLEN_MPTCP_MPC_SYNACK;
else
len = TCPOLEN_MPTCP_MPC_ACK;
*ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) |
(MPTCPOPT_MP_CAPABLE << 12) |
(MPTCP_SUPPORTED_VERSION << 8) |
MPTCP_CAP_HMAC_SHA1);
put_unaligned_be64(opts->sndr_key, ptr);
ptr += 2;
if (OPTION_MPTCP_MPC_ACK & opts->suboptions) {
put_unaligned_be64(opts->rcvr_key, ptr);
ptr += 2;
}
}
if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
struct mptcp_ext *mpext = &opts->ext_copy;
u8 len = TCPOLEN_MPTCP_DSS_BASE;
u8 flags = 0;
if (mpext->use_ack) {
len += TCPOLEN_MPTCP_DSS_ACK64;
flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64;
}
if (mpext->use_map) {
len += TCPOLEN_MPTCP_DSS_MAP64;
/* Use only 64-bit mapping flags for now, add
* support for optional 32-bit mappings later.
*/
flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
if (mpext->data_fin)
flags |= MPTCP_DSS_DATA_FIN;
}
*ptr++ = htonl((TCPOPT_MPTCP << 24) |
(len << 16) |
(MPTCPOPT_DSS << 12) |
(flags));
if (mpext->use_ack) {
put_unaligned_be64(mpext->data_ack, ptr);
ptr += 2;
}
if (mpext->use_map) {
put_unaligned_be64(mpext->data_seq, ptr);
ptr += 2;
put_unaligned_be32(mpext->subflow_seq, ptr);
ptr += 1;
put_unaligned_be32(mpext->data_len << 16 |
TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
}
}
}