From 3382576106014bf865d341efab3d94fb28d1fc63 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 26 Apr 2018 17:18:20 +0200 Subject: [PATCH 1/4] net/smc: fix structure size The struct smc_cdc_msg must be defined as packed so the size is 44 bytes. And change the structure size check so sizeof is checked. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_cdc.c | 2 +- net/smc/smc_cdc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index b42395d24cba..42ad57365eca 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -82,7 +82,7 @@ static inline void smc_cdc_add_pending_send(struct smc_connection *conn, sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE, "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)"); BUILD_BUG_ON_MSG( - offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE, + sizeof(struct smc_cdc_msg) != SMC_WR_TX_SIZE, "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()"); BUILD_BUG_ON_MSG( sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE, diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index ab240b37ad11..d2012fd22100 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -48,7 +48,7 @@ struct smc_cdc_msg { struct smc_cdc_producer_flags prod_flags; struct smc_cdc_conn_state_flags conn_state_flags; u8 reserved[18]; -} __aligned(8); +} __packed; /* format defined in RFC7609 */ static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn) { From ee9dfbef02d186a90f3a4876b276701966a92d10 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 26 Apr 2018 17:18:21 +0200 Subject: [PATCH 2/4] net/smc: handle sockopts forcing fallback Several TCP sockopts do not work for SMC. One example are the TCP_FASTOPEN sockopts, since SMC-connection setup is based on the TCP three-way-handshake. If the SMC socket is still in state SMC_INIT, such sockopts trigger fallback to TCP. Otherwise an error is returned. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 54 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 4470501374bf..d274be7265ea 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -391,6 +391,9 @@ static int smc_connect_rdma(struct smc_sock *smc) sock_hold(&smc->sk); /* sock put in passive closing */ + if (smc->use_fallback) + goto out_connected; + if (!tcp_sk(smc->clcsock->sk)->syn_smc) { /* peer has not signalled SMC-capability */ smc->use_fallback = true; @@ -790,6 +793,9 @@ static void smc_listen_work(struct work_struct *work) int rc = 0; u8 ibport; + if (new_smc->use_fallback) + goto out_connected; + /* check if peer is smc capable */ if (!tcp_sk(newclcsock->sk)->syn_smc) { new_smc->use_fallback = true; @@ -968,7 +974,7 @@ static void smc_tcp_listen_work(struct work_struct *work) continue; new_smc->listen_smc = lsmc; - new_smc->use_fallback = false; /* assume rdma capability first*/ + new_smc->use_fallback = lsmc->use_fallback; sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); @@ -1004,7 +1010,8 @@ static int smc_listen(struct socket *sock, int backlog) * them to the clc socket -- copy smc socket options to clc socket */ smc_copy_sock_settings_to_clc(smc); - tcp_sk(smc->clcsock->sk)->syn_smc = 1; + if (!smc->use_fallback) + tcp_sk(smc->clcsock->sk)->syn_smc = 1; rc = kernel_listen(smc->clcsock, backlog); if (rc) @@ -1097,6 +1104,16 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) (sk->sk_state != SMC_APPCLOSEWAIT1) && (sk->sk_state != SMC_INIT)) goto out; + + if (msg->msg_flags & MSG_FASTOPEN) { + if (sk->sk_state == SMC_INIT) { + smc->use_fallback = true; + } else { + rc = -EINVAL; + goto out; + } + } + if (smc->use_fallback) rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); else @@ -1274,14 +1291,43 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct smc_sock *smc; + int rc; smc = smc_sk(sk); /* generic setsockopts reaching us here always apply to the * CLC socket */ - return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, - optval, optlen); + rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, + optval, optlen); + if (smc->clcsock->sk->sk_err) { + sk->sk_err = smc->clcsock->sk->sk_err; + sk->sk_error_report(sk); + } + if (rc) + return rc; + + lock_sock(sk); + switch (optname) { + case TCP_ULP: + case TCP_FASTOPEN: + case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_KEY: + case TCP_FASTOPEN_NO_COOKIE: + /* option not supported by SMC */ + if (sk->sk_state == SMC_INIT) { + smc->use_fallback = true; + } else { + if (!smc->use_fallback) + rc = -EINVAL; + } + break; + default: + break; + } + release_sock(sk); + + return rc; } static int smc_getsockopt(struct socket *sock, int level, int optname, From 01d2f7e2cdd31becffafa0cb82809a5e36558ec0 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 26 Apr 2018 17:18:22 +0200 Subject: [PATCH 3/4] net/smc: sockopts TCP_NODELAY and TCP_CORK Setting sockopt TCP_NODELAY or resetting sockopt TCP_CORK triggers data transfer. For a corked SMC socket RDMA writes are deferred, if there is still sufficient send buffer space available. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 20 +++++++++++++++++++- net/smc/smc_tx.c | 24 +++++++++++++++++++++--- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index d274be7265ea..9d8b381281e3 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1291,7 +1291,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct smc_sock *smc; - int rc; + int val, rc; smc = smc_sk(sk); @@ -1307,6 +1307,10 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (rc) return rc; + if (optlen < sizeof(int)) + return rc; + get_user(val, (int __user *)optval); + lock_sock(sk); switch (optname) { case TCP_ULP: @@ -1322,6 +1326,20 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, rc = -EINVAL; } break; + case TCP_NODELAY: + if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { + if (val) + mod_delayed_work(system_wq, &smc->conn.tx_work, + 0); + } + break; + case TCP_CORK: + if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { + if (!val) + mod_delayed_work(system_wq, &smc->conn.tx_work, + 0); + } + break; default: break; } diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 72f004c9c9b1..58dfe0bd9d60 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -19,6 +19,7 @@ #include #include +#include #include "smc.h" #include "smc_wr.h" @@ -26,6 +27,7 @@ #include "smc_tx.h" #define SMC_TX_WORK_DELAY HZ +#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ /***************************** sndbuf producer *******************************/ @@ -115,6 +117,13 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags) return rc; } +static bool smc_tx_is_corked(struct smc_sock *smc) +{ + struct tcp_sock *tp = tcp_sk(smc->clcsock->sk); + + return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; +} + /* sndbuf producer: main API called by socket layer. * called under sock lock. */ @@ -209,7 +218,16 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) /* since we just produced more new data into sndbuf, * trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ - smc_tx_sndbuf_nonempty(conn); + if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && + (atomic_read(&conn->sndbuf_space) > + (conn->sndbuf_size >> 1))) + /* for a corked socket defer the RDMA writes if there + * is still sufficient sndbuf_space available + */ + schedule_delayed_work(&conn->tx_work, + SMC_TX_CORK_DELAY); + else + smc_tx_sndbuf_nonempty(conn); } /* while (msg_data_left(msg)) */ return send_done; @@ -409,8 +427,8 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) } rc = 0; if (conn->alert_token_local) /* connection healthy */ - schedule_delayed_work(&conn->tx_work, - SMC_TX_WORK_DELAY); + mod_delayed_work(system_wq, &conn->tx_work, + SMC_TX_WORK_DELAY); } goto out_unlock; } From abb190f194d082cbb7520e692d78d3ddf050e7b1 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 26 Apr 2018 17:18:23 +0200 Subject: [PATCH 4/4] net/smc: handle sockopt TCP_DEFER_ACCEPT If sockopt TCP_DEFER_ACCEPT is set, the accept is delayed till data is available. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 26 +++++++++++++++++++++++++- net/smc/smc.h | 4 ++++ net/smc/smc_rx.c | 2 +- net/smc/smc_rx.h | 1 + 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9d8b381281e3..20aa4175b9f8 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1044,6 +1044,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, if (lsmc->sk.sk_state != SMC_LISTEN) { rc = -EINVAL; + release_sock(sk); goto out; } @@ -1071,9 +1072,29 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, if (!rc) rc = sock_error(nsk); + release_sock(sk); + if (rc) + goto out; + + if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) { + /* wait till data arrives on the socket */ + timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * + MSEC_PER_SEC); + if (smc_sk(nsk)->use_fallback) { + struct sock *clcsk = smc_sk(nsk)->clcsock->sk; + + lock_sock(clcsk); + if (skb_queue_empty(&clcsk->sk_receive_queue)) + sk_wait_data(clcsk, &timeo, NULL); + release_sock(clcsk); + } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { + lock_sock(nsk); + smc_rx_wait_data(smc_sk(nsk), &timeo); + release_sock(nsk); + } + } out: - release_sock(sk); sock_put(sk); /* sock_hold above */ return rc; } @@ -1340,6 +1361,9 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, 0); } break; + case TCP_DEFER_ACCEPT: + smc->sockopt_defer_accept = val; + break; default: break; } diff --git a/net/smc/smc.h b/net/smc/smc.h index e4829a2f46ba..2405e889b93d 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -180,6 +180,10 @@ struct smc_sock { /* smc sock container */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ bool use_fallback; /* fallback to tcp */ + int sockopt_defer_accept; + /* sockopt TCP_DEFER_ACCEPT + * value + */ u8 wait_close_tx_prepared : 1; /* shutdown wr or close * started, waiting for unsent diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index eff4e0d0bb31..af851d8df1f8 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -51,7 +51,7 @@ static void smc_rx_data_ready(struct sock *sk) * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown. * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted). */ -static int smc_rx_wait_data(struct smc_sock *smc, long *timeo) +int smc_rx_wait_data(struct smc_sock *smc, long *timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h index 3a32b59bf06c..0b75a6b470e6 100644 --- a/net/smc/smc_rx.h +++ b/net/smc/smc_rx.h @@ -20,5 +20,6 @@ void smc_rx_init(struct smc_sock *smc); int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, int flags); +int smc_rx_wait_data(struct smc_sock *smc, long *timeo); #endif /* SMC_RX_H */