Merge branch 'tcp-more-reliable-window-probes'

Eric Dumazet says:

====================
tcp: more reliable window probes

This series address a problem caused by small rto_min timers in DC,
leading to either timer storms or early flow terminations.

We also add two new SNMP counters for proper monitoring :
TCPWinProbe and TCPKeepAlive

v2: added TCPKeepAlive counter, as suggested by Yuchung & Neal
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2015-05-09 16:42:32 -04:00
commit 82ae9c6060
6 changed files with 37 additions and 15 deletions

View File

@ -527,7 +527,7 @@ int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
void tcp_send_probe0(struct sock *); void tcp_send_probe0(struct sock *);
void tcp_send_partial(struct sock *); void tcp_send_partial(struct sock *);
int tcp_write_wakeup(struct sock *); int tcp_write_wakeup(struct sock *, int mib);
void tcp_send_fin(struct sock *sk); void tcp_send_fin(struct sock *sk);
void tcp_send_active_reset(struct sock *sk, gfp_t priority); void tcp_send_active_reset(struct sock *sk, gfp_t priority);
int tcp_send_synack(struct sock *); int tcp_send_synack(struct sock *);
@ -1043,14 +1043,31 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk)
return tp->is_cwnd_limited; return tp->is_cwnd_limited;
} }
/* Something is really bad, we could not queue an additional packet,
* because qdisc is full or receiver sent a 0 window.
* We do not want to add fuel to the fire, or abort too early,
* so make sure the timer we arm now is at least 200ms in the future,
* regardless of current icsk_rto value (as it could be ~2ms)
*/
static inline unsigned long tcp_probe0_base(const struct sock *sk)
{
return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN);
}
/* Variant of inet_csk_rto_backoff() used for zero window probes */
static inline unsigned long tcp_probe0_when(const struct sock *sk,
unsigned long max_when)
{
u64 when = (u64)tcp_probe0_base(sk) << inet_csk(sk)->icsk_backoff;
return (unsigned long)min_t(u64, when, max_when);
}
static inline void tcp_check_probe_timer(struct sock *sk) static inline void tcp_check_probe_timer(struct sock *sk)
{ {
const struct tcp_sock *tp = tcp_sk(sk); if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
const struct inet_connection_sock *icsk = inet_csk(sk);
if (!tp->packets_out && !icsk->icsk_pending)
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
icsk->icsk_rto, TCP_RTO_MAX); tcp_probe0_base(sk), TCP_RTO_MAX);
} }
static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)

View File

@ -276,6 +276,8 @@ enum
LINUX_MIB_TCPACKSKIPPEDFINWAIT2, /* TCPACKSkippedFinWait2 */ LINUX_MIB_TCPACKSKIPPEDFINWAIT2, /* TCPACKSkippedFinWait2 */
LINUX_MIB_TCPACKSKIPPEDTIMEWAIT, /* TCPACKSkippedTimeWait */ LINUX_MIB_TCPACKSKIPPEDTIMEWAIT, /* TCPACKSkippedTimeWait */
LINUX_MIB_TCPACKSKIPPEDCHALLENGE, /* TCPACKSkippedChallenge */ LINUX_MIB_TCPACKSKIPPEDCHALLENGE, /* TCPACKSkippedChallenge */
LINUX_MIB_TCPWINPROBE, /* TCPWinProbe */
LINUX_MIB_TCPKEEPALIVE, /* TCPKeepAlive */
__LINUX_MIB_MAX __LINUX_MIB_MAX
}; };

View File

@ -298,6 +298,8 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2), SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2),
SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT), SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT),
SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE),
SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE),
SNMP_MIB_SENTINEL SNMP_MIB_SENTINEL
}; };

View File

@ -3233,7 +3233,7 @@ static void tcp_ack_probe(struct sock *sk)
* This function is not for random using! * This function is not for random using!
*/ */
} else { } else {
unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
when, TCP_RTO_MAX); when, TCP_RTO_MAX);

View File

@ -3382,7 +3382,7 @@ EXPORT_SYMBOL_GPL(tcp_send_ack);
* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
* out-of-date with SND.UNA-1 to probe window. * out-of-date with SND.UNA-1 to probe window.
*/ */
static int tcp_xmit_probe_skb(struct sock *sk, int urgent) static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb; struct sk_buff *skb;
@ -3400,6 +3400,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
*/ */
tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
skb_mstamp_get(&skb->skb_mstamp); skb_mstamp_get(&skb->skb_mstamp);
NET_INC_STATS_BH(sock_net(sk), mib);
return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
} }
@ -3407,12 +3408,12 @@ void tcp_send_window_probe(struct sock *sk)
{ {
if (sk->sk_state == TCP_ESTABLISHED) { if (sk->sk_state == TCP_ESTABLISHED) {
tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
tcp_xmit_probe_skb(sk, 0); tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
} }
} }
/* Initiate keepalive or window probe from timer. */ /* Initiate keepalive or window probe from timer. */
int tcp_write_wakeup(struct sock *sk) int tcp_write_wakeup(struct sock *sk, int mib)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb; struct sk_buff *skb;
@ -3449,8 +3450,8 @@ int tcp_write_wakeup(struct sock *sk)
return err; return err;
} else { } else {
if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
tcp_xmit_probe_skb(sk, 1); tcp_xmit_probe_skb(sk, 1, mib);
return tcp_xmit_probe_skb(sk, 0); return tcp_xmit_probe_skb(sk, 0, mib);
} }
} }
@ -3464,7 +3465,7 @@ void tcp_send_probe0(struct sock *sk)
unsigned long probe_max; unsigned long probe_max;
int err; int err;
err = tcp_write_wakeup(sk); err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
if (tp->packets_out || !tcp_send_head(sk)) { if (tp->packets_out || !tcp_send_head(sk)) {
/* Cancel probe timer, if it is not required. */ /* Cancel probe timer, if it is not required. */
@ -3490,7 +3491,7 @@ void tcp_send_probe0(struct sock *sk)
probe_max = TCP_RESOURCE_PROBE_INTERVAL; probe_max = TCP_RESOURCE_PROBE_INTERVAL;
} }
inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
inet_csk_rto_backoff(icsk, probe_max), tcp_probe0_when(sk, probe_max),
TCP_RTO_MAX); TCP_RTO_MAX);
} }

View File

@ -616,7 +616,7 @@ static void tcp_keepalive_timer (unsigned long data)
tcp_write_err(sk); tcp_write_err(sk);
goto out; goto out;
} }
if (tcp_write_wakeup(sk) <= 0) { if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
icsk->icsk_probes_out++; icsk->icsk_probes_out++;
elapsed = keepalive_intvl_when(tp); elapsed = keepalive_intvl_when(tp);
} else { } else {