mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-15 20:26:48 +07:00
Merge branch 'tcp-xmit-timer-rearming'
Neal Cardwell says: ==================== tcp: fix xmit timer rearming to avoid stalls This patch series is a bug fix for a TCP loss recovery performance bug reported independently in recent netdev threads: (i) July 26, 2017: netdev thread "TCP fast retransmit issues" (ii) July 26, 2017: netdev thread: "[PATCH V2 net-next] TLP: Don't reschedule PTO when there's one outstanding TLP retransmission" Many thanks to Klavs Klavsen and Mao Wenan for the detailed reports, traces, and packetdrill test cases, which enabled us to root-cause this issue and verify the fix. - v1 -> v2: - In patch 2/3, changed an unclear comment in the pre-existing code in tcp_schedule_loss_probe() to be more clear (thanks to Eric Dumazet for suggesting we improve this). ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
337f1b07db
@ -1916,6 +1916,16 @@ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
|
||||
u64 xmit_time);
|
||||
extern void tcp_rack_reo_timeout(struct sock *sk);
|
||||
|
||||
/* At how many usecs into the future should the RTO fire? */
|
||||
static inline s64 tcp_rto_delta_us(const struct sock *sk)
|
||||
{
|
||||
const struct sk_buff *skb = tcp_write_queue_head(sk);
|
||||
u32 rto = inet_csk(sk)->icsk_rto;
|
||||
u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
|
||||
|
||||
return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp;
|
||||
}
|
||||
|
||||
/*
|
||||
* Save and compile IPv4 options, return a pointer to it
|
||||
*/
|
||||
|
@ -107,6 +107,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
|
||||
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
|
||||
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
|
||||
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
|
||||
#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
|
||||
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
|
||||
#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
|
||||
#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
|
||||
@ -3004,10 +3005,7 @@ void tcp_rearm_rto(struct sock *sk)
|
||||
/* Offset the time elapsed after installing regular RTO */
|
||||
if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
|
||||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
|
||||
struct sk_buff *skb = tcp_write_queue_head(sk);
|
||||
u64 rto_time_stamp = skb->skb_mstamp +
|
||||
jiffies_to_usecs(rto);
|
||||
s64 delta_us = rto_time_stamp - tp->tcp_mstamp;
|
||||
s64 delta_us = tcp_rto_delta_us(sk);
|
||||
/* delta_us may not be positive if the socket is locked
|
||||
* when the retrans timer fires and is rescheduled.
|
||||
*/
|
||||
@ -3019,6 +3017,13 @@ void tcp_rearm_rto(struct sock *sk)
|
||||
}
|
||||
}
|
||||
|
||||
/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */
|
||||
static void tcp_set_xmit_timer(struct sock *sk)
|
||||
{
|
||||
if (!tcp_schedule_loss_probe(sk))
|
||||
tcp_rearm_rto(sk);
|
||||
}
|
||||
|
||||
/* If we get here, the whole TSO packet has not been acked. */
|
||||
static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
@ -3180,7 +3185,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
|
||||
ca_rtt_us, sack->rate);
|
||||
|
||||
if (flag & FLAG_ACKED) {
|
||||
tcp_rearm_rto(sk);
|
||||
flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
|
||||
if (unlikely(icsk->icsk_mtup.probe_size &&
|
||||
!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
|
||||
tcp_mtup_probe_success(sk);
|
||||
@ -3208,7 +3213,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
|
||||
* after when the head was last (re)transmitted. Otherwise the
|
||||
* timeout may continue to extend in loss recovery.
|
||||
*/
|
||||
tcp_rearm_rto(sk);
|
||||
flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */
|
||||
}
|
||||
|
||||
if (icsk->icsk_ca_ops->pkts_acked) {
|
||||
@ -3580,9 +3585,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
|
||||
if (after(ack, tp->snd_nxt))
|
||||
goto invalid_ack;
|
||||
|
||||
if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
|
||||
tcp_rearm_rto(sk);
|
||||
|
||||
if (after(ack, prior_snd_una)) {
|
||||
flag |= FLAG_SND_UNA_ADVANCED;
|
||||
icsk->icsk_retransmits = 0;
|
||||
@ -3647,18 +3649,20 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
|
||||
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
|
||||
&sack_state);
|
||||
|
||||
if (tp->tlp_high_seq)
|
||||
tcp_process_tlp_ack(sk, ack, flag);
|
||||
/* If needed, reset TLP/RTO timer; RACK may later override this. */
|
||||
if (flag & FLAG_SET_XMIT_TIMER)
|
||||
tcp_set_xmit_timer(sk);
|
||||
|
||||
if (tcp_ack_is_dubious(sk, flag)) {
|
||||
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
|
||||
tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
|
||||
}
|
||||
if (tp->tlp_high_seq)
|
||||
tcp_process_tlp_ack(sk, ack, flag);
|
||||
|
||||
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
|
||||
sk_dst_confirm(sk);
|
||||
|
||||
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
|
||||
tcp_schedule_loss_probe(sk);
|
||||
delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
|
||||
lost = tp->lost - lost; /* freshly marked lost */
|
||||
tcp_rate_gen(sk, delivered, lost, sack_state.rate);
|
||||
|
@ -2377,24 +2377,15 @@ bool tcp_schedule_loss_probe(struct sock *sk)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
u32 timeout, tlp_time_stamp, rto_time_stamp;
|
||||
u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
|
||||
u32 timeout, rto_delta_us;
|
||||
|
||||
/* No consecutive loss probes. */
|
||||
if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
|
||||
tcp_rearm_rto(sk);
|
||||
return false;
|
||||
}
|
||||
/* Don't do any loss probe on a Fast Open connection before 3WHS
|
||||
* finishes.
|
||||
*/
|
||||
if (tp->fastopen_rsk)
|
||||
return false;
|
||||
|
||||
/* TLP is only scheduled when next timer event is RTO. */
|
||||
if (icsk->icsk_pending != ICSK_TIME_RETRANS)
|
||||
return false;
|
||||
|
||||
/* Schedule a loss probe in 2*RTT for SACK capable connections
|
||||
* in Open state, that are either limited by cwnd or application.
|
||||
*/
|
||||
@ -2417,14 +2408,10 @@ bool tcp_schedule_loss_probe(struct sock *sk)
|
||||
(rtt + (rtt >> 1) + TCP_DELACK_MAX));
|
||||
timeout = max_t(u32, timeout, msecs_to_jiffies(10));
|
||||
|
||||
/* If RTO is shorter, just schedule TLP in its place. */
|
||||
tlp_time_stamp = tcp_jiffies32 + timeout;
|
||||
rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
|
||||
if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
|
||||
s32 delta = rto_time_stamp - tcp_jiffies32;
|
||||
if (delta > 0)
|
||||
timeout = delta;
|
||||
}
|
||||
/* If the RTO formula yields an earlier time, then use that time. */
|
||||
rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */
|
||||
if (rto_delta_us > 0)
|
||||
timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
|
||||
|
||||
inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
|
||||
TCP_RTO_MAX);
|
||||
|
Loading…
Reference in New Issue
Block a user