linux_dsm_epyc7002/net/ipv4/tcp_hybla.c

195 lines
4.9 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* TCP HYBLA
*
* TCP-HYBLA Congestion control algorithm, based on:
* C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
* for Heterogeneous Networks",
* International Journal on satellite Communications,
* September 2004
* Daniele Lacamera
* root at danielinux.net
*/
#include <linux/module.h>
#include <net/tcp.h>
/* Tcp Hybla structure. */
struct hybla {
bool hybla_en;
u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
u32 rho; /* Rho parameter, integer part */
u32 rho2; /* Rho * Rho, integer part */
u32 rho_3ls; /* Rho parameter, <<3 */
u32 rho2_7ls; /* Rho^2, <<7 */
tcp: switch rtt estimations to usec resolution Upcoming congestion controls for TCP require usec resolution for RTT estimations. Millisecond resolution is simply not enough these days. FQ/pacing in DC environments also require this change for finer control and removal of bimodal behavior due to the current hack in tcp_update_pacing_rate() for 'small rtt' TCP_CONG_RTT_STAMP is no longer needed. As Julian Anastasov pointed out, we need to keep user compatibility : tcp_metrics used to export RTT and RTTVAR in msec resolution, so we added RTT_US and RTTVAR_US. An iproute2 patch is needed to use the new attributes if provided by the kernel. In this example ss command displays a srtt of 32 usecs (10Gbit link) lpk51:~# ./ss -i dst lpk52 Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port tcp ESTAB 0 1 10.246.11.51:42959 10.246.11.52:64614 cubic wscale:6,6 rto:201 rtt:0.032/0.001 ato:40 mss:1448 cwnd:10 send 3620.0Mbps pacing_rate 7240.0Mbps unacked:1 rcv_rtt:993 rcv_space:29559 Updated iproute2 ip command displays : lpk51:~# ./ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 274us rttvar 213us source 10.246.11.51 Old binary displays : lpk51:~# ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 250us rttvar 125us source 10.246.11.51 With help from Julian Anastasov, Stephen Hemminger and Yuchung Cheng Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Yuchung Cheng <ycheng@google.com> Cc: Larry Brakmo <brakmo@google.com> Cc: Julian Anastasov <ja@ssi.bg> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-27 05:02:48 +07:00
u32 minrtt_us; /* Minimum smoothed round trip time value seen */
};
/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
static int rtt0 = 25;
module_param(rtt0, int, 0644);
MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
/* This is called to refresh values for hybla parameters */
static inline void hybla_recalc_param (struct sock *sk)
{
struct hybla *ca = inet_csk_ca(sk);
tcp: switch rtt estimations to usec resolution Upcoming congestion controls for TCP require usec resolution for RTT estimations. Millisecond resolution is simply not enough these days. FQ/pacing in DC environments also require this change for finer control and removal of bimodal behavior due to the current hack in tcp_update_pacing_rate() for 'small rtt' TCP_CONG_RTT_STAMP is no longer needed. As Julian Anastasov pointed out, we need to keep user compatibility : tcp_metrics used to export RTT and RTTVAR in msec resolution, so we added RTT_US and RTTVAR_US. An iproute2 patch is needed to use the new attributes if provided by the kernel. In this example ss command displays a srtt of 32 usecs (10Gbit link) lpk51:~# ./ss -i dst lpk52 Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port tcp ESTAB 0 1 10.246.11.51:42959 10.246.11.52:64614 cubic wscale:6,6 rto:201 rtt:0.032/0.001 ato:40 mss:1448 cwnd:10 send 3620.0Mbps pacing_rate 7240.0Mbps unacked:1 rcv_rtt:993 rcv_space:29559 Updated iproute2 ip command displays : lpk51:~# ./ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 274us rttvar 213us source 10.246.11.51 Old binary displays : lpk51:~# ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 250us rttvar 125us source 10.246.11.51 With help from Julian Anastasov, Stephen Hemminger and Yuchung Cheng Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Yuchung Cheng <ycheng@google.com> Cc: Larry Brakmo <brakmo@google.com> Cc: Julian Anastasov <ja@ssi.bg> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-27 05:02:48 +07:00
ca->rho_3ls = max_t(u32,
tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC),
8U);
ca->rho = ca->rho_3ls >> 3;
ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
ca->rho2 = ca->rho2_7ls >> 7;
}
static void hybla_init(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hybla *ca = inet_csk_ca(sk);
ca->rho = 0;
ca->rho2 = 0;
ca->rho_3ls = 0;
ca->rho2_7ls = 0;
ca->snd_cwnd_cents = 0;
ca->hybla_en = true;
tp->snd_cwnd = 2;
tp->snd_cwnd_clamp = 65535;
/* 1st Rho measurement based on initial srtt */
hybla_recalc_param(sk);
/* set minimum rtt as this is the 1st ever seen */
tcp: switch rtt estimations to usec resolution Upcoming congestion controls for TCP require usec resolution for RTT estimations. Millisecond resolution is simply not enough these days. FQ/pacing in DC environments also require this change for finer control and removal of bimodal behavior due to the current hack in tcp_update_pacing_rate() for 'small rtt' TCP_CONG_RTT_STAMP is no longer needed. As Julian Anastasov pointed out, we need to keep user compatibility : tcp_metrics used to export RTT and RTTVAR in msec resolution, so we added RTT_US and RTTVAR_US. An iproute2 patch is needed to use the new attributes if provided by the kernel. In this example ss command displays a srtt of 32 usecs (10Gbit link) lpk51:~# ./ss -i dst lpk52 Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port tcp ESTAB 0 1 10.246.11.51:42959 10.246.11.52:64614 cubic wscale:6,6 rto:201 rtt:0.032/0.001 ato:40 mss:1448 cwnd:10 send 3620.0Mbps pacing_rate 7240.0Mbps unacked:1 rcv_rtt:993 rcv_space:29559 Updated iproute2 ip command displays : lpk51:~# ./ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 274us rttvar 213us source 10.246.11.51 Old binary displays : lpk51:~# ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 250us rttvar 125us source 10.246.11.51 With help from Julian Anastasov, Stephen Hemminger and Yuchung Cheng Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Yuchung Cheng <ycheng@google.com> Cc: Larry Brakmo <brakmo@google.com> Cc: Julian Anastasov <ja@ssi.bg> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-27 05:02:48 +07:00
ca->minrtt_us = tp->srtt_us;
tp->snd_cwnd = ca->rho;
}
static void hybla_state(struct sock *sk, u8 ca_state)
{
struct hybla *ca = inet_csk_ca(sk);
ca->hybla_en = (ca_state == TCP_CA_Open);
}
static inline u32 hybla_fraction(u32 odds)
{
static const u32 fractions[] = {
128, 139, 152, 165, 181, 197, 215, 234,
};
return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
}
/* TCP Hybla main routine.
* This is the algorithm behavior:
* o Recalc Hybla parameters if min_rtt has changed
* o Give cwnd a new value based on the model proposed
* o remember increments <1
*/
static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct hybla *ca = inet_csk_ca(sk);
u32 increment, odd, rho_fractions;
int is_slowstart = 0;
/* Recalculate rho only if this srtt is the lowest */
tcp: switch rtt estimations to usec resolution Upcoming congestion controls for TCP require usec resolution for RTT estimations. Millisecond resolution is simply not enough these days. FQ/pacing in DC environments also require this change for finer control and removal of bimodal behavior due to the current hack in tcp_update_pacing_rate() for 'small rtt' TCP_CONG_RTT_STAMP is no longer needed. As Julian Anastasov pointed out, we need to keep user compatibility : tcp_metrics used to export RTT and RTTVAR in msec resolution, so we added RTT_US and RTTVAR_US. An iproute2 patch is needed to use the new attributes if provided by the kernel. In this example ss command displays a srtt of 32 usecs (10Gbit link) lpk51:~# ./ss -i dst lpk52 Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port tcp ESTAB 0 1 10.246.11.51:42959 10.246.11.52:64614 cubic wscale:6,6 rto:201 rtt:0.032/0.001 ato:40 mss:1448 cwnd:10 send 3620.0Mbps pacing_rate 7240.0Mbps unacked:1 rcv_rtt:993 rcv_space:29559 Updated iproute2 ip command displays : lpk51:~# ./ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 274us rttvar 213us source 10.246.11.51 Old binary displays : lpk51:~# ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 250us rttvar 125us source 10.246.11.51 With help from Julian Anastasov, Stephen Hemminger and Yuchung Cheng Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Yuchung Cheng <ycheng@google.com> Cc: Larry Brakmo <brakmo@google.com> Cc: Julian Anastasov <ja@ssi.bg> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-27 05:02:48 +07:00
if (tp->srtt_us < ca->minrtt_us) {
hybla_recalc_param(sk);
tcp: switch rtt estimations to usec resolution Upcoming congestion controls for TCP require usec resolution for RTT estimations. Millisecond resolution is simply not enough these days. FQ/pacing in DC environments also require this change for finer control and removal of bimodal behavior due to the current hack in tcp_update_pacing_rate() for 'small rtt' TCP_CONG_RTT_STAMP is no longer needed. As Julian Anastasov pointed out, we need to keep user compatibility : tcp_metrics used to export RTT and RTTVAR in msec resolution, so we added RTT_US and RTTVAR_US. An iproute2 patch is needed to use the new attributes if provided by the kernel. In this example ss command displays a srtt of 32 usecs (10Gbit link) lpk51:~# ./ss -i dst lpk52 Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port tcp ESTAB 0 1 10.246.11.51:42959 10.246.11.52:64614 cubic wscale:6,6 rto:201 rtt:0.032/0.001 ato:40 mss:1448 cwnd:10 send 3620.0Mbps pacing_rate 7240.0Mbps unacked:1 rcv_rtt:993 rcv_space:29559 Updated iproute2 ip command displays : lpk51:~# ./ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 274us rttvar 213us source 10.246.11.51 Old binary displays : lpk51:~# ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 250us rttvar 125us source 10.246.11.51 With help from Julian Anastasov, Stephen Hemminger and Yuchung Cheng Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Yuchung Cheng <ycheng@google.com> Cc: Larry Brakmo <brakmo@google.com> Cc: Julian Anastasov <ja@ssi.bg> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-27 05:02:48 +07:00
ca->minrtt_us = tp->srtt_us;
}
if (!tcp_is_cwnd_limited(sk))
return;
if (!ca->hybla_en) {
tcp_reno_cong_avoid(sk, ack, acked);
return;
}
if (ca->rho == 0)
hybla_recalc_param(sk);
rho_fractions = ca->rho_3ls - (ca->rho << 3);
if (tcp_in_slow_start(tp)) {
/*
* slow start
* INC = 2^RHO - 1
* This is done by splitting the rho parameter
* into 2 parts: an integer part and a fraction part.
* Inrement<<7 is estimated by doing:
* [2^(int+fract)]<<7
* that is equal to:
* (2^int) * [(2^fract) <<7]
* 2^int is straightly computed as 1<<int,
* while we will use hybla_slowstart_fraction_increment() to
* calculate 2^fract in a <<7 value.
*/
is_slowstart = 1;
increment = ((1 << min(ca->rho, 16U)) *
hybla_fraction(rho_fractions)) - 128;
} else {
/*
* congestion avoidance
* INC = RHO^2 / W
* as long as increment is estimated as (rho<<7)/window
* it already is <<7 and we can easily count its fractions.
*/
increment = ca->rho2_7ls / tp->snd_cwnd;
if (increment < 128)
tp->snd_cwnd_cnt++;
}
odd = increment % 128;
tp->snd_cwnd += increment >> 7;
ca->snd_cwnd_cents += odd;
/* check when fractions goes >=128 and increase cwnd by 1. */
while (ca->snd_cwnd_cents >= 128) {
tp->snd_cwnd++;
ca->snd_cwnd_cents -= 128;
tp->snd_cwnd_cnt = 0;
}
/* check when cwnd has not been incremented for a while */
if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tp->snd_cwnd) {
tp->snd_cwnd++;
tp->snd_cwnd_cnt = 0;
}
/* clamp down slowstart cwnd to ssthresh value. */
if (is_slowstart)
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
}
static struct tcp_congestion_ops tcp_hybla __read_mostly = {
.init = hybla_init,
.ssthresh = tcp_reno_ssthresh,
.undo_cwnd = tcp_reno_undo_cwnd,
.cong_avoid = hybla_cong_avoid,
.set_state = hybla_state,
.owner = THIS_MODULE,
.name = "hybla"
};
static int __init hybla_register(void)
{
BUILD_BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE);
return tcp_register_congestion_control(&tcp_hybla);
}
static void __exit hybla_unregister(void)
{
tcp_unregister_congestion_control(&tcp_hybla);
}
module_init(hybla_register);
module_exit(hybla_unregister);
MODULE_AUTHOR("Daniele Lacamera");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("TCP Hybla");