linux_dsm_epyc7002/samples/bpf/do_hbm_test.sh
brakmo 71634d7f92 bpf: Add support for fq's EDT to HBM
Adds support for fq's Earliest Departure Time to HBM (Host Bandwidth
Manager). Includes a new BPF program supporting EDT, and also updates
corresponding programs.

It will drop packets with an EDT of more than 500us in the future
unless the packet belongs to a flow with less than 2 packets in flight.
This is done so each flow has at least 2 packets in flight, so they
will not starve, and also to help prevent delayed ACK timeouts.

It will also work with ECN enabled traffic, where the packets will be
CE marked if their EDT is more than 50us in the future.

The table below shows some performance numbers. The flows are back to
back RPCS. One server sending to another, either 2 or 4 flows.
One flow is a 10KB RPC, the rest are 1MB RPCs. When there are more
than one flow of a given RPC size, the numbers represent averages.

The rate limit applies to all flows (they are in the same cgroup).
Tests ending with "-edt" ran with the new BPF program supporting EDT.
Tests ending with "-hbt" ran on top HBT qdisc with the specified rate
(i.e. no HBM). The other tests ran with the HBM BPF program included
in the HBM patch-set.

EDT has limited value when using DCTCP, but it helps in many cases when
using Cubic. It usually achieves larger link utilization and lower
99% latencies for the 1MB RPCs.
HBM ends up queueing a lot of packets with its default parameter values,
reducing the goodput of the 10KB RPCs and increasing their latency. Also,
the RTTs seen by the flows are quite large.

                         Aggr              10K  10K  10K   1MB  1MB  1MB
         Limit           rate drops  RTT  rate  P90  P99  rate  P90  P99
Test      rate  Flows    Mbps   %     us  Mbps   us   us  Mbps   ms   ms
--------  ----  -----    ---- -----  ---  ---- ---- ----  ---- ---- ----
cubic       1G    2       904  0.02  108   257  511  539   647 13.4 24.5
cubic-edt   1G    2       982  0.01  156   239  656  967   743 14.0 17.2
dctcp       1G    2       977  0.00  105   324  408  744   653 14.5 15.9
dctcp-edt   1G    2       981  0.01  142   321  417  811   660 15.7 17.0
cubic-htb   1G    2       919  0.00 1825    40 2822 4140   879  9.7  9.9

cubic     200M    2       155  0.30  220    81  532  655    74  283  450
cubic-edt 200M    2       188  0.02  222    87 1035 1095   101   84   85
dctcp     200M    2       188  0.03  111    77  912  939   111   76  325
dctcp-edt 200M    2       188  0.03  217    74 1416 1738   114   76   79
cubic-htb 200M    2       188  0.00 5015     8 14ms 15ms   180   48   50

cubic       1G    4       952  0.03  110   165  516  546   262   38  154
cubic-edt   1G    4       973  0.01  190   111 1034 1314   287   65   79
dctcp       1G    4       951  0.00  103   180  617  905   257   37   38
dctcp-edt   1G    4       967  0.00  163   151  732 1126   272   43   55
cubic-htb   1G    4       914  0.00 3249    13  7ms  8ms   300   29   34

cubic       5G    4      4236  0.00  134   305  490  624  1310   10   17
cubic-edt   5G    4      4865  0.00  156   306  425  759  1520   10   16
dctcp       5G    4      4936  0.00  128   485  221  409  1484    7    9
dctcp-edt   5G    4      4924  0.00  148   390  392  623  1508   11   26

v1 -> v2: Incorporated Andrii's suggestions
v2 -> v3: Incorporated Yonghong's suggestions
v3 -> v4: Removed credit update that is not needed

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2019-07-03 15:03:00 +02:00

443 lines
12 KiB
Bash
Executable File

#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# Copyright (c) 2019 Facebook
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of version 2 of the GNU General Public
# License as published by the Free Software Foundation.
Usage() {
echo "Script for testing HBM (Host Bandwidth Manager) framework."
echo "It creates a cgroup to use for testing and load a BPF program to limit"
echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create"
echo "loads. The output is the goodput in Mbps (unless -D was used)."
echo ""
echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]"
echo " [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E] [--edt]"
echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]"
echo " [-l] [-N] [--no_cn] [-p=<port>|--port=<port>] [-P]"
echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]"
echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]"
echo " Where:"
echo " out egress (default)"
echo " -b or --bpf BPF program filename to load and attach."
echo " Default is hbm_out_kern.o for egress,"
echo " -c or -cc TCP congestion control (cubic or dctcp)"
echo " --debug print BPF trace buffer"
echo " -d or --delay add a delay in ms using netem"
echo " -D In addition to the goodput in Mbps, it also outputs"
echo " other detailed information. This information is"
echo " test dependent (i.e. iperf3 or netperf)."
echo " -E enable ECN (not required for dctcp)"
echo " --edt use fq's Earliest Departure Time (requires fq)"
echo " -f or --flows number of concurrent flows (default=1)"
echo " -i or --id cgroup id (an integer, default is 1)"
echo " -N use netperf instead of iperf3"
echo " --no_cn Do not return CN notifications"
echo " -l do not limit flows using loopback"
echo " -h Help"
echo " -p or --port iperf3 port (default is 5201)"
echo " -P use an iperf3 instance for each flow"
echo " -q use the specified qdisc"
echo " -r or --rate rate in Mbps (default 1s 1Gbps)"
echo " -R Use TCP_RR for netperf. 1st flow has req"
echo " size of 10KB, rest of 1MB. Reply in all"
echo " cases is 1 byte."
echo " More detailed output for each flow can be found"
echo " in the files netperf.<cg>.<flow>, where <cg> is the"
echo " cgroup id as specified with the -i flag, and <flow>"
echo " is the flow id starting at 1 and increasing by 1 for"
echo " flow (as specified by -f)."
echo " -s or --server hostname of netperf server. Used to create netperf"
echo " test traffic between to hosts (default is within host)"
echo " netserver must be running on the host."
echo " -S or --stats whether to update hbm stats (default is yes)."
echo " -t or --time duration of iperf3 in seconds (default=5)"
echo " -w Work conserving flag. cgroup can increase its"
echo " bandwidth beyond the rate limit specified"
echo " while there is available bandwidth. Current"
echo " implementation assumes there is only one NIC"
echo " (eth0), but can be extended to support multiple"
echo " NICs."
echo " cubic or dctcp specify which TCP CC to use"
echo " "
exit
}
#set -x
debug_flag=0
args="$@"
name="$0"
netem=0
cc=x
dir="-o"
dir_name="out"
dur=5
flows=1
id=1
prog=""
port=5201
rate=1000
multi_iperf=0
flow_cnt=1
use_netperf=0
rr=0
ecn=0
details=0
server=""
qdisc=""
flags=""
do_stats=0
function start_hbm () {
rm -f hbm.out
echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out
echo " " >> hbm.out
./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog >> hbm.out 2>&1 &
echo $!
}
processArgs () {
for i in $args ; do
case $i in
# Support for upcomming ingress rate limiting
#in) # support for upcoming ingress rate limiting
# dir="-i"
# dir_name="in"
# ;;
out)
dir="-o"
dir_name="out"
;;
-b=*|--bpf=*)
prog="${i#*=}"
;;
-c=*|--cc=*)
cc="${i#*=}"
;;
--no_cn)
flags="$flags --no_cn"
;;
--debug)
flags="$flags -d"
debug_flag=1
;;
-d=*|--delay=*)
netem="${i#*=}"
;;
-D)
details=1
;;
-E)
ecn=1
;;
--edt)
flags="$flags --edt"
qdisc="fq"
;;
-f=*|--flows=*)
flows="${i#*=}"
;;
-i=*|--id=*)
id="${i#*=}"
;;
-l)
flags="$flags -l"
;;
-N)
use_netperf=1
;;
-p=*|--port=*)
port="${i#*=}"
;;
-P)
multi_iperf=1
;;
-q=*)
qdisc="${i#*=}"
;;
-r=*|--rate=*)
rate="${i#*=}"
;;
-R)
rr=1
;;
-s=*|--server=*)
server="${i#*=}"
;;
-S|--stats)
flags="$flags -s"
do_stats=1
;;
-t=*|--time=*)
dur="${i#*=}"
;;
-w)
flags="$flags -w"
;;
cubic)
cc=cubic
;;
dctcp)
cc=dctcp
;;
*)
echo "Unknown arg:$i"
Usage
;;
esac
done
}
processArgs
if [ $debug_flag -eq 1 ] ; then
rm -f hbm_out.log
fi
hbm_pid=$(start_hbm)
usleep 100000
host=`hostname`
cg_base_dir=/sys/fs/cgroup
cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id"
echo $$ >> $cg_dir/cgroup.procs
ulimit -l unlimited
rm -f ss.out
rm -f hbm.[0-9]*.$dir_name
if [ $ecn -ne 0 ] ; then
sysctl -w -q -n net.ipv4.tcp_ecn=1
fi
if [ $use_netperf -eq 0 ] ; then
cur_cc=`sysctl -n net.ipv4.tcp_congestion_control`
if [ "$cc" != "x" ] ; then
sysctl -w -q -n net.ipv4.tcp_congestion_control=$cc
fi
fi
if [ "$netem" -ne "0" ] ; then
if [ "$qdisc" != "" ] ; then
echo "WARNING: Ignoring -q options because -d option used"
fi
tc qdisc del dev lo root > /dev/null 2>&1
tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1
elif [ "$qdisc" != "" ] ; then
tc qdisc del dev eth0 root > /dev/null 2>&1
tc qdisc add dev eth0 root $qdisc > /dev/null 2>&1
fi
n=0
m=$[$dur * 5]
hn="::1"
if [ $use_netperf -ne 0 ] ; then
if [ "$server" != "" ] ; then
hn=$server
fi
fi
( ping6 -i 0.2 -c $m $hn > ping.out 2>&1 ) &
if [ $use_netperf -ne 0 ] ; then
begNetserverPid=`ps ax | grep netserver | grep --invert-match "grep" | \
awk '{ print $1 }'`
if [ "$begNetserverPid" == "" ] ; then
if [ "$server" == "" ] ; then
( ./netserver > /dev/null 2>&1) &
usleep 100000
fi
fi
flow_cnt=1
if [ "$server" == "" ] ; then
np_server=$host
else
np_server=$server
fi
if [ "$cc" == "x" ] ; then
np_cc=""
else
np_cc="-K $cc,$cc"
fi
replySize=1
while [ $flow_cnt -le $flows ] ; do
if [ $rr -ne 0 ] ; then
reqSize=1M
if [ $flow_cnt -eq 1 ] ; then
reqSize=10K
fi
if [ "$dir" == "-i" ] ; then
replySize=$reqSize
reqSize=1
fi
( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r $reqSize,$replySize $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,REMOTE_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,LOCAL_RECV_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
else
if [ "$dir" == "-i" ] ; then
( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r 1,10M $np_cc -k P50_LATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REMOTE_TRANSPORT_RETRANS,REMOTE_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
else
( ./netperf -H $np_server -l $dur -f m -j -t TCP_STREAM -- $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
fi
fi
flow_cnt=$[flow_cnt+1]
done
# sleep for duration of test (plus some buffer)
n=$[dur+2]
sleep $n
# force graceful termination of netperf
pids=`pgrep netperf`
for p in $pids ; do
kill -SIGALRM $p
done
flow_cnt=1
rate=0
if [ $details -ne 0 ] ; then
echo ""
echo "Details for HBM in cgroup $id"
if [ $do_stats -eq 1 ] ; then
if [ -e hbm.$id.$dir_name ] ; then
cat hbm.$id.$dir_name
fi
fi
fi
while [ $flow_cnt -le $flows ] ; do
if [ "$dir" == "-i" ] ; then
r=`cat netperf.$id.$flow_cnt | grep -o "REMOTE_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
else
r=`cat netperf.$id.$flow_cnt | grep -o "LOCAL_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
fi
echo "rate for flow $flow_cnt: $r"
rate=$[rate+r]
if [ $details -ne 0 ] ; then
echo "-----"
echo "Details for cgroup $id, flow $flow_cnt"
cat netperf.$id.$flow_cnt
fi
flow_cnt=$[flow_cnt+1]
done
if [ $details -ne 0 ] ; then
echo ""
delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
echo "PING AVG DELAY:$delay"
echo "AGGREGATE_GOODPUT:$rate"
else
echo $rate
fi
elif [ $multi_iperf -eq 0 ] ; then
(iperf3 -s -p $port -1 > /dev/null 2>&1) &
usleep 100000
iperf3 -c $host -p $port -i 0 -P $flows -f m -t $dur > iperf.$id
rates=`grep receiver iperf.$id | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*"`
rate=`echo $rates | grep -o "[0-9]*$"`
if [ $details -ne 0 ] ; then
echo ""
echo "Details for HBM in cgroup $id"
if [ $do_stats -eq 1 ] ; then
if [ -e hbm.$id.$dir_name ] ; then
cat hbm.$id.$dir_name
fi
fi
delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
echo "PING AVG DELAY:$delay"
echo "AGGREGATE_GOODPUT:$rate"
else
echo $rate
fi
else
flow_cnt=1
while [ $flow_cnt -le $flows ] ; do
(iperf3 -s -p $port -1 > /dev/null 2>&1) &
( iperf3 -c $host -p $port -i 0 -P 1 -f m -t $dur | grep receiver | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*" | grep -o "[0-9]*$" > iperf3.$id.$flow_cnt ) &
port=$[port+1]
flow_cnt=$[flow_cnt+1]
done
n=$[dur+1]
sleep $n
flow_cnt=1
rate=0
if [ $details -ne 0 ] ; then
echo ""
echo "Details for HBM in cgroup $id"
if [ $do_stats -eq 1 ] ; then
if [ -e hbm.$id.$dir_name ] ; then
cat hbm.$id.$dir_name
fi
fi
fi
while [ $flow_cnt -le $flows ] ; do
r=`cat iperf3.$id.$flow_cnt`
# echo "rate for flow $flow_cnt: $r"
if [ $details -ne 0 ] ; then
echo "Rate for cgroup $id, flow $flow_cnt LOCAL_SEND_THROUGHPUT=$r"
fi
rate=$[rate+r]
flow_cnt=$[flow_cnt+1]
done
if [ $details -ne 0 ] ; then
delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
echo "PING AVG DELAY:$delay"
echo "AGGREGATE_GOODPUT:$rate"
else
echo $rate
fi
fi
if [ $use_netperf -eq 0 ] ; then
sysctl -w -q -n net.ipv4.tcp_congestion_control=$cur_cc
fi
if [ $ecn -ne 0 ] ; then
sysctl -w -q -n net.ipv4.tcp_ecn=0
fi
if [ "$netem" -ne "0" ] ; then
tc qdisc del dev lo root > /dev/null 2>&1
fi
if [ "$qdisc" != "" ] ; then
tc qdisc del dev eth0 root > /dev/null 2>&1
fi
sleep 2
hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'`
if [ "$hbmPid" == "$hbm_pid" ] ; then
kill $hbm_pid
fi
sleep 1
# Detach any BPF programs that may have lingered
ttx=`bpftool cgroup tree | grep hbm`
v=2
for x in $ttx ; do
if [ "${x:0:36}" == "/sys/fs/cgroup/cgroup-test-work-dir/" ] ; then
cg=$x ; v=0
else
if [ $v -eq 0 ] ; then
id=$x ; v=1
else
if [ $v -eq 1 ] ; then
type=$x ; bpftool cgroup detach $cg $type id $id
v=0
fi
fi
fi
done
if [ $use_netperf -ne 0 ] ; then
if [ "$server" == "" ] ; then
if [ "$begNetserverPid" == "" ] ; then
netserverPid=`ps ax | grep netserver | grep --invert-match "grep" | awk '{ print $1 }'`
if [ "$netserverPid" != "" ] ; then
kill $netserverPid
fi
fi
fi
fi
exit