Merge branch 'xdp-vlan'

Jesper Dangaard Brouer says:

====================
While implementing PoC building blocks for eBPF code XDP+TC that can
manipulate VLANs headers, I discovered a bug in generic-XDP.

The fix should be backported to stable kernels.  Even-though
generic-XDP was introduced in v4.12, I think the bug is not exposed
until v4.14 in the mentined fixes commit.
====================

Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Alexei Starovoitov 2018-10-09 21:59:09 -07:00
commit a0f99e6796
5 changed files with 509 additions and 2 deletions

View File

@ -4258,6 +4258,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
struct netdev_rx_queue *rxqueue;
void *orig_data, *orig_data_end;
u32 metalen, act = XDP_DROP;
__be16 orig_eth_type;
struct ethhdr *eth;
bool orig_bcast;
int hlen, off;
u32 mac_len;
@ -4298,6 +4301,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
xdp->data_hard_start = skb->data - skb_headroom(skb);
orig_data_end = xdp->data_end;
orig_data = xdp->data;
eth = (struct ethhdr *)xdp->data;
orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
orig_eth_type = eth->h_proto;
rxqueue = netif_get_rxqueue(skb);
xdp->rxq = &rxqueue->xdp_rxq;
@ -4321,6 +4327,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
}
/* check if XDP changed eth hdr such SKB needs update */
eth = (struct ethhdr *)xdp->data;
if ((orig_eth_type != eth->h_proto) ||
(orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
__skb_push(skb, ETH_HLEN);
skb->protocol = eth_type_trans(skb, skb->dev);
}
switch (act) {
case XDP_REDIRECT:
case XDP_TX:

View File

@ -36,7 +36,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_sk_lookup_kern.o
test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o \
test_sk_lookup_kern.o test_xdp_vlan.o
# Order correspond to 'make run_tests' order
TEST_PROGS := test_kmod.sh \
@ -49,7 +50,8 @@ TEST_PROGS := test_kmod.sh \
test_lwt_seg6local.sh \
test_lirc_mode2.sh \
test_skb_cgroup_id.sh \
test_flow_dissector.sh
test_flow_dissector.sh \
test_xdp_vlan.sh
# Compile but not part of 'make run_tests'
TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user \

View File

@ -155,6 +155,10 @@ static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx,
(void *) BPF_FUNC_sk_lookup_udp;
static int (*bpf_sk_release)(struct bpf_sock *sk) =
(void *) BPF_FUNC_sk_release;
static int (*bpf_skb_vlan_push)(void *ctx, __be16 vlan_proto, __u16 vlan_tci) =
(void *) BPF_FUNC_skb_vlan_push;
static int (*bpf_skb_vlan_pop)(void *ctx) =
(void *) BPF_FUNC_skb_vlan_pop;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions

View File

@ -0,0 +1,292 @@
/* SPDX-License-Identifier: GPL-2.0
* Copyright(c) 2018 Jesper Dangaard Brouer.
*
* XDP/TC VLAN manipulation example
*
* GOTCHA: Remember to disable NIC hardware offloading of VLANs,
* else the VLAN tags are NOT inlined in the packet payload:
*
* # ethtool -K ixgbe2 rxvlan off
*
* Verify setting:
* # ethtool -k ixgbe2 | grep rx-vlan-offload
* rx-vlan-offload: off
*
*/
#include <stddef.h>
#include <stdbool.h>
#include <string.h>
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/in.h>
#include <linux/pkt_cls.h>
#include "bpf_helpers.h"
#include "bpf_endian.h"
/* linux/if_vlan.h have not exposed this as UAPI, thus mirror some here
*
* struct vlan_hdr - vlan header
* @h_vlan_TCI: priority and VLAN ID
* @h_vlan_encapsulated_proto: packet type ID or len
*/
struct _vlan_hdr {
__be16 h_vlan_TCI;
__be16 h_vlan_encapsulated_proto;
};
#define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */
#define VLAN_PRIO_SHIFT 13
#define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator */
#define VLAN_TAG_PRESENT VLAN_CFI_MASK
#define VLAN_VID_MASK 0x0fff /* VLAN Identifier */
#define VLAN_N_VID 4096
struct parse_pkt {
__u16 l3_proto;
__u16 l3_offset;
__u16 vlan_outer;
__u16 vlan_inner;
__u8 vlan_outer_offset;
__u8 vlan_inner_offset;
};
char _license[] SEC("license") = "GPL";
static __always_inline
bool parse_eth_frame(struct ethhdr *eth, void *data_end, struct parse_pkt *pkt)
{
__u16 eth_type;
__u8 offset;
offset = sizeof(*eth);
/* Make sure packet is large enough for parsing eth + 2 VLAN headers */
if ((void *)eth + offset + (2*sizeof(struct _vlan_hdr)) > data_end)
return false;
eth_type = eth->h_proto;
/* Handle outer VLAN tag */
if (eth_type == bpf_htons(ETH_P_8021Q)
|| eth_type == bpf_htons(ETH_P_8021AD)) {
struct _vlan_hdr *vlan_hdr;
vlan_hdr = (void *)eth + offset;
pkt->vlan_outer_offset = offset;
pkt->vlan_outer = bpf_ntohs(vlan_hdr->h_vlan_TCI)
& VLAN_VID_MASK;
eth_type = vlan_hdr->h_vlan_encapsulated_proto;
offset += sizeof(*vlan_hdr);
}
/* Handle inner (double) VLAN tag */
if (eth_type == bpf_htons(ETH_P_8021Q)
|| eth_type == bpf_htons(ETH_P_8021AD)) {
struct _vlan_hdr *vlan_hdr;
vlan_hdr = (void *)eth + offset;
pkt->vlan_inner_offset = offset;
pkt->vlan_inner = bpf_ntohs(vlan_hdr->h_vlan_TCI)
& VLAN_VID_MASK;
eth_type = vlan_hdr->h_vlan_encapsulated_proto;
offset += sizeof(*vlan_hdr);
}
pkt->l3_proto = bpf_ntohs(eth_type); /* Convert to host-byte-order */
pkt->l3_offset = offset;
return true;
}
/* Hint, VLANs are choosen to hit network-byte-order issues */
#define TESTVLAN 4011 /* 0xFAB */
// #define TO_VLAN 4000 /* 0xFA0 (hint 0xOA0 = 160) */
SEC("xdp_drop_vlan_4011")
int xdp_prognum0(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
struct parse_pkt pkt = { 0 };
if (!parse_eth_frame(data, data_end, &pkt))
return XDP_ABORTED;
/* Drop specific VLAN ID example */
if (pkt.vlan_outer == TESTVLAN)
return XDP_ABORTED;
/*
* Using XDP_ABORTED makes it possible to record this event,
* via tracepoint xdp:xdp_exception like:
* # perf record -a -e xdp:xdp_exception
* # perf script
*/
return XDP_PASS;
}
/*
Commands to setup VLAN on Linux to test packets gets dropped:
export ROOTDEV=ixgbe2
export VLANID=4011
ip link add link $ROOTDEV name $ROOTDEV.$VLANID type vlan id $VLANID
ip link set dev $ROOTDEV.$VLANID up
ip link set dev $ROOTDEV mtu 1508
ip addr add 100.64.40.11/24 dev $ROOTDEV.$VLANID
Load prog with ip tool:
ip link set $ROOTDEV xdp off
ip link set $ROOTDEV xdp object xdp_vlan01_kern.o section xdp_drop_vlan_4011
*/
/* Changing VLAN to zero, have same practical effect as removing the VLAN. */
#define TO_VLAN 0
SEC("xdp_vlan_change")
int xdp_prognum1(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
struct parse_pkt pkt = { 0 };
if (!parse_eth_frame(data, data_end, &pkt))
return XDP_ABORTED;
/* Change specific VLAN ID */
if (pkt.vlan_outer == TESTVLAN) {
struct _vlan_hdr *vlan_hdr = data + pkt.vlan_outer_offset;
/* Modifying VLAN, preserve top 4 bits */
vlan_hdr->h_vlan_TCI =
bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000)
| TO_VLAN);
}
return XDP_PASS;
}
/*
* Show XDP+TC can cooperate, on creating a VLAN rewriter.
* 1. Create a XDP prog that can "pop"/remove a VLAN header.
* 2. Create a TC-bpf prog that egress can add a VLAN header.
*/
#ifndef ETH_ALEN /* Ethernet MAC address length */
#define ETH_ALEN 6 /* bytes */
#endif
#define VLAN_HDR_SZ 4 /* bytes */
SEC("xdp_vlan_remove_outer")
int xdp_prognum2(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
struct parse_pkt pkt = { 0 };
char *dest;
if (!parse_eth_frame(data, data_end, &pkt))
return XDP_ABORTED;
/* Skip packet if no outer VLAN was detected */
if (pkt.vlan_outer_offset == 0)
return XDP_PASS;
/* Moving Ethernet header, dest overlap with src, memmove handle this */
dest = data;
dest+= VLAN_HDR_SZ;
/*
* Notice: Taking over vlan_hdr->h_vlan_encapsulated_proto, by
* only moving two MAC addrs (12 bytes), not overwriting last 2 bytes
*/
__builtin_memmove(dest, data, ETH_ALEN * 2);
/* Note: LLVM built-in memmove inlining require size to be constant */
/* Move start of packet header seen by Linux kernel stack */
bpf_xdp_adjust_head(ctx, VLAN_HDR_SZ);
return XDP_PASS;
}
static __always_inline
void shift_mac_4bytes_16bit(void *data)
{
__u16 *p = data;
p[7] = p[5]; /* delete p[7] was vlan_hdr->h_vlan_TCI */
p[6] = p[4]; /* delete p[6] was ethhdr->h_proto */
p[5] = p[3];
p[4] = p[2];
p[3] = p[1];
p[2] = p[0];
}
static __always_inline
void shift_mac_4bytes_32bit(void *data)
{
__u32 *p = data;
/* Assuming VLAN hdr present. The 4 bytes in p[3] that gets
* overwritten, is ethhdr->h_proto and vlan_hdr->h_vlan_TCI.
* The vlan_hdr->h_vlan_encapsulated_proto take over role as
* ethhdr->h_proto.
*/
p[3] = p[2];
p[2] = p[1];
p[1] = p[0];
}
SEC("xdp_vlan_remove_outer2")
int xdp_prognum3(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
struct ethhdr *orig_eth = data;
struct parse_pkt pkt = { 0 };
if (!parse_eth_frame(orig_eth, data_end, &pkt))
return XDP_ABORTED;
/* Skip packet if no outer VLAN was detected */
if (pkt.vlan_outer_offset == 0)
return XDP_PASS;
/* Simply shift down MAC addrs 4 bytes, overwrite h_proto + TCI */
shift_mac_4bytes_32bit(data);
/* Move start of packet header seen by Linux kernel stack */
bpf_xdp_adjust_head(ctx, VLAN_HDR_SZ);
return XDP_PASS;
}
/*=====================================
* BELOW: TC-hook based ebpf programs
* ====================================
* The TC-clsact eBPF programs (currently) need to be attach via TC commands
*/
SEC("tc_vlan_push")
int _tc_progA(struct __sk_buff *ctx)
{
bpf_skb_vlan_push(ctx, bpf_htons(ETH_P_8021Q), TESTVLAN);
return TC_ACT_OK;
}
/*
Commands to setup TC to use above bpf prog:
export ROOTDEV=ixgbe2
export FILE=xdp_vlan01_kern.o
# Re-attach clsact to clear/flush existing role
tc qdisc del dev $ROOTDEV clsact 2> /dev/null ;\
tc qdisc add dev $ROOTDEV clsact
# Attach BPF prog EGRESS
tc filter add dev $ROOTDEV egress \
prio 1 handle 1 bpf da obj $FILE sec tc_vlan_push
tc filter show dev $ROOTDEV egress
*/

View File

@ -0,0 +1,195 @@
#!/bin/bash
TESTNAME=xdp_vlan
usage() {
echo "Testing XDP + TC eBPF VLAN manipulations: $TESTNAME"
echo ""
echo "Usage: $0 [-vfh]"
echo " -v | --verbose : Verbose"
echo " --flush : Flush before starting (e.g. after --interactive)"
echo " --interactive : Keep netns setup running after test-run"
echo ""
}
cleanup()
{
local status=$?
if [ "$status" = "0" ]; then
echo "selftests: $TESTNAME [PASS]";
else
echo "selftests: $TESTNAME [FAILED]";
fi
if [ -n "$INTERACTIVE" ]; then
echo "Namespace setup still active explore with:"
echo " ip netns exec ns1 bash"
echo " ip netns exec ns2 bash"
exit $status
fi
set +e
ip link del veth1 2> /dev/null
ip netns del ns1 2> /dev/null
ip netns del ns2 2> /dev/null
}
# Using external program "getopt" to get --long-options
OPTIONS=$(getopt -o hvfi: \
--long verbose,flush,help,interactive,debug -- "$@")
if (( $? != 0 )); then
usage
echo "selftests: $TESTNAME [FAILED] Error calling getopt, unknown option?"
exit 2
fi
eval set -- "$OPTIONS"
## --- Parse command line arguments / parameters ---
while true; do
case "$1" in
-v | --verbose)
export VERBOSE=yes
shift
;;
-i | --interactive | --debug )
INTERACTIVE=yes
shift
;;
-f | --flush )
cleanup
shift
;;
-- )
shift
break
;;
-h | --help )
usage;
echo "selftests: $TESTNAME [SKIP] usage help info requested"
exit 0
;;
* )
shift
break
;;
esac
done
if [ "$EUID" -ne 0 ]; then
echo "selftests: $TESTNAME [FAILED] need root privileges"
exit 1
fi
ip link set dev lo xdp off 2>/dev/null > /dev/null
if [ $? -ne 0 ];then
echo "selftests: $TESTNAME [SKIP] need ip xdp support"
exit 0
fi
# Interactive mode likely require us to cleanup netns
if [ -n "$INTERACTIVE" ]; then
ip link del veth1 2> /dev/null
ip netns del ns1 2> /dev/null
ip netns del ns2 2> /dev/null
fi
# Exit on failure
set -e
# Some shell-tools dependencies
which ip > /dev/null
which tc > /dev/null
which ethtool > /dev/null
# Make rest of shell verbose, showing comments as doc/info
if [ -n "$VERBOSE" ]; then
set -v
fi
# Create two namespaces
ip netns add ns1
ip netns add ns2
# Run cleanup if failing or on kill
trap cleanup 0 2 3 6 9
# Create veth pair
ip link add veth1 type veth peer name veth2
# Move veth1 and veth2 into the respective namespaces
ip link set veth1 netns ns1
ip link set veth2 netns ns2
# NOTICE: XDP require VLAN header inside packet payload
# - Thus, disable VLAN offloading driver features
# - For veth REMEMBER TX side VLAN-offload
#
# Disable rx-vlan-offload (mostly needed on ns1)
ip netns exec ns1 ethtool -K veth1 rxvlan off
ip netns exec ns2 ethtool -K veth2 rxvlan off
#
# Disable tx-vlan-offload (mostly needed on ns2)
ip netns exec ns2 ethtool -K veth2 txvlan off
ip netns exec ns1 ethtool -K veth1 txvlan off
export IPADDR1=100.64.41.1
export IPADDR2=100.64.41.2
# In ns1/veth1 add IP-addr on plain net_device
ip netns exec ns1 ip addr add ${IPADDR1}/24 dev veth1
ip netns exec ns1 ip link set veth1 up
# In ns2/veth2 create VLAN device
export VLAN=4011
export DEVNS2=veth2
ip netns exec ns2 ip link add link $DEVNS2 name $DEVNS2.$VLAN type vlan id $VLAN
ip netns exec ns2 ip addr add ${IPADDR2}/24 dev $DEVNS2.$VLAN
ip netns exec ns2 ip link set $DEVNS2 up
ip netns exec ns2 ip link set $DEVNS2.$VLAN up
# Bringup lo in netns (to avoids confusing people using --interactive)
ip netns exec ns1 ip link set lo up
ip netns exec ns2 ip link set lo up
# At this point, the hosts cannot reach each-other,
# because ns2 are using VLAN tags on the packets.
ip netns exec ns2 sh -c 'ping -W 1 -c 1 100.64.41.1 || echo "Okay ping fails"'
# Now we can use the test_xdp_vlan.c program to pop/push these VLAN tags
# ----------------------------------------------------------------------
# In ns1: ingress use XDP to remove VLAN tags
export DEVNS1=veth1
export FILE=test_xdp_vlan.o
# First test: Remove VLAN by setting VLAN ID 0, using "xdp_vlan_change"
export XDP_PROG=xdp_vlan_change
ip netns exec ns1 ip link set $DEVNS1 xdp object $FILE section $XDP_PROG
# In ns1: egress use TC to add back VLAN tag 4011
# (del cmd)
# tc qdisc del dev $DEVNS1 clsact 2> /dev/null
#
ip netns exec ns1 tc qdisc add dev $DEVNS1 clsact
ip netns exec ns1 tc filter add dev $DEVNS1 egress \
prio 1 handle 1 bpf da obj $FILE sec tc_vlan_push
# Now the namespaces can reach each-other, test with ping:
ip netns exec ns2 ping -W 2 -c 3 $IPADDR1
ip netns exec ns1 ping -W 2 -c 3 $IPADDR2
# Second test: Replace xdp prog, that fully remove vlan header
#
# Catch kernel bug for generic-XDP, that does didn't allow us to
# remove a VLAN header, because skb->protocol still contain VLAN
# ETH_P_8021Q indication, and this cause overwriting of our changes.
#
export XDP_PROG=xdp_vlan_remove_outer2
ip netns exec ns1 ip link set $DEVNS1 xdp off
ip netns exec ns1 ip link set $DEVNS1 xdp object $FILE section $XDP_PROG
# Now the namespaces should still be able reach each-other, test with ping:
ip netns exec ns2 ping -W 2 -c 3 $IPADDR1
ip netns exec ns1 ping -W 2 -c 3 $IPADDR2