ipv6: sr: add support for SRH encapsulation and injection with lwtunnels

This patch creates a new type of interfaceless lightweight tunnel (SEG6),
enabling the encapsulation and injection of SRH within locally emitted
packets and forwarded packets.

>From a configuration viewpoint, a seg6 tunnel would be configured as follows:

  ip -6 ro ad fc00::1/128 encap seg6 mode encap segs fc42::1,fc42::2,fc42::3 dev eth0

Any packet whose destination address is fc00::1 would thus be encapsulated
within an outer IPv6 header containing the SRH with three segments, and would
actually be routed to the first segment of the list. If `mode inline' was
specified instead of `mode encap', then the SRH would be directly inserted
after the IPv6 header without outer encapsulation.

The inline mode is only available if CONFIG_IPV6_SEG6_INLINE is enabled. This
feature was made configurable because direct header insertion may break
several mechanisms such as PMTUD or IPSec AH.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David Lebrun 2016-11-08 14:57:41 +01:00 committed by David S. Miller
parent 915d7e5e59
commit 6c8702c60b
9 changed files with 526 additions and 1 deletions

View File

@ -0,0 +1,6 @@
#ifndef _LINUX_SEG6_IPTUNNEL_H
#define _LINUX_SEG6_IPTUNNEL_H
#include <uapi/linux/seg6_iptunnel.h>
#endif

View File

@ -16,6 +16,8 @@
#include <linux/net.h> #include <linux/net.h>
#include <linux/ipv6.h> #include <linux/ipv6.h>
#include <net/lwtunnel.h>
#include <linux/seg6.h>
static inline void update_csum_diff4(struct sk_buff *skb, __be32 from, static inline void update_csum_diff4(struct sk_buff *skb, __be32 from,
__be32 to) __be32 to)
@ -48,5 +50,9 @@ static inline struct seg6_pernet_data *seg6_pernet(struct net *net)
extern int seg6_init(void); extern int seg6_init(void);
extern void seg6_exit(void); extern void seg6_exit(void);
extern int seg6_iptunnel_init(void);
extern void seg6_iptunnel_exit(void);
extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len);
#endif #endif

View File

@ -9,6 +9,7 @@ enum lwtunnel_encap_types {
LWTUNNEL_ENCAP_IP, LWTUNNEL_ENCAP_IP,
LWTUNNEL_ENCAP_ILA, LWTUNNEL_ENCAP_ILA,
LWTUNNEL_ENCAP_IP6, LWTUNNEL_ENCAP_IP6,
LWTUNNEL_ENCAP_SEG6,
__LWTUNNEL_ENCAP_MAX, __LWTUNNEL_ENCAP_MAX,
}; };

View File

@ -0,0 +1,44 @@
/*
* SR-IPv6 implementation
*
* Author:
* David Lebrun <david.lebrun@uclouvain.be>
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _UAPI_LINUX_SEG6_IPTUNNEL_H
#define _UAPI_LINUX_SEG6_IPTUNNEL_H
enum {
SEG6_IPTUNNEL_UNSPEC,
SEG6_IPTUNNEL_SRH,
__SEG6_IPTUNNEL_MAX,
};
#define SEG6_IPTUNNEL_MAX (__SEG6_IPTUNNEL_MAX - 1)
struct seg6_iptunnel_encap {
int mode;
struct ipv6_sr_hdr srh[0];
};
#define SEG6_IPTUN_ENCAP_SIZE(x) ((sizeof(*x)) + (((x)->srh->hdrlen + 1) << 3))
enum {
SEG6_IPTUN_MODE_INLINE,
SEG6_IPTUN_MODE_ENCAP,
};
static inline size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo)
{
int encap = (tuninfo->mode == SEG6_IPTUN_MODE_ENCAP);
return ((tuninfo->srh->hdrlen + 1) << 3) +
(encap * sizeof(struct ipv6hdr));
}
#endif

View File

@ -39,6 +39,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
return "MPLS"; return "MPLS";
case LWTUNNEL_ENCAP_ILA: case LWTUNNEL_ENCAP_ILA:
return "ILA"; return "ILA";
case LWTUNNEL_ENCAP_SEG6:
return "SEG6";
case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP6:
case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_IP:
case LWTUNNEL_ENCAP_NONE: case LWTUNNEL_ENCAP_NONE:

View File

@ -289,4 +289,16 @@ config IPV6_PIMSM_V2
Support for IPv6 PIM multicast routing protocol PIM-SMv2. Support for IPv6 PIM multicast routing protocol PIM-SMv2.
If unsure, say N. If unsure, say N.
config IPV6_SEG6_INLINE
bool "IPv6: direct Segment Routing Header insertion "
depends on IPV6
---help---
Support for direct insertion of the Segment Routing Header,
also known as inline mode. Be aware that direct insertion of
extension headers (as opposed to encapsulation) may break
multiple mechanisms such as PMTUD or IPSec AH. Use this feature
only if you know exactly what you are doing.
If unsure, say N.
endif # IPV6 endif # IPV6

View File

@ -9,7 +9,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
udp_offload.o seg6.o udp_offload.o seg6.o seg6_iptunnel.o
ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o

View File

@ -26,6 +26,43 @@
#include <linux/seg6.h> #include <linux/seg6.h>
#include <linux/seg6_genl.h> #include <linux/seg6_genl.h>
bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len)
{
int trailing;
unsigned int tlv_offset;
if (srh->type != IPV6_SRCRT_TYPE_4)
return false;
if (((srh->hdrlen + 1) << 3) != len)
return false;
if (srh->segments_left != srh->first_segment)
return false;
tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4);
trailing = len - tlv_offset;
if (trailing < 0)
return false;
while (trailing) {
struct sr6_tlv *tlv;
unsigned int tlv_len;
tlv = (struct sr6_tlv *)((unsigned char *)srh + tlv_offset);
tlv_len = sizeof(*tlv) + tlv->len;
trailing -= tlv_len;
if (trailing < 0)
return false;
tlv_offset += tlv_len;
}
return true;
}
static struct genl_family seg6_genl_family; static struct genl_family seg6_genl_family;
static const struct nla_policy seg6_genl_policy[SEG6_ATTR_MAX + 1] = { static const struct nla_policy seg6_genl_policy[SEG6_ATTR_MAX + 1] = {
@ -198,10 +235,16 @@ int __init seg6_init(void)
if (err) if (err)
goto out_unregister_genl; goto out_unregister_genl;
err = seg6_iptunnel_init();
if (err)
goto out_unregister_pernet;
pr_info("Segment Routing with IPv6\n"); pr_info("Segment Routing with IPv6\n");
out: out:
return err; return err;
out_unregister_pernet:
unregister_pernet_subsys(&ip6_segments_ops);
out_unregister_genl: out_unregister_genl:
genl_unregister_family(&seg6_genl_family); genl_unregister_family(&seg6_genl_family);
goto out; goto out;
@ -209,6 +252,7 @@ int __init seg6_init(void)
void seg6_exit(void) void seg6_exit(void)
{ {
seg6_iptunnel_exit();
unregister_pernet_subsys(&ip6_segments_ops); unregister_pernet_subsys(&ip6_segments_ops);
genl_unregister_family(&seg6_genl_family); genl_unregister_family(&seg6_genl_family);
} }

410
net/ipv6/seg6_iptunnel.c Normal file
View File

@ -0,0 +1,410 @@
/*
* SR-IPv6 implementation
*
* Author:
* David Lebrun <david.lebrun@uclouvain.be>
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/types.h>
#include <linux/skbuff.h>
#include <linux/net.h>
#include <linux/module.h>
#include <net/ip.h>
#include <net/lwtunnel.h>
#include <net/netevent.h>
#include <net/netns/generic.h>
#include <net/ip6_fib.h>
#include <net/route.h>
#include <net/seg6.h>
#include <linux/seg6.h>
#include <linux/seg6_iptunnel.h>
#include <net/addrconf.h>
#include <net/ip6_route.h>
#ifdef CONFIG_DST_CACHE
#include <net/dst_cache.h>
#endif
struct seg6_lwt {
#ifdef CONFIG_DST_CACHE
struct dst_cache cache;
#endif
struct seg6_iptunnel_encap tuninfo[0];
};
static inline struct seg6_lwt *seg6_lwt_lwtunnel(struct lwtunnel_state *lwt)
{
return (struct seg6_lwt *)lwt->data;
}
static inline struct seg6_iptunnel_encap *
seg6_encap_lwtunnel(struct lwtunnel_state *lwt)
{
return seg6_lwt_lwtunnel(lwt)->tuninfo;
}
static const struct nla_policy seg6_iptunnel_policy[SEG6_IPTUNNEL_MAX + 1] = {
[SEG6_IPTUNNEL_SRH] = { .type = NLA_BINARY },
};
int nla_put_srh(struct sk_buff *skb, int attrtype,
struct seg6_iptunnel_encap *tuninfo)
{
struct seg6_iptunnel_encap *data;
struct nlattr *nla;
int len;
len = SEG6_IPTUN_ENCAP_SIZE(tuninfo);
nla = nla_reserve(skb, attrtype, len);
if (!nla)
return -EMSGSIZE;
data = nla_data(nla);
memcpy(data, tuninfo, len);
return 0;
}
static void set_tun_src(struct net *net, struct net_device *dev,
struct in6_addr *daddr, struct in6_addr *saddr)
{
struct seg6_pernet_data *sdata = seg6_pernet(net);
struct in6_addr *tun_src;
rcu_read_lock();
tun_src = rcu_dereference(sdata->tun_src);
if (!ipv6_addr_any(tun_src)) {
memcpy(saddr, tun_src, sizeof(struct in6_addr));
} else {
ipv6_dev_get_saddr(net, dev, daddr, IPV6_PREFER_SRC_PUBLIC,
saddr);
}
rcu_read_unlock();
}
/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
static int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
{
struct net *net = dev_net(skb_dst(skb)->dev);
struct ipv6hdr *hdr, *inner_hdr;
struct ipv6_sr_hdr *isrh;
int hdrlen, tot_len, err;
hdrlen = (osrh->hdrlen + 1) << 3;
tot_len = hdrlen + sizeof(*hdr);
err = pskb_expand_head(skb, tot_len, 0, GFP_ATOMIC);
if (unlikely(err))
return err;
inner_hdr = ipv6_hdr(skb);
skb_push(skb, tot_len);
skb_reset_network_header(skb);
skb_mac_header_rebuild(skb);
hdr = ipv6_hdr(skb);
/* inherit tc, flowlabel and hlim
* hlim will be decremented in ip6_forward() afterwards and
* decapsulation will overwrite inner hlim with outer hlim
*/
ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
ip6_flowlabel(inner_hdr));
hdr->hop_limit = inner_hdr->hop_limit;
hdr->nexthdr = NEXTHDR_ROUTING;
isrh = (void *)hdr + sizeof(*hdr);
memcpy(isrh, osrh, hdrlen);
isrh->nexthdr = NEXTHDR_IPV6;
hdr->daddr = isrh->segments[isrh->first_segment];
set_tun_src(net, skb->dev, &hdr->daddr, &hdr->saddr);
skb_postpush_rcsum(skb, hdr, tot_len);
return 0;
}
/* insert an SRH within an IPv6 packet, just after the IPv6 header */
#ifdef CONFIG_IPV6_SEG6_INLINE
static int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
{
struct ipv6hdr *hdr, *oldhdr;
struct ipv6_sr_hdr *isrh;
int hdrlen, err;
hdrlen = (osrh->hdrlen + 1) << 3;
err = pskb_expand_head(skb, hdrlen, 0, GFP_ATOMIC);
if (unlikely(err))
return err;
oldhdr = ipv6_hdr(skb);
skb_pull(skb, sizeof(struct ipv6hdr));
skb_postpull_rcsum(skb, skb_network_header(skb),
sizeof(struct ipv6hdr));
skb_push(skb, sizeof(struct ipv6hdr) + hdrlen);
skb_reset_network_header(skb);
skb_mac_header_rebuild(skb);
hdr = ipv6_hdr(skb);
memmove(hdr, oldhdr, sizeof(*hdr));
isrh = (void *)hdr + sizeof(*hdr);
memcpy(isrh, osrh, hdrlen);
isrh->nexthdr = hdr->nexthdr;
hdr->nexthdr = NEXTHDR_ROUTING;
isrh->segments[0] = hdr->daddr;
hdr->daddr = isrh->segments[isrh->first_segment];
skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen);
return 0;
}
#endif
static int seg6_do_srh(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct seg6_iptunnel_encap *tinfo;
int err = 0;
tinfo = seg6_encap_lwtunnel(dst->lwtstate);
if (likely(!skb->encapsulation)) {
skb_reset_inner_headers(skb);
skb->encapsulation = 1;
}
switch (tinfo->mode) {
#ifdef CONFIG_IPV6_SEG6_INLINE
case SEG6_IPTUN_MODE_INLINE:
err = seg6_do_srh_inline(skb, tinfo->srh);
skb_reset_inner_headers(skb);
break;
#endif
case SEG6_IPTUN_MODE_ENCAP:
err = seg6_do_srh_encap(skb, tinfo->srh);
break;
}
if (err)
return err;
ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
skb_set_transport_header(skb, sizeof(struct ipv6hdr));
skb_set_inner_protocol(skb, skb->protocol);
return 0;
}
int seg6_input(struct sk_buff *skb)
{
int err;
err = seg6_do_srh(skb);
if (unlikely(err)) {
kfree_skb(skb);
return err;
}
skb_dst_drop(skb);
ip6_route_input(skb);
return dst_input(skb);
}
int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *orig_dst = skb_dst(skb);
struct dst_entry *dst = NULL;
struct seg6_lwt *slwt;
int err = -EINVAL;
err = seg6_do_srh(skb);
if (unlikely(err))
goto drop;
slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate);
#ifdef CONFIG_DST_CACHE
dst = dst_cache_get(&slwt->cache);
#endif
if (unlikely(!dst)) {
struct ipv6hdr *hdr = ipv6_hdr(skb);
struct flowi6 fl6;
fl6.daddr = hdr->daddr;
fl6.saddr = hdr->saddr;
fl6.flowlabel = ip6_flowinfo(hdr);
fl6.flowi6_mark = skb->mark;
fl6.flowi6_proto = hdr->nexthdr;
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
err = dst->error;
dst_release(dst);
goto drop;
}
#ifdef CONFIG_DST_CACHE
dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr);
#endif
}
skb_dst_drop(skb);
skb_dst_set(skb, dst);
return dst_output(net, sk, skb);
drop:
kfree_skb(skb);
return err;
}
static int seg6_build_state(struct net_device *dev, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts)
{
struct nlattr *tb[SEG6_IPTUNNEL_MAX + 1];
struct seg6_iptunnel_encap *tuninfo;
struct lwtunnel_state *newts;
int tuninfo_len, min_size;
struct seg6_lwt *slwt;
int err;
err = nla_parse_nested(tb, SEG6_IPTUNNEL_MAX, nla,
seg6_iptunnel_policy);
if (err < 0)
return err;
if (!tb[SEG6_IPTUNNEL_SRH])
return -EINVAL;
tuninfo = nla_data(tb[SEG6_IPTUNNEL_SRH]);
tuninfo_len = nla_len(tb[SEG6_IPTUNNEL_SRH]);
/* tuninfo must contain at least the iptunnel encap structure,
* the SRH and one segment
*/
min_size = sizeof(*tuninfo) + sizeof(struct ipv6_sr_hdr) +
sizeof(struct in6_addr);
if (tuninfo_len < min_size)
return -EINVAL;
switch (tuninfo->mode) {
#ifdef CONFIG_IPV6_SEG6_INLINE
case SEG6_IPTUN_MODE_INLINE:
break;
#endif
case SEG6_IPTUN_MODE_ENCAP:
break;
default:
return -EINVAL;
}
/* verify that SRH is consistent */
if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo)))
return -EINVAL;
newts = lwtunnel_state_alloc(tuninfo_len + sizeof(*slwt));
if (!newts)
return -ENOMEM;
slwt = seg6_lwt_lwtunnel(newts);
#ifdef CONFIG_DST_CACHE
err = dst_cache_init(&slwt->cache, GFP_KERNEL);
if (err) {
kfree(newts);
return err;
}
#endif
memcpy(&slwt->tuninfo, tuninfo, tuninfo_len);
newts->type = LWTUNNEL_ENCAP_SEG6;
newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
LWTUNNEL_STATE_INPUT_REDIRECT;
newts->headroom = seg6_lwt_headroom(tuninfo);
*ts = newts;
return 0;
}
#ifdef CONFIG_DST_CACHE
static void seg6_destroy_state(struct lwtunnel_state *lwt)
{
dst_cache_destroy(&seg6_lwt_lwtunnel(lwt)->cache);
}
#endif
static int seg6_fill_encap_info(struct sk_buff *skb,
struct lwtunnel_state *lwtstate)
{
struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate);
if (nla_put_srh(skb, SEG6_IPTUNNEL_SRH, tuninfo))
return -EMSGSIZE;
return 0;
}
static int seg6_encap_nlsize(struct lwtunnel_state *lwtstate)
{
struct seg6_iptunnel_encap *tuninfo = seg6_encap_lwtunnel(lwtstate);
return nla_total_size(SEG6_IPTUN_ENCAP_SIZE(tuninfo));
}
static int seg6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
{
struct seg6_iptunnel_encap *a_hdr = seg6_encap_lwtunnel(a);
struct seg6_iptunnel_encap *b_hdr = seg6_encap_lwtunnel(b);
int len = SEG6_IPTUN_ENCAP_SIZE(a_hdr);
if (len != SEG6_IPTUN_ENCAP_SIZE(b_hdr))
return 1;
return memcmp(a_hdr, b_hdr, len);
}
static const struct lwtunnel_encap_ops seg6_iptun_ops = {
.build_state = seg6_build_state,
#ifdef CONFIG_DST_CACHE
.destroy_state = seg6_destroy_state,
#endif
.output = seg6_output,
.input = seg6_input,
.fill_encap = seg6_fill_encap_info,
.get_encap_size = seg6_encap_nlsize,
.cmp_encap = seg6_encap_cmp,
};
int __init seg6_iptunnel_init(void)
{
return lwtunnel_encap_add_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6);
}
void seg6_iptunnel_exit(void)
{
lwtunnel_encap_del_ops(&seg6_iptun_ops, LWTUNNEL_ENCAP_SEG6);
}