linux_dsm_epyc7002/include/uapi/linux/ip.h
Andy Gospodarek 0eeb075fad net: ipv4 sysctl option to ignore routes when nexthop link is down
This feature is only enabled with the new per-interface or ipv4 global
sysctls called 'ignore_routes_with_linkdown'.

net.ipv4.conf.all.ignore_routes_with_linkdown = 0
net.ipv4.conf.default.ignore_routes_with_linkdown = 0
net.ipv4.conf.lo.ignore_routes_with_linkdown = 0
...

When the above sysctls are set, will report to userspace that a route is
dead and will no longer resolve to this nexthop when performing a fib
lookup.  This will signal to userspace that the route will not be
selected.  The signalling of a RTNH_F_DEAD is only passed to userspace
if the sysctl is enabled and link is down.  This was done as without it
the netlink listeners would have no idea whether or not a nexthop would
be selected.   The kernel only sets RTNH_F_DEAD internally if the
interface has IFF_UP cleared.

With the new sysctl set, the following behavior can be observed
(interface p8p1 is link-down):

default via 10.0.5.2 dev p9p1
10.0.5.0/24 dev p9p1  proto kernel  scope link  src 10.0.5.15
70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1
80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1 dead linkdown
90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 1 dead linkdown
90.0.0.0/24 via 70.0.0.2 dev p7p1  metric 2
90.0.0.1 via 70.0.0.2 dev p7p1  src 70.0.0.1
    cache
local 80.0.0.1 dev lo  src 80.0.0.1
    cache <local>
80.0.0.2 via 10.0.5.2 dev p9p1  src 10.0.5.15
    cache

While the route does remain in the table (so it can be modified if
needed rather than being wiped away as it would be if IFF_UP was
cleared), the proper next-hop is chosen automatically when the link is
down.  Now interface p8p1 is linked-up:

default via 10.0.5.2 dev p9p1
10.0.5.0/24 dev p9p1  proto kernel  scope link  src 10.0.5.15
70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1
80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1
90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 1
90.0.0.0/24 via 70.0.0.2 dev p7p1  metric 2
192.168.56.0/24 dev p2p1  proto kernel  scope link  src 192.168.56.2
90.0.0.1 via 80.0.0.2 dev p8p1  src 80.0.0.1
    cache
local 80.0.0.1 dev lo  src 80.0.0.1
    cache <local>
80.0.0.2 dev p8p1  src 80.0.0.1
    cache

and the output changes to what one would expect.

If the sysctl is not set, the following output would be expected when
p8p1 is down:

default via 10.0.5.2 dev p9p1
10.0.5.0/24 dev p9p1  proto kernel  scope link  src 10.0.5.15
70.0.0.0/24 dev p7p1  proto kernel  scope link  src 70.0.0.1
80.0.0.0/24 dev p8p1  proto kernel  scope link  src 80.0.0.1 linkdown
90.0.0.0/24 via 80.0.0.2 dev p8p1  metric 1 linkdown
90.0.0.0/24 via 70.0.0.2 dev p7p1  metric 2

Since the dead flag does not appear, there should be no expectation that
the kernel would skip using this route due to link being down.

v2: Split kernel changes into 2 patches, this actually makes a
behavioral change if the sysctl is set.  Also took suggestion from Alex
to simplify code by only checking sysctl during fib lookup and
suggestion from Scott to add a per-interface sysctl.

v3: Code clean-ups to make it more readable and efficient as well as a
reverse path check fix.

v4: Drop binary sysctl

v5: Whitespace fixups from Dave

v6: Style changes from Dave and checkpatch suggestions

v7: One more checkpatch fixup

Signed-off-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: Dinesh Dutt <ddutt@cumulusnetworks.com>
Acked-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-06-24 02:15:54 -07:00

174 lines
4.5 KiB
C

/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Definitions for the IP protocol.
*
* Version: @(#)ip.h 1.0.2 04/28/93
*
* Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _UAPI_LINUX_IP_H
#define _UAPI_LINUX_IP_H
#include <linux/types.h>
#include <asm/byteorder.h>
#define IPTOS_TOS_MASK 0x1E
#define IPTOS_TOS(tos) ((tos)&IPTOS_TOS_MASK)
#define IPTOS_LOWDELAY 0x10
#define IPTOS_THROUGHPUT 0x08
#define IPTOS_RELIABILITY 0x04
#define IPTOS_MINCOST 0x02
#define IPTOS_PREC_MASK 0xE0
#define IPTOS_PREC(tos) ((tos)&IPTOS_PREC_MASK)
#define IPTOS_PREC_NETCONTROL 0xe0
#define IPTOS_PREC_INTERNETCONTROL 0xc0
#define IPTOS_PREC_CRITIC_ECP 0xa0
#define IPTOS_PREC_FLASHOVERRIDE 0x80
#define IPTOS_PREC_FLASH 0x60
#define IPTOS_PREC_IMMEDIATE 0x40
#define IPTOS_PREC_PRIORITY 0x20
#define IPTOS_PREC_ROUTINE 0x00
/* IP options */
#define IPOPT_COPY 0x80
#define IPOPT_CLASS_MASK 0x60
#define IPOPT_NUMBER_MASK 0x1f
#define IPOPT_COPIED(o) ((o)&IPOPT_COPY)
#define IPOPT_CLASS(o) ((o)&IPOPT_CLASS_MASK)
#define IPOPT_NUMBER(o) ((o)&IPOPT_NUMBER_MASK)
#define IPOPT_CONTROL 0x00
#define IPOPT_RESERVED1 0x20
#define IPOPT_MEASUREMENT 0x40
#define IPOPT_RESERVED2 0x60
#define IPOPT_END (0 |IPOPT_CONTROL)
#define IPOPT_NOOP (1 |IPOPT_CONTROL)
#define IPOPT_SEC (2 |IPOPT_CONTROL|IPOPT_COPY)
#define IPOPT_LSRR (3 |IPOPT_CONTROL|IPOPT_COPY)
#define IPOPT_TIMESTAMP (4 |IPOPT_MEASUREMENT)
#define IPOPT_CIPSO (6 |IPOPT_CONTROL|IPOPT_COPY)
#define IPOPT_RR (7 |IPOPT_CONTROL)
#define IPOPT_SID (8 |IPOPT_CONTROL|IPOPT_COPY)
#define IPOPT_SSRR (9 |IPOPT_CONTROL|IPOPT_COPY)
#define IPOPT_RA (20|IPOPT_CONTROL|IPOPT_COPY)
#define IPVERSION 4
#define MAXTTL 255
#define IPDEFTTL 64
#define IPOPT_OPTVAL 0
#define IPOPT_OLEN 1
#define IPOPT_OFFSET 2
#define IPOPT_MINOFF 4
#define MAX_IPOPTLEN 40
#define IPOPT_NOP IPOPT_NOOP
#define IPOPT_EOL IPOPT_END
#define IPOPT_TS IPOPT_TIMESTAMP
#define IPOPT_TS_TSONLY 0 /* timestamps only */
#define IPOPT_TS_TSANDADDR 1 /* timestamps and addresses */
#define IPOPT_TS_PRESPEC 3 /* specified modules only */
#define IPV4_BEET_PHMAXLEN 8
struct iphdr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u8 ihl:4,
version:4;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u8 version:4,
ihl:4;
#else
#error "Please fix <asm/byteorder.h>"
#endif
__u8 tos;
__be16 tot_len;
__be16 id;
__be16 frag_off;
__u8 ttl;
__u8 protocol;
__sum16 check;
__be32 saddr;
__be32 daddr;
/*The options start here. */
};
struct ip_auth_hdr {
__u8 nexthdr;
__u8 hdrlen; /* This one is measured in 32 bit units! */
__be16 reserved;
__be32 spi;
__be32 seq_no; /* Sequence number */
__u8 auth_data[0]; /* Variable len but >=4. Mind the 64 bit alignment! */
};
struct ip_esp_hdr {
__be32 spi;
__be32 seq_no; /* Sequence number */
__u8 enc_data[0]; /* Variable len but >=8. Mind the 64 bit alignment! */
};
struct ip_comp_hdr {
__u8 nexthdr;
__u8 flags;
__be16 cpi;
};
struct ip_beet_phdr {
__u8 nexthdr;
__u8 hdrlen;
__u8 padlen;
__u8 reserved;
};
/* index values for the variables in ipv4_devconf */
enum
{
IPV4_DEVCONF_FORWARDING=1,
IPV4_DEVCONF_MC_FORWARDING,
IPV4_DEVCONF_PROXY_ARP,
IPV4_DEVCONF_ACCEPT_REDIRECTS,
IPV4_DEVCONF_SECURE_REDIRECTS,
IPV4_DEVCONF_SEND_REDIRECTS,
IPV4_DEVCONF_SHARED_MEDIA,
IPV4_DEVCONF_RP_FILTER,
IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE,
IPV4_DEVCONF_BOOTP_RELAY,
IPV4_DEVCONF_LOG_MARTIANS,
IPV4_DEVCONF_TAG,
IPV4_DEVCONF_ARPFILTER,
IPV4_DEVCONF_MEDIUM_ID,
IPV4_DEVCONF_NOXFRM,
IPV4_DEVCONF_NOPOLICY,
IPV4_DEVCONF_FORCE_IGMP_VERSION,
IPV4_DEVCONF_ARP_ANNOUNCE,
IPV4_DEVCONF_ARP_IGNORE,
IPV4_DEVCONF_PROMOTE_SECONDARIES,
IPV4_DEVCONF_ARP_ACCEPT,
IPV4_DEVCONF_ARP_NOTIFY,
IPV4_DEVCONF_ACCEPT_LOCAL,
IPV4_DEVCONF_SRC_VMARK,
IPV4_DEVCONF_PROXY_ARP_PVLAN,
IPV4_DEVCONF_ROUTE_LOCALNET,
IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL,
IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL,
IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
__IPV4_DEVCONF_MAX
};
#define IPV4_DEVCONF_MAX (__IPV4_DEVCONF_MAX - 1)
#endif /* _UAPI_LINUX_IP_H */