linux_dsm_epyc7002/include/uapi/linux/rtnetlink.h

780 lines
19 KiB
C
Raw Normal View History

License cleanup: add SPDX license identifier to uapi header files with no license Many user space API headers are missing licensing information, which makes it hard for compliance tools to determine the correct license. By default are files without license information under the default license of the kernel, which is GPLV2. Marking them GPLV2 would exclude them from being included in non GPLV2 code, which is obviously not intended. The user space API headers fall under the syscall exception which is in the kernels COPYING file: NOTE! This copyright does *not* cover user programs that use kernel services by normal system calls - this is merely considered normal use of the kernel, and does *not* fall under the heading of "derived work". otherwise syscall usage would not be possible. Update the files which contain no license information with an SPDX license identifier. The chosen identifier is 'GPL-2.0 WITH Linux-syscall-note' which is the officially assigned identifier for the Linux syscall exception. SPDX license identifiers are a legally binding shorthand, which can be used instead of the full boiler plate text. This patch is based on work done by Thomas Gleixner and Kate Stewart and Philippe Ombredanne. See the previous patch in this series for the methodology of how this patch was researched. Reviewed-by: Kate Stewart <kstewart@linuxfoundation.org> Reviewed-by: Philippe Ombredanne <pombredanne@nexb.com> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2017-11-01 21:08:43 +07:00
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI__LINUX_RTNETLINK_H
#define _UAPI__LINUX_RTNETLINK_H
#include <linux/types.h>
#include <linux/netlink.h>
#include <linux/if_link.h>
#include <linux/if_addr.h>
#include <linux/neighbour.h>
/* rtnetlink families. Values up to 127 are reserved for real address
* families, values above 128 may be used arbitrarily.
*/
#define RTNL_FAMILY_IPMR 128
#define RTNL_FAMILY_IP6MR 129
#define RTNL_FAMILY_MAX 129
/****
* Routing/neighbour discovery messages.
****/
/* Types of messages */
enum {
RTM_BASE = 16,
#define RTM_BASE RTM_BASE
RTM_NEWLINK = 16,
#define RTM_NEWLINK RTM_NEWLINK
RTM_DELLINK,
#define RTM_DELLINK RTM_DELLINK
RTM_GETLINK,
#define RTM_GETLINK RTM_GETLINK
RTM_SETLINK,
#define RTM_SETLINK RTM_SETLINK
RTM_NEWADDR = 20,
#define RTM_NEWADDR RTM_NEWADDR
RTM_DELADDR,
#define RTM_DELADDR RTM_DELADDR
RTM_GETADDR,
#define RTM_GETADDR RTM_GETADDR
RTM_NEWROUTE = 24,
#define RTM_NEWROUTE RTM_NEWROUTE
RTM_DELROUTE,
#define RTM_DELROUTE RTM_DELROUTE
RTM_GETROUTE,
#define RTM_GETROUTE RTM_GETROUTE
RTM_NEWNEIGH = 28,
#define RTM_NEWNEIGH RTM_NEWNEIGH
RTM_DELNEIGH,
#define RTM_DELNEIGH RTM_DELNEIGH
RTM_GETNEIGH,
#define RTM_GETNEIGH RTM_GETNEIGH
RTM_NEWRULE = 32,
#define RTM_NEWRULE RTM_NEWRULE
RTM_DELRULE,
#define RTM_DELRULE RTM_DELRULE
RTM_GETRULE,
#define RTM_GETRULE RTM_GETRULE
RTM_NEWQDISC = 36,
#define RTM_NEWQDISC RTM_NEWQDISC
RTM_DELQDISC,
#define RTM_DELQDISC RTM_DELQDISC
RTM_GETQDISC,
#define RTM_GETQDISC RTM_GETQDISC
RTM_NEWTCLASS = 40,
#define RTM_NEWTCLASS RTM_NEWTCLASS
RTM_DELTCLASS,
#define RTM_DELTCLASS RTM_DELTCLASS
RTM_GETTCLASS,
#define RTM_GETTCLASS RTM_GETTCLASS
RTM_NEWTFILTER = 44,
#define RTM_NEWTFILTER RTM_NEWTFILTER
RTM_DELTFILTER,
#define RTM_DELTFILTER RTM_DELTFILTER
RTM_GETTFILTER,
#define RTM_GETTFILTER RTM_GETTFILTER
RTM_NEWACTION = 48,
#define RTM_NEWACTION RTM_NEWACTION
RTM_DELACTION,
#define RTM_DELACTION RTM_DELACTION
RTM_GETACTION,
#define RTM_GETACTION RTM_GETACTION
RTM_NEWPREFIX = 52,
#define RTM_NEWPREFIX RTM_NEWPREFIX
RTM_GETMULTICAST = 58,
#define RTM_GETMULTICAST RTM_GETMULTICAST
RTM_GETANYCAST = 62,
#define RTM_GETANYCAST RTM_GETANYCAST
RTM_NEWNEIGHTBL = 64,
#define RTM_NEWNEIGHTBL RTM_NEWNEIGHTBL
RTM_GETNEIGHTBL = 66,
#define RTM_GETNEIGHTBL RTM_GETNEIGHTBL
RTM_SETNEIGHTBL,
#define RTM_SETNEIGHTBL RTM_SETNEIGHTBL
RTM_NEWNDUSEROPT = 68,
#define RTM_NEWNDUSEROPT RTM_NEWNDUSEROPT
RTM_NEWADDRLABEL = 72,
#define RTM_NEWADDRLABEL RTM_NEWADDRLABEL
RTM_DELADDRLABEL,
#define RTM_DELADDRLABEL RTM_DELADDRLABEL
RTM_GETADDRLABEL,
#define RTM_GETADDRLABEL RTM_GETADDRLABEL
RTM_GETDCB = 78,
#define RTM_GETDCB RTM_GETDCB
RTM_SETDCB,
#define RTM_SETDCB RTM_SETDCB
RTM_NEWNETCONF = 80,
#define RTM_NEWNETCONF RTM_NEWNETCONF
RTM_DELNETCONF,
#define RTM_DELNETCONF RTM_DELNETCONF
RTM_GETNETCONF = 82,
#define RTM_GETNETCONF RTM_GETNETCONF
RTM_NEWMDB = 84,
#define RTM_NEWMDB RTM_NEWMDB
RTM_DELMDB = 85,
#define RTM_DELMDB RTM_DELMDB
RTM_GETMDB = 86,
#define RTM_GETMDB RTM_GETMDB
RTM_NEWNSID = 88,
#define RTM_NEWNSID RTM_NEWNSID
RTM_DELNSID = 89,
#define RTM_DELNSID RTM_DELNSID
RTM_GETNSID = 90,
#define RTM_GETNSID RTM_GETNSID
rtnetlink: add new RTM_GETSTATS message to dump link stats This patch adds a new RTM_GETSTATS message to query link stats via netlink from the kernel. RTM_NEWLINK also dumps stats today, but RTM_NEWLINK returns a lot more than just stats and is expensive in some cases when frequent polling for stats from userspace is a common operation. RTM_GETSTATS is an attempt to provide a light weight netlink message to explicity query only link stats from the kernel on an interface. The idea is to also keep it extensible so that new kinds of stats can be added to it in the future. This patch adds the following attribute for NETDEV stats: struct nla_policy ifla_stats_policy[IFLA_STATS_MAX + 1] = { [IFLA_STATS_LINK_64] = { .len = sizeof(struct rtnl_link_stats64) }, }; Like any other rtnetlink message, RTM_GETSTATS can be used to get stats of a single interface or all interfaces with NLM_F_DUMP. Future possible new types of stat attributes: link af stats: - IFLA_STATS_LINK_IPV6 (nested. for ipv6 stats) - IFLA_STATS_LINK_MPLS (nested. for mpls/mdev stats) extended stats: - IFLA_STATS_LINK_EXTENDED (nested. extended software netdev stats like bridge, vlan, vxlan etc) - IFLA_STATS_LINK_HW_EXTENDED (nested. extended hardware stats which are available via ethtool today) This patch also declares a filter mask for all stat attributes. User has to provide a mask of stats attributes to query. filter mask can be specified in the new hdr 'struct if_stats_msg' for stats messages. Other important field in the header is the ifindex. This api can also include attributes for global stats (eg tcp) in the future. When global stats are included in a stats msg, the ifindex in the header must be zero. A single stats message cannot contain both global and netdev specific stats. To easily distinguish them, netdev specific stat attributes name are prefixed with IFLA_STATS_LINK_ Without any attributes in the filter_mask, no stats will be returned. This patch has been tested with mofified iproute2 ifstat. Suggested-by: Jamal Hadi Salim <jhs@mojatatu.com> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-20 22:43:43 +07:00
RTM_NEWSTATS = 92,
#define RTM_NEWSTATS RTM_NEWSTATS
RTM_GETSTATS = 94,
#define RTM_GETSTATS RTM_GETSTATS
RTM_NEWCACHEREPORT = 96,
#define RTM_NEWCACHEREPORT RTM_NEWCACHEREPORT
RTM_NEWCHAIN = 100,
#define RTM_NEWCHAIN RTM_NEWCHAIN
RTM_DELCHAIN,
#define RTM_DELCHAIN RTM_DELCHAIN
RTM_GETCHAIN,
#define RTM_GETCHAIN RTM_GETCHAIN
RTM_NEWNEXTHOP = 104,
#define RTM_NEWNEXTHOP RTM_NEWNEXTHOP
RTM_DELNEXTHOP,
#define RTM_DELNEXTHOP RTM_DELNEXTHOP
RTM_GETNEXTHOP,
#define RTM_GETNEXTHOP RTM_GETNEXTHOP
RTM_NEWLINKPROP = 108,
#define RTM_NEWLINKPROP RTM_NEWLINKPROP
RTM_DELLINKPROP,
#define RTM_DELLINKPROP RTM_DELLINKPROP
RTM_GETLINKPROP,
#define RTM_GETLINKPROP RTM_GETLINKPROP
RTM_NEWVLAN = 112,
#define RTM_NEWNVLAN RTM_NEWVLAN
RTM_DELVLAN,
#define RTM_DELVLAN RTM_DELVLAN
RTM_GETVLAN,
#define RTM_GETVLAN RTM_GETVLAN
__RTM_MAX,
#define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1)
};
#define RTM_NR_MSGTYPES (RTM_MAX + 1 - RTM_BASE)
#define RTM_NR_FAMILIES (RTM_NR_MSGTYPES >> 2)
#define RTM_FAM(cmd) (((cmd) - RTM_BASE) >> 2)
/*
Generic structure for encapsulation of optional route information.
It is reminiscent of sockaddr, but with sa_family replaced
with attribute type.
*/
struct rtattr {
unsigned short rta_len;
unsigned short rta_type;
};
/* Macros to handle rtattributes */
#define RTA_ALIGNTO 4U
#define RTA_ALIGN(len) ( ((len)+RTA_ALIGNTO-1) & ~(RTA_ALIGNTO-1) )
#define RTA_OK(rta,len) ((len) >= (int)sizeof(struct rtattr) && \
(rta)->rta_len >= sizeof(struct rtattr) && \
(rta)->rta_len <= (len))
#define RTA_NEXT(rta,attrlen) ((attrlen) -= RTA_ALIGN((rta)->rta_len), \
(struct rtattr*)(((char*)(rta)) + RTA_ALIGN((rta)->rta_len)))
#define RTA_LENGTH(len) (RTA_ALIGN(sizeof(struct rtattr)) + (len))
#define RTA_SPACE(len) RTA_ALIGN(RTA_LENGTH(len))
#define RTA_DATA(rta) ((void*)(((char*)(rta)) + RTA_LENGTH(0)))
#define RTA_PAYLOAD(rta) ((int)((rta)->rta_len) - RTA_LENGTH(0))
/******************************************************************************
* Definitions used in routing table administration.
****/
struct rtmsg {
unsigned char rtm_family;
unsigned char rtm_dst_len;
unsigned char rtm_src_len;
unsigned char rtm_tos;
unsigned char rtm_table; /* Routing table id */
unsigned char rtm_protocol; /* Routing protocol; see below */
unsigned char rtm_scope; /* See below */
unsigned char rtm_type; /* See below */
unsigned rtm_flags;
};
/* rtm_type */
enum {
RTN_UNSPEC,
RTN_UNICAST, /* Gateway or direct route */
RTN_LOCAL, /* Accept locally */
RTN_BROADCAST, /* Accept locally as broadcast,
send as broadcast */
RTN_ANYCAST, /* Accept locally as broadcast,
but send as unicast */
RTN_MULTICAST, /* Multicast route */
RTN_BLACKHOLE, /* Drop */
RTN_UNREACHABLE, /* Destination is unreachable */
RTN_PROHIBIT, /* Administratively prohibited */
RTN_THROW, /* Not in this table */
RTN_NAT, /* Translate this address */
RTN_XRESOLVE, /* Use external resolver */
__RTN_MAX
};
#define RTN_MAX (__RTN_MAX - 1)
/* rtm_protocol */
#define RTPROT_UNSPEC 0
#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirects;
not used by current IPv4 */
#define RTPROT_KERNEL 2 /* Route installed by kernel */
#define RTPROT_BOOT 3 /* Route installed during boot */
#define RTPROT_STATIC 4 /* Route installed by administrator */
/* Values of protocol >= RTPROT_STATIC are not interpreted by kernel;
they are just passed from user and back as is.
It will be used by hypothetical multiple routing daemons.
Note that protocol values should be standardized in order to
avoid conflicts.
*/
#define RTPROT_GATED 8 /* Apparently, GateD */
#define RTPROT_RA 9 /* RDISC/ND router advertisements */
#define RTPROT_MRT 10 /* Merit MRT */
#define RTPROT_ZEBRA 11 /* Zebra */
#define RTPROT_BIRD 12 /* BIRD */
#define RTPROT_DNROUTED 13 /* DECnet routing daemon */
#define RTPROT_XORP 14 /* XORP */
#define RTPROT_NTK 15 /* Netsukuku */
#define RTPROT_DHCP 16 /* DHCP client */
#define RTPROT_MROUTED 17 /* Multicast daemon */
#define RTPROT_BABEL 42 /* Babel daemon */
#define RTPROT_BGP 186 /* BGP Routes */
#define RTPROT_ISIS 187 /* ISIS Routes */
#define RTPROT_OSPF 188 /* OSPF Routes */
#define RTPROT_RIP 189 /* RIP Routes */
#define RTPROT_EIGRP 192 /* EIGRP Routes */
/* rtm_scope
Really it is not scope, but sort of distance to the destination.
NOWHERE are reserved for not existing destinations, HOST is our
local addresses, LINK are destinations, located on directly attached
link and UNIVERSE is everywhere in the Universe.
Intermediate values are also possible f.e. interior routes
could be assigned a value between UNIVERSE and LINK.
*/
enum rt_scope_t {
RT_SCOPE_UNIVERSE=0,
/* User defined values */
RT_SCOPE_SITE=200,
RT_SCOPE_LINK=253,
RT_SCOPE_HOST=254,
RT_SCOPE_NOWHERE=255
};
/* rtm_flags */
#define RTM_F_NOTIFY 0x100 /* Notify user of route change */
#define RTM_F_CLONED 0x200 /* This route is cloned */
#define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */
#define RTM_F_PREFIX 0x800 /* Prefix addresses */
#define RTM_F_LOOKUP_TABLE 0x1000 /* set rtm_table to FIB lookup result */
#define RTM_F_FIB_MATCH 0x2000 /* return full fib lookup match */
ipv4: Add "offload" and "trap" indications to routes When performing L3 offload, routes and nexthops are usually programmed into two different tables in the underlying device. Therefore, the fact that a nexthop resides in hardware does not necessarily mean that all the associated routes also reside in hardware and vice-versa. While the kernel can signal to user space the presence of a nexthop in hardware (via 'RTNH_F_OFFLOAD'), it does not have a corresponding flag for routes. In addition, the fact that a route resides in hardware does not necessarily mean that the traffic is offloaded. For example, unreachable routes (i.e., 'RTN_UNREACHABLE') are programmed to trap packets to the CPU so that the kernel will be able to generate the appropriate ICMP error packet. This patch adds an "offload" and "trap" indications to IPv4 routes, so that users will have better visibility into the offload process. 'struct fib_alias' is extended with two new fields that indicate if the route resides in hardware or not and if it is offloading traffic from the kernel or trapping packets to it. Note that the new fields are added in the 6 bytes hole and therefore the struct still fits in a single cache line [1]. Capable drivers are expected to invoke fib_alias_hw_flags_set() with the route's key in order to set the flags. The indications are dumped to user space via a new flags (i.e., 'RTM_F_OFFLOAD' and 'RTM_F_TRAP') in the 'rtm_flags' field in the ancillary header. v2: * Make use of 'struct fib_rt_info' in fib_alias_hw_flags_set() [1] struct fib_alias { struct hlist_node fa_list; /* 0 16 */ struct fib_info * fa_info; /* 16 8 */ u8 fa_tos; /* 24 1 */ u8 fa_type; /* 25 1 */ u8 fa_state; /* 26 1 */ u8 fa_slen; /* 27 1 */ u32 tb_id; /* 28 4 */ s16 fa_default; /* 32 2 */ u8 offload:1; /* 34: 0 1 */ u8 trap:1; /* 34: 1 1 */ u8 unused:6; /* 34: 2 1 */ /* XXX 5 bytes hole, try to pack */ struct callback_head rcu __attribute__((__aligned__(8))); /* 40 16 */ /* size: 56, cachelines: 1, members: 12 */ /* sum members: 50, holes: 1, sum holes: 5 */ /* sum bitfield members: 8 bits (1 bytes) */ /* forced alignments: 1, forced holes: 1, sum forced holes: 5 */ /* last cacheline: 56 bytes */ } __attribute__((__aligned__(8))); Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: David Ahern <dsahern@gmail.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-14 18:23:11 +07:00
#define RTM_F_OFFLOAD 0x4000 /* route is offloaded */
#define RTM_F_TRAP 0x8000 /* route is trapping packets */
/* Reserved table identifiers */
enum rt_class_t {
RT_TABLE_UNSPEC=0,
/* User defined values */
RT_TABLE_COMPAT=252,
RT_TABLE_DEFAULT=253,
RT_TABLE_MAIN=254,
RT_TABLE_LOCAL=255,
RT_TABLE_MAX=0xFFFFFFFF
};
/* Routing message attributes */
enum rtattr_type_t {
RTA_UNSPEC,
RTA_DST,
RTA_SRC,
RTA_IIF,
RTA_OIF,
RTA_GATEWAY,
RTA_PRIORITY,
RTA_PREFSRC,
RTA_METRICS,
RTA_MULTIPATH,
RTA_PROTOINFO, /* no longer used */
RTA_FLOW,
RTA_CACHEINFO,
RTA_SESSION, /* no longer used */
RTA_MP_ALGO, /* no longer used */
RTA_TABLE,
RTA_MARK,
RTA_MFC_STATS,
mpls: Netlink commands to add, remove, and dump routes This change adds two new netlink routing attributes: RTA_VIA and RTA_NEWDST. RTA_VIA specifies the specifies the next machine to send a packet to like RTA_GATEWAY. RTA_VIA differs from RTA_GATEWAY in that it includes the address family of the address of the next machine to send a packet to. Currently the MPLS code supports addresses in AF_INET, AF_INET6 and AF_PACKET. For AF_INET and AF_INET6 the destination mac address is acquired from the neighbour table. For AF_PACKET the destination mac_address is specified in the netlink configuration. I think raw destination mac address support with the family AF_PACKET will prove useful. There is MPLS-TP which is defined to operate on machines that do not support internet packets of any flavor. Further seem to be corner cases where it can be useful. At this point I don't care much either way. RTA_NEWDST specifies the destination address to forward the packet with. MPLS typically changes it's destination address at every hop. For a swap operation RTA_NEWDST is specified with a length of one label. For a push operation RTA_NEWDST is specified with two or more labels. For a pop operation RTA_NEWDST is not specified or equivalently an emtpy RTAN_NEWDST is specified. Those new netlink attributes are used to implement handling of rt-netlink RTM_NEWROUTE, RTM_DELROUTE, and RTM_GETROUTE messages, to maintain the MPLS label table. rtm_to_route_config parses a netlink RTM_NEWROUTE or RTM_DELROUTE message, verify no unhandled attributes or unhandled values are present and sets up the data structures for mpls_route_add and mpls_route_del. I did my best to match up with the existing conventions with the caveats that MPLS addresses are all destination-specific-addresses, and so don't properly have a scope. Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-04 08:13:56 +07:00
RTA_VIA,
RTA_NEWDST,
RTA_PREF,
RTA_ENCAP_TYPE,
RTA_ENCAP,
RTA_EXPIRES,
RTA_PAD,
RTA_UID,
RTA_TTL_PROPAGATE,
RTA_IP_PROTO,
RTA_SPORT,
RTA_DPORT,
RTA_NH_ID,
__RTA_MAX
};
#define RTA_MAX (__RTA_MAX - 1)
#define RTM_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtmsg))))
#define RTM_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct rtmsg))
/* RTM_MULTIPATH --- array of struct rtnexthop.
*
* "struct rtnexthop" describes all necessary nexthop information,
* i.e. parameters of path to a destination via this nexthop.
*
* At the moment it is impossible to set different prefsrc, mtu, window
* and rtt for different paths from multipath.
*/
struct rtnexthop {
unsigned short rtnh_len;
unsigned char rtnh_flags;
unsigned char rtnh_hops;
int rtnh_ifindex;
};
/* rtnh_flags */
#define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */
#define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */
#define RTNH_F_ONLINK 4 /* Gateway is forced on link */
#define RTNH_F_OFFLOAD 8 /* offloaded route */
#define RTNH_F_LINKDOWN 16 /* carrier-down on nexthop */
#define RTNH_F_UNRESOLVED 32 /* The entry is unresolved (ipmr) */
#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN | RTNH_F_OFFLOAD)
/* Macros to handle hexthops */
#define RTNH_ALIGNTO 4
#define RTNH_ALIGN(len) ( ((len)+RTNH_ALIGNTO-1) & ~(RTNH_ALIGNTO-1) )
#define RTNH_OK(rtnh,len) ((rtnh)->rtnh_len >= sizeof(struct rtnexthop) && \
((int)(rtnh)->rtnh_len) <= (len))
#define RTNH_NEXT(rtnh) ((struct rtnexthop*)(((char*)(rtnh)) + RTNH_ALIGN((rtnh)->rtnh_len)))
#define RTNH_LENGTH(len) (RTNH_ALIGN(sizeof(struct rtnexthop)) + (len))
#define RTNH_SPACE(len) RTNH_ALIGN(RTNH_LENGTH(len))
#define RTNH_DATA(rtnh) ((struct rtattr*)(((char*)(rtnh)) + RTNH_LENGTH(0)))
mpls: Netlink commands to add, remove, and dump routes This change adds two new netlink routing attributes: RTA_VIA and RTA_NEWDST. RTA_VIA specifies the specifies the next machine to send a packet to like RTA_GATEWAY. RTA_VIA differs from RTA_GATEWAY in that it includes the address family of the address of the next machine to send a packet to. Currently the MPLS code supports addresses in AF_INET, AF_INET6 and AF_PACKET. For AF_INET and AF_INET6 the destination mac address is acquired from the neighbour table. For AF_PACKET the destination mac_address is specified in the netlink configuration. I think raw destination mac address support with the family AF_PACKET will prove useful. There is MPLS-TP which is defined to operate on machines that do not support internet packets of any flavor. Further seem to be corner cases where it can be useful. At this point I don't care much either way. RTA_NEWDST specifies the destination address to forward the packet with. MPLS typically changes it's destination address at every hop. For a swap operation RTA_NEWDST is specified with a length of one label. For a push operation RTA_NEWDST is specified with two or more labels. For a pop operation RTA_NEWDST is not specified or equivalently an emtpy RTAN_NEWDST is specified. Those new netlink attributes are used to implement handling of rt-netlink RTM_NEWROUTE, RTM_DELROUTE, and RTM_GETROUTE messages, to maintain the MPLS label table. rtm_to_route_config parses a netlink RTM_NEWROUTE or RTM_DELROUTE message, verify no unhandled attributes or unhandled values are present and sets up the data structures for mpls_route_add and mpls_route_del. I did my best to match up with the existing conventions with the caveats that MPLS addresses are all destination-specific-addresses, and so don't properly have a scope. Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-04 08:13:56 +07:00
/* RTA_VIA */
struct rtvia {
__kernel_sa_family_t rtvia_family;
__u8 rtvia_addr[0];
};
/* RTM_CACHEINFO */
struct rta_cacheinfo {
__u32 rta_clntref;
__u32 rta_lastuse;
__s32 rta_expires;
__u32 rta_error;
__u32 rta_used;
#define RTNETLINK_HAVE_PEERINFO 1
__u32 rta_id;
__u32 rta_ts;
__u32 rta_tsage;
};
/* RTM_METRICS --- array of struct rtattr with types of RTAX_* */
enum {
RTAX_UNSPEC,
#define RTAX_UNSPEC RTAX_UNSPEC
RTAX_LOCK,
#define RTAX_LOCK RTAX_LOCK
RTAX_MTU,
#define RTAX_MTU RTAX_MTU
RTAX_WINDOW,
#define RTAX_WINDOW RTAX_WINDOW
RTAX_RTT,
#define RTAX_RTT RTAX_RTT
RTAX_RTTVAR,
#define RTAX_RTTVAR RTAX_RTTVAR
RTAX_SSTHRESH,
#define RTAX_SSTHRESH RTAX_SSTHRESH
RTAX_CWND,
#define RTAX_CWND RTAX_CWND
RTAX_ADVMSS,
#define RTAX_ADVMSS RTAX_ADVMSS
RTAX_REORDERING,
#define RTAX_REORDERING RTAX_REORDERING
RTAX_HOPLIMIT,
#define RTAX_HOPLIMIT RTAX_HOPLIMIT
RTAX_INITCWND,
#define RTAX_INITCWND RTAX_INITCWND
RTAX_FEATURES,
#define RTAX_FEATURES RTAX_FEATURES
RTAX_RTO_MIN,
#define RTAX_RTO_MIN RTAX_RTO_MIN
RTAX_INITRWND,
#define RTAX_INITRWND RTAX_INITRWND
tcp: introduce a per-route knob for quick ack In previous discussions, I tried to find some reasonable heuristics for delayed ACK, however this seems not possible, according to Eric: "ACKS might also be delayed because of bidirectional traffic, and is more controlled by the application response time. TCP stack can not easily estimate it." "ACK can be incredibly useful to recover from losses in a short time. The vast majority of TCP sessions are small lived, and we send one ACK per received segment anyway at beginning or retransmits to let the sender smoothly increase its cwnd, so an auto-tuning facility wont help them that much." and according to David: "ACKs are the only information we have to detect loss. And, for the same reasons that TCP VEGAS is fundamentally broken, we cannot measure the pipe or some other receiver-side-visible piece of information to determine when it's "safe" to stretch ACK. And even if it's "safe", we should not do it so that losses are accurately detected and we don't spuriously retransmit. The only way to know when the bandwidth increases is to "test" it, by sending more and more packets until drops happen. That's why all successful congestion control algorithms must operate on explicited tested pieces of information. Similarly, it's not really possible to universally know if it's safe to stretch ACK or not." It still makes sense to enable or disable quick ack mode like what TCP_QUICK_ACK does. Similar to TCP_QUICK_ACK option, but for people who can't modify the source code and still wants to control TCP delayed ACK behavior. As David suggested, this should belong to per-path scope, since different pathes may want different behaviors. Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Rick Jones <rick.jones2@hp.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: Thomas Graf <tgraf@suug.ch> CC: David Laight <David.Laight@ACULAB.COM> Signed-off-by: Cong Wang <amwang@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-06-15 08:39:18 +07:00
RTAX_QUICKACK,
#define RTAX_QUICKACK RTAX_QUICKACK
RTAX_CC_ALGO,
#define RTAX_CC_ALGO RTAX_CC_ALGO
RTAX_FASTOPEN_NO_COOKIE,
#define RTAX_FASTOPEN_NO_COOKIE RTAX_FASTOPEN_NO_COOKIE
__RTAX_MAX
};
#define RTAX_MAX (__RTAX_MAX - 1)
#define RTAX_FEATURE_ECN (1 << 0)
#define RTAX_FEATURE_SACK (1 << 1)
#define RTAX_FEATURE_TIMESTAMP (1 << 2)
#define RTAX_FEATURE_ALLFRAG (1 << 3)
#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \
RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG)
struct rta_session {
__u8 proto;
__u8 pad1;
__u16 pad2;
union {
struct {
__u16 sport;
__u16 dport;
} ports;
struct {
__u8 type;
__u8 code;
__u16 ident;
} icmpt;
__u32 spi;
} u;
};
struct rta_mfc_stats {
__u64 mfcs_packets;
__u64 mfcs_bytes;
__u64 mfcs_wrong_if;
};
/****
* General form of address family dependent message.
****/
struct rtgenmsg {
unsigned char rtgen_family;
};
/*****************************************************************
* Link layer specific messages.
****/
/* struct ifinfomsg
* passes link level specific information, not dependent
* on network protocol.
*/
struct ifinfomsg {
unsigned char ifi_family;
unsigned char __ifi_pad;
unsigned short ifi_type; /* ARPHRD_* */
int ifi_index; /* Link index */
unsigned ifi_flags; /* IFF_* flags */
unsigned ifi_change; /* IFF_* change mask */
};
/********************************************************************
* prefix information
****/
struct prefixmsg {
unsigned char prefix_family;
unsigned char prefix_pad1;
unsigned short prefix_pad2;
int prefix_ifindex;
unsigned char prefix_type;
unsigned char prefix_len;
unsigned char prefix_flags;
unsigned char prefix_pad3;
};
enum
{
PREFIX_UNSPEC,
PREFIX_ADDRESS,
PREFIX_CACHEINFO,
__PREFIX_MAX
};
#define PREFIX_MAX (__PREFIX_MAX - 1)
struct prefix_cacheinfo {
__u32 preferred_time;
__u32 valid_time;
};
/*****************************************************************
* Traffic control messages.
****/
struct tcmsg {
unsigned char tcm_family;
unsigned char tcm__pad1;
unsigned short tcm__pad2;
int tcm_ifindex;
__u32 tcm_handle;
__u32 tcm_parent;
/* tcm_block_index is used instead of tcm_parent
* in case tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK
*/
#define tcm_block_index tcm_parent
__u32 tcm_info;
};
/* For manipulation of filters in shared block, tcm_ifindex is set to
* TCM_IFINDEX_MAGIC_BLOCK, and tcm_parent is aliased to tcm_block_index
* which is the block index.
*/
#define TCM_IFINDEX_MAGIC_BLOCK (0xFFFFFFFFU)
enum {
TCA_UNSPEC,
TCA_KIND,
TCA_OPTIONS,
TCA_STATS,
TCA_XSTATS,
TCA_RATE,
TCA_FCNT,
TCA_STATS2,
TCA_STAB,
TCA_PAD,
TCA_DUMP_INVISIBLE,
TCA_CHAIN,
TCA_HW_OFFLOAD,
TCA_INGRESS_BLOCK,
TCA_EGRESS_BLOCK,
__TCA_MAX
};
#define TCA_MAX (__TCA_MAX - 1)
#define TCA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcmsg))))
#define TCA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcmsg))
/********************************************************************
* Neighbor Discovery userland options
****/
struct nduseroptmsg {
unsigned char nduseropt_family;
unsigned char nduseropt_pad1;
unsigned short nduseropt_opts_len; /* Total length of options */
int nduseropt_ifindex;
__u8 nduseropt_icmp_type;
__u8 nduseropt_icmp_code;
unsigned short nduseropt_pad2;
unsigned int nduseropt_pad3;
/* Followed by one or more ND options */
};
enum {
NDUSEROPT_UNSPEC,
NDUSEROPT_SRCADDR,
__NDUSEROPT_MAX
};
#define NDUSEROPT_MAX (__NDUSEROPT_MAX - 1)
#ifndef __KERNEL__
/* RTnetlink multicast groups - backwards compatibility for userspace */
#define RTMGRP_LINK 1
#define RTMGRP_NOTIFY 2
#define RTMGRP_NEIGH 4
#define RTMGRP_TC 8
#define RTMGRP_IPV4_IFADDR 0x10
#define RTMGRP_IPV4_MROUTE 0x20
#define RTMGRP_IPV4_ROUTE 0x40
#define RTMGRP_IPV4_RULE 0x80
#define RTMGRP_IPV6_IFADDR 0x100
#define RTMGRP_IPV6_MROUTE 0x200
#define RTMGRP_IPV6_ROUTE 0x400
#define RTMGRP_IPV6_IFINFO 0x800
#define RTMGRP_DECnet_IFADDR 0x1000
#define RTMGRP_DECnet_ROUTE 0x4000
#define RTMGRP_IPV6_PREFIX 0x20000
#endif
/* RTnetlink multicast groups */
enum rtnetlink_groups {
RTNLGRP_NONE,
#define RTNLGRP_NONE RTNLGRP_NONE
RTNLGRP_LINK,
#define RTNLGRP_LINK RTNLGRP_LINK
RTNLGRP_NOTIFY,
#define RTNLGRP_NOTIFY RTNLGRP_NOTIFY
RTNLGRP_NEIGH,
#define RTNLGRP_NEIGH RTNLGRP_NEIGH
RTNLGRP_TC,
#define RTNLGRP_TC RTNLGRP_TC
RTNLGRP_IPV4_IFADDR,
#define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR
RTNLGRP_IPV4_MROUTE,
#define RTNLGRP_IPV4_MROUTE RTNLGRP_IPV4_MROUTE
RTNLGRP_IPV4_ROUTE,
#define RTNLGRP_IPV4_ROUTE RTNLGRP_IPV4_ROUTE
RTNLGRP_IPV4_RULE,
#define RTNLGRP_IPV4_RULE RTNLGRP_IPV4_RULE
RTNLGRP_IPV6_IFADDR,
#define RTNLGRP_IPV6_IFADDR RTNLGRP_IPV6_IFADDR
RTNLGRP_IPV6_MROUTE,
#define RTNLGRP_IPV6_MROUTE RTNLGRP_IPV6_MROUTE
RTNLGRP_IPV6_ROUTE,
#define RTNLGRP_IPV6_ROUTE RTNLGRP_IPV6_ROUTE
RTNLGRP_IPV6_IFINFO,
#define RTNLGRP_IPV6_IFINFO RTNLGRP_IPV6_IFINFO
RTNLGRP_DECnet_IFADDR,
#define RTNLGRP_DECnet_IFADDR RTNLGRP_DECnet_IFADDR
RTNLGRP_NOP2,
RTNLGRP_DECnet_ROUTE,
#define RTNLGRP_DECnet_ROUTE RTNLGRP_DECnet_ROUTE
RTNLGRP_DECnet_RULE,
#define RTNLGRP_DECnet_RULE RTNLGRP_DECnet_RULE
RTNLGRP_NOP4,
RTNLGRP_IPV6_PREFIX,
#define RTNLGRP_IPV6_PREFIX RTNLGRP_IPV6_PREFIX
RTNLGRP_IPV6_RULE,
#define RTNLGRP_IPV6_RULE RTNLGRP_IPV6_RULE
RTNLGRP_ND_USEROPT,
#define RTNLGRP_ND_USEROPT RTNLGRP_ND_USEROPT
RTNLGRP_PHONET_IFADDR,
#define RTNLGRP_PHONET_IFADDR RTNLGRP_PHONET_IFADDR
RTNLGRP_PHONET_ROUTE,
#define RTNLGRP_PHONET_ROUTE RTNLGRP_PHONET_ROUTE
RTNLGRP_DCB,
#define RTNLGRP_DCB RTNLGRP_DCB
RTNLGRP_IPV4_NETCONF,
#define RTNLGRP_IPV4_NETCONF RTNLGRP_IPV4_NETCONF
RTNLGRP_IPV6_NETCONF,
#define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF
RTNLGRP_MDB,
#define RTNLGRP_MDB RTNLGRP_MDB
RTNLGRP_MPLS_ROUTE,
#define RTNLGRP_MPLS_ROUTE RTNLGRP_MPLS_ROUTE
RTNLGRP_NSID,
#define RTNLGRP_NSID RTNLGRP_NSID
RTNLGRP_MPLS_NETCONF,
#define RTNLGRP_MPLS_NETCONF RTNLGRP_MPLS_NETCONF
RTNLGRP_IPV4_MROUTE_R,
#define RTNLGRP_IPV4_MROUTE_R RTNLGRP_IPV4_MROUTE_R
RTNLGRP_IPV6_MROUTE_R,
#define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R
RTNLGRP_NEXTHOP,
#define RTNLGRP_NEXTHOP RTNLGRP_NEXTHOP
RTNLGRP_BRVLAN,
#define RTNLGRP_BRVLAN RTNLGRP_BRVLAN
__RTNLGRP_MAX
};
#define RTNLGRP_MAX (__RTNLGRP_MAX - 1)
/* TC action piece */
struct tcamsg {
unsigned char tca_family;
unsigned char tca__pad1;
unsigned short tca__pad2;
};
net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch When you dump hundreds of thousands of actions, getting only 32 per dump batch even when the socket buffer and memory allocations allow is inefficient. With this change, the user will get as many as possibly fitting within the given constraints available to the kernel. The top level action TLV space is extended. An attribute TCA_ROOT_FLAGS is used to carry flags; flag TCA_FLAG_LARGE_DUMP_ON is set by the user indicating the user is capable of processing these large dumps. Older user space which doesnt set this flag doesnt get the large (than 32) batches. The kernel uses the TCA_ROOT_COUNT attribute to tell the user how many actions are put in a single batch. As such user space app knows how long to iterate (independent of the type of action being dumped) instead of hardcoded maximum of 32 thus maintaining backward compat. Some results dumping 1.5M actions below: first an unpatched tc which doesnt understand these features... prompt$ time -p tc actions ls action gact | grep index | wc -l 1500000 real 1388.43 user 2.07 sys 1386.79 Now lets see a patched tc which sets the correct flags when requesting a dump: prompt$ time -p updatedtc actions ls action gact | grep index | wc -l 1500000 real 178.13 user 2.02 sys 176.96 That is about 8x performance improvement for tc app which sets its receive buffer to about 32K. Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-31 00:24:51 +07:00
enum {
TCA_ROOT_UNSPEC,
TCA_ROOT_TAB,
#define TCA_ACT_TAB TCA_ROOT_TAB
#define TCAA_MAX TCA_ROOT_TAB
TCA_ROOT_FLAGS,
TCA_ROOT_COUNT,
net sched actions: add time filter for action dumping This patch adds support for filtering based on time since last used. When we are dumping a large number of actions it is useful to have the option of filtering based on when the action was last used to reduce the amount of data crossing to user space. With this patch the user space app sets the TCA_ROOT_TIME_DELTA attribute with the value in milliseconds with "time of interest since now". The kernel converts this to jiffies and does the filtering comparison matching entries that have seen activity since then and returns them to user space. Old kernels and old tc continue to work in legacy mode since they dont specify this attribute. Some example (we have 400 actions bound to 400 filters); at installation time. Using updated when tc setting the time of interest to 120 seconds earlier (we see 400 actions): prompt$ hackedtc actions ls action gact since 120000| grep index | wc -l 400 go get some coffee and wait for > 120 seconds and try again: prompt$ hackedtc actions ls action gact since 120000 | grep index | wc -l 0 Lets see a filter bound to one of these actions: .... filter pref 10 u32 filter pref 10 u32 fh 800: ht divisor 1 filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10 (rule hit 2 success 1) match 7f000002/ffffffff at 12 (success 1 ) action order 1: gact action pass random type none pass val 0 index 23 ref 2 bind 1 installed 1145 sec used 802 sec Action statistics: Sent 84 bytes 1 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 .... that coffee took long, no? It was good. Now lets ping -c 1 127.0.0.2, then run the actions again: prompt$ hackedtc actions ls action gact since 120 | grep index | wc -l 1 More details please: prompt$ hackedtc -s actions ls action gact since 120000 action order 0: gact action pass random type none pass val 0 index 23 ref 2 bind 1 installed 1270 sec used 30 sec Action statistics: Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 And the filter? filter pref 10 u32 filter pref 10 u32 fh 800: ht divisor 1 filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10 (rule hit 4 success 2) match 7f000002/ffffffff at 12 (success 2 ) action order 1: gact action pass random type none pass val 0 index 23 ref 2 bind 1 installed 1324 sec used 84 sec Action statistics: Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-31 00:24:52 +07:00
TCA_ROOT_TIME_DELTA, /* in msecs */
net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch When you dump hundreds of thousands of actions, getting only 32 per dump batch even when the socket buffer and memory allocations allow is inefficient. With this change, the user will get as many as possibly fitting within the given constraints available to the kernel. The top level action TLV space is extended. An attribute TCA_ROOT_FLAGS is used to carry flags; flag TCA_FLAG_LARGE_DUMP_ON is set by the user indicating the user is capable of processing these large dumps. Older user space which doesnt set this flag doesnt get the large (than 32) batches. The kernel uses the TCA_ROOT_COUNT attribute to tell the user how many actions are put in a single batch. As such user space app knows how long to iterate (independent of the type of action being dumped) instead of hardcoded maximum of 32 thus maintaining backward compat. Some results dumping 1.5M actions below: first an unpatched tc which doesnt understand these features... prompt$ time -p tc actions ls action gact | grep index | wc -l 1500000 real 1388.43 user 2.07 sys 1386.79 Now lets see a patched tc which sets the correct flags when requesting a dump: prompt$ time -p updatedtc actions ls action gact | grep index | wc -l 1500000 real 178.13 user 2.02 sys 176.96 That is about 8x performance improvement for tc app which sets its receive buffer to about 32K. Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-31 00:24:51 +07:00
__TCA_ROOT_MAX,
#define TCA_ROOT_MAX (__TCA_ROOT_MAX - 1)
};
#define TA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg))))
#define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg))
net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch When you dump hundreds of thousands of actions, getting only 32 per dump batch even when the socket buffer and memory allocations allow is inefficient. With this change, the user will get as many as possibly fitting within the given constraints available to the kernel. The top level action TLV space is extended. An attribute TCA_ROOT_FLAGS is used to carry flags; flag TCA_FLAG_LARGE_DUMP_ON is set by the user indicating the user is capable of processing these large dumps. Older user space which doesnt set this flag doesnt get the large (than 32) batches. The kernel uses the TCA_ROOT_COUNT attribute to tell the user how many actions are put in a single batch. As such user space app knows how long to iterate (independent of the type of action being dumped) instead of hardcoded maximum of 32 thus maintaining backward compat. Some results dumping 1.5M actions below: first an unpatched tc which doesnt understand these features... prompt$ time -p tc actions ls action gact | grep index | wc -l 1500000 real 1388.43 user 2.07 sys 1386.79 Now lets see a patched tc which sets the correct flags when requesting a dump: prompt$ time -p updatedtc actions ls action gact | grep index | wc -l 1500000 real 178.13 user 2.02 sys 176.96 That is about 8x performance improvement for tc app which sets its receive buffer to about 32K. Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-07-31 00:24:51 +07:00
/* tcamsg flags stored in attribute TCA_ROOT_FLAGS
*
* TCA_FLAG_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
* actions in a dump. All dump responses will contain the number of actions
* being dumped stored in for user app's consumption in TCA_ROOT_COUNT
*
*/
#define TCA_FLAG_LARGE_DUMP_ON (1 << 0)
/* New extended info filters for IFLA_EXT_MASK */
#define RTEXT_FILTER_VF (1 << 0)
#define RTEXT_FILTER_BRVLAN (1 << 1)
#define RTEXT_FILTER_BRVLAN_COMPRESSED (1 << 2)
#define RTEXT_FILTER_SKIP_STATS (1 << 3)
/* End of information exported to user level */
#endif /* _UAPI__LINUX_RTNETLINK_H */