mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-17 17:46:43 +07:00
1f211a1b92
This work adds a generalization of the ingress qdisc as a qdisc holding only classifiers. The clsact qdisc works on ingress, but also on egress. In both cases, it's execution happens without taking the qdisc lock, and the main difference for the egress part compared to prior version of [1] is that this can be applied with _any_ underlying real egress qdisc (also classless ones). Besides solving the use-case of [1], that is, allowing for more programmability on assigning skb->priority for the mqprio case that is supported by most popular 10G+ NICs, it also opens up a lot more flexibility for other tc applications. The main work on classification can already be done at clsact egress time if the use-case allows and state stored for later retrieval f.e. again in skb->priority with major/minors (which is checked by most classful qdiscs before consulting tc_classify()) and/or in other skb fields like skb->tc_index for some light-weight post-processing to get to the eventual classid in case of a classful qdisc. Another use case is that the clsact egress part allows to have a central egress counterpart to the ingress classifiers, so that classifiers can easily share state (e.g. in cls_bpf via eBPF maps) for ingress and egress. Currently, default setups like mq + pfifo_fast would require for this to use, for example, prio qdisc instead (to get a tc_classify() run) and to duplicate the egress classifier for each queue. With clsact, it allows for leaving the setup as is, it can additionally assign skb->priority to put the skb in one of pfifo_fast's bands and it can share state with maps. Moreover, we can access the skb's dst entry (f.e. to retrieve tclassid) w/o the need to perform a skb_dst_force() to hold on to it any longer. In lwt case, we can also use this facility to setup dst metadata via cls_bpf (bpf_skb_set_tunnel_key()) without needing a real egress qdisc just for that (case of IFF_NO_QUEUE devices, for example). The realization can be done without any changes to the scheduler core framework. All it takes is that we have two a-priori defined minors/child classes, where we can mux between ingress and egress classifier list (dev->ingress_cl_list and dev->egress_cl_list, latter stored close to dev->_tx to avoid extra cacheline miss for moderate loads). The egress part is a bit similar modelled to handle_ing() and patched to a noop in case the functionality is not used. Both handlers are now called sch_handle_ingress() and sch_handle_egress(), code sharing among the two doesn't seem practical as there are various minor differences in both paths, so that making them conditional in a single handler would rather slow things down. Full compatibility to ingress qdisc is provided as well. Since both piggyback on TC_H_CLSACT, only one of them (ingress/clsact) can exist per netdevice, and thus ingress qdisc specific behaviour can be retained for user space. This means, either a user does 'tc qdisc add dev foo ingress' and configures ingress qdisc as usual, or the 'tc qdisc add dev foo clsact' alternative, where both, ingress and egress classifier can be configured as in the below example. ingress qdisc supports attaching classifier to any minor number whereas clsact has two fixed minors for muxing between the lists, therefore to not break user space setups, they are better done as two separate qdiscs. I decided to extend the sch_ingress module with clsact functionality so that commonly used code can be reused, the module is being aliased with sch_clsact so that it can be auto-loaded properly. Alternative would have been to add a flag when initializing ingress to alter its behaviour plus aliasing to a different name (as it's more than just ingress). However, the first would end up, based on the flag, choosing the new/old behaviour by calling different function implementations to handle each anyway, the latter would require to register ingress qdisc once again under different alias. So, this really begs to provide a minimal, cleaner approach to have Qdisc_ops and Qdisc_class_ops by its own that share callbacks used by both. Example, adding qdisc: # tc qdisc add dev foo clsact # tc qdisc show dev foo qdisc mq 0: root qdisc pfifo_fast 0: parent :1 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :2 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :3 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc pfifo_fast 0: parent :4 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 qdisc clsact ffff: parent ffff:fff1 Adding filters (deleting, etc works analogous by specifying ingress/egress): # tc filter add dev foo ingress bpf da obj bar.o sec ingress # tc filter add dev foo egress bpf da obj bar.o sec egress # tc filter show dev foo ingress filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bar.o:[ingress] direct-action # tc filter show dev foo egress filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 bar.o:[egress] direct-action A 'tc filter show dev foo' or 'tc filter show dev foo parent ffff:' will show an empty list for clsact. Either using the parent names (ingress/egress) or specifying the full major/minor will then show the related filter lists. Prior work on a mqprio prequeue() facility [1] was done mainly by John Fastabend. [1] http://patchwork.ozlabs.org/patch/512949/ Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: John Fastabend <john.r.fastabend@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
128 lines
4.0 KiB
C
128 lines
4.0 KiB
C
#ifndef __LINUX_RTNETLINK_H
|
|
#define __LINUX_RTNETLINK_H
|
|
|
|
|
|
#include <linux/mutex.h>
|
|
#include <linux/netdevice.h>
|
|
#include <linux/wait.h>
|
|
#include <uapi/linux/rtnetlink.h>
|
|
|
|
extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
|
|
extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
|
|
extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
|
|
u32 group, struct nlmsghdr *nlh, gfp_t flags);
|
|
extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
|
|
extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
|
|
extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
|
|
u32 id, long expires, u32 error);
|
|
|
|
void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
|
|
struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
|
|
unsigned change, gfp_t flags);
|
|
void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
|
|
gfp_t flags);
|
|
|
|
|
|
/* RTNL is used as a global lock for all changes to network configuration */
|
|
extern void rtnl_lock(void);
|
|
extern void rtnl_unlock(void);
|
|
extern int rtnl_trylock(void);
|
|
extern int rtnl_is_locked(void);
|
|
|
|
extern wait_queue_head_t netdev_unregistering_wq;
|
|
extern struct mutex net_mutex;
|
|
|
|
#ifdef CONFIG_PROVE_LOCKING
|
|
extern bool lockdep_rtnl_is_held(void);
|
|
#else
|
|
static inline bool lockdep_rtnl_is_held(void)
|
|
{
|
|
return true;
|
|
}
|
|
#endif /* #ifdef CONFIG_PROVE_LOCKING */
|
|
|
|
/**
|
|
* rcu_dereference_rtnl - rcu_dereference with debug checking
|
|
* @p: The pointer to read, prior to dereferencing
|
|
*
|
|
* Do an rcu_dereference(p), but check caller either holds rcu_read_lock()
|
|
* or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference()
|
|
*/
|
|
#define rcu_dereference_rtnl(p) \
|
|
rcu_dereference_check(p, lockdep_rtnl_is_held())
|
|
|
|
/**
|
|
* rcu_dereference_bh_rtnl - rcu_dereference_bh with debug checking
|
|
* @p: The pointer to read, prior to dereference
|
|
*
|
|
* Do an rcu_dereference_bh(p), but check caller either holds rcu_read_lock_bh()
|
|
* or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference_bh()
|
|
*/
|
|
#define rcu_dereference_bh_rtnl(p) \
|
|
rcu_dereference_bh_check(p, lockdep_rtnl_is_held())
|
|
|
|
/**
|
|
* rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL
|
|
* @p: The pointer to read, prior to dereferencing
|
|
*
|
|
* Return the value of the specified RCU-protected pointer, but omit
|
|
* both the smp_read_barrier_depends() and the ACCESS_ONCE(), because
|
|
* caller holds RTNL.
|
|
*/
|
|
#define rtnl_dereference(p) \
|
|
rcu_dereference_protected(p, lockdep_rtnl_is_held())
|
|
|
|
static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
|
|
{
|
|
return rtnl_dereference(dev->ingress_queue);
|
|
}
|
|
|
|
struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);
|
|
|
|
#ifdef CONFIG_NET_INGRESS
|
|
void net_inc_ingress_queue(void);
|
|
void net_dec_ingress_queue(void);
|
|
#endif
|
|
|
|
#ifdef CONFIG_NET_EGRESS
|
|
void net_inc_egress_queue(void);
|
|
void net_dec_egress_queue(void);
|
|
#endif
|
|
|
|
extern void rtnetlink_init(void);
|
|
extern void __rtnl_unlock(void);
|
|
|
|
#define ASSERT_RTNL() do { \
|
|
if (unlikely(!rtnl_is_locked())) { \
|
|
printk(KERN_ERR "RTNL: assertion failed at %s (%d)\n", \
|
|
__FILE__, __LINE__); \
|
|
dump_stack(); \
|
|
} \
|
|
} while(0)
|
|
|
|
extern int ndo_dflt_fdb_dump(struct sk_buff *skb,
|
|
struct netlink_callback *cb,
|
|
struct net_device *dev,
|
|
struct net_device *filter_dev,
|
|
int idx);
|
|
extern int ndo_dflt_fdb_add(struct ndmsg *ndm,
|
|
struct nlattr *tb[],
|
|
struct net_device *dev,
|
|
const unsigned char *addr,
|
|
u16 vid,
|
|
u16 flags);
|
|
extern int ndo_dflt_fdb_del(struct ndmsg *ndm,
|
|
struct nlattr *tb[],
|
|
struct net_device *dev,
|
|
const unsigned char *addr,
|
|
u16 vid);
|
|
|
|
extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
|
|
struct net_device *dev, u16 mode,
|
|
u32 flags, u32 mask, int nlflags,
|
|
u32 filter_mask,
|
|
int (*vlan_fill)(struct sk_buff *skb,
|
|
struct net_device *dev,
|
|
u32 filter_mask));
|
|
#endif /* __LINUX_RTNETLINK_H */
|