linux_dsm_epyc7002/include/linux/ipv6.h
Eric Dumazet 81c3d5470e [INET]: speedup inet (tcp/dccp) lookups
Arnaldo and I agreed it could be applied now, because I have other
pending patches depending on this one (Thank you Arnaldo)

(The other important patch moves skc_refcnt in a separate cache line,
so that the SMP/NUMA performance doesnt suffer from cache line ping pongs)

1) First some performance data :
--------------------------------

tcp_v4_rcv() wastes a *lot* of time in __inet_lookup_established()

The most time critical code is :

sk_for_each(sk, node, &head->chain) {
     if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
         goto hit; /* You sunk my battleship! */
}

The sk_for_each() does use prefetch() hints but only the begining of
"struct sock" is prefetched.

As INET_MATCH first comparison uses inet_sk(__sk)->daddr, wich is far
away from the begining of "struct sock", it has to bring into CPU
cache cold cache line. Each iteration has to use at least 2 cache
lines.

This can be problematic if some chains are very long.

2) The goal
-----------

The idea I had is to change things so that INET_MATCH() may return
FALSE in 99% of cases only using the data already in the CPU cache,
using one cache line per iteration.

3) Description of the patch
---------------------------

Adds a new 'unsigned int skc_hash' field in 'struct sock_common',
filling a 32 bits hole on 64 bits platform.

struct sock_common {
	unsigned short		skc_family;
	volatile unsigned char	skc_state;
	unsigned char		skc_reuse;
	int			skc_bound_dev_if;
	struct hlist_node	skc_node;
	struct hlist_node	skc_bind_node;
	atomic_t		skc_refcnt;
+	unsigned int		skc_hash;
	struct proto		*skc_prot;
};

Store in this 32 bits field the full hash, not masked by (ehash_size -
1) Using this full hash as the first comparison done in INET_MATCH
permits us immediatly skip the element without touching a second cache
line in case of a miss.

Suppress the sk_hashent/tw_hashent fields since skc_hash (aliased to
sk_hash and tw_hash) already contains the slot number if we mask with
(ehash_size - 1)

File include/net/inet_hashtables.h

64 bits platforms :
#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
     (((__sk)->sk_hash == (__hash))
     ((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie))   &&  \
     ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports))   &&  \
     (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))

32bits platforms:
#define TCP_IPV4_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
     (((__sk)->sk_hash == (__hash))                 &&  \
     (inet_sk(__sk)->daddr          == (__saddr))   &&  \
     (inet_sk(__sk)->rcv_saddr      == (__daddr))   &&  \
     (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))


- Adds a prefetch(head->chain.first) in 
__inet_lookup_established()/__tcp_v4_check_established() and 
__inet6_lookup_established()/__tcp_v6_check_established() and 
__dccp_v4_check_established() to bring into cache the first element of the 
list, before the {read|write}_lock(&head->lock);

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-10-03 14:13:38 -07:00

386 lines
8.3 KiB
C

#ifndef _IPV6_H
#define _IPV6_H
#include <linux/config.h>
#include <linux/in6.h>
#include <asm/byteorder.h>
/* The latest drafts declared increase in minimal mtu up to 1280. */
#define IPV6_MIN_MTU 1280
/*
* Advanced API
* source interface/address selection, source routing, etc...
* *under construction*
*/
struct in6_pktinfo {
struct in6_addr ipi6_addr;
int ipi6_ifindex;
};
struct in6_ifreq {
struct in6_addr ifr6_addr;
__u32 ifr6_prefixlen;
int ifr6_ifindex;
};
#define IPV6_SRCRT_STRICT 0x01 /* this hop must be a neighbor */
#define IPV6_SRCRT_TYPE_0 0 /* IPv6 type 0 Routing Header */
/*
* routing header
*/
struct ipv6_rt_hdr {
__u8 nexthdr;
__u8 hdrlen;
__u8 type;
__u8 segments_left;
/*
* type specific data
* variable length field
*/
};
struct ipv6_opt_hdr {
__u8 nexthdr;
__u8 hdrlen;
/*
* TLV encoded option data follows.
*/
};
#define ipv6_destopt_hdr ipv6_opt_hdr
#define ipv6_hopopt_hdr ipv6_opt_hdr
#ifdef __KERNEL__
#define ipv6_optlen(p) (((p)->hdrlen+1) << 3)
#endif
/*
* routing header type 0 (used in cmsghdr struct)
*/
struct rt0_hdr {
struct ipv6_rt_hdr rt_hdr;
__u32 reserved;
struct in6_addr addr[0];
#define rt0_type rt_hdr.type
};
struct ipv6_auth_hdr {
__u8 nexthdr;
__u8 hdrlen; /* This one is measured in 32 bit units! */
__u16 reserved;
__u32 spi;
__u32 seq_no; /* Sequence number */
__u8 auth_data[0]; /* Length variable but >=4. Mind the 64 bit alignment! */
};
struct ipv6_esp_hdr {
__u32 spi;
__u32 seq_no; /* Sequence number */
__u8 enc_data[0]; /* Length variable but >=8. Mind the 64 bit alignment! */
};
struct ipv6_comp_hdr {
__u8 nexthdr;
__u8 flags;
__u16 cpi;
};
/*
* IPv6 fixed header
*
* BEWARE, it is incorrect. The first 4 bits of flow_lbl
* are glued to priority now, forming "class".
*/
struct ipv6hdr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u8 priority:4,
version:4;
#elif defined(__BIG_ENDIAN_BITFIELD)
__u8 version:4,
priority:4;
#else
#error "Please fix <asm/byteorder.h>"
#endif
__u8 flow_lbl[3];
__u16 payload_len;
__u8 nexthdr;
__u8 hop_limit;
struct in6_addr saddr;
struct in6_addr daddr;
};
/*
* This structure contains configuration options per IPv6 link.
*/
struct ipv6_devconf {
__s32 forwarding;
__s32 hop_limit;
__s32 mtu6;
__s32 accept_ra;
__s32 accept_redirects;
__s32 autoconf;
__s32 dad_transmits;
__s32 rtr_solicits;
__s32 rtr_solicit_interval;
__s32 rtr_solicit_delay;
__s32 force_mld_version;
#ifdef CONFIG_IPV6_PRIVACY
__s32 use_tempaddr;
__s32 temp_valid_lft;
__s32 temp_prefered_lft;
__s32 regen_max_retry;
__s32 max_desync_factor;
#endif
__s32 max_addresses;
void *sysctl;
};
/* index values for the variables in ipv6_devconf */
enum {
DEVCONF_FORWARDING = 0,
DEVCONF_HOPLIMIT,
DEVCONF_MTU6,
DEVCONF_ACCEPT_RA,
DEVCONF_ACCEPT_REDIRECTS,
DEVCONF_AUTOCONF,
DEVCONF_DAD_TRANSMITS,
DEVCONF_RTR_SOLICITS,
DEVCONF_RTR_SOLICIT_INTERVAL,
DEVCONF_RTR_SOLICIT_DELAY,
DEVCONF_USE_TEMPADDR,
DEVCONF_TEMP_VALID_LFT,
DEVCONF_TEMP_PREFERED_LFT,
DEVCONF_REGEN_MAX_RETRY,
DEVCONF_MAX_DESYNC_FACTOR,
DEVCONF_MAX_ADDRESSES,
DEVCONF_FORCE_MLD_VERSION,
DEVCONF_MAX
};
#ifdef __KERNEL__
#include <linux/in6.h> /* struct sockaddr_in6 */
#include <linux/icmpv6.h>
#include <net/if_inet6.h> /* struct ipv6_mc_socklist */
#include <linux/tcp.h>
#include <linux/udp.h>
/*
This structure contains results of exthdrs parsing
as offsets from skb->nh.
*/
struct inet6_skb_parm {
int iif;
__u16 ra;
__u16 hop;
__u16 dst0;
__u16 srcrt;
__u16 dst1;
__u16 lastopt;
};
#define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb))
static inline int inet6_iif(const struct sk_buff *skb)
{
return IP6CB(skb)->iif;
}
struct tcp6_request_sock {
struct tcp_request_sock req;
struct in6_addr loc_addr;
struct in6_addr rmt_addr;
struct sk_buff *pktopts;
int iif;
};
static inline struct tcp6_request_sock *tcp6_rsk(const struct request_sock *sk)
{
return (struct tcp6_request_sock *)sk;
}
/**
* struct ipv6_pinfo - ipv6 private area
*
* In the struct sock hierarchy (tcp6_sock, upd6_sock, etc)
* this _must_ be the last member, so that inet6_sk_generic
* is able to calculate its offset from the base struct sock
* by using the struct proto->slab_obj_size member. -acme
*/
struct ipv6_pinfo {
struct in6_addr saddr;
struct in6_addr rcv_saddr;
struct in6_addr daddr;
struct in6_addr *daddr_cache;
__u32 flow_label;
__u32 frag_size;
__s16 hop_limit;
__s16 mcast_hops;
int mcast_oif;
/* pktoption flags */
union {
struct {
__u16 srcrt:2,
osrcrt:2,
rxinfo:1,
rxoinfo:1,
rxhlim:1,
rxohlim:1,
hopopts:1,
ohopopts:1,
dstopts:1,
odstopts:1,
rxflow:1,
rxtclass:1;
} bits;
__u16 all;
} rxopt;
/* sockopt flags */
__u8 mc_loop:1,
recverr:1,
sndflow:1,
pmtudisc:2,
ipv6only:1;
__u8 tclass;
__u32 dst_cookie;
struct ipv6_mc_socklist *ipv6_mc_list;
struct ipv6_ac_socklist *ipv6_ac_list;
struct ipv6_fl_socklist *ipv6_fl_list;
struct ipv6_txoptions *opt;
struct sk_buff *pktoptions;
struct {
struct ipv6_txoptions *opt;
struct rt6_info *rt;
int hop_limit;
int tclass;
} cork;
};
/* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */
struct raw6_sock {
/* inet_sock has to be the first member of raw6_sock */
struct inet_sock inet;
__u32 checksum; /* perform checksum */
__u32 offset; /* checksum offset */
struct icmp6_filter filter;
/* ipv6_pinfo has to be the last member of raw6_sock, see inet6_sk_generic */
struct ipv6_pinfo inet6;
};
struct udp6_sock {
struct udp_sock udp;
/* ipv6_pinfo has to be the last member of udp6_sock, see inet6_sk_generic */
struct ipv6_pinfo inet6;
};
struct tcp6_sock {
struct tcp_sock tcp;
/* ipv6_pinfo has to be the last member of tcp6_sock, see inet6_sk_generic */
struct ipv6_pinfo inet6;
};
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
{
return inet_sk(__sk)->pinet6;
}
static inline struct raw6_sock *raw6_sk(const struct sock *sk)
{
return (struct raw6_sock *)sk;
}
static inline void inet_sk_copy_descendant(struct sock *sk_to,
const struct sock *sk_from)
{
int ancestor_size = sizeof(struct inet_sock);
if (sk_from->sk_family == PF_INET6)
ancestor_size += sizeof(struct ipv6_pinfo);
__inet_sk_copy_descendant(sk_to, sk_from, ancestor_size);
}
#define __ipv6_only_sock(sk) (inet6_sk(sk)->ipv6only)
#define ipv6_only_sock(sk) ((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk))
#include <linux/tcp.h>
struct tcp6_timewait_sock {
struct tcp_timewait_sock tw_v6_sk;
struct in6_addr tw_v6_daddr;
struct in6_addr tw_v6_rcv_saddr;
};
static inline struct tcp6_timewait_sock *tcp6_twsk(const struct sock *sk)
{
return (struct tcp6_timewait_sock *)sk;
}
static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk)
{
return likely(sk->sk_state != TCP_TIME_WAIT) ?
&inet6_sk(sk)->rcv_saddr : &tcp6_twsk(sk)->tw_v6_rcv_saddr;
}
static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk)
{
return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL;
}
static inline int inet_v6_ipv6only(const struct sock *sk)
{
return likely(sk->sk_state != TCP_TIME_WAIT) ?
ipv6_only_sock(sk) : inet_twsk(sk)->tw_ipv6only;
}
#else
#define __ipv6_only_sock(sk) 0
#define ipv6_only_sock(sk) 0
static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
{
return NULL;
}
static inline struct raw6_sock *raw6_sk(const struct sock *sk)
{
return NULL;
}
#define __tcp_v6_rcv_saddr(__sk) NULL
#define tcp_v6_rcv_saddr(__sk) NULL
#define tcp_twsk_ipv6only(__sk) 0
#define inet_v6_ipv6only(__sk) 0
#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
#define INET6_MATCH(__sk, __hash, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash)) && \
((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
((__sk)->sk_family == AF_INET6) && \
ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr)) && \
ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
#endif /* __KERNEL__ */
#endif /* _IPV6_H */