2005-08-12 19:26:18 +07:00
|
|
|
/*
|
|
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
|
|
* operating system. INET is implemented using the BSD Socket
|
|
|
|
* interface as the means of communication with the user level.
|
|
|
|
*
|
|
|
|
* Generic INET6 transport hashtables
|
|
|
|
*
|
2005-12-14 14:25:44 +07:00
|
|
|
* Authors: Lotsa people, from code originally in tcp, generalised here
|
2014-08-25 03:53:10 +07:00
|
|
|
* by Arnaldo Carvalho de Melo <acme@mandriva.com>
|
2005-08-12 19:26:18 +07:00
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/module.h>
|
2005-12-14 14:25:44 +07:00
|
|
|
#include <linux/random.h>
|
2005-08-12 19:26:18 +07:00
|
|
|
|
2016-02-10 23:50:40 +07:00
|
|
|
#include <net/addrconf.h>
|
2005-08-12 19:26:18 +07:00
|
|
|
#include <net/inet_connection_sock.h>
|
|
|
|
#include <net/inet_hashtables.h>
|
|
|
|
#include <net/inet6_hashtables.h>
|
2011-08-04 10:50:44 +07:00
|
|
|
#include <net/secure_seq.h>
|
2005-12-14 14:25:44 +07:00
|
|
|
#include <net/ip.h>
|
2016-02-10 23:50:40 +07:00
|
|
|
#include <net/sock_reuseport.h>
|
2005-08-12 19:26:18 +07:00
|
|
|
|
2015-03-19 04:05:35 +07:00
|
|
|
u32 inet6_ehashfn(const struct net *net,
|
|
|
|
const struct in6_addr *laddr, const u16 lport,
|
|
|
|
const struct in6_addr *faddr, const __be16 fport)
|
2013-10-20 02:48:52 +07:00
|
|
|
{
|
2013-10-20 02:48:57 +07:00
|
|
|
static u32 inet6_ehash_secret __read_mostly;
|
|
|
|
static u32 ipv6_hash_secret __read_mostly;
|
|
|
|
|
|
|
|
u32 lhash, fhash;
|
|
|
|
|
|
|
|
net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret));
|
|
|
|
net_get_random_once(&ipv6_hash_secret, sizeof(ipv6_hash_secret));
|
|
|
|
|
|
|
|
lhash = (__force u32)laddr->s6_addr32[3];
|
|
|
|
fhash = __ipv6_addr_jhash(faddr, ipv6_hash_secret);
|
|
|
|
|
2013-10-20 02:48:52 +07:00
|
|
|
return __inet6_ehashfn(lhash, lport, fhash, fport,
|
2013-10-20 02:48:57 +07:00
|
|
|
inet6_ehash_secret + net_hash_mix(net));
|
2013-10-20 02:48:52 +07:00
|
|
|
}
|
|
|
|
|
2006-04-10 12:48:59 +07:00
|
|
|
/*
|
|
|
|
* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
|
|
|
|
* we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
|
|
|
|
*
|
|
|
|
* The sockhash lock must be held as a reader here.
|
|
|
|
*/
|
2008-01-31 20:07:21 +07:00
|
|
|
struct sock *__inet6_lookup_established(struct net *net,
|
|
|
|
struct inet_hashinfo *hashinfo,
|
2006-04-10 12:48:59 +07:00
|
|
|
const struct in6_addr *saddr,
|
2006-11-08 15:20:00 +07:00
|
|
|
const __be16 sport,
|
2006-04-10 12:48:59 +07:00
|
|
|
const struct in6_addr *daddr,
|
|
|
|
const u16 hnum,
|
|
|
|
const int dif)
|
|
|
|
{
|
|
|
|
struct sock *sk;
|
2008-11-17 10:40:17 +07:00
|
|
|
const struct hlist_nulls_node *node;
|
2006-09-28 08:43:07 +07:00
|
|
|
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
|
2006-04-10 12:48:59 +07:00
|
|
|
/* Optimize here for direct hit, only listening connections can
|
|
|
|
* have wildcards anyways.
|
|
|
|
*/
|
2008-06-17 07:13:48 +07:00
|
|
|
unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
|
2009-10-09 07:16:19 +07:00
|
|
|
unsigned int slot = hash & hashinfo->ehash_mask;
|
2008-11-17 10:40:17 +07:00
|
|
|
struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
|
2006-04-10 12:48:59 +07:00
|
|
|
|
2008-11-17 10:40:17 +07:00
|
|
|
|
|
|
|
begin:
|
|
|
|
sk_nulls_for_each_rcu(sk, node, &head->chain) {
|
2012-11-30 16:49:27 +07:00
|
|
|
if (sk->sk_hash != hash)
|
|
|
|
continue;
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 05:42:29 +07:00
|
|
|
if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif))
|
|
|
|
continue;
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 14:22:02 +07:00
|
|
|
if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
|
|
|
|
goto out;
|
|
|
|
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 05:42:29 +07:00
|
|
|
if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, ports, dif))) {
|
|
|
|
sock_gen_put(sk);
|
|
|
|
goto begin;
|
2008-11-17 10:40:17 +07:00
|
|
|
}
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 05:42:29 +07:00
|
|
|
goto found;
|
2006-04-10 12:48:59 +07:00
|
|
|
}
|
2008-11-17 10:40:17 +07:00
|
|
|
if (get_nulls_value(node) != slot)
|
|
|
|
goto begin;
|
|
|
|
out:
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 14:22:02 +07:00
|
|
|
sk = NULL;
|
|
|
|
found:
|
2006-04-10 12:48:59 +07:00
|
|
|
return sk;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__inet6_lookup_established);
|
|
|
|
|
2011-01-17 06:09:38 +07:00
|
|
|
static inline int compute_score(struct sock *sk, struct net *net,
|
2008-11-24 08:22:55 +07:00
|
|
|
const unsigned short hnum,
|
|
|
|
const struct in6_addr *daddr,
|
|
|
|
const int dif)
|
|
|
|
{
|
|
|
|
int score = -1;
|
|
|
|
|
2009-10-15 13:30:45 +07:00
|
|
|
if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
|
2008-11-24 08:22:55 +07:00
|
|
|
sk->sk_family == PF_INET6) {
|
|
|
|
|
|
|
|
score = 1;
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 05:42:29 +07:00
|
|
|
if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
|
|
|
|
if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
|
2008-11-24 08:22:55 +07:00
|
|
|
return -1;
|
|
|
|
score++;
|
|
|
|
}
|
|
|
|
if (sk->sk_bound_dev_if) {
|
|
|
|
if (sk->sk_bound_dev_if != dif)
|
|
|
|
return -1;
|
|
|
|
score++;
|
|
|
|
}
|
2015-10-09 09:33:21 +07:00
|
|
|
if (sk->sk_incoming_cpu == raw_smp_processor_id())
|
|
|
|
score++;
|
2008-11-24 08:22:55 +07:00
|
|
|
}
|
|
|
|
return score;
|
|
|
|
}
|
|
|
|
|
2016-04-01 22:52:17 +07:00
|
|
|
/* called with rcu_read_lock() */
|
2008-01-31 20:07:21 +07:00
|
|
|
struct sock *inet6_lookup_listener(struct net *net,
|
2016-02-10 23:50:38 +07:00
|
|
|
struct inet_hashinfo *hashinfo,
|
|
|
|
struct sk_buff *skb, int doff,
|
|
|
|
const struct in6_addr *saddr,
|
2013-01-22 16:50:39 +07:00
|
|
|
const __be16 sport, const struct in6_addr *daddr,
|
2008-01-31 20:07:21 +07:00
|
|
|
const unsigned short hnum, const int dif)
|
2005-08-12 19:26:18 +07:00
|
|
|
{
|
2008-11-24 08:22:55 +07:00
|
|
|
unsigned int hash = inet_lhashfn(net, hnum);
|
|
|
|
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
|
2016-04-01 22:52:17 +07:00
|
|
|
int score, hiscore = 0, matches = 0, reuseport = 0;
|
|
|
|
struct sock *sk, *result = NULL;
|
|
|
|
u32 phash = 0;
|
2008-11-24 08:22:55 +07:00
|
|
|
|
2016-04-01 22:52:17 +07:00
|
|
|
sk_for_each(sk, &ilb->head) {
|
2008-11-24 08:22:55 +07:00
|
|
|
score = compute_score(sk, net, hnum, daddr, dif);
|
|
|
|
if (score > hiscore) {
|
2016-04-09 22:01:13 +07:00
|
|
|
reuseport = sk->sk_reuseport;
|
2013-01-22 16:50:39 +07:00
|
|
|
if (reuseport) {
|
|
|
|
phash = inet6_ehashfn(net, daddr, hnum,
|
|
|
|
saddr, sport);
|
2016-04-01 22:52:17 +07:00
|
|
|
result = reuseport_select_sock(sk, phash,
|
|
|
|
skb, doff);
|
|
|
|
if (result)
|
|
|
|
return result;
|
2013-01-22 16:50:39 +07:00
|
|
|
matches = 1;
|
|
|
|
}
|
2016-04-01 22:52:17 +07:00
|
|
|
result = sk;
|
2016-04-09 22:01:13 +07:00
|
|
|
hiscore = score;
|
2013-01-22 16:50:39 +07:00
|
|
|
} else if (score == hiscore && reuseport) {
|
|
|
|
matches++;
|
2014-08-24 01:58:54 +07:00
|
|
|
if (reciprocal_scale(phash, matches) == 0)
|
2013-01-22 16:50:39 +07:00
|
|
|
result = sk;
|
|
|
|
phash = next_pseudo_random32(phash);
|
2005-08-12 19:26:18 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(inet6_lookup_listener);
|
|
|
|
|
2008-01-31 20:07:21 +07:00
|
|
|
struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
|
2016-02-10 23:50:38 +07:00
|
|
|
struct sk_buff *skb, int doff,
|
2006-11-08 15:20:00 +07:00
|
|
|
const struct in6_addr *saddr, const __be16 sport,
|
|
|
|
const struct in6_addr *daddr, const __be16 dport,
|
2005-08-12 19:26:18 +07:00
|
|
|
const int dif)
|
|
|
|
{
|
|
|
|
struct sock *sk;
|
2016-04-01 22:52:17 +07:00
|
|
|
bool refcounted;
|
2005-08-12 19:26:18 +07:00
|
|
|
|
2016-02-10 23:50:38 +07:00
|
|
|
sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
|
2016-04-01 22:52:17 +07:00
|
|
|
ntohs(dport), dif, &refcounted);
|
|
|
|
if (sk && !refcounted && !atomic_inc_not_zero(&sk->sk_refcnt))
|
|
|
|
sk = NULL;
|
2005-08-12 19:26:18 +07:00
|
|
|
return sk;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(inet6_lookup);
|
2005-12-14 14:25:44 +07:00
|
|
|
|
|
|
|
static int __inet6_check_established(struct inet_timewait_death_row *death_row,
|
|
|
|
struct sock *sk, const __u16 lport,
|
|
|
|
struct inet_timewait_sock **twp)
|
|
|
|
{
|
|
|
|
struct inet_hashinfo *hinfo = death_row->hashinfo;
|
2006-03-14 05:26:12 +07:00
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 05:42:29 +07:00
|
|
|
const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr;
|
|
|
|
const struct in6_addr *saddr = &sk->sk_v6_daddr;
|
2005-12-14 14:25:44 +07:00
|
|
|
const int dif = sk->sk_bound_dev_if;
|
2009-10-15 13:30:45 +07:00
|
|
|
const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
|
2008-06-17 07:13:48 +07:00
|
|
|
struct net *net = sock_net(sk);
|
|
|
|
const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr,
|
2009-10-15 13:30:45 +07:00
|
|
|
inet->inet_dport);
|
2005-12-14 14:25:44 +07:00
|
|
|
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
|
2008-11-21 11:39:09 +07:00
|
|
|
spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
|
2005-12-14 14:25:44 +07:00
|
|
|
struct sock *sk2;
|
2008-11-17 10:40:17 +07:00
|
|
|
const struct hlist_nulls_node *node;
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 14:22:02 +07:00
|
|
|
struct inet_timewait_sock *tw = NULL;
|
2005-12-14 14:25:44 +07:00
|
|
|
|
2008-11-21 11:39:09 +07:00
|
|
|
spin_lock(lock);
|
2005-12-14 14:25:44 +07:00
|
|
|
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 14:22:02 +07:00
|
|
|
sk_nulls_for_each(sk2, node, &head->chain) {
|
2012-11-30 16:49:27 +07:00
|
|
|
if (sk2->sk_hash != hash)
|
|
|
|
continue;
|
2005-12-14 14:25:44 +07:00
|
|
|
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 05:42:29 +07:00
|
|
|
if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports, dif))) {
|
|
|
|
if (sk2->sk_state == TCP_TIME_WAIT) {
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 14:22:02 +07:00
|
|
|
tw = inet_twsk(sk2);
|
|
|
|
if (twsk_unique(sk, sk2, twp))
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 05:42:29 +07:00
|
|
|
break;
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 14:22:02 +07:00
|
|
|
}
|
2005-12-14 14:25:44 +07:00
|
|
|
goto not_unique;
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 05:42:29 +07:00
|
|
|
}
|
2005-12-14 14:25:44 +07:00
|
|
|
}
|
|
|
|
|
2006-03-14 05:26:12 +07:00
|
|
|
/* Must record num and sport now. Otherwise we will see
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 05:42:29 +07:00
|
|
|
* in hash table socket with a funny identity.
|
|
|
|
*/
|
2009-10-15 13:30:45 +07:00
|
|
|
inet->inet_num = lport;
|
|
|
|
inet->inet_sport = htons(lport);
|
2009-12-03 05:31:19 +07:00
|
|
|
sk->sk_hash = hash;
|
2008-07-26 11:43:18 +07:00
|
|
|
WARN_ON(!sk_unhashed(sk));
|
2008-11-17 10:40:17 +07:00
|
|
|
__sk_nulls_add_node_rcu(sk, &head->chain);
|
2009-12-03 05:31:19 +07:00
|
|
|
if (tw) {
|
2015-07-09 04:28:29 +07:00
|
|
|
sk_nulls_del_node_init_rcu((struct sock *)tw);
|
2016-04-28 06:44:39 +07:00
|
|
|
__NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
|
2009-12-03 05:31:19 +07:00
|
|
|
}
|
2008-11-21 11:39:09 +07:00
|
|
|
spin_unlock(lock);
|
2008-04-01 09:41:46 +07:00
|
|
|
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
|
2005-12-14 14:25:44 +07:00
|
|
|
|
2009-12-03 05:31:19 +07:00
|
|
|
if (twp) {
|
2005-12-14 14:25:44 +07:00
|
|
|
*twp = tw;
|
2009-12-03 05:31:19 +07:00
|
|
|
} else if (tw) {
|
2005-12-14 14:25:44 +07:00
|
|
|
/* Silly. Should hash-dance instead... */
|
2015-07-09 04:28:30 +07:00
|
|
|
inet_twsk_deschedule_put(tw);
|
2005-12-14 14:25:44 +07:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
not_unique:
|
2008-11-21 11:39:09 +07:00
|
|
|
spin_unlock(lock);
|
2005-12-14 14:25:44 +07:00
|
|
|
return -EADDRNOTAVAIL;
|
|
|
|
}
|
|
|
|
|
2015-05-28 00:46:02 +07:00
|
|
|
static u32 inet6_sk_port_offset(const struct sock *sk)
|
2005-12-14 14:25:44 +07:00
|
|
|
{
|
|
|
|
const struct inet_sock *inet = inet_sk(sk);
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 05:42:29 +07:00
|
|
|
|
|
|
|
return secure_ipv6_port_ephemeral(sk->sk_v6_rcv_saddr.s6_addr32,
|
|
|
|
sk->sk_v6_daddr.s6_addr32,
|
2009-10-15 13:30:45 +07:00
|
|
|
inet->inet_dport);
|
2005-12-14 14:25:44 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
int inet6_hash_connect(struct inet_timewait_death_row *death_row,
|
|
|
|
struct sock *sk)
|
|
|
|
{
|
2015-05-28 00:46:02 +07:00
|
|
|
u32 port_offset = 0;
|
|
|
|
|
|
|
|
if (!inet_sk(sk)->inet_num)
|
|
|
|
port_offset = inet6_sk_port_offset(sk);
|
|
|
|
return __inet_hash_connect(death_row, sk, port_offset,
|
2015-03-19 04:05:37 +07:00
|
|
|
__inet6_check_established);
|
2005-12-14 14:25:44 +07:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(inet6_hash_connect);
|
2016-02-10 23:50:36 +07:00
|
|
|
|
|
|
|
int inet6_hash(struct sock *sk)
|
|
|
|
{
|
|
|
|
if (sk->sk_state != TCP_CLOSE) {
|
|
|
|
local_bh_disable();
|
2016-02-10 23:50:40 +07:00
|
|
|
__inet_hash(sk, NULL, ipv6_rcv_saddr_equal);
|
2016-02-10 23:50:36 +07:00
|
|
|
local_bh_enable();
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(inet6_hash);
|
|
|
|
|
|
|
|
/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
|
|
|
|
* only, and any IPv4 addresses if not IPv6 only
|
|
|
|
* match_wildcard == false: addresses must be exactly the same, i.e.
|
|
|
|
* IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
|
|
|
|
* and 0.0.0.0 equals to 0.0.0.0 only
|
|
|
|
*/
|
|
|
|
int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
|
|
|
|
bool match_wildcard)
|
|
|
|
{
|
|
|
|
const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
|
|
|
|
int sk2_ipv6only = inet_v6_ipv6only(sk2);
|
|
|
|
int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
|
|
|
|
int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
|
|
|
|
|
|
|
|
/* if both are mapped, treat as IPv4 */
|
|
|
|
if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
|
|
|
|
if (!sk2_ipv6only) {
|
|
|
|
if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
|
|
|
|
return 1;
|
|
|
|
if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
|
|
|
|
return match_wildcard;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
|
|
|
|
!(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
|
|
|
|
!(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (sk2_rcv_saddr6 &&
|
|
|
|
ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(ipv6_rcv_saddr_equal);
|