2005-12-27 11:43:12 +07:00
|
|
|
/*
|
|
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
|
|
* operating system. INET is implemented using the BSD Socket
|
|
|
|
* interface as the means of communication with the user level.
|
|
|
|
*
|
|
|
|
* Definitions for inet_sock
|
|
|
|
*
|
|
|
|
* Authors: Many, reorganised here by
|
|
|
|
* Arnaldo Carvalho de Melo <acme@mandriva.com>
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
*/
|
|
|
|
#ifndef _INET_SOCK_H
|
|
|
|
#define _INET_SOCK_H
|
|
|
|
|
2015-01-06 04:56:15 +07:00
|
|
|
#include <linux/bitops.h>
|
2008-09-09 11:43:12 +07:00
|
|
|
#include <linux/kmemcheck.h>
|
2005-12-27 11:43:12 +07:00
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/types.h>
|
2007-03-24 01:40:27 +07:00
|
|
|
#include <linux/jhash.h>
|
rfs: Receive Flow Steering
This patch implements receive flow steering (RFS). RFS steers
received packets for layer 3 and 4 processing to the CPU where
the application for the corresponding flow is running. RFS is an
extension of Receive Packet Steering (RPS).
The basic idea of RFS is that when an application calls recvmsg
(or sendmsg) the application's running CPU is stored in a hash
table that is indexed by the connection's rxhash which is stored in
the socket structure. The rxhash is passed in skb's received on
the connection from netif_receive_skb. For each received packet,
the associated rxhash is used to look up the CPU in the hash table,
if a valid CPU is set then the packet is steered to that CPU using
the RPS mechanisms.
The convolution of the simple approach is that it would potentially
allow OOO packets. If threads are thrashing around CPUs or multiple
threads are trying to read from the same sockets, a quickly changing
CPU value in the hash table could cause rampant OOO packets--
we consider this a non-starter.
To avoid OOO packets, this solution implements two types of hash
tables: rps_sock_flow_table and rps_dev_flow_table.
rps_sock_table is a global hash table. Each entry is just a CPU
number and it is populated in recvmsg and sendmsg as described above.
This table contains the "desired" CPUs for flows.
rps_dev_flow_table is specific to each device queue. Each entry
contains a CPU and a tail queue counter. The CPU is the "current"
CPU for a matching flow. The tail queue counter holds the value
of a tail queue counter for the associated CPU's backlog queue at
the time of last enqueue for a flow matching the entry.
Each backlog queue has a queue head counter which is incremented
on dequeue, and so a queue tail counter is computed as queue head
count + queue length. When a packet is enqueued on a backlog queue,
the current value of the queue tail counter is saved in the hash
entry of the rps_dev_flow_table.
And now the trick: when selecting the CPU for RPS (get_rps_cpu)
the rps_sock_flow table and the rps_dev_flow table for the RX queue
are consulted. When the desired CPU for the flow (found in the
rps_sock_flow table) does not match the current CPU (found in the
rps_dev_flow table), the current CPU is changed to the desired CPU
if one of the following is true:
- The current CPU is unset (equal to RPS_NO_CPU)
- Current CPU is offline
- The current CPU's queue head counter >= queue tail counter in the
rps_dev_flow table. This checks if the queue tail has advanced
beyond the last packet that was enqueued using this table entry.
This guarantees that all packets queued using this entry have been
dequeued, thus preserving in order delivery.
Making each queue have its own rps_dev_flow table has two advantages:
1) the tail queue counters will be written on each receive, so
keeping the table local to interrupting CPU s good for locality. 2)
this allows lockless access to the table-- the CPU number and queue
tail counter need to be accessed together under mutual exclusion
from netif_receive_skb, we assume that this is only called from
device napi_poll which is non-reentrant.
This patch implements RFS for TCP and connected UDP sockets.
It should be usable for other flow oriented protocols.
There are two configuration parameters for RFS. The
"rps_flow_entries" kernel init parameter sets the number of
entries in the rps_sock_flow_table, the per rxqueue sysfs entry
"rps_flow_cnt" contains the number of entries in the rps_dev_flow
table for the rxqueue. Both are rounded to power of two.
The obvious benefit of RFS (over just RPS) is that it achieves
CPU locality between the receive processing for a flow and the
applications processing; this can result in increased performance
(higher pps, lower latency).
The benefits of RFS are dependent on cache hierarchy, application
load, and other factors. On simple benchmarks, we don't necessarily
see improvement and sometimes see degradation. However, for more
complex benchmarks and for applications where cache pressure is
much higher this technique seems to perform very well.
Below are some benchmark results which show the potential benfit of
this patch. The netperf test has 500 instances of netperf TCP_RR
test with 1 byte req. and resp. The RPC test is an request/response
test similar in structure to netperf RR test ith 100 threads on
each host, but does more work in userspace that netperf.
e1000e on 8 core Intel
No RFS or RPS 104K tps at 30% CPU
No RFS (best RPS config): 290K tps at 63% CPU
RFS 303K tps at 61% CPU
RPC test tps CPU% 50/90/99% usec latency Latency StdDev
No RFS/RPS 103K 48% 757/900/3185 4472.35
RPS only: 174K 73% 415/993/2468 491.66
RFS 223K 73% 379/651/1382 315.61
Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-04-17 06:01:27 +07:00
|
|
|
#include <linux/netdevice.h>
|
2005-12-27 11:43:12 +07:00
|
|
|
|
|
|
|
#include <net/flow.h>
|
|
|
|
#include <net/sock.h>
|
|
|
|
#include <net/request_sock.h>
|
2008-06-17 07:14:11 +07:00
|
|
|
#include <net/netns/hash.h>
|
2015-03-13 06:44:05 +07:00
|
|
|
#include <net/tcp_states.h>
|
2005-12-27 11:43:12 +07:00
|
|
|
|
|
|
|
/** struct ip_options - IP Options
|
|
|
|
*
|
|
|
|
* @faddr - Saved first hop address
|
2011-11-23 06:33:10 +07:00
|
|
|
* @nexthop - Saved nexthop address in LSRR and SSRR
|
2005-12-27 11:43:12 +07:00
|
|
|
* @is_strictroute - Strict source route
|
|
|
|
* @srr_is_hit - Packet destination addr was our one
|
|
|
|
* @is_changed - IP checksum more not valid
|
|
|
|
* @rr_needaddr - Need to record addr of outgoing dev
|
|
|
|
* @ts_needtime - Need to record timestamp
|
|
|
|
* @ts_needaddr - Need to record addr of outgoing dev
|
|
|
|
*/
|
|
|
|
struct ip_options {
|
2006-09-28 08:28:07 +07:00
|
|
|
__be32 faddr;
|
2011-11-23 06:33:10 +07:00
|
|
|
__be32 nexthop;
|
2005-12-27 11:43:12 +07:00
|
|
|
unsigned char optlen;
|
|
|
|
unsigned char srr;
|
|
|
|
unsigned char rr;
|
|
|
|
unsigned char ts;
|
2008-03-23 06:35:29 +07:00
|
|
|
unsigned char is_strictroute:1,
|
2005-12-27 11:43:12 +07:00
|
|
|
srr_is_hit:1,
|
|
|
|
is_changed:1,
|
|
|
|
rr_needaddr:1,
|
|
|
|
ts_needtime:1,
|
|
|
|
ts_needaddr:1;
|
|
|
|
unsigned char router_alert;
|
2006-08-04 06:46:20 +07:00
|
|
|
unsigned char cipso;
|
2005-12-27 11:43:12 +07:00
|
|
|
unsigned char __pad2;
|
|
|
|
unsigned char __data[0];
|
|
|
|
};
|
|
|
|
|
2011-04-21 16:45:37 +07:00
|
|
|
struct ip_options_rcu {
|
|
|
|
struct rcu_head rcu;
|
|
|
|
struct ip_options opt;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct ip_options_data {
|
|
|
|
struct ip_options_rcu opt;
|
|
|
|
char data[40];
|
|
|
|
};
|
2005-12-27 11:43:12 +07:00
|
|
|
|
|
|
|
struct inet_request_sock {
|
|
|
|
struct request_sock req;
|
2013-10-10 05:21:29 +07:00
|
|
|
#define ir_loc_addr req.__req_common.skc_rcv_saddr
|
|
|
|
#define ir_rmt_addr req.__req_common.skc_daddr
|
2013-10-10 14:04:37 +07:00
|
|
|
#define ir_num req.__req_common.skc_num
|
2013-10-10 05:21:29 +07:00
|
|
|
#define ir_rmt_port req.__req_common.skc_dport
|
|
|
|
#define ir_v6_rmt_addr req.__req_common.skc_v6_daddr
|
|
|
|
#define ir_v6_loc_addr req.__req_common.skc_v6_rcv_saddr
|
|
|
|
#define ir_iif req.__req_common.skc_bound_dev_if
|
2015-03-12 08:53:14 +07:00
|
|
|
#define ir_cookie req.__req_common.skc_cookie
|
|
|
|
#define ireq_net req.__req_common.skc_net
|
2015-03-13 06:44:05 +07:00
|
|
|
#define ireq_state req.__req_common.skc_state
|
2015-03-13 06:44:10 +07:00
|
|
|
#define ireq_family req.__req_common.skc_family
|
2013-10-10 05:21:29 +07:00
|
|
|
|
2008-09-09 11:43:12 +07:00
|
|
|
kmemcheck_bitfield_begin(flags);
|
|
|
|
u16 snd_wscale : 4,
|
|
|
|
rcv_wscale : 4,
|
2005-12-27 11:43:12 +07:00
|
|
|
tstamp_ok : 1,
|
|
|
|
sack_ok : 1,
|
|
|
|
wscale_ok : 1,
|
|
|
|
ecn_ok : 1,
|
2008-10-01 21:41:00 +07:00
|
|
|
acked : 1,
|
|
|
|
no_srccheck: 1;
|
2008-09-09 11:43:12 +07:00
|
|
|
kmemcheck_bitfield_end(flags);
|
2015-03-17 11:06:18 +07:00
|
|
|
u32 ir_mark;
|
2014-06-25 21:09:52 +07:00
|
|
|
union {
|
|
|
|
struct ip_options_rcu *opt;
|
|
|
|
struct sk_buff *pktopts;
|
|
|
|
};
|
2005-12-27 11:43:12 +07:00
|
|
|
};
|
|
|
|
|
|
|
|
static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
|
|
|
|
{
|
|
|
|
return (struct inet_request_sock *)sk;
|
|
|
|
}
|
|
|
|
|
2015-03-17 11:06:18 +07:00
|
|
|
static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
|
net: support marking accepting TCP sockets
When using mark-based routing, sockets returned from accept()
may need to be marked differently depending on the incoming
connection request.
This is the case, for example, if different socket marks identify
different networks: a listening socket may want to accept
connections from all networks, but each connection should be
marked with the network that the request came in on, so that
subsequent packets are sent on the correct network.
This patch adds a sysctl to mark TCP sockets based on the fwmark
of the incoming SYN packet. If enabled, and an unmarked socket
receives a SYN, then the SYN packet's fwmark is written to the
connection's inet_request_sock, and later written back to the
accepted socket when the connection is established. If the
socket already has a nonzero mark, then the behaviour is the same
as it is today, i.e., the listening socket's fwmark is used.
Black-box tested using user-mode linux:
- IPv4/IPv6 SYN+ACK, FIN, etc. packets are routed based on the
mark of the incoming SYN packet.
- The socket returned by accept() is marked with the mark of the
incoming SYN packet.
- Tested with syncookies=1 and syncookies=2.
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-14 00:17:35 +07:00
|
|
|
{
|
2015-03-17 11:06:18 +07:00
|
|
|
if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept)
|
net: support marking accepting TCP sockets
When using mark-based routing, sockets returned from accept()
may need to be marked differently depending on the incoming
connection request.
This is the case, for example, if different socket marks identify
different networks: a listening socket may want to accept
connections from all networks, but each connection should be
marked with the network that the request came in on, so that
subsequent packets are sent on the correct network.
This patch adds a sysctl to mark TCP sockets based on the fwmark
of the incoming SYN packet. If enabled, and an unmarked socket
receives a SYN, then the SYN packet's fwmark is written to the
connection's inet_request_sock, and later written back to the
accepted socket when the connection is established. If the
socket already has a nonzero mark, then the behaviour is the same
as it is today, i.e., the listening socket's fwmark is used.
Black-box tested using user-mode linux:
- IPv4/IPv6 SYN+ACK, FIN, etc. packets are routed based on the
mark of the incoming SYN packet.
- The socket returned by accept() is marked with the mark of the
incoming SYN packet.
- Tested with syncookies=1 and syncookies=2.
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-14 00:17:35 +07:00
|
|
|
return skb->mark;
|
2015-03-17 11:06:18 +07:00
|
|
|
|
|
|
|
return sk->sk_mark;
|
net: support marking accepting TCP sockets
When using mark-based routing, sockets returned from accept()
may need to be marked differently depending on the incoming
connection request.
This is the case, for example, if different socket marks identify
different networks: a listening socket may want to accept
connections from all networks, but each connection should be
marked with the network that the request came in on, so that
subsequent packets are sent on the correct network.
This patch adds a sysctl to mark TCP sockets based on the fwmark
of the incoming SYN packet. If enabled, and an unmarked socket
receives a SYN, then the SYN packet's fwmark is written to the
connection's inet_request_sock, and later written back to the
accepted socket when the connection is established. If the
socket already has a nonzero mark, then the behaviour is the same
as it is today, i.e., the listening socket's fwmark is used.
Black-box tested using user-mode linux:
- IPv4/IPv6 SYN+ACK, FIN, etc. packets are routed based on the
mark of the incoming SYN packet.
- The socket returned by accept() is marked with the mark of the
incoming SYN packet.
- Tested with syncookies=1 and syncookies=2.
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-14 00:17:35 +07:00
|
|
|
}
|
|
|
|
|
2011-03-01 09:36:47 +07:00
|
|
|
struct inet_cork {
|
|
|
|
unsigned int flags;
|
2011-05-07 05:02:07 +07:00
|
|
|
__be32 addr;
|
2011-03-01 09:36:47 +07:00
|
|
|
struct ip_options *opt;
|
2011-05-07 05:02:07 +07:00
|
|
|
unsigned int fragsize;
|
2011-03-01 09:36:47 +07:00
|
|
|
int length; /* Total length of all frames */
|
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 06:04:42 +07:00
|
|
|
struct dst_entry *dst;
|
2011-03-01 09:36:47 +07:00
|
|
|
u8 tx_flags;
|
2013-09-24 20:43:09 +07:00
|
|
|
__u8 ttl;
|
|
|
|
__s16 tos;
|
|
|
|
char priority;
|
2011-03-01 09:36:47 +07:00
|
|
|
};
|
|
|
|
|
2011-05-07 05:02:07 +07:00
|
|
|
struct inet_cork_full {
|
|
|
|
struct inet_cork base;
|
|
|
|
struct flowi fl;
|
|
|
|
};
|
|
|
|
|
2005-12-27 11:43:12 +07:00
|
|
|
struct ip_mc_socklist;
|
|
|
|
struct ipv6_pinfo;
|
|
|
|
struct rtable;
|
|
|
|
|
|
|
|
/** struct inet_sock - representation of INET sockets
|
|
|
|
*
|
|
|
|
* @sk - ancestor class
|
|
|
|
* @pinet6 - pointer to IPv6 control block
|
2009-10-15 13:30:45 +07:00
|
|
|
* @inet_daddr - Foreign IPv4 addr
|
|
|
|
* @inet_rcv_saddr - Bound local IPv4 addr
|
|
|
|
* @inet_dport - Destination port
|
|
|
|
* @inet_num - Local port
|
|
|
|
* @inet_saddr - Sending source
|
2005-12-27 11:43:12 +07:00
|
|
|
* @uc_ttl - Unicast TTL
|
2009-10-15 13:30:45 +07:00
|
|
|
* @inet_sport - Source port
|
|
|
|
* @inet_id - ID counter for DF pkts
|
2005-12-27 11:43:12 +07:00
|
|
|
* @tos - TOS
|
|
|
|
* @mc_ttl - Multicasting TTL
|
|
|
|
* @is_icsk - is this an inet_connection_sock?
|
2012-02-08 16:11:07 +07:00
|
|
|
* @uc_index - Unicast outgoing device index
|
2005-12-27 11:43:12 +07:00
|
|
|
* @mc_index - Multicast device index
|
|
|
|
* @mc_list - Group array
|
|
|
|
* @cork - info to build ip hdr on each ip frag while socket is corked
|
|
|
|
*/
|
|
|
|
struct inet_sock {
|
|
|
|
/* sk and pinet6 has to be the first two members of inet_sock */
|
|
|
|
struct sock sk;
|
2011-12-10 16:48:31 +07:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2005-12-27 11:43:12 +07:00
|
|
|
struct ipv6_pinfo *pinet6;
|
|
|
|
#endif
|
|
|
|
/* Socket demultiplex comparisons on incoming packets. */
|
net: optimize INET input path further
Followup of commit b178bb3dfc30 (net: reorder struct sock fields)
Optimize INET input path a bit further, by :
1) moving sk_refcnt close to sk_lock.
This reduces number of dirtied cache lines by one on 64bit arches (and
64 bytes cache line size).
2) moving inet_daddr & inet_rcv_saddr at the beginning of sk
(same cache line than hash / family / bound_dev_if / nulls_node)
This reduces number of accessed cache lines in lookups by one, and dont
increase size of inet and timewait socks.
inet and tw sockets now share same place-holder for these fields.
Before patch :
offsetof(struct sock, sk_refcnt) = 0x10
offsetof(struct sock, sk_lock) = 0x40
offsetof(struct sock, sk_receive_queue) = 0x60
offsetof(struct inet_sock, inet_daddr) = 0x270
offsetof(struct inet_sock, inet_rcv_saddr) = 0x274
After patch :
offsetof(struct sock, sk_refcnt) = 0x44
offsetof(struct sock, sk_lock) = 0x48
offsetof(struct sock, sk_receive_queue) = 0x68
offsetof(struct inet_sock, inet_daddr) = 0x0
offsetof(struct inet_sock, inet_rcv_saddr) = 0x4
compute_score() (udp or tcp) now use a single cache line per ignored
item, instead of two.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-12-01 02:04:07 +07:00
|
|
|
#define inet_daddr sk.__sk_common.skc_daddr
|
|
|
|
#define inet_rcv_saddr sk.__sk_common.skc_rcv_saddr
|
2012-11-30 16:49:27 +07:00
|
|
|
#define inet_dport sk.__sk_common.skc_dport
|
|
|
|
#define inet_num sk.__sk_common.skc_num
|
net: optimize INET input path further
Followup of commit b178bb3dfc30 (net: reorder struct sock fields)
Optimize INET input path a bit further, by :
1) moving sk_refcnt close to sk_lock.
This reduces number of dirtied cache lines by one on 64bit arches (and
64 bytes cache line size).
2) moving inet_daddr & inet_rcv_saddr at the beginning of sk
(same cache line than hash / family / bound_dev_if / nulls_node)
This reduces number of accessed cache lines in lookups by one, and dont
increase size of inet and timewait socks.
inet and tw sockets now share same place-holder for these fields.
Before patch :
offsetof(struct sock, sk_refcnt) = 0x10
offsetof(struct sock, sk_lock) = 0x40
offsetof(struct sock, sk_receive_queue) = 0x60
offsetof(struct inet_sock, inet_daddr) = 0x270
offsetof(struct inet_sock, inet_rcv_saddr) = 0x274
After patch :
offsetof(struct sock, sk_refcnt) = 0x44
offsetof(struct sock, sk_lock) = 0x48
offsetof(struct sock, sk_receive_queue) = 0x68
offsetof(struct inet_sock, inet_daddr) = 0x0
offsetof(struct inet_sock, inet_rcv_saddr) = 0x4
compute_score() (udp or tcp) now use a single cache line per ignored
item, instead of two.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-12-01 02:04:07 +07:00
|
|
|
|
2009-10-15 13:30:45 +07:00
|
|
|
__be32 inet_saddr;
|
2005-12-27 11:43:12 +07:00
|
|
|
__s16 uc_ttl;
|
|
|
|
__u16 cmsg_flags;
|
2009-10-15 13:30:45 +07:00
|
|
|
__be16 inet_sport;
|
|
|
|
__u16 inet_id;
|
2010-01-12 07:28:01 +07:00
|
|
|
|
2011-04-21 16:45:37 +07:00
|
|
|
struct ip_options_rcu __rcu *inet_opt;
|
2012-11-30 16:49:27 +07:00
|
|
|
int rx_dst_ifindex;
|
2005-12-27 11:43:12 +07:00
|
|
|
__u8 tos;
|
2010-01-12 07:28:01 +07:00
|
|
|
__u8 min_ttl;
|
2005-12-27 11:43:12 +07:00
|
|
|
__u8 mc_ttl;
|
|
|
|
__u8 pmtudisc;
|
|
|
|
__u8 recverr:1,
|
|
|
|
is_icsk:1,
|
|
|
|
freebind:1,
|
|
|
|
hdrincl:1,
|
2008-10-01 21:30:02 +07:00
|
|
|
mc_loop:1,
|
2009-05-28 14:00:46 +07:00
|
|
|
transparent:1,
|
2010-06-15 08:07:31 +07:00
|
|
|
mc_all:1,
|
|
|
|
nodefrag:1;
|
inet: add IP_BIND_ADDRESS_NO_PORT to overcome bind(0) limitations
When an application needs to force a source IP on an active TCP socket
it has to use bind(IP, port=x).
As most applications do not want to deal with already used ports, x is
often set to 0, meaning the kernel is in charge to find an available
port.
But kernel does not know yet if this socket is going to be a listener or
be connected.
It has very limited choices (no full knowledge of final 4-tuple for a
connect())
With limited ephemeral port range (about 32K ports), it is very easy to
fill the space.
This patch adds a new SOL_IP socket option, asking kernel to ignore
the 0 port provided by application in bind(IP, port=0) and only
remember the given IP address.
The port will be automatically chosen at connect() time, in a way
that allows sharing a source port as long as the 4-tuples are unique.
This new feature is available for both IPv4 and IPv6 (Thanks Neal)
Tested:
Wrote a test program and checked its behavior on IPv4 and IPv6.
strace(1) shows sequences of bind(IP=127.0.0.2, port=0) followed by
connect().
Also getsockname() show that the port is still 0 right after bind()
but properly allocated after connect().
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5
setsockopt(5, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0
bind(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, 16) = 0
getsockname(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0
connect(5, {sa_family=AF_INET, sin_port=htons(53174), sin_addr=inet_addr("127.0.0.3")}, 16) = 0
getsockname(5, {sa_family=AF_INET, sin_port=htons(38050), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0
IPv6 test :
socket(PF_INET6, SOCK_STREAM, IPPROTO_IP) = 7
setsockopt(7, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0
bind(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0
getsockname(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0
connect(7, {sa_family=AF_INET6, sin6_port=htons(57300), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0
getsockname(7, {sa_family=AF_INET6, sin6_port=htons(60964), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0
I was able to bind()/connect() a million concurrent IPv4 sockets,
instead of ~32000 before patch.
lpaa23:~# ulimit -n 1000010
lpaa23:~# ./bind --connect --num-flows=1000000 &
1000000 sockets
lpaa23:~# grep TCP /proc/net/sockstat
TCP: inuse 2000063 orphan 0 tw 47 alloc 2000157 mem 66
Check that a given source port is indeed used by many different
connections :
lpaa23:~# ss -t src :40000 | head -10
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 0 127.0.0.2:40000 127.0.202.33:44983
ESTAB 0 0 127.0.0.2:40000 127.2.27.240:44983
ESTAB 0 0 127.0.0.2:40000 127.2.98.5:44983
ESTAB 0 0 127.0.0.2:40000 127.0.124.196:44983
ESTAB 0 0 127.0.0.2:40000 127.2.139.38:44983
ESTAB 0 0 127.0.0.2:40000 127.1.59.80:44983
ESTAB 0 0 127.0.0.2:40000 127.3.6.228:44983
ESTAB 0 0 127.0.0.2:40000 127.0.38.53:44983
ESTAB 0 0 127.0.0.2:40000 127.1.197.10:44983
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-06-07 11:17:57 +07:00
|
|
|
__u8 bind_address_no_port:1;
|
2012-02-09 16:35:49 +07:00
|
|
|
__u8 rcv_tos;
|
2015-01-06 04:56:14 +07:00
|
|
|
__u8 convert_csum;
|
2012-02-08 16:11:07 +07:00
|
|
|
int uc_index;
|
2005-12-27 11:43:12 +07:00
|
|
|
int mc_index;
|
2006-09-27 11:27:35 +07:00
|
|
|
__be32 mc_addr;
|
2010-11-12 12:46:50 +07:00
|
|
|
struct ip_mc_socklist __rcu *mc_list;
|
2011-05-07 05:02:07 +07:00
|
|
|
struct inet_cork_full cork;
|
2005-12-27 11:43:12 +07:00
|
|
|
};
|
|
|
|
|
|
|
|
#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
|
|
|
|
#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */
|
|
|
|
|
2015-01-06 04:56:15 +07:00
|
|
|
/* cmsg flags for inet */
|
|
|
|
#define IP_CMSG_PKTINFO BIT(0)
|
|
|
|
#define IP_CMSG_TTL BIT(1)
|
|
|
|
#define IP_CMSG_TOS BIT(2)
|
|
|
|
#define IP_CMSG_RECVOPTS BIT(3)
|
|
|
|
#define IP_CMSG_RETOPTS BIT(4)
|
|
|
|
#define IP_CMSG_PASSSEC BIT(5)
|
|
|
|
#define IP_CMSG_ORIGDSTADDR BIT(6)
|
2015-01-06 04:56:17 +07:00
|
|
|
#define IP_CMSG_CHECKSUM BIT(7)
|
2015-01-06 04:56:15 +07:00
|
|
|
|
2015-11-09 01:54:07 +07:00
|
|
|
/* SYNACK messages might be attached to request sockets.
|
|
|
|
* Some places want to reach the listener in this case.
|
|
|
|
*/
|
|
|
|
static inline struct sock *skb_to_full_sk(const struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
struct sock *sk = skb->sk;
|
|
|
|
|
|
|
|
if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
|
|
|
|
sk = inet_reqsk(sk)->rsk_listener;
|
|
|
|
return sk;
|
|
|
|
}
|
|
|
|
|
2005-12-27 11:43:12 +07:00
|
|
|
static inline struct inet_sock *inet_sk(const struct sock *sk)
|
|
|
|
{
|
|
|
|
return (struct inet_sock *)sk;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void __inet_sk_copy_descendant(struct sock *sk_to,
|
|
|
|
const struct sock *sk_from,
|
|
|
|
const int ancestor_size)
|
|
|
|
{
|
|
|
|
memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
|
|
|
|
sk_from->sk_prot->obj_size - ancestor_size);
|
|
|
|
}
|
2011-12-10 16:48:31 +07:00
|
|
|
#if !(IS_ENABLED(CONFIG_IPV6))
|
2005-12-27 11:43:12 +07:00
|
|
|
static inline void inet_sk_copy_descendant(struct sock *sk_to,
|
|
|
|
const struct sock *sk_from)
|
|
|
|
{
|
|
|
|
__inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2013-09-22 00:22:41 +07:00
|
|
|
int inet_sk_rebuild_header(struct sock *sk);
|
2005-12-27 11:43:12 +07:00
|
|
|
|
2013-10-20 02:48:51 +07:00
|
|
|
static inline unsigned int __inet_ehashfn(const __be32 laddr,
|
|
|
|
const __u16 lport,
|
|
|
|
const __be32 faddr,
|
|
|
|
const __be16 fport,
|
|
|
|
u32 initval)
|
2005-12-27 11:43:12 +07:00
|
|
|
{
|
2008-03-05 05:28:41 +07:00
|
|
|
return jhash_3words((__force __u32) laddr,
|
|
|
|
(__force __u32) faddr,
|
2007-03-24 01:40:27 +07:00
|
|
|
((__u32) lport) << 16 | (__force __u32)fport,
|
2013-10-20 02:48:51 +07:00
|
|
|
initval);
|
2005-12-27 11:43:12 +07:00
|
|
|
}
|
|
|
|
|
2015-03-18 08:32:27 +07:00
|
|
|
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
|
2015-10-05 11:08:11 +07:00
|
|
|
struct sock *sk_listener,
|
|
|
|
bool attach_listener);
|
2008-06-11 02:39:35 +07:00
|
|
|
|
2008-10-01 21:41:00 +07:00
|
|
|
static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
|
|
|
|
{
|
2011-01-28 13:01:53 +07:00
|
|
|
__u8 flags = 0;
|
|
|
|
|
2011-08-07 16:16:09 +07:00
|
|
|
if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl)
|
2011-01-28 13:01:53 +07:00
|
|
|
flags |= FLOWI_FLAG_ANYSRC;
|
|
|
|
return flags;
|
2008-10-01 21:41:00 +07:00
|
|
|
}
|
|
|
|
|
2015-01-06 04:56:14 +07:00
|
|
|
static inline void inet_inc_convert_csum(struct sock *sk)
|
|
|
|
{
|
|
|
|
inet_sk(sk)->convert_csum++;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void inet_dec_convert_csum(struct sock *sk)
|
|
|
|
{
|
|
|
|
if (inet_sk(sk)->convert_csum > 0)
|
|
|
|
inet_sk(sk)->convert_csum--;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool inet_get_convert_csum(struct sock *sk)
|
|
|
|
{
|
|
|
|
return !!inet_sk(sk)->convert_csum;
|
|
|
|
}
|
|
|
|
|
2005-12-27 11:43:12 +07:00
|
|
|
#endif /* _INET_SOCK_H */
|