inet: add IP_BIND_ADDRESS_NO_PORT to overcome bind(0) limitations

When an application needs to force a source IP on an active TCP socket
it has to use bind(IP, port=x).

As most applications do not want to deal with already used ports, x is
often set to 0, meaning the kernel is in charge to find an available
port.
But kernel does not know yet if this socket is going to be a listener or
be connected.
It has very limited choices (no full knowledge of final 4-tuple for a
connect())

With limited ephemeral port range (about 32K ports), it is very easy to
fill the space.

This patch adds a new SOL_IP socket option, asking kernel to ignore
the 0 port provided by application in bind(IP, port=0) and only
remember the given IP address.

The port will be automatically chosen at connect() time, in a way
that allows sharing a source port as long as the 4-tuples are unique.

This new feature is available for both IPv4 and IPv6 (Thanks Neal)

Tested:

Wrote a test program and checked its behavior on IPv4 and IPv6.

strace(1) shows sequences of bind(IP=127.0.0.2, port=0) followed by
connect().
Also getsockname() show that the port is still 0 right after bind()
but properly allocated after connect().

socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5
setsockopt(5, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0
bind(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, 16) = 0
getsockname(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0
connect(5, {sa_family=AF_INET, sin_port=htons(53174), sin_addr=inet_addr("127.0.0.3")}, 16) = 0
getsockname(5, {sa_family=AF_INET, sin_port=htons(38050), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0

IPv6 test :

socket(PF_INET6, SOCK_STREAM, IPPROTO_IP) = 7
setsockopt(7, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0
bind(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0
getsockname(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0
connect(7, {sa_family=AF_INET6, sin6_port=htons(57300), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0
getsockname(7, {sa_family=AF_INET6, sin6_port=htons(60964), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0

I was able to bind()/connect() a million concurrent IPv4 sockets,
instead of ~32000 before patch.

lpaa23:~# ulimit -n 1000010
lpaa23:~# ./bind --connect --num-flows=1000000 &
1000000 sockets

lpaa23:~# grep TCP /proc/net/sockstat
TCP: inuse 2000063 orphan 0 tw 47 alloc 2000157 mem 66

Check that a given source port is indeed used by many different
connections :

lpaa23:~# ss -t src :40000 | head -10
State      Recv-Q Send-Q   Local Address:Port          Peer Address:Port
ESTAB      0      0           127.0.0.2:40000         127.0.202.33:44983
ESTAB      0      0           127.0.0.2:40000         127.2.27.240:44983
ESTAB      0      0           127.0.0.2:40000           127.2.98.5:44983
ESTAB      0      0           127.0.0.2:40000        127.0.124.196:44983
ESTAB      0      0           127.0.0.2:40000         127.2.139.38:44983
ESTAB      0      0           127.0.0.2:40000          127.1.59.80:44983
ESTAB      0      0           127.0.0.2:40000          127.3.6.228:44983
ESTAB      0      0           127.0.0.2:40000          127.0.38.53:44983
ESTAB      0      0           127.0.0.2:40000         127.1.197.10:44983

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Eric Dumazet 2015-06-06 21:17:57 -07:00 committed by David S. Miller
parent 513d1a1d1c
commit 90c337da15
5 changed files with 13 additions and 2 deletions

View File

@ -187,6 +187,7 @@ struct inet_sock {
transparent:1, transparent:1,
mc_all:1, mc_all:1,
nodefrag:1; nodefrag:1;
__u8 bind_address_no_port:1;
__u8 rcv_tos; __u8 rcv_tos;
__u8 convert_csum; __u8 convert_csum;
int uc_index; int uc_index;

View File

@ -112,6 +112,7 @@ struct in_addr {
#define IP_MINTTL 21 #define IP_MINTTL 21
#define IP_NODEFRAG 22 #define IP_NODEFRAG 22
#define IP_CHECKSUM 23 #define IP_CHECKSUM 23
#define IP_BIND_ADDRESS_NO_PORT 24
/* IP_MTU_DISCOVER values */ /* IP_MTU_DISCOVER values */
#define IP_PMTUDISC_DONT 0 /* Never send DF frames */ #define IP_PMTUDISC_DONT 0 /* Never send DF frames */

View File

@ -488,7 +488,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
inet->inet_saddr = 0; /* Use device */ inet->inet_saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. */ /* Make sure we are allowed to bind here. */
if (sk->sk_prot->get_port(sk, snum)) { if ((snum || !inet->bind_address_no_port) &&
sk->sk_prot->get_port(sk, snum)) {
inet->inet_saddr = inet->inet_rcv_saddr = 0; inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -EADDRINUSE; err = -EADDRINUSE;
goto out_release_sock; goto out_release_sock;

View File

@ -582,6 +582,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
case IP_TRANSPARENT: case IP_TRANSPARENT:
case IP_MINTTL: case IP_MINTTL:
case IP_NODEFRAG: case IP_NODEFRAG:
case IP_BIND_ADDRESS_NO_PORT:
case IP_UNICAST_IF: case IP_UNICAST_IF:
case IP_MULTICAST_TTL: case IP_MULTICAST_TTL:
case IP_MULTICAST_ALL: case IP_MULTICAST_ALL:
@ -732,6 +733,9 @@ static int do_ip_setsockopt(struct sock *sk, int level,
} }
inet->nodefrag = val ? 1 : 0; inet->nodefrag = val ? 1 : 0;
break; break;
case IP_BIND_ADDRESS_NO_PORT:
inet->bind_address_no_port = val ? 1 : 0;
break;
case IP_MTU_DISCOVER: case IP_MTU_DISCOVER:
if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
goto e_inval; goto e_inval;
@ -1324,6 +1328,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_NODEFRAG: case IP_NODEFRAG:
val = inet->nodefrag; val = inet->nodefrag;
break; break;
case IP_BIND_ADDRESS_NO_PORT:
val = inet->bind_address_no_port;
break;
case IP_MTU_DISCOVER: case IP_MTU_DISCOVER:
val = inet->pmtudisc; val = inet->pmtudisc;
break; break;

View File

@ -362,7 +362,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
np->saddr = addr->sin6_addr; np->saddr = addr->sin6_addr;
/* Make sure we are allowed to bind here. */ /* Make sure we are allowed to bind here. */
if (sk->sk_prot->get_port(sk, snum)) { if ((snum || !inet->bind_address_no_port) &&
sk->sk_prot->get_port(sk, snum)) {
inet_reset_saddr(sk); inet_reset_saddr(sk);
err = -EADDRINUSE; err = -EADDRINUSE;
goto out; goto out;