linux_dsm_epyc7002/net/ipv6/anycast.c

543 lines
11 KiB
C
Raw Normal View History

/*
* Anycast support for IPv6
* Linux INET6 implementation
*
* Authors:
* David L Stevens (dlstevens@us.ibm.com)
*
* based heavily on net/ipv6/mcast.c
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/capability.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/random.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/route.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 15:04:11 +07:00
#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/if_inet6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/ip6_route.h>
#include <net/checksum.h>
static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr);
/*
* socket join an anycast group
*/
int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct net_device *dev = NULL;
struct inet6_dev *idev;
struct ipv6_ac_socklist *pac;
struct net *net = sock_net(sk);
int ishost = !net->ipv6.devconf_all->forwarding;
int err = 0;
ASSERT_RTNL();
net: Allow userns root to control ipv6 Allow an unpriviled user who has created a user namespace, and then created a network namespace to effectively use the new network namespace, by reducing capable(CAP_NET_ADMIN) and capable(CAP_NET_RAW) calls to be ns_capable(net->user_ns, CAP_NET_ADMIN), or capable(net->user_ns, CAP_NET_RAW) calls. Settings that merely control a single network device are allowed. Either the network device is a logical network device where restrictions make no difference or the network device is hardware NIC that has been explicity moved from the initial network namespace. In general policy and network stack state changes are allowed while resource control is left unchanged. Allow the SIOCSIFADDR ioctl to add ipv6 addresses. Allow the SIOCDIFADDR ioctl to delete ipv6 addresses. Allow the SIOCADDRT ioctl to add ipv6 routes. Allow the SIOCDELRT ioctl to delete ipv6 routes. Allow creation of ipv6 raw sockets. Allow setting the IPV6_JOIN_ANYCAST socket option. Allow setting the IPV6_FL_A_RENEW parameter of the IPV6_FLOWLABEL_MGR socket option. Allow setting the IPV6_TRANSPARENT socket option. Allow setting the IPV6_HOPOPTS socket option. Allow setting the IPV6_RTHDRDSTOPTS socket option. Allow setting the IPV6_DSTOPTS socket option. Allow setting the IPV6_IPSEC_POLICY socket option. Allow setting the IPV6_XFRM_POLICY socket option. Allow sending packets with the IPV6_2292HOPOPTS control message. Allow sending packets with the IPV6_2292DSTOPTS control message. Allow sending packets with the IPV6_RTHDRDSTOPTS control message. Allow setting the multicast routing socket options on non multicast routing sockets. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL, and SIOCDELTUNNEL ioctls for setting up, changing and deleting tunnels over ipv6. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL, SIOCDELTUNNEL ioctls for setting up, changing and deleting ipv6 over ipv4 tunnels. Allow the SIOCADDPRL, SIOCDELPRL, SIOCCHGPRL ioctls for adding, deleting, and changing the potential router list for ISATAP tunnels. Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2012-11-16 10:03:06 +07:00
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (ipv6_addr_is_multicast(addr))
return -EINVAL;
net/ipv6: Change address check to always take a device argument ipv6_chk_addr_and_flags determines if an address is a local address and optionally if it is an address on a specific device. For example, it is called by ip6_route_info_create to determine if a given gateway address is a local address. The address check currently does not consider L3 domains and as a result does not allow a route to be added in one VRF if the nexthop points to an address in a second VRF. e.g., $ ip route add 2001:db8:1::/64 vrf r2 via 2001:db8:102::23 Error: Invalid gateway address. where 2001:db8:102::23 is an address on an interface in vrf r1. ipv6_chk_addr_and_flags needs to allow callers to always pass in a device with a separate argument to not limit the address to the specific device. The device is used used to determine the L3 domain of interest. To that end add an argument to skip the device check and update callers to always pass a device where possible and use the new argument to mean any address in the domain. Update a handful of users of ipv6_chk_addr with a NULL dev argument. This patch handles the change to these callers without adding the domain check. ip6_validate_gw needs to handle 2 cases - one where the device is given as part of the nexthop spec and the other where the device is resolved. There is at least 1 VRF case where deferring the check to only after the route lookup has resolved the device fails with an unintuitive error "RTNETLINK answers: No route to host" as opposed to the preferred "Error: Gateway can not be a local address." The 'no route to host' error is because of the fallback to a full lookup. The check is done twice to avoid this error. Signed-off-by: David Ahern <dsahern@gmail.com> Reviewed-by: Ido Schimmel <idosch@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-13 22:29:37 +07:00
if (ifindex)
dev = __dev_get_by_index(net, ifindex);
if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE))
return -EINVAL;
pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL);
if (!pac)
return -ENOMEM;
pac->acl_next = NULL;
pac->acl_addr = *addr;
if (ifindex == 0) {
struct rt6_info *rt;
rt = rt6_lookup(net, addr, NULL, 0, NULL, 0);
if (rt) {
dev = rt->dst.dev;
ip6_rt_put(rt);
} else if (ishost) {
err = -EADDRNOTAVAIL;
goto error;
} else {
/* router, no matching interface: just pick one */
dev = __dev_get_by_flags(net, IFF_UP,
IFF_UP | IFF_LOOPBACK);
}
net/ipv6: Change address check to always take a device argument ipv6_chk_addr_and_flags determines if an address is a local address and optionally if it is an address on a specific device. For example, it is called by ip6_route_info_create to determine if a given gateway address is a local address. The address check currently does not consider L3 domains and as a result does not allow a route to be added in one VRF if the nexthop points to an address in a second VRF. e.g., $ ip route add 2001:db8:1::/64 vrf r2 via 2001:db8:102::23 Error: Invalid gateway address. where 2001:db8:102::23 is an address on an interface in vrf r1. ipv6_chk_addr_and_flags needs to allow callers to always pass in a device with a separate argument to not limit the address to the specific device. The device is used used to determine the L3 domain of interest. To that end add an argument to skip the device check and update callers to always pass a device where possible and use the new argument to mean any address in the domain. Update a handful of users of ipv6_chk_addr with a NULL dev argument. This patch handles the change to these callers without adding the domain check. ip6_validate_gw needs to handle 2 cases - one where the device is given as part of the nexthop spec and the other where the device is resolved. There is at least 1 VRF case where deferring the check to only after the route lookup has resolved the device fails with an unintuitive error "RTNETLINK answers: No route to host" as opposed to the preferred "Error: Gateway can not be a local address." The 'no route to host' error is because of the fallback to a full lookup. The check is done twice to avoid this error. Signed-off-by: David Ahern <dsahern@gmail.com> Reviewed-by: Ido Schimmel <idosch@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-13 22:29:37 +07:00
}
if (!dev) {
err = -ENODEV;
goto error;
}
idev = __in6_dev_get(dev);
if (!idev) {
if (ifindex)
err = -ENODEV;
else
err = -EADDRNOTAVAIL;
goto error;
}
/* reset ishost, now that we have a specific device */
ishost = !idev->cnf.forwarding;
pac->acl_ifindex = dev->ifindex;
/* XXX
* For hosts, allow link-local or matching prefix anycasts.
* This obviates the need for propagating anycast routes while
* still allowing some non-router anycast participation.
*/
if (!ipv6_chk_prefix(addr, dev)) {
if (ishost)
err = -EADDRNOTAVAIL;
if (err)
goto error;
}
err = __ipv6_dev_ac_inc(idev, addr);
if (!err) {
pac->acl_next = np->ipv6_ac_list;
np->ipv6_ac_list = pac;
pac = NULL;
}
error:
if (pac)
sock_kfree_s(sk, pac, sizeof(*pac));
return err;
}
/*
* socket leave an anycast group
*/
int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct net_device *dev;
struct ipv6_ac_socklist *pac, *prev_pac;
struct net *net = sock_net(sk);
ASSERT_RTNL();
prev_pac = NULL;
for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) {
if ((ifindex == 0 || pac->acl_ifindex == ifindex) &&
ipv6_addr_equal(&pac->acl_addr, addr))
break;
prev_pac = pac;
}
if (!pac)
return -ENOENT;
if (prev_pac)
prev_pac->acl_next = pac->acl_next;
else
np->ipv6_ac_list = pac->acl_next;
dev = __dev_get_by_index(net, pac->acl_ifindex);
if (dev)
ipv6_dev_ac_dec(dev, &pac->acl_addr);
sock_kfree_s(sk, pac, sizeof(*pac));
return 0;
}
void ipv6_sock_ac_close(struct sock *sk)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct net_device *dev = NULL;
struct ipv6_ac_socklist *pac;
struct net *net = sock_net(sk);
int prev_index;
if (!np->ipv6_ac_list)
return;
rtnl_lock();
pac = np->ipv6_ac_list;
np->ipv6_ac_list = NULL;
prev_index = 0;
while (pac) {
struct ipv6_ac_socklist *next = pac->acl_next;
if (pac->acl_ifindex != prev_index) {
dev = __dev_get_by_index(net, pac->acl_ifindex);
prev_index = pac->acl_ifindex;
}
if (dev)
ipv6_dev_ac_dec(dev, &pac->acl_addr);
sock_kfree_s(sk, pac, sizeof(*pac));
pac = next;
}
rtnl_unlock();
}
static void aca_get(struct ifacaddr6 *aca)
{
refcount_inc(&aca->aca_refcnt);
}
static void aca_put(struct ifacaddr6 *ac)
{
if (refcount_dec_and_test(&ac->aca_refcnt)) {
fib6_info_release(ac->aca_rt);
kfree(ac);
}
}
static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
const struct in6_addr *addr)
{
struct ifacaddr6 *aca;
aca = kzalloc(sizeof(*aca), GFP_ATOMIC);
if (!aca)
return NULL;
aca->aca_addr = *addr;
fib6_info_hold(f6i);
aca->aca_rt = f6i;
aca->aca_users = 1;
/* aca_tstamp should be updated upon changes */
aca->aca_cstamp = aca->aca_tstamp = jiffies;
refcount_set(&aca->aca_refcnt, 1);
return aca;
}
/*
* device anycast group inc (add if not found)
*/
int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
{
struct ifacaddr6 *aca;
struct fib6_info *f6i;
struct net *net;
int err;
ASSERT_RTNL();
write_lock_bh(&idev->lock);
if (idev->dead) {
err = -ENODEV;
goto out;
}
for (aca = idev->ac_list; aca; aca = aca->aca_next) {
if (ipv6_addr_equal(&aca->aca_addr, addr)) {
aca->aca_users++;
err = 0;
goto out;
}
}
net = dev_net(idev->dev);
f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC);
if (IS_ERR(f6i)) {
err = PTR_ERR(f6i);
goto out;
}
aca = aca_alloc(f6i, addr);
if (!aca) {
fib6_info_release(f6i);
err = -ENOMEM;
goto out;
}
aca->aca_next = idev->ac_list;
idev->ac_list = aca;
/* Hold this for addrconf_join_solict() below before we unlock,
* it is already exposed via idev->ac_list.
*/
aca_get(aca);
write_unlock_bh(&idev->lock);
ip6_ins_rt(net, f6i);
addrconf_join_solict(idev->dev, &aca->aca_addr);
aca_put(aca);
return 0;
out:
write_unlock_bh(&idev->lock);
return err;
}
/*
* device anycast group decrement
*/
int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
{
struct ifacaddr6 *aca, *prev_aca;
ASSERT_RTNL();
write_lock_bh(&idev->lock);
prev_aca = NULL;
for (aca = idev->ac_list; aca; aca = aca->aca_next) {
if (ipv6_addr_equal(&aca->aca_addr, addr))
break;
prev_aca = aca;
}
if (!aca) {
write_unlock_bh(&idev->lock);
return -ENOENT;
}
if (--aca->aca_users > 0) {
write_unlock_bh(&idev->lock);
return 0;
}
if (prev_aca)
prev_aca->aca_next = aca->aca_next;
else
idev->ac_list = aca->aca_next;
write_unlock_bh(&idev->lock);
addrconf_leave_solict(idev, &aca->aca_addr);
ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
aca_put(aca);
return 0;
}
/* called with rtnl_lock() */
static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr)
{
struct inet6_dev *idev = __in6_dev_get(dev);
if (!idev)
return -ENODEV;
return __ipv6_dev_ac_dec(idev, addr);
}
void ipv6_ac_destroy_dev(struct inet6_dev *idev)
{
struct ifacaddr6 *aca;
write_lock_bh(&idev->lock);
while ((aca = idev->ac_list) != NULL) {
idev->ac_list = aca->aca_next;
write_unlock_bh(&idev->lock);
addrconf_leave_solict(idev, &aca->aca_addr);
ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
aca_put(aca);
write_lock_bh(&idev->lock);
}
write_unlock_bh(&idev->lock);
}
/*
* check if the interface has this anycast address
* called with rcu_read_lock()
*/
static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *addr)
{
struct inet6_dev *idev;
struct ifacaddr6 *aca;
idev = __in6_dev_get(dev);
if (idev) {
read_lock_bh(&idev->lock);
for (aca = idev->ac_list; aca; aca = aca->aca_next)
if (ipv6_addr_equal(&aca->aca_addr, addr))
break;
read_unlock_bh(&idev->lock);
return aca != NULL;
}
return false;
}
/*
* check if given interface (or any, if dev==0) has this anycast address
*/
bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
const struct in6_addr *addr)
{
bool found = false;
rcu_read_lock();
if (dev)
found = ipv6_chk_acast_dev(dev, addr);
else
for_each_netdev_rcu(net, dev)
if (ipv6_chk_acast_dev(dev, addr)) {
found = true;
break;
}
rcu_read_unlock();
return found;
}
/* check if this anycast address is link-local on given interface or
* is global
*/
bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
const struct in6_addr *addr)
{
return ipv6_chk_acast_addr(net,
(ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL ?
dev : NULL),
addr);
}
#ifdef CONFIG_PROC_FS
struct ac6_iter_state {
struct seq_net_private p;
struct net_device *dev;
struct inet6_dev *idev;
};
#define ac6_seq_private(seq) ((struct ac6_iter_state *)(seq)->private)
static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq)
{
struct ifacaddr6 *im = NULL;
struct ac6_iter_state *state = ac6_seq_private(seq);
struct net *net = seq_file_net(seq);
state->idev = NULL;
for_each_netdev_rcu(net, state->dev) {
struct inet6_dev *idev;
idev = __in6_dev_get(state->dev);
if (!idev)
continue;
read_lock_bh(&idev->lock);
im = idev->ac_list;
if (im) {
state->idev = idev;
break;
}
read_unlock_bh(&idev->lock);
}
return im;
}
static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im)
{
struct ac6_iter_state *state = ac6_seq_private(seq);
im = im->aca_next;
while (!im) {
if (likely(state->idev != NULL))
read_unlock_bh(&state->idev->lock);
state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->idev = NULL;
break;
}
state->idev = __in6_dev_get(state->dev);
if (!state->idev)
continue;
read_lock_bh(&state->idev->lock);
im = state->idev->ac_list;
}
return im;
}
static struct ifacaddr6 *ac6_get_idx(struct seq_file *seq, loff_t pos)
{
struct ifacaddr6 *im = ac6_get_first(seq);
if (im)
while (pos && (im = ac6_get_next(seq, im)) != NULL)
--pos;
return pos ? NULL : im;
}
static void *ac6_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
rcu_read_lock();
return ac6_get_idx(seq, *pos);
}
static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ifacaddr6 *im = ac6_get_next(seq, v);
++*pos;
return im;
}
static void ac6_seq_stop(struct seq_file *seq, void *v)
__releases(RCU)
{
struct ac6_iter_state *state = ac6_seq_private(seq);
if (likely(state->idev != NULL)) {
read_unlock_bh(&state->idev->lock);
state->idev = NULL;
}
rcu_read_unlock();
}
static int ac6_seq_show(struct seq_file *seq, void *v)
{
struct ifacaddr6 *im = (struct ifacaddr6 *)v;
struct ac6_iter_state *state = ac6_seq_private(seq);
seq_printf(seq, "%-4d %-15s %pi6 %5d\n",
state->dev->ifindex, state->dev->name,
&im->aca_addr, im->aca_users);
return 0;
}
static const struct seq_operations ac6_seq_ops = {
.start = ac6_seq_start,
.next = ac6_seq_next,
.stop = ac6_seq_stop,
.show = ac6_seq_show,
};
int __net_init ac6_proc_init(struct net *net)
{
if (!proc_create_net("anycast6", 0444, net->proc_net, &ac6_seq_ops,
sizeof(struct ac6_iter_state)))
return -ENOMEM;
return 0;
}
void ac6_proc_exit(struct net *net)
{
remove_proc_entry("anycast6", net->proc_net);
}
#endif