ipv6: hook up exception table to store dst cache

This commit makes use of the exception hash table implementation to
store dst caches created by pmtu discovery and ip redirect into the hash
table under the rt_info and no longer inserts these routes into fib6
tree.
This makes the fib6 tree only contain static configured routes and could
now be protected by rcu instead of a rw lock.
With this change, in the route lookup related functions, after finding
the rt6_info with the longest prefix, we also need to search for the
exception table before doing backtracking.
In the route delete function, if the route being deleted is not a dst
cache, deletion of this route also need to flush the whole hash table
under it. If it is a dst cache, then only delete the cached dst in the
hash table.

Note: for fib6_walk_continue() function, w->root now is always pointing
to a root node considering that fib6_prune_clones() is removed from the
code. So we add a WARN_ON() msg to make sure w->root always points to a
root node and also removed the update of w->root in fib6_repair_tree().
This is a prerequisite for later patch because we don't need to make
w->root as rcu protected when replacing rwlock with RCU.
Also, we remove all prune related variables as it is no longer used.

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Wei Wang 2017-10-06 12:06:03 -07:00 committed by David S. Miller
parent 38fbeeeecc
commit 2b760fcf5c
4 changed files with 72 additions and 133 deletions

View File

@ -280,7 +280,6 @@ struct fib6_walker {
struct fib6_node *root, *node;
struct rt6_info *leaf;
enum fib6_walk_state state;
bool prune;
unsigned int skip;
unsigned int count;
int (*func)(struct fib6_walker *);

View File

@ -2326,7 +2326,6 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
if (!fn)
goto out;
noflags |= RTF_CACHE;
for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
if (rt->dst.dev->ifindex != dev->ifindex)
continue;

View File

@ -54,7 +54,6 @@ struct fib6_cleaner {
#define FWS_INIT FWS_L
#endif
static void fib6_prune_clones(struct net *net, struct fib6_node *fn);
static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
static int fib6_walk(struct net *net, struct fib6_walker *w);
@ -1101,6 +1100,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt)))
return -EINVAL;
if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE))
return -EINVAL;
if (info->nlh) {
if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
@ -1192,11 +1193,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
#endif
err = fib6_add_rt2node(fn, rt, info, mxc);
if (!err) {
if (!err)
fib6_start_gc(info->nl_net, rt);
if (!(rt->rt6i_flags & RTF_CACHE))
fib6_prune_clones(info->nl_net, pn);
}
out:
if (err) {
@ -1511,19 +1509,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
read_lock(&net->ipv6.fib6_walker_lock);
FOR_WALKERS(net, w) {
if (!child) {
if (w->root == fn) {
w->root = w->node = NULL;
RT6_TRACE("W %p adjusted by delroot 1\n", w);
} else if (w->node == fn) {
if (w->node == fn) {
RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
w->node = pn;
w->state = nstate;
}
} else {
if (w->root == fn) {
w->root = child;
RT6_TRACE("W %p adjusted by delroot 2\n", w);
}
if (w->node == fn) {
w->node = child;
if (children&2) {
@ -1557,12 +1548,17 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
RT6_TRACE("fib6_del_route\n");
WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE);
/* Unlink it */
*rtp = rt->dst.rt6_next;
rt->rt6i_node = NULL;
net->ipv6.rt6_stats->fib_rt_entries--;
net->ipv6.rt6_stats->fib_discarded_routes++;
/* Flush all cached dst in exception table */
rt6_flush_exceptions(rt);
/* Reset round-robin state, if necessary */
if (fn->rr_ptr == rt)
fn->rr_ptr = NULL;
@ -1625,18 +1621,9 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
WARN_ON(!(fn->fn_flags & RTN_RTINFO));
if (!(rt->rt6i_flags & RTF_CACHE)) {
struct fib6_node *pn = fn;
#ifdef CONFIG_IPV6_SUBTREES
/* clones of this route might be in another subtree */
if (rt->rt6i_src.plen) {
while (!(pn->fn_flags & RTN_ROOT))
pn = pn->parent;
pn = pn->parent;
}
#endif
fib6_prune_clones(info->nl_net, pn);
}
/* remove cached dst from exception table */
if (rt->rt6i_flags & RTF_CACHE)
return rt6_remove_exception_rt(rt);
/*
* Walk the leaf entries looking for ourself
@ -1679,16 +1666,14 @@ static int fib6_walk_continue(struct fib6_walker *w)
{
struct fib6_node *fn, *pn;
/* w->root should always be table->tb6_root */
WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));
for (;;) {
fn = w->node;
if (!fn)
return 0;
if (w->prune && fn != w->root &&
fn->fn_flags & RTN_RTINFO && w->state < FWS_C) {
w->state = FWS_C;
w->leaf = fn->leaf;
}
switch (w->state) {
#ifdef CONFIG_IPV6_SUBTREES
case FWS_S:
@ -1820,20 +1805,16 @@ static int fib6_clean_node(struct fib6_walker *w)
* func is called on each route.
* It may return -1 -> delete this route.
* 0 -> continue walking
*
* prune==1 -> only immediate children of node (certainly,
* ignoring pure split nodes) will be scanned.
*/
static void fib6_clean_tree(struct net *net, struct fib6_node *root,
int (*func)(struct rt6_info *, void *arg),
bool prune, int sernum, void *arg)
int sernum, void *arg)
{
struct fib6_cleaner c;
c.w.root = root;
c.w.func = fib6_clean_node;
c.w.prune = prune;
c.w.count = 0;
c.w.skip = 0;
c.func = func;
@ -1858,7 +1839,7 @@ static void __fib6_clean_all(struct net *net,
hlist_for_each_entry_rcu(table, head, tb6_hlist) {
write_lock_bh(&table->tb6_lock);
fib6_clean_tree(net, &table->tb6_root,
func, false, sernum, arg);
func, sernum, arg);
write_unlock_bh(&table->tb6_lock);
}
}
@ -1871,22 +1852,6 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *),
__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
}
static int fib6_prune_clone(struct rt6_info *rt, void *arg)
{
if (rt->rt6i_flags & RTF_CACHE) {
RT6_TRACE("pruning clone %p\n", rt);
return -1;
}
return 0;
}
static void fib6_prune_clones(struct net *net, struct fib6_node *fn)
{
fib6_clean_tree(net, fn, fib6_prune_clone, true,
FIB6_NO_SERNUM_CHANGE, NULL);
}
static void fib6_flush_trees(struct net *net)
{
int new_sernum = fib6_new_sernum(net);
@ -1914,32 +1879,6 @@ static int fib6_age(struct rt6_info *rt, void *arg)
return -1;
}
gc_args->more++;
/* The following part will soon be removed when the exception
* table is hooked up to store all cached routes.
*/
} else if (rt->rt6i_flags & RTF_CACHE) {
if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout))
rt->dst.obsolete = DST_OBSOLETE_KILL;
if (atomic_read(&rt->dst.__refcnt) == 1 &&
rt->dst.obsolete == DST_OBSOLETE_KILL) {
RT6_TRACE("aging clone %p\n", rt);
return -1;
} else if (rt->rt6i_flags & RTF_GATEWAY) {
struct neighbour *neigh;
__u8 neigh_flags = 0;
neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
if (neigh) {
neigh_flags = neigh->flags;
neigh_release(neigh);
}
if (!(neigh_flags & NTF_ROUTER)) {
RT6_TRACE("purging route %p via non-router but gateway\n",
rt);
return -1;
}
}
gc_args->more++;
}
/* Also age clones in the exception table.

View File

@ -878,8 +878,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6, int flags)
{
struct rt6_info *rt, *rt_cache;
struct fib6_node *fn;
struct rt6_info *rt;
read_lock_bh(&table->tb6_lock);
fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
@ -893,6 +893,11 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
if (fn)
goto restart;
}
/* Search through exception table */
rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
if (rt_cache)
rt = rt_cache;
dst_use(&rt->dst, jiffies);
read_unlock_bh(&table->tb6_lock);
@ -1592,7 +1597,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
int oif, struct flowi6 *fl6, int flags)
{
struct fib6_node *fn, *saved_fn;
struct rt6_info *rt;
struct rt6_info *rt, *rt_cache;
int strict = 0;
strict |= flags & RT6_LOOKUP_F_IFACE;
@ -1624,6 +1629,10 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
}
}
/*Search through exception table */
rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
if (rt_cache)
rt = rt_cache;
if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
dst_use(&rt->dst, jiffies);
@ -1988,23 +1997,17 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
if (!rt6_cache_allowed_for_pmtu(rt6)) {
rt6_do_update_pmtu(rt6, mtu);
/* update rt6_ex->stamp for cache */
if (rt6->rt6i_flags & RTF_CACHE)
rt6_update_exception_stamp_rt(rt6);
} else if (daddr) {
struct rt6_info *nrt6;
nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
if (nrt6) {
rt6_do_update_pmtu(nrt6, mtu);
/* ip6_ins_rt(nrt6) will bump the
* rt6->rt6i_node->fn_sernum
* which will fail the next rt6_check() and
* invalidate the sk->sk_dst_cache.
*/
ip6_ins_rt(nrt6);
/* Release the reference taken in
* ip6_rt_cache_alloc()
*/
dst_release(&nrt6->dst);
if (rt6_insert_exception(nrt6, rt6))
dst_release_immediate(&nrt6->dst);
}
}
}
@ -2068,7 +2071,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
int flags)
{
struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
struct rt6_info *rt;
struct rt6_info *rt, *rt_cache;
struct fib6_node *fn;
/* Get the "current" route for this destination and
@ -2093,8 +2096,23 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
continue;
if (fl6->flowi6_oif != rt->dst.dev->ifindex)
continue;
if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
/* rt_cache's gateway might be different from its 'parent'
* in the case of an ip redirect.
* So we keep searching in the exception table if the gateway
* is different.
*/
if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
rt_cache = rt6_find_cached_rt(rt,
&fl6->daddr,
&fl6->saddr);
if (rt_cache &&
ipv6_addr_equal(&rdfl->gateway,
&rt_cache->rt6i_gateway)) {
rt = rt_cache;
break;
}
continue;
}
break;
}
@ -2785,9 +2803,9 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
static int ip6_route_del(struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
struct rt6_info *rt, *rt_cache;
struct fib6_table *table;
struct fib6_node *fn;
struct rt6_info *rt;
int err = -ESRCH;
table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
@ -2801,13 +2819,17 @@ static int ip6_route_del(struct fib6_config *cfg,
fn = fib6_locate(&table->tb6_root,
&cfg->fc_dst, cfg->fc_dst_len,
&cfg->fc_src, cfg->fc_src_len,
true);
!(cfg->fc_flags & RTF_CACHE));
if (fn) {
for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
if ((rt->rt6i_flags & RTF_CACHE) &&
!(cfg->fc_flags & RTF_CACHE))
continue;
if (cfg->fc_flags & RTF_CACHE) {
rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
&cfg->fc_src);
if (!rt_cache)
continue;
rt = rt_cache;
}
if (cfg->fc_ifindex &&
(!rt->dst.dev ||
rt->dst.dev->ifindex != cfg->fc_ifindex))
@ -2933,8 +2955,14 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
nrt->rt6i_protocol = RTPROT_REDIRECT;
nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
if (ip6_ins_rt(nrt))
goto out_release;
/* No need to remove rt from the exception table if rt is
* a cached route because rt6_insert_exception() will
* takes care of it
*/
if (rt6_insert_exception(nrt, rt)) {
dst_release_immediate(&nrt->dst);
goto out;
}
netevent.old = &rt->dst;
netevent.new = &nrt->dst;
@ -2942,17 +2970,6 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
netevent.neigh = neigh;
call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
if (rt->rt6i_flags & RTF_CACHE) {
rt = (struct rt6_info *) dst_clone(&rt->dst);
ip6_del_rt(rt);
}
out_release:
/* Release the reference taken in
* ip6_rt_cache_alloc()
*/
dst_release(&nrt->dst);
out:
neigh_release(neigh);
}
@ -3344,12 +3361,8 @@ static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
{
struct in6_addr *gateway = (struct in6_addr *)arg;
/* RTF_CACHE_GATEWAY case will be removed once the exception
* table is hooked up to store all cached routes.
*/
if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
return -1;
}
@ -3438,20 +3451,9 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
dst_metric_raw(&rt->dst, RTAX_MTU) &&
!dst_metric_locked(&rt->dst, RTAX_MTU)) {
spin_lock_bh(&rt6_exception_lock);
/* This case will be removed once the exception table
* is hooked up.
*/
if (rt->rt6i_flags & RTF_CACHE) {
/* For RTF_CACHE with rt6i_pmtu == 0
* (i.e. a redirected route),
* the metrics of its rt->dst.from has already
* been updated.
*/
if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
rt->rt6i_pmtu = arg->mtu;
} else if (dst_mtu(&rt->dst) >= arg->mtu ||
(dst_mtu(&rt->dst) < arg->mtu &&
dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
if (dst_mtu(&rt->dst) >= arg->mtu ||
(dst_mtu(&rt->dst) < arg->mtu &&
dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
}
rt6_exceptions_update_pmtu(rt, arg->mtu);