2019-05-27 13:55:01 +07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
2007-02-09 21:24:49 +07:00
|
|
|
* Linux INET6 implementation
|
2005-04-17 05:20:36 +07:00
|
|
|
* Forwarding Information Database
|
|
|
|
*
|
|
|
|
* Authors:
|
2007-02-09 21:24:49 +07:00
|
|
|
* Pedro Roque <roque@di.fc.ul.pt>
|
2005-04-17 05:20:36 +07:00
|
|
|
*
|
2014-03-28 11:07:02 +07:00
|
|
|
* Changes:
|
|
|
|
* Yuji SEKIYA @USAGI: Support default route on router node;
|
|
|
|
* remove ip6_null_entry from the top of
|
|
|
|
* routing table.
|
|
|
|
* Ville Nuorvala: Fixed routing subtrees.
|
2005-04-17 05:20:36 +07:00
|
|
|
*/
|
2012-05-15 21:11:53 +07:00
|
|
|
|
|
|
|
#define pr_fmt(fmt) "IPv6: " fmt
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/net.h>
|
|
|
|
#include <linux/route.h>
|
|
|
|
#include <linux/netdevice.h>
|
|
|
|
#include <linux/in6.h>
|
|
|
|
#include <linux/init.h>
|
2006-08-05 13:20:06 +07:00
|
|
|
#include <linux/list.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 15:04:11 +07:00
|
|
|
#include <linux/slab.h>
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2018-10-05 10:07:52 +07:00
|
|
|
#include <net/ip.h>
|
2005-04-17 05:20:36 +07:00
|
|
|
#include <net/ipv6.h>
|
|
|
|
#include <net/ndisc.h>
|
|
|
|
#include <net/addrconf.h>
|
2015-07-21 15:43:48 +07:00
|
|
|
#include <net/lwtunnel.h>
|
2017-08-03 18:28:17 +07:00
|
|
|
#include <net/fib_notifier.h>
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
#include <net/ip6_fib.h>
|
|
|
|
#include <net/ip6_route.h>
|
|
|
|
|
2014-03-28 11:07:04 +07:00
|
|
|
static struct kmem_cache *fib6_node_kmem __read_mostly;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2014-10-07 00:58:34 +07:00
|
|
|
struct fib6_cleaner {
|
|
|
|
struct fib6_walker w;
|
2008-03-04 14:31:57 +07:00
|
|
|
struct net *net;
|
2018-04-18 07:33:26 +07:00
|
|
|
int (*func)(struct fib6_info *, void *arg);
|
2014-10-07 00:58:38 +07:00
|
|
|
int sernum;
|
2005-04-17 05:20:36 +07:00
|
|
|
void *arg;
|
2018-10-12 10:17:21 +07:00
|
|
|
bool skip_notify;
|
2005-04-17 05:20:36 +07:00
|
|
|
};
|
|
|
|
|
|
|
|
#ifdef CONFIG_IPV6_SUBTREES
|
|
|
|
#define FWS_INIT FWS_S
|
|
|
|
#else
|
|
|
|
#define FWS_INIT FWS_L
|
|
|
|
#endif
|
|
|
|
|
2018-04-18 07:33:26 +07:00
|
|
|
static struct fib6_info *fib6_find_prefix(struct net *net,
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
struct fib6_table *table,
|
|
|
|
struct fib6_node *fn);
|
|
|
|
static struct fib6_node *fib6_repair_tree(struct net *net,
|
|
|
|
struct fib6_table *table,
|
|
|
|
struct fib6_node *fn);
|
2016-03-08 20:44:35 +07:00
|
|
|
static int fib6_walk(struct net *net, struct fib6_walker *w);
|
2014-10-07 00:58:34 +07:00
|
|
|
static int fib6_walk_continue(struct fib6_walker *w);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* A routing update causes an increase of the serial number on the
|
|
|
|
* affected subtree. This allows for cached routes to be asynchronously
|
|
|
|
* tested when modifications are made to the destination cache as a
|
|
|
|
* result of redirects, path MTU changes, etc.
|
|
|
|
*/
|
|
|
|
|
treewide: setup_timer() -> timer_setup() (2 field)
This converts all remaining setup_timer() calls that use a nested field
to reach a struct timer_list. Coccinelle does not have an easy way to
match multiple fields, so a new script is needed to change the matches of
"&_E->_timer" into "&_E->_field1._timer" in all the rules.
spatch --very-quiet --all-includes --include-headers \
-I ./arch/x86/include -I ./arch/x86/include/generated \
-I ./include -I ./arch/x86/include/uapi \
-I ./arch/x86/include/generated/uapi -I ./include/uapi \
-I ./include/generated/uapi --include ./include/linux/kconfig.h \
--dir . \
--cocci-file ~/src/data/timer_setup-2fields.cocci
@fix_address_of depends@
expression e;
@@
setup_timer(
-&(e)
+&e
, ...)
// Update any raw setup_timer() usages that have a NULL callback, but
// would otherwise match change_timer_function_usage, since the latter
// will update all function assignments done in the face of a NULL
// function initialization in setup_timer().
@change_timer_function_usage_NULL@
expression _E;
identifier _field1;
identifier _timer;
type _cast_data;
@@
(
-setup_timer(&_E->_field1._timer, NULL, _E);
+timer_setup(&_E->_field1._timer, NULL, 0);
|
-setup_timer(&_E->_field1._timer, NULL, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, NULL, 0);
|
-setup_timer(&_E._field1._timer, NULL, &_E);
+timer_setup(&_E._field1._timer, NULL, 0);
|
-setup_timer(&_E._field1._timer, NULL, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, NULL, 0);
)
@change_timer_function_usage@
expression _E;
identifier _field1;
identifier _timer;
struct timer_list _stl;
identifier _callback;
type _cast_func, _cast_data;
@@
(
-setup_timer(&_E->_field1._timer, _callback, _E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, &_callback, _E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, _callback, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, (_cast_func)_callback, _E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, (_cast_func)&_callback, _E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, (_cast_data)_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, &_callback, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, (_cast_func)_callback, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, (_cast_func)&_callback, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
_E->_field1._timer@_stl.function = _callback;
|
_E->_field1._timer@_stl.function = &_callback;
|
_E->_field1._timer@_stl.function = (_cast_func)_callback;
|
_E->_field1._timer@_stl.function = (_cast_func)&_callback;
|
_E._field1._timer@_stl.function = _callback;
|
_E._field1._timer@_stl.function = &_callback;
|
_E._field1._timer@_stl.function = (_cast_func)_callback;
|
_E._field1._timer@_stl.function = (_cast_func)&_callback;
)
// callback(unsigned long arg)
@change_callback_handle_cast
depends on change_timer_function_usage@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
identifier _handle;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
(
... when != _origarg
_handletype *_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _field1._timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle =
-(void *)_origarg;
+from_timer(_handle, t, _field1._timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _field1._timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(void *)_origarg;
+from_timer(_handle, t, _field1._timer);
... when != _origarg
)
}
// callback(unsigned long arg) without existing variable
@change_callback_handle_cast_no_arg
depends on change_timer_function_usage &&
!change_callback_handle_cast@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
+ _handletype *_origarg = from_timer(_origarg, t, _field1._timer);
+
... when != _origarg
- (_handletype *)_origarg
+ _origarg
... when != _origarg
}
// Avoid already converted callbacks.
@match_callback_converted
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier t;
@@
void _callback(struct timer_list *t)
{ ... }
// callback(struct something *handle)
@change_callback_handle_arg
depends on change_timer_function_usage &&
!match_callback_converted &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
@@
void _callback(
-_handletype *_handle
+struct timer_list *t
)
{
+ _handletype *_handle = from_timer(_handle, t, _field1._timer);
...
}
// If change_callback_handle_arg ran on an empty function, remove
// the added handler.
@unchange_callback_handle_arg
depends on change_timer_function_usage &&
change_callback_handle_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
identifier t;
@@
void _callback(struct timer_list *t)
{
- _handletype *_handle = from_timer(_handle, t, _field1._timer);
}
// We only want to refactor the setup_timer() data argument if we've found
// the matching callback. This undoes changes in change_timer_function_usage.
@unchange_timer_function_usage
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg &&
!change_callback_handle_arg@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type change_timer_function_usage._cast_data;
@@
(
-timer_setup(&_E->_field1._timer, _callback, 0);
+setup_timer(&_E->_field1._timer, _callback, (_cast_data)_E);
|
-timer_setup(&_E._field1._timer, _callback, 0);
+setup_timer(&_E._field1._timer, _callback, (_cast_data)&_E);
)
// If we fixed a callback from a .function assignment, fix the
// assignment cast now.
@change_timer_function_assignment
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_func;
typedef TIMER_FUNC_TYPE;
@@
(
_E->_field1._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_field1._timer.function =
-&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_field1._timer.function =
-(_cast_func)_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_field1._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._field1._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._field1._timer.function =
-&_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E._field1._timer.function =
-(_cast_func)_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._field1._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
)
// Sometimes timer functions are called directly. Replace matched args.
@change_timer_function_calls
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression _E;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_data;
@@
_callback(
(
-(_cast_data)_E
+&_E->_field1._timer
|
-(_cast_data)&_E
+&_E._field1._timer
|
-_E
+&_E->_field1._timer
)
)
// If a timer has been configured without a data argument, it can be
// converted without regard to the callback argument, since it is unused.
@match_timer_function_unused_data@
expression _E;
identifier _field1;
identifier _timer;
identifier _callback;
@@
(
-setup_timer(&_E->_field1._timer, _callback, 0);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, _callback, 0L);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, _callback, 0UL);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, 0);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, 0L);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, 0UL);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_field1._timer, _callback, 0);
+timer_setup(&_field1._timer, _callback, 0);
|
-setup_timer(&_field1._timer, _callback, 0L);
+timer_setup(&_field1._timer, _callback, 0);
|
-setup_timer(&_field1._timer, _callback, 0UL);
+timer_setup(&_field1._timer, _callback, 0);
|
-setup_timer(_field1._timer, _callback, 0);
+timer_setup(_field1._timer, _callback, 0);
|
-setup_timer(_field1._timer, _callback, 0L);
+timer_setup(_field1._timer, _callback, 0);
|
-setup_timer(_field1._timer, _callback, 0UL);
+timer_setup(_field1._timer, _callback, 0);
)
@change_callback_unused_data
depends on match_timer_function_unused_data@
identifier match_timer_function_unused_data._callback;
type _origtype;
identifier _origarg;
@@
void _callback(
-_origtype _origarg
+struct timer_list *unused
)
{
... when != _origarg
}
Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-18 10:21:24 +07:00
|
|
|
static void fib6_gc_timer_cb(struct timer_list *t);
|
2008-03-04 14:28:58 +07:00
|
|
|
|
2016-03-08 20:44:35 +07:00
|
|
|
#define FOR_WALKERS(net, w) \
|
|
|
|
list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2016-03-08 20:44:35 +07:00
|
|
|
static void fib6_walker_link(struct net *net, struct fib6_walker *w)
|
2006-08-15 13:49:16 +07:00
|
|
|
{
|
2016-03-08 20:44:35 +07:00
|
|
|
write_lock_bh(&net->ipv6.fib6_walker_lock);
|
|
|
|
list_add(&w->lh, &net->ipv6.fib6_walkers);
|
|
|
|
write_unlock_bh(&net->ipv6.fib6_walker_lock);
|
2006-08-15 13:49:16 +07:00
|
|
|
}
|
|
|
|
|
2016-03-08 20:44:35 +07:00
|
|
|
static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
|
2006-08-15 13:49:16 +07:00
|
|
|
{
|
2016-03-08 20:44:35 +07:00
|
|
|
write_lock_bh(&net->ipv6.fib6_walker_lock);
|
2010-02-18 15:13:30 +07:00
|
|
|
list_del(&w->lh);
|
2016-03-08 20:44:35 +07:00
|
|
|
write_unlock_bh(&net->ipv6.fib6_walker_lock);
|
2006-08-15 13:49:16 +07:00
|
|
|
}
|
2014-10-07 00:58:34 +07:00
|
|
|
|
2014-10-07 00:58:37 +07:00
|
|
|
static int fib6_new_sernum(struct net *net)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2014-10-07 00:58:35 +07:00
|
|
|
int new, old;
|
|
|
|
|
|
|
|
do {
|
2014-10-07 00:58:37 +07:00
|
|
|
old = atomic_read(&net->ipv6.fib6_sernum);
|
2014-10-07 00:58:35 +07:00
|
|
|
new = old < INT_MAX ? old + 1 : 1;
|
2014-10-07 00:58:37 +07:00
|
|
|
} while (atomic_cmpxchg(&net->ipv6.fib6_sernum,
|
|
|
|
old, new) != old);
|
2014-10-07 00:58:35 +07:00
|
|
|
return new;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
2014-10-07 00:58:38 +07:00
|
|
|
enum {
|
|
|
|
FIB6_NO_SERNUM_CHANGE = 0,
|
|
|
|
};
|
|
|
|
|
2018-04-19 05:38:59 +07:00
|
|
|
void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
|
2017-10-07 02:05:56 +07:00
|
|
|
{
|
|
|
|
struct fib6_node *fn;
|
|
|
|
|
2018-04-19 05:38:59 +07:00
|
|
|
fn = rcu_dereference_protected(f6i->fib6_node,
|
|
|
|
lockdep_is_held(&f6i->fib6_table->tb6_lock));
|
2017-10-07 02:05:56 +07:00
|
|
|
if (fn)
|
|
|
|
fn->fn_sernum = fib6_new_sernum(net);
|
|
|
|
}
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* Auxiliary address test functions for the radix tree.
|
|
|
|
*
|
2007-02-09 21:24:49 +07:00
|
|
|
* These assume a 32bit processor (although it will work on
|
2005-04-17 05:20:36 +07:00
|
|
|
* 64bit processors)
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* test bit
|
|
|
|
*/
|
2010-03-27 08:24:16 +07:00
|
|
|
#if defined(__LITTLE_ENDIAN)
|
|
|
|
# define BITOP_BE32_SWIZZLE (0x1F & ~7)
|
|
|
|
#else
|
|
|
|
# define BITOP_BE32_SWIZZLE 0
|
|
|
|
#endif
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2014-10-07 00:58:34 +07:00
|
|
|
static __be32 addr_bit_set(const void *token, int fn_bit)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2011-04-22 11:53:02 +07:00
|
|
|
const __be32 *addr = token;
|
2010-03-27 08:24:16 +07:00
|
|
|
/*
|
|
|
|
* Here,
|
2014-03-28 11:07:02 +07:00
|
|
|
* 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
|
2010-03-27 08:24:16 +07:00
|
|
|
* is optimized version of
|
|
|
|
* htonl(1 << ((~fn_bit)&0x1F))
|
|
|
|
* See include/asm-generic/bitops/le.h.
|
|
|
|
*/
|
2010-04-21 09:06:52 +07:00
|
|
|
return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
|
|
|
|
addr[fn_bit >> 5];
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
2019-05-23 10:27:59 +07:00
|
|
|
struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
|
2018-04-18 07:33:24 +07:00
|
|
|
{
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *f6i;
|
2019-05-23 10:27:59 +07:00
|
|
|
size_t sz = sizeof(*f6i);
|
2018-04-18 07:33:24 +07:00
|
|
|
|
2019-05-23 10:27:59 +07:00
|
|
|
if (with_fib6_nh)
|
|
|
|
sz += sizeof(struct fib6_nh);
|
|
|
|
|
|
|
|
f6i = kzalloc(sz, gfp_flags);
|
2018-04-18 07:33:24 +07:00
|
|
|
if (!f6i)
|
|
|
|
return NULL;
|
|
|
|
|
2019-06-04 10:19:52 +07:00
|
|
|
/* fib6_siblings is a union with nh_list, so this initializes both */
|
2018-04-19 05:38:59 +07:00
|
|
|
INIT_LIST_HEAD(&f6i->fib6_siblings);
|
2019-04-23 08:35:03 +07:00
|
|
|
refcount_set(&f6i->fib6_ref, 1);
|
2018-04-18 07:33:24 +07:00
|
|
|
|
|
|
|
return f6i;
|
|
|
|
}
|
|
|
|
|
2018-06-18 19:24:31 +07:00
|
|
|
void fib6_info_destroy_rcu(struct rcu_head *head)
|
2018-04-18 07:33:24 +07:00
|
|
|
{
|
2018-06-18 19:24:31 +07:00
|
|
|
struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
|
2018-04-18 07:33:24 +07:00
|
|
|
|
2018-04-19 05:38:59 +07:00
|
|
|
WARN_ON(f6i->fib6_node);
|
2018-04-18 07:33:24 +07:00
|
|
|
|
2019-06-04 10:19:52 +07:00
|
|
|
if (f6i->nh)
|
|
|
|
nexthop_put(f6i->nh);
|
|
|
|
else
|
|
|
|
fib6_nh_release(f6i->fib6_nh);
|
|
|
|
|
2018-10-05 10:07:52 +07:00
|
|
|
ip_fib_metrics_put(f6i->fib6_metrics);
|
2018-04-18 07:33:24 +07:00
|
|
|
kfree(f6i);
|
|
|
|
}
|
2018-06-18 19:24:31 +07:00
|
|
|
EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
|
2018-04-18 07:33:24 +07:00
|
|
|
|
2017-10-07 02:06:11 +07:00
|
|
|
static struct fib6_node *node_alloc(struct net *net)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
|
|
|
struct fib6_node *fn;
|
|
|
|
|
2007-02-10 16:45:03 +07:00
|
|
|
fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
|
2017-10-07 02:06:11 +07:00
|
|
|
if (fn)
|
|
|
|
net->ipv6.rt6_stats->fib_nodes++;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
return fn;
|
|
|
|
}
|
|
|
|
|
2017-10-07 02:06:11 +07:00
|
|
|
static void node_free_immediate(struct net *net, struct fib6_node *fn)
|
2017-08-21 23:47:10 +07:00
|
|
|
{
|
|
|
|
kmem_cache_free(fib6_node_kmem, fn);
|
2017-10-07 02:06:11 +07:00
|
|
|
net->ipv6.rt6_stats->fib_nodes--;
|
2017-08-21 23:47:10 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void node_free_rcu(struct rcu_head *head)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2017-08-21 23:47:10 +07:00
|
|
|
struct fib6_node *fn = container_of(head, struct fib6_node, rcu);
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
kmem_cache_free(fib6_node_kmem, fn);
|
|
|
|
}
|
|
|
|
|
2017-10-07 02:06:11 +07:00
|
|
|
static void node_free(struct net *net, struct fib6_node *fn)
|
2017-08-21 23:47:10 +07:00
|
|
|
{
|
|
|
|
call_rcu(&fn->rcu, node_free_rcu);
|
2017-10-07 02:06:11 +07:00
|
|
|
net->ipv6.rt6_stats->fib_nodes--;
|
2017-08-21 23:47:10 +07:00
|
|
|
}
|
|
|
|
|
2017-09-08 15:26:19 +07:00
|
|
|
static void fib6_free_table(struct fib6_table *table)
|
|
|
|
{
|
|
|
|
inetpeer_invalidate_tree(&table->tb6_peers);
|
|
|
|
kfree(table);
|
|
|
|
}
|
|
|
|
|
2008-03-04 14:25:27 +07:00
|
|
|
static void fib6_link_table(struct net *net, struct fib6_table *tb)
|
2006-08-11 13:11:17 +07:00
|
|
|
{
|
|
|
|
unsigned int h;
|
|
|
|
|
2006-10-22 10:20:54 +07:00
|
|
|
/*
|
|
|
|
* Initialize table lock at a single place to give lockdep a key,
|
|
|
|
* tables aren't visible prior to being linked to the list.
|
|
|
|
*/
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
spin_lock_init(&tb->tb6_lock);
|
2009-07-31 08:52:15 +07:00
|
|
|
h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
|
2006-08-11 13:11:17 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* No protection necessary, this is the only list mutatation
|
|
|
|
* operation, tables never disappear once they exist.
|
|
|
|
*/
|
2008-03-04 14:25:27 +07:00
|
|
|
hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
|
2006-08-11 13:11:17 +07:00
|
|
|
}
|
2006-08-05 13:20:06 +07:00
|
|
|
|
2006-08-11 13:11:17 +07:00
|
|
|
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
|
2008-03-04 14:24:31 +07:00
|
|
|
|
2008-03-05 04:48:30 +07:00
|
|
|
static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
|
2006-08-05 13:20:06 +07:00
|
|
|
{
|
|
|
|
struct fib6_table *table;
|
|
|
|
|
|
|
|
table = kzalloc(sizeof(*table), GFP_ATOMIC);
|
2011-12-04 05:50:45 +07:00
|
|
|
if (table) {
|
2006-08-05 13:20:06 +07:00
|
|
|
table->tb6_id = id;
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
rcu_assign_pointer(table->tb6_root.leaf,
|
2018-04-18 07:33:18 +07:00
|
|
|
net->ipv6.fib6_null_entry);
|
2006-08-05 13:20:06 +07:00
|
|
|
table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
|
2012-06-11 14:01:52 +07:00
|
|
|
inet_peer_base_init(&table->tb6_peers);
|
2006-08-05 13:20:06 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
return table;
|
|
|
|
}
|
|
|
|
|
2008-03-04 14:25:27 +07:00
|
|
|
struct fib6_table *fib6_new_table(struct net *net, u32 id)
|
2006-08-05 13:20:06 +07:00
|
|
|
{
|
|
|
|
struct fib6_table *tb;
|
|
|
|
|
|
|
|
if (id == 0)
|
|
|
|
id = RT6_TABLE_MAIN;
|
2008-03-04 14:25:27 +07:00
|
|
|
tb = fib6_get_table(net, id);
|
2006-08-05 13:20:06 +07:00
|
|
|
if (tb)
|
|
|
|
return tb;
|
|
|
|
|
2008-03-05 04:48:30 +07:00
|
|
|
tb = fib6_alloc_table(net, id);
|
2011-12-04 05:50:45 +07:00
|
|
|
if (tb)
|
2008-03-04 14:25:27 +07:00
|
|
|
fib6_link_table(net, tb);
|
2006-08-05 13:20:06 +07:00
|
|
|
|
|
|
|
return tb;
|
|
|
|
}
|
2016-05-05 11:46:12 +07:00
|
|
|
EXPORT_SYMBOL_GPL(fib6_new_table);
|
2006-08-05 13:20:06 +07:00
|
|
|
|
2008-03-04 14:25:27 +07:00
|
|
|
struct fib6_table *fib6_get_table(struct net *net, u32 id)
|
2006-08-05 13:20:06 +07:00
|
|
|
{
|
|
|
|
struct fib6_table *tb;
|
2008-03-04 14:25:27 +07:00
|
|
|
struct hlist_head *head;
|
2006-08-05 13:20:06 +07:00
|
|
|
unsigned int h;
|
|
|
|
|
|
|
|
if (id == 0)
|
|
|
|
id = RT6_TABLE_MAIN;
|
2009-07-31 08:52:15 +07:00
|
|
|
h = id & (FIB6_TABLE_HASHSZ - 1);
|
2006-08-05 13:20:06 +07:00
|
|
|
rcu_read_lock();
|
2008-03-04 14:25:27 +07:00
|
|
|
head = &net->ipv6.fib_table_hash[h];
|
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 08:06:00 +07:00
|
|
|
hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
|
2006-08-05 13:20:06 +07:00
|
|
|
if (tb->tb6_id == id) {
|
|
|
|
rcu_read_unlock();
|
|
|
|
return tb;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
2015-10-13 01:47:08 +07:00
|
|
|
EXPORT_SYMBOL_GPL(fib6_get_table);
|
2006-08-05 13:20:06 +07:00
|
|
|
|
2010-01-17 10:35:32 +07:00
|
|
|
static void __net_init fib6_tables_init(struct net *net)
|
2006-08-05 13:20:06 +07:00
|
|
|
{
|
2008-03-04 14:25:27 +07:00
|
|
|
fib6_link_table(net, net->ipv6.fib6_main_tbl);
|
|
|
|
fib6_link_table(net, net->ipv6.fib6_local_tbl);
|
2006-08-05 13:20:06 +07:00
|
|
|
}
|
|
|
|
#else
|
|
|
|
|
2008-03-04 14:25:27 +07:00
|
|
|
struct fib6_table *fib6_new_table(struct net *net, u32 id)
|
2006-08-05 13:20:06 +07:00
|
|
|
{
|
2008-03-04 14:25:27 +07:00
|
|
|
return fib6_get_table(net, id);
|
2006-08-05 13:20:06 +07:00
|
|
|
}
|
|
|
|
|
2008-03-04 14:25:27 +07:00
|
|
|
struct fib6_table *fib6_get_table(struct net *net, u32 id)
|
2006-08-05 13:20:06 +07:00
|
|
|
{
|
2008-03-04 14:25:27 +07:00
|
|
|
return net->ipv6.fib6_main_tbl;
|
2006-08-05 13:20:06 +07:00
|
|
|
}
|
|
|
|
|
2011-03-13 04:22:43 +07:00
|
|
|
struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
|
2018-03-02 23:32:17 +07:00
|
|
|
const struct sk_buff *skb,
|
2008-03-04 14:25:27 +07:00
|
|
|
int flags, pol_lookup_t lookup)
|
2006-08-05 13:20:06 +07:00
|
|
|
{
|
2015-10-23 14:36:53 +07:00
|
|
|
struct rt6_info *rt;
|
|
|
|
|
2018-03-02 23:32:17 +07:00
|
|
|
rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
|
2017-06-20 17:29:25 +07:00
|
|
|
if (rt->dst.error == -EAGAIN) {
|
2019-06-21 07:36:39 +07:00
|
|
|
ip6_rt_put_flags(rt, flags);
|
2015-10-23 14:36:53 +07:00
|
|
|
rt = net->ipv6.ip6_null_entry;
|
2019-09-20 00:12:36 +07:00
|
|
|
if (!(flags & RT6_LOOKUP_F_DST_NOREF))
|
2019-06-21 07:36:39 +07:00
|
|
|
dst_hold(&rt->dst);
|
2015-10-23 14:36:53 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
return &rt->dst;
|
2006-08-05 13:20:06 +07:00
|
|
|
}
|
|
|
|
|
2018-05-10 10:34:23 +07:00
|
|
|
/* called with rcu lock held; no reference taken on fib6_info */
|
2019-04-17 04:36:10 +07:00
|
|
|
int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
|
|
|
|
struct fib6_result *res, int flags)
|
2018-05-10 10:34:23 +07:00
|
|
|
{
|
2019-04-17 04:36:10 +07:00
|
|
|
return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
|
|
|
|
res, flags);
|
2018-05-10 10:34:23 +07:00
|
|
|
}
|
|
|
|
|
2010-01-17 10:35:32 +07:00
|
|
|
static void __net_init fib6_tables_init(struct net *net)
|
2006-08-05 13:20:06 +07:00
|
|
|
{
|
2008-03-04 14:25:27 +07:00
|
|
|
fib6_link_table(net, net->ipv6.fib6_main_tbl);
|
2006-08-05 13:20:06 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2017-08-03 18:28:19 +07:00
|
|
|
unsigned int fib6_tables_seq_read(struct net *net)
|
|
|
|
{
|
|
|
|
unsigned int h, fib_seq = 0;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
|
|
|
|
struct hlist_head *head = &net->ipv6.fib_table_hash[h];
|
|
|
|
struct fib6_table *tb;
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
hlist_for_each_entry_rcu(tb, head, tb6_hlist)
|
2017-08-03 18:28:19 +07:00
|
|
|
fib_seq += tb->fib_seq;
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return fib_seq;
|
|
|
|
}
|
|
|
|
|
2019-10-03 16:49:27 +07:00
|
|
|
static int call_fib6_entry_notifier(struct notifier_block *nb,
|
2017-08-03 18:28:19 +07:00
|
|
|
enum fib_event_type event_type,
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *rt)
|
2017-08-03 18:28:19 +07:00
|
|
|
{
|
|
|
|
struct fib6_entry_notifier_info info = {
|
|
|
|
.rt = rt,
|
|
|
|
};
|
|
|
|
|
2019-10-03 16:49:27 +07:00
|
|
|
return call_fib6_notifier(nb, event_type, &info.info);
|
2017-08-03 18:28:19 +07:00
|
|
|
}
|
|
|
|
|
2019-05-23 02:04:41 +07:00
|
|
|
int call_fib6_entry_notifiers(struct net *net,
|
|
|
|
enum fib_event_type event_type,
|
|
|
|
struct fib6_info *rt,
|
|
|
|
struct netlink_ext_ack *extack)
|
2017-08-03 18:28:17 +07:00
|
|
|
{
|
|
|
|
struct fib6_entry_notifier_info info = {
|
2017-10-28 07:37:13 +07:00
|
|
|
.info.extack = extack,
|
2017-08-03 18:28:17 +07:00
|
|
|
.rt = rt,
|
|
|
|
};
|
|
|
|
|
2018-04-19 05:38:59 +07:00
|
|
|
rt->fib6_table->fib_seq++;
|
2017-08-03 18:28:17 +07:00
|
|
|
return call_fib6_notifiers(net, event_type, &info.info);
|
|
|
|
}
|
|
|
|
|
2019-06-18 22:12:45 +07:00
|
|
|
int call_fib6_multipath_entry_notifiers(struct net *net,
|
|
|
|
enum fib_event_type event_type,
|
|
|
|
struct fib6_info *rt,
|
|
|
|
unsigned int nsiblings,
|
|
|
|
struct netlink_ext_ack *extack)
|
|
|
|
{
|
|
|
|
struct fib6_entry_notifier_info info = {
|
|
|
|
.info.extack = extack,
|
|
|
|
.rt = rt,
|
|
|
|
.nsiblings = nsiblings,
|
|
|
|
};
|
|
|
|
|
|
|
|
rt->fib6_table->fib_seq++;
|
|
|
|
return call_fib6_notifiers(net, event_type, &info.info);
|
|
|
|
}
|
|
|
|
|
2017-08-03 18:28:19 +07:00
|
|
|
struct fib6_dump_arg {
|
|
|
|
struct net *net;
|
|
|
|
struct notifier_block *nb;
|
|
|
|
};
|
|
|
|
|
2019-10-03 16:49:28 +07:00
|
|
|
static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
|
2017-08-03 18:28:19 +07:00
|
|
|
{
|
2018-04-18 07:33:18 +07:00
|
|
|
if (rt == arg->net->ipv6.fib6_null_entry)
|
2019-10-03 16:49:28 +07:00
|
|
|
return 0;
|
|
|
|
return call_fib6_entry_notifier(arg->nb, FIB_EVENT_ENTRY_ADD, rt);
|
2017-08-03 18:28:19 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static int fib6_node_dump(struct fib6_walker *w)
|
|
|
|
{
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *rt;
|
2019-10-03 16:49:28 +07:00
|
|
|
int err = 0;
|
2017-08-03 18:28:19 +07:00
|
|
|
|
2019-10-03 16:49:28 +07:00
|
|
|
for_each_fib6_walker_rt(w) {
|
|
|
|
err = fib6_rt_dump(rt, w->args);
|
|
|
|
if (err)
|
|
|
|
break;
|
|
|
|
}
|
2017-08-03 18:28:19 +07:00
|
|
|
w->leaf = NULL;
|
2019-10-03 16:49:28 +07:00
|
|
|
return err;
|
2017-08-03 18:28:19 +07:00
|
|
|
}
|
|
|
|
|
2019-10-03 16:49:28 +07:00
|
|
|
static int fib6_table_dump(struct net *net, struct fib6_table *tb,
|
|
|
|
struct fib6_walker *w)
|
2017-08-03 18:28:19 +07:00
|
|
|
{
|
2019-10-03 16:49:28 +07:00
|
|
|
int err;
|
|
|
|
|
2017-08-03 18:28:19 +07:00
|
|
|
w->root = &tb->tb6_root;
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
spin_lock_bh(&tb->tb6_lock);
|
2019-10-03 16:49:28 +07:00
|
|
|
err = fib6_walk(net, w);
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
spin_unlock_bh(&tb->tb6_lock);
|
2019-10-03 16:49:28 +07:00
|
|
|
return err;
|
2017-08-03 18:28:19 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Called with rcu_read_lock() */
|
|
|
|
int fib6_tables_dump(struct net *net, struct notifier_block *nb)
|
|
|
|
{
|
|
|
|
struct fib6_dump_arg arg;
|
|
|
|
struct fib6_walker *w;
|
|
|
|
unsigned int h;
|
2019-10-03 16:49:28 +07:00
|
|
|
int err = 0;
|
2017-08-03 18:28:19 +07:00
|
|
|
|
|
|
|
w = kzalloc(sizeof(*w), GFP_ATOMIC);
|
|
|
|
if (!w)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
w->func = fib6_node_dump;
|
|
|
|
arg.net = net;
|
|
|
|
arg.nb = nb;
|
|
|
|
w->args = &arg;
|
|
|
|
|
|
|
|
for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
|
|
|
|
struct hlist_head *head = &net->ipv6.fib_table_hash[h];
|
|
|
|
struct fib6_table *tb;
|
|
|
|
|
2019-10-03 16:49:28 +07:00
|
|
|
hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
|
|
|
|
err = fib6_table_dump(net, tb, w);
|
|
|
|
if (err < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
2017-08-03 18:28:19 +07:00
|
|
|
}
|
|
|
|
|
2019-10-03 16:49:28 +07:00
|
|
|
out:
|
2017-08-03 18:28:19 +07:00
|
|
|
kfree(w);
|
|
|
|
|
2019-10-03 16:49:28 +07:00
|
|
|
return err;
|
2017-08-03 18:28:19 +07:00
|
|
|
}
|
|
|
|
|
2014-10-07 00:58:34 +07:00
|
|
|
static int fib6_dump_node(struct fib6_walker *w)
|
2006-08-11 13:11:17 +07:00
|
|
|
{
|
|
|
|
int res;
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *rt;
|
2006-08-11 13:11:17 +07:00
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
for_each_fib6_walker_rt(w) {
|
ipv6: Dump route exceptions if requested
Since commit 2b760fcf5cfb ("ipv6: hook up exception table to store dst
cache"), route exceptions reside in a separate hash table, and won't be
found by walking the FIB, so they won't be dumped to userspace on a
RTM_GETROUTE message.
This causes 'ip -6 route list cache' and 'ip -6 route flush cache' to
have no function anymore:
# ip -6 route get fc00:3::1
fc00:3::1 via fc00:1::2 dev veth_A-R1 src fc00:1::1 metric 1024 expires 539sec mtu 1400 pref medium
# ip -6 route get fc00:4::1
fc00:4::1 via fc00:2::2 dev veth_A-R2 src fc00:2::1 metric 1024 expires 536sec mtu 1500 pref medium
# ip -6 route list cache
# ip -6 route flush cache
# ip -6 route get fc00:3::1
fc00:3::1 via fc00:1::2 dev veth_A-R1 src fc00:1::1 metric 1024 expires 520sec mtu 1400 pref medium
# ip -6 route get fc00:4::1
fc00:4::1 via fc00:2::2 dev veth_A-R2 src fc00:2::1 metric 1024 expires 519sec mtu 1500 pref medium
because iproute2 lists cached routes using RTM_GETROUTE, and flushes them
by listing all the routes, and deleting them with RTM_DELROUTE one by one.
If cached routes are requested using the RTM_F_CLONED flag together with
strict checking, or if no strict checking is requested (and hence we can't
consistently apply filters), look up exceptions in the hash table
associated with the current fib6_info in rt6_dump_route(), and, if present
and not expired, add them to the dump.
We might be unable to dump all the entries for a given node in a single
message, so keep track of how many entries were handled for the current
node in fib6_walker, and skip that amount in case we start from the same
partially dumped node.
When a partial dump restarts, as the starting node might change when
'sernum' changes, we have no guarantee that we need to skip the same
amount of in-node entries. Therefore, we need two counters, and we need to
zero the in-node counter if the node from which the dump is resumed
differs.
Note that, with the current version of iproute2, this only fixes the
'ip -6 route list cache': on a flush command, iproute2 doesn't pass
RTM_F_CLONED and, due to this inconsistency, 'ip -6 route flush cache' is
still unable to fetch the routes to be flushed. This will be addressed in
a patch for iproute2.
To flush cached routes, a procfs entry could be introduced instead: that's
how it works for IPv4. We already have a rt6_flush_exception() function
ready to be wired to it. However, this would not solve the issue for
listing.
Versions of iproute2 and kernel tested:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0 5.1.0, patched
3.18 list + + + + + +
flush + + + + + +
4.4 list + + + + + +
flush + + + + + +
4.9 list + + + + + +
flush + + + + + +
4.14 list + + + + + +
flush + + + + + +
4.15 list
flush
4.19 list
flush
5.0 list
flush
5.1 list
flush
with list + + + + + +
fix flush + + + +
v7:
- Explain usage of "skip" counters in commit message (suggested by
David Ahern)
v6:
- Rebase onto net-next, use recently introduced nexthop walker
- Make rt6_nh_dump_exceptions() a separate function (suggested by David
Ahern)
v5:
- Use dump_routes and dump_exceptions from filter, ignore NLM_F_MATCH,
update test results (flushing works with iproute2 < 5.0.0 now)
v4:
- Split NLM_F_MATCH and strict check handling in separate patches
- Filter routes using RTM_F_CLONED: if it's not set, only return
non-cached routes, and if it's set, only return cached routes:
change requested by David Ahern and Martin Lau. This implies that
iproute2 needs a separate patch to be able to flush IPv6 cached
routes. This is not ideal because we can't fix the breakage caused
by 2b760fcf5cfb entirely in kernel. However, two years have passed
since then, and this makes it more tolerable
v3:
- More descriptive comment about expired exceptions in rt6_dump_route()
- Swap return values of rt6_dump_route() (suggested by Martin Lau)
- Don't zero skip_in_node in case we don't dump anything in a given pass
(also suggested by Martin Lau)
- Remove check on RTM_F_CLONED altogether: in the current UAPI semantic,
it's just a flag to indicate the route was cloned, not to filter on
routes
v2: Add tracking of number of entries to be skipped in current node after
a partial dump. As we restart from the same node, if not all the
exceptions for a given node fit in a single message, the dump will
not terminate, as suggested by Martin Lau. This is a concrete
possibility, setting up a big number of exceptions for the same route
actually causes the issue, suggested by David Ahern.
Reported-by: Jianlin Shi <jishi@redhat.com>
Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 22:45:27 +07:00
|
|
|
res = rt6_dump_route(rt, w->args, w->skip_in_node);
|
2019-06-21 22:45:26 +07:00
|
|
|
if (res >= 0) {
|
2006-08-11 13:11:17 +07:00
|
|
|
/* Frame is full, suspend walking */
|
|
|
|
w->leaf = rt;
|
ipv6: Dump route exceptions if requested
Since commit 2b760fcf5cfb ("ipv6: hook up exception table to store dst
cache"), route exceptions reside in a separate hash table, and won't be
found by walking the FIB, so they won't be dumped to userspace on a
RTM_GETROUTE message.
This causes 'ip -6 route list cache' and 'ip -6 route flush cache' to
have no function anymore:
# ip -6 route get fc00:3::1
fc00:3::1 via fc00:1::2 dev veth_A-R1 src fc00:1::1 metric 1024 expires 539sec mtu 1400 pref medium
# ip -6 route get fc00:4::1
fc00:4::1 via fc00:2::2 dev veth_A-R2 src fc00:2::1 metric 1024 expires 536sec mtu 1500 pref medium
# ip -6 route list cache
# ip -6 route flush cache
# ip -6 route get fc00:3::1
fc00:3::1 via fc00:1::2 dev veth_A-R1 src fc00:1::1 metric 1024 expires 520sec mtu 1400 pref medium
# ip -6 route get fc00:4::1
fc00:4::1 via fc00:2::2 dev veth_A-R2 src fc00:2::1 metric 1024 expires 519sec mtu 1500 pref medium
because iproute2 lists cached routes using RTM_GETROUTE, and flushes them
by listing all the routes, and deleting them with RTM_DELROUTE one by one.
If cached routes are requested using the RTM_F_CLONED flag together with
strict checking, or if no strict checking is requested (and hence we can't
consistently apply filters), look up exceptions in the hash table
associated with the current fib6_info in rt6_dump_route(), and, if present
and not expired, add them to the dump.
We might be unable to dump all the entries for a given node in a single
message, so keep track of how many entries were handled for the current
node in fib6_walker, and skip that amount in case we start from the same
partially dumped node.
When a partial dump restarts, as the starting node might change when
'sernum' changes, we have no guarantee that we need to skip the same
amount of in-node entries. Therefore, we need two counters, and we need to
zero the in-node counter if the node from which the dump is resumed
differs.
Note that, with the current version of iproute2, this only fixes the
'ip -6 route list cache': on a flush command, iproute2 doesn't pass
RTM_F_CLONED and, due to this inconsistency, 'ip -6 route flush cache' is
still unable to fetch the routes to be flushed. This will be addressed in
a patch for iproute2.
To flush cached routes, a procfs entry could be introduced instead: that's
how it works for IPv4. We already have a rt6_flush_exception() function
ready to be wired to it. However, this would not solve the issue for
listing.
Versions of iproute2 and kernel tested:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0 5.1.0, patched
3.18 list + + + + + +
flush + + + + + +
4.4 list + + + + + +
flush + + + + + +
4.9 list + + + + + +
flush + + + + + +
4.14 list + + + + + +
flush + + + + + +
4.15 list
flush
4.19 list
flush
5.0 list
flush
5.1 list
flush
with list + + + + + +
fix flush + + + +
v7:
- Explain usage of "skip" counters in commit message (suggested by
David Ahern)
v6:
- Rebase onto net-next, use recently introduced nexthop walker
- Make rt6_nh_dump_exceptions() a separate function (suggested by David
Ahern)
v5:
- Use dump_routes and dump_exceptions from filter, ignore NLM_F_MATCH,
update test results (flushing works with iproute2 < 5.0.0 now)
v4:
- Split NLM_F_MATCH and strict check handling in separate patches
- Filter routes using RTM_F_CLONED: if it's not set, only return
non-cached routes, and if it's set, only return cached routes:
change requested by David Ahern and Martin Lau. This implies that
iproute2 needs a separate patch to be able to flush IPv6 cached
routes. This is not ideal because we can't fix the breakage caused
by 2b760fcf5cfb entirely in kernel. However, two years have passed
since then, and this makes it more tolerable
v3:
- More descriptive comment about expired exceptions in rt6_dump_route()
- Swap return values of rt6_dump_route() (suggested by Martin Lau)
- Don't zero skip_in_node in case we don't dump anything in a given pass
(also suggested by Martin Lau)
- Remove check on RTM_F_CLONED altogether: in the current UAPI semantic,
it's just a flag to indicate the route was cloned, not to filter on
routes
v2: Add tracking of number of entries to be skipped in current node after
a partial dump. As we restart from the same node, if not all the
exceptions for a given node fit in a single message, the dump will
not terminate, as suggested by Martin Lau. This is a concrete
possibility, setting up a big number of exceptions for the same route
actually causes the issue, suggested by David Ahern.
Reported-by: Jianlin Shi <jishi@redhat.com>
Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 22:45:27 +07:00
|
|
|
|
|
|
|
/* We'll restart from this node, so if some routes were
|
|
|
|
* already dumped, skip them next time.
|
|
|
|
*/
|
|
|
|
w->skip_in_node += res;
|
|
|
|
|
2006-08-11 13:11:17 +07:00
|
|
|
return 1;
|
|
|
|
}
|
ipv6: Dump route exceptions if requested
Since commit 2b760fcf5cfb ("ipv6: hook up exception table to store dst
cache"), route exceptions reside in a separate hash table, and won't be
found by walking the FIB, so they won't be dumped to userspace on a
RTM_GETROUTE message.
This causes 'ip -6 route list cache' and 'ip -6 route flush cache' to
have no function anymore:
# ip -6 route get fc00:3::1
fc00:3::1 via fc00:1::2 dev veth_A-R1 src fc00:1::1 metric 1024 expires 539sec mtu 1400 pref medium
# ip -6 route get fc00:4::1
fc00:4::1 via fc00:2::2 dev veth_A-R2 src fc00:2::1 metric 1024 expires 536sec mtu 1500 pref medium
# ip -6 route list cache
# ip -6 route flush cache
# ip -6 route get fc00:3::1
fc00:3::1 via fc00:1::2 dev veth_A-R1 src fc00:1::1 metric 1024 expires 520sec mtu 1400 pref medium
# ip -6 route get fc00:4::1
fc00:4::1 via fc00:2::2 dev veth_A-R2 src fc00:2::1 metric 1024 expires 519sec mtu 1500 pref medium
because iproute2 lists cached routes using RTM_GETROUTE, and flushes them
by listing all the routes, and deleting them with RTM_DELROUTE one by one.
If cached routes are requested using the RTM_F_CLONED flag together with
strict checking, or if no strict checking is requested (and hence we can't
consistently apply filters), look up exceptions in the hash table
associated with the current fib6_info in rt6_dump_route(), and, if present
and not expired, add them to the dump.
We might be unable to dump all the entries for a given node in a single
message, so keep track of how many entries were handled for the current
node in fib6_walker, and skip that amount in case we start from the same
partially dumped node.
When a partial dump restarts, as the starting node might change when
'sernum' changes, we have no guarantee that we need to skip the same
amount of in-node entries. Therefore, we need two counters, and we need to
zero the in-node counter if the node from which the dump is resumed
differs.
Note that, with the current version of iproute2, this only fixes the
'ip -6 route list cache': on a flush command, iproute2 doesn't pass
RTM_F_CLONED and, due to this inconsistency, 'ip -6 route flush cache' is
still unable to fetch the routes to be flushed. This will be addressed in
a patch for iproute2.
To flush cached routes, a procfs entry could be introduced instead: that's
how it works for IPv4. We already have a rt6_flush_exception() function
ready to be wired to it. However, this would not solve the issue for
listing.
Versions of iproute2 and kernel tested:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0 5.1.0, patched
3.18 list + + + + + +
flush + + + + + +
4.4 list + + + + + +
flush + + + + + +
4.9 list + + + + + +
flush + + + + + +
4.14 list + + + + + +
flush + + + + + +
4.15 list
flush
4.19 list
flush
5.0 list
flush
5.1 list
flush
with list + + + + + +
fix flush + + + +
v7:
- Explain usage of "skip" counters in commit message (suggested by
David Ahern)
v6:
- Rebase onto net-next, use recently introduced nexthop walker
- Make rt6_nh_dump_exceptions() a separate function (suggested by David
Ahern)
v5:
- Use dump_routes and dump_exceptions from filter, ignore NLM_F_MATCH,
update test results (flushing works with iproute2 < 5.0.0 now)
v4:
- Split NLM_F_MATCH and strict check handling in separate patches
- Filter routes using RTM_F_CLONED: if it's not set, only return
non-cached routes, and if it's set, only return cached routes:
change requested by David Ahern and Martin Lau. This implies that
iproute2 needs a separate patch to be able to flush IPv6 cached
routes. This is not ideal because we can't fix the breakage caused
by 2b760fcf5cfb entirely in kernel. However, two years have passed
since then, and this makes it more tolerable
v3:
- More descriptive comment about expired exceptions in rt6_dump_route()
- Swap return values of rt6_dump_route() (suggested by Martin Lau)
- Don't zero skip_in_node in case we don't dump anything in a given pass
(also suggested by Martin Lau)
- Remove check on RTM_F_CLONED altogether: in the current UAPI semantic,
it's just a flag to indicate the route was cloned, not to filter on
routes
v2: Add tracking of number of entries to be skipped in current node after
a partial dump. As we restart from the same node, if not all the
exceptions for a given node fit in a single message, the dump will
not terminate, as suggested by Martin Lau. This is a concrete
possibility, setting up a big number of exceptions for the same route
actually causes the issue, suggested by David Ahern.
Reported-by: Jianlin Shi <jishi@redhat.com>
Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 22:45:27 +07:00
|
|
|
w->skip_in_node = 0;
|
2017-02-03 03:37:09 +07:00
|
|
|
|
|
|
|
/* Multipath routes are dumped in one route with the
|
|
|
|
* RTA_MULTIPATH attribute. Jump 'rt' to point to the
|
|
|
|
* last sibling of this route (no need to dump the
|
|
|
|
* sibling routes again)
|
|
|
|
*/
|
2018-04-19 05:38:59 +07:00
|
|
|
if (rt->fib6_nsiblings)
|
|
|
|
rt = list_last_entry(&rt->fib6_siblings,
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info,
|
2018-04-19 05:38:59 +07:00
|
|
|
fib6_siblings);
|
2006-08-11 13:11:17 +07:00
|
|
|
}
|
|
|
|
w->leaf = NULL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void fib6_dump_end(struct netlink_callback *cb)
|
|
|
|
{
|
2016-03-08 20:44:35 +07:00
|
|
|
struct net *net = sock_net(cb->skb->sk);
|
2014-10-07 00:58:34 +07:00
|
|
|
struct fib6_walker *w = (void *)cb->args[2];
|
2006-08-11 13:11:17 +07:00
|
|
|
|
|
|
|
if (w) {
|
2009-01-14 13:17:51 +07:00
|
|
|
if (cb->args[4]) {
|
|
|
|
cb->args[4] = 0;
|
2016-03-08 20:44:35 +07:00
|
|
|
fib6_walker_unlink(net, w);
|
2009-01-14 13:17:51 +07:00
|
|
|
}
|
2006-08-11 13:11:17 +07:00
|
|
|
cb->args[2] = 0;
|
|
|
|
kfree(w);
|
|
|
|
}
|
2014-03-28 11:07:04 +07:00
|
|
|
cb->done = (void *)cb->args[3];
|
2006-08-11 13:11:17 +07:00
|
|
|
cb->args[1] = 3;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int fib6_dump_done(struct netlink_callback *cb)
|
|
|
|
{
|
|
|
|
fib6_dump_end(cb);
|
|
|
|
return cb->done ? cb->done(cb) : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
|
|
|
|
struct netlink_callback *cb)
|
|
|
|
{
|
2016-03-08 20:44:35 +07:00
|
|
|
struct net *net = sock_net(skb->sk);
|
2014-10-07 00:58:34 +07:00
|
|
|
struct fib6_walker *w;
|
2006-08-11 13:11:17 +07:00
|
|
|
int res;
|
|
|
|
|
|
|
|
w = (void *)cb->args[2];
|
|
|
|
w->root = &table->tb6_root;
|
|
|
|
|
|
|
|
if (cb->args[4] == 0) {
|
2010-02-08 12:19:03 +07:00
|
|
|
w->count = 0;
|
|
|
|
w->skip = 0;
|
ipv6: Dump route exceptions if requested
Since commit 2b760fcf5cfb ("ipv6: hook up exception table to store dst
cache"), route exceptions reside in a separate hash table, and won't be
found by walking the FIB, so they won't be dumped to userspace on a
RTM_GETROUTE message.
This causes 'ip -6 route list cache' and 'ip -6 route flush cache' to
have no function anymore:
# ip -6 route get fc00:3::1
fc00:3::1 via fc00:1::2 dev veth_A-R1 src fc00:1::1 metric 1024 expires 539sec mtu 1400 pref medium
# ip -6 route get fc00:4::1
fc00:4::1 via fc00:2::2 dev veth_A-R2 src fc00:2::1 metric 1024 expires 536sec mtu 1500 pref medium
# ip -6 route list cache
# ip -6 route flush cache
# ip -6 route get fc00:3::1
fc00:3::1 via fc00:1::2 dev veth_A-R1 src fc00:1::1 metric 1024 expires 520sec mtu 1400 pref medium
# ip -6 route get fc00:4::1
fc00:4::1 via fc00:2::2 dev veth_A-R2 src fc00:2::1 metric 1024 expires 519sec mtu 1500 pref medium
because iproute2 lists cached routes using RTM_GETROUTE, and flushes them
by listing all the routes, and deleting them with RTM_DELROUTE one by one.
If cached routes are requested using the RTM_F_CLONED flag together with
strict checking, or if no strict checking is requested (and hence we can't
consistently apply filters), look up exceptions in the hash table
associated with the current fib6_info in rt6_dump_route(), and, if present
and not expired, add them to the dump.
We might be unable to dump all the entries for a given node in a single
message, so keep track of how many entries were handled for the current
node in fib6_walker, and skip that amount in case we start from the same
partially dumped node.
When a partial dump restarts, as the starting node might change when
'sernum' changes, we have no guarantee that we need to skip the same
amount of in-node entries. Therefore, we need two counters, and we need to
zero the in-node counter if the node from which the dump is resumed
differs.
Note that, with the current version of iproute2, this only fixes the
'ip -6 route list cache': on a flush command, iproute2 doesn't pass
RTM_F_CLONED and, due to this inconsistency, 'ip -6 route flush cache' is
still unable to fetch the routes to be flushed. This will be addressed in
a patch for iproute2.
To flush cached routes, a procfs entry could be introduced instead: that's
how it works for IPv4. We already have a rt6_flush_exception() function
ready to be wired to it. However, this would not solve the issue for
listing.
Versions of iproute2 and kernel tested:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0 5.1.0, patched
3.18 list + + + + + +
flush + + + + + +
4.4 list + + + + + +
flush + + + + + +
4.9 list + + + + + +
flush + + + + + +
4.14 list + + + + + +
flush + + + + + +
4.15 list
flush
4.19 list
flush
5.0 list
flush
5.1 list
flush
with list + + + + + +
fix flush + + + +
v7:
- Explain usage of "skip" counters in commit message (suggested by
David Ahern)
v6:
- Rebase onto net-next, use recently introduced nexthop walker
- Make rt6_nh_dump_exceptions() a separate function (suggested by David
Ahern)
v5:
- Use dump_routes and dump_exceptions from filter, ignore NLM_F_MATCH,
update test results (flushing works with iproute2 < 5.0.0 now)
v4:
- Split NLM_F_MATCH and strict check handling in separate patches
- Filter routes using RTM_F_CLONED: if it's not set, only return
non-cached routes, and if it's set, only return cached routes:
change requested by David Ahern and Martin Lau. This implies that
iproute2 needs a separate patch to be able to flush IPv6 cached
routes. This is not ideal because we can't fix the breakage caused
by 2b760fcf5cfb entirely in kernel. However, two years have passed
since then, and this makes it more tolerable
v3:
- More descriptive comment about expired exceptions in rt6_dump_route()
- Swap return values of rt6_dump_route() (suggested by Martin Lau)
- Don't zero skip_in_node in case we don't dump anything in a given pass
(also suggested by Martin Lau)
- Remove check on RTM_F_CLONED altogether: in the current UAPI semantic,
it's just a flag to indicate the route was cloned, not to filter on
routes
v2: Add tracking of number of entries to be skipped in current node after
a partial dump. As we restart from the same node, if not all the
exceptions for a given node fit in a single message, the dump will
not terminate, as suggested by Martin Lau. This is a concrete
possibility, setting up a big number of exceptions for the same route
actually causes the issue, suggested by David Ahern.
Reported-by: Jianlin Shi <jishi@redhat.com>
Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 22:45:27 +07:00
|
|
|
w->skip_in_node = 0;
|
2010-02-08 12:19:03 +07:00
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
spin_lock_bh(&table->tb6_lock);
|
2016-03-08 20:44:35 +07:00
|
|
|
res = fib6_walk(net, w);
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
spin_unlock_bh(&table->tb6_lock);
|
2010-02-08 12:19:03 +07:00
|
|
|
if (res > 0) {
|
2006-08-11 13:11:17 +07:00
|
|
|
cb->args[4] = 1;
|
2010-02-08 12:19:03 +07:00
|
|
|
cb->args[5] = w->root->fn_sernum;
|
|
|
|
}
|
2006-08-11 13:11:17 +07:00
|
|
|
} else {
|
2010-02-08 12:19:03 +07:00
|
|
|
if (cb->args[5] != w->root->fn_sernum) {
|
|
|
|
/* Begin at the root if the tree changed */
|
|
|
|
cb->args[5] = w->root->fn_sernum;
|
|
|
|
w->state = FWS_INIT;
|
|
|
|
w->node = w->root;
|
|
|
|
w->skip = w->count;
|
ipv6: Dump route exceptions if requested
Since commit 2b760fcf5cfb ("ipv6: hook up exception table to store dst
cache"), route exceptions reside in a separate hash table, and won't be
found by walking the FIB, so they won't be dumped to userspace on a
RTM_GETROUTE message.
This causes 'ip -6 route list cache' and 'ip -6 route flush cache' to
have no function anymore:
# ip -6 route get fc00:3::1
fc00:3::1 via fc00:1::2 dev veth_A-R1 src fc00:1::1 metric 1024 expires 539sec mtu 1400 pref medium
# ip -6 route get fc00:4::1
fc00:4::1 via fc00:2::2 dev veth_A-R2 src fc00:2::1 metric 1024 expires 536sec mtu 1500 pref medium
# ip -6 route list cache
# ip -6 route flush cache
# ip -6 route get fc00:3::1
fc00:3::1 via fc00:1::2 dev veth_A-R1 src fc00:1::1 metric 1024 expires 520sec mtu 1400 pref medium
# ip -6 route get fc00:4::1
fc00:4::1 via fc00:2::2 dev veth_A-R2 src fc00:2::1 metric 1024 expires 519sec mtu 1500 pref medium
because iproute2 lists cached routes using RTM_GETROUTE, and flushes them
by listing all the routes, and deleting them with RTM_DELROUTE one by one.
If cached routes are requested using the RTM_F_CLONED flag together with
strict checking, or if no strict checking is requested (and hence we can't
consistently apply filters), look up exceptions in the hash table
associated with the current fib6_info in rt6_dump_route(), and, if present
and not expired, add them to the dump.
We might be unable to dump all the entries for a given node in a single
message, so keep track of how many entries were handled for the current
node in fib6_walker, and skip that amount in case we start from the same
partially dumped node.
When a partial dump restarts, as the starting node might change when
'sernum' changes, we have no guarantee that we need to skip the same
amount of in-node entries. Therefore, we need two counters, and we need to
zero the in-node counter if the node from which the dump is resumed
differs.
Note that, with the current version of iproute2, this only fixes the
'ip -6 route list cache': on a flush command, iproute2 doesn't pass
RTM_F_CLONED and, due to this inconsistency, 'ip -6 route flush cache' is
still unable to fetch the routes to be flushed. This will be addressed in
a patch for iproute2.
To flush cached routes, a procfs entry could be introduced instead: that's
how it works for IPv4. We already have a rt6_flush_exception() function
ready to be wired to it. However, this would not solve the issue for
listing.
Versions of iproute2 and kernel tested:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0 5.1.0, patched
3.18 list + + + + + +
flush + + + + + +
4.4 list + + + + + +
flush + + + + + +
4.9 list + + + + + +
flush + + + + + +
4.14 list + + + + + +
flush + + + + + +
4.15 list
flush
4.19 list
flush
5.0 list
flush
5.1 list
flush
with list + + + + + +
fix flush + + + +
v7:
- Explain usage of "skip" counters in commit message (suggested by
David Ahern)
v6:
- Rebase onto net-next, use recently introduced nexthop walker
- Make rt6_nh_dump_exceptions() a separate function (suggested by David
Ahern)
v5:
- Use dump_routes and dump_exceptions from filter, ignore NLM_F_MATCH,
update test results (flushing works with iproute2 < 5.0.0 now)
v4:
- Split NLM_F_MATCH and strict check handling in separate patches
- Filter routes using RTM_F_CLONED: if it's not set, only return
non-cached routes, and if it's set, only return cached routes:
change requested by David Ahern and Martin Lau. This implies that
iproute2 needs a separate patch to be able to flush IPv6 cached
routes. This is not ideal because we can't fix the breakage caused
by 2b760fcf5cfb entirely in kernel. However, two years have passed
since then, and this makes it more tolerable
v3:
- More descriptive comment about expired exceptions in rt6_dump_route()
- Swap return values of rt6_dump_route() (suggested by Martin Lau)
- Don't zero skip_in_node in case we don't dump anything in a given pass
(also suggested by Martin Lau)
- Remove check on RTM_F_CLONED altogether: in the current UAPI semantic,
it's just a flag to indicate the route was cloned, not to filter on
routes
v2: Add tracking of number of entries to be skipped in current node after
a partial dump. As we restart from the same node, if not all the
exceptions for a given node fit in a single message, the dump will
not terminate, as suggested by Martin Lau. This is a concrete
possibility, setting up a big number of exceptions for the same route
actually causes the issue, suggested by David Ahern.
Reported-by: Jianlin Shi <jishi@redhat.com>
Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 22:45:27 +07:00
|
|
|
w->skip_in_node = 0;
|
2010-02-08 12:19:03 +07:00
|
|
|
} else
|
|
|
|
w->skip = 0;
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
spin_lock_bh(&table->tb6_lock);
|
2006-08-11 13:11:17 +07:00
|
|
|
res = fib6_walk_continue(w);
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
spin_unlock_bh(&table->tb6_lock);
|
2009-01-14 13:17:51 +07:00
|
|
|
if (res <= 0) {
|
2016-03-08 20:44:35 +07:00
|
|
|
fib6_walker_unlink(net, w);
|
2009-01-14 13:17:51 +07:00
|
|
|
cb->args[4] = 0;
|
2006-08-11 13:11:17 +07:00
|
|
|
}
|
|
|
|
}
|
2009-01-14 13:17:51 +07:00
|
|
|
|
2006-08-11 13:11:17 +07:00
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2007-03-23 01:58:32 +07:00
|
|
|
static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
|
2006-08-11 13:11:17 +07:00
|
|
|
{
|
fib_frontend, ip6_fib: Select routes or exceptions dump from RTM_F_CLONED
The following patches add back the ability to dump IPv4 and IPv6 exception
routes, and we need to allow selection of regular routes or exceptions.
Use RTM_F_CLONED as filter to decide whether to dump routes or exceptions:
iproute2 passes it in dump requests (except for IPv6 cache flush requests,
this will be fixed in iproute2) and this used to work as long as
exceptions were stored directly in the FIB, for both IPv4 and IPv6.
Caveat: if strict checking is not requested (that is, if the dump request
doesn't go through ip_valid_fib_dump_req()), we can't filter on protocol,
tables or route types.
In this case, filtering on RTM_F_CLONED would be inconsistent: we would
fix 'ip route list cache' by returning exception routes and at the same
time introduce another bug in case another selector is present, e.g. on
'ip route list cache table main' we would return all exception routes,
without filtering on tables.
Keep this consistent by applying no filters at all, and dumping both
routes and exceptions, if strict checking is not requested. iproute2
currently filters results anyway, and no unwanted results will be
presented to the user. The kernel will just dump more data than needed.
v7: No changes
v6: Rebase onto net-next, no changes
v5: New patch: add dump_routes and dump_exceptions flags in filter and
simply clear the unwanted one if strict checking is enabled, don't
ignore NLM_F_MATCH and don't set filter_set if NLM_F_MATCH is set.
Skip filtering altogether if no strict checking is requested:
selecting routes or exceptions only would be inconsistent with the
fact we can't filter on tables.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 22:45:20 +07:00
|
|
|
struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true,
|
|
|
|
.filter.dump_routes = true };
|
2018-10-08 10:16:35 +07:00
|
|
|
const struct nlmsghdr *nlh = cb->nlh;
|
2008-03-26 00:26:21 +07:00
|
|
|
struct net *net = sock_net(skb->sk);
|
2006-08-11 13:11:17 +07:00
|
|
|
unsigned int h, s_h;
|
|
|
|
unsigned int e = 0, s_e;
|
2014-10-07 00:58:34 +07:00
|
|
|
struct fib6_walker *w;
|
2006-08-11 13:11:17 +07:00
|
|
|
struct fib6_table *tb;
|
2008-03-04 14:25:27 +07:00
|
|
|
struct hlist_head *head;
|
2006-08-11 13:11:17 +07:00
|
|
|
int res = 0;
|
|
|
|
|
2018-10-08 10:16:35 +07:00
|
|
|
if (cb->strict_check) {
|
2018-10-16 08:56:42 +07:00
|
|
|
int err;
|
2018-10-08 10:16:35 +07:00
|
|
|
|
2018-10-16 08:56:48 +07:00
|
|
|
err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
|
2018-10-08 10:16:35 +07:00
|
|
|
if (err < 0)
|
|
|
|
return err;
|
2018-10-16 08:56:44 +07:00
|
|
|
} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
|
|
|
|
struct rtmsg *rtm = nlmsg_data(nlh);
|
2018-10-08 10:16:35 +07:00
|
|
|
|
2019-06-21 22:45:24 +07:00
|
|
|
if (rtm->rtm_flags & RTM_F_PREFIX)
|
|
|
|
arg.filter.flags = RTM_F_PREFIX;
|
2018-10-16 08:56:44 +07:00
|
|
|
}
|
2006-08-11 13:11:17 +07:00
|
|
|
|
|
|
|
w = (void *)cb->args[2];
|
2011-12-04 05:50:45 +07:00
|
|
|
if (!w) {
|
2006-08-11 13:11:17 +07:00
|
|
|
/* New dump:
|
|
|
|
*
|
|
|
|
* 1. hook callback destructor.
|
|
|
|
*/
|
|
|
|
cb->args[3] = (long)cb->done;
|
|
|
|
cb->done = fib6_dump_done;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 2. allocate and initialize walker.
|
|
|
|
*/
|
|
|
|
w = kzalloc(sizeof(*w), GFP_ATOMIC);
|
2011-12-04 05:50:45 +07:00
|
|
|
if (!w)
|
2006-08-11 13:11:17 +07:00
|
|
|
return -ENOMEM;
|
|
|
|
w->func = fib6_dump_node;
|
|
|
|
cb->args[2] = (long)w;
|
|
|
|
}
|
|
|
|
|
|
|
|
arg.skb = skb;
|
|
|
|
arg.cb = cb;
|
2008-08-15 05:33:21 +07:00
|
|
|
arg.net = net;
|
2006-08-11 13:11:17 +07:00
|
|
|
w->args = &arg;
|
|
|
|
|
2018-10-16 08:56:44 +07:00
|
|
|
if (arg.filter.table_id) {
|
|
|
|
tb = fib6_get_table(net, arg.filter.table_id);
|
|
|
|
if (!tb) {
|
2018-10-25 02:59:01 +07:00
|
|
|
if (arg.filter.dump_all_families)
|
2018-11-02 23:11:05 +07:00
|
|
|
goto out;
|
2018-10-25 02:59:01 +07:00
|
|
|
|
2018-10-16 08:56:44 +07:00
|
|
|
NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
|
2019-01-03 09:26:13 +07:00
|
|
|
if (!cb->args[0]) {
|
|
|
|
res = fib6_dump_table(tb, skb, cb);
|
|
|
|
if (!res)
|
|
|
|
cb->args[0] = 1;
|
|
|
|
}
|
2018-10-16 08:56:44 +07:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
s_h = cb->args[0];
|
|
|
|
s_e = cb->args[1];
|
|
|
|
|
2011-04-28 05:56:07 +07:00
|
|
|
rcu_read_lock();
|
2009-07-31 08:52:15 +07:00
|
|
|
for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
|
2006-08-11 13:11:17 +07:00
|
|
|
e = 0;
|
2008-03-04 14:25:27 +07:00
|
|
|
head = &net->ipv6.fib_table_hash[h];
|
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 08:06:00 +07:00
|
|
|
hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
|
2006-08-11 13:11:17 +07:00
|
|
|
if (e < s_e)
|
|
|
|
goto next;
|
|
|
|
res = fib6_dump_table(tb, skb, cb);
|
|
|
|
if (res != 0)
|
2018-10-16 08:56:44 +07:00
|
|
|
goto out_unlock;
|
2006-08-11 13:11:17 +07:00
|
|
|
next:
|
|
|
|
e++;
|
|
|
|
}
|
|
|
|
}
|
2018-10-16 08:56:44 +07:00
|
|
|
out_unlock:
|
2011-04-28 05:56:07 +07:00
|
|
|
rcu_read_unlock();
|
2006-08-11 13:11:17 +07:00
|
|
|
cb->args[1] = e;
|
|
|
|
cb->args[0] = h;
|
2018-10-16 08:56:44 +07:00
|
|
|
out:
|
2006-08-11 13:11:17 +07:00
|
|
|
res = res < 0 ? res : skb->len;
|
|
|
|
if (res <= 0)
|
|
|
|
fib6_dump_end(cb);
|
|
|
|
return res;
|
|
|
|
}
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2018-04-18 07:33:26 +07:00
|
|
|
void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
|
2018-04-18 07:33:16 +07:00
|
|
|
{
|
|
|
|
if (!f6i)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (f6i->fib6_metrics == &dst_default_metrics) {
|
|
|
|
struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC);
|
|
|
|
|
|
|
|
if (!p)
|
|
|
|
return;
|
|
|
|
|
|
|
|
refcount_set(&p->refcnt, 1);
|
|
|
|
f6i->fib6_metrics = p;
|
|
|
|
}
|
|
|
|
|
|
|
|
f6i->fib6_metrics->metrics[metric - 1] = val;
|
|
|
|
}
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* Routing Table
|
|
|
|
*
|
|
|
|
* return the appropriate node for a routing tree "add" operation
|
|
|
|
* by either creating and inserting or by returning an existing
|
|
|
|
* node.
|
|
|
|
*/
|
|
|
|
|
2017-10-07 02:06:11 +07:00
|
|
|
static struct fib6_node *fib6_add_1(struct net *net,
|
|
|
|
struct fib6_table *table,
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
struct fib6_node *root,
|
|
|
|
struct in6_addr *addr, int plen,
|
|
|
|
int offset, int allow_create,
|
|
|
|
int replace_required,
|
|
|
|
struct netlink_ext_ack *extack)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
|
|
|
struct fib6_node *fn, *in, *ln;
|
|
|
|
struct fib6_node *pn = NULL;
|
|
|
|
struct rt6key *key;
|
|
|
|
int bit;
|
2007-02-09 21:24:49 +07:00
|
|
|
__be32 dir = 0;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
RT6_TRACE("fib6_add_1\n");
|
|
|
|
|
|
|
|
/* insert node in tree */
|
|
|
|
|
|
|
|
fn = root;
|
|
|
|
|
|
|
|
do {
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
lockdep_is_held(&table->tb6_lock));
|
|
|
|
key = (struct rt6key *)((u8 *)leaf + offset);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Prefix match
|
|
|
|
*/
|
|
|
|
if (plen < fn->fn_bit ||
|
2011-11-14 07:15:14 +07:00
|
|
|
!ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
|
2011-11-17 04:18:02 +07:00
|
|
|
if (!allow_create) {
|
|
|
|
if (replace_required) {
|
2017-05-21 23:12:05 +07:00
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Can not replace route - no match found");
|
2012-05-15 21:11:53 +07:00
|
|
|
pr_warn("Can't replace route, no match found\n");
|
2011-11-17 04:18:02 +07:00
|
|
|
return ERR_PTR(-ENOENT);
|
|
|
|
}
|
2012-05-15 21:11:53 +07:00
|
|
|
pr_warn("NLM_F_CREATE should be set when creating new route\n");
|
2011-11-17 04:18:02 +07:00
|
|
|
}
|
2005-04-17 05:20:36 +07:00
|
|
|
goto insert_above;
|
2011-11-14 07:15:14 +07:00
|
|
|
}
|
2007-02-09 21:24:49 +07:00
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* Exact match ?
|
|
|
|
*/
|
2007-02-09 21:24:49 +07:00
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
if (plen == fn->fn_bit) {
|
|
|
|
/* clean up an intermediate node */
|
2011-12-04 05:50:45 +07:00
|
|
|
if (!(fn->fn_flags & RTN_RTINFO)) {
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
RCU_INIT_POINTER(fn->leaf, NULL);
|
2018-04-18 07:33:25 +07:00
|
|
|
fib6_info_release(leaf);
|
ipv6: remove null_entry before adding default route
In the current code, when creating a new fib6 table, tb6_root.leaf gets
initialized to net->ipv6.ip6_null_entry.
If a default route is being added with rt->rt6i_metric = 0xffffffff,
fib6_add() will add this route after net->ipv6.ip6_null_entry. As
null_entry is shared, it could cause problem.
In order to fix it, set fn->leaf to NULL before calling
fib6_add_rt2node() when trying to add the first default route.
And reset fn->leaf to null_entry when adding fails or when deleting the
last default route.
syzkaller reported the following issue which is fixed by this commit:
WARNING: suspicious RCU usage
4.15.0-rc5+ #171 Not tainted
-----------------------------
net/ipv6/ip6_fib.c:1702 suspicious rcu_dereference_protected() usage!
other info that might help us debug this:
rcu_scheduler_active = 2, debug_locks = 1
4 locks held by swapper/0/0:
#0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] lockdep_copy_map include/linux/lockdep.h:178 [inline]
#0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] call_timer_fn+0x1c6/0x820 kernel/time/timer.c:1310
#1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] spin_lock_bh include/linux/spinlock.h:315 [inline]
#1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] fib6_run_gc+0x9d/0x3c0 net/ipv6/ip6_fib.c:2007
#2: (rcu_read_lock){....}, at: [<0000000091db762d>] __fib6_clean_all+0x0/0x3a0 net/ipv6/ip6_fib.c:1560
#3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] spin_lock_bh include/linux/spinlock.h:315 [inline]
#3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] __fib6_clean_all+0x1d0/0x3a0 net/ipv6/ip6_fib.c:1948
stack backtrace:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc5+ #171
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
<IRQ>
__dump_stack lib/dump_stack.c:17 [inline]
dump_stack+0x194/0x257 lib/dump_stack.c:53
lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4585
fib6_del+0xcaa/0x11b0 net/ipv6/ip6_fib.c:1701
fib6_clean_node+0x3aa/0x4f0 net/ipv6/ip6_fib.c:1892
fib6_walk_continue+0x46c/0x8a0 net/ipv6/ip6_fib.c:1815
fib6_walk+0x91/0xf0 net/ipv6/ip6_fib.c:1863
fib6_clean_tree+0x1e6/0x340 net/ipv6/ip6_fib.c:1933
__fib6_clean_all+0x1f4/0x3a0 net/ipv6/ip6_fib.c:1949
fib6_clean_all net/ipv6/ip6_fib.c:1960 [inline]
fib6_run_gc+0x16b/0x3c0 net/ipv6/ip6_fib.c:2016
fib6_gc_timer_cb+0x20/0x30 net/ipv6/ip6_fib.c:2033
call_timer_fn+0x228/0x820 kernel/time/timer.c:1320
expire_timers kernel/time/timer.c:1357 [inline]
__run_timers+0x7ee/0xb70 kernel/time/timer.c:1660
run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686
__do_softirq+0x2d7/0xb85 kernel/softirq.c:285
invoke_softirq kernel/softirq.c:365 [inline]
irq_exit+0x1cc/0x200 kernel/softirq.c:405
exiting_irq arch/x86/include/asm/apic.h:540 [inline]
smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052
apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:904
</IRQ>
Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-01-09 01:34:00 +07:00
|
|
|
/* remove null_entry in the root node */
|
|
|
|
} else if (fn->fn_flags & RTN_TL_ROOT &&
|
|
|
|
rcu_access_pointer(fn->leaf) ==
|
2018-04-18 07:33:18 +07:00
|
|
|
net->ipv6.fib6_null_entry) {
|
ipv6: remove null_entry before adding default route
In the current code, when creating a new fib6 table, tb6_root.leaf gets
initialized to net->ipv6.ip6_null_entry.
If a default route is being added with rt->rt6i_metric = 0xffffffff,
fib6_add() will add this route after net->ipv6.ip6_null_entry. As
null_entry is shared, it could cause problem.
In order to fix it, set fn->leaf to NULL before calling
fib6_add_rt2node() when trying to add the first default route.
And reset fn->leaf to null_entry when adding fails or when deleting the
last default route.
syzkaller reported the following issue which is fixed by this commit:
WARNING: suspicious RCU usage
4.15.0-rc5+ #171 Not tainted
-----------------------------
net/ipv6/ip6_fib.c:1702 suspicious rcu_dereference_protected() usage!
other info that might help us debug this:
rcu_scheduler_active = 2, debug_locks = 1
4 locks held by swapper/0/0:
#0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] lockdep_copy_map include/linux/lockdep.h:178 [inline]
#0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] call_timer_fn+0x1c6/0x820 kernel/time/timer.c:1310
#1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] spin_lock_bh include/linux/spinlock.h:315 [inline]
#1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] fib6_run_gc+0x9d/0x3c0 net/ipv6/ip6_fib.c:2007
#2: (rcu_read_lock){....}, at: [<0000000091db762d>] __fib6_clean_all+0x0/0x3a0 net/ipv6/ip6_fib.c:1560
#3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] spin_lock_bh include/linux/spinlock.h:315 [inline]
#3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] __fib6_clean_all+0x1d0/0x3a0 net/ipv6/ip6_fib.c:1948
stack backtrace:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc5+ #171
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
<IRQ>
__dump_stack lib/dump_stack.c:17 [inline]
dump_stack+0x194/0x257 lib/dump_stack.c:53
lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4585
fib6_del+0xcaa/0x11b0 net/ipv6/ip6_fib.c:1701
fib6_clean_node+0x3aa/0x4f0 net/ipv6/ip6_fib.c:1892
fib6_walk_continue+0x46c/0x8a0 net/ipv6/ip6_fib.c:1815
fib6_walk+0x91/0xf0 net/ipv6/ip6_fib.c:1863
fib6_clean_tree+0x1e6/0x340 net/ipv6/ip6_fib.c:1933
__fib6_clean_all+0x1f4/0x3a0 net/ipv6/ip6_fib.c:1949
fib6_clean_all net/ipv6/ip6_fib.c:1960 [inline]
fib6_run_gc+0x16b/0x3c0 net/ipv6/ip6_fib.c:2016
fib6_gc_timer_cb+0x20/0x30 net/ipv6/ip6_fib.c:2033
call_timer_fn+0x228/0x820 kernel/time/timer.c:1320
expire_timers kernel/time/timer.c:1357 [inline]
__run_timers+0x7ee/0xb70 kernel/time/timer.c:1660
run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686
__do_softirq+0x2d7/0xb85 kernel/softirq.c:285
invoke_softirq kernel/softirq.c:365 [inline]
irq_exit+0x1cc/0x200 kernel/softirq.c:405
exiting_irq arch/x86/include/asm/apic.h:540 [inline]
smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052
apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:904
</IRQ>
Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-01-09 01:34:00 +07:00
|
|
|
RCU_INIT_POINTER(fn->leaf, NULL);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
2007-02-09 21:24:49 +07:00
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
return fn;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We have more bits to go
|
|
|
|
*/
|
2007-02-09 21:24:49 +07:00
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/* Try to walk down on tree. */
|
|
|
|
dir = addr_bit_set(addr, fn->fn_bit);
|
|
|
|
pn = fn;
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
fn = dir ?
|
|
|
|
rcu_dereference_protected(fn->right,
|
|
|
|
lockdep_is_held(&table->tb6_lock)) :
|
|
|
|
rcu_dereference_protected(fn->left,
|
|
|
|
lockdep_is_held(&table->tb6_lock));
|
2005-04-17 05:20:36 +07:00
|
|
|
} while (fn);
|
|
|
|
|
2011-11-17 04:18:02 +07:00
|
|
|
if (!allow_create) {
|
2011-11-14 07:15:14 +07:00
|
|
|
/* We should not create new node because
|
|
|
|
* NLM_F_REPLACE was specified without NLM_F_CREATE
|
|
|
|
* I assume it is safe to require NLM_F_CREATE when
|
|
|
|
* REPLACE flag is used! Later we may want to remove the
|
|
|
|
* check for replace_required, because according
|
|
|
|
* to netlink specification, NLM_F_CREATE
|
|
|
|
* MUST be specified if new route is created.
|
|
|
|
* That would keep IPv6 consistent with IPv4
|
|
|
|
*/
|
2011-11-17 04:18:02 +07:00
|
|
|
if (replace_required) {
|
2017-05-21 23:12:05 +07:00
|
|
|
NL_SET_ERR_MSG(extack,
|
|
|
|
"Can not replace route - no match found");
|
2012-05-15 21:11:53 +07:00
|
|
|
pr_warn("Can't replace route, no match found\n");
|
2011-11-17 04:18:02 +07:00
|
|
|
return ERR_PTR(-ENOENT);
|
|
|
|
}
|
2012-05-15 21:11:53 +07:00
|
|
|
pr_warn("NLM_F_CREATE should be set when creating new route\n");
|
2011-11-14 07:15:14 +07:00
|
|
|
}
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* We walked to the bottom of tree.
|
|
|
|
* Create new leaf node without children.
|
|
|
|
*/
|
|
|
|
|
2017-10-07 02:06:11 +07:00
|
|
|
ln = node_alloc(net);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2011-12-04 05:50:45 +07:00
|
|
|
if (!ln)
|
2012-09-25 22:17:07 +07:00
|
|
|
return ERR_PTR(-ENOMEM);
|
2005-04-17 05:20:36 +07:00
|
|
|
ln->fn_bit = plen;
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
RCU_INIT_POINTER(ln->parent, pn);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
if (dir)
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
rcu_assign_pointer(pn->right, ln);
|
2005-04-17 05:20:36 +07:00
|
|
|
else
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
rcu_assign_pointer(pn->left, ln);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
return ln;
|
|
|
|
|
|
|
|
|
|
|
|
insert_above:
|
|
|
|
/*
|
2007-02-09 21:24:49 +07:00
|
|
|
* split since we don't have a common prefix anymore or
|
2005-04-17 05:20:36 +07:00
|
|
|
* we have a less significant route.
|
|
|
|
* we've to insert an intermediate node on the list
|
|
|
|
* this new node will point to the one we need to create
|
|
|
|
* and the current
|
|
|
|
*/
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
pn = rcu_dereference_protected(fn->parent,
|
|
|
|
lockdep_is_held(&table->tb6_lock));
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
/* find 1st bit in difference between the 2 addrs.
|
|
|
|
|
2005-11-09 00:37:56 +07:00
|
|
|
See comment in __ipv6_addr_diff: bit may be an invalid value,
|
2005-04-17 05:20:36 +07:00
|
|
|
but if it is >= plen, the value is ignored in any case.
|
|
|
|
*/
|
2007-02-09 21:24:49 +07:00
|
|
|
|
2013-07-22 13:21:09 +07:00
|
|
|
bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2007-02-09 21:24:49 +07:00
|
|
|
/*
|
|
|
|
* (intermediate)[in]
|
2005-04-17 05:20:36 +07:00
|
|
|
* / \
|
|
|
|
* (new leaf node)[ln] (old node)[fn]
|
|
|
|
*/
|
|
|
|
if (plen > bit) {
|
2017-10-07 02:06:11 +07:00
|
|
|
in = node_alloc(net);
|
|
|
|
ln = node_alloc(net);
|
2007-02-09 21:24:49 +07:00
|
|
|
|
2011-12-04 05:50:45 +07:00
|
|
|
if (!in || !ln) {
|
2005-04-17 05:20:36 +07:00
|
|
|
if (in)
|
2017-10-07 02:06:11 +07:00
|
|
|
node_free_immediate(net, in);
|
2005-04-17 05:20:36 +07:00
|
|
|
if (ln)
|
2017-10-07 02:06:11 +07:00
|
|
|
node_free_immediate(net, ln);
|
2012-09-25 22:17:07 +07:00
|
|
|
return ERR_PTR(-ENOMEM);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
2007-02-09 21:24:49 +07:00
|
|
|
/*
|
|
|
|
* new intermediate node.
|
2005-04-17 05:20:36 +07:00
|
|
|
* RTN_RTINFO will
|
|
|
|
* be off since that an address that chooses one of
|
|
|
|
* the branches would not match less specific routes
|
|
|
|
* in the other branch
|
|
|
|
*/
|
|
|
|
|
|
|
|
in->fn_bit = bit;
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
RCU_INIT_POINTER(in->parent, pn);
|
2005-04-17 05:20:36 +07:00
|
|
|
in->leaf = fn->leaf;
|
2019-04-23 08:35:02 +07:00
|
|
|
fib6_info_hold(rcu_dereference_protected(in->leaf,
|
|
|
|
lockdep_is_held(&table->tb6_lock)));
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
/* update parent pointer */
|
|
|
|
if (dir)
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
rcu_assign_pointer(pn->right, in);
|
2005-04-17 05:20:36 +07:00
|
|
|
else
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
rcu_assign_pointer(pn->left, in);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
ln->fn_bit = plen;
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
RCU_INIT_POINTER(ln->parent, in);
|
|
|
|
rcu_assign_pointer(fn->parent, in);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
if (addr_bit_set(addr, bit)) {
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
rcu_assign_pointer(in->right, ln);
|
|
|
|
rcu_assign_pointer(in->left, fn);
|
2005-04-17 05:20:36 +07:00
|
|
|
} else {
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
rcu_assign_pointer(in->left, ln);
|
|
|
|
rcu_assign_pointer(in->right, fn);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
} else { /* plen <= bit */
|
|
|
|
|
2007-02-09 21:24:49 +07:00
|
|
|
/*
|
2005-04-17 05:20:36 +07:00
|
|
|
* (new leaf node)[ln]
|
|
|
|
* / \
|
|
|
|
* (old node)[fn] NULL
|
|
|
|
*/
|
|
|
|
|
2017-10-07 02:06:11 +07:00
|
|
|
ln = node_alloc(net);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2011-12-04 05:50:45 +07:00
|
|
|
if (!ln)
|
2012-09-25 22:17:07 +07:00
|
|
|
return ERR_PTR(-ENOMEM);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
ln->fn_bit = plen;
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
RCU_INIT_POINTER(ln->parent, pn);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
if (addr_bit_set(&key->addr, plen))
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
RCU_INIT_POINTER(ln->right, fn);
|
2005-04-17 05:20:36 +07:00
|
|
|
else
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
RCU_INIT_POINTER(ln->left, fn);
|
|
|
|
|
|
|
|
rcu_assign_pointer(fn->parent, ln);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
if (dir)
|
|
|
|
rcu_assign_pointer(pn->right, ln);
|
|
|
|
else
|
|
|
|
rcu_assign_pointer(pn->left, ln);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
return ln;
|
|
|
|
}
|
|
|
|
|
2019-05-23 10:27:56 +07:00
|
|
|
static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
|
|
|
|
const struct fib6_info *match,
|
|
|
|
const struct fib6_table *table)
|
2015-01-06 05:57:44 +07:00
|
|
|
{
|
2018-04-21 05:38:01 +07:00
|
|
|
int cpu;
|
2015-01-06 05:57:44 +07:00
|
|
|
|
2019-05-23 10:27:55 +07:00
|
|
|
if (!fib6_nh->rt6i_pcpu)
|
|
|
|
return;
|
|
|
|
|
2018-04-21 05:38:01 +07:00
|
|
|
/* release the reference to this fib entry from
|
|
|
|
* all of its cached pcpu routes
|
|
|
|
*/
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
|
|
struct rt6_info **ppcpu_rt;
|
|
|
|
struct rt6_info *pcpu_rt;
|
2015-01-06 05:57:44 +07:00
|
|
|
|
2019-05-23 10:27:55 +07:00
|
|
|
ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
|
2018-04-21 05:38:01 +07:00
|
|
|
pcpu_rt = *ppcpu_rt;
|
2019-05-23 10:27:56 +07:00
|
|
|
|
|
|
|
/* only dropping the 'from' reference if the cached route
|
|
|
|
* is using 'match'. The cached pcpu_rt->from only changes
|
|
|
|
* from a fib6_info to NULL (ip6_dst_destroy); it can never
|
|
|
|
* change from one fib6_info reference to another
|
|
|
|
*/
|
|
|
|
if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
|
2018-04-21 05:38:02 +07:00
|
|
|
struct fib6_info *from;
|
2015-01-06 05:57:44 +07:00
|
|
|
|
2019-04-29 02:22:25 +07:00
|
|
|
from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
|
2018-04-21 05:38:02 +07:00
|
|
|
fib6_info_release(from);
|
2018-04-21 05:38:01 +07:00
|
|
|
}
|
2014-03-27 19:04:08 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-09 04:53:23 +07:00
|
|
|
struct fib6_nh_pcpu_arg {
|
|
|
|
struct fib6_info *from;
|
|
|
|
const struct fib6_table *table;
|
|
|
|
};
|
|
|
|
|
|
|
|
static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
|
|
|
|
{
|
|
|
|
struct fib6_nh_pcpu_arg *arg = _arg;
|
|
|
|
|
|
|
|
__fib6_drop_pcpu_from(nh, arg->from, arg->table);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-05-23 10:27:56 +07:00
|
|
|
static void fib6_drop_pcpu_from(struct fib6_info *f6i,
|
|
|
|
const struct fib6_table *table)
|
|
|
|
{
|
|
|
|
/* Make sure rt6_make_pcpu_route() wont add other percpu routes
|
|
|
|
* while we are cleaning them here.
|
|
|
|
*/
|
|
|
|
f6i->fib6_destroying = 1;
|
|
|
|
mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
|
|
|
|
|
2019-06-09 04:53:23 +07:00
|
|
|
if (f6i->nh) {
|
|
|
|
struct fib6_nh_pcpu_arg arg = {
|
|
|
|
.from = f6i,
|
|
|
|
.table = table
|
|
|
|
};
|
|
|
|
|
|
|
|
nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from,
|
|
|
|
&arg);
|
|
|
|
} else {
|
|
|
|
struct fib6_nh *fib6_nh;
|
|
|
|
|
|
|
|
fib6_nh = f6i->fib6_nh;
|
|
|
|
__fib6_drop_pcpu_from(fib6_nh, f6i, table);
|
|
|
|
}
|
2019-05-23 10:27:56 +07:00
|
|
|
}
|
|
|
|
|
2018-04-18 07:33:26 +07:00
|
|
|
static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
|
2015-01-26 21:11:17 +07:00
|
|
|
struct net *net)
|
|
|
|
{
|
2018-04-19 05:38:59 +07:00
|
|
|
struct fib6_table *table = rt->fib6_table;
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
|
2019-05-23 10:27:55 +07:00
|
|
|
fib6_drop_pcpu_from(rt, table);
|
2019-05-16 09:39:52 +07:00
|
|
|
|
2019-06-04 10:19:52 +07:00
|
|
|
if (rt->nh && !list_empty(&rt->nh_list))
|
|
|
|
list_del_init(&rt->nh_list);
|
|
|
|
|
2019-04-23 08:35:03 +07:00
|
|
|
if (refcount_read(&rt->fib6_ref) != 1) {
|
2015-01-26 21:11:17 +07:00
|
|
|
/* This route is used as dummy address holder in some split
|
|
|
|
* nodes. It is not leaked, but it still holds other resources,
|
|
|
|
* which must be released in time. So, scan ascendant nodes
|
|
|
|
* and replace dummy references to this route with references
|
|
|
|
* to still alive ones.
|
|
|
|
*/
|
|
|
|
while (fn) {
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
lockdep_is_held(&table->tb6_lock));
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *new_leaf;
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
|
|
|
|
new_leaf = fib6_find_prefix(net, table, fn);
|
2019-04-23 08:35:02 +07:00
|
|
|
fib6_info_hold(new_leaf);
|
2018-04-18 07:33:25 +07:00
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
rcu_assign_pointer(fn->leaf, new_leaf);
|
2018-04-18 07:33:25 +07:00
|
|
|
fib6_info_release(rt);
|
2015-01-26 21:11:17 +07:00
|
|
|
}
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
fn = rcu_dereference_protected(fn->parent,
|
|
|
|
lockdep_is_held(&table->tb6_lock));
|
2015-01-26 21:11:17 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* Insert routing information in a node.
|
|
|
|
*/
|
|
|
|
|
2018-04-18 07:33:26 +07:00
|
|
|
static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
|
2018-04-18 07:33:16 +07:00
|
|
|
struct nl_info *info,
|
2017-10-28 07:37:13 +07:00
|
|
|
struct netlink_ext_ack *extack)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
|
2018-04-19 05:38:59 +07:00
|
|
|
lockdep_is_held(&rt->fib6_table->tb6_lock));
|
2018-07-04 04:36:21 +07:00
|
|
|
struct fib6_info *iter = NULL;
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info __rcu **ins;
|
2018-07-04 04:36:21 +07:00
|
|
|
struct fib6_info __rcu **fallback_ins = NULL;
|
2011-12-04 05:50:45 +07:00
|
|
|
int replace = (info->nlh &&
|
|
|
|
(info->nlh->nlmsg_flags & NLM_F_REPLACE));
|
|
|
|
int add = (!info->nlh ||
|
|
|
|
(info->nlh->nlmsg_flags & NLM_F_CREATE));
|
2011-11-14 07:15:14 +07:00
|
|
|
int found = 0;
|
2018-07-04 04:36:21 +07:00
|
|
|
bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
|
2016-09-07 22:21:40 +07:00
|
|
|
u16 nlflags = NLM_F_EXCL;
|
2014-03-27 19:04:08 +07:00
|
|
|
int err;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2018-07-04 04:36:21 +07:00
|
|
|
if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
|
2017-02-01 07:51:37 +07:00
|
|
|
nlflags |= NLM_F_APPEND;
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
ins = &fn->leaf;
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
for (iter = leaf; iter;
|
2018-05-05 03:54:24 +07:00
|
|
|
iter = rcu_dereference_protected(iter->fib6_next,
|
2018-04-19 05:38:59 +07:00
|
|
|
lockdep_is_held(&rt->fib6_table->tb6_lock))) {
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* Search for duplicates
|
|
|
|
*/
|
|
|
|
|
2018-04-19 05:38:59 +07:00
|
|
|
if (iter->fib6_metric == rt->fib6_metric) {
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* Same priority level
|
|
|
|
*/
|
2011-12-04 05:50:45 +07:00
|
|
|
if (info->nlh &&
|
|
|
|
(info->nlh->nlmsg_flags & NLM_F_EXCL))
|
2011-11-14 07:15:14 +07:00
|
|
|
return -EEXIST;
|
2016-09-07 22:21:40 +07:00
|
|
|
|
|
|
|
nlflags &= ~NLM_F_EXCL;
|
2011-11-14 07:15:14 +07:00
|
|
|
if (replace) {
|
2018-07-04 04:36:21 +07:00
|
|
|
if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
|
|
|
|
found++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (rt_can_ecmp)
|
|
|
|
fallback_ins = fallback_ins ?: ins;
|
|
|
|
goto next_iter;
|
2011-11-14 07:15:14 +07:00
|
|
|
}
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2017-07-06 03:41:46 +07:00
|
|
|
if (rt6_duplicate_nexthop(iter, rt)) {
|
2018-04-19 05:38:59 +07:00
|
|
|
if (rt->fib6_nsiblings)
|
|
|
|
rt->fib6_nsiblings = 0;
|
|
|
|
if (!(iter->fib6_flags & RTF_EXPIRES))
|
2005-04-17 05:20:36 +07:00
|
|
|
return -EEXIST;
|
2018-04-19 05:38:59 +07:00
|
|
|
if (!(rt->fib6_flags & RTF_EXPIRES))
|
2018-04-18 07:33:17 +07:00
|
|
|
fib6_clean_expires(iter);
|
2012-04-06 07:13:10 +07:00
|
|
|
else
|
2018-04-18 07:33:17 +07:00
|
|
|
fib6_set_expires(iter, rt->expires);
|
2018-08-31 04:15:43 +07:00
|
|
|
|
|
|
|
if (rt->fib6_pmtu)
|
|
|
|
fib6_metric_set(iter, RTAX_MTU,
|
|
|
|
rt->fib6_pmtu);
|
2005-04-17 05:20:36 +07:00
|
|
|
return -EEXIST;
|
|
|
|
}
|
2018-07-04 04:36:21 +07:00
|
|
|
/* If we have the same destination and the same metric,
|
|
|
|
* but not the same gateway, then the route we try to
|
|
|
|
* add is sibling to this route, increment our counter
|
|
|
|
* of siblings, and later we will add our route to the
|
|
|
|
* list.
|
|
|
|
* Only static routes (which don't have flag
|
|
|
|
* RTF_EXPIRES) are used for ECMPv6.
|
|
|
|
*
|
|
|
|
* To avoid long list, we only had siblings if the
|
|
|
|
* route have a gateway.
|
|
|
|
*/
|
|
|
|
if (rt_can_ecmp &&
|
|
|
|
rt6_qualify_for_ecmp(iter))
|
|
|
|
rt->fib6_nsiblings++;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
2018-04-19 05:38:59 +07:00
|
|
|
if (iter->fib6_metric > rt->fib6_metric)
|
2005-04-17 05:20:36 +07:00
|
|
|
break;
|
|
|
|
|
2018-07-04 04:36:21 +07:00
|
|
|
next_iter:
|
2018-05-05 03:54:24 +07:00
|
|
|
ins = &iter->fib6_next;
|
2015-05-19 01:54:00 +07:00
|
|
|
}
|
|
|
|
|
2018-07-04 04:36:21 +07:00
|
|
|
if (fallback_ins && !found) {
|
|
|
|
/* No ECMP-able route found, replace first non-ECMP one */
|
|
|
|
ins = fallback_ins;
|
|
|
|
iter = rcu_dereference_protected(*ins,
|
|
|
|
lockdep_is_held(&rt->fib6_table->tb6_lock));
|
|
|
|
found++;
|
|
|
|
}
|
|
|
|
|
[IPV6]: Fix routing round-robin locking.
As per RFC2461, section 6.3.6, item #2, when no routers on the
matching list are known to be reachable or probably reachable we
do round robin on those available routes so that we make sure
to probe as many of them as possible to detect when one becomes
reachable faster.
Each routing table has a rwlock protecting the tree and the linked
list of routes at each leaf. The round robin code executes during
lookup and thus with the rwlock taken as a reader. A small local
spinlock tries to provide protection but this does not work at all
for two reasons:
1) The round-robin list manipulation, as coded, goes like this (with
read lock held):
walk routes finding head and tail
spin_lock();
rotate list using head and tail
spin_unlock();
While one thread is rotating the list, another thread can
end up with stale values of head and tail and then proceed
to corrupt the list when it gets the lock. This ends up causing
the OOPS in fib6_add() later onthat many people have been hitting.
2) All the other code paths that run with the rwlock held as
a reader do not expect the list to change on them, they
expect it to remain completely fixed while they hold the
lock in that way.
So, simply stated, it is impossible to implement this correctly using
a manipulation of the list without violating the rwlock locking
semantics.
Reimplement using a per-fib6_node round-robin pointer. This way we
don't need to manipulate the list at all, and since the round-robin
pointer can only ever point to real existing entries we don't need
to perform any locking on the changing of the round-robin pointer
itself. We only need to reset the round-robin pointer to NULL when
the entry it is pointing to is removed.
The idea is from Thomas Graf and it is very similar to how this
was implemented before the advanced router selection code when in.
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-03-25 10:36:25 +07:00
|
|
|
/* Reset round-robin state, if necessary */
|
|
|
|
if (ins == &fn->leaf)
|
|
|
|
fn->rr_ptr = NULL;
|
|
|
|
|
2012-10-22 10:42:09 +07:00
|
|
|
/* Link this route to others same route. */
|
2018-07-04 04:36:21 +07:00
|
|
|
if (rt->fib6_nsiblings) {
|
|
|
|
unsigned int fib6_nsiblings;
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *sibling, *temp_sibling;
|
2012-10-22 10:42:09 +07:00
|
|
|
|
2018-07-04 04:36:21 +07:00
|
|
|
/* Find the first route that have the same metric */
|
|
|
|
sibling = leaf;
|
|
|
|
while (sibling) {
|
|
|
|
if (sibling->fib6_metric == rt->fib6_metric &&
|
|
|
|
rt6_qualify_for_ecmp(sibling)) {
|
|
|
|
list_add_tail(&rt->fib6_siblings,
|
|
|
|
&sibling->fib6_siblings);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
sibling = rcu_dereference_protected(sibling->fib6_next,
|
|
|
|
lockdep_is_held(&rt->fib6_table->tb6_lock));
|
2012-10-22 10:42:09 +07:00
|
|
|
}
|
|
|
|
/* For each sibling in the list, increment the counter of
|
|
|
|
* siblings. BUG() if counters does not match, list of siblings
|
|
|
|
* is broken!
|
|
|
|
*/
|
2018-07-04 04:36:21 +07:00
|
|
|
fib6_nsiblings = 0;
|
2012-10-22 10:42:09 +07:00
|
|
|
list_for_each_entry_safe(sibling, temp_sibling,
|
2018-07-04 04:36:21 +07:00
|
|
|
&rt->fib6_siblings, fib6_siblings) {
|
2018-04-19 05:38:59 +07:00
|
|
|
sibling->fib6_nsiblings++;
|
2018-07-04 04:36:21 +07:00
|
|
|
BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
|
|
|
|
fib6_nsiblings++;
|
2012-10-22 10:42:09 +07:00
|
|
|
}
|
2018-07-04 04:36:21 +07:00
|
|
|
BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
|
|
|
|
rt6_multipath_rebalance(temp_sibling);
|
2012-10-22 10:42:09 +07:00
|
|
|
}
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* insert node
|
|
|
|
*/
|
2011-11-14 07:15:14 +07:00
|
|
|
if (!replace) {
|
|
|
|
if (!add)
|
2012-05-15 21:11:53 +07:00
|
|
|
pr_warn("NLM_F_CREATE should be set when creating new route\n");
|
2011-11-14 07:15:14 +07:00
|
|
|
|
|
|
|
add:
|
2016-09-07 22:21:40 +07:00
|
|
|
nlflags |= NLM_F_CREATE;
|
2015-01-06 05:57:44 +07:00
|
|
|
|
2019-06-18 22:12:57 +07:00
|
|
|
if (!info->skip_notify_kernel) {
|
|
|
|
err = call_fib6_entry_notifiers(info->nl_net,
|
|
|
|
FIB_EVENT_ENTRY_ADD,
|
|
|
|
rt, extack);
|
2019-07-18 03:39:33 +07:00
|
|
|
if (err) {
|
|
|
|
struct fib6_info *sibling, *next_sibling;
|
|
|
|
|
|
|
|
/* If the route has siblings, then it first
|
|
|
|
* needs to be unlinked from them.
|
|
|
|
*/
|
|
|
|
if (!rt->fib6_nsiblings)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
list_for_each_entry_safe(sibling, next_sibling,
|
|
|
|
&rt->fib6_siblings,
|
|
|
|
fib6_siblings)
|
|
|
|
sibling->fib6_nsiblings--;
|
|
|
|
rt->fib6_nsiblings = 0;
|
|
|
|
list_del_init(&rt->fib6_siblings);
|
|
|
|
rt6_multipath_rebalance(next_sibling);
|
2019-06-18 22:12:57 +07:00
|
|
|
return err;
|
2019-07-18 03:39:33 +07:00
|
|
|
}
|
2019-06-18 22:12:57 +07:00
|
|
|
}
|
2018-03-28 08:21:59 +07:00
|
|
|
|
2018-05-05 03:54:24 +07:00
|
|
|
rcu_assign_pointer(rt->fib6_next, iter);
|
2019-04-23 08:35:02 +07:00
|
|
|
fib6_info_hold(rt);
|
2018-04-19 05:38:59 +07:00
|
|
|
rcu_assign_pointer(rt->fib6_node, fn);
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
rcu_assign_pointer(*ins, rt);
|
2017-02-03 03:37:10 +07:00
|
|
|
if (!info->skip_notify)
|
|
|
|
inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
|
2011-11-14 07:15:14 +07:00
|
|
|
info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
|
|
|
|
|
2011-12-04 05:50:45 +07:00
|
|
|
if (!(fn->fn_flags & RTN_RTINFO)) {
|
2011-11-14 07:15:14 +07:00
|
|
|
info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
|
|
|
|
fn->fn_flags |= RTN_RTINFO;
|
|
|
|
}
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2011-11-14 07:15:14 +07:00
|
|
|
} else {
|
2018-07-04 04:36:21 +07:00
|
|
|
int nsiblings;
|
2015-05-19 01:54:00 +07:00
|
|
|
|
2011-11-14 07:15:14 +07:00
|
|
|
if (!found) {
|
|
|
|
if (add)
|
|
|
|
goto add;
|
2012-05-15 21:11:53 +07:00
|
|
|
pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
|
2011-11-14 07:15:14 +07:00
|
|
|
return -ENOENT;
|
|
|
|
}
|
2015-01-06 05:57:44 +07:00
|
|
|
|
2019-06-18 22:12:57 +07:00
|
|
|
if (!info->skip_notify_kernel) {
|
|
|
|
err = call_fib6_entry_notifiers(info->nl_net,
|
|
|
|
FIB_EVENT_ENTRY_REPLACE,
|
|
|
|
rt, extack);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
}
|
2018-03-28 08:21:59 +07:00
|
|
|
|
2019-04-23 08:35:02 +07:00
|
|
|
fib6_info_hold(rt);
|
2018-04-19 05:38:59 +07:00
|
|
|
rcu_assign_pointer(rt->fib6_node, fn);
|
2018-07-04 04:36:21 +07:00
|
|
|
rt->fib6_next = iter->fib6_next;
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
rcu_assign_pointer(*ins, rt);
|
2017-02-03 03:37:10 +07:00
|
|
|
if (!info->skip_notify)
|
|
|
|
inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
|
2011-12-04 05:50:45 +07:00
|
|
|
if (!(fn->fn_flags & RTN_RTINFO)) {
|
2011-11-14 07:15:14 +07:00
|
|
|
info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
|
|
|
|
fn->fn_flags |= RTN_RTINFO;
|
|
|
|
}
|
2018-07-04 04:36:21 +07:00
|
|
|
nsiblings = iter->fib6_nsiblings;
|
|
|
|
iter->fib6_node = NULL;
|
|
|
|
fib6_purge_rt(iter, fn, info->nl_net);
|
|
|
|
if (rcu_access_pointer(fn->rr_ptr) == iter)
|
|
|
|
fn->rr_ptr = NULL;
|
|
|
|
fib6_info_release(iter);
|
2015-05-19 01:54:00 +07:00
|
|
|
|
2018-07-04 04:36:21 +07:00
|
|
|
if (nsiblings) {
|
2015-05-19 01:54:00 +07:00
|
|
|
/* Replacing an ECMP route, remove all siblings */
|
2018-07-04 04:36:21 +07:00
|
|
|
ins = &rt->fib6_next;
|
|
|
|
iter = rcu_dereference_protected(*ins,
|
|
|
|
lockdep_is_held(&rt->fib6_table->tb6_lock));
|
|
|
|
while (iter) {
|
|
|
|
if (iter->fib6_metric > rt->fib6_metric)
|
|
|
|
break;
|
|
|
|
if (rt6_qualify_for_ecmp(iter)) {
|
|
|
|
*ins = iter->fib6_next;
|
|
|
|
iter->fib6_node = NULL;
|
|
|
|
fib6_purge_rt(iter, fn, info->nl_net);
|
|
|
|
if (rcu_access_pointer(fn->rr_ptr) == iter)
|
|
|
|
fn->rr_ptr = NULL;
|
|
|
|
fib6_info_release(iter);
|
|
|
|
nsiblings--;
|
|
|
|
info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
|
|
|
|
} else {
|
|
|
|
ins = &iter->fib6_next;
|
|
|
|
}
|
|
|
|
iter = rcu_dereference_protected(*ins,
|
|
|
|
lockdep_is_held(&rt->fib6_table->tb6_lock));
|
2015-05-19 01:54:00 +07:00
|
|
|
}
|
2018-07-04 04:36:21 +07:00
|
|
|
WARN_ON(nsiblings != 0);
|
2015-05-19 01:54:00 +07:00
|
|
|
}
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-04-18 07:33:26 +07:00
|
|
|
static void fib6_start_gc(struct net *net, struct fib6_info *rt)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2008-07-23 04:33:45 +07:00
|
|
|
if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
|
2018-04-19 05:38:59 +07:00
|
|
|
(rt->fib6_flags & RTF_EXPIRES))
|
2008-07-23 04:33:45 +07:00
|
|
|
mod_timer(&net->ipv6.ip6_fib_timer,
|
2008-07-22 03:21:35 +07:00
|
|
|
jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
2008-03-04 14:31:11 +07:00
|
|
|
void fib6_force_start_gc(struct net *net)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2008-07-23 04:33:45 +07:00
|
|
|
if (!timer_pending(&net->ipv6.ip6_fib_timer))
|
|
|
|
mod_timer(&net->ipv6.ip6_fib_timer,
|
2008-07-22 03:21:35 +07:00
|
|
|
jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
2018-04-18 07:33:26 +07:00
|
|
|
static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
|
2018-01-07 17:45:13 +07:00
|
|
|
int sernum)
|
2017-10-07 02:06:07 +07:00
|
|
|
{
|
2018-04-19 05:38:59 +07:00
|
|
|
struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
|
|
|
|
lockdep_is_held(&rt->fib6_table->tb6_lock));
|
2017-10-07 02:06:07 +07:00
|
|
|
|
|
|
|
/* paired with smp_rmb() in rt6_get_cookie_safe() */
|
|
|
|
smp_wmb();
|
|
|
|
while (fn) {
|
|
|
|
fn->fn_sernum = sernum;
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
fn = rcu_dereference_protected(fn->parent,
|
2018-04-19 05:38:59 +07:00
|
|
|
lockdep_is_held(&rt->fib6_table->tb6_lock));
|
2017-10-07 02:06:07 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-18 07:33:26 +07:00
|
|
|
void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
|
2018-01-07 17:45:13 +07:00
|
|
|
{
|
|
|
|
__fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
|
|
|
|
}
|
|
|
|
|
2019-05-23 02:04:40 +07:00
|
|
|
/* allow ipv4 to update sernum via ipv6_stub */
|
|
|
|
void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
|
|
|
|
{
|
|
|
|
spin_lock_bh(&f6i->fib6_table->tb6_lock);
|
|
|
|
fib6_update_sernum_upto_root(net, f6i);
|
|
|
|
spin_unlock_bh(&f6i->fib6_table->tb6_lock);
|
|
|
|
}
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* Add routing information to the routing tree.
|
|
|
|
* <destination addr>/<source addr>
|
|
|
|
* with source addr info in sub-trees
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
* Need to own table->tb6_lock
|
2005-04-17 05:20:36 +07:00
|
|
|
*/
|
|
|
|
|
2018-04-18 07:33:26 +07:00
|
|
|
int fib6_add(struct fib6_node *root, struct fib6_info *rt,
|
2018-04-18 07:33:16 +07:00
|
|
|
struct nl_info *info, struct netlink_ext_ack *extack)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2018-04-19 05:38:59 +07:00
|
|
|
struct fib6_table *table = rt->fib6_table;
|
2006-08-24 07:20:34 +07:00
|
|
|
struct fib6_node *fn, *pn = NULL;
|
2005-04-17 05:20:36 +07:00
|
|
|
int err = -ENOMEM;
|
2011-11-14 07:15:14 +07:00
|
|
|
int allow_create = 1;
|
|
|
|
int replace_required = 0;
|
2014-10-07 00:58:37 +07:00
|
|
|
int sernum = fib6_new_sernum(info->nl_net);
|
2011-12-04 05:50:45 +07:00
|
|
|
|
|
|
|
if (info->nlh) {
|
|
|
|
if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
|
2011-11-14 07:15:14 +07:00
|
|
|
allow_create = 0;
|
2011-12-04 05:50:45 +07:00
|
|
|
if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
|
2011-11-14 07:15:14 +07:00
|
|
|
replace_required = 1;
|
|
|
|
}
|
|
|
|
if (!allow_create && !replace_required)
|
2012-05-15 21:11:53 +07:00
|
|
|
pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2017-10-07 02:06:11 +07:00
|
|
|
fn = fib6_add_1(info->nl_net, table, root,
|
2018-04-19 05:38:59 +07:00
|
|
|
&rt->fib6_dst.addr, rt->fib6_dst.plen,
|
|
|
|
offsetof(struct fib6_info, fib6_dst), allow_create,
|
2017-10-07 02:06:07 +07:00
|
|
|
replace_required, extack);
|
2011-11-14 07:15:14 +07:00
|
|
|
if (IS_ERR(fn)) {
|
|
|
|
err = PTR_ERR(fn);
|
net: fib: fib6_add: fix potential NULL pointer dereference
When the kernel is compiled with CONFIG_IPV6_SUBTREES, and we return
with an error in fn = fib6_add_1(), then error codes are encoded into
the return pointer e.g. ERR_PTR(-ENOENT). In such an error case, we
write the error code into err and jump to out, hence enter the if(err)
condition. Now, if CONFIG_IPV6_SUBTREES is enabled, we check for:
if (pn != fn && pn->leaf == rt)
...
if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO))
...
Since pn is NULL and fn is f.e. ERR_PTR(-ENOENT), then pn != fn
evaluates to true and causes a NULL-pointer dereference on further
checks on pn. Fix it, by setting both NULL in error case, so that
pn != fn already evaluates to false and no further dereference
takes place.
This was first correctly implemented in 4a287eba2 ("IPv6 routing,
NLM_F_* flag support: REPLACE and EXCL flags support, warn about
missing CREATE flag"), but the bug got later on introduced by
188c517a0 ("ipv6: return errno pointers consistently for fib6_add_1()").
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Lin Ming <mlin@ss.pku.edu.cn>
Cc: Matti Vaittinen <matti.vaittinen@nsn.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Matti Vaittinen <matti.vaittinen@nsn.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-09-07 20:13:20 +07:00
|
|
|
fn = NULL;
|
2005-04-17 05:20:36 +07:00
|
|
|
goto out;
|
2012-09-25 22:17:07 +07:00
|
|
|
}
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2006-08-24 07:20:34 +07:00
|
|
|
pn = fn;
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
#ifdef CONFIG_IPV6_SUBTREES
|
2018-04-19 05:38:59 +07:00
|
|
|
if (rt->fib6_src.plen) {
|
2005-04-17 05:20:36 +07:00
|
|
|
struct fib6_node *sn;
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
if (!rcu_access_pointer(fn->subtree)) {
|
2005-04-17 05:20:36 +07:00
|
|
|
struct fib6_node *sfn;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create subtree.
|
|
|
|
*
|
|
|
|
* fn[main tree]
|
|
|
|
* |
|
|
|
|
* sfn[subtree root]
|
|
|
|
* \
|
|
|
|
* sn[new leaf node]
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Create subtree root node */
|
2017-10-07 02:06:11 +07:00
|
|
|
sfn = node_alloc(info->nl_net);
|
2011-12-04 05:50:45 +07:00
|
|
|
if (!sfn)
|
2017-08-19 07:14:49 +07:00
|
|
|
goto failure;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2019-04-23 08:35:02 +07:00
|
|
|
fib6_info_hold(info->nl_net->ipv6.fib6_null_entry);
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
rcu_assign_pointer(sfn->leaf,
|
2018-04-18 07:33:18 +07:00
|
|
|
info->nl_net->ipv6.fib6_null_entry);
|
2005-04-17 05:20:36 +07:00
|
|
|
sfn->fn_flags = RTN_ROOT;
|
|
|
|
|
|
|
|
/* Now add the first leaf node to new subtree */
|
|
|
|
|
2017-10-07 02:06:11 +07:00
|
|
|
sn = fib6_add_1(info->nl_net, table, sfn,
|
2018-04-19 05:38:59 +07:00
|
|
|
&rt->fib6_src.addr, rt->fib6_src.plen,
|
|
|
|
offsetof(struct fib6_info, fib6_src),
|
2017-10-07 02:06:07 +07:00
|
|
|
allow_create, replace_required, extack);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2012-09-21 01:29:56 +07:00
|
|
|
if (IS_ERR(sn)) {
|
2005-04-17 05:20:36 +07:00
|
|
|
/* If it is failed, discard just allocated
|
2017-08-19 07:14:49 +07:00
|
|
|
root, and then (in failure) stale node
|
2005-04-17 05:20:36 +07:00
|
|
|
in main tree.
|
|
|
|
*/
|
2017-10-07 02:06:11 +07:00
|
|
|
node_free_immediate(info->nl_net, sfn);
|
2012-09-25 22:17:07 +07:00
|
|
|
err = PTR_ERR(sn);
|
2017-08-19 07:14:49 +07:00
|
|
|
goto failure;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Now link new subtree to main tree */
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
rcu_assign_pointer(sfn->parent, fn);
|
|
|
|
rcu_assign_pointer(fn->subtree, sfn);
|
2005-04-17 05:20:36 +07:00
|
|
|
} else {
|
2017-10-07 02:06:11 +07:00
|
|
|
sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
|
2018-04-19 05:38:59 +07:00
|
|
|
&rt->fib6_src.addr, rt->fib6_src.plen,
|
|
|
|
offsetof(struct fib6_info, fib6_src),
|
2017-10-07 02:06:07 +07:00
|
|
|
allow_create, replace_required, extack);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2011-11-14 07:15:14 +07:00
|
|
|
if (IS_ERR(sn)) {
|
|
|
|
err = PTR_ERR(sn);
|
2017-08-19 07:14:49 +07:00
|
|
|
goto failure;
|
2012-09-25 22:17:07 +07:00
|
|
|
}
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
if (!rcu_access_pointer(fn->leaf)) {
|
2018-01-19 01:40:03 +07:00
|
|
|
if (fn->fn_flags & RTN_TL_ROOT) {
|
|
|
|
/* put back null_entry for root node */
|
|
|
|
rcu_assign_pointer(fn->leaf,
|
2018-04-18 07:33:18 +07:00
|
|
|
info->nl_net->ipv6.fib6_null_entry);
|
2018-01-19 01:40:03 +07:00
|
|
|
} else {
|
2019-04-23 08:35:02 +07:00
|
|
|
fib6_info_hold(rt);
|
2018-01-19 01:40:03 +07:00
|
|
|
rcu_assign_pointer(fn->leaf, rt);
|
|
|
|
}
|
2006-08-24 07:20:34 +07:00
|
|
|
}
|
2005-04-17 05:20:36 +07:00
|
|
|
fn = sn;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2018-04-18 07:33:16 +07:00
|
|
|
err = fib6_add_rt2node(fn, rt, info, extack);
|
2017-10-07 02:06:07 +07:00
|
|
|
if (!err) {
|
2019-06-04 10:19:52 +07:00
|
|
|
if (rt->nh)
|
|
|
|
list_add(&rt->nh_list, &rt->nh->f6i_list);
|
2018-01-07 17:45:13 +07:00
|
|
|
__fib6_update_sernum_upto_root(rt, sernum);
|
2008-03-04 14:31:11 +07:00
|
|
|
fib6_start_gc(info->nl_net, rt);
|
2017-10-07 02:06:07 +07:00
|
|
|
}
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
out:
|
2006-08-24 07:20:34 +07:00
|
|
|
if (err) {
|
|
|
|
#ifdef CONFIG_IPV6_SUBTREES
|
|
|
|
/*
|
|
|
|
* If fib6_add_1 has cleared the old leaf pointer in the
|
|
|
|
* super-tree leaf node we have to find a new one for it.
|
|
|
|
*/
|
2018-01-04 05:11:59 +07:00
|
|
|
if (pn != fn) {
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *pn_leaf =
|
2018-01-04 05:11:59 +07:00
|
|
|
rcu_dereference_protected(pn->leaf,
|
|
|
|
lockdep_is_held(&table->tb6_lock));
|
|
|
|
if (pn_leaf == rt) {
|
|
|
|
pn_leaf = NULL;
|
|
|
|
RCU_INIT_POINTER(pn->leaf, NULL);
|
2018-04-18 07:33:25 +07:00
|
|
|
fib6_info_release(rt);
|
2006-08-24 07:20:34 +07:00
|
|
|
}
|
2018-01-04 05:11:59 +07:00
|
|
|
if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
|
|
|
|
pn_leaf = fib6_find_prefix(info->nl_net, table,
|
|
|
|
pn);
|
|
|
|
#if RT6_DEBUG >= 2
|
|
|
|
if (!pn_leaf) {
|
|
|
|
WARN_ON(!pn_leaf);
|
|
|
|
pn_leaf =
|
2018-04-18 07:33:18 +07:00
|
|
|
info->nl_net->ipv6.fib6_null_entry;
|
2018-01-04 05:11:59 +07:00
|
|
|
}
|
2006-08-24 07:20:34 +07:00
|
|
|
#endif
|
2018-04-18 07:33:25 +07:00
|
|
|
fib6_info_hold(pn_leaf);
|
2018-01-04 05:11:59 +07:00
|
|
|
rcu_assign_pointer(pn->leaf, pn_leaf);
|
|
|
|
}
|
2006-08-24 07:20:34 +07:00
|
|
|
}
|
|
|
|
#endif
|
2017-08-19 07:14:49 +07:00
|
|
|
goto failure;
|
2006-08-24 07:20:34 +07:00
|
|
|
}
|
2005-04-17 05:20:36 +07:00
|
|
|
return err;
|
|
|
|
|
2017-08-19 07:14:49 +07:00
|
|
|
failure:
|
ipv6: remove null_entry before adding default route
In the current code, when creating a new fib6 table, tb6_root.leaf gets
initialized to net->ipv6.ip6_null_entry.
If a default route is being added with rt->rt6i_metric = 0xffffffff,
fib6_add() will add this route after net->ipv6.ip6_null_entry. As
null_entry is shared, it could cause problem.
In order to fix it, set fn->leaf to NULL before calling
fib6_add_rt2node() when trying to add the first default route.
And reset fn->leaf to null_entry when adding fails or when deleting the
last default route.
syzkaller reported the following issue which is fixed by this commit:
WARNING: suspicious RCU usage
4.15.0-rc5+ #171 Not tainted
-----------------------------
net/ipv6/ip6_fib.c:1702 suspicious rcu_dereference_protected() usage!
other info that might help us debug this:
rcu_scheduler_active = 2, debug_locks = 1
4 locks held by swapper/0/0:
#0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] lockdep_copy_map include/linux/lockdep.h:178 [inline]
#0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] call_timer_fn+0x1c6/0x820 kernel/time/timer.c:1310
#1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] spin_lock_bh include/linux/spinlock.h:315 [inline]
#1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] fib6_run_gc+0x9d/0x3c0 net/ipv6/ip6_fib.c:2007
#2: (rcu_read_lock){....}, at: [<0000000091db762d>] __fib6_clean_all+0x0/0x3a0 net/ipv6/ip6_fib.c:1560
#3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] spin_lock_bh include/linux/spinlock.h:315 [inline]
#3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] __fib6_clean_all+0x1d0/0x3a0 net/ipv6/ip6_fib.c:1948
stack backtrace:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc5+ #171
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
<IRQ>
__dump_stack lib/dump_stack.c:17 [inline]
dump_stack+0x194/0x257 lib/dump_stack.c:53
lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4585
fib6_del+0xcaa/0x11b0 net/ipv6/ip6_fib.c:1701
fib6_clean_node+0x3aa/0x4f0 net/ipv6/ip6_fib.c:1892
fib6_walk_continue+0x46c/0x8a0 net/ipv6/ip6_fib.c:1815
fib6_walk+0x91/0xf0 net/ipv6/ip6_fib.c:1863
fib6_clean_tree+0x1e6/0x340 net/ipv6/ip6_fib.c:1933
__fib6_clean_all+0x1f4/0x3a0 net/ipv6/ip6_fib.c:1949
fib6_clean_all net/ipv6/ip6_fib.c:1960 [inline]
fib6_run_gc+0x16b/0x3c0 net/ipv6/ip6_fib.c:2016
fib6_gc_timer_cb+0x20/0x30 net/ipv6/ip6_fib.c:2033
call_timer_fn+0x228/0x820 kernel/time/timer.c:1320
expire_timers kernel/time/timer.c:1357 [inline]
__run_timers+0x7ee/0xb70 kernel/time/timer.c:1660
run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686
__do_softirq+0x2d7/0xb85 kernel/softirq.c:285
invoke_softirq kernel/softirq.c:365 [inline]
irq_exit+0x1cc/0x200 kernel/softirq.c:405
exiting_irq arch/x86/include/asm/apic.h:540 [inline]
smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052
apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:904
</IRQ>
Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-01-09 01:34:00 +07:00
|
|
|
/* fn->leaf could be NULL and fib6_repair_tree() needs to be called if:
|
|
|
|
* 1. fn is an intermediate node and we failed to add the new
|
|
|
|
* route to it in both subtree creation failure and fib6_add_rt2node()
|
|
|
|
* failure case.
|
|
|
|
* 2. fn is the root node in the table and we fail to add the first
|
|
|
|
* default route to it.
|
2005-04-17 05:20:36 +07:00
|
|
|
*/
|
ipv6: remove null_entry before adding default route
In the current code, when creating a new fib6 table, tb6_root.leaf gets
initialized to net->ipv6.ip6_null_entry.
If a default route is being added with rt->rt6i_metric = 0xffffffff,
fib6_add() will add this route after net->ipv6.ip6_null_entry. As
null_entry is shared, it could cause problem.
In order to fix it, set fn->leaf to NULL before calling
fib6_add_rt2node() when trying to add the first default route.
And reset fn->leaf to null_entry when adding fails or when deleting the
last default route.
syzkaller reported the following issue which is fixed by this commit:
WARNING: suspicious RCU usage
4.15.0-rc5+ #171 Not tainted
-----------------------------
net/ipv6/ip6_fib.c:1702 suspicious rcu_dereference_protected() usage!
other info that might help us debug this:
rcu_scheduler_active = 2, debug_locks = 1
4 locks held by swapper/0/0:
#0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] lockdep_copy_map include/linux/lockdep.h:178 [inline]
#0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] call_timer_fn+0x1c6/0x820 kernel/time/timer.c:1310
#1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] spin_lock_bh include/linux/spinlock.h:315 [inline]
#1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] fib6_run_gc+0x9d/0x3c0 net/ipv6/ip6_fib.c:2007
#2: (rcu_read_lock){....}, at: [<0000000091db762d>] __fib6_clean_all+0x0/0x3a0 net/ipv6/ip6_fib.c:1560
#3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] spin_lock_bh include/linux/spinlock.h:315 [inline]
#3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] __fib6_clean_all+0x1d0/0x3a0 net/ipv6/ip6_fib.c:1948
stack backtrace:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc5+ #171
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
<IRQ>
__dump_stack lib/dump_stack.c:17 [inline]
dump_stack+0x194/0x257 lib/dump_stack.c:53
lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4585
fib6_del+0xcaa/0x11b0 net/ipv6/ip6_fib.c:1701
fib6_clean_node+0x3aa/0x4f0 net/ipv6/ip6_fib.c:1892
fib6_walk_continue+0x46c/0x8a0 net/ipv6/ip6_fib.c:1815
fib6_walk+0x91/0xf0 net/ipv6/ip6_fib.c:1863
fib6_clean_tree+0x1e6/0x340 net/ipv6/ip6_fib.c:1933
__fib6_clean_all+0x1f4/0x3a0 net/ipv6/ip6_fib.c:1949
fib6_clean_all net/ipv6/ip6_fib.c:1960 [inline]
fib6_run_gc+0x16b/0x3c0 net/ipv6/ip6_fib.c:2016
fib6_gc_timer_cb+0x20/0x30 net/ipv6/ip6_fib.c:2033
call_timer_fn+0x228/0x820 kernel/time/timer.c:1320
expire_timers kernel/time/timer.c:1357 [inline]
__run_timers+0x7ee/0xb70 kernel/time/timer.c:1660
run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686
__do_softirq+0x2d7/0xb85 kernel/softirq.c:285
invoke_softirq kernel/softirq.c:365 [inline]
irq_exit+0x1cc/0x200 kernel/softirq.c:405
exiting_irq arch/x86/include/asm/apic.h:540 [inline]
smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052
apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:904
</IRQ>
Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-01-09 01:34:00 +07:00
|
|
|
if (fn &&
|
|
|
|
(!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) ||
|
|
|
|
(fn->fn_flags & RTN_TL_ROOT &&
|
|
|
|
!rcu_access_pointer(fn->leaf))))
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
fib6_repair_tree(info->nl_net, table, fn);
|
2005-04-17 05:20:36 +07:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Routing tree lookup
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct lookup_args {
|
2018-04-18 07:33:26 +07:00
|
|
|
int offset; /* key offset on fib6_info */
|
2011-04-22 11:53:02 +07:00
|
|
|
const struct in6_addr *addr; /* search key */
|
2005-04-17 05:20:36 +07:00
|
|
|
};
|
|
|
|
|
2018-05-10 10:34:19 +07:00
|
|
|
static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
|
|
|
|
struct lookup_args *args)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
|
|
|
struct fib6_node *fn;
|
2006-11-15 11:56:00 +07:00
|
|
|
__be32 dir;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2006-08-24 07:21:29 +07:00
|
|
|
if (unlikely(args->offset == 0))
|
|
|
|
return NULL;
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* Descend on a tree
|
|
|
|
*/
|
|
|
|
|
|
|
|
fn = root;
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
struct fib6_node *next;
|
|
|
|
|
|
|
|
dir = addr_bit_set(args->addr, fn->fn_bit);
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
next = dir ? rcu_dereference(fn->right) :
|
|
|
|
rcu_dereference(fn->left);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
if (next) {
|
|
|
|
fn = next;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2011-12-04 05:50:45 +07:00
|
|
|
while (fn) {
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
struct fib6_node *subtree = FIB6_SUBTREE(fn);
|
|
|
|
|
|
|
|
if (subtree || fn->fn_flags & RTN_RTINFO) {
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *leaf = rcu_dereference(fn->leaf);
|
2005-04-17 05:20:36 +07:00
|
|
|
struct rt6key *key;
|
|
|
|
|
2017-10-07 02:06:08 +07:00
|
|
|
if (!leaf)
|
|
|
|
goto backtrack;
|
|
|
|
|
|
|
|
key = (struct rt6key *) ((u8 *)leaf + args->offset);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2006-08-24 07:21:12 +07:00
|
|
|
if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
|
|
|
|
#ifdef CONFIG_IPV6_SUBTREES
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
if (subtree) {
|
2013-08-07 07:34:31 +07:00
|
|
|
struct fib6_node *sfn;
|
2018-05-10 10:34:19 +07:00
|
|
|
sfn = fib6_node_lookup_1(subtree,
|
|
|
|
args + 1);
|
2013-08-07 07:34:31 +07:00
|
|
|
if (!sfn)
|
|
|
|
goto backtrack;
|
|
|
|
fn = sfn;
|
|
|
|
}
|
2006-08-24 07:21:12 +07:00
|
|
|
#endif
|
2013-08-07 07:34:31 +07:00
|
|
|
if (fn->fn_flags & RTN_RTINFO)
|
2006-08-24 07:21:12 +07:00
|
|
|
return fn;
|
|
|
|
}
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
2013-08-07 07:34:31 +07:00
|
|
|
backtrack:
|
2006-08-24 07:21:12 +07:00
|
|
|
if (fn->fn_flags & RTN_ROOT)
|
|
|
|
break;
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
fn = rcu_dereference(fn->parent);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
/* called with rcu_read_lock() held
|
|
|
|
*/
|
2018-05-10 10:34:19 +07:00
|
|
|
struct fib6_node *fib6_node_lookup(struct fib6_node *root,
|
|
|
|
const struct in6_addr *daddr,
|
|
|
|
const struct in6_addr *saddr)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
|
|
|
struct fib6_node *fn;
|
2006-08-24 07:21:29 +07:00
|
|
|
struct lookup_args args[] = {
|
|
|
|
{
|
2018-04-19 05:38:59 +07:00
|
|
|
.offset = offsetof(struct fib6_info, fib6_dst),
|
2006-08-24 07:21:29 +07:00
|
|
|
.addr = daddr,
|
|
|
|
},
|
2005-04-17 05:20:36 +07:00
|
|
|
#ifdef CONFIG_IPV6_SUBTREES
|
2006-08-24 07:21:29 +07:00
|
|
|
{
|
2018-04-19 05:38:59 +07:00
|
|
|
.offset = offsetof(struct fib6_info, fib6_src),
|
2006-08-24 07:21:29 +07:00
|
|
|
.addr = saddr,
|
|
|
|
},
|
2005-04-17 05:20:36 +07:00
|
|
|
#endif
|
2006-08-24 07:21:29 +07:00
|
|
|
{
|
|
|
|
.offset = 0, /* sentinel */
|
|
|
|
}
|
|
|
|
};
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2018-05-10 10:34:19 +07:00
|
|
|
fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
|
2011-12-04 05:50:45 +07:00
|
|
|
if (!fn || fn->fn_flags & RTN_TL_ROOT)
|
2005-04-17 05:20:36 +07:00
|
|
|
fn = root;
|
|
|
|
|
|
|
|
return fn;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get node with specified destination prefix (and source prefix,
|
|
|
|
* if subtrees are used)
|
2017-10-07 02:06:02 +07:00
|
|
|
* exact_match == true means we try to find fn with exact match of
|
|
|
|
* the passed in prefix addr
|
|
|
|
* exact_match == false means we try to find fn with longest prefix
|
|
|
|
* match of the passed in prefix addr. This is useful for finding fn
|
|
|
|
* for cached route as it will be stored in the exception table under
|
|
|
|
* the node with longest prefix length.
|
2005-04-17 05:20:36 +07:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
2014-03-28 11:07:04 +07:00
|
|
|
static struct fib6_node *fib6_locate_1(struct fib6_node *root,
|
|
|
|
const struct in6_addr *addr,
|
2017-10-07 02:06:02 +07:00
|
|
|
int plen, int offset,
|
|
|
|
bool exact_match)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2017-10-07 02:06:02 +07:00
|
|
|
struct fib6_node *fn, *prev = NULL;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
for (fn = root; fn ; ) {
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *leaf = rcu_dereference(fn->leaf);
|
2017-10-07 02:06:08 +07:00
|
|
|
struct rt6key *key;
|
|
|
|
|
|
|
|
/* This node is being deleted */
|
|
|
|
if (!leaf) {
|
|
|
|
if (plen <= fn->fn_bit)
|
|
|
|
goto out;
|
|
|
|
else
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
|
|
|
key = (struct rt6key *)((u8 *)leaf + offset);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Prefix match
|
|
|
|
*/
|
|
|
|
if (plen < fn->fn_bit ||
|
|
|
|
!ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
|
2017-10-07 02:06:02 +07:00
|
|
|
goto out;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
if (plen == fn->fn_bit)
|
|
|
|
return fn;
|
|
|
|
|
2019-06-21 22:45:28 +07:00
|
|
|
if (fn->fn_flags & RTN_RTINFO)
|
|
|
|
prev = fn;
|
2017-10-07 02:06:02 +07:00
|
|
|
|
2017-10-07 02:06:08 +07:00
|
|
|
next:
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* We have more bits to go
|
|
|
|
*/
|
|
|
|
if (addr_bit_set(addr, fn->fn_bit))
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
fn = rcu_dereference(fn->right);
|
2005-04-17 05:20:36 +07:00
|
|
|
else
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
fn = rcu_dereference(fn->left);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
2017-10-07 02:06:02 +07:00
|
|
|
out:
|
|
|
|
if (exact_match)
|
|
|
|
return NULL;
|
|
|
|
else
|
|
|
|
return prev;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
2014-03-28 11:07:04 +07:00
|
|
|
struct fib6_node *fib6_locate(struct fib6_node *root,
|
|
|
|
const struct in6_addr *daddr, int dst_len,
|
2017-10-07 02:06:02 +07:00
|
|
|
const struct in6_addr *saddr, int src_len,
|
|
|
|
bool exact_match)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
|
|
|
struct fib6_node *fn;
|
|
|
|
|
|
|
|
fn = fib6_locate_1(root, daddr, dst_len,
|
2018-04-19 05:38:59 +07:00
|
|
|
offsetof(struct fib6_info, fib6_dst),
|
2017-10-07 02:06:02 +07:00
|
|
|
exact_match);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
#ifdef CONFIG_IPV6_SUBTREES
|
|
|
|
if (src_len) {
|
2008-07-26 11:43:18 +07:00
|
|
|
WARN_ON(saddr == NULL);
|
2017-10-14 05:01:08 +07:00
|
|
|
if (fn) {
|
|
|
|
struct fib6_node *subtree = FIB6_SUBTREE(fn);
|
|
|
|
|
|
|
|
if (subtree) {
|
|
|
|
fn = fib6_locate_1(subtree, saddr, src_len,
|
2018-04-19 05:38:59 +07:00
|
|
|
offsetof(struct fib6_info, fib6_src),
|
2017-10-07 02:06:02 +07:00
|
|
|
exact_match);
|
2017-10-14 05:01:08 +07:00
|
|
|
}
|
|
|
|
}
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2011-12-04 05:50:45 +07:00
|
|
|
if (fn && fn->fn_flags & RTN_RTINFO)
|
2005-04-17 05:20:36 +07:00
|
|
|
return fn;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Deletion
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2018-04-18 07:33:26 +07:00
|
|
|
static struct fib6_info *fib6_find_prefix(struct net *net,
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
struct fib6_table *table,
|
|
|
|
struct fib6_node *fn)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
struct fib6_node *child_left, *child_right;
|
|
|
|
|
2011-12-04 05:50:45 +07:00
|
|
|
if (fn->fn_flags & RTN_ROOT)
|
2018-04-18 07:33:18 +07:00
|
|
|
return net->ipv6.fib6_null_entry;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2011-12-04 05:50:45 +07:00
|
|
|
while (fn) {
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
child_left = rcu_dereference_protected(fn->left,
|
|
|
|
lockdep_is_held(&table->tb6_lock));
|
|
|
|
child_right = rcu_dereference_protected(fn->right,
|
|
|
|
lockdep_is_held(&table->tb6_lock));
|
|
|
|
if (child_left)
|
|
|
|
return rcu_dereference_protected(child_left->leaf,
|
|
|
|
lockdep_is_held(&table->tb6_lock));
|
|
|
|
if (child_right)
|
|
|
|
return rcu_dereference_protected(child_right->leaf,
|
|
|
|
lockdep_is_held(&table->tb6_lock));
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2006-08-24 07:22:24 +07:00
|
|
|
fn = FIB6_SUBTREE(fn);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called to trim the tree of intermediate nodes when possible. "fn"
|
|
|
|
* is the node we want to try and remove.
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
* Need to own table->tb6_lock
|
2005-04-17 05:20:36 +07:00
|
|
|
*/
|
|
|
|
|
2008-03-05 04:48:30 +07:00
|
|
|
static struct fib6_node *fib6_repair_tree(struct net *net,
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
struct fib6_table *table,
|
|
|
|
struct fib6_node *fn)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
|
|
|
int children;
|
|
|
|
int nstate;
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
struct fib6_node *child;
|
2014-10-07 00:58:34 +07:00
|
|
|
struct fib6_walker *w;
|
2005-04-17 05:20:36 +07:00
|
|
|
int iter = 0;
|
|
|
|
|
ipv6: remove null_entry before adding default route
In the current code, when creating a new fib6 table, tb6_root.leaf gets
initialized to net->ipv6.ip6_null_entry.
If a default route is being added with rt->rt6i_metric = 0xffffffff,
fib6_add() will add this route after net->ipv6.ip6_null_entry. As
null_entry is shared, it could cause problem.
In order to fix it, set fn->leaf to NULL before calling
fib6_add_rt2node() when trying to add the first default route.
And reset fn->leaf to null_entry when adding fails or when deleting the
last default route.
syzkaller reported the following issue which is fixed by this commit:
WARNING: suspicious RCU usage
4.15.0-rc5+ #171 Not tainted
-----------------------------
net/ipv6/ip6_fib.c:1702 suspicious rcu_dereference_protected() usage!
other info that might help us debug this:
rcu_scheduler_active = 2, debug_locks = 1
4 locks held by swapper/0/0:
#0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] lockdep_copy_map include/linux/lockdep.h:178 [inline]
#0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] call_timer_fn+0x1c6/0x820 kernel/time/timer.c:1310
#1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] spin_lock_bh include/linux/spinlock.h:315 [inline]
#1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] fib6_run_gc+0x9d/0x3c0 net/ipv6/ip6_fib.c:2007
#2: (rcu_read_lock){....}, at: [<0000000091db762d>] __fib6_clean_all+0x0/0x3a0 net/ipv6/ip6_fib.c:1560
#3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] spin_lock_bh include/linux/spinlock.h:315 [inline]
#3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] __fib6_clean_all+0x1d0/0x3a0 net/ipv6/ip6_fib.c:1948
stack backtrace:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc5+ #171
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
<IRQ>
__dump_stack lib/dump_stack.c:17 [inline]
dump_stack+0x194/0x257 lib/dump_stack.c:53
lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4585
fib6_del+0xcaa/0x11b0 net/ipv6/ip6_fib.c:1701
fib6_clean_node+0x3aa/0x4f0 net/ipv6/ip6_fib.c:1892
fib6_walk_continue+0x46c/0x8a0 net/ipv6/ip6_fib.c:1815
fib6_walk+0x91/0xf0 net/ipv6/ip6_fib.c:1863
fib6_clean_tree+0x1e6/0x340 net/ipv6/ip6_fib.c:1933
__fib6_clean_all+0x1f4/0x3a0 net/ipv6/ip6_fib.c:1949
fib6_clean_all net/ipv6/ip6_fib.c:1960 [inline]
fib6_run_gc+0x16b/0x3c0 net/ipv6/ip6_fib.c:2016
fib6_gc_timer_cb+0x20/0x30 net/ipv6/ip6_fib.c:2033
call_timer_fn+0x228/0x820 kernel/time/timer.c:1320
expire_timers kernel/time/timer.c:1357 [inline]
__run_timers+0x7ee/0xb70 kernel/time/timer.c:1660
run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686
__do_softirq+0x2d7/0xb85 kernel/softirq.c:285
invoke_softirq kernel/softirq.c:365 [inline]
irq_exit+0x1cc/0x200 kernel/softirq.c:405
exiting_irq arch/x86/include/asm/apic.h:540 [inline]
smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052
apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:904
</IRQ>
Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-01-09 01:34:00 +07:00
|
|
|
/* Set fn->leaf to null_entry for root node. */
|
|
|
|
if (fn->fn_flags & RTN_TL_ROOT) {
|
2018-04-18 07:33:18 +07:00
|
|
|
rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
|
ipv6: remove null_entry before adding default route
In the current code, when creating a new fib6 table, tb6_root.leaf gets
initialized to net->ipv6.ip6_null_entry.
If a default route is being added with rt->rt6i_metric = 0xffffffff,
fib6_add() will add this route after net->ipv6.ip6_null_entry. As
null_entry is shared, it could cause problem.
In order to fix it, set fn->leaf to NULL before calling
fib6_add_rt2node() when trying to add the first default route.
And reset fn->leaf to null_entry when adding fails or when deleting the
last default route.
syzkaller reported the following issue which is fixed by this commit:
WARNING: suspicious RCU usage
4.15.0-rc5+ #171 Not tainted
-----------------------------
net/ipv6/ip6_fib.c:1702 suspicious rcu_dereference_protected() usage!
other info that might help us debug this:
rcu_scheduler_active = 2, debug_locks = 1
4 locks held by swapper/0/0:
#0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] lockdep_copy_map include/linux/lockdep.h:178 [inline]
#0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] call_timer_fn+0x1c6/0x820 kernel/time/timer.c:1310
#1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] spin_lock_bh include/linux/spinlock.h:315 [inline]
#1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] fib6_run_gc+0x9d/0x3c0 net/ipv6/ip6_fib.c:2007
#2: (rcu_read_lock){....}, at: [<0000000091db762d>] __fib6_clean_all+0x0/0x3a0 net/ipv6/ip6_fib.c:1560
#3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] spin_lock_bh include/linux/spinlock.h:315 [inline]
#3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] __fib6_clean_all+0x1d0/0x3a0 net/ipv6/ip6_fib.c:1948
stack backtrace:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc5+ #171
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
<IRQ>
__dump_stack lib/dump_stack.c:17 [inline]
dump_stack+0x194/0x257 lib/dump_stack.c:53
lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4585
fib6_del+0xcaa/0x11b0 net/ipv6/ip6_fib.c:1701
fib6_clean_node+0x3aa/0x4f0 net/ipv6/ip6_fib.c:1892
fib6_walk_continue+0x46c/0x8a0 net/ipv6/ip6_fib.c:1815
fib6_walk+0x91/0xf0 net/ipv6/ip6_fib.c:1863
fib6_clean_tree+0x1e6/0x340 net/ipv6/ip6_fib.c:1933
__fib6_clean_all+0x1f4/0x3a0 net/ipv6/ip6_fib.c:1949
fib6_clean_all net/ipv6/ip6_fib.c:1960 [inline]
fib6_run_gc+0x16b/0x3c0 net/ipv6/ip6_fib.c:2016
fib6_gc_timer_cb+0x20/0x30 net/ipv6/ip6_fib.c:2033
call_timer_fn+0x228/0x820 kernel/time/timer.c:1320
expire_timers kernel/time/timer.c:1357 [inline]
__run_timers+0x7ee/0xb70 kernel/time/timer.c:1660
run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686
__do_softirq+0x2d7/0xb85 kernel/softirq.c:285
invoke_softirq kernel/softirq.c:365 [inline]
irq_exit+0x1cc/0x200 kernel/softirq.c:405
exiting_irq arch/x86/include/asm/apic.h:540 [inline]
smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052
apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:904
</IRQ>
Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-01-09 01:34:00 +07:00
|
|
|
return fn;
|
|
|
|
}
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
for (;;) {
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
|
|
|
|
lockdep_is_held(&table->tb6_lock));
|
|
|
|
struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
|
|
|
|
lockdep_is_held(&table->tb6_lock));
|
|
|
|
struct fib6_node *pn = rcu_dereference_protected(fn->parent,
|
|
|
|
lockdep_is_held(&table->tb6_lock));
|
|
|
|
struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
|
|
|
|
lockdep_is_held(&table->tb6_lock));
|
|
|
|
struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
|
|
|
|
lockdep_is_held(&table->tb6_lock));
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
lockdep_is_held(&table->tb6_lock));
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
lockdep_is_held(&table->tb6_lock));
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *new_fn_leaf;
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
|
|
|
|
iter++;
|
|
|
|
|
2008-07-26 11:43:18 +07:00
|
|
|
WARN_ON(fn->fn_flags & RTN_RTINFO);
|
|
|
|
WARN_ON(fn->fn_flags & RTN_TL_ROOT);
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
WARN_ON(fn_leaf);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
children = 0;
|
|
|
|
child = NULL;
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
if (fn_r)
|
|
|
|
child = fn_r, children |= 1;
|
|
|
|
if (fn_l)
|
|
|
|
child = fn_l, children |= 2;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2006-08-24 07:22:24 +07:00
|
|
|
if (children == 3 || FIB6_SUBTREE(fn)
|
2005-04-17 05:20:36 +07:00
|
|
|
#ifdef CONFIG_IPV6_SUBTREES
|
|
|
|
/* Subtree root (i.e. fn) may have one child */
|
2011-12-04 05:50:45 +07:00
|
|
|
|| (children && fn->fn_flags & RTN_ROOT)
|
2005-04-17 05:20:36 +07:00
|
|
|
#endif
|
|
|
|
) {
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
new_fn_leaf = fib6_find_prefix(net, table, fn);
|
2005-04-17 05:20:36 +07:00
|
|
|
#if RT6_DEBUG >= 2
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
if (!new_fn_leaf) {
|
|
|
|
WARN_ON(!new_fn_leaf);
|
2018-04-18 07:33:18 +07:00
|
|
|
new_fn_leaf = net->ipv6.fib6_null_entry;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
#endif
|
2018-04-18 07:33:25 +07:00
|
|
|
fib6_info_hold(new_fn_leaf);
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
rcu_assign_pointer(fn->leaf, new_fn_leaf);
|
|
|
|
return pn;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_IPV6_SUBTREES
|
2006-08-24 07:22:24 +07:00
|
|
|
if (FIB6_SUBTREE(pn) == fn) {
|
2008-07-26 11:43:18 +07:00
|
|
|
WARN_ON(!(fn->fn_flags & RTN_ROOT));
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
RCU_INIT_POINTER(pn->subtree, NULL);
|
2005-04-17 05:20:36 +07:00
|
|
|
nstate = FWS_L;
|
|
|
|
} else {
|
2008-07-26 11:43:18 +07:00
|
|
|
WARN_ON(fn->fn_flags & RTN_ROOT);
|
2005-04-17 05:20:36 +07:00
|
|
|
#endif
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
if (pn_r == fn)
|
|
|
|
rcu_assign_pointer(pn->right, child);
|
|
|
|
else if (pn_l == fn)
|
|
|
|
rcu_assign_pointer(pn->left, child);
|
2005-04-17 05:20:36 +07:00
|
|
|
#if RT6_DEBUG >= 2
|
2008-07-26 11:43:18 +07:00
|
|
|
else
|
|
|
|
WARN_ON(1);
|
2005-04-17 05:20:36 +07:00
|
|
|
#endif
|
|
|
|
if (child)
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
rcu_assign_pointer(child->parent, pn);
|
2005-04-17 05:20:36 +07:00
|
|
|
nstate = FWS_R;
|
|
|
|
#ifdef CONFIG_IPV6_SUBTREES
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2016-03-08 20:44:35 +07:00
|
|
|
read_lock(&net->ipv6.fib6_walker_lock);
|
|
|
|
FOR_WALKERS(net, w) {
|
2011-12-04 05:50:45 +07:00
|
|
|
if (!child) {
|
2017-10-07 02:06:03 +07:00
|
|
|
if (w->node == fn) {
|
2005-04-17 05:20:36 +07:00
|
|
|
RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
|
|
|
|
w->node = pn;
|
|
|
|
w->state = nstate;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (w->node == fn) {
|
|
|
|
w->node = child;
|
|
|
|
if (children&2) {
|
|
|
|
RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
|
2014-03-28 11:07:02 +07:00
|
|
|
w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
|
2005-04-17 05:20:36 +07:00
|
|
|
} else {
|
|
|
|
RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
|
2014-03-28 11:07:02 +07:00
|
|
|
w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-03-08 20:44:35 +07:00
|
|
|
read_unlock(&net->ipv6.fib6_walker_lock);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2017-10-07 02:06:11 +07:00
|
|
|
node_free(net, fn);
|
2011-12-04 05:50:45 +07:00
|
|
|
if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
|
2005-04-17 05:20:36 +07:00
|
|
|
return pn;
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
RCU_INIT_POINTER(pn->leaf, NULL);
|
2018-04-18 07:33:25 +07:00
|
|
|
fib6_info_release(pn_leaf);
|
2005-04-17 05:20:36 +07:00
|
|
|
fn = pn;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info __rcu **rtp, struct nl_info *info)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2014-10-07 00:58:34 +07:00
|
|
|
struct fib6_walker *w;
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *rt = rcu_dereference_protected(*rtp,
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
lockdep_is_held(&table->tb6_lock));
|
2008-03-04 14:34:17 +07:00
|
|
|
struct net *net = info->nl_net;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
RT6_TRACE("fib6_del_route\n");
|
|
|
|
|
|
|
|
/* Unlink it */
|
2018-05-05 03:54:24 +07:00
|
|
|
*rtp = rt->fib6_next;
|
2018-04-19 05:38:59 +07:00
|
|
|
rt->fib6_node = NULL;
|
2008-03-04 14:34:17 +07:00
|
|
|
net->ipv6.rt6_stats->fib_rt_entries--;
|
|
|
|
net->ipv6.rt6_stats->fib_discarded_routes++;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2017-10-07 02:06:03 +07:00
|
|
|
/* Flush all cached dst in exception table */
|
|
|
|
rt6_flush_exceptions(rt);
|
|
|
|
|
[IPV6]: Fix routing round-robin locking.
As per RFC2461, section 6.3.6, item #2, when no routers on the
matching list are known to be reachable or probably reachable we
do round robin on those available routes so that we make sure
to probe as many of them as possible to detect when one becomes
reachable faster.
Each routing table has a rwlock protecting the tree and the linked
list of routes at each leaf. The round robin code executes during
lookup and thus with the rwlock taken as a reader. A small local
spinlock tries to provide protection but this does not work at all
for two reasons:
1) The round-robin list manipulation, as coded, goes like this (with
read lock held):
walk routes finding head and tail
spin_lock();
rotate list using head and tail
spin_unlock();
While one thread is rotating the list, another thread can
end up with stale values of head and tail and then proceed
to corrupt the list when it gets the lock. This ends up causing
the OOPS in fib6_add() later onthat many people have been hitting.
2) All the other code paths that run with the rwlock held as
a reader do not expect the list to change on them, they
expect it to remain completely fixed while they hold the
lock in that way.
So, simply stated, it is impossible to implement this correctly using
a manipulation of the list without violating the rwlock locking
semantics.
Reimplement using a per-fib6_node round-robin pointer. This way we
don't need to manipulate the list at all, and since the round-robin
pointer can only ever point to real existing entries we don't need
to perform any locking on the changing of the round-robin pointer
itself. We only need to reset the round-robin pointer to NULL when
the entry it is pointing to is removed.
The idea is from Thomas Graf and it is very similar to how this
was implemented before the advanced router selection code when in.
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-03-25 10:36:25 +07:00
|
|
|
/* Reset round-robin state, if necessary */
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
if (rcu_access_pointer(fn->rr_ptr) == rt)
|
[IPV6]: Fix routing round-robin locking.
As per RFC2461, section 6.3.6, item #2, when no routers on the
matching list are known to be reachable or probably reachable we
do round robin on those available routes so that we make sure
to probe as many of them as possible to detect when one becomes
reachable faster.
Each routing table has a rwlock protecting the tree and the linked
list of routes at each leaf. The round robin code executes during
lookup and thus with the rwlock taken as a reader. A small local
spinlock tries to provide protection but this does not work at all
for two reasons:
1) The round-robin list manipulation, as coded, goes like this (with
read lock held):
walk routes finding head and tail
spin_lock();
rotate list using head and tail
spin_unlock();
While one thread is rotating the list, another thread can
end up with stale values of head and tail and then proceed
to corrupt the list when it gets the lock. This ends up causing
the OOPS in fib6_add() later onthat many people have been hitting.
2) All the other code paths that run with the rwlock held as
a reader do not expect the list to change on them, they
expect it to remain completely fixed while they hold the
lock in that way.
So, simply stated, it is impossible to implement this correctly using
a manipulation of the list without violating the rwlock locking
semantics.
Reimplement using a per-fib6_node round-robin pointer. This way we
don't need to manipulate the list at all, and since the round-robin
pointer can only ever point to real existing entries we don't need
to perform any locking on the changing of the round-robin pointer
itself. We only need to reset the round-robin pointer to NULL when
the entry it is pointing to is removed.
The idea is from Thomas Graf and it is very similar to how this
was implemented before the advanced router selection code when in.
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-03-25 10:36:25 +07:00
|
|
|
fn->rr_ptr = NULL;
|
|
|
|
|
2012-10-22 10:42:09 +07:00
|
|
|
/* Remove this entry from other siblings */
|
2018-04-19 05:38:59 +07:00
|
|
|
if (rt->fib6_nsiblings) {
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *sibling, *next_sibling;
|
2012-10-22 10:42:09 +07:00
|
|
|
|
|
|
|
list_for_each_entry_safe(sibling, next_sibling,
|
2018-04-19 05:38:59 +07:00
|
|
|
&rt->fib6_siblings, fib6_siblings)
|
|
|
|
sibling->fib6_nsiblings--;
|
|
|
|
rt->fib6_nsiblings = 0;
|
|
|
|
list_del_init(&rt->fib6_siblings);
|
2018-01-09 21:40:25 +07:00
|
|
|
rt6_multipath_rebalance(next_sibling);
|
2012-10-22 10:42:09 +07:00
|
|
|
}
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/* Adjust walkers */
|
2016-03-08 20:44:35 +07:00
|
|
|
read_lock(&net->ipv6.fib6_walker_lock);
|
|
|
|
FOR_WALKERS(net, w) {
|
2005-04-17 05:20:36 +07:00
|
|
|
if (w->state == FWS_C && w->leaf == rt) {
|
|
|
|
RT6_TRACE("walker %p adjusted by delroute\n", w);
|
2018-05-05 03:54:24 +07:00
|
|
|
w->leaf = rcu_dereference_protected(rt->fib6_next,
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
lockdep_is_held(&table->tb6_lock));
|
2011-12-04 05:50:45 +07:00
|
|
|
if (!w->leaf)
|
2005-04-17 05:20:36 +07:00
|
|
|
w->state = FWS_U;
|
|
|
|
}
|
|
|
|
}
|
2016-03-08 20:44:35 +07:00
|
|
|
read_unlock(&net->ipv6.fib6_walker_lock);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
ipv6: remove null_entry before adding default route
In the current code, when creating a new fib6 table, tb6_root.leaf gets
initialized to net->ipv6.ip6_null_entry.
If a default route is being added with rt->rt6i_metric = 0xffffffff,
fib6_add() will add this route after net->ipv6.ip6_null_entry. As
null_entry is shared, it could cause problem.
In order to fix it, set fn->leaf to NULL before calling
fib6_add_rt2node() when trying to add the first default route.
And reset fn->leaf to null_entry when adding fails or when deleting the
last default route.
syzkaller reported the following issue which is fixed by this commit:
WARNING: suspicious RCU usage
4.15.0-rc5+ #171 Not tainted
-----------------------------
net/ipv6/ip6_fib.c:1702 suspicious rcu_dereference_protected() usage!
other info that might help us debug this:
rcu_scheduler_active = 2, debug_locks = 1
4 locks held by swapper/0/0:
#0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] lockdep_copy_map include/linux/lockdep.h:178 [inline]
#0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] call_timer_fn+0x1c6/0x820 kernel/time/timer.c:1310
#1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] spin_lock_bh include/linux/spinlock.h:315 [inline]
#1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] fib6_run_gc+0x9d/0x3c0 net/ipv6/ip6_fib.c:2007
#2: (rcu_read_lock){....}, at: [<0000000091db762d>] __fib6_clean_all+0x0/0x3a0 net/ipv6/ip6_fib.c:1560
#3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] spin_lock_bh include/linux/spinlock.h:315 [inline]
#3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] __fib6_clean_all+0x1d0/0x3a0 net/ipv6/ip6_fib.c:1948
stack backtrace:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc5+ #171
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
<IRQ>
__dump_stack lib/dump_stack.c:17 [inline]
dump_stack+0x194/0x257 lib/dump_stack.c:53
lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4585
fib6_del+0xcaa/0x11b0 net/ipv6/ip6_fib.c:1701
fib6_clean_node+0x3aa/0x4f0 net/ipv6/ip6_fib.c:1892
fib6_walk_continue+0x46c/0x8a0 net/ipv6/ip6_fib.c:1815
fib6_walk+0x91/0xf0 net/ipv6/ip6_fib.c:1863
fib6_clean_tree+0x1e6/0x340 net/ipv6/ip6_fib.c:1933
__fib6_clean_all+0x1f4/0x3a0 net/ipv6/ip6_fib.c:1949
fib6_clean_all net/ipv6/ip6_fib.c:1960 [inline]
fib6_run_gc+0x16b/0x3c0 net/ipv6/ip6_fib.c:2016
fib6_gc_timer_cb+0x20/0x30 net/ipv6/ip6_fib.c:2033
call_timer_fn+0x228/0x820 kernel/time/timer.c:1320
expire_timers kernel/time/timer.c:1357 [inline]
__run_timers+0x7ee/0xb70 kernel/time/timer.c:1660
run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686
__do_softirq+0x2d7/0xb85 kernel/softirq.c:285
invoke_softirq kernel/softirq.c:365 [inline]
irq_exit+0x1cc/0x200 kernel/softirq.c:405
exiting_irq arch/x86/include/asm/apic.h:540 [inline]
smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052
apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:904
</IRQ>
Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-01-09 01:34:00 +07:00
|
|
|
/* If it was last route, call fib6_repair_tree() to:
|
|
|
|
* 1. For root node, put back null_entry as how the table was created.
|
|
|
|
* 2. For other nodes, expunge its radix tree node.
|
|
|
|
*/
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
if (!rcu_access_pointer(fn->leaf)) {
|
ipv6: remove null_entry before adding default route
In the current code, when creating a new fib6 table, tb6_root.leaf gets
initialized to net->ipv6.ip6_null_entry.
If a default route is being added with rt->rt6i_metric = 0xffffffff,
fib6_add() will add this route after net->ipv6.ip6_null_entry. As
null_entry is shared, it could cause problem.
In order to fix it, set fn->leaf to NULL before calling
fib6_add_rt2node() when trying to add the first default route.
And reset fn->leaf to null_entry when adding fails or when deleting the
last default route.
syzkaller reported the following issue which is fixed by this commit:
WARNING: suspicious RCU usage
4.15.0-rc5+ #171 Not tainted
-----------------------------
net/ipv6/ip6_fib.c:1702 suspicious rcu_dereference_protected() usage!
other info that might help us debug this:
rcu_scheduler_active = 2, debug_locks = 1
4 locks held by swapper/0/0:
#0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] lockdep_copy_map include/linux/lockdep.h:178 [inline]
#0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] call_timer_fn+0x1c6/0x820 kernel/time/timer.c:1310
#1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] spin_lock_bh include/linux/spinlock.h:315 [inline]
#1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] fib6_run_gc+0x9d/0x3c0 net/ipv6/ip6_fib.c:2007
#2: (rcu_read_lock){....}, at: [<0000000091db762d>] __fib6_clean_all+0x0/0x3a0 net/ipv6/ip6_fib.c:1560
#3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] spin_lock_bh include/linux/spinlock.h:315 [inline]
#3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] __fib6_clean_all+0x1d0/0x3a0 net/ipv6/ip6_fib.c:1948
stack backtrace:
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc5+ #171
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
<IRQ>
__dump_stack lib/dump_stack.c:17 [inline]
dump_stack+0x194/0x257 lib/dump_stack.c:53
lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4585
fib6_del+0xcaa/0x11b0 net/ipv6/ip6_fib.c:1701
fib6_clean_node+0x3aa/0x4f0 net/ipv6/ip6_fib.c:1892
fib6_walk_continue+0x46c/0x8a0 net/ipv6/ip6_fib.c:1815
fib6_walk+0x91/0xf0 net/ipv6/ip6_fib.c:1863
fib6_clean_tree+0x1e6/0x340 net/ipv6/ip6_fib.c:1933
__fib6_clean_all+0x1f4/0x3a0 net/ipv6/ip6_fib.c:1949
fib6_clean_all net/ipv6/ip6_fib.c:1960 [inline]
fib6_run_gc+0x16b/0x3c0 net/ipv6/ip6_fib.c:2016
fib6_gc_timer_cb+0x20/0x30 net/ipv6/ip6_fib.c:2033
call_timer_fn+0x228/0x820 kernel/time/timer.c:1320
expire_timers kernel/time/timer.c:1357 [inline]
__run_timers+0x7ee/0xb70 kernel/time/timer.c:1660
run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686
__do_softirq+0x2d7/0xb85 kernel/softirq.c:285
invoke_softirq kernel/softirq.c:365 [inline]
irq_exit+0x1cc/0x200 kernel/softirq.c:405
exiting_irq arch/x86/include/asm/apic.h:540 [inline]
smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052
apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:904
</IRQ>
Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-01-09 01:34:00 +07:00
|
|
|
if (!(fn->fn_flags & RTN_TL_ROOT)) {
|
|
|
|
fn->fn_flags &= ~RTN_RTINFO;
|
|
|
|
net->ipv6.rt6_stats->fib_route_nodes--;
|
|
|
|
}
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
fn = fib6_repair_tree(net, table, fn);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
2015-01-26 21:11:17 +07:00
|
|
|
fib6_purge_rt(rt, fn, net);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2019-06-18 22:12:57 +07:00
|
|
|
if (!info->skip_notify_kernel)
|
|
|
|
call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
|
2017-02-03 03:37:11 +07:00
|
|
|
if (!info->skip_notify)
|
|
|
|
inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
|
2019-06-18 22:12:57 +07:00
|
|
|
|
2018-04-18 07:33:25 +07:00
|
|
|
fib6_info_release(rt);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
/* Need to own table->tb6_lock */
|
2018-04-18 07:33:26 +07:00
|
|
|
int fib6_del(struct fib6_info *rt, struct nl_info *info)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2018-04-19 05:38:59 +07:00
|
|
|
struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
|
|
|
|
lockdep_is_held(&rt->fib6_table->tb6_lock));
|
|
|
|
struct fib6_table *table = rt->fib6_table;
|
2008-03-05 04:48:30 +07:00
|
|
|
struct net *net = info->nl_net;
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info __rcu **rtp;
|
|
|
|
struct fib6_info __rcu **rtp_next;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2018-04-18 07:33:18 +07:00
|
|
|
if (!fn || rt == net->ipv6.fib6_null_entry)
|
2005-04-17 05:20:36 +07:00
|
|
|
return -ENOENT;
|
|
|
|
|
2008-07-26 11:43:18 +07:00
|
|
|
WARN_ON(!(fn->fn_flags & RTN_RTINFO));
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Walk the leaf entries looking for ourself
|
|
|
|
*/
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *cur = rcu_dereference_protected(*rtp,
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
lockdep_is_held(&table->tb6_lock));
|
|
|
|
if (rt == cur) {
|
|
|
|
fib6_del_route(table, fn, rtp, info);
|
2005-04-17 05:20:36 +07:00
|
|
|
return 0;
|
|
|
|
}
|
2018-05-05 03:54:24 +07:00
|
|
|
rtp_next = &cur->fib6_next;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Tree traversal function.
|
|
|
|
*
|
|
|
|
* Certainly, it is not interrupt safe.
|
|
|
|
* However, it is internally reenterable wrt itself and fib6_add/fib6_del.
|
|
|
|
* It means, that we can modify tree during walking
|
|
|
|
* and use this function for garbage collection, clone pruning,
|
2007-02-09 21:24:49 +07:00
|
|
|
* cleaning tree when a device goes down etc. etc.
|
2005-04-17 05:20:36 +07:00
|
|
|
*
|
|
|
|
* It guarantees that every node will be traversed,
|
|
|
|
* and that it will be traversed only once.
|
|
|
|
*
|
|
|
|
* Callback function w->func may return:
|
|
|
|
* 0 -> continue walking.
|
|
|
|
* positive value -> walking is suspended (used by tree dumps,
|
|
|
|
* and probably by gc, if it will be split to several slices)
|
|
|
|
* negative value -> terminate walking.
|
|
|
|
*
|
|
|
|
* The function itself returns:
|
|
|
|
* 0 -> walk is complete.
|
|
|
|
* >0 -> walk is incomplete (i.e. suspended)
|
|
|
|
* <0 -> walk is terminated by an error.
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
*
|
|
|
|
* This function is called with tb6_lock held.
|
2005-04-17 05:20:36 +07:00
|
|
|
*/
|
|
|
|
|
2014-10-07 00:58:34 +07:00
|
|
|
static int fib6_walk_continue(struct fib6_walker *w)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
struct fib6_node *fn, *pn, *left, *right;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2017-10-07 02:06:03 +07:00
|
|
|
/* w->root should always be table->tb6_root */
|
|
|
|
WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
for (;;) {
|
|
|
|
fn = w->node;
|
2011-12-04 05:50:45 +07:00
|
|
|
if (!fn)
|
2005-04-17 05:20:36 +07:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
switch (w->state) {
|
|
|
|
#ifdef CONFIG_IPV6_SUBTREES
|
|
|
|
case FWS_S:
|
2006-08-24 07:22:24 +07:00
|
|
|
if (FIB6_SUBTREE(fn)) {
|
|
|
|
w->node = FIB6_SUBTREE(fn);
|
2005-04-17 05:20:36 +07:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
w->state = FWS_L;
|
2007-02-09 21:24:49 +07:00
|
|
|
#endif
|
2017-10-17 04:36:52 +07:00
|
|
|
/* fall through */
|
2005-04-17 05:20:36 +07:00
|
|
|
case FWS_L:
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
left = rcu_dereference_protected(fn->left, 1);
|
|
|
|
if (left) {
|
|
|
|
w->node = left;
|
2005-04-17 05:20:36 +07:00
|
|
|
w->state = FWS_INIT;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
w->state = FWS_R;
|
2017-10-17 04:36:52 +07:00
|
|
|
/* fall through */
|
2005-04-17 05:20:36 +07:00
|
|
|
case FWS_R:
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
right = rcu_dereference_protected(fn->right, 1);
|
|
|
|
if (right) {
|
|
|
|
w->node = right;
|
2005-04-17 05:20:36 +07:00
|
|
|
w->state = FWS_INIT;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
w->state = FWS_C;
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
w->leaf = rcu_dereference_protected(fn->leaf, 1);
|
2017-10-17 04:36:52 +07:00
|
|
|
/* fall through */
|
2005-04-17 05:20:36 +07:00
|
|
|
case FWS_C:
|
2011-12-04 05:50:45 +07:00
|
|
|
if (w->leaf && fn->fn_flags & RTN_RTINFO) {
|
2010-02-08 12:19:03 +07:00
|
|
|
int err;
|
|
|
|
|
2012-06-26 05:37:19 +07:00
|
|
|
if (w->skip) {
|
|
|
|
w->skip--;
|
2014-04-24 20:48:53 +07:00
|
|
|
goto skip;
|
2010-02-08 12:19:03 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
err = w->func(w);
|
2005-04-17 05:20:36 +07:00
|
|
|
if (err)
|
|
|
|
return err;
|
2010-02-08 12:19:03 +07:00
|
|
|
|
|
|
|
w->count++;
|
2005-04-17 05:20:36 +07:00
|
|
|
continue;
|
|
|
|
}
|
2014-04-24 20:48:53 +07:00
|
|
|
skip:
|
2005-04-17 05:20:36 +07:00
|
|
|
w->state = FWS_U;
|
2017-10-17 04:36:52 +07:00
|
|
|
/* fall through */
|
2005-04-17 05:20:36 +07:00
|
|
|
case FWS_U:
|
|
|
|
if (fn == w->root)
|
|
|
|
return 0;
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
pn = rcu_dereference_protected(fn->parent, 1);
|
|
|
|
left = rcu_dereference_protected(pn->left, 1);
|
|
|
|
right = rcu_dereference_protected(pn->right, 1);
|
2005-04-17 05:20:36 +07:00
|
|
|
w->node = pn;
|
|
|
|
#ifdef CONFIG_IPV6_SUBTREES
|
2006-08-24 07:22:24 +07:00
|
|
|
if (FIB6_SUBTREE(pn) == fn) {
|
2008-07-26 11:43:18 +07:00
|
|
|
WARN_ON(!(fn->fn_flags & RTN_ROOT));
|
2005-04-17 05:20:36 +07:00
|
|
|
w->state = FWS_L;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
#endif
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
if (left == fn) {
|
2005-04-17 05:20:36 +07:00
|
|
|
w->state = FWS_R;
|
|
|
|
continue;
|
|
|
|
}
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
if (right == fn) {
|
2005-04-17 05:20:36 +07:00
|
|
|
w->state = FWS_C;
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
w->leaf = rcu_dereference_protected(w->node->leaf, 1);
|
2005-04-17 05:20:36 +07:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
#if RT6_DEBUG >= 2
|
2008-07-26 11:43:18 +07:00
|
|
|
WARN_ON(1);
|
2005-04-17 05:20:36 +07:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-03-08 20:44:35 +07:00
|
|
|
static int fib6_walk(struct net *net, struct fib6_walker *w)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
|
|
|
int res;
|
|
|
|
|
|
|
|
w->state = FWS_INIT;
|
|
|
|
w->node = w->root;
|
|
|
|
|
2016-03-08 20:44:35 +07:00
|
|
|
fib6_walker_link(net, w);
|
2005-04-17 05:20:36 +07:00
|
|
|
res = fib6_walk_continue(w);
|
|
|
|
if (res <= 0)
|
2016-03-08 20:44:35 +07:00
|
|
|
fib6_walker_unlink(net, w);
|
2005-04-17 05:20:36 +07:00
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2014-10-07 00:58:34 +07:00
|
|
|
static int fib6_clean_node(struct fib6_walker *w)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
|
|
|
int res;
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *rt;
|
2014-10-07 00:58:34 +07:00
|
|
|
struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
|
2008-03-04 14:31:57 +07:00
|
|
|
struct nl_info info = {
|
|
|
|
.nl_net = c->net,
|
2018-10-12 10:17:21 +07:00
|
|
|
.skip_notify = c->skip_notify,
|
2008-03-04 14:31:57 +07:00
|
|
|
};
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2014-10-07 00:58:38 +07:00
|
|
|
if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
|
|
|
|
w->node->fn_sernum != c->sernum)
|
|
|
|
w->node->fn_sernum = c->sernum;
|
|
|
|
|
|
|
|
if (!c->func) {
|
|
|
|
WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
|
|
|
|
w->leaf = NULL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
for_each_fib6_walker_rt(w) {
|
2005-04-17 05:20:36 +07:00
|
|
|
res = c->func(rt, c->arg);
|
2018-01-07 17:45:12 +07:00
|
|
|
if (res == -1) {
|
2005-04-17 05:20:36 +07:00
|
|
|
w->leaf = rt;
|
2007-12-14 00:45:12 +07:00
|
|
|
res = fib6_del(rt, &info);
|
2005-04-17 05:20:36 +07:00
|
|
|
if (res) {
|
|
|
|
#if RT6_DEBUG >= 2
|
2012-05-15 21:11:54 +07:00
|
|
|
pr_debug("%s: del failed: rt=%p@%p err=%d\n",
|
2017-08-26 05:03:10 +07:00
|
|
|
__func__, rt,
|
2018-04-19 05:38:59 +07:00
|
|
|
rcu_access_pointer(rt->fib6_node),
|
2017-08-26 05:03:10 +07:00
|
|
|
res);
|
2005-04-17 05:20:36 +07:00
|
|
|
#endif
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
return 0;
|
2018-01-07 17:45:12 +07:00
|
|
|
} else if (res == -2) {
|
2018-04-19 05:38:59 +07:00
|
|
|
if (WARN_ON(!rt->fib6_nsiblings))
|
2018-01-07 17:45:12 +07:00
|
|
|
continue;
|
2018-04-19 05:38:59 +07:00
|
|
|
rt = list_last_entry(&rt->fib6_siblings,
|
|
|
|
struct fib6_info, fib6_siblings);
|
2018-01-07 17:45:12 +07:00
|
|
|
continue;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
2008-07-26 11:43:18 +07:00
|
|
|
WARN_ON(res != 0);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
w->leaf = rt;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Convenient frontend to tree walker.
|
2007-02-09 21:24:49 +07:00
|
|
|
*
|
2005-04-17 05:20:36 +07:00
|
|
|
* func is called on each route.
|
2018-01-07 17:45:12 +07:00
|
|
|
* It may return -2 -> skip multipath route.
|
|
|
|
* -1 -> delete this route.
|
2005-04-17 05:20:36 +07:00
|
|
|
* 0 -> continue walking
|
|
|
|
*/
|
|
|
|
|
2008-03-04 14:31:57 +07:00
|
|
|
static void fib6_clean_tree(struct net *net, struct fib6_node *root,
|
2018-04-18 07:33:26 +07:00
|
|
|
int (*func)(struct fib6_info *, void *arg),
|
2018-10-12 10:17:21 +07:00
|
|
|
int sernum, void *arg, bool skip_notify)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2014-10-07 00:58:34 +07:00
|
|
|
struct fib6_cleaner c;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
c.w.root = root;
|
|
|
|
c.w.func = fib6_clean_node;
|
2010-02-08 12:19:03 +07:00
|
|
|
c.w.count = 0;
|
|
|
|
c.w.skip = 0;
|
ipv6: Dump route exceptions if requested
Since commit 2b760fcf5cfb ("ipv6: hook up exception table to store dst
cache"), route exceptions reside in a separate hash table, and won't be
found by walking the FIB, so they won't be dumped to userspace on a
RTM_GETROUTE message.
This causes 'ip -6 route list cache' and 'ip -6 route flush cache' to
have no function anymore:
# ip -6 route get fc00:3::1
fc00:3::1 via fc00:1::2 dev veth_A-R1 src fc00:1::1 metric 1024 expires 539sec mtu 1400 pref medium
# ip -6 route get fc00:4::1
fc00:4::1 via fc00:2::2 dev veth_A-R2 src fc00:2::1 metric 1024 expires 536sec mtu 1500 pref medium
# ip -6 route list cache
# ip -6 route flush cache
# ip -6 route get fc00:3::1
fc00:3::1 via fc00:1::2 dev veth_A-R1 src fc00:1::1 metric 1024 expires 520sec mtu 1400 pref medium
# ip -6 route get fc00:4::1
fc00:4::1 via fc00:2::2 dev veth_A-R2 src fc00:2::1 metric 1024 expires 519sec mtu 1500 pref medium
because iproute2 lists cached routes using RTM_GETROUTE, and flushes them
by listing all the routes, and deleting them with RTM_DELROUTE one by one.
If cached routes are requested using the RTM_F_CLONED flag together with
strict checking, or if no strict checking is requested (and hence we can't
consistently apply filters), look up exceptions in the hash table
associated with the current fib6_info in rt6_dump_route(), and, if present
and not expired, add them to the dump.
We might be unable to dump all the entries for a given node in a single
message, so keep track of how many entries were handled for the current
node in fib6_walker, and skip that amount in case we start from the same
partially dumped node.
When a partial dump restarts, as the starting node might change when
'sernum' changes, we have no guarantee that we need to skip the same
amount of in-node entries. Therefore, we need two counters, and we need to
zero the in-node counter if the node from which the dump is resumed
differs.
Note that, with the current version of iproute2, this only fixes the
'ip -6 route list cache': on a flush command, iproute2 doesn't pass
RTM_F_CLONED and, due to this inconsistency, 'ip -6 route flush cache' is
still unable to fetch the routes to be flushed. This will be addressed in
a patch for iproute2.
To flush cached routes, a procfs entry could be introduced instead: that's
how it works for IPv4. We already have a rt6_flush_exception() function
ready to be wired to it. However, this would not solve the issue for
listing.
Versions of iproute2 and kernel tested:
iproute2
kernel 4.14.0 4.15.0 4.19.0 5.0.0 5.1.0 5.1.0, patched
3.18 list + + + + + +
flush + + + + + +
4.4 list + + + + + +
flush + + + + + +
4.9 list + + + + + +
flush + + + + + +
4.14 list + + + + + +
flush + + + + + +
4.15 list
flush
4.19 list
flush
5.0 list
flush
5.1 list
flush
with list + + + + + +
fix flush + + + +
v7:
- Explain usage of "skip" counters in commit message (suggested by
David Ahern)
v6:
- Rebase onto net-next, use recently introduced nexthop walker
- Make rt6_nh_dump_exceptions() a separate function (suggested by David
Ahern)
v5:
- Use dump_routes and dump_exceptions from filter, ignore NLM_F_MATCH,
update test results (flushing works with iproute2 < 5.0.0 now)
v4:
- Split NLM_F_MATCH and strict check handling in separate patches
- Filter routes using RTM_F_CLONED: if it's not set, only return
non-cached routes, and if it's set, only return cached routes:
change requested by David Ahern and Martin Lau. This implies that
iproute2 needs a separate patch to be able to flush IPv6 cached
routes. This is not ideal because we can't fix the breakage caused
by 2b760fcf5cfb entirely in kernel. However, two years have passed
since then, and this makes it more tolerable
v3:
- More descriptive comment about expired exceptions in rt6_dump_route()
- Swap return values of rt6_dump_route() (suggested by Martin Lau)
- Don't zero skip_in_node in case we don't dump anything in a given pass
(also suggested by Martin Lau)
- Remove check on RTM_F_CLONED altogether: in the current UAPI semantic,
it's just a flag to indicate the route was cloned, not to filter on
routes
v2: Add tracking of number of entries to be skipped in current node after
a partial dump. As we restart from the same node, if not all the
exceptions for a given node fit in a single message, the dump will
not terminate, as suggested by Martin Lau. This is a concrete
possibility, setting up a big number of exceptions for the same route
actually causes the issue, suggested by David Ahern.
Reported-by: Jianlin Shi <jishi@redhat.com>
Fixes: 2b760fcf5cfb ("ipv6: hook up exception table to store dst cache")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-21 22:45:27 +07:00
|
|
|
c.w.skip_in_node = 0;
|
2005-04-17 05:20:36 +07:00
|
|
|
c.func = func;
|
2014-10-07 00:58:38 +07:00
|
|
|
c.sernum = sernum;
|
2005-04-17 05:20:36 +07:00
|
|
|
c.arg = arg;
|
2008-03-04 14:31:57 +07:00
|
|
|
c.net = net;
|
2018-10-12 10:17:21 +07:00
|
|
|
c.skip_notify = skip_notify;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2016-03-08 20:44:35 +07:00
|
|
|
fib6_walk(net, &c.w);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
2014-10-07 00:58:38 +07:00
|
|
|
static void __fib6_clean_all(struct net *net,
|
2018-04-18 07:33:26 +07:00
|
|
|
int (*func)(struct fib6_info *, void *),
|
2018-10-12 10:17:21 +07:00
|
|
|
int sernum, void *arg, bool skip_notify)
|
2006-08-05 13:20:06 +07:00
|
|
|
{
|
|
|
|
struct fib6_table *table;
|
2008-03-04 14:25:27 +07:00
|
|
|
struct hlist_head *head;
|
2006-08-11 13:11:17 +07:00
|
|
|
unsigned int h;
|
2006-08-05 13:20:06 +07:00
|
|
|
|
2006-08-11 13:11:17 +07:00
|
|
|
rcu_read_lock();
|
2009-07-31 08:52:15 +07:00
|
|
|
for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
|
2008-03-04 14:27:06 +07:00
|
|
|
head = &net->ipv6.fib_table_hash[h];
|
hlist: drop the node parameter from iterators
I'm not sure why, but the hlist for each entry iterators were conceived
list_for_each_entry(pos, head, member)
The hlist ones were greedy and wanted an extra parameter:
hlist_for_each_entry(tpos, pos, head, member)
Why did they need an extra pos parameter? I'm not quite sure. Not only
they don't really need it, it also prevents the iterator from looking
exactly like the list iterator, which is unfortunate.
Besides the semantic patch, there was some manual work required:
- Fix up the actual hlist iterators in linux/list.h
- Fix up the declaration of other iterators based on the hlist ones.
- A very small amount of places were using the 'node' parameter, this
was modified to use 'obj->member' instead.
- Coccinelle didn't handle the hlist_for_each_entry_safe iterator
properly, so those had to be fixed up manually.
The semantic patch which is mostly the work of Peter Senna Tschudin is here:
@@
iterator name hlist_for_each_entry, hlist_for_each_entry_continue, hlist_for_each_entry_from, hlist_for_each_entry_rcu, hlist_for_each_entry_rcu_bh, hlist_for_each_entry_continue_rcu_bh, for_each_busy_worker, ax25_uid_for_each, ax25_for_each, inet_bind_bucket_for_each, sctp_for_each_hentry, sk_for_each, sk_for_each_rcu, sk_for_each_from, sk_for_each_safe, sk_for_each_bound, hlist_for_each_entry_safe, hlist_for_each_entry_continue_rcu, nr_neigh_for_each, nr_neigh_for_each_safe, nr_node_for_each, nr_node_for_each_safe, for_each_gfn_indirect_valid_sp, for_each_gfn_sp, for_each_host;
type T;
expression a,c,d,e;
identifier b;
statement S;
@@
-T b;
<+... when != b
(
hlist_for_each_entry(a,
- b,
c, d) S
|
hlist_for_each_entry_continue(a,
- b,
c) S
|
hlist_for_each_entry_from(a,
- b,
c) S
|
hlist_for_each_entry_rcu(a,
- b,
c, d) S
|
hlist_for_each_entry_rcu_bh(a,
- b,
c, d) S
|
hlist_for_each_entry_continue_rcu_bh(a,
- b,
c) S
|
for_each_busy_worker(a, c,
- b,
d) S
|
ax25_uid_for_each(a,
- b,
c) S
|
ax25_for_each(a,
- b,
c) S
|
inet_bind_bucket_for_each(a,
- b,
c) S
|
sctp_for_each_hentry(a,
- b,
c) S
|
sk_for_each(a,
- b,
c) S
|
sk_for_each_rcu(a,
- b,
c) S
|
sk_for_each_from
-(a, b)
+(a)
S
+ sk_for_each_from(a) S
|
sk_for_each_safe(a,
- b,
c, d) S
|
sk_for_each_bound(a,
- b,
c) S
|
hlist_for_each_entry_safe(a,
- b,
c, d, e) S
|
hlist_for_each_entry_continue_rcu(a,
- b,
c) S
|
nr_neigh_for_each(a,
- b,
c) S
|
nr_neigh_for_each_safe(a,
- b,
c, d) S
|
nr_node_for_each(a,
- b,
c) S
|
nr_node_for_each_safe(a,
- b,
c, d) S
|
- for_each_gfn_sp(a, c, d, b) S
+ for_each_gfn_sp(a, c, d) S
|
- for_each_gfn_indirect_valid_sp(a, c, d, b) S
+ for_each_gfn_indirect_valid_sp(a, c, d) S
|
for_each_host(a,
- b,
c) S
|
for_each_host_safe(a,
- b,
c, d) S
|
for_each_mesh_entry(a,
- b,
c, d) S
)
...+>
[akpm@linux-foundation.org: drop bogus change from net/ipv4/raw.c]
[akpm@linux-foundation.org: drop bogus hunk from net/ipv6/raw.c]
[akpm@linux-foundation.org: checkpatch fixes]
[akpm@linux-foundation.org: fix warnings]
[akpm@linux-foudnation.org: redo intrusive kvm changes]
Tested-by: Peter Senna Tschudin <peter.senna@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-28 08:06:00 +07:00
|
|
|
hlist_for_each_entry_rcu(table, head, tb6_hlist) {
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
spin_lock_bh(&table->tb6_lock);
|
2008-03-04 14:31:57 +07:00
|
|
|
fib6_clean_tree(net, &table->tb6_root,
|
2018-10-12 10:17:21 +07:00
|
|
|
func, sernum, arg, skip_notify);
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
spin_unlock_bh(&table->tb6_lock);
|
2006-08-05 13:20:06 +07:00
|
|
|
}
|
|
|
|
}
|
2006-08-11 13:11:17 +07:00
|
|
|
rcu_read_unlock();
|
2006-08-05 13:20:06 +07:00
|
|
|
}
|
|
|
|
|
2018-04-18 07:33:26 +07:00
|
|
|
void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
|
2014-10-07 00:58:38 +07:00
|
|
|
void *arg)
|
|
|
|
{
|
2018-10-12 10:17:21 +07:00
|
|
|
__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
void fib6_clean_all_skip_notify(struct net *net,
|
|
|
|
int (*func)(struct fib6_info *, void *),
|
|
|
|
void *arg)
|
|
|
|
{
|
|
|
|
__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
|
2014-10-07 00:58:38 +07:00
|
|
|
}
|
|
|
|
|
2014-09-28 05:46:06 +07:00
|
|
|
static void fib6_flush_trees(struct net *net)
|
|
|
|
{
|
2014-10-07 00:58:37 +07:00
|
|
|
int new_sernum = fib6_new_sernum(net);
|
2014-09-28 05:46:06 +07:00
|
|
|
|
2018-10-12 10:17:21 +07:00
|
|
|
__fib6_clean_all(net, NULL, new_sernum, NULL, false);
|
2014-09-28 05:46:06 +07:00
|
|
|
}
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* Garbage collection
|
|
|
|
*/
|
|
|
|
|
2018-04-18 07:33:26 +07:00
|
|
|
static int fib6_age(struct fib6_info *rt, void *arg)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2016-03-08 20:44:25 +07:00
|
|
|
struct fib6_gc_args *gc_args = arg;
|
2005-04-17 05:20:36 +07:00
|
|
|
unsigned long now = jiffies;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* check addrconf expiration here.
|
|
|
|
* Routes are expired even if they are in use.
|
|
|
|
*/
|
|
|
|
|
2018-04-19 05:38:59 +07:00
|
|
|
if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
|
2018-04-18 07:33:17 +07:00
|
|
|
if (time_after(now, rt->expires)) {
|
2005-04-17 05:20:36 +07:00
|
|
|
RT6_TRACE("expiring %p\n", rt);
|
|
|
|
return -1;
|
|
|
|
}
|
2016-03-08 20:44:25 +07:00
|
|
|
gc_args->more++;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
2017-10-07 02:06:01 +07:00
|
|
|
/* Also age clones in the exception table.
|
|
|
|
* Note, that clones are aged out
|
|
|
|
* only if they are not in use now.
|
|
|
|
*/
|
|
|
|
rt6_age_exceptions(rt, gc_args, now);
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-08-01 15:04:14 +07:00
|
|
|
void fib6_run_gc(unsigned long expires, struct net *net, bool force)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2016-03-08 20:44:25 +07:00
|
|
|
struct fib6_gc_args gc_args;
|
2013-08-01 15:04:24 +07:00
|
|
|
unsigned long now;
|
|
|
|
|
2013-08-01 15:04:14 +07:00
|
|
|
if (force) {
|
2016-03-08 20:44:45 +07:00
|
|
|
spin_lock_bh(&net->ipv6.fib6_gc_lock);
|
|
|
|
} else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
|
2013-08-01 15:04:14 +07:00
|
|
|
mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
|
|
|
|
return;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
2013-08-01 15:04:14 +07:00
|
|
|
gc_args.timeout = expires ? (int)expires :
|
|
|
|
net->ipv6.sysctl.ip6_rt_gc_interval;
|
2017-06-18 00:42:37 +07:00
|
|
|
gc_args.more = 0;
|
2008-03-04 14:27:06 +07:00
|
|
|
|
2016-03-08 20:44:25 +07:00
|
|
|
fib6_clean_all(net, fib6_age, &gc_args);
|
2013-08-01 15:04:24 +07:00
|
|
|
now = jiffies;
|
|
|
|
net->ipv6.ip6_rt_last_gc = now;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
if (gc_args.more)
|
2008-07-23 04:34:09 +07:00
|
|
|
mod_timer(&net->ipv6.ip6_fib_timer,
|
2013-08-01 15:04:24 +07:00
|
|
|
round_jiffies(now
|
2008-07-23 04:34:09 +07:00
|
|
|
+ net->ipv6.sysctl.ip6_rt_gc_interval));
|
2008-07-23 04:33:45 +07:00
|
|
|
else
|
|
|
|
del_timer(&net->ipv6.ip6_fib_timer);
|
2016-03-08 20:44:45 +07:00
|
|
|
spin_unlock_bh(&net->ipv6.fib6_gc_lock);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
treewide: setup_timer() -> timer_setup() (2 field)
This converts all remaining setup_timer() calls that use a nested field
to reach a struct timer_list. Coccinelle does not have an easy way to
match multiple fields, so a new script is needed to change the matches of
"&_E->_timer" into "&_E->_field1._timer" in all the rules.
spatch --very-quiet --all-includes --include-headers \
-I ./arch/x86/include -I ./arch/x86/include/generated \
-I ./include -I ./arch/x86/include/uapi \
-I ./arch/x86/include/generated/uapi -I ./include/uapi \
-I ./include/generated/uapi --include ./include/linux/kconfig.h \
--dir . \
--cocci-file ~/src/data/timer_setup-2fields.cocci
@fix_address_of depends@
expression e;
@@
setup_timer(
-&(e)
+&e
, ...)
// Update any raw setup_timer() usages that have a NULL callback, but
// would otherwise match change_timer_function_usage, since the latter
// will update all function assignments done in the face of a NULL
// function initialization in setup_timer().
@change_timer_function_usage_NULL@
expression _E;
identifier _field1;
identifier _timer;
type _cast_data;
@@
(
-setup_timer(&_E->_field1._timer, NULL, _E);
+timer_setup(&_E->_field1._timer, NULL, 0);
|
-setup_timer(&_E->_field1._timer, NULL, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, NULL, 0);
|
-setup_timer(&_E._field1._timer, NULL, &_E);
+timer_setup(&_E._field1._timer, NULL, 0);
|
-setup_timer(&_E._field1._timer, NULL, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, NULL, 0);
)
@change_timer_function_usage@
expression _E;
identifier _field1;
identifier _timer;
struct timer_list _stl;
identifier _callback;
type _cast_func, _cast_data;
@@
(
-setup_timer(&_E->_field1._timer, _callback, _E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, &_callback, _E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, _callback, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, (_cast_func)_callback, _E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, (_cast_func)&_callback, _E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, (_cast_data)_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, &_callback, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, (_cast_func)_callback, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, (_cast_func)&_callback, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
_E->_field1._timer@_stl.function = _callback;
|
_E->_field1._timer@_stl.function = &_callback;
|
_E->_field1._timer@_stl.function = (_cast_func)_callback;
|
_E->_field1._timer@_stl.function = (_cast_func)&_callback;
|
_E._field1._timer@_stl.function = _callback;
|
_E._field1._timer@_stl.function = &_callback;
|
_E._field1._timer@_stl.function = (_cast_func)_callback;
|
_E._field1._timer@_stl.function = (_cast_func)&_callback;
)
// callback(unsigned long arg)
@change_callback_handle_cast
depends on change_timer_function_usage@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
identifier _handle;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
(
... when != _origarg
_handletype *_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _field1._timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle =
-(void *)_origarg;
+from_timer(_handle, t, _field1._timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _field1._timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(void *)_origarg;
+from_timer(_handle, t, _field1._timer);
... when != _origarg
)
}
// callback(unsigned long arg) without existing variable
@change_callback_handle_cast_no_arg
depends on change_timer_function_usage &&
!change_callback_handle_cast@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
+ _handletype *_origarg = from_timer(_origarg, t, _field1._timer);
+
... when != _origarg
- (_handletype *)_origarg
+ _origarg
... when != _origarg
}
// Avoid already converted callbacks.
@match_callback_converted
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier t;
@@
void _callback(struct timer_list *t)
{ ... }
// callback(struct something *handle)
@change_callback_handle_arg
depends on change_timer_function_usage &&
!match_callback_converted &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
@@
void _callback(
-_handletype *_handle
+struct timer_list *t
)
{
+ _handletype *_handle = from_timer(_handle, t, _field1._timer);
...
}
// If change_callback_handle_arg ran on an empty function, remove
// the added handler.
@unchange_callback_handle_arg
depends on change_timer_function_usage &&
change_callback_handle_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
identifier t;
@@
void _callback(struct timer_list *t)
{
- _handletype *_handle = from_timer(_handle, t, _field1._timer);
}
// We only want to refactor the setup_timer() data argument if we've found
// the matching callback. This undoes changes in change_timer_function_usage.
@unchange_timer_function_usage
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg &&
!change_callback_handle_arg@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type change_timer_function_usage._cast_data;
@@
(
-timer_setup(&_E->_field1._timer, _callback, 0);
+setup_timer(&_E->_field1._timer, _callback, (_cast_data)_E);
|
-timer_setup(&_E._field1._timer, _callback, 0);
+setup_timer(&_E._field1._timer, _callback, (_cast_data)&_E);
)
// If we fixed a callback from a .function assignment, fix the
// assignment cast now.
@change_timer_function_assignment
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_func;
typedef TIMER_FUNC_TYPE;
@@
(
_E->_field1._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_field1._timer.function =
-&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_field1._timer.function =
-(_cast_func)_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_field1._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._field1._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._field1._timer.function =
-&_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E._field1._timer.function =
-(_cast_func)_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._field1._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
)
// Sometimes timer functions are called directly. Replace matched args.
@change_timer_function_calls
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression _E;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_data;
@@
_callback(
(
-(_cast_data)_E
+&_E->_field1._timer
|
-(_cast_data)&_E
+&_E._field1._timer
|
-_E
+&_E->_field1._timer
)
)
// If a timer has been configured without a data argument, it can be
// converted without regard to the callback argument, since it is unused.
@match_timer_function_unused_data@
expression _E;
identifier _field1;
identifier _timer;
identifier _callback;
@@
(
-setup_timer(&_E->_field1._timer, _callback, 0);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, _callback, 0L);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, _callback, 0UL);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, 0);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, 0L);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, 0UL);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_field1._timer, _callback, 0);
+timer_setup(&_field1._timer, _callback, 0);
|
-setup_timer(&_field1._timer, _callback, 0L);
+timer_setup(&_field1._timer, _callback, 0);
|
-setup_timer(&_field1._timer, _callback, 0UL);
+timer_setup(&_field1._timer, _callback, 0);
|
-setup_timer(_field1._timer, _callback, 0);
+timer_setup(_field1._timer, _callback, 0);
|
-setup_timer(_field1._timer, _callback, 0L);
+timer_setup(_field1._timer, _callback, 0);
|
-setup_timer(_field1._timer, _callback, 0UL);
+timer_setup(_field1._timer, _callback, 0);
)
@change_callback_unused_data
depends on match_timer_function_unused_data@
identifier match_timer_function_unused_data._callback;
type _origtype;
identifier _origarg;
@@
void _callback(
-_origtype _origarg
+struct timer_list *unused
)
{
... when != _origarg
}
Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-18 10:21:24 +07:00
|
|
|
static void fib6_gc_timer_cb(struct timer_list *t)
|
2008-03-04 14:28:58 +07:00
|
|
|
{
|
treewide: setup_timer() -> timer_setup() (2 field)
This converts all remaining setup_timer() calls that use a nested field
to reach a struct timer_list. Coccinelle does not have an easy way to
match multiple fields, so a new script is needed to change the matches of
"&_E->_timer" into "&_E->_field1._timer" in all the rules.
spatch --very-quiet --all-includes --include-headers \
-I ./arch/x86/include -I ./arch/x86/include/generated \
-I ./include -I ./arch/x86/include/uapi \
-I ./arch/x86/include/generated/uapi -I ./include/uapi \
-I ./include/generated/uapi --include ./include/linux/kconfig.h \
--dir . \
--cocci-file ~/src/data/timer_setup-2fields.cocci
@fix_address_of depends@
expression e;
@@
setup_timer(
-&(e)
+&e
, ...)
// Update any raw setup_timer() usages that have a NULL callback, but
// would otherwise match change_timer_function_usage, since the latter
// will update all function assignments done in the face of a NULL
// function initialization in setup_timer().
@change_timer_function_usage_NULL@
expression _E;
identifier _field1;
identifier _timer;
type _cast_data;
@@
(
-setup_timer(&_E->_field1._timer, NULL, _E);
+timer_setup(&_E->_field1._timer, NULL, 0);
|
-setup_timer(&_E->_field1._timer, NULL, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, NULL, 0);
|
-setup_timer(&_E._field1._timer, NULL, &_E);
+timer_setup(&_E._field1._timer, NULL, 0);
|
-setup_timer(&_E._field1._timer, NULL, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, NULL, 0);
)
@change_timer_function_usage@
expression _E;
identifier _field1;
identifier _timer;
struct timer_list _stl;
identifier _callback;
type _cast_func, _cast_data;
@@
(
-setup_timer(&_E->_field1._timer, _callback, _E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, &_callback, _E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, _callback, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, (_cast_func)_callback, _E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, (_cast_func)&_callback, _E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, (_cast_data)_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, &_callback, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, (_cast_func)_callback, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, (_cast_func)&_callback, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
_E->_field1._timer@_stl.function = _callback;
|
_E->_field1._timer@_stl.function = &_callback;
|
_E->_field1._timer@_stl.function = (_cast_func)_callback;
|
_E->_field1._timer@_stl.function = (_cast_func)&_callback;
|
_E._field1._timer@_stl.function = _callback;
|
_E._field1._timer@_stl.function = &_callback;
|
_E._field1._timer@_stl.function = (_cast_func)_callback;
|
_E._field1._timer@_stl.function = (_cast_func)&_callback;
)
// callback(unsigned long arg)
@change_callback_handle_cast
depends on change_timer_function_usage@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
identifier _handle;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
(
... when != _origarg
_handletype *_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _field1._timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle =
-(void *)_origarg;
+from_timer(_handle, t, _field1._timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _field1._timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(void *)_origarg;
+from_timer(_handle, t, _field1._timer);
... when != _origarg
)
}
// callback(unsigned long arg) without existing variable
@change_callback_handle_cast_no_arg
depends on change_timer_function_usage &&
!change_callback_handle_cast@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
+ _handletype *_origarg = from_timer(_origarg, t, _field1._timer);
+
... when != _origarg
- (_handletype *)_origarg
+ _origarg
... when != _origarg
}
// Avoid already converted callbacks.
@match_callback_converted
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier t;
@@
void _callback(struct timer_list *t)
{ ... }
// callback(struct something *handle)
@change_callback_handle_arg
depends on change_timer_function_usage &&
!match_callback_converted &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
@@
void _callback(
-_handletype *_handle
+struct timer_list *t
)
{
+ _handletype *_handle = from_timer(_handle, t, _field1._timer);
...
}
// If change_callback_handle_arg ran on an empty function, remove
// the added handler.
@unchange_callback_handle_arg
depends on change_timer_function_usage &&
change_callback_handle_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
identifier t;
@@
void _callback(struct timer_list *t)
{
- _handletype *_handle = from_timer(_handle, t, _field1._timer);
}
// We only want to refactor the setup_timer() data argument if we've found
// the matching callback. This undoes changes in change_timer_function_usage.
@unchange_timer_function_usage
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg &&
!change_callback_handle_arg@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type change_timer_function_usage._cast_data;
@@
(
-timer_setup(&_E->_field1._timer, _callback, 0);
+setup_timer(&_E->_field1._timer, _callback, (_cast_data)_E);
|
-timer_setup(&_E._field1._timer, _callback, 0);
+setup_timer(&_E._field1._timer, _callback, (_cast_data)&_E);
)
// If we fixed a callback from a .function assignment, fix the
// assignment cast now.
@change_timer_function_assignment
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_func;
typedef TIMER_FUNC_TYPE;
@@
(
_E->_field1._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_field1._timer.function =
-&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_field1._timer.function =
-(_cast_func)_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_field1._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._field1._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._field1._timer.function =
-&_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E._field1._timer.function =
-(_cast_func)_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._field1._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
)
// Sometimes timer functions are called directly. Replace matched args.
@change_timer_function_calls
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression _E;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_data;
@@
_callback(
(
-(_cast_data)_E
+&_E->_field1._timer
|
-(_cast_data)&_E
+&_E._field1._timer
|
-_E
+&_E->_field1._timer
)
)
// If a timer has been configured without a data argument, it can be
// converted without regard to the callback argument, since it is unused.
@match_timer_function_unused_data@
expression _E;
identifier _field1;
identifier _timer;
identifier _callback;
@@
(
-setup_timer(&_E->_field1._timer, _callback, 0);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, _callback, 0L);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, _callback, 0UL);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, 0);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, 0L);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, 0UL);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_field1._timer, _callback, 0);
+timer_setup(&_field1._timer, _callback, 0);
|
-setup_timer(&_field1._timer, _callback, 0L);
+timer_setup(&_field1._timer, _callback, 0);
|
-setup_timer(&_field1._timer, _callback, 0UL);
+timer_setup(&_field1._timer, _callback, 0);
|
-setup_timer(_field1._timer, _callback, 0);
+timer_setup(_field1._timer, _callback, 0);
|
-setup_timer(_field1._timer, _callback, 0L);
+timer_setup(_field1._timer, _callback, 0);
|
-setup_timer(_field1._timer, _callback, 0UL);
+timer_setup(_field1._timer, _callback, 0);
)
@change_callback_unused_data
depends on match_timer_function_unused_data@
identifier match_timer_function_unused_data._callback;
type _origtype;
identifier _origarg;
@@
void _callback(
-_origtype _origarg
+struct timer_list *unused
)
{
... when != _origarg
}
Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-18 10:21:24 +07:00
|
|
|
struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer);
|
|
|
|
|
|
|
|
fib6_run_gc(0, arg, true);
|
2008-03-04 14:28:58 +07:00
|
|
|
}
|
|
|
|
|
2010-01-17 10:35:32 +07:00
|
|
|
static int __net_init fib6_net_init(struct net *net)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2010-10-13 15:22:03 +07:00
|
|
|
size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
|
2017-08-03 18:28:16 +07:00
|
|
|
int err;
|
|
|
|
|
|
|
|
err = fib6_notifier_init(net);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2010-10-13 15:22:03 +07:00
|
|
|
|
2016-03-08 20:44:45 +07:00
|
|
|
spin_lock_init(&net->ipv6.fib6_gc_lock);
|
2016-03-08 20:44:35 +07:00
|
|
|
rwlock_init(&net->ipv6.fib6_walker_lock);
|
|
|
|
INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
|
treewide: setup_timer() -> timer_setup() (2 field)
This converts all remaining setup_timer() calls that use a nested field
to reach a struct timer_list. Coccinelle does not have an easy way to
match multiple fields, so a new script is needed to change the matches of
"&_E->_timer" into "&_E->_field1._timer" in all the rules.
spatch --very-quiet --all-includes --include-headers \
-I ./arch/x86/include -I ./arch/x86/include/generated \
-I ./include -I ./arch/x86/include/uapi \
-I ./arch/x86/include/generated/uapi -I ./include/uapi \
-I ./include/generated/uapi --include ./include/linux/kconfig.h \
--dir . \
--cocci-file ~/src/data/timer_setup-2fields.cocci
@fix_address_of depends@
expression e;
@@
setup_timer(
-&(e)
+&e
, ...)
// Update any raw setup_timer() usages that have a NULL callback, but
// would otherwise match change_timer_function_usage, since the latter
// will update all function assignments done in the face of a NULL
// function initialization in setup_timer().
@change_timer_function_usage_NULL@
expression _E;
identifier _field1;
identifier _timer;
type _cast_data;
@@
(
-setup_timer(&_E->_field1._timer, NULL, _E);
+timer_setup(&_E->_field1._timer, NULL, 0);
|
-setup_timer(&_E->_field1._timer, NULL, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, NULL, 0);
|
-setup_timer(&_E._field1._timer, NULL, &_E);
+timer_setup(&_E._field1._timer, NULL, 0);
|
-setup_timer(&_E._field1._timer, NULL, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, NULL, 0);
)
@change_timer_function_usage@
expression _E;
identifier _field1;
identifier _timer;
struct timer_list _stl;
identifier _callback;
type _cast_func, _cast_data;
@@
(
-setup_timer(&_E->_field1._timer, _callback, _E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, &_callback, _E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, _callback, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, (_cast_func)_callback, _E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, (_cast_func)&_callback, _E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, (_cast_data)_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, &_callback, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, (_cast_func)_callback, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, (_cast_func)&_callback, (_cast_data)&_E);
+timer_setup(&_E._field1._timer, _callback, 0);
|
_E->_field1._timer@_stl.function = _callback;
|
_E->_field1._timer@_stl.function = &_callback;
|
_E->_field1._timer@_stl.function = (_cast_func)_callback;
|
_E->_field1._timer@_stl.function = (_cast_func)&_callback;
|
_E._field1._timer@_stl.function = _callback;
|
_E._field1._timer@_stl.function = &_callback;
|
_E._field1._timer@_stl.function = (_cast_func)_callback;
|
_E._field1._timer@_stl.function = (_cast_func)&_callback;
)
// callback(unsigned long arg)
@change_callback_handle_cast
depends on change_timer_function_usage@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
identifier _handle;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
(
... when != _origarg
_handletype *_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _field1._timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle =
-(void *)_origarg;
+from_timer(_handle, t, _field1._timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _field1._timer);
... when != _origarg
|
... when != _origarg
_handletype *_handle;
... when != _handle
_handle =
-(void *)_origarg;
+from_timer(_handle, t, _field1._timer);
... when != _origarg
)
}
// callback(unsigned long arg) without existing variable
@change_callback_handle_cast_no_arg
depends on change_timer_function_usage &&
!change_callback_handle_cast@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
@@
void _callback(
-_origtype _origarg
+struct timer_list *t
)
{
+ _handletype *_origarg = from_timer(_origarg, t, _field1._timer);
+
... when != _origarg
- (_handletype *)_origarg
+ _origarg
... when != _origarg
}
// Avoid already converted callbacks.
@match_callback_converted
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier t;
@@
void _callback(struct timer_list *t)
{ ... }
// callback(struct something *handle)
@change_callback_handle_arg
depends on change_timer_function_usage &&
!match_callback_converted &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
@@
void _callback(
-_handletype *_handle
+struct timer_list *t
)
{
+ _handletype *_handle = from_timer(_handle, t, _field1._timer);
...
}
// If change_callback_handle_arg ran on an empty function, remove
// the added handler.
@unchange_callback_handle_arg
depends on change_timer_function_usage &&
change_callback_handle_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
identifier t;
@@
void _callback(struct timer_list *t)
{
- _handletype *_handle = from_timer(_handle, t, _field1._timer);
}
// We only want to refactor the setup_timer() data argument if we've found
// the matching callback. This undoes changes in change_timer_function_usage.
@unchange_timer_function_usage
depends on change_timer_function_usage &&
!change_callback_handle_cast &&
!change_callback_handle_cast_no_arg &&
!change_callback_handle_arg@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type change_timer_function_usage._cast_data;
@@
(
-timer_setup(&_E->_field1._timer, _callback, 0);
+setup_timer(&_E->_field1._timer, _callback, (_cast_data)_E);
|
-timer_setup(&_E._field1._timer, _callback, 0);
+setup_timer(&_E._field1._timer, _callback, (_cast_data)&_E);
)
// If we fixed a callback from a .function assignment, fix the
// assignment cast now.
@change_timer_function_assignment
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_func;
typedef TIMER_FUNC_TYPE;
@@
(
_E->_field1._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_field1._timer.function =
-&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_field1._timer.function =
-(_cast_func)_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E->_field1._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._field1._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._field1._timer.function =
-&_callback;
+(TIMER_FUNC_TYPE)_callback
;
|
_E._field1._timer.function =
-(_cast_func)_callback
+(TIMER_FUNC_TYPE)_callback
;
|
_E._field1._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
;
)
// Sometimes timer functions are called directly. Replace matched args.
@change_timer_function_calls
depends on change_timer_function_usage &&
(change_callback_handle_cast ||
change_callback_handle_cast_no_arg ||
change_callback_handle_arg)@
expression _E;
identifier change_timer_function_usage._field1;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_data;
@@
_callback(
(
-(_cast_data)_E
+&_E->_field1._timer
|
-(_cast_data)&_E
+&_E._field1._timer
|
-_E
+&_E->_field1._timer
)
)
// If a timer has been configured without a data argument, it can be
// converted without regard to the callback argument, since it is unused.
@match_timer_function_unused_data@
expression _E;
identifier _field1;
identifier _timer;
identifier _callback;
@@
(
-setup_timer(&_E->_field1._timer, _callback, 0);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, _callback, 0L);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E->_field1._timer, _callback, 0UL);
+timer_setup(&_E->_field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, 0);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, 0L);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_E._field1._timer, _callback, 0UL);
+timer_setup(&_E._field1._timer, _callback, 0);
|
-setup_timer(&_field1._timer, _callback, 0);
+timer_setup(&_field1._timer, _callback, 0);
|
-setup_timer(&_field1._timer, _callback, 0L);
+timer_setup(&_field1._timer, _callback, 0);
|
-setup_timer(&_field1._timer, _callback, 0UL);
+timer_setup(&_field1._timer, _callback, 0);
|
-setup_timer(_field1._timer, _callback, 0);
+timer_setup(_field1._timer, _callback, 0);
|
-setup_timer(_field1._timer, _callback, 0L);
+timer_setup(_field1._timer, _callback, 0);
|
-setup_timer(_field1._timer, _callback, 0UL);
+timer_setup(_field1._timer, _callback, 0);
)
@change_callback_unused_data
depends on match_timer_function_unused_data@
identifier match_timer_function_unused_data._callback;
type _origtype;
identifier _origarg;
@@
void _callback(
-_origtype _origarg
+struct timer_list *unused
)
{
... when != _origarg
}
Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-18 10:21:24 +07:00
|
|
|
timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0);
|
2008-03-04 14:31:11 +07:00
|
|
|
|
2008-03-04 14:34:17 +07:00
|
|
|
net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
|
|
|
|
if (!net->ipv6.rt6_stats)
|
|
|
|
goto out_timer;
|
|
|
|
|
2010-10-13 15:22:03 +07:00
|
|
|
/* Avoid false sharing : Use at least a full cache line */
|
|
|
|
size = max_t(size_t, size, L1_CACHE_BYTES);
|
|
|
|
|
|
|
|
net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
|
2008-03-04 14:25:27 +07:00
|
|
|
if (!net->ipv6.fib_table_hash)
|
2008-03-04 14:34:17 +07:00
|
|
|
goto out_rt6_stats;
|
2008-03-04 14:24:31 +07:00
|
|
|
|
2008-03-04 14:25:27 +07:00
|
|
|
net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
|
|
|
|
GFP_KERNEL);
|
|
|
|
if (!net->ipv6.fib6_main_tbl)
|
2008-03-04 14:24:31 +07:00
|
|
|
goto out_fib_table_hash;
|
|
|
|
|
2008-03-04 14:25:27 +07:00
|
|
|
net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
|
2018-04-18 07:33:18 +07:00
|
|
|
net->ipv6.fib6_null_entry);
|
2008-03-04 14:25:27 +07:00
|
|
|
net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
|
|
|
|
RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
|
2012-06-11 14:01:52 +07:00
|
|
|
inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
|
2008-03-04 14:24:31 +07:00
|
|
|
|
|
|
|
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
|
2008-03-04 14:25:27 +07:00
|
|
|
net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
|
|
|
|
GFP_KERNEL);
|
|
|
|
if (!net->ipv6.fib6_local_tbl)
|
2008-03-04 14:24:31 +07:00
|
|
|
goto out_fib6_main_tbl;
|
2008-03-04 14:25:27 +07:00
|
|
|
net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
|
2018-04-18 07:33:18 +07:00
|
|
|
net->ipv6.fib6_null_entry);
|
2008-03-04 14:25:27 +07:00
|
|
|
net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
|
|
|
|
RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
|
2012-06-11 14:01:52 +07:00
|
|
|
inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
|
2008-03-04 14:24:31 +07:00
|
|
|
#endif
|
2008-03-04 14:25:27 +07:00
|
|
|
fib6_tables_init(net);
|
2007-12-07 15:45:16 +07:00
|
|
|
|
2008-07-23 04:33:45 +07:00
|
|
|
return 0;
|
2007-12-07 15:40:34 +07:00
|
|
|
|
2008-03-04 14:24:31 +07:00
|
|
|
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
|
|
|
|
out_fib6_main_tbl:
|
2008-03-04 14:25:27 +07:00
|
|
|
kfree(net->ipv6.fib6_main_tbl);
|
2008-03-04 14:24:31 +07:00
|
|
|
#endif
|
|
|
|
out_fib_table_hash:
|
2008-03-04 14:25:27 +07:00
|
|
|
kfree(net->ipv6.fib_table_hash);
|
2008-03-04 14:34:17 +07:00
|
|
|
out_rt6_stats:
|
|
|
|
kfree(net->ipv6.rt6_stats);
|
2008-03-04 14:31:11 +07:00
|
|
|
out_timer:
|
2017-08-03 18:28:16 +07:00
|
|
|
fib6_notifier_exit(net);
|
2008-07-23 04:33:45 +07:00
|
|
|
return -ENOMEM;
|
2014-03-28 11:07:02 +07:00
|
|
|
}
|
2008-03-04 14:25:27 +07:00
|
|
|
|
|
|
|
static void fib6_net_exit(struct net *net)
|
|
|
|
{
|
2017-09-08 15:26:19 +07:00
|
|
|
unsigned int i;
|
|
|
|
|
2008-07-23 04:33:45 +07:00
|
|
|
del_timer_sync(&net->ipv6.ip6_fib_timer);
|
|
|
|
|
2017-09-09 05:48:47 +07:00
|
|
|
for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
|
2017-09-08 15:26:19 +07:00
|
|
|
struct hlist_head *head = &net->ipv6.fib_table_hash[i];
|
|
|
|
struct hlist_node *tmp;
|
|
|
|
struct fib6_table *tb;
|
|
|
|
|
|
|
|
hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
|
|
|
|
hlist_del(&tb->tb6_hlist);
|
|
|
|
fib6_free_table(tb);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-03-04 14:25:27 +07:00
|
|
|
kfree(net->ipv6.fib_table_hash);
|
2008-03-04 14:34:17 +07:00
|
|
|
kfree(net->ipv6.rt6_stats);
|
2017-08-03 18:28:16 +07:00
|
|
|
fib6_notifier_exit(net);
|
2008-03-04 14:25:27 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct pernet_operations fib6_net_ops = {
|
|
|
|
.init = fib6_net_init,
|
|
|
|
.exit = fib6_net_exit,
|
|
|
|
};
|
|
|
|
|
|
|
|
int __init fib6_init(void)
|
|
|
|
{
|
|
|
|
int ret = -ENOMEM;
|
2008-03-04 14:31:11 +07:00
|
|
|
|
2008-03-04 14:25:27 +07:00
|
|
|
fib6_node_kmem = kmem_cache_create("fib6_nodes",
|
|
|
|
sizeof(struct fib6_node),
|
|
|
|
0, SLAB_HWCACHE_ALIGN,
|
|
|
|
NULL);
|
|
|
|
if (!fib6_node_kmem)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = register_pernet_subsys(&fib6_net_ops);
|
|
|
|
if (ret)
|
2008-03-04 14:34:17 +07:00
|
|
|
goto out_kmem_cache_create;
|
2012-06-16 15:12:19 +07:00
|
|
|
|
2017-12-03 03:44:08 +07:00
|
|
|
ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL,
|
|
|
|
inet6_dump_fib, 0);
|
2012-06-16 15:12:19 +07:00
|
|
|
if (ret)
|
|
|
|
goto out_unregister_subsys;
|
2014-09-28 05:46:06 +07:00
|
|
|
|
|
|
|
__fib6_flush_trees = fib6_flush_trees;
|
2008-03-04 14:25:27 +07:00
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
|
2012-06-16 15:12:19 +07:00
|
|
|
out_unregister_subsys:
|
|
|
|
unregister_pernet_subsys(&fib6_net_ops);
|
2007-12-07 15:40:34 +07:00
|
|
|
out_kmem_cache_create:
|
|
|
|
kmem_cache_destroy(fib6_node_kmem);
|
|
|
|
goto out;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
void fib6_gc_cleanup(void)
|
|
|
|
{
|
2008-03-04 14:25:27 +07:00
|
|
|
unregister_pernet_subsys(&fib6_net_ops);
|
2005-04-17 05:20:36 +07:00
|
|
|
kmem_cache_destroy(fib6_node_kmem);
|
|
|
|
}
|
2013-09-21 21:55:59 +07:00
|
|
|
|
|
|
|
#ifdef CONFIG_PROC_FS
|
|
|
|
static int ipv6_route_seq_show(struct seq_file *seq, void *v)
|
|
|
|
{
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *rt = v;
|
2013-09-21 21:55:59 +07:00
|
|
|
struct ipv6_route_iter *iter = seq->private;
|
2019-06-04 10:19:52 +07:00
|
|
|
struct fib6_nh *fib6_nh = rt->fib6_nh;
|
2019-03-28 10:53:52 +07:00
|
|
|
unsigned int flags = rt->fib6_flags;
|
2018-04-18 07:33:14 +07:00
|
|
|
const struct net_device *dev;
|
2013-09-21 21:55:59 +07:00
|
|
|
|
2019-06-04 10:19:52 +07:00
|
|
|
if (rt->nh)
|
|
|
|
fib6_nh = nexthop_fib6_nh(rt->nh);
|
|
|
|
|
2018-04-19 05:38:59 +07:00
|
|
|
seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
|
2013-09-21 21:55:59 +07:00
|
|
|
|
|
|
|
#ifdef CONFIG_IPV6_SUBTREES
|
2018-04-19 05:38:59 +07:00
|
|
|
seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
|
2013-09-21 21:55:59 +07:00
|
|
|
#else
|
|
|
|
seq_puts(seq, "00000000000000000000000000000000 00 ");
|
|
|
|
#endif
|
2019-06-04 10:19:52 +07:00
|
|
|
if (fib6_nh->fib_nh_gw_family) {
|
2019-03-28 10:53:52 +07:00
|
|
|
flags |= RTF_GATEWAY;
|
2019-06-04 10:19:52 +07:00
|
|
|
seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
|
2019-03-28 10:53:52 +07:00
|
|
|
} else {
|
2013-09-21 21:55:59 +07:00
|
|
|
seq_puts(seq, "00000000000000000000000000000000");
|
2019-03-28 10:53:52 +07:00
|
|
|
}
|
2013-09-21 21:55:59 +07:00
|
|
|
|
2019-06-04 10:19:52 +07:00
|
|
|
dev = fib6_nh->fib_nh_dev;
|
2013-09-21 21:55:59 +07:00
|
|
|
seq_printf(seq, " %08x %08x %08x %08x %8s\n",
|
2019-04-23 08:35:03 +07:00
|
|
|
rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
|
2019-03-28 10:53:52 +07:00
|
|
|
flags, dev ? dev->name : "");
|
2013-09-21 21:55:59 +07:00
|
|
|
iter->w.leaf = NULL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-10-07 00:58:34 +07:00
|
|
|
static int ipv6_route_yield(struct fib6_walker *w)
|
2013-09-21 21:55:59 +07:00
|
|
|
{
|
|
|
|
struct ipv6_route_iter *iter = w->args;
|
|
|
|
|
|
|
|
if (!iter->skip)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
do {
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
iter->w.leaf = rcu_dereference_protected(
|
2018-05-05 03:54:24 +07:00
|
|
|
iter->w.leaf->fib6_next,
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
lockdep_is_held(&iter->tbl->tb6_lock));
|
2013-09-21 21:55:59 +07:00
|
|
|
iter->skip--;
|
|
|
|
if (!iter->skip && iter->w.leaf)
|
|
|
|
return 1;
|
|
|
|
} while (iter->w.leaf);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-03-08 20:44:35 +07:00
|
|
|
static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
|
|
|
|
struct net *net)
|
2013-09-21 21:55:59 +07:00
|
|
|
{
|
|
|
|
memset(&iter->w, 0, sizeof(iter->w));
|
|
|
|
iter->w.func = ipv6_route_yield;
|
|
|
|
iter->w.root = &iter->tbl->tb6_root;
|
|
|
|
iter->w.state = FWS_INIT;
|
|
|
|
iter->w.node = iter->w.root;
|
|
|
|
iter->w.args = iter;
|
2013-09-21 21:56:10 +07:00
|
|
|
iter->sernum = iter->w.root->fn_sernum;
|
2013-09-21 21:55:59 +07:00
|
|
|
INIT_LIST_HEAD(&iter->w.lh);
|
2016-03-08 20:44:35 +07:00
|
|
|
fib6_walker_link(net, &iter->w);
|
2013-09-21 21:55:59 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
|
|
|
|
struct net *net)
|
|
|
|
{
|
|
|
|
unsigned int h;
|
|
|
|
struct hlist_node *node;
|
|
|
|
|
|
|
|
if (tbl) {
|
|
|
|
h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
|
|
|
|
node = rcu_dereference_bh(hlist_next_rcu(&tbl->tb6_hlist));
|
|
|
|
} else {
|
|
|
|
h = 0;
|
|
|
|
node = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (!node && h < FIB6_TABLE_HASHSZ) {
|
|
|
|
node = rcu_dereference_bh(
|
|
|
|
hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
|
|
|
|
}
|
|
|
|
return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
|
|
|
|
}
|
|
|
|
|
2013-09-21 21:56:10 +07:00
|
|
|
static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
|
|
|
|
{
|
|
|
|
if (iter->sernum != iter->w.root->fn_sernum) {
|
|
|
|
iter->sernum = iter->w.root->fn_sernum;
|
|
|
|
iter->w.state = FWS_INIT;
|
|
|
|
iter->w.node = iter->w.root;
|
|
|
|
WARN_ON(iter->w.skip);
|
|
|
|
iter->w.skip = iter->w.count;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-09-21 21:55:59 +07:00
|
|
|
static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
|
|
|
{
|
|
|
|
int r;
|
2018-04-18 07:33:26 +07:00
|
|
|
struct fib6_info *n;
|
2013-09-21 21:55:59 +07:00
|
|
|
struct net *net = seq_file_net(seq);
|
|
|
|
struct ipv6_route_iter *iter = seq->private;
|
|
|
|
|
|
|
|
if (!v)
|
|
|
|
goto iter_table;
|
|
|
|
|
2018-05-05 03:54:24 +07:00
|
|
|
n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next);
|
2013-09-21 21:55:59 +07:00
|
|
|
if (n) {
|
|
|
|
++*pos;
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
|
|
|
iter_table:
|
2013-09-21 21:56:10 +07:00
|
|
|
ipv6_route_check_sernum(iter);
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
spin_lock_bh(&iter->tbl->tb6_lock);
|
2013-09-21 21:55:59 +07:00
|
|
|
r = fib6_walk_continue(&iter->w);
|
ipv6: replace rwlock with rcu and spinlock in fib6_table
With all the preparation work before, we are now ready to replace rwlock
with rcu and spinlock in fib6_table.
That means now all fib6_node in fib6_table are protected by rcu. And
when freeing fib6_node, call_rcu() is used to wait for the rcu grace
period before releasing the memory.
When accessing fib6_node, corresponding rcu APIs need to be used.
And all previous sessions protected by the write lock will now be
protected by the spin lock per table.
All previous sessions protected by read lock will now be protected by
rcu_read_lock().
A couple of things to note here:
1. As part of the work of replacing rwlock with rcu, the linked list of
fn->leaf now has to be rcu protected as well. So both fn->leaf and
rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are
used when manipulating them.
2. For fn->rr_ptr, first of all, it also needs to be rcu protected now
and is tagged with __rcu and rcu APIs are used in corresponding places.
Secondly, fn->rr_ptr is changed in rt6_select() which is a reader
thread. This makes the issue a bit complicated. We think a valid
solution for it is to let rt6_select() grab the tb6_lock if it decides
to change it. As it is not in the normal operation and only happens when
there is no valid neighbor cache for the route, we think the performance
impact should be low.
3. fib6_walk_continue() has to be called with tb6_lock held even in the
route dumping related functions, e.g. inet6_dump_fib(),
fib6_tables_dump() and ipv6_route_seq_ops. It is because
fib6_walk_continue() makes modifications to the walker structure, and so
are fib6_repair_tree() and fib6_del_route(). In order to do proper
syncing between them, we need to let fib6_walk_continue() hold the lock.
We may be able to do further improvement on the way we do the tree walk
to get rid of the need for holding the spin lock. But not for now.
4. When fib6_del_route() removes a route from the tree, we no longer
mark rt->dst.rt6_next to NULL to make simultaneous reader be able to
further traverse the list with rcu. However, rt->dst.rt6_next is only
valid within this same rcu period. No one should access it later.
5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be
performed before we publish this route (either by linking it to fn->leaf
or insert it in the list pointed by fn->leaf) just to be safe because as
soon as we publish the route, some read thread will be able to access it.
Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-07 02:06:10 +07:00
|
|
|
spin_unlock_bh(&iter->tbl->tb6_lock);
|
2013-09-21 21:55:59 +07:00
|
|
|
if (r > 0) {
|
|
|
|
if (v)
|
|
|
|
++*pos;
|
|
|
|
return iter->w.leaf;
|
|
|
|
} else if (r < 0) {
|
2016-03-08 20:44:35 +07:00
|
|
|
fib6_walker_unlink(net, &iter->w);
|
2013-09-21 21:55:59 +07:00
|
|
|
return NULL;
|
|
|
|
}
|
2016-03-08 20:44:35 +07:00
|
|
|
fib6_walker_unlink(net, &iter->w);
|
2013-09-21 21:55:59 +07:00
|
|
|
|
|
|
|
iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
|
|
|
|
if (!iter->tbl)
|
|
|
|
return NULL;
|
|
|
|
|
2016-03-08 20:44:35 +07:00
|
|
|
ipv6_route_seq_setup_walk(iter, net);
|
2013-09-21 21:55:59 +07:00
|
|
|
goto iter_table;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
|
|
|
|
__acquires(RCU_BH)
|
|
|
|
{
|
|
|
|
struct net *net = seq_file_net(seq);
|
|
|
|
struct ipv6_route_iter *iter = seq->private;
|
|
|
|
|
|
|
|
rcu_read_lock_bh();
|
|
|
|
iter->tbl = ipv6_route_seq_next_table(NULL, net);
|
|
|
|
iter->skip = *pos;
|
|
|
|
|
|
|
|
if (iter->tbl) {
|
2016-03-08 20:44:35 +07:00
|
|
|
ipv6_route_seq_setup_walk(iter, net);
|
2013-09-21 21:55:59 +07:00
|
|
|
return ipv6_route_seq_next(seq, NULL, pos);
|
|
|
|
} else {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
|
|
|
|
{
|
2014-10-07 00:58:34 +07:00
|
|
|
struct fib6_walker *w = &iter->w;
|
2013-09-21 21:55:59 +07:00
|
|
|
return w->node && !(w->state == FWS_U && w->node == w->root);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
|
|
|
|
__releases(RCU_BH)
|
|
|
|
{
|
2016-03-08 20:44:35 +07:00
|
|
|
struct net *net = seq_file_net(seq);
|
2013-09-21 21:55:59 +07:00
|
|
|
struct ipv6_route_iter *iter = seq->private;
|
|
|
|
|
|
|
|
if (ipv6_route_iter_active(iter))
|
2016-03-08 20:44:35 +07:00
|
|
|
fib6_walker_unlink(net, &iter->w);
|
2013-09-21 21:55:59 +07:00
|
|
|
|
|
|
|
rcu_read_unlock_bh();
|
|
|
|
}
|
|
|
|
|
2018-04-11 00:42:55 +07:00
|
|
|
const struct seq_operations ipv6_route_seq_ops = {
|
2013-09-21 21:55:59 +07:00
|
|
|
.start = ipv6_route_seq_start,
|
|
|
|
.next = ipv6_route_seq_next,
|
|
|
|
.stop = ipv6_route_seq_stop,
|
|
|
|
.show = ipv6_route_seq_show
|
|
|
|
};
|
|
|
|
#endif /* CONFIG_PROC_FS */
|