mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-11-24 09:20:50 +07:00
Merge branch 'for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into core/rcu
Pull RCU updates from Paul E. McKenney: - Documentation updates. - Miscellaneous fixes. - Parallelize SRCU callback handling (plus overlapping patches). Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
commit
58d30c36d4
@ -17,7 +17,7 @@ rcu_dereference.txt
|
||||
rcubarrier.txt
|
||||
- RCU and Unloadable Modules
|
||||
rculist_nulls.txt
|
||||
- RCU list primitives for use with SLAB_DESTROY_BY_RCU
|
||||
- RCU list primitives for use with SLAB_TYPESAFE_BY_RCU
|
||||
rcuref.txt
|
||||
- Reference-count design for elements of lists/arrays protected by RCU
|
||||
rcu.txt
|
||||
|
@ -19,6 +19,8 @@ to each other.
|
||||
The <tt>rcu_state</tt> Structure</a>
|
||||
<li> <a href="#The rcu_node Structure">
|
||||
The <tt>rcu_node</tt> Structure</a>
|
||||
<li> <a href="#The rcu_segcblist Structure">
|
||||
The <tt>rcu_segcblist</tt> Structure</a>
|
||||
<li> <a href="#The rcu_data Structure">
|
||||
The <tt>rcu_data</tt> Structure</a>
|
||||
<li> <a href="#The rcu_dynticks Structure">
|
||||
@ -841,6 +843,134 @@ for lockdep lock-class names.
|
||||
Finally, lines 64-66 produce an error if the maximum number of
|
||||
CPUs is too large for the specified fanout.
|
||||
|
||||
<h3><a name="The rcu_segcblist Structure">
|
||||
The <tt>rcu_segcblist</tt> Structure</a></h3>
|
||||
|
||||
The <tt>rcu_segcblist</tt> structure maintains a segmented list of
|
||||
callbacks as follows:
|
||||
|
||||
<pre>
|
||||
1 #define RCU_DONE_TAIL 0
|
||||
2 #define RCU_WAIT_TAIL 1
|
||||
3 #define RCU_NEXT_READY_TAIL 2
|
||||
4 #define RCU_NEXT_TAIL 3
|
||||
5 #define RCU_CBLIST_NSEGS 4
|
||||
6
|
||||
7 struct rcu_segcblist {
|
||||
8 struct rcu_head *head;
|
||||
9 struct rcu_head **tails[RCU_CBLIST_NSEGS];
|
||||
10 unsigned long gp_seq[RCU_CBLIST_NSEGS];
|
||||
11 long len;
|
||||
12 long len_lazy;
|
||||
13 };
|
||||
</pre>
|
||||
|
||||
<p>
|
||||
The segments are as follows:
|
||||
|
||||
<ol>
|
||||
<li> <tt>RCU_DONE_TAIL</tt>: Callbacks whose grace periods have elapsed.
|
||||
These callbacks are ready to be invoked.
|
||||
<li> <tt>RCU_WAIT_TAIL</tt>: Callbacks that are waiting for the
|
||||
current grace period.
|
||||
Note that different CPUs can have different ideas about which
|
||||
grace period is current, hence the <tt>->gp_seq</tt> field.
|
||||
<li> <tt>RCU_NEXT_READY_TAIL</tt>: Callbacks waiting for the next
|
||||
grace period to start.
|
||||
<li> <tt>RCU_NEXT_TAIL</tt>: Callbacks that have not yet been
|
||||
associated with a grace period.
|
||||
</ol>
|
||||
|
||||
<p>
|
||||
The <tt>->head</tt> pointer references the first callback or
|
||||
is <tt>NULL</tt> if the list contains no callbacks (which is
|
||||
<i>not</i> the same as being empty).
|
||||
Each element of the <tt>->tails[]</tt> array references the
|
||||
<tt>->next</tt> pointer of the last callback in the corresponding
|
||||
segment of the list, or the list's <tt>->head</tt> pointer if
|
||||
that segment and all previous segments are empty.
|
||||
If the corresponding segment is empty but some previous segment is
|
||||
not empty, then the array element is identical to its predecessor.
|
||||
Older callbacks are closer to the head of the list, and new callbacks
|
||||
are added at the tail.
|
||||
This relationship between the <tt>->head</tt> pointer, the
|
||||
<tt>->tails[]</tt> array, and the callbacks is shown in this
|
||||
diagram:
|
||||
|
||||
</p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%">
|
||||
|
||||
</p><p>In this figure, the <tt>->head</tt> pointer references the
|
||||
first
|
||||
RCU callback in the list.
|
||||
The <tt>->tails[RCU_DONE_TAIL]</tt> array element references
|
||||
the <tt>->head</tt> pointer itself, indicating that none
|
||||
of the callbacks is ready to invoke.
|
||||
The <tt>->tails[RCU_WAIT_TAIL]</tt> array element references callback
|
||||
CB 2's <tt>->next</tt> pointer, which indicates that
|
||||
CB 1 and CB 2 are both waiting on the current grace period,
|
||||
give or take possible disagreements about exactly which grace period
|
||||
is the current one.
|
||||
The <tt>->tails[RCU_NEXT_READY_TAIL]</tt> array element
|
||||
references the same RCU callback that <tt>->tails[RCU_WAIT_TAIL]</tt>
|
||||
does, which indicates that there are no callbacks waiting on the next
|
||||
RCU grace period.
|
||||
The <tt>->tails[RCU_NEXT_TAIL]</tt> array element references
|
||||
CB 4's <tt>->next</tt> pointer, indicating that all the
|
||||
remaining RCU callbacks have not yet been assigned to an RCU grace
|
||||
period.
|
||||
Note that the <tt>->tails[RCU_NEXT_TAIL]</tt> array element
|
||||
always references the last RCU callback's <tt>->next</tt> pointer
|
||||
unless the callback list is empty, in which case it references
|
||||
the <tt>->head</tt> pointer.
|
||||
|
||||
<p>
|
||||
There is one additional important special case for the
|
||||
<tt>->tails[RCU_NEXT_TAIL]</tt> array element: It can be <tt>NULL</tt>
|
||||
when this list is <i>disabled</i>.
|
||||
Lists are disabled when the corresponding CPU is offline or when
|
||||
the corresponding CPU's callbacks are offloaded to a kthread,
|
||||
both of which are described elsewhere.
|
||||
|
||||
</p><p>CPUs advance their callbacks from the
|
||||
<tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the
|
||||
<tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments
|
||||
as grace periods advance.
|
||||
|
||||
</p><p>The <tt>->gp_seq[]</tt> array records grace-period
|
||||
numbers corresponding to the list segments.
|
||||
This is what allows different CPUs to have different ideas as to
|
||||
which is the current grace period while still avoiding premature
|
||||
invocation of their callbacks.
|
||||
In particular, this allows CPUs that go idle for extended periods
|
||||
to determine which of their callbacks are ready to be invoked after
|
||||
reawakening.
|
||||
|
||||
</p><p>The <tt>->len</tt> counter contains the number of
|
||||
callbacks in <tt>->head</tt>, and the
|
||||
<tt>->len_lazy</tt> contains the number of those callbacks that
|
||||
are known to only free memory, and whose invocation can therefore
|
||||
be safely deferred.
|
||||
|
||||
<p><b>Important note</b>: It is the <tt>->len</tt> field that
|
||||
determines whether or not there are callbacks associated with
|
||||
this <tt>rcu_segcblist</tt> structure, <i>not</i> the <tt>->head</tt>
|
||||
pointer.
|
||||
The reason for this is that all the ready-to-invoke callbacks
|
||||
(that is, those in the <tt>RCU_DONE_TAIL</tt> segment) are extracted
|
||||
all at once at callback-invocation time.
|
||||
If callback invocation must be postponed, for example, because a
|
||||
high-priority process just woke up on this CPU, then the remaining
|
||||
callbacks are placed back on the <tt>RCU_DONE_TAIL</tt> segment.
|
||||
Either way, the <tt>->len</tt> and <tt>->len_lazy</tt> counts
|
||||
are adjusted after the corresponding callbacks have been invoked, and so
|
||||
again it is the <tt>->len</tt> count that accurately reflects whether
|
||||
or not there are callbacks associated with this <tt>rcu_segcblist</tt>
|
||||
structure.
|
||||
Of course, off-CPU sampling of the <tt>->len</tt> count requires
|
||||
the use of appropriate synchronization, for example, memory barriers.
|
||||
This synchronization can be a bit subtle, particularly in the case
|
||||
of <tt>rcu_barrier()</tt>.
|
||||
|
||||
<h3><a name="The rcu_data Structure">
|
||||
The <tt>rcu_data</tt> Structure</a></h3>
|
||||
|
||||
@ -983,62 +1113,18 @@ choice.
|
||||
as follows:
|
||||
|
||||
<pre>
|
||||
1 struct rcu_head *nxtlist;
|
||||
2 struct rcu_head **nxttail[RCU_NEXT_SIZE];
|
||||
3 unsigned long nxtcompleted[RCU_NEXT_SIZE];
|
||||
4 long qlen_lazy;
|
||||
5 long qlen;
|
||||
6 long qlen_last_fqs_check;
|
||||
1 struct rcu_segcblist cblist;
|
||||
2 long qlen_last_fqs_check;
|
||||
3 unsigned long n_cbs_invoked;
|
||||
4 unsigned long n_nocbs_invoked;
|
||||
5 unsigned long n_cbs_orphaned;
|
||||
6 unsigned long n_cbs_adopted;
|
||||
7 unsigned long n_force_qs_snap;
|
||||
8 unsigned long n_cbs_invoked;
|
||||
9 unsigned long n_cbs_orphaned;
|
||||
10 unsigned long n_cbs_adopted;
|
||||
11 long blimit;
|
||||
8 long blimit;
|
||||
</pre>
|
||||
|
||||
<p>The <tt>->nxtlist</tt> pointer and the
|
||||
<tt>->nxttail[]</tt> array form a four-segment list with
|
||||
older callbacks near the head and newer ones near the tail.
|
||||
Each segment contains callbacks with the corresponding relationship
|
||||
to the current grace period.
|
||||
The pointer out of the end of each of the four segments is referenced
|
||||
by the element of the <tt>->nxttail[]</tt> array indexed by
|
||||
<tt>RCU_DONE_TAIL</tt> (for callbacks handled by a prior grace period),
|
||||
<tt>RCU_WAIT_TAIL</tt> (for callbacks waiting on the current grace period),
|
||||
<tt>RCU_NEXT_READY_TAIL</tt> (for callbacks that will wait on the next
|
||||
grace period), and
|
||||
<tt>RCU_NEXT_TAIL</tt> (for callbacks that are not yet associated
|
||||
with a specific grace period)
|
||||
respectively, as shown in the following figure.
|
||||
|
||||
</p><p><img src="nxtlist.svg" alt="nxtlist.svg" width="40%">
|
||||
|
||||
</p><p>In this figure, the <tt>->nxtlist</tt> pointer references the
|
||||
first
|
||||
RCU callback in the list.
|
||||
The <tt>->nxttail[RCU_DONE_TAIL]</tt> array element references
|
||||
the <tt>->nxtlist</tt> pointer itself, indicating that none
|
||||
of the callbacks is ready to invoke.
|
||||
The <tt>->nxttail[RCU_WAIT_TAIL]</tt> array element references callback
|
||||
CB 2's <tt>->next</tt> pointer, which indicates that
|
||||
CB 1 and CB 2 are both waiting on the current grace period.
|
||||
The <tt>->nxttail[RCU_NEXT_READY_TAIL]</tt> array element
|
||||
references the same RCU callback that <tt>->nxttail[RCU_WAIT_TAIL]</tt>
|
||||
does, which indicates that there are no callbacks waiting on the next
|
||||
RCU grace period.
|
||||
The <tt>->nxttail[RCU_NEXT_TAIL]</tt> array element references
|
||||
CB 4's <tt>->next</tt> pointer, indicating that all the
|
||||
remaining RCU callbacks have not yet been assigned to an RCU grace
|
||||
period.
|
||||
Note that the <tt>->nxttail[RCU_NEXT_TAIL]</tt> array element
|
||||
always references the last RCU callback's <tt>->next</tt> pointer
|
||||
unless the callback list is empty, in which case it references
|
||||
the <tt>->nxtlist</tt> pointer.
|
||||
|
||||
</p><p>CPUs advance their callbacks from the
|
||||
<tt>RCU_NEXT_TAIL</tt> to the <tt>RCU_NEXT_READY_TAIL</tt> to the
|
||||
<tt>RCU_WAIT_TAIL</tt> to the <tt>RCU_DONE_TAIL</tt> list segments
|
||||
as grace periods advance.
|
||||
<p>The <tt>->cblist</tt> structure is the segmented callback list
|
||||
described earlier.
|
||||
The CPU advances the callbacks in its <tt>rcu_data</tt> structure
|
||||
whenever it notices that another RCU grace period has completed.
|
||||
The CPU detects the completion of an RCU grace period by noticing
|
||||
@ -1049,16 +1135,7 @@ Recall that each <tt>rcu_node</tt> structure's
|
||||
<tt>->completed</tt> field is updated at the end of each
|
||||
grace period.
|
||||
|
||||
</p><p>The <tt>->nxtcompleted[]</tt> array records grace-period
|
||||
numbers corresponding to the list segments.
|
||||
This allows CPUs that go idle for extended periods to determine
|
||||
which of their callbacks are ready to be invoked after reawakening.
|
||||
|
||||
</p><p>The <tt>->qlen</tt> counter contains the number of
|
||||
callbacks in <tt>->nxtlist</tt>, and the
|
||||
<tt>->qlen_lazy</tt> contains the number of those callbacks that
|
||||
are known to only free memory, and whose invocation can therefore
|
||||
be safely deferred.
|
||||
<p>
|
||||
The <tt>->qlen_last_fqs_check</tt> and
|
||||
<tt>->n_force_qs_snap</tt> coordinate the forcing of quiescent
|
||||
states from <tt>call_rcu()</tt> and friends when callback
|
||||
@ -1069,6 +1146,10 @@ lists grow excessively long.
|
||||
fields count the number of callbacks invoked,
|
||||
sent to other CPUs when this CPU goes offline,
|
||||
and received from other CPUs when those other CPUs go offline.
|
||||
The <tt>->n_nocbs_invoked</tt> is used when the CPU's callbacks
|
||||
are offloaded to a kthread.
|
||||
|
||||
<p>
|
||||
Finally, the <tt>->blimit</tt> counter is the maximum number of
|
||||
RCU callbacks that may be invoked at a given time.
|
||||
|
||||
@ -1104,6 +1185,9 @@ Its fields are as follows:
|
||||
1 int dynticks_nesting;
|
||||
2 int dynticks_nmi_nesting;
|
||||
3 atomic_t dynticks;
|
||||
4 bool rcu_need_heavy_qs;
|
||||
5 unsigned long rcu_qs_ctr;
|
||||
6 bool rcu_urgent_qs;
|
||||
</pre>
|
||||
|
||||
<p>The <tt>->dynticks_nesting</tt> field counts the
|
||||
@ -1117,11 +1201,32 @@ NMIs are counted by the <tt>->dynticks_nmi_nesting</tt>
|
||||
field, except that NMIs that interrupt non-dyntick-idle execution
|
||||
are not counted.
|
||||
|
||||
</p><p>Finally, the <tt>->dynticks</tt> field counts the corresponding
|
||||
</p><p>The <tt>->dynticks</tt> field counts the corresponding
|
||||
CPU's transitions to and from dyntick-idle mode, so that this counter
|
||||
has an even value when the CPU is in dyntick-idle mode and an odd
|
||||
value otherwise.
|
||||
|
||||
</p><p>The <tt>->rcu_need_heavy_qs</tt> field is used
|
||||
to record the fact that the RCU core code would really like to
|
||||
see a quiescent state from the corresponding CPU, so much so that
|
||||
it is willing to call for heavy-weight dyntick-counter operations.
|
||||
This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
|
||||
code, which provide a momentary idle sojourn in response.
|
||||
|
||||
</p><p>The <tt>->rcu_qs_ctr</tt> field is used to record
|
||||
quiescent states from <tt>cond_resched()</tt>.
|
||||
Because <tt>cond_resched()</tt> can execute quite frequently, this
|
||||
must be quite lightweight, as in a non-atomic increment of this
|
||||
per-CPU field.
|
||||
|
||||
</p><p>Finally, the <tt>->rcu_urgent_qs</tt> field is used to record
|
||||
the fact that the RCU core code would really like to see a quiescent
|
||||
state from the corresponding CPU, with the various other fields indicating
|
||||
just how badly RCU wants this quiescent state.
|
||||
This flag is checked by RCU's context-switch and <tt>cond_resched()</tt>
|
||||
code, which, if nothing else, non-atomically increment <tt>->rcu_qs_ctr</tt>
|
||||
in response.
|
||||
|
||||
<table>
|
||||
<tr><th> </th></tr>
|
||||
<tr><th align="left">Quick Quiz:</th></tr>
|
||||
|
@ -19,7 +19,7 @@
|
||||
id="svg2"
|
||||
version="1.1"
|
||||
inkscape:version="0.48.4 r9939"
|
||||
sodipodi:docname="nxtlist.fig">
|
||||
sodipodi:docname="segcblist.svg">
|
||||
<metadata
|
||||
id="metadata94">
|
||||
<rdf:RDF>
|
||||
@ -28,7 +28,7 @@
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
<dc:title />
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
@ -241,61 +241,51 @@
|
||||
xml:space="preserve"
|
||||
x="225"
|
||||
y="675"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="324"
|
||||
text-anchor="start"
|
||||
id="text64">nxtlist</text>
|
||||
id="text64"
|
||||
style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->head</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="225"
|
||||
y="1800"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="324"
|
||||
text-anchor="start"
|
||||
id="text66">nxttail[RCU_DONE_TAIL]</text>
|
||||
id="text66"
|
||||
style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_DONE_TAIL]</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="225"
|
||||
y="2925"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="324"
|
||||
text-anchor="start"
|
||||
id="text68">nxttail[RCU_WAIT_TAIL]</text>
|
||||
id="text68"
|
||||
style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_WAIT_TAIL]</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="225"
|
||||
y="4050"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="324"
|
||||
text-anchor="start"
|
||||
id="text70">nxttail[RCU_NEXT_READY_TAIL]</text>
|
||||
id="text70"
|
||||
style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_NEXT_READY_TAIL]</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
x="225"
|
||||
y="5175"
|
||||
fill="#000000"
|
||||
font-family="Courier"
|
||||
font-style="normal"
|
||||
font-weight="bold"
|
||||
font-size="324"
|
||||
text-anchor="start"
|
||||
id="text72">nxttail[RCU_NEXT_TAIL]</text>
|
||||
id="text72"
|
||||
style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_NEXT_TAIL]</text>
|
||||
<!-- Text -->
|
||||
<text
|
||||
xml:space="preserve"
|
||||
|
Before Width: | Height: | Size: 11 KiB After Width: | Height: | Size: 11 KiB |
@ -284,6 +284,7 @@ Expedited Grace Period Refinements</a></h2>
|
||||
Funnel locking and wait/wakeup</a>.
|
||||
<li> <a href="#Use of Workqueues">Use of Workqueues</a>.
|
||||
<li> <a href="#Stall Warnings">Stall warnings</a>.
|
||||
<li> <a href="#Mid-Boot Operation">Mid-boot operation</a>.
|
||||
</ol>
|
||||
|
||||
<h3><a name="Idle-CPU Checks">Idle-CPU Checks</a></h3>
|
||||
@ -524,7 +525,7 @@ their grace periods and carrying out their wakeups.
|
||||
In earlier implementations, the task requesting the expedited
|
||||
grace period also drove it to completion.
|
||||
This straightforward approach had the disadvantage of needing to
|
||||
account for signals sent to user tasks,
|
||||
account for POSIX signals sent to user tasks,
|
||||
so more recent implemementations use the Linux kernel's
|
||||
<a href="https://www.kernel.org/doc/Documentation/workqueue.txt">workqueues</a>.
|
||||
|
||||
@ -533,8 +534,8 @@ The requesting task still does counter snapshotting and funnel-lock
|
||||
processing, but the task reaching the top of the funnel lock
|
||||
does a <tt>schedule_work()</tt> (from <tt>_synchronize_rcu_expedited()</tt>
|
||||
so that a workqueue kthread does the actual grace-period processing.
|
||||
Because workqueue kthreads do not accept signals, grace-period-wait
|
||||
processing need not allow for signals.
|
||||
Because workqueue kthreads do not accept POSIX signals, grace-period-wait
|
||||
processing need not allow for POSIX signals.
|
||||
|
||||
In addition, this approach allows wakeups for the previous expedited
|
||||
grace period to be overlapped with processing for the next expedited
|
||||
@ -586,6 +587,46 @@ blocking the current grace period are printed.
|
||||
Each stall warning results in another pass through the loop, but the
|
||||
second and subsequent passes use longer stall times.
|
||||
|
||||
<h3><a name="Mid-Boot Operation">Mid-boot operation</a></h3>
|
||||
|
||||
<p>
|
||||
The use of workqueues has the advantage that the expedited
|
||||
grace-period code need not worry about POSIX signals.
|
||||
Unfortunately, it has the
|
||||
corresponding disadvantage that workqueues cannot be used until
|
||||
they are initialized, which does not happen until some time after
|
||||
the scheduler spawns the first task.
|
||||
Given that there are parts of the kernel that really do want to
|
||||
execute grace periods during this mid-boot “dead zone”,
|
||||
expedited grace periods must do something else during thie time.
|
||||
|
||||
<p>
|
||||
What they do is to fall back to the old practice of requiring that the
|
||||
requesting task drive the expedited grace period, as was the case
|
||||
before the use of workqueues.
|
||||
However, the requesting task is only required to drive the grace period
|
||||
during the mid-boot dead zone.
|
||||
Before mid-boot, a synchronous grace period is a no-op.
|
||||
Some time after mid-boot, workqueues are used.
|
||||
|
||||
<p>
|
||||
Non-expedited non-SRCU synchronous grace periods must also operate
|
||||
normally during mid-boot.
|
||||
This is handled by causing non-expedited grace periods to take the
|
||||
expedited code path during mid-boot.
|
||||
|
||||
<p>
|
||||
The current code assumes that there are no POSIX signals during
|
||||
the mid-boot dead zone.
|
||||
However, if an overwhelming need for POSIX signals somehow arises,
|
||||
appropriate adjustments can be made to the expedited stall-warning code.
|
||||
One such adjustment would reinstate the pre-workqueue stall-warning
|
||||
checks, but only during the mid-boot dead zone.
|
||||
|
||||
<p>
|
||||
With this refinement, synchronous grace periods can now be used from
|
||||
task context pretty much any time during the life of the kernel.
|
||||
|
||||
<h3><a name="Summary">
|
||||
Summary</a></h3>
|
||||
|
||||
|
@ -659,8 +659,9 @@ systems with more than one CPU:
|
||||
In other words, a given instance of <tt>synchronize_rcu()</tt>
|
||||
can avoid waiting on a given RCU read-side critical section only
|
||||
if it can prove that <tt>synchronize_rcu()</tt> started first.
|
||||
</font>
|
||||
|
||||
<p>
|
||||
<p><font color="ffffff">
|
||||
A related question is “When <tt>rcu_read_lock()</tt>
|
||||
doesn't generate any code, why does it matter how it relates
|
||||
to a grace period?”
|
||||
@ -675,8 +676,9 @@ systems with more than one CPU:
|
||||
within the critical section, in which case none of the accesses
|
||||
within the critical section may observe the effects of any
|
||||
access following the grace period.
|
||||
</font>
|
||||
|
||||
<p>
|
||||
<p><font color="ffffff">
|
||||
As of late 2016, mathematical models of RCU take this
|
||||
viewpoint, for example, see slides 62 and 63
|
||||
of the
|
||||
@ -1616,8 +1618,8 @@ CPUs should at least make reasonable forward progress.
|
||||
In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
|
||||
is permitted to impose modest degradation of real-time latency
|
||||
on non-idle online CPUs.
|
||||
That said, it will likely be necessary to take further steps to reduce this
|
||||
degradation, hopefully to roughly that of a scheduling-clock interrupt.
|
||||
Here, “modest” means roughly the same latency
|
||||
degradation as a scheduling-clock interrupt.
|
||||
|
||||
<p>
|
||||
There are a number of situations where even
|
||||
@ -1913,12 +1915,9 @@ This requirement is another factor driving batching of grace periods,
|
||||
but it is also the driving force behind the checks for large numbers
|
||||
of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
|
||||
Finally, high update rates should not delay RCU read-side critical
|
||||
sections, although some read-side delays can occur when using
|
||||
sections, although some small read-side delays can occur when using
|
||||
<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
|
||||
of <tt>try_stop_cpus()</tt>.
|
||||
(In the future, <tt>synchronize_rcu_expedited()</tt> will be
|
||||
converted to use lighter-weight inter-processor interrupts (IPIs),
|
||||
but this will still disturb readers, though to a much smaller degree.)
|
||||
of <tt>smp_call_function_single()</tt>.
|
||||
|
||||
<p>
|
||||
Although all three of these corner cases were understood in the early
|
||||
@ -2154,7 +2153,8 @@ as will <tt>rcu_assign_pointer()</tt>.
|
||||
<p>
|
||||
Although <tt>call_rcu()</tt> may be invoked at any
|
||||
time during boot, callbacks are not guaranteed to be invoked until after
|
||||
the scheduler is fully up and running.
|
||||
all of RCU's kthreads have been spawned, which occurs at
|
||||
<tt>early_initcall()</tt> time.
|
||||
This delay in callback invocation is due to the fact that RCU does not
|
||||
invoke callbacks until it is fully initialized, and this full initialization
|
||||
cannot occur until after the scheduler has initialized itself to the
|
||||
@ -2167,8 +2167,10 @@ on what operations those callbacks could invoke.
|
||||
Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
|
||||
<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
|
||||
(<a href="#Bottom-Half Flavor">discussed below</a>),
|
||||
and
|
||||
<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>
|
||||
<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>,
|
||||
<tt>synchronize_rcu_expedited()</tt>,
|
||||
<tt>synchronize_rcu_bh_expedited()</tt>, and
|
||||
<tt>synchronize_sched_expedited()</tt>
|
||||
will all operate normally
|
||||
during very early boot, the reason being that there is only one CPU
|
||||
and preemption is disabled.
|
||||
@ -2178,45 +2180,59 @@ state and thus a grace period, so the early-boot implementation can
|
||||
be a no-op.
|
||||
|
||||
<p>
|
||||
Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt>
|
||||
continue to operate normally through the remainder of boot, courtesy
|
||||
of the fact that preemption is disabled across their RCU read-side
|
||||
critical sections and also courtesy of the fact that there is still
|
||||
only one CPU.
|
||||
However, once the scheduler starts initializing, preemption is enabled.
|
||||
There is still only a single CPU, but the fact that preemption is enabled
|
||||
means that the no-op implementation of <tt>synchronize_rcu()</tt> no
|
||||
longer works in <tt>CONFIG_PREEMPT=y</tt> kernels.
|
||||
Therefore, as soon as the scheduler starts initializing, the early-boot
|
||||
fastpath is disabled.
|
||||
This means that <tt>synchronize_rcu()</tt> switches to its runtime
|
||||
mode of operation where it posts callbacks, which in turn means that
|
||||
any call to <tt>synchronize_rcu()</tt> will block until the corresponding
|
||||
callback is invoked.
|
||||
Unfortunately, the callback cannot be invoked until RCU's runtime
|
||||
grace-period machinery is up and running, which cannot happen until
|
||||
the scheduler has initialized itself sufficiently to allow RCU's
|
||||
kthreads to be spawned.
|
||||
Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
|
||||
initialization can result in deadlock.
|
||||
However, once the scheduler has spawned its first kthread, this early
|
||||
boot trick fails for <tt>synchronize_rcu()</tt> (as well as for
|
||||
<tt>synchronize_rcu_expedited()</tt>) in <tt>CONFIG_PREEMPT=y</tt>
|
||||
kernels.
|
||||
The reason is that an RCU read-side critical section might be preempted,
|
||||
which means that a subsequent <tt>synchronize_rcu()</tt> really does have
|
||||
to wait for something, as opposed to simply returning immediately.
|
||||
Unfortunately, <tt>synchronize_rcu()</tt> can't do this until all of
|
||||
its kthreads are spawned, which doesn't happen until some time during
|
||||
<tt>early_initcalls()</tt> time.
|
||||
But this is no excuse: RCU is nevertheless required to correctly handle
|
||||
synchronous grace periods during this time period.
|
||||
Once all of its kthreads are up and running, RCU starts running
|
||||
normally.
|
||||
|
||||
<table>
|
||||
<tr><th> </th></tr>
|
||||
<tr><th align="left">Quick Quiz:</th></tr>
|
||||
<tr><td>
|
||||
So what happens with <tt>synchronize_rcu()</tt> during
|
||||
scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
|
||||
kernels?
|
||||
How can RCU possibly handle grace periods before all of its
|
||||
kthreads have been spawned???
|
||||
</td></tr>
|
||||
<tr><th align="left">Answer:</th></tr>
|
||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
||||
In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
|
||||
maps directly to <tt>synchronize_sched()</tt>.
|
||||
Therefore, <tt>synchronize_rcu()</tt> works normally throughout
|
||||
boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
|
||||
However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
|
||||
so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
|
||||
during scheduler initialization.
|
||||
Very carefully!
|
||||
</font>
|
||||
|
||||
<p><font color="ffffff">
|
||||
During the “dead zone” between the time that the
|
||||
scheduler spawns the first task and the time that all of RCU's
|
||||
kthreads have been spawned, all synchronous grace periods are
|
||||
handled by the expedited grace-period mechanism.
|
||||
At runtime, this expedited mechanism relies on workqueues, but
|
||||
during the dead zone the requesting task itself drives the
|
||||
desired expedited grace period.
|
||||
Because dead-zone execution takes place within task context,
|
||||
everything works.
|
||||
Once the dead zone ends, expedited grace periods go back to
|
||||
using workqueues, as is required to avoid problems that would
|
||||
otherwise occur when a user task received a POSIX signal while
|
||||
driving an expedited grace period.
|
||||
</font>
|
||||
|
||||
<p><font color="ffffff">
|
||||
And yes, this does mean that it is unhelpful to send POSIX
|
||||
signals to random tasks between the time that the scheduler
|
||||
spawns its first kthread and the time that RCU's kthreads
|
||||
have all been spawned.
|
||||
If there ever turns out to be a good reason for sending POSIX
|
||||
signals during that time, appropriate adjustments will be made.
|
||||
(If it turns out that POSIX signals are sent during this time for
|
||||
no good reason, other adjustments will be made, appropriate
|
||||
or otherwise.)
|
||||
</font></td></tr>
|
||||
<tr><td> </td></tr>
|
||||
</table>
|
||||
@ -2295,12 +2311,61 @@ situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
|
||||
The need for <tt>rcu_barrier()</tt> for module unloading became
|
||||
apparent later.
|
||||
|
||||
<p>
|
||||
<b>Important note</b>: The <tt>rcu_barrier()</tt> function is not,
|
||||
repeat, <i>not</i>, obligated to wait for a grace period.
|
||||
It is instead only required to wait for RCU callbacks that have
|
||||
already been posted.
|
||||
Therefore, if there are no RCU callbacks posted anywhere in the system,
|
||||
<tt>rcu_barrier()</tt> is within its rights to return immediately.
|
||||
Even if there are callbacks posted, <tt>rcu_barrier()</tt> does not
|
||||
necessarily need to wait for a grace period.
|
||||
|
||||
<table>
|
||||
<tr><th> </th></tr>
|
||||
<tr><th align="left">Quick Quiz:</th></tr>
|
||||
<tr><td>
|
||||
Wait a minute!
|
||||
Each RCU callbacks must wait for a grace period to complete,
|
||||
and <tt>rcu_barrier()</tt> must wait for each pre-existing
|
||||
callback to be invoked.
|
||||
Doesn't <tt>rcu_barrier()</tt> therefore need to wait for
|
||||
a full grace period if there is even one callback posted anywhere
|
||||
in the system?
|
||||
</td></tr>
|
||||
<tr><th align="left">Answer:</th></tr>
|
||||
<tr><td bgcolor="#ffffff"><font color="ffffff">
|
||||
Absolutely not!!!
|
||||
</font>
|
||||
|
||||
<p><font color="ffffff">
|
||||
Yes, each RCU callbacks must wait for a grace period to complete,
|
||||
but it might well be partly (or even completely) finished waiting
|
||||
by the time <tt>rcu_barrier()</tt> is invoked.
|
||||
In that case, <tt>rcu_barrier()</tt> need only wait for the
|
||||
remaining portion of the grace period to elapse.
|
||||
So even if there are quite a few callbacks posted,
|
||||
<tt>rcu_barrier()</tt> might well return quite quickly.
|
||||
</font>
|
||||
|
||||
<p><font color="ffffff">
|
||||
So if you need to wait for a grace period as well as for all
|
||||
pre-existing callbacks, you will need to invoke both
|
||||
<tt>synchronize_rcu()</tt> and <tt>rcu_barrier()</tt>.
|
||||
If latency is a concern, you can always use workqueues
|
||||
to invoke them concurrently.
|
||||
</font></td></tr>
|
||||
<tr><td> </td></tr>
|
||||
</table>
|
||||
|
||||
<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
|
||||
|
||||
<p>
|
||||
The Linux kernel supports CPU hotplug, which means that CPUs
|
||||
can come and go.
|
||||
It is of course illegal to use any RCU API member from an offline CPU.
|
||||
It is of course illegal to use any RCU API member from an offline CPU,
|
||||
with the exception of <a href="#Sleepable RCU">SRCU</a> read-side
|
||||
critical sections.
|
||||
This requirement was present from day one in DYNIX/ptx, but
|
||||
on the other hand, the Linux kernel's CPU-hotplug implementation
|
||||
is “interesting.”
|
||||
@ -2310,19 +2375,18 @@ The Linux-kernel CPU-hotplug implementation has notifiers that
|
||||
are used to allow the various kernel subsystems (including RCU)
|
||||
to respond appropriately to a given CPU-hotplug operation.
|
||||
Most RCU operations may be invoked from CPU-hotplug notifiers,
|
||||
including even normal synchronous grace-period operations
|
||||
such as <tt>synchronize_rcu()</tt>.
|
||||
However, expedited grace-period operations such as
|
||||
<tt>synchronize_rcu_expedited()</tt> are not supported,
|
||||
due to the fact that current implementations block CPU-hotplug
|
||||
operations, which could result in deadlock.
|
||||
including even synchronous grace-period operations such as
|
||||
<tt>synchronize_rcu()</tt> and <tt>synchronize_rcu_expedited()</tt>.
|
||||
|
||||
<p>
|
||||
In addition, all-callback-wait operations such as
|
||||
However, all-callback-wait operations such as
|
||||
<tt>rcu_barrier()</tt> are also not supported, due to the
|
||||
fact that there are phases of CPU-hotplug operations where
|
||||
the outgoing CPU's callbacks will not be invoked until after
|
||||
the CPU-hotplug operation ends, which could also result in deadlock.
|
||||
Furthermore, <tt>rcu_barrier()</tt> blocks CPU-hotplug operations
|
||||
during its execution, which results in another type of deadlock
|
||||
when invoked from a CPU-hotplug notifier.
|
||||
|
||||
<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
|
||||
|
||||
@ -2863,6 +2927,27 @@ It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt>
|
||||
API, which, in combination with <tt>srcu_read_unlock()</tt>,
|
||||
guarantees a full memory barrier.
|
||||
|
||||
<p>
|
||||
Also unlike other RCU flavors, SRCU's callbacks-wait function
|
||||
<tt>srcu_barrier()</tt> may be invoked from CPU-hotplug notifiers,
|
||||
though this is not necessarily a good idea.
|
||||
The reason that this is possible is that SRCU is insensitive
|
||||
to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt>
|
||||
need not exclude CPU-hotplug operations.
|
||||
|
||||
<p>
|
||||
As of v4.12, SRCU's callbacks are maintained per-CPU, eliminating
|
||||
a locking bottleneck present in prior kernel versions.
|
||||
Although this will allow users to put much heavier stress on
|
||||
<tt>call_srcu()</tt>, it is important to note that SRCU does not
|
||||
yet take any special steps to deal with callback flooding.
|
||||
So if you are posting (say) 10,000 SRCU callbacks per second per CPU,
|
||||
you are probably totally OK, but if you intend to post (say) 1,000,000
|
||||
SRCU callbacks per second per CPU, please run some tests first.
|
||||
SRCU just might need a few adjustment to deal with that sort of load.
|
||||
Of course, your mileage may vary based on the speed of your CPUs and
|
||||
the size of your memory.
|
||||
|
||||
<p>
|
||||
The
|
||||
<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
|
||||
@ -3021,8 +3106,8 @@ to do some redesign to avoid this scalability problem.
|
||||
|
||||
<p>
|
||||
RCU disables CPU hotplug in a few places, perhaps most notably in the
|
||||
expedited grace-period and <tt>rcu_barrier()</tt> operations.
|
||||
If there is a strong reason to use expedited grace periods in CPU-hotplug
|
||||
<tt>rcu_barrier()</tt> operations.
|
||||
If there is a strong reason to use <tt>rcu_barrier()</tt> in CPU-hotplug
|
||||
notifiers, it will be necessary to avoid disabling CPU hotplug.
|
||||
This would introduce some complexity, so there had better be a <i>very</i>
|
||||
good reason.
|
||||
@ -3096,9 +3181,5 @@ Andy Lutomirski for their help in rendering
|
||||
this article human readable, and to Michelle Rankin for her support
|
||||
of this effort.
|
||||
Other contributions are acknowledged in the Linux kernel's git archive.
|
||||
The cartoon is copyright (c) 2013 by Melissa Broussard,
|
||||
and is provided
|
||||
under the terms of the Creative Commons Attribution-Share Alike 3.0
|
||||
United States license.
|
||||
|
||||
</body></html>
|
||||
|
@ -138,6 +138,15 @@ o Be very careful about comparing pointers obtained from
|
||||
This sort of comparison occurs frequently when scanning
|
||||
RCU-protected circular linked lists.
|
||||
|
||||
Note that if checks for being within an RCU read-side
|
||||
critical section are not required and the pointer is never
|
||||
dereferenced, rcu_access_pointer() should be used in place
|
||||
of rcu_dereference(). The rcu_access_pointer() primitive
|
||||
does not require an enclosing read-side critical section,
|
||||
and also omits the smp_read_barrier_depends() included in
|
||||
rcu_dereference(), which in turn should provide a small
|
||||
performance gain in some CPUs (e.g., the DEC Alpha).
|
||||
|
||||
o The comparison is against a pointer that references memory
|
||||
that was initialized "a long time ago." The reason
|
||||
this is safe is that even if misordering occurs, the
|
||||
|
@ -1,5 +1,5 @@
|
||||
Using hlist_nulls to protect read-mostly linked lists and
|
||||
objects using SLAB_DESTROY_BY_RCU allocations.
|
||||
objects using SLAB_TYPESAFE_BY_RCU allocations.
|
||||
|
||||
Please read the basics in Documentation/RCU/listRCU.txt
|
||||
|
||||
@ -7,7 +7,7 @@ Using special makers (called 'nulls') is a convenient way
|
||||
to solve following problem :
|
||||
|
||||
A typical RCU linked list managing objects which are
|
||||
allocated with SLAB_DESTROY_BY_RCU kmem_cache can
|
||||
allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can
|
||||
use following algos :
|
||||
|
||||
1) Lookup algo
|
||||
@ -96,7 +96,7 @@ unlock_chain(); // typically a spin_unlock()
|
||||
3) Remove algo
|
||||
--------------
|
||||
Nothing special here, we can use a standard RCU hlist deletion.
|
||||
But thanks to SLAB_DESTROY_BY_RCU, beware a deleted object can be reused
|
||||
But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused
|
||||
very very fast (before the end of RCU grace period)
|
||||
|
||||
if (put_last_reference_on(obj) {
|
||||
|
@ -1,9 +1,102 @@
|
||||
Using RCU's CPU Stall Detector
|
||||
|
||||
The rcu_cpu_stall_suppress module parameter enables RCU's CPU stall
|
||||
detector, which detects conditions that unduly delay RCU grace periods.
|
||||
This module parameter enables CPU stall detection by default, but
|
||||
may be overridden via boot-time parameter or at runtime via sysfs.
|
||||
This document first discusses what sorts of issues RCU's CPU stall
|
||||
detector can locate, and then discusses kernel parameters and Kconfig
|
||||
options that can be used to fine-tune the detector's operation. Finally,
|
||||
this document explains the stall detector's "splat" format.
|
||||
|
||||
|
||||
What Causes RCU CPU Stall Warnings?
|
||||
|
||||
So your kernel printed an RCU CPU stall warning. The next question is
|
||||
"What caused it?" The following problems can result in RCU CPU stall
|
||||
warnings:
|
||||
|
||||
o A CPU looping in an RCU read-side critical section.
|
||||
|
||||
o A CPU looping with interrupts disabled.
|
||||
|
||||
o A CPU looping with preemption disabled. This condition can
|
||||
result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
|
||||
stalls.
|
||||
|
||||
o A CPU looping with bottom halves disabled. This condition can
|
||||
result in RCU-sched and RCU-bh stalls.
|
||||
|
||||
o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
|
||||
kernel without invoking schedule(). Note that cond_resched()
|
||||
does not necessarily prevent RCU CPU stall warnings. Therefore,
|
||||
if the looping in the kernel is really expected and desirable
|
||||
behavior, you might need to replace some of the cond_resched()
|
||||
calls with calls to cond_resched_rcu_qs().
|
||||
|
||||
o Booting Linux using a console connection that is too slow to
|
||||
keep up with the boot-time console-message rate. For example,
|
||||
a 115Kbaud serial console can be -way- too slow to keep up
|
||||
with boot-time message rates, and will frequently result in
|
||||
RCU CPU stall warning messages. Especially if you have added
|
||||
debug printk()s.
|
||||
|
||||
o Anything that prevents RCU's grace-period kthreads from running.
|
||||
This can result in the "All QSes seen" console-log message.
|
||||
This message will include information on when the kthread last
|
||||
ran and how often it should be expected to run.
|
||||
|
||||
o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
|
||||
happen to preempt a low-priority task in the middle of an RCU
|
||||
read-side critical section. This is especially damaging if
|
||||
that low-priority task is not permitted to run on any other CPU,
|
||||
in which case the next RCU grace period can never complete, which
|
||||
will eventually cause the system to run out of memory and hang.
|
||||
While the system is in the process of running itself out of
|
||||
memory, you might see stall-warning messages.
|
||||
|
||||
o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
|
||||
is running at a higher priority than the RCU softirq threads.
|
||||
This will prevent RCU callbacks from ever being invoked,
|
||||
and in a CONFIG_PREEMPT_RCU kernel will further prevent
|
||||
RCU grace periods from ever completing. Either way, the
|
||||
system will eventually run out of memory and hang. In the
|
||||
CONFIG_PREEMPT_RCU case, you might see stall-warning
|
||||
messages.
|
||||
|
||||
o A hardware or software issue shuts off the scheduler-clock
|
||||
interrupt on a CPU that is not in dyntick-idle mode. This
|
||||
problem really has happened, and seems to be most likely to
|
||||
result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
|
||||
|
||||
o A bug in the RCU implementation.
|
||||
|
||||
o A hardware failure. This is quite unlikely, but has occurred
|
||||
at least once in real life. A CPU failed in a running system,
|
||||
becoming unresponsive, but not causing an immediate crash.
|
||||
This resulted in a series of RCU CPU stall warnings, eventually
|
||||
leading the realization that the CPU had failed.
|
||||
|
||||
The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall
|
||||
warning. Note that SRCU does -not- have CPU stall warnings. Please note
|
||||
that RCU only detects CPU stalls when there is a grace period in progress.
|
||||
No grace period, no CPU stall warnings.
|
||||
|
||||
To diagnose the cause of the stall, inspect the stack traces.
|
||||
The offending function will usually be near the top of the stack.
|
||||
If you have a series of stall warnings from a single extended stall,
|
||||
comparing the stack traces can often help determine where the stall
|
||||
is occurring, which will usually be in the function nearest the top of
|
||||
that portion of the stack which remains the same from trace to trace.
|
||||
If you can reliably trigger the stall, ftrace can be quite helpful.
|
||||
|
||||
RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE
|
||||
and with RCU's event tracing. For information on RCU's event tracing,
|
||||
see include/trace/events/rcu.h.
|
||||
|
||||
|
||||
Fine-Tuning the RCU CPU Stall Detector
|
||||
|
||||
The rcuupdate.rcu_cpu_stall_suppress module parameter disables RCU's
|
||||
CPU stall detector, which detects conditions that unduly delay RCU grace
|
||||
periods. This module parameter enables CPU stall detection by default,
|
||||
but may be overridden via boot-time parameter or at runtime via sysfs.
|
||||
The stall detector's idea of what constitutes "unduly delayed" is
|
||||
controlled by a set of kernel configuration variables and cpp macros:
|
||||
|
||||
@ -56,6 +149,9 @@ rcupdate.rcu_task_stall_timeout
|
||||
And continues with the output of sched_show_task() for each
|
||||
task stalling the current RCU-tasks grace period.
|
||||
|
||||
|
||||
Interpreting RCU's CPU Stall-Detector "Splats"
|
||||
|
||||
For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling,
|
||||
it will print a message similar to the following:
|
||||
|
||||
@ -178,89 +274,3 @@ grace period is in flight.
|
||||
|
||||
It is entirely possible to see stall warnings from normal and from
|
||||
expedited grace periods at about the same time from the same run.
|
||||
|
||||
|
||||
What Causes RCU CPU Stall Warnings?
|
||||
|
||||
So your kernel printed an RCU CPU stall warning. The next question is
|
||||
"What caused it?" The following problems can result in RCU CPU stall
|
||||
warnings:
|
||||
|
||||
o A CPU looping in an RCU read-side critical section.
|
||||
|
||||
o A CPU looping with interrupts disabled. This condition can
|
||||
result in RCU-sched and RCU-bh stalls.
|
||||
|
||||
o A CPU looping with preemption disabled. This condition can
|
||||
result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
|
||||
stalls.
|
||||
|
||||
o A CPU looping with bottom halves disabled. This condition can
|
||||
result in RCU-sched and RCU-bh stalls.
|
||||
|
||||
o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
|
||||
kernel without invoking schedule(). Note that cond_resched()
|
||||
does not necessarily prevent RCU CPU stall warnings. Therefore,
|
||||
if the looping in the kernel is really expected and desirable
|
||||
behavior, you might need to replace some of the cond_resched()
|
||||
calls with calls to cond_resched_rcu_qs().
|
||||
|
||||
o Booting Linux using a console connection that is too slow to
|
||||
keep up with the boot-time console-message rate. For example,
|
||||
a 115Kbaud serial console can be -way- too slow to keep up
|
||||
with boot-time message rates, and will frequently result in
|
||||
RCU CPU stall warning messages. Especially if you have added
|
||||
debug printk()s.
|
||||
|
||||
o Anything that prevents RCU's grace-period kthreads from running.
|
||||
This can result in the "All QSes seen" console-log message.
|
||||
This message will include information on when the kthread last
|
||||
ran and how often it should be expected to run.
|
||||
|
||||
o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
|
||||
happen to preempt a low-priority task in the middle of an RCU
|
||||
read-side critical section. This is especially damaging if
|
||||
that low-priority task is not permitted to run on any other CPU,
|
||||
in which case the next RCU grace period can never complete, which
|
||||
will eventually cause the system to run out of memory and hang.
|
||||
While the system is in the process of running itself out of
|
||||
memory, you might see stall-warning messages.
|
||||
|
||||
o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
|
||||
is running at a higher priority than the RCU softirq threads.
|
||||
This will prevent RCU callbacks from ever being invoked,
|
||||
and in a CONFIG_PREEMPT_RCU kernel will further prevent
|
||||
RCU grace periods from ever completing. Either way, the
|
||||
system will eventually run out of memory and hang. In the
|
||||
CONFIG_PREEMPT_RCU case, you might see stall-warning
|
||||
messages.
|
||||
|
||||
o A hardware or software issue shuts off the scheduler-clock
|
||||
interrupt on a CPU that is not in dyntick-idle mode. This
|
||||
problem really has happened, and seems to be most likely to
|
||||
result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
|
||||
|
||||
o A bug in the RCU implementation.
|
||||
|
||||
o A hardware failure. This is quite unlikely, but has occurred
|
||||
at least once in real life. A CPU failed in a running system,
|
||||
becoming unresponsive, but not causing an immediate crash.
|
||||
This resulted in a series of RCU CPU stall warnings, eventually
|
||||
leading the realization that the CPU had failed.
|
||||
|
||||
The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall
|
||||
warning. Note that SRCU does -not- have CPU stall warnings. Please note
|
||||
that RCU only detects CPU stalls when there is a grace period in progress.
|
||||
No grace period, no CPU stall warnings.
|
||||
|
||||
To diagnose the cause of the stall, inspect the stack traces.
|
||||
The offending function will usually be near the top of the stack.
|
||||
If you have a series of stall warnings from a single extended stall,
|
||||
comparing the stack traces can often help determine where the stall
|
||||
is occurring, which will usually be in the function nearest the top of
|
||||
that portion of the stack which remains the same from trace to trace.
|
||||
If you can reliably trigger the stall, ftrace can be quite helpful.
|
||||
|
||||
RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE
|
||||
and with RCU's event tracing. For information on RCU's event tracing,
|
||||
see include/trace/events/rcu.h.
|
||||
|
@ -562,7 +562,9 @@ This section presents a "toy" RCU implementation that is based on
|
||||
familiar locking primitives. Its overhead makes it a non-starter for
|
||||
real-life use, as does its lack of scalability. It is also unsuitable
|
||||
for realtime use, since it allows scheduling latency to "bleed" from
|
||||
one read-side critical section to another.
|
||||
one read-side critical section to another. It also assumes recursive
|
||||
reader-writer locks: If you try this with non-recursive locks, and
|
||||
you allow nested rcu_read_lock() calls, you can deadlock.
|
||||
|
||||
However, it is probably the easiest implementation to relate to, so is
|
||||
a good starting point.
|
||||
@ -587,20 +589,21 @@ It is extremely simple:
|
||||
write_unlock(&rcu_gp_mutex);
|
||||
}
|
||||
|
||||
[You can ignore rcu_assign_pointer() and rcu_dereference() without
|
||||
missing much. But here they are anyway. And whatever you do, don't
|
||||
forget about them when submitting patches making use of RCU!]
|
||||
[You can ignore rcu_assign_pointer() and rcu_dereference() without missing
|
||||
much. But here are simplified versions anyway. And whatever you do,
|
||||
don't forget about them when submitting patches making use of RCU!]
|
||||
|
||||
#define rcu_assign_pointer(p, v) ({ \
|
||||
smp_wmb(); \
|
||||
(p) = (v); \
|
||||
})
|
||||
#define rcu_assign_pointer(p, v) \
|
||||
({ \
|
||||
smp_store_release(&(p), (v)); \
|
||||
})
|
||||
|
||||
#define rcu_dereference(p) ({ \
|
||||
typeof(p) _________p1 = p; \
|
||||
smp_read_barrier_depends(); \
|
||||
(_________p1); \
|
||||
})
|
||||
#define rcu_dereference(p) \
|
||||
({ \
|
||||
typeof(p) _________p1 = p; \
|
||||
smp_read_barrier_depends(); \
|
||||
(_________p1); \
|
||||
})
|
||||
|
||||
|
||||
The rcu_read_lock() and rcu_read_unlock() primitive read-acquire
|
||||
@ -925,7 +928,8 @@ d. Do you need RCU grace periods to complete even in the face
|
||||
|
||||
e. Is your workload too update-intensive for normal use of
|
||||
RCU, but inappropriate for other synchronization mechanisms?
|
||||
If so, consider SLAB_DESTROY_BY_RCU. But please be careful!
|
||||
If so, consider SLAB_TYPESAFE_BY_RCU (which was originally
|
||||
named SLAB_DESTROY_BY_RCU). But please be careful!
|
||||
|
||||
f. Do you need read-side critical sections that are respected
|
||||
even though they are in the middle of the idle loop, during
|
||||
|
@ -768,7 +768,7 @@ equal to zero, in which case the compiler is within its rights to
|
||||
transform the above code into the following:
|
||||
|
||||
q = READ_ONCE(a);
|
||||
WRITE_ONCE(b, 1);
|
||||
WRITE_ONCE(b, 2);
|
||||
do_something_else();
|
||||
|
||||
Given this transformation, the CPU is not required to respect the ordering
|
||||
|
@ -320,6 +320,9 @@ config HAVE_CMPXCHG_LOCAL
|
||||
config HAVE_CMPXCHG_DOUBLE
|
||||
bool
|
||||
|
||||
config ARCH_WEAK_RELEASE_ACQUIRE
|
||||
bool
|
||||
|
||||
config ARCH_WANT_IPC_PARSE_VERSION
|
||||
bool
|
||||
|
||||
|
@ -99,6 +99,7 @@ config PPC
|
||||
select ARCH_USE_BUILTIN_BSWAP
|
||||
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
|
||||
select ARCH_WANT_IPC_PARSE_VERSION
|
||||
select ARCH_WEAK_RELEASE_ACQUIRE
|
||||
select BINFMT_ELF
|
||||
select BUILDTIME_EXTABLE_SORT
|
||||
select CLONE_BACKWARDS
|
||||
|
@ -4665,7 +4665,7 @@ i915_gem_load_init(struct drm_i915_private *dev_priv)
|
||||
dev_priv->requests = KMEM_CACHE(drm_i915_gem_request,
|
||||
SLAB_HWCACHE_ALIGN |
|
||||
SLAB_RECLAIM_ACCOUNT |
|
||||
SLAB_DESTROY_BY_RCU);
|
||||
SLAB_TYPESAFE_BY_RCU);
|
||||
if (!dev_priv->requests)
|
||||
goto err_vmas;
|
||||
|
||||
|
@ -493,7 +493,7 @@ static inline struct drm_i915_gem_request *
|
||||
__i915_gem_active_get_rcu(const struct i915_gem_active *active)
|
||||
{
|
||||
/* Performing a lockless retrieval of the active request is super
|
||||
* tricky. SLAB_DESTROY_BY_RCU merely guarantees that the backing
|
||||
* tricky. SLAB_TYPESAFE_BY_RCU merely guarantees that the backing
|
||||
* slab of request objects will not be freed whilst we hold the
|
||||
* RCU read lock. It does not guarantee that the request itself
|
||||
* will not be freed and then *reused*. Viz,
|
||||
|
@ -1071,7 +1071,7 @@ int ldlm_init(void)
|
||||
ldlm_lock_slab = kmem_cache_create("ldlm_locks",
|
||||
sizeof(struct ldlm_lock), 0,
|
||||
SLAB_HWCACHE_ALIGN |
|
||||
SLAB_DESTROY_BY_RCU, NULL);
|
||||
SLAB_TYPESAFE_BY_RCU, NULL);
|
||||
if (!ldlm_lock_slab) {
|
||||
kmem_cache_destroy(ldlm_resource_slab);
|
||||
return -ENOMEM;
|
||||
|
@ -2340,7 +2340,7 @@ static int jbd2_journal_init_journal_head_cache(void)
|
||||
jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
|
||||
sizeof(struct journal_head),
|
||||
0, /* offset */
|
||||
SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU,
|
||||
SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU,
|
||||
NULL); /* ctor */
|
||||
retval = 0;
|
||||
if (!jbd2_journal_head_cache) {
|
||||
|
@ -38,7 +38,7 @@ void signalfd_cleanup(struct sighand_struct *sighand)
|
||||
/*
|
||||
* The lockless check can race with remove_wait_queue() in progress,
|
||||
* but in this case its caller should run under rcu_read_lock() and
|
||||
* sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return.
|
||||
* sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return.
|
||||
*/
|
||||
if (likely(!waitqueue_active(wqh)))
|
||||
return;
|
||||
|
@ -229,7 +229,7 @@ static inline struct dma_fence *dma_fence_get_rcu(struct dma_fence *fence)
|
||||
*
|
||||
* Function returns NULL if no refcount could be obtained, or the fence.
|
||||
* This function handles acquiring a reference to a fence that may be
|
||||
* reallocated within the RCU grace period (such as with SLAB_DESTROY_BY_RCU),
|
||||
* reallocated within the RCU grace period (such as with SLAB_TYPESAFE_BY_RCU),
|
||||
* so long as the caller is using RCU on the pointer to the fence.
|
||||
*
|
||||
* An alternative mechanism is to employ a seqlock to protect a bunch of
|
||||
@ -257,7 +257,7 @@ dma_fence_get_rcu_safe(struct dma_fence * __rcu *fencep)
|
||||
* have successfully acquire a reference to it. If it no
|
||||
* longer matches, we are holding a reference to some other
|
||||
* reallocated pointer. This is possible if the allocator
|
||||
* is using a freelist like SLAB_DESTROY_BY_RCU where the
|
||||
* is using a freelist like SLAB_TYPESAFE_BY_RCU where the
|
||||
* fence remains valid for the RCU grace period, but it
|
||||
* may be reallocated. When using such allocators, we are
|
||||
* responsible for ensuring the reference we get is to
|
||||
|
@ -375,8 +375,6 @@ struct kvm {
|
||||
struct mutex slots_lock;
|
||||
struct mm_struct *mm; /* userspace tied to this vm */
|
||||
struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM];
|
||||
struct srcu_struct srcu;
|
||||
struct srcu_struct irq_srcu;
|
||||
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
|
||||
|
||||
/*
|
||||
@ -429,6 +427,8 @@ struct kvm {
|
||||
struct list_head devices;
|
||||
struct dentry *debugfs_dentry;
|
||||
struct kvm_stat_data **debugfs_stat_data;
|
||||
struct srcu_struct srcu;
|
||||
struct srcu_struct irq_srcu;
|
||||
};
|
||||
|
||||
#define kvm_err(fmt, ...) \
|
||||
|
99
include/linux/rcu_node_tree.h
Normal file
99
include/linux/rcu_node_tree.h
Normal file
@ -0,0 +1,99 @@
|
||||
/*
|
||||
* RCU node combining tree definitions. These are used to compute
|
||||
* global attributes while avoiding common-case global contention. A key
|
||||
* property that these computations rely on is a tournament-style approach
|
||||
* where only one of the tasks contending a lower level in the tree need
|
||||
* advance to the next higher level. If properly configured, this allows
|
||||
* unlimited scalability while maintaining a constant level of contention
|
||||
* on the root node.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, you can access it online at
|
||||
* http://www.gnu.org/licenses/gpl-2.0.html.
|
||||
*
|
||||
* Copyright IBM Corporation, 2017
|
||||
*
|
||||
* Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
|
||||
*/
|
||||
|
||||
#ifndef __LINUX_RCU_NODE_TREE_H
|
||||
#define __LINUX_RCU_NODE_TREE_H
|
||||
|
||||
/*
|
||||
* Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
|
||||
* CONFIG_RCU_FANOUT_LEAF.
|
||||
* In theory, it should be possible to add more levels straightforwardly.
|
||||
* In practice, this did work well going from three levels to four.
|
||||
* Of course, your mileage may vary.
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_RCU_FANOUT
|
||||
#define RCU_FANOUT CONFIG_RCU_FANOUT
|
||||
#else /* #ifdef CONFIG_RCU_FANOUT */
|
||||
# ifdef CONFIG_64BIT
|
||||
# define RCU_FANOUT 64
|
||||
# else
|
||||
# define RCU_FANOUT 32
|
||||
# endif
|
||||
#endif /* #else #ifdef CONFIG_RCU_FANOUT */
|
||||
|
||||
#ifdef CONFIG_RCU_FANOUT_LEAF
|
||||
#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
|
||||
#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
|
||||
#define RCU_FANOUT_LEAF 16
|
||||
#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
|
||||
|
||||
#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
|
||||
#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT)
|
||||
#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT)
|
||||
#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT)
|
||||
|
||||
#if NR_CPUS <= RCU_FANOUT_1
|
||||
# define RCU_NUM_LVLS 1
|
||||
# define NUM_RCU_LVL_0 1
|
||||
# define NUM_RCU_NODES NUM_RCU_LVL_0
|
||||
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
|
||||
# define RCU_NODE_NAME_INIT { "rcu_node_0" }
|
||||
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
|
||||
#elif NR_CPUS <= RCU_FANOUT_2
|
||||
# define RCU_NUM_LVLS 2
|
||||
# define NUM_RCU_LVL_0 1
|
||||
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
|
||||
# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
|
||||
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
|
||||
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
|
||||
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
|
||||
#elif NR_CPUS <= RCU_FANOUT_3
|
||||
# define RCU_NUM_LVLS 3
|
||||
# define NUM_RCU_LVL_0 1
|
||||
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
|
||||
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
|
||||
# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
|
||||
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
|
||||
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
|
||||
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
|
||||
#elif NR_CPUS <= RCU_FANOUT_4
|
||||
# define RCU_NUM_LVLS 4
|
||||
# define NUM_RCU_LVL_0 1
|
||||
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
|
||||
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
|
||||
# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
|
||||
# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
|
||||
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
|
||||
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
|
||||
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
|
||||
#else
|
||||
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
|
||||
#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
|
||||
|
||||
#endif /* __LINUX_RCU_NODE_TREE_H */
|
712
include/linux/rcu_segcblist.h
Normal file
712
include/linux/rcu_segcblist.h
Normal file
@ -0,0 +1,712 @@
|
||||
/*
|
||||
* RCU segmented callback lists
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, you can access it online at
|
||||
* http://www.gnu.org/licenses/gpl-2.0.html.
|
||||
*
|
||||
* Copyright IBM Corporation, 2017
|
||||
*
|
||||
* Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
|
||||
*/
|
||||
|
||||
#ifndef __KERNEL_RCU_SEGCBLIST_H
|
||||
#define __KERNEL_RCU_SEGCBLIST_H
|
||||
|
||||
/* Simple unsegmented callback lists. */
|
||||
struct rcu_cblist {
|
||||
struct rcu_head *head;
|
||||
struct rcu_head **tail;
|
||||
long len;
|
||||
long len_lazy;
|
||||
};
|
||||
|
||||
#define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head }
|
||||
|
||||
/* Initialize simple callback list. */
|
||||
static inline void rcu_cblist_init(struct rcu_cblist *rclp)
|
||||
{
|
||||
rclp->head = NULL;
|
||||
rclp->tail = &rclp->head;
|
||||
rclp->len = 0;
|
||||
rclp->len_lazy = 0;
|
||||
}
|
||||
|
||||
/* Is simple callback list empty? */
|
||||
static inline bool rcu_cblist_empty(struct rcu_cblist *rclp)
|
||||
{
|
||||
return !rclp->head;
|
||||
}
|
||||
|
||||
/* Return number of callbacks in simple callback list. */
|
||||
static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
|
||||
{
|
||||
return rclp->len;
|
||||
}
|
||||
|
||||
/* Return number of lazy callbacks in simple callback list. */
|
||||
static inline long rcu_cblist_n_lazy_cbs(struct rcu_cblist *rclp)
|
||||
{
|
||||
return rclp->len_lazy;
|
||||
}
|
||||
|
||||
/*
|
||||
* Debug function to actually count the number of callbacks.
|
||||
* If the number exceeds the limit specified, return -1.
|
||||
*/
|
||||
static inline long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
|
||||
{
|
||||
int cnt = 0;
|
||||
struct rcu_head **rhpp = &rclp->head;
|
||||
|
||||
for (;;) {
|
||||
if (!*rhpp)
|
||||
return cnt;
|
||||
if (++cnt > lim)
|
||||
return -1;
|
||||
rhpp = &(*rhpp)->next;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Dequeue the oldest rcu_head structure from the specified callback
|
||||
* list. This function assumes that the callback is non-lazy, but
|
||||
* the caller can later invoke rcu_cblist_dequeued_lazy() if it
|
||||
* finds otherwise (and if it cares about laziness). This allows
|
||||
* different users to have different ways of determining laziness.
|
||||
*/
|
||||
static inline struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
|
||||
{
|
||||
struct rcu_head *rhp;
|
||||
|
||||
rhp = rclp->head;
|
||||
if (!rhp)
|
||||
return NULL;
|
||||
rclp->len--;
|
||||
rclp->head = rhp->next;
|
||||
if (!rclp->head)
|
||||
rclp->tail = &rclp->head;
|
||||
return rhp;
|
||||
}
|
||||
|
||||
/*
|
||||
* Account for the fact that a previously dequeued callback turned out
|
||||
* to be marked as lazy.
|
||||
*/
|
||||
static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
|
||||
{
|
||||
rclp->len_lazy--;
|
||||
}
|
||||
|
||||
/*
|
||||
* Interim function to return rcu_cblist head pointer. Longer term, the
|
||||
* rcu_cblist will be used more pervasively, removing the need for this
|
||||
* function.
|
||||
*/
|
||||
static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
|
||||
{
|
||||
return rclp->head;
|
||||
}
|
||||
|
||||
/*
|
||||
* Interim function to return rcu_cblist head pointer. Longer term, the
|
||||
* rcu_cblist will be used more pervasively, removing the need for this
|
||||
* function.
|
||||
*/
|
||||
static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
|
||||
{
|
||||
WARN_ON_ONCE(rcu_cblist_empty(rclp));
|
||||
return rclp->tail;
|
||||
}
|
||||
|
||||
/* Complicated segmented callback lists. ;-) */
|
||||
|
||||
/*
|
||||
* Index values for segments in rcu_segcblist structure.
|
||||
*
|
||||
* The segments are as follows:
|
||||
*
|
||||
* [head, *tails[RCU_DONE_TAIL]):
|
||||
* Callbacks whose grace period has elapsed, and thus can be invoked.
|
||||
* [*tails[RCU_DONE_TAIL], *tails[RCU_WAIT_TAIL]):
|
||||
* Callbacks waiting for the current GP from the current CPU's viewpoint.
|
||||
* [*tails[RCU_WAIT_TAIL], *tails[RCU_NEXT_READY_TAIL]):
|
||||
* Callbacks that arrived before the next GP started, again from
|
||||
* the current CPU's viewpoint. These can be handled by the next GP.
|
||||
* [*tails[RCU_NEXT_READY_TAIL], *tails[RCU_NEXT_TAIL]):
|
||||
* Callbacks that might have arrived after the next GP started.
|
||||
* There is some uncertainty as to when a given GP starts and
|
||||
* ends, but a CPU knows the exact times if it is the one starting
|
||||
* or ending the GP. Other CPUs know that the previous GP ends
|
||||
* before the next one starts.
|
||||
*
|
||||
* Note that RCU_WAIT_TAIL cannot be empty unless RCU_NEXT_READY_TAIL is also
|
||||
* empty.
|
||||
*
|
||||
* The ->gp_seq[] array contains the grace-period number at which the
|
||||
* corresponding segment of callbacks will be ready to invoke. A given
|
||||
* element of this array is meaningful only when the corresponding segment
|
||||
* is non-empty, and it is never valid for RCU_DONE_TAIL (whose callbacks
|
||||
* are already ready to invoke) or for RCU_NEXT_TAIL (whose callbacks have
|
||||
* not yet been assigned a grace-period number).
|
||||
*/
|
||||
#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
|
||||
#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
|
||||
#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
|
||||
#define RCU_NEXT_TAIL 3
|
||||
#define RCU_CBLIST_NSEGS 4
|
||||
|
||||
struct rcu_segcblist {
|
||||
struct rcu_head *head;
|
||||
struct rcu_head **tails[RCU_CBLIST_NSEGS];
|
||||
unsigned long gp_seq[RCU_CBLIST_NSEGS];
|
||||
long len;
|
||||
long len_lazy;
|
||||
};
|
||||
|
||||
#define RCU_SEGCBLIST_INITIALIZER(n) \
|
||||
{ \
|
||||
.head = NULL, \
|
||||
.tails[RCU_DONE_TAIL] = &n.head, \
|
||||
.tails[RCU_WAIT_TAIL] = &n.head, \
|
||||
.tails[RCU_NEXT_READY_TAIL] = &n.head, \
|
||||
.tails[RCU_NEXT_TAIL] = &n.head, \
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize an rcu_segcblist structure.
|
||||
*/
|
||||
static inline void rcu_segcblist_init(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
int i;
|
||||
|
||||
BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
|
||||
BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
|
||||
rsclp->head = NULL;
|
||||
for (i = 0; i < RCU_CBLIST_NSEGS; i++)
|
||||
rsclp->tails[i] = &rsclp->head;
|
||||
rsclp->len = 0;
|
||||
rsclp->len_lazy = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Is the specified rcu_segcblist structure empty?
|
||||
*
|
||||
* But careful! The fact that the ->head field is NULL does not
|
||||
* necessarily imply that there are no callbacks associated with
|
||||
* this structure. When callbacks are being invoked, they are
|
||||
* removed as a group. If callback invocation must be preempted,
|
||||
* the remaining callbacks will be added back to the list. Either
|
||||
* way, the counts are updated later.
|
||||
*
|
||||
* So it is often the case that rcu_segcblist_n_cbs() should be used
|
||||
* instead.
|
||||
*/
|
||||
static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
return !rsclp->head;
|
||||
}
|
||||
|
||||
/* Return number of callbacks in segmented callback list. */
|
||||
static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
return READ_ONCE(rsclp->len);
|
||||
}
|
||||
|
||||
/* Return number of lazy callbacks in segmented callback list. */
|
||||
static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
return rsclp->len_lazy;
|
||||
}
|
||||
|
||||
/* Return number of lazy callbacks in segmented callback list. */
|
||||
static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
return rsclp->len - rsclp->len_lazy;
|
||||
}
|
||||
|
||||
/*
|
||||
* Is the specified rcu_segcblist enabled, for example, not corresponding
|
||||
* to an offline or callback-offloaded CPU?
|
||||
*/
|
||||
static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
return !!rsclp->tails[RCU_NEXT_TAIL];
|
||||
}
|
||||
|
||||
/*
|
||||
* Disable the specified rcu_segcblist structure, so that callbacks can
|
||||
* no longer be posted to it. This structure must be empty.
|
||||
*/
|
||||
static inline void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
|
||||
WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
|
||||
WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
|
||||
rsclp->tails[RCU_NEXT_TAIL] = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Is the specified segment of the specified rcu_segcblist structure
|
||||
* empty of callbacks?
|
||||
*/
|
||||
static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
|
||||
{
|
||||
if (seg == RCU_DONE_TAIL)
|
||||
return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
|
||||
return rsclp->tails[seg - 1] == rsclp->tails[seg];
|
||||
}
|
||||
|
||||
/*
|
||||
* Are all segments following the specified segment of the specified
|
||||
* rcu_segcblist structure empty of callbacks? (The specified
|
||||
* segment might well contain callbacks.)
|
||||
*/
|
||||
static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
|
||||
{
|
||||
return !*rsclp->tails[seg];
|
||||
}
|
||||
|
||||
/*
|
||||
* Does the specified rcu_segcblist structure contain callbacks that
|
||||
* are ready to be invoked?
|
||||
*/
|
||||
static inline bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
return rcu_segcblist_is_enabled(rsclp) &&
|
||||
&rsclp->head != rsclp->tails[RCU_DONE_TAIL];
|
||||
}
|
||||
|
||||
/*
|
||||
* Does the specified rcu_segcblist structure contain callbacks that
|
||||
* are still pending, that is, not yet ready to be invoked?
|
||||
*/
|
||||
static inline bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
return rcu_segcblist_is_enabled(rsclp) &&
|
||||
!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Dequeue and return the first ready-to-invoke callback. If there
|
||||
* are no ready-to-invoke callbacks, return NULL. Disables interrupts
|
||||
* to avoid interference. Does not protect from interference from other
|
||||
* CPUs or tasks.
|
||||
*/
|
||||
static inline struct rcu_head *
|
||||
rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
unsigned long flags;
|
||||
int i;
|
||||
struct rcu_head *rhp;
|
||||
|
||||
local_irq_save(flags);
|
||||
if (!rcu_segcblist_ready_cbs(rsclp)) {
|
||||
local_irq_restore(flags);
|
||||
return NULL;
|
||||
}
|
||||
rhp = rsclp->head;
|
||||
BUG_ON(!rhp);
|
||||
rsclp->head = rhp->next;
|
||||
for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
|
||||
if (rsclp->tails[i] != &rhp->next)
|
||||
break;
|
||||
rsclp->tails[i] = &rsclp->head;
|
||||
}
|
||||
smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
|
||||
WRITE_ONCE(rsclp->len, rsclp->len - 1);
|
||||
local_irq_restore(flags);
|
||||
return rhp;
|
||||
}
|
||||
|
||||
/*
|
||||
* Account for the fact that a previously dequeued callback turned out
|
||||
* to be marked as lazy.
|
||||
*/
|
||||
static inline void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
rsclp->len_lazy--;
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a pointer to the first callback in the specified rcu_segcblist
|
||||
* structure. This is useful for diagnostics.
|
||||
*/
|
||||
static inline struct rcu_head *
|
||||
rcu_segcblist_first_cb(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
if (rcu_segcblist_is_enabled(rsclp))
|
||||
return rsclp->head;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a pointer to the first pending callback in the specified
|
||||
* rcu_segcblist structure. This is useful just after posting a given
|
||||
* callback -- if that callback is the first pending callback, then
|
||||
* you cannot rely on someone else having already started up the required
|
||||
* grace period.
|
||||
*/
|
||||
static inline struct rcu_head *
|
||||
rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
if (rcu_segcblist_is_enabled(rsclp))
|
||||
return *rsclp->tails[RCU_DONE_TAIL];
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Does the specified rcu_segcblist structure contain callbacks that
|
||||
* have not yet been processed beyond having been posted, that is,
|
||||
* does it contain callbacks in its last segment?
|
||||
*/
|
||||
static inline bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
return rcu_segcblist_is_enabled(rsclp) &&
|
||||
!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Enqueue the specified callback onto the specified rcu_segcblist
|
||||
* structure, updating accounting as needed. Note that the ->len
|
||||
* field may be accessed locklessly, hence the WRITE_ONCE().
|
||||
* The ->len field is used by rcu_barrier() and friends to determine
|
||||
* if it must post a callback on this structure, and it is OK
|
||||
* for rcu_barrier() to sometimes post callbacks needlessly, but
|
||||
* absolutely not OK for it to ever miss posting a callback.
|
||||
*/
|
||||
static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
|
||||
struct rcu_head *rhp, bool lazy)
|
||||
{
|
||||
WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
|
||||
if (lazy)
|
||||
rsclp->len_lazy++;
|
||||
smp_mb(); /* Ensure counts are updated before callback is enqueued. */
|
||||
rhp->next = NULL;
|
||||
*rsclp->tails[RCU_NEXT_TAIL] = rhp;
|
||||
rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
|
||||
}
|
||||
|
||||
/*
|
||||
* Entrain the specified callback onto the specified rcu_segcblist at
|
||||
* the end of the last non-empty segment. If the entire rcu_segcblist
|
||||
* is empty, make no change, but return false.
|
||||
*
|
||||
* This is intended for use by rcu_barrier()-like primitives, -not-
|
||||
* for normal grace-period use. IMPORTANT: The callback you enqueue
|
||||
* will wait for all prior callbacks, NOT necessarily for a grace
|
||||
* period. You have been warned.
|
||||
*/
|
||||
static inline bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
|
||||
struct rcu_head *rhp, bool lazy)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (rcu_segcblist_n_cbs(rsclp) == 0)
|
||||
return false;
|
||||
WRITE_ONCE(rsclp->len, rsclp->len + 1);
|
||||
if (lazy)
|
||||
rsclp->len_lazy++;
|
||||
smp_mb(); /* Ensure counts are updated before callback is entrained. */
|
||||
rhp->next = NULL;
|
||||
for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
|
||||
if (rsclp->tails[i] != rsclp->tails[i - 1])
|
||||
break;
|
||||
*rsclp->tails[i] = rhp;
|
||||
for (; i <= RCU_NEXT_TAIL; i++)
|
||||
rsclp->tails[i] = &rhp->next;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Extract only the counts from the specified rcu_segcblist structure,
|
||||
* and place them in the specified rcu_cblist structure. This function
|
||||
* supports both callback orphaning and invocation, hence the separation
|
||||
* of counts and callbacks. (Callbacks ready for invocation must be
|
||||
* orphaned and adopted separately from pending callbacks, but counts
|
||||
* apply to all callbacks. Locking must be used to make sure that
|
||||
* both orphaned-callbacks lists are consistent.)
|
||||
*/
|
||||
static inline void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
|
||||
struct rcu_cblist *rclp)
|
||||
{
|
||||
rclp->len_lazy += rsclp->len_lazy;
|
||||
rclp->len += rsclp->len;
|
||||
rsclp->len_lazy = 0;
|
||||
WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
|
||||
}
|
||||
|
||||
/*
|
||||
* Extract only those callbacks ready to be invoked from the specified
|
||||
* rcu_segcblist structure and place them in the specified rcu_cblist
|
||||
* structure.
|
||||
*/
|
||||
static inline void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
|
||||
struct rcu_cblist *rclp)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!rcu_segcblist_ready_cbs(rsclp))
|
||||
return; /* Nothing to do. */
|
||||
*rclp->tail = rsclp->head;
|
||||
rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
|
||||
*rsclp->tails[RCU_DONE_TAIL] = NULL;
|
||||
rclp->tail = rsclp->tails[RCU_DONE_TAIL];
|
||||
for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
|
||||
if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
|
||||
rsclp->tails[i] = &rsclp->head;
|
||||
}
|
||||
|
||||
/*
|
||||
* Extract only those callbacks still pending (not yet ready to be
|
||||
* invoked) from the specified rcu_segcblist structure and place them in
|
||||
* the specified rcu_cblist structure. Note that this loses information
|
||||
* about any callbacks that might have been partway done waiting for
|
||||
* their grace period. Too bad! They will have to start over.
|
||||
*/
|
||||
static inline void
|
||||
rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
|
||||
struct rcu_cblist *rclp)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!rcu_segcblist_pend_cbs(rsclp))
|
||||
return; /* Nothing to do. */
|
||||
*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
|
||||
rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
|
||||
*rsclp->tails[RCU_DONE_TAIL] = NULL;
|
||||
for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
|
||||
rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
|
||||
}
|
||||
|
||||
/*
|
||||
* Move the entire contents of the specified rcu_segcblist structure,
|
||||
* counts, callbacks, and all, to the specified rcu_cblist structure.
|
||||
* @@@ Why do we need this??? Moving early-boot CBs to NOCB lists?
|
||||
* @@@ Memory barrier needed? (Not if only used at boot time...)
|
||||
*/
|
||||
static inline void rcu_segcblist_extract_all(struct rcu_segcblist *rsclp,
|
||||
struct rcu_cblist *rclp)
|
||||
{
|
||||
rcu_segcblist_extract_done_cbs(rsclp, rclp);
|
||||
rcu_segcblist_extract_pend_cbs(rsclp, rclp);
|
||||
rcu_segcblist_extract_count(rsclp, rclp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert counts from the specified rcu_cblist structure in the
|
||||
* specified rcu_segcblist structure.
|
||||
*/
|
||||
static inline void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
|
||||
struct rcu_cblist *rclp)
|
||||
{
|
||||
rsclp->len_lazy += rclp->len_lazy;
|
||||
/* ->len sampled locklessly. */
|
||||
WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
|
||||
rclp->len_lazy = 0;
|
||||
rclp->len = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Move callbacks from the specified rcu_cblist to the beginning of the
|
||||
* done-callbacks segment of the specified rcu_segcblist.
|
||||
*/
|
||||
static inline void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
|
||||
struct rcu_cblist *rclp)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!rclp->head)
|
||||
return; /* No callbacks to move. */
|
||||
*rclp->tail = rsclp->head;
|
||||
rsclp->head = rclp->head;
|
||||
for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
|
||||
if (&rsclp->head == rsclp->tails[i])
|
||||
rsclp->tails[i] = rclp->tail;
|
||||
else
|
||||
break;
|
||||
rclp->head = NULL;
|
||||
rclp->tail = &rclp->head;
|
||||
}
|
||||
|
||||
/*
|
||||
* Move callbacks from the specified rcu_cblist to the end of the
|
||||
* new-callbacks segment of the specified rcu_segcblist.
|
||||
*/
|
||||
static inline void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
|
||||
struct rcu_cblist *rclp)
|
||||
{
|
||||
if (!rclp->head)
|
||||
return; /* Nothing to do. */
|
||||
*rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
|
||||
rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
|
||||
rclp->head = NULL;
|
||||
rclp->tail = &rclp->head;
|
||||
}
|
||||
|
||||
/*
|
||||
* Advance the callbacks in the specified rcu_segcblist structure based
|
||||
* on the current value passed in for the grace-period counter.
|
||||
*/
|
||||
static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp,
|
||||
unsigned long seq)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
|
||||
if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Find all callbacks whose ->gp_seq numbers indicate that they
|
||||
* are ready to invoke, and put them into the RCU_DONE_TAIL segment.
|
||||
*/
|
||||
for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
|
||||
if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
|
||||
break;
|
||||
rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
|
||||
}
|
||||
|
||||
/* If no callbacks moved, nothing more need be done. */
|
||||
if (i == RCU_WAIT_TAIL)
|
||||
return;
|
||||
|
||||
/* Clean up tail pointers that might have been misordered above. */
|
||||
for (j = RCU_WAIT_TAIL; j < i; j++)
|
||||
rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
|
||||
|
||||
/*
|
||||
* Callbacks moved, so clean up the misordered ->tails[] pointers
|
||||
* that now point into the middle of the list of ready-to-invoke
|
||||
* callbacks. The overall effect is to copy down the later pointers
|
||||
* into the gap that was created by the now-ready segments.
|
||||
*/
|
||||
for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
|
||||
if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
|
||||
break; /* No more callbacks. */
|
||||
rsclp->tails[j] = rsclp->tails[i];
|
||||
rsclp->gp_seq[j] = rsclp->gp_seq[i];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* "Accelerate" callbacks based on more-accurate grace-period information.
|
||||
* The reason for this is that RCU does not synchronize the beginnings and
|
||||
* ends of grace periods, and that callbacks are posted locally. This in
|
||||
* turn means that the callbacks must be labelled conservatively early
|
||||
* on, as getting exact information would degrade both performance and
|
||||
* scalability. When more accurate grace-period information becomes
|
||||
* available, previously posted callbacks can be "accelerated", marking
|
||||
* them to complete at the end of the earlier grace period.
|
||||
*
|
||||
* This function operates on an rcu_segcblist structure, and also the
|
||||
* grace-period sequence number seq at which new callbacks would become
|
||||
* ready to invoke. Returns true if there are callbacks that won't be
|
||||
* ready to invoke until seq, false otherwise.
|
||||
*/
|
||||
static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp,
|
||||
unsigned long seq)
|
||||
{
|
||||
int i;
|
||||
|
||||
WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
|
||||
if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Find the segment preceding the oldest segment of callbacks
|
||||
* whose ->gp_seq[] completion is at or after that passed in via
|
||||
* "seq", skipping any empty segments. This oldest segment, along
|
||||
* with any later segments, can be merged in with any newly arrived
|
||||
* callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
|
||||
* as their ->gp_seq[] grace-period completion sequence number.
|
||||
*/
|
||||
for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
|
||||
if (rsclp->tails[i] != rsclp->tails[i - 1] &&
|
||||
ULONG_CMP_LT(rsclp->gp_seq[i], seq))
|
||||
break;
|
||||
|
||||
/*
|
||||
* If all the segments contain callbacks that correspond to
|
||||
* earlier grace-period sequence numbers than "seq", leave.
|
||||
* Assuming that the rcu_segcblist structure has enough
|
||||
* segments in its arrays, this can only happen if some of
|
||||
* the non-done segments contain callbacks that really are
|
||||
* ready to invoke. This situation will get straightened
|
||||
* out by the next call to rcu_segcblist_advance().
|
||||
*
|
||||
* Also advance to the oldest segment of callbacks whose
|
||||
* ->gp_seq[] completion is at or after that passed in via "seq",
|
||||
* skipping any empty segments.
|
||||
*/
|
||||
if (++i >= RCU_NEXT_TAIL)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Merge all later callbacks, including newly arrived callbacks,
|
||||
* into the segment located by the for-loop above. Assign "seq"
|
||||
* as the ->gp_seq[] value in order to correctly handle the case
|
||||
* where there were no pending callbacks in the rcu_segcblist
|
||||
* structure other than in the RCU_NEXT_TAIL segment.
|
||||
*/
|
||||
for (; i < RCU_NEXT_TAIL; i++) {
|
||||
rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
|
||||
rsclp->gp_seq[i] = seq;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Scan the specified rcu_segcblist structure for callbacks that need
|
||||
* a grace period later than the one specified by "seq". We don't look
|
||||
* at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
|
||||
* have a grace-period sequence number.
|
||||
*/
|
||||
static inline bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
|
||||
unsigned long seq)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
|
||||
if (rsclp->tails[i - 1] != rsclp->tails[i] &&
|
||||
ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Interim function to return rcu_segcblist head pointer. Longer term, the
|
||||
* rcu_segcblist will be used more pervasively, removing the need for this
|
||||
* function.
|
||||
*/
|
||||
static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
return rsclp->head;
|
||||
}
|
||||
|
||||
/*
|
||||
* Interim function to return rcu_segcblist head pointer. Longer term, the
|
||||
* rcu_segcblist will be used more pervasively, removing the need for this
|
||||
* function.
|
||||
*/
|
||||
static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
|
||||
{
|
||||
WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
|
||||
return rsclp->tails[RCU_NEXT_TAIL];
|
||||
}
|
||||
|
||||
#endif /* __KERNEL_RCU_SEGCBLIST_H */
|
@ -509,7 +509,8 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n,
|
||||
{
|
||||
struct hlist_node *i, *last = NULL;
|
||||
|
||||
for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i))
|
||||
/* Note: write side code, so rcu accessors are not needed. */
|
||||
for (i = h->first; i; i = i->next)
|
||||
last = i;
|
||||
|
||||
if (last) {
|
||||
|
@ -363,15 +363,20 @@ static inline void rcu_init_nohz(void)
|
||||
#ifdef CONFIG_TASKS_RCU
|
||||
#define TASKS_RCU(x) x
|
||||
extern struct srcu_struct tasks_rcu_exit_srcu;
|
||||
#define rcu_note_voluntary_context_switch(t) \
|
||||
#define rcu_note_voluntary_context_switch_lite(t) \
|
||||
do { \
|
||||
rcu_all_qs(); \
|
||||
if (READ_ONCE((t)->rcu_tasks_holdout)) \
|
||||
WRITE_ONCE((t)->rcu_tasks_holdout, false); \
|
||||
} while (0)
|
||||
#define rcu_note_voluntary_context_switch(t) \
|
||||
do { \
|
||||
rcu_all_qs(); \
|
||||
rcu_note_voluntary_context_switch_lite(t); \
|
||||
} while (0)
|
||||
#else /* #ifdef CONFIG_TASKS_RCU */
|
||||
#define TASKS_RCU(x) do { } while (0)
|
||||
#define rcu_note_voluntary_context_switch(t) rcu_all_qs()
|
||||
#define rcu_note_voluntary_context_switch_lite(t) do { } while (0)
|
||||
#define rcu_note_voluntary_context_switch(t) rcu_all_qs()
|
||||
#endif /* #else #ifdef CONFIG_TASKS_RCU */
|
||||
|
||||
/**
|
||||
@ -1127,11 +1132,11 @@ do { \
|
||||
* if the UNLOCK and LOCK are executed by the same CPU or if the
|
||||
* UNLOCK and LOCK operate on the same lock variable.
|
||||
*/
|
||||
#ifdef CONFIG_PPC
|
||||
#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE
|
||||
#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */
|
||||
#else /* #ifdef CONFIG_PPC */
|
||||
#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
|
||||
#define smp_mb__after_unlock_lock() do { } while (0)
|
||||
#endif /* #else #ifdef CONFIG_PPC */
|
||||
#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
|
||||
|
||||
|
||||
#endif /* __LINUX_RCUPDATE_H */
|
||||
|
@ -33,6 +33,11 @@ static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool rcu_eqs_special_set(int cpu)
|
||||
{
|
||||
return false; /* Never flag non-existent other CPUs! */
|
||||
}
|
||||
|
||||
static inline unsigned long get_state_synchronize_rcu(void)
|
||||
{
|
||||
return 0;
|
||||
@ -87,10 +92,11 @@ static inline void kfree_call_rcu(struct rcu_head *head,
|
||||
call_rcu(head, func);
|
||||
}
|
||||
|
||||
static inline void rcu_note_context_switch(void)
|
||||
{
|
||||
rcu_sched_qs();
|
||||
}
|
||||
#define rcu_note_context_switch(preempt) \
|
||||
do { \
|
||||
rcu_sched_qs(); \
|
||||
rcu_note_voluntary_context_switch_lite(current); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* Take advantage of the fact that there is only one CPU, which
|
||||
@ -212,14 +218,14 @@ static inline void exit_rcu(void)
|
||||
{
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
|
||||
extern int rcu_scheduler_active __read_mostly;
|
||||
void rcu_scheduler_starting(void);
|
||||
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
|
||||
#else /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
|
||||
static inline void rcu_scheduler_starting(void)
|
||||
{
|
||||
}
|
||||
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
|
||||
#endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
|
||||
|
||||
#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
|
||||
|
||||
@ -237,6 +243,10 @@ static inline bool rcu_is_watching(void)
|
||||
|
||||
#endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
|
||||
|
||||
static inline void rcu_request_urgent_qs_task(struct task_struct *t)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void rcu_all_qs(void)
|
||||
{
|
||||
barrier(); /* Avoid RCU read-side critical sections leaking across. */
|
||||
|
@ -30,7 +30,7 @@
|
||||
#ifndef __LINUX_RCUTREE_H
|
||||
#define __LINUX_RCUTREE_H
|
||||
|
||||
void rcu_note_context_switch(void);
|
||||
void rcu_note_context_switch(bool preempt);
|
||||
int rcu_needs_cpu(u64 basem, u64 *nextevt);
|
||||
void rcu_cpu_stall_reset(void);
|
||||
|
||||
@ -41,7 +41,7 @@ void rcu_cpu_stall_reset(void);
|
||||
*/
|
||||
static inline void rcu_virt_note_context_switch(int cpu)
|
||||
{
|
||||
rcu_note_context_switch();
|
||||
rcu_note_context_switch(false);
|
||||
}
|
||||
|
||||
void synchronize_rcu_bh(void);
|
||||
@ -108,6 +108,7 @@ void rcu_scheduler_starting(void);
|
||||
extern int rcu_scheduler_active __read_mostly;
|
||||
|
||||
bool rcu_is_watching(void);
|
||||
void rcu_request_urgent_qs_task(struct task_struct *t);
|
||||
|
||||
void rcu_all_qs(void);
|
||||
|
||||
|
@ -28,7 +28,7 @@
|
||||
#define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */
|
||||
#define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */
|
||||
/*
|
||||
* SLAB_DESTROY_BY_RCU - **WARNING** READ THIS!
|
||||
* SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
|
||||
*
|
||||
* This delays freeing the SLAB page by a grace period, it does _NOT_
|
||||
* delay object freeing. This means that if you do kmem_cache_free()
|
||||
@ -61,8 +61,10 @@
|
||||
*
|
||||
* rcu_read_lock before reading the address, then rcu_read_unlock after
|
||||
* taking the spinlock within the structure expected at that address.
|
||||
*
|
||||
* Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
|
||||
*/
|
||||
#define SLAB_DESTROY_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */
|
||||
#define SLAB_TYPESAFE_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */
|
||||
#define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */
|
||||
#define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */
|
||||
|
||||
|
@ -22,7 +22,7 @@
|
||||
* Lai Jiangshan <laijs@cn.fujitsu.com>
|
||||
*
|
||||
* For detailed explanation of Read-Copy Update mechanism see -
|
||||
* Documentation/RCU/ *.txt
|
||||
* Documentation/RCU/ *.txt
|
||||
*
|
||||
*/
|
||||
|
||||
@ -32,35 +32,9 @@
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/rcu_segcblist.h>
|
||||
|
||||
struct srcu_array {
|
||||
unsigned long lock_count[2];
|
||||
unsigned long unlock_count[2];
|
||||
};
|
||||
|
||||
struct rcu_batch {
|
||||
struct rcu_head *head, **tail;
|
||||
};
|
||||
|
||||
#define RCU_BATCH_INIT(name) { NULL, &(name.head) }
|
||||
|
||||
struct srcu_struct {
|
||||
unsigned long completed;
|
||||
struct srcu_array __percpu *per_cpu_ref;
|
||||
spinlock_t queue_lock; /* protect ->batch_queue, ->running */
|
||||
bool running;
|
||||
/* callbacks just queued */
|
||||
struct rcu_batch batch_queue;
|
||||
/* callbacks try to do the first check_zero */
|
||||
struct rcu_batch batch_check0;
|
||||
/* callbacks done with the first check_zero and the flip */
|
||||
struct rcu_batch batch_check1;
|
||||
struct rcu_batch batch_done;
|
||||
struct delayed_work work;
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
struct lockdep_map dep_map;
|
||||
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
|
||||
};
|
||||
struct srcu_struct;
|
||||
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
|
||||
@ -82,46 +56,15 @@ int init_srcu_struct(struct srcu_struct *sp);
|
||||
#define __SRCU_DEP_MAP_INIT(srcu_name)
|
||||
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
|
||||
|
||||
void process_srcu(struct work_struct *work);
|
||||
|
||||
#define __SRCU_STRUCT_INIT(name) \
|
||||
{ \
|
||||
.completed = -300, \
|
||||
.per_cpu_ref = &name##_srcu_array, \
|
||||
.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \
|
||||
.running = false, \
|
||||
.batch_queue = RCU_BATCH_INIT(name.batch_queue), \
|
||||
.batch_check0 = RCU_BATCH_INIT(name.batch_check0), \
|
||||
.batch_check1 = RCU_BATCH_INIT(name.batch_check1), \
|
||||
.batch_done = RCU_BATCH_INIT(name.batch_done), \
|
||||
.work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
|
||||
__SRCU_DEP_MAP_INIT(name) \
|
||||
}
|
||||
|
||||
/*
|
||||
* Define and initialize a srcu struct at build time.
|
||||
* Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
|
||||
*
|
||||
* Note that although DEFINE_STATIC_SRCU() hides the name from other
|
||||
* files, the per-CPU variable rules nevertheless require that the
|
||||
* chosen name be globally unique. These rules also prohibit use of
|
||||
* DEFINE_STATIC_SRCU() within a function. If these rules are too
|
||||
* restrictive, declare the srcu_struct manually. For example, in
|
||||
* each file:
|
||||
*
|
||||
* static struct srcu_struct my_srcu;
|
||||
*
|
||||
* Then, before the first use of each my_srcu, manually initialize it:
|
||||
*
|
||||
* init_srcu_struct(&my_srcu);
|
||||
*
|
||||
* See include/linux/percpu-defs.h for the rules on per-CPU variables.
|
||||
*/
|
||||
#define __DEFINE_SRCU(name, is_static) \
|
||||
static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
|
||||
is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
|
||||
#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
|
||||
#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)
|
||||
#ifdef CONFIG_TINY_SRCU
|
||||
#include <linux/srcutiny.h>
|
||||
#elif defined(CONFIG_TREE_SRCU)
|
||||
#include <linux/srcutree.h>
|
||||
#elif defined(CONFIG_CLASSIC_SRCU)
|
||||
#include <linux/srcuclassic.h>
|
||||
#else
|
||||
#error "Unknown SRCU implementation specified to kernel configuration"
|
||||
#endif
|
||||
|
||||
/**
|
||||
* call_srcu() - Queue a callback for invocation after an SRCU grace period
|
||||
@ -147,9 +90,6 @@ void cleanup_srcu_struct(struct srcu_struct *sp);
|
||||
int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
|
||||
void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
|
||||
void synchronize_srcu(struct srcu_struct *sp);
|
||||
void synchronize_srcu_expedited(struct srcu_struct *sp);
|
||||
unsigned long srcu_batches_completed(struct srcu_struct *sp);
|
||||
void srcu_barrier(struct srcu_struct *sp);
|
||||
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
|
||||
|
101
include/linux/srcuclassic.h
Normal file
101
include/linux/srcuclassic.h
Normal file
@ -0,0 +1,101 @@
|
||||
/*
|
||||
* Sleepable Read-Copy Update mechanism for mutual exclusion,
|
||||
* classic v4.11 variant.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, you can access it online at
|
||||
* http://www.gnu.org/licenses/gpl-2.0.html.
|
||||
*
|
||||
* Copyright (C) IBM Corporation, 2017
|
||||
*
|
||||
* Author: Paul McKenney <paulmck@us.ibm.com>
|
||||
*/
|
||||
|
||||
#ifndef _LINUX_SRCU_CLASSIC_H
|
||||
#define _LINUX_SRCU_CLASSIC_H
|
||||
|
||||
struct srcu_array {
|
||||
unsigned long lock_count[2];
|
||||
unsigned long unlock_count[2];
|
||||
};
|
||||
|
||||
struct rcu_batch {
|
||||
struct rcu_head *head, **tail;
|
||||
};
|
||||
|
||||
#define RCU_BATCH_INIT(name) { NULL, &(name.head) }
|
||||
|
||||
struct srcu_struct {
|
||||
unsigned long completed;
|
||||
struct srcu_array __percpu *per_cpu_ref;
|
||||
spinlock_t queue_lock; /* protect ->batch_queue, ->running */
|
||||
bool running;
|
||||
/* callbacks just queued */
|
||||
struct rcu_batch batch_queue;
|
||||
/* callbacks try to do the first check_zero */
|
||||
struct rcu_batch batch_check0;
|
||||
/* callbacks done with the first check_zero and the flip */
|
||||
struct rcu_batch batch_check1;
|
||||
struct rcu_batch batch_done;
|
||||
struct delayed_work work;
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
struct lockdep_map dep_map;
|
||||
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
|
||||
};
|
||||
|
||||
void process_srcu(struct work_struct *work);
|
||||
|
||||
#define __SRCU_STRUCT_INIT(name) \
|
||||
{ \
|
||||
.completed = -300, \
|
||||
.per_cpu_ref = &name##_srcu_array, \
|
||||
.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \
|
||||
.running = false, \
|
||||
.batch_queue = RCU_BATCH_INIT(name.batch_queue), \
|
||||
.batch_check0 = RCU_BATCH_INIT(name.batch_check0), \
|
||||
.batch_check1 = RCU_BATCH_INIT(name.batch_check1), \
|
||||
.batch_done = RCU_BATCH_INIT(name.batch_done), \
|
||||
.work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
|
||||
__SRCU_DEP_MAP_INIT(name) \
|
||||
}
|
||||
|
||||
/*
|
||||
* Define and initialize a srcu struct at build time.
|
||||
* Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
|
||||
*
|
||||
* Note that although DEFINE_STATIC_SRCU() hides the name from other
|
||||
* files, the per-CPU variable rules nevertheless require that the
|
||||
* chosen name be globally unique. These rules also prohibit use of
|
||||
* DEFINE_STATIC_SRCU() within a function. If these rules are too
|
||||
* restrictive, declare the srcu_struct manually. For example, in
|
||||
* each file:
|
||||
*
|
||||
* static struct srcu_struct my_srcu;
|
||||
*
|
||||
* Then, before the first use of each my_srcu, manually initialize it:
|
||||
*
|
||||
* init_srcu_struct(&my_srcu);
|
||||
*
|
||||
* See include/linux/percpu-defs.h for the rules on per-CPU variables.
|
||||
*/
|
||||
#define __DEFINE_SRCU(name, is_static) \
|
||||
static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
|
||||
is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
|
||||
#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
|
||||
#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)
|
||||
|
||||
void synchronize_srcu_expedited(struct srcu_struct *sp);
|
||||
void srcu_barrier(struct srcu_struct *sp);
|
||||
unsigned long srcu_batches_completed(struct srcu_struct *sp);
|
||||
|
||||
#endif
|
81
include/linux/srcutiny.h
Normal file
81
include/linux/srcutiny.h
Normal file
@ -0,0 +1,81 @@
|
||||
/*
|
||||
* Sleepable Read-Copy Update mechanism for mutual exclusion,
|
||||
* tiny variant.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, you can access it online at
|
||||
* http://www.gnu.org/licenses/gpl-2.0.html.
|
||||
*
|
||||
* Copyright (C) IBM Corporation, 2017
|
||||
*
|
||||
* Author: Paul McKenney <paulmck@us.ibm.com>
|
||||
*/
|
||||
|
||||
#ifndef _LINUX_SRCU_TINY_H
|
||||
#define _LINUX_SRCU_TINY_H
|
||||
|
||||
#include <linux/swait.h>
|
||||
|
||||
struct srcu_struct {
|
||||
int srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */
|
||||
struct swait_queue_head srcu_wq;
|
||||
/* Last srcu_read_unlock() wakes GP. */
|
||||
unsigned long srcu_gp_seq; /* GP seq # for callback tagging. */
|
||||
struct rcu_segcblist srcu_cblist;
|
||||
/* Pending SRCU callbacks. */
|
||||
int srcu_idx; /* Current reader array element. */
|
||||
bool srcu_gp_running; /* GP workqueue running? */
|
||||
bool srcu_gp_waiting; /* GP waiting for readers? */
|
||||
struct work_struct srcu_work; /* For driving grace periods. */
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
struct lockdep_map dep_map;
|
||||
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
|
||||
};
|
||||
|
||||
void srcu_drive_gp(struct work_struct *wp);
|
||||
|
||||
#define __SRCU_STRUCT_INIT(name) \
|
||||
{ \
|
||||
.srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \
|
||||
.srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist), \
|
||||
.srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp), \
|
||||
__SRCU_DEP_MAP_INIT(name) \
|
||||
}
|
||||
|
||||
/*
|
||||
* This odd _STATIC_ arrangement is needed for API compatibility with
|
||||
* Tree SRCU, which needs some per-CPU data.
|
||||
*/
|
||||
#define DEFINE_SRCU(name) \
|
||||
struct srcu_struct name = __SRCU_STRUCT_INIT(name)
|
||||
#define DEFINE_STATIC_SRCU(name) \
|
||||
static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
|
||||
|
||||
void synchronize_srcu(struct srcu_struct *sp);
|
||||
|
||||
static inline void synchronize_srcu_expedited(struct srcu_struct *sp)
|
||||
{
|
||||
synchronize_srcu(sp);
|
||||
}
|
||||
|
||||
static inline void srcu_barrier(struct srcu_struct *sp)
|
||||
{
|
||||
synchronize_srcu(sp);
|
||||
}
|
||||
|
||||
static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
139
include/linux/srcutree.h
Normal file
139
include/linux/srcutree.h
Normal file
@ -0,0 +1,139 @@
|
||||
/*
|
||||
* Sleepable Read-Copy Update mechanism for mutual exclusion,
|
||||
* tree variant.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, you can access it online at
|
||||
* http://www.gnu.org/licenses/gpl-2.0.html.
|
||||
*
|
||||
* Copyright (C) IBM Corporation, 2017
|
||||
*
|
||||
* Author: Paul McKenney <paulmck@us.ibm.com>
|
||||
*/
|
||||
|
||||
#ifndef _LINUX_SRCU_TREE_H
|
||||
#define _LINUX_SRCU_TREE_H
|
||||
|
||||
#include <linux/rcu_node_tree.h>
|
||||
#include <linux/completion.h>
|
||||
|
||||
struct srcu_node;
|
||||
struct srcu_struct;
|
||||
|
||||
/*
|
||||
* Per-CPU structure feeding into leaf srcu_node, similar in function
|
||||
* to rcu_node.
|
||||
*/
|
||||
struct srcu_data {
|
||||
/* Read-side state. */
|
||||
unsigned long srcu_lock_count[2]; /* Locks per CPU. */
|
||||
unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */
|
||||
|
||||
/* Update-side state. */
|
||||
spinlock_t lock ____cacheline_internodealigned_in_smp;
|
||||
struct rcu_segcblist srcu_cblist; /* List of callbacks.*/
|
||||
unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */
|
||||
bool srcu_cblist_invoking; /* Invoking these CBs? */
|
||||
struct delayed_work work; /* Context for CB invoking. */
|
||||
struct rcu_head srcu_barrier_head; /* For srcu_barrier() use. */
|
||||
struct srcu_node *mynode; /* Leaf srcu_node. */
|
||||
int cpu;
|
||||
struct srcu_struct *sp;
|
||||
};
|
||||
|
||||
/*
|
||||
* Node in SRCU combining tree, similar in function to rcu_data.
|
||||
*/
|
||||
struct srcu_node {
|
||||
spinlock_t lock;
|
||||
unsigned long srcu_have_cbs[4]; /* GP seq for children */
|
||||
/* having CBs, but only */
|
||||
/* is > ->srcu_gq_seq. */
|
||||
struct srcu_node *srcu_parent; /* Next up in tree. */
|
||||
int grplo; /* Least CPU for node. */
|
||||
int grphi; /* Biggest CPU for node. */
|
||||
};
|
||||
|
||||
/*
|
||||
* Per-SRCU-domain structure, similar in function to rcu_state.
|
||||
*/
|
||||
struct srcu_struct {
|
||||
struct srcu_node node[NUM_RCU_NODES]; /* Combining tree. */
|
||||
struct srcu_node *level[RCU_NUM_LVLS + 1];
|
||||
/* First node at each level. */
|
||||
struct mutex srcu_cb_mutex; /* Serialize CB preparation. */
|
||||
spinlock_t gp_lock; /* protect ->srcu_cblist */
|
||||
struct mutex srcu_gp_mutex; /* Serialize GP work. */
|
||||
unsigned int srcu_idx; /* Current rdr array element. */
|
||||
unsigned long srcu_gp_seq; /* Grace-period seq #. */
|
||||
unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */
|
||||
atomic_t srcu_exp_cnt; /* # ongoing expedited GPs. */
|
||||
struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */
|
||||
unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */
|
||||
struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */
|
||||
struct completion srcu_barrier_completion;
|
||||
/* Awaken barrier rq at end. */
|
||||
atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */
|
||||
/* callback for the barrier */
|
||||
/* operation. */
|
||||
struct delayed_work work;
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
struct lockdep_map dep_map;
|
||||
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
|
||||
};
|
||||
|
||||
/* Values for state variable (bottom bits of ->srcu_gp_seq). */
|
||||
#define SRCU_STATE_IDLE 0
|
||||
#define SRCU_STATE_SCAN1 1
|
||||
#define SRCU_STATE_SCAN2 2
|
||||
|
||||
void process_srcu(struct work_struct *work);
|
||||
|
||||
#define __SRCU_STRUCT_INIT(name) \
|
||||
{ \
|
||||
.sda = &name##_srcu_data, \
|
||||
.gp_lock = __SPIN_LOCK_UNLOCKED(name.gp_lock), \
|
||||
.srcu_gp_seq_needed = 0 - 1, \
|
||||
__SRCU_DEP_MAP_INIT(name) \
|
||||
}
|
||||
|
||||
/*
|
||||
* Define and initialize a srcu struct at build time.
|
||||
* Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
|
||||
*
|
||||
* Note that although DEFINE_STATIC_SRCU() hides the name from other
|
||||
* files, the per-CPU variable rules nevertheless require that the
|
||||
* chosen name be globally unique. These rules also prohibit use of
|
||||
* DEFINE_STATIC_SRCU() within a function. If these rules are too
|
||||
* restrictive, declare the srcu_struct manually. For example, in
|
||||
* each file:
|
||||
*
|
||||
* static struct srcu_struct my_srcu;
|
||||
*
|
||||
* Then, before the first use of each my_srcu, manually initialize it:
|
||||
*
|
||||
* init_srcu_struct(&my_srcu);
|
||||
*
|
||||
* See include/linux/percpu-defs.h for the rules on per-CPU variables.
|
||||
*/
|
||||
#define __DEFINE_SRCU(name, is_static) \
|
||||
static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\
|
||||
is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
|
||||
#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
|
||||
#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)
|
||||
|
||||
void synchronize_srcu_expedited(struct srcu_struct *sp);
|
||||
void srcu_barrier(struct srcu_struct *sp);
|
||||
unsigned long srcu_batches_completed(struct srcu_struct *sp);
|
||||
|
||||
#endif
|
@ -209,7 +209,7 @@ struct ustat {
|
||||
* naturally due ABI requirements, but some architectures (like CRIS) have
|
||||
* weird ABI and we need to ask it explicitly.
|
||||
*
|
||||
* The alignment is required to guarantee that bits 0 and 1 of @next will be
|
||||
* The alignment is required to guarantee that bit 0 of @next will be
|
||||
* clear under normal conditions -- as long as we use call_rcu(),
|
||||
* call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback.
|
||||
*
|
||||
|
@ -995,7 +995,7 @@ struct smc_hashinfo;
|
||||
struct module;
|
||||
|
||||
/*
|
||||
* caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
|
||||
* caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes
|
||||
* un-modified. Special care is taken when initializing object to zero.
|
||||
*/
|
||||
static inline void sk_prot_clear_nulls(struct sock *sk, int size)
|
||||
|
39
init/Kconfig
39
init/Kconfig
@ -526,6 +526,35 @@ config SRCU
|
||||
permits arbitrary sleeping or blocking within RCU read-side critical
|
||||
sections.
|
||||
|
||||
config CLASSIC_SRCU
|
||||
bool "Use v4.11 classic SRCU implementation"
|
||||
default n
|
||||
depends on RCU_EXPERT && SRCU
|
||||
help
|
||||
This option selects the traditional well-tested classic SRCU
|
||||
implementation from v4.11, as might be desired for enterprise
|
||||
Linux distributions. Without this option, the shiny new
|
||||
Tiny SRCU and Tree SRCU implementations are used instead.
|
||||
At some point, it is hoped that Tiny SRCU and Tree SRCU
|
||||
will accumulate enough test time and confidence to allow
|
||||
Classic SRCU to be dropped entirely.
|
||||
|
||||
Say Y if you need a rock-solid SRCU.
|
||||
|
||||
Say N if you would like help test Tree SRCU.
|
||||
|
||||
config TINY_SRCU
|
||||
bool
|
||||
default y if TINY_RCU && !CLASSIC_SRCU
|
||||
help
|
||||
This option selects the single-CPU non-preemptible version of SRCU.
|
||||
|
||||
config TREE_SRCU
|
||||
bool
|
||||
default y if !TINY_RCU && !CLASSIC_SRCU
|
||||
help
|
||||
This option selects the full-fledged version of SRCU.
|
||||
|
||||
config TASKS_RCU
|
||||
bool
|
||||
default n
|
||||
@ -612,11 +641,17 @@ config RCU_FANOUT_LEAF
|
||||
initialization. These systems tend to run CPU-bound, and thus
|
||||
are not helped by synchronized interrupts, and thus tend to
|
||||
skew them, which reduces lock contention enough that large
|
||||
leaf-level fanouts work well.
|
||||
leaf-level fanouts work well. That said, setting leaf-level
|
||||
fanout to a large number will likely cause problematic
|
||||
lock contention on the leaf-level rcu_node structures unless
|
||||
you boot with the skew_tick kernel parameter.
|
||||
|
||||
Select a specific number if testing RCU itself.
|
||||
|
||||
Select the maximum permissible value for large systems.
|
||||
Select the maximum permissible value for large systems, but
|
||||
please understand that you may also need to set the skew_tick
|
||||
kernel boot parameter to avoid contention on the rcu_node
|
||||
structure's locks.
|
||||
|
||||
Take the default if unsure.
|
||||
|
||||
|
@ -1313,7 +1313,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
|
||||
if (atomic_dec_and_test(&sighand->count)) {
|
||||
signalfd_cleanup(sighand);
|
||||
/*
|
||||
* sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it
|
||||
* sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
|
||||
* without an RCU grace period, see __lock_task_sighand().
|
||||
*/
|
||||
kmem_cache_free(sighand_cachep, sighand);
|
||||
@ -2144,7 +2144,7 @@ void __init proc_caches_init(void)
|
||||
{
|
||||
sighand_cachep = kmem_cache_create("sighand_cache",
|
||||
sizeof(struct sighand_struct), 0,
|
||||
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
|
||||
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
|
||||
SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
|
||||
signal_cachep = kmem_cache_create("signal_cache",
|
||||
sizeof(struct signal_struct), 0,
|
||||
|
@ -1144,10 +1144,10 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
|
||||
return 0;
|
||||
|
||||
printk("\n");
|
||||
printk("======================================================\n");
|
||||
printk("[ INFO: possible circular locking dependency detected ]\n");
|
||||
pr_warn("======================================================\n");
|
||||
pr_warn("WARNING: possible circular locking dependency detected\n");
|
||||
print_kernel_ident();
|
||||
printk("-------------------------------------------------------\n");
|
||||
pr_warn("------------------------------------------------------\n");
|
||||
printk("%s/%d is trying to acquire lock:\n",
|
||||
curr->comm, task_pid_nr(curr));
|
||||
print_lock(check_src);
|
||||
@ -1482,11 +1482,11 @@ print_bad_irq_dependency(struct task_struct *curr,
|
||||
return 0;
|
||||
|
||||
printk("\n");
|
||||
printk("======================================================\n");
|
||||
printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
|
||||
pr_warn("=====================================================\n");
|
||||
pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n",
|
||||
irqclass, irqclass);
|
||||
print_kernel_ident();
|
||||
printk("------------------------------------------------------\n");
|
||||
pr_warn("-----------------------------------------------------\n");
|
||||
printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
|
||||
curr->comm, task_pid_nr(curr),
|
||||
curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
|
||||
@ -1711,10 +1711,10 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
|
||||
return 0;
|
||||
|
||||
printk("\n");
|
||||
printk("=============================================\n");
|
||||
printk("[ INFO: possible recursive locking detected ]\n");
|
||||
pr_warn("============================================\n");
|
||||
pr_warn("WARNING: possible recursive locking detected\n");
|
||||
print_kernel_ident();
|
||||
printk("---------------------------------------------\n");
|
||||
pr_warn("--------------------------------------------\n");
|
||||
printk("%s/%d is trying to acquire lock:\n",
|
||||
curr->comm, task_pid_nr(curr));
|
||||
print_lock(next);
|
||||
@ -2061,10 +2061,10 @@ static void print_collision(struct task_struct *curr,
|
||||
struct lock_chain *chain)
|
||||
{
|
||||
printk("\n");
|
||||
printk("======================\n");
|
||||
printk("[chain_key collision ]\n");
|
||||
pr_warn("============================\n");
|
||||
pr_warn("WARNING: chain_key collision\n");
|
||||
print_kernel_ident();
|
||||
printk("----------------------\n");
|
||||
pr_warn("----------------------------\n");
|
||||
printk("%s/%d: ", current->comm, task_pid_nr(current));
|
||||
printk("Hash chain already cached but the contents don't match!\n");
|
||||
|
||||
@ -2360,10 +2360,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
|
||||
return 0;
|
||||
|
||||
printk("\n");
|
||||
printk("=================================\n");
|
||||
printk("[ INFO: inconsistent lock state ]\n");
|
||||
pr_warn("================================\n");
|
||||
pr_warn("WARNING: inconsistent lock state\n");
|
||||
print_kernel_ident();
|
||||
printk("---------------------------------\n");
|
||||
pr_warn("--------------------------------\n");
|
||||
|
||||
printk("inconsistent {%s} -> {%s} usage.\n",
|
||||
usage_str[prev_bit], usage_str[new_bit]);
|
||||
@ -2425,10 +2425,10 @@ print_irq_inversion_bug(struct task_struct *curr,
|
||||
return 0;
|
||||
|
||||
printk("\n");
|
||||
printk("=========================================================\n");
|
||||
printk("[ INFO: possible irq lock inversion dependency detected ]\n");
|
||||
pr_warn("========================================================\n");
|
||||
pr_warn("WARNING: possible irq lock inversion dependency detected\n");
|
||||
print_kernel_ident();
|
||||
printk("---------------------------------------------------------\n");
|
||||
pr_warn("--------------------------------------------------------\n");
|
||||
printk("%s/%d just changed the state of lock:\n",
|
||||
curr->comm, task_pid_nr(curr));
|
||||
print_lock(this);
|
||||
@ -3170,10 +3170,10 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
|
||||
return 0;
|
||||
|
||||
printk("\n");
|
||||
printk("==================================\n");
|
||||
printk("[ BUG: Nested lock was not taken ]\n");
|
||||
pr_warn("==================================\n");
|
||||
pr_warn("WARNING: Nested lock was not taken\n");
|
||||
print_kernel_ident();
|
||||
printk("----------------------------------\n");
|
||||
pr_warn("----------------------------------\n");
|
||||
|
||||
printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
|
||||
print_lock(hlock);
|
||||
@ -3383,10 +3383,10 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
|
||||
return 0;
|
||||
|
||||
printk("\n");
|
||||
printk("=====================================\n");
|
||||
printk("[ BUG: bad unlock balance detected! ]\n");
|
||||
pr_warn("=====================================\n");
|
||||
pr_warn("WARNING: bad unlock balance detected!\n");
|
||||
print_kernel_ident();
|
||||
printk("-------------------------------------\n");
|
||||
pr_warn("-------------------------------------\n");
|
||||
printk("%s/%d is trying to release lock (",
|
||||
curr->comm, task_pid_nr(curr));
|
||||
print_lockdep_cache(lock);
|
||||
@ -3880,10 +3880,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
|
||||
return 0;
|
||||
|
||||
printk("\n");
|
||||
printk("=================================\n");
|
||||
printk("[ BUG: bad contention detected! ]\n");
|
||||
pr_warn("=================================\n");
|
||||
pr_warn("WARNING: bad contention detected!\n");
|
||||
print_kernel_ident();
|
||||
printk("---------------------------------\n");
|
||||
pr_warn("---------------------------------\n");
|
||||
printk("%s/%d is trying to contend lock (",
|
||||
curr->comm, task_pid_nr(curr));
|
||||
print_lockdep_cache(lock);
|
||||
@ -4244,10 +4244,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
|
||||
return;
|
||||
|
||||
printk("\n");
|
||||
printk("=========================\n");
|
||||
printk("[ BUG: held lock freed! ]\n");
|
||||
pr_warn("=========================\n");
|
||||
pr_warn("WARNING: held lock freed!\n");
|
||||
print_kernel_ident();
|
||||
printk("-------------------------\n");
|
||||
pr_warn("-------------------------\n");
|
||||
printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
|
||||
curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
|
||||
print_lock(hlock);
|
||||
@ -4302,11 +4302,11 @@ static void print_held_locks_bug(void)
|
||||
return;
|
||||
|
||||
printk("\n");
|
||||
printk("=====================================\n");
|
||||
printk("[ BUG: %s/%d still has locks held! ]\n",
|
||||
pr_warn("====================================\n");
|
||||
pr_warn("WARNING: %s/%d still has locks held!\n",
|
||||
current->comm, task_pid_nr(current));
|
||||
print_kernel_ident();
|
||||
printk("-------------------------------------\n");
|
||||
pr_warn("------------------------------------\n");
|
||||
lockdep_print_held_locks(current);
|
||||
printk("\nstack backtrace:\n");
|
||||
dump_stack();
|
||||
@ -4371,7 +4371,7 @@ void debug_show_all_locks(void)
|
||||
} while_each_thread(g, p);
|
||||
|
||||
printk("\n");
|
||||
printk("=============================================\n\n");
|
||||
pr_warn("=============================================\n\n");
|
||||
|
||||
if (unlock)
|
||||
read_unlock(&tasklist_lock);
|
||||
@ -4401,10 +4401,10 @@ asmlinkage __visible void lockdep_sys_exit(void)
|
||||
if (!debug_locks_off())
|
||||
return;
|
||||
printk("\n");
|
||||
printk("================================================\n");
|
||||
printk("[ BUG: lock held when returning to user space! ]\n");
|
||||
pr_warn("================================================\n");
|
||||
pr_warn("WARNING: lock held when returning to user space!\n");
|
||||
print_kernel_ident();
|
||||
printk("------------------------------------------------\n");
|
||||
pr_warn("------------------------------------------------\n");
|
||||
printk("%s/%d is leaving the kernel with locks still held!\n",
|
||||
curr->comm, curr->pid);
|
||||
lockdep_print_held_locks(curr);
|
||||
@ -4421,13 +4421,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
|
||||
#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
|
||||
/* Note: the following can be executed concurrently, so be careful. */
|
||||
printk("\n");
|
||||
pr_err("===============================\n");
|
||||
pr_err("[ ERR: suspicious RCU usage. ]\n");
|
||||
pr_warn("=============================\n");
|
||||
pr_warn("WARNING: suspicious RCU usage\n");
|
||||
print_kernel_ident();
|
||||
pr_err("-------------------------------\n");
|
||||
pr_err("%s:%d %s!\n", file, line, s);
|
||||
pr_err("\nother info that might help us debug this:\n\n");
|
||||
pr_err("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
|
||||
pr_warn("-----------------------------\n");
|
||||
printk("%s:%d %s!\n", file, line, s);
|
||||
printk("\nother info that might help us debug this:\n\n");
|
||||
printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
|
||||
!rcu_lockdep_current_cpu_online()
|
||||
? "RCU used illegally from offline CPU!\n"
|
||||
: !rcu_is_watching()
|
||||
|
@ -102,10 +102,11 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
|
||||
return;
|
||||
}
|
||||
|
||||
printk("\n============================================\n");
|
||||
printk( "[ BUG: circular locking deadlock detected! ]\n");
|
||||
printk("%s\n", print_tainted());
|
||||
printk( "--------------------------------------------\n");
|
||||
pr_warn("\n");
|
||||
pr_warn("============================================\n");
|
||||
pr_warn("WARNING: circular locking deadlock detected!\n");
|
||||
pr_warn("%s\n", print_tainted());
|
||||
pr_warn("--------------------------------------------\n");
|
||||
printk("%s/%d is deadlocking current task %s/%d\n\n",
|
||||
task->comm, task_pid_nr(task),
|
||||
current->comm, task_pid_nr(current));
|
||||
|
@ -3,7 +3,9 @@
|
||||
KCOV_INSTRUMENT := n
|
||||
|
||||
obj-y += update.o sync.o
|
||||
obj-$(CONFIG_SRCU) += srcu.o
|
||||
obj-$(CONFIG_CLASSIC_SRCU) += srcu.o
|
||||
obj-$(CONFIG_TREE_SRCU) += srcutree.o
|
||||
obj-$(CONFIG_TINY_SRCU) += srcutiny.o
|
||||
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
|
||||
obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
|
||||
obj-$(CONFIG_TREE_RCU) += tree.o
|
||||
|
153
kernel/rcu/rcu.h
153
kernel/rcu/rcu.h
@ -56,6 +56,83 @@
|
||||
#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \
|
||||
DYNTICK_TASK_FLAG)
|
||||
|
||||
|
||||
/*
|
||||
* Grace-period counter management.
|
||||
*/
|
||||
|
||||
#define RCU_SEQ_CTR_SHIFT 2
|
||||
#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1)
|
||||
|
||||
/*
|
||||
* Return the counter portion of a sequence number previously returned
|
||||
* by rcu_seq_snap() or rcu_seq_current().
|
||||
*/
|
||||
static inline unsigned long rcu_seq_ctr(unsigned long s)
|
||||
{
|
||||
return s >> RCU_SEQ_CTR_SHIFT;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the state portion of a sequence number previously returned
|
||||
* by rcu_seq_snap() or rcu_seq_current().
|
||||
*/
|
||||
static inline int rcu_seq_state(unsigned long s)
|
||||
{
|
||||
return s & RCU_SEQ_STATE_MASK;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the state portion of the pointed-to sequence number.
|
||||
* The caller is responsible for preventing conflicting updates.
|
||||
*/
|
||||
static inline void rcu_seq_set_state(unsigned long *sp, int newstate)
|
||||
{
|
||||
WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK);
|
||||
WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate);
|
||||
}
|
||||
|
||||
/* Adjust sequence number for start of update-side operation. */
|
||||
static inline void rcu_seq_start(unsigned long *sp)
|
||||
{
|
||||
WRITE_ONCE(*sp, *sp + 1);
|
||||
smp_mb(); /* Ensure update-side operation after counter increment. */
|
||||
WARN_ON_ONCE(rcu_seq_state(*sp) != 1);
|
||||
}
|
||||
|
||||
/* Adjust sequence number for end of update-side operation. */
|
||||
static inline void rcu_seq_end(unsigned long *sp)
|
||||
{
|
||||
smp_mb(); /* Ensure update-side operation before counter increment. */
|
||||
WARN_ON_ONCE(!rcu_seq_state(*sp));
|
||||
WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1);
|
||||
}
|
||||
|
||||
/* Take a snapshot of the update side's sequence number. */
|
||||
static inline unsigned long rcu_seq_snap(unsigned long *sp)
|
||||
{
|
||||
unsigned long s;
|
||||
|
||||
s = (READ_ONCE(*sp) + 2 * RCU_SEQ_STATE_MASK + 1) & ~RCU_SEQ_STATE_MASK;
|
||||
smp_mb(); /* Above access must not bleed into critical section. */
|
||||
return s;
|
||||
}
|
||||
|
||||
/* Return the current value the update side's sequence number, no ordering. */
|
||||
static inline unsigned long rcu_seq_current(unsigned long *sp)
|
||||
{
|
||||
return READ_ONCE(*sp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a snapshot from rcu_seq_snap(), determine whether or not a
|
||||
* full update-side operation has occurred.
|
||||
*/
|
||||
static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
|
||||
{
|
||||
return ULONG_CMP_GE(READ_ONCE(*sp), s);
|
||||
}
|
||||
|
||||
/*
|
||||
* debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
|
||||
* by call_rcu() and rcu callback execution, and are therefore not part of the
|
||||
@ -109,12 +186,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
|
||||
|
||||
rcu_lock_acquire(&rcu_callback_map);
|
||||
if (__is_kfree_rcu_offset(offset)) {
|
||||
RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
|
||||
RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);)
|
||||
kfree((void *)head - offset);
|
||||
rcu_lock_release(&rcu_callback_map);
|
||||
return true;
|
||||
} else {
|
||||
RCU_TRACE(trace_rcu_invoke_callback(rn, head));
|
||||
RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
|
||||
head->func(head);
|
||||
rcu_lock_release(&rcu_callback_map);
|
||||
return false;
|
||||
@ -144,4 +221,76 @@ void rcu_test_sync_prims(void);
|
||||
*/
|
||||
extern void resched_cpu(int cpu);
|
||||
|
||||
#if defined(SRCU) || !defined(TINY_RCU)
|
||||
|
||||
#include <linux/rcu_node_tree.h>
|
||||
|
||||
extern int rcu_num_lvls;
|
||||
extern int num_rcu_lvl[];
|
||||
extern int rcu_num_nodes;
|
||||
static bool rcu_fanout_exact;
|
||||
static int rcu_fanout_leaf;
|
||||
|
||||
/*
|
||||
* Compute the per-level fanout, either using the exact fanout specified
|
||||
* or balancing the tree, depending on the rcu_fanout_exact boot parameter.
|
||||
*/
|
||||
static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (rcu_fanout_exact) {
|
||||
levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
|
||||
for (i = rcu_num_lvls - 2; i >= 0; i--)
|
||||
levelspread[i] = RCU_FANOUT;
|
||||
} else {
|
||||
int ccur;
|
||||
int cprv;
|
||||
|
||||
cprv = nr_cpu_ids;
|
||||
for (i = rcu_num_lvls - 1; i >= 0; i--) {
|
||||
ccur = levelcnt[i];
|
||||
levelspread[i] = (cprv + ccur - 1) / ccur;
|
||||
cprv = ccur;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Do a full breadth-first scan of the rcu_node structures for the
|
||||
* specified rcu_state structure.
|
||||
*/
|
||||
#define rcu_for_each_node_breadth_first(rsp, rnp) \
|
||||
for ((rnp) = &(rsp)->node[0]; \
|
||||
(rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
|
||||
|
||||
/*
|
||||
* Do a breadth-first scan of the non-leaf rcu_node structures for the
|
||||
* specified rcu_state structure. Note that if there is a singleton
|
||||
* rcu_node tree with but one rcu_node structure, this loop is a no-op.
|
||||
*/
|
||||
#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
|
||||
for ((rnp) = &(rsp)->node[0]; \
|
||||
(rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
|
||||
|
||||
/*
|
||||
* Scan the leaves of the rcu_node hierarchy for the specified rcu_state
|
||||
* structure. Note that if there is a singleton rcu_node tree with but
|
||||
* one rcu_node structure, this loop -will- visit the rcu_node structure.
|
||||
* It is still a leaf node, even if it is also the root node.
|
||||
*/
|
||||
#define rcu_for_each_leaf_node(rsp, rnp) \
|
||||
for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
|
||||
(rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
|
||||
|
||||
/*
|
||||
* Iterate over all possible CPUs in a leaf RCU node.
|
||||
*/
|
||||
#define for_each_leaf_node_possible_cpu(rnp, cpu) \
|
||||
for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
|
||||
cpu <= rnp->grphi; \
|
||||
cpu = cpumask_next((cpu), cpu_possible_mask))
|
||||
|
||||
#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
|
||||
|
||||
#endif /* __LINUX_RCU_H */
|
||||
|
@ -559,19 +559,34 @@ static void srcu_torture_barrier(void)
|
||||
|
||||
static void srcu_torture_stats(void)
|
||||
{
|
||||
int cpu;
|
||||
int idx = srcu_ctlp->completed & 0x1;
|
||||
int __maybe_unused cpu;
|
||||
int idx;
|
||||
|
||||
pr_alert("%s%s per-CPU(idx=%d):",
|
||||
#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU)
|
||||
#ifdef CONFIG_TREE_SRCU
|
||||
idx = srcu_ctlp->srcu_idx & 0x1;
|
||||
#else /* #ifdef CONFIG_TREE_SRCU */
|
||||
idx = srcu_ctlp->completed & 0x1;
|
||||
#endif /* #else #ifdef CONFIG_TREE_SRCU */
|
||||
pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
|
||||
torture_type, TORTURE_FLAG, idx);
|
||||
for_each_possible_cpu(cpu) {
|
||||
unsigned long l0, l1;
|
||||
unsigned long u0, u1;
|
||||
long c0, c1;
|
||||
struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
|
||||
#ifdef CONFIG_TREE_SRCU
|
||||
struct srcu_data *counts;
|
||||
|
||||
counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
|
||||
u0 = counts->srcu_unlock_count[!idx];
|
||||
u1 = counts->srcu_unlock_count[idx];
|
||||
#else /* #ifdef CONFIG_TREE_SRCU */
|
||||
struct srcu_array *counts;
|
||||
|
||||
counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
|
||||
u0 = counts->unlock_count[!idx];
|
||||
u1 = counts->unlock_count[idx];
|
||||
#endif /* #else #ifdef CONFIG_TREE_SRCU */
|
||||
|
||||
/*
|
||||
* Make sure that a lock is always counted if the corresponding
|
||||
@ -579,14 +594,26 @@ static void srcu_torture_stats(void)
|
||||
*/
|
||||
smp_rmb();
|
||||
|
||||
#ifdef CONFIG_TREE_SRCU
|
||||
l0 = counts->srcu_lock_count[!idx];
|
||||
l1 = counts->srcu_lock_count[idx];
|
||||
#else /* #ifdef CONFIG_TREE_SRCU */
|
||||
l0 = counts->lock_count[!idx];
|
||||
l1 = counts->lock_count[idx];
|
||||
#endif /* #else #ifdef CONFIG_TREE_SRCU */
|
||||
|
||||
c0 = l0 - u0;
|
||||
c1 = l1 - u1;
|
||||
pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
|
||||
}
|
||||
pr_cont("\n");
|
||||
#elif defined(CONFIG_TINY_SRCU)
|
||||
idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
|
||||
pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n",
|
||||
torture_type, TORTURE_FLAG, idx,
|
||||
READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
|
||||
READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
|
||||
#endif
|
||||
}
|
||||
|
||||
static void srcu_torture_synchronize_expedited(void)
|
||||
|
@ -22,7 +22,7 @@
|
||||
* Lai Jiangshan <laijs@cn.fujitsu.com>
|
||||
*
|
||||
* For detailed explanation of Read-Copy Update mechanism see -
|
||||
* Documentation/RCU/ *.txt
|
||||
* Documentation/RCU/ *.txt
|
||||
*
|
||||
*/
|
||||
|
||||
@ -243,8 +243,14 @@ static bool srcu_readers_active(struct srcu_struct *sp)
|
||||
* cleanup_srcu_struct - deconstruct a sleep-RCU structure
|
||||
* @sp: structure to clean up.
|
||||
*
|
||||
* Must invoke this after you are finished using a given srcu_struct that
|
||||
* was initialized via init_srcu_struct(), else you leak memory.
|
||||
* Must invoke this only after you are finished using a given srcu_struct
|
||||
* that was initialized via init_srcu_struct(). This code does some
|
||||
* probabalistic checking, spotting late uses of srcu_read_lock(),
|
||||
* synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu().
|
||||
* If any such late uses are detected, the per-CPU memory associated with
|
||||
* the srcu_struct is simply leaked and WARN_ON() is invoked. If the
|
||||
* caller frees the srcu_struct itself, a use-after-free crash will likely
|
||||
* ensue, but at least there will be a warning printed.
|
||||
*/
|
||||
void cleanup_srcu_struct(struct srcu_struct *sp)
|
||||
{
|
||||
|
215
kernel/rcu/srcutiny.c
Normal file
215
kernel/rcu/srcutiny.c
Normal file
@ -0,0 +1,215 @@
|
||||
/*
|
||||
* Sleepable Read-Copy Update mechanism for mutual exclusion,
|
||||
* tiny version for non-preemptible single-CPU use.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, you can access it online at
|
||||
* http://www.gnu.org/licenses/gpl-2.0.html.
|
||||
*
|
||||
* Copyright (C) IBM Corporation, 2017
|
||||
*
|
||||
* Author: Paul McKenney <paulmck@us.ibm.com>
|
||||
*/
|
||||
|
||||
#include <linux/export.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/preempt.h>
|
||||
#include <linux/rcupdate_wait.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/srcu.h>
|
||||
|
||||
#include <linux/rcu_node_tree.h>
|
||||
#include "rcu.h"
|
||||
|
||||
static int init_srcu_struct_fields(struct srcu_struct *sp)
|
||||
{
|
||||
sp->srcu_lock_nesting[0] = 0;
|
||||
sp->srcu_lock_nesting[1] = 0;
|
||||
init_swait_queue_head(&sp->srcu_wq);
|
||||
sp->srcu_gp_seq = 0;
|
||||
rcu_segcblist_init(&sp->srcu_cblist);
|
||||
sp->srcu_gp_running = false;
|
||||
sp->srcu_gp_waiting = false;
|
||||
sp->srcu_idx = 0;
|
||||
INIT_WORK(&sp->srcu_work, srcu_drive_gp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
|
||||
int __init_srcu_struct(struct srcu_struct *sp, const char *name,
|
||||
struct lock_class_key *key)
|
||||
{
|
||||
/* Don't re-initialize a lock while it is held. */
|
||||
debug_check_no_locks_freed((void *)sp, sizeof(*sp));
|
||||
lockdep_init_map(&sp->dep_map, name, key, 0);
|
||||
return init_srcu_struct_fields(sp);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__init_srcu_struct);
|
||||
|
||||
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
|
||||
|
||||
/*
|
||||
* init_srcu_struct - initialize a sleep-RCU structure
|
||||
* @sp: structure to initialize.
|
||||
*
|
||||
* Must invoke this on a given srcu_struct before passing that srcu_struct
|
||||
* to any other function. Each srcu_struct represents a separate domain
|
||||
* of SRCU protection.
|
||||
*/
|
||||
int init_srcu_struct(struct srcu_struct *sp)
|
||||
{
|
||||
return init_srcu_struct_fields(sp);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(init_srcu_struct);
|
||||
|
||||
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
|
||||
|
||||
/*
|
||||
* cleanup_srcu_struct - deconstruct a sleep-RCU structure
|
||||
* @sp: structure to clean up.
|
||||
*
|
||||
* Must invoke this after you are finished using a given srcu_struct that
|
||||
* was initialized via init_srcu_struct(), else you leak memory.
|
||||
*/
|
||||
void cleanup_srcu_struct(struct srcu_struct *sp)
|
||||
{
|
||||
WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
|
||||
flush_work(&sp->srcu_work);
|
||||
WARN_ON(rcu_seq_state(sp->srcu_gp_seq));
|
||||
WARN_ON(sp->srcu_gp_running);
|
||||
WARN_ON(sp->srcu_gp_waiting);
|
||||
WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
|
||||
|
||||
/*
|
||||
* Counts the new reader in the appropriate per-CPU element of the
|
||||
* srcu_struct. Must be called from process context.
|
||||
* Returns an index that must be passed to the matching srcu_read_unlock().
|
||||
*/
|
||||
int __srcu_read_lock(struct srcu_struct *sp)
|
||||
{
|
||||
int idx;
|
||||
|
||||
idx = READ_ONCE(sp->srcu_idx);
|
||||
WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1);
|
||||
return idx;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__srcu_read_lock);
|
||||
|
||||
/*
|
||||
* Removes the count for the old reader from the appropriate element of
|
||||
* the srcu_struct. Must be called from process context.
|
||||
*/
|
||||
void __srcu_read_unlock(struct srcu_struct *sp, int idx)
|
||||
{
|
||||
int newval = sp->srcu_lock_nesting[idx] - 1;
|
||||
|
||||
WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
|
||||
if (!newval && READ_ONCE(sp->srcu_gp_waiting))
|
||||
swake_up(&sp->srcu_wq);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
|
||||
|
||||
/*
|
||||
* Workqueue handler to drive one grace period and invoke any callbacks
|
||||
* that become ready as a result. Single-CPU and !PREEMPT operation
|
||||
* means that we get away with murder on synchronization. ;-)
|
||||
*/
|
||||
void srcu_drive_gp(struct work_struct *wp)
|
||||
{
|
||||
int idx;
|
||||
struct rcu_cblist ready_cbs;
|
||||
struct srcu_struct *sp;
|
||||
struct rcu_head *rhp;
|
||||
|
||||
sp = container_of(wp, struct srcu_struct, srcu_work);
|
||||
if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist))
|
||||
return; /* Already running or nothing to do. */
|
||||
|
||||
/* Tag recently arrived callbacks and wait for readers. */
|
||||
WRITE_ONCE(sp->srcu_gp_running, true);
|
||||
rcu_segcblist_accelerate(&sp->srcu_cblist,
|
||||
rcu_seq_snap(&sp->srcu_gp_seq));
|
||||
rcu_seq_start(&sp->srcu_gp_seq);
|
||||
idx = sp->srcu_idx;
|
||||
WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
|
||||
WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
|
||||
swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
|
||||
WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
|
||||
rcu_seq_end(&sp->srcu_gp_seq);
|
||||
|
||||
/* Update callback list based on GP, and invoke ready callbacks. */
|
||||
rcu_segcblist_advance(&sp->srcu_cblist,
|
||||
rcu_seq_current(&sp->srcu_gp_seq));
|
||||
if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
|
||||
rcu_cblist_init(&ready_cbs);
|
||||
local_irq_disable();
|
||||
rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
|
||||
local_irq_enable();
|
||||
rhp = rcu_cblist_dequeue(&ready_cbs);
|
||||
for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
|
||||
local_bh_disable();
|
||||
rhp->func(rhp);
|
||||
local_bh_enable();
|
||||
}
|
||||
local_irq_disable();
|
||||
rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
|
||||
local_irq_enable();
|
||||
}
|
||||
WRITE_ONCE(sp->srcu_gp_running, false);
|
||||
|
||||
/*
|
||||
* If more callbacks, reschedule ourselves. This can race with
|
||||
* a call_srcu() at interrupt level, but the ->srcu_gp_running
|
||||
* checks will straighten that out.
|
||||
*/
|
||||
if (!rcu_segcblist_empty(&sp->srcu_cblist))
|
||||
schedule_work(&sp->srcu_work);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(srcu_drive_gp);
|
||||
|
||||
/*
|
||||
* Enqueue an SRCU callback on the specified srcu_struct structure,
|
||||
* initiating grace-period processing if it is not already running.
|
||||
*/
|
||||
void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
|
||||
rcu_callback_t func)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
head->func = func;
|
||||
local_irq_save(flags);
|
||||
rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
|
||||
local_irq_restore(flags);
|
||||
if (!READ_ONCE(sp->srcu_gp_running))
|
||||
schedule_work(&sp->srcu_work);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(call_srcu);
|
||||
|
||||
/*
|
||||
* synchronize_srcu - wait for prior SRCU read-side critical-section completion
|
||||
*/
|
||||
void synchronize_srcu(struct srcu_struct *sp)
|
||||
{
|
||||
struct rcu_synchronize rs;
|
||||
|
||||
init_rcu_head_on_stack(&rs.head);
|
||||
init_completion(&rs.completion);
|
||||
call_srcu(sp, &rs.head, wakeme_after_rcu);
|
||||
wait_for_completion(&rs.completion);
|
||||
destroy_rcu_head_on_stack(&rs.head);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(synchronize_srcu);
|
996
kernel/rcu/srcutree.c
Normal file
996
kernel/rcu/srcutree.c
Normal file
@ -0,0 +1,996 @@
|
||||
/*
|
||||
* Sleepable Read-Copy Update mechanism for mutual exclusion.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, you can access it online at
|
||||
* http://www.gnu.org/licenses/gpl-2.0.html.
|
||||
*
|
||||
* Copyright (C) IBM Corporation, 2006
|
||||
* Copyright (C) Fujitsu, 2012
|
||||
*
|
||||
* Author: Paul McKenney <paulmck@us.ibm.com>
|
||||
* Lai Jiangshan <laijs@cn.fujitsu.com>
|
||||
*
|
||||
* For detailed explanation of Read-Copy Update mechanism see -
|
||||
* Documentation/RCU/ *.txt
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/export.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/preempt.h>
|
||||
#include <linux/rcupdate_wait.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/smp.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/srcu.h>
|
||||
|
||||
#include "rcu.h"
|
||||
|
||||
static void srcu_invoke_callbacks(struct work_struct *work);
|
||||
static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
|
||||
|
||||
/*
|
||||
* Initialize SRCU combining tree. Note that statically allocated
|
||||
* srcu_struct structures might already have srcu_read_lock() and
|
||||
* srcu_read_unlock() running against them. So if the is_static parameter
|
||||
* is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
|
||||
*/
|
||||
static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
|
||||
{
|
||||
int cpu;
|
||||
int i;
|
||||
int level = 0;
|
||||
int levelspread[RCU_NUM_LVLS];
|
||||
struct srcu_data *sdp;
|
||||
struct srcu_node *snp;
|
||||
struct srcu_node *snp_first;
|
||||
|
||||
/* Work out the overall tree geometry. */
|
||||
sp->level[0] = &sp->node[0];
|
||||
for (i = 1; i < rcu_num_lvls; i++)
|
||||
sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1];
|
||||
rcu_init_levelspread(levelspread, num_rcu_lvl);
|
||||
|
||||
/* Each pass through this loop initializes one srcu_node structure. */
|
||||
rcu_for_each_node_breadth_first(sp, snp) {
|
||||
spin_lock_init(&snp->lock);
|
||||
for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++)
|
||||
snp->srcu_have_cbs[i] = 0;
|
||||
snp->grplo = -1;
|
||||
snp->grphi = -1;
|
||||
if (snp == &sp->node[0]) {
|
||||
/* Root node, special case. */
|
||||
snp->srcu_parent = NULL;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Non-root node. */
|
||||
if (snp == sp->level[level + 1])
|
||||
level++;
|
||||
snp->srcu_parent = sp->level[level - 1] +
|
||||
(snp - sp->level[level]) /
|
||||
levelspread[level - 1];
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the per-CPU srcu_data array, which feeds into the
|
||||
* leaves of the srcu_node tree.
|
||||
*/
|
||||
WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
|
||||
ARRAY_SIZE(sdp->srcu_unlock_count));
|
||||
level = rcu_num_lvls - 1;
|
||||
snp_first = sp->level[level];
|
||||
for_each_possible_cpu(cpu) {
|
||||
sdp = per_cpu_ptr(sp->sda, cpu);
|
||||
spin_lock_init(&sdp->lock);
|
||||
rcu_segcblist_init(&sdp->srcu_cblist);
|
||||
sdp->srcu_cblist_invoking = false;
|
||||
sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
|
||||
sdp->mynode = &snp_first[cpu / levelspread[level]];
|
||||
for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
|
||||
if (snp->grplo < 0)
|
||||
snp->grplo = cpu;
|
||||
snp->grphi = cpu;
|
||||
}
|
||||
sdp->cpu = cpu;
|
||||
INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks);
|
||||
sdp->sp = sp;
|
||||
if (is_static)
|
||||
continue;
|
||||
|
||||
/* Dynamically allocated, better be no srcu_read_locks()! */
|
||||
for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) {
|
||||
sdp->srcu_lock_count[i] = 0;
|
||||
sdp->srcu_unlock_count[i] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize non-compile-time initialized fields, including the
|
||||
* associated srcu_node and srcu_data structures. The is_static
|
||||
* parameter is passed through to init_srcu_struct_nodes(), and
|
||||
* also tells us that ->sda has already been wired up to srcu_data.
|
||||
*/
|
||||
static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
|
||||
{
|
||||
mutex_init(&sp->srcu_cb_mutex);
|
||||
mutex_init(&sp->srcu_gp_mutex);
|
||||
sp->srcu_idx = 0;
|
||||
sp->srcu_gp_seq = 0;
|
||||
atomic_set(&sp->srcu_exp_cnt, 0);
|
||||
sp->srcu_barrier_seq = 0;
|
||||
mutex_init(&sp->srcu_barrier_mutex);
|
||||
atomic_set(&sp->srcu_barrier_cpu_cnt, 0);
|
||||
INIT_DELAYED_WORK(&sp->work, process_srcu);
|
||||
if (!is_static)
|
||||
sp->sda = alloc_percpu(struct srcu_data);
|
||||
init_srcu_struct_nodes(sp, is_static);
|
||||
smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */
|
||||
return sp->sda ? 0 : -ENOMEM;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
|
||||
int __init_srcu_struct(struct srcu_struct *sp, const char *name,
|
||||
struct lock_class_key *key)
|
||||
{
|
||||
/* Don't re-initialize a lock while it is held. */
|
||||
debug_check_no_locks_freed((void *)sp, sizeof(*sp));
|
||||
lockdep_init_map(&sp->dep_map, name, key, 0);
|
||||
spin_lock_init(&sp->gp_lock);
|
||||
return init_srcu_struct_fields(sp, false);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__init_srcu_struct);
|
||||
|
||||
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
|
||||
|
||||
/**
|
||||
* init_srcu_struct - initialize a sleep-RCU structure
|
||||
* @sp: structure to initialize.
|
||||
*
|
||||
* Must invoke this on a given srcu_struct before passing that srcu_struct
|
||||
* to any other function. Each srcu_struct represents a separate domain
|
||||
* of SRCU protection.
|
||||
*/
|
||||
int init_srcu_struct(struct srcu_struct *sp)
|
||||
{
|
||||
spin_lock_init(&sp->gp_lock);
|
||||
return init_srcu_struct_fields(sp, false);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(init_srcu_struct);
|
||||
|
||||
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
|
||||
|
||||
/*
|
||||
* First-use initialization of statically allocated srcu_struct
|
||||
* structure. Wiring up the combining tree is more than can be
|
||||
* done with compile-time initialization, so this check is added
|
||||
* to each update-side SRCU primitive. Use ->gp_lock, which -is-
|
||||
* compile-time initialized, to resolve races involving multiple
|
||||
* CPUs trying to garner first-use privileges.
|
||||
*/
|
||||
static void check_init_srcu_struct(struct srcu_struct *sp)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT);
|
||||
/* The smp_load_acquire() pairs with the smp_store_release(). */
|
||||
if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
|
||||
return; /* Already initialized. */
|
||||
spin_lock_irqsave(&sp->gp_lock, flags);
|
||||
if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
|
||||
spin_unlock_irqrestore(&sp->gp_lock, flags);
|
||||
return;
|
||||
}
|
||||
init_srcu_struct_fields(sp, true);
|
||||
spin_unlock_irqrestore(&sp->gp_lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns approximate total of the readers' ->srcu_lock_count[] values
|
||||
* for the rank of per-CPU counters specified by idx.
|
||||
*/
|
||||
static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
|
||||
{
|
||||
int cpu;
|
||||
unsigned long sum = 0;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
|
||||
|
||||
sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns approximate total of the readers' ->srcu_unlock_count[] values
|
||||
* for the rank of per-CPU counters specified by idx.
|
||||
*/
|
||||
static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
|
||||
{
|
||||
int cpu;
|
||||
unsigned long sum = 0;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
|
||||
|
||||
sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true if the number of pre-existing readers is determined to
|
||||
* be zero.
|
||||
*/
|
||||
static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
|
||||
{
|
||||
unsigned long unlocks;
|
||||
|
||||
unlocks = srcu_readers_unlock_idx(sp, idx);
|
||||
|
||||
/*
|
||||
* Make sure that a lock is always counted if the corresponding
|
||||
* unlock is counted. Needs to be a smp_mb() as the read side may
|
||||
* contain a read from a variable that is written to before the
|
||||
* synchronize_srcu() in the write side. In this case smp_mb()s
|
||||
* A and B act like the store buffering pattern.
|
||||
*
|
||||
* This smp_mb() also pairs with smp_mb() C to prevent accesses
|
||||
* after the synchronize_srcu() from being executed before the
|
||||
* grace period ends.
|
||||
*/
|
||||
smp_mb(); /* A */
|
||||
|
||||
/*
|
||||
* If the locks are the same as the unlocks, then there must have
|
||||
* been no readers on this index at some time in between. This does
|
||||
* not mean that there are no more readers, as one could have read
|
||||
* the current index but not have incremented the lock counter yet.
|
||||
*
|
||||
* Possible bug: There is no guarantee that there haven't been
|
||||
* ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were
|
||||
* counted, meaning that this could return true even if there are
|
||||
* still active readers. Since there are no memory barriers around
|
||||
* srcu_flip(), the CPU is not required to increment ->srcu_idx
|
||||
* before running srcu_readers_unlock_idx(), which means that there
|
||||
* could be an arbitrarily large number of critical sections that
|
||||
* execute after srcu_readers_unlock_idx() but use the old value
|
||||
* of ->srcu_idx.
|
||||
*/
|
||||
return srcu_readers_lock_idx(sp, idx) == unlocks;
|
||||
}
|
||||
|
||||
/**
|
||||
* srcu_readers_active - returns true if there are readers. and false
|
||||
* otherwise
|
||||
* @sp: which srcu_struct to count active readers (holding srcu_read_lock).
|
||||
*
|
||||
* Note that this is not an atomic primitive, and can therefore suffer
|
||||
* severe errors when invoked on an active srcu_struct. That said, it
|
||||
* can be useful as an error check at cleanup time.
|
||||
*/
|
||||
static bool srcu_readers_active(struct srcu_struct *sp)
|
||||
{
|
||||
int cpu;
|
||||
unsigned long sum = 0;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
|
||||
|
||||
sum += READ_ONCE(cpuc->srcu_lock_count[0]);
|
||||
sum += READ_ONCE(cpuc->srcu_lock_count[1]);
|
||||
sum -= READ_ONCE(cpuc->srcu_unlock_count[0]);
|
||||
sum -= READ_ONCE(cpuc->srcu_unlock_count[1]);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
#define SRCU_INTERVAL 1
|
||||
|
||||
/**
|
||||
* cleanup_srcu_struct - deconstruct a sleep-RCU structure
|
||||
* @sp: structure to clean up.
|
||||
*
|
||||
* Must invoke this after you are finished using a given srcu_struct that
|
||||
* was initialized via init_srcu_struct(), else you leak memory.
|
||||
*/
|
||||
void cleanup_srcu_struct(struct srcu_struct *sp)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
WARN_ON_ONCE(atomic_read(&sp->srcu_exp_cnt));
|
||||
if (WARN_ON(srcu_readers_active(sp)))
|
||||
return; /* Leakage unless caller handles error. */
|
||||
flush_delayed_work(&sp->work);
|
||||
for_each_possible_cpu(cpu)
|
||||
flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
|
||||
if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
|
||||
WARN_ON(srcu_readers_active(sp))) {
|
||||
pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
|
||||
return; /* Caller forgot to stop doing call_srcu()? */
|
||||
}
|
||||
free_percpu(sp->sda);
|
||||
sp->sda = NULL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
|
||||
|
||||
/*
|
||||
* Counts the new reader in the appropriate per-CPU element of the
|
||||
* srcu_struct. Must be called from process context.
|
||||
* Returns an index that must be passed to the matching srcu_read_unlock().
|
||||
*/
|
||||
int __srcu_read_lock(struct srcu_struct *sp)
|
||||
{
|
||||
int idx;
|
||||
|
||||
idx = READ_ONCE(sp->srcu_idx) & 0x1;
|
||||
__this_cpu_inc(sp->sda->srcu_lock_count[idx]);
|
||||
smp_mb(); /* B */ /* Avoid leaking the critical section. */
|
||||
return idx;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__srcu_read_lock);
|
||||
|
||||
/*
|
||||
* Removes the count for the old reader from the appropriate per-CPU
|
||||
* element of the srcu_struct. Note that this may well be a different
|
||||
* CPU than that which was incremented by the corresponding srcu_read_lock().
|
||||
* Must be called from process context.
|
||||
*/
|
||||
void __srcu_read_unlock(struct srcu_struct *sp, int idx)
|
||||
{
|
||||
smp_mb(); /* C */ /* Avoid leaking the critical section. */
|
||||
this_cpu_inc(sp->sda->srcu_unlock_count[idx]);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
|
||||
|
||||
/*
|
||||
* We use an adaptive strategy for synchronize_srcu() and especially for
|
||||
* synchronize_srcu_expedited(). We spin for a fixed time period
|
||||
* (defined below) to allow SRCU readers to exit their read-side critical
|
||||
* sections. If there are still some readers after a few microseconds,
|
||||
* we repeatedly block for 1-millisecond time periods.
|
||||
*/
|
||||
#define SRCU_RETRY_CHECK_DELAY 5
|
||||
|
||||
/*
|
||||
* Start an SRCU grace period.
|
||||
*/
|
||||
static void srcu_gp_start(struct srcu_struct *sp)
|
||||
{
|
||||
struct srcu_data *sdp = this_cpu_ptr(sp->sda);
|
||||
int state;
|
||||
|
||||
RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock),
|
||||
"Invoked srcu_gp_start() without ->gp_lock!");
|
||||
WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
|
||||
rcu_segcblist_advance(&sdp->srcu_cblist,
|
||||
rcu_seq_current(&sp->srcu_gp_seq));
|
||||
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
|
||||
rcu_seq_snap(&sp->srcu_gp_seq));
|
||||
rcu_seq_start(&sp->srcu_gp_seq);
|
||||
state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
|
||||
WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Track online CPUs to guide callback workqueue placement.
|
||||
*/
|
||||
DEFINE_PER_CPU(bool, srcu_online);
|
||||
|
||||
void srcu_online_cpu(unsigned int cpu)
|
||||
{
|
||||
WRITE_ONCE(per_cpu(srcu_online, cpu), true);
|
||||
}
|
||||
|
||||
void srcu_offline_cpu(unsigned int cpu)
|
||||
{
|
||||
WRITE_ONCE(per_cpu(srcu_online, cpu), false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Place the workqueue handler on the specified CPU if online, otherwise
|
||||
* just run it whereever. This is useful for placing workqueue handlers
|
||||
* that are to invoke the specified CPU's callbacks.
|
||||
*/
|
||||
static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
|
||||
struct delayed_work *dwork,
|
||||
unsigned long delay)
|
||||
{
|
||||
bool ret;
|
||||
|
||||
preempt_disable();
|
||||
if (READ_ONCE(per_cpu(srcu_online, cpu)))
|
||||
ret = queue_delayed_work_on(cpu, wq, dwork, delay);
|
||||
else
|
||||
ret = queue_delayed_work(wq, dwork, delay);
|
||||
preempt_enable();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Schedule callback invocation for the specified srcu_data structure,
|
||||
* if possible, on the corresponding CPU.
|
||||
*/
|
||||
static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
|
||||
{
|
||||
srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq,
|
||||
&sdp->work, delay);
|
||||
}
|
||||
|
||||
/*
|
||||
* Schedule callback invocation for all srcu_data structures associated
|
||||
* with the specified srcu_node structure, if possible, on the corresponding
|
||||
* CPUs.
|
||||
*/
|
||||
static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
for (cpu = snp->grplo; cpu <= snp->grphi; cpu++)
|
||||
srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu),
|
||||
atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Note the end of an SRCU grace period. Initiates callback invocation
|
||||
* and starts a new grace period if needed.
|
||||
*
|
||||
* The ->srcu_cb_mutex acquisition does not protect any data, but
|
||||
* instead prevents more than one grace period from starting while we
|
||||
* are initiating callback invocation. This allows the ->srcu_have_cbs[]
|
||||
* array to have a finite number of elements.
|
||||
*/
|
||||
static void srcu_gp_end(struct srcu_struct *sp)
|
||||
{
|
||||
bool cbs;
|
||||
unsigned long gpseq;
|
||||
int idx;
|
||||
int idxnext;
|
||||
struct srcu_node *snp;
|
||||
|
||||
/* Prevent more than one additional grace period. */
|
||||
mutex_lock(&sp->srcu_cb_mutex);
|
||||
|
||||
/* End the current grace period. */
|
||||
spin_lock_irq(&sp->gp_lock);
|
||||
idx = rcu_seq_state(sp->srcu_gp_seq);
|
||||
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
|
||||
rcu_seq_end(&sp->srcu_gp_seq);
|
||||
gpseq = rcu_seq_current(&sp->srcu_gp_seq);
|
||||
spin_unlock_irq(&sp->gp_lock);
|
||||
mutex_unlock(&sp->srcu_gp_mutex);
|
||||
/* A new grace period can start at this point. But only one. */
|
||||
|
||||
/* Initiate callback invocation as needed. */
|
||||
idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
|
||||
idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
|
||||
rcu_for_each_node_breadth_first(sp, snp) {
|
||||
spin_lock_irq(&snp->lock);
|
||||
cbs = false;
|
||||
if (snp >= sp->level[rcu_num_lvls - 1])
|
||||
cbs = snp->srcu_have_cbs[idx] == gpseq;
|
||||
snp->srcu_have_cbs[idx] = gpseq;
|
||||
rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
|
||||
spin_unlock_irq(&snp->lock);
|
||||
if (cbs) {
|
||||
smp_mb(); /* GP end before CB invocation. */
|
||||
srcu_schedule_cbs_snp(sp, snp);
|
||||
}
|
||||
}
|
||||
|
||||
/* Callback initiation done, allow grace periods after next. */
|
||||
mutex_unlock(&sp->srcu_cb_mutex);
|
||||
|
||||
/* Start a new grace period if needed. */
|
||||
spin_lock_irq(&sp->gp_lock);
|
||||
gpseq = rcu_seq_current(&sp->srcu_gp_seq);
|
||||
if (!rcu_seq_state(gpseq) &&
|
||||
ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
|
||||
srcu_gp_start(sp);
|
||||
spin_unlock_irq(&sp->gp_lock);
|
||||
/* Throttle expedited grace periods: Should be rare! */
|
||||
srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) &&
|
||||
rcu_seq_ctr(gpseq) & 0xf
|
||||
? 0
|
||||
: SRCU_INTERVAL);
|
||||
} else {
|
||||
spin_unlock_irq(&sp->gp_lock);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Funnel-locking scheme to scalably mediate many concurrent grace-period
|
||||
* requests. The winner has to do the work of actually starting grace
|
||||
* period s. Losers must either ensure that their desired grace-period
|
||||
* number is recorded on at least their leaf srcu_node structure, or they
|
||||
* must take steps to invoke their own callbacks.
|
||||
*/
|
||||
static void srcu_funnel_gp_start(struct srcu_struct *sp,
|
||||
struct srcu_data *sdp,
|
||||
unsigned long s)
|
||||
{
|
||||
unsigned long flags;
|
||||
int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
|
||||
struct srcu_node *snp = sdp->mynode;
|
||||
unsigned long snp_seq;
|
||||
|
||||
/* Each pass through the loop does one level of the srcu_node tree. */
|
||||
for (; snp != NULL; snp = snp->srcu_parent) {
|
||||
if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
|
||||
return; /* GP already done and CBs recorded. */
|
||||
spin_lock_irqsave(&snp->lock, flags);
|
||||
if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
|
||||
snp_seq = snp->srcu_have_cbs[idx];
|
||||
spin_unlock_irqrestore(&snp->lock, flags);
|
||||
if (snp == sdp->mynode && snp_seq != s) {
|
||||
smp_mb(); /* CBs after GP! */
|
||||
srcu_schedule_cbs_sdp(sdp, 0);
|
||||
}
|
||||
return;
|
||||
}
|
||||
snp->srcu_have_cbs[idx] = s;
|
||||
spin_unlock_irqrestore(&snp->lock, flags);
|
||||
}
|
||||
|
||||
/* Top of tree, must ensure the grace period will be started. */
|
||||
spin_lock_irqsave(&sp->gp_lock, flags);
|
||||
if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
|
||||
/*
|
||||
* Record need for grace period s. Pair with load
|
||||
* acquire setting up for initialization.
|
||||
*/
|
||||
smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/
|
||||
}
|
||||
|
||||
/* If grace period not already done and none in progress, start it. */
|
||||
if (!rcu_seq_done(&sp->srcu_gp_seq, s) &&
|
||||
rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
|
||||
WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
|
||||
srcu_gp_start(sp);
|
||||
queue_delayed_work(system_power_efficient_wq, &sp->work,
|
||||
atomic_read(&sp->srcu_exp_cnt)
|
||||
? 0
|
||||
: SRCU_INTERVAL);
|
||||
}
|
||||
spin_unlock_irqrestore(&sp->gp_lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait until all readers counted by array index idx complete, but
|
||||
* loop an additional time if there is an expedited grace period pending.
|
||||
* The caller must ensure that ->srcu_idx is not changed while checking.
|
||||
*/
|
||||
static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
|
||||
{
|
||||
for (;;) {
|
||||
if (srcu_readers_active_idx_check(sp, idx))
|
||||
return true;
|
||||
if (--trycount + !!atomic_read(&sp->srcu_exp_cnt) <= 0)
|
||||
return false;
|
||||
udelay(SRCU_RETRY_CHECK_DELAY);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Increment the ->srcu_idx counter so that future SRCU readers will
|
||||
* use the other rank of the ->srcu_(un)lock_count[] arrays. This allows
|
||||
* us to wait for pre-existing readers in a starvation-free manner.
|
||||
*/
|
||||
static void srcu_flip(struct srcu_struct *sp)
|
||||
{
|
||||
WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1);
|
||||
|
||||
/*
|
||||
* Ensure that if the updater misses an __srcu_read_unlock()
|
||||
* increment, that task's next __srcu_read_lock() will see the
|
||||
* above counter update. Note that both this memory barrier
|
||||
* and the one in srcu_readers_active_idx_check() provide the
|
||||
* guarantee for __srcu_read_lock().
|
||||
*/
|
||||
smp_mb(); /* D */ /* Pairs with C. */
|
||||
}
|
||||
|
||||
/*
|
||||
* Enqueue an SRCU callback on the srcu_data structure associated with
|
||||
* the current CPU and the specified srcu_struct structure, initiating
|
||||
* grace-period processing if it is not already running.
|
||||
*
|
||||
* Note that all CPUs must agree that the grace period extended beyond
|
||||
* all pre-existing SRCU read-side critical section. On systems with
|
||||
* more than one CPU, this means that when "func()" is invoked, each CPU
|
||||
* is guaranteed to have executed a full memory barrier since the end of
|
||||
* its last corresponding SRCU read-side critical section whose beginning
|
||||
* preceded the call to call_rcu(). It also means that each CPU executing
|
||||
* an SRCU read-side critical section that continues beyond the start of
|
||||
* "func()" must have executed a memory barrier after the call_rcu()
|
||||
* but before the beginning of that SRCU read-side critical section.
|
||||
* Note that these guarantees include CPUs that are offline, idle, or
|
||||
* executing in user mode, as well as CPUs that are executing in the kernel.
|
||||
*
|
||||
* Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
|
||||
* resulting SRCU callback function "func()", then both CPU A and CPU
|
||||
* B are guaranteed to execute a full memory barrier during the time
|
||||
* interval between the call to call_rcu() and the invocation of "func()".
|
||||
* This guarantee applies even if CPU A and CPU B are the same CPU (but
|
||||
* again only if the system has more than one CPU).
|
||||
*
|
||||
* Of course, these guarantees apply only for invocations of call_srcu(),
|
||||
* srcu_read_lock(), and srcu_read_unlock() that are all passed the same
|
||||
* srcu_struct structure.
|
||||
*/
|
||||
void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
|
||||
rcu_callback_t func)
|
||||
{
|
||||
unsigned long flags;
|
||||
bool needgp = false;
|
||||
unsigned long s;
|
||||
struct srcu_data *sdp;
|
||||
|
||||
check_init_srcu_struct(sp);
|
||||
rhp->func = func;
|
||||
local_irq_save(flags);
|
||||
sdp = this_cpu_ptr(sp->sda);
|
||||
spin_lock(&sdp->lock);
|
||||
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
|
||||
rcu_segcblist_advance(&sdp->srcu_cblist,
|
||||
rcu_seq_current(&sp->srcu_gp_seq));
|
||||
s = rcu_seq_snap(&sp->srcu_gp_seq);
|
||||
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
|
||||
if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
|
||||
sdp->srcu_gp_seq_needed = s;
|
||||
needgp = true;
|
||||
}
|
||||
spin_unlock_irqrestore(&sdp->lock, flags);
|
||||
if (needgp)
|
||||
srcu_funnel_gp_start(sp, sdp, s);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(call_srcu);
|
||||
|
||||
/*
|
||||
* Helper function for synchronize_srcu() and synchronize_srcu_expedited().
|
||||
*/
|
||||
static void __synchronize_srcu(struct srcu_struct *sp)
|
||||
{
|
||||
struct rcu_synchronize rcu;
|
||||
|
||||
RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
|
||||
lock_is_held(&rcu_bh_lock_map) ||
|
||||
lock_is_held(&rcu_lock_map) ||
|
||||
lock_is_held(&rcu_sched_lock_map),
|
||||
"Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
|
||||
|
||||
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
|
||||
return;
|
||||
might_sleep();
|
||||
check_init_srcu_struct(sp);
|
||||
init_completion(&rcu.completion);
|
||||
init_rcu_head_on_stack(&rcu.head);
|
||||
call_srcu(sp, &rcu.head, wakeme_after_rcu);
|
||||
wait_for_completion(&rcu.completion);
|
||||
destroy_rcu_head_on_stack(&rcu.head);
|
||||
}
|
||||
|
||||
/**
|
||||
* synchronize_srcu_expedited - Brute-force SRCU grace period
|
||||
* @sp: srcu_struct with which to synchronize.
|
||||
*
|
||||
* Wait for an SRCU grace period to elapse, but be more aggressive about
|
||||
* spinning rather than blocking when waiting.
|
||||
*
|
||||
* Note that synchronize_srcu_expedited() has the same deadlock and
|
||||
* memory-ordering properties as does synchronize_srcu().
|
||||
*/
|
||||
void synchronize_srcu_expedited(struct srcu_struct *sp)
|
||||
{
|
||||
bool do_norm = rcu_gp_is_normal();
|
||||
|
||||
check_init_srcu_struct(sp);
|
||||
if (!do_norm) {
|
||||
atomic_inc(&sp->srcu_exp_cnt);
|
||||
smp_mb__after_atomic(); /* increment before GP. */
|
||||
}
|
||||
__synchronize_srcu(sp);
|
||||
if (!do_norm) {
|
||||
smp_mb__before_atomic(); /* GP before decrement. */
|
||||
WARN_ON_ONCE(atomic_dec_return(&sp->srcu_exp_cnt) < 0);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
|
||||
|
||||
/**
|
||||
* synchronize_srcu - wait for prior SRCU read-side critical-section completion
|
||||
* @sp: srcu_struct with which to synchronize.
|
||||
*
|
||||
* Wait for the count to drain to zero of both indexes. To avoid the
|
||||
* possible starvation of synchronize_srcu(), it waits for the count of
|
||||
* the index=((->srcu_idx & 1) ^ 1) to drain to zero at first,
|
||||
* and then flip the srcu_idx and wait for the count of the other index.
|
||||
*
|
||||
* Can block; must be called from process context.
|
||||
*
|
||||
* Note that it is illegal to call synchronize_srcu() from the corresponding
|
||||
* SRCU read-side critical section; doing so will result in deadlock.
|
||||
* However, it is perfectly legal to call synchronize_srcu() on one
|
||||
* srcu_struct from some other srcu_struct's read-side critical section,
|
||||
* as long as the resulting graph of srcu_structs is acyclic.
|
||||
*
|
||||
* There are memory-ordering constraints implied by synchronize_srcu().
|
||||
* On systems with more than one CPU, when synchronize_srcu() returns,
|
||||
* each CPU is guaranteed to have executed a full memory barrier since
|
||||
* the end of its last corresponding SRCU-sched read-side critical section
|
||||
* whose beginning preceded the call to synchronize_srcu(). In addition,
|
||||
* each CPU having an SRCU read-side critical section that extends beyond
|
||||
* the return from synchronize_srcu() is guaranteed to have executed a
|
||||
* full memory barrier after the beginning of synchronize_srcu() and before
|
||||
* the beginning of that SRCU read-side critical section. Note that these
|
||||
* guarantees include CPUs that are offline, idle, or executing in user mode,
|
||||
* as well as CPUs that are executing in the kernel.
|
||||
*
|
||||
* Furthermore, if CPU A invoked synchronize_srcu(), which returned
|
||||
* to its caller on CPU B, then both CPU A and CPU B are guaranteed
|
||||
* to have executed a full memory barrier during the execution of
|
||||
* synchronize_srcu(). This guarantee applies even if CPU A and CPU B
|
||||
* are the same CPU, but again only if the system has more than one CPU.
|
||||
*
|
||||
* Of course, these memory-ordering guarantees apply only when
|
||||
* synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
|
||||
* passed the same srcu_struct structure.
|
||||
*/
|
||||
void synchronize_srcu(struct srcu_struct *sp)
|
||||
{
|
||||
if (rcu_gp_is_expedited())
|
||||
synchronize_srcu_expedited(sp);
|
||||
else
|
||||
__synchronize_srcu(sp);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(synchronize_srcu);
|
||||
|
||||
/*
|
||||
* Callback function for srcu_barrier() use.
|
||||
*/
|
||||
static void srcu_barrier_cb(struct rcu_head *rhp)
|
||||
{
|
||||
struct srcu_data *sdp;
|
||||
struct srcu_struct *sp;
|
||||
|
||||
sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
|
||||
sp = sdp->sp;
|
||||
if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
|
||||
complete(&sp->srcu_barrier_completion);
|
||||
}
|
||||
|
||||
/**
|
||||
* srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
|
||||
* @sp: srcu_struct on which to wait for in-flight callbacks.
|
||||
*/
|
||||
void srcu_barrier(struct srcu_struct *sp)
|
||||
{
|
||||
int cpu;
|
||||
struct srcu_data *sdp;
|
||||
unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq);
|
||||
|
||||
check_init_srcu_struct(sp);
|
||||
mutex_lock(&sp->srcu_barrier_mutex);
|
||||
if (rcu_seq_done(&sp->srcu_barrier_seq, s)) {
|
||||
smp_mb(); /* Force ordering following return. */
|
||||
mutex_unlock(&sp->srcu_barrier_mutex);
|
||||
return; /* Someone else did our work for us. */
|
||||
}
|
||||
rcu_seq_start(&sp->srcu_barrier_seq);
|
||||
init_completion(&sp->srcu_barrier_completion);
|
||||
|
||||
/* Initial count prevents reaching zero until all CBs are posted. */
|
||||
atomic_set(&sp->srcu_barrier_cpu_cnt, 1);
|
||||
|
||||
/*
|
||||
* Each pass through this loop enqueues a callback, but only
|
||||
* on CPUs already having callbacks enqueued. Note that if
|
||||
* a CPU already has callbacks enqueue, it must have already
|
||||
* registered the need for a future grace period, so all we
|
||||
* need do is enqueue a callback that will use the same
|
||||
* grace period as the last callback already in the queue.
|
||||
*/
|
||||
for_each_possible_cpu(cpu) {
|
||||
sdp = per_cpu_ptr(sp->sda, cpu);
|
||||
spin_lock_irq(&sdp->lock);
|
||||
atomic_inc(&sp->srcu_barrier_cpu_cnt);
|
||||
sdp->srcu_barrier_head.func = srcu_barrier_cb;
|
||||
if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
|
||||
&sdp->srcu_barrier_head, 0))
|
||||
atomic_dec(&sp->srcu_barrier_cpu_cnt);
|
||||
spin_unlock_irq(&sdp->lock);
|
||||
}
|
||||
|
||||
/* Remove the initial count, at which point reaching zero can happen. */
|
||||
if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
|
||||
complete(&sp->srcu_barrier_completion);
|
||||
wait_for_completion(&sp->srcu_barrier_completion);
|
||||
|
||||
rcu_seq_end(&sp->srcu_barrier_seq);
|
||||
mutex_unlock(&sp->srcu_barrier_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(srcu_barrier);
|
||||
|
||||
/**
|
||||
* srcu_batches_completed - return batches completed.
|
||||
* @sp: srcu_struct on which to report batch completion.
|
||||
*
|
||||
* Report the number of batches, correlated with, but not necessarily
|
||||
* precisely the same as, the number of grace periods that have elapsed.
|
||||
*/
|
||||
unsigned long srcu_batches_completed(struct srcu_struct *sp)
|
||||
{
|
||||
return sp->srcu_idx;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(srcu_batches_completed);
|
||||
|
||||
/*
|
||||
* Core SRCU state machine. Push state bits of ->srcu_gp_seq
|
||||
* to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has
|
||||
* completed in that state.
|
||||
*/
|
||||
static void srcu_advance_state(struct srcu_struct *sp)
|
||||
{
|
||||
int idx;
|
||||
|
||||
mutex_lock(&sp->srcu_gp_mutex);
|
||||
|
||||
/*
|
||||
* Because readers might be delayed for an extended period after
|
||||
* fetching ->srcu_idx for their index, at any point in time there
|
||||
* might well be readers using both idx=0 and idx=1. We therefore
|
||||
* need to wait for readers to clear from both index values before
|
||||
* invoking a callback.
|
||||
*
|
||||
* The load-acquire ensures that we see the accesses performed
|
||||
* by the prior grace period.
|
||||
*/
|
||||
idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
|
||||
if (idx == SRCU_STATE_IDLE) {
|
||||
spin_lock_irq(&sp->gp_lock);
|
||||
if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
|
||||
WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
|
||||
spin_unlock_irq(&sp->gp_lock);
|
||||
mutex_unlock(&sp->srcu_gp_mutex);
|
||||
return;
|
||||
}
|
||||
idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
|
||||
if (idx == SRCU_STATE_IDLE)
|
||||
srcu_gp_start(sp);
|
||||
spin_unlock_irq(&sp->gp_lock);
|
||||
if (idx != SRCU_STATE_IDLE) {
|
||||
mutex_unlock(&sp->srcu_gp_mutex);
|
||||
return; /* Someone else started the grace period. */
|
||||
}
|
||||
}
|
||||
|
||||
if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
|
||||
idx = 1 ^ (sp->srcu_idx & 1);
|
||||
if (!try_check_zero(sp, idx, 1)) {
|
||||
mutex_unlock(&sp->srcu_gp_mutex);
|
||||
return; /* readers present, retry later. */
|
||||
}
|
||||
srcu_flip(sp);
|
||||
rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
|
||||
}
|
||||
|
||||
if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
|
||||
|
||||
/*
|
||||
* SRCU read-side critical sections are normally short,
|
||||
* so check at least twice in quick succession after a flip.
|
||||
*/
|
||||
idx = 1 ^ (sp->srcu_idx & 1);
|
||||
if (!try_check_zero(sp, idx, 2)) {
|
||||
mutex_unlock(&sp->srcu_gp_mutex);
|
||||
return; /* readers present, retry later. */
|
||||
}
|
||||
srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Invoke a limited number of SRCU callbacks that have passed through
|
||||
* their grace period. If there are more to do, SRCU will reschedule
|
||||
* the workqueue. Note that needed memory barriers have been executed
|
||||
* in this task's context by srcu_readers_active_idx_check().
|
||||
*/
|
||||
static void srcu_invoke_callbacks(struct work_struct *work)
|
||||
{
|
||||
bool more;
|
||||
struct rcu_cblist ready_cbs;
|
||||
struct rcu_head *rhp;
|
||||
struct srcu_data *sdp;
|
||||
struct srcu_struct *sp;
|
||||
|
||||
sdp = container_of(work, struct srcu_data, work.work);
|
||||
sp = sdp->sp;
|
||||
rcu_cblist_init(&ready_cbs);
|
||||
spin_lock_irq(&sdp->lock);
|
||||
smp_mb(); /* Old grace periods before callback invocation! */
|
||||
rcu_segcblist_advance(&sdp->srcu_cblist,
|
||||
rcu_seq_current(&sp->srcu_gp_seq));
|
||||
if (sdp->srcu_cblist_invoking ||
|
||||
!rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
|
||||
spin_unlock_irq(&sdp->lock);
|
||||
return; /* Someone else on the job or nothing to do. */
|
||||
}
|
||||
|
||||
/* We are on the job! Extract and invoke ready callbacks. */
|
||||
sdp->srcu_cblist_invoking = true;
|
||||
rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
|
||||
spin_unlock_irq(&sdp->lock);
|
||||
rhp = rcu_cblist_dequeue(&ready_cbs);
|
||||
for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
|
||||
local_bh_disable();
|
||||
rhp->func(rhp);
|
||||
local_bh_enable();
|
||||
}
|
||||
|
||||
/*
|
||||
* Update counts, accelerate new callbacks, and if needed,
|
||||
* schedule another round of callback invocation.
|
||||
*/
|
||||
spin_lock_irq(&sdp->lock);
|
||||
rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
|
||||
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
|
||||
rcu_seq_snap(&sp->srcu_gp_seq));
|
||||
sdp->srcu_cblist_invoking = false;
|
||||
more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
|
||||
spin_unlock_irq(&sdp->lock);
|
||||
if (more)
|
||||
srcu_schedule_cbs_sdp(sdp, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Finished one round of SRCU grace period. Start another if there are
|
||||
* more SRCU callbacks queued, otherwise put SRCU into not-running state.
|
||||
*/
|
||||
static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
|
||||
{
|
||||
bool pushgp = true;
|
||||
|
||||
spin_lock_irq(&sp->gp_lock);
|
||||
if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
|
||||
if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
|
||||
/* All requests fulfilled, time to go idle. */
|
||||
pushgp = false;
|
||||
}
|
||||
} else if (!rcu_seq_state(sp->srcu_gp_seq)) {
|
||||
/* Outstanding request and no GP. Start one. */
|
||||
srcu_gp_start(sp);
|
||||
}
|
||||
spin_unlock_irq(&sp->gp_lock);
|
||||
|
||||
if (pushgp)
|
||||
queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is the work-queue function that handles SRCU grace periods.
|
||||
*/
|
||||
void process_srcu(struct work_struct *work)
|
||||
{
|
||||
struct srcu_struct *sp;
|
||||
|
||||
sp = container_of(work, struct srcu_struct, work.work);
|
||||
|
||||
srcu_advance_state(sp);
|
||||
srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(process_srcu);
|
@ -79,7 +79,7 @@ EXPORT_SYMBOL(__rcu_is_watching);
|
||||
*/
|
||||
static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
|
||||
{
|
||||
RCU_TRACE(reset_cpu_stall_ticks(rcp));
|
||||
RCU_TRACE(reset_cpu_stall_ticks(rcp);)
|
||||
if (rcp->donetail != rcp->curtail) {
|
||||
rcp->donetail = rcp->curtail;
|
||||
return 1;
|
||||
@ -125,7 +125,7 @@ void rcu_bh_qs(void)
|
||||
*/
|
||||
void rcu_check_callbacks(int user)
|
||||
{
|
||||
RCU_TRACE(check_cpu_stalls());
|
||||
RCU_TRACE(check_cpu_stalls();)
|
||||
if (user)
|
||||
rcu_sched_qs();
|
||||
else if (!in_softirq())
|
||||
@ -143,7 +143,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
|
||||
const char *rn = NULL;
|
||||
struct rcu_head *next, *list;
|
||||
unsigned long flags;
|
||||
RCU_TRACE(int cb_count = 0);
|
||||
RCU_TRACE(int cb_count = 0;)
|
||||
|
||||
/* Move the ready-to-invoke callbacks to a local list. */
|
||||
local_irq_save(flags);
|
||||
@ -152,7 +152,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
|
||||
local_irq_restore(flags);
|
||||
return;
|
||||
}
|
||||
RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
|
||||
RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);)
|
||||
list = rcp->rcucblist;
|
||||
rcp->rcucblist = *rcp->donetail;
|
||||
*rcp->donetail = NULL;
|
||||
@ -162,7 +162,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
|
||||
local_irq_restore(flags);
|
||||
|
||||
/* Invoke the callbacks on the local list. */
|
||||
RCU_TRACE(rn = rcp->name);
|
||||
RCU_TRACE(rn = rcp->name;)
|
||||
while (list) {
|
||||
next = list->next;
|
||||
prefetch(next);
|
||||
@ -171,9 +171,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
|
||||
__rcu_reclaim(rn, list);
|
||||
local_bh_enable();
|
||||
list = next;
|
||||
RCU_TRACE(cb_count++);
|
||||
RCU_TRACE(cb_count++;)
|
||||
}
|
||||
RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
|
||||
RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);)
|
||||
RCU_TRACE(trace_rcu_batch_end(rcp->name,
|
||||
cb_count, 0, need_resched(),
|
||||
is_idle_task(current),
|
||||
@ -221,7 +221,7 @@ static void __call_rcu(struct rcu_head *head,
|
||||
local_irq_save(flags);
|
||||
*rcp->curtail = head;
|
||||
rcp->curtail = &head->next;
|
||||
RCU_TRACE(rcp->qlen++);
|
||||
RCU_TRACE(rcp->qlen++;)
|
||||
local_irq_restore(flags);
|
||||
|
||||
if (unlikely(is_idle_task(current))) {
|
||||
@ -254,8 +254,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
|
||||
void __init rcu_init(void)
|
||||
{
|
||||
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
|
||||
RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk));
|
||||
RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk));
|
||||
RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);)
|
||||
RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);)
|
||||
|
||||
rcu_early_boot_tests();
|
||||
}
|
||||
|
@ -52,7 +52,7 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
|
||||
RCU_TRACE(.name = "rcu_bh")
|
||||
};
|
||||
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
|
||||
#include <linux/kernel_stat.h>
|
||||
|
||||
int rcu_scheduler_active __read_mostly;
|
||||
@ -65,15 +65,16 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
|
||||
* to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
|
||||
* The reason for this is that Tiny RCU does not need kthreads, so does
|
||||
* not have to care about the fact that the scheduler is half-initialized
|
||||
* at a certain phase of the boot process.
|
||||
* at a certain phase of the boot process. Unless SRCU is in the mix.
|
||||
*/
|
||||
void __init rcu_scheduler_starting(void)
|
||||
{
|
||||
WARN_ON(nr_context_switches() > 0);
|
||||
rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
|
||||
rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
|
||||
? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
|
||||
}
|
||||
|
||||
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
|
||||
#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
|
||||
|
||||
#ifdef CONFIG_RCU_TRACE
|
||||
|
||||
@ -162,8 +163,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
|
||||
|
||||
static void check_cpu_stalls(void)
|
||||
{
|
||||
RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
|
||||
RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
|
||||
RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);)
|
||||
RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);)
|
||||
}
|
||||
|
||||
#endif /* #ifdef CONFIG_RCU_TRACE */
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -30,80 +30,8 @@
|
||||
#include <linux/seqlock.h>
|
||||
#include <linux/swait.h>
|
||||
#include <linux/stop_machine.h>
|
||||
|
||||
/*
|
||||
* Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
|
||||
* CONFIG_RCU_FANOUT_LEAF.
|
||||
* In theory, it should be possible to add more levels straightforwardly.
|
||||
* In practice, this did work well going from three levels to four.
|
||||
* Of course, your mileage may vary.
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_RCU_FANOUT
|
||||
#define RCU_FANOUT CONFIG_RCU_FANOUT
|
||||
#else /* #ifdef CONFIG_RCU_FANOUT */
|
||||
# ifdef CONFIG_64BIT
|
||||
# define RCU_FANOUT 64
|
||||
# else
|
||||
# define RCU_FANOUT 32
|
||||
# endif
|
||||
#endif /* #else #ifdef CONFIG_RCU_FANOUT */
|
||||
|
||||
#ifdef CONFIG_RCU_FANOUT_LEAF
|
||||
#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
|
||||
#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
|
||||
# ifdef CONFIG_64BIT
|
||||
# define RCU_FANOUT_LEAF 64
|
||||
# else
|
||||
# define RCU_FANOUT_LEAF 32
|
||||
# endif
|
||||
#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
|
||||
|
||||
#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
|
||||
#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT)
|
||||
#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT)
|
||||
#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT)
|
||||
|
||||
#if NR_CPUS <= RCU_FANOUT_1
|
||||
# define RCU_NUM_LVLS 1
|
||||
# define NUM_RCU_LVL_0 1
|
||||
# define NUM_RCU_NODES NUM_RCU_LVL_0
|
||||
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
|
||||
# define RCU_NODE_NAME_INIT { "rcu_node_0" }
|
||||
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
|
||||
#elif NR_CPUS <= RCU_FANOUT_2
|
||||
# define RCU_NUM_LVLS 2
|
||||
# define NUM_RCU_LVL_0 1
|
||||
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
|
||||
# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
|
||||
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
|
||||
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
|
||||
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
|
||||
#elif NR_CPUS <= RCU_FANOUT_3
|
||||
# define RCU_NUM_LVLS 3
|
||||
# define NUM_RCU_LVL_0 1
|
||||
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
|
||||
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
|
||||
# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
|
||||
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
|
||||
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
|
||||
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
|
||||
#elif NR_CPUS <= RCU_FANOUT_4
|
||||
# define RCU_NUM_LVLS 4
|
||||
# define NUM_RCU_LVL_0 1
|
||||
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
|
||||
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
|
||||
# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
|
||||
# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
|
||||
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
|
||||
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
|
||||
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
|
||||
#else
|
||||
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
|
||||
#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
|
||||
|
||||
extern int rcu_num_lvls;
|
||||
extern int rcu_num_nodes;
|
||||
#include <linux/rcu_segcblist.h>
|
||||
#include <linux/rcu_node_tree.h>
|
||||
|
||||
/*
|
||||
* Dynticks per-CPU state.
|
||||
@ -113,6 +41,9 @@ struct rcu_dynticks {
|
||||
/* Process level is worth LLONG_MAX/2. */
|
||||
int dynticks_nmi_nesting; /* Track NMI nesting level. */
|
||||
atomic_t dynticks; /* Even value for idle, else odd. */
|
||||
bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */
|
||||
unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */
|
||||
bool rcu_urgent_qs; /* GP old need light quiescent state. */
|
||||
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
|
||||
long long dynticks_idle_nesting;
|
||||
/* irq/process nesting level from idle. */
|
||||
@ -261,41 +192,6 @@ struct rcu_node {
|
||||
*/
|
||||
#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo))
|
||||
|
||||
/*
|
||||
* Do a full breadth-first scan of the rcu_node structures for the
|
||||
* specified rcu_state structure.
|
||||
*/
|
||||
#define rcu_for_each_node_breadth_first(rsp, rnp) \
|
||||
for ((rnp) = &(rsp)->node[0]; \
|
||||
(rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
|
||||
|
||||
/*
|
||||
* Do a breadth-first scan of the non-leaf rcu_node structures for the
|
||||
* specified rcu_state structure. Note that if there is a singleton
|
||||
* rcu_node tree with but one rcu_node structure, this loop is a no-op.
|
||||
*/
|
||||
#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
|
||||
for ((rnp) = &(rsp)->node[0]; \
|
||||
(rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
|
||||
|
||||
/*
|
||||
* Scan the leaves of the rcu_node hierarchy for the specified rcu_state
|
||||
* structure. Note that if there is a singleton rcu_node tree with but
|
||||
* one rcu_node structure, this loop -will- visit the rcu_node structure.
|
||||
* It is still a leaf node, even if it is also the root node.
|
||||
*/
|
||||
#define rcu_for_each_leaf_node(rsp, rnp) \
|
||||
for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
|
||||
(rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
|
||||
|
||||
/*
|
||||
* Iterate over all possible CPUs in a leaf RCU node.
|
||||
*/
|
||||
#define for_each_leaf_node_possible_cpu(rnp, cpu) \
|
||||
for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \
|
||||
cpu <= rnp->grphi; \
|
||||
cpu = cpumask_next((cpu), cpu_possible_mask))
|
||||
|
||||
/*
|
||||
* Union to allow "aggregate OR" operation on the need for a quiescent
|
||||
* state by the normal and expedited grace periods.
|
||||
@ -336,34 +232,9 @@ struct rcu_data {
|
||||
/* period it is aware of. */
|
||||
|
||||
/* 2) batch handling */
|
||||
/*
|
||||
* If nxtlist is not NULL, it is partitioned as follows.
|
||||
* Any of the partitions might be empty, in which case the
|
||||
* pointer to that partition will be equal to the pointer for
|
||||
* the following partition. When the list is empty, all of
|
||||
* the nxttail elements point to the ->nxtlist pointer itself,
|
||||
* which in that case is NULL.
|
||||
*
|
||||
* [nxtlist, *nxttail[RCU_DONE_TAIL]):
|
||||
* Entries that batch # <= ->completed
|
||||
* The grace period for these entries has completed, and
|
||||
* the other grace-period-completed entries may be moved
|
||||
* here temporarily in rcu_process_callbacks().
|
||||
* [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
|
||||
* Entries that batch # <= ->completed - 1: waiting for current GP
|
||||
* [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
|
||||
* Entries known to have arrived before current GP ended
|
||||
* [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
|
||||
* Entries that might have arrived after current GP ended
|
||||
* Note that the value of *nxttail[RCU_NEXT_TAIL] will
|
||||
* always be NULL, as this is the end of the list.
|
||||
*/
|
||||
struct rcu_head *nxtlist;
|
||||
struct rcu_head **nxttail[RCU_NEXT_SIZE];
|
||||
unsigned long nxtcompleted[RCU_NEXT_SIZE];
|
||||
/* grace periods for sublists. */
|
||||
long qlen_lazy; /* # of lazy queued callbacks */
|
||||
long qlen; /* # of queued callbacks, incl lazy */
|
||||
struct rcu_segcblist cblist; /* Segmented callback list, with */
|
||||
/* different callbacks waiting for */
|
||||
/* different grace periods. */
|
||||
long qlen_last_fqs_check;
|
||||
/* qlen at last check for QS forcing */
|
||||
unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
|
||||
@ -482,7 +353,6 @@ struct rcu_state {
|
||||
struct rcu_node *level[RCU_NUM_LVLS + 1];
|
||||
/* Hierarchy levels (+1 to */
|
||||
/* shut bogus gcc warning) */
|
||||
u8 flavor_mask; /* bit in flavor mask. */
|
||||
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
|
||||
call_rcu_func_t call; /* call_rcu() flavor. */
|
||||
int ncpus; /* # CPUs seen so far. */
|
||||
@ -502,14 +372,11 @@ struct rcu_state {
|
||||
|
||||
raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
|
||||
/* Protect following fields. */
|
||||
struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
|
||||
struct rcu_cblist orphan_pend; /* Orphaned callbacks that */
|
||||
/* need a grace period. */
|
||||
struct rcu_head **orphan_nxttail; /* Tail of above. */
|
||||
struct rcu_head *orphan_donelist; /* Orphaned callbacks that */
|
||||
struct rcu_cblist orphan_done; /* Orphaned callbacks that */
|
||||
/* are ready to invoke. */
|
||||
struct rcu_head **orphan_donetail; /* Tail of above. */
|
||||
long qlen_lazy; /* Number of lazy callbacks. */
|
||||
long qlen; /* Total number of callbacks. */
|
||||
/* (Contains counts.) */
|
||||
/* End of fields guarded by orphan_lock. */
|
||||
|
||||
struct mutex barrier_mutex; /* Guards barrier fields. */
|
||||
@ -596,6 +463,7 @@ extern struct rcu_state rcu_preempt_state;
|
||||
#endif /* #ifdef CONFIG_PREEMPT_RCU */
|
||||
|
||||
int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
|
||||
bool rcu_eqs_special_set(int cpu);
|
||||
|
||||
#ifdef CONFIG_RCU_BOOST
|
||||
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
|
||||
@ -673,6 +541,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
|
||||
static void rcu_dynticks_task_enter(void);
|
||||
static void rcu_dynticks_task_exit(void);
|
||||
|
||||
#ifdef CONFIG_SRCU
|
||||
void srcu_online_cpu(unsigned int cpu);
|
||||
void srcu_offline_cpu(unsigned int cpu);
|
||||
#else /* #ifdef CONFIG_SRCU */
|
||||
void srcu_online_cpu(unsigned int cpu) { }
|
||||
void srcu_offline_cpu(unsigned int cpu) { }
|
||||
#endif /* #else #ifdef CONFIG_SRCU */
|
||||
|
||||
#endif /* #ifndef RCU_TREE_NONCORE */
|
||||
|
||||
#ifdef CONFIG_RCU_TRACE
|
||||
|
@ -292,7 +292,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
|
||||
trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
|
||||
rnp->grplo, rnp->grphi,
|
||||
TPS("wait"));
|
||||
wait_event(rnp->exp_wq[(s >> 1) & 0x3],
|
||||
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
|
||||
sync_exp_work_done(rsp,
|
||||
&rdp->exp_workdone2, s));
|
||||
return true;
|
||||
@ -331,6 +331,8 @@ static void sync_sched_exp_handler(void *data)
|
||||
return;
|
||||
}
|
||||
__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
|
||||
/* Store .exp before .rcu_urgent_qs. */
|
||||
smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
|
||||
resched_cpu(smp_processor_id());
|
||||
}
|
||||
|
||||
@ -531,7 +533,8 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
|
||||
rnp->exp_seq_rq = s;
|
||||
spin_unlock(&rnp->exp_lock);
|
||||
}
|
||||
wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
|
||||
smp_mb(); /* All above changes before wakeup. */
|
||||
wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]);
|
||||
}
|
||||
trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
|
||||
mutex_unlock(&rsp->exp_wake_mutex);
|
||||
@ -609,9 +612,9 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
|
||||
/* Wait for expedited grace period to complete. */
|
||||
rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
|
||||
rnp = rcu_get_root(rsp);
|
||||
wait_event(rnp->exp_wq[(s >> 1) & 0x3],
|
||||
sync_exp_work_done(rsp,
|
||||
&rdp->exp_workdone0, s));
|
||||
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
|
||||
sync_exp_work_done(rsp, &rdp->exp_workdone0, s));
|
||||
smp_mb(); /* Workqueue actions happen before return. */
|
||||
|
||||
/* Let the next expedited grace period start. */
|
||||
mutex_unlock(&rsp->exp_mutex);
|
||||
@ -735,15 +738,3 @@ void synchronize_rcu_expedited(void)
|
||||
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
|
||||
|
||||
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
|
||||
|
||||
/*
|
||||
* Switch to run-time mode once Tree RCU has fully initialized.
|
||||
*/
|
||||
static int __init rcu_exp_runtime_mode(void)
|
||||
{
|
||||
rcu_test_sync_prims();
|
||||
rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
|
||||
rcu_test_sync_prims();
|
||||
return 0;
|
||||
}
|
||||
core_initcall(rcu_exp_runtime_mode);
|
||||
|
@ -1350,10 +1350,10 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
|
||||
*/
|
||||
if ((rdp->completed != rnp->completed ||
|
||||
unlikely(READ_ONCE(rdp->gpwrap))) &&
|
||||
rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
|
||||
rcu_segcblist_pend_cbs(&rdp->cblist))
|
||||
note_gp_changes(rsp, rdp);
|
||||
|
||||
if (cpu_has_callbacks_ready_to_invoke(rdp))
|
||||
if (rcu_segcblist_ready_cbs(&rdp->cblist))
|
||||
cbs_ready = true;
|
||||
}
|
||||
return cbs_ready;
|
||||
@ -1461,7 +1461,7 @@ static void rcu_prepare_for_idle(void)
|
||||
rdtp->last_accelerate = jiffies;
|
||||
for_each_rcu_flavor(rsp) {
|
||||
rdp = this_cpu_ptr(rsp->rda);
|
||||
if (!*rdp->nxttail[RCU_DONE_TAIL])
|
||||
if (rcu_segcblist_pend_cbs(&rdp->cblist))
|
||||
continue;
|
||||
rnp = rdp->mynode;
|
||||
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
|
||||
@ -1529,7 +1529,7 @@ static void rcu_oom_notify_cpu(void *unused)
|
||||
|
||||
for_each_rcu_flavor(rsp) {
|
||||
rdp = raw_cpu_ptr(rsp->rda);
|
||||
if (rdp->qlen_lazy != 0) {
|
||||
if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) {
|
||||
atomic_inc(&oom_callback_count);
|
||||
rsp->call(&rdp->oom_head, rcu_oom_callback);
|
||||
}
|
||||
@ -1709,7 +1709,7 @@ __setup("rcu_nocbs=", rcu_nocb_setup);
|
||||
|
||||
static int __init parse_rcu_nocb_poll(char *arg)
|
||||
{
|
||||
rcu_nocb_poll = 1;
|
||||
rcu_nocb_poll = true;
|
||||
return 0;
|
||||
}
|
||||
early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
|
||||
@ -1860,7 +1860,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
|
||||
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
|
||||
TPS("WakeEmpty"));
|
||||
} else {
|
||||
rdp->nocb_defer_wakeup = RCU_NOGP_WAKE;
|
||||
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE);
|
||||
/* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
|
||||
smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
|
||||
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
|
||||
TPS("WakeEmptyIsDeferred"));
|
||||
}
|
||||
@ -1872,7 +1874,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
|
||||
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
|
||||
TPS("WakeOvf"));
|
||||
} else {
|
||||
rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE;
|
||||
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE);
|
||||
/* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
|
||||
smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
|
||||
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
|
||||
TPS("WakeOvfIsDeferred"));
|
||||
}
|
||||
@ -1930,30 +1934,26 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
|
||||
struct rcu_data *rdp,
|
||||
unsigned long flags)
|
||||
{
|
||||
long ql = rsp->qlen;
|
||||
long qll = rsp->qlen_lazy;
|
||||
long ql = rcu_cblist_n_cbs(&rsp->orphan_done);
|
||||
long qll = rcu_cblist_n_lazy_cbs(&rsp->orphan_done);
|
||||
|
||||
/* If this is not a no-CBs CPU, tell the caller to do it the old way. */
|
||||
if (!rcu_is_nocb_cpu(smp_processor_id()))
|
||||
return false;
|
||||
rsp->qlen = 0;
|
||||
rsp->qlen_lazy = 0;
|
||||
|
||||
/* First, enqueue the donelist, if any. This preserves CB ordering. */
|
||||
if (rsp->orphan_donelist != NULL) {
|
||||
__call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
|
||||
rsp->orphan_donetail, ql, qll, flags);
|
||||
ql = qll = 0;
|
||||
rsp->orphan_donelist = NULL;
|
||||
rsp->orphan_donetail = &rsp->orphan_donelist;
|
||||
if (!rcu_cblist_empty(&rsp->orphan_done)) {
|
||||
__call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done),
|
||||
rcu_cblist_tail(&rsp->orphan_done),
|
||||
ql, qll, flags);
|
||||
}
|
||||
if (rsp->orphan_nxtlist != NULL) {
|
||||
__call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
|
||||
rsp->orphan_nxttail, ql, qll, flags);
|
||||
ql = qll = 0;
|
||||
rsp->orphan_nxtlist = NULL;
|
||||
rsp->orphan_nxttail = &rsp->orphan_nxtlist;
|
||||
if (!rcu_cblist_empty(&rsp->orphan_pend)) {
|
||||
__call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend),
|
||||
rcu_cblist_tail(&rsp->orphan_pend),
|
||||
ql, qll, flags);
|
||||
}
|
||||
rcu_cblist_init(&rsp->orphan_done);
|
||||
rcu_cblist_init(&rsp->orphan_pend);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -2395,16 +2395,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
|
||||
return false;
|
||||
|
||||
/* If there are early-boot callbacks, move them to nocb lists. */
|
||||
if (rdp->nxtlist) {
|
||||
rdp->nocb_head = rdp->nxtlist;
|
||||
rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL];
|
||||
atomic_long_set(&rdp->nocb_q_count, rdp->qlen);
|
||||
atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy);
|
||||
rdp->nxtlist = NULL;
|
||||
rdp->qlen = 0;
|
||||
rdp->qlen_lazy = 0;
|
||||
if (!rcu_segcblist_empty(&rdp->cblist)) {
|
||||
rdp->nocb_head = rcu_segcblist_head(&rdp->cblist);
|
||||
rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist);
|
||||
atomic_long_set(&rdp->nocb_q_count,
|
||||
rcu_segcblist_n_cbs(&rdp->cblist));
|
||||
atomic_long_set(&rdp->nocb_q_count_lazy,
|
||||
rcu_segcblist_n_lazy_cbs(&rdp->cblist));
|
||||
rcu_segcblist_init(&rdp->cblist);
|
||||
}
|
||||
rdp->nxttail[RCU_NEXT_TAIL] = NULL;
|
||||
rcu_segcblist_disable(&rdp->cblist);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -41,11 +41,11 @@
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/prefetch.h>
|
||||
|
||||
#define RCU_TREE_NONCORE
|
||||
#include "tree.h"
|
||||
|
||||
DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
|
||||
#include "rcu.h"
|
||||
|
||||
static int r_open(struct inode *inode, struct file *file,
|
||||
const struct seq_operations *op)
|
||||
@ -121,7 +121,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
|
||||
cpu_is_offline(rdp->cpu) ? '!' : ' ',
|
||||
ulong2long(rdp->completed), ulong2long(rdp->gpnum),
|
||||
rdp->cpu_no_qs.b.norm,
|
||||
rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
|
||||
rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu),
|
||||
rdp->core_needs_qs);
|
||||
seq_printf(m, " dt=%d/%llx/%d df=%lu",
|
||||
rcu_dynticks_snap(rdp->dynticks),
|
||||
@ -130,17 +130,15 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
|
||||
rdp->dynticks_fqs);
|
||||
seq_printf(m, " of=%lu", rdp->offline_fqs);
|
||||
rcu_nocb_q_lengths(rdp, &ql, &qll);
|
||||
qll += rdp->qlen_lazy;
|
||||
ql += rdp->qlen;
|
||||
qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist);
|
||||
ql += rcu_segcblist_n_cbs(&rdp->cblist);
|
||||
seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
|
||||
qll, ql,
|
||||
".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
|
||||
rdp->nxttail[RCU_NEXT_TAIL]],
|
||||
".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
|
||||
rdp->nxttail[RCU_NEXT_READY_TAIL]],
|
||||
".W"[rdp->nxttail[RCU_DONE_TAIL] !=
|
||||
rdp->nxttail[RCU_WAIT_TAIL]],
|
||||
".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
|
||||
".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)],
|
||||
".R"[!rcu_segcblist_segempty(&rdp->cblist,
|
||||
RCU_NEXT_READY_TAIL)],
|
||||
".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)],
|
||||
".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]);
|
||||
#ifdef CONFIG_RCU_BOOST
|
||||
seq_printf(m, " kt=%d/%c ktl=%x",
|
||||
per_cpu(rcu_cpu_has_work, rdp->cpu),
|
||||
@ -278,7 +276,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
|
||||
seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
|
||||
rsp->n_force_qs, rsp->n_force_qs_ngp,
|
||||
rsp->n_force_qs - rsp->n_force_qs_ngp,
|
||||
READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
|
||||
READ_ONCE(rsp->n_force_qs_lh),
|
||||
rcu_cblist_n_lazy_cbs(&rsp->orphan_done),
|
||||
rcu_cblist_n_cbs(&rsp->orphan_done));
|
||||
for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
|
||||
if (rnp->level != level) {
|
||||
seq_puts(m, "\n");
|
||||
|
@ -124,7 +124,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
|
||||
* non-expedited counterparts? Intended for use within RCU. Note
|
||||
* that if the user specifies both rcu_expedited and rcu_normal, then
|
||||
* rcu_normal wins. (Except during the time period during boot from
|
||||
* when the first task is spawned until the rcu_exp_runtime_mode()
|
||||
* when the first task is spawned until the rcu_set_runtime_mode()
|
||||
* core_initcall() is invoked, at which point everything is expedited.)
|
||||
*/
|
||||
bool rcu_gp_is_normal(void)
|
||||
@ -190,6 +190,39 @@ void rcu_end_inkernel_boot(void)
|
||||
|
||||
#endif /* #ifndef CONFIG_TINY_RCU */
|
||||
|
||||
/*
|
||||
* Test each non-SRCU synchronous grace-period wait API. This is
|
||||
* useful just after a change in mode for these primitives, and
|
||||
* during early boot.
|
||||
*/
|
||||
void rcu_test_sync_prims(void)
|
||||
{
|
||||
if (!IS_ENABLED(CONFIG_PROVE_RCU))
|
||||
return;
|
||||
synchronize_rcu();
|
||||
synchronize_rcu_bh();
|
||||
synchronize_sched();
|
||||
synchronize_rcu_expedited();
|
||||
synchronize_rcu_bh_expedited();
|
||||
synchronize_sched_expedited();
|
||||
}
|
||||
|
||||
#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU)
|
||||
|
||||
/*
|
||||
* Switch to run-time mode once RCU has fully initialized.
|
||||
*/
|
||||
static int __init rcu_set_runtime_mode(void)
|
||||
{
|
||||
rcu_test_sync_prims();
|
||||
rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
|
||||
rcu_test_sync_prims();
|
||||
return 0;
|
||||
}
|
||||
core_initcall(rcu_set_runtime_mode);
|
||||
|
||||
#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
|
||||
|
||||
#ifdef CONFIG_PREEMPT_RCU
|
||||
|
||||
/*
|
||||
@ -632,6 +665,7 @@ static void check_holdout_task(struct task_struct *t,
|
||||
put_task_struct(t);
|
||||
return;
|
||||
}
|
||||
rcu_request_urgent_qs_task(t);
|
||||
if (!needreport)
|
||||
return;
|
||||
if (*firstreport) {
|
||||
@ -817,23 +851,6 @@ static void rcu_spawn_tasks_kthread(void)
|
||||
|
||||
#endif /* #ifdef CONFIG_TASKS_RCU */
|
||||
|
||||
/*
|
||||
* Test each non-SRCU synchronous grace-period wait API. This is
|
||||
* useful just after a change in mode for these primitives, and
|
||||
* during early boot.
|
||||
*/
|
||||
void rcu_test_sync_prims(void)
|
||||
{
|
||||
if (!IS_ENABLED(CONFIG_PROVE_RCU))
|
||||
return;
|
||||
synchronize_rcu();
|
||||
synchronize_rcu_bh();
|
||||
synchronize_sched();
|
||||
synchronize_rcu_expedited();
|
||||
synchronize_rcu_bh_expedited();
|
||||
synchronize_sched_expedited();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PROVE_RCU
|
||||
|
||||
/*
|
||||
|
@ -3378,7 +3378,7 @@ static void __sched notrace __schedule(bool preempt)
|
||||
hrtick_clear(rq);
|
||||
|
||||
local_irq_disable();
|
||||
rcu_note_context_switch();
|
||||
rcu_note_context_switch(preempt);
|
||||
|
||||
/*
|
||||
* Make sure that signal_pending_state()->signal_pending() below
|
||||
|
@ -1237,7 +1237,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
|
||||
}
|
||||
/*
|
||||
* This sighand can be already freed and even reused, but
|
||||
* we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which
|
||||
* we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which
|
||||
* initializes ->siglock: this slab can't go away, it has
|
||||
* the same object type, ->siglock can't be reinitialized.
|
||||
*
|
||||
|
@ -413,7 +413,7 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
|
||||
*size += sizeof(struct kasan_alloc_meta);
|
||||
|
||||
/* Add free meta. */
|
||||
if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor ||
|
||||
if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor ||
|
||||
cache->object_size < sizeof(struct kasan_free_meta)) {
|
||||
cache->kasan_info.free_meta_offset = *size;
|
||||
*size += sizeof(struct kasan_free_meta);
|
||||
@ -561,7 +561,7 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
|
||||
unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
|
||||
|
||||
/* RCU slabs could be legally used after free within the RCU period */
|
||||
if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
|
||||
if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
|
||||
return;
|
||||
|
||||
kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
|
||||
@ -572,7 +572,7 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
|
||||
s8 shadow_byte;
|
||||
|
||||
/* RCU slabs could be legally used after free within the RCU period */
|
||||
if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
|
||||
if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
|
||||
return false;
|
||||
|
||||
shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
|
||||
|
@ -95,7 +95,7 @@ void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
|
||||
void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
|
||||
{
|
||||
/* TODO: RCU freeing is unsupported for now; hide false positives. */
|
||||
if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU))
|
||||
if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU))
|
||||
kmemcheck_mark_freed(object, size);
|
||||
}
|
||||
|
||||
|
@ -21,7 +21,7 @@
|
||||
#include <linux/slab.h>
|
||||
|
||||
/* global SRCU for all MMs */
|
||||
static struct srcu_struct srcu;
|
||||
DEFINE_STATIC_SRCU(srcu);
|
||||
|
||||
/*
|
||||
* This function allows mmu_notifier::release callback to delay a call to
|
||||
@ -252,12 +252,6 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
|
||||
|
||||
BUG_ON(atomic_read(&mm->mm_users) <= 0);
|
||||
|
||||
/*
|
||||
* Verify that mmu_notifier_init() already run and the global srcu is
|
||||
* initialized.
|
||||
*/
|
||||
BUG_ON(!srcu.per_cpu_ref);
|
||||
|
||||
ret = -ENOMEM;
|
||||
mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
|
||||
if (unlikely(!mmu_notifier_mm))
|
||||
@ -406,9 +400,3 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn,
|
||||
mmdrop(mm);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release);
|
||||
|
||||
static int __init mmu_notifier_init(void)
|
||||
{
|
||||
return init_srcu_struct(&srcu);
|
||||
}
|
||||
subsys_initcall(mmu_notifier_init);
|
||||
|
@ -430,7 +430,7 @@ static void anon_vma_ctor(void *data)
|
||||
void __init anon_vma_init(void)
|
||||
{
|
||||
anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
|
||||
0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
|
||||
0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
|
||||
anon_vma_ctor);
|
||||
anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
|
||||
SLAB_PANIC|SLAB_ACCOUNT);
|
||||
@ -481,7 +481,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
|
||||
* If this page is still mapped, then its anon_vma cannot have been
|
||||
* freed. But if it has been unmapped, we have no security against the
|
||||
* anon_vma structure being freed and reused (for another anon_vma:
|
||||
* SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero()
|
||||
* SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
|
||||
* above cannot corrupt).
|
||||
*/
|
||||
if (!page_mapped(page)) {
|
||||
|
@ -1728,7 +1728,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page)
|
||||
|
||||
freelist = page->freelist;
|
||||
slab_destroy_debugcheck(cachep, page);
|
||||
if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
|
||||
if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU))
|
||||
call_rcu(&page->rcu_head, kmem_rcu_free);
|
||||
else
|
||||
kmem_freepages(cachep, page);
|
||||
@ -1924,7 +1924,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
|
||||
|
||||
cachep->num = 0;
|
||||
|
||||
if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU)
|
||||
if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU)
|
||||
return false;
|
||||
|
||||
left = calculate_slab_order(cachep, size,
|
||||
@ -2030,7 +2030,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
|
||||
if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
|
||||
2 * sizeof(unsigned long long)))
|
||||
flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
|
||||
if (!(flags & SLAB_DESTROY_BY_RCU))
|
||||
if (!(flags & SLAB_TYPESAFE_BY_RCU))
|
||||
flags |= SLAB_POISON;
|
||||
#endif
|
||||
#endif
|
||||
|
@ -126,7 +126,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
|
||||
|
||||
/* Legal flag mask for kmem_cache_create(), for various configurations */
|
||||
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
|
||||
SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS )
|
||||
SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
|
||||
|
||||
#if defined(CONFIG_DEBUG_SLAB)
|
||||
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
|
||||
@ -415,7 +415,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
|
||||
* back there or track user information then we can
|
||||
* only use the space before that information.
|
||||
*/
|
||||
if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
|
||||
if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER))
|
||||
return s->inuse;
|
||||
/*
|
||||
* Else we can use all the padding etc for the allocation
|
||||
|
@ -39,7 +39,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
|
||||
* Set of flags that will prevent slab merging
|
||||
*/
|
||||
#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
|
||||
SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
|
||||
SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
|
||||
SLAB_FAILSLAB | SLAB_KASAN)
|
||||
|
||||
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
|
||||
@ -500,7 +500,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
|
||||
struct kmem_cache *s, *s2;
|
||||
|
||||
/*
|
||||
* On destruction, SLAB_DESTROY_BY_RCU kmem_caches are put on the
|
||||
* On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the
|
||||
* @slab_caches_to_rcu_destroy list. The slab pages are freed
|
||||
* through RCU and and the associated kmem_cache are dereferenced
|
||||
* while freeing the pages, so the kmem_caches should be freed only
|
||||
@ -537,7 +537,7 @@ static int shutdown_cache(struct kmem_cache *s)
|
||||
memcg_unlink_cache(s);
|
||||
list_del(&s->list);
|
||||
|
||||
if (s->flags & SLAB_DESTROY_BY_RCU) {
|
||||
if (s->flags & SLAB_TYPESAFE_BY_RCU) {
|
||||
list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
|
||||
schedule_work(&slab_caches_to_rcu_destroy_work);
|
||||
} else {
|
||||
|
@ -126,7 +126,7 @@ static inline void clear_slob_page_free(struct page *sp)
|
||||
|
||||
/*
|
||||
* struct slob_rcu is inserted at the tail of allocated slob blocks, which
|
||||
* were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free
|
||||
* were created with a SLAB_TYPESAFE_BY_RCU slab. slob_rcu is used to free
|
||||
* the block using call_rcu.
|
||||
*/
|
||||
struct slob_rcu {
|
||||
@ -524,7 +524,7 @@ EXPORT_SYMBOL(ksize);
|
||||
|
||||
int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
|
||||
{
|
||||
if (flags & SLAB_DESTROY_BY_RCU) {
|
||||
if (flags & SLAB_TYPESAFE_BY_RCU) {
|
||||
/* leave room for rcu footer at the end of object */
|
||||
c->size += sizeof(struct slob_rcu);
|
||||
}
|
||||
@ -598,7 +598,7 @@ static void kmem_rcu_free(struct rcu_head *head)
|
||||
void kmem_cache_free(struct kmem_cache *c, void *b)
|
||||
{
|
||||
kmemleak_free_recursive(b, c->flags);
|
||||
if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
|
||||
if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) {
|
||||
struct slob_rcu *slob_rcu;
|
||||
slob_rcu = b + (c->size - sizeof(struct slob_rcu));
|
||||
slob_rcu->size = c->size;
|
||||
|
12
mm/slub.c
12
mm/slub.c
@ -1687,7 +1687,7 @@ static void rcu_free_slab(struct rcu_head *h)
|
||||
|
||||
static void free_slab(struct kmem_cache *s, struct page *page)
|
||||
{
|
||||
if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
|
||||
if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
|
||||
struct rcu_head *head;
|
||||
|
||||
if (need_reserve_slab_rcu) {
|
||||
@ -2963,7 +2963,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
|
||||
* slab_free_freelist_hook() could have put the items into quarantine.
|
||||
* If so, no need to free them.
|
||||
*/
|
||||
if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU))
|
||||
if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU))
|
||||
return;
|
||||
do_slab_free(s, page, head, tail, cnt, addr);
|
||||
}
|
||||
@ -3433,7 +3433,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
|
||||
* the slab may touch the object after free or before allocation
|
||||
* then we should never poison the object itself.
|
||||
*/
|
||||
if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
|
||||
if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
|
||||
!s->ctor)
|
||||
s->flags |= __OBJECT_POISON;
|
||||
else
|
||||
@ -3455,7 +3455,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
|
||||
*/
|
||||
s->inuse = size;
|
||||
|
||||
if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
|
||||
if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
|
||||
s->ctor)) {
|
||||
/*
|
||||
* Relocate free pointer after the object if it is not
|
||||
@ -3537,7 +3537,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
|
||||
s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
|
||||
s->reserved = 0;
|
||||
|
||||
if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
|
||||
if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU))
|
||||
s->reserved = sizeof(struct rcu_head);
|
||||
|
||||
if (!calculate_sizes(s, -1))
|
||||
@ -5042,7 +5042,7 @@ SLAB_ATTR_RO(cache_dma);
|
||||
|
||||
static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
|
||||
{
|
||||
return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
|
||||
return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
|
||||
}
|
||||
SLAB_ATTR_RO(destroy_by_rcu);
|
||||
|
||||
|
@ -951,7 +951,7 @@ static struct proto dccp_v4_prot = {
|
||||
.orphan_count = &dccp_orphan_count,
|
||||
.max_header = MAX_DCCP_HEADER,
|
||||
.obj_size = sizeof(struct dccp_sock),
|
||||
.slab_flags = SLAB_DESTROY_BY_RCU,
|
||||
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
||||
.rsk_prot = &dccp_request_sock_ops,
|
||||
.twsk_prot = &dccp_timewait_sock_ops,
|
||||
.h.hashinfo = &dccp_hashinfo,
|
||||
|
@ -1014,7 +1014,7 @@ static struct proto dccp_v6_prot = {
|
||||
.orphan_count = &dccp_orphan_count,
|
||||
.max_header = MAX_DCCP_HEADER,
|
||||
.obj_size = sizeof(struct dccp6_sock),
|
||||
.slab_flags = SLAB_DESTROY_BY_RCU,
|
||||
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
||||
.rsk_prot = &dccp6_request_sock_ops,
|
||||
.twsk_prot = &dccp6_timewait_sock_ops,
|
||||
.h.hashinfo = &dccp_hashinfo,
|
||||
|
@ -2402,7 +2402,7 @@ struct proto tcp_prot = {
|
||||
.sysctl_rmem = sysctl_tcp_rmem,
|
||||
.max_header = MAX_TCP_HEADER,
|
||||
.obj_size = sizeof(struct tcp_sock),
|
||||
.slab_flags = SLAB_DESTROY_BY_RCU,
|
||||
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
||||
.twsk_prot = &tcp_timewait_sock_ops,
|
||||
.rsk_prot = &tcp_request_sock_ops,
|
||||
.h.hashinfo = &tcp_hashinfo,
|
||||
|
@ -1921,7 +1921,7 @@ struct proto tcpv6_prot = {
|
||||
.sysctl_rmem = sysctl_tcp_rmem,
|
||||
.max_header = MAX_TCP_HEADER,
|
||||
.obj_size = sizeof(struct tcp6_sock),
|
||||
.slab_flags = SLAB_DESTROY_BY_RCU,
|
||||
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
||||
.twsk_prot = &tcp6_timewait_sock_ops,
|
||||
.rsk_prot = &tcp6_request_sock_ops,
|
||||
.h.hashinfo = &tcp_hashinfo,
|
||||
|
@ -142,7 +142,7 @@ static struct proto llc_proto = {
|
||||
.name = "LLC",
|
||||
.owner = THIS_MODULE,
|
||||
.obj_size = sizeof(struct llc_sock),
|
||||
.slab_flags = SLAB_DESTROY_BY_RCU,
|
||||
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -506,7 +506,7 @@ static struct sock *__llc_lookup_established(struct llc_sap *sap,
|
||||
again:
|
||||
sk_nulls_for_each_rcu(rc, node, laddr_hb) {
|
||||
if (llc_estab_match(sap, daddr, laddr, rc)) {
|
||||
/* Extra checks required by SLAB_DESTROY_BY_RCU */
|
||||
/* Extra checks required by SLAB_TYPESAFE_BY_RCU */
|
||||
if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
|
||||
goto again;
|
||||
if (unlikely(llc_sk(rc)->sap != sap ||
|
||||
@ -565,7 +565,7 @@ static struct sock *__llc_lookup_listener(struct llc_sap *sap,
|
||||
again:
|
||||
sk_nulls_for_each_rcu(rc, node, laddr_hb) {
|
||||
if (llc_listener_match(sap, laddr, rc)) {
|
||||
/* Extra checks required by SLAB_DESTROY_BY_RCU */
|
||||
/* Extra checks required by SLAB_TYPESAFE_BY_RCU */
|
||||
if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
|
||||
goto again;
|
||||
if (unlikely(llc_sk(rc)->sap != sap ||
|
||||
|
@ -328,7 +328,7 @@ static struct sock *llc_lookup_dgram(struct llc_sap *sap,
|
||||
again:
|
||||
sk_nulls_for_each_rcu(rc, node, laddr_hb) {
|
||||
if (llc_dgram_match(sap, laddr, rc)) {
|
||||
/* Extra checks required by SLAB_DESTROY_BY_RCU */
|
||||
/* Extra checks required by SLAB_TYPESAFE_BY_RCU */
|
||||
if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
|
||||
goto again;
|
||||
if (unlikely(llc_sk(rc)->sap != sap ||
|
||||
|
@ -918,7 +918,7 @@ static unsigned int early_drop_list(struct net *net,
|
||||
continue;
|
||||
|
||||
/* kill only if still in same netns -- might have moved due to
|
||||
* SLAB_DESTROY_BY_RCU rules.
|
||||
* SLAB_TYPESAFE_BY_RCU rules.
|
||||
*
|
||||
* We steal the timer reference. If that fails timer has
|
||||
* already fired or someone else deleted it. Just drop ref
|
||||
@ -1073,7 +1073,7 @@ __nf_conntrack_alloc(struct net *net,
|
||||
|
||||
/*
|
||||
* Do not use kmem_cache_zalloc(), as this cache uses
|
||||
* SLAB_DESTROY_BY_RCU.
|
||||
* SLAB_TYPESAFE_BY_RCU.
|
||||
*/
|
||||
ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
|
||||
if (ct == NULL)
|
||||
@ -1118,7 +1118,7 @@ void nf_conntrack_free(struct nf_conn *ct)
|
||||
struct net *net = nf_ct_net(ct);
|
||||
|
||||
/* A freed object has refcnt == 0, that's
|
||||
* the golden rule for SLAB_DESTROY_BY_RCU
|
||||
* the golden rule for SLAB_TYPESAFE_BY_RCU
|
||||
*/
|
||||
NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
|
||||
|
||||
@ -1882,7 +1882,7 @@ int nf_conntrack_init_start(void)
|
||||
nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
|
||||
sizeof(struct nf_conn),
|
||||
NFCT_INFOMASK + 1,
|
||||
SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
|
||||
SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
|
||||
if (!nf_conntrack_cachep)
|
||||
goto err_cachep;
|
||||
|
||||
|
@ -101,7 +101,7 @@ struct proto smc_proto = {
|
||||
.unhash = smc_unhash_sk,
|
||||
.obj_size = sizeof(struct smc_sock),
|
||||
.h.smc_hash = &smc_v4_hashinfo,
|
||||
.slab_flags = SLAB_DESTROY_BY_RCU,
|
||||
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
||||
};
|
||||
EXPORT_SYMBOL_GPL(smc_proto);
|
||||
|
||||
|
@ -170,7 +170,7 @@ qemu_append="`identify_qemu_append "$QEMU"`"
|
||||
# Pull in Kconfig-fragment boot parameters
|
||||
boot_args="`configfrag_boot_params "$boot_args" "$config_template"`"
|
||||
# Generate kernel-version-specific boot parameters
|
||||
boot_args="`per_version_boot_params "$boot_args" $builddir/.config $seconds`"
|
||||
boot_args="`per_version_boot_params "$boot_args" $resdir/.config $seconds`"
|
||||
|
||||
if test -n "$TORTURE_BUILDONLY"
|
||||
then
|
||||
|
Loading…
Reference in New Issue
Block a user