rcu/tree: Maintain separate array for vmalloc ptrs

To do so, we use an array of kvfree_rcu_bulk_data structures.
It consists of two elements:
 - index number 0 corresponds to slab pointers.
 - index number 1 corresponds to vmalloc pointers.

Keeping vmalloc pointers separated from slab pointers makes
it possible to invoke the right freeing API for the right
kind of pointer.

It also prepares us for future headless support for vmalloc
and SLAB objects. Such objects cannot be queued on a linked
list and are instead directly into an array.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
This commit is contained in:
Uladzislau Rezki (Sony) 2020-05-25 23:47:53 +02:00 committed by Paul E. McKenney
parent 53c72b590b
commit 5f3c8d6204

View File

@ -57,6 +57,8 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/sched/isolation.h> #include <linux/sched/isolation.h>
#include <linux/sched/clock.h> #include <linux/sched/clock.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include "../time/tick-internal.h" #include "../time/tick-internal.h"
#include "tree.h" #include "tree.h"
@ -2966,46 +2968,47 @@ EXPORT_SYMBOL_GPL(call_rcu);
/* Maximum number of jiffies to wait before draining a batch. */ /* Maximum number of jiffies to wait before draining a batch. */
#define KFREE_DRAIN_JIFFIES (HZ / 50) #define KFREE_DRAIN_JIFFIES (HZ / 50)
#define KFREE_N_BATCHES 2 #define KFREE_N_BATCHES 2
#define FREE_N_CHANNELS 2
/** /**
* struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
* @nr_records: Number of active pointers in the array * @nr_records: Number of active pointers in the array
* @records: Array of the kfree_rcu() pointers
* @next: Next bulk object in the block chain * @next: Next bulk object in the block chain
* @records: Array of the kvfree_rcu() pointers
*/ */
struct kfree_rcu_bulk_data { struct kvfree_rcu_bulk_data {
unsigned long nr_records; unsigned long nr_records;
struct kfree_rcu_bulk_data *next; struct kvfree_rcu_bulk_data *next;
void *records[]; void *records[];
}; };
/* /*
* This macro defines how many entries the "records" array * This macro defines how many entries the "records" array
* will contain. It is based on the fact that the size of * will contain. It is based on the fact that the size of
* kfree_rcu_bulk_data structure becomes exactly one page. * kvfree_rcu_bulk_data structure becomes exactly one page.
*/ */
#define KFREE_BULK_MAX_ENTR \ #define KVFREE_BULK_MAX_ENTR \
((PAGE_SIZE - sizeof(struct kfree_rcu_bulk_data)) / sizeof(void *)) ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
/** /**
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
* @head_free: List of kfree_rcu() objects waiting for a grace period * @head_free: List of kfree_rcu() objects waiting for a grace period
* @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
* @krcp: Pointer to @kfree_rcu_cpu structure * @krcp: Pointer to @kfree_rcu_cpu structure
*/ */
struct kfree_rcu_cpu_work { struct kfree_rcu_cpu_work {
struct rcu_work rcu_work; struct rcu_work rcu_work;
struct rcu_head *head_free; struct rcu_head *head_free;
struct kfree_rcu_bulk_data *bhead_free; struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu *krcp;
}; };
/** /**
* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
* @head: List of kfree_rcu() objects not yet waiting for a grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period
* @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
* @lock: Synchronize access to this structure * @lock: Synchronize access to this structure
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
@ -3020,7 +3023,7 @@ struct kfree_rcu_cpu_work {
*/ */
struct kfree_rcu_cpu { struct kfree_rcu_cpu {
struct rcu_head *head; struct rcu_head *head;
struct kfree_rcu_bulk_data *bhead; struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS];
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
raw_spinlock_t lock; raw_spinlock_t lock;
struct delayed_work monitor_work; struct delayed_work monitor_work;
@ -3044,7 +3047,7 @@ static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
}; };
static __always_inline void static __always_inline void
debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead) debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
{ {
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
int i; int i;
@ -3073,20 +3076,20 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
local_irq_restore(flags); local_irq_restore(flags);
} }
static inline struct kfree_rcu_bulk_data * static inline struct kvfree_rcu_bulk_data *
get_cached_bnode(struct kfree_rcu_cpu *krcp) get_cached_bnode(struct kfree_rcu_cpu *krcp)
{ {
if (!krcp->nr_bkv_objs) if (!krcp->nr_bkv_objs)
return NULL; return NULL;
krcp->nr_bkv_objs--; krcp->nr_bkv_objs--;
return (struct kfree_rcu_bulk_data *) return (struct kvfree_rcu_bulk_data *)
llist_del_first(&krcp->bkvcache); llist_del_first(&krcp->bkvcache);
} }
static inline bool static inline bool
put_cached_bnode(struct kfree_rcu_cpu *krcp, put_cached_bnode(struct kfree_rcu_cpu *krcp,
struct kfree_rcu_bulk_data *bnode) struct kvfree_rcu_bulk_data *bnode)
{ {
// Check the limit. // Check the limit.
if (krcp->nr_bkv_objs >= rcu_min_cached_objs) if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
@ -3105,43 +3108,63 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp,
static void kfree_rcu_work(struct work_struct *work) static void kfree_rcu_work(struct work_struct *work)
{ {
unsigned long flags; unsigned long flags;
struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext;
struct rcu_head *head, *next; struct rcu_head *head, *next;
struct kfree_rcu_bulk_data *bhead, *bnext;
struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu *krcp;
struct kfree_rcu_cpu_work *krwp; struct kfree_rcu_cpu_work *krwp;
int i, j;
krwp = container_of(to_rcu_work(work), krwp = container_of(to_rcu_work(work),
struct kfree_rcu_cpu_work, rcu_work); struct kfree_rcu_cpu_work, rcu_work);
krcp = krwp->krcp; krcp = krwp->krcp;
raw_spin_lock_irqsave(&krcp->lock, flags); raw_spin_lock_irqsave(&krcp->lock, flags);
// Channels 1 and 2.
for (i = 0; i < FREE_N_CHANNELS; i++) {
bkvhead[i] = krwp->bkvhead_free[i];
krwp->bkvhead_free[i] = NULL;
}
// Channel 3.
head = krwp->head_free; head = krwp->head_free;
krwp->head_free = NULL; krwp->head_free = NULL;
bhead = krwp->bhead_free;
krwp->bhead_free = NULL;
raw_spin_unlock_irqrestore(&krcp->lock, flags); raw_spin_unlock_irqrestore(&krcp->lock, flags);
/* "bhead" is now private, so traverse locklessly. */ // Handle two first channels.
for (; bhead; bhead = bnext) { for (i = 0; i < FREE_N_CHANNELS; i++) {
bnext = bhead->next; for (; bkvhead[i]; bkvhead[i] = bnext) {
bnext = bkvhead[i]->next;
debug_rcu_bhead_unqueue(bkvhead[i]);
debug_rcu_bhead_unqueue(bhead); rcu_lock_acquire(&rcu_callback_map);
if (i == 0) { // kmalloc() / kfree().
trace_rcu_invoke_kfree_bulk_callback(
rcu_state.name, bkvhead[i]->nr_records,
bkvhead[i]->records);
rcu_lock_acquire(&rcu_callback_map); kfree_bulk(bkvhead[i]->nr_records,
trace_rcu_invoke_kfree_bulk_callback(rcu_state.name, bkvhead[i]->records);
bhead->nr_records, bhead->records); } else { // vmalloc() / vfree().
for (j = 0; j < bkvhead[i]->nr_records; j++) {
trace_rcu_invoke_kfree_callback(
rcu_state.name,
bkvhead[i]->records[j], 0);
kfree_bulk(bhead->nr_records, bhead->records); vfree(bkvhead[i]->records[j]);
rcu_lock_release(&rcu_callback_map); }
}
rcu_lock_release(&rcu_callback_map);
krcp = krc_this_cpu_lock(&flags); krcp = krc_this_cpu_lock(&flags);
if (put_cached_bnode(krcp, bhead)) if (put_cached_bnode(krcp, bkvhead[i]))
bhead = NULL; bkvhead[i] = NULL;
krc_this_cpu_unlock(krcp, flags); krc_this_cpu_unlock(krcp, flags);
if (bhead) if (bkvhead[i])
free_page((unsigned long) bhead); free_page((unsigned long) bkvhead[i]);
cond_resched_tasks_rcu_qs(); cond_resched_tasks_rcu_qs();
}
} }
/* /*
@ -3159,7 +3182,7 @@ static void kfree_rcu_work(struct work_struct *work)
trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
kfree(ptr); kvfree(ptr);
rcu_lock_release(&rcu_callback_map); rcu_lock_release(&rcu_callback_map);
cond_resched_tasks_rcu_qs(); cond_resched_tasks_rcu_qs();
@ -3176,7 +3199,7 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
{ {
struct kfree_rcu_cpu_work *krwp; struct kfree_rcu_cpu_work *krwp;
bool repeat = false; bool repeat = false;
int i; int i, j;
lockdep_assert_held(&krcp->lock); lockdep_assert_held(&krcp->lock);
@ -3184,21 +3207,25 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
krwp = &(krcp->krw_arr[i]); krwp = &(krcp->krw_arr[i]);
/* /*
* Try to detach bhead or head and attach it over any * Try to detach bkvhead or head and attach it over any
* available corresponding free channel. It can be that * available corresponding free channel. It can be that
* a previous RCU batch is in progress, it means that * a previous RCU batch is in progress, it means that
* immediately to queue another one is not possible so * immediately to queue another one is not possible so
* return false to tell caller to retry. * return false to tell caller to retry.
*/ */
if ((krcp->bhead && !krwp->bhead_free) || if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
(krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
(krcp->head && !krwp->head_free)) { (krcp->head && !krwp->head_free)) {
/* Channel 1. */ // Channel 1 corresponds to SLAB ptrs.
if (!krwp->bhead_free) { // Channel 2 corresponds to vmalloc ptrs.
krwp->bhead_free = krcp->bhead; for (j = 0; j < FREE_N_CHANNELS; j++) {
krcp->bhead = NULL; if (!krwp->bkvhead_free[j]) {
krwp->bkvhead_free[j] = krcp->bkvhead[j];
krcp->bkvhead[j] = NULL;
}
} }
/* Channel 2. */ // Channel 3 corresponds to emergency path.
if (!krwp->head_free) { if (!krwp->head_free) {
krwp->head_free = krcp->head; krwp->head_free = krcp->head;
krcp->head = NULL; krcp->head = NULL;
@ -3207,16 +3234,17 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
WRITE_ONCE(krcp->count, 0); WRITE_ONCE(krcp->count, 0);
/* /*
* One work is per one batch, so there are two "free channels", * One work is per one batch, so there are three
* "bhead_free" and "head_free" the batch can handle. It can be * "free channels", the batch can handle. It can
* that the work is in the pending state when two channels have * be that the work is in the pending state when
* been detached following each other, one by one. * channels have been detached following by each
* other.
*/ */
queue_rcu_work(system_wq, &krwp->rcu_work); queue_rcu_work(system_wq, &krwp->rcu_work);
} }
/* Repeat if any "free" corresponding channel is still busy. */ // Repeat if any "free" corresponding channel is still busy.
if (krcp->bhead || krcp->head) if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
repeat = true; repeat = true;
} }
@ -3258,23 +3286,22 @@ static void kfree_rcu_monitor(struct work_struct *work)
} }
static inline bool static inline bool
kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
struct rcu_head *head, rcu_callback_t func)
{ {
struct kfree_rcu_bulk_data *bnode; struct kvfree_rcu_bulk_data *bnode;
int idx;
if (unlikely(!krcp->initialized)) if (unlikely(!krcp->initialized))
return false; return false;
lockdep_assert_held(&krcp->lock); lockdep_assert_held(&krcp->lock);
idx = !!is_vmalloc_addr(ptr);
/* Check if a new block is required. */ /* Check if a new block is required. */
if (!krcp->bhead || if (!krcp->bkvhead[idx] ||
krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) { krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
bnode = get_cached_bnode(krcp); bnode = get_cached_bnode(krcp);
if (!bnode) { if (!bnode) {
WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
/* /*
* To keep this path working on raw non-preemptible * To keep this path working on raw non-preemptible
* sections, prevent the optional entry into the * sections, prevent the optional entry into the
@ -3287,7 +3314,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
if (IS_ENABLED(CONFIG_PREEMPT_RT)) if (IS_ENABLED(CONFIG_PREEMPT_RT))
return false; return false;
bnode = (struct kfree_rcu_bulk_data *) bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_NOWAIT | __GFP_NOWARN); __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
} }
@ -3297,30 +3324,30 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
/* Initialize the new block. */ /* Initialize the new block. */
bnode->nr_records = 0; bnode->nr_records = 0;
bnode->next = krcp->bhead; bnode->next = krcp->bkvhead[idx];
/* Attach it to the head. */ /* Attach it to the head. */
krcp->bhead = bnode; krcp->bkvhead[idx] = bnode;
} }
/* Finally insert. */ /* Finally insert. */
krcp->bhead->records[krcp->bhead->nr_records++] = krcp->bkvhead[idx]->records
(void *) head - (unsigned long) func; [krcp->bkvhead[idx]->nr_records++] = ptr;
return true; return true;
} }
/* /*
* Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace * Queue a request for lazy invocation of appropriate free routine after a
* period. Please note there are two paths are maintained, one is the main one * grace period. Please note there are three paths are maintained, two are the
* that uses kfree_bulk() interface and second one is emergency one, that is * main ones that use array of pointers interface and third one is emergency
* used only when the main path can not be maintained temporary, due to memory * one, that is used only when the main path can not be maintained temporary,
* pressure. * due to memory pressure.
* *
* Each kfree_call_rcu() request is added to a batch. The batch will be drained * Each kfree_call_rcu() request is added to a batch. The batch will be drained
* every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
* be free'd in workqueue context. This allows us to: batch requests together to * be free'd in workqueue context. This allows us to: batch requests together to
* reduce the number of grace periods during heavy kfree_rcu() load. * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
*/ */
void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{ {
@ -3343,7 +3370,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
* Under high memory pressure GFP_NOWAIT can fail, * Under high memory pressure GFP_NOWAIT can fail,
* in that case the emergency path is maintained. * in that case the emergency path is maintained.
*/ */
if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) { if (unlikely(!kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr))) {
head->func = func; head->func = func;
head->next = krcp->head; head->next = krcp->head;
krcp->head = head; krcp->head = head;
@ -4324,7 +4351,7 @@ static void __init kfree_rcu_batch_init(void)
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
struct kfree_rcu_bulk_data *bnode; struct kvfree_rcu_bulk_data *bnode;
for (i = 0; i < KFREE_N_BATCHES; i++) { for (i = 0; i < KFREE_N_BATCHES; i++) {
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
@ -4332,7 +4359,7 @@ static void __init kfree_rcu_batch_init(void)
} }
for (i = 0; i < rcu_min_cached_objs; i++) { for (i = 0; i < rcu_min_cached_objs; i++) {
bnode = (struct kfree_rcu_bulk_data *) bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_NOWAIT | __GFP_NOWARN); __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
if (bnode) if (bnode)