linux_dsm_epyc7002/arch/x86/mm/tlb.c
Andy Lutomirski 675357362a Revert "x86/mm: Stop calling leave_mm() in idle code"
This reverts commit 43858b4f25.

The reason I removed the leave_mm() calls in question is because the
heuristic wasn't needed after that patch.  With the original version
of my PCID series, we never flushed a "lazy cpu" (i.e. a CPU running
kernel thread) due a flush on the loaded mm.

Unfortunately, that caused architectural issues, so now I've
reinstated these flushes on non-PCID systems in:

    commit b956575bed ("x86/mm: Flush more aggressively in lazy TLB mode").

That, in turn, gives us a power management and occasionally
performance regression as compared to old kernels: a process that
goes into a deep idle state on a given CPU and gets its mm flushed
due to activity on a different CPU will wake the idle CPU.

Reinstate the old ugly heuristic: if a CPU goes into ACPI C3 or an
intel_idle state that is likely to cause a TLB flush gets its mm
switched to init_mm before going idle.

FWIW, this heuristic is lousy.  Whether we should change CR3 before
idle isn't a good hint except insofar as the performance hit is a bit
lower if the TLB is getting flushed by the idle code anyway.  What we
really want to know is whether we anticipate being idle long enough
that the mm is likely to be flushed before we wake up.  This is more a
matter of the expected latency than the idle state that gets chosen.
This heuristic also completely fails on systems that don't know
whether the TLB will be flushed (e.g. AMD systems?).  OTOH it may be a
bit obsolete anyway -- PCID systems don't presently benefit from this
heuristic at all.

We also shouldn't do this callback from innermost bit of the idle code
due to the RCU nastiness it causes.  All the information need is
available before rcu_idle_enter() needs to happen.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 43858b4f25 "x86/mm: Stop calling leave_mm() in idle code"
Link: http://lkml.kernel.org/r/c513bbd4e653747213e05bc7062de000bf0202a5.1509793738.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-11-04 15:01:50 +01:00

642 lines
19 KiB
C

#include <linux/init.h>
#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/cpu.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
#include <linux/debugfs.h>
/*
* TLB flushing, formerly SMP-only
* c/o Linus Torvalds.
*
* These mean you can really definitely utterly forget about
* writing to user space from interrupts. (Its not allowed anyway).
*
* Optimizations Manfred Spraul <manfred@colorfullife.com>
*
* More scalable flush, from Andi Kleen
*
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
u16 *new_asid, bool *need_flush)
{
u16 asid;
if (!static_cpu_has(X86_FEATURE_PCID)) {
*new_asid = 0;
*need_flush = true;
return;
}
for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
next->context.ctx_id)
continue;
*new_asid = asid;
*need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
next_tlb_gen);
return;
}
/*
* We don't currently own an ASID slot on this CPU.
* Allocate a slot.
*/
*new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
if (*new_asid >= TLB_NR_DYN_ASIDS) {
*new_asid = 0;
this_cpu_write(cpu_tlbstate.next_asid, 1);
}
*need_flush = true;
}
void leave_mm(int cpu)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
/*
* It's plausible that we're in lazy TLB mode while our mm is init_mm.
* If so, our callers still expect us to flush the TLB, but there
* aren't any user TLB entries in init_mm to worry about.
*
* This needs to happen before any other sanity checks due to
* intel_idle's shenanigans.
*/
if (loaded_mm == &init_mm)
return;
/* Warn if we're not lazy. */
WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
switch_mm(NULL, &init_mm, NULL);
}
EXPORT_SYMBOL_GPL(leave_mm);
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
unsigned long flags;
local_irq_save(flags);
switch_mm_irqs_off(prev, next, tsk);
local_irq_restore(flags);
}
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
/*
* NB: The scheduler will call us with prev == next when switching
* from lazy TLB mode to normal mode if active_mm isn't changing.
* When this happens, we don't assume that CR3 (and hence
* cpu_tlbstate.loaded_mm) matches next.
*
* NB: leave_mm() calls us with prev == NULL and tsk == NULL.
*/
/* We don't want flush_tlb_func_* to run concurrently with us. */
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
WARN_ON_ONCE(!irqs_disabled());
/*
* Verify that CR3 is what we think it is. This will catch
* hypothetical buggy code that directly switches to swapper_pg_dir
* without going through leave_mm() / switch_mm_irqs_off() or that
* does something like write_cr3(read_cr3_pa()).
*
* Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {
/*
* If we were to BUG here, we'd be very likely to kill
* the system so hard that we don't see the call trace.
* Try to recover instead by ignoring the error and doing
* a global flush to minimize the chance of corruption.
*
* (This is far from being a fully correct recovery.
* Architecturally, the CPU could prefetch something
* back into an incorrect ASID slot and leave it there
* to cause trouble down the road. It's better than
* nothing, though.)
*/
__flush_tlb_all();
}
#endif
this_cpu_write(cpu_tlbstate.is_lazy, false);
if (real_prev == next) {
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
/*
* We don't currently support having a real mm loaded without
* our cpu set in mm_cpumask(). We have all the bookkeeping
* in place to figure out whether we would need to flush
* if our cpu were cleared in mm_cpumask(), but we don't
* currently use it.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
return;
} else {
u16 new_asid;
bool need_flush;
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
* If our current stack is in vmalloc space and isn't
* mapped in the new pgd, we'll double-fault. Forcibly
* map it.
*/
unsigned int index = pgd_index(current_stack_pointer);
pgd_t *pgd = next->pgd + index;
if (unlikely(pgd_none(*pgd)))
set_pgd(pgd, init_mm.pgd[index]);
}
/* Stop remote flushes for the previous mm */
VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
real_prev != &init_mm);
cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
/*
* Start remote flushes and then read tlb_gen.
*/
cpumask_set_cpu(cpu, mm_cpumask(next));
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
write_cr3(build_cr3(next, new_asid));
/*
* NB: This gets called via leave_mm() in the idle path
* where RCU functions differently. Tracing normally
* uses RCU, so we need to use the _rcuidle variant.
*
* (There is no good reason for this. The idle code should
* be rearranged to call this before rcu_idle_enter().)
*/
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
write_cr3(build_cr3_noflush(next, new_asid));
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
}
load_mm_cr4(next);
switch_ldt(real_prev, next);
}
/*
* Please ignore the name of this function. It should be called
* switch_to_kernel_thread().
*
* enter_lazy_tlb() is a hint from the scheduler that we are entering a
* kernel thread or other context without an mm. Acceptable implementations
* include doing nothing whatsoever, switching to init_mm, or various clever
* lazy tricks to try to minimize TLB flushes.
*
* The scheduler reserves the right to call enter_lazy_tlb() several times
* in a row. It will notify us that we're going back to a real mm by
* calling switch_mm_irqs_off().
*/
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
if (tlb_defer_switch_to_init_mm()) {
/*
* There's a significant optimization that may be possible
* here. We have accurate enough TLB flush tracking that we
* don't need to maintain coherence of TLB per se when we're
* lazy. We do, however, need to maintain coherence of
* paging-structure caches. We could, in principle, leave our
* old mm loaded and only switch to init_mm when
* tlb_remove_page() happens.
*/
this_cpu_write(cpu_tlbstate.is_lazy, true);
} else {
switch_mm(NULL, &init_mm, NULL);
}
}
/*
* Call this when reinitializing a CPU. It fixes the following potential
* problems:
*
* - The ASID changed from what cpu_tlbstate thinks it is (most likely
* because the CPU was taken down and came back up with CR3's PCID
* bits clear. CPU hotplug can do this.
*
* - The TLB contains junk in slots corresponding to inactive ASIDs.
*
* - The CPU went so far out to lunch that it may have missed a TLB
* flush.
*/
void initialize_tlbstate_and_flush(void)
{
int i;
struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
unsigned long cr3 = __read_cr3();
/* Assert that CR3 already references the right mm. */
WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
/*
* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
* doesn't work like other CR4 bits because it can only be set from
* long mode.)
*/
WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
!(cr4_read_shadow() & X86_CR4_PCIDE));
/* Force ASID 0 and force a TLB flush. */
write_cr3(build_cr3(mm, 0));
/* Reinitialize tlbstate. */
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
}
/*
* flush_tlb_func_common()'s memory ordering requirement is that any
* TLB fills that happen after we flush the TLB are ordered after we
* read active_mm's tlb_gen. We don't need any explicit barriers
* because all x86 flush operations are serializing and the
* atomic64_read operation won't be reordered by the compiler.
*/
static void flush_tlb_func_common(const struct flush_tlb_info *f,
bool local, enum tlb_flush_reason reason)
{
/*
* We have three different tlb_gen values in here. They are:
*
* - mm_tlb_gen: the latest generation.
* - local_tlb_gen: the generation that this CPU has already caught
* up to.
* - f->new_tlb_gen: the generation that the requester of the flush
* wants us to catch up to.
*/
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
if (unlikely(loaded_mm == &init_mm))
return;
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
if (this_cpu_read(cpu_tlbstate.is_lazy)) {
/*
* We're in lazy mode. We need to at least flush our
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
}
if (unlikely(local_tlb_gen == mm_tlb_gen)) {
/*
* There's nothing to do: we're already up to date. This can
* happen if two concurrent flushes happen -- the first flush to
* be handled can catch us all the way up, leaving no work for
* the second flush.
*/
trace_tlb_flush(reason, 0);
return;
}
WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
/*
* If we get to this point, we know that our TLB is out of date.
* This does not strictly imply that we need to flush (it's
* possible that f->new_tlb_gen <= local_tlb_gen), but we're
* going to need to flush in the very near future, so we might
* as well get it over with.
*
* The only question is whether to do a full or partial flush.
*
* We do a partial flush if requested and two extra conditions
* are met:
*
* 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
* we've always done all needed flushes to catch up to
* local_tlb_gen. If, for example, local_tlb_gen == 2 and
* f->new_tlb_gen == 3, then we know that the flush needed to bring
* us up to date for tlb_gen 3 is the partial flush we're
* processing.
*
* As an example of why this check is needed, suppose that there
* are two concurrent flushes. The first is a full flush that
* changes context.tlb_gen from 1 to 2. The second is a partial
* flush that changes context.tlb_gen from 2 to 3. If they get
* processed on this CPU in reverse order, we'll see
* local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
* If we were to use __flush_tlb_single() and set local_tlb_gen to
* 3, we'd be break the invariant: we'd update local_tlb_gen above
* 1 without the full flush that's needed for tlb_gen 2.
*
* 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation.
* Partial TLB flushes are not all that much cheaper than full TLB
* flushes, so it seems unlikely that it would be a performance win
* to do a partial flush if that won't bring our TLB fully up to
* date. By doing a full flush instead, we can increase
* local_tlb_gen all the way to mm_tlb_gen and we can probably
* avoid another flush in the very near future.
*/
if (f->end != TLB_FLUSH_ALL &&
f->new_tlb_gen == local_tlb_gen + 1 &&
f->new_tlb_gen == mm_tlb_gen) {
/* Partial flush */
unsigned long addr;
unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
addr = f->start;
while (addr < f->end) {
__flush_tlb_single(addr);
addr += PAGE_SIZE;
}
if (local)
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
trace_tlb_flush(reason, nr_pages);
} else {
/* Full flush. */
local_flush_tlb();
if (local)
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
trace_tlb_flush(reason, TLB_FLUSH_ALL);
}
/* Both paths above update our state to mm_tlb_gen. */
this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
}
static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
{
const struct flush_tlb_info *f = info;
flush_tlb_func_common(f, true, reason);
}
static void flush_tlb_func_remote(void *info)
{
const struct flush_tlb_info *f = info;
inc_irq_stat(irq_tlb_count);
if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
return;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
}
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
else
trace_tlb_flush(TLB_REMOTE_SEND_IPI,
(info->end - info->start) >> PAGE_SHIFT);
if (is_uv_system()) {
/*
* This whole special case is confused. UV has a "Broadcast
* Assist Unit", which seems to be a fancy way to send IPIs.
* Back when x86 used an explicit TLB flush IPI, UV was
* optimized to use its own mechanism. These days, x86 uses
* smp_call_function_many(), but UV still uses a manual IPI,
* and that IPI's action is out of date -- it does a manual
* flush instead of calling flush_tlb_func_remote(). This
* means that the percpu tlb_gen variables won't be updated
* and we'll do pointless flushes on future context switches.
*
* Rather than hooking native_flush_tlb_others() here, I think
* that UV should be updated so that smp_call_function_many(),
* etc, are optimal on UV.
*/
unsigned int cpu;
cpu = smp_processor_id();
cpumask = uv_flush_tlb_others(cpumask, info);
if (cpumask)
smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);
return;
}
smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);
}
/*
* See Documentation/x86/tlb.txt for details. We choose 33
* because it is large enough to cover the vast majority (at
* least 95%) of allocations, and is small enough that we are
* confident it will not cause too much overhead. Each single
* flush is about 100 ns, so this caps the maximum overhead at
* _about_ 3,000 ns.
*
* This is in units of pages.
*/
static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
{
int cpu;
struct flush_tlb_info info = {
.mm = mm,
};
cpu = get_cpu();
/* This is also a barrier that synchronizes with switch_mm(). */
info.new_tlb_gen = inc_mm_tlb_gen(mm);
/* Should we flush just the requested range? */
if ((end != TLB_FLUSH_ALL) &&
!(vmflag & VM_HUGETLB) &&
((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
info.start = start;
info.end = end;
} else {
info.start = 0UL;
info.end = TLB_FLUSH_ALL;
}
if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
VM_WARN_ON(irqs_disabled());
local_irq_disable();
flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
local_irq_enable();
}
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), &info);
put_cpu();
}
static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
__flush_tlb_all();
}
void flush_tlb_all(void)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
on_each_cpu(do_flush_tlb_all, NULL, 1);
}
static void do_kernel_range_flush(void *info)
{
struct flush_tlb_info *f = info;
unsigned long addr;
/* flush range by one by one 'invlpg' */
for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
__flush_tlb_single(addr);
}
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
/* Balance as user space task's flush, a bit conservative */
if (end == TLB_FLUSH_ALL ||
(end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
on_each_cpu(do_flush_tlb_all, NULL, 1);
} else {
struct flush_tlb_info info;
info.start = start;
info.end = end;
on_each_cpu(do_kernel_range_flush, &info, 1);
}
}
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
struct flush_tlb_info info = {
.mm = NULL,
.start = 0UL,
.end = TLB_FLUSH_ALL,
};
int cpu = get_cpu();
if (cpumask_test_cpu(cpu, &batch->cpumask)) {
VM_WARN_ON(irqs_disabled());
local_irq_disable();
flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
local_irq_enable();
}
if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
flush_tlb_others(&batch->cpumask, &info);
cpumask_clear(&batch->cpumask);
put_cpu();
}
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
char buf[32];
unsigned int len;
len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
}
static ssize_t tlbflush_write_file(struct file *file,
const char __user *user_buf, size_t count, loff_t *ppos)
{
char buf[32];
ssize_t len;
int ceiling;
len = min(count, sizeof(buf) - 1);
if (copy_from_user(buf, user_buf, len))
return -EFAULT;
buf[len] = '\0';
if (kstrtoint(buf, 0, &ceiling))
return -EINVAL;
if (ceiling < 0)
return -EINVAL;
tlb_single_page_flush_ceiling = ceiling;
return count;
}
static const struct file_operations fops_tlbflush = {
.read = tlbflush_read_file,
.write = tlbflush_write_file,
.llseek = default_llseek,
};
static int __init create_tlb_single_page_flush_ceiling(void)
{
debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
arch_debugfs_dir, NULL, &fops_tlbflush);
return 0;
}
late_initcall(create_tlb_single_page_flush_ceiling);