mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-25 23:05:23 +07:00
71b3c126e6
When switch_mm() activates a new PGD, it also sets a bit that tells other CPUs that the PGD is in use so that TLB flush IPIs will be sent. In order for that to work correctly, the bit needs to be visible prior to loading the PGD and therefore starting to fill the local TLB. Document all the barriers that make this work correctly and add a couple that were missing. Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
278 lines
7.5 KiB
C
278 lines
7.5 KiB
C
#ifndef _ASM_X86_MMU_CONTEXT_H
|
|
#define _ASM_X86_MMU_CONTEXT_H
|
|
|
|
#include <asm/desc.h>
|
|
#include <linux/atomic.h>
|
|
#include <linux/mm_types.h>
|
|
|
|
#include <trace/events/tlb.h>
|
|
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/paravirt.h>
|
|
#include <asm/mpx.h>
|
|
#ifndef CONFIG_PARAVIRT
|
|
static inline void paravirt_activate_mm(struct mm_struct *prev,
|
|
struct mm_struct *next)
|
|
{
|
|
}
|
|
#endif /* !CONFIG_PARAVIRT */
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
extern struct static_key rdpmc_always_available;
|
|
|
|
static inline void load_mm_cr4(struct mm_struct *mm)
|
|
{
|
|
if (static_key_false(&rdpmc_always_available) ||
|
|
atomic_read(&mm->context.perf_rdpmc_allowed))
|
|
cr4_set_bits(X86_CR4_PCE);
|
|
else
|
|
cr4_clear_bits(X86_CR4_PCE);
|
|
}
|
|
#else
|
|
static inline void load_mm_cr4(struct mm_struct *mm) {}
|
|
#endif
|
|
|
|
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
|
/*
|
|
* ldt_structs can be allocated, used, and freed, but they are never
|
|
* modified while live.
|
|
*/
|
|
struct ldt_struct {
|
|
/*
|
|
* Xen requires page-aligned LDTs with special permissions. This is
|
|
* needed to prevent us from installing evil descriptors such as
|
|
* call gates. On native, we could merge the ldt_struct and LDT
|
|
* allocations, but it's not worth trying to optimize.
|
|
*/
|
|
struct desc_struct *entries;
|
|
int size;
|
|
};
|
|
|
|
/*
|
|
* Used for LDT copy/destruction.
|
|
*/
|
|
int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
|
|
void destroy_context(struct mm_struct *mm);
|
|
#else /* CONFIG_MODIFY_LDT_SYSCALL */
|
|
static inline int init_new_context(struct task_struct *tsk,
|
|
struct mm_struct *mm)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline void destroy_context(struct mm_struct *mm) {}
|
|
#endif
|
|
|
|
static inline void load_mm_ldt(struct mm_struct *mm)
|
|
{
|
|
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
|
struct ldt_struct *ldt;
|
|
|
|
/* lockless_dereference synchronizes with smp_store_release */
|
|
ldt = lockless_dereference(mm->context.ldt);
|
|
|
|
/*
|
|
* Any change to mm->context.ldt is followed by an IPI to all
|
|
* CPUs with the mm active. The LDT will not be freed until
|
|
* after the IPI is handled by all such CPUs. This means that,
|
|
* if the ldt_struct changes before we return, the values we see
|
|
* will be safe, and the new values will be loaded before we run
|
|
* any user code.
|
|
*
|
|
* NB: don't try to convert this to use RCU without extreme care.
|
|
* We would still need IRQs off, because we don't want to change
|
|
* the local LDT after an IPI loaded a newer value than the one
|
|
* that we can see.
|
|
*/
|
|
|
|
if (unlikely(ldt))
|
|
set_ldt(ldt->entries, ldt->size);
|
|
else
|
|
clear_LDT();
|
|
#else
|
|
clear_LDT();
|
|
#endif
|
|
|
|
DEBUG_LOCKS_WARN_ON(preemptible());
|
|
}
|
|
|
|
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
|
{
|
|
#ifdef CONFIG_SMP
|
|
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
|
|
this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
|
|
#endif
|
|
}
|
|
|
|
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
|
struct task_struct *tsk)
|
|
{
|
|
unsigned cpu = smp_processor_id();
|
|
|
|
if (likely(prev != next)) {
|
|
#ifdef CONFIG_SMP
|
|
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
|
|
this_cpu_write(cpu_tlbstate.active_mm, next);
|
|
#endif
|
|
cpumask_set_cpu(cpu, mm_cpumask(next));
|
|
|
|
/*
|
|
* Re-load page tables.
|
|
*
|
|
* This logic has an ordering constraint:
|
|
*
|
|
* CPU 0: Write to a PTE for 'next'
|
|
* CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
|
|
* CPU 1: set bit 1 in next's mm_cpumask
|
|
* CPU 1: load from the PTE that CPU 0 writes (implicit)
|
|
*
|
|
* We need to prevent an outcome in which CPU 1 observes
|
|
* the new PTE value and CPU 0 observes bit 1 clear in
|
|
* mm_cpumask. (If that occurs, then the IPI will never
|
|
* be sent, and CPU 0's TLB will contain a stale entry.)
|
|
*
|
|
* The bad outcome can occur if either CPU's load is
|
|
* reordered before that CPU's store, so both CPUs much
|
|
* execute full barriers to prevent this from happening.
|
|
*
|
|
* Thus, switch_mm needs a full barrier between the
|
|
* store to mm_cpumask and any operation that could load
|
|
* from next->pgd. This barrier synchronizes with
|
|
* remote TLB flushers. Fortunately, load_cr3 is
|
|
* serializing and thus acts as a full barrier.
|
|
*
|
|
*/
|
|
load_cr3(next->pgd);
|
|
|
|
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
|
|
|
/* Stop flush ipis for the previous mm */
|
|
cpumask_clear_cpu(cpu, mm_cpumask(prev));
|
|
|
|
/* Load per-mm CR4 state */
|
|
load_mm_cr4(next);
|
|
|
|
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
|
/*
|
|
* Load the LDT, if the LDT is different.
|
|
*
|
|
* It's possible that prev->context.ldt doesn't match
|
|
* the LDT register. This can happen if leave_mm(prev)
|
|
* was called and then modify_ldt changed
|
|
* prev->context.ldt but suppressed an IPI to this CPU.
|
|
* In this case, prev->context.ldt != NULL, because we
|
|
* never set context.ldt to NULL while the mm still
|
|
* exists. That means that next->context.ldt !=
|
|
* prev->context.ldt, because mms never share an LDT.
|
|
*/
|
|
if (unlikely(prev->context.ldt != next->context.ldt))
|
|
load_mm_ldt(next);
|
|
#endif
|
|
}
|
|
#ifdef CONFIG_SMP
|
|
else {
|
|
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
|
|
BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
|
|
|
|
if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
|
|
/*
|
|
* On established mms, the mm_cpumask is only changed
|
|
* from irq context, from ptep_clear_flush() while in
|
|
* lazy tlb mode, and here. Irqs are blocked during
|
|
* schedule, protecting us from simultaneous changes.
|
|
*/
|
|
cpumask_set_cpu(cpu, mm_cpumask(next));
|
|
|
|
/*
|
|
* We were in lazy tlb mode and leave_mm disabled
|
|
* tlb flush IPI delivery. We must reload CR3
|
|
* to make sure to use no freed page tables.
|
|
*
|
|
* As above, this is a barrier that forces
|
|
* TLB repopulation to be ordered after the
|
|
* store to mm_cpumask.
|
|
*/
|
|
load_cr3(next->pgd);
|
|
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
|
load_mm_cr4(next);
|
|
load_mm_ldt(next);
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#define activate_mm(prev, next) \
|
|
do { \
|
|
paravirt_activate_mm((prev), (next)); \
|
|
switch_mm((prev), (next), NULL); \
|
|
} while (0);
|
|
|
|
#ifdef CONFIG_X86_32
|
|
#define deactivate_mm(tsk, mm) \
|
|
do { \
|
|
lazy_load_gs(0); \
|
|
} while (0)
|
|
#else
|
|
#define deactivate_mm(tsk, mm) \
|
|
do { \
|
|
load_gs_index(0); \
|
|
loadsegment(fs, 0); \
|
|
} while (0)
|
|
#endif
|
|
|
|
static inline void arch_dup_mmap(struct mm_struct *oldmm,
|
|
struct mm_struct *mm)
|
|
{
|
|
paravirt_arch_dup_mmap(oldmm, mm);
|
|
}
|
|
|
|
static inline void arch_exit_mmap(struct mm_struct *mm)
|
|
{
|
|
paravirt_arch_exit_mmap(mm);
|
|
}
|
|
|
|
#ifdef CONFIG_X86_64
|
|
static inline bool is_64bit_mm(struct mm_struct *mm)
|
|
{
|
|
return !config_enabled(CONFIG_IA32_EMULATION) ||
|
|
!(mm->context.ia32_compat == TIF_IA32);
|
|
}
|
|
#else
|
|
static inline bool is_64bit_mm(struct mm_struct *mm)
|
|
{
|
|
return false;
|
|
}
|
|
#endif
|
|
|
|
static inline void arch_bprm_mm_init(struct mm_struct *mm,
|
|
struct vm_area_struct *vma)
|
|
{
|
|
mpx_mm_init(mm);
|
|
}
|
|
|
|
static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
unsigned long start, unsigned long end)
|
|
{
|
|
/*
|
|
* mpx_notify_unmap() goes and reads a rarely-hot
|
|
* cacheline in the mm_struct. That can be expensive
|
|
* enough to be seen in profiles.
|
|
*
|
|
* The mpx_notify_unmap() call and its contents have been
|
|
* observed to affect munmap() performance on hardware
|
|
* where MPX is not present.
|
|
*
|
|
* The unlikely() optimizes for the fast case: no MPX
|
|
* in the CPU, or no MPX use in the process. Even if
|
|
* we get this wrong (in the unlikely event that MPX
|
|
* is widely enabled on some system) the overhead of
|
|
* MPX itself (reading bounds tables) is expected to
|
|
* overwhelm the overhead of getting this unlikely()
|
|
* consistently wrong.
|
|
*/
|
|
if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX)))
|
|
mpx_notify_unmap(mm, vma, start, end);
|
|
}
|
|
|
|
#endif /* _ASM_X86_MMU_CONTEXT_H */
|