From f39681ed0f48498b80455095376f11535feea332 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Jun 2017 08:53:15 -0700 Subject: [PATCH 001/149] x86/mm: Give each mm TLB flush generation a unique ID This adds two new variables to mmu_context_t: ctx_id and tlb_gen. ctx_id uniquely identifies the mm_struct and will never be reused. For a given mm_struct (and hence ctx_id), tlb_gen is a monotonic count of the number of times that a TLB flush has been requested. The pair (ctx_id, tlb_gen) can be used as an identifier for TLB flush actions and will be used in subsequent patches to reliably determine whether all needed TLB flushes have occurred on a given CPU. This patch is split out for ease of review. By itself, it has no real effect other than creating and updating the new variables. Signed-off-by: Andy Lutomirski Reviewed-by: Nadav Amit Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/413a91c24dab3ed0caa5f4e4d017d87b0857f920.1498751203.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu.h | 25 +++++++++++++++++++++++-- arch/x86/include/asm/mmu_context.h | 6 ++++++ arch/x86/include/asm/tlbflush.h | 18 ++++++++++++++++++ arch/x86/mm/tlb.c | 6 ++++-- 4 files changed, 51 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 79b647a7ebd0..bb8c597c2248 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -3,12 +3,28 @@ #include #include +#include /* - * The x86 doesn't have a mmu context, but - * we put the segment information here. + * x86 has arch-specific MMU state beyond what lives in mm_struct. */ typedef struct { + /* + * ctx_id uniquely identifies this mm_struct. A ctx_id will never + * be reused, and zero is not a valid ctx_id. + */ + u64 ctx_id; + + /* + * Any code that needs to do any sort of TLB flushing for this + * mm will first make its changes to the page tables, then + * increment tlb_gen, then flush. This lets the low-level + * flushing code keep track of what needs flushing. + * + * This is not used on Xen PV. + */ + atomic64_t tlb_gen; + #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; #endif @@ -37,6 +53,11 @@ typedef struct { #endif } mm_context_t; +#define INIT_MM_CONTEXT(mm) \ + .context = { \ + .ctx_id = 1, \ + } + void leave_mm(int cpu); #endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index ecfcb6643c9b..ae19b9d11259 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -12,6 +12,9 @@ #include #include #include + +extern atomic64_t last_mm_ctx_id; + #ifndef CONFIG_PARAVIRT static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) @@ -132,6 +135,9 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); + atomic64_set(&mm->context.tlb_gen, 0); + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { /* pkey 0 is the default and always allocated */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 50ea3482e1d1..ad2135385699 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -57,6 +57,23 @@ static inline void invpcid_flush_all_nonglobals(void) __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); } +static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) +{ + u64 new_tlb_gen; + + /* + * Bump the generation count. This also serves as a full barrier + * that synchronizes with switch_mm(): callers are required to order + * their read of mm_cpumask after their writes to the paging + * structures. + */ + smp_mb__before_atomic(); + new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); + smp_mb__after_atomic(); + + return new_tlb_gen; +} + #ifdef CONFIG_PARAVIRT #include #else @@ -262,6 +279,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm) { + inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 014d07a80053..14f4f8f66aa8 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,6 +28,8 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ +atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); + void leave_mm(int cpu) { struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); @@ -250,8 +252,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, cpu = get_cpu(); - /* Synchronize with switch_mm. */ - smp_mb(); + /* This is also a barrier that synchronizes with switch_mm(). */ + inc_mm_tlb_gen(mm); /* Should we flush just the requested range? */ if ((end != TLB_FLUSH_ALL) && From b0579ade7cd82391360e959cc844e50a160e8a96 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Jun 2017 08:53:16 -0700 Subject: [PATCH 002/149] x86/mm: Track the TLB's tlb_gen and update the flushing algorithm There are two kernel features that would benefit from tracking how up-to-date each CPU's TLB is in the case where IPIs aren't keeping it up to date in real time: - Lazy mm switching currently works by switching to init_mm when it would otherwise flush. This is wasteful: there isn't fundamentally any need to update CR3 at all when going lazy or when returning from lazy mode, nor is there any need to receive flush IPIs at all. Instead, we should just stop trying to keep the TLB coherent when we go lazy and, when unlazying, check whether we missed any flushes. - PCID will let us keep recent user contexts alive in the TLB. If we start doing this, we need a way to decide whether those contexts are up to date. On some paravirt systems, remote TLBs can be flushed without IPIs. This won't update the target CPUs' tlb_gens, which may cause unnecessary local flushes later on. We can address this if it becomes a problem by carefully updating the target CPU's tlb_gen directly. By itself, this patch is a very minor optimization that avoids unnecessary flushes when multiple TLB flushes targetting the same CPU race. The complexity in this patch would not be worth it on its own, but it will enable improved lazy TLB tracking and PCID. Signed-off-by: Andy Lutomirski Reviewed-by: Nadav Amit Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1210fb244bc9cbe7677f7f0b72db4d359675f24b.1498751203.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 43 +++++++++++++- arch/x86/mm/tlb.c | 102 +++++++++++++++++++++++++++++--- 2 files changed, 135 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index ad2135385699..d7df54cc7e4d 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -82,6 +82,11 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +struct tlb_context { + u64 ctx_id; + u64 tlb_gen; +}; + struct tlb_state { /* * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts @@ -97,6 +102,21 @@ struct tlb_state { * disabling interrupts when modifying either one. */ unsigned long cr4; + + /* + * This is a list of all contexts that might exist in the TLB. + * Since we don't yet use PCID, there is only one context. + * + * For each context, ctx_id indicates which mm the TLB's user + * entries came from. As an invariant, the TLB will never + * contain entries that are out-of-date as when that mm reached + * the tlb_gen in the list. + * + * To be clear, this means that it's legal for the TLB code to + * flush the TLB without updating tlb_gen. This can happen + * (for now, at least) due to paravirt remote flushes. + */ + struct tlb_context ctxs[1]; }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); @@ -248,9 +268,26 @@ static inline void __flush_tlb_one(unsigned long addr) * and page-granular flushes are available only on i486 and up. */ struct flush_tlb_info { - struct mm_struct *mm; - unsigned long start; - unsigned long end; + /* + * We support several kinds of flushes. + * + * - Fully flush a single mm. .mm will be set, .end will be + * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to + * which the IPI sender is trying to catch us up. + * + * - Partially flush a single mm. .mm will be set, .start and + * .end will indicate the range, and .new_tlb_gen will be set + * such that the changes between generation .new_tlb_gen-1 and + * .new_tlb_gen are entirely contained in the indicated range. + * + * - Fully flush all mms whose tlb_gens have been updated. .mm + * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen + * will be zero. + */ + struct mm_struct *mm; + unsigned long start; + unsigned long end; + u64 new_tlb_gen; }; #define local_flush_tlb() __flush_tlb() diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 14f4f8f66aa8..4e5a5ddb9e4d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -105,6 +105,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, } this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen)); WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); cpumask_set_cpu(cpu, mm_cpumask(next)); @@ -155,25 +157,102 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, switch_ldt(real_prev, next); } +/* + * flush_tlb_func_common()'s memory ordering requirement is that any + * TLB fills that happen after we flush the TLB are ordered after we + * read active_mm's tlb_gen. We don't need any explicit barriers + * because all x86 flush operations are serializing and the + * atomic64_read operation won't be reordered by the compiler. + */ static void flush_tlb_func_common(const struct flush_tlb_info *f, bool local, enum tlb_flush_reason reason) { + /* + * We have three different tlb_gen values in here. They are: + * + * - mm_tlb_gen: the latest generation. + * - local_tlb_gen: the generation that this CPU has already caught + * up to. + * - f->new_tlb_gen: the generation that the requester of the flush + * wants us to catch up to. + */ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen); + /* This code cannot presently handle being reentered. */ VM_WARN_ON(!irqs_disabled()); + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != + loaded_mm->context.ctx_id); + if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { + /* + * leave_mm() is adequate to handle any type of flush, and + * we would prefer not to receive further IPIs. leave_mm() + * clears this CPU's bit in mm_cpumask(). + */ leave_mm(smp_processor_id()); return; } - if (f->end == TLB_FLUSH_ALL) { - local_flush_tlb(); - if (local) - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - trace_tlb_flush(reason, TLB_FLUSH_ALL); - } else { + if (unlikely(local_tlb_gen == mm_tlb_gen)) { + /* + * There's nothing to do: we're already up to date. This can + * happen if two concurrent flushes happen -- the first flush to + * be handled can catch us all the way up, leaving no work for + * the second flush. + */ + return; + } + + WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); + WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); + + /* + * If we get to this point, we know that our TLB is out of date. + * This does not strictly imply that we need to flush (it's + * possible that f->new_tlb_gen <= local_tlb_gen), but we're + * going to need to flush in the very near future, so we might + * as well get it over with. + * + * The only question is whether to do a full or partial flush. + * + * We do a partial flush if requested and two extra conditions + * are met: + * + * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that + * we've always done all needed flushes to catch up to + * local_tlb_gen. If, for example, local_tlb_gen == 2 and + * f->new_tlb_gen == 3, then we know that the flush needed to bring + * us up to date for tlb_gen 3 is the partial flush we're + * processing. + * + * As an example of why this check is needed, suppose that there + * are two concurrent flushes. The first is a full flush that + * changes context.tlb_gen from 1 to 2. The second is a partial + * flush that changes context.tlb_gen from 2 to 3. If they get + * processed on this CPU in reverse order, we'll see + * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. + * If we were to use __flush_tlb_single() and set local_tlb_gen to + * 3, we'd be break the invariant: we'd update local_tlb_gen above + * 1 without the full flush that's needed for tlb_gen 2. + * + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. + * Partial TLB flushes are not all that much cheaper than full TLB + * flushes, so it seems unlikely that it would be a performance win + * to do a partial flush if that won't bring our TLB fully up to + * date. By doing a full flush instead, we can increase + * local_tlb_gen all the way to mm_tlb_gen and we can probably + * avoid another flush in the very near future. + */ + if (f->end != TLB_FLUSH_ALL && + f->new_tlb_gen == local_tlb_gen + 1 && + f->new_tlb_gen == mm_tlb_gen) { + /* Partial flush */ unsigned long addr; unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; + addr = f->start; while (addr < f->end) { __flush_tlb_single(addr); @@ -182,7 +261,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, if (local) count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); trace_tlb_flush(reason, nr_pages); + } else { + /* Full flush. */ + local_flush_tlb(); + if (local) + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + trace_tlb_flush(reason, TLB_FLUSH_ALL); } + + /* Both paths above update our state to mm_tlb_gen. */ + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen); } static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) @@ -253,7 +341,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, cpu = get_cpu(); /* This is also a barrier that synchronizes with switch_mm(). */ - inc_mm_tlb_gen(mm); + info.new_tlb_gen = inc_mm_tlb_gen(mm); /* Should we flush just the requested range? */ if ((end != TLB_FLUSH_ALL) && From 94b1b03b519b81c494900cb112aa00ed205cc2d9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Jun 2017 08:53:17 -0700 Subject: [PATCH 003/149] x86/mm: Rework lazy TLB mode and TLB freshness tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit x86's lazy TLB mode used to be fairly weak -- it would switch to init_mm the first time it tried to flush a lazy TLB. This meant an unnecessary CR3 write and, if the flush was remote, an unnecessary IPI. Rewrite it entirely. When we enter lazy mode, we simply remove the CPU from mm_cpumask. This means that we need a way to figure out whether we've missed a flush when we switch back out of lazy mode. I use the tlb_gen machinery to track whether a context is up to date. Note to reviewers: this patch, my itself, looks a bit odd. I'm using an array of length 1 containing (ctx_id, tlb_gen) rather than just storing tlb_gen, and making it at array isn't necessary yet. I'm doing this because the next few patches add PCID support, and, with PCID, we need ctx_id, and the array will end up with a length greater than 1. Making it an array now means that there will be less churn and therefore less stress on your eyeballs. NB: This is dubious but, AFAICT, still correct on Xen and UV. xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this patch changes the way that mm_cpumask() works. This should be okay, since Xen *also* iterates all online CPUs to find all the CPUs it needs to twiddle. The UV tlbflush code is rather dated and should be changed. Here are some benchmark results, done on a Skylake laptop at 2.3 GHz (turbo off, intel_pstate requesting max performance) under KVM with the guest using idle=poll (to avoid artifacts when bouncing between CPUs). I haven't done any real statistics here -- I just ran them in a loop and picked the fastest results that didn't look like outliers. Unpatched means commit a4eb8b993554, so all the bookkeeping overhead is gone. MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU. This is intended to be a nearly worst-case test. patched: 13.4µs unpatched: 21.6µs Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores), nrounds = 100, 256M data patched: 1.1 seconds or so unpatched: 1.9 seconds or so The sleepup on Vitaly's test appearss to be because it spends a lot of time blocked on mmap_sem, and this patch avoids sending IPIs to blocked CPUs. Signed-off-by: Andy Lutomirski Reviewed-by: Nadav Amit Reviewed-by: Thomas Gleixner Cc: Andrew Banman Cc: Andrew Morton Cc: Arjan van de Ven Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Dave Hansen Cc: Dimitri Sivanich Cc: Juergen Gross Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Travis Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 6 +- arch/x86/include/asm/tlbflush.h | 4 - arch/x86/mm/init.c | 1 - arch/x86/mm/tlb.c | 207 +++++++++++++++++------------ arch/x86/xen/mmu_pv.c | 5 +- 5 files changed, 129 insertions(+), 94 deletions(-) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index ae19b9d11259..85f6b5575aad 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -128,8 +128,10 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); + int cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) + cpumask_clear_cpu(cpu, mm_cpumask(mm)); } static inline int init_new_context(struct task_struct *tsk, diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index d7df54cc7e4d..06e997a36d49 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -95,7 +95,6 @@ struct tlb_state { * mode even if we've already switched back to swapper_pg_dir. */ struct mm_struct *loaded_mm; - int state; /* * Access to this CR4 shadow and to H/W CR4 is protected by @@ -310,9 +309,6 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info); -#define TLBSTATE_OK 1 -#define TLBSTATE_LAZY 2 - static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm) { diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 673541eb3b3f..4d353efb2838 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -812,7 +812,6 @@ void __init zone_sizes_init(void) DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, - .state = 0, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; EXPORT_SYMBOL_GPL(cpu_tlbstate); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 4e5a5ddb9e4d..0982c997d36f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -45,8 +45,8 @@ void leave_mm(int cpu) if (loaded_mm == &init_mm) return; - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - BUG(); + /* Warn if we're not lazy. */ + WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); switch_mm(NULL, &init_mm, NULL); } @@ -65,94 +65,117 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { - unsigned cpu = smp_processor_id(); struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); + unsigned cpu = smp_processor_id(); + u64 next_tlb_gen; /* - * NB: The scheduler will call us with prev == next when - * switching from lazy TLB mode to normal mode if active_mm - * isn't changing. When this happens, there is no guarantee - * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next. + * NB: The scheduler will call us with prev == next when switching + * from lazy TLB mode to normal mode if active_mm isn't changing. + * When this happens, we don't assume that CR3 (and hence + * cpu_tlbstate.loaded_mm) matches next. * * NB: leave_mm() calls us with prev == NULL and tsk == NULL. */ - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + /* We don't want flush_tlb_func_* to run concurrently with us. */ + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(!irqs_disabled()); + + /* + * Verify that CR3 is what we think it is. This will catch + * hypothetical buggy code that directly switches to swapper_pg_dir + * without going through leave_mm() / switch_mm_irqs_off(). + */ + VM_BUG_ON(read_cr3_pa() != __pa(real_prev->pgd)); if (real_prev == next) { + VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != + next->context.ctx_id); + + if (cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * There's nothing to do: we weren't lazy, and we + * aren't changing our mm. We don't need to flush + * anything, nor do we need to update CR3, CR4, or + * LDTR. + */ + return; + } + + /* Resume remote flushes and then read tlb_gen. */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + + if (this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen) < next_tlb_gen) { + /* + * Ideally, we'd have a flush_tlb() variant that + * takes the known CR3 value as input. This would + * be faster on Xen PV and on hypothetical CPUs + * on which INVPCID is fast. + */ + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, + next_tlb_gen); + write_cr3(__pa(next->pgd)); + + /* + * This gets called via leave_mm() in the idle path + * where RCU functions differently. Tracing normally + * uses RCU, so we have to call the tracepoint + * specially here. + */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, + TLB_FLUSH_ALL); + } + /* - * There's nothing to do: we always keep the per-mm control - * regs in sync with cpu_tlbstate.loaded_mm. Just - * sanity-check mm_cpumask. + * We just exited lazy mode, which means that CR4 and/or LDTR + * may be stale. (Changes to the required CR4 and LDTR states + * are not reflected in tlb_gen.) */ - if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next)))) - cpumask_set_cpu(cpu, mm_cpumask(next)); - return; + } else { + VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) == + next->context.ctx_id); + + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* + * If our current stack is in vmalloc space and isn't + * mapped in the new pgd, we'll double-fault. Forcibly + * map it. + */ + unsigned int index = pgd_index(current_stack_pointer()); + pgd_t *pgd = next->pgd + index; + + if (unlikely(pgd_none(*pgd))) + set_pgd(pgd, init_mm.pgd[index]); + } + + /* Stop remote flushes for the previous mm */ + if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); + + VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); + + /* + * Start remote flushes and then read tlb_gen. + */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen); + this_cpu_write(cpu_tlbstate.loaded_mm, next); + write_cr3(__pa(next->pgd)); + + /* + * This gets called via leave_mm() in the idle path where RCU + * functions differently. Tracing normally uses RCU, so we + * have to call the tracepoint specially here. + */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, + TLB_FLUSH_ALL); } - if (IS_ENABLED(CONFIG_VMAP_STACK)) { - /* - * If our current stack is in vmalloc space and isn't - * mapped in the new pgd, we'll double-fault. Forcibly - * map it. - */ - unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); - - pgd_t *pgd = next->pgd + stack_pgd_index; - - if (unlikely(pgd_none(*pgd))) - set_pgd(pgd, init_mm.pgd[stack_pgd_index]); - } - - this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen)); - - WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); - cpumask_set_cpu(cpu, mm_cpumask(next)); - - /* - * Re-load page tables. - * - * This logic has an ordering constraint: - * - * CPU 0: Write to a PTE for 'next' - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. - * CPU 1: set bit 1 in next's mm_cpumask - * CPU 1: load from the PTE that CPU 0 writes (implicit) - * - * We need to prevent an outcome in which CPU 1 observes - * the new PTE value and CPU 0 observes bit 1 clear in - * mm_cpumask. (If that occurs, then the IPI will never - * be sent, and CPU 0's TLB will contain a stale entry.) - * - * The bad outcome can occur if either CPU's load is - * reordered before that CPU's store, so both CPUs must - * execute full barriers to prevent this from happening. - * - * Thus, switch_mm needs a full barrier between the - * store to mm_cpumask and any operation that could load - * from next->pgd. TLB fills are special and can happen - * due to instruction fetches or for no reason at all, - * and neither LOCK nor MFENCE orders them. - * Fortunately, load_cr3() is serializing and gives the - * ordering guarantee we need. - */ - load_cr3(next->pgd); - - /* - * This gets called via leave_mm() in the idle path where RCU - * functions differently. Tracing normally uses RCU, so we have to - * call the tracepoint specially here. - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - - /* Stop flush ipis for the previous mm */ - WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && - real_prev != &init_mm); - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); - - /* Load per-mm CR4 and LDTR state */ load_mm_cr4(next); switch_ldt(real_prev, next); } @@ -186,13 +209,13 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) != loaded_mm->context.ctx_id); - if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { + if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { /* - * leave_mm() is adequate to handle any type of flush, and - * we would prefer not to receive further IPIs. leave_mm() - * clears this CPU's bit in mm_cpumask(). + * We're in lazy mode -- don't flush. We can get here on + * remote flushes due to races and on local flushes if a + * kernel thread coincidentally flushes the mm it's lazily + * still using. */ - leave_mm(smp_processor_id()); return; } @@ -203,6 +226,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * be handled can catch us all the way up, leaving no work for * the second flush. */ + trace_tlb_flush(reason, 0); return; } @@ -304,6 +328,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (info->end - info->start) >> PAGE_SHIFT); if (is_uv_system()) { + /* + * This whole special case is confused. UV has a "Broadcast + * Assist Unit", which seems to be a fancy way to send IPIs. + * Back when x86 used an explicit TLB flush IPI, UV was + * optimized to use its own mechanism. These days, x86 uses + * smp_call_function_many(), but UV still uses a manual IPI, + * and that IPI's action is out of date -- it does a manual + * flush instead of calling flush_tlb_func_remote(). This + * means that the percpu tlb_gen variables won't be updated + * and we'll do pointless flushes on future context switches. + * + * Rather than hooking native_flush_tlb_others() here, I think + * that UV should be updated so that smp_call_function_many(), + * etc, are optimal on UV. + */ unsigned int cpu; cpu = smp_processor_id(); @@ -363,6 +402,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), &info); + put_cpu(); } @@ -371,8 +411,6 @@ static void do_flush_tlb_all(void *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) - leave_mm(smp_processor_id()); } void flush_tlb_all(void) @@ -425,6 +463,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) flush_tlb_others(&batch->cpumask, &info); + cpumask_clear(&batch->cpumask); put_cpu(); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1d7a7213a310..0472790ec20b 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1005,14 +1005,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm) /* Get the "official" set of cpus referring to our pagetable. */ if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { for_each_online_cpu(cpu) { - if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) - && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) + if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) continue; smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1); } return; } - cpumask_copy(mask, mm_cpumask(mm)); /* * It's possible that a vcpu may have a stale reference to our @@ -1021,6 +1019,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm) * look at its actual current cr3 value, and force it to flush * if needed. */ + cpumask_clear(mask); for_each_online_cpu(cpu) { if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) cpumask_set_cpu(cpu, mask); From 43858b4f25cf0adc5c2ca9cf5ce5fdf2532941e5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Jun 2017 08:53:18 -0700 Subject: [PATCH 004/149] x86/mm: Stop calling leave_mm() in idle code Now that lazy TLB suppresses all flush IPIs (as opposed to all but the first), there's no need to leave_mm() when going idle. This means we can get rid of the rcuidle hack in switch_mm_irqs_off() and we can unexport leave_mm(). This also removes acpi_unlazy_tlb() from the x86 and ia64 headers, since it has no callers any more. Signed-off-by: Andy Lutomirski Reviewed-by: Nadav Amit Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/03c699cfd6021e467be650d6b73deaccfe4b4bd7.1498751203.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/acpi.h | 2 -- arch/x86/include/asm/acpi.h | 2 -- arch/x86/mm/tlb.c | 20 +++----------------- drivers/acpi/processor_idle.c | 2 -- drivers/idle/intel_idle.c | 9 ++++----- 5 files changed, 7 insertions(+), 28 deletions(-) diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h index a3d0211970e9..c86a947f5368 100644 --- a/arch/ia64/include/asm/acpi.h +++ b/arch/ia64/include/asm/acpi.h @@ -112,8 +112,6 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf) buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP; } -#define acpi_unlazy_tlb(x) - #ifdef CONFIG_ACPI_NUMA extern cpumask_t early_cpu_possible_map; #define for_each_possible_early_cpu(cpu) \ diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 2efc768e4362..562286fa151f 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -150,8 +150,6 @@ static inline void disable_acpi(void) { } extern int x86_acpi_numa_init(void); #endif /* CONFIG_ACPI_NUMA */ -#define acpi_unlazy_tlb(x) leave_mm(x) - #ifdef CONFIG_ACPI_APEI static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) { diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 0982c997d36f..2c1b8881e9d3 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -50,7 +50,6 @@ void leave_mm(int cpu) switch_mm(NULL, &init_mm, NULL); } -EXPORT_SYMBOL_GPL(leave_mm); void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) @@ -117,15 +116,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen); write_cr3(__pa(next->pgd)); - - /* - * This gets called via leave_mm() in the idle path - * where RCU functions differently. Tracing normally - * uses RCU, so we have to call the tracepoint - * specially here. - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, - TLB_FLUSH_ALL); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, + TLB_FLUSH_ALL); } /* @@ -167,13 +159,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm, next); write_cr3(__pa(next->pgd)); - /* - * This gets called via leave_mm() in the idle path where RCU - * functions differently. Tracing normally uses RCU, so we - * have to call the tracepoint specially here. - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, - TLB_FLUSH_ALL); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } load_mm_cr4(next); diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 5c8aa9cf62d7..fe3d2a40f311 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -708,8 +708,6 @@ static DEFINE_RAW_SPINLOCK(c3_lock); static void acpi_idle_enter_bm(struct acpi_processor *pr, struct acpi_processor_cx *cx, bool timer_bc) { - acpi_unlazy_tlb(smp_processor_id()); - /* * Must be done before busmaster disable as we might need to * access HPET ! diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 216d7ec88c0c..2ae43f59091d 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -912,16 +912,15 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state = &drv->states[index]; unsigned long eax = flg2MWAIT(state->flags); unsigned int cstate; - int cpu = smp_processor_id(); cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; /* - * leave_mm() to avoid costly and often unnecessary wakeups - * for flushing the user TLB's associated with the active mm. + * NB: if CPUIDLE_FLAG_TLB_FLUSHED is set, this idle transition + * will probably flush the TLB. It's not guaranteed to flush + * the TLB, though, so it's not clear that we can do anything + * useful with this knowledge. */ - if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED) - leave_mm(cpu); if (!(lapic_timer_reliable_states & (1 << (cstate)))) tick_broadcast_enter(); From cba4671af7550e008f7a7835f06df0763825bf3e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Jun 2017 08:53:19 -0700 Subject: [PATCH 005/149] x86/mm: Disable PCID on 32-bit kernels 32-bit kernels on new hardware will see PCID in CPUID, but PCID can only be used in 64-bit mode. Rather than making all PCID code conditional, just disable the feature on 32-bit builds. Signed-off-by: Andy Lutomirski Reviewed-by: Nadav Amit Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/2e391769192a4d31b808410c383c6bf0734bc6ea.1498751203.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/disabled-features.h | 4 +++- arch/x86/kernel/cpu/bugs.c | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 5dff775af7cd..c10c9128f54e 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -21,11 +21,13 @@ # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) # define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) # define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) +# define DISABLE_PCID 0 #else # define DISABLE_VME 0 # define DISABLE_K6_MTRR 0 # define DISABLE_CYRIX_ARR 0 # define DISABLE_CENTAUR_MCR 0 +# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) #endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS @@ -49,7 +51,7 @@ #define DISABLED_MASK1 0 #define DISABLED_MASK2 0 #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 0 +#define DISABLED_MASK4 (DISABLE_PCID) #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 0 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0af86d9242da..db684880d74a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -21,6 +21,14 @@ void __init check_bugs(void) { +#ifdef CONFIG_X86_32 + /* + * Regardless of whether PCID is enumerated, the SDM says + * that it can't be enabled in 32-bit mode. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); +#endif + identify_boot_cpu(); if (!IS_ENABLED(CONFIG_SMP)) { From 0790c9aad84901ca1bdc14746175549c8b5da215 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Jun 2017 08:53:20 -0700 Subject: [PATCH 006/149] x86/mm: Add the 'nopcid' boot option to turn off PCID The parameter is only present on x86_64 systems to save a few bytes, as PCID is always disabled on x86_32. Signed-off-by: Andy Lutomirski Reviewed-by: Nadav Amit Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/8bbb2e65bcd249a5f18bfb8128b4689f08ac2b60.1498751203.git.luto@kernel.org Signed-off-by: Ingo Molnar --- .../admin-guide/kernel-parameters.txt | 2 ++ arch/x86/kernel/cpu/common.c | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9b0b3dea6326..7037a0f86d03 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2657,6 +2657,8 @@ nopat [X86] Disable PAT (page attribute table extension of pagetables) support. + nopcid [X86-64] Disable the PCID cpu feature. + norandmaps Don't use address space randomization. Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c8b39870f33e..904485e7b230 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -168,6 +168,24 @@ static int __init x86_mpx_setup(char *s) } __setup("nompx", x86_mpx_setup); +#ifdef CONFIG_X86_64 +static int __init x86_pcid_setup(char *s) +{ + /* require an exact match without trailing characters */ + if (strlen(s)) + return 0; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_PCID)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_PCID); + pr_info("nopcid: PCID feature disabled\n"); + return 1; +} +__setup("nopcid", x86_pcid_setup); +#endif + static int __init x86_noinvpcid_setup(char *s) { /* noinvpcid doesn't accept parameters */ From 660da7c9228f685b2ebe664f9fd69aaddcc420b5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Jun 2017 08:53:21 -0700 Subject: [PATCH 007/149] x86/mm: Enable CR4.PCIDE on supported systems We can use PCID if the CPU has PCID and PGE and we're not on Xen. By itself, this has no effect. A followup patch will start using PCID. Signed-off-by: Andy Lutomirski Reviewed-by: Nadav Amit Reviewed-by: Boris Ostrovsky Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: Juergen Gross Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Rik van Riel Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/6327ecd907b32f79d5aa0d466f04503bbec5df88.1498751203.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 8 ++++++++ arch/x86/kernel/cpu/common.c | 22 ++++++++++++++++++++++ arch/x86/xen/enlighten_pv.c | 6 ++++++ 3 files changed, 36 insertions(+) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 06e997a36d49..6397275008db 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -243,6 +243,14 @@ static inline void __flush_tlb_all(void) __flush_tlb_global(); else __flush_tlb(); + + /* + * Note: if we somehow had PCID but not PGE, then this wouldn't work -- + * we'd end up flushing kernel translations for the current ASID but + * we might fail to flush kernel translations for other cached ASIDs. + * + * To avoid this issue, we force PCID off if PGE is off. + */ } static inline void __flush_tlb_one(unsigned long addr) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 904485e7b230..b95cd94ca97b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -329,6 +329,25 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) } } +static void setup_pcid(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_PCID)) { + if (cpu_has(c, X86_FEATURE_PGE)) { + cr4_set_bits(X86_CR4_PCIDE); + } else { + /* + * flush_tlb_all(), as currently implemented, won't + * work if PCID is on but PGE is not. Since that + * combination doesn't exist on real hardware, there's + * no reason to try to fully support it, but it's + * polite to avoid corrupting data if we're on + * an improperly configured VM. + */ + clear_cpu_cap(c, X86_FEATURE_PCID); + } + } +} + /* * Protection Keys are not available in 32-bit mode. */ @@ -1143,6 +1162,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smep(c); setup_smap(c); + /* Set up PCID */ + setup_pcid(c); + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index f33eef4ebd12..a136aac543c3 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -295,6 +295,12 @@ static void __init xen_init_capabilities(void) setup_clear_cpu_cap(X86_FEATURE_ACC); setup_clear_cpu_cap(X86_FEATURE_X2APIC); + /* + * Xen PV would need some work to support PCID: CR3 handling as well + * as xen_flush_tlb_others() would need updating. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); + if (!xen_initial_domain()) setup_clear_cpu_cap(X86_FEATURE_ACPI); From 29178c1473fd4ad9c523b41bb18a047749c66d11 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 8 Jun 2017 18:10:28 +0200 Subject: [PATCH 008/149] ARC: defconfig: Cleanup from old Kconfig options Remove old, dead Kconfig option INET_LRO. It is gone since commit 7bbf3cae65b6 ("ipv4: Remove inet_lro library"). Signed-off-by: Krzysztof Kozlowski Signed-off-by: Vineet Gupta --- arch/arc/configs/nps_defconfig | 1 - arch/arc/configs/tb10x_defconfig | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/arc/configs/nps_defconfig b/arch/arc/configs/nps_defconfig index ede625c76216..7c9c706ae7f6 100644 --- a/arch/arc/configs/nps_defconfig +++ b/arch/arc/configs/nps_defconfig @@ -39,7 +39,6 @@ CONFIG_IP_PNP=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set -# CONFIG_INET_LRO is not set # CONFIG_INET_DIAG is not set # CONFIG_IPV6 is not set # CONFIG_WIRELESS is not set diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig index 4c5118384eb5..f30182549395 100644 --- a/arch/arc/configs/tb10x_defconfig +++ b/arch/arc/configs/tb10x_defconfig @@ -38,7 +38,6 @@ CONFIG_IP_MULTICAST=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set -# CONFIG_INET_LRO is not set # CONFIG_INET_DIAG is not set # CONFIG_IPV6 is not set # CONFIG_WIRELESS is not set From f62995c92a29e4d9331382b8b2461eef3b9c7c6b Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sun, 9 Jul 2017 20:37:39 +0800 Subject: [PATCH 009/149] x86/boot/KASLR: Wrap e820 entries walking code into new function process_e820_entries() The original function process_e820_entry() only takes care of each e820 entry passed. And move the E820_TYPE_RAM checking logic into process_e820_entries(). And remove the redundent local variable 'addr' definition in find_random_phys_addr(). Signed-off-by: Baoquan He Acked-by: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fanc.fnst@cn.fujitsu.com Cc: izumi.taku@jp.fujitsu.com Cc: matt@codeblueprint.co.uk Cc: thgarnie@google.com Link: http://lkml.kernel.org/r/1499603862-11516-2-git-send-email-bhe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/kaslr.c | 38 ++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 91f27ab970ef..1485f48aeda1 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -488,10 +488,6 @@ static void process_e820_entry(struct boot_e820_entry *entry, unsigned long start_orig, end; struct boot_e820_entry cur_entry; - /* Skip non-RAM entries. */ - if (entry->type != E820_TYPE_RAM) - return; - /* On 32-bit, ignore entries entirely above our maximum. */ if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE) return; @@ -562,12 +558,29 @@ static void process_e820_entry(struct boot_e820_entry *entry, } } +static void process_e820_entries(unsigned long minimum, + unsigned long image_size) +{ + int i; + struct boot_e820_entry *entry; + + /* Verify potential e820 positions, appending to slots list. */ + for (i = 0; i < boot_params->e820_entries; i++) { + entry = &boot_params->e820_table[i]; + /* Skip non-RAM entries. */ + if (entry->type != E820_TYPE_RAM) + continue; + process_e820_entry(entry, minimum, image_size); + if (slot_area_index == MAX_SLOT_AREA) { + debug_putstr("Aborted e820 scan (slot_areas full)!\n"); + break; + } + } +} + static unsigned long find_random_phys_addr(unsigned long minimum, unsigned long image_size) { - int i; - unsigned long addr; - /* Check if we had too many memmaps. */ if (memmap_too_large) { debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n"); @@ -577,16 +590,7 @@ static unsigned long find_random_phys_addr(unsigned long minimum, /* Make sure minimum is aligned. */ minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); - /* Verify potential e820 positions, appending to slots list. */ - for (i = 0; i < boot_params->e820_entries; i++) { - process_e820_entry(&boot_params->e820_table[i], minimum, - image_size); - if (slot_area_index == MAX_SLOT_AREA) { - debug_putstr("Aborted e820 scan (slot_areas full)!\n"); - break; - } - } - + process_e820_entries(minimum, image_size); return slots_fetch_random(); } From 87891b01b54210763117f0a67b022cd94de6cd13 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sun, 9 Jul 2017 20:37:40 +0800 Subject: [PATCH 010/149] x86/boot/KASLR: Switch to pass struct mem_vector to process_e820_entry() This makes process_e820_entry() be able to process any kind of memory region. Signed-off-by: Baoquan He Acked-by: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fanc.fnst@cn.fujitsu.com Cc: izumi.taku@jp.fujitsu.com Cc: matt@codeblueprint.co.uk Cc: thgarnie@google.com Link: http://lkml.kernel.org/r/1499603862-11516-3-git-send-email-bhe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/kaslr.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 1485f48aeda1..36ff9f729c43 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -479,31 +479,31 @@ static unsigned long slots_fetch_random(void) return 0; } -static void process_e820_entry(struct boot_e820_entry *entry, +static void process_e820_entry(struct mem_vector *entry, unsigned long minimum, unsigned long image_size) { struct mem_vector region, overlap; struct slot_area slot_area; unsigned long start_orig, end; - struct boot_e820_entry cur_entry; + struct mem_vector cur_entry; /* On 32-bit, ignore entries entirely above our maximum. */ - if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE) + if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE) return; /* Ignore entries entirely below our minimum. */ - if (entry->addr + entry->size < minimum) + if (entry->start + entry->size < minimum) return; /* Ignore entries above memory limit */ - end = min(entry->size + entry->addr, mem_limit); - if (entry->addr >= end) + end = min(entry->size + entry->start, mem_limit); + if (entry->start >= end) return; - cur_entry.addr = entry->addr; - cur_entry.size = end - entry->addr; + cur_entry.start = entry->start; + cur_entry.size = end - entry->start; - region.start = cur_entry.addr; + region.start = cur_entry.start; region.size = cur_entry.size; /* Give up if slot area array is full. */ @@ -518,7 +518,7 @@ static void process_e820_entry(struct boot_e820_entry *entry, region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); /* Did we raise the address above this e820 region? */ - if (region.start > cur_entry.addr + cur_entry.size) + if (region.start > cur_entry.start + cur_entry.size) return; /* Reduce size by any delta from the original address. */ @@ -562,6 +562,7 @@ static void process_e820_entries(unsigned long minimum, unsigned long image_size) { int i; + struct mem_vector region; struct boot_e820_entry *entry; /* Verify potential e820 positions, appending to slots list. */ @@ -570,7 +571,9 @@ static void process_e820_entries(unsigned long minimum, /* Skip non-RAM entries. */ if (entry->type != E820_TYPE_RAM) continue; - process_e820_entry(entry, minimum, image_size); + region.start = entry->addr; + region.size = entry->size; + process_e820_entry(®ion, minimum, image_size); if (slot_area_index == MAX_SLOT_AREA) { debug_putstr("Aborted e820 scan (slot_areas full)!\n"); break; From 27aac20574110abfd594175a668dc58b23b2b14a Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sun, 9 Jul 2017 20:37:41 +0800 Subject: [PATCH 011/149] x86/boot/KASLR: Rename process_e820_entry() into process_mem_region() Now process_e820_entry() is not limited to e820 entry processing, rename it to process_mem_region(). And adjust the code comment accordingly. Signed-off-by: Baoquan He Acked-by: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: fanc.fnst@cn.fujitsu.com Cc: izumi.taku@jp.fujitsu.com Cc: matt@codeblueprint.co.uk Cc: thgarnie@google.com Link: http://lkml.kernel.org/r/1499603862-11516-4-git-send-email-bhe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/kaslr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 36ff9f729c43..99c7194f7ea6 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -479,7 +479,7 @@ static unsigned long slots_fetch_random(void) return 0; } -static void process_e820_entry(struct mem_vector *entry, +static void process_mem_region(struct mem_vector *entry, unsigned long minimum, unsigned long image_size) { @@ -517,7 +517,7 @@ static void process_e820_entry(struct mem_vector *entry, /* Potentially raise address to meet alignment needs. */ region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); - /* Did we raise the address above this e820 region? */ + /* Did we raise the address above the passed in memory entry? */ if (region.start > cur_entry.start + cur_entry.size) return; @@ -573,7 +573,7 @@ static void process_e820_entries(unsigned long minimum, continue; region.start = entry->addr; region.size = entry->size; - process_e820_entry(®ion, minimum, image_size); + process_mem_region(®ion, minimum, image_size); if (slot_area_index == MAX_SLOT_AREA) { debug_putstr("Aborted e820 scan (slot_areas full)!\n"); break; From c262f3b9a3246da87c66ce398cd7e30d8f1529ea Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:09:58 -0500 Subject: [PATCH 012/149] x86/cpu/AMD: Document AMD Secure Memory Encryption (SME) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create a Documentation entry to describe the AMD Secure Memory Encryption (SME) feature and add documentation for the mem_encrypt= kernel parameter. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/ca0a0c13b055fd804cfc92cbaca8acd68057eed0.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- .../admin-guide/kernel-parameters.txt | 11 +++ Documentation/x86/amd-memory-encryption.txt | 68 +++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 Documentation/x86/amd-memory-encryption.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f701430f4894..372cc66bba23 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2233,6 +2233,17 @@ memory contents and reserves bad memory regions that are detected. + mem_encrypt= [X86-64] AMD Secure Memory Encryption (SME) control + Valid arguments: on, off + Default (depends on kernel configuration option): + on (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) + off (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=n) + mem_encrypt=on: Activate SME + mem_encrypt=off: Do not activate SME + + Refer to Documentation/x86/amd-memory-encryption.txt + for details on when memory encryption can be activated. + mem_sleep_default= [SUSPEND] Default system suspend mode: s2idle - Suspend-To-Idle shallow - Power-On Suspend or equivalent (if supported) diff --git a/Documentation/x86/amd-memory-encryption.txt b/Documentation/x86/amd-memory-encryption.txt new file mode 100644 index 000000000000..f512ab718541 --- /dev/null +++ b/Documentation/x86/amd-memory-encryption.txt @@ -0,0 +1,68 @@ +Secure Memory Encryption (SME) is a feature found on AMD processors. + +SME provides the ability to mark individual pages of memory as encrypted using +the standard x86 page tables. A page that is marked encrypted will be +automatically decrypted when read from DRAM and encrypted when written to +DRAM. SME can therefore be used to protect the contents of DRAM from physical +attacks on the system. + +A page is encrypted when a page table entry has the encryption bit set (see +below on how to determine its position). The encryption bit can also be +specified in the cr3 register, allowing the PGD table to be encrypted. Each +successive level of page tables can also be encrypted by setting the encryption +bit in the page table entry that points to the next table. This allows the full +page table hierarchy to be encrypted. Note, this means that just because the +encryption bit is set in cr3, doesn't imply the full hierarchy is encyrpted. +Each page table entry in the hierarchy needs to have the encryption bit set to +achieve that. So, theoretically, you could have the encryption bit set in cr3 +so that the PGD is encrypted, but not set the encryption bit in the PGD entry +for a PUD which results in the PUD pointed to by that entry to not be +encrypted. + +Support for SME can be determined through the CPUID instruction. The CPUID +function 0x8000001f reports information related to SME: + + 0x8000001f[eax]: + Bit[0] indicates support for SME + 0x8000001f[ebx]: + Bits[5:0] pagetable bit number used to activate memory + encryption + Bits[11:6] reduction in physical address space, in bits, when + memory encryption is enabled (this only affects + system physical addresses, not guest physical + addresses) + +If support for SME is present, MSR 0xc00100010 (MSR_K8_SYSCFG) can be used to +determine if SME is enabled and/or to enable memory encryption: + + 0xc0010010: + Bit[23] 0 = memory encryption features are disabled + 1 = memory encryption features are enabled + +Linux relies on BIOS to set this bit if BIOS has determined that the reduction +in the physical address space as a result of enabling memory encryption (see +CPUID information above) will not conflict with the address space resource +requirements for the system. If this bit is not set upon Linux startup then +Linux itself will not set it and memory encryption will not be possible. + +The state of SME in the Linux kernel can be documented as follows: + - Supported: + The CPU supports SME (determined through CPUID instruction). + + - Enabled: + Supported and bit 23 of MSR_K8_SYSCFG is set. + + - Active: + Supported, Enabled and the Linux kernel is actively applying + the encryption bit to page table entries (the SME mask in the + kernel is non-zero). + +SME can also be enabled and activated in the BIOS. If SME is enabled and +activated in the BIOS, then all memory accesses will be encrypted and it will +not be necessary to activate the Linux memory encryption support. If the BIOS +merely enables SME (sets bit 23 of the MSR_K8_SYSCFG), then Linux can activate +memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or +by supplying mem_encrypt=on on the kernel command line. However, if BIOS does +not enable SME, then Linux will not be able to activate memory encryption, even +if configured to do so by default or the mem_encrypt=on command line parameter +is specified. From aac7b79eea6118dee3da9b99dcd564471672806d Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:09:59 -0500 Subject: [PATCH 013/149] x86/mm/pat: Set write-protect cache mode for full PAT support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For processors that support PAT, set the write-protect cache mode (_PAGE_CACHE_MODE_WP) entry to the actual write-protect value (x05). Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Acked-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/ade53b63d4dbffbfc3cb08fb62024647059c8688.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 45979502f64b..88990ab87961 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -293,7 +293,7 @@ void init_cache_modes(void) * pat_init - Initialize PAT MSR and PAT table * * This function initializes PAT MSR and PAT table with an OS-defined value - * to enable additional cache attributes, WC and WT. + * to enable additional cache attributes, WC, WT and WP. * * This function must be called on all CPUs using the specific sequence of * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this @@ -352,7 +352,7 @@ void pat_init(void) * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS * 011 3 UC : _PAGE_CACHE_MODE_UC * 100 4 WB : Reserved - * 101 5 WC : Reserved + * 101 5 WP : _PAGE_CACHE_MODE_WP * 110 6 UC-: Reserved * 111 7 WT : _PAGE_CACHE_MODE_WT * @@ -360,7 +360,7 @@ void pat_init(void) * corresponding types in the presence of PAT errata. */ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | - PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT); + PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT); } if (!boot_cpu_done) { From f7750a79568788473c5e8092ee58a52248f34329 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:00 -0500 Subject: [PATCH 014/149] x86, mpparse, x86/acpi, x86/PCI, x86/dmi, SFI: Use memremap() for RAM mappings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ioremap() function is intended for mapping MMIO. For RAM, the memremap() function should be used. Convert calls from ioremap() to memremap() when re-mapping RAM. This will be used later by SME to control how the encryption mask is applied to memory mappings, with certain memory locations being mapped decrypted vs encrypted. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/b13fccb9abbd547a7eef7b1fdfc223431b211c88.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/dmi.h | 8 ++++---- arch/x86/kernel/acpi/boot.c | 6 +++--- arch/x86/kernel/kdebugfs.c | 34 +++++++++++----------------------- arch/x86/kernel/ksysfs.c | 28 ++++++++++++++-------------- arch/x86/kernel/mpparse.c | 10 +++++----- arch/x86/pci/common.c | 4 ++-- drivers/firmware/dmi-sysfs.c | 5 +++-- drivers/firmware/pcdp.c | 4 ++-- drivers/sfi/sfi_core.c | 22 +++++++++++----------- 9 files changed, 55 insertions(+), 66 deletions(-) diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h index 3c69fed215c5..a8e15b04565b 100644 --- a/arch/x86/include/asm/dmi.h +++ b/arch/x86/include/asm/dmi.h @@ -13,9 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len) } /* Use early IO mappings for DMI because it's initialized early */ -#define dmi_early_remap early_ioremap -#define dmi_early_unmap early_iounmap -#define dmi_remap ioremap_cache -#define dmi_unmap iounmap +#define dmi_early_remap early_memremap +#define dmi_early_unmap early_memunmap +#define dmi_remap(_x, _l) memremap(_x, _l, MEMREMAP_WB) +#define dmi_unmap(_x) memunmap(_x) #endif /* _ASM_X86_DMI_H */ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6bb680671088..850160a76864 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -115,7 +115,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { #define ACPI_INVALID_GSI INT_MIN /* - * This is just a simple wrapper around early_ioremap(), + * This is just a simple wrapper around early_memremap(), * with sanity checks for phys == 0 and size == 0. */ char *__init __acpi_map_table(unsigned long phys, unsigned long size) @@ -124,7 +124,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) if (!phys || !size) return NULL; - return early_ioremap(phys, size); + return early_memremap(phys, size); } void __init __acpi_unmap_table(char *map, unsigned long size) @@ -132,7 +132,7 @@ void __init __acpi_unmap_table(char *map, unsigned long size) if (!map || !size) return; - early_iounmap(map, size); + early_memunmap(map, size); } #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 38b64587b31b..fd6f8fbbe6f2 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -33,7 +33,6 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, struct setup_data_node *node = file->private_data; unsigned long remain; loff_t pos = *ppos; - struct page *pg; void *p; u64 pa; @@ -47,18 +46,13 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, count = node->len - pos; pa = node->paddr + sizeof(struct setup_data) + pos; - pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); - if (PageHighMem(pg)) { - p = ioremap_cache(pa, count); - if (!p) - return -ENXIO; - } else - p = __va(pa); + p = memremap(pa, count, MEMREMAP_WB); + if (!p) + return -ENOMEM; remain = copy_to_user(user_buf, p, count); - if (PageHighMem(pg)) - iounmap(p); + memunmap(p); if (remain) return -EFAULT; @@ -109,7 +103,6 @@ static int __init create_setup_data_nodes(struct dentry *parent) struct setup_data *data; int error; struct dentry *d; - struct page *pg; u64 pa_data; int no = 0; @@ -126,16 +119,12 @@ static int __init create_setup_data_nodes(struct dentry *parent) goto err_dir; } - pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); - if (PageHighMem(pg)) { - data = ioremap_cache(pa_data, sizeof(*data)); - if (!data) { - kfree(node); - error = -ENXIO; - goto err_dir; - } - } else - data = __va(pa_data); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); + if (!data) { + kfree(node); + error = -ENOMEM; + goto err_dir; + } node->paddr = pa_data; node->type = data->type; @@ -143,8 +132,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) error = create_setup_data_node(d, no, node); pa_data = data->next; - if (PageHighMem(pg)) - iounmap(data); + memunmap(data); if (error) goto err_dir; no++; diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 4afc67f5facc..ee51db9a968a 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -16,8 +16,8 @@ #include #include #include +#include -#include #include static ssize_t version_show(struct kobject *kobj, @@ -79,12 +79,12 @@ static int get_setup_data_paddr(int nr, u64 *paddr) *paddr = pa_data; return 0; } - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; pa_data = data->next; - iounmap(data); + memunmap(data); i++; } return -EINVAL; @@ -97,17 +97,17 @@ static int __init get_setup_data_size(int nr, size_t *size) u64 pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; if (nr == i) { *size = data->len; - iounmap(data); + memunmap(data); return 0; } pa_data = data->next; - iounmap(data); + memunmap(data); i++; } return -EINVAL; @@ -127,12 +127,12 @@ static ssize_t type_show(struct kobject *kobj, ret = get_setup_data_paddr(nr, &paddr); if (ret) return ret; - data = ioremap_cache(paddr, sizeof(*data)); + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; ret = sprintf(buf, "0x%x\n", data->type); - iounmap(data); + memunmap(data); return ret; } @@ -154,7 +154,7 @@ static ssize_t setup_data_data_read(struct file *fp, ret = get_setup_data_paddr(nr, &paddr); if (ret) return ret; - data = ioremap_cache(paddr, sizeof(*data)); + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; @@ -170,15 +170,15 @@ static ssize_t setup_data_data_read(struct file *fp, goto out; ret = count; - p = ioremap_cache(paddr + sizeof(*data), data->len); + p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB); if (!p) { ret = -ENOMEM; goto out; } memcpy(buf, p + off, count); - iounmap(p); + memunmap(p); out: - iounmap(data); + memunmap(data); return ret; } @@ -250,13 +250,13 @@ static int __init get_setup_data_total_num(u64 pa_data, int *nr) *nr = 0; while (pa_data) { *nr += 1; - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) { ret = -ENOMEM; goto out; } pa_data = data->next; - iounmap(data); + memunmap(data); } out: diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0d904d759ff1..fd37f39066da 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -436,9 +436,9 @@ static unsigned long __init get_mpc_size(unsigned long physptr) struct mpc_table *mpc; unsigned long size; - mpc = early_ioremap(physptr, PAGE_SIZE); + mpc = early_memremap(physptr, PAGE_SIZE); size = mpc->length; - early_iounmap(mpc, PAGE_SIZE); + early_memunmap(mpc, PAGE_SIZE); apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); return size; @@ -450,7 +450,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) unsigned long size; size = get_mpc_size(mpf->physptr); - mpc = early_ioremap(mpf->physptr, size); + mpc = early_memremap(mpf->physptr, size); /* * Read the physical hardware table. Anything here will * override the defaults. @@ -461,10 +461,10 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) #endif pr_err("BIOS bug, MP table errors detected!...\n"); pr_cont("... disabling SMP support. (tell your hw vendor)\n"); - early_iounmap(mpc, size); + early_memunmap(mpc, size); return -1; } - early_iounmap(mpc, size); + early_memunmap(mpc, size); if (early) return -1; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index dbe2132b0ed4..7a5350d08cef 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -674,7 +674,7 @@ int pcibios_add_device(struct pci_dev *dev) pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = ioremap(pa_data, sizeof(*rom)); + data = memremap(pa_data, sizeof(*rom), MEMREMAP_WB); if (!data) return -ENOMEM; @@ -693,7 +693,7 @@ int pcibios_add_device(struct pci_dev *dev) } } pa_data = data->next; - iounmap(data); + memunmap(data); } set_dma_domain_ops(dev); set_dev_domain_options(dev); diff --git a/drivers/firmware/dmi-sysfs.c b/drivers/firmware/dmi-sysfs.c index ef76e5eecf0b..d5de6ee8466d 100644 --- a/drivers/firmware/dmi-sysfs.c +++ b/drivers/firmware/dmi-sysfs.c @@ -25,6 +25,7 @@ #include #include #include +#include #define MAX_ENTRY_TYPE 255 /* Most of these aren't used, but we consider the top entry type is only 8 bits */ @@ -380,7 +381,7 @@ static ssize_t dmi_sel_raw_read_phys32(struct dmi_sysfs_entry *entry, u8 __iomem *mapped; ssize_t wrote = 0; - mapped = ioremap(sel->access_method_address, sel->area_length); + mapped = dmi_remap(sel->access_method_address, sel->area_length); if (!mapped) return -EIO; @@ -390,7 +391,7 @@ static ssize_t dmi_sel_raw_read_phys32(struct dmi_sysfs_entry *entry, wrote++; } - iounmap(mapped); + dmi_unmap(mapped); return wrote; } diff --git a/drivers/firmware/pcdp.c b/drivers/firmware/pcdp.c index 75273a251603..e83d6aec0c13 100644 --- a/drivers/firmware/pcdp.c +++ b/drivers/firmware/pcdp.c @@ -95,7 +95,7 @@ efi_setup_pcdp_console(char *cmdline) if (efi.hcdp == EFI_INVALID_TABLE_ADDR) return -ENODEV; - pcdp = early_ioremap(efi.hcdp, 4096); + pcdp = early_memremap(efi.hcdp, 4096); printk(KERN_INFO "PCDP: v%d at 0x%lx\n", pcdp->rev, efi.hcdp); if (strstr(cmdline, "console=hcdp")) { @@ -131,6 +131,6 @@ efi_setup_pcdp_console(char *cmdline) } out: - early_iounmap(pcdp, 4096); + early_memunmap(pcdp, 4096); return rc; } diff --git a/drivers/sfi/sfi_core.c b/drivers/sfi/sfi_core.c index 296db7a69c27..d5ce53491efb 100644 --- a/drivers/sfi/sfi_core.c +++ b/drivers/sfi/sfi_core.c @@ -86,13 +86,13 @@ static struct sfi_table_simple *syst_va __read_mostly; /* * FW creates and saves the SFI tables in memory. When these tables get * used, they may need to be mapped to virtual address space, and the mapping - * can happen before or after the ioremap() is ready, so a flag is needed + * can happen before or after the memremap() is ready, so a flag is needed * to indicating this */ -static u32 sfi_use_ioremap __read_mostly; +static u32 sfi_use_memremap __read_mostly; /* - * sfi_un/map_memory calls early_ioremap/iounmap which is a __init function + * sfi_un/map_memory calls early_memremap/memunmap which is a __init function * and introduces section mismatch. So use __ref to make it calm. */ static void __iomem * __ref sfi_map_memory(u64 phys, u32 size) @@ -100,10 +100,10 @@ static void __iomem * __ref sfi_map_memory(u64 phys, u32 size) if (!phys || !size) return NULL; - if (sfi_use_ioremap) - return ioremap_cache(phys, size); + if (sfi_use_memremap) + return memremap(phys, size, MEMREMAP_WB); else - return early_ioremap(phys, size); + return early_memremap(phys, size); } static void __ref sfi_unmap_memory(void __iomem *virt, u32 size) @@ -111,10 +111,10 @@ static void __ref sfi_unmap_memory(void __iomem *virt, u32 size) if (!virt || !size) return; - if (sfi_use_ioremap) - iounmap(virt); + if (sfi_use_memremap) + memunmap(virt); else - early_iounmap(virt, size); + early_memunmap(virt, size); } static void sfi_print_table_header(unsigned long long pa, @@ -507,8 +507,8 @@ void __init sfi_init_late(void) length = syst_va->header.len; sfi_unmap_memory(syst_va, sizeof(struct sfi_table_simple)); - /* Use ioremap now after it is ready */ - sfi_use_ioremap = 1; + /* Use memremap now after it is ready */ + sfi_use_memremap = 1; syst_va = sfi_map_memory(syst_pa, length); sfi_acpi_init(); From 872cbefd2d9c52bd0b1e2c7942c4369e98a5a5ae Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:01 -0500 Subject: [PATCH 015/149] x86/cpu/AMD: Add the Secure Memory Encryption CPU feature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the CPU features to include identifying and reporting on the Secure Memory Encryption (SME) feature. SME is identified by CPUID 0x8000001f, but requires BIOS support to enable it (set bit 23 of MSR_K8_SYSCFG). Only show the SME feature as available if reported by CPUID, enabled by BIOS and not configured as CONFIG_X86_32=y. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/85c17ff450721abccddc95e611ae8df3f4d9718b.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/cpu/amd.c | 19 +++++++++++++++++++ arch/x86/kernel/cpu/scattered.c | 1 + 4 files changed, 23 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ca3c48c0872f..14f0f2913364 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -196,6 +196,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 5573c75f8e4c..17f5c12e1afd 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -356,6 +356,8 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_K8_SYSCFG 0xc0010010 +#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bb5abe8f5fd4..5ccc7b2e63bb 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -611,6 +611,25 @@ static void early_init_amd(struct cpuinfo_x86 *c) */ if (cpu_has_amd_erratum(c, amd_erratum_400)) set_cpu_bug(c, X86_BUG_AMD_E400); + + /* + * BIOS support is required for SME. If BIOS has not enabled SME + * then don't advertise the feature (set in scattered.c). Also, + * since the SME support requires long mode, don't advertise the + * feature under CONFIG_X86_32. + */ + if (cpu_has(c, X86_FEATURE_SME)) { + if (IS_ENABLED(CONFIG_X86_32)) { + clear_cpu_cap(c, X86_FEATURE_SME); + } else { + u64 msr; + + /* Check if SME is enabled */ + rdmsrl(MSR_K8_SYSCFG, msr); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + clear_cpu_cap(c, X86_FEATURE_SME); + } + } } static void init_amd_k8(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 23c23508c012..05459ad3db46 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -31,6 +31,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; From 9af9b94068fb1ea3206a700fc222075966fbef14 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:02 -0500 Subject: [PATCH 016/149] x86/cpu/AMD: Handle SME reduction in physical address size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When System Memory Encryption (SME) is enabled, the physical address space is reduced. Adjust the x86_phys_bits value to reflect this reduction. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/593c037a3cad85ba92f3d061ffa7462e9ce3531d.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 5ccc7b2e63bb..4d87950fde30 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -613,21 +613,23 @@ static void early_init_amd(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_E400); /* - * BIOS support is required for SME. If BIOS has not enabled SME - * then don't advertise the feature (set in scattered.c). Also, - * since the SME support requires long mode, don't advertise the - * feature under CONFIG_X86_32. + * BIOS support is required for SME. If BIOS has enabled SME then + * adjust x86_phys_bits by the SME physical address space reduction + * value. If BIOS has not enabled SME then don't advertise the + * feature (set in scattered.c). Also, since the SME support requires + * long mode, don't advertise the feature under CONFIG_X86_32. */ if (cpu_has(c, X86_FEATURE_SME)) { - if (IS_ENABLED(CONFIG_X86_32)) { - clear_cpu_cap(c, X86_FEATURE_SME); - } else { - u64 msr; + u64 msr; - /* Check if SME is enabled */ - rdmsrl(MSR_K8_SYSCFG, msr); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + /* Check if SME is enabled */ + rdmsrl(MSR_K8_SYSCFG, msr); + if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) { + c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f; + if (IS_ENABLED(CONFIG_X86_32)) clear_cpu_cap(c, X86_FEATURE_SME); + } else { + clear_cpu_cap(c, X86_FEATURE_SME); } } } From 7744ccdbc16f0ac4adae21b3678af93775b3a386 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:03 -0500 Subject: [PATCH 017/149] x86/mm: Add Secure Memory Encryption (SME) support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for Secure Memory Encryption (SME). This initial support provides a Kconfig entry to build the SME support into the kernel and defines the memory encryption mask that will be used in subsequent patches to mark pages as encrypted. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/a6c34d16caaed3bc3e2d6f0987554275bd291554.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 25 +++++++++++++++++++++ arch/x86/include/asm/mem_encrypt.h | 30 +++++++++++++++++++++++++ arch/x86/mm/Makefile | 1 + arch/x86/mm/mem_encrypt.c | 21 ++++++++++++++++++ include/linux/mem_encrypt.h | 35 ++++++++++++++++++++++++++++++ 5 files changed, 112 insertions(+) create mode 100644 arch/x86/include/asm/mem_encrypt.h create mode 100644 arch/x86/mm/mem_encrypt.c create mode 100644 include/linux/mem_encrypt.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 781521b7cf9e..ba7b93d08d00 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1415,6 +1415,31 @@ config X86_DIRECT_GBPAGES supports them), so don't confuse the user by printing that we have them enabled. +config ARCH_HAS_MEM_ENCRYPT + def_bool y + +config AMD_MEM_ENCRYPT + bool "AMD Secure Memory Encryption (SME) support" + depends on X86_64 && CPU_SUP_AMD + ---help--- + Say yes to enable support for the encryption of system memory. + This requires an AMD processor that supports Secure Memory + Encryption (SME). + +config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT + bool "Activate AMD Secure Memory Encryption (SME) by default" + default y + depends on AMD_MEM_ENCRYPT + ---help--- + Say yes to have system memory encrypted by default if running on + an AMD processor that supports Secure Memory Encryption (SME). + + If set to Y, then the encryption of system memory can be + deactivated with the mem_encrypt=off command line option. + + If set to N, then the encryption of system memory can be + activated with the mem_encrypt=on command line option. + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h new file mode 100644 index 000000000000..a1057961ac46 --- /dev/null +++ b/arch/x86/include/asm/mem_encrypt.h @@ -0,0 +1,30 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __X86_MEM_ENCRYPT_H__ +#define __X86_MEM_ENCRYPT_H__ + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_AMD_MEM_ENCRYPT + +extern unsigned long sme_me_mask; + +#else /* !CONFIG_AMD_MEM_ENCRYPT */ + +#define sme_me_mask 0UL + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + +#endif /* __ASSEMBLY__ */ + +#endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 0fbdcb64f9f8..a94a7b663d5f 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -39,3 +39,4 @@ obj-$(CONFIG_X86_INTEL_MPX) += mpx.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c new file mode 100644 index 000000000000..b99d469c73e7 --- /dev/null +++ b/arch/x86/mm/mem_encrypt.c @@ -0,0 +1,21 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include + +/* + * Since SME related variables are set early in the boot process they must + * reside in the .data section so as not to be zeroed out when the .bss + * section is later cleared. + */ +unsigned long sme_me_mask __section(.data) = 0; +EXPORT_SYMBOL_GPL(sme_me_mask); diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h new file mode 100644 index 000000000000..59769f7287e4 --- /dev/null +++ b/include/linux/mem_encrypt.h @@ -0,0 +1,35 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __MEM_ENCRYPT_H__ +#define __MEM_ENCRYPT_H__ + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT + +#include + +#else /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */ + +#define sme_me_mask 0UL + +#endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */ + +static inline bool sme_active(void) +{ + return !!sme_me_mask; +} + +#endif /* __ASSEMBLY__ */ + +#endif /* __MEM_ENCRYPT_H__ */ From 33c2b803edd13487518a2c7d5002d84d7e9c878f Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:04 -0500 Subject: [PATCH 018/149] x86/mm: Remove phys_to_virt() usage in ioremap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently there is a check if the address being mapped is in the ISA range (is_ISA_range()), and if it is, then phys_to_virt() is used to perform the mapping. When SME is active, the default is to add pagetable mappings with the encryption bit set unless specifically overridden. The resulting pagetable mapping from phys_to_virt() will result in a mapping that has the encryption bit set. With SME, the use of ioremap() is intended to generate pagetable mappings that do not have the encryption bit set through the use of the PAGE_KERNEL_IO protection value. Rather than special case the SME scenario, remove the ISA range check and usage of phys_to_virt() and have ISA range mappings continue through the remaining ioremap() path. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/88ada7b09c6568c61cd696351eb59fb15a82ce1a.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 4c1b5fd0c7ad..66ddf5e8ffc8 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -105,12 +105,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return NULL; } - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (is_ISA_range(phys_addr, last_addr)) - return (__force void __iomem *)phys_to_virt(phys_addr); - /* * Don't allow anybody to remap normal RAM that we're using.. */ @@ -340,13 +334,17 @@ void iounmap(volatile void __iomem *addr) return; /* - * __ioremap special-cases the PCI/ISA range by not instantiating a - * vm_area and by simply returning an address into the kernel mapping - * of ISA space. So handle that here. + * The PCI/ISA range special-casing was removed from __ioremap() + * so this check, in theory, can be removed. However, there are + * cases where iounmap() is called for addresses not obtained via + * ioremap() (vga16fb for example). Add a warning so that these + * cases can be caught and fixed. */ if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && - (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) + (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) { + WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n"); return; + } addr = (volatile void __iomem *) (PAGE_MASK & (unsigned long __force)addr); From 5868f3651fa0dff96a57f94d49247d3ef320ebe2 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:05 -0500 Subject: [PATCH 019/149] x86/mm: Add support to enable SME in early boot processing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support to the early boot code to use Secure Memory Encryption (SME). Since the kernel has been loaded into memory in a decrypted state, encrypt the kernel in place and update the early pagetables with the memory encryption mask so that new pagetable entries will use memory encryption. The routines to set the encryption mask and perform the encryption are stub routines for now with functionality to be added in a later patch. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/e52ad781f085224bf835b3caff9aa3aee6febccb.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mem_encrypt.h | 8 +++++ arch/x86/kernel/head64.c | 53 +++++++++++++++++++++++------- arch/x86/kernel/head_64.S | 20 +++++++++-- arch/x86/mm/mem_encrypt.c | 9 +++++ include/linux/mem_encrypt.h | 5 +++ 5 files changed, 82 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index a1057961ac46..475e34f53793 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -15,14 +15,22 @@ #ifndef __ASSEMBLY__ +#include + #ifdef CONFIG_AMD_MEM_ENCRYPT extern unsigned long sme_me_mask; +void __init sme_encrypt_kernel(void); +void __init sme_enable(void); + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask 0UL +static inline void __init sme_encrypt_kernel(void) { } +static inline void __init sme_enable(void) { } + #endif /* CONFIG_AMD_MEM_ENCRYPT */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 46c3c73e7f43..1f0ddcc9675c 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -45,9 +46,10 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr) return ptr - (void *)_text + (void *)physaddr; } -void __head __startup_64(unsigned long physaddr) +unsigned long __head __startup_64(unsigned long physaddr) { unsigned long load_delta, *p; + unsigned long pgtable_flags; pgdval_t *pgd; p4dval_t *p4d; pudval_t *pud; @@ -68,6 +70,12 @@ void __head __startup_64(unsigned long physaddr) if (load_delta & ~PMD_PAGE_MASK) for (;;); + /* Activate Secure Memory Encryption (SME) if supported and enabled */ + sme_enable(); + + /* Include the SME encryption mask in the fixup value */ + load_delta += sme_get_me_mask(); + /* Fixup the physical addresses in the page table */ pgd = fixup_pointer(&early_top_pgt, physaddr); @@ -94,28 +102,30 @@ void __head __startup_64(unsigned long physaddr) pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); + pgtable_flags = _KERNPG_TABLE + sme_get_me_mask(); if (IS_ENABLED(CONFIG_X86_5LEVEL)) { p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE; - pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE; + pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; + pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; - p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; - p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; + p4d[i + 0] = (pgdval_t)pud + pgtable_flags; + p4d[i + 1] = (pgdval_t)pud + pgtable_flags; } else { i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; - pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; + pgd[i + 0] = (pgdval_t)pud + pgtable_flags; + pgd[i + 1] = (pgdval_t)pud + pgtable_flags; } i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; - pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE; - pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE; + pud[i + 0] = (pudval_t)pmd + pgtable_flags; + pud[i + 1] = (pudval_t)pmd + pgtable_flags; pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; + pmd_entry += sme_get_me_mask(); pmd_entry += physaddr; for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { @@ -136,9 +146,30 @@ void __head __startup_64(unsigned long physaddr) pmd[i] += load_delta; } - /* Fixup phys_base */ + /* + * Fixup phys_base - remove the memory encryption mask to obtain + * the true physical address. + */ p = fixup_pointer(&phys_base, physaddr); - *p += load_delta; + *p += load_delta - sme_get_me_mask(); + + /* Encrypt the kernel (if SME is active) */ + sme_encrypt_kernel(); + + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + +unsigned long __startup_secondary_64(void) +{ + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); } /* Wipe all early page tables except for the kernel symbol map */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6225550883df..ec5d5e90c8f1 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -73,12 +73,19 @@ startup_64: /* Sanitize CPU configuration */ call verify_cpu + /* + * Perform pagetable fixups. Additionally, if SME is active, encrypt + * the kernel and retrieve the modifier (SME encryption mask if SME + * is active) to be added to the initial pgdir entry that will be + * programmed into CR3. + */ leaq _text(%rip), %rdi pushq %rsi call __startup_64 popq %rsi - movq $(early_top_pgt - __START_KERNEL_map), %rax + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) /* @@ -98,7 +105,16 @@ ENTRY(secondary_startup_64) /* Sanitize CPU configuration */ call verify_cpu - movq $(init_top_pgt - __START_KERNEL_map), %rax + /* + * Retrieve the modifier (SME encryption mask if SME is active) to be + * added to the initial pgdir entry that will be programmed into CR3. + */ + pushq %rsi + call __startup_secondary_64 + popq %rsi + + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(init_top_pgt - __START_KERNEL_map), %rax 1: /* Enable PAE mode, PGE and LA57 */ diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index b99d469c73e7..3ac6f99b095c 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -11,6 +11,7 @@ */ #include +#include /* * Since SME related variables are set early in the boot process they must @@ -19,3 +20,11 @@ */ unsigned long sme_me_mask __section(.data) = 0; EXPORT_SYMBOL_GPL(sme_me_mask); + +void __init sme_encrypt_kernel(void) +{ +} + +void __init sme_enable(void) +{ +} diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index 59769f7287e4..570f4fcff13f 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -30,6 +30,11 @@ static inline bool sme_active(void) return !!sme_me_mask; } +static inline unsigned long sme_get_me_mask(void) +{ + return sme_me_mask; +} + #endif /* __ASSEMBLY__ */ #endif /* __MEM_ENCRYPT_H__ */ From fd7e315988b784509ba3f1b42f539bd0b1fca9bb Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:06 -0500 Subject: [PATCH 020/149] x86/mm: Simplify p[g4um]d_page() macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create a pgd_pfn() macro similar to the p[4um]d_pfn() macros and then use the p[g4um]d_pfn() macros in the p[g4um]d_page() macros instead of duplicating the code. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/e61eb533a6d0aac941db2723d8aa63ef6b882dee.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 77037b6f1caa..b64ea527edfb 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -195,6 +195,11 @@ static inline unsigned long p4d_pfn(p4d_t p4d) return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; } +static inline unsigned long pgd_pfn(pgd_t pgd) +{ + return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + static inline int p4d_large(p4d_t p4d) { /* No 512 GiB pages yet */ @@ -704,8 +709,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pmd_page(pmd) \ - pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT) +#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] @@ -773,8 +777,7 @@ static inline unsigned long pud_page_vaddr(pud_t pud) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pud_page(pud) \ - pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT) +#define pud_page(pud) pfn_to_page(pud_pfn(pud)) /* Find an entry in the second-level page table.. */ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) @@ -824,8 +827,7 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define p4d_page(p4d) \ - pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT) +#define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d)) /* Find an entry in the third-level page table.. */ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) @@ -859,7 +861,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) +#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) /* to find an entry in a page-table-directory. */ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) From 21729f81ce8ae76a6995681d40e16f7ce8075db4 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:07 -0500 Subject: [PATCH 021/149] x86/mm: Provide general kernel support for memory encryption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes to the existing page table macros will allow the SME support to be enabled in a simple fashion with minimal changes to files that use these macros. Since the memory encryption mask will now be part of the regular pagetable macros, we introduce two new macros (_PAGE_TABLE_NOENC and _KERNPG_TABLE_NOENC) to allow for early pagetable creation/initialization without the encryption mask before SME becomes active. Two new pgprot() macros are defined to allow setting or clearing the page encryption mask. The FIXMAP_PAGE_NOCACHE define is introduced for use with MMIO. SME does not support encryption for MMIO areas so this define removes the encryption mask from the page attribute. Two new macros are introduced (__sme_pa() / __sme_pa_nodebug()) to allow creating a physical address with the encryption mask. These are used when working with the cr3 register so that the PGD can be encrypted. The current __va() macro is updated so that the virtual address is generated based off of the physical address without the encryption mask thus allowing the same virtual address to be generated regardless of whether encryption is enabled for that physical location or not. Also, an early initialization function is added for SME. If SME is active, this function: - Updates the early_pmd_flags so that early page faults create mappings with the encryption mask. - Updates the __supported_pte_mask to include the encryption mask. - Updates the protection_map entries to include the encryption mask so that user-space allocations will automatically have the encryption mask applied. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/b36e952c4c39767ae7f0a41cf5345adf27438480.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/pagetable.c | 7 +++++ arch/x86/include/asm/fixmap.h | 7 +++++ arch/x86/include/asm/mem_encrypt.h | 13 +++++++++ arch/x86/include/asm/page_types.h | 3 +- arch/x86/include/asm/pgtable.h | 9 ++++++ arch/x86/include/asm/pgtable_types.h | 43 ++++++++++++++++++---------- arch/x86/include/asm/processor.h | 3 +- arch/x86/kernel/espfix_64.c | 2 +- arch/x86/kernel/head64.c | 11 +++++-- arch/x86/kernel/head_64.S | 20 ++++++------- arch/x86/mm/kasan_init_64.c | 4 +-- arch/x86/mm/mem_encrypt.c | 17 +++++++++++ arch/x86/mm/pageattr.c | 3 ++ arch/x86/mm/tlb.c | 4 +-- include/asm-generic/pgtable.h | 12 ++++++++ include/linux/mem_encrypt.h | 8 ++++++ 16 files changed, 132 insertions(+), 34 deletions(-) diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 28029be47fbb..f1aa43854bed 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -15,6 +15,13 @@ #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)((unsigned long)(x))) +/* + * The pgtable.h and mm/ident_map.c includes make use of the SME related + * information which is not used in the compressed image support. Un-define + * the SME support to avoid any compile and link errors. + */ +#undef CONFIG_AMD_MEM_ENCRYPT + #include "misc.h" /* These actually do the work of building the kernel identity maps. */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b65155cc3760..d9ff226cb489 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -157,6 +157,13 @@ static inline void __set_fixmap(enum fixed_addresses idx, } #endif +/* + * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not + * supported for MMIO addresses, so make sure that the memory encryption + * mask is not part of the page attributes. + */ +#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE + #include #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 475e34f53793..dbae7a5a347d 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -21,6 +21,8 @@ extern unsigned long sme_me_mask; +void __init sme_early_init(void); + void __init sme_encrypt_kernel(void); void __init sme_enable(void); @@ -28,11 +30,22 @@ void __init sme_enable(void); #define sme_me_mask 0UL +static inline void __init sme_early_init(void) { } + static inline void __init sme_encrypt_kernel(void) { } static inline void __init sme_enable(void) { } #endif /* CONFIG_AMD_MEM_ENCRYPT */ +/* + * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when + * writing to or comparing values from the cr3 register. Having the + * encryption mask set in cr3 enables the PGD entry to be encrypted and + * avoid special case handling of PGD allocations. + */ +#define __sme_pa(x) (__pa(x) | sme_me_mask) +#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) + #endif /* __ASSEMBLY__ */ #endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 7bd0099384ca..b98ed9d14630 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -3,6 +3,7 @@ #include #include +#include /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 @@ -15,7 +16,7 @@ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) -#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) +#define __PHYSICAL_MASK ((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1))) #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) /* Cast *PAGE_MASK to a signed type so that it is sign-extended if diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index b64ea527edfb..c6452cb12c0b 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_PGTABLE_H #define _ASM_X86_PGTABLE_H +#include #include #include @@ -13,6 +14,12 @@ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ : (prot)) +/* + * Macros to add or remove encryption attribute + */ +#define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot))) +#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) + #ifndef __ASSEMBLY__ #include @@ -38,6 +45,8 @@ extern struct list_head pgd_list; extern struct mm_struct *pgd_page_get_mm(struct page *page); +extern pmdval_t early_pmd_flags; + #ifdef CONFIG_PARAVIRT #include #else /* !CONFIG_PARAVIRT */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index bf9638e1ee42..de32ca32928a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -2,6 +2,8 @@ #define _ASM_X86_PGTABLE_DEFS_H #include +#include + #include #define FIRST_USER_ADDRESS 0UL @@ -121,10 +123,10 @@ #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ - _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ - _PAGE_DIRTY) +#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\ + _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \ + _PAGE_ACCESSED | _PAGE_DIRTY) /* * Set of bits not changed in pte_modify. The pte's @@ -191,18 +193,29 @@ enum page_cache_mode { #define __PAGE_KERNEL_IO (__PAGE_KERNEL) #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) -#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) -#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) -#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) -#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) -#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) +#ifndef __ASSEMBLY__ -#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) -#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ + _PAGE_DIRTY | _PAGE_ENC) + +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC) +#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC) + +#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) + +#endif /* __ASSEMBLY__ */ /* xwr */ #define __P000 PAGE_NONE diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6a79547e8ee0..a68f70c3debc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -29,6 +29,7 @@ struct vm86; #include #include #include +#include /* * We handle most unaligned accesses in hardware. On the other hand @@ -241,7 +242,7 @@ static inline unsigned long read_cr3_pa(void) static inline void load_cr3(pgd_t *pgdir) { - write_cr3(__pa(pgdir)); + write_cr3(__sme_pa(pgdir)); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 6b91e2eb8d3f..9c4e7ba6870c 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -195,7 +195,7 @@ void init_espfix_ap(int cpu) pte_p = pte_offset_kernel(&pmd, addr); stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); - pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); + pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask)); for (n = 0; n < ESPFIX_PTE_CLONES; n++) set_pte(&pte_p[n*PTE_STRIDE], pte); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 1f0ddcc9675c..5cd0b72a0283 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -102,7 +102,7 @@ unsigned long __head __startup_64(unsigned long physaddr) pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); - pgtable_flags = _KERNPG_TABLE + sme_get_me_mask(); + pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); if (IS_ENABLED(CONFIG_X86_5LEVEL)) { p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); @@ -177,7 +177,7 @@ static void __init reset_early_page_tables(void) { memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); next_early_pgt = 0; - write_cr3(__pa_nodebug(early_top_pgt)); + write_cr3(__sme_pa_nodebug(early_top_pgt)); } /* Create a new PMD entry */ @@ -310,6 +310,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_page(init_top_pgt); + /* + * SME support may update early_pmd_flags to include the memory + * encryption mask, so it needs to be called before anything + * that may generate a page fault. + */ + sme_early_init(); + kasan_early_init(); for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ec5d5e90c8f1..513cbb012ecc 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -351,9 +351,9 @@ GLOBAL(name) NEXT_PAGE(early_top_pgt) .fill 511,8,0 #ifdef CONFIG_X86_5LEVEL - .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #else - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif NEXT_PAGE(early_dynamic_pgts) @@ -366,15 +366,15 @@ NEXT_PAGE(init_top_pgt) .fill 512,8,0 #else NEXT_PAGE(init_top_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC NEXT_PAGE(level3_ident_pgt) - .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .fill 511, 8, 0 NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. @@ -386,14 +386,14 @@ NEXT_PAGE(level2_ident_pgt) #ifdef CONFIG_X86_5LEVEL NEXT_PAGE(level4_kernel_pgt) .fill 511,8,0 - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ - .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE - .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC NEXT_PAGE(level2_kernel_pgt) /* @@ -411,7 +411,7 @@ NEXT_PAGE(level2_kernel_pgt) NEXT_PAGE(level2_fixmap_pgt) .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ .fill 5,8,0 diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 02c9d7553409..39d4daf5e289 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -87,7 +87,7 @@ static struct notifier_block kasan_die_notifier = { void __init kasan_early_init(void) { int i; - pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; + pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC; pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; @@ -153,7 +153,7 @@ void __init kasan_init(void) */ memset(kasan_zero_page, 0, PAGE_SIZE); for (i = 0; i < PTRS_PER_PTE; i++) { - pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO); + pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC); set_pte(&kasan_zero_pte[i], pte); } /* Flush TLBs again to be sure that write protection applied. */ diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 3ac6f99b095c..f973d3dc3802 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -12,6 +12,7 @@ #include #include +#include /* * Since SME related variables are set early in the boot process they must @@ -21,6 +22,22 @@ unsigned long sme_me_mask __section(.data) = 0; EXPORT_SYMBOL_GPL(sme_me_mask); +void __init sme_early_init(void) +{ + unsigned int i; + + if (!sme_me_mask) + return; + + early_pmd_flags = __sme_set(early_pmd_flags); + + __supported_pte_mask = __sme_set(__supported_pte_mask); + + /* Update the protection map with memory encryption mask */ + for (i = 0; i < ARRAY_SIZE(protection_map); i++) + protection_map[i] = pgprot_encrypted(protection_map[i]); +} + void __init sme_encrypt_kernel(void) { } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 757b0bcdf712..7e2d6c0a64c4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -2020,6 +2020,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, if (!(page_flags & _PAGE_RW)) cpa.mask_clr = __pgprot(_PAGE_RW); + if (!(page_flags & _PAGE_ENC)) + cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); + cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); retval = __change_page_attr_set_clr(&cpa, 0); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 2c1b8881e9d3..593d2f76a54c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -115,7 +115,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, */ this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen); - write_cr3(__pa(next->pgd)); + write_cr3(__sme_pa(next->pgd)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } @@ -157,7 +157,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen); this_cpu_write(cpu_tlbstate.loaded_mm, next); - write_cr3(__pa(next->pgd)); + write_cr3(__sme_pa(next->pgd)); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 7dfa767dc680..4d7bb98f4134 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -582,6 +582,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ #endif /* CONFIG_MMU */ +/* + * No-op macros that just return the current protection value. Defined here + * because these macros can be used used even if CONFIG_MMU is not defined. + */ +#ifndef pgprot_encrypted +#define pgprot_encrypted(prot) (prot) +#endif + +#ifndef pgprot_decrypted +#define pgprot_decrypted(prot) (prot) +#endif + /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index 570f4fcff13f..1255f09f5e42 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -35,6 +35,14 @@ static inline unsigned long sme_get_me_mask(void) return sme_me_mask; } +/* + * The __sme_set() and __sme_clr() macros are useful for adding or removing + * the encryption mask from a value (e.g. when dealing with pagetable + * entries). + */ +#define __sme_set(x) ((unsigned long)(x) | sme_me_mask) +#define __sme_clr(x) ((unsigned long)(x) & ~sme_me_mask) + #endif /* __ASSEMBLY__ */ #endif /* __MEM_ENCRYPT_H__ */ From eef9c4abe77f55b1600f59d8ac5f1d953e2f5384 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:08 -0500 Subject: [PATCH 022/149] x86/mm: Add SME support for read_cr3_pa() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CR3 register entry can contain the SME encryption mask that indicates the PGD is encrypted. The encryption mask should not be used when creating a virtual address from the CR3 register, so remove the SME encryption mask in the read_cr3_pa() function. During early boot SME will need to use a native version of read_cr3_pa(), so create native_read_cr3_pa(). Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/767b085c384a46f67f451f8589903a462c7ff68a.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor-flags.h | 5 +++-- arch/x86/include/asm/processor.h | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 79aa2f98398d..f5d3e50af98c 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -2,6 +2,7 @@ #define _ASM_X86_PROCESSOR_FLAGS_H #include +#include #ifdef CONFIG_VM86 #define X86_VM_MASK X86_EFLAGS_VM @@ -32,8 +33,8 @@ * CR3_ADDR_MASK is the mask used by read_cr3_pa(). */ #ifdef CONFIG_X86_64 -/* Mask off the address space ID bits. */ -#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull +/* Mask off the address space ID and SME encryption bits. */ +#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) #define CR3_PCID_MASK 0xFFFull #else /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a68f70c3debc..973709d2938f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -240,6 +240,11 @@ static inline unsigned long read_cr3_pa(void) return __read_cr3() & CR3_ADDR_MASK; } +static inline unsigned long native_read_cr3_pa(void) +{ + return __native_read_cr3() & CR3_ADDR_MASK; +} + static inline void load_cr3(pgd_t *pgdir) { write_cr3(__sme_pa(pgdir)); From f88a68facd9a15b94f8c195d9d2c0b30c76c595a Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:09 -0500 Subject: [PATCH 023/149] x86/mm: Extend early_memremap() support with additional attrs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add early_memremap() support to be able to specify encrypted and decrypted mappings with and without write-protection. The use of write-protection is necessary when encrypting data "in place". The write-protect attribute is considered cacheable for loads, but not stores. This implies that the hardware will never give the core a dirty line with this memtype. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/479b5832c30fae3efa7932e48f81794e86397229.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 +++ arch/x86/include/asm/fixmap.h | 13 ++++++++ arch/x86/include/asm/pgtable_types.h | 8 +++++ arch/x86/mm/ioremap.c | 44 ++++++++++++++++++++++++++++ include/asm-generic/early_ioremap.h | 2 ++ mm/early_ioremap.c | 10 +++++++ 6 files changed, 81 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ba7b93d08d00..8328bcb9ce8b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1440,6 +1440,10 @@ config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT If set to N, then the encryption of system memory can be activated with the mem_encrypt=on command line option. +config ARCH_USE_MEMREMAP_PROT + def_bool y + depends on AMD_MEM_ENCRYPT + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index d9ff226cb489..dcd9fb55e679 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -164,6 +164,19 @@ static inline void __set_fixmap(enum fixed_addresses idx, */ #define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE +/* + * Early memremap routines used for in-place encryption. The mappings created + * by these routines are intended to be used as temporary mappings. + */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size); + #include #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index de32ca32928a..32095af0fefb 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -161,6 +161,7 @@ enum page_cache_mode { #define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) +#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP)) #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ @@ -189,6 +190,7 @@ enum page_cache_mode { #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) +#define __PAGE_KERNEL_WP (__PAGE_KERNEL | _PAGE_CACHE_WP) #define __PAGE_KERNEL_IO (__PAGE_KERNEL) #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) @@ -202,6 +204,12 @@ enum page_cache_mode { #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ _PAGE_DIRTY | _PAGE_ENC) +#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC) +#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC) + +#define __PAGE_KERNEL_NOENC (__PAGE_KERNEL) +#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP) + #define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 66ddf5e8ffc8..570201bbf442 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -417,6 +417,50 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); } +#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT +/* Remap memory with encryption */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC); +} + +/* + * Remap memory with encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; + + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP); +} + +/* Remap memory without encryption */ +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC); +} + +/* + * Remap memory without encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; + + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP); +} +#endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */ + static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h index 734ad4db388c..2edef8d7fa6b 100644 --- a/include/asm-generic/early_ioremap.h +++ b/include/asm-generic/early_ioremap.h @@ -13,6 +13,8 @@ extern void *early_memremap(resource_size_t phys_addr, unsigned long size); extern void *early_memremap_ro(resource_size_t phys_addr, unsigned long size); +extern void *early_memremap_prot(resource_size_t phys_addr, + unsigned long size, unsigned long prot_val); extern void early_iounmap(void __iomem *addr, unsigned long size); extern void early_memunmap(void *addr, unsigned long size); diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index 6d5717bd7197..d7d30da754ba 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -226,6 +226,16 @@ early_memremap_ro(resource_size_t phys_addr, unsigned long size) } #endif +#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT +void __init * +early_memremap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long prot_val) +{ + return (__force void *)__early_ioremap(phys_addr, size, + __pgprot(prot_val)); +} +#endif + #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) From 7f8b7e7f4ccbbd1fb8badddfabd28c955aea87b4 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:10 -0500 Subject: [PATCH 024/149] x86/mm: Add support for early encryption/decryption of memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support to be able to either encrypt or decrypt data in place during the early stages of booting the kernel. This does not change the memory encryption attribute - it is used for ensuring that data present in either an encrypted or decrypted memory area is in the proper state (for example the initrd will have been loaded by the boot loader and will not be encrypted, but the memory that it resides in is marked as encrypted). Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/f9968e9432cd6c4b57ef245729be04ff18852225.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mem_encrypt.h | 10 ++++ arch/x86/mm/mem_encrypt.c | 76 ++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index dbae7a5a347d..8baa35ba2316 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -21,6 +21,11 @@ extern unsigned long sme_me_mask; +void __init sme_early_encrypt(resource_size_t paddr, + unsigned long size); +void __init sme_early_decrypt(resource_size_t paddr, + unsigned long size); + void __init sme_early_init(void); void __init sme_encrypt_kernel(void); @@ -30,6 +35,11 @@ void __init sme_enable(void); #define sme_me_mask 0UL +static inline void __init sme_early_encrypt(resource_size_t paddr, + unsigned long size) { } +static inline void __init sme_early_decrypt(resource_size_t paddr, + unsigned long size) { } + static inline void __init sme_early_init(void) { } static inline void __init sme_encrypt_kernel(void) { } diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index f973d3dc3802..54bb73c3dd9d 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -14,6 +14,9 @@ #include #include +#include +#include + /* * Since SME related variables are set early in the boot process they must * reside in the .data section so as not to be zeroed out when the .bss @@ -22,6 +25,79 @@ unsigned long sme_me_mask __section(.data) = 0; EXPORT_SYMBOL_GPL(sme_me_mask); +/* Buffer used for early in-place encryption by BSP, no locking needed */ +static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); + +/* + * This routine does not change the underlying encryption setting of the + * page(s) that map this memory. It assumes that eventually the memory is + * meant to be accessed as either encrypted or decrypted but the contents + * are currently not in the desired state. + * + * This routine follows the steps outlined in the AMD64 Architecture + * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place. + */ +static void __init __sme_early_enc_dec(resource_size_t paddr, + unsigned long size, bool enc) +{ + void *src, *dst; + size_t len; + + if (!sme_me_mask) + return; + + local_flush_tlb(); + wbinvd(); + + /* + * There are limited number of early mapping slots, so map (at most) + * one page at time. + */ + while (size) { + len = min_t(size_t, sizeof(sme_early_buffer), size); + + /* + * Create mappings for the current and desired format of + * the memory. Use a write-protected mapping for the source. + */ + src = enc ? early_memremap_decrypted_wp(paddr, len) : + early_memremap_encrypted_wp(paddr, len); + + dst = enc ? early_memremap_encrypted(paddr, len) : + early_memremap_decrypted(paddr, len); + + /* + * If a mapping can't be obtained to perform the operation, + * then eventual access of that area in the desired mode + * will cause a crash. + */ + BUG_ON(!src || !dst); + + /* + * Use a temporary buffer, of cache-line multiple size, to + * avoid data corruption as documented in the APM. + */ + memcpy(sme_early_buffer, src, len); + memcpy(dst, sme_early_buffer, len); + + early_memunmap(dst, len); + early_memunmap(src, len); + + paddr += len; + size -= len; + } +} + +void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, true); +} + +void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, false); +} + void __init sme_early_init(void) { unsigned int i; From b9d05200bc12444c7778a49c9694d8382ed06aa8 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 Jul 2017 16:10:11 -0500 Subject: [PATCH 025/149] x86/mm: Insure that boot memory areas are mapped properly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The boot data and command line data are present in memory in a decrypted state and are copied early in the boot process. The early page fault support will map these areas as encrypted, so before attempting to copy them, add decrypted mappings so the data is accessed properly when copied. For the initrd, encrypt this data in place. Since the future mapping of the initrd area will be mapped as encrypted the data will be accessed properly. Signed-off-by: Tom Lendacky Reviewed-by: Thomas Gleixner Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brijesh Singh Cc: Dave Young Cc: Dmitry Vyukov Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Larry Woodman Cc: Linus Torvalds Cc: Matt Fleming Cc: Michael S. Tsirkin Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Toshimitsu Kani Cc: kasan-dev@googlegroups.com Cc: kvm@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/bb0d430b41efefd45ee515aaf0979dcfda8b6a44.1500319216.git.thomas.lendacky@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mem_encrypt.h | 6 +++ arch/x86/include/asm/pgtable.h | 3 ++ arch/x86/kernel/head64.c | 30 ++++++++++++-- arch/x86/kernel/setup.c | 9 +++++ arch/x86/mm/kasan_init_64.c | 2 +- arch/x86/mm/mem_encrypt.c | 63 ++++++++++++++++++++++++++++++ 6 files changed, 108 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 8baa35ba2316..ab1fe77c2f73 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -26,6 +26,9 @@ void __init sme_early_encrypt(resource_size_t paddr, void __init sme_early_decrypt(resource_size_t paddr, unsigned long size); +void __init sme_map_bootdata(char *real_mode_data); +void __init sme_unmap_bootdata(char *real_mode_data); + void __init sme_early_init(void); void __init sme_encrypt_kernel(void); @@ -40,6 +43,9 @@ static inline void __init sme_early_encrypt(resource_size_t paddr, static inline void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) { } +static inline void __init sme_map_bootdata(char *real_mode_data) { } +static inline void __init sme_unmap_bootdata(char *real_mode_data) { } + static inline void __init sme_early_init(void) { } static inline void __init sme_encrypt_kernel(void) { } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index c6452cb12c0b..bbeae4a2bd01 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -23,6 +23,9 @@ #ifndef __ASSEMBLY__ #include +extern pgd_t early_top_pgt[PTRS_PER_PGD]; +int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level_checkwx(void); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 5cd0b72a0283..0cdb53bf4c4b 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -34,7 +34,6 @@ /* * Manage page tables very early on. */ -extern pgd_t early_top_pgt[PTRS_PER_PGD]; extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); @@ -181,13 +180,13 @@ static void __init reset_early_page_tables(void) } /* Create a new PMD entry */ -int __init early_make_pgtable(unsigned long address) +int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) { unsigned long physaddr = address - __PAGE_OFFSET; pgdval_t pgd, *pgd_p; p4dval_t p4d, *p4d_p; pudval_t pud, *pud_p; - pmdval_t pmd, *pmd_p; + pmdval_t *pmd_p; /* Invalid address or early pgt is done ? */ if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) @@ -246,12 +245,21 @@ int __init early_make_pgtable(unsigned long address) memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } - pmd = (physaddr & PMD_MASK) + early_pmd_flags; pmd_p[pmd_index(address)] = pmd; return 0; } +int __init early_make_pgtable(unsigned long address) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + pmdval_t pmd; + + pmd = (physaddr & PMD_MASK) + early_pmd_flags; + + return __early_make_pgtable(address, pmd); +} + /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ static void __init clear_bss(void) @@ -274,6 +282,12 @@ static void __init copy_bootdata(char *real_mode_data) char * command_line; unsigned long cmd_line_ptr; + /* + * If SME is active, this will create decrypted mappings of the + * boot data in advance of the copy operations. + */ + sme_map_bootdata(real_mode_data); + memcpy(&boot_params, real_mode_data, sizeof boot_params); sanitize_boot_params(&boot_params); cmd_line_ptr = get_cmd_line_ptr(); @@ -281,6 +295,14 @@ static void __init copy_bootdata(char *real_mode_data) command_line = __va(cmd_line_ptr); memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } + + /* + * The old boot data is no longer needed and won't be reserved, + * freeing up that memory for use by the system. If SME is active, + * we need to remove the mappings that were created so that the + * memory doesn't remain mapped as decrypted. + */ + sme_unmap_bootdata(real_mode_data); } asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3486d0498800..0bfe0c1628f6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include