From f39681ed0f48498b80455095376f11535feea332 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 29 Jun 2017 08:53:15 -0700
Subject: [PATCH 001/149] x86/mm: Give each mm TLB flush generation a unique ID

This adds two new variables to mmu_context_t: ctx_id and tlb_gen.
ctx_id uniquely identifies the mm_struct and will never be reused.
For a given mm_struct (and hence ctx_id), tlb_gen is a monotonic
count of the number of times that a TLB flush has been requested.
The pair (ctx_id, tlb_gen) can be used as an identifier for TLB
flush actions and will be used in subsequent patches to reliably
determine whether all needed TLB flushes have occurred on a given
CPU.

This patch is split out for ease of review.  By itself, it has no
real effect other than creating and updating the new variables.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/413a91c24dab3ed0caa5f4e4d017d87b0857f920.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu.h         | 25 +++++++++++++++++++++++--
 arch/x86/include/asm/mmu_context.h |  6 ++++++
 arch/x86/include/asm/tlbflush.h    | 18 ++++++++++++++++++
 arch/x86/mm/tlb.c                  |  6 ++++--
 4 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 79b647a7ebd0..bb8c597c2248 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -3,12 +3,28 @@
 
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
+#include <linux/atomic.h>
 
 /*
- * The x86 doesn't have a mmu context, but
- * we put the segment information here.
+ * x86 has arch-specific MMU state beyond what lives in mm_struct.
  */
 typedef struct {
+	/*
+	 * ctx_id uniquely identifies this mm_struct.  A ctx_id will never
+	 * be reused, and zero is not a valid ctx_id.
+	 */
+	u64 ctx_id;
+
+	/*
+	 * Any code that needs to do any sort of TLB flushing for this
+	 * mm will first make its changes to the page tables, then
+	 * increment tlb_gen, then flush.  This lets the low-level
+	 * flushing code keep track of what needs flushing.
+	 *
+	 * This is not used on Xen PV.
+	 */
+	atomic64_t tlb_gen;
+
 #ifdef CONFIG_MODIFY_LDT_SYSCALL
 	struct ldt_struct *ldt;
 #endif
@@ -37,6 +53,11 @@ typedef struct {
 #endif
 } mm_context_t;
 
+#define INIT_MM_CONTEXT(mm)						\
+	.context = {							\
+		.ctx_id = 1,						\
+	}
+
 void leave_mm(int cpu);
 
 #endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index ecfcb6643c9b..ae19b9d11259 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -12,6 +12,9 @@
 #include <asm/tlbflush.h>
 #include <asm/paravirt.h>
 #include <asm/mpx.h>
+
+extern atomic64_t last_mm_ctx_id;
+
 #ifndef CONFIG_PARAVIRT
 static inline void paravirt_activate_mm(struct mm_struct *prev,
 					struct mm_struct *next)
@@ -132,6 +135,9 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
+	mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
+	atomic64_set(&mm->context.tlb_gen, 0);
+
 	#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 	if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
 		/* pkey 0 is the default and always allocated */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 50ea3482e1d1..ad2135385699 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -57,6 +57,23 @@ static inline void invpcid_flush_all_nonglobals(void)
 	__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
 }
 
+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+{
+	u64 new_tlb_gen;
+
+	/*
+	 * Bump the generation count.  This also serves as a full barrier
+	 * that synchronizes with switch_mm(): callers are required to order
+	 * their read of mm_cpumask after their writes to the paging
+	 * structures.
+	 */
+	smp_mb__before_atomic();
+	new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen);
+	smp_mb__after_atomic();
+
+	return new_tlb_gen;
+}
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
@@ -262,6 +279,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
 					struct mm_struct *mm)
 {
+	inc_mm_tlb_gen(mm);
 	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
 }
 
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 014d07a80053..14f4f8f66aa8 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,8 @@
  *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
  */
 
+atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
+
 void leave_mm(int cpu)
 {
 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -250,8 +252,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 
 	cpu = get_cpu();
 
-	/* Synchronize with switch_mm. */
-	smp_mb();
+	/* This is also a barrier that synchronizes with switch_mm(). */
+	inc_mm_tlb_gen(mm);
 
 	/* Should we flush just the requested range? */
 	if ((end != TLB_FLUSH_ALL) &&

From b0579ade7cd82391360e959cc844e50a160e8a96 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 29 Jun 2017 08:53:16 -0700
Subject: [PATCH 002/149] x86/mm: Track the TLB's tlb_gen and update the
 flushing algorithm

There are two kernel features that would benefit from tracking
how up-to-date each CPU's TLB is in the case where IPIs aren't keeping
it up to date in real time:

 - Lazy mm switching currently works by switching to init_mm when
   it would otherwise flush.  This is wasteful: there isn't fundamentally
   any need to update CR3 at all when going lazy or when returning from
   lazy mode, nor is there any need to receive flush IPIs at all.  Instead,
   we should just stop trying to keep the TLB coherent when we go lazy and,
   when unlazying, check whether we missed any flushes.

 - PCID will let us keep recent user contexts alive in the TLB.  If we
   start doing this, we need a way to decide whether those contexts are
   up to date.

On some paravirt systems, remote TLBs can be flushed without IPIs.
This won't update the target CPUs' tlb_gens, which may cause
unnecessary local flushes later on.  We can address this if it becomes
a problem by carefully updating the target CPU's tlb_gen directly.

By itself, this patch is a very minor optimization that avoids
unnecessary flushes when multiple TLB flushes targetting the same CPU
race.  The complexity in this patch would not be worth it on its own,
but it will enable improved lazy TLB tracking and PCID.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1210fb244bc9cbe7677f7f0b72db4d359675f24b.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h |  43 +++++++++++++-
 arch/x86/mm/tlb.c               | 102 +++++++++++++++++++++++++++++---
 2 files changed, 135 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index ad2135385699..d7df54cc7e4d 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -82,6 +82,11 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+struct tlb_context {
+	u64 ctx_id;
+	u64 tlb_gen;
+};
+
 struct tlb_state {
 	/*
 	 * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
@@ -97,6 +102,21 @@ struct tlb_state {
 	 * disabling interrupts when modifying either one.
 	 */
 	unsigned long cr4;
+
+	/*
+	 * This is a list of all contexts that might exist in the TLB.
+	 * Since we don't yet use PCID, there is only one context.
+	 *
+	 * For each context, ctx_id indicates which mm the TLB's user
+	 * entries came from.  As an invariant, the TLB will never
+	 * contain entries that are out-of-date as when that mm reached
+	 * the tlb_gen in the list.
+	 *
+	 * To be clear, this means that it's legal for the TLB code to
+	 * flush the TLB without updating tlb_gen.  This can happen
+	 * (for now, at least) due to paravirt remote flushes.
+	 */
+	struct tlb_context ctxs[1];
 };
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
 
@@ -248,9 +268,26 @@ static inline void __flush_tlb_one(unsigned long addr)
  * and page-granular flushes are available only on i486 and up.
  */
 struct flush_tlb_info {
-	struct mm_struct *mm;
-	unsigned long start;
-	unsigned long end;
+	/*
+	 * We support several kinds of flushes.
+	 *
+	 * - Fully flush a single mm.  .mm will be set, .end will be
+	 *   TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to
+	 *   which the IPI sender is trying to catch us up.
+	 *
+	 * - Partially flush a single mm.  .mm will be set, .start and
+	 *   .end will indicate the range, and .new_tlb_gen will be set
+	 *   such that the changes between generation .new_tlb_gen-1 and
+	 *   .new_tlb_gen are entirely contained in the indicated range.
+	 *
+	 * - Fully flush all mms whose tlb_gens have been updated.  .mm
+	 *   will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen
+	 *   will be zero.
+	 */
+	struct mm_struct	*mm;
+	unsigned long		start;
+	unsigned long		end;
+	u64			new_tlb_gen;
 };
 
 #define local_flush_tlb() __flush_tlb()
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 14f4f8f66aa8..4e5a5ddb9e4d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -105,6 +105,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	}
 
 	this_cpu_write(cpu_tlbstate.loaded_mm, next);
+	this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
+	this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen));
 
 	WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
 	cpumask_set_cpu(cpu, mm_cpumask(next));
@@ -155,25 +157,102 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	switch_ldt(real_prev, next);
 }
 
+/*
+ * flush_tlb_func_common()'s memory ordering requirement is that any
+ * TLB fills that happen after we flush the TLB are ordered after we
+ * read active_mm's tlb_gen.  We don't need any explicit barriers
+ * because all x86 flush operations are serializing and the
+ * atomic64_read operation won't be reordered by the compiler.
+ */
 static void flush_tlb_func_common(const struct flush_tlb_info *f,
 				  bool local, enum tlb_flush_reason reason)
 {
+	/*
+	 * We have three different tlb_gen values in here.  They are:
+	 *
+	 * - mm_tlb_gen:     the latest generation.
+	 * - local_tlb_gen:  the generation that this CPU has already caught
+	 *                   up to.
+	 * - f->new_tlb_gen: the generation that the requester of the flush
+	 *                   wants us to catch up to.
+	 */
+	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
+	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen);
+
 	/* This code cannot presently handle being reentered. */
 	VM_WARN_ON(!irqs_disabled());
 
+	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
+		   loaded_mm->context.ctx_id);
+
 	if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
+		/*
+		 * leave_mm() is adequate to handle any type of flush, and
+		 * we would prefer not to receive further IPIs.  leave_mm()
+		 * clears this CPU's bit in mm_cpumask().
+		 */
 		leave_mm(smp_processor_id());
 		return;
 	}
 
-	if (f->end == TLB_FLUSH_ALL) {
-		local_flush_tlb();
-		if (local)
-			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-		trace_tlb_flush(reason, TLB_FLUSH_ALL);
-	} else {
+	if (unlikely(local_tlb_gen == mm_tlb_gen)) {
+		/*
+		 * There's nothing to do: we're already up to date.  This can
+		 * happen if two concurrent flushes happen -- the first flush to
+		 * be handled can catch us all the way up, leaving no work for
+		 * the second flush.
+		 */
+		return;
+	}
+
+	WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
+	WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
+
+	/*
+	 * If we get to this point, we know that our TLB is out of date.
+	 * This does not strictly imply that we need to flush (it's
+	 * possible that f->new_tlb_gen <= local_tlb_gen), but we're
+	 * going to need to flush in the very near future, so we might
+	 * as well get it over with.
+	 *
+	 * The only question is whether to do a full or partial flush.
+	 *
+	 * We do a partial flush if requested and two extra conditions
+	 * are met:
+	 *
+	 * 1. f->new_tlb_gen == local_tlb_gen + 1.  We have an invariant that
+	 *    we've always done all needed flushes to catch up to
+	 *    local_tlb_gen.  If, for example, local_tlb_gen == 2 and
+	 *    f->new_tlb_gen == 3, then we know that the flush needed to bring
+	 *    us up to date for tlb_gen 3 is the partial flush we're
+	 *    processing.
+	 *
+	 *    As an example of why this check is needed, suppose that there
+	 *    are two concurrent flushes.  The first is a full flush that
+	 *    changes context.tlb_gen from 1 to 2.  The second is a partial
+	 *    flush that changes context.tlb_gen from 2 to 3.  If they get
+	 *    processed on this CPU in reverse order, we'll see
+	 *     local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
+	 *    If we were to use __flush_tlb_single() and set local_tlb_gen to
+	 *    3, we'd be break the invariant: we'd update local_tlb_gen above
+	 *    1 without the full flush that's needed for tlb_gen 2.
+	 *
+	 * 2. f->new_tlb_gen == mm_tlb_gen.  This is purely an optimiation.
+	 *    Partial TLB flushes are not all that much cheaper than full TLB
+	 *    flushes, so it seems unlikely that it would be a performance win
+	 *    to do a partial flush if that won't bring our TLB fully up to
+	 *    date.  By doing a full flush instead, we can increase
+	 *    local_tlb_gen all the way to mm_tlb_gen and we can probably
+	 *    avoid another flush in the very near future.
+	 */
+	if (f->end != TLB_FLUSH_ALL &&
+	    f->new_tlb_gen == local_tlb_gen + 1 &&
+	    f->new_tlb_gen == mm_tlb_gen) {
+		/* Partial flush */
 		unsigned long addr;
 		unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+
 		addr = f->start;
 		while (addr < f->end) {
 			__flush_tlb_single(addr);
@@ -182,7 +261,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 		if (local)
 			count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
 		trace_tlb_flush(reason, nr_pages);
+	} else {
+		/* Full flush. */
+		local_flush_tlb();
+		if (local)
+			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+		trace_tlb_flush(reason, TLB_FLUSH_ALL);
 	}
+
+	/* Both paths above update our state to mm_tlb_gen. */
+	this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen);
 }
 
 static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
@@ -253,7 +341,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 	cpu = get_cpu();
 
 	/* This is also a barrier that synchronizes with switch_mm(). */
-	inc_mm_tlb_gen(mm);
+	info.new_tlb_gen = inc_mm_tlb_gen(mm);
 
 	/* Should we flush just the requested range? */
 	if ((end != TLB_FLUSH_ALL) &&

From 94b1b03b519b81c494900cb112aa00ed205cc2d9 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 29 Jun 2017 08:53:17 -0700
Subject: [PATCH 003/149] x86/mm: Rework lazy TLB mode and TLB freshness
 tracking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

x86's lazy TLB mode used to be fairly weak -- it would switch to
init_mm the first time it tried to flush a lazy TLB.  This meant an
unnecessary CR3 write and, if the flush was remote, an unnecessary
IPI.

Rewrite it entirely.  When we enter lazy mode, we simply remove the
CPU from mm_cpumask.  This means that we need a way to figure out
whether we've missed a flush when we switch back out of lazy mode.
I use the tlb_gen machinery to track whether a context is up to
date.

Note to reviewers: this patch, my itself, looks a bit odd.  I'm
using an array of length 1 containing (ctx_id, tlb_gen) rather than
just storing tlb_gen, and making it at array isn't necessary yet.
I'm doing this because the next few patches add PCID support, and,
with PCID, we need ctx_id, and the array will end up with a length
greater than 1.  Making it an array now means that there will be
less churn and therefore less stress on your eyeballs.

NB: This is dubious but, AFAICT, still correct on Xen and UV.
xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
patch changes the way that mm_cpumask() works.  This should be okay,
since Xen *also* iterates all online CPUs to find all the CPUs it
needs to twiddle.

The UV tlbflush code is rather dated and should be changed.

Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs).  I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers.  Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.

MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity.  In
an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
This is intended to be a nearly worst-case test.

  patched:         13.4µs
  unpatched:       21.6µs

Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
nrounds = 100, 256M data

  patched:         1.1 seconds or so
  unpatched:       1.9 seconds or so

The sleepup on Vitaly's test appearss to be because it spends a lot
of time blocked on mmap_sem, and this patch avoids sending IPIs to
blocked CPUs.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h |   6 +-
 arch/x86/include/asm/tlbflush.h    |   4 -
 arch/x86/mm/init.c                 |   1 -
 arch/x86/mm/tlb.c                  | 207 +++++++++++++++++------------
 arch/x86/xen/mmu_pv.c              |   5 +-
 5 files changed, 129 insertions(+), 94 deletions(-)

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index ae19b9d11259..85f6b5575aad 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -128,8 +128,10 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
 
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
-	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
-		this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+	int cpu = smp_processor_id();
+
+	if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
+		cpumask_clear_cpu(cpu, mm_cpumask(mm));
 }
 
 static inline int init_new_context(struct task_struct *tsk,
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index d7df54cc7e4d..06e997a36d49 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -95,7 +95,6 @@ struct tlb_state {
 	 * mode even if we've already switched back to swapper_pg_dir.
 	 */
 	struct mm_struct *loaded_mm;
-	int state;
 
 	/*
 	 * Access to this CR4 shadow and to H/W CR4 is protected by
@@ -310,9 +309,6 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
 void native_flush_tlb_others(const struct cpumask *cpumask,
 			     const struct flush_tlb_info *info);
 
-#define TLBSTATE_OK	1
-#define TLBSTATE_LAZY	2
-
 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
 					struct mm_struct *mm)
 {
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 673541eb3b3f..4d353efb2838 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -812,7 +812,6 @@ void __init zone_sizes_init(void)
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
 	.loaded_mm = &init_mm,
-	.state = 0,
 	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
 };
 EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 4e5a5ddb9e4d..0982c997d36f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -45,8 +45,8 @@ void leave_mm(int cpu)
 	if (loaded_mm == &init_mm)
 		return;
 
-	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
-		BUG();
+	/* Warn if we're not lazy. */
+	WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
 
 	switch_mm(NULL, &init_mm, NULL);
 }
@@ -65,94 +65,117 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			struct task_struct *tsk)
 {
-	unsigned cpu = smp_processor_id();
 	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+	unsigned cpu = smp_processor_id();
+	u64 next_tlb_gen;
 
 	/*
-	 * NB: The scheduler will call us with prev == next when
-	 * switching from lazy TLB mode to normal mode if active_mm
-	 * isn't changing.  When this happens, there is no guarantee
-	 * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
+	 * NB: The scheduler will call us with prev == next when switching
+	 * from lazy TLB mode to normal mode if active_mm isn't changing.
+	 * When this happens, we don't assume that CR3 (and hence
+	 * cpu_tlbstate.loaded_mm) matches next.
 	 *
 	 * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
 	 */
 
-	this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+	/* We don't want flush_tlb_func_* to run concurrently with us. */
+	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
+		WARN_ON_ONCE(!irqs_disabled());
+
+	/*
+	 * Verify that CR3 is what we think it is.  This will catch
+	 * hypothetical buggy code that directly switches to swapper_pg_dir
+	 * without going through leave_mm() / switch_mm_irqs_off().
+	 */
+	VM_BUG_ON(read_cr3_pa() != __pa(real_prev->pgd));
 
 	if (real_prev == next) {
+		VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
+			  next->context.ctx_id);
+
+		if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
+			/*
+			 * There's nothing to do: we weren't lazy, and we
+			 * aren't changing our mm.  We don't need to flush
+			 * anything, nor do we need to update CR3, CR4, or
+			 * LDTR.
+			 */
+			return;
+		}
+
+		/* Resume remote flushes and then read tlb_gen. */
+		cpumask_set_cpu(cpu, mm_cpumask(next));
+		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+		if (this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen) < next_tlb_gen) {
+			/*
+			 * Ideally, we'd have a flush_tlb() variant that
+			 * takes the known CR3 value as input.  This would
+			 * be faster on Xen PV and on hypothetical CPUs
+			 * on which INVPCID is fast.
+			 */
+			this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen,
+				       next_tlb_gen);
+			write_cr3(__pa(next->pgd));
+
+			/*
+			 * This gets called via leave_mm() in the idle path
+			 * where RCU functions differently.  Tracing normally
+			 * uses RCU, so we have to call the tracepoint
+			 * specially here.
+			 */
+			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH,
+						TLB_FLUSH_ALL);
+		}
+
 		/*
-		 * There's nothing to do: we always keep the per-mm control
-		 * regs in sync with cpu_tlbstate.loaded_mm.  Just
-		 * sanity-check mm_cpumask.
+		 * We just exited lazy mode, which means that CR4 and/or LDTR
+		 * may be stale.  (Changes to the required CR4 and LDTR states
+		 * are not reflected in tlb_gen.)
 		 */
-		if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
-			cpumask_set_cpu(cpu, mm_cpumask(next));
-		return;
+	} else {
+		VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) ==
+			  next->context.ctx_id);
+
+		if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+			/*
+			 * If our current stack is in vmalloc space and isn't
+			 * mapped in the new pgd, we'll double-fault.  Forcibly
+			 * map it.
+			 */
+			unsigned int index = pgd_index(current_stack_pointer());
+			pgd_t *pgd = next->pgd + index;
+
+			if (unlikely(pgd_none(*pgd)))
+				set_pgd(pgd, init_mm.pgd[index]);
+		}
+
+		/* Stop remote flushes for the previous mm */
+		if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
+			cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+
+		VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
+
+		/*
+		 * Start remote flushes and then read tlb_gen.
+		 */
+		cpumask_set_cpu(cpu, mm_cpumask(next));
+		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+
+		this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
+		this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen);
+		this_cpu_write(cpu_tlbstate.loaded_mm, next);
+		write_cr3(__pa(next->pgd));
+
+		/*
+		 * This gets called via leave_mm() in the idle path where RCU
+		 * functions differently.  Tracing normally uses RCU, so we
+		 * have to call the tracepoint specially here.
+		 */
+		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH,
+					TLB_FLUSH_ALL);
 	}
 
-	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
-		/*
-		 * If our current stack is in vmalloc space and isn't
-		 * mapped in the new pgd, we'll double-fault.  Forcibly
-		 * map it.
-		 */
-		unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
-
-		pgd_t *pgd = next->pgd + stack_pgd_index;
-
-		if (unlikely(pgd_none(*pgd)))
-			set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
-	}
-
-	this_cpu_write(cpu_tlbstate.loaded_mm, next);
-	this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
-	this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen));
-
-	WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
-	cpumask_set_cpu(cpu, mm_cpumask(next));
-
-	/*
-	 * Re-load page tables.
-	 *
-	 * This logic has an ordering constraint:
-	 *
-	 *  CPU 0: Write to a PTE for 'next'
-	 *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
-	 *  CPU 1: set bit 1 in next's mm_cpumask
-	 *  CPU 1: load from the PTE that CPU 0 writes (implicit)
-	 *
-	 * We need to prevent an outcome in which CPU 1 observes
-	 * the new PTE value and CPU 0 observes bit 1 clear in
-	 * mm_cpumask.  (If that occurs, then the IPI will never
-	 * be sent, and CPU 0's TLB will contain a stale entry.)
-	 *
-	 * The bad outcome can occur if either CPU's load is
-	 * reordered before that CPU's store, so both CPUs must
-	 * execute full barriers to prevent this from happening.
-	 *
-	 * Thus, switch_mm needs a full barrier between the
-	 * store to mm_cpumask and any operation that could load
-	 * from next->pgd.  TLB fills are special and can happen
-	 * due to instruction fetches or for no reason at all,
-	 * and neither LOCK nor MFENCE orders them.
-	 * Fortunately, load_cr3() is serializing and gives the
-	 * ordering guarantee we need.
-	 */
-	load_cr3(next->pgd);
-
-	/*
-	 * This gets called via leave_mm() in the idle path where RCU
-	 * functions differently.  Tracing normally uses RCU, so we have to
-	 * call the tracepoint specially here.
-	 */
-	trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-
-	/* Stop flush ipis for the previous mm */
-	WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
-		     real_prev != &init_mm);
-	cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
-
-	/* Load per-mm CR4 and LDTR state */
 	load_mm_cr4(next);
 	switch_ldt(real_prev, next);
 }
@@ -186,13 +209,13 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
 		   loaded_mm->context.ctx_id);
 
-	if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
+	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
 		/*
-		 * leave_mm() is adequate to handle any type of flush, and
-		 * we would prefer not to receive further IPIs.  leave_mm()
-		 * clears this CPU's bit in mm_cpumask().
+		 * We're in lazy mode -- don't flush.  We can get here on
+		 * remote flushes due to races and on local flushes if a
+		 * kernel thread coincidentally flushes the mm it's lazily
+		 * still using.
 		 */
-		leave_mm(smp_processor_id());
 		return;
 	}
 
@@ -203,6 +226,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 		 * be handled can catch us all the way up, leaving no work for
 		 * the second flush.
 		 */
+		trace_tlb_flush(reason, 0);
 		return;
 	}
 
@@ -304,6 +328,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 				(info->end - info->start) >> PAGE_SHIFT);
 
 	if (is_uv_system()) {
+		/*
+		 * This whole special case is confused.  UV has a "Broadcast
+		 * Assist Unit", which seems to be a fancy way to send IPIs.
+		 * Back when x86 used an explicit TLB flush IPI, UV was
+		 * optimized to use its own mechanism.  These days, x86 uses
+		 * smp_call_function_many(), but UV still uses a manual IPI,
+		 * and that IPI's action is out of date -- it does a manual
+		 * flush instead of calling flush_tlb_func_remote().  This
+		 * means that the percpu tlb_gen variables won't be updated
+		 * and we'll do pointless flushes on future context switches.
+		 *
+		 * Rather than hooking native_flush_tlb_others() here, I think
+		 * that UV should be updated so that smp_call_function_many(),
+		 * etc, are optimal on UV.
+		 */
 		unsigned int cpu;
 
 		cpu = smp_processor_id();
@@ -363,6 +402,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 
 	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
 		flush_tlb_others(mm_cpumask(mm), &info);
+
 	put_cpu();
 }
 
@@ -371,8 +411,6 @@ static void do_flush_tlb_all(void *info)
 {
 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
 	__flush_tlb_all();
-	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
-		leave_mm(smp_processor_id());
 }
 
 void flush_tlb_all(void)
@@ -425,6 +463,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 
 	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
 		flush_tlb_others(&batch->cpumask, &info);
+
 	cpumask_clear(&batch->cpumask);
 
 	put_cpu();
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 1d7a7213a310..0472790ec20b 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1005,14 +1005,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
 	/* Get the "official" set of cpus referring to our pagetable. */
 	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
 		for_each_online_cpu(cpu) {
-			if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
-			    && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
+			if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
 				continue;
 			smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
 		}
 		return;
 	}
-	cpumask_copy(mask, mm_cpumask(mm));
 
 	/*
 	 * It's possible that a vcpu may have a stale reference to our
@@ -1021,6 +1019,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
 	 * look at its actual current cr3 value, and force it to flush
 	 * if needed.
 	 */
+	cpumask_clear(mask);
 	for_each_online_cpu(cpu) {
 		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
 			cpumask_set_cpu(cpu, mask);

From 43858b4f25cf0adc5c2ca9cf5ce5fdf2532941e5 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 29 Jun 2017 08:53:18 -0700
Subject: [PATCH 004/149] x86/mm: Stop calling leave_mm() in idle code

Now that lazy TLB suppresses all flush IPIs (as opposed to all but
the first), there's no need to leave_mm() when going idle.

This means we can get rid of the rcuidle hack in
switch_mm_irqs_off() and we can unexport leave_mm().

This also removes acpi_unlazy_tlb() from the x86 and ia64 headers,
since it has no callers any more.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/03c699cfd6021e467be650d6b73deaccfe4b4bd7.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/include/asm/acpi.h  |  2 --
 arch/x86/include/asm/acpi.h   |  2 --
 arch/x86/mm/tlb.c             | 20 +++-----------------
 drivers/acpi/processor_idle.c |  2 --
 drivers/idle/intel_idle.c     |  9 ++++-----
 5 files changed, 7 insertions(+), 28 deletions(-)

diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h
index a3d0211970e9..c86a947f5368 100644
--- a/arch/ia64/include/asm/acpi.h
+++ b/arch/ia64/include/asm/acpi.h
@@ -112,8 +112,6 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
 	buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP;
 }
 
-#define acpi_unlazy_tlb(x)
-
 #ifdef CONFIG_ACPI_NUMA
 extern cpumask_t early_cpu_possible_map;
 #define for_each_possible_early_cpu(cpu)  \
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 2efc768e4362..562286fa151f 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -150,8 +150,6 @@ static inline void disable_acpi(void) { }
 extern int x86_acpi_numa_init(void);
 #endif /* CONFIG_ACPI_NUMA */
 
-#define acpi_unlazy_tlb(x)	leave_mm(x)
-
 #ifdef CONFIG_ACPI_APEI
 static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
 {
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0982c997d36f..2c1b8881e9d3 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -50,7 +50,6 @@ void leave_mm(int cpu)
 
 	switch_mm(NULL, &init_mm, NULL);
 }
-EXPORT_SYMBOL_GPL(leave_mm);
 
 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	       struct task_struct *tsk)
@@ -117,15 +116,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen,
 				       next_tlb_gen);
 			write_cr3(__pa(next->pgd));
-
-			/*
-			 * This gets called via leave_mm() in the idle path
-			 * where RCU functions differently.  Tracing normally
-			 * uses RCU, so we have to call the tracepoint
-			 * specially here.
-			 */
-			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH,
-						TLB_FLUSH_ALL);
+			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
+					TLB_FLUSH_ALL);
 		}
 
 		/*
@@ -167,13 +159,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		this_cpu_write(cpu_tlbstate.loaded_mm, next);
 		write_cr3(__pa(next->pgd));
 
-		/*
-		 * This gets called via leave_mm() in the idle path where RCU
-		 * functions differently.  Tracing normally uses RCU, so we
-		 * have to call the tracepoint specially here.
-		 */
-		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH,
-					TLB_FLUSH_ALL);
+		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 	}
 
 	load_mm_cr4(next);
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 5c8aa9cf62d7..fe3d2a40f311 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -708,8 +708,6 @@ static DEFINE_RAW_SPINLOCK(c3_lock);
 static void acpi_idle_enter_bm(struct acpi_processor *pr,
 			       struct acpi_processor_cx *cx, bool timer_bc)
 {
-	acpi_unlazy_tlb(smp_processor_id());
-
 	/*
 	 * Must be done before busmaster disable as we might need to
 	 * access HPET !
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 216d7ec88c0c..2ae43f59091d 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -912,16 +912,15 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
 	struct cpuidle_state *state = &drv->states[index];
 	unsigned long eax = flg2MWAIT(state->flags);
 	unsigned int cstate;
-	int cpu = smp_processor_id();
 
 	cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1;
 
 	/*
-	 * leave_mm() to avoid costly and often unnecessary wakeups
-	 * for flushing the user TLB's associated with the active mm.
+	 * NB: if CPUIDLE_FLAG_TLB_FLUSHED is set, this idle transition
+	 * will probably flush the TLB.  It's not guaranteed to flush
+	 * the TLB, though, so it's not clear that we can do anything
+	 * useful with this knowledge.
 	 */
-	if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
-		leave_mm(cpu);
 
 	if (!(lapic_timer_reliable_states & (1 << (cstate))))
 		tick_broadcast_enter();

From cba4671af7550e008f7a7835f06df0763825bf3e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 29 Jun 2017 08:53:19 -0700
Subject: [PATCH 005/149] x86/mm: Disable PCID on 32-bit kernels

32-bit kernels on new hardware will see PCID in CPUID, but PCID can
only be used in 64-bit mode.  Rather than making all PCID code
conditional, just disable the feature on 32-bit builds.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/2e391769192a4d31b808410c383c6bf0734bc6ea.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/disabled-features.h | 4 +++-
 arch/x86/kernel/cpu/bugs.c               | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 5dff775af7cd..c10c9128f54e 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -21,11 +21,13 @@
 # define DISABLE_K6_MTRR	(1<<(X86_FEATURE_K6_MTRR & 31))
 # define DISABLE_CYRIX_ARR	(1<<(X86_FEATURE_CYRIX_ARR & 31))
 # define DISABLE_CENTAUR_MCR	(1<<(X86_FEATURE_CENTAUR_MCR & 31))
+# define DISABLE_PCID		0
 #else
 # define DISABLE_VME		0
 # define DISABLE_K6_MTRR	0
 # define DISABLE_CYRIX_ARR	0
 # define DISABLE_CENTAUR_MCR	0
+# define DISABLE_PCID		(1<<(X86_FEATURE_PCID & 31))
 #endif /* CONFIG_X86_64 */
 
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
@@ -49,7 +51,7 @@
 #define DISABLED_MASK1	0
 #define DISABLED_MASK2	0
 #define DISABLED_MASK3	(DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR)
-#define DISABLED_MASK4	0
+#define DISABLED_MASK4	(DISABLE_PCID)
 #define DISABLED_MASK5	0
 #define DISABLED_MASK6	0
 #define DISABLED_MASK7	0
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 0af86d9242da..db684880d74a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -21,6 +21,14 @@
 
 void __init check_bugs(void)
 {
+#ifdef CONFIG_X86_32
+	/*
+	 * Regardless of whether PCID is enumerated, the SDM says
+	 * that it can't be enabled in 32-bit mode.
+	 */
+	setup_clear_cpu_cap(X86_FEATURE_PCID);
+#endif
+
 	identify_boot_cpu();
 
 	if (!IS_ENABLED(CONFIG_SMP)) {

From 0790c9aad84901ca1bdc14746175549c8b5da215 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 29 Jun 2017 08:53:20 -0700
Subject: [PATCH 006/149] x86/mm: Add the 'nopcid' boot option to turn off PCID

The parameter is only present on x86_64 systems to save a few bytes,
as PCID is always disabled on x86_32.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/8bbb2e65bcd249a5f18bfb8128b4689f08ac2b60.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 .../admin-guide/kernel-parameters.txt          |  2 ++
 arch/x86/kernel/cpu/common.c                   | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9b0b3dea6326..7037a0f86d03 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2657,6 +2657,8 @@
 	nopat		[X86] Disable PAT (page attribute table extension of
 			pagetables) support.
 
+	nopcid		[X86-64] Disable the PCID cpu feature.
+
 	norandmaps	Don't use address space randomization.  Equivalent to
 			echo 0 > /proc/sys/kernel/randomize_va_space
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c8b39870f33e..904485e7b230 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -168,6 +168,24 @@ static int __init x86_mpx_setup(char *s)
 }
 __setup("nompx", x86_mpx_setup);
 
+#ifdef CONFIG_X86_64
+static int __init x86_pcid_setup(char *s)
+{
+	/* require an exact match without trailing characters */
+	if (strlen(s))
+		return 0;
+
+	/* do not emit a message if the feature is not present */
+	if (!boot_cpu_has(X86_FEATURE_PCID))
+		return 1;
+
+	setup_clear_cpu_cap(X86_FEATURE_PCID);
+	pr_info("nopcid: PCID feature disabled\n");
+	return 1;
+}
+__setup("nopcid", x86_pcid_setup);
+#endif
+
 static int __init x86_noinvpcid_setup(char *s)
 {
 	/* noinvpcid doesn't accept parameters */

From 660da7c9228f685b2ebe664f9fd69aaddcc420b5 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 29 Jun 2017 08:53:21 -0700
Subject: [PATCH 007/149] x86/mm: Enable CR4.PCIDE on supported systems

We can use PCID if the CPU has PCID and PGE and we're not on Xen.

By itself, this has no effect. A followup patch will start using PCID.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/6327ecd907b32f79d5aa0d466f04503bbec5df88.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h |  8 ++++++++
 arch/x86/kernel/cpu/common.c    | 22 ++++++++++++++++++++++
 arch/x86/xen/enlighten_pv.c     |  6 ++++++
 3 files changed, 36 insertions(+)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 06e997a36d49..6397275008db 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -243,6 +243,14 @@ static inline void __flush_tlb_all(void)
 		__flush_tlb_global();
 	else
 		__flush_tlb();
+
+	/*
+	 * Note: if we somehow had PCID but not PGE, then this wouldn't work --
+	 * we'd end up flushing kernel translations for the current ASID but
+	 * we might fail to flush kernel translations for other cached ASIDs.
+	 *
+	 * To avoid this issue, we force PCID off if PGE is off.
+	 */
 }
 
 static inline void __flush_tlb_one(unsigned long addr)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 904485e7b230..b95cd94ca97b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -329,6 +329,25 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
 	}
 }
 
+static void setup_pcid(struct cpuinfo_x86 *c)
+{
+	if (cpu_has(c, X86_FEATURE_PCID)) {
+		if (cpu_has(c, X86_FEATURE_PGE)) {
+			cr4_set_bits(X86_CR4_PCIDE);
+		} else {
+			/*
+			 * flush_tlb_all(), as currently implemented, won't
+			 * work if PCID is on but PGE is not.  Since that
+			 * combination doesn't exist on real hardware, there's
+			 * no reason to try to fully support it, but it's
+			 * polite to avoid corrupting data if we're on
+			 * an improperly configured VM.
+			 */
+			clear_cpu_cap(c, X86_FEATURE_PCID);
+		}
+	}
+}
+
 /*
  * Protection Keys are not available in 32-bit mode.
  */
@@ -1143,6 +1162,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	setup_smep(c);
 	setup_smap(c);
 
+	/* Set up PCID */
+	setup_pcid(c);
+
 	/*
 	 * The vendor-specific functions might have changed features.
 	 * Now we do "generic changes."
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index f33eef4ebd12..a136aac543c3 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -295,6 +295,12 @@ static void __init xen_init_capabilities(void)
 	setup_clear_cpu_cap(X86_FEATURE_ACC);
 	setup_clear_cpu_cap(X86_FEATURE_X2APIC);
 
+	/*
+	 * Xen PV would need some work to support PCID: CR3 handling as well
+	 * as xen_flush_tlb_others() would need updating.
+	 */
+	setup_clear_cpu_cap(X86_FEATURE_PCID);
+
 	if (!xen_initial_domain())
 		setup_clear_cpu_cap(X86_FEATURE_ACPI);
 

From 29178c1473fd4ad9c523b41bb18a047749c66d11 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Thu, 8 Jun 2017 18:10:28 +0200
Subject: [PATCH 008/149] ARC: defconfig: Cleanup from old Kconfig options

Remove old, dead Kconfig option INET_LRO. It is gone since
commit 7bbf3cae65b6 ("ipv4: Remove inet_lro library").

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/configs/nps_defconfig   | 1 -
 arch/arc/configs/tb10x_defconfig | 1 -
 2 files changed, 2 deletions(-)

diff --git a/arch/arc/configs/nps_defconfig b/arch/arc/configs/nps_defconfig
index ede625c76216..7c9c706ae7f6 100644
--- a/arch/arc/configs/nps_defconfig
+++ b/arch/arc/configs/nps_defconfig
@@ -39,7 +39,6 @@ CONFIG_IP_PNP=y
 # CONFIG_INET_XFRM_MODE_TRANSPORT is not set
 # CONFIG_INET_XFRM_MODE_TUNNEL is not set
 # CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
 # CONFIG_INET_DIAG is not set
 # CONFIG_IPV6 is not set
 # CONFIG_WIRELESS is not set
diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig
index 4c5118384eb5..f30182549395 100644
--- a/arch/arc/configs/tb10x_defconfig
+++ b/arch/arc/configs/tb10x_defconfig
@@ -38,7 +38,6 @@ CONFIG_IP_MULTICAST=y
 # CONFIG_INET_XFRM_MODE_TRANSPORT is not set
 # CONFIG_INET_XFRM_MODE_TUNNEL is not set
 # CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
 # CONFIG_INET_DIAG is not set
 # CONFIG_IPV6 is not set
 # CONFIG_WIRELESS is not set

From f62995c92a29e4d9331382b8b2461eef3b9c7c6b Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Sun, 9 Jul 2017 20:37:39 +0800
Subject: [PATCH 009/149] x86/boot/KASLR: Wrap e820 entries walking code into
 new function process_e820_entries()

The original function process_e820_entry() only takes care of each
e820 entry passed.

And move the E820_TYPE_RAM checking logic into process_e820_entries().

And remove the redundent local variable 'addr' definition in
find_random_phys_addr().

Signed-off-by: Baoquan He <bhe@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: fanc.fnst@cn.fujitsu.com
Cc: izumi.taku@jp.fujitsu.com
Cc: matt@codeblueprint.co.uk
Cc: thgarnie@google.com
Link: http://lkml.kernel.org/r/1499603862-11516-2-git-send-email-bhe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/kaslr.c | 38 ++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 91f27ab970ef..1485f48aeda1 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -488,10 +488,6 @@ static void process_e820_entry(struct boot_e820_entry *entry,
 	unsigned long start_orig, end;
 	struct boot_e820_entry cur_entry;
 
-	/* Skip non-RAM entries. */
-	if (entry->type != E820_TYPE_RAM)
-		return;
-
 	/* On 32-bit, ignore entries entirely above our maximum. */
 	if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE)
 		return;
@@ -562,12 +558,29 @@ static void process_e820_entry(struct boot_e820_entry *entry,
 	}
 }
 
+static void process_e820_entries(unsigned long minimum,
+				 unsigned long image_size)
+{
+	int i;
+	struct boot_e820_entry *entry;
+
+	/* Verify potential e820 positions, appending to slots list. */
+	for (i = 0; i < boot_params->e820_entries; i++) {
+		entry = &boot_params->e820_table[i];
+		/* Skip non-RAM entries. */
+		if (entry->type != E820_TYPE_RAM)
+			continue;
+		process_e820_entry(entry, minimum, image_size);
+		if (slot_area_index == MAX_SLOT_AREA) {
+			debug_putstr("Aborted e820 scan (slot_areas full)!\n");
+			break;
+		}
+	}
+}
+
 static unsigned long find_random_phys_addr(unsigned long minimum,
 					   unsigned long image_size)
 {
-	int i;
-	unsigned long addr;
-
 	/* Check if we had too many memmaps. */
 	if (memmap_too_large) {
 		debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n");
@@ -577,16 +590,7 @@ static unsigned long find_random_phys_addr(unsigned long minimum,
 	/* Make sure minimum is aligned. */
 	minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
 
-	/* Verify potential e820 positions, appending to slots list. */
-	for (i = 0; i < boot_params->e820_entries; i++) {
-		process_e820_entry(&boot_params->e820_table[i], minimum,
-				   image_size);
-		if (slot_area_index == MAX_SLOT_AREA) {
-			debug_putstr("Aborted e820 scan (slot_areas full)!\n");
-			break;
-		}
-	}
-
+	process_e820_entries(minimum, image_size);
 	return slots_fetch_random();
 }
 

From 87891b01b54210763117f0a67b022cd94de6cd13 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Sun, 9 Jul 2017 20:37:40 +0800
Subject: [PATCH 010/149] x86/boot/KASLR: Switch to pass struct mem_vector to
 process_e820_entry()

This makes process_e820_entry() be able to process any kind of memory
region.

Signed-off-by: Baoquan He <bhe@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: fanc.fnst@cn.fujitsu.com
Cc: izumi.taku@jp.fujitsu.com
Cc: matt@codeblueprint.co.uk
Cc: thgarnie@google.com
Link: http://lkml.kernel.org/r/1499603862-11516-3-git-send-email-bhe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/kaslr.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 1485f48aeda1..36ff9f729c43 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -479,31 +479,31 @@ static unsigned long slots_fetch_random(void)
 	return 0;
 }
 
-static void process_e820_entry(struct boot_e820_entry *entry,
+static void process_e820_entry(struct mem_vector *entry,
 			       unsigned long minimum,
 			       unsigned long image_size)
 {
 	struct mem_vector region, overlap;
 	struct slot_area slot_area;
 	unsigned long start_orig, end;
-	struct boot_e820_entry cur_entry;
+	struct mem_vector cur_entry;
 
 	/* On 32-bit, ignore entries entirely above our maximum. */
-	if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE)
+	if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE)
 		return;
 
 	/* Ignore entries entirely below our minimum. */
-	if (entry->addr + entry->size < minimum)
+	if (entry->start + entry->size < minimum)
 		return;
 
 	/* Ignore entries above memory limit */
-	end = min(entry->size + entry->addr, mem_limit);
-	if (entry->addr >= end)
+	end = min(entry->size + entry->start, mem_limit);
+	if (entry->start >= end)
 		return;
-	cur_entry.addr = entry->addr;
-	cur_entry.size = end - entry->addr;
+	cur_entry.start = entry->start;
+	cur_entry.size = end - entry->start;
 
-	region.start = cur_entry.addr;
+	region.start = cur_entry.start;
 	region.size = cur_entry.size;
 
 	/* Give up if slot area array is full. */
@@ -518,7 +518,7 @@ static void process_e820_entry(struct boot_e820_entry *entry,
 		region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
 
 		/* Did we raise the address above this e820 region? */
-		if (region.start > cur_entry.addr + cur_entry.size)
+		if (region.start > cur_entry.start + cur_entry.size)
 			return;
 
 		/* Reduce size by any delta from the original address. */
@@ -562,6 +562,7 @@ static void process_e820_entries(unsigned long minimum,
 				 unsigned long image_size)
 {
 	int i;
+	struct mem_vector region;
 	struct boot_e820_entry *entry;
 
 	/* Verify potential e820 positions, appending to slots list. */
@@ -570,7 +571,9 @@ static void process_e820_entries(unsigned long minimum,
 		/* Skip non-RAM entries. */
 		if (entry->type != E820_TYPE_RAM)
 			continue;
-		process_e820_entry(entry, minimum, image_size);
+		region.start = entry->addr;
+		region.size = entry->size;
+		process_e820_entry(&region, minimum, image_size);
 		if (slot_area_index == MAX_SLOT_AREA) {
 			debug_putstr("Aborted e820 scan (slot_areas full)!\n");
 			break;

From 27aac20574110abfd594175a668dc58b23b2b14a Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Sun, 9 Jul 2017 20:37:41 +0800
Subject: [PATCH 011/149] x86/boot/KASLR: Rename process_e820_entry() into
 process_mem_region()

Now process_e820_entry() is not limited to e820 entry processing, rename
it to process_mem_region(). And adjust the code comment accordingly.

Signed-off-by: Baoquan He <bhe@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: fanc.fnst@cn.fujitsu.com
Cc: izumi.taku@jp.fujitsu.com
Cc: matt@codeblueprint.co.uk
Cc: thgarnie@google.com
Link: http://lkml.kernel.org/r/1499603862-11516-4-git-send-email-bhe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/kaslr.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 36ff9f729c43..99c7194f7ea6 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -479,7 +479,7 @@ static unsigned long slots_fetch_random(void)
 	return 0;
 }
 
-static void process_e820_entry(struct mem_vector *entry,
+static void process_mem_region(struct mem_vector *entry,
 			       unsigned long minimum,
 			       unsigned long image_size)
 {
@@ -517,7 +517,7 @@ static void process_e820_entry(struct mem_vector *entry,
 		/* Potentially raise address to meet alignment needs. */
 		region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
 
-		/* Did we raise the address above this e820 region? */
+		/* Did we raise the address above the passed in memory entry? */
 		if (region.start > cur_entry.start + cur_entry.size)
 			return;
 
@@ -573,7 +573,7 @@ static void process_e820_entries(unsigned long minimum,
 			continue;
 		region.start = entry->addr;
 		region.size = entry->size;
-		process_e820_entry(&region, minimum, image_size);
+		process_mem_region(&region, minimum, image_size);
 		if (slot_area_index == MAX_SLOT_AREA) {
 			debug_putstr("Aborted e820 scan (slot_areas full)!\n");
 			break;

From c262f3b9a3246da87c66ce398cd7e30d8f1529ea Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:09:58 -0500
Subject: [PATCH 012/149] x86/cpu/AMD: Document AMD Secure Memory Encryption
 (SME)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Create a Documentation entry to describe the AMD Secure Memory
Encryption (SME) feature and add documentation for the mem_encrypt=
kernel parameter.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ca0a0c13b055fd804cfc92cbaca8acd68057eed0.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 .../admin-guide/kernel-parameters.txt         | 11 +++
 Documentation/x86/amd-memory-encryption.txt   | 68 +++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 Documentation/x86/amd-memory-encryption.txt

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f701430f4894..372cc66bba23 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2233,6 +2233,17 @@
 			memory contents and reserves bad memory
 			regions that are detected.
 
+	mem_encrypt=	[X86-64] AMD Secure Memory Encryption (SME) control
+			Valid arguments: on, off
+			Default (depends on kernel configuration option):
+			  on  (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y)
+			  off (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=n)
+			mem_encrypt=on:		Activate SME
+			mem_encrypt=off:	Do not activate SME
+
+			Refer to Documentation/x86/amd-memory-encryption.txt
+			for details on when memory encryption can be activated.
+
 	mem_sleep_default=	[SUSPEND] Default system suspend mode:
 			s2idle  - Suspend-To-Idle
 			shallow - Power-On Suspend or equivalent (if supported)
diff --git a/Documentation/x86/amd-memory-encryption.txt b/Documentation/x86/amd-memory-encryption.txt
new file mode 100644
index 000000000000..f512ab718541
--- /dev/null
+++ b/Documentation/x86/amd-memory-encryption.txt
@@ -0,0 +1,68 @@
+Secure Memory Encryption (SME) is a feature found on AMD processors.
+
+SME provides the ability to mark individual pages of memory as encrypted using
+the standard x86 page tables.  A page that is marked encrypted will be
+automatically decrypted when read from DRAM and encrypted when written to
+DRAM.  SME can therefore be used to protect the contents of DRAM from physical
+attacks on the system.
+
+A page is encrypted when a page table entry has the encryption bit set (see
+below on how to determine its position).  The encryption bit can also be
+specified in the cr3 register, allowing the PGD table to be encrypted. Each
+successive level of page tables can also be encrypted by setting the encryption
+bit in the page table entry that points to the next table. This allows the full
+page table hierarchy to be encrypted. Note, this means that just because the
+encryption bit is set in cr3, doesn't imply the full hierarchy is encyrpted.
+Each page table entry in the hierarchy needs to have the encryption bit set to
+achieve that. So, theoretically, you could have the encryption bit set in cr3
+so that the PGD is encrypted, but not set the encryption bit in the PGD entry
+for a PUD which results in the PUD pointed to by that entry to not be
+encrypted.
+
+Support for SME can be determined through the CPUID instruction. The CPUID
+function 0x8000001f reports information related to SME:
+
+	0x8000001f[eax]:
+		Bit[0] indicates support for SME
+	0x8000001f[ebx]:
+		Bits[5:0]  pagetable bit number used to activate memory
+			   encryption
+		Bits[11:6] reduction in physical address space, in bits, when
+			   memory encryption is enabled (this only affects
+			   system physical addresses, not guest physical
+			   addresses)
+
+If support for SME is present, MSR 0xc00100010 (MSR_K8_SYSCFG) can be used to
+determine if SME is enabled and/or to enable memory encryption:
+
+	0xc0010010:
+		Bit[23]   0 = memory encryption features are disabled
+			  1 = memory encryption features are enabled
+
+Linux relies on BIOS to set this bit if BIOS has determined that the reduction
+in the physical address space as a result of enabling memory encryption (see
+CPUID information above) will not conflict with the address space resource
+requirements for the system.  If this bit is not set upon Linux startup then
+Linux itself will not set it and memory encryption will not be possible.
+
+The state of SME in the Linux kernel can be documented as follows:
+	- Supported:
+	  The CPU supports SME (determined through CPUID instruction).
+
+	- Enabled:
+	  Supported and bit 23 of MSR_K8_SYSCFG is set.
+
+	- Active:
+	  Supported, Enabled and the Linux kernel is actively applying
+	  the encryption bit to page table entries (the SME mask in the
+	  kernel is non-zero).
+
+SME can also be enabled and activated in the BIOS. If SME is enabled and
+activated in the BIOS, then all memory accesses will be encrypted and it will
+not be necessary to activate the Linux memory encryption support.  If the BIOS
+merely enables SME (sets bit 23 of the MSR_K8_SYSCFG), then Linux can activate
+memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or
+by supplying mem_encrypt=on on the kernel command line.  However, if BIOS does
+not enable SME, then Linux will not be able to activate memory encryption, even
+if configured to do so by default or the mem_encrypt=on command line parameter
+is specified.

From aac7b79eea6118dee3da9b99dcd564471672806d Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:09:59 -0500
Subject: [PATCH 013/149] x86/mm/pat: Set write-protect cache mode for full PAT
 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For processors that support PAT, set the write-protect cache mode
(_PAGE_CACHE_MODE_WP) entry to the actual write-protect value (x05).

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ade53b63d4dbffbfc3cb08fb62024647059c8688.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pat.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 45979502f64b..88990ab87961 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -293,7 +293,7 @@ void init_cache_modes(void)
  * pat_init - Initialize PAT MSR and PAT table
  *
  * This function initializes PAT MSR and PAT table with an OS-defined value
- * to enable additional cache attributes, WC and WT.
+ * to enable additional cache attributes, WC, WT and WP.
  *
  * This function must be called on all CPUs using the specific sequence of
  * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this
@@ -352,7 +352,7 @@ void pat_init(void)
 		 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
 		 *      011    3    UC : _PAGE_CACHE_MODE_UC
 		 *      100    4    WB : Reserved
-		 *      101    5    WC : Reserved
+		 *      101    5    WP : _PAGE_CACHE_MODE_WP
 		 *      110    6    UC-: Reserved
 		 *      111    7    WT : _PAGE_CACHE_MODE_WT
 		 *
@@ -360,7 +360,7 @@ void pat_init(void)
 		 * corresponding types in the presence of PAT errata.
 		 */
 		pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
-		      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT);
+		      PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
 	}
 
 	if (!boot_cpu_done) {

From f7750a79568788473c5e8092ee58a52248f34329 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:00 -0500
Subject: [PATCH 014/149] x86, mpparse, x86/acpi, x86/PCI, x86/dmi, SFI: Use
 memremap() for RAM mappings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ioremap() function is intended for mapping MMIO. For RAM, the
memremap() function should be used. Convert calls from ioremap() to
memremap() when re-mapping RAM.

This will be used later by SME to control how the encryption mask is
applied to memory mappings, with certain memory locations being mapped
decrypted vs encrypted.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/b13fccb9abbd547a7eef7b1fdfc223431b211c88.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/dmi.h   |  8 ++++----
 arch/x86/kernel/acpi/boot.c  |  6 +++---
 arch/x86/kernel/kdebugfs.c   | 34 +++++++++++-----------------------
 arch/x86/kernel/ksysfs.c     | 28 ++++++++++++++--------------
 arch/x86/kernel/mpparse.c    | 10 +++++-----
 arch/x86/pci/common.c        |  4 ++--
 drivers/firmware/dmi-sysfs.c |  5 +++--
 drivers/firmware/pcdp.c      |  4 ++--
 drivers/sfi/sfi_core.c       | 22 +++++++++++-----------
 9 files changed, 55 insertions(+), 66 deletions(-)

diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index 3c69fed215c5..a8e15b04565b 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -13,9 +13,9 @@ static __always_inline __init void *dmi_alloc(unsigned len)
 }
 
 /* Use early IO mappings for DMI because it's initialized early */
-#define dmi_early_remap		early_ioremap
-#define dmi_early_unmap		early_iounmap
-#define dmi_remap		ioremap_cache
-#define dmi_unmap		iounmap
+#define dmi_early_remap		early_memremap
+#define dmi_early_unmap		early_memunmap
+#define dmi_remap(_x, _l)	memremap(_x, _l, MEMREMAP_WB)
+#define dmi_unmap(_x)		memunmap(_x)
 
 #endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 6bb680671088..850160a76864 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -115,7 +115,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
 #define	ACPI_INVALID_GSI		INT_MIN
 
 /*
- * This is just a simple wrapper around early_ioremap(),
+ * This is just a simple wrapper around early_memremap(),
  * with sanity checks for phys == 0 and size == 0.
  */
 char *__init __acpi_map_table(unsigned long phys, unsigned long size)
@@ -124,7 +124,7 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
 	if (!phys || !size)
 		return NULL;
 
-	return early_ioremap(phys, size);
+	return early_memremap(phys, size);
 }
 
 void __init __acpi_unmap_table(char *map, unsigned long size)
@@ -132,7 +132,7 @@ void __init __acpi_unmap_table(char *map, unsigned long size)
 	if (!map || !size)
 		return;
 
-	early_iounmap(map, size);
+	early_memunmap(map, size);
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 38b64587b31b..fd6f8fbbe6f2 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -33,7 +33,6 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
 	struct setup_data_node *node = file->private_data;
 	unsigned long remain;
 	loff_t pos = *ppos;
-	struct page *pg;
 	void *p;
 	u64 pa;
 
@@ -47,18 +46,13 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
 		count = node->len - pos;
 
 	pa = node->paddr + sizeof(struct setup_data) + pos;
-	pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
-	if (PageHighMem(pg)) {
-		p = ioremap_cache(pa, count);
-		if (!p)
-			return -ENXIO;
-	} else
-		p = __va(pa);
+	p = memremap(pa, count, MEMREMAP_WB);
+	if (!p)
+		return -ENOMEM;
 
 	remain = copy_to_user(user_buf, p, count);
 
-	if (PageHighMem(pg))
-		iounmap(p);
+	memunmap(p);
 
 	if (remain)
 		return -EFAULT;
@@ -109,7 +103,6 @@ static int __init create_setup_data_nodes(struct dentry *parent)
 	struct setup_data *data;
 	int error;
 	struct dentry *d;
-	struct page *pg;
 	u64 pa_data;
 	int no = 0;
 
@@ -126,16 +119,12 @@ static int __init create_setup_data_nodes(struct dentry *parent)
 			goto err_dir;
 		}
 
-		pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
-		if (PageHighMem(pg)) {
-			data = ioremap_cache(pa_data, sizeof(*data));
-			if (!data) {
-				kfree(node);
-				error = -ENXIO;
-				goto err_dir;
-			}
-		} else
-			data = __va(pa_data);
+		data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
+		if (!data) {
+			kfree(node);
+			error = -ENOMEM;
+			goto err_dir;
+		}
 
 		node->paddr = pa_data;
 		node->type = data->type;
@@ -143,8 +132,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
 		error = create_setup_data_node(d, no, node);
 		pa_data = data->next;
 
-		if (PageHighMem(pg))
-			iounmap(data);
+		memunmap(data);
 		if (error)
 			goto err_dir;
 		no++;
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index 4afc67f5facc..ee51db9a968a 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -16,8 +16,8 @@
 #include <linux/stat.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
+#include <linux/io.h>
 
-#include <asm/io.h>
 #include <asm/setup.h>
 
 static ssize_t version_show(struct kobject *kobj,
@@ -79,12 +79,12 @@ static int get_setup_data_paddr(int nr, u64 *paddr)
 			*paddr = pa_data;
 			return 0;
 		}
-		data = ioremap_cache(pa_data, sizeof(*data));
+		data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
 		if (!data)
 			return -ENOMEM;
 
 		pa_data = data->next;
-		iounmap(data);
+		memunmap(data);
 		i++;
 	}
 	return -EINVAL;
@@ -97,17 +97,17 @@ static int __init get_setup_data_size(int nr, size_t *size)
 	u64 pa_data = boot_params.hdr.setup_data;
 
 	while (pa_data) {
-		data = ioremap_cache(pa_data, sizeof(*data));
+		data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
 		if (!data)
 			return -ENOMEM;
 		if (nr == i) {
 			*size = data->len;
-			iounmap(data);
+			memunmap(data);
 			return 0;
 		}
 
 		pa_data = data->next;
-		iounmap(data);
+		memunmap(data);
 		i++;
 	}
 	return -EINVAL;
@@ -127,12 +127,12 @@ static ssize_t type_show(struct kobject *kobj,
 	ret = get_setup_data_paddr(nr, &paddr);
 	if (ret)
 		return ret;
-	data = ioremap_cache(paddr, sizeof(*data));
+	data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
 	if (!data)
 		return -ENOMEM;
 
 	ret = sprintf(buf, "0x%x\n", data->type);
-	iounmap(data);
+	memunmap(data);
 	return ret;
 }
 
@@ -154,7 +154,7 @@ static ssize_t setup_data_data_read(struct file *fp,
 	ret = get_setup_data_paddr(nr, &paddr);
 	if (ret)
 		return ret;
-	data = ioremap_cache(paddr, sizeof(*data));
+	data = memremap(paddr, sizeof(*data), MEMREMAP_WB);
 	if (!data)
 		return -ENOMEM;
 
@@ -170,15 +170,15 @@ static ssize_t setup_data_data_read(struct file *fp,
 		goto out;
 
 	ret = count;
-	p = ioremap_cache(paddr + sizeof(*data), data->len);
+	p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB);
 	if (!p) {
 		ret = -ENOMEM;
 		goto out;
 	}
 	memcpy(buf, p + off, count);
-	iounmap(p);
+	memunmap(p);
 out:
-	iounmap(data);
+	memunmap(data);
 	return ret;
 }
 
@@ -250,13 +250,13 @@ static int __init get_setup_data_total_num(u64 pa_data, int *nr)
 	*nr = 0;
 	while (pa_data) {
 		*nr += 1;
-		data = ioremap_cache(pa_data, sizeof(*data));
+		data = memremap(pa_data, sizeof(*data), MEMREMAP_WB);
 		if (!data) {
 			ret = -ENOMEM;
 			goto out;
 		}
 		pa_data = data->next;
-		iounmap(data);
+		memunmap(data);
 	}
 
 out:
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 0d904d759ff1..fd37f39066da 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -436,9 +436,9 @@ static unsigned long __init get_mpc_size(unsigned long physptr)
 	struct mpc_table *mpc;
 	unsigned long size;
 
-	mpc = early_ioremap(physptr, PAGE_SIZE);
+	mpc = early_memremap(physptr, PAGE_SIZE);
 	size = mpc->length;
-	early_iounmap(mpc, PAGE_SIZE);
+	early_memunmap(mpc, PAGE_SIZE);
 	apic_printk(APIC_VERBOSE, "  mpc: %lx-%lx\n", physptr, physptr + size);
 
 	return size;
@@ -450,7 +450,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
 	unsigned long size;
 
 	size = get_mpc_size(mpf->physptr);
-	mpc = early_ioremap(mpf->physptr, size);
+	mpc = early_memremap(mpf->physptr, size);
 	/*
 	 * Read the physical hardware table.  Anything here will
 	 * override the defaults.
@@ -461,10 +461,10 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
 #endif
 		pr_err("BIOS bug, MP table errors detected!...\n");
 		pr_cont("... disabling SMP support. (tell your hw vendor)\n");
-		early_iounmap(mpc, size);
+		early_memunmap(mpc, size);
 		return -1;
 	}
-	early_iounmap(mpc, size);
+	early_memunmap(mpc, size);
 
 	if (early)
 		return -1;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index dbe2132b0ed4..7a5350d08cef 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -674,7 +674,7 @@ int pcibios_add_device(struct pci_dev *dev)
 
 	pa_data = boot_params.hdr.setup_data;
 	while (pa_data) {
-		data = ioremap(pa_data, sizeof(*rom));
+		data = memremap(pa_data, sizeof(*rom), MEMREMAP_WB);
 		if (!data)
 			return -ENOMEM;
 
@@ -693,7 +693,7 @@ int pcibios_add_device(struct pci_dev *dev)
 			}
 		}
 		pa_data = data->next;
-		iounmap(data);
+		memunmap(data);
 	}
 	set_dma_domain_ops(dev);
 	set_dev_domain_options(dev);
diff --git a/drivers/firmware/dmi-sysfs.c b/drivers/firmware/dmi-sysfs.c
index ef76e5eecf0b..d5de6ee8466d 100644
--- a/drivers/firmware/dmi-sysfs.c
+++ b/drivers/firmware/dmi-sysfs.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/io.h>
+#include <asm/dmi.h>
 
 #define MAX_ENTRY_TYPE 255 /* Most of these aren't used, but we consider
 			      the top entry type is only 8 bits */
@@ -380,7 +381,7 @@ static ssize_t dmi_sel_raw_read_phys32(struct dmi_sysfs_entry *entry,
 	u8 __iomem *mapped;
 	ssize_t wrote = 0;
 
-	mapped = ioremap(sel->access_method_address, sel->area_length);
+	mapped = dmi_remap(sel->access_method_address, sel->area_length);
 	if (!mapped)
 		return -EIO;
 
@@ -390,7 +391,7 @@ static ssize_t dmi_sel_raw_read_phys32(struct dmi_sysfs_entry *entry,
 		wrote++;
 	}
 
-	iounmap(mapped);
+	dmi_unmap(mapped);
 	return wrote;
 }
 
diff --git a/drivers/firmware/pcdp.c b/drivers/firmware/pcdp.c
index 75273a251603..e83d6aec0c13 100644
--- a/drivers/firmware/pcdp.c
+++ b/drivers/firmware/pcdp.c
@@ -95,7 +95,7 @@ efi_setup_pcdp_console(char *cmdline)
 	if (efi.hcdp == EFI_INVALID_TABLE_ADDR)
 		return -ENODEV;
 
-	pcdp = early_ioremap(efi.hcdp, 4096);
+	pcdp = early_memremap(efi.hcdp, 4096);
 	printk(KERN_INFO "PCDP: v%d at 0x%lx\n", pcdp->rev, efi.hcdp);
 
 	if (strstr(cmdline, "console=hcdp")) {
@@ -131,6 +131,6 @@ efi_setup_pcdp_console(char *cmdline)
 	}
 
 out:
-	early_iounmap(pcdp, 4096);
+	early_memunmap(pcdp, 4096);
 	return rc;
 }
diff --git a/drivers/sfi/sfi_core.c b/drivers/sfi/sfi_core.c
index 296db7a69c27..d5ce53491efb 100644
--- a/drivers/sfi/sfi_core.c
+++ b/drivers/sfi/sfi_core.c
@@ -86,13 +86,13 @@ static struct sfi_table_simple *syst_va __read_mostly;
 /*
  * FW creates and saves the SFI tables in memory. When these tables get
  * used, they may need to be mapped to virtual address space, and the mapping
- * can happen before or after the ioremap() is ready, so a flag is needed
+ * can happen before or after the memremap() is ready, so a flag is needed
  * to indicating this
  */
-static u32 sfi_use_ioremap __read_mostly;
+static u32 sfi_use_memremap __read_mostly;
 
 /*
- * sfi_un/map_memory calls early_ioremap/iounmap which is a __init function
+ * sfi_un/map_memory calls early_memremap/memunmap which is a __init function
  * and introduces section mismatch. So use __ref to make it calm.
  */
 static void __iomem * __ref sfi_map_memory(u64 phys, u32 size)
@@ -100,10 +100,10 @@ static void __iomem * __ref sfi_map_memory(u64 phys, u32 size)
 	if (!phys || !size)
 		return NULL;
 
-	if (sfi_use_ioremap)
-		return ioremap_cache(phys, size);
+	if (sfi_use_memremap)
+		return memremap(phys, size, MEMREMAP_WB);
 	else
-		return early_ioremap(phys, size);
+		return early_memremap(phys, size);
 }
 
 static void __ref sfi_unmap_memory(void __iomem *virt, u32 size)
@@ -111,10 +111,10 @@ static void __ref sfi_unmap_memory(void __iomem *virt, u32 size)
 	if (!virt || !size)
 		return;
 
-	if (sfi_use_ioremap)
-		iounmap(virt);
+	if (sfi_use_memremap)
+		memunmap(virt);
 	else
-		early_iounmap(virt, size);
+		early_memunmap(virt, size);
 }
 
 static void sfi_print_table_header(unsigned long long pa,
@@ -507,8 +507,8 @@ void __init sfi_init_late(void)
 	length = syst_va->header.len;
 	sfi_unmap_memory(syst_va, sizeof(struct sfi_table_simple));
 
-	/* Use ioremap now after it is ready */
-	sfi_use_ioremap = 1;
+	/* Use memremap now after it is ready */
+	sfi_use_memremap = 1;
 	syst_va = sfi_map_memory(syst_pa, length);
 
 	sfi_acpi_init();

From 872cbefd2d9c52bd0b1e2c7942c4369e98a5a5ae Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:01 -0500
Subject: [PATCH 015/149] x86/cpu/AMD: Add the Secure Memory Encryption CPU
 feature
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update the CPU features to include identifying and reporting on the
Secure Memory Encryption (SME) feature.  SME is identified by CPUID
0x8000001f, but requires BIOS support to enable it (set bit 23 of
MSR_K8_SYSCFG).  Only show the SME feature as available if reported by
CPUID, enabled by BIOS and not configured as CONFIG_X86_32=y.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/85c17ff450721abccddc95e611ae8df3f4d9718b.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/include/asm/msr-index.h   |  2 ++
 arch/x86/kernel/cpu/amd.c          | 19 +++++++++++++++++++
 arch/x86/kernel/cpu/scattered.c    |  1 +
 4 files changed, 23 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index ca3c48c0872f..14f0f2913364 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -196,6 +196,7 @@
 
 #define X86_FEATURE_HW_PSTATE	( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+#define X86_FEATURE_SME		( 7*32+10) /* AMD Secure Memory Encryption */
 
 #define X86_FEATURE_INTEL_PPIN	( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 5573c75f8e4c..17f5c12e1afd 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -356,6 +356,8 @@
 #define MSR_K8_TOP_MEM1			0xc001001a
 #define MSR_K8_TOP_MEM2			0xc001001d
 #define MSR_K8_SYSCFG			0xc0010010
+#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT	23
+#define MSR_K8_SYSCFG_MEM_ENCRYPT	BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT)
 #define MSR_K8_INT_PENDING_MSG		0xc0010055
 /* C1E active bits in int pending message */
 #define K8_INTP_C1E_ACTIVE_MASK		0x18000000
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bb5abe8f5fd4..5ccc7b2e63bb 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -611,6 +611,25 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 	 */
 	if (cpu_has_amd_erratum(c, amd_erratum_400))
 		set_cpu_bug(c, X86_BUG_AMD_E400);
+
+	/*
+	 * BIOS support is required for SME. If BIOS has not enabled SME
+	 * then don't advertise the feature (set in scattered.c). Also,
+	 * since the SME support requires long mode, don't advertise the
+	 * feature under CONFIG_X86_32.
+	 */
+	if (cpu_has(c, X86_FEATURE_SME)) {
+		if (IS_ENABLED(CONFIG_X86_32)) {
+			clear_cpu_cap(c, X86_FEATURE_SME);
+		} else {
+			u64 msr;
+
+			/* Check if SME is enabled */
+			rdmsrl(MSR_K8_SYSCFG, msr);
+			if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+				clear_cpu_cap(c, X86_FEATURE_SME);
+		}
+	}
 }
 
 static void init_amd_k8(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 23c23508c012..05459ad3db46 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -31,6 +31,7 @@ static const struct cpuid_bit cpuid_bits[] = {
 	{ X86_FEATURE_HW_PSTATE,	CPUID_EDX,  7, 0x80000007, 0 },
 	{ X86_FEATURE_CPB,		CPUID_EDX,  9, 0x80000007, 0 },
 	{ X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
+	{ X86_FEATURE_SME,		CPUID_EAX,  0, 0x8000001f, 0 },
 	{ 0, 0, 0, 0, 0 }
 };
 

From 9af9b94068fb1ea3206a700fc222075966fbef14 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:02 -0500
Subject: [PATCH 016/149] x86/cpu/AMD: Handle SME reduction in physical address
 size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When System Memory Encryption (SME) is enabled, the physical address
space is reduced. Adjust the x86_phys_bits value to reflect this
reduction.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/593c037a3cad85ba92f3d061ffa7462e9ce3531d.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/amd.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 5ccc7b2e63bb..4d87950fde30 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -613,21 +613,23 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 		set_cpu_bug(c, X86_BUG_AMD_E400);
 
 	/*
-	 * BIOS support is required for SME. If BIOS has not enabled SME
-	 * then don't advertise the feature (set in scattered.c). Also,
-	 * since the SME support requires long mode, don't advertise the
-	 * feature under CONFIG_X86_32.
+	 * BIOS support is required for SME. If BIOS has enabled SME then
+	 * adjust x86_phys_bits by the SME physical address space reduction
+	 * value. If BIOS has not enabled SME then don't advertise the
+	 * feature (set in scattered.c). Also, since the SME support requires
+	 * long mode, don't advertise the feature under CONFIG_X86_32.
 	 */
 	if (cpu_has(c, X86_FEATURE_SME)) {
-		if (IS_ENABLED(CONFIG_X86_32)) {
-			clear_cpu_cap(c, X86_FEATURE_SME);
-		} else {
-			u64 msr;
+		u64 msr;
 
-			/* Check if SME is enabled */
-			rdmsrl(MSR_K8_SYSCFG, msr);
-			if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+		/* Check if SME is enabled */
+		rdmsrl(MSR_K8_SYSCFG, msr);
+		if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) {
+			c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
+			if (IS_ENABLED(CONFIG_X86_32))
 				clear_cpu_cap(c, X86_FEATURE_SME);
+		} else {
+			clear_cpu_cap(c, X86_FEATURE_SME);
 		}
 	}
 }

From 7744ccdbc16f0ac4adae21b3678af93775b3a386 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:03 -0500
Subject: [PATCH 017/149] x86/mm: Add Secure Memory Encryption (SME) support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for Secure Memory Encryption (SME). This initial support
provides a Kconfig entry to build the SME support into the kernel and
defines the memory encryption mask that will be used in subsequent
patches to mark pages as encrypted.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/a6c34d16caaed3bc3e2d6f0987554275bd291554.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig                   | 25 +++++++++++++++++++++
 arch/x86/include/asm/mem_encrypt.h | 30 +++++++++++++++++++++++++
 arch/x86/mm/Makefile               |  1 +
 arch/x86/mm/mem_encrypt.c          | 21 ++++++++++++++++++
 include/linux/mem_encrypt.h        | 35 ++++++++++++++++++++++++++++++
 5 files changed, 112 insertions(+)
 create mode 100644 arch/x86/include/asm/mem_encrypt.h
 create mode 100644 arch/x86/mm/mem_encrypt.c
 create mode 100644 include/linux/mem_encrypt.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..ba7b93d08d00 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1415,6 +1415,31 @@ config X86_DIRECT_GBPAGES
 	  supports them), so don't confuse the user by printing
 	  that we have them enabled.
 
+config ARCH_HAS_MEM_ENCRYPT
+	def_bool y
+
+config AMD_MEM_ENCRYPT
+	bool "AMD Secure Memory Encryption (SME) support"
+	depends on X86_64 && CPU_SUP_AMD
+	---help---
+	  Say yes to enable support for the encryption of system memory.
+	  This requires an AMD processor that supports Secure Memory
+	  Encryption (SME).
+
+config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
+	bool "Activate AMD Secure Memory Encryption (SME) by default"
+	default y
+	depends on AMD_MEM_ENCRYPT
+	---help---
+	  Say yes to have system memory encrypted by default if running on
+	  an AMD processor that supports Secure Memory Encryption (SME).
+
+	  If set to Y, then the encryption of system memory can be
+	  deactivated with the mem_encrypt=off command line option.
+
+	  If set to N, then the encryption of system memory can be
+	  activated with the mem_encrypt=on command line option.
+
 # Common NUMA Features
 config NUMA
 	bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
new file mode 100644
index 000000000000..a1057961ac46
--- /dev/null
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -0,0 +1,30 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __X86_MEM_ENCRYPT_H__
+#define __X86_MEM_ENCRYPT_H__
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+extern unsigned long sme_me_mask;
+
+#else	/* !CONFIG_AMD_MEM_ENCRYPT */
+
+#define sme_me_mask	0UL
+
+#endif	/* CONFIG_AMD_MEM_ENCRYPT */
+
+#endif	/* __ASSEMBLY__ */
+
+#endif	/* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 0fbdcb64f9f8..a94a7b663d5f 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -39,3 +39,4 @@ obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
 obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
 
+obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt.o
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
new file mode 100644
index 000000000000..b99d469c73e7
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt.c
@@ -0,0 +1,21 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+/*
+ * Since SME related variables are set early in the boot process they must
+ * reside in the .data section so as not to be zeroed out when the .bss
+ * section is later cleared.
+ */
+unsigned long sme_me_mask __section(.data) = 0;
+EXPORT_SYMBOL_GPL(sme_me_mask);
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
new file mode 100644
index 000000000000..59769f7287e4
--- /dev/null
+++ b/include/linux/mem_encrypt.h
@@ -0,0 +1,35 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __MEM_ENCRYPT_H__
+#define __MEM_ENCRYPT_H__
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT
+
+#include <asm/mem_encrypt.h>
+
+#else	/* !CONFIG_ARCH_HAS_MEM_ENCRYPT */
+
+#define sme_me_mask	0UL
+
+#endif	/* CONFIG_ARCH_HAS_MEM_ENCRYPT */
+
+static inline bool sme_active(void)
+{
+	return !!sme_me_mask;
+}
+
+#endif	/* __ASSEMBLY__ */
+
+#endif	/* __MEM_ENCRYPT_H__ */

From 33c2b803edd13487518a2c7d5002d84d7e9c878f Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:04 -0500
Subject: [PATCH 018/149] x86/mm: Remove phys_to_virt() usage in ioremap()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently there is a check if the address being mapped is in the ISA
range (is_ISA_range()), and if it is, then phys_to_virt() is used to
perform the mapping. When SME is active, the default is to add pagetable
mappings with the encryption bit set unless specifically overridden. The
resulting pagetable mapping from phys_to_virt() will result in a mapping
that has the encryption bit set. With SME, the use of ioremap() is
intended to generate pagetable mappings that do not have the encryption
bit set through the use of the PAGE_KERNEL_IO protection value.

Rather than special case the SME scenario, remove the ISA range check and
usage of phys_to_virt() and have ISA range mappings continue through the
remaining ioremap() path.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/88ada7b09c6568c61cd696351eb59fb15a82ce1a.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/ioremap.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 4c1b5fd0c7ad..66ddf5e8ffc8 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -105,12 +105,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		return NULL;
 	}
 
-	/*
-	 * Don't remap the low PCI/ISA area, it's always mapped..
-	 */
-	if (is_ISA_range(phys_addr, last_addr))
-		return (__force void __iomem *)phys_to_virt(phys_addr);
-
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
@@ -340,13 +334,17 @@ void iounmap(volatile void __iomem *addr)
 		return;
 
 	/*
-	 * __ioremap special-cases the PCI/ISA range by not instantiating a
-	 * vm_area and by simply returning an address into the kernel mapping
-	 * of ISA space.   So handle that here.
+	 * The PCI/ISA range special-casing was removed from __ioremap()
+	 * so this check, in theory, can be removed. However, there are
+	 * cases where iounmap() is called for addresses not obtained via
+	 * ioremap() (vga16fb for example). Add a warning so that these
+	 * cases can be caught and fixed.
 	 */
 	if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
-	    (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
+	    (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) {
+		WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n");
 		return;
+	}
 
 	addr = (volatile void __iomem *)
 		(PAGE_MASK & (unsigned long __force)addr);

From 5868f3651fa0dff96a57f94d49247d3ef320ebe2 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:05 -0500
Subject: [PATCH 019/149] x86/mm: Add support to enable SME in early boot
 processing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support to the early boot code to use Secure Memory Encryption (SME).
Since the kernel has been loaded into memory in a decrypted state, encrypt
the kernel in place and update the early pagetables with the memory
encryption mask so that new pagetable entries will use memory encryption.

The routines to set the encryption mask and perform the encryption are
stub routines for now with functionality to be added in a later patch.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/e52ad781f085224bf835b3caff9aa3aee6febccb.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mem_encrypt.h |  8 +++++
 arch/x86/kernel/head64.c           | 53 +++++++++++++++++++++++-------
 arch/x86/kernel/head_64.S          | 20 +++++++++--
 arch/x86/mm/mem_encrypt.c          |  9 +++++
 include/linux/mem_encrypt.h        |  5 +++
 5 files changed, 82 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index a1057961ac46..475e34f53793 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -15,14 +15,22 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/init.h>
+
 #ifdef CONFIG_AMD_MEM_ENCRYPT
 
 extern unsigned long sme_me_mask;
 
+void __init sme_encrypt_kernel(void);
+void __init sme_enable(void);
+
 #else	/* !CONFIG_AMD_MEM_ENCRYPT */
 
 #define sme_me_mask	0UL
 
+static inline void __init sme_encrypt_kernel(void) { }
+static inline void __init sme_enable(void) { }
+
 #endif	/* CONFIG_AMD_MEM_ENCRYPT */
 
 #endif	/* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 46c3c73e7f43..1f0ddcc9675c 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -14,6 +14,7 @@
 #include <linux/start_kernel.h>
 #include <linux/io.h>
 #include <linux/memblock.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/processor.h>
 #include <asm/proto.h>
@@ -45,9 +46,10 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
 	return ptr - (void *)_text + (void *)physaddr;
 }
 
-void __head __startup_64(unsigned long physaddr)
+unsigned long __head __startup_64(unsigned long physaddr)
 {
 	unsigned long load_delta, *p;
+	unsigned long pgtable_flags;
 	pgdval_t *pgd;
 	p4dval_t *p4d;
 	pudval_t *pud;
@@ -68,6 +70,12 @@ void __head __startup_64(unsigned long physaddr)
 	if (load_delta & ~PMD_PAGE_MASK)
 		for (;;);
 
+	/* Activate Secure Memory Encryption (SME) if supported and enabled */
+	sme_enable();
+
+	/* Include the SME encryption mask in the fixup value */
+	load_delta += sme_get_me_mask();
+
 	/* Fixup the physical addresses in the page table */
 
 	pgd = fixup_pointer(&early_top_pgt, physaddr);
@@ -94,28 +102,30 @@ void __head __startup_64(unsigned long physaddr)
 
 	pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
 	pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+	pgtable_flags = _KERNPG_TABLE + sme_get_me_mask();
 
 	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
 		p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
 
 		i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
-		pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
-		pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
+		pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
+		pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
 
 		i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
-		p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
-		p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+		p4d[i + 0] = (pgdval_t)pud + pgtable_flags;
+		p4d[i + 1] = (pgdval_t)pud + pgtable_flags;
 	} else {
 		i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
-		pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
-		pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+		pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
+		pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
 	}
 
 	i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
-	pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
-	pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
+	pud[i + 0] = (pudval_t)pmd + pgtable_flags;
+	pud[i + 1] = (pudval_t)pmd + pgtable_flags;
 
 	pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+	pmd_entry += sme_get_me_mask();
 	pmd_entry +=  physaddr;
 
 	for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
@@ -136,9 +146,30 @@ void __head __startup_64(unsigned long physaddr)
 			pmd[i] += load_delta;
 	}
 
-	/* Fixup phys_base */
+	/*
+	 * Fixup phys_base - remove the memory encryption mask to obtain
+	 * the true physical address.
+	 */
 	p = fixup_pointer(&phys_base, physaddr);
-	*p += load_delta;
+	*p += load_delta - sme_get_me_mask();
+
+	/* Encrypt the kernel (if SME is active) */
+	sme_encrypt_kernel();
+
+	/*
+	 * Return the SME encryption mask (if SME is active) to be used as a
+	 * modifier for the initial pgdir entry programmed into CR3.
+	 */
+	return sme_get_me_mask();
+}
+
+unsigned long __startup_secondary_64(void)
+{
+	/*
+	 * Return the SME encryption mask (if SME is active) to be used as a
+	 * modifier for the initial pgdir entry programmed into CR3.
+	 */
+	return sme_get_me_mask();
 }
 
 /* Wipe all early page tables except for the kernel symbol map */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6225550883df..ec5d5e90c8f1 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -73,12 +73,19 @@ startup_64:
 	/* Sanitize CPU configuration */
 	call verify_cpu
 
+	/*
+	 * Perform pagetable fixups. Additionally, if SME is active, encrypt
+	 * the kernel and retrieve the modifier (SME encryption mask if SME
+	 * is active) to be added to the initial pgdir entry that will be
+	 * programmed into CR3.
+	 */
 	leaq	_text(%rip), %rdi
 	pushq	%rsi
 	call	__startup_64
 	popq	%rsi
 
-	movq	$(early_top_pgt - __START_KERNEL_map), %rax
+	/* Form the CR3 value being sure to include the CR3 modifier */
+	addq	$(early_top_pgt - __START_KERNEL_map), %rax
 	jmp 1f
 ENTRY(secondary_startup_64)
 	/*
@@ -98,7 +105,16 @@ ENTRY(secondary_startup_64)
 	/* Sanitize CPU configuration */
 	call verify_cpu
 
-	movq	$(init_top_pgt - __START_KERNEL_map), %rax
+	/*
+	 * Retrieve the modifier (SME encryption mask if SME is active) to be
+	 * added to the initial pgdir entry that will be programmed into CR3.
+	 */
+	pushq	%rsi
+	call	__startup_secondary_64
+	popq	%rsi
+
+	/* Form the CR3 value being sure to include the CR3 modifier */
+	addq	$(init_top_pgt - __START_KERNEL_map), %rax
 1:
 
 	/* Enable PAE mode, PGE and LA57 */
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index b99d469c73e7..3ac6f99b095c 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/init.h>
 
 /*
  * Since SME related variables are set early in the boot process they must
@@ -19,3 +20,11 @@
  */
 unsigned long sme_me_mask __section(.data) = 0;
 EXPORT_SYMBOL_GPL(sme_me_mask);
+
+void __init sme_encrypt_kernel(void)
+{
+}
+
+void __init sme_enable(void)
+{
+}
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
index 59769f7287e4..570f4fcff13f 100644
--- a/include/linux/mem_encrypt.h
+++ b/include/linux/mem_encrypt.h
@@ -30,6 +30,11 @@ static inline bool sme_active(void)
 	return !!sme_me_mask;
 }
 
+static inline unsigned long sme_get_me_mask(void)
+{
+	return sme_me_mask;
+}
+
 #endif	/* __ASSEMBLY__ */
 
 #endif	/* __MEM_ENCRYPT_H__ */

From fd7e315988b784509ba3f1b42f539bd0b1fca9bb Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:06 -0500
Subject: [PATCH 020/149] x86/mm: Simplify p[g4um]d_page() macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Create a pgd_pfn() macro similar to the p[4um]d_pfn() macros and then
use the p[g4um]d_pfn() macros in the p[g4um]d_page() macros instead of
duplicating the code.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/e61eb533a6d0aac941db2723d8aa63ef6b882dee.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 77037b6f1caa..b64ea527edfb 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -195,6 +195,11 @@ static inline unsigned long p4d_pfn(p4d_t p4d)
 	return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
 }
 
+static inline unsigned long pgd_pfn(pgd_t pgd)
+{
+	return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
 static inline int p4d_large(p4d_t p4d)
 {
 	/* No 512 GiB pages yet */
@@ -704,8 +709,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
  * Currently stuck as a macro due to indirect forward reference to
  * linux/mmzone.h's __section_mem_map_addr() definition:
  */
-#define pmd_page(pmd)		\
-	pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT)
+#define pmd_page(pmd)	pfn_to_page(pmd_pfn(pmd))
 
 /*
  * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -773,8 +777,7 @@ static inline unsigned long pud_page_vaddr(pud_t pud)
  * Currently stuck as a macro due to indirect forward reference to
  * linux/mmzone.h's __section_mem_map_addr() definition:
  */
-#define pud_page(pud)		\
-	pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT)
+#define pud_page(pud)	pfn_to_page(pud_pfn(pud))
 
 /* Find an entry in the second-level page table.. */
 static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
@@ -824,8 +827,7 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d)
  * Currently stuck as a macro due to indirect forward reference to
  * linux/mmzone.h's __section_mem_map_addr() definition:
  */
-#define p4d_page(p4d)		\
-	pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT)
+#define p4d_page(p4d)	pfn_to_page(p4d_pfn(p4d))
 
 /* Find an entry in the third-level page table.. */
 static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
@@ -859,7 +861,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
  * Currently stuck as a macro due to indirect forward reference to
  * linux/mmzone.h's __section_mem_map_addr() definition:
  */
-#define pgd_page(pgd)		pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
+#define pgd_page(pgd)	pfn_to_page(pgd_pfn(pgd))
 
 /* to find an entry in a page-table-directory. */
 static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)

From 21729f81ce8ae76a6995681d40e16f7ce8075db4 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:07 -0500
Subject: [PATCH 021/149] x86/mm: Provide general kernel support for memory
 encryption
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Changes to the existing page table macros will allow the SME support to
be enabled in a simple fashion with minimal changes to files that use these
macros.  Since the memory encryption mask will now be part of the regular
pagetable macros, we introduce two new macros (_PAGE_TABLE_NOENC and
_KERNPG_TABLE_NOENC) to allow for early pagetable creation/initialization
without the encryption mask before SME becomes active.  Two new pgprot()
macros are defined to allow setting or clearing the page encryption mask.

The FIXMAP_PAGE_NOCACHE define is introduced for use with MMIO.  SME does
not support encryption for MMIO areas so this define removes the encryption
mask from the page attribute.

Two new macros are introduced (__sme_pa() / __sme_pa_nodebug()) to allow
creating a physical address with the encryption mask.  These are used when
working with the cr3 register so that the PGD can be encrypted. The current
__va() macro is updated so that the virtual address is generated based off
of the physical address without the encryption mask thus allowing the same
virtual address to be generated regardless of whether encryption is enabled
for that physical location or not.

Also, an early initialization function is added for SME.  If SME is active,
this function:

 - Updates the early_pmd_flags so that early page faults create mappings
   with the encryption mask.

 - Updates the __supported_pte_mask to include the encryption mask.

 - Updates the protection_map entries to include the encryption mask so
   that user-space allocations will automatically have the encryption mask
   applied.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/b36e952c4c39767ae7f0a41cf5345adf27438480.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/pagetable.c |  7 +++++
 arch/x86/include/asm/fixmap.h        |  7 +++++
 arch/x86/include/asm/mem_encrypt.h   | 13 +++++++++
 arch/x86/include/asm/page_types.h    |  3 +-
 arch/x86/include/asm/pgtable.h       |  9 ++++++
 arch/x86/include/asm/pgtable_types.h | 43 ++++++++++++++++++----------
 arch/x86/include/asm/processor.h     |  3 +-
 arch/x86/kernel/espfix_64.c          |  2 +-
 arch/x86/kernel/head64.c             | 11 +++++--
 arch/x86/kernel/head_64.S            | 20 ++++++-------
 arch/x86/mm/kasan_init_64.c          |  4 +--
 arch/x86/mm/mem_encrypt.c            | 17 +++++++++++
 arch/x86/mm/pageattr.c               |  3 ++
 arch/x86/mm/tlb.c                    |  4 +--
 include/asm-generic/pgtable.h        | 12 ++++++++
 include/linux/mem_encrypt.h          |  8 ++++++
 16 files changed, 132 insertions(+), 34 deletions(-)

diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 28029be47fbb..f1aa43854bed 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -15,6 +15,13 @@
 #define __pa(x)  ((unsigned long)(x))
 #define __va(x)  ((void *)((unsigned long)(x)))
 
+/*
+ * The pgtable.h and mm/ident_map.c includes make use of the SME related
+ * information which is not used in the compressed image support. Un-define
+ * the SME support to avoid any compile and link errors.
+ */
+#undef CONFIG_AMD_MEM_ENCRYPT
+
 #include "misc.h"
 
 /* These actually do the work of building the kernel identity maps. */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index b65155cc3760..d9ff226cb489 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -157,6 +157,13 @@ static inline void __set_fixmap(enum fixed_addresses idx,
 }
 #endif
 
+/*
+ * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not
+ * supported for MMIO addresses, so make sure that the memory encryption
+ * mask is not part of the page attributes.
+ */
+#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE
+
 #include <asm-generic/fixmap.h>
 
 #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags)
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 475e34f53793..dbae7a5a347d 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -21,6 +21,8 @@
 
 extern unsigned long sme_me_mask;
 
+void __init sme_early_init(void);
+
 void __init sme_encrypt_kernel(void);
 void __init sme_enable(void);
 
@@ -28,11 +30,22 @@ void __init sme_enable(void);
 
 #define sme_me_mask	0UL
 
+static inline void __init sme_early_init(void) { }
+
 static inline void __init sme_encrypt_kernel(void) { }
 static inline void __init sme_enable(void) { }
 
 #endif	/* CONFIG_AMD_MEM_ENCRYPT */
 
+/*
+ * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when
+ * writing to or comparing values from the cr3 register.  Having the
+ * encryption mask set in cr3 enables the PGD entry to be encrypted and
+ * avoid special case handling of PGD allocations.
+ */
+#define __sme_pa(x)		(__pa(x) | sme_me_mask)
+#define __sme_pa_nodebug(x)	(__pa_nodebug(x) | sme_me_mask)
+
 #endif	/* __ASSEMBLY__ */
 
 #endif	/* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 7bd0099384ca..b98ed9d14630 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -3,6 +3,7 @@
 
 #include <linux/const.h>
 #include <linux/types.h>
+#include <linux/mem_encrypt.h>
 
 /* PAGE_SHIFT determines the page size */
 #define PAGE_SHIFT		12
@@ -15,7 +16,7 @@
 #define PUD_PAGE_SIZE		(_AC(1, UL) << PUD_SHIFT)
 #define PUD_PAGE_MASK		(~(PUD_PAGE_SIZE-1))
 
-#define __PHYSICAL_MASK		((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
+#define __PHYSICAL_MASK		((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1)))
 #define __VIRTUAL_MASK		((1UL << __VIRTUAL_MASK_SHIFT) - 1)
 
 /* Cast *PAGE_MASK to a signed type so that it is sign-extended if
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index b64ea527edfb..c6452cb12c0b 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_X86_PGTABLE_H
 #define _ASM_X86_PGTABLE_H
 
+#include <linux/mem_encrypt.h>
 #include <asm/page.h>
 #include <asm/pgtable_types.h>
 
@@ -13,6 +14,12 @@
 		     cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS)))	\
 	 : (prot))
 
+/*
+ * Macros to add or remove encryption attribute
+ */
+#define pgprot_encrypted(prot)	__pgprot(__sme_set(pgprot_val(prot)))
+#define pgprot_decrypted(prot)	__pgprot(__sme_clr(pgprot_val(prot)))
+
 #ifndef __ASSEMBLY__
 #include <asm/x86_init.h>
 
@@ -38,6 +45,8 @@ extern struct list_head pgd_list;
 
 extern struct mm_struct *pgd_page_get_mm(struct page *page);
 
+extern pmdval_t early_pmd_flags;
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index bf9638e1ee42..de32ca32928a 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -2,6 +2,8 @@
 #define _ASM_X86_PGTABLE_DEFS_H
 
 #include <linux/const.h>
+#include <linux/mem_encrypt.h>
+
 #include <asm/page_types.h>
 
 #define FIRST_USER_ADDRESS	0UL
@@ -121,10 +123,10 @@
 
 #define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
-#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
-			 _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
-			 _PAGE_DIRTY)
+#define _PAGE_TABLE_NOENC	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
+				 _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE_NOENC	(_PAGE_PRESENT | _PAGE_RW |		\
+				 _PAGE_ACCESSED | _PAGE_DIRTY)
 
 /*
  * Set of bits not changed in pte_modify.  The pte's
@@ -191,18 +193,29 @@ enum page_cache_mode {
 #define __PAGE_KERNEL_IO		(__PAGE_KERNEL)
 #define __PAGE_KERNEL_IO_NOCACHE	(__PAGE_KERNEL_NOCACHE)
 
-#define PAGE_KERNEL			__pgprot(__PAGE_KERNEL)
-#define PAGE_KERNEL_RO			__pgprot(__PAGE_KERNEL_RO)
-#define PAGE_KERNEL_EXEC		__pgprot(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RX			__pgprot(__PAGE_KERNEL_RX)
-#define PAGE_KERNEL_NOCACHE		__pgprot(__PAGE_KERNEL_NOCACHE)
-#define PAGE_KERNEL_LARGE		__pgprot(__PAGE_KERNEL_LARGE)
-#define PAGE_KERNEL_LARGE_EXEC		__pgprot(__PAGE_KERNEL_LARGE_EXEC)
-#define PAGE_KERNEL_VSYSCALL		__pgprot(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VVAR		__pgprot(__PAGE_KERNEL_VVAR)
+#ifndef __ASSEMBLY__
 
-#define PAGE_KERNEL_IO			__pgprot(__PAGE_KERNEL_IO)
-#define PAGE_KERNEL_IO_NOCACHE		__pgprot(__PAGE_KERNEL_IO_NOCACHE)
+#define _PAGE_ENC	(_AT(pteval_t, sme_me_mask))
+
+#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
+			 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC)
+#define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
+			 _PAGE_DIRTY | _PAGE_ENC)
+
+#define PAGE_KERNEL		__pgprot(__PAGE_KERNEL | _PAGE_ENC)
+#define PAGE_KERNEL_RO		__pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
+#define PAGE_KERNEL_EXEC	__pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_RX		__pgprot(__PAGE_KERNEL_RX | _PAGE_ENC)
+#define PAGE_KERNEL_NOCACHE	__pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
+#define PAGE_KERNEL_LARGE	__pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
+#define PAGE_KERNEL_LARGE_EXEC	__pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_VSYSCALL	__pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC)
+#define PAGE_KERNEL_VVAR	__pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC)
+
+#define PAGE_KERNEL_IO		__pgprot(__PAGE_KERNEL_IO)
+#define PAGE_KERNEL_IO_NOCACHE	__pgprot(__PAGE_KERNEL_IO_NOCACHE)
+
+#endif	/* __ASSEMBLY__ */
 
 /*         xwr */
 #define __P000	PAGE_NONE
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6a79547e8ee0..a68f70c3debc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -29,6 +29,7 @@ struct vm86;
 #include <linux/math64.h>
 #include <linux/err.h>
 #include <linux/irqflags.h>
+#include <linux/mem_encrypt.h>
 
 /*
  * We handle most unaligned accesses in hardware.  On the other hand
@@ -241,7 +242,7 @@ static inline unsigned long read_cr3_pa(void)
 
 static inline void load_cr3(pgd_t *pgdir)
 {
-	write_cr3(__pa(pgdir));
+	write_cr3(__sme_pa(pgdir));
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 6b91e2eb8d3f..9c4e7ba6870c 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -195,7 +195,7 @@ void init_espfix_ap(int cpu)
 
 	pte_p = pte_offset_kernel(&pmd, addr);
 	stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
-	pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
+	pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask));
 	for (n = 0; n < ESPFIX_PTE_CLONES; n++)
 		set_pte(&pte_p[n*PTE_STRIDE], pte);
 
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 1f0ddcc9675c..5cd0b72a0283 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -102,7 +102,7 @@ unsigned long __head __startup_64(unsigned long physaddr)
 
 	pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
 	pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
-	pgtable_flags = _KERNPG_TABLE + sme_get_me_mask();
+	pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
 
 	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
 		p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
@@ -177,7 +177,7 @@ static void __init reset_early_page_tables(void)
 {
 	memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
 	next_early_pgt = 0;
-	write_cr3(__pa_nodebug(early_top_pgt));
+	write_cr3(__sme_pa_nodebug(early_top_pgt));
 }
 
 /* Create a new PMD entry */
@@ -310,6 +310,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 
 	clear_page(init_top_pgt);
 
+	/*
+	 * SME support may update early_pmd_flags to include the memory
+	 * encryption mask, so it needs to be called before anything
+	 * that may generate a page fault.
+	 */
+	sme_early_init();
+
 	kasan_early_init();
 
 	for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ec5d5e90c8f1..513cbb012ecc 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -351,9 +351,9 @@ GLOBAL(name)
 NEXT_PAGE(early_top_pgt)
 	.fill	511,8,0
 #ifdef CONFIG_X86_5LEVEL
-	.quad	level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad	level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #else
-	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #endif
 
 NEXT_PAGE(early_dynamic_pgts)
@@ -366,15 +366,15 @@ NEXT_PAGE(init_top_pgt)
 	.fill	512,8,0
 #else
 NEXT_PAGE(init_top_pgt)
-	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
-	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.org    init_top_pgt + PGD_START_KERNEL*8, 0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 
 NEXT_PAGE(level3_ident_pgt)
-	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
+	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.fill	511, 8, 0
 NEXT_PAGE(level2_ident_pgt)
 	/* Since I easily can, map the first 1G.
@@ -386,14 +386,14 @@ NEXT_PAGE(level2_ident_pgt)
 #ifdef CONFIG_X86_5LEVEL
 NEXT_PAGE(level4_kernel_pgt)
 	.fill	511,8,0
-	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #endif
 
 NEXT_PAGE(level3_kernel_pgt)
 	.fill	L3_START_KERNEL,8,0
 	/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
-	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
-	.quad	level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+	.quad	level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 
 NEXT_PAGE(level2_kernel_pgt)
 	/*
@@ -411,7 +411,7 @@ NEXT_PAGE(level2_kernel_pgt)
 
 NEXT_PAGE(level2_fixmap_pgt)
 	.fill	506,8,0
-	.quad	level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.quad	level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 	/* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
 	.fill	5,8,0
 
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 02c9d7553409..39d4daf5e289 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -87,7 +87,7 @@ static struct notifier_block kasan_die_notifier = {
 void __init kasan_early_init(void)
 {
 	int i;
-	pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
+	pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC;
 	pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
 	pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
 	p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE;
@@ -153,7 +153,7 @@ void __init kasan_init(void)
 	 */
 	memset(kasan_zero_page, 0, PAGE_SIZE);
 	for (i = 0; i < PTRS_PER_PTE; i++) {
-		pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
+		pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC);
 		set_pte(&kasan_zero_pte[i], pte);
 	}
 	/* Flush TLBs again to be sure that write protection applied. */
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 3ac6f99b095c..f973d3dc3802 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -12,6 +12,7 @@
 
 #include <linux/linkage.h>
 #include <linux/init.h>
+#include <linux/mm.h>
 
 /*
  * Since SME related variables are set early in the boot process they must
@@ -21,6 +22,22 @@
 unsigned long sme_me_mask __section(.data) = 0;
 EXPORT_SYMBOL_GPL(sme_me_mask);
 
+void __init sme_early_init(void)
+{
+	unsigned int i;
+
+	if (!sme_me_mask)
+		return;
+
+	early_pmd_flags = __sme_set(early_pmd_flags);
+
+	__supported_pte_mask = __sme_set(__supported_pte_mask);
+
+	/* Update the protection map with memory encryption mask */
+	for (i = 0; i < ARRAY_SIZE(protection_map); i++)
+		protection_map[i] = pgprot_encrypted(protection_map[i]);
+}
+
 void __init sme_encrypt_kernel(void)
 {
 }
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 757b0bcdf712..7e2d6c0a64c4 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -2020,6 +2020,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
 	if (!(page_flags & _PAGE_RW))
 		cpa.mask_clr = __pgprot(_PAGE_RW);
 
+	if (!(page_flags & _PAGE_ENC))
+		cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);
+
 	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
 
 	retval = __change_page_attr_set_clr(&cpa, 0);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 2c1b8881e9d3..593d2f76a54c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -115,7 +115,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			 */
 			this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen,
 				       next_tlb_gen);
-			write_cr3(__pa(next->pgd));
+			write_cr3(__sme_pa(next->pgd));
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
 					TLB_FLUSH_ALL);
 		}
@@ -157,7 +157,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
 		this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen);
 		this_cpu_write(cpu_tlbstate.loaded_mm, next);
-		write_cr3(__pa(next->pgd));
+		write_cr3(__sme_pa(next->pgd));
 
 		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 	}
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 7dfa767dc680..4d7bb98f4134 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -582,6 +582,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
 #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
 #endif /* CONFIG_MMU */
 
+/*
+ * No-op macros that just return the current protection value. Defined here
+ * because these macros can be used used even if CONFIG_MMU is not defined.
+ */
+#ifndef pgprot_encrypted
+#define pgprot_encrypted(prot)	(prot)
+#endif
+
+#ifndef pgprot_decrypted
+#define pgprot_decrypted(prot)	(prot)
+#endif
+
 /*
  * A facility to provide lazy MMU batching.  This allows PTE updates and
  * page invalidations to be delayed until a call to leave lazy MMU mode
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
index 570f4fcff13f..1255f09f5e42 100644
--- a/include/linux/mem_encrypt.h
+++ b/include/linux/mem_encrypt.h
@@ -35,6 +35,14 @@ static inline unsigned long sme_get_me_mask(void)
 	return sme_me_mask;
 }
 
+/*
+ * The __sme_set() and __sme_clr() macros are useful for adding or removing
+ * the encryption mask from a value (e.g. when dealing with pagetable
+ * entries).
+ */
+#define __sme_set(x)		((unsigned long)(x) | sme_me_mask)
+#define __sme_clr(x)		((unsigned long)(x) & ~sme_me_mask)
+
 #endif	/* __ASSEMBLY__ */
 
 #endif	/* __MEM_ENCRYPT_H__ */

From eef9c4abe77f55b1600f59d8ac5f1d953e2f5384 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:08 -0500
Subject: [PATCH 022/149] x86/mm: Add SME support for read_cr3_pa()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CR3 register entry can contain the SME encryption mask that indicates
the PGD is encrypted.  The encryption mask should not be used when
creating a virtual address from the CR3 register, so remove the SME
encryption mask in the read_cr3_pa() function.

During early boot SME will need to use a native version of read_cr3_pa(),
so create native_read_cr3_pa().

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/767b085c384a46f67f451f8589903a462c7ff68a.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor-flags.h | 5 +++--
 arch/x86/include/asm/processor.h       | 5 +++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 79aa2f98398d..f5d3e50af98c 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_PROCESSOR_FLAGS_H
 
 #include <uapi/asm/processor-flags.h>
+#include <linux/mem_encrypt.h>
 
 #ifdef CONFIG_VM86
 #define X86_VM_MASK	X86_EFLAGS_VM
@@ -32,8 +33,8 @@
  * CR3_ADDR_MASK is the mask used by read_cr3_pa().
  */
 #ifdef CONFIG_X86_64
-/* Mask off the address space ID bits. */
-#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
+/* Mask off the address space ID and SME encryption bits. */
+#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
 #define CR3_PCID_MASK 0xFFFull
 #else
 /*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a68f70c3debc..973709d2938f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -240,6 +240,11 @@ static inline unsigned long read_cr3_pa(void)
 	return __read_cr3() & CR3_ADDR_MASK;
 }
 
+static inline unsigned long native_read_cr3_pa(void)
+{
+	return __native_read_cr3() & CR3_ADDR_MASK;
+}
+
 static inline void load_cr3(pgd_t *pgdir)
 {
 	write_cr3(__sme_pa(pgdir));

From f88a68facd9a15b94f8c195d9d2c0b30c76c595a Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:09 -0500
Subject: [PATCH 023/149] x86/mm: Extend early_memremap() support with
 additional attrs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add early_memremap() support to be able to specify encrypted and
decrypted mappings with and without write-protection. The use of
write-protection is necessary when encrypting data "in place". The
write-protect attribute is considered cacheable for loads, but not
stores. This implies that the hardware will never give the core a
dirty line with this memtype.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/479b5832c30fae3efa7932e48f81794e86397229.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig                     |  4 +++
 arch/x86/include/asm/fixmap.h        | 13 ++++++++
 arch/x86/include/asm/pgtable_types.h |  8 +++++
 arch/x86/mm/ioremap.c                | 44 ++++++++++++++++++++++++++++
 include/asm-generic/early_ioremap.h  |  2 ++
 mm/early_ioremap.c                   | 10 +++++++
 6 files changed, 81 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ba7b93d08d00..8328bcb9ce8b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1440,6 +1440,10 @@ config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
 	  If set to N, then the encryption of system memory can be
 	  activated with the mem_encrypt=on command line option.
 
+config ARCH_USE_MEMREMAP_PROT
+	def_bool y
+	depends on AMD_MEM_ENCRYPT
+
 # Common NUMA Features
 config NUMA
 	bool "Numa Memory Allocation and Scheduler Support"
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index d9ff226cb489..dcd9fb55e679 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -164,6 +164,19 @@ static inline void __set_fixmap(enum fixed_addresses idx,
  */
 #define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE
 
+/*
+ * Early memremap routines used for in-place encryption. The mappings created
+ * by these routines are intended to be used as temporary mappings.
+ */
+void __init *early_memremap_encrypted(resource_size_t phys_addr,
+				      unsigned long size);
+void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
+					 unsigned long size);
+void __init *early_memremap_decrypted(resource_size_t phys_addr,
+				      unsigned long size);
+void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
+					 unsigned long size);
+
 #include <asm-generic/fixmap.h>
 
 #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index de32ca32928a..32095af0fefb 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -161,6 +161,7 @@ enum page_cache_mode {
 
 #define _PAGE_CACHE_MASK	(_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
 #define _PAGE_NOCACHE		(cachemode2protval(_PAGE_CACHE_MODE_UC))
+#define _PAGE_CACHE_WP		(cachemode2protval(_PAGE_CACHE_MODE_WP))
 
 #define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
 #define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
@@ -189,6 +190,7 @@ enum page_cache_mode {
 #define __PAGE_KERNEL_VVAR		(__PAGE_KERNEL_RO | _PAGE_USER)
 #define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
+#define __PAGE_KERNEL_WP		(__PAGE_KERNEL | _PAGE_CACHE_WP)
 
 #define __PAGE_KERNEL_IO		(__PAGE_KERNEL)
 #define __PAGE_KERNEL_IO_NOCACHE	(__PAGE_KERNEL_NOCACHE)
@@ -202,6 +204,12 @@ enum page_cache_mode {
 #define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
 			 _PAGE_DIRTY | _PAGE_ENC)
 
+#define __PAGE_KERNEL_ENC	(__PAGE_KERNEL | _PAGE_ENC)
+#define __PAGE_KERNEL_ENC_WP	(__PAGE_KERNEL_WP | _PAGE_ENC)
+
+#define __PAGE_KERNEL_NOENC	(__PAGE_KERNEL)
+#define __PAGE_KERNEL_NOENC_WP	(__PAGE_KERNEL_WP)
+
 #define PAGE_KERNEL		__pgprot(__PAGE_KERNEL | _PAGE_ENC)
 #define PAGE_KERNEL_RO		__pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
 #define PAGE_KERNEL_EXEC	__pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 66ddf5e8ffc8..570201bbf442 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -417,6 +417,50 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
 	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
 }
 
+#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
+/* Remap memory with encryption */
+void __init *early_memremap_encrypted(resource_size_t phys_addr,
+				      unsigned long size)
+{
+	return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC);
+}
+
+/*
+ * Remap memory with encryption and write-protected - cannot be called
+ * before pat_init() is called
+ */
+void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
+					 unsigned long size)
+{
+	/* Be sure the write-protect PAT entry is set for write-protect */
+	if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+		return NULL;
+
+	return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP);
+}
+
+/* Remap memory without encryption */
+void __init *early_memremap_decrypted(resource_size_t phys_addr,
+				      unsigned long size)
+{
+	return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC);
+}
+
+/*
+ * Remap memory without encryption and write-protected - cannot be called
+ * before pat_init() is called
+ */
+void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
+					 unsigned long size)
+{
+	/* Be sure the write-protect PAT entry is set for write-protect */
+	if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+		return NULL;
+
+	return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP);
+}
+#endif	/* CONFIG_ARCH_USE_MEMREMAP_PROT */
+
 static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
 
 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h
index 734ad4db388c..2edef8d7fa6b 100644
--- a/include/asm-generic/early_ioremap.h
+++ b/include/asm-generic/early_ioremap.h
@@ -13,6 +13,8 @@ extern void *early_memremap(resource_size_t phys_addr,
 			    unsigned long size);
 extern void *early_memremap_ro(resource_size_t phys_addr,
 			       unsigned long size);
+extern void *early_memremap_prot(resource_size_t phys_addr,
+				 unsigned long size, unsigned long prot_val);
 extern void early_iounmap(void __iomem *addr, unsigned long size);
 extern void early_memunmap(void *addr, unsigned long size);
 
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 6d5717bd7197..d7d30da754ba 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -226,6 +226,16 @@ early_memremap_ro(resource_size_t phys_addr, unsigned long size)
 }
 #endif
 
+#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
+void __init *
+early_memremap_prot(resource_size_t phys_addr, unsigned long size,
+		    unsigned long prot_val)
+{
+	return (__force void *)__early_ioremap(phys_addr, size,
+					       __pgprot(prot_val));
+}
+#endif
+
 #define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
 
 void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)

From 7f8b7e7f4ccbbd1fb8badddfabd28c955aea87b4 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:10 -0500
Subject: [PATCH 024/149] x86/mm: Add support for early encryption/decryption
 of memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support to be able to either encrypt or decrypt data in place during
the early stages of booting the kernel. This does not change the memory
encryption attribute - it is used for ensuring that data present in either
an encrypted or decrypted memory area is in the proper state (for example
the initrd will have been loaded by the boot loader and will not be
encrypted, but the memory that it resides in is marked as encrypted).

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/f9968e9432cd6c4b57ef245729be04ff18852225.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mem_encrypt.h | 10 ++++
 arch/x86/mm/mem_encrypt.c          | 76 ++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index dbae7a5a347d..8baa35ba2316 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -21,6 +21,11 @@
 
 extern unsigned long sme_me_mask;
 
+void __init sme_early_encrypt(resource_size_t paddr,
+			      unsigned long size);
+void __init sme_early_decrypt(resource_size_t paddr,
+			      unsigned long size);
+
 void __init sme_early_init(void);
 
 void __init sme_encrypt_kernel(void);
@@ -30,6 +35,11 @@ void __init sme_enable(void);
 
 #define sme_me_mask	0UL
 
+static inline void __init sme_early_encrypt(resource_size_t paddr,
+					    unsigned long size) { }
+static inline void __init sme_early_decrypt(resource_size_t paddr,
+					    unsigned long size) { }
+
 static inline void __init sme_early_init(void) { }
 
 static inline void __init sme_encrypt_kernel(void) { }
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index f973d3dc3802..54bb73c3dd9d 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -14,6 +14,9 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+
 /*
  * Since SME related variables are set early in the boot process they must
  * reside in the .data section so as not to be zeroed out when the .bss
@@ -22,6 +25,79 @@
 unsigned long sme_me_mask __section(.data) = 0;
 EXPORT_SYMBOL_GPL(sme_me_mask);
 
+/* Buffer used for early in-place encryption by BSP, no locking needed */
+static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+/*
+ * This routine does not change the underlying encryption setting of the
+ * page(s) that map this memory. It assumes that eventually the memory is
+ * meant to be accessed as either encrypted or decrypted but the contents
+ * are currently not in the desired state.
+ *
+ * This routine follows the steps outlined in the AMD64 Architecture
+ * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
+ */
+static void __init __sme_early_enc_dec(resource_size_t paddr,
+				       unsigned long size, bool enc)
+{
+	void *src, *dst;
+	size_t len;
+
+	if (!sme_me_mask)
+		return;
+
+	local_flush_tlb();
+	wbinvd();
+
+	/*
+	 * There are limited number of early mapping slots, so map (at most)
+	 * one page at time.
+	 */
+	while (size) {
+		len = min_t(size_t, sizeof(sme_early_buffer), size);
+
+		/*
+		 * Create mappings for the current and desired format of
+		 * the memory. Use a write-protected mapping for the source.
+		 */
+		src = enc ? early_memremap_decrypted_wp(paddr, len) :
+			    early_memremap_encrypted_wp(paddr, len);
+
+		dst = enc ? early_memremap_encrypted(paddr, len) :
+			    early_memremap_decrypted(paddr, len);
+
+		/*
+		 * If a mapping can't be obtained to perform the operation,
+		 * then eventual access of that area in the desired mode
+		 * will cause a crash.
+		 */
+		BUG_ON(!src || !dst);
+
+		/*
+		 * Use a temporary buffer, of cache-line multiple size, to
+		 * avoid data corruption as documented in the APM.
+		 */
+		memcpy(sme_early_buffer, src, len);
+		memcpy(dst, sme_early_buffer, len);
+
+		early_memunmap(dst, len);
+		early_memunmap(src, len);
+
+		paddr += len;
+		size -= len;
+	}
+}
+
+void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
+{
+	__sme_early_enc_dec(paddr, size, true);
+}
+
+void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
+{
+	__sme_early_enc_dec(paddr, size, false);
+}
+
 void __init sme_early_init(void)
 {
 	unsigned int i;

From b9d05200bc12444c7778a49c9694d8382ed06aa8 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:11 -0500
Subject: [PATCH 025/149] x86/mm: Insure that boot memory areas are mapped
 properly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The boot data and command line data are present in memory in a decrypted
state and are copied early in the boot process.  The early page fault
support will map these areas as encrypted, so before attempting to copy
them, add decrypted mappings so the data is accessed properly when copied.

For the initrd, encrypt this data in place. Since the future mapping of
the initrd area will be mapped as encrypted the data will be accessed
properly.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/bb0d430b41efefd45ee515aaf0979dcfda8b6a44.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mem_encrypt.h |  6 +++
 arch/x86/include/asm/pgtable.h     |  3 ++
 arch/x86/kernel/head64.c           | 30 ++++++++++++--
 arch/x86/kernel/setup.c            |  9 +++++
 arch/x86/mm/kasan_init_64.c        |  2 +-
 arch/x86/mm/mem_encrypt.c          | 63 ++++++++++++++++++++++++++++++
 6 files changed, 108 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 8baa35ba2316..ab1fe77c2f73 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -26,6 +26,9 @@ void __init sme_early_encrypt(resource_size_t paddr,
 void __init sme_early_decrypt(resource_size_t paddr,
 			      unsigned long size);
 
+void __init sme_map_bootdata(char *real_mode_data);
+void __init sme_unmap_bootdata(char *real_mode_data);
+
 void __init sme_early_init(void);
 
 void __init sme_encrypt_kernel(void);
@@ -40,6 +43,9 @@ static inline void __init sme_early_encrypt(resource_size_t paddr,
 static inline void __init sme_early_decrypt(resource_size_t paddr,
 					    unsigned long size) { }
 
+static inline void __init sme_map_bootdata(char *real_mode_data) { }
+static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
+
 static inline void __init sme_early_init(void) { }
 
 static inline void __init sme_encrypt_kernel(void) { }
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index c6452cb12c0b..bbeae4a2bd01 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -23,6 +23,9 @@
 #ifndef __ASSEMBLY__
 #include <asm/x86_init.h>
 
+extern pgd_t early_top_pgt[PTRS_PER_PGD];
+int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
+
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
 void ptdump_walk_pgd_level_checkwx(void);
 
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 5cd0b72a0283..0cdb53bf4c4b 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -34,7 +34,6 @@
 /*
  * Manage page tables very early on.
  */
-extern pgd_t early_top_pgt[PTRS_PER_PGD];
 extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
 static unsigned int __initdata next_early_pgt;
 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
@@ -181,13 +180,13 @@ static void __init reset_early_page_tables(void)
 }
 
 /* Create a new PMD entry */
-int __init early_make_pgtable(unsigned long address)
+int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
 {
 	unsigned long physaddr = address - __PAGE_OFFSET;
 	pgdval_t pgd, *pgd_p;
 	p4dval_t p4d, *p4d_p;
 	pudval_t pud, *pud_p;
-	pmdval_t pmd, *pmd_p;
+	pmdval_t *pmd_p;
 
 	/* Invalid address or early pgt is done ?  */
 	if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
@@ -246,12 +245,21 @@ int __init early_make_pgtable(unsigned long address)
 		memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
 		*pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
 	}
-	pmd = (physaddr & PMD_MASK) + early_pmd_flags;
 	pmd_p[pmd_index(address)] = pmd;
 
 	return 0;
 }
 
+int __init early_make_pgtable(unsigned long address)
+{
+	unsigned long physaddr = address - __PAGE_OFFSET;
+	pmdval_t pmd;
+
+	pmd = (physaddr & PMD_MASK) + early_pmd_flags;
+
+	return __early_make_pgtable(address, pmd);
+}
+
 /* Don't add a printk in there. printk relies on the PDA which is not initialized 
    yet. */
 static void __init clear_bss(void)
@@ -274,6 +282,12 @@ static void __init copy_bootdata(char *real_mode_data)
 	char * command_line;
 	unsigned long cmd_line_ptr;
 
+	/*
+	 * If SME is active, this will create decrypted mappings of the
+	 * boot data in advance of the copy operations.
+	 */
+	sme_map_bootdata(real_mode_data);
+
 	memcpy(&boot_params, real_mode_data, sizeof boot_params);
 	sanitize_boot_params(&boot_params);
 	cmd_line_ptr = get_cmd_line_ptr();
@@ -281,6 +295,14 @@ static void __init copy_bootdata(char *real_mode_data)
 		command_line = __va(cmd_line_ptr);
 		memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
 	}
+
+	/*
+	 * The old boot data is no longer needed and won't be reserved,
+	 * freeing up that memory for use by the system. If SME is active,
+	 * we need to remove the mappings that were created so that the
+	 * memory doesn't remain mapped as decrypted.
+	 */
+	sme_unmap_bootdata(real_mode_data);
 }
 
 asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3486d0498800..0bfe0c1628f6 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -69,6 +69,7 @@
 #include <linux/crash_dump.h>
 #include <linux/tboot.h>
 #include <linux/jiffies.h>
+#include <linux/mem_encrypt.h>
 
 #include <linux/usb/xhci-dbgp.h>
 #include <video/edid.h>
@@ -374,6 +375,14 @@ static void __init reserve_initrd(void)
 	    !ramdisk_image || !ramdisk_size)
 		return;		/* No initrd provided by bootloader */
 
+	/*
+	 * If SME is active, this memory will be marked encrypted by the
+	 * kernel when it is accessed (including relocation). However, the
+	 * ramdisk image was loaded decrypted by the bootloader, so make
+	 * sure that it is encrypted before accessing it.
+	 */
+	sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
+
 	initrd_start = 0;
 
 	mapped_size = memblock_mem_size(max_pfn_mapped);
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 39d4daf5e289..bc84b73684b7 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -11,8 +11,8 @@
 #include <asm/e820/types.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
+#include <asm/pgtable.h>
 
-extern pgd_t early_top_pgt[PTRS_PER_PGD];
 extern struct range pfn_mapped[E820_MAX_ENTRIES];
 
 static int __init map_range(struct range *range)
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 54bb73c3dd9d..0843d023da4a 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -16,6 +16,8 @@
 
 #include <asm/tlbflush.h>
 #include <asm/fixmap.h>
+#include <asm/setup.h>
+#include <asm/bootparam.h>
 
 /*
  * Since SME related variables are set early in the boot process they must
@@ -98,6 +100,67 @@ void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
 	__sme_early_enc_dec(paddr, size, false);
 }
 
+static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
+					     bool map)
+{
+	unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
+	pmdval_t pmd_flags, pmd;
+
+	/* Use early_pmd_flags but remove the encryption mask */
+	pmd_flags = __sme_clr(early_pmd_flags);
+
+	do {
+		pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
+		__early_make_pgtable((unsigned long)vaddr, pmd);
+
+		vaddr += PMD_SIZE;
+		paddr += PMD_SIZE;
+		size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
+	} while (size);
+
+	__native_flush_tlb();
+}
+
+void __init sme_unmap_bootdata(char *real_mode_data)
+{
+	struct boot_params *boot_data;
+	unsigned long cmdline_paddr;
+
+	if (!sme_active())
+		return;
+
+	/* Get the command line address before unmapping the real_mode_data */
+	boot_data = (struct boot_params *)real_mode_data;
+	cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+	__sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
+
+	if (!cmdline_paddr)
+		return;
+
+	__sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
+}
+
+void __init sme_map_bootdata(char *real_mode_data)
+{
+	struct boot_params *boot_data;
+	unsigned long cmdline_paddr;
+
+	if (!sme_active())
+		return;
+
+	__sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
+
+	/* Get the command line address after mapping the real_mode_data */
+	boot_data = (struct boot_params *)real_mode_data;
+	cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+	if (!cmdline_paddr)
+		return;
+
+	__sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
+}
+
 void __init sme_early_init(void)
 {
 	unsigned int i;

From d68baa3fa6e4d703fd0c7954ee5c739789e7242f Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:12 -0500
Subject: [PATCH 026/149] x86/boot/e820: Add support to determine the E820 type
 of an address
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a function that will return the E820 type associated with an address
range.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/b797aaa588803bf33263d5dd8c32377668fa931a.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/e820/api.h |  2 ++
 arch/x86/kernel/e820.c          | 26 +++++++++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index a504adc661a4..cd266d830e49 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -39,6 +39,8 @@ extern void e820__setup_pci_gap(void);
 extern void e820__reallocate_tables(void);
 extern void e820__register_nosave_regions(unsigned long limit_pfn);
 
+extern int  e820__get_entry_type(u64 start, u64 end);
+
 /*
  * Returns true iff the specified range [start,end) is completely contained inside
  * the ISA region.
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 532da61d605c..71c11ad5643e 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -96,7 +96,8 @@ EXPORT_SYMBOL_GPL(e820__mapped_any);
  * Note: this function only works correctly once the E820 table is sorted and
  * not-overlapping (at least for the range specified), which is the case normally.
  */
-bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
+static struct e820_entry *__e820__mapped_all(u64 start, u64 end,
+					     enum e820_type type)
 {
 	int i;
 
@@ -122,9 +123,28 @@ bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
 		 * coverage of the desired range exists:
 		 */
 		if (start >= end)
-			return 1;
+			return entry;
 	}
-	return 0;
+
+	return NULL;
+}
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ */
+bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type)
+{
+	return __e820__mapped_all(start, end, type);
+}
+
+/*
+ * This function returns the type associated with the range <start,end>.
+ */
+int e820__get_entry_type(u64 start, u64 end)
+{
+	struct e820_entry *entry = __e820__mapped_all(start, end, 0);
+
+	return entry ? entry->type : -EINVAL;
 }
 
 /*

From a19d66c56af1c52b8b463bf94d21116ae8c1aa5a Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:13 -0500
Subject: [PATCH 027/149] efi: Add an EFI table address match function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a function that will determine if a supplied physical address matches
the address of an EFI table.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/e1e06441d80f44776df391e0e4cb485b345b7518.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/firmware/efi/efi.c | 33 +++++++++++++++++++++++++++++++++
 include/linux/efi.h        |  7 +++++++
 2 files changed, 40 insertions(+)

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 045d6d311bde..69d4d130e055 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -55,6 +55,25 @@ struct efi __read_mostly efi = {
 };
 EXPORT_SYMBOL(efi);
 
+static unsigned long *efi_tables[] = {
+	&efi.mps,
+	&efi.acpi,
+	&efi.acpi20,
+	&efi.smbios,
+	&efi.smbios3,
+	&efi.sal_systab,
+	&efi.boot_info,
+	&efi.hcdp,
+	&efi.uga,
+	&efi.uv_systab,
+	&efi.fw_vendor,
+	&efi.runtime,
+	&efi.config_table,
+	&efi.esrt,
+	&efi.properties_table,
+	&efi.mem_attr_table,
+};
+
 static bool disable_runtime;
 static int __init setup_noefi(char *arg)
 {
@@ -855,6 +874,20 @@ int efi_status_to_err(efi_status_t status)
 	return err;
 }
 
+bool efi_is_table_address(unsigned long phys_addr)
+{
+	unsigned int i;
+
+	if (phys_addr == EFI_INVALID_TABLE_ADDR)
+		return false;
+
+	for (i = 0; i < ARRAY_SIZE(efi_tables); i++)
+		if (*(efi_tables[i]) == phys_addr)
+			return true;
+
+	return false;
+}
+
 #ifdef CONFIG_KEXEC
 static int update_efi_random_seed(struct notifier_block *nb,
 				  unsigned long code, void *unused)
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 8269bcb8ccf7..8e24f099bd3f 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1091,6 +1091,8 @@ static inline bool efi_enabled(int feature)
 	return test_bit(feature, &efi.flags) != 0;
 }
 extern void efi_reboot(enum reboot_mode reboot_mode, const char *__unused);
+
+extern bool efi_is_table_address(unsigned long phys_addr);
 #else
 static inline bool efi_enabled(int feature)
 {
@@ -1104,6 +1106,11 @@ efi_capsule_pending(int *reset_type)
 {
 	return false;
 }
+
+static inline bool efi_is_table_address(unsigned long phys_addr)
+{
+	return false;
+}
 #endif
 
 extern int efi_status_to_err(efi_status_t status);

From f99afd08a45fbbd9ce35a7624ffd1d850a1906c0 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:14 -0500
Subject: [PATCH 028/149] efi: Update efi_mem_type() to return an error rather
 than 0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The efi_mem_type() function currently returns a 0, which maps to
EFI_RESERVED_TYPE, if the function is unable to find a memmap entry for
the supplied physical address. Returning EFI_RESERVED_TYPE implies that
a memmap entry exists, when it doesn't.  Instead of returning 0, change
the function to return a negative error value when no memmap entry is
found.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/7fbf40a9dc414d5da849e1ddcd7f7c1285e4e181.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/ia64/kernel/efi.c      | 4 ++--
 arch/x86/platform/efi/efi.c | 6 +++---
 include/linux/efi.h         | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 121295637d0d..81416000c5e0 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -757,14 +757,14 @@ efi_memmap_intersects (unsigned long phys_addr, unsigned long size)
 	return 0;
 }
 
-u32
+int
 efi_mem_type (unsigned long phys_addr)
 {
 	efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
 
 	if (md)
 		return md->type;
-	return 0;
+	return -EINVAL;
 }
 
 u64
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index f084d8718ac4..6217b23e85f6 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -1035,12 +1035,12 @@ void __init efi_enter_virtual_mode(void)
 /*
  * Convenience functions to obtain memory types and attributes
  */
-u32 efi_mem_type(unsigned long phys_addr)
+int efi_mem_type(unsigned long phys_addr)
 {
 	efi_memory_desc_t *md;
 
 	if (!efi_enabled(EFI_MEMMAP))
-		return 0;
+		return -ENOTSUPP;
 
 	for_each_efi_memory_desc(md) {
 		if ((md->phys_addr <= phys_addr) &&
@@ -1048,7 +1048,7 @@ u32 efi_mem_type(unsigned long phys_addr)
 				  (md->num_pages << EFI_PAGE_SHIFT))))
 			return md->type;
 	}
-	return 0;
+	return -EINVAL;
 }
 
 static int __init arch_parse_efi_cmdline(char *str)
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 8e24f099bd3f..4e47f78430be 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -985,7 +985,7 @@ static inline void efi_esrt_init(void) { }
 extern int efi_config_parse_tables(void *config_tables, int count, int sz,
 				   efi_config_table_type_t *arch_tables);
 extern u64 efi_get_iobase (void);
-extern u32 efi_mem_type (unsigned long phys_addr);
+extern int efi_mem_type(unsigned long phys_addr);
 extern u64 efi_mem_attributes (unsigned long phys_addr);
 extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size);
 extern int __init efi_uart_console_only (void);

From 38eecccdf488e38ee93690cfe9ec1914b73f512f Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:15 -0500
Subject: [PATCH 029/149] x86/efi: Update EFI pagetable creation to work with
 SME
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When SME is active, pagetable entries created for EFI need to have the
encryption mask set as necessary.

When the new pagetable pages are allocated they are mapped encrypted. So,
update the efi_pgt value that will be used in CR3 to include the encryption
mask so that the PGD table can be read successfully. The pagetable mapping
as well as the kernel are also added to the pagetable mapping as encrypted.
All other EFI mappings are mapped decrypted (tables, etc.).

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9a8f4c502db4a84b09e2f0a1555bb75aa8b69785.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/efi_64.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 9bf72f5bfedb..12e83888e5b9 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -327,7 +327,7 @@ virt_to_phys_or_null_size(void *va, unsigned long size)
 
 int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 {
-	unsigned long pfn, text;
+	unsigned long pfn, text, pf;
 	struct page *page;
 	unsigned npages;
 	pgd_t *pgd;
@@ -335,7 +335,12 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	if (efi_enabled(EFI_OLD_MEMMAP))
 		return 0;
 
-	efi_scratch.efi_pgt = (pgd_t *)__pa(efi_pgd);
+	/*
+	 * Since the PGD is encrypted, set the encryption mask so that when
+	 * this value is loaded into cr3 the PGD will be decrypted during
+	 * the pagetable walk.
+	 */
+	efi_scratch.efi_pgt = (pgd_t *)__sme_pa(efi_pgd);
 	pgd = efi_pgd;
 
 	/*
@@ -345,7 +350,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	 * phys_efi_set_virtual_address_map().
 	 */
 	pfn = pa_memmap >> PAGE_SHIFT;
-	if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX | _PAGE_RW)) {
+	pf = _PAGE_NX | _PAGE_RW | _PAGE_ENC;
+	if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, pf)) {
 		pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap);
 		return 1;
 	}
@@ -388,7 +394,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	text = __pa(_text);
 	pfn = text >> PAGE_SHIFT;
 
-	if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, _PAGE_RW)) {
+	pf = _PAGE_RW | _PAGE_ENC;
+	if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, pf)) {
 		pr_err("Failed to map kernel text 1:1\n");
 		return 1;
 	}

From 8f716c9b5febf6ed0f5fedb7c9407cd0c25b2796 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:16 -0500
Subject: [PATCH 030/149] x86/mm: Add support to access boot related data in
 the clear
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Boot data (such as EFI related data) is not encrypted when the system is
booted because UEFI/BIOS does not run with SME active. In order to access
this data properly it needs to be mapped decrypted.

Update early_memremap() to provide an arch specific routine to modify the
pagetable protection attributes before they are applied to the new
mapping. This is used to remove the encryption mask for boot related data.

Update memremap() to provide an arch specific routine to determine if RAM
remapping is allowed.  RAM remapping will cause an encrypted mapping to be
generated. By preventing RAM remapping, ioremap_cache() will be used
instead, which will provide a decrypted mapping of the boot related data.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/81fb6b4117a5df6b9f2eda342f81bbef4b23d2e5.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/io.h |   5 ++
 arch/x86/mm/ioremap.c     | 180 ++++++++++++++++++++++++++++++++++++++
 include/linux/io.h        |   2 +
 kernel/memremap.c         |  20 ++++-
 mm/early_ioremap.c        |  18 +++-
 5 files changed, 218 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 7afb0e2f07f4..09c5557b1454 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -381,4 +381,9 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
 #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc
 #endif
 
+extern bool arch_memremap_can_ram_remap(resource_size_t offset,
+					unsigned long size,
+					unsigned long flags);
+#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap
+
 #endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 570201bbf442..8986b2868944 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -13,6 +13,8 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mmiotrace.h>
+#include <linux/mem_encrypt.h>
+#include <linux/efi.h>
 
 #include <asm/set_memory.h>
 #include <asm/e820/api.h>
@@ -21,6 +23,7 @@
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
 #include <asm/pat.h>
+#include <asm/setup.h>
 
 #include "physaddr.h"
 
@@ -417,6 +420,183 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
 	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
 }
 
+/*
+ * Examine the physical address to determine if it is an area of memory
+ * that should be mapped decrypted.  If the memory is not part of the
+ * kernel usable area it was accessed and created decrypted, so these
+ * areas should be mapped decrypted.
+ */
+static bool memremap_should_map_decrypted(resource_size_t phys_addr,
+					  unsigned long size)
+{
+	/* Check if the address is outside kernel usable area */
+	switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) {
+	case E820_TYPE_RESERVED:
+	case E820_TYPE_ACPI:
+	case E820_TYPE_NVS:
+	case E820_TYPE_UNUSABLE:
+		return true;
+	default:
+		break;
+	}
+
+	return false;
+}
+
+/*
+ * Examine the physical address to determine if it is EFI data. Check
+ * it against the boot params structure and EFI tables and memory types.
+ */
+static bool memremap_is_efi_data(resource_size_t phys_addr,
+				 unsigned long size)
+{
+	u64 paddr;
+
+	/* Check if the address is part of EFI boot/runtime data */
+	if (!efi_enabled(EFI_BOOT))
+		return false;
+
+	paddr = boot_params.efi_info.efi_memmap_hi;
+	paddr <<= 32;
+	paddr |= boot_params.efi_info.efi_memmap;
+	if (phys_addr == paddr)
+		return true;
+
+	paddr = boot_params.efi_info.efi_systab_hi;
+	paddr <<= 32;
+	paddr |= boot_params.efi_info.efi_systab;
+	if (phys_addr == paddr)
+		return true;
+
+	if (efi_is_table_address(phys_addr))
+		return true;
+
+	switch (efi_mem_type(phys_addr)) {
+	case EFI_BOOT_SERVICES_DATA:
+	case EFI_RUNTIME_SERVICES_DATA:
+		return true;
+	default:
+		break;
+	}
+
+	return false;
+}
+
+/*
+ * Examine the physical address to determine if it is boot data by checking
+ * it against the boot params setup_data chain.
+ */
+static bool memremap_is_setup_data(resource_size_t phys_addr,
+				   unsigned long size)
+{
+	struct setup_data *data;
+	u64 paddr, paddr_next;
+
+	paddr = boot_params.hdr.setup_data;
+	while (paddr) {
+		unsigned int len;
+
+		if (phys_addr == paddr)
+			return true;
+
+		data = memremap(paddr, sizeof(*data),
+				MEMREMAP_WB | MEMREMAP_DEC);
+
+		paddr_next = data->next;
+		len = data->len;
+
+		memunmap(data);
+
+		if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
+			return true;
+
+		paddr = paddr_next;
+	}
+
+	return false;
+}
+
+/*
+ * Examine the physical address to determine if it is boot data by checking
+ * it against the boot params setup_data chain (early boot version).
+ */
+static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
+						unsigned long size)
+{
+	struct setup_data *data;
+	u64 paddr, paddr_next;
+
+	paddr = boot_params.hdr.setup_data;
+	while (paddr) {
+		unsigned int len;
+
+		if (phys_addr == paddr)
+			return true;
+
+		data = early_memremap_decrypted(paddr, sizeof(*data));
+
+		paddr_next = data->next;
+		len = data->len;
+
+		early_memunmap(data, sizeof(*data));
+
+		if ((phys_addr > paddr) && (phys_addr < (paddr + len)))
+			return true;
+
+		paddr = paddr_next;
+	}
+
+	return false;
+}
+
+/*
+ * Architecture function to determine if RAM remap is allowed. By default, a
+ * RAM remap will map the data as encrypted. Determine if a RAM remap should
+ * not be done so that the data will be mapped decrypted.
+ */
+bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
+				 unsigned long flags)
+{
+	if (!sme_active())
+		return true;
+
+	if (flags & MEMREMAP_ENC)
+		return true;
+
+	if (flags & MEMREMAP_DEC)
+		return false;
+
+	if (memremap_is_setup_data(phys_addr, size) ||
+	    memremap_is_efi_data(phys_addr, size) ||
+	    memremap_should_map_decrypted(phys_addr, size))
+		return false;
+
+	return true;
+}
+
+/*
+ * Architecture override of __weak function to adjust the protection attributes
+ * used when remapping memory. By default, early_memremap() will map the data
+ * as encrypted. Determine if an encrypted mapping should not be done and set
+ * the appropriate protection attributes.
+ */
+pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
+					     unsigned long size,
+					     pgprot_t prot)
+{
+	if (!sme_active())
+		return prot;
+
+	if (early_memremap_is_setup_data(phys_addr, size) ||
+	    memremap_is_efi_data(phys_addr, size) ||
+	    memremap_should_map_decrypted(phys_addr, size))
+		prot = pgprot_decrypted(prot);
+	else
+		prot = pgprot_encrypted(prot);
+
+	return prot;
+}
+
 #ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
 /* Remap memory with encryption */
 void __init *early_memremap_encrypted(resource_size_t phys_addr,
diff --git a/include/linux/io.h b/include/linux/io.h
index 2195d9ea4aaa..32e30e8fb9db 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -157,6 +157,8 @@ enum {
 	MEMREMAP_WB = 1 << 0,
 	MEMREMAP_WT = 1 << 1,
 	MEMREMAP_WC = 1 << 2,
+	MEMREMAP_ENC = 1 << 3,
+	MEMREMAP_DEC = 1 << 4,
 };
 
 void *memremap(resource_size_t offset, size_t size, unsigned long flags);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 124bed776532..9afdc434fb49 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -34,13 +34,24 @@ static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
 }
 #endif
 
-static void *try_ram_remap(resource_size_t offset, size_t size)
+#ifndef arch_memremap_can_ram_remap
+static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
+					unsigned long flags)
+{
+	return true;
+}
+#endif
+
+static void *try_ram_remap(resource_size_t offset, size_t size,
+			   unsigned long flags)
 {
 	unsigned long pfn = PHYS_PFN(offset);
 
 	/* In the simple case just return the existing linear address */
-	if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
+	if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) &&
+	    arch_memremap_can_ram_remap(offset, size, flags))
 		return __va(offset);
+
 	return NULL; /* fallback to arch_memremap_wb */
 }
 
@@ -48,7 +59,8 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
  * memremap() - remap an iomem_resource as cacheable memory
  * @offset: iomem resource start address
  * @size: size of remap
- * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC
+ * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC,
+ *		  MEMREMAP_ENC, MEMREMAP_DEC
  *
  * memremap() is "ioremap" for cases where it is known that the resource
  * being mapped does not have i/o side effects and the __iomem
@@ -95,7 +107,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
 		 * the requested range is potentially in System RAM.
 		 */
 		if (is_ram == REGION_INTERSECTS)
-			addr = try_ram_remap(offset, size);
+			addr = try_ram_remap(offset, size, flags);
 		if (!addr)
 			addr = arch_memremap_wb(offset, size);
 	}
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index d7d30da754ba..b1dd4a948fc0 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -30,6 +30,13 @@ early_param("early_ioremap_debug", early_ioremap_debug_setup);
 
 static int after_paging_init __initdata;
 
+pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr,
+						    unsigned long size,
+						    pgprot_t prot)
+{
+	return prot;
+}
+
 void __init __weak early_ioremap_shutdown(void)
 {
 }
@@ -215,14 +222,19 @@ early_ioremap(resource_size_t phys_addr, unsigned long size)
 void __init *
 early_memremap(resource_size_t phys_addr, unsigned long size)
 {
-	return (__force void *)__early_ioremap(phys_addr, size,
-					       FIXMAP_PAGE_NORMAL);
+	pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size,
+						     FIXMAP_PAGE_NORMAL);
+
+	return (__force void *)__early_ioremap(phys_addr, size, prot);
 }
 #ifdef FIXMAP_PAGE_RO
 void __init *
 early_memremap_ro(resource_size_t phys_addr, unsigned long size)
 {
-	return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
+	pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size,
+						     FIXMAP_PAGE_RO);
+
+	return (__force void *)__early_ioremap(phys_addr, size, prot);
 }
 #endif
 

From 5997efb967565e858259401af394e8449629c1f0 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:17 -0500
Subject: [PATCH 031/149] x86/boot: Use memremap() to map the MPF and MPC data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SMP MP-table is built by UEFI and placed in memory in a decrypted
state. These tables are accessed using a mix of early_memremap(),
early_memunmap(), phys_to_virt() and virt_to_phys(). Change all accesses
to use early_memremap()/early_memunmap(). This allows for proper setting
of the encryption mask so that the data can be successfully accessed when
SME is active.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/d9464b0d7c861021ed8f494e4a40d6cd10f1eddd.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/mpparse.c | 98 ++++++++++++++++++++++++++++-----------
 1 file changed, 70 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index fd37f39066da..5cbb3177ed17 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -429,7 +429,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 	}
 }
 
-static struct mpf_intel *mpf_found;
+static unsigned long mpf_base;
 
 static unsigned long __init get_mpc_size(unsigned long physptr)
 {
@@ -451,6 +451,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
 
 	size = get_mpc_size(mpf->physptr);
 	mpc = early_memremap(mpf->physptr, size);
+
 	/*
 	 * Read the physical hardware table.  Anything here will
 	 * override the defaults.
@@ -497,12 +498,12 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
  */
 void __init default_get_smp_config(unsigned int early)
 {
-	struct mpf_intel *mpf = mpf_found;
+	struct mpf_intel *mpf;
 
 	if (!smp_found_config)
 		return;
 
-	if (!mpf)
+	if (!mpf_base)
 		return;
 
 	if (acpi_lapic && early)
@@ -515,6 +516,12 @@ void __init default_get_smp_config(unsigned int early)
 	if (acpi_lapic && acpi_ioapic)
 		return;
 
+	mpf = early_memremap(mpf_base, sizeof(*mpf));
+	if (!mpf) {
+		pr_err("MPTABLE: error mapping MP table\n");
+		return;
+	}
+
 	pr_info("Intel MultiProcessor Specification v1.%d\n",
 		mpf->specification);
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
@@ -529,7 +536,7 @@ void __init default_get_smp_config(unsigned int early)
 	/*
 	 * Now see if we need to read further.
 	 */
-	if (mpf->feature1 != 0) {
+	if (mpf->feature1) {
 		if (early) {
 			/*
 			 * local APIC has default address
@@ -542,8 +549,10 @@ void __init default_get_smp_config(unsigned int early)
 		construct_default_ISA_mptable(mpf->feature1);
 
 	} else if (mpf->physptr) {
-		if (check_physptr(mpf, early))
+		if (check_physptr(mpf, early)) {
+			early_memunmap(mpf, sizeof(*mpf));
 			return;
+		}
 	} else
 		BUG();
 
@@ -552,6 +561,8 @@ void __init default_get_smp_config(unsigned int early)
 	/*
 	 * Only use the first configuration found.
 	 */
+
+	early_memunmap(mpf, sizeof(*mpf));
 }
 
 static void __init smp_reserve_memory(struct mpf_intel *mpf)
@@ -561,15 +572,16 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
 
 static int __init smp_scan_config(unsigned long base, unsigned long length)
 {
-	unsigned int *bp = phys_to_virt(base);
+	unsigned int *bp;
 	struct mpf_intel *mpf;
-	unsigned long mem;
+	int ret = 0;
 
 	apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
 		    base, base + length - 1);
 	BUILD_BUG_ON(sizeof(*mpf) != 16);
 
 	while (length > 0) {
+		bp = early_memremap(base, length);
 		mpf = (struct mpf_intel *)bp;
 		if ((*bp == SMP_MAGIC_IDENT) &&
 		    (mpf->length == 1) &&
@@ -579,24 +591,26 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
 #ifdef CONFIG_X86_LOCAL_APIC
 			smp_found_config = 1;
 #endif
-			mpf_found = mpf;
+			mpf_base = base;
 
-			pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
-				(unsigned long long) virt_to_phys(mpf),
-				(unsigned long long) virt_to_phys(mpf) +
-				sizeof(*mpf) - 1, mpf);
+			pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n",
+				base, base + sizeof(*mpf) - 1, mpf);
 
-			mem = virt_to_phys(mpf);
-			memblock_reserve(mem, sizeof(*mpf));
+			memblock_reserve(base, sizeof(*mpf));
 			if (mpf->physptr)
 				smp_reserve_memory(mpf);
 
-			return 1;
+			ret = 1;
 		}
-		bp += 4;
+		early_memunmap(bp, length);
+
+		if (ret)
+			break;
+
+		base += 16;
 		length -= 16;
 	}
-	return 0;
+	return ret;
 }
 
 void __init default_find_smp_config(void)
@@ -838,29 +852,40 @@ static int __init update_mp_table(void)
 	char oem[10];
 	struct mpf_intel *mpf;
 	struct mpc_table *mpc, *mpc_new;
+	unsigned long size;
 
 	if (!enable_update_mptable)
 		return 0;
 
-	mpf = mpf_found;
-	if (!mpf)
+	if (!mpf_base)
 		return 0;
 
+	mpf = early_memremap(mpf_base, sizeof(*mpf));
+	if (!mpf) {
+		pr_err("MPTABLE: mpf early_memremap() failed\n");
+		return 0;
+	}
+
 	/*
 	 * Now see if we need to go further.
 	 */
-	if (mpf->feature1 != 0)
-		return 0;
+	if (mpf->feature1)
+		goto do_unmap_mpf;
 
 	if (!mpf->physptr)
-		return 0;
+		goto do_unmap_mpf;
 
-	mpc = phys_to_virt(mpf->physptr);
+	size = get_mpc_size(mpf->physptr);
+	mpc = early_memremap(mpf->physptr, size);
+	if (!mpc) {
+		pr_err("MPTABLE: mpc early_memremap() failed\n");
+		goto do_unmap_mpf;
+	}
 
 	if (!smp_check_mpc(mpc, oem, str))
-		return 0;
+		goto do_unmap_mpc;
 
-	pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf));
+	pr_info("mpf: %llx\n", (u64)mpf_base);
 	pr_info("physptr: %x\n", mpf->physptr);
 
 	if (mpc_new_phys && mpc->length > mpc_new_length) {
@@ -878,21 +903,32 @@ static int __init update_mp_table(void)
 		new = mpf_checksum((unsigned char *)mpc, mpc->length);
 		if (old == new) {
 			pr_info("mpc is readonly, please try alloc_mptable instead\n");
-			return 0;
+			goto do_unmap_mpc;
 		}
 		pr_info("use in-position replacing\n");
 	} else {
+		mpc_new = early_memremap(mpc_new_phys, mpc_new_length);
+		if (!mpc_new) {
+			pr_err("MPTABLE: new mpc early_memremap() failed\n");
+			goto do_unmap_mpc;
+		}
 		mpf->physptr = mpc_new_phys;
-		mpc_new = phys_to_virt(mpc_new_phys);
 		memcpy(mpc_new, mpc, mpc->length);
+		early_memunmap(mpc, size);
 		mpc = mpc_new;
+		size = mpc_new_length;
 		/* check if we can modify that */
 		if (mpc_new_phys - mpf->physptr) {
 			struct mpf_intel *mpf_new;
 			/* steal 16 bytes from [0, 1k) */
+			mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new));
+			if (!mpf_new) {
+				pr_err("MPTABLE: new mpf early_memremap() failed\n");
+				goto do_unmap_mpc;
+			}
 			pr_info("mpf new: %x\n", 0x400 - 16);
-			mpf_new = phys_to_virt(0x400 - 16);
 			memcpy(mpf_new, mpf, 16);
+			early_memunmap(mpf, sizeof(*mpf));
 			mpf = mpf_new;
 			mpf->physptr = mpc_new_phys;
 		}
@@ -909,6 +945,12 @@ static int __init update_mp_table(void)
 	 */
 	replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
 
+do_unmap_mpc:
+	early_memunmap(mpc, size);
+
+do_unmap_mpf:
+	early_memunmap(mpf, sizeof(*mpf));
+
 	return 0;
 }
 

From 1de328628cd06b5efff9195b57bdc1a64680814d Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:18 -0500
Subject: [PATCH 032/149] x86/mm: Add support to access persistent memory in
 the clear
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Persistent memory is expected to persist across reboots. The encryption
key used by SME will change across reboots which will result in corrupted
persistent memory.  Persistent memory is handed out by block devices
through memory remapping functions, so be sure not to map this memory as
encrypted.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/7d829302d8fdc85f3d9505fc3eb8ec0c3a3e1cbf.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/ioremap.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 8986b2868944..704fc081c104 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -424,17 +424,46 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
  * Examine the physical address to determine if it is an area of memory
  * that should be mapped decrypted.  If the memory is not part of the
  * kernel usable area it was accessed and created decrypted, so these
- * areas should be mapped decrypted.
+ * areas should be mapped decrypted. And since the encryption key can
+ * change across reboots, persistent memory should also be mapped
+ * decrypted.
  */
 static bool memremap_should_map_decrypted(resource_size_t phys_addr,
 					  unsigned long size)
 {
+	int is_pmem;
+
+	/*
+	 * Check if the address is part of a persistent memory region.
+	 * This check covers areas added by E820, EFI and ACPI.
+	 */
+	is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM,
+				    IORES_DESC_PERSISTENT_MEMORY);
+	if (is_pmem != REGION_DISJOINT)
+		return true;
+
+	/*
+	 * Check if the non-volatile attribute is set for an EFI
+	 * reserved area.
+	 */
+	if (efi_enabled(EFI_BOOT)) {
+		switch (efi_mem_type(phys_addr)) {
+		case EFI_RESERVED_TYPE:
+			if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV)
+				return true;
+			break;
+		default:
+			break;
+		}
+	}
+
 	/* Check if the address is outside kernel usable area */
 	switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) {
 	case E820_TYPE_RESERVED:
 	case E820_TYPE_ACPI:
 	case E820_TYPE_NVS:
 	case E820_TYPE_UNUSABLE:
+	case E820_TYPE_PRAM:
 		return true;
 	default:
 		break;

From 77bd2342d4304bda7896c953d424d15deb314ca3 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:19 -0500
Subject: [PATCH 033/149] x86/mm: Add support for changing the memory
 encryption attribute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for changing the memory encryption attribute for one or more
memory pages. This will be useful when we have to change the AP trampoline
area to not be encrypted. Or when we need to change the SWIOTLB area to
not be encrypted in support of devices that can't support the encryption
mask range.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/924ae0d1f6d4c90c5a0e366c291b90a2d86aa79e.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/set_memory.h |  3 ++
 arch/x86/mm/pageattr.c            | 62 +++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index eaec6c364e42..cd71273ec49d 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -11,6 +11,7 @@
  * Executability : eXeutable, NoteXecutable
  * Read/Write    : ReadOnly, ReadWrite
  * Presence      : NotPresent
+ * Encryption    : Encrypted, Decrypted
  *
  * Within a category, the attributes are mutually exclusive.
  *
@@ -42,6 +43,8 @@ int set_memory_wt(unsigned long addr, int numpages);
 int set_memory_wb(unsigned long addr, int numpages);
 int set_memory_np(unsigned long addr, int numpages);
 int set_memory_4k(unsigned long addr, int numpages);
+int set_memory_encrypted(unsigned long addr, int numpages);
+int set_memory_decrypted(unsigned long addr, int numpages);
 
 int set_memory_array_uc(unsigned long *addr, int addrinarray);
 int set_memory_array_wc(unsigned long *addr, int addrinarray);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7e2d6c0a64c4..9c8ea129ff5c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1775,6 +1775,68 @@ int set_memory_4k(unsigned long addr, int numpages)
 					__pgprot(0), 1, 0, NULL);
 }
 
+static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
+{
+	struct cpa_data cpa;
+	unsigned long start;
+	int ret;
+
+	/* Nothing to do if the SME is not active */
+	if (!sme_active())
+		return 0;
+
+	/* Should not be working on unaligned addresses */
+	if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
+		addr &= PAGE_MASK;
+
+	start = addr;
+
+	memset(&cpa, 0, sizeof(cpa));
+	cpa.vaddr = &addr;
+	cpa.numpages = numpages;
+	cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0);
+	cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC);
+	cpa.pgd = init_mm.pgd;
+
+	/* Must avoid aliasing mappings in the highmem code */
+	kmap_flush_unused();
+	vm_unmap_aliases();
+
+	/*
+	 * Before changing the encryption attribute, we need to flush caches.
+	 */
+	if (static_cpu_has(X86_FEATURE_CLFLUSH))
+		cpa_flush_range(start, numpages, 1);
+	else
+		cpa_flush_all(1);
+
+	ret = __change_page_attr_set_clr(&cpa, 1);
+
+	/*
+	 * After changing the encryption attribute, we need to flush TLBs
+	 * again in case any speculative TLB caching occurred (but no need
+	 * to flush caches again).  We could just use cpa_flush_all(), but
+	 * in case TLB flushing gets optimized in the cpa_flush_range()
+	 * path use the same logic as above.
+	 */
+	if (static_cpu_has(X86_FEATURE_CLFLUSH))
+		cpa_flush_range(start, numpages, 0);
+	else
+		cpa_flush_all(0);
+
+	return ret;
+}
+
+int set_memory_encrypted(unsigned long addr, int numpages)
+{
+	return __set_memory_enc_dec(addr, numpages, true);
+}
+
+int set_memory_decrypted(unsigned long addr, int numpages)
+{
+	return __set_memory_enc_dec(addr, numpages, false);
+}
+
 int set_pages_uc(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);

From 163ea3c83aeeb3908a51162c79cb3a7c374d92b4 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:20 -0500
Subject: [PATCH 034/149] x86/realmode: Decrypt trampoline area if memory
 encryption is active
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When Secure Memory Encryption is enabled, the trampoline area must not
be encrypted. A CPU running in real mode will not be able to decrypt
memory that has been encrypted because it will not be able to use addresses
with the memory encryption mask.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/c70ffd2614fa77e80df31c9169ca98a9b16ff97c.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/realmode/init.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index cd4be19c36dc..d6ddc7e60884 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -1,6 +1,7 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 #include <linux/memblock.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/set_memory.h>
 #include <asm/pgtable.h>
@@ -59,6 +60,13 @@ static void __init setup_real_mode(void)
 
 	base = (unsigned char *)real_mode_header;
 
+	/*
+	 * If SME is active, the trampoline area will need to be in
+	 * decrypted memory in order to bring up other processors
+	 * successfully.
+	 */
+	set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT);
+
 	memcpy(base, real_mode_blob, size);
 
 	phys_base = __pa(base);

From c7753208a94c73d5beb1e4bd843081d6dc7d4678 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:21 -0500
Subject: [PATCH 035/149] x86, swiotlb: Add memory encryption support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since DMA addresses will effectively look like 48-bit addresses when the
memory encryption mask is set, SWIOTLB is needed if the DMA mask of the
device performing the DMA does not support 48-bits. SWIOTLB will be
initialized to create decrypted bounce buffers for use by these devices.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/aa2d29b78ae7d508db8881e46a3215231b9327a7.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/dma-mapping.h |  5 +--
 arch/x86/include/asm/mem_encrypt.h |  5 +++
 arch/x86/kernel/pci-dma.c          | 11 +++---
 arch/x86/kernel/pci-nommu.c        |  2 +-
 arch/x86/kernel/pci-swiotlb.c      | 15 +++++++--
 arch/x86/mm/mem_encrypt.c          | 22 ++++++++++++
 include/linux/swiotlb.h            |  1 +
 init/main.c                        | 10 ++++++
 lib/swiotlb.c                      | 54 +++++++++++++++++++++++++-----
 9 files changed, 108 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 398c79889f5c..1387dafdba2d 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -12,6 +12,7 @@
 #include <asm/io.h>
 #include <asm/swiotlb.h>
 #include <linux/dma-contiguous.h>
+#include <linux/mem_encrypt.h>
 
 #ifdef CONFIG_ISA
 # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24)
@@ -57,12 +58,12 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 
 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
 {
-	return paddr;
+	return __sme_set(paddr);
 }
 
 static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
 {
-	return daddr;
+	return __sme_clr(daddr);
 }
 #endif /* CONFIG_X86_DMA_REMAP */
 
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index ab1fe77c2f73..70e55f6b9adf 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -34,6 +34,11 @@ void __init sme_early_init(void);
 void __init sme_encrypt_kernel(void);
 void __init sme_enable(void);
 
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_init(void);
+
+void swiotlb_set_mem_attributes(void *vaddr, unsigned long size);
+
 #else	/* !CONFIG_AMD_MEM_ENCRYPT */
 
 #define sme_me_mask	0UL
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 5e16d3f29594..0accc2404b92 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -93,9 +93,12 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 	if (gfpflags_allow_blocking(flag)) {
 		page = dma_alloc_from_contiguous(dev, count, get_order(size),
 						 flag);
-		if (page && page_to_phys(page) + size > dma_mask) {
-			dma_release_from_contiguous(dev, page, count);
-			page = NULL;
+		if (page) {
+			addr = phys_to_dma(dev, page_to_phys(page));
+			if (addr + size > dma_mask) {
+				dma_release_from_contiguous(dev, page, count);
+				page = NULL;
+			}
 		}
 	}
 	/* fallback */
@@ -104,7 +107,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
 	if (!page)
 		return NULL;
 
-	addr = page_to_phys(page);
+	addr = phys_to_dma(dev, page_to_phys(page));
 	if (addr + size > dma_mask) {
 		__free_pages(page, get_order(size));
 
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a6d404087fe3..4fc3cb60ea11 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -32,7 +32,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
 				 enum dma_data_direction dir,
 				 unsigned long attrs)
 {
-	dma_addr_t bus = page_to_phys(page) + offset;
+	dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset;
 	WARN_ON(size == 0);
 	if (!check_addr("map_single", dev, bus, size))
 		return NOMMU_MAPPING_ERROR;
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 1e23577e17cf..677077510e30 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -6,12 +6,14 @@
 #include <linux/swiotlb.h>
 #include <linux/bootmem.h>
 #include <linux/dma-mapping.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/iommu.h>
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
 #include <asm/xen/swiotlb-xen.h>
 #include <asm/iommu_table.h>
+
 int swiotlb __read_mostly;
 
 void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -79,8 +81,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
 		  pci_swiotlb_late_init);
 
 /*
- * if 4GB or more detected (and iommu=off not set) return 1
- * and set swiotlb to 1.
+ * If 4GB or more detected (and iommu=off not set) or if SME is active
+ * then set swiotlb to 1 and return 1.
  */
 int __init pci_swiotlb_detect_4gb(void)
 {
@@ -89,6 +91,15 @@ int __init pci_swiotlb_detect_4gb(void)
 	if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN)
 		swiotlb = 1;
 #endif
+
+	/*
+	 * If SME is active then swiotlb will be set to 1 so that bounce
+	 * buffers are allocated and used for devices that do not support
+	 * the addressing range required for the encryption mask.
+	 */
+	if (sme_active())
+		swiotlb = 1;
+
 	return swiotlb;
 }
 IOMMU_INIT(pci_swiotlb_detect_4gb,
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 0843d023da4a..a7400ec8538b 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -13,11 +13,14 @@
 #include <linux/linkage.h>
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/swiotlb.h>
 
 #include <asm/tlbflush.h>
 #include <asm/fixmap.h>
 #include <asm/setup.h>
 #include <asm/bootparam.h>
+#include <asm/set_memory.h>
 
 /*
  * Since SME related variables are set early in the boot process they must
@@ -177,6 +180,25 @@ void __init sme_early_init(void)
 		protection_map[i] = pgprot_encrypted(protection_map[i]);
 }
 
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_init(void)
+{
+	if (!sme_me_mask)
+		return;
+
+	/* Call into SWIOTLB to update the SWIOTLB DMA buffers */
+	swiotlb_update_mem_attributes();
+}
+
+void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
+{
+	WARN(PAGE_ALIGN(size) != size,
+	     "size is not page-aligned (%#lx)\n", size);
+
+	/* Make the SWIOTLB buffer area decrypted */
+	set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
+}
+
 void __init sme_encrypt_kernel(void)
 {
 }
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 4ee479f2f355..15e7160751a8 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -35,6 +35,7 @@ int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
 extern unsigned long swiotlb_nr_tbl(void);
 unsigned long swiotlb_size_or_default(void);
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
+extern void __init swiotlb_update_mem_attributes(void);
 
 /*
  * Enumeration for sync targets
diff --git a/init/main.c b/init/main.c
index 052481fbe363..9789ab7fe85e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -488,6 +488,8 @@ void __init __weak thread_stack_cache_init(void)
 }
 #endif
 
+void __init __weak mem_encrypt_init(void) { }
+
 /*
  * Set up kernel memory allocators
  */
@@ -641,6 +643,14 @@ asmlinkage __visible void __init start_kernel(void)
 	 */
 	locking_selftest();
 
+	/*
+	 * This needs to be called before any devices perform DMA
+	 * operations that might use the SWIOTLB bounce buffers. It will
+	 * mark the bounce buffers as decrypted so that their usage will
+	 * not cause "plain-text" data to be decrypted when accessed.
+	 */
+	mem_encrypt_init();
+
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (initrd_start && !initrd_below_start_ok &&
 	    page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index a8d74a733a38..04ac91acf193 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -30,6 +30,7 @@
 #include <linux/highmem.h>
 #include <linux/gfp.h>
 #include <linux/scatterlist.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/io.h>
 #include <asm/dma.h>
@@ -155,6 +156,15 @@ unsigned long swiotlb_size_or_default(void)
 	return size ? size : (IO_TLB_DEFAULT_SIZE);
 }
 
+void __weak swiotlb_set_mem_attributes(void *vaddr, unsigned long size) { }
+
+/* For swiotlb, clear memory encryption mask from dma addresses */
+static dma_addr_t swiotlb_phys_to_dma(struct device *hwdev,
+				      phys_addr_t address)
+{
+	return __sme_clr(phys_to_dma(hwdev, address));
+}
+
 /* Note that this doesn't work with highmem page */
 static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
 				      volatile void *address)
@@ -183,6 +193,31 @@ void swiotlb_print_info(void)
 	       bytes >> 20, vstart, vend - 1);
 }
 
+/*
+ * Early SWIOTLB allocation may be too early to allow an architecture to
+ * perform the desired operations.  This function allows the architecture to
+ * call SWIOTLB when the operations are possible.  It needs to be called
+ * before the SWIOTLB memory is used.
+ */
+void __init swiotlb_update_mem_attributes(void)
+{
+	void *vaddr;
+	unsigned long bytes;
+
+	if (no_iotlb_memory || late_alloc)
+		return;
+
+	vaddr = phys_to_virt(io_tlb_start);
+	bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT);
+	swiotlb_set_mem_attributes(vaddr, bytes);
+	memset(vaddr, 0, bytes);
+
+	vaddr = phys_to_virt(io_tlb_overflow_buffer);
+	bytes = PAGE_ALIGN(io_tlb_overflow);
+	swiotlb_set_mem_attributes(vaddr, bytes);
+	memset(vaddr, 0, bytes);
+}
+
 int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 {
 	void *v_overflow_buffer;
@@ -320,6 +355,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	io_tlb_start = virt_to_phys(tlb);
 	io_tlb_end = io_tlb_start + bytes;
 
+	swiotlb_set_mem_attributes(tlb, bytes);
 	memset(tlb, 0, bytes);
 
 	/*
@@ -330,6 +366,8 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 	if (!v_overflow_buffer)
 		goto cleanup2;
 
+	swiotlb_set_mem_attributes(v_overflow_buffer, io_tlb_overflow);
+	memset(v_overflow_buffer, 0, io_tlb_overflow);
 	io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer);
 
 	/*
@@ -581,7 +619,7 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size,
 		return SWIOTLB_MAP_ERROR;
 	}
 
-	start_dma_addr = phys_to_dma(hwdev, io_tlb_start);
+	start_dma_addr = swiotlb_phys_to_dma(hwdev, io_tlb_start);
 	return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size,
 				      dir, attrs);
 }
@@ -702,7 +740,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 			goto err_warn;
 
 		ret = phys_to_virt(paddr);
-		dev_addr = phys_to_dma(hwdev, paddr);
+		dev_addr = swiotlb_phys_to_dma(hwdev, paddr);
 
 		/* Confirm address can be DMA'd by device */
 		if (dev_addr + size - 1 > dma_mask) {
@@ -812,10 +850,10 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 	map = map_single(dev, phys, size, dir, attrs);
 	if (map == SWIOTLB_MAP_ERROR) {
 		swiotlb_full(dev, size, dir, 1);
-		return phys_to_dma(dev, io_tlb_overflow_buffer);
+		return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer);
 	}
 
-	dev_addr = phys_to_dma(dev, map);
+	dev_addr = swiotlb_phys_to_dma(dev, map);
 
 	/* Ensure that the address returned is DMA'ble */
 	if (dma_capable(dev, dev_addr, size))
@@ -824,7 +862,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 	attrs |= DMA_ATTR_SKIP_CPU_SYNC;
 	swiotlb_tbl_unmap_single(dev, map, size, dir, attrs);
 
-	return phys_to_dma(dev, io_tlb_overflow_buffer);
+	return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer);
 }
 EXPORT_SYMBOL_GPL(swiotlb_map_page);
 
@@ -958,7 +996,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 				sg_dma_len(sgl) = 0;
 				return 0;
 			}
-			sg->dma_address = phys_to_dma(hwdev, map);
+			sg->dma_address = swiotlb_phys_to_dma(hwdev, map);
 		} else
 			sg->dma_address = dev_addr;
 		sg_dma_len(sg) = sg->length;
@@ -1026,7 +1064,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
 int
 swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
 {
-	return (dma_addr == phys_to_dma(hwdev, io_tlb_overflow_buffer));
+	return (dma_addr == swiotlb_phys_to_dma(hwdev, io_tlb_overflow_buffer));
 }
 EXPORT_SYMBOL(swiotlb_dma_mapping_error);
 
@@ -1039,6 +1077,6 @@ EXPORT_SYMBOL(swiotlb_dma_mapping_error);
 int
 swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
-	return phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
+	return swiotlb_phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
 }
 EXPORT_SYMBOL(swiotlb_dma_supported);

From 648babb7078c6310d2af5b8aa01f086030916968 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:22 -0500
Subject: [PATCH 036/149] swiotlb: Add warnings for use of bounce buffers with
 SME
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add warnings to let the user know when bounce buffers are being used for
DMA when SME is active.  Since the bounce buffers are not in encrypted
memory, these notifications are to allow the user to determine some
appropriate action - if necessary.  Actions can range from utilizing an
IOMMU, replacing the device with another device that can support 64-bit
DMA, ignoring the message if the device isn't used much, etc.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/d112564053c3f2e86ca634a8d4fa4abc0eb53a6a.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/dma-mapping.h | 13 +++++++++++++
 lib/swiotlb.c               |  3 +++
 2 files changed, 16 insertions(+)

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 843ab866e0f4..fce2369ecf82 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -10,6 +10,7 @@
 #include <linux/scatterlist.h>
 #include <linux/kmemcheck.h>
 #include <linux/bug.h>
+#include <linux/mem_encrypt.h>
 
 /**
  * List of possible attributes associated with a DMA mapping. The semantics
@@ -548,6 +549,12 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 	return 0;
 }
 
+static inline void dma_check_mask(struct device *dev, u64 mask)
+{
+	if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
+		dev_warn(dev, "SME is active, device will require DMA bounce buffers\n");
+}
+
 static inline int dma_supported(struct device *dev, u64 mask)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -564,6 +571,9 @@ static inline int dma_set_mask(struct device *dev, u64 mask)
 {
 	if (!dev->dma_mask || !dma_supported(dev, mask))
 		return -EIO;
+
+	dma_check_mask(dev, mask);
+
 	*dev->dma_mask = mask;
 	return 0;
 }
@@ -583,6 +593,9 @@ static inline int dma_set_coherent_mask(struct device *dev, u64 mask)
 {
 	if (!dma_supported(dev, mask))
 		return -EIO;
+
+	dma_check_mask(dev, mask);
+
 	dev->coherent_dma_mask = mask;
 	return 0;
 }
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 04ac91acf193..8c6c83ef57a4 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -507,6 +507,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 	if (no_iotlb_memory)
 		panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
 
+	if (sme_active())
+		pr_warn_once("SME is active and system is using DMA bounce buffers\n");
+
 	mask = dma_get_seg_boundary(hwdev);
 
 	tbl_dma_addr &= mask;

From f655e6e6b992a2fb0d0334db2620607b98df39e7 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:23 -0500
Subject: [PATCH 037/149] x86/cpu/AMD: Make the microcode level available
 earlier in the boot
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the setting of the cpuinfo_x86.microcode field from amd_init() to
early_amd_init() so that it is available earlier in the boot process. This
avoids having to read MSR_AMD64_PATCH_LEVEL directly during early boot.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/7b7525fa12593dac5f4b01fcc25c95f97e93862f.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/amd.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 4d87950fde30..f22fd4ea8858 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -547,8 +547,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 
 static void early_init_amd(struct cpuinfo_x86 *c)
 {
+	u32 dummy;
+
 	early_init_amd_mc(c);
 
+	rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
 	/*
 	 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
 	 * with P/T states and does not stop in deep C-states
@@ -750,8 +754,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
 
 static void init_amd(struct cpuinfo_x86 *c)
 {
-	u32 dummy;
-
 	early_init_amd(c);
 
 	/*
@@ -813,8 +815,6 @@ static void init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 > 0x11)
 		set_cpu_cap(c, X86_FEATURE_ARAT);
 
-	rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
-
 	/* 3DNow or LM implies PREFETCHW */
 	if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
 		if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))

From 2543a786aa25258451f3418b87a038c7ddaa2e85 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:24 -0500
Subject: [PATCH 038/149] iommu/amd: Allow the AMD IOMMU to work with memory
 encryption
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The IOMMU is programmed with physical addresses for the various tables
and buffers that are used to communicate between the device and the
driver. When the driver allocates this memory it is encrypted. In order
for the IOMMU to access the memory as encrypted the encryption mask needs
to be included in these physical addresses during configuration.

The PTE entries created by the IOMMU should also include the encryption
mask so that when the device behind the IOMMU performs a DMA, the DMA
will be performed to encrypted memory.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Joerg Roedel <jroedel@suse.de>
Cc: <iommu@lists.linux-foundation.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/3053631ea25ba8b1601c351cb7c541c496f6d9bc.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/iommu/amd_iommu.c       | 30 +++++++++++++++--------------
 drivers/iommu/amd_iommu_init.c  | 34 +++++++++++++++++++++++++++------
 drivers/iommu/amd_iommu_proto.h | 10 ++++++++++
 drivers/iommu/amd_iommu_types.h |  2 +-
 4 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 688e77576e5a..abc6ca65a47b 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -575,7 +575,7 @@ static void dump_dte_entry(u16 devid)
 
 static void dump_command(unsigned long phys_addr)
 {
-	struct iommu_cmd *cmd = phys_to_virt(phys_addr);
+	struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr);
 	int i;
 
 	for (i = 0; i < 4; ++i)
@@ -919,11 +919,13 @@ static void copy_cmd_to_buffer(struct amd_iommu *iommu,
 
 static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
 {
+	u64 paddr = iommu_virt_to_phys((void *)address);
+
 	WARN_ON(address & 0x7ULL);
 
 	memset(cmd, 0, sizeof(*cmd));
-	cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
-	cmd->data[1] = upper_32_bits(__pa(address));
+	cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
+	cmd->data[1] = upper_32_bits(paddr);
 	cmd->data[2] = 1;
 	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
 }
@@ -1383,7 +1385,7 @@ static bool increase_address_space(struct protection_domain *domain,
 		return false;
 
 	*pte             = PM_LEVEL_PDE(domain->mode,
-					virt_to_phys(domain->pt_root));
+					iommu_virt_to_phys(domain->pt_root));
 	domain->pt_root  = pte;
 	domain->mode    += 1;
 	domain->updated  = true;
@@ -1420,7 +1422,7 @@ static u64 *alloc_pte(struct protection_domain *domain,
 			if (!page)
 				return NULL;
 
-			__npte = PM_LEVEL_PDE(level, virt_to_phys(page));
+			__npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
 
 			/* pte could have been changed somewhere. */
 			if (cmpxchg64(pte, __pte, __npte) != __pte) {
@@ -1536,10 +1538,10 @@ static int iommu_map_page(struct protection_domain *dom,
 			return -EBUSY;
 
 	if (count > 1) {
-		__pte = PAGE_SIZE_PTE(phys_addr, page_size);
+		__pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size);
 		__pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
 	} else
-		__pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
+		__pte = __sme_set(phys_addr) | IOMMU_PTE_P | IOMMU_PTE_FC;
 
 	if (prot & IOMMU_PROT_IR)
 		__pte |= IOMMU_PTE_IR;
@@ -1755,7 +1757,7 @@ static void free_gcr3_tbl_level1(u64 *tbl)
 		if (!(tbl[i] & GCR3_VALID))
 			continue;
 
-		ptr = __va(tbl[i] & PAGE_MASK);
+		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
 
 		free_page((unsigned long)ptr);
 	}
@@ -1770,7 +1772,7 @@ static void free_gcr3_tbl_level2(u64 *tbl)
 		if (!(tbl[i] & GCR3_VALID))
 			continue;
 
-		ptr = __va(tbl[i] & PAGE_MASK);
+		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
 
 		free_gcr3_tbl_level1(ptr);
 	}
@@ -2049,7 +2051,7 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
 	u64 flags = 0;
 
 	if (domain->mode != PAGE_MODE_NONE)
-		pte_root = virt_to_phys(domain->pt_root);
+		pte_root = iommu_virt_to_phys(domain->pt_root);
 
 	pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
 		    << DEV_ENTRY_MODE_SHIFT;
@@ -2061,7 +2063,7 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
 		flags |= DTE_FLAG_IOTLB;
 
 	if (domain->flags & PD_IOMMUV2_MASK) {
-		u64 gcr3 = __pa(domain->gcr3_tbl);
+		u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
 		u64 glx  = domain->glx;
 		u64 tmp;
 
@@ -3606,10 +3608,10 @@ static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc)
 			if (root == NULL)
 				return NULL;
 
-			*pte = __pa(root) | GCR3_VALID;
+			*pte = iommu_virt_to_phys(root) | GCR3_VALID;
 		}
 
-		root = __va(*pte & PAGE_MASK);
+		root = iommu_phys_to_virt(*pte & PAGE_MASK);
 
 		level -= 1;
 	}
@@ -3788,7 +3790,7 @@ static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table)
 
 	dte	= amd_iommu_dev_table[devid].data[2];
 	dte	&= ~DTE_IRQ_PHYS_ADDR_MASK;
-	dte	|= virt_to_phys(table->table);
+	dte	|= iommu_virt_to_phys(table->table);
 	dte	|= DTE_IRQ_REMAP_INTCTL;
 	dte	|= DTE_IRQ_TABLE_LEN;
 	dte	|= DTE_IRQ_REMAP_ENABLE;
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 5cc597b383c7..42ff9a86c102 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -30,6 +30,7 @@
 #include <linux/iommu.h>
 #include <linux/kmemleak.h>
 #include <linux/crash_dump.h>
+#include <linux/mem_encrypt.h>
 #include <asm/pci-direct.h>
 #include <asm/iommu.h>
 #include <asm/gart.h>
@@ -348,7 +349,7 @@ static void iommu_set_device_table(struct amd_iommu *iommu)
 
 	BUG_ON(iommu->mmio_base == NULL);
 
-	entry = virt_to_phys(amd_iommu_dev_table);
+	entry = iommu_virt_to_phys(amd_iommu_dev_table);
 	entry |= (dev_table_size >> 12) - 1;
 	memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
 			&entry, sizeof(entry));
@@ -606,7 +607,7 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
 
 	BUG_ON(iommu->cmd_buf == NULL);
 
-	entry = (u64)virt_to_phys(iommu->cmd_buf);
+	entry = iommu_virt_to_phys(iommu->cmd_buf);
 	entry |= MMIO_CMD_SIZE_512;
 
 	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
@@ -635,7 +636,7 @@ static void iommu_enable_event_buffer(struct amd_iommu *iommu)
 
 	BUG_ON(iommu->evt_buf == NULL);
 
-	entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+	entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
 
 	memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
 		    &entry, sizeof(entry));
@@ -668,7 +669,7 @@ static void iommu_enable_ppr_log(struct amd_iommu *iommu)
 	if (iommu->ppr_log == NULL)
 		return;
 
-	entry = (u64)virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512;
+	entry = iommu_virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512;
 
 	memcpy_toio(iommu->mmio_base + MMIO_PPR_LOG_OFFSET,
 		    &entry, sizeof(entry));
@@ -748,10 +749,10 @@ static int iommu_init_ga_log(struct amd_iommu *iommu)
 	if (!iommu->ga_log_tail)
 		goto err_out;
 
-	entry = (u64)virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512;
+	entry = iommu_virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512;
 	memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_BASE_OFFSET,
 		    &entry, sizeof(entry));
-	entry = ((u64)virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL;
+	entry = (iommu_virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL;
 	memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_TAIL_OFFSET,
 		    &entry, sizeof(entry));
 	writel(0x00, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
@@ -2564,6 +2565,24 @@ static int __init amd_iommu_init(void)
 	return ret;
 }
 
+static bool amd_iommu_sme_check(void)
+{
+	if (!sme_active() || (boot_cpu_data.x86 != 0x17))
+		return true;
+
+	/* For Fam17h, a specific level of support is required */
+	if (boot_cpu_data.microcode >= 0x08001205)
+		return true;
+
+	if ((boot_cpu_data.microcode >= 0x08001126) &&
+	    (boot_cpu_data.microcode <= 0x080011ff))
+		return true;
+
+	pr_notice("AMD-Vi: IOMMU not currently supported when SME is active\n");
+
+	return false;
+}
+
 /****************************************************************************
  *
  * Early detect code. This code runs at IOMMU detection time in the DMA
@@ -2578,6 +2597,9 @@ int __init amd_iommu_detect(void)
 	if (no_iommu || (iommu_detected && !gart_iommu_aperture))
 		return -ENODEV;
 
+	if (!amd_iommu_sme_check())
+		return -ENODEV;
+
 	ret = iommu_go_to_state(IOMMU_IVRS_DETECTED);
 	if (ret)
 		return ret;
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h
index 466260f8a1df..3f12fb2338ea 100644
--- a/drivers/iommu/amd_iommu_proto.h
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -87,4 +87,14 @@ static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
 	return !!(iommu->features & f);
 }
 
+static inline u64 iommu_virt_to_phys(void *vaddr)
+{
+	return (u64)__sme_set(virt_to_phys(vaddr));
+}
+
+static inline void *iommu_phys_to_virt(unsigned long paddr)
+{
+	return phys_to_virt(__sme_clr(paddr));
+}
+
 #endif /* _ASM_X86_AMD_IOMMU_PROTO_H  */
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 294a409e283b..8591f43c467c 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -344,7 +344,7 @@
 
 #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
 #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
-#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
+#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
 #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
 
 #define IOMMU_PROT_MASK 0x03

From 46d010e04a637ca5bbdd0ff72554d9c06f2961c9 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:25 -0500
Subject: [PATCH 039/149] x86/boot/realmode: Check for memory encryption on the
 APs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support to check if memory encryption is active in the kernel and that
it has been enabled on the AP. If memory encryption is active in the kernel
but has not been enabled on the AP, then set the memory encryption bit (bit
23) of MSR_K8_SYSCFG to enable memory encryption on that AP and allow the
AP to continue start up.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/37e29b99c395910f56ca9f8ecf7b0439b28827c8.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/realmode.h      | 12 ++++++++++++
 arch/x86/realmode/init.c             |  4 ++++
 arch/x86/realmode/rm/trampoline_64.S | 24 ++++++++++++++++++++++++
 3 files changed, 40 insertions(+)

diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 230e1903acf0..90d91520c13a 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -1,6 +1,15 @@
 #ifndef _ARCH_X86_REALMODE_H
 #define _ARCH_X86_REALMODE_H
 
+/*
+ * Flag bit definitions for use with the flags field of the trampoline header
+ * in the CONFIG_X86_64 variant.
+ */
+#define TH_FLAGS_SME_ACTIVE_BIT		0
+#define TH_FLAGS_SME_ACTIVE		BIT(TH_FLAGS_SME_ACTIVE_BIT)
+
+#ifndef __ASSEMBLY__
+
 #include <linux/types.h>
 #include <asm/io.h>
 
@@ -38,6 +47,7 @@ struct trampoline_header {
 	u64 start;
 	u64 efer;
 	u32 cr4;
+	u32 flags;
 #endif
 };
 
@@ -69,4 +79,6 @@ static inline size_t real_mode_size_needed(void)
 void set_real_mode_mem(phys_addr_t mem, size_t size);
 void reserve_real_mode(void);
 
+#endif /* __ASSEMBLY__ */
+
 #endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index d6ddc7e60884..1f71980fc5e0 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -108,6 +108,10 @@ static void __init setup_real_mode(void)
 	trampoline_cr4_features = &trampoline_header->cr4;
 	*trampoline_cr4_features = mmu_cr4_features;
 
+	trampoline_header->flags = 0;
+	if (sme_active())
+		trampoline_header->flags |= TH_FLAGS_SME_ACTIVE;
+
 	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
 	trampoline_pgd[0] = trampoline_pgd_entry.pgd;
 	trampoline_pgd[511] = init_top_pgt[511].pgd;
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index dac7b20d2f9d..614fd7064d0a 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -30,6 +30,7 @@
 #include <asm/msr.h>
 #include <asm/segment.h>
 #include <asm/processor-flags.h>
+#include <asm/realmode.h>
 #include "realmode.h"
 
 	.text
@@ -92,6 +93,28 @@ ENTRY(startup_32)
 	movl	%edx, %fs
 	movl	%edx, %gs
 
+	/*
+	 * Check for memory encryption support. This is a safety net in
+	 * case BIOS hasn't done the necessary step of setting the bit in
+	 * the MSR for this AP. If SME is active and we've gotten this far
+	 * then it is safe for us to set the MSR bit and continue. If we
+	 * don't we'll eventually crash trying to execute encrypted
+	 * instructions.
+	 */
+	bt	$TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags
+	jnc	.Ldone
+	movl	$MSR_K8_SYSCFG, %ecx
+	rdmsr
+	bts	$MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax
+	jc	.Ldone
+
+	/*
+	 * Memory encryption is enabled but the SME enable bit for this
+	 * CPU has has not been set.  It is safe to set it, so do so.
+	 */
+	wrmsr
+.Ldone:
+
 	movl	pa_tr_cr4, %eax
 	movl	%eax, %cr4		# Enable PAE mode
 
@@ -147,6 +170,7 @@ GLOBAL(trampoline_header)
 	tr_start:		.space	8
 	GLOBAL(tr_efer)		.space	8
 	GLOBAL(tr_cr4)		.space	4
+	GLOBAL(tr_flags)	.space	4
 END(trampoline_header)
 
 #include "trampoline_common.S"

From 95cf9264d5f36c291c1c50c00349f83348e6f9c7 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:26 -0500
Subject: [PATCH 040/149] x86, drm, fbdev: Do not specify encrypted memory for
 video mappings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since video memory needs to be accessed decrypted, be sure that the
memory encryption mask is not set for the video ranges.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/a19436f30424402e01f63a09b32ab103272acced.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/vga.h       | 14 +++++++++++++-
 arch/x86/mm/pageattr.c           |  2 ++
 drivers/gpu/drm/drm_gem.c        |  2 ++
 drivers/gpu/drm/drm_vm.c         |  4 ++++
 drivers/gpu/drm/ttm/ttm_bo_vm.c  |  7 +++++--
 drivers/gpu/drm/udl/udl_fb.c     |  4 ++++
 drivers/video/fbdev/core/fbmem.c | 12 ++++++++++++
 7 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h
index c4b9dc2f67c5..9f42beefc67a 100644
--- a/arch/x86/include/asm/vga.h
+++ b/arch/x86/include/asm/vga.h
@@ -7,12 +7,24 @@
 #ifndef _ASM_X86_VGA_H
 #define _ASM_X86_VGA_H
 
+#include <asm/set_memory.h>
+
 /*
  *	On the PC, we can just recalculate addresses and then
  *	access the videoram directly without any black magic.
+ *	To support memory encryption however, we need to access
+ *	the videoram as decrypted memory.
  */
 
-#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x)
+#define VGA_MAP_MEM(x, s)					\
+({								\
+	unsigned long start = (unsigned long)phys_to_virt(x);	\
+								\
+	if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))			\
+		set_memory_decrypted(start, (s) >> PAGE_SHIFT);	\
+								\
+	start;							\
+})
 
 #define vga_readb(x) (*(x))
 #define vga_writeb(x, y) (*(y) = (x))
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 9c8ea129ff5c..dfb7d657cf43 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1831,11 +1831,13 @@ int set_memory_encrypted(unsigned long addr, int numpages)
 {
 	return __set_memory_enc_dec(addr, numpages, true);
 }
+EXPORT_SYMBOL_GPL(set_memory_encrypted);
 
 int set_memory_decrypted(unsigned long addr, int numpages)
 {
 	return __set_memory_enc_dec(addr, numpages, false);
 }
+EXPORT_SYMBOL_GPL(set_memory_decrypted);
 
 int set_pages_uc(struct page *page, int numpages)
 {
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 8dc11064253d..7a61a07ac4de 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -36,6 +36,7 @@
 #include <linux/pagemap.h>
 #include <linux/shmem_fs.h>
 #include <linux/dma-buf.h>
+#include <linux/mem_encrypt.h>
 #include <drm/drmP.h>
 #include <drm/drm_vma_manager.h>
 #include <drm/drm_gem.h>
@@ -928,6 +929,7 @@ int drm_gem_mmap_obj(struct drm_gem_object *obj, unsigned long obj_size,
 	vma->vm_ops = dev->driver->gem_vm_ops;
 	vma->vm_private_data = obj;
 	vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
+	vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
 
 	/* Take a ref for this mapping of the object, so that the fault
 	 * handler can dereference the mmap offset's pointer to the object.
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c
index 1170b3209a12..ed4bcbfd6086 100644
--- a/drivers/gpu/drm/drm_vm.c
+++ b/drivers/gpu/drm/drm_vm.c
@@ -40,6 +40,7 @@
 #include <linux/efi.h>
 #include <linux/slab.h>
 #endif
+#include <linux/mem_encrypt.h>
 #include <asm/pgtable.h>
 #include "drm_internal.h"
 #include "drm_legacy.h"
@@ -58,6 +59,9 @@ static pgprot_t drm_io_prot(struct drm_local_map *map,
 {
 	pgprot_t tmp = vm_get_page_prot(vma->vm_flags);
 
+	/* We don't want graphics memory to be mapped encrypted */
+	tmp = pgprot_decrypted(tmp);
+
 #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__)
 	if (map->type == _DRM_REGISTERS && !(map->flags & _DRM_WRITE_COMBINING))
 		tmp = pgprot_noncached(tmp);
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index b442d12f2f7d..84fb009d4eb0 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -39,6 +39,7 @@
 #include <linux/rbtree.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
+#include <linux/mem_encrypt.h>
 
 #define TTM_BO_VM_NUM_PREFAULT 16
 
@@ -230,9 +231,11 @@ static int ttm_bo_vm_fault(struct vm_fault *vmf)
 	 * first page.
 	 */
 	for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) {
-		if (bo->mem.bus.is_iomem)
+		if (bo->mem.bus.is_iomem) {
+			/* Iomem should not be marked encrypted */
+			cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot);
 			pfn = bdev->driver->io_mem_pfn(bo, page_offset);
-		else {
+		} else {
 			page = ttm->pages[page_offset];
 			if (unlikely(!page && i == 0)) {
 				retval = VM_FAULT_OOM;
diff --git a/drivers/gpu/drm/udl/udl_fb.c b/drivers/gpu/drm/udl/udl_fb.c
index 4a6500362564..92e1690e28de 100644
--- a/drivers/gpu/drm/udl/udl_fb.c
+++ b/drivers/gpu/drm/udl/udl_fb.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/fb.h>
 #include <linux/dma-buf.h>
+#include <linux/mem_encrypt.h>
 
 #include <drm/drmP.h>
 #include <drm/drm_crtc.h>
@@ -169,6 +170,9 @@ static int udl_fb_mmap(struct fb_info *info, struct vm_area_struct *vma)
 	pr_notice("mmap() framebuffer addr:%lu size:%lu\n",
 		  pos, size);
 
+	/* We don't want the framebuffer to be mapped encrypted */
+	vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
+
 	while (size > 0) {
 		page = vmalloc_to_pfn((void *)pos);
 		if (remap_pfn_range(vma, start, page, PAGE_SIZE, PAGE_SHARED))
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 7a42238db446..25e862c487f6 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -32,6 +32,7 @@
 #include <linux/device.h>
 #include <linux/efi.h>
 #include <linux/fb.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/fb.h>
 
@@ -1396,6 +1397,12 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
 	mutex_lock(&info->mm_lock);
 	if (fb->fb_mmap) {
 		int res;
+
+		/*
+		 * The framebuffer needs to be accessed decrypted, be sure
+		 * SME protection is removed ahead of the call
+		 */
+		vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
 		res = fb->fb_mmap(info, vma);
 		mutex_unlock(&info->mm_lock);
 		return res;
@@ -1421,6 +1428,11 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
 	mutex_unlock(&info->mm_lock);
 
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+	/*
+	 * The framebuffer needs to be accessed decrypted, be sure
+	 * SME protection is removed
+	 */
+	vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
 	fb_pgprotect(file, vma, start);
 
 	return vm_iomap_memory(vma, start, len);

From d0ec49d4de90806755e17289bd48464a1a515823 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:27 -0500
Subject: [PATCH 041/149] kvm/x86/svm: Support Secure Memory Encryption within
 KVM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update the KVM support to work with SME. The VMCB has a number of fields
where physical addresses are used and these addresses must contain the
memory encryption mask in order to properly access the encrypted memory.
Also, use the memory encryption mask when creating and using the nested
page tables.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/89146eccfa50334409801ff20acd52a90fb5efcf.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c              | 11 +++++++----
 arch/x86/kvm/mmu.h              |  2 +-
 arch/x86/kvm/svm.c              | 35 +++++++++++++++++----------------
 arch/x86/kvm/vmx.c              |  2 +-
 arch/x86/kvm/x86.c              |  3 ++-
 6 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 87ac4fba6d8e..7cbaab523f22 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1078,7 +1078,7 @@ void kvm_mmu_init_vm(struct kvm *kvm);
 void kvm_mmu_uninit_vm(struct kvm *kvm);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
-		u64 acc_track_mask);
+		u64 acc_track_mask, u64 me_mask);
 
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9b1dd114956a..ccb70b8d16cc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -108,7 +108,7 @@ module_param(dbg, bool, 0644);
 	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
 
 
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
 #define PT64_DIR_BASE_ADDR_MASK \
 	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
 #define PT64_LVL_ADDR_MASK(level) \
@@ -126,7 +126,7 @@ module_param(dbg, bool, 0644);
 					    * PT32_LEVEL_BITS))) - 1))
 
 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
-			| shadow_x_mask | shadow_nx_mask)
+			| shadow_x_mask | shadow_nx_mask | shadow_me_mask)
 
 #define ACC_EXEC_MASK    1
 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
@@ -186,6 +186,7 @@ static u64 __read_mostly shadow_dirty_mask;
 static u64 __read_mostly shadow_mmio_mask;
 static u64 __read_mostly shadow_mmio_value;
 static u64 __read_mostly shadow_present_mask;
+static u64 __read_mostly shadow_me_mask;
 
 /*
  * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
@@ -349,7 +350,7 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
  */
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
-		u64 acc_track_mask)
+		u64 acc_track_mask, u64 me_mask)
 {
 	BUG_ON(!dirty_mask != !accessed_mask);
 	BUG_ON(!accessed_mask && !acc_track_mask);
@@ -362,6 +363,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 	shadow_x_mask = x_mask;
 	shadow_present_mask = p_mask;
 	shadow_acc_track_mask = acc_track_mask;
+	shadow_me_mask = me_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
@@ -2433,7 +2435,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
 	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
 
 	spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
-	       shadow_user_mask | shadow_x_mask;
+	       shadow_user_mask | shadow_x_mask | shadow_me_mask;
 
 	if (sp_ad_disabled(sp))
 		spte |= shadow_acc_track_value;
@@ -2745,6 +2747,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		pte_access &= ~ACC_WRITE_MASK;
 
 	spte |= (u64)pfn << PAGE_SHIFT;
+	spte |= shadow_me_mask;
 
 	if (pte_access & ACC_WRITE_MASK) {
 
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index d7d248a000dd..3cc725590ab9 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,7 +48,7 @@
 
 static inline u64 rsvd_bits(int s, int e)
 {
-	return ((1ULL << (e - s + 1)) - 1) << s;
+	return __sme_clr(((1ULL << (e - s + 1)) - 1) << s);
 }
 
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4d8141e533c3..6af04dd5426c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1167,9 +1167,9 @@ static void avic_init_vmcb(struct vcpu_svm *svm)
 {
 	struct vmcb *vmcb = svm->vmcb;
 	struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
-	phys_addr_t bpa = page_to_phys(svm->avic_backing_page);
-	phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page);
-	phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page);
+	phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
+	phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page));
+	phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page));
 
 	vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
 	vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
@@ -1232,8 +1232,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 		set_intercept(svm, INTERCEPT_MWAIT);
 	}
 
-	control->iopm_base_pa = iopm_base;
-	control->msrpm_base_pa = __pa(svm->msrpm);
+	control->iopm_base_pa = __sme_set(iopm_base);
+	control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
 	control->int_ctl = V_INTR_MASKING_MASK;
 
 	init_seg(&save->es);
@@ -1377,9 +1377,9 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
 		return -EINVAL;
 
 	new_entry = READ_ONCE(*entry);
-	new_entry = (page_to_phys(svm->avic_backing_page) &
-		     AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
-		     AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
+	new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
+			      AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+			      AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
 	WRITE_ONCE(*entry, new_entry);
 
 	svm->avic_physical_id_cache = entry;
@@ -1647,7 +1647,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	svm->vmcb = page_address(page);
 	clear_page(svm->vmcb);
-	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
+	svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
 	svm->asid_generation = 0;
 	init_vmcb(svm);
 
@@ -1675,7 +1675,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
+	__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
 	__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
 	__free_page(virt_to_page(svm->nested.hsave));
 	__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
@@ -2335,7 +2335,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
 	u64 pdpte;
 	int ret;
 
-	ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
+	ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
 				       offset_in_page(cr3) + index * 8, 8);
 	if (ret)
 		return 0;
@@ -2347,7 +2347,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	svm->vmcb->control.nested_cr3 = root;
+	svm->vmcb->control.nested_cr3 = __sme_set(root);
 	mark_dirty(svm->vmcb, VMCB_NPT);
 	svm_flush_tlb(vcpu);
 }
@@ -2868,7 +2868,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 		svm->nested.msrpm[p] = svm->msrpm[p] | value;
 	}
 
-	svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
+	svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
 
 	return true;
 }
@@ -4501,7 +4501,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 	pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
 		 irq.vector);
 	*svm = to_svm(vcpu);
-	vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page);
+	vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
 	vcpu_info->vector = irq.vector;
 
 	return 0;
@@ -4552,7 +4552,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 			struct amd_iommu_pi_data pi;
 
 			/* Try to enable guest_mode in IRTE */
-			pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK;
+			pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
+					    AVIC_HPA_MASK);
 			pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
 						     svm->vcpu.vcpu_id);
 			pi.is_guest_mode = true;
@@ -5001,7 +5002,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	svm->vmcb->save.cr3 = root;
+	svm->vmcb->save.cr3 = __sme_set(root);
 	mark_dirty(svm->vmcb, VMCB_CR);
 	svm_flush_tlb(vcpu);
 }
@@ -5010,7 +5011,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	svm->vmcb->control.nested_cr3 = root;
+	svm->vmcb->control.nested_cr3 = __sme_set(root);
 	mark_dirty(svm->vmcb, VMCB_NPT);
 
 	/* Also sync guest cr3 here in case we live migrate */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 84e62acf2dd8..ffd469ecad57 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6492,7 +6492,7 @@ void vmx_enable_tdp(void)
 		enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
 		0ull, VMX_EPT_EXECUTABLE_MASK,
 		cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
-		VMX_EPT_RWX_MASK);
+		VMX_EPT_RWX_MASK, 0ull);
 
 	ept_set_mmio_spte_mask();
 	kvm_enable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5b8f07889f6a..88be1aabc8d7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -54,6 +54,7 @@
 #include <linux/kvm_irqfd.h>
 #include <linux/irqbypass.h>
 #include <linux/sched/stat.h>
+#include <linux/mem_encrypt.h>
 
 #include <trace/events/kvm.h>
 
@@ -6113,7 +6114,7 @@ int kvm_arch_init(void *opaque)
 
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 			PT_DIRTY_MASK, PT64_NX_MASK, 0,
-			PT_PRESENT_MASK, 0);
+			PT_PRESENT_MASK, 0, sme_me_mask);
 	kvm_timer_init();
 
 	perf_register_guest_info_callbacks(&kvm_guest_cbs);

From bba4ed011a52d494aa7ef5e08cf226709bbf3f60 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:28 -0500
Subject: [PATCH 042/149] x86/mm, kexec: Allow kexec to be used with SME
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Provide support so that kexec can be used to boot a kernel when SME is
enabled.

Support is needed to allocate pages for kexec without encryption.  This
is needed in order to be able to reboot in the kernel in the same manner
as originally booted.

Additionally, when shutting down all of the CPUs we need to be sure to
flush the caches and then halt. This is needed when booting from a state
where SME was not active into a state where SME is active (or vice-versa).
Without these steps, it is possible for cache lines to exist for the same
physical location but tagged both with and without the encryption bit. This
can cause random memory corruption when caches are flushed depending on
which cacheline is written last.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: <kexec@lists.infradead.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/b95ff075db3e7cd545313f2fb609a49619a09625.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/init.h          |  1 +
 arch/x86/include/asm/kexec.h         |  8 ++++++++
 arch/x86/include/asm/pgtable_types.h |  1 +
 arch/x86/kernel/machine_kexec_64.c   | 22 +++++++++++++++++++++-
 arch/x86/kernel/process.c            | 17 +++++++++++++++--
 arch/x86/mm/ident_map.c              | 12 ++++++++----
 include/linux/kexec.h                |  8 ++++++++
 kernel/kexec_core.c                  | 12 +++++++++++-
 8 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 474eb8c66fee..05c4aa00cc86 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -7,6 +7,7 @@ struct x86_mapping_info {
 	unsigned long page_flag;	 /* page flag for PMD or PUD entry */
 	unsigned long offset;		 /* ident mapping offset */
 	bool direct_gbpages;		 /* PUD level 1GB page support */
+	unsigned long kernpg_flag;	 /* kernel pagetable flag override */
 };
 
 int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 70ef205489f0..e8183acf931f 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -207,6 +207,14 @@ struct kexec_entry64_regs {
 	uint64_t r15;
 	uint64_t rip;
 };
+
+extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages,
+				       gfp_t gfp);
+#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages
+
+extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages);
+#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages
+
 #endif
 
 typedef void crash_vmclear_fn(void);
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 32095af0fefb..830992fc5a06 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -213,6 +213,7 @@ enum page_cache_mode {
 #define PAGE_KERNEL		__pgprot(__PAGE_KERNEL | _PAGE_ENC)
 #define PAGE_KERNEL_RO		__pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
 #define PAGE_KERNEL_EXEC	__pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
+#define PAGE_KERNEL_EXEC_NOENC	__pgprot(__PAGE_KERNEL_EXEC)
 #define PAGE_KERNEL_RX		__pgprot(__PAGE_KERNEL_RX | _PAGE_ENC)
 #define PAGE_KERNEL_NOCACHE	__pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
 #define PAGE_KERNEL_LARGE	__pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index cb0a30473c23..9cf8daacc046 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -87,7 +87,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 	}
 	pte = pte_offset_kernel(pmd, vaddr);
-	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC));
 	return 0;
 err:
 	free_transition_pgtable(image);
@@ -115,6 +115,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 		.alloc_pgt_page	= alloc_pgt_page,
 		.context	= image,
 		.page_flag	= __PAGE_KERNEL_LARGE_EXEC,
+		.kernpg_flag	= _KERNPG_TABLE_NOENC,
 	};
 	unsigned long mstart, mend;
 	pgd_t *level4p;
@@ -602,3 +603,22 @@ void arch_kexec_unprotect_crashkres(void)
 {
 	kexec_mark_crashkres(false);
 }
+
+int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
+{
+	/*
+	 * If SME is active we need to be sure that kexec pages are
+	 * not encrypted because when we boot to the new kernel the
+	 * pages won't be accessed encrypted (initially).
+	 */
+	return set_memory_decrypted((unsigned long)vaddr, pages);
+}
+
+void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
+{
+	/*
+	 * If SME is active we need to reset the pages back to being
+	 * an encrypted mapping before freeing them.
+	 */
+	set_memory_encrypted((unsigned long)vaddr, pages);
+}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3ca198080ea9..bd6b85fac666 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -355,6 +355,7 @@ bool xen_set_default_idle(void)
 	return ret;
 }
 #endif
+
 void stop_this_cpu(void *dummy)
 {
 	local_irq_disable();
@@ -365,8 +366,20 @@ void stop_this_cpu(void *dummy)
 	disable_local_APIC();
 	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
 
-	for (;;)
-		halt();
+	for (;;) {
+		/*
+		 * Use wbinvd followed by hlt to stop the processor. This
+		 * provides support for kexec on a processor that supports
+		 * SME. With kexec, going from SME inactive to SME active
+		 * requires clearing cache entries so that addresses without
+		 * the encryption bit set don't corrupt the same physical
+		 * address that has the encryption bit set when caches are
+		 * flushed. To achieve this a wbinvd is performed followed by
+		 * a hlt. Even if the processor is not in the kexec/SME
+		 * scenario this only adds a wbinvd to a halting processor.
+		 */
+		asm volatile("wbinvd; hlt" : : : "memory");
+	}
 }
 
 /*
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index adab1595f4bd..31cea988fa36 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -51,7 +51,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
 		if (!pmd)
 			return -ENOMEM;
 		ident_pmd_init(info, pmd, addr, next);
-		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+		set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag));
 	}
 
 	return 0;
@@ -79,7 +79,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
 		if (!pud)
 			return -ENOMEM;
 		ident_pud_init(info, pud, addr, next);
-		set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+		set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag));
 	}
 
 	return 0;
@@ -93,6 +93,10 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
 	unsigned long next;
 	int result;
 
+	/* Set the default pagetable flags if not supplied */
+	if (!info->kernpg_flag)
+		info->kernpg_flag = _KERNPG_TABLE;
+
 	for (; addr < end; addr = next) {
 		pgd_t *pgd = pgd_page + pgd_index(addr);
 		p4d_t *p4d;
@@ -116,14 +120,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
 		if (result)
 			return result;
 		if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
-			set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+			set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag));
 		} else {
 			/*
 			 * With p4d folded, pgd is equal to p4d.
 			 * The pgd entry has to point to the pud page table in this case.
 			 */
 			pud_t *pud = pud_offset(p4d, 0);
-			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+			set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag));
 		}
 	}
 
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index dd056fab9e35..2b7590f5483a 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -327,6 +327,14 @@ static inline void *boot_phys_to_virt(unsigned long entry)
 	return phys_to_virt(boot_phys_to_phys(entry));
 }
 
+#ifndef arch_kexec_post_alloc_pages
+static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { return 0; }
+#endif
+
+#ifndef arch_kexec_pre_free_pages
+static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { }
+#endif
+
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
 struct task_struct;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1ae7c41c33c1..20fef1a38602 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -301,7 +301,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page *pages;
 
-	pages = alloc_pages(gfp_mask, order);
+	pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
 	if (pages) {
 		unsigned int count, i;
 
@@ -310,6 +310,13 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 		count = 1 << order;
 		for (i = 0; i < count; i++)
 			SetPageReserved(pages + i);
+
+		arch_kexec_post_alloc_pages(page_address(pages), count,
+					    gfp_mask);
+
+		if (gfp_mask & __GFP_ZERO)
+			for (i = 0; i < count; i++)
+				clear_highpage(pages + i);
 	}
 
 	return pages;
@@ -321,6 +328,9 @@ static void kimage_free_pages(struct page *page)
 
 	order = page_private(page);
 	count = 1 << order;
+
+	arch_kexec_pre_free_pages(page_address(page), count);
+
 	for (i = 0; i < count; i++)
 		ClearPageReserved(page + i);
 	__free_pages(page, order);

From f2f931c6819467af5260a21c59fb787ce2863f92 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:29 -0500
Subject: [PATCH 043/149] xen/x86: Remove SME feature in PV guests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Xen does not currently support SME for PV guests. Clear the SME CPU
capability in order to avoid any ambiguity.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: <xen-devel@lists.xen.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/3b605622a9fae5e588e5a13967120a18ec18071b.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/xen/enlighten_pv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 290bc5ac9852..df1921751aa5 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -263,6 +263,7 @@ static void __init xen_init_capabilities(void)
 	setup_clear_cpu_cap(X86_FEATURE_MTRR);
 	setup_clear_cpu_cap(X86_FEATURE_ACC);
 	setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+	setup_clear_cpu_cap(X86_FEATURE_SME);
 
 	/*
 	 * Xen PV would need some work to support PCID: CR3 handling as well

From 8458bf94b0399cd1bca6c437366bcafb29c230c5 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:30 -0500
Subject: [PATCH 044/149] x86/mm: Use proper encryption attributes with
 /dev/mem
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When accessing memory using /dev/mem (or /dev/kmem) use the proper
encryption attributes when mapping the memory.

To insure the proper attributes are applied when reading or writing
/dev/mem, update the xlate_dev_mem_ptr() function to use memremap()
which will essentially perform the same steps of applying __va for
RAM or using ioremap() if not RAM.

To insure the proper attributes are applied when mmapping /dev/mem,
update the phys_mem_access_prot() to call phys_mem_access_encrypted(),
a new function which will check if the memory should be mapped encrypted
or not. If it is not to be mapped encrypted then the VMA protection
value is updated to remove the encryption bit.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/c917f403ab9f61cbfd455ad6425ed8429a5e7b54.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/io.h |  3 +++
 arch/x86/mm/ioremap.c     | 18 +++++++++---------
 arch/x86/mm/pat.c         |  3 +++
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 09c5557b1454..e080a39b2108 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -386,4 +386,7 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset,
 					unsigned long flags);
 #define arch_memremap_can_ram_remap arch_memremap_can_ram_remap
 
+extern bool phys_mem_access_encrypted(unsigned long phys_addr,
+				      unsigned long size);
+
 #endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 704fc081c104..34f0e1847dd6 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -400,12 +400,10 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
 	unsigned long offset = phys & ~PAGE_MASK;
 	void *vaddr;
 
-	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
-	if (page_is_ram(start >> PAGE_SHIFT))
-		return __va(phys);
+	/* memremap() maps if RAM, otherwise falls back to ioremap() */
+	vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB);
 
-	vaddr = ioremap_cache(start, PAGE_SIZE);
-	/* Only add the offset on success and return NULL if the ioremap() failed: */
+	/* Only add the offset on success and return NULL if memremap() failed */
 	if (vaddr)
 		vaddr += offset;
 
@@ -414,10 +412,7 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
 
 void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
 {
-	if (page_is_ram(phys >> PAGE_SHIFT))
-		return;
-
-	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+	memunmap((void *)((unsigned long)addr & PAGE_MASK));
 }
 
 /*
@@ -626,6 +621,11 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
 	return prot;
 }
 
+bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
+{
+	return arch_memremap_can_ram_remap(phys_addr, size, 0);
+}
+
 #ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
 /* Remap memory with encryption */
 void __init *early_memremap_encrypted(resource_size_t phys_addr,
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 88990ab87961..fe7d57a8fb60 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -744,6 +744,9 @@ EXPORT_SYMBOL(arch_io_free_memtype_wc);
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 				unsigned long size, pgprot_t vma_prot)
 {
+	if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size))
+		vma_prot = pgprot_decrypted(vma_prot);
+
 	return vma_prot;
 }
 

From db516997a985b461f021d594e78155bbc7fc3e7e Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:31 -0500
Subject: [PATCH 045/149] x86/mm: Create native_make_p4d() for PGTABLE_LEVELS
 <= 4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, native_make_p4d() is only defined when CONFIG_PGTABLE_LEVELS
is greater than 4. Create a macro that will allow for defining and using
native_make_p4d() when CONFIG_PGTABLES_LEVELS is not greater than 4.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/b645e14f9e73731023694494860ceab73feff777.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable_types.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 830992fc5a06..6c55973ebab8 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -309,6 +309,11 @@ static inline p4dval_t native_p4d_val(p4d_t p4d)
 #else
 #include <asm-generic/pgtable-nop4d.h>
 
+static inline p4d_t native_make_p4d(pudval_t val)
+{
+	return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) };
+}
+
 static inline p4dval_t native_p4d_val(p4d_t p4d)
 {
 	return native_pgd_val(p4d.pgd);

From 6ebcb060713f614c92216482eed501b31cee74ec Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:32 -0500
Subject: [PATCH 046/149] x86/mm: Add support to encrypt the kernel in-place
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the support to encrypt the kernel in-place. This is done by creating
new page mappings for the kernel - a decrypted write-protected mapping
and an encrypted mapping. The kernel is encrypted by copying it through
a temporary buffer.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/c039bf9412ef95e1e6bf4fdf8facab95e00c717b.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mem_encrypt.h |   6 +
 arch/x86/mm/Makefile               |   1 +
 arch/x86/mm/mem_encrypt.c          | 310 +++++++++++++++++++++++++++++
 arch/x86/mm/mem_encrypt_boot.S     | 149 ++++++++++++++
 4 files changed, 466 insertions(+)
 create mode 100644 arch/x86/mm/mem_encrypt_boot.S

diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 70e55f6b9adf..7122c365c2a3 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -21,6 +21,12 @@
 
 extern unsigned long sme_me_mask;
 
+void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
+			 unsigned long decrypted_kernel_vaddr,
+			 unsigned long kernel_len,
+			 unsigned long encryption_wa,
+			 unsigned long encryption_pgd);
+
 void __init sme_early_encrypt(resource_size_t paddr,
 			      unsigned long size);
 void __init sme_early_decrypt(resource_size_t paddr,
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index a94a7b663d5f..72bf8c01c6e3 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -40,3 +40,4 @@ obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
 obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index a7400ec8538b..e5d543938b0f 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -21,6 +21,8 @@
 #include <asm/setup.h>
 #include <asm/bootparam.h>
 #include <asm/set_memory.h>
+#include <asm/cacheflush.h>
+#include <asm/sections.h>
 
 /*
  * Since SME related variables are set early in the boot process they must
@@ -199,8 +201,316 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
 	set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT);
 }
 
+static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start,
+				 unsigned long end)
+{
+	unsigned long pgd_start, pgd_end, pgd_size;
+	pgd_t *pgd_p;
+
+	pgd_start = start & PGDIR_MASK;
+	pgd_end = end & PGDIR_MASK;
+
+	pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1);
+	pgd_size *= sizeof(pgd_t);
+
+	pgd_p = pgd_base + pgd_index(start);
+
+	memset(pgd_p, 0, pgd_size);
+}
+
+#define PGD_FLAGS	_KERNPG_TABLE_NOENC
+#define P4D_FLAGS	_KERNPG_TABLE_NOENC
+#define PUD_FLAGS	_KERNPG_TABLE_NOENC
+#define PMD_FLAGS	(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
+
+static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area,
+				     unsigned long vaddr, pmdval_t pmd_val)
+{
+	pgd_t *pgd_p;
+	p4d_t *p4d_p;
+	pud_t *pud_p;
+	pmd_t *pmd_p;
+
+	pgd_p = pgd_base + pgd_index(vaddr);
+	if (native_pgd_val(*pgd_p)) {
+		if (IS_ENABLED(CONFIG_X86_5LEVEL))
+			p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
+		else
+			pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK);
+	} else {
+		pgd_t pgd;
+
+		if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+			p4d_p = pgtable_area;
+			memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
+			pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D;
+
+			pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS);
+		} else {
+			pud_p = pgtable_area;
+			memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+			pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
+
+			pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS);
+		}
+		native_set_pgd(pgd_p, pgd);
+	}
+
+	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+		p4d_p += p4d_index(vaddr);
+		if (native_p4d_val(*p4d_p)) {
+			pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK);
+		} else {
+			p4d_t p4d;
+
+			pud_p = pgtable_area;
+			memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
+			pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD;
+
+			p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS);
+			native_set_p4d(p4d_p, p4d);
+		}
+	}
+
+	pud_p += pud_index(vaddr);
+	if (native_pud_val(*pud_p)) {
+		if (native_pud_val(*pud_p) & _PAGE_PSE)
+			goto out;
+
+		pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK);
+	} else {
+		pud_t pud;
+
+		pmd_p = pgtable_area;
+		memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
+		pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD;
+
+		pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS);
+		native_set_pud(pud_p, pud);
+	}
+
+	pmd_p += pmd_index(vaddr);
+	if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE))
+		native_set_pmd(pmd_p, native_make_pmd(pmd_val));
+
+out:
+	return pgtable_area;
+}
+
+static unsigned long __init sme_pgtable_calc(unsigned long len)
+{
+	unsigned long p4d_size, pud_size, pmd_size;
+	unsigned long total;
+
+	/*
+	 * Perform a relatively simplistic calculation of the pagetable
+	 * entries that are needed. That mappings will be covered by 2MB
+	 * PMD entries so we can conservatively calculate the required
+	 * number of P4D, PUD and PMD structures needed to perform the
+	 * mappings. Incrementing the count for each covers the case where
+	 * the addresses cross entries.
+	 */
+	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+		p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
+		p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
+		pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1;
+		pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+	} else {
+		p4d_size = 0;
+		pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1;
+		pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+	}
+	pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1;
+	pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
+
+	total = p4d_size + pud_size + pmd_size;
+
+	/*
+	 * Now calculate the added pagetable structures needed to populate
+	 * the new pagetables.
+	 */
+	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+		p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
+		p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D;
+		pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE;
+		pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+	} else {
+		p4d_size = 0;
+		pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE;
+		pud_size *= sizeof(pud_t) * PTRS_PER_PUD;
+	}
+	pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE;
+	pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD;
+
+	total += p4d_size + pud_size + pmd_size;
+
+	return total;
+}
+
 void __init sme_encrypt_kernel(void)
 {
+	unsigned long workarea_start, workarea_end, workarea_len;
+	unsigned long execute_start, execute_end, execute_len;
+	unsigned long kernel_start, kernel_end, kernel_len;
+	unsigned long pgtable_area_len;
+	unsigned long paddr, pmd_flags;
+	unsigned long decrypted_base;
+	void *pgtable_area;
+	pgd_t *pgd;
+
+	if (!sme_active())
+		return;
+
+	/*
+	 * Prepare for encrypting the kernel by building new pagetables with
+	 * the necessary attributes needed to encrypt the kernel in place.
+	 *
+	 *   One range of virtual addresses will map the memory occupied
+	 *   by the kernel as encrypted.
+	 *
+	 *   Another range of virtual addresses will map the memory occupied
+	 *   by the kernel as decrypted and write-protected.
+	 *
+	 *     The use of write-protect attribute will prevent any of the
+	 *     memory from being cached.
+	 */
+
+	/* Physical addresses gives us the identity mapped virtual addresses */
+	kernel_start = __pa_symbol(_text);
+	kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
+	kernel_len = kernel_end - kernel_start;
+
+	/* Set the encryption workarea to be immediately after the kernel */
+	workarea_start = kernel_end;
+
+	/*
+	 * Calculate required number of workarea bytes needed:
+	 *   executable encryption area size:
+	 *     stack page (PAGE_SIZE)
+	 *     encryption routine page (PAGE_SIZE)
+	 *     intermediate copy buffer (PMD_PAGE_SIZE)
+	 *   pagetable structures for the encryption of the kernel
+	 *   pagetable structures for workarea (in case not currently mapped)
+	 */
+	execute_start = workarea_start;
+	execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
+	execute_len = execute_end - execute_start;
+
+	/*
+	 * One PGD for both encrypted and decrypted mappings and a set of
+	 * PUDs and PMDs for each of the encrypted and decrypted mappings.
+	 */
+	pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD;
+	pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2;
+
+	/* PUDs and PMDs needed in the current pagetables for the workarea */
+	pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len);
+
+	/*
+	 * The total workarea includes the executable encryption area and
+	 * the pagetable area.
+	 */
+	workarea_len = execute_len + pgtable_area_len;
+	workarea_end = workarea_start + workarea_len;
+
+	/*
+	 * Set the address to the start of where newly created pagetable
+	 * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable
+	 * structures are created when the workarea is added to the current
+	 * pagetables and when the new encrypted and decrypted kernel
+	 * mappings are populated.
+	 */
+	pgtable_area = (void *)execute_end;
+
+	/*
+	 * Make sure the current pagetable structure has entries for
+	 * addressing the workarea.
+	 */
+	pgd = (pgd_t *)native_read_cr3_pa();
+	paddr = workarea_start;
+	while (paddr < workarea_end) {
+		pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+						paddr,
+						paddr + PMD_FLAGS);
+
+		paddr += PMD_PAGE_SIZE;
+	}
+
+	/* Flush the TLB - no globals so cr3 is enough */
+	native_write_cr3(__native_read_cr3());
+
+	/*
+	 * A new pagetable structure is being built to allow for the kernel
+	 * to be encrypted. It starts with an empty PGD that will then be
+	 * populated with new PUDs and PMDs as the encrypted and decrypted
+	 * kernel mappings are created.
+	 */
+	pgd = pgtable_area;
+	memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD);
+	pgtable_area += sizeof(*pgd) * PTRS_PER_PGD;
+
+	/* Add encrypted kernel (identity) mappings */
+	pmd_flags = PMD_FLAGS | _PAGE_ENC;
+	paddr = kernel_start;
+	while (paddr < kernel_end) {
+		pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+						paddr,
+						paddr + pmd_flags);
+
+		paddr += PMD_PAGE_SIZE;
+	}
+
+	/*
+	 * A different PGD index/entry must be used to get different
+	 * pagetable entries for the decrypted mapping. Choose the next
+	 * PGD index and convert it to a virtual address to be used as
+	 * the base of the mapping.
+	 */
+	decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1);
+	decrypted_base <<= PGDIR_SHIFT;
+
+	/* Add decrypted, write-protected kernel (non-identity) mappings */
+	pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT);
+	paddr = kernel_start;
+	while (paddr < kernel_end) {
+		pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+						paddr + decrypted_base,
+						paddr + pmd_flags);
+
+		paddr += PMD_PAGE_SIZE;
+	}
+
+	/* Add decrypted workarea mappings to both kernel mappings */
+	paddr = workarea_start;
+	while (paddr < workarea_end) {
+		pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+						paddr,
+						paddr + PMD_FLAGS);
+
+		pgtable_area = sme_populate_pgd(pgd, pgtable_area,
+						paddr + decrypted_base,
+						paddr + PMD_FLAGS);
+
+		paddr += PMD_PAGE_SIZE;
+	}
+
+	/* Perform the encryption */
+	sme_encrypt_execute(kernel_start, kernel_start + decrypted_base,
+			    kernel_len, workarea_start, (unsigned long)pgd);
+
+	/*
+	 * At this point we are running encrypted.  Remove the mappings for
+	 * the decrypted areas - all that is needed for this is to remove
+	 * the PGD entry/entries.
+	 */
+	sme_clear_pgd(pgd, kernel_start + decrypted_base,
+		      kernel_end + decrypted_base);
+
+	sme_clear_pgd(pgd, workarea_start + decrypted_base,
+		      workarea_end + decrypted_base);
+
+	/* Flush the TLB - no globals so cr3 is enough */
+	native_write_cr3(__native_read_cr3());
 }
 
 void __init sme_enable(void)
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
new file mode 100644
index 000000000000..b327e0472448
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -0,0 +1,149 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/pgtable.h>
+#include <asm/page.h>
+#include <asm/processor-flags.h>
+#include <asm/msr-index.h>
+#include <asm/frame.h>
+
+	.text
+	.code64
+ENTRY(sme_encrypt_execute)
+
+	/*
+	 * Entry parameters:
+	 *   RDI - virtual address for the encrypted kernel mapping
+	 *   RSI - virtual address for the decrypted kernel mapping
+	 *   RDX - length of kernel
+	 *   RCX - virtual address of the encryption workarea, including:
+	 *     - stack page (PAGE_SIZE)
+	 *     - encryption routine page (PAGE_SIZE)
+	 *     - intermediate copy buffer (PMD_PAGE_SIZE)
+	 *    R8 - physcial address of the pagetables to use for encryption
+	 */
+
+	FRAME_BEGIN			/* RBP now has original stack pointer */
+
+	/* Set up a one page stack in the non-encrypted memory area */
+	movq	%rcx, %rax		/* Workarea stack page */
+	leaq	PAGE_SIZE(%rax), %rsp	/* Set new stack pointer */
+	addq	$PAGE_SIZE, %rax	/* Workarea encryption routine */
+
+	push	%r12
+	movq	%rdi, %r10		/* Encrypted kernel */
+	movq	%rsi, %r11		/* Decrypted kernel */
+	movq	%rdx, %r12		/* Kernel length */
+
+	/* Copy encryption routine into the workarea */
+	movq	%rax, %rdi				/* Workarea encryption routine */
+	leaq	__enc_copy(%rip), %rsi			/* Encryption routine */
+	movq	$(.L__enc_copy_end - __enc_copy), %rcx	/* Encryption routine length */
+	rep	movsb
+
+	/* Setup registers for call */
+	movq	%r10, %rdi		/* Encrypted kernel */
+	movq	%r11, %rsi		/* Decrypted kernel */
+	movq	%r8, %rdx		/* Pagetables used for encryption */
+	movq	%r12, %rcx		/* Kernel length */
+	movq	%rax, %r8		/* Workarea encryption routine */
+	addq	$PAGE_SIZE, %r8		/* Workarea intermediate copy buffer */
+
+	call	*%rax			/* Call the encryption routine */
+
+	pop	%r12
+
+	movq	%rbp, %rsp		/* Restore original stack pointer */
+	FRAME_END
+
+	ret
+ENDPROC(sme_encrypt_execute)
+
+ENTRY(__enc_copy)
+/*
+ * Routine used to encrypt kernel.
+ *   This routine must be run outside of the kernel proper since
+ *   the kernel will be encrypted during the process. So this
+ *   routine is defined here and then copied to an area outside
+ *   of the kernel where it will remain and run decrypted
+ *   during execution.
+ *
+ *   On entry the registers must be:
+ *     RDI - virtual address for the encrypted kernel mapping
+ *     RSI - virtual address for the decrypted kernel mapping
+ *     RDX - address of the pagetables to use for encryption
+ *     RCX - length of kernel
+ *      R8 - intermediate copy buffer
+ *
+ *     RAX - points to this routine
+ *
+ * The kernel will be encrypted by copying from the non-encrypted
+ * kernel space to an intermediate buffer and then copying from the
+ * intermediate buffer back to the encrypted kernel space. The physical
+ * addresses of the two kernel space mappings are the same which
+ * results in the kernel being encrypted "in place".
+ */
+	/* Enable the new page tables */
+	mov	%rdx, %cr3
+
+	/* Flush any global TLBs */
+	mov	%cr4, %rdx
+	andq	$~X86_CR4_PGE, %rdx
+	mov	%rdx, %cr4
+	orq	$X86_CR4_PGE, %rdx
+	mov	%rdx, %cr4
+
+	/* Set the PAT register PA5 entry to write-protect */
+	push	%rcx
+	movl	$MSR_IA32_CR_PAT, %ecx
+	rdmsr
+	push	%rdx			/* Save original PAT value */
+	andl	$0xffff00ff, %edx	/* Clear PA5 */
+	orl	$0x00000500, %edx	/* Set PA5 to WP */
+	wrmsr
+	pop	%rdx			/* RDX contains original PAT value */
+	pop	%rcx
+
+	movq	%rcx, %r9		/* Save kernel length */
+	movq	%rdi, %r10		/* Save encrypted kernel address */
+	movq	%rsi, %r11		/* Save decrypted kernel address */
+
+	wbinvd				/* Invalidate any cache entries */
+
+	/* Copy/encrypt 2MB at a time */
+1:
+	movq	%r11, %rsi		/* Source - decrypted kernel */
+	movq	%r8, %rdi		/* Dest   - intermediate copy buffer */
+	movq	$PMD_PAGE_SIZE, %rcx	/* 2MB length */
+	rep	movsb
+
+	movq	%r8, %rsi		/* Source - intermediate copy buffer */
+	movq	%r10, %rdi		/* Dest   - encrypted kernel */
+	movq	$PMD_PAGE_SIZE, %rcx	/* 2MB length */
+	rep	movsb
+
+	addq	$PMD_PAGE_SIZE, %r11
+	addq	$PMD_PAGE_SIZE, %r10
+	subq	$PMD_PAGE_SIZE, %r9	/* Kernel length decrement */
+	jnz	1b			/* Kernel length not zero? */
+
+	/* Restore PAT register */
+	push	%rdx			/* Save original PAT value */
+	movl	$MSR_IA32_CR_PAT, %ecx
+	rdmsr
+	pop	%rdx			/* Restore original PAT value */
+	wrmsr
+
+	ret
+.L__enc_copy_end:
+ENDPROC(__enc_copy)

From e505371dd83963caae1a37ead9524e8d997341be Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:33 -0500
Subject: [PATCH 047/149] x86/boot: Add early cmdline parsing for options with
 arguments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a cmdline_find_option() function to look for cmdline options that
take arguments. The argument is returned in a supplied buffer and the
argument length (regardless of whether it fits in the supplied buffer)
is returned, with -1 indicating not found.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/36b5f97492a9745dce27682305f990fc20e5cf8a.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cmdline.h |   2 +
 arch/x86/lib/cmdline.c         | 105 +++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+)

diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
index e01f7f7ccb0c..84ae170bc3d0 100644
--- a/arch/x86/include/asm/cmdline.h
+++ b/arch/x86/include/asm/cmdline.h
@@ -2,5 +2,7 @@
 #define _ASM_X86_CMDLINE_H
 
 int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
+int cmdline_find_option(const char *cmdline_ptr, const char *option,
+			char *buffer, int bufsize);
 
 #endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 5cc78bf57232..3261abb21ef4 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
 	return 0;	/* Buffer overrun */
 }
 
+/*
+ * Find a non-boolean option (i.e. option=argument). In accordance with
+ * standard Linux practice, if this option is repeated, this returns the
+ * last instance on the command line.
+ *
+ * @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
+ * @option: option string to look for
+ * @buffer: memory buffer to return the option argument
+ * @bufsize: size of the supplied memory buffer
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+static int
+__cmdline_find_option(const char *cmdline, int max_cmdline_size,
+		      const char *option, char *buffer, int bufsize)
+{
+	char c;
+	int pos = 0, len = -1;
+	const char *opptr = NULL;
+	char *bufptr = buffer;
+	enum {
+		st_wordstart = 0,	/* Start of word/after whitespace */
+		st_wordcmp,	/* Comparing this word */
+		st_wordskip,	/* Miscompare, skip */
+		st_bufcpy,	/* Copying this to buffer */
+	} state = st_wordstart;
+
+	if (!cmdline)
+		return -1;      /* No command line */
+
+	/*
+	 * This 'pos' check ensures we do not overrun
+	 * a non-NULL-terminated 'cmdline'
+	 */
+	while (pos++ < max_cmdline_size) {
+		c = *(char *)cmdline++;
+		if (!c)
+			break;
+
+		switch (state) {
+		case st_wordstart:
+			if (myisspace(c))
+				break;
+
+			state = st_wordcmp;
+			opptr = option;
+			/* fall through */
+
+		case st_wordcmp:
+			if ((c == '=') && !*opptr) {
+				/*
+				 * We matched all the way to the end of the
+				 * option we were looking for, prepare to
+				 * copy the argument.
+				 */
+				len = 0;
+				bufptr = buffer;
+				state = st_bufcpy;
+				break;
+			} else if (c == *opptr++) {
+				/*
+				 * We are currently matching, so continue
+				 * to the next character on the cmdline.
+				 */
+				break;
+			}
+			state = st_wordskip;
+			/* fall through */
+
+		case st_wordskip:
+			if (myisspace(c))
+				state = st_wordstart;
+			break;
+
+		case st_bufcpy:
+			if (myisspace(c)) {
+				state = st_wordstart;
+			} else {
+				/*
+				 * Increment len, but don't overrun the
+				 * supplied buffer and leave room for the
+				 * NULL terminator.
+				 */
+				if (++len < bufsize)
+					*bufptr++ = c;
+			}
+			break;
+		}
+	}
+
+	if (bufsize)
+		*bufptr = '\0';
+
+	return len;
+}
+
 int cmdline_find_option_bool(const char *cmdline, const char *option)
 {
 	return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
 }
+
+int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
+			int bufsize)
+{
+	return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
+				     buffer, bufsize);
+}

From 7375ae3a0b79ea072f4c672039f08f5db633b9e1 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:34 -0500
Subject: [PATCH 048/149] compiler-gcc.h: Introduce __nostackprotector function
 attribute
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Create a new function attribute, __nostackprotector, that can used to turn off
stack protection on a per function basis.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/0576fd5c74440ad0250f16ac6609ecf587812456.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler-gcc.h | 2 ++
 include/linux/compiler.h     | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index cd4bbe8242bd..c28cedde973f 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -166,6 +166,8 @@
 
 #if GCC_VERSION >= 40100
 # define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
+
+#define __nostackprotector	__attribute__((__optimize__("no-stack-protector")))
 #endif
 
 #if GCC_VERSION >= 40300
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 219f82f3ec1a..3f8c88e29a46 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -470,6 +470,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 #define __visible
 #endif
 
+#ifndef __nostackprotector
+# define __nostackprotector
+#endif
+
 /*
  * Assume alignment of return value.
  */

From aca20d5462149333ba8b24a4a352be5b7a00dfd2 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:35 -0500
Subject: [PATCH 049/149] x86/mm: Add support to make use of Secure Memory
 Encryption
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support to check if SME has been enabled and if memory encryption
should be activated (checking of command line option based on the
configuration of the default state).  If memory encryption is to be
activated, then the encryption mask is set and the kernel is encrypted
"in place."

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/5f0da2fd4cce63f556117549e2c89c170072209f.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mem_encrypt.h |  6 ++-
 arch/x86/kernel/head64.c           |  5 +-
 arch/x86/mm/mem_encrypt.c          | 77 +++++++++++++++++++++++++++++-
 3 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 7122c365c2a3..8e618fcf1f7c 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -17,6 +17,8 @@
 
 #include <linux/init.h>
 
+#include <asm/bootparam.h>
+
 #ifdef CONFIG_AMD_MEM_ENCRYPT
 
 extern unsigned long sme_me_mask;
@@ -38,7 +40,7 @@ void __init sme_unmap_bootdata(char *real_mode_data);
 void __init sme_early_init(void);
 
 void __init sme_encrypt_kernel(void);
-void __init sme_enable(void);
+void __init sme_enable(struct boot_params *bp);
 
 /* Architecture __weak replacement functions */
 void __init mem_encrypt_init(void);
@@ -60,7 +62,7 @@ static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
 static inline void __init sme_early_init(void) { }
 
 static inline void __init sme_encrypt_kernel(void) { }
-static inline void __init sme_enable(void) { }
+static inline void __init sme_enable(struct boot_params *bp) { }
 
 #endif	/* CONFIG_AMD_MEM_ENCRYPT */
 
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 0cdb53bf4c4b..925b2928f377 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -45,7 +45,8 @@ static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
 	return ptr - (void *)_text + (void *)physaddr;
 }
 
-unsigned long __head __startup_64(unsigned long physaddr)
+unsigned long __head __startup_64(unsigned long physaddr,
+				  struct boot_params *bp)
 {
 	unsigned long load_delta, *p;
 	unsigned long pgtable_flags;
@@ -70,7 +71,7 @@ unsigned long __head __startup_64(unsigned long physaddr)
 		for (;;);
 
 	/* Activate Secure Memory Encryption (SME) if supported and enabled */
-	sme_enable();
+	sme_enable(bp);
 
 	/* Include the SME encryption mask in the fixup value */
 	load_delta += sme_get_me_mask();
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index e5d543938b0f..0fbd09269757 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -15,6 +15,7 @@
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
 #include <linux/swiotlb.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/tlbflush.h>
 #include <asm/fixmap.h>
@@ -23,6 +24,13 @@
 #include <asm/set_memory.h>
 #include <asm/cacheflush.h>
 #include <asm/sections.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/cmdline.h>
+
+static char sme_cmdline_arg[] __initdata = "mem_encrypt";
+static char sme_cmdline_on[]  __initdata = "on";
+static char sme_cmdline_off[] __initdata = "off";
 
 /*
  * Since SME related variables are set early in the boot process they must
@@ -190,6 +198,8 @@ void __init mem_encrypt_init(void)
 
 	/* Call into SWIOTLB to update the SWIOTLB DMA buffers */
 	swiotlb_update_mem_attributes();
+
+	pr_info("AMD Secure Memory Encryption (SME) active\n");
 }
 
 void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
@@ -513,6 +523,71 @@ void __init sme_encrypt_kernel(void)
 	native_write_cr3(__native_read_cr3());
 }
 
-void __init sme_enable(void)
+void __init __nostackprotector sme_enable(struct boot_params *bp)
 {
+	const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
+	unsigned int eax, ebx, ecx, edx;
+	bool active_by_default;
+	unsigned long me_mask;
+	char buffer[16];
+	u64 msr;
+
+	/* Check for the SME support leaf */
+	eax = 0x80000000;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+	if (eax < 0x8000001f)
+		return;
+
+	/*
+	 * Check for the SME feature:
+	 *   CPUID Fn8000_001F[EAX] - Bit 0
+	 *     Secure Memory Encryption support
+	 *   CPUID Fn8000_001F[EBX] - Bits 5:0
+	 *     Pagetable bit position used to indicate encryption
+	 */
+	eax = 0x8000001f;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+	if (!(eax & 1))
+		return;
+
+	me_mask = 1UL << (ebx & 0x3f);
+
+	/* Check if SME is enabled */
+	msr = __rdmsr(MSR_K8_SYSCFG);
+	if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+		return;
+
+	/*
+	 * Fixups have not been applied to phys_base yet and we're running
+	 * identity mapped, so we must obtain the address to the SME command
+	 * line argument data using rip-relative addressing.
+	 */
+	asm ("lea sme_cmdline_arg(%%rip), %0"
+	     : "=r" (cmdline_arg)
+	     : "p" (sme_cmdline_arg));
+	asm ("lea sme_cmdline_on(%%rip), %0"
+	     : "=r" (cmdline_on)
+	     : "p" (sme_cmdline_on));
+	asm ("lea sme_cmdline_off(%%rip), %0"
+	     : "=r" (cmdline_off)
+	     : "p" (sme_cmdline_off));
+
+	if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
+		active_by_default = true;
+	else
+		active_by_default = false;
+
+	cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
+				     ((u64)bp->ext_cmd_line_ptr << 32));
+
+	cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
+
+	if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
+		sme_me_mask = me_mask;
+	else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
+		sme_me_mask = 0;
+	else
+		sme_me_mask = active_by_default ? me_mask : 0;
 }

From 693bf0aa01b7f9a1b24b2b932d555c6667f17a1a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 20 Jul 2017 11:23:07 +0200
Subject: [PATCH 050/149] x86/boot: Fix memremap() related build failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The following commit:

  5997efb96756 ("x86/boot: Use memremap() to map the MPF and MPC data")

causes new build failures on certain randconfigs:

  drivers/sfi/sfi_core.c: In function ‘sfi_map_memory’:
  drivers/sfi/sfi_core.c:104:10: error: implicit declaration of function ‘memremap’ [-Werror=implicit-function-declaration]
  drivers/sfi/sfi_core.c:104:31: error: ‘MEMREMAP_WB’ undeclared (first use in this function)

This is a case of parasitic header dependencies: the sfi_core.c file indirectly
includes <linux/io.h> for typical - but not all configs.

Including <linux/io.h> explicitly solves the build failure.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arnd@arndb.de
Cc: aryabinin@virtuozzo.com
Cc: bp@alien8.de
Cc: bp@suse.de
Cc: brijesh.singh@amd.com
Cc: corbet@lwn.net
Cc: dvyukov@google.com
Cc: dyoung@redhat.com
Cc: glider@google.com
Cc: konrad.wilk@oracle.com
Cc: linux-tip-commits@vger.kernel.org
Cc: luto@kernel.org
Cc: lwoodman@redhat.com
Cc: matt@codeblueprint.co.uk
Cc: mst@redhat.com
Cc: pbonzini@redhat.com
Cc: riel@redhat.com
Cc: rkrcmar@redhat.com
Cc: thomas.lendacky@amd.com
Cc: toshi.kani@hpe.com
Link: http://lkml.kernel.org/r/20170720092307.6xslahuaclmsiffe@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/sfi/sfi_core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/sfi/sfi_core.c b/drivers/sfi/sfi_core.c
index d5ce53491efb..153b3f3cc795 100644
--- a/drivers/sfi/sfi_core.c
+++ b/drivers/sfi/sfi_core.c
@@ -68,6 +68,7 @@
 #include <linux/init.h>
 #include <linux/sfi.h>
 #include <linux/slab.h>
+#include <linux/io.h>
 
 #include "sfi_core.h"
 

From 3a366f791d83c690f3042e37f410f38de04fb4c7 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Mon, 17 Jul 2017 01:59:47 +0300
Subject: [PATCH 051/149] x86/mm/dump_pagetables: Generalize address
 normalization

Modify normalize_addr to handle different sizes of virtual address
space.

It's preparation for enabling 5-level paging.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170716225954.74185-2-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/dump_pagetables.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0470826d2bdc..a824d575bb84 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -188,11 +188,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
  */
 static unsigned long normalize_addr(unsigned long u)
 {
-#ifdef CONFIG_X86_64
-	return (signed long)(u << 16) >> 16;
-#else
-	return u;
-#endif
+	int shift;
+	if (!IS_ENABLED(CONFIG_X86_64))
+		return u;
+
+	shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
+	return (signed long)(u << shift) >> shift;
 }
 
 /*

From 45dcd20913632fe69641c24b16d89e13e0e107ff Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Mon, 17 Jul 2017 01:59:48 +0300
Subject: [PATCH 052/149] x86/mm/dump_pagetables: Fix printout of p4d level

Modify printk_prot() and callers to print out additional page table
level correctly.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170716225954.74185-3-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/dump_pagetables.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a824d575bb84..b371ab68f2d4 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -138,7 +138,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
 {
 	pgprotval_t pr = pgprot_val(prot);
 	static const char * const level_name[] =
-		{ "cr3", "pgd", "pud", "pmd", "pte" };
+		{ "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
 
 	if (!pgprot_val(prot)) {
 		/* Not present */
@@ -162,12 +162,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
 			pt_dump_cont_printf(m, dmsg, "    ");
 
 		/* Bit 7 has a different meaning on level 3 vs 4 */
-		if (level <= 3 && pr & _PAGE_PSE)
+		if (level <= 4 && pr & _PAGE_PSE)
 			pt_dump_cont_printf(m, dmsg, "PSE ");
 		else
 			pt_dump_cont_printf(m, dmsg, "    ");
-		if ((level == 4 && pr & _PAGE_PAT) ||
-		    ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
+		if ((level == 5 && pr & _PAGE_PAT) ||
+		    ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE))
 			pt_dump_cont_printf(m, dmsg, "PAT ");
 		else
 			pt_dump_cont_printf(m, dmsg, "    ");
@@ -298,7 +298,7 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
 	for (i = 0; i < PTRS_PER_PTE; i++) {
 		prot = pte_flags(*start);
 		st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
-		note_page(m, st, __pgprot(prot), 4);
+		note_page(m, st, __pgprot(prot), 5);
 		start++;
 	}
 }
@@ -317,13 +317,13 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
 		if (!pmd_none(*start)) {
 			if (pmd_large(*start) || !pmd_present(*start)) {
 				prot = pmd_flags(*start);
-				note_page(m, st, __pgprot(prot), 3);
+				note_page(m, st, __pgprot(prot), 4);
 			} else {
 				walk_pte_level(m, st, *start,
 					       P + i * PMD_LEVEL_MULT);
 			}
 		} else
-			note_page(m, st, __pgprot(0), 3);
+			note_page(m, st, __pgprot(0), 4);
 		start++;
 	}
 }
@@ -362,13 +362,13 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
 		    !pud_already_checked(prev_pud, start, st->check_wx)) {
 			if (pud_large(*start) || !pud_present(*start)) {
 				prot = pud_flags(*start);
-				note_page(m, st, __pgprot(prot), 2);
+				note_page(m, st, __pgprot(prot), 3);
 			} else {
 				walk_pmd_level(m, st, *start,
 					       P + i * PUD_LEVEL_MULT);
 			}
 		} else
-			note_page(m, st, __pgprot(0), 2);
+			note_page(m, st, __pgprot(0), 3);
 
 		prev_pud = start;
 		start++;

From daaa42f04178ac7b02e4d43cdfc7fac48430152e Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Mon, 17 Jul 2017 01:59:49 +0300
Subject: [PATCH 053/149] x86/xen: Redefine XEN_ELFNOTE_INIT_P2M using PUD_SIZE
 * PTRS_PER_PUD

XEN_ELFNOTE_INIT_P2M has to be 512GB for both 4- and 5-level paging.
(PUD_SIZE * PTRS_PER_PUD) would do this.

Unfortunately, we cannot use P4D_SIZE, which would fit here. With
current headers structure it cannot be used in assembly, if p4d
level is folded.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170716225954.74185-4-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/xen/xen-head.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 72a8e6adebe6..a7525e95d53f 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -58,7 +58,7 @@ ENTRY(hypercall_page)
 #else
 	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      _ASM_PTR __START_KERNEL_map)
 	/* Map the p2m table to a 512GB-aligned user address. */
-	ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad PGDIR_SIZE)
+	ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad (PUD_SIZE * PTRS_PER_PUD))
 #endif
 #ifdef CONFIG_XEN_PV
 	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          _ASM_PTR startup_xen)

From e8f01a8dad7cdf9ddaa792451733a45ab5d385af Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Mon, 17 Jul 2017 01:59:50 +0300
Subject: [PATCH 054/149] x86/mm: Rename tasksize_32bit/64bit to
 task_size_32bit/64bit()

Rename these helpers to be consistent with spelling of TASK_SIZE and
related constants.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170716225954.74185-5-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/elf.h   |  4 ++--
 arch/x86/kernel/sys_x86_64.c |  2 +-
 arch/x86/mm/hugetlbpage.c    |  2 +-
 arch/x86/mm/mmap.c           | 10 +++++-----
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 1c18d83d3f09..c7090ef1388e 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -304,8 +304,8 @@ static inline int mmap_is_ia32(void)
 		test_thread_flag(TIF_ADDR32));
 }
 
-extern unsigned long tasksize_32bit(void);
-extern unsigned long tasksize_64bit(void);
+extern unsigned long task_size_32bit(void);
+extern unsigned long task_size_64bit(void);
 extern unsigned long get_mmap_base(int is_legacy);
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 213ddf3e937d..89bd0d6460e1 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -120,7 +120,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
 	}
 
 	*begin	= get_mmap_base(1);
-	*end	= in_compat_syscall() ? tasksize_32bit() : tasksize_64bit();
+	*end	= in_compat_syscall() ? task_size_32bit() : task_size_64bit();
 }
 
 unsigned long
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 2824607df108..868f02cf58ce 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -86,7 +86,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
 	info.length = len;
 	info.low_limit = get_mmap_base(1);
 	info.high_limit = in_compat_syscall() ?
-		tasksize_32bit() : tasksize_64bit();
+		task_size_32bit() : task_size_64bit();
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
 	return vm_unmapped_area(&info);
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 229d04a83f85..d2586913c8d0 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -37,12 +37,12 @@ struct va_alignment __read_mostly va_align = {
 	.flags = -1,
 };
 
-unsigned long tasksize_32bit(void)
+unsigned long task_size_32bit(void)
 {
 	return IA32_PAGE_OFFSET;
 }
 
-unsigned long tasksize_64bit(void)
+unsigned long task_size_64bit(void)
 {
 	return TASK_SIZE_MAX;
 }
@@ -52,7 +52,7 @@ static unsigned long stack_maxrandom_size(unsigned long task_size)
 	unsigned long max = 0;
 	if ((current->flags & PF_RANDOMIZE) &&
 		!(current->personality & ADDR_NO_RANDOMIZE)) {
-		max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit());
+		max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit());
 		max <<= PAGE_SHIFT;
 	}
 
@@ -142,7 +142,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
 
 	arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
-			arch_rnd(mmap64_rnd_bits), tasksize_64bit());
+			arch_rnd(mmap64_rnd_bits), task_size_64bit());
 
 #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
 	/*
@@ -152,7 +152,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	 * mmap_base, the compat syscall uses mmap_compat_base.
 	 */
 	arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
-			arch_rnd(mmap32_rnd_bits), tasksize_32bit());
+			arch_rnd(mmap32_rnd_bits), task_size_32bit());
 #endif
 }
 

From 44b04912fa72489d403738f39e1c782614b7ae7c Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Mon, 17 Jul 2017 01:59:51 +0300
Subject: [PATCH 055/149] x86/mpx: Do not allow MPX if we have mappings above
 47-bit

MPX (without MAWA extension) cannot handle addresses above 47 bits, so we
need to make sure that MPX cannot be enabled if we already have a VMA above
the boundary and forbid creating such VMAs once MPX is enabled.

The patch implements mpx_unmapped_area_check() which is called from all
variants of get_unmapped_area() to check if the requested address fits
mpx.

On enabling MPX, we check if we already have any vma above 47-bit
boundary and forbit the enabling if we do.

As long as DEFAULT_MAP_WINDOW is equal to TASK_SIZE_MAX, the change is
nop. It will change when we allow userspace to have mappings above
47-bits.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170716225954.74185-6-kirill.shutemov@linux.intel.com
[ Readability edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mpx.h       |  9 +++++++++
 arch/x86/include/asm/processor.h |  3 +++
 arch/x86/kernel/sys_x86_64.c     |  9 +++++++++
 arch/x86/mm/hugetlbpage.c        |  6 ++++++
 arch/x86/mm/mpx.c                | 33 +++++++++++++++++++++++++++++++-
 5 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h
index a0d662be4c5b..7d7404756bb4 100644
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -73,6 +73,9 @@ static inline void mpx_mm_init(struct mm_struct *mm)
 }
 void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
 		      unsigned long start, unsigned long end);
+
+unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
+		unsigned long flags);
 #else
 static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
 {
@@ -94,6 +97,12 @@ static inline void mpx_notify_unmap(struct mm_struct *mm,
 				    unsigned long start, unsigned long end)
 {
 }
+
+static inline unsigned long mpx_unmapped_area_check(unsigned long addr,
+		unsigned long len, unsigned long flags)
+{
+	return addr;
+}
 #endif /* CONFIG_X86_INTEL_MPX */
 
 #endif /* _ASM_X86_MPX_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 973709d2938f..c8d59fcc5665 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -809,6 +809,7 @@ static inline void spin_lock_prefetch(const void *x)
 #define IA32_PAGE_OFFSET	PAGE_OFFSET
 #define TASK_SIZE		PAGE_OFFSET
 #define TASK_SIZE_MAX		TASK_SIZE
+#define DEFAULT_MAP_WINDOW	TASK_SIZE
 #define STACK_TOP		TASK_SIZE
 #define STACK_TOP_MAX		STACK_TOP
 
@@ -850,6 +851,8 @@ static inline void spin_lock_prefetch(const void *x)
  */
 #define TASK_SIZE_MAX	((1UL << 47) - PAGE_SIZE)
 
+#define DEFAULT_MAP_WINDOW	TASK_SIZE_MAX
+
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
  */
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 89bd0d6460e1..f840e895d871 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -21,6 +21,7 @@
 #include <asm/compat.h>
 #include <asm/ia32.h>
 #include <asm/syscalls.h>
+#include <asm/mpx.h>
 
 /*
  * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
@@ -132,6 +133,10 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	struct vm_unmapped_area_info info;
 	unsigned long begin, end;
 
+	addr = mpx_unmapped_area_check(addr, len, flags);
+	if (IS_ERR_VALUE(addr))
+		return addr;
+
 	if (flags & MAP_FIXED)
 		return addr;
 
@@ -171,6 +176,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	unsigned long addr = addr0;
 	struct vm_unmapped_area_info info;
 
+	addr = mpx_unmapped_area_check(addr, len, flags);
+	if (IS_ERR_VALUE(addr))
+		return addr;
+
 	/* requested length too big for entire address space */
 	if (len > TASK_SIZE)
 		return -ENOMEM;
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 868f02cf58ce..3cf89ad00f87 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -18,6 +18,7 @@
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
 #include <asm/elf.h>
+#include <asm/mpx.h>
 
 #if 0	/* This is just for testing */
 struct page *
@@ -135,6 +136,11 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 
 	if (len & ~huge_page_mask(h))
 		return -EINVAL;
+
+	addr = mpx_unmapped_area_check(addr, len, flags);
+	if (IS_ERR_VALUE(addr))
+		return addr;
+
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 1c34b767c84c..9ceaa955d2ba 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -355,10 +355,19 @@ int mpx_enable_management(void)
 	 */
 	bd_base = mpx_get_bounds_dir();
 	down_write(&mm->mmap_sem);
+
+	/* MPX doesn't support addresses above 47 bits yet. */
+	if (find_vma(mm, DEFAULT_MAP_WINDOW)) {
+		pr_warn_once("%s (%d): MPX cannot handle addresses "
+				"above 47-bits. Disabling.",
+				current->comm, current->pid);
+		ret = -ENXIO;
+		goto out;
+	}
 	mm->context.bd_addr = bd_base;
 	if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR)
 		ret = -ENXIO;
-
+out:
 	up_write(&mm->mmap_sem);
 	return ret;
 }
@@ -1030,3 +1039,25 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (ret)
 		force_sig(SIGSEGV, current);
 }
+
+/* MPX cannot handle addresses above 47 bits yet. */
+unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
+		unsigned long flags)
+{
+	if (!kernel_managing_mpx_tables(current->mm))
+		return addr;
+	if (addr + len <= DEFAULT_MAP_WINDOW)
+		return addr;
+	if (flags & MAP_FIXED)
+		return -ENOMEM;
+
+	/*
+	 * Requested len is larger than the whole area we're allowed to map in.
+	 * Resetting hinting address wouldn't do much good -- fail early.
+	 */
+	if (len > DEFAULT_MAP_WINDOW)
+		return -ENOMEM;
+
+	/* Look for unmap area within DEFAULT_MAP_WINDOW */
+	return 0;
+}

From b569bab78d8df157a6f91070af827753e4d1787c Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Mon, 17 Jul 2017 01:59:52 +0300
Subject: [PATCH 056/149] x86/mm: Prepare to expose larger address space to
 userspace

On x86, 5-level paging enables 56-bit userspace virtual address space.
Not all user space is ready to handle wide addresses. It's known that
at least some JIT compilers use higher bits in pointers to encode their
information. It collides with valid pointers with 5-level paging and
leads to crashes.

To mitigate this, we are not going to allocate virtual address space
above 47-bit by default.

But userspace can ask for allocation from full address space by
specifying hint address (with or without MAP_FIXED) above 47-bits.

If hint address set above 47-bit, but MAP_FIXED is not specified, we try
to look for unmapped area by specified address. If it's already
occupied, we look for unmapped area in *full* address space, rather than
from 47-bit window.

A high hint address would only affect the allocation in question, but not
any future mmap()s.

Specifying high hint address on older kernel or on machine without 5-level
paging support is safe. The hint will be ignored and kernel will fall back
to allocation from 47-bit address space.

This approach helps to easily make application's memory allocator aware
about large address space without manually tracking allocated virtual
address space.

The patch puts all machinery in place, but not yet allows userspace to have
mappings above 47-bit -- TASK_SIZE_MAX has to be raised to get the effect.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170716225954.74185-7-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/elf.h       |  2 +-
 arch/x86/include/asm/processor.h |  7 +++++--
 arch/x86/kernel/sys_x86_64.c     | 21 +++++++++++++++++----
 arch/x86/mm/hugetlbpage.c        | 21 +++++++++++++++++----
 arch/x86/mm/mmap.c               |  6 +++---
 5 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index c7090ef1388e..f87f4d73c700 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -305,7 +305,7 @@ static inline int mmap_is_ia32(void)
 }
 
 extern unsigned long task_size_32bit(void);
-extern unsigned long task_size_64bit(void);
+extern unsigned long task_size_64bit(int full_addr_space);
 extern unsigned long get_mmap_base(int is_legacy);
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c8d59fcc5665..58fd40a3bf52 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -808,6 +808,7 @@ static inline void spin_lock_prefetch(const void *x)
  */
 #define IA32_PAGE_OFFSET	PAGE_OFFSET
 #define TASK_SIZE		PAGE_OFFSET
+#define TASK_SIZE_LOW		TASK_SIZE
 #define TASK_SIZE_MAX		TASK_SIZE
 #define DEFAULT_MAP_WINDOW	TASK_SIZE
 #define STACK_TOP		TASK_SIZE
@@ -859,12 +860,14 @@ static inline void spin_lock_prefetch(const void *x)
 #define IA32_PAGE_OFFSET	((current->personality & ADDR_LIMIT_3GB) ? \
 					0xc0000000 : 0xFFFFe000)
 
+#define TASK_SIZE_LOW		(test_thread_flag(TIF_ADDR32) ? \
+					IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW)
 #define TASK_SIZE		(test_thread_flag(TIF_ADDR32) ? \
 					IA32_PAGE_OFFSET : TASK_SIZE_MAX)
 #define TASK_SIZE_OF(child)	((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
 					IA32_PAGE_OFFSET : TASK_SIZE_MAX)
 
-#define STACK_TOP		TASK_SIZE
+#define STACK_TOP		TASK_SIZE_LOW
 #define STACK_TOP_MAX		TASK_SIZE_MAX
 
 #define INIT_THREAD  {						\
@@ -885,7 +888,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
  * space during mmap's.
  */
 #define __TASK_UNMAPPED_BASE(task_size)	(PAGE_ALIGN(task_size / 3))
-#define TASK_UNMAPPED_BASE		__TASK_UNMAPPED_BASE(TASK_SIZE)
+#define TASK_UNMAPPED_BASE		__TASK_UNMAPPED_BASE(TASK_SIZE_LOW)
 
 #define KSTK_EIP(task)		(task_pt_regs(task)->ip)
 
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index f840e895d871..73e4d28112f8 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -101,8 +101,8 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
 	return error;
 }
 
-static void find_start_end(unsigned long flags, unsigned long *begin,
-			   unsigned long *end)
+static void find_start_end(unsigned long addr, unsigned long flags,
+		unsigned long *begin, unsigned long *end)
 {
 	if (!in_compat_syscall() && (flags & MAP_32BIT)) {
 		/* This is usually used needed to map code in small
@@ -121,7 +121,10 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
 	}
 
 	*begin	= get_mmap_base(1);
-	*end	= in_compat_syscall() ? task_size_32bit() : task_size_64bit();
+	if (in_compat_syscall())
+		*end = task_size_32bit();
+	else
+		*end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
 }
 
 unsigned long
@@ -140,7 +143,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	if (flags & MAP_FIXED)
 		return addr;
 
-	find_start_end(flags, &begin, &end);
+	find_start_end(addr, flags, &begin, &end);
 
 	if (len > end)
 		return -ENOMEM;
@@ -204,6 +207,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	info.length = len;
 	info.low_limit = PAGE_SIZE;
 	info.high_limit = get_mmap_base(0);
+
+	/*
+	 * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+	 * in the full address space.
+	 *
+	 * !in_compat_syscall() check to avoid high addresses for x32.
+	 */
+	if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+		info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
+
 	info.align_mask = 0;
 	info.align_offset = pgoff << PAGE_SHIFT;
 	if (filp) {
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 3cf89ad00f87..6d06cf33e3de 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -86,25 +86,38 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
 	info.flags = 0;
 	info.length = len;
 	info.low_limit = get_mmap_base(1);
+
+	/*
+	 * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+	 * in the full address space.
+	 */
 	info.high_limit = in_compat_syscall() ?
-		task_size_32bit() : task_size_64bit();
+		task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW);
+
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
 	return vm_unmapped_area(&info);
 }
 
 static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
-		unsigned long addr0, unsigned long len,
+		unsigned long addr, unsigned long len,
 		unsigned long pgoff, unsigned long flags)
 {
 	struct hstate *h = hstate_file(file);
 	struct vm_unmapped_area_info info;
-	unsigned long addr;
 
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
 	info.low_limit = PAGE_SIZE;
 	info.high_limit = get_mmap_base(0);
+
+	/*
+	 * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
+	 * in the full address space.
+	 */
+	if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall())
+		info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
+
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
 	addr = vm_unmapped_area(&info);
@@ -119,7 +132,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
 		VM_BUG_ON(addr != -ENOMEM);
 		info.flags = 0;
 		info.low_limit = TASK_UNMAPPED_BASE;
-		info.high_limit = TASK_SIZE;
+		info.high_limit = TASK_SIZE_LOW;
 		addr = vm_unmapped_area(&info);
 	}
 
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index d2586913c8d0..c15a50a70b24 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -42,9 +42,9 @@ unsigned long task_size_32bit(void)
 	return IA32_PAGE_OFFSET;
 }
 
-unsigned long task_size_64bit(void)
+unsigned long task_size_64bit(int full_addr_space)
 {
-	return TASK_SIZE_MAX;
+	return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW;
 }
 
 static unsigned long stack_maxrandom_size(unsigned long task_size)
@@ -142,7 +142,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
 
 	arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
-			arch_rnd(mmap64_rnd_bits), task_size_64bit());
+			arch_rnd(mmap64_rnd_bits), task_size_64bit(0));
 
 #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
 	/*

From ee00f4a32a76ef631394f31d5b6028d50462b357 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Mon, 17 Jul 2017 01:59:53 +0300
Subject: [PATCH 057/149] x86/mm: Allow userspace have mappings above 47-bit

All bits and pieces are now in place and we can allow userspace to have VMAs
above 47 bits.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170716225954.74185-8-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 58fd40a3bf52..ed26e14cfc2f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -850,9 +850,9 @@ static inline void spin_lock_prefetch(const void *x)
  * particular problem by preventing anything from being mapped
  * at the maximum canonical address.
  */
-#define TASK_SIZE_MAX	((1UL << 47) - PAGE_SIZE)
+#define TASK_SIZE_MAX	((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
 
-#define DEFAULT_MAP_WINDOW	TASK_SIZE_MAX
+#define DEFAULT_MAP_WINDOW	((1UL << 47) - PAGE_SIZE)
 
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.

From 77ef56e4f0fbb350d93289aa025c7d605af012d4 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Mon, 17 Jul 2017 01:59:54 +0300
Subject: [PATCH 058/149] x86: Enable 5-level paging support via
 CONFIG_X86_5LEVEL=y

Most of things are in place and we can enable support for 5-level paging.

The patch makes XEN_PV and XEN_PVH dependent on !X86_5LEVEL. Both are
not ready to work with 5-level paging.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170716225954.74185-9-kirill.shutemov@linux.intel.com
[ Minor readability edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/x86/x86_64/5level-paging.txt | 64 ++++++++++++++++++++++
 arch/x86/Kconfig                           | 19 +++++++
 arch/x86/xen/Kconfig                       |  5 ++
 3 files changed, 88 insertions(+)
 create mode 100644 Documentation/x86/x86_64/5level-paging.txt

diff --git a/Documentation/x86/x86_64/5level-paging.txt b/Documentation/x86/x86_64/5level-paging.txt
new file mode 100644
index 000000000000..087251a0d99c
--- /dev/null
+++ b/Documentation/x86/x86_64/5level-paging.txt
@@ -0,0 +1,64 @@
+== Overview ==
+
+Original x86-64 was limited by 4-level paing to 256 TiB of virtual address
+space and 64 TiB of physical address space. We are already bumping into
+this limit: some vendors offers servers with 64 TiB of memory today.
+
+To overcome the limitation upcoming hardware will introduce support for
+5-level paging. It is a straight-forward extension of the current page
+table structure adding one more layer of translation.
+
+It bumps the limits to 128 PiB of virtual address space and 4 PiB of
+physical address space. This "ought to be enough for anybody" ©.
+
+QEMU 2.9 and later support 5-level paging.
+
+Virtual memory layout for 5-level paging is described in
+Documentation/x86/x86_64/mm.txt
+
+== Enabling 5-level paging ==
+
+CONFIG_X86_5LEVEL=y enables the feature.
+
+So far, a kernel compiled with the option enabled will be able to boot
+only on machines that supports the feature -- see for 'la57' flag in
+/proc/cpuinfo.
+
+The plan is to implement boot-time switching between 4- and 5-level paging
+in the future.
+
+== User-space and large virtual address space ==
+
+On x86, 5-level paging enables 56-bit userspace virtual address space.
+Not all user space is ready to handle wide addresses. It's known that
+at least some JIT compilers use higher bits in pointers to encode their
+information. It collides with valid pointers with 5-level paging and
+leads to crashes.
+
+To mitigate this, we are not going to allocate virtual address space
+above 47-bit by default.
+
+But userspace can ask for allocation from full address space by
+specifying hint address (with or without MAP_FIXED) above 47-bits.
+
+If hint address set above 47-bit, but MAP_FIXED is not specified, we try
+to look for unmapped area by specified address. If it's already
+occupied, we look for unmapped area in *full* address space, rather than
+from 47-bit window.
+
+A high hint address would only affect the allocation in question, but not
+any future mmap()s.
+
+Specifying high hint address on older kernel or on machine without 5-level
+paging support is safe. The hint will be ignored and kernel will fall back
+to allocation from 47-bit address space.
+
+This approach helps to easily make application's memory allocator aware
+about large address space without manually tracking allocated virtual
+address space.
+
+One important case we need to handle here is interaction with MPX.
+MPX (without MAWA extension) cannot handle addresses above 47-bit, so we
+need to make sure that MPX cannot be enabled we already have VMA above
+the boundary and forbid creating such VMAs once MPX is enabled.
+
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8328bcb9ce8b..ff637dedfafa 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -326,6 +326,7 @@ config FIX_EARLYCON_MEM
 
 config PGTABLE_LEVELS
 	int
+	default 5 if X86_5LEVEL
 	default 4 if X86_64
 	default 3 if X86_PAE
 	default 2
@@ -1398,6 +1399,24 @@ config X86_PAE
 	  has the cost of more pagetable lookup overhead, and also
 	  consumes more pagetable space per process.
 
+config X86_5LEVEL
+	bool "Enable 5-level page tables support"
+	depends on X86_64
+	---help---
+	  5-level paging enables access to larger address space:
+	  upto 128 PiB of virtual address space and 4 PiB of
+	  physical address space.
+
+	  It will be supported by future Intel CPUs.
+
+	  Note: a kernel with this option enabled can only be booted
+	  on machines that support the feature.
+
+	  See Documentation/x86/x86_64/5level-paging.txt for more
+	  information.
+
+	  Say N if unsure.
+
 config ARCH_PHYS_ADDR_T_64BIT
 	def_bool y
 	depends on X86_64 || X86_PAE
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 027987638e98..1ecd419811a2 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -17,6 +17,9 @@ config XEN_PV
 	bool "Xen PV guest support"
 	default y
 	depends on XEN
+	# XEN_PV is not ready to work with 5-level paging.
+	# Changes to hypervisor are also required.
+	depends on !X86_5LEVEL
 	select XEN_HAVE_PVMMU
 	select XEN_HAVE_VPMU
 	help
@@ -75,4 +78,6 @@ config XEN_DEBUG_FS
 config XEN_PVH
 	bool "Support for running as a PVH guest"
 	depends on XEN && XEN_PVHVM && ACPI
+	# Pre-built page tables are not ready to handle 5-level paging.
+	depends on !X86_5LEVEL
 	def_bool n

From 10af6235e0d327d42e1bad974385197817923dc1 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 24 Jul 2017 21:41:38 -0700
Subject: [PATCH 059/149] x86/mm: Implement PCID based optimization: try to
 preserve old TLB entries using PCID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PCID is a "process context ID" -- it's what other architectures call
an address space ID.  Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB.  x86's
PCID is 12 bits.

This is an unorthodox approach to using PCID.  x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs.  To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.

This patch uses PCID differently.  We use a PCID to identify a
recently-used mm on a per-cpu basis.  An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.

Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs).  I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers.  Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.

ping-pong between two mms on the same CPU using eventfd:

  patched:         1.22µs
  patched, nopcid: 1.33µs
  unpatched:       1.34µs

Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration.  dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:

  patched:         1.8µs  11M  dTLB misses
  patched, nopcid: 6.2µs, 207M dTLB misses
  unpatched:       6.1µs, 190M dTLB misses

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mmu_context.h     |  3 +
 arch/x86/include/asm/processor-flags.h |  2 +
 arch/x86/include/asm/tlbflush.h        | 18 +++++-
 arch/x86/mm/init.c                     |  1 +
 arch/x86/mm/tlb.c                      | 80 ++++++++++++++++++++------
 5 files changed, 86 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 85f6b5575aad..14b3cdccf4f9 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -300,6 +300,9 @@ static inline unsigned long __get_current_cr3_fast(void)
 {
 	unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
 
+	if (static_cpu_has(X86_FEATURE_PCID))
+		cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
 	/* For now, be very restrictive about when this can be called. */
 	VM_WARN_ON(in_nmi() || !in_atomic());
 
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index f5d3e50af98c..8a6d89fc9a79 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -36,6 +36,7 @@
 /* Mask off the address space ID and SME encryption bits. */
 #define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
 #define CR3_PCID_MASK 0xFFFull
+#define CR3_NOFLUSH (1UL << 63)
 #else
 /*
  * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
@@ -43,6 +44,7 @@
  */
 #define CR3_ADDR_MASK 0xFFFFFFFFull
 #define CR3_PCID_MASK 0ull
+#define CR3_NOFLUSH 0
 #endif
 
 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6397275008db..d23e61dc0640 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -82,6 +82,12 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+/*
+ * 6 because 6 should be plenty and struct tlb_state will fit in
+ * two cache lines.
+ */
+#define TLB_NR_DYN_ASIDS 6
+
 struct tlb_context {
 	u64 ctx_id;
 	u64 tlb_gen;
@@ -95,6 +101,8 @@ struct tlb_state {
 	 * mode even if we've already switched back to swapper_pg_dir.
 	 */
 	struct mm_struct *loaded_mm;
+	u16 loaded_mm_asid;
+	u16 next_asid;
 
 	/*
 	 * Access to this CR4 shadow and to H/W CR4 is protected by
@@ -104,7 +112,8 @@ struct tlb_state {
 
 	/*
 	 * This is a list of all contexts that might exist in the TLB.
-	 * Since we don't yet use PCID, there is only one context.
+	 * There is one per ASID that we use, and the ASID (what the
+	 * CPU calls PCID) is the index into ctxts.
 	 *
 	 * For each context, ctx_id indicates which mm the TLB's user
 	 * entries came from.  As an invariant, the TLB will never
@@ -114,8 +123,13 @@ struct tlb_state {
 	 * To be clear, this means that it's legal for the TLB code to
 	 * flush the TLB without updating tlb_gen.  This can happen
 	 * (for now, at least) due to paravirt remote flushes.
+	 *
+	 * NB: context 0 is a bit special, since it's also used by
+	 * various bits of init code.  This is fine -- code that
+	 * isn't aware of PCID will end up harmlessly flushing
+	 * context 0.
 	 */
-	struct tlb_context ctxs[1];
+	struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
 };
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 4d353efb2838..65ae17d45c4a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -812,6 +812,7 @@ void __init zone_sizes_init(void)
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
 	.loaded_mm = &init_mm,
+	.next_asid = 1,
 	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
 };
 EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 593d2f76a54c..ce104b962a17 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,6 +30,40 @@
 
 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
 
+static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
+			    u16 *new_asid, bool *need_flush)
+{
+	u16 asid;
+
+	if (!static_cpu_has(X86_FEATURE_PCID)) {
+		*new_asid = 0;
+		*need_flush = true;
+		return;
+	}
+
+	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+		if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
+		    next->context.ctx_id)
+			continue;
+
+		*new_asid = asid;
+		*need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
+			       next_tlb_gen);
+		return;
+	}
+
+	/*
+	 * We don't currently own an ASID slot on this CPU.
+	 * Allocate a slot.
+	 */
+	*new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
+	if (*new_asid >= TLB_NR_DYN_ASIDS) {
+		*new_asid = 0;
+		this_cpu_write(cpu_tlbstate.next_asid, 1);
+	}
+	*need_flush = true;
+}
+
 void leave_mm(int cpu)
 {
 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -65,6 +99,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			struct task_struct *tsk)
 {
 	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
 	unsigned cpu = smp_processor_id();
 	u64 next_tlb_gen;
 
@@ -84,12 +119,13 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	/*
 	 * Verify that CR3 is what we think it is.  This will catch
 	 * hypothetical buggy code that directly switches to swapper_pg_dir
-	 * without going through leave_mm() / switch_mm_irqs_off().
+	 * without going through leave_mm() / switch_mm_irqs_off() or that
+	 * does something like write_cr3(read_cr3_pa()).
 	 */
-	VM_BUG_ON(read_cr3_pa() != __pa(real_prev->pgd));
+	VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid));
 
 	if (real_prev == next) {
-		VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
+		VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
 			  next->context.ctx_id);
 
 		if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
@@ -106,16 +142,17 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		cpumask_set_cpu(cpu, mm_cpumask(next));
 		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
 
-		if (this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen) < next_tlb_gen) {
+		if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
+		    next_tlb_gen) {
 			/*
 			 * Ideally, we'd have a flush_tlb() variant that
 			 * takes the known CR3 value as input.  This would
 			 * be faster on Xen PV and on hypothetical CPUs
 			 * on which INVPCID is fast.
 			 */
-			this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen,
+			this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
 				       next_tlb_gen);
-			write_cr3(__sme_pa(next->pgd));
+			write_cr3(__sme_pa(next->pgd) | prev_asid);
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
 					TLB_FLUSH_ALL);
 		}
@@ -126,8 +163,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		 * are not reflected in tlb_gen.)
 		 */
 	} else {
-		VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) ==
-			  next->context.ctx_id);
+		u16 new_asid;
+		bool need_flush;
 
 		if (IS_ENABLED(CONFIG_VMAP_STACK)) {
 			/*
@@ -154,12 +191,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		cpumask_set_cpu(cpu, mm_cpumask(next));
 		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
 
-		this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
-		this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen);
-		this_cpu_write(cpu_tlbstate.loaded_mm, next);
-		write_cr3(__sme_pa(next->pgd));
+		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
 
-		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+		if (need_flush) {
+			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+			write_cr3(__sme_pa(next->pgd) | new_asid);
+			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
+					TLB_FLUSH_ALL);
+		} else {
+			/* The new ASID is already up to date. */
+			write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH);
+			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
+		}
+
+		this_cpu_write(cpu_tlbstate.loaded_mm, next);
+		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
 	}
 
 	load_mm_cr4(next);
@@ -186,13 +233,14 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 	 *                   wants us to catch up to.
 	 */
 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
 	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
-	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen);
+	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
 
 	/* This code cannot presently handle being reentered. */
 	VM_WARN_ON(!irqs_disabled());
 
-	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
+	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
 		   loaded_mm->context.ctx_id);
 
 	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
@@ -280,7 +328,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 	}
 
 	/* Both paths above update our state to mm_tlb_gen. */
-	this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen);
+	this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
 }
 
 static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)

From 04b67022fb6d5b13025591f61a487a6ef7f4f05c Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Mon, 24 Jul 2017 18:25:58 +0300
Subject: [PATCH 060/149] x86/mm/dump_pagetables: Speed up page tables dump for
 CONFIG_KASAN=y

KASAN fills kernel page tables with repeated values to map several
TBs of the virtual memory to the single kasan_zero_page:
  kasan_zero_p4d ->
    kasan_zero_pud ->
        kasan_zero_pmd->
            kasan_zero_pte->
                kasan_zero_page

Walking the whole KASAN shadow range takes a lot of time, especially
with 5-level page tables. Since we already know that all kasan page tables
eventually point to the kasan_zero_page we could call note_page()
right and avoid walking lower levels of the page tables.
This will not affect the output of the kernel_page_tables file,
but let us avoid spending time in page table walkers:

Before:

  $ time cat /sys/kernel/debug/kernel_page_tables > /dev/null

  real    0m55.855s
  user    0m0.000s
  sys     0m55.840s

After:

  $ time cat /sys/kernel/debug/kernel_page_tables > /dev/null

  real    0m0.054s
  user    0m0.000s
  sys     0m0.054s

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170724152558.24689-1-aryabinin@virtuozzo.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/dump_pagetables.c | 64 ++++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 23 deletions(-)

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index b371ab68f2d4..5e3ac6fe6c9e 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -13,12 +13,12 @@
  */
 
 #include <linux/debugfs.h>
+#include <linux/kasan.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 
-#include <asm/kasan.h>
 #include <asm/pgtable.h>
 
 /*
@@ -302,23 +302,53 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
 		start++;
 	}
 }
+#ifdef CONFIG_KASAN
+
+/*
+ * This is an optimization for KASAN=y case. Since all kasan page tables
+ * eventually point to the kasan_zero_page we could call note_page()
+ * right away without walking through lower level page tables. This saves
+ * us dozens of seconds (minutes for 5-level config) while checking for
+ * W+X mapping or reading kernel_page_tables debugfs file.
+ */
+static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
+				void *pt)
+{
+	if (__pa(pt) == __pa(kasan_zero_pmd) ||
+#ifdef CONFIG_X86_5LEVEL
+	    __pa(pt) == __pa(kasan_zero_p4d) ||
+#endif
+	    __pa(pt) == __pa(kasan_zero_pud)) {
+		pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
+		note_page(m, st, __pgprot(prot), 5);
+		return true;
+	}
+	return false;
+}
+#else
+static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
+				void *pt)
+{
+	return false;
+}
+#endif
 
 #if PTRS_PER_PMD > 1
 
 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
 {
 	int i;
-	pmd_t *start;
+	pmd_t *start, *pmd_start;
 	pgprotval_t prot;
 
-	start = (pmd_t *)pud_page_vaddr(addr);
+	pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
 	for (i = 0; i < PTRS_PER_PMD; i++) {
 		st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
 		if (!pmd_none(*start)) {
 			if (pmd_large(*start) || !pmd_present(*start)) {
 				prot = pmd_flags(*start);
 				note_page(m, st, __pgprot(prot), 4);
-			} else {
+			} else if (!kasan_page_table(m, st, pmd_start)) {
 				walk_pte_level(m, st, *start,
 					       P + i * PMD_LEVEL_MULT);
 			}
@@ -336,34 +366,22 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
 
 #if PTRS_PER_PUD > 1
 
-/*
- * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y
- * KASAN fills page tables with the same values. Since there is no
- * point in checking page table more than once we just skip repeated
- * entries. This saves us dozens of seconds during boot.
- */
-static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
-{
-	return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
-}
-
 static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
 {
 	int i;
-	pud_t *start;
+	pud_t *start, *pud_start;
 	pgprotval_t prot;
 	pud_t *prev_pud = NULL;
 
-	start = (pud_t *)p4d_page_vaddr(addr);
+	pud_start = start = (pud_t *)p4d_page_vaddr(addr);
 
 	for (i = 0; i < PTRS_PER_PUD; i++) {
 		st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
-		if (!pud_none(*start) &&
-		    !pud_already_checked(prev_pud, start, st->check_wx)) {
+		if (!pud_none(*start)) {
 			if (pud_large(*start) || !pud_present(*start)) {
 				prot = pud_flags(*start);
 				note_page(m, st, __pgprot(prot), 3);
-			} else {
+			} else if (!kasan_page_table(m, st, pud_start)) {
 				walk_pmd_level(m, st, *start,
 					       P + i * PUD_LEVEL_MULT);
 			}
@@ -386,10 +404,10 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
 static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
 {
 	int i;
-	p4d_t *start;
+	p4d_t *start, *p4d_start;
 	pgprotval_t prot;
 
-	start = (p4d_t *)pgd_page_vaddr(addr);
+	p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
 
 	for (i = 0; i < PTRS_PER_P4D; i++) {
 		st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
@@ -397,7 +415,7 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
 			if (p4d_large(*start) || !p4d_present(*start)) {
 				prot = p4d_flags(*start);
 				note_page(m, st, __pgprot(prot), 2);
-			} else {
+			} else if (!kasan_page_table(m, st, p4d_start)) {
 				walk_pud_level(m, st, *start,
 					       P + i * P4D_LEVEL_MULT);
 			}

From f90e2d9a5235bdcfcf1979f5e4b40684e778d18b Mon Sep 17 00:00:00 2001
From: Wang Kai <morgan.wang@huawei.com>
Date: Mon, 24 Jul 2017 21:03:46 +0800
Subject: [PATCH 061/149] x86/mm/pkeys: Fix typo in
 Documentation/x86/protection-keys.txt

Replace PKEY_DENY_WRITE with PKEY_DISABLE_WRITE,
to match the source code.

Signed-off-by: Wang Kai <morgan.wang@huawei.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: corbet@lwn.net
Cc: dave.hansen@intel.com
Cc: dave.hansen@linux.intel.com
Cc: linux-doc@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/x86/protection-keys.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/x86/protection-keys.txt b/Documentation/x86/protection-keys.txt
index b64304540821..fa46dcb347bc 100644
--- a/Documentation/x86/protection-keys.txt
+++ b/Documentation/x86/protection-keys.txt
@@ -34,7 +34,7 @@ with a key.  In this example WRPKRU is wrapped by a C function
 called pkey_set().
 
 	int real_prot = PROT_READ|PROT_WRITE;
-	pkey = pkey_alloc(0, PKEY_DENY_WRITE);
+	pkey = pkey_alloc(0, PKEY_DISABLE_WRITE);
 	ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
 	ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey);
 	... application runs here
@@ -42,9 +42,9 @@ called pkey_set().
 Now, if the application needs to update the data at 'ptr', it can
 gain access, do the update, then remove its write access:
 
-	pkey_set(pkey, 0); // clear PKEY_DENY_WRITE
+	pkey_set(pkey, 0); // clear PKEY_DISABLE_WRITE
 	*ptr = foo; // assign something
-	pkey_set(pkey, PKEY_DENY_WRITE); // set PKEY_DENY_WRITE again
+	pkey_set(pkey, PKEY_DISABLE_WRITE); // set PKEY_DISABLE_WRITE again
 
 Now when it frees the memory, it will also free the pkey since it
 is no longer in use:

From bc9b934b2fbbd51008a1b52c0cd1b457e6440736 Mon Sep 17 00:00:00 2001
From: Alexander Dahl <ada@thorsis.com>
Date: Tue, 25 Jul 2017 14:00:22 +0200
Subject: [PATCH 062/149] memory: atmel-ebi: Fix smc timing return value
 evaluation

Setting optional EBI/SMC properties through device tree always fails due
to wrong evaluation of the return value of
atmel_ebi_xslate_smc_timings().

If you put some of those properties in your dts file, but not
'atmel,smc-tdf-ns' the local variable 'required' in
atmel_ebi_xslate_smc_timings() stays on 'false' after the first 'if'
block. This leads to setting 'ret' to -EINVAL in the first run of the
following 'for' loop which is then the return value of this function.

However if you set 'atmel,smc-tdf-ns' in the dts file and everything in
atmel_ebi_xslate_smc_timings() works well, it returns the content of
'required' which is 'true' then.

So the function atmel_ebi_xslate_smc_timings() always returns non-zero
which lets its call in atmel_ebi_xslate_smc_config() always fail and
thus returning -EINVAL, so the EBI configuration for this node fails.

Judging from the following code evaluating the local 'required' variable
in atmel_ebi_xslate_smc_config() and the call of caps->xlate_config in
atmel_ebi_dev_setup() it's probably right to only let the call fail if a
negative error code is returned.

Signed-off-by: Alexander Dahl <ada@thorsis.com>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
---
 drivers/memory/atmel-ebi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/memory/atmel-ebi.c b/drivers/memory/atmel-ebi.c
index 99e644cda4d1..1cf34d24d4e0 100644
--- a/drivers/memory/atmel-ebi.c
+++ b/drivers/memory/atmel-ebi.c
@@ -263,7 +263,7 @@ static int atmel_ebi_xslate_smc_config(struct atmel_ebi_dev *ebid,
 	}
 
 	ret = atmel_ebi_xslate_smc_timings(ebid, np, &conf->smcconf);
-	if (ret)
+	if (ret < 0)
 		return -EINVAL;
 
 	if ((ret > 0 && !required) || (!ret && required)) {

From 1f6b53901f5bffbbb0464c732e83bb2a3a784ff7 Mon Sep 17 00:00:00 2001
From: Alexander Dahl <ada@thorsis.com>
Date: Tue, 25 Jul 2017 14:00:23 +0200
Subject: [PATCH 063/149] memory: atmel-ebi: Allow t_DF timings of zero ns

As reported in [1] and in [2] it's not possible to set the device tree
property 'atmel,smc-tdf-ns' to zero, although the SoC allows a setting
of 0ns for the t_DF time.

Allow this setting by doing the same thing as in the atmel nand
controller driver by setting ncycles to ATMEL_SMC_MODE_TDF_MIN if zero
is set in the dts.

[1] http://lists.infradead.org/pipermail/linux-arm-kernel/2017-March/490966.html
[2] http://lists.infradead.org/pipermail/linux-arm-kernel/2017-July/520652.html

Suggested-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Alexander Dahl <ada@thorsis.com>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
---
 drivers/memory/atmel-ebi.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/memory/atmel-ebi.c b/drivers/memory/atmel-ebi.c
index 1cf34d24d4e0..f8a01aea422e 100644
--- a/drivers/memory/atmel-ebi.c
+++ b/drivers/memory/atmel-ebi.c
@@ -120,12 +120,14 @@ static int atmel_ebi_xslate_smc_timings(struct atmel_ebi_dev *ebid,
 	if (!ret) {
 		required = true;
 		ncycles = DIV_ROUND_UP(val, clk_period_ns);
-		if (ncycles > ATMEL_SMC_MODE_TDF_MAX ||
-		    ncycles < ATMEL_SMC_MODE_TDF_MIN) {
+		if (ncycles > ATMEL_SMC_MODE_TDF_MAX) {
 			ret = -EINVAL;
 			goto out;
 		}
 
+		if (ncycles < ATMEL_SMC_MODE_TDF_MIN)
+			ncycles = ATMEL_SMC_MODE_TDF_MIN;
+
 		smcconf->mode |= ATMEL_SMC_MODE_TDF(ncycles);
 	}
 

From 3fb3b3c4b68e33bb4acf42361b6a0db96115de35 Mon Sep 17 00:00:00 2001
From: Alexander Dahl <ada@thorsis.com>
Date: Tue, 25 Jul 2017 14:00:24 +0200
Subject: [PATCH 064/149] memory: atmel-ebi: Fix smc cycle xlate converter

The converter function for translating ns timings in register values was
initialized with a wrong function pointer. This resulted in wrong
register values also for the setup and pulse registers when configuring
the EBI interface trough dts.

Includes a small fix in a comment of the smc driver, which was probably
just a copy'n'paste mistake.

Signed-off-by: Alexander Dahl <ada@thorsis.com>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
---
 drivers/memory/atmel-ebi.c | 2 +-
 drivers/mfd/atmel-smc.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/memory/atmel-ebi.c b/drivers/memory/atmel-ebi.c
index f8a01aea422e..ebf69ff48ae2 100644
--- a/drivers/memory/atmel-ebi.c
+++ b/drivers/memory/atmel-ebi.c
@@ -72,7 +72,7 @@ struct atmel_smc_timing_xlate {
 	{ .name = nm, .converter = atmel_smc_cs_conf_set_pulse, .shift = pos}
 
 #define ATMEL_SMC_CYCLE_XLATE(nm, pos)	\
-	{ .name = nm, .converter = atmel_smc_cs_conf_set_setup, .shift = pos}
+	{ .name = nm, .converter = atmel_smc_cs_conf_set_cycle, .shift = pos}
 
 static void at91sam9_ebi_get_config(struct atmel_ebi_dev *ebid,
 				    struct atmel_ebi_dev_config *conf)
diff --git a/drivers/mfd/atmel-smc.c b/drivers/mfd/atmel-smc.c
index 954cf0f66a31..20cc0ea470fa 100644
--- a/drivers/mfd/atmel-smc.c
+++ b/drivers/mfd/atmel-smc.c
@@ -206,7 +206,7 @@ EXPORT_SYMBOL_GPL(atmel_smc_cs_conf_set_pulse);
  *	     parameter
  *
  * This function encodes the @ncycles value as described in the datasheet
- * (section "SMC Pulse Register"), and then stores the result in the
+ * (section "SMC Cycle Register"), and then stores the result in the
  * @conf->setup field at @shift position.
  *
  * Returns -EINVAL if @shift is invalid, -ERANGE if @ncycles does not fit in

From 4e237903f95db585b976e7311de2bfdaaf0f6e31 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Fri, 28 Jul 2017 11:01:16 -0500
Subject: [PATCH 065/149] x86/mm, kexec: Fix memory corruption with SME on
 successive kexecs

After issuing successive kexecs it was found that the SHA hash failed
verification when booting the kexec'd kernel.  When SME is enabled, the
change from using pages that were marked encrypted to now being marked as
not encrypted (through new identify mapped page tables) results in memory
corruption if there are any cache entries for the previously encrypted
pages. This is because separate cache entries can exist for the same
physical location but tagged both with and without the encryption bit.

To prevent this, issue a wbinvd if SME is active before copying the pages
from the source location to the destination location to clear any possible
cache entry conflicts.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Cc: <kexec@lists.infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/e7fb8610af3a93e8f8ae6f214cd9249adc0df2b4.1501186516.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/kexec.h         |  3 ++-
 arch/x86/kernel/machine_kexec_64.c   |  3 ++-
 arch/x86/kernel/relocate_kernel_64.S | 14 ++++++++++++++
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index e8183acf931f..942c1f444da8 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -147,7 +147,8 @@ unsigned long
 relocate_kernel(unsigned long indirection_page,
 		unsigned long page_list,
 		unsigned long start_address,
-		unsigned int preserve_context);
+		unsigned int preserve_context,
+		unsigned int sme_active);
 #endif
 
 #define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 9cf8daacc046..1f790cf9d38f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -335,7 +335,8 @@ void machine_kexec(struct kimage *image)
 	image->start = relocate_kernel((unsigned long)image->head,
 				       (unsigned long)page_list,
 				       image->start,
-				       image->preserve_context);
+				       image->preserve_context,
+				       sme_active());
 
 #ifdef CONFIG_KEXEC_JUMP
 	if (image->preserve_context)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 98111b38ebfd..307d3bac5f04 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -47,6 +47,7 @@ relocate_kernel:
 	 * %rsi page_list
 	 * %rdx start address
 	 * %rcx preserve_context
+	 * %r8  sme_active
 	 */
 
 	/* Save the CPU context, used for jumping back */
@@ -71,6 +72,9 @@ relocate_kernel:
 	pushq $0
 	popfq
 
+	/* Save SME active flag */
+	movq	%r8, %r12
+
 	/*
 	 * get physical address of control page now
 	 * this is impossible after page table switch
@@ -132,6 +136,16 @@ identity_mapped:
 	/* Flush the TLB (needed?) */
 	movq	%r9, %cr3
 
+	/*
+	 * If SME is active, there could be old encrypted cache line
+	 * entries that will conflict with the now unencrypted memory
+	 * used by kexec. Flush the caches before copying the kernel.
+	 */
+	testq	%r12, %r12
+	jz 1f
+	wbinvd
+1:
+
 	movq	%rcx, %r11
 	call	swap_pages
 

From 57bd1905b228f2a14d7506b0302f69f425131e57 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Fri, 28 Jul 2017 11:01:17 -0500
Subject: [PATCH 066/149] acpi, x86/mm: Remove encryption mask from ACPI page
 protection type

The arch_apei_get_mem_attributes() function is used to set the page
protection type for ACPI physical addresses. When SME is active, the
associated protection type cannot have the encryption mask set since the
ACPI tables live in un-encrypted memory - the kernel will see corrupted
data.

To fix this, create a new protection type, PAGE_KERNEL_NOENC, that is a
'no encryption' version of PAGE_KERNEL, and return that from
arch_apei_get_mem_attributes().

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/e1cb9395b2f061cd96f1e59c3cbbe5ff5d4ec26e.1501186516.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/acpi.h          | 11 ++++++-----
 arch/x86/include/asm/pgtable_types.h |  1 +
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 562286fa151f..72d867f6b518 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -160,12 +160,13 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
 	 * you call efi_mem_attributes() during boot and at runtime,
 	 * you could theoretically see different attributes.
 	 *
-	 * Since we are yet to see any x86 platforms that require
-	 * anything other than PAGE_KERNEL (some arm64 platforms
-	 * require the equivalent of PAGE_KERNEL_NOCACHE), return that
-	 * until we know differently.
+	 * We are yet to see any x86 platforms that require anything
+	 * other than PAGE_KERNEL (some ARM64 platforms require the
+	 * equivalent of PAGE_KERNEL_NOCACHE). Additionally, if SME
+	 * is active, the ACPI information will not be encrypted,
+	 * so return PAGE_KERNEL_NOENC until we know differently.
 	 */
-	 return PAGE_KERNEL;
+	return PAGE_KERNEL_NOENC;
 }
 #endif
 
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 6c55973ebab8..399261ce904c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -211,6 +211,7 @@ enum page_cache_mode {
 #define __PAGE_KERNEL_NOENC_WP	(__PAGE_KERNEL_WP)
 
 #define PAGE_KERNEL		__pgprot(__PAGE_KERNEL | _PAGE_ENC)
+#define PAGE_KERNEL_NOENC	__pgprot(__PAGE_KERNEL)
 #define PAGE_KERNEL_RO		__pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
 #define PAGE_KERNEL_EXEC	__pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
 #define PAGE_KERNEL_EXEC_NOENC	__pgprot(__PAGE_KERNEL_EXEC)

From 4bb0f0e73c8c30917d169c4a0f1ac083690c545b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 1 Aug 2017 12:01:52 -0400
Subject: [PATCH 067/149] tracing: Call clear_boot_tracer() at lateinit_sync

The clear_boot_tracer function is used to reset the default_bootup_tracer
string to prevent it from being accessed after boot, as it originally points
to init data. But since clear_boot_tracer() is called via the
init_lateinit() call, it races with the initcall for registering the hwlat
tracer. If someone adds "ftrace=hwlat" to the kernel command line, depending
on how the linker sets up the text, the saved command line may be cleared,
and the hwlat tracer never is initialized.

Simply have the clear_boot_tracer() be called by initcall_lateinit_sync() as
that's for tasks to be called after lateinit.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=196551

Cc: stable@vger.kernel.org
Fixes: e7c15cd8a ("tracing: Added hardware latency tracer")
Reported-by: Zamir SUN <sztsian@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 42b9355033d4..784fb43b2abe 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8407,4 +8407,4 @@ __init static int clear_boot_tracer(void)
 }
 
 fs_initcall(tracer_init_tracefs);
-late_initcall(clear_boot_tracer);
+late_initcall_sync(clear_boot_tracer);

From 147d88e0b5eb90191bc5c12ca0a3c410b75a13d2 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 1 Aug 2017 14:02:01 +0300
Subject: [PATCH 068/149] tracing: Missing error code in tracer_alloc_buffers()

If ring_buffer_alloc() or one of the next couple function calls fail
then we should return -ENOMEM but the current code returns success.

Link: http://lkml.kernel.org/r/20170801110201.ajdkct7vwzixahvx@mwanda

Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: stable@vger.kernel.org
Fixes: b32614c03413 ('tracing/rb: Convert to hotplug state machine')
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 784fb43b2abe..d815fc317e9d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8293,6 +8293,7 @@ __init static int tracer_alloc_buffers(void)
 	if (ret < 0)
 		goto out_free_cpumask;
 	/* Used for event triggers */
+	ret = -ENOMEM;
 	temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
 	if (!temp_buffer)
 		goto out_rm_hp_state;

From a7e52ad7ed82e21273eccff93d1477a7b313aabb Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 2 Aug 2017 14:20:54 -0400
Subject: [PATCH 069/149] ring-buffer: Have ring_buffer_alloc_read_page()
 return error on offline CPU

Chunyu Hu reported:
  "per_cpu trace directories and files are created for all possible cpus,
   but only the cpus which have ever been on-lined have their own per cpu
   ring buffer (allocated by cpuhp threads). While trace_buffers_open, the
   open handler for trace file 'trace_pipe_raw' is always trying to access
   field of ring_buffer_per_cpu, and would panic with the NULL pointer.

   Align the behavior of trace_pipe_raw with trace_pipe, that returns -NODEV
   when openning it if that cpu does not have trace ring buffer.

   Reproduce:
   cat /sys/kernel/debug/tracing/per_cpu/cpu31/trace_pipe_raw
   (cpu31 is never on-lined, this is a 16 cores x86_64 box)

   Tested with:
   1) boot with maxcpus=14, read trace_pipe_raw of cpu15.
      Got -NODEV.
   2) oneline cpu15, read trace_pipe_raw of cpu15.
      Get the raw trace data.

   Call trace:
   [ 5760.950995] RIP: 0010:ring_buffer_alloc_read_page+0x32/0xe0
   [ 5760.961678]  tracing_buffers_read+0x1f6/0x230
   [ 5760.962695]  __vfs_read+0x37/0x160
   [ 5760.963498]  ? __vfs_read+0x5/0x160
   [ 5760.964339]  ? security_file_permission+0x9d/0xc0
   [ 5760.965451]  ? __vfs_read+0x5/0x160
   [ 5760.966280]  vfs_read+0x8c/0x130
   [ 5760.967070]  SyS_read+0x55/0xc0
   [ 5760.967779]  do_syscall_64+0x67/0x150
   [ 5760.968687]  entry_SYSCALL64_slow_path+0x25/0x25"

This was introduced by the addition of the feature to reuse reader pages
instead of re-allocating them. The problem is that the allocation of a
reader page (which is per cpu) does not check if the cpu is online and set
up for the ring buffer.

Link: http://lkml.kernel.org/r/1500880866-1177-1-git-send-email-chuhu@redhat.com

Cc: stable@vger.kernel.org
Fixes: 73a757e63114 ("ring-buffer: Return reader page back into existing ring buffer")
Reported-by: Chunyu Hu <chuhu@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c           | 14 +++++++++-----
 kernel/trace/ring_buffer_benchmark.c |  2 +-
 kernel/trace/trace.c                 | 16 +++++++++++-----
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 529cc50d7243..81279c6602ff 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4386,15 +4386,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
  * the page that was allocated, with the read page of the buffer.
  *
  * Returns:
- *  The page allocated, or NULL on error.
+ *  The page allocated, or ERR_PTR
  */
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
 {
-	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+	struct ring_buffer_per_cpu *cpu_buffer;
 	struct buffer_data_page *bpage = NULL;
 	unsigned long flags;
 	struct page *page;
 
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return ERR_PTR(-ENODEV);
+
+	cpu_buffer = buffer->buffers[cpu];
 	local_irq_save(flags);
 	arch_spin_lock(&cpu_buffer->lock);
 
@@ -4412,7 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
 	page = alloc_pages_node(cpu_to_node(cpu),
 				GFP_KERNEL | __GFP_NORETRY, 0);
 	if (!page)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	bpage = page_address(page);
 
@@ -4467,8 +4471,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
  *
  * for example:
  *	rpage = ring_buffer_alloc_read_page(buffer, cpu);
- *	if (!rpage)
- *		return error;
+ *	if (IS_ERR(rpage))
+ *		return PTR_ERR(rpage);
  *	ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
  *	if (ret >= 0)
  *		process_page(rpage, ret);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 9fbcaf567886..68ee79afe31c 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -113,7 +113,7 @@ static enum event_status read_page(int cpu)
 	int i;
 
 	bpage = ring_buffer_alloc_read_page(buffer, cpu);
-	if (!bpage)
+	if (IS_ERR(bpage))
 		return EVENT_DROPPED;
 
 	ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d815fc317e9d..44004d8aa3b3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6598,7 +6598,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 {
 	struct ftrace_buffer_info *info = filp->private_data;
 	struct trace_iterator *iter = &info->iter;
-	ssize_t ret;
+	ssize_t ret = 0;
 	ssize_t size;
 
 	if (!count)
@@ -6612,10 +6612,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 	if (!info->spare) {
 		info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
 							  iter->cpu_file);
-		info->spare_cpu = iter->cpu_file;
+		if (IS_ERR(info->spare)) {
+			ret = PTR_ERR(info->spare);
+			info->spare = NULL;
+		} else {
+			info->spare_cpu = iter->cpu_file;
+		}
 	}
 	if (!info->spare)
-		return -ENOMEM;
+		return ret;
 
 	/* Do we have previous read data to read? */
 	if (info->read < PAGE_SIZE)
@@ -6790,8 +6795,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		ref->ref = 1;
 		ref->buffer = iter->trace_buffer->buffer;
 		ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
-		if (!ref->page) {
-			ret = -ENOMEM;
+		if (IS_ERR(ref->page)) {
+			ret = PTR_ERR(ref->page);
+			ref->page = NULL;
 			kfree(ref);
 			break;
 		}

From f862b31514bad66e48d9d4ff6036ee051cf36a6f Mon Sep 17 00:00:00 2001
From: Eugeniy Paltsev
 <Eugeniy.Paltsev-HKixBCOQz3hWk0Htik3J/w@public.gmane.org>
Date: Mon, 26 Jun 2017 14:47:25 +0300
Subject: [PATCH 070/149] ARC: [plat-axs10x]: prepare dts files for enabling
 PAE40 on axs103

Enable 64bit adressing, where it needed, to make possible
enabling PAE40 on axs103.

This patch doesn't affect on any functionality.

Signed-off-by: Eugeniy Paltsev <Eugeniy.Paltsev-HKixBCOQz3hWk0Htik3J/w@public.gmane.org>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/boot/dts/axc001.dtsi     | 20 +++++++++-----------
 arch/arc/boot/dts/axc003.dtsi     | 21 ++++++++++-----------
 arch/arc/boot/dts/axc003_idu.dtsi | 21 ++++++++++-----------
 arch/arc/boot/dts/axs10x_mb.dtsi  |  2 +-
 4 files changed, 30 insertions(+), 34 deletions(-)

diff --git a/arch/arc/boot/dts/axc001.dtsi b/arch/arc/boot/dts/axc001.dtsi
index 53ce226f77a5..a380ffa1a458 100644
--- a/arch/arc/boot/dts/axc001.dtsi
+++ b/arch/arc/boot/dts/axc001.dtsi
@@ -15,15 +15,15 @@
 
 / {
 	compatible = "snps,arc";
-	#address-cells = <1>;
-	#size-cells = <1>;
+	#address-cells = <2>;
+	#size-cells = <2>;
 
 	cpu_card {
 		compatible = "simple-bus";
 		#address-cells = <1>;
 		#size-cells = <1>;
 
-		ranges = <0x00000000 0xf0000000 0x10000000>;
+		ranges = <0x00000000 0x0 0xf0000000 0x10000000>;
 
 		core_clk: core_clk {
 			#clock-cells = <0>;
@@ -91,23 +91,21 @@ arcpct0: pct {
 	mb_intc: dw-apb-ictl@0xe0012000 {
 		#interrupt-cells = <1>;
 		compatible = "snps,dw-apb-ictl";
-		reg = < 0xe0012000 0x200 >;
+		reg = < 0x0 0xe0012000 0x0 0x200 >;
 		interrupt-controller;
 		interrupt-parent = <&core_intc>;
 		interrupts = < 7 >;
 	};
 
 	memory {
-		#address-cells = <1>;
-		#size-cells = <1>;
-		ranges = <0x00000000 0x80000000 0x20000000>;
 		device_type = "memory";
-		reg = <0x80000000 0x1b000000>;	/* (512 - 32) MiB */
+		/* CONFIG_KERNEL_RAM_BASE_ADDRESS needs to match low mem start */
+		reg = <0x0 0x80000000 0x0 0x1b000000>;	/* (512 - 32) MiB */
 	};
 
 	reserved-memory {
-		#address-cells = <1>;
-		#size-cells = <1>;
+		#address-cells = <2>;
+		#size-cells = <2>;
 		ranges;
 		/*
 		 * We just move frame buffer area to the very end of
@@ -118,7 +116,7 @@ reserved-memory {
 		 */
 		frame_buffer: frame_buffer@9e000000 {
 			compatible = "shared-dma-pool";
-			reg = <0x9e000000 0x2000000>;
+			reg = <0x0 0x9e000000 0x0 0x2000000>;
 			no-map;
 		};
 	};
diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi
index 14df46f141bf..cc9239ef8d08 100644
--- a/arch/arc/boot/dts/axc003.dtsi
+++ b/arch/arc/boot/dts/axc003.dtsi
@@ -14,15 +14,15 @@
 
 / {
 	compatible = "snps,arc";
-	#address-cells = <1>;
-	#size-cells = <1>;
+	#address-cells = <2>;
+	#size-cells = <2>;
 
 	cpu_card {
 		compatible = "simple-bus";
 		#address-cells = <1>;
 		#size-cells = <1>;
 
-		ranges = <0x00000000 0xf0000000 0x10000000>;
+		ranges = <0x00000000 0x0 0xf0000000 0x10000000>;
 
 		core_clk: core_clk {
 			#clock-cells = <0>;
@@ -94,30 +94,29 @@ arcpct0: pct {
 	mb_intc: dw-apb-ictl@0xe0012000 {
 		#interrupt-cells = <1>;
 		compatible = "snps,dw-apb-ictl";
-		reg = < 0xe0012000 0x200 >;
+		reg = < 0x0 0xe0012000 0x0 0x200 >;
 		interrupt-controller;
 		interrupt-parent = <&core_intc>;
 		interrupts = < 24 >;
 	};
 
 	memory {
-		#address-cells = <1>;
-		#size-cells = <1>;
-		ranges = <0x00000000 0x80000000 0x40000000>;
 		device_type = "memory";
-		reg = <0x80000000 0x20000000>;	/* 512MiB */
+		/* CONFIG_KERNEL_RAM_BASE_ADDRESS needs to match low mem start */
+		reg = <0x0 0x80000000 0x0 0x20000000	/* 512 MiB low mem */
+		       0x1 0xc0000000 0x0 0x40000000>;	/* 1 GiB highmem */
 	};
 
 	reserved-memory {
-		#address-cells = <1>;
-		#size-cells = <1>;
+		#address-cells = <2>;
+		#size-cells = <2>;
 		ranges;
 		/*
 		 * Move frame buffer out of IOC aperture (0x8z-0xAz).
 		 */
 		frame_buffer: frame_buffer@be000000 {
 			compatible = "shared-dma-pool";
-			reg = <0xbe000000 0x2000000>;
+			reg = <0x0 0xbe000000 0x0 0x2000000>;
 			no-map;
 		};
 	};
diff --git a/arch/arc/boot/dts/axc003_idu.dtsi b/arch/arc/boot/dts/axc003_idu.dtsi
index 695f9fa1996b..4ebb2170abec 100644
--- a/arch/arc/boot/dts/axc003_idu.dtsi
+++ b/arch/arc/boot/dts/axc003_idu.dtsi
@@ -14,15 +14,15 @@
 
 / {
 	compatible = "snps,arc";
-	#address-cells = <1>;
-	#size-cells = <1>;
+	#address-cells = <2>;
+	#size-cells = <2>;
 
 	cpu_card {
 		compatible = "simple-bus";
 		#address-cells = <1>;
 		#size-cells = <1>;
 
-		ranges = <0x00000000 0xf0000000 0x10000000>;
+		ranges = <0x00000000 0x0 0xf0000000 0x10000000>;
 
 		core_clk: core_clk {
 			#clock-cells = <0>;
@@ -100,30 +100,29 @@ arcpct0: pct {
 	mb_intc: dw-apb-ictl@0xe0012000 {
 		#interrupt-cells = <1>;
 		compatible = "snps,dw-apb-ictl";
-		reg = < 0xe0012000 0x200 >;
+		reg = < 0x0 0xe0012000 0x0 0x200 >;
 		interrupt-controller;
 		interrupt-parent = <&idu_intc>;
 		interrupts = <0>;
 	};
 
 	memory {
-		#address-cells = <1>;
-		#size-cells = <1>;
-		ranges = <0x00000000 0x80000000 0x40000000>;
 		device_type = "memory";
-		reg = <0x80000000 0x20000000>;	/* 512MiB */
+		/* CONFIG_KERNEL_RAM_BASE_ADDRESS needs to match low mem start */
+		reg = <0x0 0x80000000 0x0 0x20000000	/* 512 MiB low mem */
+		       0x1 0xc0000000 0x0 0x40000000>;	/* 1 GiB highmem */
 	};
 
 	reserved-memory {
-		#address-cells = <1>;
-		#size-cells = <1>;
+		#address-cells = <2>;
+		#size-cells = <2>;
 		ranges;
 		/*
 		 * Move frame buffer out of IOC aperture (0x8z-0xAz).
 		 */
 		frame_buffer: frame_buffer@be000000 {
 			compatible = "shared-dma-pool";
-			reg = <0xbe000000 0x2000000>;
+			reg = <0x0 0xbe000000 0x0 0x2000000>;
 			no-map;
 		};
 	};
diff --git a/arch/arc/boot/dts/axs10x_mb.dtsi b/arch/arc/boot/dts/axs10x_mb.dtsi
index 41cfb29b62c1..0ff7e07edcd4 100644
--- a/arch/arc/boot/dts/axs10x_mb.dtsi
+++ b/arch/arc/boot/dts/axs10x_mb.dtsi
@@ -13,7 +13,7 @@ axs10x_mb {
 		compatible = "simple-bus";
 		#address-cells = <1>;
 		#size-cells = <1>;
-		ranges = <0x00000000 0xe0000000 0x10000000>;
+		ranges = <0x00000000 0x0 0xe0000000 0x10000000>;
 		interrupt-parent = <&mb_intc>;
 
 		i2sclk: i2sclk@100a0 {

From 33460f86ad2c982f3172a10b17948ccaf923f07f Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Fri, 28 Jul 2017 16:53:50 +0530
Subject: [PATCH 071/149] ARC: [plat-sim] Include this platform unconditionally

Essentially remove CONFIG_ARC_PLAT_SIM

There is no need for any platform specific code, just the board DTS
match strings which we can include unconditionally

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/Kconfig                           |  1 -
 arch/arc/Makefile                          |  2 +-
 arch/arc/configs/haps_hs_defconfig         |  1 -
 arch/arc/configs/haps_hs_smp_defconfig     |  1 -
 arch/arc/configs/nsim_700_defconfig        |  1 -
 arch/arc/configs/nsim_hs_defconfig         |  1 -
 arch/arc/configs/nsim_hs_smp_defconfig     |  1 -
 arch/arc/configs/nsimosci_defconfig        |  1 -
 arch/arc/configs/nsimosci_hs_defconfig     |  1 -
 arch/arc/configs/nsimosci_hs_smp_defconfig |  1 -
 arch/arc/plat-sim/Kconfig                  | 13 -------------
 arch/arc/plat-sim/platform.c               |  5 ++++-
 12 files changed, 5 insertions(+), 24 deletions(-)
 delete mode 100644 arch/arc/plat-sim/Kconfig

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index a5459698f0ee..7db85ab00c52 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -96,7 +96,6 @@ menu "ARC Architecture Configuration"
 
 menu "ARC Platform/SoC/Board"
 
-source "arch/arc/plat-sim/Kconfig"
 source "arch/arc/plat-tb10x/Kconfig"
 source "arch/arc/plat-axs10x/Kconfig"
 #New platform adds here
diff --git a/arch/arc/Makefile b/arch/arc/Makefile
index 44ef35d33956..3a61cfcc38c0 100644
--- a/arch/arc/Makefile
+++ b/arch/arc/Makefile
@@ -107,7 +107,7 @@ core-y		+= arch/arc/
 # w/o this dtb won't embed into kernel binary
 core-y		+= arch/arc/boot/dts/
 
-core-$(CONFIG_ARC_PLAT_SIM)	+= arch/arc/plat-sim/
+core-y				+= arch/arc/plat-sim/
 core-$(CONFIG_ARC_PLAT_TB10X)	+= arch/arc/plat-tb10x/
 core-$(CONFIG_ARC_PLAT_AXS10X)	+= arch/arc/plat-axs10x/
 core-$(CONFIG_ARC_PLAT_EZNPS)	+= arch/arc/plat-eznps/
diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig
index 57b3e599322f..db04ea4dd2d9 100644
--- a/arch/arc/configs/haps_hs_defconfig
+++ b/arch/arc/configs/haps_hs_defconfig
@@ -21,7 +21,6 @@ CONFIG_MODULES=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
 CONFIG_ISA_ARCV2=y
 CONFIG_ARC_BUILTIN_DTB_NAME="haps_hs"
 CONFIG_PREEMPT=y
diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig
index f85985adebb2..821a2e562f3f 100644
--- a/arch/arc/configs/haps_hs_smp_defconfig
+++ b/arch/arc/configs/haps_hs_smp_defconfig
@@ -23,7 +23,6 @@ CONFIG_MODULES=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
 CONFIG_ISA_ARCV2=y
 CONFIG_SMP=y
 CONFIG_ARC_BUILTIN_DTB_NAME="haps_hs_idu"
diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig
index b0066a749d4c..6dff83a238b8 100644
--- a/arch/arc/configs/nsim_700_defconfig
+++ b/arch/arc/configs/nsim_700_defconfig
@@ -23,7 +23,6 @@ CONFIG_MODULES=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
 CONFIG_ARC_BUILTIN_DTB_NAME="nsim_700"
 CONFIG_PREEMPT=y
 # CONFIG_COMPACTION is not set
diff --git a/arch/arc/configs/nsim_hs_defconfig b/arch/arc/configs/nsim_hs_defconfig
index ebe9ebb92933..31ee51b987e7 100644
--- a/arch/arc/configs/nsim_hs_defconfig
+++ b/arch/arc/configs/nsim_hs_defconfig
@@ -26,7 +26,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
 CONFIG_ISA_ARCV2=y
 CONFIG_ARC_BUILTIN_DTB_NAME="nsim_hs"
 CONFIG_PREEMPT=y
diff --git a/arch/arc/configs/nsim_hs_smp_defconfig b/arch/arc/configs/nsim_hs_smp_defconfig
index 4bde43278be6..8d3b1f67cae4 100644
--- a/arch/arc/configs/nsim_hs_smp_defconfig
+++ b/arch/arc/configs/nsim_hs_smp_defconfig
@@ -24,7 +24,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
 CONFIG_ISA_ARCV2=y
 CONFIG_SMP=y
 CONFIG_ARC_BUILTIN_DTB_NAME="nsim_hs_idu"
diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig
index f6fb3d26557e..6168ce2ac2ef 100644
--- a/arch/arc/configs/nsimosci_defconfig
+++ b/arch/arc/configs/nsimosci_defconfig
@@ -23,7 +23,6 @@ CONFIG_MODULES=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
 CONFIG_ARC_BUILTIN_DTB_NAME="nsimosci"
 # CONFIG_COMPACTION is not set
 CONFIG_NET=y
diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig
index b9f0fe00044b..a70bdeb2b3fd 100644
--- a/arch/arc/configs/nsimosci_hs_defconfig
+++ b/arch/arc/configs/nsimosci_hs_defconfig
@@ -23,7 +23,6 @@ CONFIG_MODULES=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
 CONFIG_ISA_ARCV2=y
 CONFIG_ARC_BUILTIN_DTB_NAME="nsimosci_hs"
 # CONFIG_COMPACTION is not set
diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig
index 155add7761ed..ef96406c446e 100644
--- a/arch/arc/configs/nsimosci_hs_smp_defconfig
+++ b/arch/arc/configs/nsimosci_hs_smp_defconfig
@@ -18,7 +18,6 @@ CONFIG_MODULES=y
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_IOSCHED_DEADLINE is not set
 # CONFIG_IOSCHED_CFQ is not set
-CONFIG_ARC_PLAT_SIM=y
 CONFIG_ISA_ARCV2=y
 CONFIG_SMP=y
 # CONFIG_ARC_TIMERS_64BIT is not set
diff --git a/arch/arc/plat-sim/Kconfig b/arch/arc/plat-sim/Kconfig
deleted file mode 100644
index ac6af96a82f3..000000000000
--- a/arch/arc/plat-sim/Kconfig
+++ /dev/null
@@ -1,13 +0,0 @@
-#
-# Copyright (C) 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-#
-
-menuconfig ARC_PLAT_SIM
-	bool "ARC nSIM based simulation virtual platforms"
-	help
-	  Support for nSIM based ARC simulation platforms
-	  This includes the standalone nSIM (uart only) vs. System C OSCI VP
diff --git a/arch/arc/plat-sim/platform.c b/arch/arc/plat-sim/platform.c
index aea87389e44b..5cda56b1a2ea 100644
--- a/arch/arc/plat-sim/platform.c
+++ b/arch/arc/plat-sim/platform.c
@@ -20,11 +20,14 @@
  */
 
 static const char *simulation_compat[] __initconst = {
+#ifdef CONFIG_ISA_ARCOMPACT
 	"snps,nsim",
-	"snps,nsim_hs",
 	"snps,nsimosci",
+#else
+	"snps,nsim_hs",
 	"snps,nsimosci_hs",
 	"snps,zebu_hs",
+#endif
 	NULL,
 };
 

From b37174d95b0251611a80ef60abf03752e9d66d67 Mon Sep 17 00:00:00 2001
From: Alexey Brodkin <abrodkin@synopsys.com>
Date: Fri, 7 Jul 2017 12:25:14 +0300
Subject: [PATCH 072/149] ARCv2: SLC: Make sure busy bit is set properly for
 region ops

c70c473396cb "ARCv2: SLC: Make sure busy bit is set properly on SLC flushing"
fixes problem for entire SLC operation where the problem was initially
caught. But given a nature of the issue it is perfectly possible for
busy bit to be read incorrectly even when region operation was started.

So extending initial fix for regional operation as well.

Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
Cc: stable@vger.kernel.org   #4.10
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/mm/cache.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index a867575a758b..bebc24cb7912 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -697,6 +697,9 @@ noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op)
 	write_aux_reg(ARC_REG_SLC_RGN_END, (paddr + sz + l2_line_sz - 1));
 	write_aux_reg(ARC_REG_SLC_RGN_START, paddr);
 
+	/* Make sure "busy" bit reports correct stataus, see STAR 9001165532 */
+	read_aux_reg(ARC_REG_SLC_CTRL);
+
 	while (read_aux_reg(ARC_REG_SLC_CTRL) & SLC_CTRL_BUSY);
 
 	spin_unlock_irqrestore(&lock, flags);

From 2e332fec2f2c996f8d5447b0946ca43bb0ae4b42 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Tue, 18 Jul 2017 12:14:09 -0700
Subject: [PATCH 073/149] ARC: dma: implement dma_unmap_page and sg variant

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/mm/dma.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index 2a07e6ecafbd..1d0326d874e7 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -153,6 +153,19 @@ static void _dma_cache_sync(phys_addr_t paddr, size_t size,
 	}
 }
 
+/*
+ * arc_dma_map_page - map a portion of a page for streaming DMA
+ *
+ * Ensure that any data held in the cache is appropriately discarded
+ * or written back.
+ *
+ * The device owns this memory once this call has completed.  The CPU
+ * can regain ownership by calling dma_unmap_page().
+ *
+ * Note: while it takes struct page as arg, caller can "abuse" it to pass
+ * a region larger than PAGE_SIZE, provided it is physically contiguous
+ * and this still works correctly
+ */
 static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page,
 		unsigned long offset, size_t size, enum dma_data_direction dir,
 		unsigned long attrs)
@@ -165,6 +178,24 @@ static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page,
 	return plat_phys_to_dma(dev, paddr);
 }
 
+/*
+ * arc_dma_unmap_page - unmap a buffer previously mapped through dma_map_page()
+ *
+ * After this call, reads by the CPU to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ *
+ * Note: historically this routine was not implemented for ARC
+ */
+static void arc_dma_unmap_page(struct device *dev, dma_addr_t handle,
+			       size_t size, enum dma_data_direction dir,
+			       unsigned long attrs)
+{
+	phys_addr_t paddr = plat_dma_to_phys(dev, handle);
+
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		_dma_cache_sync(paddr, size, dir);
+}
+
 static int arc_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	   int nents, enum dma_data_direction dir, unsigned long attrs)
 {
@@ -178,6 +209,18 @@ static int arc_dma_map_sg(struct device *dev, struct scatterlist *sg,
 	return nents;
 }
 
+static void arc_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
+			     int nents, enum dma_data_direction dir,
+			     unsigned long attrs)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sg, s, nents, i)
+		arc_dma_unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir,
+				   attrs);
+}
+
 static void arc_dma_sync_single_for_cpu(struct device *dev,
 		dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
 {
@@ -223,7 +266,9 @@ const struct dma_map_ops arc_dma_ops = {
 	.free			= arc_dma_free,
 	.mmap			= arc_dma_mmap,
 	.map_page		= arc_dma_map_page,
+	.unmap_page		= arc_dma_unmap_page,
 	.map_sg			= arc_dma_map_sg,
+	.unmap_sg		= arc_dma_unmap_sg,
 	.sync_single_for_device	= arc_dma_sync_single_for_device,
 	.sync_single_for_cpu	= arc_dma_sync_single_for_cpu,
 	.sync_sg_for_cpu	= arc_dma_sync_sg_for_cpu,

From 7d79cee2c6540ea64dd917a14e2fd63d4ac3d3c0 Mon Sep 17 00:00:00 2001
From: Alexey Brodkin <Alexey.Brodkin@synopsys.com>
Date: Tue, 1 Aug 2017 12:58:47 +0300
Subject: [PATCH 074/149] ARCv2: PAE40: Explicitly set MSB counterpart of SLC
 region ops addresses

It is necessary to explicitly set both SLC_AUX_RGN_START1 and SLC_AUX_RGN_END1
which hold MSB bits of the physical address correspondingly of region start
and end otherwise SLC region operation is executed in unpredictable manner

Without this patch, SLC flushes on HSDK (IOC disabled) were taking
seconds.

Cc: stable@vger.kernel.org   #4.4+
Reported-by: Vladimir Kondratiev <vladimir.kondratiev@intel.com>
Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
[vgupta: PAR40 regs only written if PAE40 exist]
---
 arch/arc/include/asm/cache.h |  2 ++
 arch/arc/mm/cache.c          | 13 +++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h
index 19ebddffb279..02fd1cece6ef 100644
--- a/arch/arc/include/asm/cache.h
+++ b/arch/arc/include/asm/cache.h
@@ -96,7 +96,9 @@ extern unsigned long perip_base, perip_end;
 #define ARC_REG_SLC_FLUSH	0x904
 #define ARC_REG_SLC_INVALIDATE	0x905
 #define ARC_REG_SLC_RGN_START	0x914
+#define ARC_REG_SLC_RGN_START1	0x915
 #define ARC_REG_SLC_RGN_END	0x916
+#define ARC_REG_SLC_RGN_END1	0x917
 
 /* Bit val in SLC_CONTROL */
 #define SLC_CTRL_DIS		0x001
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index bebc24cb7912..874913b3e826 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -665,6 +665,7 @@ noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op)
 	static DEFINE_SPINLOCK(lock);
 	unsigned long flags;
 	unsigned int ctrl;
+	phys_addr_t end;
 
 	spin_lock_irqsave(&lock, flags);
 
@@ -694,8 +695,16 @@ noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op)
 	 * END needs to be setup before START (latter triggers the operation)
 	 * END can't be same as START, so add (l2_line_sz - 1) to sz
 	 */
-	write_aux_reg(ARC_REG_SLC_RGN_END, (paddr + sz + l2_line_sz - 1));
-	write_aux_reg(ARC_REG_SLC_RGN_START, paddr);
+	end = paddr + sz + l2_line_sz - 1;
+	if (is_pae40_enabled())
+		write_aux_reg(ARC_REG_SLC_RGN_END1, upper_32_bits(end));
+
+	write_aux_reg(ARC_REG_SLC_RGN_END, lower_32_bits(end));
+
+	if (is_pae40_enabled())
+		write_aux_reg(ARC_REG_SLC_RGN_START1, upper_32_bits(paddr));
+
+	write_aux_reg(ARC_REG_SLC_RGN_START, lower_32_bits(paddr));
 
 	/* Make sure "busy" bit reports correct stataus, see STAR 9001165532 */
 	read_aux_reg(ARC_REG_SLC_CTRL);

From b5ddb6d54729d814356937572d6c9b599f10c29f Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Thu, 3 Aug 2017 17:45:44 +0530
Subject: [PATCH 075/149] ARCv2: PAE40: set MSB even if !CONFIG_ARC_HAS_PAE40
 but PAE exists in SoC

PAE40 confiuration in hardware extends some of the address registers
for TLB/cache ops to 2 words.

So far kernel was NOT setting the higher word if feature was not enabled
in software which is wrong. Those need to be set to 0 in such case.

Normally this would be done in the cache flush / tlb ops, however since
these registers only exist conditionally, this would have to be
conditional to a flag being set on boot which is expensive/ugly -
specially for the more common case of PAE exists but not in use.
Optimize that by zero'ing them once at boot - nobody will write to
them afterwards

Cc: stable@vger.kernel.org   #4.4+
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/include/asm/mmu.h |  2 ++
 arch/arc/mm/cache.c        | 34 ++++++++++++++++++++++++++++------
 arch/arc/mm/tlb.c          | 12 +++++++++++-
 3 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/arch/arc/include/asm/mmu.h b/arch/arc/include/asm/mmu.h
index db7319e9b506..efb79fafff1d 100644
--- a/arch/arc/include/asm/mmu.h
+++ b/arch/arc/include/asm/mmu.h
@@ -94,6 +94,8 @@ static inline int is_pae40_enabled(void)
 	return IS_ENABLED(CONFIG_ARC_HAS_PAE40);
 }
 
+extern int pae40_exist_but_not_enab(void);
+
 #endif	/* !__ASSEMBLY__ */
 
 #endif
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index 874913b3e826..7db283b46ebd 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -1123,6 +1123,13 @@ noinline void __init arc_ioc_setup(void)
 	__dc_enable();
 }
 
+/*
+ * Cache related boot time checks/setups only needed on master CPU:
+ *  - Geometry checks (kernel build and hardware agree: e.g. L1_CACHE_BYTES)
+ *    Assume SMP only, so all cores will have same cache config. A check on
+ *    one core suffices for all
+ *  - IOC setup / dma callbacks only need to be done once
+ */
 void __init arc_cache_init_master(void)
 {
 	unsigned int __maybe_unused cpu = smp_processor_id();
@@ -1202,12 +1209,27 @@ void __ref arc_cache_init(void)
 
 	printk(arc_cache_mumbojumbo(0, str, sizeof(str)));
 
-	/*
-	 * Only master CPU needs to execute rest of function:
-	 *  - Assume SMP so all cores will have same cache config so
-	 *    any geomtry checks will be same for all
-	 *  - IOC setup / dma callbacks only need to be setup once
-	 */
 	if (!cpu)
 		arc_cache_init_master();
+
+	/*
+	 * In PAE regime, TLB and cache maintenance ops take wider addresses
+	 * And even if PAE is not enabled in kernel, the upper 32-bits still need
+	 * to be zeroed to keep the ops sane.
+	 * As an optimization for more common !PAE enabled case, zero them out
+	 * once at init, rather than checking/setting to 0 for every runtime op
+	 */
+	if (is_isa_arcv2() && pae40_exist_but_not_enab()) {
+
+		if (IS_ENABLED(CONFIG_ARC_HAS_ICACHE))
+			write_aux_reg(ARC_REG_IC_PTAG_HI, 0);
+
+		if (IS_ENABLED(CONFIG_ARC_HAS_DCACHE))
+			write_aux_reg(ARC_REG_DC_PTAG_HI, 0);
+
+		if (l2_line_sz) {
+			write_aux_reg(ARC_REG_SLC_RGN_END1, 0);
+			write_aux_reg(ARC_REG_SLC_RGN_START1, 0);
+		}
+	}
 }
diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c
index d0126fdfe2d8..b181f3ee38aa 100644
--- a/arch/arc/mm/tlb.c
+++ b/arch/arc/mm/tlb.c
@@ -104,6 +104,8 @@
 /* A copy of the ASID from the PID reg is kept in asid_cache */
 DEFINE_PER_CPU(unsigned int, asid_cache) = MM_CTXT_FIRST_CYCLE;
 
+static int __read_mostly pae_exists;
+
 /*
  * Utility Routine to erase a J-TLB entry
  * Caller needs to setup Index Reg (manually or via getIndex)
@@ -784,7 +786,7 @@ void read_decode_mmu_bcr(void)
 		mmu->u_dtlb = mmu4->u_dtlb * 4;
 		mmu->u_itlb = mmu4->u_itlb * 4;
 		mmu->sasid = mmu4->sasid;
-		mmu->pae = mmu4->pae;
+		pae_exists = mmu->pae = mmu4->pae;
 	}
 }
 
@@ -809,6 +811,11 @@ char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len)
 	return buf;
 }
 
+int pae40_exist_but_not_enab(void)
+{
+	return pae_exists && !is_pae40_enabled();
+}
+
 void arc_mmu_init(void)
 {
 	char str[256];
@@ -859,6 +866,9 @@ void arc_mmu_init(void)
 	/* swapper_pg_dir is the pgd for the kernel, used by vmalloc */
 	write_aux_reg(ARC_REG_SCRATCH_DATA0, swapper_pg_dir);
 #endif
+
+	if (pae40_exist_but_not_enab())
+		write_aux_reg(ARC_REG_TLBPD1HI, 0);
 }
 
 /*

From cb87481ee89dbd6609e227afbf64900fb4e5c930 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 26 Jul 2017 22:46:27 +1000
Subject: [PATCH 076/149] kbuild: linker script do not match C names unless
 LD_DEAD_CODE_DATA_ELIMINATION is configured

The .data and .bss sections were modified in the generic linker script to
pull in sections named .data.<C identifier>, which are generated by gcc with
-ffunction-sections and -fdata-sections options.

The problem with this pattern is it can also match section names that Linux
defines explicitly, e.g., .data.unlikely. This can cause Linux sections to
get moved into the wrong place.

The way to avoid this is to use ".." separators for explicit section names
(the dot character is valid in a section name but not a C identifier).
However currently there are sections which don't follow this rule, so for
now just disable the wild card by default.

Example: http://marc.info/?l=linux-arm-kernel&m=150106824024221&w=2

Cc: <stable@vger.kernel.org> # 4.9
Fixes: b67067f1176df ("kbuild: allow archs to select link dead code/data elimination")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 include/asm-generic/vmlinux.lds.h | 38 +++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index da0be9a8d1de..9623d78f8494 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -59,6 +59,22 @@
 /* Align . to a 8 byte boundary equals to maximum function alignment. */
 #define ALIGN_FUNCTION()  . = ALIGN(8)
 
+/*
+ * LD_DEAD_CODE_DATA_ELIMINATION option enables -fdata-sections, which
+ * generates .data.identifier sections, which need to be pulled in with
+ * .data. We don't want to pull in .data..other sections, which Linux
+ * has defined. Same for text and bss.
+ */
+#ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
+#define TEXT_MAIN .text .text.[0-9a-zA-Z_]*
+#define DATA_MAIN .data .data.[0-9a-zA-Z_]*
+#define BSS_MAIN .bss .bss.[0-9a-zA-Z_]*
+#else
+#define TEXT_MAIN .text
+#define DATA_MAIN .data
+#define BSS_MAIN .bss
+#endif
+
 /*
  * Align to a 32 byte boundary equal to the
  * alignment gcc 4.5 uses for a struct
@@ -198,12 +214,9 @@
 
 /*
  * .data section
- * LD_DEAD_CODE_DATA_ELIMINATION option enables -fdata-sections generates
- * .data.identifier which needs to be pulled in with .data, but don't want to
- * pull in .data..stuff which has its own requirements. Same for bss.
  */
 #define DATA_DATA							\
-	*(.data .data.[0-9a-zA-Z_]*)					\
+	*(DATA_MAIN)							\
 	*(.ref.data)							\
 	*(.data..shared_aligned) /* percpu related */			\
 	MEM_KEEP(init.data)						\
@@ -434,16 +447,17 @@
 		VMLINUX_SYMBOL(__security_initcall_end) = .;		\
 	}
 
-/* .text section. Map to function alignment to avoid address changes
+/*
+ * .text section. Map to function alignment to avoid address changes
  * during second ld run in second ld pass when generating System.map
- * LD_DEAD_CODE_DATA_ELIMINATION option enables -ffunction-sections generates
- * .text.identifier which needs to be pulled in with .text , but some
- * architectures define .text.foo which is not intended to be pulled in here.
- * Those enabling LD_DEAD_CODE_DATA_ELIMINATION must ensure they don't have
- * conflicting section names, and must pull in .text.[0-9a-zA-Z_]* */
+ *
+ * TEXT_MAIN here will match .text.fixup and .text.unlikely if dead
+ * code elimination is enabled, so these sections should be converted
+ * to use ".." first.
+ */
 #define TEXT_TEXT							\
 		ALIGN_FUNCTION();					\
-		*(.text.hot .text .text.fixup .text.unlikely)		\
+		*(.text.hot TEXT_MAIN .text.fixup .text.unlikely)	\
 		*(.ref.text)						\
 	MEM_KEEP(init.text)						\
 	MEM_KEEP(exit.text)						\
@@ -613,7 +627,7 @@
 		BSS_FIRST_SECTIONS					\
 		*(.bss..page_aligned)					\
 		*(.dynbss)						\
-		*(.bss .bss.[0-9a-zA-Z_]*)				\
+		*(BSS_MAIN)						\
 		*(COMMON)						\
 	}
 

From 312a3d0918bb7d65862fbbd3e2f2f4630e4d6f56 Mon Sep 17 00:00:00 2001
From: Cao jin <caoj.fnst@cn.fujitsu.com>
Date: Wed, 2 Aug 2017 10:31:06 +0800
Subject: [PATCH 077/149] kbuild: trivial cleanups on the comments

This is a bunch of trivial fixes and cleanups.

Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 Makefile               | 10 +++++-----
 scripts/Kbuild.include |  7 +++----
 scripts/Makefile.build |  8 ++++----
 scripts/basic/Makefile |  2 +-
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/Makefile b/Makefile
index 6eba23bcb5ad..473f1aa76f6d 100644
--- a/Makefile
+++ b/Makefile
@@ -442,7 +442,7 @@ export RCS_TAR_IGNORE := --exclude SCCS --exclude BitKeeper --exclude .svn \
 # ===========================================================================
 # Rules shared between *config targets and build targets
 
-# Basic helpers built in scripts/
+# Basic helpers built in scripts/basic/
 PHONY += scripts_basic
 scripts_basic:
 	$(Q)$(MAKE) $(build)=scripts/basic
@@ -505,7 +505,7 @@ ifeq ($(KBUILD_EXTMOD),)
                 endif
         endif
 endif
-# install and module_install need also be processed one by one
+# install and modules_install need also be processed one by one
 ifneq ($(filter install,$(MAKECMDGOALS)),)
         ifneq ($(filter modules_install,$(MAKECMDGOALS)),)
 	        mixed-targets := 1
@@ -964,7 +964,7 @@ export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y2) $(drivers-y) $(net-y) $(virt-
 export KBUILD_VMLINUX_LIBS := $(libs-y1)
 export KBUILD_LDS          := arch/$(SRCARCH)/kernel/vmlinux.lds
 export LDFLAGS_vmlinux
-# used by scripts/pacmage/Makefile
+# used by scripts/package/Makefile
 export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools)
 
 vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN) $(KBUILD_VMLINUX_LIBS)
@@ -992,8 +992,8 @@ include/generated/autoksyms.h: FORCE
 ARCH_POSTLINK := $(wildcard $(srctree)/arch/$(SRCARCH)/Makefile.postlink)
 
 # Final link of vmlinux with optional arch pass after final link
-    cmd_link-vmlinux =                                                 \
-	$(CONFIG_SHELL) $< $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux) ;       \
+cmd_link-vmlinux =                                                 \
+	$(CONFIG_SHELL) $< $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux) ;    \
 	$(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) $@, true)
 
 vmlinux: scripts/link-vmlinux.sh vmlinux_prereq $(vmlinux-deps) FORCE
diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index dd8e2dde0b34..9ffd3dda3889 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -85,8 +85,8 @@ TMPOUT := $(if $(KBUILD_EXTMOD),$(firstword $(KBUILD_EXTMOD))/)
 
 # try-run
 # Usage: option = $(call try-run, $(CC)...-o "$$TMP",option-ok,otherwise)
-# Exit code chooses option. "$$TMP" is can be used as temporary file and
-# is automatically cleaned up.
+# Exit code chooses option. "$$TMP" serves as a temporary file and is
+# automatically cleaned up.
 try-run = $(shell set -e;		\
 	TMP="$(TMPOUT).$$$$.tmp";	\
 	TMPO="$(TMPOUT).$$$$.o";	\
@@ -261,7 +261,6 @@ make-cmd = $(call escsq,$(subst \#,\\\#,$(subst $$,$$$$,$(cmd_$(1)))))
 any-prereq = $(filter-out $(PHONY),$?) $(filter-out $(PHONY) $(wildcard $^),$^)
 
 # Execute command if command has changed or prerequisite(s) are updated.
-#
 if_changed = $(if $(strip $(any-prereq) $(arg-check)),                       \
 	@set -e;                                                             \
 	$(echo-cmd) $(cmd_$(1));                                             \
@@ -315,7 +314,7 @@ if_changed_rule = $(if $(strip $(any-prereq) $(arg-check) ),                 \
 	$(rule_$(1)), @:)
 
 ###
-# why - tell why a a target got build
+# why - tell why a target got built
 #       enabled by make V=2
 #       Output (listed in the order they are checked):
 #          (1) - due to target is PHONY
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 4a9a2cec0a1b..f6152c70f7f4 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -229,8 +229,8 @@ ifeq ("$(origin RECORDMCOUNT_WARN)", "command line")
 endif
 # Due to recursion, we must skip empty.o.
 # The empty.o file is created in the make process in order to determine
-#  the target endianness and word size. It is made before all other C
-#  files, including recordmcount.
+# the target endianness and word size. It is made before all other C
+# files, including recordmcount.
 sub_cmd_record_mcount =					\
 	if [ $(@) != "scripts/mod/empty.o" ]; then	\
 		$(objtree)/scripts/recordmcount $(RECORDMCOUNT_FLAGS) "$(@)";	\
@@ -245,13 +245,13 @@ sub_cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH
 	"$(LD)" "$(NM)" "$(RM)" "$(MV)" \
 	"$(if $(part-of-module),1,0)" "$(@)";
 recordmcount_source := $(srctree)/scripts/recordmcount.pl
-endif
+endif # BUILD_C_RECORDMCOUNT
 cmd_record_mcount =						\
 	if [ "$(findstring $(CC_FLAGS_FTRACE),$(_c_flags))" =	\
 	     "$(CC_FLAGS_FTRACE)" ]; then			\
 		$(sub_cmd_record_mcount)			\
 	fi;
-endif
+endif # CONFIG_FTRACE_MCOUNT_RECORD
 
 ifdef CONFIG_STACK_VALIDATION
 ifneq ($(SKIP_STACK_VALIDATION),1)
diff --git a/scripts/basic/Makefile b/scripts/basic/Makefile
index ec10d9345bc2..0372b33febe5 100644
--- a/scripts/basic/Makefile
+++ b/scripts/basic/Makefile
@@ -1,5 +1,5 @@
 ###
-# Makefile.basic lists the most basic programs used during the build process.
+# This Makefile lists the most basic programs used during the build process.
 # The programs listed herein are what are needed to do the basic stuff,
 # such as fix file dependencies.
 # This initial step is needed to avoid files to be recompiled

From 4e433fc4d1a93310d31e7671faca8660a2890908 Mon Sep 17 00:00:00 2001
From: Cao jin <caoj.fnst@cn.fujitsu.com>
Date: Tue, 8 Aug 2017 21:20:50 +0800
Subject: [PATCH 078/149] fixdep: trivial: typo fix and correction

Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 scripts/basic/fixdep.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/basic/fixdep.c b/scripts/basic/fixdep.c
index fff818b92acb..bbf62cb1f819 100644
--- a/scripts/basic/fixdep.c
+++ b/scripts/basic/fixdep.c
@@ -25,7 +25,7 @@
  *
  * So we play the same trick that "mkdep" played before. We replace
  * the dependency on autoconf.h by a dependency on every config
- * option which is mentioned in any of the listed prequisites.
+ * option which is mentioned in any of the listed prerequisites.
  *
  * kconfig populates a tree in include/config/ with an empty file
  * for each config symbol and when the configuration is updated
@@ -34,7 +34,7 @@
  * the config symbols are rebuilt.
  *
  * So if the user changes his CONFIG_HIS_DRIVER option, only the objects
- * which depend on "include/linux/config/his/driver.h" will be rebuilt,
+ * which depend on "include/config/his/driver.h" will be rebuilt,
  * so most likely only his driver ;-)
  *
  * The idea above dates, by the way, back to Michael E Chastain, AFAIK.
@@ -75,7 +75,7 @@
  * and then basically copies the .<target>.d file to stdout, in the
  * process filtering out the dependency on autoconf.h and adding
  * dependencies on include/config/my/option.h for every
- * CONFIG_MY_OPTION encountered in any of the prequisites.
+ * CONFIG_MY_OPTION encountered in any of the prerequisites.
  *
  * It will also filter out all the dependencies on *.ver. We need
  * to make sure that the generated version checksum are globally up

From 9157259d16a8ee8116a98d32f29b797689327e8d Mon Sep 17 00:00:00 2001
From: Zi Yan <zi.yan@cs.rutgers.edu>
Date: Thu, 3 Aug 2017 09:17:21 -0400
Subject: [PATCH 079/149] mm: add pmd_t initializer __pmd() to work around a
 GCC bug.

THP migration is added but only supports x86_64 at the moment. For all
other architectures, swp_entry_to_pmd() only returns a zero pmd_t.

Due to a GCC zero initializer bug #53119, the standard (pmd_t){0}
initializer is not accepted by all GCC versions. __pmd() is a feasible
workaround. In addition, sparc32's pmd_t is an array instead of a single
value, so we need (pmd_t){ {0}, } instead of (pmd_t){0}. Thus,
a different __pmd() definition is needed in sparc32.

Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/include/asm/page_32.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h
index 0efd0583a8c9..6249214148c2 100644
--- a/arch/sparc/include/asm/page_32.h
+++ b/arch/sparc/include/asm/page_32.h
@@ -68,6 +68,7 @@ typedef struct { unsigned long iopgprot; } iopgprot_t;
 #define iopgprot_val(x)	((x).iopgprot)
 
 #define __pte(x)	((pte_t) { (x) } )
+#define __pmd(x)	((pmd_t) { { (x) }, })
 #define __iopte(x)	((iopte_t) { (x) } )
 #define __pgd(x)	((pgd_t) { (x) } )
 #define __ctxd(x)	((ctxd_t) { (x) } )
@@ -95,6 +96,7 @@ typedef unsigned long iopgprot_t;
 #define iopgprot_val(x)	(x)
 
 #define __pte(x)	(x)
+#define __pmd(x)	((pmd_t) { { (x) }, })
 #define __iopte(x)	(x)
 #define __pgd(x)	(x)
 #define __ctxd(x)	(x)

From a8ec3ee861b6e4e6b82a98777c65510ae63766c1 Mon Sep 17 00:00:00 2001
From: Alexey Brodkin <Alexey.Brodkin@synopsys.com>
Date: Thu, 10 Aug 2017 18:07:36 +0300
Subject: [PATCH 080/149] arc: Mask individual IRQ lines during core INTC init

ARC cores on reset have all interrupt lines of built-in INTC enabled.
Which means once we globally enable interrupts (very early on boot)
faulty hardware blocks may trigger an interrupt that Linux kernel
cannot handle yet as corresponding handler is not yet installed.

In that case system falls in "interrupt storm" and basically never
does anything useful except entering and exiting generic IRQ handling
code.

One real example of that kind of problematic hardware is DW GMAC which
also has interrupts enabled on reset and if Ethernet PHY informs GMAC
about link state, GMAC immediately reports that upstream to ARC core
and here we are.

Now with that change we mask all individual IRQ lines making entire
system more fool-proof.

[This patch was motivated by Adaptrum platform support]

Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
Cc: Eugeniy Paltsev <paltsev@synopsys.com>
Tested-by: Alexandru Gagniuc <alex.g@adaptrum.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
---
 arch/arc/kernel/intc-arcv2.c   |  3 +++
 arch/arc/kernel/intc-compact.c | 14 +++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/arch/arc/kernel/intc-arcv2.c b/arch/arc/kernel/intc-arcv2.c
index f928795fd07a..cf90714a676d 100644
--- a/arch/arc/kernel/intc-arcv2.c
+++ b/arch/arc/kernel/intc-arcv2.c
@@ -75,10 +75,13 @@ void arc_init_IRQ(void)
 	 * Set a default priority for all available interrupts to prevent
 	 * switching of register banks if Fast IRQ and multiple register banks
 	 * are supported by CPU.
+	 * Also disable all IRQ lines so faulty external hardware won't
+	 * trigger interrupt that kernel is not ready to handle.
 	 */
 	for (i = NR_EXCEPTIONS; i < irq_bcr.irqs + NR_EXCEPTIONS; i++) {
 		write_aux_reg(AUX_IRQ_SELECT, i);
 		write_aux_reg(AUX_IRQ_PRIORITY, ARCV2_IRQ_DEF_PRIO);
+		write_aux_reg(AUX_IRQ_ENABLE, 0);
 	}
 
 	/* setup status32, don't enable intr yet as kernel doesn't want */
diff --git a/arch/arc/kernel/intc-compact.c b/arch/arc/kernel/intc-compact.c
index 7e608c6b0a01..cef388025adf 100644
--- a/arch/arc/kernel/intc-compact.c
+++ b/arch/arc/kernel/intc-compact.c
@@ -27,7 +27,7 @@
  */
 void arc_init_IRQ(void)
 {
-	int level_mask = 0;
+	int level_mask = 0, i;
 
        /* Is timer high priority Interrupt (Level2 in ARCompact jargon) */
 	level_mask |= IS_ENABLED(CONFIG_ARC_COMPACT_IRQ_LEVELS) << TIMER0_IRQ;
@@ -40,6 +40,18 @@ void arc_init_IRQ(void)
 
 	if (level_mask)
 		pr_info("Level-2 interrupts bitset %x\n", level_mask);
+
+	/*
+	 * Disable all IRQ lines so faulty external hardware won't
+	 * trigger interrupt that kernel is not ready to handle.
+	 */
+	for (i = TIMER0_IRQ; i < NR_CPU_IRQS; i++) {
+		unsigned int ienb;
+
+		ienb = read_aux_reg(AUX_IENABLE);
+		ienb &= ~(1 << i);
+		write_aux_reg(AUX_IENABLE, ienb);
+	}
 }
 
 /*

From b80b32b6d5e79798b85cd4644206aaa069059390 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 14 Aug 2017 08:29:18 -0400
Subject: [PATCH 081/149] ext4: fix clang build regression

Arnd Bergmann <arnd@arndb.de>

As Stefan pointed out, I misremembered what clang can do specifically,
and it turns out that the variable-length array at the end of the
structure did not work (a flexible array would have worked here
but not solved the problem):

fs/ext4/mballoc.c:2303:17: error: fields must have a constant size:
'variable length array in structure' extension will never be supported
                ext4_grpblk_t counters[blocksize_bits + 2];

This reverts part of my previous patch, using a fixed-size array
again, but keeping the check for the array overflow.

Fixes: 2df2c3402fc8 ("ext4: fix warning about stack corruption")
Reported-by: Stefan Agner <stefan@agner.ch>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5a1052627a81..701085620cd8 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2300,7 +2300,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 					     EXT4_MAX_BLOCK_LOG_SIZE);
 	struct sg {
 		struct ext4_group_info info;
-		ext4_grpblk_t counters[blocksize_bits + 2];
+		ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
 	} sg;
 
 	group--;
@@ -2309,6 +2309,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 			      " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
 			      " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]\n");
 
+	i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+		sizeof(struct ext4_group_info);
+
 	grinfo = ext4_get_group_info(sb, group);
 	/* Load the group info in memory only if not already loaded. */
 	if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
@@ -2320,7 +2323,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 		buddy_loaded = 1;
 	}
 
-	memcpy(&sg, ext4_get_group_info(sb, group), sizeof(sg));
+	memcpy(&sg, ext4_get_group_info(sb, group), i);
 
 	if (buddy_loaded)
 		ext4_mb_unload_buddy(&e4b);

From 32aaf194201e98db4235b7b71ac62a22e2ac355f Mon Sep 17 00:00:00 2001
From: Tahsin Erdogan <tahsin@google.com>
Date: Mon, 14 Aug 2017 08:30:06 -0400
Subject: [PATCH 082/149] ext4: add missing xattr hash update

When updating an extended attribute, if the padded value sizes are the
same, a shortcut is taken to avoid the bulk of the work. This was fine
until the xattr hash update was moved inside ext4_xattr_set_entry().
With that change, the hash update got missed in the shortcut case.

Thanks to ZhangYi (yizhang089@gmail.com) for root causing the problem.

Fixes: daf8328172df ("ext4: eliminate xattr entry e_hash recalculation for removes")

Reported-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Tahsin Erdogan <tahsin@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/xattr.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 82a5af9f6668..3dd970168448 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1543,7 +1543,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 			/* Clear padding bytes. */
 			memset(val + i->value_len, 0, new_size - i->value_len);
 		}
-		return 0;
+		goto update_hash;
 	}
 
 	/* Compute min_offs and last. */
@@ -1707,6 +1707,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 		here->e_value_size = cpu_to_le32(i->value_len);
 	}
 
+update_hash:
 	if (i->value) {
 		__le32 hash = 0;
 
@@ -1725,7 +1726,8 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 						     here->e_name_len,
 						     &crc32c_hash, 1);
 		} else if (is_block) {
-			__le32 *value = s->base + min_offs - new_size;
+			__le32 *value = s->base + le16_to_cpu(
+							here->e_value_offs);
 
 			hash = ext4_xattr_hash_entry(here->e_name,
 						     here->e_name_len, value,

From 90b05b0598c698252d5f7dcdce335c5a546e9047 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 2 Aug 2017 13:47:42 +0900
Subject: [PATCH 083/149] gpio: reject invalid gpio before getting gpio_desc

Check user-given gpio number and reject it before
calling gpio_to_desc() because gpio_to_desc() is
for kernel driver and it expects given gpio number
is valid (means 0 to 511).
If given number is invalid, gpio_to_desc() calls
WARN() and dump registers and stack for debug.
This means user can easily kick WARN() just by
writing invalid gpio number (e.g. 512) to
/sys/class/gpio/export.

Fixes: 0e9a5edf5d01 ("gpio: fix deferred probe detection for legacy API")
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-sysfs.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpio/gpiolib-sysfs.c b/drivers/gpio/gpiolib-sysfs.c
index 16fe9742597b..fc80add5fedb 100644
--- a/drivers/gpio/gpiolib-sysfs.c
+++ b/drivers/gpio/gpiolib-sysfs.c
@@ -2,6 +2,7 @@
 #include <linux/mutex.h>
 #include <linux/device.h>
 #include <linux/sysfs.h>
+#include <linux/gpio.h>
 #include <linux/gpio/consumer.h>
 #include <linux/gpio/driver.h>
 #include <linux/interrupt.h>
@@ -432,6 +433,11 @@ static struct attribute *gpiochip_attrs[] = {
 };
 ATTRIBUTE_GROUPS(gpiochip);
 
+static struct gpio_desc *gpio_to_valid_desc(int gpio)
+{
+	return gpio_is_valid(gpio) ? gpio_to_desc(gpio) : NULL;
+}
+
 /*
  * /sys/class/gpio/export ... write-only
  *	integer N ... number of GPIO to export (full access)
@@ -450,7 +456,7 @@ static ssize_t export_store(struct class *class,
 	if (status < 0)
 		goto done;
 
-	desc = gpio_to_desc(gpio);
+	desc = gpio_to_valid_desc(gpio);
 	/* reject invalid GPIOs */
 	if (!desc) {
 		pr_warn("%s: invalid GPIO %ld\n", __func__, gpio);
@@ -493,7 +499,7 @@ static ssize_t unexport_store(struct class *class,
 	if (status < 0)
 		goto done;
 
-	desc = gpio_to_desc(gpio);
+	desc = gpio_to_valid_desc(gpio);
 	/* reject bogus commands (gpio_unexport ignores them) */
 	if (!desc) {
 		pr_warn("%s: invalid GPIO %ld\n", __func__, gpio);

From 3f13b6a24f192da3096389b82202f16eff2c11ee Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Wed, 12 Jul 2017 13:22:29 +0200
Subject: [PATCH 084/149] gpio: mvebu: Fix cause computation in irq handler

When switching to regmap, the way to compute the irq cause was
reorganized. However while doing it, a typo was introduced: a 'xor'
replaced a 'and'.

This lead to wrong behavior in the interrupt handler ans one of the
symptom was wrong irq handler called on the Armada 388 GP:
"->handle_irq():  c016303c,
handle_bad_irq+0x0/0x278
->irq_data.chip(): c0b0ec0c,
0xc0b0ec0c
->action():   (null)
   IRQ_NOPROBE set
 IRQ_NOREQUEST set
unexpected IRQ trap at vector 00
irq 0, desc: ee804800, depth: 1, count: 0, unhandled: 0"

Fixes: 2233bf7a92e7 ("gpio: mvebu: switch to regmap for register access")
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-mvebu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c
index e338c3743562..45c65f805fd6 100644
--- a/drivers/gpio/gpio-mvebu.c
+++ b/drivers/gpio/gpio-mvebu.c
@@ -557,7 +557,7 @@ static void mvebu_gpio_irq_handler(struct irq_desc *desc)
 	edge_cause = mvebu_gpio_read_edge_cause(mvchip);
 	edge_mask  = mvebu_gpio_read_edge_mask(mvchip);
 
-	cause = (data_in ^ level_mask) | (edge_cause & edge_mask);
+	cause = (data_in & level_mask) | (edge_cause & edge_mask);
 
 	for (i = 0; i < mvchip->chip.ngpio; i++) {
 		int irq;

From 79db795833bf5c3e798bcd7a5aeeee3fb0505927 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 15 Aug 2017 21:32:19 -0700
Subject: [PATCH 085/149] sparc64: Don't clibber fixed registers in __multi4.

%g4 and %g5 are fixed registers used by the kernel for the thread
pointer and the per-cpu offset.  Use %o4 and %g7 instead.

Diagnosis by Anthony Yznaga.

Fixes: 1b4af13ff2cc ("sparc64: Add __multi3 for gcc 7.x and later.")
Reported-by: Anatoly Pugachev <matorola@gmail.com>
Tested-by: Anatoly Pugachev <matorola@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/lib/multi3.S | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/sparc/lib/multi3.S b/arch/sparc/lib/multi3.S
index d6b6c97fe3c7..703127aaf4a5 100644
--- a/arch/sparc/lib/multi3.S
+++ b/arch/sparc/lib/multi3.S
@@ -5,26 +5,26 @@
 	.align	4
 ENTRY(__multi3) /* %o0 = u, %o1 = v */
 	mov	%o1, %g1
-	srl	%o3, 0, %g4
-	mulx	%g4, %g1, %o1
+	srl	%o3, 0, %o4
+	mulx	%o4, %g1, %o1
 	srlx	%g1, 0x20, %g3
-	mulx	%g3, %g4, %g5
-	sllx	%g5, 0x20, %o5
-	srl	%g1, 0, %g4
+	mulx	%g3, %o4, %g7
+	sllx	%g7, 0x20, %o5
+	srl	%g1, 0, %o4
 	sub	%o1, %o5, %o5
 	srlx	%o5, 0x20, %o5
-	addcc	%g5, %o5, %g5
+	addcc	%g7, %o5, %g7
 	srlx	%o3, 0x20, %o5
-	mulx	%g4, %o5, %g4
+	mulx	%o4, %o5, %o4
 	mulx	%g3, %o5, %o5
 	sethi	%hi(0x80000000), %g3
-	addcc	%g5, %g4, %g5
-	srlx	%g5, 0x20, %g5
+	addcc	%g7, %o4, %g7
+	srlx	%g7, 0x20, %g7
 	add	%g3, %g3, %g3
 	movcc	%xcc, %g0, %g3
-	addcc	%o5, %g5, %o5
-	sllx	%g4, 0x20, %g4
-	add	%o1, %g4, %o1
+	addcc	%o5, %g7, %o5
+	sllx	%o4, 0x20, %o4
+	add	%o1, %o4, %o1
 	add	%o5, %g3, %g2
 	mulx	%g1, %o2, %g1
 	add	%g1, %g2, %g1

From c7b725be84985532161bcb4fbecd056326998a77 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 15 Aug 2017 18:38:42 -0700
Subject: [PATCH 086/149] net: igmp: Use ingress interface rather than vrf
 device

Anuradha reported that statically added groups for interfaces enslaved
to a VRF device were not persisting. The problem is that igmp queries
and reports need to use the data in the in_dev for the real ingress
device rather than the VRF device. Update igmp_rcv accordingly.

Fixes: e58e41596811 ("net: Enable support for VRF with ipv4 multicast")
Reported-by: Anuradha Karuppiah <anuradhak@cumulusnetworks.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 498706b072fb..caf2f1101d02 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1007,10 +1007,18 @@ int igmp_rcv(struct sk_buff *skb)
 {
 	/* This basically follows the spec line by line -- see RFC1112 */
 	struct igmphdr *ih;
-	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+	struct net_device *dev = skb->dev;
+	struct in_device *in_dev;
 	int len = skb->len;
 	bool dropped = true;
 
+	if (netif_is_l3_master(dev)) {
+		dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif);
+		if (!dev)
+			goto drop;
+	}
+
+	in_dev = __in_dev_get_rcu(dev);
 	if (!in_dev)
 		goto drop;
 

From 6170a506899aee3dd4934c928426505e47b1b466 Mon Sep 17 00:00:00 2001
From: Tushar Dave <tushar.n.dave@oracle.com>
Date: Wed, 16 Aug 2017 11:09:10 -0700
Subject: [PATCH 087/149] sparc64: remove unnecessary log message

There is no need to log message if ATU hvapi couldn't get register.
Unlike PCI hvapi, ATU hvapi registration failure is not hard error.
Even if ATU hvapi registration fails (on system with ATU or without
ATU) system continues with legacy IOMMU. So only log message when
ATU hvapi successfully get registered.

Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/pci_sun4v.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index f10e2f712394..9ebebf1fd93d 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -1266,8 +1266,6 @@ static int pci_sun4v_probe(struct platform_device *op)
 			 * ATU group, but ATU hcalls won't be available.
 			 */
 			hv_atu = false;
-			pr_err(PFX "Could not register hvapi ATU err=%d\n",
-			       err);
 		} else {
 			pr_info(PFX "Registered hvapi ATU major[%lu] minor[%lu]\n",
 				vatu_major, vatu_minor);

From 494bea39f3201776cdfddc232705f54a0bd210c4 Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Wed, 16 Aug 2017 13:30:07 +0800
Subject: [PATCH 088/149] openvswitch: fix skb_panic due to the incorrect
 actions attrlen

For sw_flow_actions, the actions_len only represents the kernel part's
size, and when we dump the actions to the userspace, we will do the
convertions, so it's true size may become bigger than the actions_len.

But unfortunately, for OVS_PACKET_ATTR_ACTIONS, we use the actions_len
to alloc the skbuff, so the user_skb's size may become insufficient and
oops will happen like this:
  skbuff: skb_over_panic: text:ffffffff8148fabf len:1749 put:157 head:
  ffff881300f39000 data:ffff881300f39000 tail:0x6d5 end:0x6c0 dev:<NULL>
  ------------[ cut here ]------------
  kernel BUG at net/core/skbuff.c:129!
  [...]
  Call Trace:
   <IRQ>
   [<ffffffff8148be82>] skb_put+0x43/0x44
   [<ffffffff8148fabf>] skb_zerocopy+0x6c/0x1f4
   [<ffffffffa0290d36>] queue_userspace_packet+0x3a3/0x448 [openvswitch]
   [<ffffffffa0292023>] ovs_dp_upcall+0x30/0x5c [openvswitch]
   [<ffffffffa028d435>] output_userspace+0x132/0x158 [openvswitch]
   [<ffffffffa01e6890>] ? ip6_rcv_finish+0x74/0x77 [ipv6]
   [<ffffffffa028e277>] do_execute_actions+0xcc1/0xdc8 [openvswitch]
   [<ffffffffa028e3f2>] ovs_execute_actions+0x74/0x106 [openvswitch]
   [<ffffffffa0292130>] ovs_dp_process_packet+0xe1/0xfd [openvswitch]
   [<ffffffffa0292b77>] ? key_extract+0x63c/0x8d5 [openvswitch]
   [<ffffffffa029848b>] ovs_vport_receive+0xa1/0xc3 [openvswitch]
  [...]

Also we can find that the actions_len is much little than the orig_len:
  crash> struct sw_flow_actions 0xffff8812f539d000
  struct sw_flow_actions {
    rcu = {
      next = 0xffff8812f5398800,
      func = 0xffffe3b00035db32
    },
    orig_len = 1384,
    actions_len = 592,
    actions = 0xffff8812f539d01c
  }

So as a quick fix, use the orig_len instead of the actions_len to alloc
the user_skb.

Last, this oops happened on our system running a relative old kernel, but
the same risk still exists on the mainline, since we use the wrong
actions_len from the beginning.

Fixes: ccea74457bbd ("openvswitch: include datapath actions with sampled-packet upcall to userspace")
Cc: Neil McKee <neil.mckee@inmon.com>
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/actions.c  | 1 +
 net/openvswitch/datapath.c | 7 ++++---
 net/openvswitch/datapath.h | 2 ++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index e4610676299b..a54a556fcdb5 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1337,6 +1337,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
 		goto out;
 	}
 
+	OVS_CB(skb)->acts_origlen = acts->orig_len;
 	err = do_execute_actions(dp, skb, key,
 				 acts->actions, acts->actions_len);
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 45fe8c8a884d..6b44fe405282 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -381,7 +381,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 }
 
 static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
-			      unsigned int hdrlen)
+			      unsigned int hdrlen, int actions_attrlen)
 {
 	size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
 		+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
@@ -398,7 +398,7 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
 
 	/* OVS_PACKET_ATTR_ACTIONS */
 	if (upcall_info->actions_len)
-		size += nla_total_size(upcall_info->actions_len);
+		size += nla_total_size(actions_attrlen);
 
 	/* OVS_PACKET_ATTR_MRU */
 	if (upcall_info->mru)
@@ -465,7 +465,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 	else
 		hlen = skb->len;
 
-	len = upcall_msg_size(upcall_info, hlen - cutlen);
+	len = upcall_msg_size(upcall_info, hlen - cutlen,
+			      OVS_CB(skb)->acts_origlen);
 	user_skb = genlmsg_new(len, GFP_ATOMIC);
 	if (!user_skb) {
 		err = -ENOMEM;
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 5d8dcd88815f..480600649d0b 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -99,11 +99,13 @@ struct datapath {
  * when a packet is received by OVS.
  * @mru: The maximum received fragement size; 0 if the packet is not
  * fragmented.
+ * @acts_origlen: The netlink size of the flow actions applied to this skb.
  * @cutlen: The number of bytes from the packet end to be removed.
  */
 struct ovs_skb_cb {
 	struct vport		*input_vport;
 	u16			mru;
+	u16			acts_origlen;
 	u32			cutlen;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)

From 120e9dabaf551c6dc03d3a10a1f026376cb1811c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 07:03:15 -0700
Subject: [PATCH 089/149] dccp: defer ccid_hc_tx_delete() at dismantle time

syszkaller team reported another problem in DCCP [1]

Problem here is that the structure holding RTO timer
(ccid2_hc_tx_rto_expire() handler) is freed too soon.

We can not use del_timer_sync() to cancel the timer
since this timer wants to grab socket lock (that would risk a dead lock)

Solution is to defer the freeing of memory when all references to
the socket were released. Socket timers do own a reference, so this
should fix the issue.

[1]

==================================================================
BUG: KASAN: use-after-free in ccid2_hc_tx_rto_expire+0x51c/0x5c0 net/dccp/ccids/ccid2.c:144
Read of size 4 at addr ffff8801d2660540 by task kworker/u4:7/3365

CPU: 1 PID: 3365 Comm: kworker/u4:7 Not tainted 4.13.0-rc4+ #3
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: events_unbound call_usermodehelper_exec_work
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 print_address_description+0x73/0x250 mm/kasan/report.c:252
 kasan_report_error mm/kasan/report.c:351 [inline]
 kasan_report+0x24e/0x340 mm/kasan/report.c:409
 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report.c:429
 ccid2_hc_tx_rto_expire+0x51c/0x5c0 net/dccp/ccids/ccid2.c:144
 call_timer_fn+0x233/0x830 kernel/time/timer.c:1268
 expire_timers kernel/time/timer.c:1307 [inline]
 __run_timers+0x7fd/0xb90 kernel/time/timer.c:1601
 run_timer_softirq+0x21/0x80 kernel/time/timer.c:1614
 __do_softirq+0x2f5/0xba3 kernel/softirq.c:284
 invoke_softirq kernel/softirq.c:364 [inline]
 irq_exit+0x1cc/0x200 kernel/softirq.c:405
 exiting_irq arch/x86/include/asm/apic.h:638 [inline]
 smp_apic_timer_interrupt+0x76/0xa0 arch/x86/kernel/apic/apic.c:1044
 apic_timer_interrupt+0x93/0xa0 arch/x86/entry/entry_64.S:702
RIP: 0010:arch_local_irq_enable arch/x86/include/asm/paravirt.h:824 [inline]
RIP: 0010:__raw_write_unlock_irq include/linux/rwlock_api_smp.h:267 [inline]
RIP: 0010:_raw_write_unlock_irq+0x56/0x70 kernel/locking/spinlock.c:343
RSP: 0018:ffff8801cd50eaa8 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff10
RAX: dffffc0000000000 RBX: ffffffff85a090c0 RCX: 0000000000000006
RDX: 1ffffffff0b595f3 RSI: 1ffff1003962f989 RDI: ffffffff85acaf98
RBP: ffff8801cd50eab0 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801cc96ea60
R13: dffffc0000000000 R14: ffff8801cc96e4c0 R15: ffff8801cc96e4c0
 </IRQ>
 release_task+0xe9e/0x1a40 kernel/exit.c:220
 wait_task_zombie kernel/exit.c:1162 [inline]
 wait_consider_task+0x29b8/0x33c0 kernel/exit.c:1389
 do_wait_thread kernel/exit.c:1452 [inline]
 do_wait+0x441/0xa90 kernel/exit.c:1523
 kernel_wait4+0x1f5/0x370 kernel/exit.c:1665
 SYSC_wait4+0x134/0x140 kernel/exit.c:1677
 SyS_wait4+0x2c/0x40 kernel/exit.c:1673
 call_usermodehelper_exec_sync kernel/kmod.c:286 [inline]
 call_usermodehelper_exec_work+0x1a0/0x2c0 kernel/kmod.c:323
 process_one_work+0xbf3/0x1bc0 kernel/workqueue.c:2097
 worker_thread+0x223/0x1860 kernel/workqueue.c:2231
 kthread+0x35e/0x430 kernel/kthread.c:231
 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:425

Allocated by task 21267:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489
 kmem_cache_alloc+0x127/0x750 mm/slab.c:3561
 ccid_new+0x20e/0x390 net/dccp/ccid.c:151
 dccp_hdlr_ccid+0x27/0x140 net/dccp/feat.c:44
 __dccp_feat_activate+0x142/0x2a0 net/dccp/feat.c:344
 dccp_feat_activate_values+0x34e/0xa90 net/dccp/feat.c:1538
 dccp_rcv_request_sent_state_process net/dccp/input.c:472 [inline]
 dccp_rcv_state_process+0xed1/0x1620 net/dccp/input.c:677
 dccp_v4_do_rcv+0xeb/0x160 net/dccp/ipv4.c:679
 sk_backlog_rcv include/net/sock.h:911 [inline]
 __release_sock+0x124/0x360 net/core/sock.c:2269
 release_sock+0xa4/0x2a0 net/core/sock.c:2784
 inet_wait_for_connect net/ipv4/af_inet.c:557 [inline]
 __inet_stream_connect+0x671/0xf00 net/ipv4/af_inet.c:643
 inet_stream_connect+0x58/0xa0 net/ipv4/af_inet.c:682
 SYSC_connect+0x204/0x470 net/socket.c:1642
 SyS_connect+0x24/0x30 net/socket.c:1623
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Freed by task 3049:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
 __cache_free mm/slab.c:3503 [inline]
 kmem_cache_free+0x77/0x280 mm/slab.c:3763
 ccid_hc_tx_delete+0xc5/0x100 net/dccp/ccid.c:190
 dccp_destroy_sock+0x1d1/0x2b0 net/dccp/proto.c:225
 inet_csk_destroy_sock+0x166/0x3f0 net/ipv4/inet_connection_sock.c:833
 dccp_done+0xb7/0xd0 net/dccp/proto.c:145
 dccp_time_wait+0x13d/0x300 net/dccp/minisocks.c:72
 dccp_rcv_reset+0x1d1/0x5b0 net/dccp/input.c:160
 dccp_rcv_state_process+0x8fc/0x1620 net/dccp/input.c:663
 dccp_v4_do_rcv+0xeb/0x160 net/dccp/ipv4.c:679
 sk_backlog_rcv include/net/sock.h:911 [inline]
 __sk_receive_skb+0x33e/0xc00 net/core/sock.c:521
 dccp_v4_rcv+0xef1/0x1c00 net/dccp/ipv4.c:871
 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216
 NF_HOOK include/linux/netfilter.h:248 [inline]
 ip_local_deliver+0x1ce/0x6d0 net/ipv4/ip_input.c:257
 dst_input include/net/dst.h:477 [inline]
 ip_rcv_finish+0x8db/0x19c0 net/ipv4/ip_input.c:397
 NF_HOOK include/linux/netfilter.h:248 [inline]
 ip_rcv+0xc3f/0x17d0 net/ipv4/ip_input.c:488
 __netif_receive_skb_core+0x19af/0x33d0 net/core/dev.c:4417
 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4455
 process_backlog+0x203/0x740 net/core/dev.c:5130
 napi_poll net/core/dev.c:5527 [inline]
 net_rx_action+0x792/0x1910 net/core/dev.c:5593
 __do_softirq+0x2f5/0xba3 kernel/softirq.c:284

The buggy address belongs to the object at ffff8801d2660100
 which belongs to the cache ccid2_hc_tx_sock of size 1240
The buggy address is located 1088 bytes inside of
 1240-byte region [ffff8801d2660100, ffff8801d26605d8)
The buggy address belongs to the page:
page:ffffea0007499800 count:1 mapcount:0 mapping:ffff8801d2660100 index:0x0 compound_mapcount: 0
flags: 0x200000000008100(slab|head)
raw: 0200000000008100 ffff8801d2660100 0000000000000000 0000000100000005
raw: ffffea00075271a0 ffffea0007538820 ffff8801d3aef9c0 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801d2660400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8801d2660480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8801d2660500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                           ^
 ffff8801d2660580: fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc fc
 ffff8801d2660600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
==================================================================

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/proto.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 86bc40ba6ba5..b68168fcc06a 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -24,6 +24,7 @@
 #include <net/checksum.h>
 
 #include <net/inet_sock.h>
+#include <net/inet_common.h>
 #include <net/sock.h>
 #include <net/xfrm.h>
 
@@ -170,6 +171,15 @@ const char *dccp_packet_name(const int type)
 
 EXPORT_SYMBOL_GPL(dccp_packet_name);
 
+static void dccp_sk_destruct(struct sock *sk)
+{
+	struct dccp_sock *dp = dccp_sk(sk);
+
+	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+	dp->dccps_hc_tx_ccid = NULL;
+	inet_sock_destruct(sk);
+}
+
 int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 {
 	struct dccp_sock *dp = dccp_sk(sk);
@@ -179,6 +189,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
 	icsk->icsk_syn_retries	= sysctl_dccp_request_retries;
 	sk->sk_state		= DCCP_CLOSED;
 	sk->sk_write_space	= dccp_write_space;
+	sk->sk_destruct		= dccp_sk_destruct;
 	icsk->icsk_sync_mss	= dccp_sync_mss;
 	dp->dccps_mss_cache	= 536;
 	dp->dccps_rate_last	= jiffies;
@@ -219,8 +230,7 @@ void dccp_destroy_sock(struct sock *sk)
 		dp->dccps_hc_rx_ackvec = NULL;
 	}
 	ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
-	ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
-	dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+	dp->dccps_hc_rx_ccid = NULL;
 
 	/* clean up feature negotiation state */
 	dccp_feat_list_purge(&dp->dccps_featneg);

From 81fbfe8adaf38d4f5a98c19bebfd41c5d6acaee8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 10:36:47 -0700
Subject: [PATCH 090/149] ptr_ring: use kmalloc_array()

As found by syzkaller, malicious users can set whatever tx_queue_len
on a tun device and eventually crash the kernel.

Lets remove the ALIGN(XXX, SMP_CACHE_BYTES) thing since a small
ring buffer is not fast anyway.

Fixes: 2e0ab8ca83c1 ("ptr_ring: array based FIFO for pointers")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptr_ring.h  | 9 +++++----
 include/linux/skb_array.h | 3 ++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index d8c97ec8a8e6..37b4bb2545b3 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -436,9 +436,9 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
 	__PTR_RING_PEEK_CALL_v; \
 })
 
-static inline void **__ptr_ring_init_queue_alloc(int size, gfp_t gfp)
+static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp)
 {
-	return kzalloc(ALIGN(size * sizeof(void *), SMP_CACHE_BYTES), gfp);
+	return kcalloc(size, sizeof(void *), gfp);
 }
 
 static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
@@ -582,7 +582,8 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
  * In particular if you consume ring in interrupt or BH context, you must
  * disable interrupts/BH when doing so.
  */
-static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings,
+static inline int ptr_ring_resize_multiple(struct ptr_ring **rings,
+					   unsigned int nrings,
 					   int size,
 					   gfp_t gfp, void (*destroy)(void *))
 {
@@ -590,7 +591,7 @@ static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, int nrings,
 	void ***queues;
 	int i;
 
-	queues = kmalloc(nrings * sizeof *queues, gfp);
+	queues = kmalloc_array(nrings, sizeof(*queues), gfp);
 	if (!queues)
 		goto noqueues;
 
diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index 35226cd4efb0..8621ffdeecbf 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -193,7 +193,8 @@ static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
 }
 
 static inline int skb_array_resize_multiple(struct skb_array **rings,
-					    int nrings, int size, gfp_t gfp)
+					    int nrings, unsigned int size,
+					    gfp_t gfp)
 {
 	BUILD_BUG_ON(offsetof(struct skb_array, ring));
 	return ptr_ring_resize_multiple((struct ptr_ring **)rings,

From c780a049f9bf442314335372c9abc4548bfe3e44 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 11:09:12 -0700
Subject: [PATCH 091/149] ipv4: better IP_MAX_MTU enforcement

While working on yet another syzkaller report, I found
that our IP_MAX_MTU enforcements were not properly done.

gcc seems to reload dev->mtu for min(dev->mtu, IP_MAX_MTU), and
final result can be bigger than IP_MAX_MTU :/

This is a problem because device mtu can be changed on other cpus or
threads.

While this patch does not fix the issue I am working on, it is
probably worth addressing it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 4 ++--
 net/ipv4/route.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 821cedcc8e73..0cf7f5a65fe6 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -352,7 +352,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
 	    !forwarding)
 		return dst_mtu(dst);
 
-	return min(dst->dev->mtu, IP_MAX_MTU);
+	return min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU);
 }
 
 static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
@@ -364,7 +364,7 @@ static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
 		return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding);
 	}
 
-	return min(skb_dst(skb)->dev->mtu, IP_MAX_MTU);
+	return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
 }
 
 u32 ip_idents_reserve(u32 hash, int segs);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7effa62beed3..fe877a4a72b1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1267,7 +1267,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 	if (mtu)
 		return mtu;
 
-	mtu = dst->dev->mtu;
+	mtu = READ_ONCE(dst->dev->mtu);
 
 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
 		if (rt->rt_uses_gateway && mtu > 576)

From 61f0c3c7a0fd37f252b0f23c237973ddf4c21ea7 Mon Sep 17 00:00:00 2001
From: weiping zhang <zhangweiping@didichuxing.com>
Date: Tue, 8 Aug 2017 01:26:57 +0800
Subject: [PATCH 092/149] scsi: megaraid_sas: fix error handle in
 megasas_probe_one

megasas_mgmt_info.max_index has increased by 1 before megasas_io_attach,
if megasas_io_attach return error, then goto fail_io_attach,
megasas_mgmt_info.instance has a wrong index here. So first reduce
max_index and then set that instance to NULL.

Signed-off-by: weiping zhang <zhangweiping@didichuxing.com>
Acked-by: Sumit Saxena <sumit.saxena@broadcom.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/megaraid/megaraid_sas_base.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index 316c3df0c3fd..71c4746341ea 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -6228,8 +6228,8 @@ static int megasas_probe_one(struct pci_dev *pdev,
 fail_start_aen:
 fail_io_attach:
 	megasas_mgmt_info.count--;
-	megasas_mgmt_info.instance[megasas_mgmt_info.max_index] = NULL;
 	megasas_mgmt_info.max_index--;
+	megasas_mgmt_info.instance[megasas_mgmt_info.max_index] = NULL;
 
 	instance->instancet->disable_intr(instance);
 	megasas_destroy_irqs(instance);

From 82f0fd06d4a84ff32cc2cf9503764219316ec728 Mon Sep 17 00:00:00 2001
From: Varun Prakash <varun@chelsio.com>
Date: Mon, 31 Jul 2017 22:02:55 +0530
Subject: [PATCH 093/149] scsi: csiostor: fail probe if fw does not support
 FCoE

Fail probe if FCoE capability is not enabled in the firmware.

Signed-off-by: Varun Prakash <varun@chelsio.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/csiostor/csio_hw.c   |  4 +++-
 drivers/scsi/csiostor/csio_init.c | 12 ++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/csiostor/csio_hw.c b/drivers/scsi/csiostor/csio_hw.c
index 2029ad225121..5be0086142ca 100644
--- a/drivers/scsi/csiostor/csio_hw.c
+++ b/drivers/scsi/csiostor/csio_hw.c
@@ -3845,8 +3845,10 @@ csio_hw_start(struct csio_hw *hw)
 
 	if (csio_is_hw_ready(hw))
 		return 0;
-	else
+	else if (csio_match_state(hw, csio_hws_uninit))
 		return -EINVAL;
+	else
+		return -ENODEV;
 }
 
 int
diff --git a/drivers/scsi/csiostor/csio_init.c b/drivers/scsi/csiostor/csio_init.c
index ea0c31086cc6..dcd074169aa9 100644
--- a/drivers/scsi/csiostor/csio_init.c
+++ b/drivers/scsi/csiostor/csio_init.c
@@ -969,10 +969,14 @@ static int csio_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	pci_set_drvdata(pdev, hw);
 
-	if (csio_hw_start(hw) != 0) {
-		dev_err(&pdev->dev,
-			"Failed to start FW, continuing in debug mode.\n");
-		return 0;
+	rv = csio_hw_start(hw);
+	if (rv) {
+		if (rv == -EINVAL) {
+			dev_err(&pdev->dev,
+				"Failed to start FW, continuing in debug mode.\n");
+			return 0;
+		}
+		goto err_lnode_exit;
 	}
 
 	sprintf(hw->fwrev_str, "%u.%u.%u.%u\n",

From c80267324938a5517fd31fa4bbd2d63c564401f9 Mon Sep 17 00:00:00 2001
From: Raghava Aditya Renukunta <RaghavaAditya.Renukunta@microsemi.com>
Date: Fri, 4 Aug 2017 03:51:41 -0700
Subject: [PATCH 094/149] scsi: aacraid: Fix out of bounds in aac_get_name_resp

We terminate the aac_get_name_resp on a byte that is outside the bounds
of the structure. Extend the return response by one byte to remove the
out of bounds reference.

Fixes: b836439faf04 ("aacraid: 4KB sector support")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David Carroll <david.carroll@microsemi.com>
Signed-off-by: Raghava Aditya Renukunta <RaghavaAditya.Renukunta@microsemi.com>
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/aacraid/aachba.c  | 9 +++++++--
 drivers/scsi/aacraid/aacraid.h | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c
index 4591113c49de..a1a2c71e1626 100644
--- a/drivers/scsi/aacraid/aachba.c
+++ b/drivers/scsi/aacraid/aachba.c
@@ -549,7 +549,9 @@ static void get_container_name_callback(void *context, struct fib * fibptr)
 	if ((le32_to_cpu(get_name_reply->status) == CT_OK)
 	 && (get_name_reply->data[0] != '\0')) {
 		char *sp = get_name_reply->data;
-		sp[sizeof(((struct aac_get_name_resp *)NULL)->data)] = '\0';
+		int data_size = FIELD_SIZEOF(struct aac_get_name_resp, data);
+
+		sp[data_size - 1] = '\0';
 		while (*sp == ' ')
 			++sp;
 		if (*sp) {
@@ -579,12 +581,15 @@ static void get_container_name_callback(void *context, struct fib * fibptr)
 static int aac_get_container_name(struct scsi_cmnd * scsicmd)
 {
 	int status;
+	int data_size;
 	struct aac_get_name *dinfo;
 	struct fib * cmd_fibcontext;
 	struct aac_dev * dev;
 
 	dev = (struct aac_dev *)scsicmd->device->host->hostdata;
 
+	data_size = FIELD_SIZEOF(struct aac_get_name_resp, data);
+
 	cmd_fibcontext = aac_fib_alloc_tag(dev, scsicmd);
 
 	aac_fib_init(cmd_fibcontext);
@@ -593,7 +598,7 @@ static int aac_get_container_name(struct scsi_cmnd * scsicmd)
 	dinfo->command = cpu_to_le32(VM_ContainerConfig);
 	dinfo->type = cpu_to_le32(CT_READ_NAME);
 	dinfo->cid = cpu_to_le32(scmd_id(scsicmd));
-	dinfo->count = cpu_to_le32(sizeof(((struct aac_get_name_resp *)NULL)->data));
+	dinfo->count = cpu_to_le32(data_size - 1);
 
 	status = aac_fib_send(ContainerCommand,
 		  cmd_fibcontext,
diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h
index d31a9bc2ba69..ee2667e20e42 100644
--- a/drivers/scsi/aacraid/aacraid.h
+++ b/drivers/scsi/aacraid/aacraid.h
@@ -2274,7 +2274,7 @@ struct aac_get_name_resp {
 	__le32		parm3;
 	__le32		parm4;
 	__le32		parm5;
-	u8		data[16];
+	u8		data[17];
 };
 
 #define CT_CID_TO_32BITS_UID 165

From 70e42fd02c46e2aa9ab07b766d418637e3a51de7 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 9 Aug 2017 12:00:22 +0900
Subject: [PATCH 095/149] scsi: sd_zbc: Write unlock zone from sd_uninit_cmnd()

Releasing a zone write lock only when the write commnand that acquired
the lock completes can cause deadlocks due to potential command
reordering if the lock owning request is requeued and not executed. This
problem exists only with the scsi-mq path as, unlike the legacy path,
requests are moved out of the dispatch queue before being prepared and
so before locking a zone for a write command.

Since sd_uninit_cmnd() is now always called when a request is requeued,
call sd_zbc_write_unlock_zone() from that function for write requests
that acquired a zone lock instead of from sd_done(). Acquisition of a zone
lock by a write command is indicated using the new command
flag SCMD_ZONE_WRITE_LOCK.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <Bart.VanAssche@wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/sd.c        | 3 +++
 drivers/scsi/sd_zbc.c    | 9 +++++----
 include/scsi/scsi_cmnd.h | 1 +
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index bea36adeee17..e2647f2d4430 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1277,6 +1277,9 @@ static void sd_uninit_command(struct scsi_cmnd *SCpnt)
 {
 	struct request *rq = SCpnt->request;
 
+	if (SCpnt->flags & SCMD_ZONE_WRITE_LOCK)
+		sd_zbc_write_unlock_zone(SCpnt);
+
 	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
 		__free_page(rq->special_vec.bv_page);
 
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 96855df9f49d..8aa54779aac1 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -294,6 +294,9 @@ int sd_zbc_write_lock_zone(struct scsi_cmnd *cmd)
 	    test_and_set_bit(zno, sdkp->zones_wlock))
 		return BLKPREP_DEFER;
 
+	WARN_ON_ONCE(cmd->flags & SCMD_ZONE_WRITE_LOCK);
+	cmd->flags |= SCMD_ZONE_WRITE_LOCK;
+
 	return BLKPREP_OK;
 }
 
@@ -302,9 +305,10 @@ void sd_zbc_write_unlock_zone(struct scsi_cmnd *cmd)
 	struct request *rq = cmd->request;
 	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
 
-	if (sdkp->zones_wlock) {
+	if (sdkp->zones_wlock && cmd->flags & SCMD_ZONE_WRITE_LOCK) {
 		unsigned int zno = sd_zbc_zone_no(sdkp, blk_rq_pos(rq));
 		WARN_ON_ONCE(!test_bit(zno, sdkp->zones_wlock));
+		cmd->flags &= ~SCMD_ZONE_WRITE_LOCK;
 		clear_bit_unlock(zno, sdkp->zones_wlock);
 		smp_mb__after_atomic();
 	}
@@ -335,9 +339,6 @@ void sd_zbc_complete(struct scsi_cmnd *cmd,
 	case REQ_OP_WRITE_ZEROES:
 	case REQ_OP_WRITE_SAME:
 
-		/* Unlock the zone */
-		sd_zbc_write_unlock_zone(cmd);
-
 		if (result &&
 		    sshdr->sense_key == ILLEGAL_REQUEST &&
 		    sshdr->asc == 0x21)
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index a1266d318c85..6af198d8120b 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -57,6 +57,7 @@ struct scsi_pointer {
 /* for scmd->flags */
 #define SCMD_TAGGED		(1 << 0)
 #define SCMD_UNCHECKED_ISA_DMA	(1 << 1)
+#define SCMD_ZONE_WRITE_LOCK	(1 << 2)
 
 struct scsi_cmnd {
 	struct scsi_request req;

From cbe7dfa26eee4819db7b5846181d56fd0cece0ea Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 13 Aug 2017 19:44:22 +0200
Subject: [PATCH 096/149] Revert "scsi: default to scsi-mq"

Defaulting to scsi-mq in 4.13-rc has shown various regressions
on setups that we didn't previously consider.  Fixes for them are
in progress, but too invasive to make it in this cycle.  So for
now revert the commit that defaults to blk-mq for SCSI.  For 4.14
we'll plan to try again with these fixes.

This reverts commit 5c279bd9e40624f4ab6e688671026d6005b066fa.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/Kconfig | 11 +++++++++++
 drivers/scsi/scsi.c  |  4 ++++
 2 files changed, 15 insertions(+)

diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index f4538d7a3016..d145e0d90227 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -47,6 +47,17 @@ config SCSI_NETLINK
 	default	n
 	depends on NET
 
+config SCSI_MQ_DEFAULT
+	bool "SCSI: use blk-mq I/O path by default"
+	depends on SCSI
+	---help---
+	  This option enables the new blk-mq based I/O path for SCSI
+	  devices by default.  With the option the scsi_mod.use_blk_mq
+	  module/boot option defaults to Y, without it to N, but it can
+	  still be overridden either way.
+
+	  If unsure say N.
+
 config SCSI_PROC_FS
 	bool "legacy /proc/scsi/ support"
 	depends on SCSI && PROC_FS
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 3d38c6d463b8..1bf274e3b2b6 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -800,7 +800,11 @@ MODULE_LICENSE("GPL");
 module_param(scsi_logging_level, int, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(scsi_logging_level, "a bit mask of logging levels");
 
+#ifdef CONFIG_SCSI_MQ_DEFAULT
 bool scsi_use_blk_mq = true;
+#else
+bool scsi_use_blk_mq = false;
+#endif
 module_param_named(use_blk_mq, scsi_use_blk_mq, bool, S_IWUSR | S_IRUGO);
 
 static int __init init_scsi(void)

From 71eb2ac58aa52f7948deda4d9a75754b6898e5c9 Mon Sep 17 00:00:00 2001
From: Varun Prakash <varun@chelsio.com>
Date: Wed, 16 Aug 2017 19:40:32 +0530
Subject: [PATCH 097/149] scsi: cxgb4i: call neigh_event_send() to update MAC
 address

If nud_state is not valid then call neigh_event_send() to update MAC
address.

Signed-off-by: Varun Prakash <varun@chelsio.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/cxgbi/cxgb4i/cxgb4i.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
index a69a9ac836f5..1d02cf9fe06c 100644
--- a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
+++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
@@ -1635,6 +1635,9 @@ static int init_act_open(struct cxgbi_sock *csk)
 		goto rel_resource;
 	}
 
+	if (!(n->nud_state & NUD_VALID))
+		neigh_event_send(n, NULL);
+
 	csk->atid = cxgb4_alloc_atid(lldi->tids, csk);
 	if (csk->atid < 0) {
 		pr_err("%s, NO atid available.\n", ndev->name);

From ce0fa3e56ad20f04d8252353dcd24e924abdafca Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 16 Aug 2017 10:18:03 -0700
Subject: [PATCH 098/149] x86/mm, mm/hwpoison: Clear PRESENT bit for kernel 1:1
 mappings of poison pages

Speculative processor accesses may reference any memory that has a
valid page table entry.  While a speculative access won't generate
a machine check, it will log the error in a machine check bank. That
could cause escalation of a subsequent error since the overflow bit
will be then set in the machine check bank status register.

Code has to be double-plus-tricky to avoid mentioning the 1:1 virtual
address of the page we want to map out otherwise we may trigger the
very problem we are trying to avoid.  We use a non-canonical address
that passes through the usual Linux table walking code to get to the
same "pte".

Thanks to Dave Hansen for reviewing several iterations of this.

Also see:

  http://marc.info/?l=linux-mm&m=149860136413338&w=2

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Elliott, Robert (Persistent Memory) <elliott@hpe.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/20170816171803.28342-1-tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/page_64.h   |  4 +++
 arch/x86/kernel/cpu/mcheck/mce.c | 43 ++++++++++++++++++++++++++++++++
 include/linux/mm_inline.h        |  6 +++++
 mm/memory-failure.c              |  2 ++
 4 files changed, 55 insertions(+)

diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index b4a0d43248cf..b50df06ad251 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -51,6 +51,10 @@ static inline void clear_page(void *page)
 
 void copy_page(void *to, void *from);
 
+#ifdef CONFIG_X86_MCE
+#define arch_unmap_kpfn arch_unmap_kpfn
+#endif
+
 #endif	/* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_VSYSCALL_EMULATION
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 6dde0497efc7..3b413065c613 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,6 +51,7 @@
 #include <asm/mce.h>
 #include <asm/msr.h>
 #include <asm/reboot.h>
+#include <asm/set_memory.h>
 
 #include "mce-internal.h"
 
@@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce *m)
 	return ret;
 }
 
+#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE)
+
+void arch_unmap_kpfn(unsigned long pfn)
+{
+	unsigned long decoy_addr;
+
+	/*
+	 * Unmap this page from the kernel 1:1 mappings to make sure
+	 * we don't log more errors because of speculative access to
+	 * the page.
+	 * We would like to just call:
+	 *	set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
+	 * but doing that would radically increase the odds of a
+	 * speculative access to the posion page because we'd have
+	 * the virtual address of the kernel 1:1 mapping sitting
+	 * around in registers.
+	 * Instead we get tricky.  We create a non-canonical address
+	 * that looks just like the one we want, but has bit 63 flipped.
+	 * This relies on set_memory_np() not checking whether we passed
+	 * a legal address.
+	 */
+
+/*
+ * Build time check to see if we have a spare virtual bit. Don't want
+ * to leave this until run time because most developers don't have a
+ * system that can exercise this code path. This will only become a
+ * problem if/when we move beyond 5-level page tables.
+ *
+ * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
+ */
+#if PGDIR_SHIFT + 9 < 63
+	decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
+#else
+#error "no unused virtual bit available"
+#endif
+
+	if (set_memory_np(decoy_addr, 1))
+		pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
+
+}
+#endif
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index e030a68ead7e..25438b2b6f22 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -126,4 +126,10 @@ static __always_inline enum lru_list page_lru(struct page *page)
 
 #define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
 
+#ifdef arch_unmap_kpfn
+extern void arch_unmap_kpfn(unsigned long pfn);
+#else
+static __always_inline void arch_unmap_kpfn(unsigned long pfn) { }
+#endif
+
 #endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1cd3b3569af8..88366626c0b7 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1146,6 +1146,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 		return 0;
 	}
 
+	arch_unmap_kpfn(pfn);
+
 	orig_head = hpage = compound_head(p);
 	num_poisoned_pages_inc();
 

From 014cd0a368dc6351c65d51e4ee34f8573a4a1543 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 17 Aug 2017 20:30:39 +1000
Subject: [PATCH 099/149] bpf: Update sysctl documentation to list all
 supported architectures

The sysctl documentation states that the JIT is only available on
x86_64, which is no longer correct.

Update the list, and break it out to indicate which architectures
support the cBPF JIT (via HAVE_CBPF_JIT) or the eBPF JIT
(HAVE_EBPF_JIT).

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index 14db18c970b1..b9c3c6078010 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -36,8 +36,23 @@ bpf_jit_enable
 --------------
 
 This enables Berkeley Packet Filter Just in Time compiler.
-Currently supported on x86_64 architecture, bpf_jit provides a framework
-to speed packet filtering, the one used by tcpdump/libpcap for example.
+
+There are two flavors of JIT, the new eBPF JIT supported on:
+  - x86_64
+  - arm64
+  - ppc64
+  - sparc64
+  - mips64
+
+And the older cBPF JIT supported on:
+  - arm
+  - mips
+  - ppc
+  - sparc
+
+The BPF JIT provides a framework to speed packet filtering, the one used by
+tcpdump/libpcap for example.
+
 Values :
 	0 - disable the JIT (default value)
 	1 - enable the JIT

From 98529b9272e06a7767034fb8a32e43cdecda240a Mon Sep 17 00:00:00 2001
From: Lv Zheng <lv.zheng@intel.com>
Date: Wed, 16 Aug 2017 15:29:49 +0800
Subject: [PATCH 100/149] ACPI: EC: Fix regression related to wrong ECDT
 initialization order

Commit 2a5708409e4e (ACPI / EC: Fix a gap that ECDT EC cannot handle
EC events) introduced acpi_ec_ecdt_start(), but that function is
invoked before acpi_ec_query_init(), which is too early.  This causes
the kernel to crash if an EC event occurs after boot, when ec_query_wq
is not valid:

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000102
 ...
 Workqueue: events acpi_ec_event_handler
 task: ffff9f539790dac0 task.stack: ffffb437c0e10000
 RIP: 0010:__queue_work+0x32/0x430

Normally, the DSDT EC should always be valid, so acpi_ec_ecdt_start()
is actually a no-op in the majority of cases.  However, commit
c712bb58d827 (ACPI / EC: Add support to skip boot stage DSDT probe)
caused the probing of the DSDT EC as the "boot EC" to be skipped when
the ECDT EC is valid and uncovered the bug.

Fix this issue by invoking acpi_ec_ecdt_start() after acpi_ec_query_init()
in acpi_ec_init().

Link: https://jira01.devtools.intel.com/browse/LCK-4348
Fixes: 2a5708409e4e (ACPI / EC: Fix a gap that ECDT EC cannot handle EC events)
Fixes: c712bb58d827 (ACPI / EC: Add support to skip boot stage DSDT probe)
Reported-by: Wang Wendy <wendy.wang@intel.com>
Tested-by: Feng Chenzhou <chenzhoux.feng@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
[ rjw: Changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/ec.c       | 17 +++++++----------
 drivers/acpi/internal.h |  1 -
 drivers/acpi/scan.c     |  1 -
 3 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 62068a5e814f..ae3d6d152633 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -1741,7 +1741,7 @@ int __init acpi_ec_dsdt_probe(void)
  * functioning ECDT EC first in order to handle the events.
  * https://bugzilla.kernel.org/show_bug.cgi?id=115021
  */
-int __init acpi_ec_ecdt_start(void)
+static int __init acpi_ec_ecdt_start(void)
 {
 	acpi_handle handle;
 
@@ -2003,20 +2003,17 @@ static inline void acpi_ec_query_exit(void)
 int __init acpi_ec_init(void)
 {
 	int result;
+	int ecdt_fail, dsdt_fail;
 
 	/* register workqueue for _Qxx evaluations */
 	result = acpi_ec_query_init();
 	if (result)
-		goto err_exit;
-	/* Now register the driver for the EC */
-	result = acpi_bus_register_driver(&acpi_ec_driver);
-	if (result)
-		goto err_exit;
+		return result;
 
-err_exit:
-	if (result)
-		acpi_ec_query_exit();
-	return result;
+	/* Drivers must be started after acpi_ec_query_init() */
+	ecdt_fail = acpi_ec_ecdt_start();
+	dsdt_fail = acpi_bus_register_driver(&acpi_ec_driver);
+	return ecdt_fail && dsdt_fail ? -ENODEV : 0;
 }
 
 /* EC driver currently not unloadable */
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 58dd7ab3c653..3f5af4d7a739 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -185,7 +185,6 @@ typedef int (*acpi_ec_query_func) (void *data);
 int acpi_ec_init(void);
 int acpi_ec_ecdt_probe(void);
 int acpi_ec_dsdt_probe(void);
-int acpi_ec_ecdt_start(void);
 void acpi_ec_block_transactions(void);
 void acpi_ec_unblock_transactions(void);
 int acpi_ec_add_query_handler(struct acpi_ec *ec, u8 query_bit,
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 33897298f03e..70fd5502c284 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -2084,7 +2084,6 @@ int __init acpi_scan_init(void)
 
 	acpi_gpe_apply_masked_gpes();
 	acpi_update_all_gpes();
-	acpi_ec_ecdt_start();
 
 	acpi_scan_initialized = true;
 

From acc8b31665b4cc17b35c4fa445427f7e2f6dc86b Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 18 Aug 2017 10:10:43 +0200
Subject: [PATCH 101/149] net: sched: fix p_filter_chain check in
 tcf_chain_flush

The dereference before check is wrong and leads to an oops when
p_filter_chain is NULL. The check needs to be done on the pointer to
prevent NULL dereference.

Fixes: f93e1cdcf42c ("net/sched: fix filter flushing")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 39da0c5801c9..9fd44c221347 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -205,7 +205,7 @@ static void tcf_chain_flush(struct tcf_chain *chain)
 {
 	struct tcf_proto *tp;
 
-	if (*chain->p_filter_chain)
+	if (chain->p_filter_chain)
 		RCU_INIT_POINTER(*chain->p_filter_chain, NULL);
 	while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) {
 		RCU_INIT_POINTER(chain->filter_chain, tp->next);

From eac2c68d663effb077210218788952b5a0c1f60e Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 18 Aug 2017 12:11:50 +0100
Subject: [PATCH 102/149] nfp: fix infinite loop on umapping cleanup

The while loop that performs the dma page unmapping never decrements
index counter f and hence loops forever. Fix this with a pre-decrement
on f.

Detected by CoverityScan, CID#1357309 ("Infinite loop")

Fixes: 4c3523623dc0 ("net: add driver for Netronome NFP4000/NFP6000 NIC VFs")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 4631ca8b8eb2..9f77ce038a4a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -908,8 +908,7 @@ static int nfp_net_tx(struct sk_buff *skb, struct net_device *netdev)
 	return NETDEV_TX_OK;
 
 err_unmap:
-	--f;
-	while (f >= 0) {
+	while (--f >= 0) {
 		frag = &skb_shinfo(skb)->frags[f];
 		dma_unmap_page(dp->dev, tx_ring->txbufs[wr_idx].dma_addr,
 			       skb_frag_size(frag), DMA_TO_DEVICE);

From a120d9ab65354727559b9db75ded8071b7ef19e2 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 18 Aug 2017 14:12:06 +0100
Subject: [PATCH 103/149] netxen: fix incorrect loop counter decrement

The loop counter k is currently being decremented from zero which
is incorrect. Fix this by incrementing k instead

Detected by CoverityScan, CID#401847 ("Infinite loop")

Fixes: 83f18a557c6d ("netxen_nic: fw dump support")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c
index 66ff15d08bad..0a66389c06c2 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_hw.c
@@ -2311,7 +2311,7 @@ netxen_md_rdqueue(struct netxen_adapter *adapter,
 				 loop_cnt++) {
 		NX_WR_DUMP_REG(select_addr, adapter->ahw.pci_base0, queue_id);
 		read_addr = queueEntry->read_addr;
-		for (k = 0; k < read_cnt; k--) {
+		for (k = 0; k < read_cnt; k++) {
 			NX_RD_DUMP_REG(read_addr, adapter->ahw.pci_base0,
 							&read_value);
 			*data_buff++ = read_value;

From 2110ba58303f0c2a03360c5f81fbe67ed312e7b9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 18 Aug 2017 17:11:06 +0200
Subject: [PATCH 104/149] bpf, doc: improve sysctl knob description

Current context speaking of tcpdump filters is out of date these
days, so lets improve the sysctl description for the BPF knobs
a bit.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt | 37 ++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index b9c3c6078010..d7c2b88b92ae 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -35,23 +35,32 @@ Table : Subdirectories in /proc/sys/net
 bpf_jit_enable
 --------------
 
-This enables Berkeley Packet Filter Just in Time compiler.
-
-There are two flavors of JIT, the new eBPF JIT supported on:
+This enables the BPF Just in Time (JIT) compiler. BPF is a flexible
+and efficient infrastructure allowing to execute bytecode at various
+hook points. It is used in a number of Linux kernel subsystems such
+as networking (e.g. XDP, tc), tracing (e.g. kprobes, uprobes, tracepoints)
+and security (e.g. seccomp). LLVM has a BPF back end that can compile
+restricted C into a sequence of BPF instructions. After program load
+through bpf(2) and passing a verifier in the kernel, a JIT will then
+translate these BPF proglets into native CPU instructions. There are
+two flavors of JITs, the newer eBPF JIT currently supported on:
   - x86_64
   - arm64
   - ppc64
   - sparc64
   - mips64
 
-And the older cBPF JIT supported on:
+And the older cBPF JIT supported on the following archs:
   - arm
   - mips
   - ppc
   - sparc
 
-The BPF JIT provides a framework to speed packet filtering, the one used by
-tcpdump/libpcap for example.
+eBPF JITs are a superset of cBPF JITs, meaning the kernel will
+migrate cBPF instructions into eBPF instructions and then JIT
+compile them transparently. Older cBPF JITs can only translate
+tcpdump filters, seccomp rules, etc, but not mentioned eBPF
+programs loaded through bpf(2).
 
 Values :
 	0 - disable the JIT (default value)
@@ -61,9 +70,9 @@ Values :
 bpf_jit_harden
 --------------
 
-This enables hardening for the Berkeley Packet Filter Just in Time compiler.
-Supported are eBPF JIT backends. Enabling hardening trades off performance,
-but can mitigate JIT spraying.
+This enables hardening for the BPF JIT compiler. Supported are eBPF
+JIT backends. Enabling hardening trades off performance, but can
+mitigate JIT spraying.
 Values :
 	0 - disable JIT hardening (default value)
 	1 - enable JIT hardening for unprivileged users only
@@ -72,11 +81,11 @@ Values :
 bpf_jit_kallsyms
 ----------------
 
-When Berkeley Packet Filter Just in Time compiler is enabled, then compiled
-images are unknown addresses to the kernel, meaning they neither show up in
-traces nor in /proc/kallsyms. This enables export of these addresses, which
-can be used for debugging/tracing. If bpf_jit_harden is enabled, this feature
-is disabled.
+When BPF JIT compiler is enabled, then compiled images are unknown
+addresses to the kernel, meaning they neither show up in traces nor
+in /proc/kallsyms. This enables export of these addresses, which can
+be used for debugging/tracing. If bpf_jit_harden is enabled, this
+feature is disabled.
 Values :
 	0 - disable JIT kallsyms export (default value)
 	1 - enable JIT kallsyms export for privileged users only

From a0917e0bc6efc05834c0c1eafebd579a9c75e6e9 Mon Sep 17 00:00:00 2001
From: Matthew Dawson <matthew@mjdsystems.ca>
Date: Fri, 18 Aug 2017 15:04:54 -0400
Subject: [PATCH 105/149] datagram: When peeking datagrams with offset < 0
 don't skip empty skbs

Due to commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac ("udp: remove
headers from UDP packets before queueing"), when udp packets are being
peeked the requested extra offset is always 0 as there is no need to skip
the udp header.  However, when the offset is 0 and the next skb is
of length 0, it is only returned once.  The behaviour can be seen with
the following python script:

from socket import *;
f=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
g=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
f.bind(('::', 0));
addr=('::1', f.getsockname()[1]);
g.sendto(b'', addr)
g.sendto(b'b', addr)
print(f.recvfrom(10, MSG_PEEK));
print(f.recvfrom(10, MSG_PEEK));

Where the expected output should be the empty string twice.

Instead, make sk_peek_offset return negative values, and pass those values
to __skb_try_recv_datagram/__skb_try_recv_from_queue.  If the passed offset
to __skb_try_recv_from_queue is negative, the checked skb is never skipped.
__skb_try_recv_from_queue will then ensure the offset is reset back to 0
if a peek is requested without an offset, unless no packets are found.

Also simplify the if condition in __skb_try_recv_from_queue.  If _off is
greater then 0, and off is greater then or equal to skb->len, then
(_off || skb->len) must always be true assuming skb->len >= 0 is always
true.

Also remove a redundant check around a call to sk_peek_offset in af_unix.c,
as it double checked if MSG_PEEK was set in the flags.

V2:
 - Moved the negative fixup into __skb_try_recv_from_queue, and remove now
redundant checks
 - Fix peeking in udp{,v6}_recvmsg to report the right value when the
offset is 0

V3:
 - Marked new branch in __skb_try_recv_from_queue as unlikely.

Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h  |  4 +---
 net/core/datagram.c | 12 +++++++++---
 net/ipv4/udp.c      |  3 ++-
 net/ipv6/udp.c      |  3 ++-
 net/unix/af_unix.c  |  5 +----
 5 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 7c0632c7e870..aeeec62992ca 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -507,9 +507,7 @@ int sk_set_peek_off(struct sock *sk, int val);
 static inline int sk_peek_offset(struct sock *sk, int flags)
 {
 	if (unlikely(flags & MSG_PEEK)) {
-		s32 off = READ_ONCE(sk->sk_peek_off);
-		if (off >= 0)
-			return off;
+		return READ_ONCE(sk->sk_peek_off);
 	}
 
 	return 0;
diff --git a/net/core/datagram.c b/net/core/datagram.c
index ee5647bd91b3..a21ca8dee5ea 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -169,14 +169,20 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
 					  int *peeked, int *off, int *err,
 					  struct sk_buff **last)
 {
+	bool peek_at_off = false;
 	struct sk_buff *skb;
-	int _off = *off;
+	int _off = 0;
+
+	if (unlikely(flags & MSG_PEEK && *off >= 0)) {
+		peek_at_off = true;
+		_off = *off;
+	}
 
 	*last = queue->prev;
 	skb_queue_walk(queue, skb) {
 		if (flags & MSG_PEEK) {
-			if (_off >= skb->len && (skb->len || _off ||
-						 skb->peeked)) {
+			if (peek_at_off && _off >= skb->len &&
+			    (_off || skb->peeked)) {
 				_off -= skb->len;
 				continue;
 			}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index a7c804f73990..cd1d044a7fa5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1574,7 +1574,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 		return ip_recv_error(sk, msg, len, addr_len);
 
 try_again:
-	peeking = off = sk_peek_offset(sk, flags);
+	peeking = flags & MSG_PEEK;
+	off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
 	if (!skb)
 		return err;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 578142b7ca3e..20039c8501eb 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -362,7 +362,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
 
 try_again:
-	peeking = off = sk_peek_offset(sk, flags);
+	peeking = flags & MSG_PEEK;
+	off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
 	if (!skb)
 		return err;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 7b52a380d710..be8982b4f8c0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2304,10 +2304,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
 	 */
 	mutex_lock(&u->iolock);
 
-	if (flags & MSG_PEEK)
-		skip = sk_peek_offset(sk, flags);
-	else
-		skip = 0;
+	skip = max(sk_peek_offset(sk, flags), 0);
 
 	do {
 		int chunk;

From ff244c6b29b176f3f448bc75e55df297225e1b3a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 18 Aug 2017 13:39:56 -0700
Subject: [PATCH 106/149] tun: handle register_netdevice() failures properly

syzkaller reported a double free [1], caused by the fact
that tun driver was not updated properly when priv_destructor
was added.

When/if register_netdevice() fails, priv_destructor() must have been
called already.

[1]
BUG: KASAN: double-free or invalid-free in selinux_tun_dev_free_security+0x15/0x20 security/selinux/hooks.c:5023

CPU: 0 PID: 2919 Comm: syzkaller227220 Not tainted 4.13.0-rc4+ #23
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 print_address_description+0x7f/0x260 mm/kasan/report.c:252
 kasan_report_double_free+0x55/0x80 mm/kasan/report.c:333
 kasan_slab_free+0xa0/0xc0 mm/kasan/kasan.c:514
 __cache_free mm/slab.c:3503 [inline]
 kfree+0xd3/0x260 mm/slab.c:3820
 selinux_tun_dev_free_security+0x15/0x20 security/selinux/hooks.c:5023
 security_tun_dev_free_security+0x48/0x80 security/security.c:1512
 tun_set_iff drivers/net/tun.c:1884 [inline]
 __tun_chr_ioctl+0x2ce6/0x3d50 drivers/net/tun.c:2064
 tun_chr_ioctl+0x2a/0x40 drivers/net/tun.c:2309
 vfs_ioctl fs/ioctl.c:45 [inline]
 do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
 SYSC_ioctl fs/ioctl.c:700 [inline]
 SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x443ff9
RSP: 002b:00007ffc34271f68 EFLAGS: 00000217 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00000000004002e0 RCX: 0000000000443ff9
RDX: 0000000020533000 RSI: 00000000400454ca RDI: 0000000000000003
RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000217 R12: 0000000000401ce0
R13: 0000000000401d70 R14: 0000000000000000 R15: 0000000000000000

Allocated by task 2919:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_kmalloc+0xaa/0xd0 mm/kasan/kasan.c:551
 kmem_cache_alloc_trace+0x101/0x6f0 mm/slab.c:3627
 kmalloc include/linux/slab.h:493 [inline]
 kzalloc include/linux/slab.h:666 [inline]
 selinux_tun_dev_alloc_security+0x49/0x170 security/selinux/hooks.c:5012
 security_tun_dev_alloc_security+0x6d/0xa0 security/security.c:1506
 tun_set_iff drivers/net/tun.c:1839 [inline]
 __tun_chr_ioctl+0x1730/0x3d50 drivers/net/tun.c:2064
 tun_chr_ioctl+0x2a/0x40 drivers/net/tun.c:2309
 vfs_ioctl fs/ioctl.c:45 [inline]
 do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
 SYSC_ioctl fs/ioctl.c:700 [inline]
 SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Freed by task 2919:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_slab_free+0x6e/0xc0 mm/kasan/kasan.c:524
 __cache_free mm/slab.c:3503 [inline]
 kfree+0xd3/0x260 mm/slab.c:3820
 selinux_tun_dev_free_security+0x15/0x20 security/selinux/hooks.c:5023
 security_tun_dev_free_security+0x48/0x80 security/security.c:1512
 tun_free_netdev+0x13b/0x1b0 drivers/net/tun.c:1563
 register_netdevice+0x8d0/0xee0 net/core/dev.c:7605
 tun_set_iff drivers/net/tun.c:1859 [inline]
 __tun_chr_ioctl+0x1caf/0x3d50 drivers/net/tun.c:2064
 tun_chr_ioctl+0x2a/0x40 drivers/net/tun.c:2309
 vfs_ioctl fs/ioctl.c:45 [inline]
 do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
 SYSC_ioctl fs/ioctl.c:700 [inline]
 SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
 entry_SYSCALL_64_fastpath+0x1f/0xbe

The buggy address belongs to the object at ffff8801d2843b40
 which belongs to the cache kmalloc-32 of size 32
The buggy address is located 0 bytes inside of
 32-byte region [ffff8801d2843b40, ffff8801d2843b60)
The buggy address belongs to the page:
page:ffffea000660cea8 count:1 mapcount:0 mapping:ffff8801d2843000 index:0xffff8801d2843fc1
flags: 0x200000000000100(slab)
raw: 0200000000000100 ffff8801d2843000 ffff8801d2843fc1 000000010000003f
raw: ffffea0006626a40 ffffea00066141a0 ffff8801dbc00100
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801d2843a00: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
 ffff8801d2843a80: 00 00 00 fc fc fc fc fc fb fb fb fb fc fc fc fc
>ffff8801d2843b00: 00 00 00 00 fc fc fc fc fb fb fb fb fc fc fc fc
                                           ^
 ffff8801d2843b80: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
 ffff8801d2843c00: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc

==================================================================

Fixes: cf124db566e6 ("net: Fix inconsistent teardown and release of private netdev state.")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 32ad87345f57..0a2c0a42283f 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1879,6 +1879,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
 err_detach:
 	tun_detach_all(dev);
+	/* register_netdevice() already called tun_free_netdev() */
+	goto err_free_dev;
+
 err_free_flow:
 	tun_flow_uninit(tun);
 	security_tun_dev_free_security(tun->security);

From 5bfd37b4de5c98e86b12bd13be5aa46c7484a125 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Aug 2017 09:41:54 -0700
Subject: [PATCH 107/149] tipc: fix use-after-free

syszkaller reported use-after-free in tipc [1]

When msg->rep skb is freed, set the pointer to NULL,
so that caller does not free it again.

[1]

==================================================================
BUG: KASAN: use-after-free in skb_push+0xd4/0xe0 net/core/skbuff.c:1466
Read of size 8 at addr ffff8801c6e71e90 by task syz-executor5/4115

CPU: 1 PID: 4115 Comm: syz-executor5 Not tainted 4.13.0-rc4+ #32
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 print_address_description+0x73/0x250 mm/kasan/report.c:252
 kasan_report_error mm/kasan/report.c:351 [inline]
 kasan_report+0x24e/0x340 mm/kasan/report.c:409
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430
 skb_push+0xd4/0xe0 net/core/skbuff.c:1466
 tipc_nl_compat_recv+0x833/0x18f0 net/tipc/netlink_compat.c:1209
 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598
 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623
 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397
 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634
 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 sock_write_iter+0x31a/0x5d0 net/socket.c:898
 call_write_iter include/linux/fs.h:1743 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_write.c:470
 vfs_write+0x189/0x510 fs/read_write.c:518
 SYSC_write fs/read_write.c:565 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:557
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x4512e9
RSP: 002b:00007f3bc8184c08 EFLAGS: 00000216 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 00000000004512e9
RDX: 0000000000000020 RSI: 0000000020fdb000 RDI: 0000000000000006
RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004b5e76
R13: 00007f3bc8184b48 R14: 00000000004b5e86 R15: 0000000000000000

Allocated by task 4115:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489
 kmem_cache_alloc_node+0x13d/0x750 mm/slab.c:3651
 __alloc_skb+0xf1/0x740 net/core/skbuff.c:219
 alloc_skb include/linux/skbuff.h:903 [inline]
 tipc_tlv_alloc+0x26/0xb0 net/tipc/netlink_compat.c:148
 tipc_nl_compat_dumpit+0xf2/0x3c0 net/tipc/netlink_compat.c:248
 tipc_nl_compat_handle net/tipc/netlink_compat.c:1130 [inline]
 tipc_nl_compat_recv+0x756/0x18f0 net/tipc/netlink_compat.c:1199
 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598
 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623
 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397
 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634
 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 sock_write_iter+0x31a/0x5d0 net/socket.c:898
 call_write_iter include/linux/fs.h:1743 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_write.c:470
 vfs_write+0x189/0x510 fs/read_write.c:518
 SYSC_write fs/read_write.c:565 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:557
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Freed by task 4115:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
 __cache_free mm/slab.c:3503 [inline]
 kmem_cache_free+0x77/0x280 mm/slab.c:3763
 kfree_skbmem+0x1a1/0x1d0 net/core/skbuff.c:622
 __kfree_skb net/core/skbuff.c:682 [inline]
 kfree_skb+0x165/0x4c0 net/core/skbuff.c:699
 tipc_nl_compat_dumpit+0x36a/0x3c0 net/tipc/netlink_compat.c:260
 tipc_nl_compat_handle net/tipc/netlink_compat.c:1130 [inline]
 tipc_nl_compat_recv+0x756/0x18f0 net/tipc/netlink_compat.c:1199
 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598
 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623
 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397
 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634
 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 sock_write_iter+0x31a/0x5d0 net/socket.c:898
 call_write_iter include/linux/fs.h:1743 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_write.c:470
 vfs_write+0x189/0x510 fs/read_write.c:518
 SYSC_write fs/read_write.c:565 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:557
 entry_SYSCALL_64_fastpath+0x1f/0xbe

The buggy address belongs to the object at ffff8801c6e71dc0
 which belongs to the cache skbuff_head_cache of size 224
The buggy address is located 208 bytes inside of
 224-byte region [ffff8801c6e71dc0, ffff8801c6e71ea0)
The buggy address belongs to the page:
page:ffffea00071b9c40 count:1 mapcount:0 mapping:ffff8801c6e71000 index:0x0
flags: 0x200000000000100(slab)
raw: 0200000000000100 ffff8801c6e71000 0000000000000000 000000010000000c
raw: ffffea0007224a20 ffff8801d98caf48 ffff8801d9e79040 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801c6e71d80: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
 ffff8801c6e71e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff8801c6e71e80: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
                         ^
 ffff8801c6e71f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ffff8801c6e71f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
==================================================================

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Dmitry Vyukov  <dvyukov@google.com>
Cc: Jon Maloy <jon.maloy@ericsson.com>
Cc: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/netlink_compat.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 9bfe886ab330..750949dfc1d7 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -258,13 +258,15 @@ static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
 	arg = nlmsg_new(0, GFP_KERNEL);
 	if (!arg) {
 		kfree_skb(msg->rep);
+		msg->rep = NULL;
 		return -ENOMEM;
 	}
 
 	err = __tipc_nl_compat_dumpit(cmd, msg, arg);
-	if (err)
+	if (err) {
 		kfree_skb(msg->rep);
-
+		msg->rep = NULL;
+	}
 	kfree_skb(arg);
 
 	return err;

From 15339e441ec46fbc3bf3486bb1ae4845b0f1bb8d Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Wed, 16 Aug 2017 20:16:40 +0200
Subject: [PATCH 108/149] sctp: fully initialize the IPv6 address in
 sctp_v6_to_addr()

KMSAN reported use of uninitialized sctp_addr->v4.sin_addr.s_addr and
sctp_addr->v6.sin6_scope_id in sctp_v6_cmp_addr() (see below).
Make sure all fields of an IPv6 address are initialized, which
guarantees that the IPv4 fields are also initialized.

==================================================================
 BUG: KMSAN: use of uninitialized memory in sctp_v6_cmp_addr+0x8d4/0x9f0
 net/sctp/ipv6.c:517
 CPU: 2 PID: 31056 Comm: syz-executor1 Not tainted 4.11.0-rc5+ #2944
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs
 01/01/2011
 Call Trace:
  dump_stack+0x172/0x1c0 lib/dump_stack.c:42
  is_logbuf_locked mm/kmsan/kmsan.c:59 [inline]
  kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:938
  native_save_fl arch/x86/include/asm/irqflags.h:18 [inline]
  arch_local_save_flags arch/x86/include/asm/irqflags.h:72 [inline]
  arch_local_irq_save arch/x86/include/asm/irqflags.h:113 [inline]
  __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:467
  sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517
  sctp_v6_get_dst+0x8c7/0x1630 net/sctp/ipv6.c:290
  sctp_transport_route+0x101/0x570 net/sctp/transport.c:292
  sctp_assoc_add_peer+0x66d/0x16f0 net/sctp/associola.c:651
  sctp_sendmsg+0x35a5/0x4f90 net/sctp/socket.c:1871
  inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762
  sock_sendmsg_nosec net/socket.c:633 [inline]
  sock_sendmsg net/socket.c:643 [inline]
  SYSC_sendto+0x608/0x710 net/socket.c:1696
  SyS_sendto+0x8a/0xb0 net/socket.c:1664
  entry_SYSCALL_64_fastpath+0x13/0x94
 RIP: 0033:0x44b479
 RSP: 002b:00007f6213f21c08 EFLAGS: 00000286 ORIG_RAX: 000000000000002c
 RAX: ffffffffffffffda RBX: 0000000020000000 RCX: 000000000044b479
 RDX: 0000000000000041 RSI: 0000000020edd000 RDI: 0000000000000006
 RBP: 00000000007080a8 R08: 0000000020b85fe4 R09: 000000000000001c
 R10: 0000000000040005 R11: 0000000000000286 R12: 00000000ffffffff
 R13: 0000000000003760 R14: 00000000006e5820 R15: 0000000000ff8000
 origin description: ----dst_saddr@sctp_v6_get_dst
 local variable created at:
  sk_fullsock include/net/sock.h:2321 [inline]
  inet6_sk include/linux/ipv6.h:309 [inline]
  sctp_v6_get_dst+0x91/0x1630 net/sctp/ipv6.c:241
  sctp_transport_route+0x101/0x570 net/sctp/transport.c:292
==================================================================
 BUG: KMSAN: use of uninitialized memory in sctp_v6_cmp_addr+0x8d4/0x9f0
 net/sctp/ipv6.c:517
 CPU: 2 PID: 31056 Comm: syz-executor1 Not tainted 4.11.0-rc5+ #2944
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs
 01/01/2011
 Call Trace:
  dump_stack+0x172/0x1c0 lib/dump_stack.c:42
  is_logbuf_locked mm/kmsan/kmsan.c:59 [inline]
  kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:938
  native_save_fl arch/x86/include/asm/irqflags.h:18 [inline]
  arch_local_save_flags arch/x86/include/asm/irqflags.h:72 [inline]
  arch_local_irq_save arch/x86/include/asm/irqflags.h:113 [inline]
  __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:467
  sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517
  sctp_v6_get_dst+0x8c7/0x1630 net/sctp/ipv6.c:290
  sctp_transport_route+0x101/0x570 net/sctp/transport.c:292
  sctp_assoc_add_peer+0x66d/0x16f0 net/sctp/associola.c:651
  sctp_sendmsg+0x35a5/0x4f90 net/sctp/socket.c:1871
  inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762
  sock_sendmsg_nosec net/socket.c:633 [inline]
  sock_sendmsg net/socket.c:643 [inline]
  SYSC_sendto+0x608/0x710 net/socket.c:1696
  SyS_sendto+0x8a/0xb0 net/socket.c:1664
  entry_SYSCALL_64_fastpath+0x13/0x94
 RIP: 0033:0x44b479
 RSP: 002b:00007f6213f21c08 EFLAGS: 00000286 ORIG_RAX: 000000000000002c
 RAX: ffffffffffffffda RBX: 0000000020000000 RCX: 000000000044b479
 RDX: 0000000000000041 RSI: 0000000020edd000 RDI: 0000000000000006
 RBP: 00000000007080a8 R08: 0000000020b85fe4 R09: 000000000000001c
 R10: 0000000000040005 R11: 0000000000000286 R12: 00000000ffffffff
 R13: 0000000000003760 R14: 00000000006e5820 R15: 0000000000ff8000
 origin description: ----dst_saddr@sctp_v6_get_dst
 local variable created at:
  sk_fullsock include/net/sock.h:2321 [inline]
  inet6_sk include/linux/ipv6.h:309 [inline]
  sctp_v6_get_dst+0x91/0x1630 net/sctp/ipv6.c:241
  sctp_transport_route+0x101/0x570 net/sctp/transport.c:292
==================================================================

Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/ipv6.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 2a186b201ad2..a4b6ffb61495 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -512,7 +512,9 @@ static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr,
 {
 	addr->sa.sa_family = AF_INET6;
 	addr->v6.sin6_port = port;
+	addr->v6.sin6_flowinfo = 0;
 	addr->v6.sin6_addr = *saddr;
+	addr->v6.sin6_scope_id = 0;
 }
 
 /* Compare addresses exactly.

From 383143f31d7d3525a1dbff733d52fff917f82f15 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Wed, 16 Aug 2017 11:18:09 -0700
Subject: [PATCH 109/149] ipv6: reset fn->rr_ptr when replacing route

syzcaller reported the following use-after-free issue in rt6_select():
BUG: KASAN: use-after-free in rt6_select net/ipv6/route.c:755 [inline] at addr ffff8800bc6994e8
BUG: KASAN: use-after-free in ip6_pol_route.isra.46+0x1429/0x1470 net/ipv6/route.c:1084 at addr ffff8800bc6994e8
Read of size 4 by task syz-executor1/439628
CPU: 0 PID: 439628 Comm: syz-executor1 Not tainted 4.3.5+ #8
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
 0000000000000000 ffff88018fe435b0 ffffffff81ca384d ffff8801d3588c00
 ffff8800bc699380 ffff8800bc699500 dffffc0000000000 ffff8801d40a47c0
 ffff88018fe435d8 ffffffff81735751 ffff88018fe43660 ffff8800bc699380
Call Trace:
 [<ffffffff81ca384d>] __dump_stack lib/dump_stack.c:15 [inline]
 [<ffffffff81ca384d>] dump_stack+0xc1/0x124 lib/dump_stack.c:51
sctp: [Deprecated]: syz-executor0 (pid 439615) Use of struct sctp_assoc_value in delayed_ack socket option.
Use struct sctp_sack_info instead
 [<ffffffff81735751>] kasan_object_err+0x21/0x70 mm/kasan/report.c:158
 [<ffffffff817359c4>] print_address_description mm/kasan/report.c:196 [inline]
 [<ffffffff817359c4>] kasan_report_error+0x1b4/0x4a0 mm/kasan/report.c:285
 [<ffffffff81735d93>] kasan_report mm/kasan/report.c:305 [inline]
 [<ffffffff81735d93>] __asan_report_load4_noabort+0x43/0x50 mm/kasan/report.c:325
 [<ffffffff82a28e39>] rt6_select net/ipv6/route.c:755 [inline]
 [<ffffffff82a28e39>] ip6_pol_route.isra.46+0x1429/0x1470 net/ipv6/route.c:1084
 [<ffffffff82a28fb1>] ip6_pol_route_output+0x81/0xb0 net/ipv6/route.c:1203
 [<ffffffff82ab0a50>] fib6_rule_action+0x1f0/0x680 net/ipv6/fib6_rules.c:95
 [<ffffffff8265cbb6>] fib_rules_lookup+0x2a6/0x7a0 net/core/fib_rules.c:223
 [<ffffffff82ab1430>] fib6_rule_lookup+0xd0/0x250 net/ipv6/fib6_rules.c:41
 [<ffffffff82a22006>] ip6_route_output+0x1d6/0x2c0 net/ipv6/route.c:1224
 [<ffffffff829e83d2>] ip6_dst_lookup_tail+0x4d2/0x890 net/ipv6/ip6_output.c:943
 [<ffffffff829e889a>] ip6_dst_lookup_flow+0x9a/0x250 net/ipv6/ip6_output.c:1079
 [<ffffffff82a9f7d8>] ip6_datagram_dst_update+0x538/0xd40 net/ipv6/datagram.c:91
 [<ffffffff82aa0978>] __ip6_datagram_connect net/ipv6/datagram.c:251 [inline]
 [<ffffffff82aa0978>] ip6_datagram_connect+0x518/0xe50 net/ipv6/datagram.c:272
 [<ffffffff82aa1313>] ip6_datagram_connect_v6_only+0x63/0x90 net/ipv6/datagram.c:284
 [<ffffffff8292f790>] inet_dgram_connect+0x170/0x1f0 net/ipv4/af_inet.c:564
 [<ffffffff82565547>] SYSC_connect+0x1a7/0x2f0 net/socket.c:1582
 [<ffffffff8256a649>] SyS_connect+0x29/0x30 net/socket.c:1563
 [<ffffffff82c72032>] entry_SYSCALL_64_fastpath+0x12/0x17
Object at ffff8800bc699380, in cache ip6_dst_cache size: 384

The root cause of it is that in fib6_add_rt2node(), when it replaces an
existing route with the new one, it does not update fn->rr_ptr.
This commit resets fn->rr_ptr to NULL when it points to a route which is
replaced in fib6_add_rt2node().

Fixes: 27596472473a ("ipv6: fix ECMP route replacement")
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index ebb299cf72b7..e6f878067c68 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -914,6 +914,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 		}
 		nsiblings = iter->rt6i_nsiblings;
 		fib6_purge_rt(iter, fn, info->nl_net);
+		if (fn->rr_ptr == iter)
+			fn->rr_ptr = NULL;
 		rt6_release(iter);
 
 		if (nsiblings) {
@@ -926,6 +928,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 				if (rt6_qualify_for_ecmp(iter)) {
 					*ins = iter->dst.rt6_next;
 					fib6_purge_rt(iter, fn, info->nl_net);
+					if (fn->rr_ptr == iter)
+						fn->rr_ptr = NULL;
 					rt6_release(iter);
 					nsiblings--;
 				} else {

From bc3aae2bbac46dd894c89db5d5e98f7f0ef9e205 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Wed, 16 Aug 2017 12:38:52 -0700
Subject: [PATCH 110/149] net: check and errout if res->fi is NULL when
 RTM_F_FIB_MATCH is set

Syzkaller hit 'general protection fault in fib_dump_info' bug on
commit 4.13-rc5..

Guilty file: net/ipv4/fib_semantics.c

kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN
Modules linked in:
CPU: 0 PID: 2808 Comm: syz-executor0 Not tainted 4.13.0-rc5 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
Ubuntu-1.8.2-1ubuntu1 04/01/2014
task: ffff880078562700 task.stack: ffff880078110000
RIP: 0010:fib_dump_info+0x388/0x1170 net/ipv4/fib_semantics.c:1314
RSP: 0018:ffff880078117010 EFLAGS: 00010206
RAX: dffffc0000000000 RBX: 00000000000000fe RCX: 0000000000000002
RDX: 0000000000000006 RSI: ffff880078117084 RDI: 0000000000000030
RBP: ffff880078117268 R08: 000000000000000c R09: ffff8800780d80c8
R10: 0000000058d629b4 R11: 0000000067fce681 R12: 0000000000000000
R13: ffff8800784bd540 R14: ffff8800780d80b5 R15: ffff8800780d80a4
FS:  00000000022fa940(0000) GS:ffff88007fc00000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000004387d0 CR3: 0000000079135000 CR4: 00000000000006f0
Call Trace:
  inet_rtm_getroute+0xc89/0x1f50 net/ipv4/route.c:2766
  rtnetlink_rcv_msg+0x288/0x680 net/core/rtnetlink.c:4217
  netlink_rcv_skb+0x340/0x470 net/netlink/af_netlink.c:2397
  rtnetlink_rcv+0x28/0x30 net/core/rtnetlink.c:4223
  netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
  netlink_unicast+0x4c4/0x6e0 net/netlink/af_netlink.c:1291
  netlink_sendmsg+0x8c4/0xca0 net/netlink/af_netlink.c:1854
  sock_sendmsg_nosec net/socket.c:633 [inline]
  sock_sendmsg+0xca/0x110 net/socket.c:643
  ___sys_sendmsg+0x779/0x8d0 net/socket.c:2035
  __sys_sendmsg+0xd1/0x170 net/socket.c:2069
  SYSC_sendmsg net/socket.c:2080 [inline]
  SyS_sendmsg+0x2d/0x50 net/socket.c:2076
  entry_SYSCALL_64_fastpath+0x1a/0xa5
  RIP: 0033:0x4512e9
  RSP: 002b:00007ffc75584cc8 EFLAGS: 00000216 ORIG_RAX:
  000000000000002e
  RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00000000004512e9
  RDX: 0000000000000000 RSI: 0000000020f2cfc8 RDI: 0000000000000003
  RBP: 000000000000000e R08: 0000000000000000 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000216 R12: fffffffffffffffe
  R13: 0000000000718000 R14: 0000000020c44ff0 R15: 0000000000000000
  Code: 00 0f b6 8d ec fd ff ff 48 8b 85 f0 fd ff ff 88 48 17 48 8b 45
  28 48 8d 78 30 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03
  <0f>
  b6 04 02 84 c0 74 08 3c 03 0f 8e cb 0c 00 00 48 8b 45 28 44
  RIP: fib_dump_info+0x388/0x1170 net/ipv4/fib_semantics.c:1314 RSP:
  ffff880078117010
---[ end trace 254a7af28348f88b ]---

This patch adds a res->fi NULL check.

example run:
$ip route get 0.0.0.0 iif virt1-0
broadcast 0.0.0.0 dev lo
    cache <local,brd> iif virt1-0

$ip route get 0.0.0.0 iif virt1-0 fibmatch
RTNETLINK answers: No route to host

Reported-by: idaifish <idaifish@gmail.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Fixes: b61798130f1b ("net: ipv4: RTM_GETROUTE: return matched fib result when requested")
Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fe877a4a72b1..2331de20ca50 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2763,14 +2763,21 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
 		table_id = rt->rt_table_id;
 
-	if (rtm->rtm_flags & RTM_F_FIB_MATCH)
+	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
+		if (!res.fi) {
+			err = fib_props[res.type].error;
+			if (!err)
+				err = -EHOSTUNREACH;
+			goto errout_free;
+		}
 		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
 				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
 				    rt->rt_type, res.prefix, res.prefixlen,
 				    fl4.flowi4_tos, res.fi, 0);
-	else
+	} else {
 		err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
 				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
+	}
 	if (err < 0)
 		goto errout_free;
 

From cdbeb633ca71a02b7b63bfeb94994bf4e1a0b894 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 16 Aug 2017 17:53:36 -0400
Subject: [PATCH 111/149] tcp: when rearming RTO, if RTO time is in past then
 fire RTO ASAP

In some situations tcp_send_loss_probe() can realize that it's unable
to send a loss probe (TLP), and falls back to calling tcp_rearm_rto()
to schedule an RTO timer. In such cases, sometimes tcp_rearm_rto()
realizes that the RTO was eligible to fire immediately or at some
point in the past (delta_us <= 0). Previously in such cases
tcp_rearm_rto() was scheduling such "overdue" RTOs to happen at now +
icsk_rto, which caused needless delays of hundreds of milliseconds
(and non-linear behavior that made reproducible testing
difficult). This commit changes the logic to schedule "overdue" RTOs
ASAP, rather than at now + icsk_rto.

Fixes: 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)")
Suggested-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53de1424c13c..bab7f0493098 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3009,8 +3009,7 @@ void tcp_rearm_rto(struct sock *sk)
 			/* delta_us may not be positive if the socket is locked
 			 * when the retrans timer fires and is rescheduled.
 			 */
-			if (delta_us > 0)
-				rto = usecs_to_jiffies(delta_us);
+			rto = usecs_to_jiffies(max_t(int, delta_us, 1));
 		}
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
 					  TCP_RTO_MAX);

From b6f6d56c91f5261c55edef2df300698c4486b669 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Thu, 17 Aug 2017 13:06:14 +0200
Subject: [PATCH 112/149] PCI: Allow PCI express root ports to find themselves

If the pci_find_pcie_root_port() function is called on a root port
itself, return the root port rather than NULL.

This effectively reverts commit 0e405232871d6 ("PCI: fix oops when
try to find Root Port for a PCI device") which added an extra check
that would now be redundant.

Fixes: a99b646afa8a ("PCI: Disable PCIe Relaxed Ordering if unsupported")
Fixes: c56d4450eb68 ("PCI: Turn off Request Attributes to avoid Chelsio T5 Completion erratum")
Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Shawn Lin <shawn.lin@rock-chips.com>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/pci/pci.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index da5570cf5c6a..fdf65a6c13f6 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -514,7 +514,7 @@ EXPORT_SYMBOL(pci_find_resource);
  */
 struct pci_dev *pci_find_pcie_root_port(struct pci_dev *dev)
 {
-	struct pci_dev *bridge, *highest_pcie_bridge = NULL;
+	struct pci_dev *bridge, *highest_pcie_bridge = dev;
 
 	bridge = pci_upstream_bridge(dev);
 	while (bridge && pci_is_pcie(bridge)) {
@@ -522,11 +522,10 @@ struct pci_dev *pci_find_pcie_root_port(struct pci_dev *dev)
 		bridge = pci_upstream_bridge(bridge);
 	}
 
-	if (highest_pcie_bridge &&
-	    pci_pcie_type(highest_pcie_bridge) == PCI_EXP_TYPE_ROOT_PORT)
-		return highest_pcie_bridge;
+	if (pci_pcie_type(highest_pcie_bridge) != PCI_EXP_TYPE_ROOT_PORT)
+		return NULL;
 
-	return NULL;
+	return highest_pcie_bridge;
 }
 EXPORT_SYMBOL(pci_find_pcie_root_port);
 

From ca3d89a3ebe79367bd41b6b8ba37664478ae2dba Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Thu, 17 Aug 2017 18:29:52 +0300
Subject: [PATCH 113/149] net/mlx4_core: Enable 4K UAR if SRIOV module
 parameter is not enabled

enable_4k_uar module parameter was added in patch cited below to
address the backward compatibility issue in SRIOV when the VM has
system's PAGE_SIZE uar implementation and the Hypervisor has 4k uar
implementation.

The above compatibility issue does not exist in the non SRIOV case.
In this patch, we always enable 4k uar implementation if SRIOV
is not enabled on mlx4's supported cards.

Fixes: 76e39ccf9c36 ("net/mlx4_core: Fix backward compatibility on VFs")
Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 09b9bc17bce9..5fe5cdc51357 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -432,7 +432,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		/* Virtual PCI function needs to determine UAR page size from
 		 * firmware. Only master PCI function can set the uar page size
 		 */
-		if (enable_4k_uar)
+		if (enable_4k_uar || !dev->persist->num_vfs)
 			dev->uar_page_shift = DEFAULT_UAR_PAGE_SHIFT;
 		else
 			dev->uar_page_shift = PAGE_SHIFT;
@@ -2277,7 +2277,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 
 		dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1;
 
-		if (enable_4k_uar) {
+		if (enable_4k_uar || !dev->persist->num_vfs) {
 			init_hca.log_uar_sz = ilog2(dev->caps.num_uars) +
 						    PAGE_SHIFT - DEFAULT_UAR_PAGE_SHIFT;
 			init_hca.uar_page_sz = DEFAULT_UAR_PAGE_SHIFT - 12;

From b024d949a3c24255a7ef1a470420eb478949aa4c Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 17 Aug 2017 23:14:58 +0100
Subject: [PATCH 114/149] irda: do not leak initialized list.dev to userspace

list.dev has not been initialized and so the copy_to_user is copying
data from the stack back to user space which is a potential
information leak. Fix this ensuring all of list is initialized to
zero.

Detected by CoverityScan, CID#1357894 ("Uninitialized scalar variable")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/irda/af_irda.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 2e6990f8b80b..23fa7c8b09a5 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -2213,7 +2213,7 @@ static int irda_getsockopt(struct socket *sock, int level, int optname,
 {
 	struct sock *sk = sock->sk;
 	struct irda_sock *self = irda_sk(sk);
-	struct irda_device_list list;
+	struct irda_device_list list = { 0 };
 	struct irda_device_info *discoveries;
 	struct irda_ias_set *	ias_opt;	/* IAS get/query params */
 	struct ias_object *	ias_obj;	/* Object in IAS */

From 9a19bad70cf16b0cdf3576efda7deb490e7aa529 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 18 Aug 2017 00:19:42 +0100
Subject: [PATCH 115/149] rxrpc: Fix oops when discarding a preallocated
 service call

rxrpc_service_prealloc_one() doesn't set the socket pointer on any new call
it preallocates, but does add it to the rxrpc net namespace call list.
This, however, causes rxrpc_put_call() to oops when the call is discarded
when the socket is closed.  rxrpc_put_call() needs the socket to be able to
reach the namespace so that it can use a lock held therein.

Fix this by setting a call's socket pointer immediately before discarding
it.

This can be triggered by unloading the kafs module, resulting in an oops
like the following:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000030
IP: rxrpc_put_call+0x1e2/0x32d
PGD 0
P4D 0
Oops: 0000 [#1] SMP
Modules linked in: kafs(E-)
CPU: 3 PID: 3037 Comm: rmmod Tainted: G            E   4.12.0-fscache+ #213
Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
task: ffff8803fc92e2c0 task.stack: ffff8803fef74000
RIP: 0010:rxrpc_put_call+0x1e2/0x32d
RSP: 0018:ffff8803fef77e08 EFLAGS: 00010282
RAX: 0000000000000000 RBX: ffff8803fab99ac0 RCX: 000000000000000f
RDX: ffffffff81c50a40 RSI: 000000000000000c RDI: ffff8803fc92ea88
RBP: ffff8803fef77e30 R08: ffff8803fc87b941 R09: ffffffff82946d20
R10: ffff8803fef77d10 R11: 00000000000076fc R12: 0000000000000005
R13: ffff8803fab99c20 R14: 0000000000000001 R15: ffffffff816c6aee
FS:  00007f915a059700(0000) GS:ffff88041fb80000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000030 CR3: 00000003fef39000 CR4: 00000000001406e0
Call Trace:
 rxrpc_discard_prealloc+0x325/0x341
 rxrpc_listen+0xf9/0x146
 kernel_listen+0xb/0xd
 afs_close_socket+0x3e/0x173 [kafs]
 afs_exit+0x1f/0x57 [kafs]
 SyS_delete_module+0x10f/0x19a
 do_syscall_64+0x8a/0x149
 entry_SYSCALL64_slow_path+0x25/0x25

Fixes: 2baec2c3f854 ("rxrpc: Support network namespacing")
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/call_accept.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index dd30d74824b0..ec3383f97d4c 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -223,6 +223,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
 	tail = b->call_backlog_tail;
 	while (CIRC_CNT(head, tail, size) > 0) {
 		struct rxrpc_call *call = b->call_backlog[tail];
+		call->socket = rx;
 		if (rx->discard_new_call) {
 			_debug("discard %lx", call->user_call_ID);
 			rx->discard_new_call(call, call->user_call_ID);

From 4f8a881acc9d1adaf1e552349a0b1df28933a04c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 18 Aug 2017 11:01:36 +0800
Subject: [PATCH 116/149] net: sched: fix NULL pointer dereference when action
 calls some targets

As we know in some target's checkentry it may dereference par.entryinfo
to check entry stuff inside. But when sched action calls xt_check_target,
par.entryinfo is set with NULL. It would cause kernel panic when calling
some targets.

It can be reproduce with:
  # tc qd add dev eth1 ingress handle ffff:
  # tc filter add dev eth1 parent ffff: u32 match u32 0 0 action xt \
    -j ECN --ecn-tcp-remove

It could also crash kernel when using target CLUSTERIP or TPROXY.

By now there's no proper value for par.entryinfo in ipt_init_target,
but it can not be set with NULL. This patch is to void all these
panics by setting it with an ipt_entry obj with all members = 0.

Note that this issue has been there since the very beginning.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_ipt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index d516ba8178b8..541707802a23 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -41,6 +41,7 @@ static int ipt_init_target(struct net *net, struct xt_entry_target *t,
 {
 	struct xt_tgchk_param par;
 	struct xt_target *target;
+	struct ipt_entry e = {};
 	int ret = 0;
 
 	target = xt_request_find_target(AF_INET, t->u.user.name,
@@ -52,6 +53,7 @@ static int ipt_init_target(struct net *net, struct xt_entry_target *t,
 	memset(&par, 0, sizeof(par));
 	par.net       = net;
 	par.table     = table;
+	par.entryinfo = &e;
 	par.target    = target;
 	par.targinfo  = t->data;
 	par.hook_mask = hook;

From 8c97023cf0518f172b8cb7a9fffc28b89401abbf Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 26 Jul 2017 15:36:23 +0200
Subject: [PATCH 117/149] Kbuild: use -fshort-wchar globally

Commit 971a69db7dc0 ("Xen: don't warn about 2-byte wchar_t in efi")
added the --no-wchar-size-warning to the Makefile to avoid this
harmless warning:

arm-linux-gnueabi-ld: warning: drivers/xen/efi.o uses 2-byte wchar_t yet the output is to use 4-byte wchar_t; use of wchar_t values across objects may fail

Changing kbuild to use thin archives instead of recursive linking
unfortunately brings the same warning back during the final link.

The kernel does not use wchar_t string literals at this point, and
xen does not use wchar_t at all (only efi_char16_t), so the flag
has no effect, but as pointed out by Jan Beulich, adding a wchar_t
string literal would be bad here.

Since wchar_t is always defined as u16, independent of the toolchain
default, always passing -fshort-wchar is correct and lets us
remove the Xen specific hack along with fixing the warning.

Link: https://patchwork.kernel.org/patch/9275217/
Fixes: 971a69db7dc0 ("Xen: don't warn about 2-byte wchar_t in efi")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 Makefile             | 2 +-
 drivers/xen/Makefile | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 473f1aa76f6d..4dac1107913b 100644
--- a/Makefile
+++ b/Makefile
@@ -396,7 +396,7 @@ LINUXINCLUDE    := \
 KBUILD_CPPFLAGS := -D__KERNEL__
 
 KBUILD_CFLAGS   := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
-		   -fno-strict-aliasing -fno-common \
+		   -fno-strict-aliasing -fno-common -fshort-wchar \
 		   -Werror-implicit-function-declaration \
 		   -Wno-format-security \
 		   -std=gnu89 $(call cc-option,-fno-PIE)
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 8feab810aed9..7f188b8d0c67 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -7,9 +7,6 @@ obj-y	+= xenbus/
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_features.o			:= $(nostackp)
 
-CFLAGS_efi.o				+= -fshort-wchar
-LDFLAGS					+= $(call ld-option, --no-wchar-size-warning)
-
 dom0-$(CONFIG_ARM64) += arm-device.o
 dom0-$(CONFIG_PCI) += pci.o
 dom0-$(CONFIG_USB_SUPPORT) += dbgp.o

From 801d2e9f1cfb24c15ec821288b6bc2e3107f6fcd Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Fri, 18 Aug 2017 15:54:41 -0600
Subject: [PATCH 118/149] Makefile: add kselftest-clean to PHONY target list

kselftest-clean isn't in the PHONY target list. Add it.

Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 4dac1107913b..c913d80ea6e8 100644
--- a/Makefile
+++ b/Makefile
@@ -1184,6 +1184,7 @@ PHONY += kselftest
 kselftest:
 	$(Q)$(MAKE) -C tools/testing/selftests run_tests
 
+PHONY += kselftest-clean
 kselftest-clean:
 	$(Q)$(MAKE) -C tools/testing/selftests clean
 

From 2bfbe7881ee03d12bbedb05f804d139c88df30a0 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Sat, 19 Aug 2017 22:30:02 +0100
Subject: [PATCH 119/149] kbuild: Do not use hyphen in exported variable name

This definition in Makefile.dtbinst:

    export dtbinst-root ?= $(obj)

should define and export dtbinst-root when handling the root dts
directory, and do nothing in the subdirectories.  However some shells,
including dash, will not pass through environment variables whose name
includes a hyphen.  Usually GNU make does not use a shell to recurse,
but if e.g. $(srctree) contains '~' it will use a shell here.

Rename the variable to dtbinst_root.

References: https://bugs.debian.org/833561
Fixes: 323a028d39cdi ("dts, kbuild: Implement support for dtb vendor subdirs")
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 scripts/Makefile.dtbinst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/Makefile.dtbinst b/scripts/Makefile.dtbinst
index 34614a48b717..993fb85982df 100644
--- a/scripts/Makefile.dtbinst
+++ b/scripts/Makefile.dtbinst
@@ -14,7 +14,7 @@ src := $(obj)
 PHONY := __dtbs_install
 __dtbs_install:
 
-export dtbinst-root ?= $(obj)
+export dtbinst_root ?= $(obj)
 
 include include/config/auto.conf
 include scripts/Kbuild.include
@@ -27,7 +27,7 @@ dtbinst-dirs	:= $(dts-dirs)
 quiet_cmd_dtb_install =	INSTALL $<
       cmd_dtb_install =	mkdir -p $(2); cp $< $(2)
 
-install-dir = $(patsubst $(dtbinst-root)%,$(INSTALL_DTBS_PATH)%,$(obj))
+install-dir = $(patsubst $(dtbinst_root)%,$(INSTALL_DTBS_PATH)%,$(obj))
 
 $(dtbinst-files): %.dtb: $(obj)/%.dtb
 	$(call cmd,dtb_install,$(install-dir))

From d4dd2d75a26ef07cadc2949efeea9fabc2a5c299 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 21 Aug 2017 00:26:03 +0200
Subject: [PATCH 120/149] bpf, doc: also add s390x as arch to sysctl
 description

Looks like this was accidentally missed, so still add s390x
as supported eBPF JIT arch to bpf_jit_enable.

Fixes: 014cd0a368dc ("bpf: Update sysctl documentation to list all supported architectures")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index d7c2b88b92ae..28596e03220b 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -49,6 +49,7 @@ two flavors of JITs, the newer eBPF JIT currently supported on:
   - ppc64
   - sparc64
   - mips64
+  - s390x
 
 And the older cBPF JIT supported on the following archs:
   - arm

From 5a78449810b06c3bc5fcd002d52e1a64f9bb397e Mon Sep 17 00:00:00 2001
From: Chris Packham <chris.packham@alliedtelesis.co.nz>
Date: Mon, 21 Aug 2017 08:52:54 +1200
Subject: [PATCH 121/149] switchdev: documentation: minor typo fixes

Two typos in switchdev.txt

Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/switchdev.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt
index 3e7b946dea27..5e40e1f68873 100644
--- a/Documentation/networking/switchdev.txt
+++ b/Documentation/networking/switchdev.txt
@@ -228,7 +228,7 @@ Learning on the device port should be enabled, as well as learning_sync:
 	bridge link set dev DEV learning on self
 	bridge link set dev DEV learning_sync on self
 
-Learning_sync attribute enables syncing of the learned/forgotton FDB entry to
+Learning_sync attribute enables syncing of the learned/forgotten FDB entry to
 the bridge's FDB.  It's possible, but not optimal, to enable learning on the
 device port and on the bridge port, and disable learning_sync.
 
@@ -245,7 +245,7 @@ the responsibility of the port driver/device to age out these entries.  If the
 port device supports ageing, when the FDB entry expires, it will notify the
 driver which in turn will notify the bridge with SWITCHDEV_FDB_DEL.  If the
 device does not support ageing, the driver can simulate ageing using a
-garbage collection timer to monitor FBD entries.  Expired entries will be
+garbage collection timer to monitor FDB entries.  Expired entries will be
 notified to the bridge using SWITCHDEV_FDB_DEL.  See rocker driver for
 example of driver running ageing timer.
 

From 49bf4b36fdee4db3c8bc0507a8413a93a8c711cf Mon Sep 17 00:00:00 2001
From: Eric Leblond <eric@regit.org>
Date: Sun, 20 Aug 2017 21:48:14 +0200
Subject: [PATCH 122/149] tools lib bpf: improve warning

Signed-off-by: Eric Leblond <eric@regit.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/lib/bpf/libbpf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 1a2c07eb7795..8c67a90dbd82 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -879,7 +879,8 @@ bpf_object__create_maps(struct bpf_object *obj)
 			size_t j;
 			int err = *pfd;
 
-			pr_warning("failed to create map: %s\n",
+			pr_warning("failed to create map (name: '%s'): %s\n",
+				   obj->maps[i].name,
 				   strerror(errno));
 			for (j = 0; j < i; j++)
 				zclose(obj->maps[j].fd);

From 68a66d149a8c78ec6720f268597302883e48e9fa Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Sat, 19 Aug 2017 15:37:07 +0300
Subject: [PATCH 123/149] net_sched: fix order of queue length updates in
 qdisc_replace()

This important to call qdisc_tree_reduce_backlog() after changing queue
length. Parent qdisc should deactivate class in ->qlen_notify() called from
qdisc_tree_reduce_backlog() but this happens only if qdisc->q.qlen in zero.

Missed class deactivations leads to crashes/warnings at picking packets
from empty qdisc and corrupting state at reactivating this class in future.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Fixes: 86a7996cc8a0 ("net_sched: introduce qdisc_replace() helper")
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 1c123e2b2415..67f815e5d525 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -806,8 +806,11 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
 	old = *pold;
 	*pold = new;
 	if (old != NULL) {
-		qdisc_tree_reduce_backlog(old, old->q.qlen, old->qstats.backlog);
+		unsigned int qlen = old->q.qlen;
+		unsigned int backlog = old->qstats.backlog;
+
 		qdisc_reset(old);
+		qdisc_tree_reduce_backlog(old, qlen, backlog);
 	}
 	sch_tree_unlock(sch);
 

From 348a4002729ccab8b888b38cbc099efa2f2a2036 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Fri, 18 Aug 2017 17:14:49 -0700
Subject: [PATCH 124/149] ipv6: repair fib6 tree in failure case

In fib6_add(), it is possible that fib6_add_1() picks an intermediate
node and sets the node's fn->leaf to NULL in order to add this new
route. However, if fib6_add_rt2node() fails to add the new
route for some reason, fn->leaf will be left as NULL and could
potentially cause crash when fn->leaf is accessed in fib6_locate().
This patch makes sure fib6_repair_tree() is called to properly repair
fn->leaf in the above failure case.

Here is the syzkaller reported general protection fault in fib6_locate:
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN
Modules linked in:
CPU: 0 PID: 40937 Comm: syz-executor3 Not tainted
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
task: ffff8801d7d64100 ti: ffff8801d01a0000 task.ti: ffff8801d01a0000
RIP: 0010:[<ffffffff82a3e0e1>]  [<ffffffff82a3e0e1>] __ipv6_prefix_equal64_half include/net/ipv6.h:475 [inline]
RIP: 0010:[<ffffffff82a3e0e1>]  [<ffffffff82a3e0e1>] ipv6_prefix_equal include/net/ipv6.h:492 [inline]
RIP: 0010:[<ffffffff82a3e0e1>]  [<ffffffff82a3e0e1>] fib6_locate_1 net/ipv6/ip6_fib.c:1210 [inline]
RIP: 0010:[<ffffffff82a3e0e1>]  [<ffffffff82a3e0e1>] fib6_locate+0x281/0x3c0 net/ipv6/ip6_fib.c:1233
RSP: 0018:ffff8801d01a36a8  EFLAGS: 00010202
RAX: 0000000000000020 RBX: ffff8801bc790e00 RCX: ffffc90002983000
RDX: 0000000000001219 RSI: ffff8801d01a37a0 RDI: 0000000000000100
RBP: ffff8801d01a36f0 R08: 00000000000000ff R09: 0000000000000000
R10: 0000000000000003 R11: 0000000000000000 R12: 0000000000000001
R13: dffffc0000000000 R14: ffff8801d01a37a0 R15: 0000000000000000
FS:  00007f6afd68c700(0000) GS:ffff8801db400000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00000000004c6340 CR3: 00000000ba41f000 CR4: 00000000001426f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Stack:
 ffff8801d01a37a8 ffff8801d01a3780 ffffed003a0346f5 0000000c82a23ea0
 ffff8800b7bd7700 ffff8801d01a3780 ffff8800b6a1c940 ffffffff82a23ea0
 ffff8801d01a3920 ffff8801d01a3748 ffffffff82a223d6 ffff8801d7d64988
Call Trace:
 [<ffffffff82a223d6>] ip6_route_del+0x106/0x570 net/ipv6/route.c:2109
 [<ffffffff82a23f9d>] inet6_rtm_delroute+0xfd/0x100 net/ipv6/route.c:3075
 [<ffffffff82621359>] rtnetlink_rcv_msg+0x549/0x7a0 net/core/rtnetlink.c:3450
 [<ffffffff8274c1d1>] netlink_rcv_skb+0x141/0x370 net/netlink/af_netlink.c:2281
 [<ffffffff82613ddf>] rtnetlink_rcv+0x2f/0x40 net/core/rtnetlink.c:3456
 [<ffffffff8274ad38>] netlink_unicast_kernel net/netlink/af_netlink.c:1206 [inline]
 [<ffffffff8274ad38>] netlink_unicast+0x518/0x750 net/netlink/af_netlink.c:1232
 [<ffffffff8274b83e>] netlink_sendmsg+0x8ce/0xc30 net/netlink/af_netlink.c:1778
 [<ffffffff82564aff>] sock_sendmsg_nosec net/socket.c:609 [inline]
 [<ffffffff82564aff>] sock_sendmsg+0xcf/0x110 net/socket.c:619
 [<ffffffff82564d62>] sock_write_iter+0x222/0x3a0 net/socket.c:834
 [<ffffffff8178523d>] new_sync_write+0x1dd/0x2b0 fs/read_write.c:478
 [<ffffffff817853f4>] __vfs_write+0xe4/0x110 fs/read_write.c:491
 [<ffffffff81786c38>] vfs_write+0x178/0x4b0 fs/read_write.c:538
 [<ffffffff817892a9>] SYSC_write fs/read_write.c:585 [inline]
 [<ffffffff817892a9>] SyS_write+0xd9/0x1b0 fs/read_write.c:577
 [<ffffffff82c71e32>] entry_SYSCALL_64_fastpath+0x12/0x17

Note: there is no "Fixes" tag as this seems to be a bug introduced
very early.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e6f878067c68..5cc0ea038198 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1018,7 +1018,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			/* Create subtree root node */
 			sfn = node_alloc();
 			if (!sfn)
-				goto st_failure;
+				goto failure;
 
 			sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
 			atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
@@ -1035,12 +1035,12 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 			if (IS_ERR(sn)) {
 				/* If it is failed, discard just allocated
-				   root, and then (in st_failure) stale node
+				   root, and then (in failure) stale node
 				   in main tree.
 				 */
 				node_free(sfn);
 				err = PTR_ERR(sn);
-				goto st_failure;
+				goto failure;
 			}
 
 			/* Now link new subtree to main tree */
@@ -1055,7 +1055,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 
 			if (IS_ERR(sn)) {
 				err = PTR_ERR(sn);
-				goto st_failure;
+				goto failure;
 			}
 		}
 
@@ -1096,18 +1096,17 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 			atomic_inc(&pn->leaf->rt6i_ref);
 		}
 #endif
-		/* Always release dst as dst->__refcnt is guaranteed
-		 * to be taken before entering this function
-		 */
-		dst_release_immediate(&rt->dst);
+		goto failure;
 	}
 	return err;
 
-#ifdef CONFIG_IPV6_SUBTREES
-	/* Subtree creation failed, probably main tree node
-	   is orphan. If it is, shoot it.
+failure:
+	/* fn->leaf could be NULL if fn is an intermediate node and we
+	 * failed to add the new route to it in both subtree creation
+	 * failure and fib6_add_rt2node() failure case.
+	 * In both cases, fib6_repair_tree() should be called to fix
+	 * fn->leaf.
 	 */
-st_failure:
 	if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
 		fib6_repair_tree(info->nl_net, fn);
 	/* Always release dst as dst->__refcnt is guaranteed
@@ -1115,7 +1114,6 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
 	 */
 	dst_release_immediate(&rt->dst);
 	return err;
-#endif
 }
 
 /*

From 4459398b6d9e3055ced5de9820364b3bdd79ac25 Mon Sep 17 00:00:00 2001
From: Keerthy <j-keerthy@ti.com>
Date: Mon, 31 Jul 2017 10:01:36 +0530
Subject: [PATCH 125/149] soc: ti: knav: Add a NULL pointer check for kdev in
 knav_pool_create

knav_pool_create is an exported function. In the event of a call
before knav_queue_probe, we encounter a NULL pointer dereference
in the following line. Hence return -EPROBE_DEFER to the caller till
the kdev pointer is non-NULL.

Signed-off-by: Keerthy <j-keerthy@ti.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 drivers/soc/ti/knav_qmss_queue.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/soc/ti/knav_qmss_queue.c b/drivers/soc/ti/knav_qmss_queue.c
index 279e7c5551dd..39225de9d7f1 100644
--- a/drivers/soc/ti/knav_qmss_queue.c
+++ b/drivers/soc/ti/knav_qmss_queue.c
@@ -745,6 +745,9 @@ void *knav_pool_create(const char *name,
 	bool slot_found;
 	int ret;
 
+	if (!kdev)
+		return ERR_PTR(-EPROBE_DEFER);
+
 	if (!kdev->dev)
 		return ERR_PTR(-ENODEV);
 

From 03619844d81d7459874e88034f0f36d959f0e2df Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 25 Jul 2017 18:44:04 +0200
Subject: [PATCH 126/149] rtc: ds1307: fix regmap config

Current max_register setting breaks reading nvram on certain chips and
also reading the standard registers on RX8130 where register map starts
at 0x10.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Fixes: 11e5890b5342 "rtc: ds1307: convert driver to regmap"
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
---
 drivers/rtc/rtc-ds1307.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index 4fac49e55d47..4b43aa62fbc7 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -1301,7 +1301,6 @@ static void ds1307_clks_register(struct ds1307 *ds1307)
 static const struct regmap_config regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
-	.max_register = 0x12,
 };
 
 static int ds1307_probe(struct i2c_client *client,

From d6c8103b0265d8db30e20e948a4f06382bbdaea7 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 21 Aug 2017 10:06:51 +0200
Subject: [PATCH 127/149] x86/CPU: Align CR3 defines

Align them vertically for better readability and use BIT_ULL() macro.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Link: http://lkml.kernel.org/r/20170821080651.4527-1-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor-flags.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 8a6d89fc9a79..dc723b64acf0 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -34,17 +34,17 @@
  */
 #ifdef CONFIG_X86_64
 /* Mask off the address space ID and SME encryption bits. */
-#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
-#define CR3_PCID_MASK 0xFFFull
-#define CR3_NOFLUSH (1UL << 63)
+#define CR3_ADDR_MASK	__sme_clr(0x7FFFFFFFFFFFF000ull)
+#define CR3_PCID_MASK	0xFFFull
+#define CR3_NOFLUSH	BIT_ULL(63)
 #else
 /*
  * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
  * a tiny bit of code size by setting all the bits.
  */
-#define CR3_ADDR_MASK 0xFFFFFFFFull
-#define CR3_PCID_MASK 0ull
-#define CR3_NOFLUSH 0
+#define CR3_ADDR_MASK	0xFFFFFFFFull
+#define CR3_PCID_MASK	0ull
+#define CR3_NOFLUSH	0
 #endif
 
 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */

From dd1c1f2f2028a7b851f701fc6a8ebe39dcb95e7c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 21 Aug 2017 17:35:02 +0200
Subject: [PATCH 128/149] pids: make task_tgid_nr_ns() safe

This was reported many times, and this was even mentioned in commit
52ee2dfdd4f5 ("pids: refactor vnr/nr_ns helpers to make them safe") but
somehow nobody bothered to fix the obvious problem: task_tgid_nr_ns() is
not safe because task->group_leader points to nowhere after the exiting
task passes exit_notify(), rcu_read_lock() can not help.

We really need to change __unhash_process() to nullify group_leader,
parent, and real_parent, but this needs some cleanups.  Until then we
can turn task_tgid_nr_ns() into another user of __task_pid_nr_ns() and
fix the problem.

Reported-by: Troy Kensinger <tkensinger@google.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pid.h   |  4 +++-
 include/linux/sched.h | 51 +++++++++++++++++++++++--------------------
 kernel/pid.c          | 11 ++++------
 3 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 4d179316e431..719582744a2e 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -8,7 +8,9 @@ enum pid_type
 	PIDTYPE_PID,
 	PIDTYPE_PGID,
 	PIDTYPE_SID,
-	PIDTYPE_MAX
+	PIDTYPE_MAX,
+	/* only valid to __task_pid_nr_ns() */
+	__PIDTYPE_TGID
 };
 
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8337e2db0bb2..c05ac5f5aa03 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1163,13 +1163,6 @@ static inline pid_t task_tgid_nr(struct task_struct *tsk)
 	return tsk->tgid;
 }
 
-extern pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
-
-static inline pid_t task_tgid_vnr(struct task_struct *tsk)
-{
-	return pid_vnr(task_tgid(tsk));
-}
-
 /**
  * pid_alive - check that a task structure is not stale
  * @p: Task structure to be checked.
@@ -1185,23 +1178,6 @@ static inline int pid_alive(const struct task_struct *p)
 	return p->pids[PIDTYPE_PID].pid != NULL;
 }
 
-static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
-{
-	pid_t pid = 0;
-
-	rcu_read_lock();
-	if (pid_alive(tsk))
-		pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
-	rcu_read_unlock();
-
-	return pid;
-}
-
-static inline pid_t task_ppid_nr(const struct task_struct *tsk)
-{
-	return task_ppid_nr_ns(tsk, &init_pid_ns);
-}
-
 static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 {
 	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
@@ -1223,6 +1199,33 @@ static inline pid_t task_session_vnr(struct task_struct *tsk)
 	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
 }
 
+static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, ns);
+}
+
+static inline pid_t task_tgid_vnr(struct task_struct *tsk)
+{
+	return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, NULL);
+}
+
+static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
+{
+	pid_t pid = 0;
+
+	rcu_read_lock();
+	if (pid_alive(tsk))
+		pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
+	rcu_read_unlock();
+
+	return pid;
+}
+
+static inline pid_t task_ppid_nr(const struct task_struct *tsk)
+{
+	return task_ppid_nr_ns(tsk, &init_pid_ns);
+}
+
 /* Obsolete, do not use: */
 static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 {
diff --git a/kernel/pid.c b/kernel/pid.c
index c69c30d827e5..020dedbdf066 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 	if (!ns)
 		ns = task_active_pid_ns(current);
 	if (likely(pid_alive(task))) {
-		if (type != PIDTYPE_PID)
+		if (type != PIDTYPE_PID) {
+			if (type == __PIDTYPE_TGID)
+				type = PIDTYPE_PID;
 			task = task->group_leader;
+		}
 		nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
 	}
 	rcu_read_unlock();
@@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 }
 EXPORT_SYMBOL(__task_pid_nr_ns);
 
-pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return pid_nr_ns(task_tgid(tsk), ns);
-}
-EXPORT_SYMBOL(task_tgid_nr_ns);
-
 struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
 {
 	return ns_of_pid(task_pid(tsk));

From 2dc77533f1e495788d73ffa4bee4323b2646d2bb Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Sun, 13 Aug 2017 23:14:58 +0200
Subject: [PATCH 129/149] sparc: kernel/pcic: silence gcc 7.x warning in
 pcibios_fixup_bus()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When building the kernel for Sparc using gcc 7.x, the build fails
with:

arch/sparc/kernel/pcic.c: In function ‘pcibios_fixup_bus’:
arch/sparc/kernel/pcic.c:647:8: error: ‘cmd’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
    cmd |= PCI_COMMAND_IO;
        ^~

The simplified code looks like this:

unsigned int cmd;
[...]
pcic_read_config(dev->bus, dev->devfn, PCI_COMMAND, 2, &cmd);
[...]
cmd |= PCI_COMMAND_IO;

I.e, the code assumes that pcic_read_config() will always initialize
cmd. But it's not the case. Looking at pcic_read_config(), if
bus->number is != 0 or if the size is not one of 1, 2 or 4, *val will
not be initialized.

As a simple fix, we initialize cmd to zero at the beginning of
pcibios_fixup_bus.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/kernel/pcic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c
index a38787b84322..732af9a9f6dd 100644
--- a/arch/sparc/kernel/pcic.c
+++ b/arch/sparc/kernel/pcic.c
@@ -602,7 +602,7 @@ void pcibios_fixup_bus(struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 	int i, has_io, has_mem;
-	unsigned int cmd;
+	unsigned int cmd = 0;
 	struct linux_pcic *pcic;
 	/* struct linux_pbm_info* pbm = &pcic->pbm; */
 	int node;

From 0f0fc5c090055983163ed8e7b3a73980e128048c Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Tue, 15 Aug 2017 09:30:05 +0100
Subject: [PATCH 130/149] Revert "mfd: da9061: Fix to remove BBAT_CONT register
 from chip model"

This patch was applied to the MFD twice, causing unwanted behavour.

This reverts commit b77eb79acca3203883e8d8dbc7f2b842def1bff8.

Fixes: b77eb79acca3 ("mfd: da9061: Fix to remove BBAT_CONT register from chip model")
Reported-by: Steve Twiss <stwiss.opensource@diasemi.com>
Reviewed-by: Steve Twiss <stwiss.opensource@diasemi.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/da9062-core.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/mfd/da9062-core.c b/drivers/mfd/da9062-core.c
index fbe0f245ce8e..fe1811523e4a 100644
--- a/drivers/mfd/da9062-core.c
+++ b/drivers/mfd/da9062-core.c
@@ -644,6 +644,9 @@ static const struct regmap_range da9062_aa_readable_ranges[] = {
 	}, {
 		.range_min = DA9062AA_VLDO1_B,
 		.range_max = DA9062AA_VLDO4_B,
+	}, {
+		.range_min = DA9062AA_BBAT_CONT,
+		.range_max = DA9062AA_BBAT_CONT,
 	}, {
 		.range_min = DA9062AA_INTERFACE,
 		.range_max = DA9062AA_CONFIG_E,
@@ -720,6 +723,9 @@ static const struct regmap_range da9062_aa_writeable_ranges[] = {
 	}, {
 		.range_min = DA9062AA_VLDO1_B,
 		.range_max = DA9062AA_VLDO4_B,
+	}, {
+		.range_min = DA9062AA_BBAT_CONT,
+		.range_max = DA9062AA_BBAT_CONT,
 	}, {
 		.range_min = DA9062AA_GP_ID_0,
 		.range_max = DA9062AA_GP_ID_19,

From 096622104e14d8a1db4860bd557717067a0515d2 Mon Sep 17 00:00:00 2001
From: Dave Martin <Dave.Martin@arm.com>
Date: Fri, 18 Aug 2017 16:57:01 +0100
Subject: [PATCH 131/149] arm64: fpsimd: Prevent registers leaking across exec

There are some tricky dependencies between the different stages of
flushing the FPSIMD register state during exec, and these can race
with context switch in ways that can cause the old task's regs to
leak across.  In particular, a context switch during the memset() can
cause some of the task's old FPSIMD registers to reappear.

Disabling preemption for this small window would be no big deal for
performance: preemption is already disabled for similar scenarios
like updating the FPSIMD registers in sigreturn.

So, instead of rearranging things in ways that might swap existing
subtle bugs for new ones, this patch just disables preemption
around the FPSIMD state flushing so that races of this type can't
occur here.  This brings fpsimd_flush_thread() into line with other
code paths.

Cc: stable@vger.kernel.org
Fixes: 674c242c9323 ("arm64: flush FP/SIMD state correctly after execve()")
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/kernel/fpsimd.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 06da8ea16bbe..c7b4995868e1 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -161,9 +161,11 @@ void fpsimd_flush_thread(void)
 {
 	if (!system_supports_fpsimd())
 		return;
+	preempt_disable();
 	memset(&current->thread.fpsimd_state, 0, sizeof(struct fpsimd_state));
 	fpsimd_flush_task_state(current);
 	set_thread_flag(TIF_FOREIGN_FPSTATE);
+	preempt_enable();
 }
 
 /*

From 289d07a2dc6c6b6f3e4b8a62669320d99dbe6c3d Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 11 Jul 2017 15:19:22 +0100
Subject: [PATCH 132/149] arm64: mm: abort uaccess retries upon fatal signal

When there's a fatal signal pending, arm64's do_page_fault()
implementation returns 0. The intent is that we'll return to the
faulting userspace instruction, delivering the signal on the way.

However, if we take a fatal signal during fixing up a uaccess, this
results in a return to the faulting kernel instruction, which will be
instantly retried, resulting in the same fault being taken forever. As
the task never reaches userspace, the signal is not delivered, and the
task is left unkillable. While the task is stuck in this state, it can
inhibit the forward progress of the system.

To avoid this, we must ensure that when a fatal signal is pending, we
apply any necessary fixup for a faulting kernel instruction. Thus we
will return to an error path, and it is up to that code to make forward
progress towards delivering the fatal signal.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Laura Abbott <labbott@redhat.com>
Cc: stable@vger.kernel.org
Reviewed-by: Steve Capper <steve.capper@arm.com>
Tested-by: Steve Capper <steve.capper@arm.com>
Reviewed-by: James Morse <james.morse@arm.com>
Tested-by: James Morse <james.morse@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/mm/fault.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 2509e4fe6992..1f22a41565a3 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -435,8 +435,11 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 		 * the mmap_sem because it would already be released
 		 * in __lock_page_or_retry in mm/filemap.c.
 		 */
-		if (fatal_signal_pending(current))
+		if (fatal_signal_pending(current)) {
+			if (!user_mode(regs))
+				goto no_context;
 			return 0;
+		}
 
 		/*
 		 * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of

From 4a23e56ad6549d0b8c0fac6d9eb752884379c391 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 18 Aug 2017 18:42:30 +0100
Subject: [PATCH 133/149] arm64: kaslr: ignore modulo offset when validating
 virtual displacement

In the KASLR setup routine, we ensure that the early virtual mapping
of the kernel image does not cover more than a single table entry at
the level above the swapper block level, so that the assembler routines
involved in setting up this mapping can remain simple.

In this calculation we add the proposed KASLR offset to the values of
the _text and _end markers, and reject it if they would end up falling
in different swapper table sized windows.

However, when taking the addresses of _text and _end, the modulo offset
(the physical displacement modulo 2 MB) is already accounted for, and
so adding it again results in incorrect results. So disregard the modulo
offset from the calculation.

Fixes: 08cdac619c81 ("arm64: relocatable: deal with physically misaligned ...")
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/kernel/head.S  |  1 -
 arch/arm64/kernel/kaslr.c | 12 +++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 973df7de7bf8..adb0910b88f5 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -354,7 +354,6 @@ __primary_switched:
 	tst	x23, ~(MIN_KIMG_ALIGN - 1)	// already running randomized?
 	b.ne	0f
 	mov	x0, x21				// pass FDT address in x0
-	mov	x1, x23				// pass modulo offset in x1
 	bl	kaslr_early_init		// parse FDT for KASLR options
 	cbz	x0, 0f				// KASLR disabled? just proceed
 	orr	x23, x23, x0			// record KASLR offset
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index a9710efb8c01..1d95c204186b 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -75,7 +75,7 @@ extern void *__init __fixmap_remap_fdt(phys_addr_t dt_phys, int *size,
  * containing function pointers) to be reinitialized, and zero-initialized
  * .bss variables will be reset to 0.
  */
-u64 __init kaslr_early_init(u64 dt_phys, u64 modulo_offset)
+u64 __init kaslr_early_init(u64 dt_phys)
 {
 	void *fdt;
 	u64 seed, offset, mask, module_range;
@@ -133,9 +133,15 @@ u64 __init kaslr_early_init(u64 dt_phys, u64 modulo_offset)
 	 * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this
 	 * happens, increase the KASLR offset by the size of the kernel image
 	 * rounded up by SWAPPER_BLOCK_SIZE.
+	 *
+	 * NOTE: The references to _text and _end below will already take the
+	 *       modulo offset (the physical displacement modulo 2 MB) into
+	 *       account, given that the physical placement is controlled by
+	 *       the loader, and will not change as a result of the virtual
+	 *       mapping we choose.
 	 */
-	if ((((u64)_text + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT) !=
-	    (((u64)_end + offset + modulo_offset) >> SWAPPER_TABLE_SHIFT)) {
+	if ((((u64)_text + offset) >> SWAPPER_TABLE_SHIFT) !=
+	    (((u64)_end + offset) >> SWAPPER_TABLE_SHIFT)) {
 		u64 kimg_sz = _end - _text;
 		offset = (offset + round_up(kimg_sz, SWAPPER_BLOCK_SIZE))
 				& mask;

From a067d94d37ed590fd17684d18c3edf52110d305a Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 22 Aug 2017 15:39:00 +0100
Subject: [PATCH 134/149] arm64: kaslr: Adjust the offset to avoid Image across
 alignment boundary

With 16KB pages and a kernel Image larger than 16MB, the current
kaslr_early_init() logic for avoiding mappings across swapper table
boundaries fails since increasing the offset by kimg_sz just moves the
problem to the next boundary.

This patch rounds the offset down to (1 << SWAPPER_TABLE_SHIFT) if the
Image crosses a PMD_SIZE boundary.

Fixes: afd0e5a87670 ("arm64: kaslr: Fix up the kernel image alignment")
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Neeraj Upadhyay <neeraju@codeaurora.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/kernel/kaslr.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index 1d95c204186b..47080c49cc7e 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -131,8 +131,7 @@ u64 __init kaslr_early_init(u64 dt_phys)
 	/*
 	 * The kernel Image should not extend across a 1GB/32MB/512MB alignment
 	 * boundary (for 4KB/16KB/64KB granule kernels, respectively). If this
-	 * happens, increase the KASLR offset by the size of the kernel image
-	 * rounded up by SWAPPER_BLOCK_SIZE.
+	 * happens, round down the KASLR offset by (1 << SWAPPER_TABLE_SHIFT).
 	 *
 	 * NOTE: The references to _text and _end below will already take the
 	 *       modulo offset (the physical displacement modulo 2 MB) into
@@ -141,11 +140,8 @@ u64 __init kaslr_early_init(u64 dt_phys)
 	 *       mapping we choose.
 	 */
 	if ((((u64)_text + offset) >> SWAPPER_TABLE_SHIFT) !=
-	    (((u64)_end + offset) >> SWAPPER_TABLE_SHIFT)) {
-		u64 kimg_sz = _end - _text;
-		offset = (offset + round_up(kimg_sz, SWAPPER_BLOCK_SIZE))
-				& mask;
-	}
+	    (((u64)_end + offset) >> SWAPPER_TABLE_SHIFT))
+		offset = round_down(offset, 1 << SWAPPER_TABLE_SHIFT);
 
 	if (IS_ENABLED(CONFIG_KASAN))
 		/*

From 65159c051c45f269cf40a14f9404248f2d524920 Mon Sep 17 00:00:00 2001
From: Bharat Potnuri <bharat@chelsio.com>
Date: Tue, 1 Aug 2017 10:58:35 +0530
Subject: [PATCH 135/149] RDMA/uverbs: Initialize cq_context appropriately

Initializing cq_context with ev_queue in create_cq(), leads to NULL pointer
dereference in ib_uverbs_comp_handler(), if application doesnot use completion
channel. This patch fixes the cq_context initialization.

Fixes: 1e7710f3f65 ("IB/core: Change completion channel to use the reworked")
Cc: stable@vger.kernel.org # 4.12
Signed-off-by: Potnuri Bharat Teja <bharat@chelsio.com>
Reviewed-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
(cherry picked from commit 699a2d5b1b880b4e4e1c7d55fa25659322cf5b51)
---
 drivers/infiniband/core/uverbs_cmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index c551d2b275fd..55822ae71955 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1015,7 +1015,7 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
 	cq->uobject       = &obj->uobject;
 	cq->comp_handler  = ib_uverbs_comp_handler;
 	cq->event_handler = ib_uverbs_cq_event_handler;
-	cq->cq_context    = &ev_file->ev_queue;
+	cq->cq_context    = ev_file ? &ev_file->ev_queue : NULL;
 	atomic_set(&cq->usecnt, 0);
 
 	obj->uobject.object = cq;

From 9b40eebcd339c47921ff8b04c77af7c762b74216 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 22 Aug 2017 22:44:13 +0200
Subject: [PATCH 136/149] ACPICA: Fix acpi_evaluate_object_typed()

Commit 2d2a954375a0 (ACPICA: Update two error messages to emit
control method name) causes acpi_evaluate_object_typed() to fail
if its pathname argument is NULL, but some callers of that function
in the kernel, particularly acpi_nondev_subnode_data_ok(), pass
NULL as pathname to it and expect it to work.

For this reason, make acpi_evaluate_object_typed() check if its
pathname argument is NULL and fall back to using the pathname of
its handle argument if that is the case.

Reported-by: Sakari Ailus <sakari.ailus@intel.com>
Tested-by: Yang, Hyungwoo <hyungwoo.yang@intel.com>
Fixes: 2d2a954375a0 (ACPICA: Update two error messages to emit control method name)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/nsxfeval.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/acpi/acpica/nsxfeval.c b/drivers/acpi/acpica/nsxfeval.c
index 538c61677c10..783f4c838aee 100644
--- a/drivers/acpi/acpica/nsxfeval.c
+++ b/drivers/acpi/acpica/nsxfeval.c
@@ -100,9 +100,13 @@ acpi_evaluate_object_typed(acpi_handle handle,
 		free_buffer_on_error = TRUE;
 	}
 
-	status = acpi_get_handle(handle, pathname, &target_handle);
-	if (ACPI_FAILURE(status)) {
-		return_ACPI_STATUS(status);
+	if (pathname) {
+		status = acpi_get_handle(handle, pathname, &target_handle);
+		if (ACPI_FAILURE(status)) {
+			return_ACPI_STATUS(status);
+		}
+	} else {
+		target_handle = handle;
 	}
 
 	full_pathname = acpi_ns_get_external_pathname(target_handle);

From b5212f57da145e53df790a7e211d94daac768bf8 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 22 Aug 2017 23:39:58 +0300
Subject: [PATCH 137/149] ACPI: device property: Fix node lookup in
 acpi_graph_get_child_prop_value()

acpi_graph_get_child_prop_value() is intended to find a child node with a
certain property value pair. The check

	if (!fwnode_property_read_u32(fwnode, prop_name, &nr))
		continue;

is faulty: fwnode_property_read_u32() returns zero on success, not on
failure, leading to comparing values only if the searched property was not
found.

Moreover, the check is made against the parent device node instead of
the child one as it should be.

Fixes: 79389a83bc38 (ACPI / property: Add support for remote endpoints)
Reported-by: Hyungwoo Yang <hyungwoo.yang@intel.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Cc: 4.12+ <stable@vger.kernel.org> # 4.12+
[ rjw: Changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 917c789f953d..476a52c60cf3 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -1047,7 +1047,7 @@ static struct fwnode_handle *acpi_graph_get_child_prop_value(
 	fwnode_for_each_child_node(fwnode, child) {
 		u32 nr;
 
-		if (!fwnode_property_read_u32(fwnode, prop_name, &nr))
+		if (fwnode_property_read_u32(child, prop_name, &nr))
 			continue;
 
 		if (val == nr)

From dbeb0c8e84805b78f3979b53a4a12751403e4520 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 23 Aug 2017 16:46:15 +0200
Subject: [PATCH 138/149] ARM: at91: don't select CONFIG_ARM_CPU_SUSPEND for
 old platforms

My previous patch fixed a link error for all at91 platforms when
CONFIG_ARM_CPU_SUSPEND was not set, however this caused another
problem on a configuration that enabled CONFIG_ARCH_AT91 but none
of the individual SoCs, and that also enabled CPU_ARM720 as
the only CPU:

warning: (ARCH_AT91 && SOC_IMX23 && SOC_IMX28 && ARCH_PXA && MACH_MVEBU_V7 && SOC_IMX6 && ARCH_OMAP3 && ARCH_OMAP4 && SOC_OMAP5 && SOC_AM33XX && SOC_DRA7XX && ARCH_EXYNOS3 && ARCH_EXYNOS4 && EXYNOS5420_MCPM && EXYNOS_CPU_SUSPEND && ARCH_VEXPRESS_TC2_PM && ARM_BIG_LITTLE_CPUIDLE && ARM_HIGHBANK_CPUIDLE && QCOM_PM) selects ARM_CPU_SUSPEND which has unmet direct dependencies (ARCH_SUSPEND_POSSIBLE)
arch/arm/kernel/sleep.o: In function `cpu_resume':
(.text+0xf0): undefined reference to `cpu_arm720_suspend_size'
arch/arm/kernel/suspend.o: In function `__cpu_suspend_save':
suspend.c:(.text+0x134): undefined reference to `cpu_arm720_do_suspend'

This improves the hack some more by only selecting ARM_CPU_SUSPEND
for the part that requires it, and changing pm.c to drop the
contents of unused init functions so we no longer refer to
cpu_resume on at91 platforms that don't need it.

Fixes: cc7a938f5f30 ("ARM: at91: select CONFIG_ARM_CPU_SUSPEND")
Acked-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/mach-at91/Kconfig |  2 +-
 arch/arm/mach-at91/pm.c    | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/arm/mach-at91/Kconfig b/arch/arm/mach-at91/Kconfig
index d735e5fc4772..195da38cb9a2 100644
--- a/arch/arm/mach-at91/Kconfig
+++ b/arch/arm/mach-at91/Kconfig
@@ -1,7 +1,7 @@
 menuconfig ARCH_AT91
 	bool "Atmel SoCs"
 	depends on ARCH_MULTI_V4T || ARCH_MULTI_V5 || ARCH_MULTI_V7 || ARM_SINGLE_ARMV7M
-	select ARM_CPU_SUSPEND if PM
+	select ARM_CPU_SUSPEND if PM && ARCH_MULTI_V7
 	select COMMON_CLK_AT91
 	select GPIOLIB
 	select PINCTRL
diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index 667fddac3856..5036f996e694 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -608,6 +608,9 @@ static void __init at91_pm_init(void (*pm_idle)(void))
 
 void __init at91rm9200_pm_init(void)
 {
+	if (!IS_ENABLED(CONFIG_SOC_AT91RM9200))
+		return;
+
 	at91_dt_ramc();
 
 	/*
@@ -620,18 +623,27 @@ void __init at91rm9200_pm_init(void)
 
 void __init at91sam9_pm_init(void)
 {
+	if (!IS_ENABLED(CONFIG_SOC_AT91SAM9))
+		return;
+
 	at91_dt_ramc();
 	at91_pm_init(at91sam9_idle);
 }
 
 void __init sama5_pm_init(void)
 {
+	if (!IS_ENABLED(CONFIG_SOC_SAMA5))
+		return;
+
 	at91_dt_ramc();
 	at91_pm_init(NULL);
 }
 
 void __init sama5d2_pm_init(void)
 {
+	if (!IS_ENABLED(CONFIG_SOC_SAMA5D2))
+		return;
+
 	at91_pm_backup_init();
 	sama5_pm_init();
 }

From 93a4c8355e0e448d83f31801b4c72f66e4360975 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Wed, 23 Aug 2017 18:24:50 +0200
Subject: [PATCH 139/149] ARM: dts: exynos: add needs-hpd for Odroid-XU3/4

CEC support was added for Exynos5 in 4.13, but for the Odroids we need to set
'needs-hpd' as well since CEC is disabled when there is no HDMI hotplug signal,
just as for the exynos4 Odroid-U3.

This is due to the level-shifter that is disabled when there is no HPD, thus
blocking the CEC signal as well. Same close-but-no-cigar board design as the
Odroid-U3.

Tested with my Odroid XU4.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi b/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi
index f92f95741207..a183b56283f8 100644
--- a/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi
+++ b/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi
@@ -266,6 +266,7 @@ &hdmi {
 
 &hdmicec {
 	status = "okay";
+	needs-hpd;
 };
 
 &hsi2c_4 {

From 143c97cc652949893c8056c679012f0aeccb80e5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 23 Aug 2017 18:16:11 -0700
Subject: [PATCH 140/149] Revert "pty: fix the cached path of the pty slave
 file descriptor in the master"

This reverts commit c8c03f1858331e85d397bacccd34ef409aae993c.

It turns out that while fixing the ptmx file descriptor to have the
correct 'struct path' to the associated slave pty is a really good
thing, it breaks some user space tools for a very annoying reason.

The problem is that /dev/ptmx and its associated slave pty (/dev/pts/X)
are on different mounts.  That was what caused us to have the wrong path
in the first place (we would mix up the vfsmount of the 'ptmx' node,
with the dentry of the pty slave node), but it also means that now while
we use the right vfsmount, having the pty master open also keeps the pts
mount busy.

And it turn sout that that makes 'pbuilder' very unhappy, as noted by
Stefan Lippers-Hollmann:

 "This patch introduces a regression for me when using pbuilder
  0.228.7[2] (a helper to build Debian packages in a chroot and to
  create and update its chroots) when trying to umount /dev/ptmx (inside
  the chroot) on Debian/ unstable (full log and pbuilder configuration
  file[3] attached).

  [...]
  Setting up build-essential (12.3) ...
  Processing triggers for libc-bin (2.24-15) ...
  I: unmounting dev/ptmx filesystem
  W: Could not unmount dev/ptmx: umount: /var/cache/pbuilder/build/1340/dev/ptmx: target is busy
          (In some cases useful info about processes that
           use the device is found by lsof(8) or fuser(1).)"

apparently pbuilder tries to unmount the /dev/pts filesystem while still
holding at least one master node open, which is arguably not very nice,
but we don't break user space even when fixing other bugs.

So this commit has to be reverted.

I'll try to figure out a way to avoid caching the path to the slave pty
in the master pty.  The only thing that actually wants that slave pty
path is the "TIOCGPTPEER" ioctl, and I think we could just recreate the
path at that time.

Reported-by: Stefan Lippers-Hollmann <s.l-h@gmx.de>
Cc: Eric W Biederman <ebiederm@xmission.com>
Cc: Christian Brauner <christian.brauner@canonical.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/tty/pty.c         | 7 ++-----
 fs/devpts/inode.c         | 4 +---
 include/linux/devpts_fs.h | 2 +-
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 1fc80ea87c13..284749fb0f6b 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -793,7 +793,6 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 	struct tty_struct *tty;
 	struct path *pts_path;
 	struct dentry *dentry;
-	struct vfsmount *mnt;
 	int retval;
 	int index;
 
@@ -806,7 +805,7 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 	if (retval)
 		return retval;
 
-	fsi = devpts_acquire(filp, &mnt);
+	fsi = devpts_acquire(filp);
 	if (IS_ERR(fsi)) {
 		retval = PTR_ERR(fsi);
 		goto out_free_file;
@@ -850,7 +849,7 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 	pts_path = kmalloc(sizeof(struct path), GFP_KERNEL);
 	if (!pts_path)
 		goto err_release;
-	pts_path->mnt = mnt;
+	pts_path->mnt = filp->f_path.mnt;
 	pts_path->dentry = dentry;
 	path_get(pts_path);
 	tty->link->driver_data = pts_path;
@@ -867,7 +866,6 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 	path_put(pts_path);
 	kfree(pts_path);
 err_release:
-	mntput(mnt);
 	tty_unlock(tty);
 	// This will also put-ref the fsi
 	tty_release(inode, filp);
@@ -876,7 +874,6 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 	devpts_kill_index(fsi, index);
 out_put_fsi:
 	devpts_release(fsi);
-	mntput(mnt);
 out_free_file:
 	tty_free_file(filp);
 	return retval;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 44dfbca9306f..108df2e3602c 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -133,7 +133,7 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
-struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt)
+struct pts_fs_info *devpts_acquire(struct file *filp)
 {
 	struct pts_fs_info *result;
 	struct path path;
@@ -142,7 +142,6 @@ struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt)
 
 	path = filp->f_path;
 	path_get(&path);
-	*ptsmnt = NULL;
 
 	/* Has the devpts filesystem already been found? */
 	sb = path.mnt->mnt_sb;
@@ -166,7 +165,6 @@ struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt)
 	 * pty code needs to hold extra references in case of last /dev/tty close
 	 */
 	atomic_inc(&sb->s_active);
-	*ptsmnt = mntget(path.mnt);
 	result = DEVPTS_SB(sb);
 
 out:
diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
index 7883e901f65c..277ab9af9ac2 100644
--- a/include/linux/devpts_fs.h
+++ b/include/linux/devpts_fs.h
@@ -19,7 +19,7 @@
 
 struct pts_fs_info;
 
-struct pts_fs_info *devpts_acquire(struct file *, struct vfsmount **ptsmnt);
+struct pts_fs_info *devpts_acquire(struct file *);
 void devpts_release(struct pts_fs_info *);
 
 int devpts_new_index(struct pts_fs_info *);

From a8f0f9e49956a74718874b800251455680085600 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 17 Aug 2017 16:37:25 -0400
Subject: [PATCH 141/149] ftrace: Check for null ret_stack on profile function
 graph entry function

There's a small race when function graph shutsdown and the calling of the
registered function graph entry callback. The callback must not reference
the task's ret_stack without first checking that it is not NULL. Note, when
a ret_stack is allocated for a task, it stays allocated until the task exits.
The problem here, is that function_graph is shutdown, and a new task was
created, which doesn't have its ret_stack allocated. But since some of the
functions are still being traced, the callbacks can still be called.

The normal function_graph code handles this, but starting with commit
8861dd303c ("ftrace: Access ret_stack->subtime only in the function
profiler") the profiler code references the ret_stack on function entry, but
doesn't check if it is NULL first.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=196611

Cc: stable@vger.kernel.org
Fixes: 8861dd303c ("ftrace: Access ret_stack->subtime only in the function profiler")
Reported-by: lilydjwg@gmail.com
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 02004ae91860..96cea88fa00f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -889,6 +889,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
 
 	function_profile_call(trace->func, 0, NULL, NULL);
 
+	/* If function graph is shutting down, ret_stack can be NULL */
+	if (!current->ret_stack)
+		return 0;
+
 	if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)
 		current->ret_stack[index].subtime = 0;
 

From 475bb3c69ab05df2a6ecef6acc2393703d134180 Mon Sep 17 00:00:00 2001
From: Chunyu Hu <chuhu@redhat.com>
Date: Mon, 14 Aug 2017 18:18:17 +0800
Subject: [PATCH 142/149] tracing: Fix kmemleak in tracing_map_array_free()

kmemleak reported the below leak when I was doing clear of the hist
trigger. With this patch, the kmeamleak is gone.

unreferenced object 0xffff94322b63d760 (size 32):
  comm "bash", pid 1522, jiffies 4403687962 (age 2442.311s)
  hex dump (first 32 bytes):
    00 01 00 00 04 00 00 00 08 00 00 00 ff 00 00 00  ................
    10 00 00 00 00 00 00 00 80 a8 7a f2 31 94 ff ff  ..........z.1...
  backtrace:
    [<ffffffff9e96c27a>] kmemleak_alloc+0x4a/0xa0
    [<ffffffff9e424cba>] kmem_cache_alloc_trace+0xca/0x1d0
    [<ffffffff9e377736>] tracing_map_array_alloc+0x26/0x140
    [<ffffffff9e261be0>] kretprobe_trampoline+0x0/0x50
    [<ffffffff9e38b935>] create_hist_data+0x535/0x750
    [<ffffffff9e38bd47>] event_hist_trigger_func+0x1f7/0x420
    [<ffffffff9e38893d>] event_trigger_write+0xfd/0x1a0
    [<ffffffff9e44dfc7>] __vfs_write+0x37/0x170
    [<ffffffff9e44f552>] vfs_write+0xb2/0x1b0
    [<ffffffff9e450b85>] SyS_write+0x55/0xc0
    [<ffffffff9e203857>] do_syscall_64+0x67/0x150
    [<ffffffff9e977ce7>] return_from_SYSCALL_64+0x0/0x6a
    [<ffffffffffffffff>] 0xffffffffffffffff
unreferenced object 0xffff9431f27aa880 (size 128):
  comm "bash", pid 1522, jiffies 4403687962 (age 2442.311s)
  hex dump (first 32 bytes):
    00 00 8c 2a 32 94 ff ff 00 f0 8b 2a 32 94 ff ff  ...*2......*2...
    00 e0 8b 2a 32 94 ff ff 00 d0 8b 2a 32 94 ff ff  ...*2......*2...
  backtrace:
    [<ffffffff9e96c27a>] kmemleak_alloc+0x4a/0xa0
    [<ffffffff9e425348>] __kmalloc+0xe8/0x220
    [<ffffffff9e3777c1>] tracing_map_array_alloc+0xb1/0x140
    [<ffffffff9e261be0>] kretprobe_trampoline+0x0/0x50
    [<ffffffff9e38b935>] create_hist_data+0x535/0x750
    [<ffffffff9e38bd47>] event_hist_trigger_func+0x1f7/0x420
    [<ffffffff9e38893d>] event_trigger_write+0xfd/0x1a0
    [<ffffffff9e44dfc7>] __vfs_write+0x37/0x170
    [<ffffffff9e44f552>] vfs_write+0xb2/0x1b0
    [<ffffffff9e450b85>] SyS_write+0x55/0xc0
    [<ffffffff9e203857>] do_syscall_64+0x67/0x150
    [<ffffffff9e977ce7>] return_from_SYSCALL_64+0x0/0x6a
    [<ffffffffffffffff>] 0xffffffffffffffff

Link: http://lkml.kernel.org/r/1502705898-27571-1-git-send-email-chuhu@redhat.com

Cc: stable@vger.kernel.org
Fixes: 08d43a5fa063 ("tracing: Add lock-free tracing_map")
Signed-off-by: Chunyu Hu <chuhu@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/tracing_map.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 0a689bbb78ef..305039b122fa 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -221,16 +221,19 @@ void tracing_map_array_free(struct tracing_map_array *a)
 	if (!a)
 		return;
 
-	if (!a->pages) {
-		kfree(a);
-		return;
-	}
+	if (!a->pages)
+		goto free;
 
 	for (i = 0; i < a->n_pages; i++) {
 		if (!a->pages[i])
 			break;
 		free_page((unsigned long)a->pages[i]);
 	}
+
+	kfree(a->pages);
+
+ free:
+	kfree(a);
 }
 
 struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,

From 8b0db1a5bdfcee0dbfa89607672598ae203c9045 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 23 Aug 2017 12:46:27 -0400
Subject: [PATCH 143/149] tracing: Fix freeing of filter in create_filter()
 when set_str is false

Performing the following task with kmemleak enabled:

 # cd /sys/kernel/tracing/events/irq/irq_handler_entry/
 # echo 'enable_event:kmem:kmalloc:3 if irq >' > trigger
 # echo 'enable_event:kmem:kmalloc:3 if irq > 31' > trigger
 # echo scan > /sys/kernel/debug/kmemleak
 # cat /sys/kernel/debug/kmemleak
unreferenced object 0xffff8800b9290308 (size 32):
  comm "bash", pid 1114, jiffies 4294848451 (age 141.139s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<ffffffff81cef5aa>] kmemleak_alloc+0x4a/0xa0
    [<ffffffff81357938>] kmem_cache_alloc_trace+0x158/0x290
    [<ffffffff81261c09>] create_filter_start.constprop.28+0x99/0x940
    [<ffffffff812639c9>] create_filter+0xa9/0x160
    [<ffffffff81263bdc>] create_event_filter+0xc/0x10
    [<ffffffff812655e5>] set_trigger_filter+0xe5/0x210
    [<ffffffff812660c4>] event_enable_trigger_func+0x324/0x490
    [<ffffffff812652e2>] event_trigger_write+0x1a2/0x260
    [<ffffffff8138cf87>] __vfs_write+0xd7/0x380
    [<ffffffff8138f421>] vfs_write+0x101/0x260
    [<ffffffff8139187b>] SyS_write+0xab/0x130
    [<ffffffff81cfd501>] entry_SYSCALL_64_fastpath+0x1f/0xbe
    [<ffffffffffffffff>] 0xffffffffffffffff

The function create_filter() is passed a 'filterp' pointer that gets
allocated, and if "set_str" is true, it is up to the caller to free it, even
on error. The problem is that the pointer is not freed by create_filter()
when set_str is false. This is a bug, and it is not up to the caller to free
the filter on error if it doesn't care about the string.

Link: http://lkml.kernel.org/r/1502705898-27571-2-git-send-email-chuhu@redhat.com

Cc: stable@vger.kernel.org
Fixes: 38b78eb85 ("tracing: Factorize filter creation")
Reported-by: Chunyu Hu <chuhu@redhat.com>
Tested-by: Chunyu Hu <chuhu@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 59a411ff60c7..181e139a8057 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1959,6 +1959,10 @@ static int create_filter(struct trace_event_call *call,
 		if (err && set_str)
 			append_filter_err(ps, filter);
 	}
+	if (err && !set_str) {
+		free_event_filter(filter);
+		filter = NULL;
+	}
 	create_filter_finish(ps);
 
 	*filterp = filter;

From 64236e315955cc59e2b612e6a0e59579395530ae Mon Sep 17 00:00:00 2001
From: Cao jin <caoj.fnst@cn.fujitsu.com>
Date: Tue, 22 Aug 2017 21:09:53 +0800
Subject: [PATCH 144/149] kbuild: update comments of Makefile.asm-generic

Signed-off-by: Cao jin <caoj.fnst@cn.fujitsu.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 scripts/Makefile.asm-generic | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/Makefile.asm-generic b/scripts/Makefile.asm-generic
index 95f7d8090152..a6c8c1780855 100644
--- a/scripts/Makefile.asm-generic
+++ b/scripts/Makefile.asm-generic
@@ -1,9 +1,9 @@
 # include/asm-generic contains a lot of files that are used
 # verbatim by several architectures.
 #
-# This Makefile reads the file arch/$(SRCARCH)/include/asm/Kbuild
+# This Makefile reads the file arch/$(SRCARCH)/include/$(src)/Kbuild
 # and for each file listed in this file with generic-y creates
-# a small wrapper file in $(obj) (arch/$(SRCARCH)/include/generated/asm)
+# a small wrapper file in $(obj) (arch/$(SRCARCH)/include/generated/$(src))
 
 kbuild-file := $(srctree)/arch/$(SRCARCH)/include/$(src)/Kbuild
 -include $(kbuild-file)

From 58efbc9f5463308c43d812fd0748a4dee44c8a16 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 22 Aug 2017 23:45:59 -0700
Subject: [PATCH 145/149] Btrfs: fix blk_status_t/errno confusion

This fixes several instances of blk_status_t and bare errno ints being
mixed up, some of which are real bugs.

In the normal case, 0 matches BLK_STS_OK, so we don't observe any
effects of the missing conversion, but in case of errors or passes
through the repair/retry paths, the errors get mixed up.

The changes were identified using 'sparse', we don't have reports of the
buggy behaviour.

Fixes: 4e4cbee93d56 ("block: switch bios to blk_status_t")
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c |  4 +--
 fs/btrfs/inode.c   | 70 ++++++++++++++++++++++++----------------------
 fs/btrfs/raid56.c  | 34 +++++++++++-----------
 fs/btrfs/volumes.c | 10 +++----
 fs/btrfs/volumes.h |  6 ++--
 5 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 080e2ebb8aa0..f45b61fe9a9a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3516,7 +3516,7 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device)
 	struct bio *bio = device->flush_bio;
 
 	if (!device->flush_bio_sent)
-		return 0;
+		return BLK_STS_OK;
 
 	device->flush_bio_sent = 0;
 	wait_for_completion_io(&device->flush_wait);
@@ -3563,7 +3563,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 			continue;
 
 		write_dev_flush(dev);
-		dev->last_flush_error = 0;
+		dev->last_flush_error = BLK_STS_OK;
 	}
 
 	/* wait for all the barriers */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 95c212037095..24bcd5cd9cf2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7924,11 +7924,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	return ret;
 }
 
-static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
-					int mirror_num)
+static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
+						 struct bio *bio,
+						 int mirror_num)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	int ret;
+	blk_status_t ret;
 
 	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
 
@@ -7980,10 +7981,10 @@ static int btrfs_check_dio_repairable(struct inode *inode,
 	return 1;
 }
 
-static int dio_read_error(struct inode *inode, struct bio *failed_bio,
-			struct page *page, unsigned int pgoff,
-			u64 start, u64 end, int failed_mirror,
-			bio_end_io_t *repair_endio, void *repair_arg)
+static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
+				   struct page *page, unsigned int pgoff,
+				   u64 start, u64 end, int failed_mirror,
+				   bio_end_io_t *repair_endio, void *repair_arg)
 {
 	struct io_failure_record *failrec;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -7993,18 +7994,19 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
 	int read_mode = 0;
 	int segs;
 	int ret;
+	blk_status_t status;
 
 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
 
 	ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
 	if (ret)
-		return ret;
+		return errno_to_blk_status(ret);
 
 	ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
 					 failed_mirror);
 	if (!ret) {
 		free_io_failure(failure_tree, io_tree, failrec);
-		return -EIO;
+		return BLK_STS_IOERR;
 	}
 
 	segs = bio_segments(failed_bio);
@@ -8022,13 +8024,13 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
 		    "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
 		    read_mode, failrec->this_mirror, failrec->in_validation);
 
-	ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
-	if (ret) {
+	status = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
+	if (status) {
 		free_io_failure(failure_tree, io_tree, failrec);
 		bio_put(bio);
 	}
 
-	return ret;
+	return status;
 }
 
 struct btrfs_retry_complete {
@@ -8065,8 +8067,8 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
 	bio_put(bio);
 }
 
-static int __btrfs_correct_data_nocsum(struct inode *inode,
-				       struct btrfs_io_bio *io_bio)
+static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode,
+						struct btrfs_io_bio *io_bio)
 {
 	struct btrfs_fs_info *fs_info;
 	struct bio_vec bvec;
@@ -8076,8 +8078,8 @@ static int __btrfs_correct_data_nocsum(struct inode *inode,
 	unsigned int pgoff;
 	u32 sectorsize;
 	int nr_sectors;
-	int ret;
-	int err = 0;
+	blk_status_t ret;
+	blk_status_t err = BLK_STS_OK;
 
 	fs_info = BTRFS_I(inode)->root->fs_info;
 	sectorsize = fs_info->sectorsize;
@@ -8183,11 +8185,12 @@ static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
 	int csum_pos;
 	bool uptodate = (err == 0);
 	int ret;
+	blk_status_t status;
 
 	fs_info = BTRFS_I(inode)->root->fs_info;
 	sectorsize = fs_info->sectorsize;
 
-	err = 0;
+	err = BLK_STS_OK;
 	start = io_bio->logical;
 	done.inode = inode;
 	io_bio->bio.bi_iter = io_bio->iter;
@@ -8209,12 +8212,12 @@ static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
 		done.start = start;
 		init_completion(&done.done);
 
-		ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
-				pgoff, start, start + sectorsize - 1,
-				io_bio->mirror_num,
-				btrfs_retry_endio, &done);
-		if (ret) {
-			err = errno_to_blk_status(ret);
+		status = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
+					pgoff, start, start + sectorsize - 1,
+					io_bio->mirror_num, btrfs_retry_endio,
+					&done);
+		if (status) {
+			err = status;
 			goto next;
 		}
 
@@ -8250,7 +8253,7 @@ static blk_status_t btrfs_subio_endio_read(struct inode *inode,
 		if (unlikely(err))
 			return __btrfs_correct_data_nocsum(inode, io_bio);
 		else
-			return 0;
+			return BLK_STS_OK;
 	} else {
 		return __btrfs_subio_endio_read(inode, io_bio, err);
 	}
@@ -8423,9 +8426,9 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
 	return 0;
 }
 
-static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
-					 u64 file_offset, int skip_sum,
-					 int async_submit)
+static inline blk_status_t
+__btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
+		       int skip_sum, int async_submit)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_dio_private *dip = bio->bi_private;
@@ -8488,6 +8491,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
 	int clone_offset = 0;
 	int clone_len;
 	int ret;
+	blk_status_t status;
 
 	map_length = orig_bio->bi_iter.bi_size;
 	submit_len = map_length;
@@ -8537,9 +8541,9 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
 		 */
 		atomic_inc(&dip->pending_bios);
 
-		ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
-					     async_submit);
-		if (ret) {
+		status = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
+						async_submit);
+		if (status) {
 			bio_put(bio);
 			atomic_dec(&dip->pending_bios);
 			goto out_err;
@@ -8557,9 +8561,9 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
 	} while (submit_len > 0);
 
 submit:
-	ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
-				     async_submit);
-	if (!ret)
+	status = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
+					async_submit);
+	if (!status)
 		return 0;
 
 	bio_put(bio);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 208638384cd2..2cf6ba40f7c4 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -905,7 +905,7 @@ static void raid_write_end_io(struct bio *bio)
 	if (!atomic_dec_and_test(&rbio->stripes_pending))
 		return;
 
-	err = 0;
+	err = BLK_STS_OK;
 
 	/* OK, we have read all the stripes we need to. */
 	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
@@ -1324,7 +1324,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 	return;
 
 cleanup:
-	rbio_orig_end_io(rbio, -EIO);
+	rbio_orig_end_io(rbio, BLK_STS_IOERR);
 }
 
 /*
@@ -1475,7 +1475,7 @@ static void raid_rmw_end_io(struct bio *bio)
 
 cleanup:
 
-	rbio_orig_end_io(rbio, -EIO);
+	rbio_orig_end_io(rbio, BLK_STS_IOERR);
 }
 
 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
@@ -1579,7 +1579,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
 	return 0;
 
 cleanup:
-	rbio_orig_end_io(rbio, -EIO);
+	rbio_orig_end_io(rbio, BLK_STS_IOERR);
 	return -EIO;
 
 finish:
@@ -1795,12 +1795,12 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 	void **pointers;
 	int faila = -1, failb = -1;
 	struct page *page;
-	int err;
+	blk_status_t err;
 	int i;
 
 	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
 	if (!pointers) {
-		err = -ENOMEM;
+		err = BLK_STS_RESOURCE;
 		goto cleanup_io;
 	}
 
@@ -1856,7 +1856,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 					 * a bad data or Q stripe.
 					 * TODO, we should redo the xor here.
 					 */
-					err = -EIO;
+					err = BLK_STS_IOERR;
 					goto cleanup;
 				}
 				/*
@@ -1882,7 +1882,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 			if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
 				if (rbio->bbio->raid_map[faila] ==
 				    RAID5_P_STRIPE) {
-					err = -EIO;
+					err = BLK_STS_IOERR;
 					goto cleanup;
 				}
 				/*
@@ -1954,13 +1954,13 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 		}
 	}
 
-	err = 0;
+	err = BLK_STS_OK;
 cleanup:
 	kfree(pointers);
 
 cleanup_io:
 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
-		if (err == 0)
+		if (err == BLK_STS_OK)
 			cache_rbio_pages(rbio);
 		else
 			clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -1968,7 +1968,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 		rbio_orig_end_io(rbio, err);
 	} else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
 		rbio_orig_end_io(rbio, err);
-	} else if (err == 0) {
+	} else if (err == BLK_STS_OK) {
 		rbio->faila = -1;
 		rbio->failb = -1;
 
@@ -2005,7 +2005,7 @@ static void raid_recover_end_io(struct bio *bio)
 		return;
 
 	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
-		rbio_orig_end_io(rbio, -EIO);
+		rbio_orig_end_io(rbio, BLK_STS_IOERR);
 	else
 		__raid_recover_end_io(rbio);
 }
@@ -2104,7 +2104,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 cleanup:
 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
-		rbio_orig_end_io(rbio, -EIO);
+		rbio_orig_end_io(rbio, BLK_STS_IOERR);
 	return -EIO;
 }
 
@@ -2431,7 +2431,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 	nr_data = bio_list_size(&bio_list);
 	if (!nr_data) {
 		/* Every parity is right */
-		rbio_orig_end_io(rbio, 0);
+		rbio_orig_end_io(rbio, BLK_STS_OK);
 		return;
 	}
 
@@ -2451,7 +2451,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 	return;
 
 cleanup:
-	rbio_orig_end_io(rbio, -EIO);
+	rbio_orig_end_io(rbio, BLK_STS_IOERR);
 }
 
 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
@@ -2519,7 +2519,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
 	return;
 
 cleanup:
-	rbio_orig_end_io(rbio, -EIO);
+	rbio_orig_end_io(rbio, BLK_STS_IOERR);
 }
 
 /*
@@ -2633,7 +2633,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
 	return;
 
 cleanup:
-	rbio_orig_end_io(rbio, -EIO);
+	rbio_orig_end_io(rbio, BLK_STS_IOERR);
 	return;
 
 finish:
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e8b9a269fdde..bd679bc7a1a9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6212,8 +6212,8 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
 	}
 }
 
-int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
-		  int mirror_num, int async_submit)
+blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+			   int mirror_num, int async_submit)
 {
 	struct btrfs_device *dev;
 	struct bio *first_bio = bio;
@@ -6233,7 +6233,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 				&map_length, &bbio, mirror_num, 1);
 	if (ret) {
 		btrfs_bio_counter_dec(fs_info);
-		return ret;
+		return errno_to_blk_status(ret);
 	}
 
 	total_devs = bbio->num_stripes;
@@ -6256,7 +6256,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 		}
 
 		btrfs_bio_counter_dec(fs_info);
-		return ret;
+		return errno_to_blk_status(ret);
 	}
 
 	if (map_length < length) {
@@ -6283,7 +6283,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 				  dev_nr, async_submit);
 	}
 	btrfs_bio_counter_dec(fs_info);
-	return 0;
+	return BLK_STS_OK;
 }
 
 struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6f45fd60d15a..93277fc60930 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -74,7 +74,7 @@ struct btrfs_device {
 	int missing;
 	int can_discard;
 	int is_tgtdev_for_dev_replace;
-	int last_flush_error;
+	blk_status_t last_flush_error;
 	int flush_bio_sent;
 
 #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
@@ -416,8 +416,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      struct btrfs_fs_info *fs_info, u64 type);
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
-int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
-		  int mirror_num, int async_submit);
+blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
+			   int mirror_num, int async_submit);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       fmode_t flags, void *holder);
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,

From 498ca3c82a7b11e152a46c253f6b2087c929ce00 Mon Sep 17 00:00:00 2001
From: Noa Osherovich <noaos@mellanox.com>
Date: Wed, 23 Aug 2017 08:35:40 +0300
Subject: [PATCH 146/149] IB/core: Avoid accessing non-allocated memory when
 inferring port type

Commit 44c58487d51a ("IB/core: Define 'ib' and 'roce' rdma_ah_attr types")
introduced the concept of type in ah_attr:
 * During ib_register_device, each port is checked for its type which
   is stored in ib_device's port_immutable array.
 * During uverbs' modify_qp, the type is inferred using the port number
   in ib_uverbs_qp_dest struct (address vector) by accessing the
   relevant port_immutable array and the type is passed on to
   providers.

IB spec (version 1.3) enforces a valid port value only in Reset to
Init. During Init to RTR, the address vector must be valid but port
number is not mentioned as a field in the address vector, so its
value is not validated, which leads to accesses to a non-allocated
memory when inferring the port type.

Save the real port number in ib_qp during modify to Init (when the
comp_mask indicates that the port number is valid) and use this value
to infer the port type.

Avoid copying the address vector fields if the matching bit is not set
in the attr_mask. Address vector can't be modified before the port, so
no valid flow is affected.

Fixes: 44c58487d51a ('IB/core: Define 'ib' and 'roce' rdma_ah_attr types')
Signed-off-by: Noa Osherovich <noaos@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/uverbs_cmd.c | 11 +++++++----
 drivers/infiniband/core/verbs.c      |  7 ++++++-
 include/rdma/ib_verbs.h              |  1 +
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 55822ae71955..739bd69ef1d4 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1522,6 +1522,7 @@ static int create_qp(struct ib_uverbs_file *file,
 		qp->qp_type	  = attr.qp_type;
 		atomic_set(&qp->usecnt, 0);
 		atomic_inc(&pd->usecnt);
+		qp->port = 0;
 		if (attr.send_cq)
 			atomic_inc(&attr.send_cq->usecnt);
 		if (attr.recv_cq)
@@ -1962,8 +1963,9 @@ static int modify_qp(struct ib_uverbs_file *file,
 	attr->alt_timeout	  = cmd->base.alt_timeout;
 	attr->rate_limit	  = cmd->rate_limit;
 
-	attr->ah_attr.type = rdma_ah_find_type(qp->device,
-					       cmd->base.dest.port_num);
+	if (cmd->base.attr_mask & IB_QP_AV)
+		attr->ah_attr.type = rdma_ah_find_type(qp->device,
+						       cmd->base.dest.port_num);
 	if (cmd->base.dest.is_global) {
 		rdma_ah_set_grh(&attr->ah_attr, NULL,
 				cmd->base.dest.flow_label,
@@ -1981,8 +1983,9 @@ static int modify_qp(struct ib_uverbs_file *file,
 	rdma_ah_set_port_num(&attr->ah_attr,
 			     cmd->base.dest.port_num);
 
-	attr->alt_ah_attr.type = rdma_ah_find_type(qp->device,
-						   cmd->base.dest.port_num);
+	if (cmd->base.attr_mask & IB_QP_ALT_PATH)
+		attr->alt_ah_attr.type =
+			rdma_ah_find_type(qp->device, cmd->base.dest.port_num);
 	if (cmd->base.alt_dest.is_global) {
 		rdma_ah_set_grh(&attr->alt_ah_attr, NULL,
 				cmd->base.alt_dest.flow_label,
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 7f8fe443df46..b456e3ca1876 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -838,6 +838,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
 	spin_lock_init(&qp->mr_lock);
 	INIT_LIST_HEAD(&qp->rdma_mrs);
 	INIT_LIST_HEAD(&qp->sig_mrs);
+	qp->port = 0;
 
 	if (qp_init_attr->qp_type == IB_QPT_XRC_TGT)
 		return ib_create_xrc_qp(qp, qp_init_attr);
@@ -1297,7 +1298,11 @@ int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr,
 		if (ret)
 			return ret;
 	}
-	return ib_security_modify_qp(qp, attr, attr_mask, udata);
+	ret = ib_security_modify_qp(qp, attr, attr_mask, udata);
+	if (!ret && (attr_mask & IB_QP_PORT))
+		qp->port = attr->port_num;
+
+	return ret;
 }
 EXPORT_SYMBOL(ib_modify_qp_with_udata);
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b5732432bb29..88c32aba32f7 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1683,6 +1683,7 @@ struct ib_qp {
 	enum ib_qp_type		qp_type;
 	struct ib_rwq_ind_table *rwq_ind_tbl;
 	struct ib_qp_security  *qp_sec;
+	u8			port;
 };
 
 struct ib_mr {

From 1d31e9c09f41f9ffa53eaf8457b3e77a1d527f1e Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Wed, 23 Aug 2017 08:35:41 +0300
Subject: [PATCH 147/149] IB/mlx5: Fix Raw Packet QP event handler assignment

In case we have SQ and RQ for Raw Packet QP, the SQ's event handler
wasn't assigned.

Fixing this by assigning event handler for each WQ after creation.

[ 1877.145243] Call Trace:
[ 1877.148644] <IRQ>
[ 1877.150580] [<ffffffffa07987c5>] ? mlx5_rsc_event+0x105/0x210 [mlx5_core]
[ 1877.159581] [<ffffffffa0795bd7>] ? mlx5_cq_event+0x57/0xd0 [mlx5_core]
[ 1877.167137] [<ffffffffa079208e>] mlx5_eq_int+0x53e/0x6c0 [mlx5_core]
[ 1877.174526] [<ffffffff8101a679>] ? sched_clock+0x9/0x10
[ 1877.180753] [<ffffffff810f717e>] handle_irq_event_percpu+0x3e/0x1e0
[ 1877.188014] [<ffffffff810f735d>] handle_irq_event+0x3d/0x60
[ 1877.194567] [<ffffffff810f9fe7>] handle_edge_irq+0x77/0x130
[ 1877.201129] [<ffffffff81014c3f>] handle_irq+0xbf/0x150
[ 1877.207244] [<ffffffff815ed78a>] ? atomic_notifier_call_chain+0x1a/0x20
[ 1877.214829] [<ffffffff815f434f>] do_IRQ+0x4f/0xf0
[ 1877.220498] [<ffffffff815e94ad>] common_interrupt+0x6d/0x6d
[ 1877.227025] <EOI>
[ 1877.228967] [<ffffffff814834e2>] ? cpuidle_enter_state+0x52/0xc0
[ 1877.236990] [<ffffffff81483615>] cpuidle_idle_call+0xc5/0x200
[ 1877.243676] [<ffffffff8101bc7e>] arch_cpu_idle+0xe/0x30
[ 1877.249831] [<ffffffff810b4725>] cpu_startup_entry+0xf5/0x290
[ 1877.256513] [<ffffffff815cfee1>] start_secondary+0x265/0x27b
[ 1877.263111] Code: Bad RIP value.
[ 1877.267296] RIP [< (null)>] (null)
[ 1877.273264] RSP <ffff88046fd63df8>
[ 1877.277531] CR2: 0000000000000000

Fixes: 19098df2da78 ("IB/mlx5: Refactor mlx5_ib_qp to accommodate other QP types")
Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/qp.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 0889ff367c86..f58f8f5f3ebe 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -1238,6 +1238,7 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
 			goto err_destroy_tis;
 
 		sq->base.container_mibqp = qp;
+		sq->base.mqp.event = mlx5_ib_qp_event;
 	}
 
 	if (qp->rq.wqe_cnt) {

From ec2558796d25e6024071b6bcb8e11392538d57bf Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Wed, 23 Aug 2017 08:35:42 +0300
Subject: [PATCH 148/149] IB/mlx5: Always return success for RoCE modify port

CM layer calls ib_modify_port() regardless of the link layer.

For the Ethernet ports, qkey violation and Port capabilities
are meaningless. Therefore, always return success for ib_modify_port
calls on the Ethernet ports.

Cc: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Reviewed-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/main.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index a7f2e60085c4..f7fcde1ff0aa 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1085,6 +1085,12 @@ static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
 	bool is_ib = (mlx5_ib_port_link_layer(ibdev, port) ==
 		      IB_LINK_LAYER_INFINIBAND);
 
+	/* CM layer calls ib_modify_port() regardless of the link layer. For
+	 * Ethernet ports, qkey violation and Port capabilities are meaningless.
+	 */
+	if (!is_ib)
+		return 0;
+
 	if (MLX5_CAP_GEN(dev->mdev, ib_virt) && is_ib) {
 		change_mask = props->clr_port_cap_mask | props->set_port_cap_mask;
 		value = ~props->clr_port_cap_mask | props->set_port_cap_mask;

From 311fc65c9fb9c966bca8e6f3ff8132ce57344ab9 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 24 Aug 2017 15:13:29 -0500
Subject: [PATCH 149/149] pty: Repair TIOCGPTPEER

The implementation of TIOCGPTPEER has two issues.

When /dev/ptmx (as opposed to /dev/pts/ptmx) is opened the wrong
vfsmount is passed to dentry_open.  Which results in the kernel displaying
the wrong pathname for the peer.

The second is simply by caching the vfsmount and dentry of the peer it leaves
them open, in a way they were not previously Which because of the inreased
reference counts can cause unnecessary behaviour differences resulting in
regressions.

To fix these move the ioctl into tty_io.c at a generic level allowing
the ioctl to have access to the struct file on which the ioctl is
being called.  This allows the path of the slave to be derived when
opening the slave through TIOCGPTPEER instead of requiring the path to
the slave be cached.  Thus removing the need for caching the path.

A new function devpts_ptmx_path is factored out of devpts_acquire and
used to implement a function devpts_mntget.   The new function devpts_mntget
takes a filp to perform the lookup on and fsi so that it can confirm
that the superblock that is found by devpts_ptmx_path is the proper superblock.

v2: Lots of fixes to make the code actually work
v3: Suggestions by Linus
    - Removed the unnecessary initialization of filp in ptm_open_peer
    - Simplified devpts_ptmx_path as gotos are no longer required

[ This is the fix for the issue that was reverted in commit
  143c97cc6529, but this time without breaking 'pbuilder' due to
  increased reference counts   - Linus ]

Fixes: 54ebbfb16034 ("tty: add TIOCGPTPEER ioctl")
Reported-by: Christian Brauner <christian.brauner@canonical.com>
Reported-and-tested-by: Stefan Lippers-Hollmann <s.l-h@gmx.de>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/tty/pty.c         | 64 ++++++++++++++++----------------------
 drivers/tty/tty_io.c      |  3 ++
 fs/devpts/inode.c         | 65 +++++++++++++++++++++++++++++----------
 include/linux/devpts_fs.h | 10 ++++++
 4 files changed, 89 insertions(+), 53 deletions(-)

diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 284749fb0f6b..a6d5164c33a9 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -69,13 +69,8 @@ static void pty_close(struct tty_struct *tty, struct file *filp)
 #ifdef CONFIG_UNIX98_PTYS
 		if (tty->driver == ptm_driver) {
 			mutex_lock(&devpts_mutex);
-			if (tty->link->driver_data) {
-				struct path *path = tty->link->driver_data;
-
-				devpts_pty_kill(path->dentry);
-				path_put(path);
-				kfree(path);
-			}
+			if (tty->link->driver_data)
+				devpts_pty_kill(tty->link->driver_data);
 			mutex_unlock(&devpts_mutex);
 		}
 #endif
@@ -607,25 +602,24 @@ static inline void legacy_pty_init(void) { }
 static struct cdev ptmx_cdev;
 
 /**
- *	pty_open_peer - open the peer of a pty
- *	@tty: the peer of the pty being opened
+ *	ptm_open_peer - open the peer of a pty
+ *	@master: the open struct file of the ptmx device node
+ *	@tty: the master of the pty being opened
+ *	@flags: the flags for open
  *
- *	Open the cached dentry in tty->link, providing a safe way for userspace
- *	to get the slave end of a pty (where they have the master fd and cannot
- *	access or trust the mount namespace /dev/pts was mounted inside).
+ *	Provide a race free way for userspace to open the slave end of a pty
+ *	(where they have the master fd and cannot access or trust the mount
+ *	namespace /dev/pts was mounted inside).
  */
-static struct file *pty_open_peer(struct tty_struct *tty, int flags)
-{
-	if (tty->driver->subtype != PTY_TYPE_MASTER)
-		return ERR_PTR(-EIO);
-	return dentry_open(tty->link->driver_data, flags, current_cred());
-}
-
-static int pty_get_peer(struct tty_struct *tty, int flags)
+int ptm_open_peer(struct file *master, struct tty_struct *tty, int flags)
 {
 	int fd = -1;
-	struct file *filp = NULL;
+	struct file *filp;
 	int retval = -EINVAL;
+	struct path path;
+
+	if (tty->driver != ptm_driver)
+		return -EIO;
 
 	fd = get_unused_fd_flags(0);
 	if (fd < 0) {
@@ -633,7 +627,16 @@ static int pty_get_peer(struct tty_struct *tty, int flags)
 		goto err;
 	}
 
-	filp = pty_open_peer(tty, flags);
+	/* Compute the slave's path */
+	path.mnt = devpts_mntget(master, tty->driver_data);
+	if (IS_ERR(path.mnt)) {
+		retval = PTR_ERR(path.mnt);
+		goto err_put;
+	}
+	path.dentry = tty->link->driver_data;
+
+	filp = dentry_open(&path, flags, current_cred());
+	mntput(path.mnt);
 	if (IS_ERR(filp)) {
 		retval = PTR_ERR(filp);
 		goto err_put;
@@ -662,8 +665,6 @@ static int pty_unix98_ioctl(struct tty_struct *tty,
 		return pty_get_pktmode(tty, (int __user *)arg);
 	case TIOCGPTN: /* Get PT Number */
 		return put_user(tty->index, (unsigned int __user *)arg);
-	case TIOCGPTPEER: /* Open the other end */
-		return pty_get_peer(tty, (int) arg);
 	case TIOCSIG:    /* Send signal to other side of pty */
 		return pty_signal(tty, (int) arg);
 	}
@@ -791,7 +792,6 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 {
 	struct pts_fs_info *fsi;
 	struct tty_struct *tty;
-	struct path *pts_path;
 	struct dentry *dentry;
 	int retval;
 	int index;
@@ -845,26 +845,16 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 		retval = PTR_ERR(dentry);
 		goto err_release;
 	}
-	/* We need to cache a fake path for TIOCGPTPEER. */
-	pts_path = kmalloc(sizeof(struct path), GFP_KERNEL);
-	if (!pts_path)
-		goto err_release;
-	pts_path->mnt = filp->f_path.mnt;
-	pts_path->dentry = dentry;
-	path_get(pts_path);
-	tty->link->driver_data = pts_path;
+	tty->link->driver_data = dentry;
 
 	retval = ptm_driver->ops->open(tty, filp);
 	if (retval)
-		goto err_path_put;
+		goto err_release;
 
 	tty_debug_hangup(tty, "opening (count=%d)\n", tty->count);
 
 	tty_unlock(tty);
 	return 0;
-err_path_put:
-	path_put(pts_path);
-	kfree(pts_path);
 err_release:
 	tty_unlock(tty);
 	// This will also put-ref the fsi
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 974b13d24401..10c4038c0e8d 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -2518,6 +2518,9 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case TIOCSSERIAL:
 		tty_warn_deprecated_flags(p);
 		break;
+	case TIOCGPTPEER:
+		/* Special because the struct file is needed */
+		return ptm_open_peer(file, tty, (int)arg);
 	default:
 		retval = tty_jobctrl_ioctl(tty, real_tty, file, cmd, arg);
 		if (retval != -ENOIOCTLCMD)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 108df2e3602c..7eae33ffa3fc 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -133,6 +133,50 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
+static int devpts_ptmx_path(struct path *path)
+{
+	struct super_block *sb;
+	int err;
+
+	/* Has the devpts filesystem already been found? */
+	if (path->mnt->mnt_sb->s_magic == DEVPTS_SUPER_MAGIC)
+		return 0;
+
+	/* Is a devpts filesystem at "pts" in the same directory? */
+	err = path_pts(path);
+	if (err)
+		return err;
+
+	/* Is the path the root of a devpts filesystem? */
+	sb = path->mnt->mnt_sb;
+	if ((sb->s_magic != DEVPTS_SUPER_MAGIC) ||
+	    (path->mnt->mnt_root != sb->s_root))
+		return -ENODEV;
+
+	return 0;
+}
+
+struct vfsmount *devpts_mntget(struct file *filp, struct pts_fs_info *fsi)
+{
+	struct path path;
+	int err;
+
+	path = filp->f_path;
+	path_get(&path);
+
+	err = devpts_ptmx_path(&path);
+	dput(path.dentry);
+	if (err) {
+		mntput(path.mnt);
+		path.mnt = ERR_PTR(err);
+	}
+	if (DEVPTS_SB(path.mnt->mnt_sb) != fsi) {
+		mntput(path.mnt);
+		path.mnt = ERR_PTR(-ENODEV);
+	}
+	return path.mnt;
+}
+
 struct pts_fs_info *devpts_acquire(struct file *filp)
 {
 	struct pts_fs_info *result;
@@ -143,27 +187,16 @@ struct pts_fs_info *devpts_acquire(struct file *filp)
 	path = filp->f_path;
 	path_get(&path);
 
-	/* Has the devpts filesystem already been found? */
-	sb = path.mnt->mnt_sb;
-	if (sb->s_magic != DEVPTS_SUPER_MAGIC) {
-		/* Is a devpts filesystem at "pts" in the same directory? */
-		err = path_pts(&path);
-		if (err) {
-			result = ERR_PTR(err);
-			goto out;
-		}
-
-		/* Is the path the root of a devpts filesystem? */
-		result = ERR_PTR(-ENODEV);
-		sb = path.mnt->mnt_sb;
-		if ((sb->s_magic != DEVPTS_SUPER_MAGIC) ||
-		    (path.mnt->mnt_root != sb->s_root))
-			goto out;
+	err = devpts_ptmx_path(&path);
+	if (err) {
+		result = ERR_PTR(err);
+		goto out;
 	}
 
 	/*
 	 * pty code needs to hold extra references in case of last /dev/tty close
 	 */
+	sb = path.mnt->mnt_sb;
 	atomic_inc(&sb->s_active);
 	result = DEVPTS_SB(sb);
 
diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h
index 277ab9af9ac2..100cb4343763 100644
--- a/include/linux/devpts_fs.h
+++ b/include/linux/devpts_fs.h
@@ -19,6 +19,7 @@
 
 struct pts_fs_info;
 
+struct vfsmount *devpts_mntget(struct file *, struct pts_fs_info *);
 struct pts_fs_info *devpts_acquire(struct file *);
 void devpts_release(struct pts_fs_info *);
 
@@ -32,6 +33,15 @@ void *devpts_get_priv(struct dentry *);
 /* unlink */
 void devpts_pty_kill(struct dentry *);
 
+/* in pty.c */
+int ptm_open_peer(struct file *master, struct tty_struct *tty, int flags);
+
+#else
+static inline int
+ptm_open_peer(struct file *master, struct tty_struct *tty, int flags)
+{
+	return -EIO;
+}
 #endif