2005-11-19 16:17:32 +07:00
|
|
|
#ifndef __ASM_POWERPC_MMU_CONTEXT_H
|
|
|
|
#define __ASM_POWERPC_MMU_CONTEXT_H
|
2005-12-17 04:43:46 +07:00
|
|
|
#ifdef __KERNEL__
|
2005-11-19 16:17:32 +07:00
|
|
|
|
2008-12-19 02:13:24 +07:00
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/spinlock.h>
|
2007-07-03 15:22:05 +07:00
|
|
|
#include <asm/mmu.h>
|
|
|
|
#include <asm/cputable.h>
|
2008-12-19 02:13:24 +07:00
|
|
|
#include <asm/cputhreads.h>
|
2007-07-03 15:22:05 +07:00
|
|
|
|
|
|
|
/*
|
2008-12-19 02:13:24 +07:00
|
|
|
* Most if the context management is out of line
|
2007-07-03 15:22:05 +07:00
|
|
|
*/
|
2005-04-17 05:20:36 +07:00
|
|
|
extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
|
|
|
|
extern void destroy_context(struct mm_struct *mm);
|
2015-06-05 13:35:24 +07:00
|
|
|
#ifdef CONFIG_SPAPR_TCE_IOMMU
|
|
|
|
struct mm_iommu_table_group_mem_t;
|
|
|
|
|
2016-09-06 13:27:31 +07:00
|
|
|
extern int isolate_lru_page(struct page *page); /* from internal.h */
|
2016-11-30 13:52:00 +07:00
|
|
|
extern bool mm_iommu_preregistered(struct mm_struct *mm);
|
|
|
|
extern long mm_iommu_get(struct mm_struct *mm,
|
|
|
|
unsigned long ua, unsigned long entries,
|
2015-06-05 13:35:24 +07:00
|
|
|
struct mm_iommu_table_group_mem_t **pmem);
|
2016-11-30 13:52:00 +07:00
|
|
|
extern long mm_iommu_put(struct mm_struct *mm,
|
|
|
|
struct mm_iommu_table_group_mem_t *mem);
|
2016-11-30 13:51:59 +07:00
|
|
|
extern void mm_iommu_init(struct mm_struct *mm);
|
|
|
|
extern void mm_iommu_cleanup(struct mm_struct *mm);
|
2016-11-30 13:52:00 +07:00
|
|
|
extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
|
|
|
|
unsigned long ua, unsigned long size);
|
2017-03-22 11:21:47 +07:00
|
|
|
extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
|
|
|
|
struct mm_struct *mm, unsigned long ua, unsigned long size);
|
2016-11-30 13:52:00 +07:00
|
|
|
extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
|
|
|
|
unsigned long ua, unsigned long entries);
|
2015-06-05 13:35:24 +07:00
|
|
|
extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
|
|
|
|
unsigned long ua, unsigned long *hpa);
|
2017-03-22 11:21:47 +07:00
|
|
|
extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
|
|
|
|
unsigned long ua, unsigned long *hpa);
|
2015-06-05 13:35:24 +07:00
|
|
|
extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
|
|
|
|
extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
|
|
|
|
#endif
|
2005-04-17 05:20:36 +07:00
|
|
|
extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
|
2008-12-19 02:13:24 +07:00
|
|
|
extern void set_context(unsigned long id, pgd_t *pgd);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2009-07-24 06:15:26 +07:00
|
|
|
#ifdef CONFIG_PPC_BOOK3S_64
|
2016-04-29 20:26:02 +07:00
|
|
|
extern void radix__switch_mmu_context(struct mm_struct *prev,
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 11:26:06 +07:00
|
|
|
struct mm_struct *next);
|
2016-04-29 20:26:01 +07:00
|
|
|
static inline void switch_mmu_context(struct mm_struct *prev,
|
|
|
|
struct mm_struct *next,
|
|
|
|
struct task_struct *tsk)
|
|
|
|
{
|
2016-04-29 20:26:02 +07:00
|
|
|
if (radix_enabled())
|
|
|
|
return radix__switch_mmu_context(prev, next);
|
2016-04-29 20:26:01 +07:00
|
|
|
return switch_slb(tsk, next);
|
|
|
|
}
|
|
|
|
|
2017-03-29 18:00:46 +07:00
|
|
|
extern int hash__alloc_context_id(void);
|
2017-03-22 10:37:00 +07:00
|
|
|
extern void hash__reserve_context_id(int id);
|
2009-11-02 19:02:30 +07:00
|
|
|
extern void __destroy_context(int context_id);
|
2009-07-24 06:15:26 +07:00
|
|
|
static inline void mmu_context_init(void) { }
|
|
|
|
#else
|
2016-04-29 20:26:01 +07:00
|
|
|
extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
|
|
|
|
struct task_struct *tsk);
|
2010-04-16 05:11:36 +07:00
|
|
|
extern unsigned long __init_new_context(void);
|
|
|
|
extern void __destroy_context(unsigned long context_id);
|
2009-07-24 06:15:26 +07:00
|
|
|
extern void mmu_context_init(void);
|
|
|
|
#endif
|
|
|
|
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 11:26:06 +07:00
|
|
|
#if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && defined(CONFIG_PPC_RADIX_MMU)
|
|
|
|
extern void radix_kvm_prefetch_workaround(struct mm_struct *mm);
|
|
|
|
#else
|
|
|
|
static inline void radix_kvm_prefetch_workaround(struct mm_struct *mm) { }
|
|
|
|
#endif
|
|
|
|
|
2011-05-03 03:43:04 +07:00
|
|
|
extern void switch_cop(struct mm_struct *next);
|
|
|
|
extern int use_cop(unsigned long acop, struct mm_struct *mm);
|
|
|
|
extern void drop_cop(unsigned long acop, struct mm_struct *mm);
|
|
|
|
|
2017-07-24 11:27:58 +07:00
|
|
|
#if defined(CONFIG_PPC32)
|
|
|
|
static inline void switch_mm_pgdir(struct task_struct *tsk,
|
|
|
|
struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
/* 32-bit keeps track of the current PGDIR in the thread struct */
|
|
|
|
tsk->thread.pgdir = mm->pgd;
|
|
|
|
}
|
|
|
|
#elif defined(CONFIG_PPC_BOOK3E_64)
|
|
|
|
static inline void switch_mm_pgdir(struct task_struct *tsk,
|
|
|
|
struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
/* 64-bit Book3E keeps track of current PGD in the PACA */
|
|
|
|
get_paca()->pgd = mm->pgd;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void switch_mm_pgdir(struct task_struct *tsk,
|
|
|
|
struct mm_struct *mm) { }
|
|
|
|
#endif
|
|
|
|
|
2017-07-24 11:28:02 +07:00
|
|
|
#ifdef CONFIG_PPC_BOOK3S_64
|
|
|
|
static inline void inc_mm_active_cpus(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
atomic_inc(&mm->context.active_cpus);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void inc_mm_active_cpus(struct mm_struct *mm) { }
|
|
|
|
#endif
|
2017-07-24 11:27:58 +07:00
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/*
|
|
|
|
* switch_mm is the entry point called from the architecture independent
|
2013-06-04 14:40:24 +07:00
|
|
|
* code in kernel/sched/core.c
|
2005-04-17 05:20:36 +07:00
|
|
|
*/
|
powerpc/mm: Ensure IRQs are off in switch_mm()
powerpc expects IRQs to already be (soft) disabled when switch_mm() is
called, as made clear in the commit message of 9c1e105238c4 ("powerpc: Allow
perf_counters to access user memory at interrupt time").
Aside from any race conditions that might exist between switch_mm() and an IRQ,
there is also an unconditional hard_irq_disable() in switch_slb(). If that isn't
followed at some point by an IRQ enable then interrupts will remain disabled
until we return to userspace.
It is true that when switch_mm() is called from the scheduler IRQs are off, but
not when it's called by use_mm(). Looking closer we see that last year in commit
f98db6013c55 ("sched/core: Add switch_mm_irqs_off() and use it in the scheduler")
this was made more explicit by the addition of switch_mm_irqs_off() which is now
called by the scheduler, vs switch_mm() which is used by use_mm().
Arguably it is a bug in use_mm() to call switch_mm() in a different context than
it expects, but fixing that will take time.
This was discovered recently when vhost started throwing warnings such as:
BUG: sleeping function called from invalid context at kernel/mutex.c:578
in_atomic(): 0, irqs_disabled(): 1, pid: 10768, name: vhost-10760
no locks held by vhost-10760/10768.
irq event stamp: 10
hardirqs last enabled at (9): _raw_spin_unlock_irq+0x40/0x80
hardirqs last disabled at (10): switch_slb+0x2e4/0x490
softirqs last enabled at (0): copy_process+0x5e8/0x1260
softirqs last disabled at (0): (null)
Call Trace:
show_stack+0x88/0x390 (unreliable)
dump_stack+0x30/0x44
__might_sleep+0x1c4/0x2d0
mutex_lock_nested+0x74/0x5c0
cgroup_attach_task_all+0x5c/0x180
vhost_attach_cgroups_work+0x58/0x80 [vhost]
vhost_worker+0x24c/0x3d0 [vhost]
kthread+0xec/0x100
ret_from_kernel_thread+0x5c/0xd4
Prior to commit 04b96e5528ca ("vhost: lockless enqueuing") (Aug 2016) the
vhost_worker() would do a spin_unlock_irq() not long after calling use_mm(),
which had the effect of reenabling IRQs. Since that commit removed the locking
in vhost_worker() the body of the vhost_worker() loop now runs with interrupts
off causing the warnings.
This patch addresses the problem by making the powerpc code mirror the x86 code,
ie. we disable interrupts in switch_mm(), and optimise the scheduler case by
defining switch_mm_irqs_off().
Cc: stable@vger.kernel.org # v4.7+
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[mpe: Flesh out/rewrite change log, add stable]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-04-19 13:38:26 +07:00
|
|
|
static inline void switch_mm_irqs_off(struct mm_struct *prev,
|
|
|
|
struct mm_struct *next,
|
|
|
|
struct task_struct *tsk)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 11:26:06 +07:00
|
|
|
bool new_on_cpu = false;
|
|
|
|
|
2008-12-19 02:13:24 +07:00
|
|
|
/* Mark this context has been used on the new CPU */
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 11:26:06 +07:00
|
|
|
if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) {
|
2016-10-03 13:40:29 +07:00
|
|
|
cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
|
2017-07-24 11:28:02 +07:00
|
|
|
inc_mm_active_cpus(next);
|
2008-12-19 02:13:24 +07:00
|
|
|
|
2017-07-24 11:28:00 +07:00
|
|
|
/*
|
|
|
|
* This full barrier orders the store to the cpumask above vs
|
|
|
|
* a subsequent operation which allows this CPU to begin loading
|
|
|
|
* translations for next.
|
|
|
|
*
|
|
|
|
* When using the radix MMU that operation is the load of the
|
|
|
|
* MMU context id, which is then moved to SPRN_PID.
|
|
|
|
*
|
|
|
|
* For the hash MMU it is either the first load from slb_cache
|
|
|
|
* in switch_slb(), and/or the store of paca->mm_ctx_id in
|
|
|
|
* copy_mm_to_paca().
|
|
|
|
*
|
|
|
|
* On the read side the barrier is in pte_xchg(), which orders
|
|
|
|
* the store to the PTE vs the load of mm_cpumask.
|
|
|
|
*/
|
|
|
|
smp_mb();
|
|
|
|
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 11:26:06 +07:00
|
|
|
new_on_cpu = true;
|
|
|
|
}
|
2008-12-19 02:13:24 +07:00
|
|
|
|
2017-07-24 11:27:58 +07:00
|
|
|
/* Some subarchs need to track the PGD elsewhere */
|
|
|
|
switch_mm_pgdir(tsk, next);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2008-12-19 02:13:24 +07:00
|
|
|
/* Nothing else to do if we aren't actually switching */
|
2005-04-17 05:20:36 +07:00
|
|
|
if (prev == next)
|
|
|
|
return;
|
|
|
|
|
2008-12-19 02:13:24 +07:00
|
|
|
/* We must stop all altivec streams before changing the HW
|
|
|
|
* context
|
|
|
|
*/
|
2005-04-17 05:20:36 +07:00
|
|
|
#ifdef CONFIG_ALTIVEC
|
|
|
|
if (cpu_has_feature(CPU_FTR_ALTIVEC))
|
|
|
|
asm volatile ("dssall");
|
|
|
|
#endif /* CONFIG_ALTIVEC */
|
powerpc/mm/radix: Workaround prefetch issue with KVM
There's a somewhat architectural issue with Radix MMU and KVM.
When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.
The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.
This can cause stale translations and subsequent crashes.
Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.
We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.
We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.
There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:
- On the way out of a guest, before we clear the current VCPU in the
PACA, we check the PID and if it's outside of the permitted range
we flush the TLB for that PID.
- When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).
This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.
A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-07-24 11:26:06 +07:00
|
|
|
|
|
|
|
if (new_on_cpu)
|
|
|
|
radix_kvm_prefetch_workaround(next);
|
|
|
|
|
2016-04-29 20:26:01 +07:00
|
|
|
/*
|
|
|
|
* The actual HW switching method differs between the various
|
|
|
|
* sub architectures. Out of line for now
|
2008-12-19 02:13:24 +07:00
|
|
|
*/
|
2016-04-29 20:26:01 +07:00
|
|
|
switch_mmu_context(prev, next, tsk);
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
powerpc/mm: Ensure IRQs are off in switch_mm()
powerpc expects IRQs to already be (soft) disabled when switch_mm() is
called, as made clear in the commit message of 9c1e105238c4 ("powerpc: Allow
perf_counters to access user memory at interrupt time").
Aside from any race conditions that might exist between switch_mm() and an IRQ,
there is also an unconditional hard_irq_disable() in switch_slb(). If that isn't
followed at some point by an IRQ enable then interrupts will remain disabled
until we return to userspace.
It is true that when switch_mm() is called from the scheduler IRQs are off, but
not when it's called by use_mm(). Looking closer we see that last year in commit
f98db6013c55 ("sched/core: Add switch_mm_irqs_off() and use it in the scheduler")
this was made more explicit by the addition of switch_mm_irqs_off() which is now
called by the scheduler, vs switch_mm() which is used by use_mm().
Arguably it is a bug in use_mm() to call switch_mm() in a different context than
it expects, but fixing that will take time.
This was discovered recently when vhost started throwing warnings such as:
BUG: sleeping function called from invalid context at kernel/mutex.c:578
in_atomic(): 0, irqs_disabled(): 1, pid: 10768, name: vhost-10760
no locks held by vhost-10760/10768.
irq event stamp: 10
hardirqs last enabled at (9): _raw_spin_unlock_irq+0x40/0x80
hardirqs last disabled at (10): switch_slb+0x2e4/0x490
softirqs last enabled at (0): copy_process+0x5e8/0x1260
softirqs last disabled at (0): (null)
Call Trace:
show_stack+0x88/0x390 (unreliable)
dump_stack+0x30/0x44
__might_sleep+0x1c4/0x2d0
mutex_lock_nested+0x74/0x5c0
cgroup_attach_task_all+0x5c/0x180
vhost_attach_cgroups_work+0x58/0x80 [vhost]
vhost_worker+0x24c/0x3d0 [vhost]
kthread+0xec/0x100
ret_from_kernel_thread+0x5c/0xd4
Prior to commit 04b96e5528ca ("vhost: lockless enqueuing") (Aug 2016) the
vhost_worker() would do a spin_unlock_irq() not long after calling use_mm(),
which had the effect of reenabling IRQs. Since that commit removed the locking
in vhost_worker() the body of the vhost_worker() loop now runs with interrupts
off causing the warnings.
This patch addresses the problem by making the powerpc code mirror the x86 code,
ie. we disable interrupts in switch_mm(), and optimise the scheduler case by
defining switch_mm_irqs_off().
Cc: stable@vger.kernel.org # v4.7+
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
[mpe: Flesh out/rewrite change log, add stable]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-04-19 13:38:26 +07:00
|
|
|
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
|
|
|
struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
switch_mm_irqs_off(prev, next, tsk);
|
|
|
|
local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
#define switch_mm_irqs_off switch_mm_irqs_off
|
|
|
|
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
#define deactivate_mm(tsk,mm) do { } while (0)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* After we have set current->mm to a new value, this activates
|
|
|
|
* the context for the new mm so we see the new mappings.
|
|
|
|
*/
|
|
|
|
static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
|
|
|
|
{
|
|
|
|
switch_mm(prev, next, current);
|
|
|
|
}
|
|
|
|
|
2008-12-19 02:13:24 +07:00
|
|
|
/* We don't currently use enter_lazy_tlb() for anything */
|
|
|
|
static inline void enter_lazy_tlb(struct mm_struct *mm,
|
|
|
|
struct task_struct *tsk)
|
|
|
|
{
|
2009-07-24 06:15:47 +07:00
|
|
|
/* 64-bit Book3E keeps track of current PGD in the PACA */
|
|
|
|
#ifdef CONFIG_PPC_BOOK3E_64
|
|
|
|
get_paca()->pgd = NULL;
|
|
|
|
#endif
|
2008-12-19 02:13:24 +07:00
|
|
|
}
|
|
|
|
|
2015-06-25 06:56:22 +07:00
|
|
|
static inline void arch_dup_mmap(struct mm_struct *oldmm,
|
|
|
|
struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void arch_exit_mmap(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void arch_unmap(struct mm_struct *mm,
|
|
|
|
struct vm_area_struct *vma,
|
|
|
|
unsigned long start, unsigned long end)
|
|
|
|
{
|
|
|
|
if (start <= mm->context.vdso_base && mm->context.vdso_base < end)
|
|
|
|
mm->context.vdso_base = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void arch_bprm_mm_init(struct mm_struct *mm,
|
|
|
|
struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2016-02-13 04:02:21 +07:00
|
|
|
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
|
2016-02-13 04:02:24 +07:00
|
|
|
bool write, bool execute, bool foreign)
|
mm/gup, x86/mm/pkeys: Check VMAs and PTEs for protection keys
Today, for normal faults and page table walks, we check the VMA
and/or PTE to ensure that it is compatible with the action. For
instance, if we get a write fault on a non-writeable VMA, we
SIGSEGV.
We try to do the same thing for protection keys. Basically, we
try to make sure that if a user does this:
mprotect(ptr, size, PROT_NONE);
*ptr = foo;
they see the same effects with protection keys when they do this:
mprotect(ptr, size, PROT_READ|PROT_WRITE);
set_pkey(ptr, size, 4);
wrpkru(0xffffff3f); // access disable pkey 4
*ptr = foo;
The state to do that checking is in the VMA, but we also
sometimes have to do it on the page tables only, like when doing
a get_user_pages_fast() where we have no VMA.
We add two functions and expose them to generic code:
arch_pte_access_permitted(pte_flags, write)
arch_vma_access_permitted(vma, write)
These are, of course, backed up in x86 arch code with checks
against the PTE or VMA's protection key.
But, there are also cases where we do not want to respect
protection keys. When we ptrace(), for instance, we do not want
to apply the tracer's PKRU permissions to the PTEs from the
process being traced.
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: David Hildenbrand <dahi@linux.vnet.ibm.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Dominik Dingel <dingel@linux.vnet.ibm.com>
Cc: Dominik Vogt <vogt@linux.vnet.ibm.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jason Low <jason.low2@hp.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Shachar Raindel <raindel@mellanox.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-s390@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/20160212210219.14D5D715@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-02-13 04:02:19 +07:00
|
|
|
{
|
|
|
|
/* by default, allow everything */
|
|
|
|
return true;
|
|
|
|
}
|
2005-12-17 04:43:46 +07:00
|
|
|
#endif /* __KERNEL__ */
|
2005-11-19 16:17:32 +07:00
|
|
|
#endif /* __ASM_POWERPC_MMU_CONTEXT_H */
|