mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-28 11:18:45 +07:00
ada329e6b5
As a result of a KVM_SET_USER_MEMORY_REGION ioctl, KVM flushes the dcache for the memslot being changed to ensure a consistent view of memory between the host and the guest: the host runs with caches enabled, and it is possible for the data written by the hypervisor to still be in the caches, but the guest is running with stage 1 disabled, meaning data accesses are to Device-nGnRnE memory, bypassing the caches entirely. Flushing the dcache is not necessary when KVM enables FWB, because it forces the guest to uses cacheable memory accesses. The current behaviour does not change, as the dcache flush helpers execute the cache operation only if FWB is not enabled, but walking the stage 2 table is avoided. Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com> Signed-off-by: Marc Zyngier <maz@kernel.org> Link: https://lore.kernel.org/r/20200915170442.131635-1-alexandru.elisei@arm.com
1446 lines
40 KiB
C
1446 lines
40 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Copyright (C) 2012 - Virtual Open Systems and Columbia University
|
|
* Author: Christoffer Dall <c.dall@virtualopensystems.com>
|
|
*/
|
|
|
|
#include <linux/mman.h>
|
|
#include <linux/kvm_host.h>
|
|
#include <linux/io.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/sched/signal.h>
|
|
#include <trace/events/kvm.h>
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/cacheflush.h>
|
|
#include <asm/kvm_arm.h>
|
|
#include <asm/kvm_mmu.h>
|
|
#include <asm/kvm_pgtable.h>
|
|
#include <asm/kvm_ras.h>
|
|
#include <asm/kvm_asm.h>
|
|
#include <asm/kvm_emulate.h>
|
|
#include <asm/virt.h>
|
|
|
|
#include "trace.h"
|
|
|
|
static struct kvm_pgtable *hyp_pgtable;
|
|
static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
|
|
|
|
static unsigned long hyp_idmap_start;
|
|
static unsigned long hyp_idmap_end;
|
|
static phys_addr_t hyp_idmap_vector;
|
|
|
|
static unsigned long io_map_base;
|
|
|
|
|
|
/*
|
|
* Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
|
|
* we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
|
|
* CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
|
|
* long will also starve other vCPUs. We have to also make sure that the page
|
|
* tables are not freed while we released the lock.
|
|
*/
|
|
static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
|
|
phys_addr_t end,
|
|
int (*fn)(struct kvm_pgtable *, u64, u64),
|
|
bool resched)
|
|
{
|
|
int ret;
|
|
u64 next;
|
|
|
|
do {
|
|
struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
|
|
if (!pgt)
|
|
return -EINVAL;
|
|
|
|
next = stage2_pgd_addr_end(kvm, addr, end);
|
|
ret = fn(pgt, addr, next - addr);
|
|
if (ret)
|
|
break;
|
|
|
|
if (resched && next != end)
|
|
cond_resched_lock(&kvm->mmu_lock);
|
|
} while (addr = next, addr != end);
|
|
|
|
return ret;
|
|
}
|
|
|
|
#define stage2_apply_range_resched(kvm, addr, end, fn) \
|
|
stage2_apply_range(kvm, addr, end, fn, true)
|
|
|
|
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
|
|
{
|
|
return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
|
|
}
|
|
|
|
/**
|
|
* kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
|
|
* @kvm: pointer to kvm structure.
|
|
*
|
|
* Interface to HYP function to flush all VM TLB entries
|
|
*/
|
|
void kvm_flush_remote_tlbs(struct kvm *kvm)
|
|
{
|
|
kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
|
|
}
|
|
|
|
static bool kvm_is_device_pfn(unsigned long pfn)
|
|
{
|
|
return !pfn_valid(pfn);
|
|
}
|
|
|
|
/*
|
|
* Unmapping vs dcache management:
|
|
*
|
|
* If a guest maps certain memory pages as uncached, all writes will
|
|
* bypass the data cache and go directly to RAM. However, the CPUs
|
|
* can still speculate reads (not writes) and fill cache lines with
|
|
* data.
|
|
*
|
|
* Those cache lines will be *clean* cache lines though, so a
|
|
* clean+invalidate operation is equivalent to an invalidate
|
|
* operation, because no cache lines are marked dirty.
|
|
*
|
|
* Those clean cache lines could be filled prior to an uncached write
|
|
* by the guest, and the cache coherent IO subsystem would therefore
|
|
* end up writing old data to disk.
|
|
*
|
|
* This is why right after unmapping a page/section and invalidating
|
|
* the corresponding TLBs, we flush to make sure the IO subsystem will
|
|
* never hit in the cache.
|
|
*
|
|
* This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
|
|
* we then fully enforce cacheability of RAM, no matter what the guest
|
|
* does.
|
|
*/
|
|
/**
|
|
* unmap_stage2_range -- Clear stage2 page table entries to unmap a range
|
|
* @kvm: The VM pointer
|
|
* @start: The intermediate physical base address of the range to unmap
|
|
* @size: The size of the area to unmap
|
|
*
|
|
* Clear a range of stage-2 mappings, lowering the various ref-counts. Must
|
|
* be called while holding mmu_lock (unless for freeing the stage2 pgd before
|
|
* destroying the VM), otherwise another faulting VCPU may come in and mess
|
|
* with things behind our backs.
|
|
*/
|
|
static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
|
|
bool may_block)
|
|
{
|
|
struct kvm *kvm = mmu->kvm;
|
|
phys_addr_t end = start + size;
|
|
|
|
assert_spin_locked(&kvm->mmu_lock);
|
|
WARN_ON(size & ~PAGE_MASK);
|
|
WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
|
|
may_block));
|
|
}
|
|
|
|
static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
|
|
{
|
|
__unmap_stage2_range(mmu, start, size, true);
|
|
}
|
|
|
|
static void stage2_flush_memslot(struct kvm *kvm,
|
|
struct kvm_memory_slot *memslot)
|
|
{
|
|
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
|
|
phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
|
|
|
|
stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
|
|
}
|
|
|
|
/**
|
|
* stage2_flush_vm - Invalidate cache for pages mapped in stage 2
|
|
* @kvm: The struct kvm pointer
|
|
*
|
|
* Go through the stage 2 page tables and invalidate any cache lines
|
|
* backing memory already mapped to the VM.
|
|
*/
|
|
static void stage2_flush_vm(struct kvm *kvm)
|
|
{
|
|
struct kvm_memslots *slots;
|
|
struct kvm_memory_slot *memslot;
|
|
int idx;
|
|
|
|
idx = srcu_read_lock(&kvm->srcu);
|
|
spin_lock(&kvm->mmu_lock);
|
|
|
|
slots = kvm_memslots(kvm);
|
|
kvm_for_each_memslot(memslot, slots)
|
|
stage2_flush_memslot(kvm, memslot);
|
|
|
|
spin_unlock(&kvm->mmu_lock);
|
|
srcu_read_unlock(&kvm->srcu, idx);
|
|
}
|
|
|
|
/**
|
|
* free_hyp_pgds - free Hyp-mode page tables
|
|
*/
|
|
void free_hyp_pgds(void)
|
|
{
|
|
mutex_lock(&kvm_hyp_pgd_mutex);
|
|
if (hyp_pgtable) {
|
|
kvm_pgtable_hyp_destroy(hyp_pgtable);
|
|
kfree(hyp_pgtable);
|
|
}
|
|
mutex_unlock(&kvm_hyp_pgd_mutex);
|
|
}
|
|
|
|
static int __create_hyp_mappings(unsigned long start, unsigned long size,
|
|
unsigned long phys, enum kvm_pgtable_prot prot)
|
|
{
|
|
int err;
|
|
|
|
mutex_lock(&kvm_hyp_pgd_mutex);
|
|
err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
|
|
mutex_unlock(&kvm_hyp_pgd_mutex);
|
|
|
|
return err;
|
|
}
|
|
|
|
static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
|
|
{
|
|
if (!is_vmalloc_addr(kaddr)) {
|
|
BUG_ON(!virt_addr_valid(kaddr));
|
|
return __pa(kaddr);
|
|
} else {
|
|
return page_to_phys(vmalloc_to_page(kaddr)) +
|
|
offset_in_page(kaddr);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
|
|
* @from: The virtual kernel start address of the range
|
|
* @to: The virtual kernel end address of the range (exclusive)
|
|
* @prot: The protection to be applied to this range
|
|
*
|
|
* The same virtual address as the kernel virtual address is also used
|
|
* in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
|
|
* physical pages.
|
|
*/
|
|
int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
|
|
{
|
|
phys_addr_t phys_addr;
|
|
unsigned long virt_addr;
|
|
unsigned long start = kern_hyp_va((unsigned long)from);
|
|
unsigned long end = kern_hyp_va((unsigned long)to);
|
|
|
|
if (is_kernel_in_hyp_mode())
|
|
return 0;
|
|
|
|
start = start & PAGE_MASK;
|
|
end = PAGE_ALIGN(end);
|
|
|
|
for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
|
|
int err;
|
|
|
|
phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
|
|
err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
|
|
prot);
|
|
if (err)
|
|
return err;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
|
|
unsigned long *haddr,
|
|
enum kvm_pgtable_prot prot)
|
|
{
|
|
unsigned long base;
|
|
int ret = 0;
|
|
|
|
mutex_lock(&kvm_hyp_pgd_mutex);
|
|
|
|
/*
|
|
* This assumes that we have enough space below the idmap
|
|
* page to allocate our VAs. If not, the check below will
|
|
* kick. A potential alternative would be to detect that
|
|
* overflow and switch to an allocation above the idmap.
|
|
*
|
|
* The allocated size is always a multiple of PAGE_SIZE.
|
|
*/
|
|
size = PAGE_ALIGN(size + offset_in_page(phys_addr));
|
|
base = io_map_base - size;
|
|
|
|
/*
|
|
* Verify that BIT(VA_BITS - 1) hasn't been flipped by
|
|
* allocating the new area, as it would indicate we've
|
|
* overflowed the idmap/IO address range.
|
|
*/
|
|
if ((base ^ io_map_base) & BIT(VA_BITS - 1))
|
|
ret = -ENOMEM;
|
|
else
|
|
io_map_base = base;
|
|
|
|
mutex_unlock(&kvm_hyp_pgd_mutex);
|
|
|
|
if (ret)
|
|
goto out;
|
|
|
|
ret = __create_hyp_mappings(base, size, phys_addr, prot);
|
|
if (ret)
|
|
goto out;
|
|
|
|
*haddr = base + offset_in_page(phys_addr);
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* create_hyp_io_mappings - Map IO into both kernel and HYP
|
|
* @phys_addr: The physical start address which gets mapped
|
|
* @size: Size of the region being mapped
|
|
* @kaddr: Kernel VA for this mapping
|
|
* @haddr: HYP VA for this mapping
|
|
*/
|
|
int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
|
|
void __iomem **kaddr,
|
|
void __iomem **haddr)
|
|
{
|
|
unsigned long addr;
|
|
int ret;
|
|
|
|
*kaddr = ioremap(phys_addr, size);
|
|
if (!*kaddr)
|
|
return -ENOMEM;
|
|
|
|
if (is_kernel_in_hyp_mode()) {
|
|
*haddr = *kaddr;
|
|
return 0;
|
|
}
|
|
|
|
ret = __create_hyp_private_mapping(phys_addr, size,
|
|
&addr, PAGE_HYP_DEVICE);
|
|
if (ret) {
|
|
iounmap(*kaddr);
|
|
*kaddr = NULL;
|
|
*haddr = NULL;
|
|
return ret;
|
|
}
|
|
|
|
*haddr = (void __iomem *)addr;
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* create_hyp_exec_mappings - Map an executable range into HYP
|
|
* @phys_addr: The physical start address which gets mapped
|
|
* @size: Size of the region being mapped
|
|
* @haddr: HYP VA for this mapping
|
|
*/
|
|
int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
|
|
void **haddr)
|
|
{
|
|
unsigned long addr;
|
|
int ret;
|
|
|
|
BUG_ON(is_kernel_in_hyp_mode());
|
|
|
|
ret = __create_hyp_private_mapping(phys_addr, size,
|
|
&addr, PAGE_HYP_EXEC);
|
|
if (ret) {
|
|
*haddr = NULL;
|
|
return ret;
|
|
}
|
|
|
|
*haddr = (void *)addr;
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* kvm_init_stage2_mmu - Initialise a S2 MMU strucrure
|
|
* @kvm: The pointer to the KVM structure
|
|
* @mmu: The pointer to the s2 MMU structure
|
|
*
|
|
* Allocates only the stage-2 HW PGD level table(s).
|
|
* Note we don't need locking here as this is only called when the VM is
|
|
* created, which can only be done once.
|
|
*/
|
|
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
|
|
{
|
|
int cpu, err;
|
|
struct kvm_pgtable *pgt;
|
|
|
|
if (mmu->pgt != NULL) {
|
|
kvm_err("kvm_arch already initialized?\n");
|
|
return -EINVAL;
|
|
}
|
|
|
|
pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
|
|
if (!pgt)
|
|
return -ENOMEM;
|
|
|
|
err = kvm_pgtable_stage2_init(pgt, kvm);
|
|
if (err)
|
|
goto out_free_pgtable;
|
|
|
|
mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
|
|
if (!mmu->last_vcpu_ran) {
|
|
err = -ENOMEM;
|
|
goto out_destroy_pgtable;
|
|
}
|
|
|
|
for_each_possible_cpu(cpu)
|
|
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
|
|
|
|
mmu->kvm = kvm;
|
|
mmu->pgt = pgt;
|
|
mmu->pgd_phys = __pa(pgt->pgd);
|
|
mmu->vmid.vmid_gen = 0;
|
|
return 0;
|
|
|
|
out_destroy_pgtable:
|
|
kvm_pgtable_stage2_destroy(pgt);
|
|
out_free_pgtable:
|
|
kfree(pgt);
|
|
return err;
|
|
}
|
|
|
|
static void stage2_unmap_memslot(struct kvm *kvm,
|
|
struct kvm_memory_slot *memslot)
|
|
{
|
|
hva_t hva = memslot->userspace_addr;
|
|
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
|
|
phys_addr_t size = PAGE_SIZE * memslot->npages;
|
|
hva_t reg_end = hva + size;
|
|
|
|
/*
|
|
* A memory region could potentially cover multiple VMAs, and any holes
|
|
* between them, so iterate over all of them to find out if we should
|
|
* unmap any of them.
|
|
*
|
|
* +--------------------------------------------+
|
|
* +---------------+----------------+ +----------------+
|
|
* | : VMA 1 | VMA 2 | | VMA 3 : |
|
|
* +---------------+----------------+ +----------------+
|
|
* | memory region |
|
|
* +--------------------------------------------+
|
|
*/
|
|
do {
|
|
struct vm_area_struct *vma = find_vma(current->mm, hva);
|
|
hva_t vm_start, vm_end;
|
|
|
|
if (!vma || vma->vm_start >= reg_end)
|
|
break;
|
|
|
|
/*
|
|
* Take the intersection of this VMA with the memory region
|
|
*/
|
|
vm_start = max(hva, vma->vm_start);
|
|
vm_end = min(reg_end, vma->vm_end);
|
|
|
|
if (!(vma->vm_flags & VM_PFNMAP)) {
|
|
gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
|
|
unmap_stage2_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
|
|
}
|
|
hva = vm_end;
|
|
} while (hva < reg_end);
|
|
}
|
|
|
|
/**
|
|
* stage2_unmap_vm - Unmap Stage-2 RAM mappings
|
|
* @kvm: The struct kvm pointer
|
|
*
|
|
* Go through the memregions and unmap any regular RAM
|
|
* backing memory already mapped to the VM.
|
|
*/
|
|
void stage2_unmap_vm(struct kvm *kvm)
|
|
{
|
|
struct kvm_memslots *slots;
|
|
struct kvm_memory_slot *memslot;
|
|
int idx;
|
|
|
|
idx = srcu_read_lock(&kvm->srcu);
|
|
mmap_read_lock(current->mm);
|
|
spin_lock(&kvm->mmu_lock);
|
|
|
|
slots = kvm_memslots(kvm);
|
|
kvm_for_each_memslot(memslot, slots)
|
|
stage2_unmap_memslot(kvm, memslot);
|
|
|
|
spin_unlock(&kvm->mmu_lock);
|
|
mmap_read_unlock(current->mm);
|
|
srcu_read_unlock(&kvm->srcu, idx);
|
|
}
|
|
|
|
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
|
|
{
|
|
struct kvm *kvm = mmu->kvm;
|
|
struct kvm_pgtable *pgt = NULL;
|
|
|
|
spin_lock(&kvm->mmu_lock);
|
|
pgt = mmu->pgt;
|
|
if (pgt) {
|
|
mmu->pgd_phys = 0;
|
|
mmu->pgt = NULL;
|
|
free_percpu(mmu->last_vcpu_ran);
|
|
}
|
|
spin_unlock(&kvm->mmu_lock);
|
|
|
|
if (pgt) {
|
|
kvm_pgtable_stage2_destroy(pgt);
|
|
kfree(pgt);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* kvm_phys_addr_ioremap - map a device range to guest IPA
|
|
*
|
|
* @kvm: The KVM pointer
|
|
* @guest_ipa: The IPA at which to insert the mapping
|
|
* @pa: The physical address of the device
|
|
* @size: The size of the mapping
|
|
*/
|
|
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
|
|
phys_addr_t pa, unsigned long size, bool writable)
|
|
{
|
|
phys_addr_t addr;
|
|
int ret = 0;
|
|
struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
|
|
struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
|
|
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
|
|
KVM_PGTABLE_PROT_R |
|
|
(writable ? KVM_PGTABLE_PROT_W : 0);
|
|
|
|
size += offset_in_page(guest_ipa);
|
|
guest_ipa &= PAGE_MASK;
|
|
|
|
for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
|
|
ret = kvm_mmu_topup_memory_cache(&cache,
|
|
kvm_mmu_cache_min_pages(kvm));
|
|
if (ret)
|
|
break;
|
|
|
|
spin_lock(&kvm->mmu_lock);
|
|
ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
|
|
&cache);
|
|
spin_unlock(&kvm->mmu_lock);
|
|
if (ret)
|
|
break;
|
|
|
|
pa += PAGE_SIZE;
|
|
}
|
|
|
|
kvm_mmu_free_memory_cache(&cache);
|
|
return ret;
|
|
}
|
|
|
|
/**
|
|
* stage2_wp_range() - write protect stage2 memory region range
|
|
* @kvm: The KVM pointer
|
|
* @addr: Start address of range
|
|
* @end: End address of range
|
|
*/
|
|
static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
|
|
{
|
|
struct kvm *kvm = mmu->kvm;
|
|
stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
|
|
}
|
|
|
|
/**
|
|
* kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
|
|
* @kvm: The KVM pointer
|
|
* @slot: The memory slot to write protect
|
|
*
|
|
* Called to start logging dirty pages after memory region
|
|
* KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
|
|
* all present PUD, PMD and PTEs are write protected in the memory region.
|
|
* Afterwards read of dirty page log can be called.
|
|
*
|
|
* Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
|
|
* serializing operations for VM memory regions.
|
|
*/
|
|
void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
|
|
{
|
|
struct kvm_memslots *slots = kvm_memslots(kvm);
|
|
struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
|
|
phys_addr_t start, end;
|
|
|
|
if (WARN_ON_ONCE(!memslot))
|
|
return;
|
|
|
|
start = memslot->base_gfn << PAGE_SHIFT;
|
|
end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
|
|
|
|
spin_lock(&kvm->mmu_lock);
|
|
stage2_wp_range(&kvm->arch.mmu, start, end);
|
|
spin_unlock(&kvm->mmu_lock);
|
|
kvm_flush_remote_tlbs(kvm);
|
|
}
|
|
|
|
/**
|
|
* kvm_mmu_write_protect_pt_masked() - write protect dirty pages
|
|
* @kvm: The KVM pointer
|
|
* @slot: The memory slot associated with mask
|
|
* @gfn_offset: The gfn offset in memory slot
|
|
* @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
|
|
* slot to be write protected
|
|
*
|
|
* Walks bits set in mask write protects the associated pte's. Caller must
|
|
* acquire kvm_mmu_lock.
|
|
*/
|
|
static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
|
|
struct kvm_memory_slot *slot,
|
|
gfn_t gfn_offset, unsigned long mask)
|
|
{
|
|
phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
|
|
phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
|
|
phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
|
|
|
|
stage2_wp_range(&kvm->arch.mmu, start, end);
|
|
}
|
|
|
|
/*
|
|
* kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
|
|
* dirty pages.
|
|
*
|
|
* It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
|
|
* enable dirty logging for them.
|
|
*/
|
|
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
|
|
struct kvm_memory_slot *slot,
|
|
gfn_t gfn_offset, unsigned long mask)
|
|
{
|
|
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
|
|
}
|
|
|
|
static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
|
|
{
|
|
__clean_dcache_guest_page(pfn, size);
|
|
}
|
|
|
|
static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
|
|
{
|
|
__invalidate_icache_guest_page(pfn, size);
|
|
}
|
|
|
|
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
|
|
{
|
|
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
|
|
}
|
|
|
|
static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
|
|
unsigned long hva,
|
|
unsigned long map_size)
|
|
{
|
|
gpa_t gpa_start;
|
|
hva_t uaddr_start, uaddr_end;
|
|
size_t size;
|
|
|
|
/* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
|
|
if (map_size == PAGE_SIZE)
|
|
return true;
|
|
|
|
size = memslot->npages * PAGE_SIZE;
|
|
|
|
gpa_start = memslot->base_gfn << PAGE_SHIFT;
|
|
|
|
uaddr_start = memslot->userspace_addr;
|
|
uaddr_end = uaddr_start + size;
|
|
|
|
/*
|
|
* Pages belonging to memslots that don't have the same alignment
|
|
* within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
|
|
* PMD/PUD entries, because we'll end up mapping the wrong pages.
|
|
*
|
|
* Consider a layout like the following:
|
|
*
|
|
* memslot->userspace_addr:
|
|
* +-----+--------------------+--------------------+---+
|
|
* |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
|
|
* +-----+--------------------+--------------------+---+
|
|
*
|
|
* memslot->base_gfn << PAGE_SHIFT:
|
|
* +---+--------------------+--------------------+-----+
|
|
* |abc|def Stage-2 block | Stage-2 block |tvxyz|
|
|
* +---+--------------------+--------------------+-----+
|
|
*
|
|
* If we create those stage-2 blocks, we'll end up with this incorrect
|
|
* mapping:
|
|
* d -> f
|
|
* e -> g
|
|
* f -> h
|
|
*/
|
|
if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
|
|
return false;
|
|
|
|
/*
|
|
* Next, let's make sure we're not trying to map anything not covered
|
|
* by the memslot. This means we have to prohibit block size mappings
|
|
* for the beginning and end of a non-block aligned and non-block sized
|
|
* memory slot (illustrated by the head and tail parts of the
|
|
* userspace view above containing pages 'abcde' and 'xyz',
|
|
* respectively).
|
|
*
|
|
* Note that it doesn't matter if we do the check using the
|
|
* userspace_addr or the base_gfn, as both are equally aligned (per
|
|
* the check above) and equally sized.
|
|
*/
|
|
return (hva & ~(map_size - 1)) >= uaddr_start &&
|
|
(hva & ~(map_size - 1)) + map_size <= uaddr_end;
|
|
}
|
|
|
|
/*
|
|
* Check if the given hva is backed by a transparent huge page (THP) and
|
|
* whether it can be mapped using block mapping in stage2. If so, adjust
|
|
* the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
|
|
* supported. This will need to be updated to support other THP sizes.
|
|
*
|
|
* Returns the size of the mapping.
|
|
*/
|
|
static unsigned long
|
|
transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
|
|
unsigned long hva, kvm_pfn_t *pfnp,
|
|
phys_addr_t *ipap)
|
|
{
|
|
kvm_pfn_t pfn = *pfnp;
|
|
|
|
/*
|
|
* Make sure the adjustment is done only for THP pages. Also make
|
|
* sure that the HVA and IPA are sufficiently aligned and that the
|
|
* block map is contained within the memslot.
|
|
*/
|
|
if (kvm_is_transparent_hugepage(pfn) &&
|
|
fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
|
|
/*
|
|
* The address we faulted on is backed by a transparent huge
|
|
* page. However, because we map the compound huge page and
|
|
* not the individual tail page, we need to transfer the
|
|
* refcount to the head page. We have to be careful that the
|
|
* THP doesn't start to split while we are adjusting the
|
|
* refcounts.
|
|
*
|
|
* We are sure this doesn't happen, because mmu_notifier_retry
|
|
* was successful and we are holding the mmu_lock, so if this
|
|
* THP is trying to split, it will be blocked in the mmu
|
|
* notifier before touching any of the pages, specifically
|
|
* before being able to call __split_huge_page_refcount().
|
|
*
|
|
* We can therefore safely transfer the refcount from PG_tail
|
|
* to PG_head and switch the pfn from a tail page to the head
|
|
* page accordingly.
|
|
*/
|
|
*ipap &= PMD_MASK;
|
|
kvm_release_pfn_clean(pfn);
|
|
pfn &= ~(PTRS_PER_PMD - 1);
|
|
kvm_get_pfn(pfn);
|
|
*pfnp = pfn;
|
|
|
|
return PMD_SIZE;
|
|
}
|
|
|
|
/* Use page mapping if we cannot use block mapping. */
|
|
return PAGE_SIZE;
|
|
}
|
|
|
|
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
|
struct kvm_memory_slot *memslot, unsigned long hva,
|
|
unsigned long fault_status)
|
|
{
|
|
int ret;
|
|
bool write_fault, writable, force_pte = false;
|
|
bool exec_fault;
|
|
bool device = false;
|
|
unsigned long mmu_seq;
|
|
struct kvm *kvm = vcpu->kvm;
|
|
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
|
|
struct vm_area_struct *vma;
|
|
short vma_shift;
|
|
gfn_t gfn;
|
|
kvm_pfn_t pfn;
|
|
bool logging_active = memslot_is_logging(memslot);
|
|
unsigned long vma_pagesize;
|
|
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
|
|
struct kvm_pgtable *pgt;
|
|
|
|
write_fault = kvm_is_write_fault(vcpu);
|
|
exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
|
|
VM_BUG_ON(write_fault && exec_fault);
|
|
|
|
if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
|
|
kvm_err("Unexpected L2 read permission error\n");
|
|
return -EFAULT;
|
|
}
|
|
|
|
/* Let's check if we will get back a huge page backed by hugetlbfs */
|
|
mmap_read_lock(current->mm);
|
|
vma = find_vma_intersection(current->mm, hva, hva + 1);
|
|
if (unlikely(!vma)) {
|
|
kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
|
|
mmap_read_unlock(current->mm);
|
|
return -EFAULT;
|
|
}
|
|
|
|
if (is_vm_hugetlb_page(vma))
|
|
vma_shift = huge_page_shift(hstate_vma(vma));
|
|
else
|
|
vma_shift = PAGE_SHIFT;
|
|
|
|
if (logging_active ||
|
|
(vma->vm_flags & VM_PFNMAP)) {
|
|
force_pte = true;
|
|
vma_shift = PAGE_SHIFT;
|
|
}
|
|
|
|
if (vma_shift == PUD_SHIFT &&
|
|
!fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
|
|
vma_shift = PMD_SHIFT;
|
|
|
|
if (vma_shift == PMD_SHIFT &&
|
|
!fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
|
|
force_pte = true;
|
|
vma_shift = PAGE_SHIFT;
|
|
}
|
|
|
|
vma_pagesize = 1UL << vma_shift;
|
|
if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
|
|
fault_ipa &= ~(vma_pagesize - 1);
|
|
|
|
gfn = fault_ipa >> PAGE_SHIFT;
|
|
mmap_read_unlock(current->mm);
|
|
|
|
/*
|
|
* Permission faults just need to update the existing leaf entry,
|
|
* and so normally don't require allocations from the memcache. The
|
|
* only exception to this is when dirty logging is enabled at runtime
|
|
* and a write fault needs to collapse a block entry into a table.
|
|
*/
|
|
if (fault_status != FSC_PERM || (logging_active && write_fault)) {
|
|
ret = kvm_mmu_topup_memory_cache(memcache,
|
|
kvm_mmu_cache_min_pages(kvm));
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
mmu_seq = vcpu->kvm->mmu_notifier_seq;
|
|
/*
|
|
* Ensure the read of mmu_notifier_seq happens before we call
|
|
* gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
|
|
* the page we just got a reference to gets unmapped before we have a
|
|
* chance to grab the mmu_lock, which ensure that if the page gets
|
|
* unmapped afterwards, the call to kvm_unmap_hva will take it away
|
|
* from us again properly. This smp_rmb() interacts with the smp_wmb()
|
|
* in kvm_mmu_notifier_invalidate_<page|range_end>.
|
|
*/
|
|
smp_rmb();
|
|
|
|
pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
|
|
if (pfn == KVM_PFN_ERR_HWPOISON) {
|
|
kvm_send_hwpoison_signal(hva, vma_shift);
|
|
return 0;
|
|
}
|
|
if (is_error_noslot_pfn(pfn))
|
|
return -EFAULT;
|
|
|
|
if (kvm_is_device_pfn(pfn)) {
|
|
device = true;
|
|
} else if (logging_active && !write_fault) {
|
|
/*
|
|
* Only actually map the page as writable if this was a write
|
|
* fault.
|
|
*/
|
|
writable = false;
|
|
}
|
|
|
|
if (exec_fault && device)
|
|
return -ENOEXEC;
|
|
|
|
spin_lock(&kvm->mmu_lock);
|
|
pgt = vcpu->arch.hw_mmu->pgt;
|
|
if (mmu_notifier_retry(kvm, mmu_seq))
|
|
goto out_unlock;
|
|
|
|
/*
|
|
* If we are not forced to use page mapping, check if we are
|
|
* backed by a THP and thus use block mapping if possible.
|
|
*/
|
|
if (vma_pagesize == PAGE_SIZE && !force_pte)
|
|
vma_pagesize = transparent_hugepage_adjust(memslot, hva,
|
|
&pfn, &fault_ipa);
|
|
if (writable) {
|
|
prot |= KVM_PGTABLE_PROT_W;
|
|
kvm_set_pfn_dirty(pfn);
|
|
mark_page_dirty(kvm, gfn);
|
|
}
|
|
|
|
if (fault_status != FSC_PERM && !device)
|
|
clean_dcache_guest_page(pfn, vma_pagesize);
|
|
|
|
if (exec_fault) {
|
|
prot |= KVM_PGTABLE_PROT_X;
|
|
invalidate_icache_guest_page(pfn, vma_pagesize);
|
|
}
|
|
|
|
if (device)
|
|
prot |= KVM_PGTABLE_PROT_DEVICE;
|
|
else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
|
|
prot |= KVM_PGTABLE_PROT_X;
|
|
|
|
if (fault_status == FSC_PERM && !(logging_active && writable)) {
|
|
ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
|
|
} else {
|
|
ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
|
|
__pfn_to_phys(pfn), prot,
|
|
memcache);
|
|
}
|
|
|
|
out_unlock:
|
|
spin_unlock(&kvm->mmu_lock);
|
|
kvm_set_pfn_accessed(pfn);
|
|
kvm_release_pfn_clean(pfn);
|
|
return ret;
|
|
}
|
|
|
|
/* Resolve the access fault by making the page young again. */
|
|
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
|
|
{
|
|
pte_t pte;
|
|
kvm_pte_t kpte;
|
|
struct kvm_s2_mmu *mmu;
|
|
|
|
trace_kvm_access_fault(fault_ipa);
|
|
|
|
spin_lock(&vcpu->kvm->mmu_lock);
|
|
mmu = vcpu->arch.hw_mmu;
|
|
kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
|
|
spin_unlock(&vcpu->kvm->mmu_lock);
|
|
|
|
pte = __pte(kpte);
|
|
if (pte_valid(pte))
|
|
kvm_set_pfn_accessed(pte_pfn(pte));
|
|
}
|
|
|
|
/**
|
|
* kvm_handle_guest_abort - handles all 2nd stage aborts
|
|
* @vcpu: the VCPU pointer
|
|
*
|
|
* Any abort that gets to the host is almost guaranteed to be caused by a
|
|
* missing second stage translation table entry, which can mean that either the
|
|
* guest simply needs more memory and we must allocate an appropriate page or it
|
|
* can mean that the guest tried to access I/O memory, which is emulated by user
|
|
* space. The distinction is based on the IPA causing the fault and whether this
|
|
* memory region has been registered as standard RAM by user space.
|
|
*/
|
|
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
|
|
{
|
|
unsigned long fault_status;
|
|
phys_addr_t fault_ipa;
|
|
struct kvm_memory_slot *memslot;
|
|
unsigned long hva;
|
|
bool is_iabt, write_fault, writable;
|
|
gfn_t gfn;
|
|
int ret, idx;
|
|
|
|
fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
|
|
|
|
fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
|
|
is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
|
|
|
|
/* Synchronous External Abort? */
|
|
if (kvm_vcpu_abt_issea(vcpu)) {
|
|
/*
|
|
* For RAS the host kernel may handle this abort.
|
|
* There is no need to pass the error into the guest.
|
|
*/
|
|
if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
|
|
kvm_inject_vabt(vcpu);
|
|
|
|
return 1;
|
|
}
|
|
|
|
trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
|
|
kvm_vcpu_get_hfar(vcpu), fault_ipa);
|
|
|
|
/* Check the stage-2 fault is trans. fault or write fault */
|
|
if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
|
|
fault_status != FSC_ACCESS) {
|
|
kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
|
|
kvm_vcpu_trap_get_class(vcpu),
|
|
(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
|
|
(unsigned long)kvm_vcpu_get_esr(vcpu));
|
|
return -EFAULT;
|
|
}
|
|
|
|
idx = srcu_read_lock(&vcpu->kvm->srcu);
|
|
|
|
gfn = fault_ipa >> PAGE_SHIFT;
|
|
memslot = gfn_to_memslot(vcpu->kvm, gfn);
|
|
hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
|
|
write_fault = kvm_is_write_fault(vcpu);
|
|
if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
|
|
/*
|
|
* The guest has put either its instructions or its page-tables
|
|
* somewhere it shouldn't have. Userspace won't be able to do
|
|
* anything about this (there's no syndrome for a start), so
|
|
* re-inject the abort back into the guest.
|
|
*/
|
|
if (is_iabt) {
|
|
ret = -ENOEXEC;
|
|
goto out;
|
|
}
|
|
|
|
if (kvm_vcpu_dabt_iss1tw(vcpu)) {
|
|
kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
|
|
ret = 1;
|
|
goto out_unlock;
|
|
}
|
|
|
|
/*
|
|
* Check for a cache maintenance operation. Since we
|
|
* ended-up here, we know it is outside of any memory
|
|
* slot. But we can't find out if that is for a device,
|
|
* or if the guest is just being stupid. The only thing
|
|
* we know for sure is that this range cannot be cached.
|
|
*
|
|
* So let's assume that the guest is just being
|
|
* cautious, and skip the instruction.
|
|
*/
|
|
if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
|
|
kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
|
|
ret = 1;
|
|
goto out_unlock;
|
|
}
|
|
|
|
/*
|
|
* The IPA is reported as [MAX:12], so we need to
|
|
* complement it with the bottom 12 bits from the
|
|
* faulting VA. This is always 12 bits, irrespective
|
|
* of the page size.
|
|
*/
|
|
fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
|
|
ret = io_mem_abort(vcpu, fault_ipa);
|
|
goto out_unlock;
|
|
}
|
|
|
|
/* Userspace should not be able to register out-of-bounds IPAs */
|
|
VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
|
|
|
|
if (fault_status == FSC_ACCESS) {
|
|
handle_access_fault(vcpu, fault_ipa);
|
|
ret = 1;
|
|
goto out_unlock;
|
|
}
|
|
|
|
ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
|
|
if (ret == 0)
|
|
ret = 1;
|
|
out:
|
|
if (ret == -ENOEXEC) {
|
|
kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
|
|
ret = 1;
|
|
}
|
|
out_unlock:
|
|
srcu_read_unlock(&vcpu->kvm->srcu, idx);
|
|
return ret;
|
|
}
|
|
|
|
static int handle_hva_to_gpa(struct kvm *kvm,
|
|
unsigned long start,
|
|
unsigned long end,
|
|
int (*handler)(struct kvm *kvm,
|
|
gpa_t gpa, u64 size,
|
|
void *data),
|
|
void *data)
|
|
{
|
|
struct kvm_memslots *slots;
|
|
struct kvm_memory_slot *memslot;
|
|
int ret = 0;
|
|
|
|
slots = kvm_memslots(kvm);
|
|
|
|
/* we only care about the pages that the guest sees */
|
|
kvm_for_each_memslot(memslot, slots) {
|
|
unsigned long hva_start, hva_end;
|
|
gfn_t gpa;
|
|
|
|
hva_start = max(start, memslot->userspace_addr);
|
|
hva_end = min(end, memslot->userspace_addr +
|
|
(memslot->npages << PAGE_SHIFT));
|
|
if (hva_start >= hva_end)
|
|
continue;
|
|
|
|
gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
|
|
ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
|
|
{
|
|
unsigned flags = *(unsigned *)data;
|
|
bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
|
|
|
|
__unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
|
|
return 0;
|
|
}
|
|
|
|
int kvm_unmap_hva_range(struct kvm *kvm,
|
|
unsigned long start, unsigned long end, unsigned flags)
|
|
{
|
|
if (!kvm->arch.mmu.pgt)
|
|
return 0;
|
|
|
|
trace_kvm_unmap_hva_range(start, end);
|
|
handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
|
|
return 0;
|
|
}
|
|
|
|
static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
|
|
{
|
|
kvm_pfn_t *pfn = (kvm_pfn_t *)data;
|
|
|
|
WARN_ON(size != PAGE_SIZE);
|
|
|
|
/*
|
|
* The MMU notifiers will have unmapped a huge PMD before calling
|
|
* ->change_pte() (which in turn calls kvm_set_spte_hva()) and
|
|
* therefore we never need to clear out a huge PMD through this
|
|
* calling path and a memcache is not required.
|
|
*/
|
|
kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
|
|
__pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
|
|
return 0;
|
|
}
|
|
|
|
int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
|
|
{
|
|
unsigned long end = hva + PAGE_SIZE;
|
|
kvm_pfn_t pfn = pte_pfn(pte);
|
|
|
|
if (!kvm->arch.mmu.pgt)
|
|
return 0;
|
|
|
|
trace_kvm_set_spte_hva(hva);
|
|
|
|
/*
|
|
* We've moved a page around, probably through CoW, so let's treat it
|
|
* just like a translation fault and clean the cache to the PoC.
|
|
*/
|
|
clean_dcache_guest_page(pfn, PAGE_SIZE);
|
|
handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
|
|
return 0;
|
|
}
|
|
|
|
static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
|
|
{
|
|
pte_t pte;
|
|
kvm_pte_t kpte;
|
|
|
|
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
|
|
kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
|
|
pte = __pte(kpte);
|
|
return pte_valid(pte) && pte_young(pte);
|
|
}
|
|
|
|
static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
|
|
{
|
|
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
|
|
return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
|
|
}
|
|
|
|
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
|
|
{
|
|
if (!kvm->arch.mmu.pgt)
|
|
return 0;
|
|
trace_kvm_age_hva(start, end);
|
|
return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
|
|
}
|
|
|
|
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
|
|
{
|
|
if (!kvm->arch.mmu.pgt)
|
|
return 0;
|
|
trace_kvm_test_age_hva(hva);
|
|
return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
|
|
kvm_test_age_hva_handler, NULL);
|
|
}
|
|
|
|
phys_addr_t kvm_mmu_get_httbr(void)
|
|
{
|
|
return __pa(hyp_pgtable->pgd);
|
|
}
|
|
|
|
phys_addr_t kvm_get_idmap_vector(void)
|
|
{
|
|
return hyp_idmap_vector;
|
|
}
|
|
|
|
static int kvm_map_idmap_text(void)
|
|
{
|
|
unsigned long size = hyp_idmap_end - hyp_idmap_start;
|
|
int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
|
|
PAGE_HYP_EXEC);
|
|
if (err)
|
|
kvm_err("Failed to idmap %lx-%lx\n",
|
|
hyp_idmap_start, hyp_idmap_end);
|
|
|
|
return err;
|
|
}
|
|
|
|
int kvm_mmu_init(void)
|
|
{
|
|
int err;
|
|
u32 hyp_va_bits;
|
|
|
|
hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
|
|
hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
|
|
hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
|
|
hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
|
|
hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
|
|
|
|
/*
|
|
* We rely on the linker script to ensure at build time that the HYP
|
|
* init code does not cross a page boundary.
|
|
*/
|
|
BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
|
|
|
|
hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
|
|
kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
|
|
kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
|
|
kvm_debug("HYP VA range: %lx:%lx\n",
|
|
kern_hyp_va(PAGE_OFFSET),
|
|
kern_hyp_va((unsigned long)high_memory - 1));
|
|
|
|
if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
|
|
hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) &&
|
|
hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
|
|
/*
|
|
* The idmap page is intersecting with the VA space,
|
|
* it is not safe to continue further.
|
|
*/
|
|
kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
|
|
err = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
|
|
if (!hyp_pgtable) {
|
|
kvm_err("Hyp mode page-table not allocated\n");
|
|
err = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
|
|
if (err)
|
|
goto out_free_pgtable;
|
|
|
|
err = kvm_map_idmap_text();
|
|
if (err)
|
|
goto out_destroy_pgtable;
|
|
|
|
io_map_base = hyp_idmap_start;
|
|
return 0;
|
|
|
|
out_destroy_pgtable:
|
|
kvm_pgtable_hyp_destroy(hyp_pgtable);
|
|
out_free_pgtable:
|
|
kfree(hyp_pgtable);
|
|
hyp_pgtable = NULL;
|
|
out:
|
|
return err;
|
|
}
|
|
|
|
void kvm_arch_commit_memory_region(struct kvm *kvm,
|
|
const struct kvm_userspace_memory_region *mem,
|
|
struct kvm_memory_slot *old,
|
|
const struct kvm_memory_slot *new,
|
|
enum kvm_mr_change change)
|
|
{
|
|
/*
|
|
* At this point memslot has been committed and there is an
|
|
* allocated dirty_bitmap[], dirty pages will be tracked while the
|
|
* memory slot is write protected.
|
|
*/
|
|
if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
|
|
/*
|
|
* If we're with initial-all-set, we don't need to write
|
|
* protect any pages because they're all reported as dirty.
|
|
* Huge pages and normal pages will be write protect gradually.
|
|
*/
|
|
if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
|
|
kvm_mmu_wp_memory_region(kvm, mem->slot);
|
|
}
|
|
}
|
|
}
|
|
|
|
int kvm_arch_prepare_memory_region(struct kvm *kvm,
|
|
struct kvm_memory_slot *memslot,
|
|
const struct kvm_userspace_memory_region *mem,
|
|
enum kvm_mr_change change)
|
|
{
|
|
hva_t hva = mem->userspace_addr;
|
|
hva_t reg_end = hva + mem->memory_size;
|
|
bool writable = !(mem->flags & KVM_MEM_READONLY);
|
|
int ret = 0;
|
|
|
|
if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
|
|
change != KVM_MR_FLAGS_ONLY)
|
|
return 0;
|
|
|
|
/*
|
|
* Prevent userspace from creating a memory region outside of the IPA
|
|
* space addressable by the KVM guest IPA space.
|
|
*/
|
|
if (memslot->base_gfn + memslot->npages >=
|
|
(kvm_phys_size(kvm) >> PAGE_SHIFT))
|
|
return -EFAULT;
|
|
|
|
mmap_read_lock(current->mm);
|
|
/*
|
|
* A memory region could potentially cover multiple VMAs, and any holes
|
|
* between them, so iterate over all of them to find out if we can map
|
|
* any of them right now.
|
|
*
|
|
* +--------------------------------------------+
|
|
* +---------------+----------------+ +----------------+
|
|
* | : VMA 1 | VMA 2 | | VMA 3 : |
|
|
* +---------------+----------------+ +----------------+
|
|
* | memory region |
|
|
* +--------------------------------------------+
|
|
*/
|
|
do {
|
|
struct vm_area_struct *vma = find_vma(current->mm, hva);
|
|
hva_t vm_start, vm_end;
|
|
|
|
if (!vma || vma->vm_start >= reg_end)
|
|
break;
|
|
|
|
/*
|
|
* Take the intersection of this VMA with the memory region
|
|
*/
|
|
vm_start = max(hva, vma->vm_start);
|
|
vm_end = min(reg_end, vma->vm_end);
|
|
|
|
if (vma->vm_flags & VM_PFNMAP) {
|
|
gpa_t gpa = mem->guest_phys_addr +
|
|
(vm_start - mem->userspace_addr);
|
|
phys_addr_t pa;
|
|
|
|
pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
|
|
pa += vm_start - vma->vm_start;
|
|
|
|
/* IO region dirty page logging not allowed */
|
|
if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
|
|
vm_end - vm_start,
|
|
writable);
|
|
if (ret)
|
|
break;
|
|
}
|
|
hva = vm_end;
|
|
} while (hva < reg_end);
|
|
|
|
if (change == KVM_MR_FLAGS_ONLY)
|
|
goto out;
|
|
|
|
spin_lock(&kvm->mmu_lock);
|
|
if (ret)
|
|
unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
|
|
else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
|
|
stage2_flush_memslot(kvm, memslot);
|
|
spin_unlock(&kvm->mmu_lock);
|
|
out:
|
|
mmap_read_unlock(current->mm);
|
|
return ret;
|
|
}
|
|
|
|
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
|
|
{
|
|
}
|
|
|
|
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
|
|
{
|
|
}
|
|
|
|
void kvm_arch_flush_shadow_all(struct kvm *kvm)
|
|
{
|
|
kvm_free_stage2_pgd(&kvm->arch.mmu);
|
|
}
|
|
|
|
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
|
|
struct kvm_memory_slot *slot)
|
|
{
|
|
gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
|
|
phys_addr_t size = slot->npages << PAGE_SHIFT;
|
|
|
|
spin_lock(&kvm->mmu_lock);
|
|
unmap_stage2_range(&kvm->arch.mmu, gpa, size);
|
|
spin_unlock(&kvm->mmu_lock);
|
|
}
|
|
|
|
/*
|
|
* See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
|
|
*
|
|
* Main problems:
|
|
* - S/W ops are local to a CPU (not broadcast)
|
|
* - We have line migration behind our back (speculation)
|
|
* - System caches don't support S/W at all (damn!)
|
|
*
|
|
* In the face of the above, the best we can do is to try and convert
|
|
* S/W ops to VA ops. Because the guest is not allowed to infer the
|
|
* S/W to PA mapping, it can only use S/W to nuke the whole cache,
|
|
* which is a rather good thing for us.
|
|
*
|
|
* Also, it is only used when turning caches on/off ("The expected
|
|
* usage of the cache maintenance instructions that operate by set/way
|
|
* is associated with the cache maintenance instructions associated
|
|
* with the powerdown and powerup of caches, if this is required by
|
|
* the implementation.").
|
|
*
|
|
* We use the following policy:
|
|
*
|
|
* - If we trap a S/W operation, we enable VM trapping to detect
|
|
* caches being turned on/off, and do a full clean.
|
|
*
|
|
* - We flush the caches on both caches being turned on and off.
|
|
*
|
|
* - Once the caches are enabled, we stop trapping VM ops.
|
|
*/
|
|
void kvm_set_way_flush(struct kvm_vcpu *vcpu)
|
|
{
|
|
unsigned long hcr = *vcpu_hcr(vcpu);
|
|
|
|
/*
|
|
* If this is the first time we do a S/W operation
|
|
* (i.e. HCR_TVM not set) flush the whole memory, and set the
|
|
* VM trapping.
|
|
*
|
|
* Otherwise, rely on the VM trapping to wait for the MMU +
|
|
* Caches to be turned off. At that point, we'll be able to
|
|
* clean the caches again.
|
|
*/
|
|
if (!(hcr & HCR_TVM)) {
|
|
trace_kvm_set_way_flush(*vcpu_pc(vcpu),
|
|
vcpu_has_cache_enabled(vcpu));
|
|
stage2_flush_vm(vcpu->kvm);
|
|
*vcpu_hcr(vcpu) = hcr | HCR_TVM;
|
|
}
|
|
}
|
|
|
|
void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
|
|
{
|
|
bool now_enabled = vcpu_has_cache_enabled(vcpu);
|
|
|
|
/*
|
|
* If switching the MMU+caches on, need to invalidate the caches.
|
|
* If switching it off, need to clean the caches.
|
|
* Clean + invalidate does the trick always.
|
|
*/
|
|
if (now_enabled != was_enabled)
|
|
stage2_flush_vm(vcpu->kvm);
|
|
|
|
/* Caches are now on, stop trapping VM ops (until a S/W op) */
|
|
if (now_enabled)
|
|
*vcpu_hcr(vcpu) &= ~HCR_TVM;
|
|
|
|
trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
|
|
}
|