From c43203cab1e2e193c43f8295f01dfb2a0721d9e5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 1 Jun 2016 22:26:00 +0200 Subject: [PATCH 001/302] KVM: x86: avoid simultaneous queueing of both IRQ and SMI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the processor exits to KVM while delivering an interrupt, the hypervisor then requeues the interrupt for the next vmentry. Trying to enter SMM in this same window causes to enter non-root mode in emulated SMM (i.e. with IF=0) and with a request to inject an IRQ (i.e. with a valid VM-entry interrupt info field). This is invalid guest state (SDM 26.3.1.4 "Check on Guest RIP and RFLAGS") and the processor fails vmentry. The fix is to defer the injection from KVM_REQ_SMI to KVM_REQ_EVENT, like we already do for e.g. NMIs. This patch doesn't change the name of the process_smi function so that it can be applied to stable releases. The next patch will modify the names so that process_nmi and process_smi handle respectively KVM_REQ_NMI and KVM_REQ_SMI. This is especially common with Windows, probably due to the self-IPI trick that it uses to deliver deferred procedure calls (DPCs). Reported-by: Laszlo Ersek Reported-by: Michał Zegan Fixes: 64d6067057d9658acb8675afcfba549abdb7fc16 Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 902d9da12392..5a26f8c066fa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -91,6 +91,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); +static void process_smi(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); struct kvm_x86_ops *kvm_x86_ops __read_mostly; @@ -5302,13 +5303,8 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu) /* This is a good place to trace that we are exiting SMM. */ trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false); - if (unlikely(vcpu->arch.smi_pending)) { - kvm_make_request(KVM_REQ_SMI, vcpu); - vcpu->arch.smi_pending = 0; - } else { - /* Process a latched INIT, if any. */ - kvm_make_request(KVM_REQ_EVENT, vcpu); - } + /* Process a latched INIT or SMI, if any. */ + kvm_make_request(KVM_REQ_EVENT, vcpu); } kvm_mmu_reset_context(vcpu); @@ -6108,7 +6104,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) } /* try to inject new event if pending */ - if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { + if (vcpu->arch.smi_pending && !is_smm(vcpu)) { + vcpu->arch.smi_pending = false; + process_smi(vcpu); + } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; kvm_x86_ops->set_nmi(vcpu); @@ -6318,11 +6317,6 @@ static void process_smi(struct kvm_vcpu *vcpu) char buf[512]; u32 cr0; - if (is_smm(vcpu)) { - vcpu->arch.smi_pending = true; - return; - } - trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); vcpu->arch.hflags |= HF_SMM_MASK; memset(buf, 0, 512); @@ -6385,6 +6379,12 @@ static void process_smi(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); } +static void process_smi_request(struct kvm_vcpu *vcpu) +{ + vcpu->arch.smi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); +} + void kvm_make_scan_ioapic_request(struct kvm *kvm) { kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); @@ -6506,7 +6506,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) record_steal_time(vcpu); if (kvm_check_request(KVM_REQ_SMI, vcpu)) - process_smi(vcpu); + process_smi_request(vcpu); if (kvm_check_request(KVM_REQ_NMI, vcpu)) process_nmi(vcpu); if (kvm_check_request(KVM_REQ_PMU, vcpu)) @@ -6579,8 +6579,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (inject_pending_event(vcpu, req_int_win) != 0) req_immediate_exit = true; - /* enable NMI/IRQ window open exits if needed */ else { + /* Enable NMI/IRQ window open exits if needed. + * + * SMIs have two cases: 1) they can be nested, and + * then there is nothing to do here because RSM will + * cause a vmexit anyway; 2) or the SMI can be pending + * because inject_pending_event has completed the + * injection of an IRQ or NMI from the previous vmexit, + * and then we request an immediate exit to inject the SMI. + */ + if (vcpu->arch.smi_pending && !is_smm(vcpu)) + req_immediate_exit = true; if (vcpu->arch.nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) @@ -6631,8 +6641,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_load_guest_xcr0(vcpu); - if (req_immediate_exit) + if (req_immediate_exit) { + kvm_make_request(KVM_REQ_EVENT, vcpu); smp_send_reschedule(vcpu->cpu); + } trace_kvm_entry(vcpu->vcpu_id); wait_lapic_expire(vcpu); @@ -7433,6 +7445,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { vcpu->arch.hflags = 0; + vcpu->arch.smi_pending = 0; atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; From ee2cd4b7555e3a629f399c3ef228ceb42067e7af Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 1 Jun 2016 22:26:01 +0200 Subject: [PATCH 002/302] KVM: x86: rename process_smi to enter_smm, process_smi_request to process_smi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the function names more similar between KVM_REQ_NMI and KVM_REQ_SMI. Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5a26f8c066fa..1785415ebff3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -91,7 +91,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); -static void process_smi(struct kvm_vcpu *vcpu); +static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); struct kvm_x86_ops *kvm_x86_ops __read_mostly; @@ -6106,7 +6106,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) /* try to inject new event if pending */ if (vcpu->arch.smi_pending && !is_smm(vcpu)) { vcpu->arch.smi_pending = false; - process_smi(vcpu); + enter_smm(vcpu); } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; @@ -6130,6 +6130,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) kvm_x86_ops->set_irq(vcpu); } } + return 0; } @@ -6153,7 +6154,7 @@ static void process_nmi(struct kvm_vcpu *vcpu) #define put_smstate(type, buf, offset, val) \ *(type *)((buf) + (offset) - 0x7e00) = val -static u32 process_smi_get_segment_flags(struct kvm_segment *seg) +static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) { u32 flags = 0; flags |= seg->g << 23; @@ -6167,7 +6168,7 @@ static u32 process_smi_get_segment_flags(struct kvm_segment *seg) return flags; } -static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) +static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) { struct kvm_segment seg; int offset; @@ -6182,11 +6183,11 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) put_smstate(u32, buf, offset + 8, seg.base); put_smstate(u32, buf, offset + 4, seg.limit); - put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg)); + put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg)); } #ifdef CONFIG_X86_64 -static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) +static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) { struct kvm_segment seg; int offset; @@ -6195,7 +6196,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) kvm_get_segment(vcpu, &seg, n); offset = 0x7e00 + n * 16; - flags = process_smi_get_segment_flags(&seg) >> 8; + flags = enter_smm_get_segment_flags(&seg) >> 8; put_smstate(u16, buf, offset, seg.selector); put_smstate(u16, buf, offset + 2, flags); put_smstate(u32, buf, offset + 4, seg.limit); @@ -6203,7 +6204,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) } #endif -static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) +static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) { struct desc_ptr dt; struct kvm_segment seg; @@ -6227,13 +6228,13 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7fc4, seg.selector); put_smstate(u32, buf, 0x7f64, seg.base); put_smstate(u32, buf, 0x7f60, seg.limit); - put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg)); + put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg)); kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); put_smstate(u32, buf, 0x7fc0, seg.selector); put_smstate(u32, buf, 0x7f80, seg.base); put_smstate(u32, buf, 0x7f7c, seg.limit); - put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg)); + put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); kvm_x86_ops->get_gdt(vcpu, &dt); put_smstate(u32, buf, 0x7f74, dt.address); @@ -6244,7 +6245,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7f54, dt.size); for (i = 0; i < 6; i++) - process_smi_save_seg_32(vcpu, buf, i); + enter_smm_save_seg_32(vcpu, buf, i); put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu)); @@ -6253,7 +6254,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase); } -static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf) +static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) { #ifdef CONFIG_X86_64 struct desc_ptr dt; @@ -6285,7 +6286,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf) kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); put_smstate(u16, buf, 0x7e90, seg.selector); - put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8); + put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8); put_smstate(u32, buf, 0x7e94, seg.limit); put_smstate(u64, buf, 0x7e98, seg.base); @@ -6295,7 +6296,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf) kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); put_smstate(u16, buf, 0x7e70, seg.selector); - put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8); + put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8); put_smstate(u32, buf, 0x7e74, seg.limit); put_smstate(u64, buf, 0x7e78, seg.base); @@ -6304,13 +6305,13 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf) put_smstate(u64, buf, 0x7e68, dt.address); for (i = 0; i < 6; i++) - process_smi_save_seg_64(vcpu, buf, i); + enter_smm_save_seg_64(vcpu, buf, i); #else WARN_ON_ONCE(1); #endif } -static void process_smi(struct kvm_vcpu *vcpu) +static void enter_smm(struct kvm_vcpu *vcpu) { struct kvm_segment cs, ds; struct desc_ptr dt; @@ -6321,9 +6322,9 @@ static void process_smi(struct kvm_vcpu *vcpu) vcpu->arch.hflags |= HF_SMM_MASK; memset(buf, 0, 512); if (guest_cpuid_has_longmode(vcpu)) - process_smi_save_state_64(vcpu, buf); + enter_smm_save_state_64(vcpu, buf); else - process_smi_save_state_32(vcpu, buf); + enter_smm_save_state_32(vcpu, buf); kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); @@ -6379,7 +6380,7 @@ static void process_smi(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); } -static void process_smi_request(struct kvm_vcpu *vcpu) +static void process_smi(struct kvm_vcpu *vcpu) { vcpu->arch.smi_pending = true; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -6506,7 +6507,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) record_steal_time(vcpu); if (kvm_check_request(KVM_REQ_SMI, vcpu)) - process_smi_request(vcpu); + process_smi(vcpu); if (kvm_check_request(KVM_REQ_NMI, vcpu)) process_nmi(vcpu); if (kvm_check_request(KVM_REQ_PMU, vcpu)) From 250715a6171a076748be8ab88b274e72f0cfb435 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 1 Jun 2016 14:09:24 +0200 Subject: [PATCH 003/302] KVM: x86: protect KVM_CREATE_PIT/KVM_CREATE_PIT2 with kvm->lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The syzkaller folks reported a NULL pointer dereference that seems to be cause by a race between KVM_CREATE_IRQCHIP and KVM_CREATE_PIT2. The former takes kvm->lock (except when registering the devices, which needs kvm->slots_lock); the latter takes kvm->slots_lock only. Change KVM_CREATE_PIT2 to follow the same model as KVM_CREATE_IRQCHIP. Testcase: #include #include #include #include #include #include #include #include #include long r[23]; void* thr1(void* arg) { struct kvm_pit_config pitcfg = { .flags = 4 }; switch ((long)arg) { case 0: r[2] = open("/dev/kvm", O_RDONLY|O_ASYNC); break; case 1: r[3] = ioctl(r[2], KVM_CREATE_VM, 0); break; case 2: r[4] = ioctl(r[3], KVM_CREATE_IRQCHIP, 0); break; case 3: r[22] = ioctl(r[3], KVM_CREATE_PIT2, &pitcfg); break; } return 0; } int main(int argc, char **argv) { long i; pthread_t th[4]; memset(r, -1, sizeof(r)); for (i = 0; i < 4; i++) { pthread_create(&th[i], 0, thr, (void*)i); if (argc > 1 && rand()%2) usleep(rand()%1000); } usleep(20000); return 0; } Reported-by: Dmitry Vyukov Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/i8254.c | 4 +++- arch/x86/kvm/x86.c | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index a4bf5b45d65a..5fb6c620180e 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -645,7 +645,6 @@ static const struct kvm_io_device_ops speaker_dev_ops = { .write = speaker_ioport_write, }; -/* Caller must hold slots_lock */ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; @@ -690,6 +689,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) kvm_pit_set_reinject(pit, true); + mutex_lock(&kvm->slots_lock); kvm_iodevice_init(&pit->dev, &pit_dev_ops); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS, KVM_PIT_MEM_LENGTH, &pit->dev); @@ -704,12 +704,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) if (ret < 0) goto fail_register_speaker; } + mutex_unlock(&kvm->slots_lock); return pit; fail_register_speaker: kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); fail_register_pit: + mutex_unlock(&kvm->slots_lock); kvm_pit_set_reinject(pit, false); kthread_stop(pit->worker_task); fail_kthread: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1785415ebff3..9d6a30593655 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3879,7 +3879,7 @@ long kvm_arch_vm_ioctl(struct file *filp, sizeof(struct kvm_pit_config))) goto out; create_pit: - mutex_lock(&kvm->slots_lock); + mutex_lock(&kvm->lock); r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; @@ -3888,7 +3888,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.vpit) r = 0; create_pit_unlock: - mutex_unlock(&kvm->slots_lock); + mutex_unlock(&kvm->lock); break; case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ From dca4d728773a2f48e999c8617524bbf8dee4807f Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 31 May 2016 13:21:14 +0800 Subject: [PATCH 004/302] kvm/x86: remove unnecessary header file inclusion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit arch/x86/kvm/iommu.c includes and , which both are unnecessary, in fact incorrect to be here as they are intel specific. Building kvm on x86 passed after removing above inclusion. Signed-off-by: Kai Huang Signed-off-by: Radim Krčmář --- arch/x86/kvm/iommu.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c index 3069281904d3..4f2010c5feba 100644 --- a/arch/x86/kvm/iommu.c +++ b/arch/x86/kvm/iommu.c @@ -28,9 +28,7 @@ #include #include #include -#include #include -#include #include "assigned-dev.h" static bool allow_unsafe_assigned_interrupts; From e65f30e0cb29694c4241bd9c96ea9413938fcec5 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Thu, 4 Feb 2016 10:24:52 +0100 Subject: [PATCH 005/302] s390: hypfs: Move diag implementation and data definitions Diag 204 data and function definitions currently live in the hypfs files. As KVM will be a consumer of this data, we need to make it publicly available and move it to the appropriate diag.{c,h} files. __attribute__ ((packed)) occurences were replaced with __packed for all moved structs. Signed-off-by: Janosch Frank Reviewed-by: David Hildenbrand Acked-by: Michael Holzheu Signed-off-by: Christian Borntraeger --- arch/s390/hypfs/hypfs_diag.c | 361 +++++++++++------------------------ arch/s390/include/asm/diag.h | 127 ++++++++++++ arch/s390/kernel/diag.c | 22 +++ 3 files changed, 256 insertions(+), 254 deletions(-) diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c index 045035796ca7..1e28414d7275 100644 --- a/arch/s390/hypfs/hypfs_diag.c +++ b/arch/s390/hypfs/hypfs_diag.c @@ -19,29 +19,10 @@ #include #include "hypfs.h" -#define LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */ -#define CPU_NAME_LEN 16 /* type name len of cpus in diag224 name table */ #define TMP_SIZE 64 /* size of temporary buffers */ #define DBFS_D204_HDR_VERSION 0 -/* diag 204 subcodes */ -enum diag204_sc { - SUBC_STIB4 = 4, - SUBC_RSI = 5, - SUBC_STIB6 = 6, - SUBC_STIB7 = 7 -}; - -/* The two available diag 204 data formats */ -enum diag204_format { - INFO_SIMPLE = 0, - INFO_EXT = 0x00010000 -}; - -/* bit is set in flags, when physical cpu info is included in diag 204 data */ -#define LPAR_PHYS_FLG 0x80 - static char *diag224_cpu_names; /* diag 224 name table */ static enum diag204_sc diag204_store_sc; /* used subcode for store */ static enum diag204_format diag204_info_type; /* used diag 204 data format */ @@ -53,7 +34,7 @@ static int diag204_buf_pages; /* number of pages for diag204 data */ static struct dentry *dbfs_d204_file; /* - * DIAG 204 data structures and member access functions. + * DIAG 204 member access functions. * * Since we have two different diag 204 data formats for old and new s390 * machines, we do not access the structs directly, but use getter functions for @@ -62,302 +43,173 @@ static struct dentry *dbfs_d204_file; /* Time information block */ -struct info_blk_hdr { - __u8 npar; - __u8 flags; - __u16 tslice; - __u16 phys_cpus; - __u16 this_part; - __u64 curtod; -} __attribute__ ((packed)); - -struct x_info_blk_hdr { - __u8 npar; - __u8 flags; - __u16 tslice; - __u16 phys_cpus; - __u16 this_part; - __u64 curtod1; - __u64 curtod2; - char reserved[40]; -} __attribute__ ((packed)); - static inline int info_blk_hdr__size(enum diag204_format type) { - if (type == INFO_SIMPLE) - return sizeof(struct info_blk_hdr); - else /* INFO_EXT */ - return sizeof(struct x_info_blk_hdr); + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_info_blk_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_info_blk_hdr); } static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct info_blk_hdr *)hdr)->npar; - else /* INFO_EXT */ - return ((struct x_info_blk_hdr *)hdr)->npar; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->npar; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->npar; } static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct info_blk_hdr *)hdr)->flags; - else /* INFO_EXT */ - return ((struct x_info_blk_hdr *)hdr)->flags; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->flags; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->flags; } static inline __u16 info_blk_hdr__pcpus(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct info_blk_hdr *)hdr)->phys_cpus; - else /* INFO_EXT */ - return ((struct x_info_blk_hdr *)hdr)->phys_cpus; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_info_blk_hdr *)hdr)->phys_cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_info_blk_hdr *)hdr)->phys_cpus; } /* Partition header */ -struct part_hdr { - __u8 pn; - __u8 cpus; - char reserved[6]; - char part_name[LPAR_NAME_LEN]; -} __attribute__ ((packed)); - -struct x_part_hdr { - __u8 pn; - __u8 cpus; - __u8 rcpus; - __u8 pflag; - __u32 mlu; - char part_name[LPAR_NAME_LEN]; - char lpc_name[8]; - char os_name[8]; - __u64 online_cs; - __u64 online_es; - __u8 upid; - char reserved1[3]; - __u32 group_mlu; - char group_name[8]; - char reserved2[32]; -} __attribute__ ((packed)); - static inline int part_hdr__size(enum diag204_format type) { - if (type == INFO_SIMPLE) - return sizeof(struct part_hdr); - else /* INFO_EXT */ - return sizeof(struct x_part_hdr); + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_part_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_part_hdr); } static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct part_hdr *)hdr)->cpus; - else /* INFO_EXT */ - return ((struct x_part_hdr *)hdr)->rcpus; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_part_hdr *)hdr)->cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_part_hdr *)hdr)->rcpus; } static inline void part_hdr__part_name(enum diag204_format type, void *hdr, char *name) { - if (type == INFO_SIMPLE) - memcpy(name, ((struct part_hdr *)hdr)->part_name, - LPAR_NAME_LEN); - else /* INFO_EXT */ - memcpy(name, ((struct x_part_hdr *)hdr)->part_name, - LPAR_NAME_LEN); - EBCASC(name, LPAR_NAME_LEN); - name[LPAR_NAME_LEN] = 0; + if (type == DIAG204_INFO_SIMPLE) + memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name, + DIAG204_LPAR_NAME_LEN); + else /* DIAG204_INFO_EXT */ + memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name, + DIAG204_LPAR_NAME_LEN); + EBCASC(name, DIAG204_LPAR_NAME_LEN); + name[DIAG204_LPAR_NAME_LEN] = 0; strim(name); } -struct cpu_info { - __u16 cpu_addr; - char reserved1[2]; - __u8 ctidx; - __u8 cflag; - __u16 weight; - __u64 acc_time; - __u64 lp_time; -} __attribute__ ((packed)); - -struct x_cpu_info { - __u16 cpu_addr; - char reserved1[2]; - __u8 ctidx; - __u8 cflag; - __u16 weight; - __u64 acc_time; - __u64 lp_time; - __u16 min_weight; - __u16 cur_weight; - __u16 max_weight; - char reseved2[2]; - __u64 online_time; - __u64 wait_time; - __u32 pma_weight; - __u32 polar_weight; - char reserved3[40]; -} __attribute__ ((packed)); - /* CPU info block */ static inline int cpu_info__size(enum diag204_format type) { - if (type == INFO_SIMPLE) - return sizeof(struct cpu_info); - else /* INFO_EXT */ - return sizeof(struct x_cpu_info); + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_cpu_info); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_cpu_info); } static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct cpu_info *)hdr)->ctidx; - else /* INFO_EXT */ - return ((struct x_cpu_info *)hdr)->ctidx; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->ctidx; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->ctidx; } static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct cpu_info *)hdr)->cpu_addr; - else /* INFO_EXT */ - return ((struct x_cpu_info *)hdr)->cpu_addr; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->cpu_addr; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->cpu_addr; } static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct cpu_info *)hdr)->acc_time; - else /* INFO_EXT */ - return ((struct x_cpu_info *)hdr)->acc_time; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->acc_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->acc_time; } static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct cpu_info *)hdr)->lp_time; - else /* INFO_EXT */ - return ((struct x_cpu_info *)hdr)->lp_time; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_cpu_info *)hdr)->lp_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->lp_time; } static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) + if (type == DIAG204_INFO_SIMPLE) return 0; /* online_time not available in simple info */ - else /* INFO_EXT */ - return ((struct x_cpu_info *)hdr)->online_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_cpu_info *)hdr)->online_time; } /* Physical header */ -struct phys_hdr { - char reserved1[1]; - __u8 cpus; - char reserved2[6]; - char mgm_name[8]; -} __attribute__ ((packed)); - -struct x_phys_hdr { - char reserved1[1]; - __u8 cpus; - char reserved2[6]; - char mgm_name[8]; - char reserved3[80]; -} __attribute__ ((packed)); - static inline int phys_hdr__size(enum diag204_format type) { - if (type == INFO_SIMPLE) - return sizeof(struct phys_hdr); - else /* INFO_EXT */ - return sizeof(struct x_phys_hdr); + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_phys_hdr); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_phys_hdr); } static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct phys_hdr *)hdr)->cpus; - else /* INFO_EXT */ - return ((struct x_phys_hdr *)hdr)->cpus; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_hdr *)hdr)->cpus; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_hdr *)hdr)->cpus; } /* Physical CPU info block */ -struct phys_cpu { - __u16 cpu_addr; - char reserved1[2]; - __u8 ctidx; - char reserved2[3]; - __u64 mgm_time; - char reserved3[8]; -} __attribute__ ((packed)); - -struct x_phys_cpu { - __u16 cpu_addr; - char reserved1[2]; - __u8 ctidx; - char reserved2[3]; - __u64 mgm_time; - char reserved3[80]; -} __attribute__ ((packed)); - static inline int phys_cpu__size(enum diag204_format type) { - if (type == INFO_SIMPLE) - return sizeof(struct phys_cpu); - else /* INFO_EXT */ - return sizeof(struct x_phys_cpu); + if (type == DIAG204_INFO_SIMPLE) + return sizeof(struct diag204_phys_cpu); + else /* DIAG204_INFO_EXT */ + return sizeof(struct diag204_x_phys_cpu); } static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct phys_cpu *)hdr)->cpu_addr; - else /* INFO_EXT */ - return ((struct x_phys_cpu *)hdr)->cpu_addr; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->cpu_addr; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr; } static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct phys_cpu *)hdr)->mgm_time; - else /* INFO_EXT */ - return ((struct x_phys_cpu *)hdr)->mgm_time; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->mgm_time; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->mgm_time; } static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr) { - if (type == INFO_SIMPLE) - return ((struct phys_cpu *)hdr)->ctidx; - else /* INFO_EXT */ - return ((struct x_phys_cpu *)hdr)->ctidx; + if (type == DIAG204_INFO_SIMPLE) + return ((struct diag204_phys_cpu *)hdr)->ctidx; + else /* DIAG204_INFO_EXT */ + return ((struct diag204_x_phys_cpu *)hdr)->ctidx; } /* Diagnose 204 functions */ - -static inline int __diag204(unsigned long subcode, unsigned long size, void *addr) -{ - register unsigned long _subcode asm("0") = subcode; - register unsigned long _size asm("1") = size; - - asm volatile( - " diag %2,%0,0x204\n" - "0:\n" - EX_TABLE(0b,0b) - : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory"); - if (_subcode) - return -1; - return _size; -} - -static int diag204(unsigned long subcode, unsigned long size, void *addr) -{ - diag_stat_inc(DIAG_STAT_X204); - return __diag204(subcode, size, addr); -} - /* * For the old diag subcode 4 with simple data format we have to use real * memory. If we use subcode 6 or 7 with extended data format, we can (and @@ -409,12 +261,12 @@ static void *diag204_get_buffer(enum diag204_format fmt, int *pages) *pages = diag204_buf_pages; return diag204_buf; } - if (fmt == INFO_SIMPLE) { + if (fmt == DIAG204_INFO_SIMPLE) { *pages = 1; return diag204_alloc_rbuf(); - } else {/* INFO_EXT */ - *pages = diag204((unsigned long)SUBC_RSI | - (unsigned long)INFO_EXT, 0, NULL); + } else {/* DIAG204_INFO_EXT */ + *pages = diag204((unsigned long)DIAG204_SUBC_RSI | + (unsigned long)DIAG204_INFO_EXT, 0, NULL); if (*pages <= 0) return ERR_PTR(-ENOSYS); else @@ -441,18 +293,18 @@ static int diag204_probe(void) void *buf; int pages, rc; - buf = diag204_get_buffer(INFO_EXT, &pages); + buf = diag204_get_buffer(DIAG204_INFO_EXT, &pages); if (!IS_ERR(buf)) { - if (diag204((unsigned long)SUBC_STIB7 | - (unsigned long)INFO_EXT, pages, buf) >= 0) { - diag204_store_sc = SUBC_STIB7; - diag204_info_type = INFO_EXT; + if (diag204((unsigned long)DIAG204_SUBC_STIB7 | + (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) { + diag204_store_sc = DIAG204_SUBC_STIB7; + diag204_info_type = DIAG204_INFO_EXT; goto out; } - if (diag204((unsigned long)SUBC_STIB6 | - (unsigned long)INFO_EXT, pages, buf) >= 0) { - diag204_store_sc = SUBC_STIB6; - diag204_info_type = INFO_EXT; + if (diag204((unsigned long)DIAG204_SUBC_STIB6 | + (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) { + diag204_store_sc = DIAG204_SUBC_STIB6; + diag204_info_type = DIAG204_INFO_EXT; goto out; } diag204_free_buffer(); @@ -460,15 +312,15 @@ static int diag204_probe(void) /* subcodes 6 and 7 failed, now try subcode 4 */ - buf = diag204_get_buffer(INFO_SIMPLE, &pages); + buf = diag204_get_buffer(DIAG204_INFO_SIMPLE, &pages); if (IS_ERR(buf)) { rc = PTR_ERR(buf); goto fail_alloc; } - if (diag204((unsigned long)SUBC_STIB4 | - (unsigned long)INFO_SIMPLE, pages, buf) >= 0) { - diag204_store_sc = SUBC_STIB4; - diag204_info_type = INFO_SIMPLE; + if (diag204((unsigned long)DIAG204_SUBC_STIB4 | + (unsigned long)DIAG204_INFO_SIMPLE, pages, buf) >= 0) { + diag204_store_sc = DIAG204_SUBC_STIB4; + diag204_info_type = DIAG204_INFO_SIMPLE; goto out; } else { rc = -ENOSYS; @@ -543,9 +395,9 @@ static void diag224_delete_name_table(void) static int diag224_idx2name(int index, char *name) { - memcpy(name, diag224_cpu_names + ((index + 1) * CPU_NAME_LEN), - CPU_NAME_LEN); - name[CPU_NAME_LEN] = 0; + memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN), + DIAG204_CPU_NAME_LEN); + name[DIAG204_CPU_NAME_LEN] = 0; strim(name); return 0; } @@ -601,7 +453,7 @@ __init int hypfs_diag_init(void) pr_err("The hardware system does not support hypfs\n"); return -ENODATA; } - if (diag204_info_type == INFO_EXT) { + if (diag204_info_type == DIAG204_INFO_EXT) { rc = hypfs_dbfs_create_file(&dbfs_file_d204); if (rc) return rc; @@ -649,7 +501,7 @@ static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info) cpu_info__lp_time(diag204_info_type, cpu_info)); if (IS_ERR(rc)) return PTR_ERR(rc); - if (diag204_info_type == INFO_EXT) { + if (diag204_info_type == DIAG204_INFO_EXT) { rc = hypfs_create_u64(cpu_dir, "onlinetime", cpu_info__online_time(diag204_info_type, cpu_info)); @@ -665,12 +517,12 @@ static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr) { struct dentry *cpus_dir; struct dentry *lpar_dir; - char lpar_name[LPAR_NAME_LEN + 1]; + char lpar_name[DIAG204_LPAR_NAME_LEN + 1]; void *cpu_info; int i; part_hdr__part_name(diag204_info_type, part_hdr, lpar_name); - lpar_name[LPAR_NAME_LEN] = 0; + lpar_name[DIAG204_LPAR_NAME_LEN] = 0; lpar_dir = hypfs_mkdir(systems_dir, lpar_name); if (IS_ERR(lpar_dir)) return lpar_dir; @@ -753,7 +605,8 @@ int hypfs_diag_create_files(struct dentry *root) goto err_out; } } - if (info_blk_hdr__flags(diag204_info_type, time_hdr) & LPAR_PHYS_FLG) { + if (info_blk_hdr__flags(diag204_info_type, time_hdr) & + DIAG204_LPAR_PHYS_FLG) { ptr = hypfs_create_phys_files(root, part_hdr); if (IS_ERR(ptr)) { rc = PTR_ERR(ptr); diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index 5fac921c1c42..f72744f14e31 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -78,4 +78,131 @@ struct diag210 { extern int diag210(struct diag210 *addr); +/* bit is set in flags, when physical cpu info is included in diag 204 data */ +#define DIAG204_LPAR_PHYS_FLG 0x80 +#define DIAG204_LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */ +#define DIAG204_CPU_NAME_LEN 16 /* type name len of cpus in diag224 name table */ + +/* diag 204 subcodes */ +enum diag204_sc { + DIAG204_SUBC_STIB4 = 4, + DIAG204_SUBC_RSI = 5, + DIAG204_SUBC_STIB6 = 6, + DIAG204_SUBC_STIB7 = 7 +}; + +/* The two available diag 204 data formats */ +enum diag204_format { + DIAG204_INFO_SIMPLE = 0, + DIAG204_INFO_EXT = 0x00010000 +}; + +struct diag204_info_blk_hdr { + __u8 npar; + __u8 flags; + __u16 tslice; + __u16 phys_cpus; + __u16 this_part; + __u64 curtod; +} __packed; + +struct diag204_x_info_blk_hdr { + __u8 npar; + __u8 flags; + __u16 tslice; + __u16 phys_cpus; + __u16 this_part; + __u64 curtod1; + __u64 curtod2; + char reserved[40]; +} __packed; + +struct diag204_part_hdr { + __u8 pn; + __u8 cpus; + char reserved[6]; + char part_name[DIAG204_LPAR_NAME_LEN]; +} __packed; + +struct diag204_x_part_hdr { + __u8 pn; + __u8 cpus; + __u8 rcpus; + __u8 pflag; + __u32 mlu; + char part_name[DIAG204_LPAR_NAME_LEN]; + char lpc_name[8]; + char os_name[8]; + __u64 online_cs; + __u64 online_es; + __u8 upid; + char reserved1[3]; + __u32 group_mlu; + char group_name[8]; + char reserved2[32]; +} __packed; + +struct diag204_cpu_info { + __u16 cpu_addr; + char reserved1[2]; + __u8 ctidx; + __u8 cflag; + __u16 weight; + __u64 acc_time; + __u64 lp_time; +} __packed; + +struct diag204_x_cpu_info { + __u16 cpu_addr; + char reserved1[2]; + __u8 ctidx; + __u8 cflag; + __u16 weight; + __u64 acc_time; + __u64 lp_time; + __u16 min_weight; + __u16 cur_weight; + __u16 max_weight; + char reseved2[2]; + __u64 online_time; + __u64 wait_time; + __u32 pma_weight; + __u32 polar_weight; + char reserved3[40]; +} __packed; + +struct diag204_phys_hdr { + char reserved1[1]; + __u8 cpus; + char reserved2[6]; + char mgm_name[8]; +} __packed; + +struct diag204_x_phys_hdr { + char reserved1[1]; + __u8 cpus; + char reserved2[6]; + char mgm_name[8]; + char reserved3[80]; +} __packed; + +struct diag204_phys_cpu { + __u16 cpu_addr; + char reserved1[2]; + __u8 ctidx; + char reserved2[3]; + __u64 mgm_time; + char reserved3[8]; +} __packed; + +struct diag204_x_phys_cpu { + __u16 cpu_addr; + char reserved1[2]; + __u8 ctidx; + char reserved2[3]; + __u64 mgm_time; + char reserved3[80]; +} __packed; + +int diag204(unsigned long subcode, unsigned long size, void *addr); #endif /* _ASM_S390_DIAG_H */ diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index 48b37b8357e6..f4ce4a248811 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -162,6 +162,28 @@ int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode) } EXPORT_SYMBOL(diag14); +static inline int __diag204(unsigned long subcode, unsigned long size, void *addr) +{ + register unsigned long _subcode asm("0") = subcode; + register unsigned long _size asm("1") = size; + + asm volatile( + " diag %2,%0,0x204\n" + "0:\n" + EX_TABLE(0b,0b) + : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory"); + if (_subcode) + return -1; + return _size; +} + +int diag204(unsigned long subcode, unsigned long size, void *addr) +{ + diag_stat_inc(DIAG_STAT_X204); + return __diag204(subcode, size, addr); +} +EXPORT_SYMBOL(diag204); + /* * Diagnose 210: Get information about a virtual device */ From e435dc31398e63b992639cf62024d959219db191 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Mon, 8 Feb 2016 13:36:22 +0100 Subject: [PATCH 006/302] s390: Make cpc_name accessible sclp_ocf.c is the only way to get the cpc name, as it registers the sole event handler for the ocf event. By creating a new global function that copies that name, we make it accessible to the world which longs to retrieve it. Additionally we now also store the cpc name as EBCDIC, so we don't have to convert it to and from ASCII if it is requested in native encoding. Signed-off-by: Janosch Frank Reviewed-by: David Hildenbrand Acked-by: Heiko Carstens Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/sclp.h | 1 + drivers/s390/char/sclp_ocf.c | 23 +++++++++++++++-------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index e4f6f73afe2f..49736a0d4e0e 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -101,5 +101,6 @@ int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count); int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count); void sclp_early_detect(void); void _sclp_print_early(const char *); +void sclp_ocf_cpc_name_copy(char *dst); #endif /* _ASM_S390_SCLP_H */ diff --git a/drivers/s390/char/sclp_ocf.c b/drivers/s390/char/sclp_ocf.c index 2553db0fdb52..f59b71776bbd 100644 --- a/drivers/s390/char/sclp_ocf.c +++ b/drivers/s390/char/sclp_ocf.c @@ -26,7 +26,7 @@ #define OCF_LENGTH_CPC_NAME 8UL static char hmc_network[OCF_LENGTH_HMC_NETWORK + 1]; -static char cpc_name[OCF_LENGTH_CPC_NAME + 1]; +static char cpc_name[OCF_LENGTH_CPC_NAME]; /* in EBCDIC */ static DEFINE_SPINLOCK(sclp_ocf_lock); static struct work_struct sclp_ocf_change_work; @@ -72,9 +72,8 @@ static void sclp_ocf_handler(struct evbuf_header *evbuf) } if (cpc) { size = min(OCF_LENGTH_CPC_NAME, (size_t) cpc->length); + memset(cpc_name, 0, OCF_LENGTH_CPC_NAME); memcpy(cpc_name, cpc + 1, size); - EBCASC(cpc_name, size); - cpc_name[size] = 0; } spin_unlock(&sclp_ocf_lock); schedule_work(&sclp_ocf_change_work); @@ -85,15 +84,23 @@ static struct sclp_register sclp_ocf_event = { .receiver_fn = sclp_ocf_handler, }; +void sclp_ocf_cpc_name_copy(char *dst) +{ + spin_lock_irq(&sclp_ocf_lock); + memcpy(dst, cpc_name, OCF_LENGTH_CPC_NAME); + spin_unlock_irq(&sclp_ocf_lock); +} +EXPORT_SYMBOL(sclp_ocf_cpc_name_copy); + static ssize_t cpc_name_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { - int rc; + char name[OCF_LENGTH_CPC_NAME + 1]; - spin_lock_irq(&sclp_ocf_lock); - rc = snprintf(page, PAGE_SIZE, "%s\n", cpc_name); - spin_unlock_irq(&sclp_ocf_lock); - return rc; + sclp_ocf_cpc_name_copy(name); + name[OCF_LENGTH_CPC_NAME] = 0; + EBCASC(name, OCF_LENGTH_CPC_NAME); + return snprintf(page, PAGE_SIZE, "%s\n", name); } static struct kobj_attribute cpc_name_attr = From 022bd2d11cc51f62e873a09bcae8016b10950194 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 12 Feb 2016 12:52:49 +0100 Subject: [PATCH 007/302] s390: Make diag224 public Diag204's cpu structures only contain the cpu type by means of an index in the diag224 name table. Hence, to be able to use diag204 in any meaningful way, we also need a usable diag224 interface. Signed-off-by: Janosch Frank Reviewed-by: Christian Borntraeger Reviewed-by: David Hildenbrand Acked-by: Heiko Carstens Signed-off-by: Christian Borntraeger --- arch/s390/hypfs/hypfs_diag.c | 14 -------------- arch/s390/include/asm/diag.h | 1 + arch/s390/kernel/diag.c | 15 +++++++++++++++ 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c index 1e28414d7275..28f03ca60100 100644 --- a/arch/s390/hypfs/hypfs_diag.c +++ b/arch/s390/hypfs/hypfs_diag.c @@ -360,20 +360,6 @@ static void *diag204_store(void) /* Diagnose 224 functions */ -static int diag224(void *ptr) -{ - int rc = -EOPNOTSUPP; - - diag_stat_inc(DIAG_STAT_X224); - asm volatile( - " diag %1,%2,0x224\n" - "0: lhi %0,0x0\n" - "1:\n" - EX_TABLE(0b,1b) - : "+d" (rc) :"d" (0), "d" (ptr) : "memory"); - return rc; -} - static int diag224_get_name_table(void) { /* memory must be below 2GB */ diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index f72744f14e31..197e303a76e9 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -205,4 +205,5 @@ struct diag204_x_phys_cpu { } __packed; int diag204(unsigned long subcode, unsigned long size, void *addr); +int diag224(void *ptr); #endif /* _ASM_S390_DIAG_H */ diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index f4ce4a248811..a44faf4a0454 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -218,3 +218,18 @@ int diag210(struct diag210 *addr) return ccode; } EXPORT_SYMBOL(diag210); + +int diag224(void *ptr) +{ + int rc = -EOPNOTSUPP; + + diag_stat_inc(DIAG_STAT_X224); + asm volatile( + " diag %1,%2,0x224\n" + "0: lhi %0,0x0\n" + "1:\n" + EX_TABLE(0b,1b) + : "+d" (rc) :"d" (0), "d" (ptr) : "memory"); + return rc; +} +EXPORT_SYMBOL(diag224); From a011eeb2a3d6cd778eb63bea0bf149ebbe658ab5 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Mon, 9 May 2016 14:14:01 +0200 Subject: [PATCH 008/302] KVM: s390: Add operation exception interception handler This commit introduces code that handles operation exception interceptions. With this handler we can emulate instructions by using illegal opcodes. Signed-off-by: Janosch Frank Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 1 + arch/s390/kvm/intercept.c | 11 +++++++++++ arch/s390/kvm/kvm-s390.c | 1 + arch/s390/kvm/trace.h | 21 +++++++++++++++++++++ 4 files changed, 34 insertions(+) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 37b9017c6a96..093ea14109e2 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -255,6 +255,7 @@ struct kvm_vcpu_stat { u32 instruction_stctg; u32 exit_program_interruption; u32 exit_instr_and_program; + u32 exit_operation_exception; u32 deliver_external_call; u32 deliver_emergency_signal; u32 deliver_service_signal; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 2e6b54e4d3f9..09c13db1416f 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -349,6 +349,15 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; } +static int handle_operexc(struct kvm_vcpu *vcpu) +{ + vcpu->stat.exit_operation_exception++; + trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa, + vcpu->arch.sie_block->ipb); + + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); +} + int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) { if (kvm_is_ucontrol(vcpu->kvm)) @@ -370,6 +379,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) return handle_validity(vcpu); case 0x28: return handle_stop(vcpu); + case 0x2c: + return handle_operexc(vcpu); case 0x38: return handle_partial_execution(vcpu); default: diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 6d8ec3ac9dd8..f0addece729e 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -63,6 +63,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "exit_instruction", VCPU_STAT(exit_instruction) }, { "exit_program_interruption", VCPU_STAT(exit_program_interruption) }, { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) }, + { "exit_operation_exception", VCPU_STAT(exit_operation_exception) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h index 916834d7a73a..90d26a6aa52c 100644 --- a/arch/s390/kvm/trace.h +++ b/arch/s390/kvm/trace.h @@ -412,6 +412,27 @@ TRACE_EVENT(kvm_s390_handle_stsi, __entry->addr) ); +TRACE_EVENT(kvm_s390_handle_operexc, + TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb), + TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(__u64, instruction) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->instruction = ((__u64)ipa << 48) | + ((__u64)ipb << 16); + ), + + VCPU_TP_PRINTK("operation exception on instruction %016llx (%s)", + __entry->instruction, + __print_symbolic(icpt_insn_decoder(__entry->instruction), + icpt_insn_codes)) + ); + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ From a2d57b35c0226102b1f2ffdc2f719fcc30c99bf5 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Mon, 23 May 2016 15:09:19 +0200 Subject: [PATCH 009/302] KVM: s390: Extend diag 204 fields The new store hypervisor information instruction, which we are going to introduce, needs previously unused fields in diag 204 structures. Signed-off-by: Janosch Frank Acked-by: Heiko Carstens Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/diag.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index 197e303a76e9..f4000cdb6921 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -97,6 +97,11 @@ enum diag204_format { DIAG204_INFO_EXT = 0x00010000 }; +enum diag204_cpu_flags { + DIAG204_CPU_ONLINE = 0x20, + DIAG204_CPU_CAPPED = 0x40, +}; + struct diag204_info_blk_hdr { __u8 npar; __u8 flags; @@ -136,10 +141,13 @@ struct diag204_x_part_hdr { __u64 online_cs; __u64 online_es; __u8 upid; - char reserved1[3]; + __u8 reserved:3; + __u8 mtid:5; + char reserved1[2]; __u32 group_mlu; char group_name[8]; - char reserved2[32]; + char hardware_group_name[8]; + char reserved2[24]; } __packed; struct diag204_cpu_info { @@ -168,7 +176,9 @@ struct diag204_x_cpu_info { __u64 wait_time; __u32 pma_weight; __u32 polar_weight; - char reserved3[40]; + __u32 cpu_type_cap; + __u32 group_cpu_type_cap; + char reserved3[32]; } __packed; struct diag204_phys_hdr { @@ -199,7 +209,8 @@ struct diag204_x_phys_cpu { __u16 cpu_addr; char reserved1[2]; __u8 ctidx; - char reserved2[3]; + char reserved2[1]; + __u16 weight; __u64 mgm_time; char reserved3[80]; } __packed; From 95ca2cb57985b07f5b136405f80a5106f5b06641 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Mon, 23 May 2016 15:11:58 +0200 Subject: [PATCH 010/302] KVM: s390: Add sthyi emulation Store Hypervisor Information is an emulated z/VM instruction that provides a guest with basic information about the layers it is running on. This includes information about the cpu configuration of both the machine and the lpar, as well as their names, machine model and machine type. This information enables an application to determine the maximum capacity of CPs and IFLs available to software. The instruction is available whenever the facility bit 74 is set, otherwise executing it results in an operation exception. It is important to check the validity flags in the sections before using data from any structure member. It is not guaranteed that all members will be valid on all machines / machine configurations. Signed-off-by: Janosch Frank Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/diag.h | 10 + arch/s390/include/asm/kvm_host.h | 2 + arch/s390/include/uapi/asm/sie.h | 1 + arch/s390/kvm/Makefile | 2 +- arch/s390/kvm/intercept.c | 4 + arch/s390/kvm/kvm-s390.c | 6 + arch/s390/kvm/kvm-s390.h | 3 + arch/s390/kvm/sthyi.c | 460 +++++++++++++++++++++++++++++++ arch/s390/kvm/trace.h | 20 ++ 9 files changed, 507 insertions(+), 1 deletion(-) create mode 100644 arch/s390/kvm/sthyi.c diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index f4000cdb6921..82211998ccf7 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -215,6 +215,16 @@ struct diag204_x_phys_cpu { char reserved3[80]; } __packed; +struct diag204_x_part_block { + struct diag204_x_part_hdr hdr; + struct diag204_x_cpu_info cpus[]; +} __packed; + +struct diag204_x_phys_block { + struct diag204_x_phys_hdr hdr; + struct diag204_x_phys_cpu cpus[]; +} __packed; + int diag204(unsigned long subcode, unsigned long size, void *addr); int diag224(void *ptr); #endif /* _ASM_S390_DIAG_H */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 093ea14109e2..7233b1c49964 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -154,6 +154,7 @@ struct kvm_s390_sie_block { #define LCTL_CR14 0x0002 __u16 lctl; /* 0x0044 */ __s16 icpua; /* 0x0046 */ +#define ICTL_OPEREXC 0x80000000 #define ICTL_PINT 0x20000000 #define ICTL_LPSW 0x00400000 #define ICTL_STCTL 0x00040000 @@ -279,6 +280,7 @@ struct kvm_vcpu_stat { u32 instruction_stfl; u32 instruction_tprot; u32 instruction_essa; + u32 instruction_sthyi; u32 instruction_sigp_sense; u32 instruction_sigp_sense_running; u32 instruction_sigp_external_call; diff --git a/arch/s390/include/uapi/asm/sie.h b/arch/s390/include/uapi/asm/sie.h index 8fb5d4a6dd25..3ac634368939 100644 --- a/arch/s390/include/uapi/asm/sie.h +++ b/arch/s390/include/uapi/asm/sie.h @@ -140,6 +140,7 @@ exit_code_ipa0(0xB2, 0x4c, "TAR"), \ exit_code_ipa0(0xB2, 0x50, "CSP"), \ exit_code_ipa0(0xB2, 0x54, "MVPG"), \ + exit_code_ipa0(0xB2, 0x56, "STHYI"), \ exit_code_ipa0(0xB2, 0x58, "BSG"), \ exit_code_ipa0(0xB2, 0x5a, "BSA"), \ exit_code_ipa0(0xB2, 0x5f, "CHSC"), \ diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index d42fa38c2429..82e73e2b953d 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqch ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-objs += diag.o gaccess.o guestdbg.o +kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 09c13db1416f..9359f65c8634 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -355,6 +355,10 @@ static int handle_operexc(struct kvm_vcpu *vcpu) trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa, vcpu->arch.sie_block->ipb); + if (vcpu->arch.sie_block->ipa == 0xb256 && + test_kvm_facility(vcpu->kvm, 74)) + return handle_sthyi(vcpu); + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f0addece729e..1c10254119b3 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -94,6 +94,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "instruction_stsi", VCPU_STAT(instruction_stsi) }, { "instruction_stfl", VCPU_STAT(instruction_stfl) }, { "instruction_tprot", VCPU_STAT(instruction_tprot) }, + { "instruction_sthyi", VCPU_STAT(instruction_sthyi) }, { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) }, { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) }, { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) }, @@ -1189,6 +1190,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask, S390_ARCH_FAC_LIST_SIZE_BYTE); + set_kvm_facility(kvm->arch.model.fac_mask, 74); + set_kvm_facility(kvm->arch.model.fac_list, 74); + kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid(); kvm->arch.model.ibc = sclp.ibc & 0x0fff; @@ -1679,6 +1683,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) } vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb; vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; + if (test_kvm_facility(vcpu->kvm, 74)) + vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; if (vcpu->kvm->arch.use_cmma) { rc = kvm_s390_vcpu_setup_cmma(vcpu); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 8621ab00ec8e..c5ec4d31e5e3 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -250,6 +250,9 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); +/* implemented in sthyi.c */ +int handle_sthyi(struct kvm_vcpu *vcpu); + /* implemented in kvm-s390.c */ void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod); long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c new file mode 100644 index 000000000000..894d5626f18d --- /dev/null +++ b/arch/s390/kvm/sthyi.c @@ -0,0 +1,460 @@ +/* + * store hypervisor information instruction emulation functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + * + * Copyright IBM Corp. 2016 + * Author(s): Janosch Frank + */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "kvm-s390.h" +#include "gaccess.h" +#include "trace.h" + +#define DED_WEIGHT 0xffff +/* + * CP and IFL as EBCDIC strings, SP/0x40 determines the end of string + * as they are justified with spaces. + */ +#define CP 0xc3d7404040404040UL +#define IFL 0xc9c6d34040404040UL + +enum hdr_flags { + HDR_NOT_LPAR = 0x10, + HDR_STACK_INCM = 0x20, + HDR_STSI_UNAV = 0x40, + HDR_PERF_UNAV = 0x80, +}; + +enum mac_validity { + MAC_NAME_VLD = 0x20, + MAC_ID_VLD = 0x40, + MAC_CNT_VLD = 0x80, +}; + +enum par_flag { + PAR_MT_EN = 0x80, +}; + +enum par_validity { + PAR_GRP_VLD = 0x08, + PAR_ID_VLD = 0x10, + PAR_ABS_VLD = 0x20, + PAR_WGHT_VLD = 0x40, + PAR_PCNT_VLD = 0x80, +}; + +struct hdr_sctn { + u8 infhflg1; + u8 infhflg2; /* reserved */ + u8 infhval1; /* reserved */ + u8 infhval2; /* reserved */ + u8 reserved[3]; + u8 infhygct; + u16 infhtotl; + u16 infhdln; + u16 infmoff; + u16 infmlen; + u16 infpoff; + u16 infplen; + u16 infhoff1; + u16 infhlen1; + u16 infgoff1; + u16 infglen1; + u16 infhoff2; + u16 infhlen2; + u16 infgoff2; + u16 infglen2; + u16 infhoff3; + u16 infhlen3; + u16 infgoff3; + u16 infglen3; + u8 reserved2[4]; +} __packed; + +struct mac_sctn { + u8 infmflg1; /* reserved */ + u8 infmflg2; /* reserved */ + u8 infmval1; + u8 infmval2; /* reserved */ + u16 infmscps; + u16 infmdcps; + u16 infmsifl; + u16 infmdifl; + char infmname[8]; + char infmtype[4]; + char infmmanu[16]; + char infmseq[16]; + char infmpman[4]; + u8 reserved[4]; +} __packed; + +struct par_sctn { + u8 infpflg1; + u8 infpflg2; /* reserved */ + u8 infpval1; + u8 infpval2; /* reserved */ + u16 infppnum; + u16 infpscps; + u16 infpdcps; + u16 infpsifl; + u16 infpdifl; + u16 reserved; + char infppnam[8]; + u32 infpwbcp; + u32 infpabcp; + u32 infpwbif; + u32 infpabif; + char infplgnm[8]; + u32 infplgcp; + u32 infplgif; +} __packed; + +struct sthyi_sctns { + struct hdr_sctn hdr; + struct mac_sctn mac; + struct par_sctn par; +} __packed; + +struct cpu_inf { + u64 lpar_cap; + u64 lpar_grp_cap; + u64 lpar_weight; + u64 all_weight; + int cpu_num_ded; + int cpu_num_shd; +}; + +struct lpar_cpu_inf { + struct cpu_inf cp; + struct cpu_inf ifl; +}; + +static inline u64 cpu_id(u8 ctidx, void *diag224_buf) +{ + return *((u64 *)(diag224_buf + (ctidx + 1) * DIAG204_CPU_NAME_LEN)); +} + +/* + * Scales the cpu capping from the lpar range to the one expected in + * sthyi data. + * + * diag204 reports a cap in hundredths of processor units. + * z/VM's range for one core is 0 - 0x10000. + */ +static u32 scale_cap(u32 in) +{ + return (0x10000 * in) / 100; +} + +static void fill_hdr(struct sthyi_sctns *sctns) +{ + sctns->hdr.infhdln = sizeof(sctns->hdr); + sctns->hdr.infmoff = sizeof(sctns->hdr); + sctns->hdr.infmlen = sizeof(sctns->mac); + sctns->hdr.infplen = sizeof(sctns->par); + sctns->hdr.infpoff = sctns->hdr.infhdln + sctns->hdr.infmlen; + sctns->hdr.infhtotl = sctns->hdr.infpoff + sctns->hdr.infplen; +} + +static void fill_stsi_mac(struct sthyi_sctns *sctns, + struct sysinfo_1_1_1 *sysinfo) +{ + if (stsi(sysinfo, 1, 1, 1)) + return; + + sclp_ocf_cpc_name_copy(sctns->mac.infmname); + + memcpy(sctns->mac.infmtype, sysinfo->type, sizeof(sctns->mac.infmtype)); + memcpy(sctns->mac.infmmanu, sysinfo->manufacturer, sizeof(sctns->mac.infmmanu)); + memcpy(sctns->mac.infmpman, sysinfo->plant, sizeof(sctns->mac.infmpman)); + memcpy(sctns->mac.infmseq, sysinfo->sequence, sizeof(sctns->mac.infmseq)); + + sctns->mac.infmval1 |= MAC_ID_VLD | MAC_NAME_VLD; +} + +static void fill_stsi_par(struct sthyi_sctns *sctns, + struct sysinfo_2_2_2 *sysinfo) +{ + if (stsi(sysinfo, 2, 2, 2)) + return; + + sctns->par.infppnum = sysinfo->lpar_number; + memcpy(sctns->par.infppnam, sysinfo->name, sizeof(sctns->par.infppnam)); + + sctns->par.infpval1 |= PAR_ID_VLD; +} + +static void fill_stsi(struct sthyi_sctns *sctns) +{ + void *sysinfo; + + /* Errors are handled through the validity bits in the response. */ + sysinfo = (void *)__get_free_page(GFP_KERNEL); + if (!sysinfo) + return; + + fill_stsi_mac(sctns, sysinfo); + fill_stsi_par(sctns, sysinfo); + + free_pages((unsigned long)sysinfo, 0); +} + +static void fill_diag_mac(struct sthyi_sctns *sctns, + struct diag204_x_phys_block *block, + void *diag224_buf) +{ + int i; + + for (i = 0; i < block->hdr.cpus; i++) { + switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) { + case CP: + if (block->cpus[i].weight == DED_WEIGHT) + sctns->mac.infmdcps++; + else + sctns->mac.infmscps++; + break; + case IFL: + if (block->cpus[i].weight == DED_WEIGHT) + sctns->mac.infmdifl++; + else + sctns->mac.infmsifl++; + break; + } + } + sctns->mac.infmval1 |= MAC_CNT_VLD; +} + +/* Returns a pointer to the the next partition block. */ +static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf, + bool this_lpar, + void *diag224_buf, + struct diag204_x_part_block *block) +{ + int i, capped = 0, weight_cp = 0, weight_ifl = 0; + struct cpu_inf *cpu_inf; + + for (i = 0; i < block->hdr.rcpus; i++) { + if (!(block->cpus[i].cflag & DIAG204_CPU_ONLINE)) + continue; + + switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) { + case CP: + cpu_inf = &part_inf->cp; + if (block->cpus[i].cur_weight < DED_WEIGHT) + weight_cp |= block->cpus[i].cur_weight; + break; + case IFL: + cpu_inf = &part_inf->ifl; + if (block->cpus[i].cur_weight < DED_WEIGHT) + weight_ifl |= block->cpus[i].cur_weight; + break; + default: + continue; + } + + if (!this_lpar) + continue; + + capped |= block->cpus[i].cflag & DIAG204_CPU_CAPPED; + cpu_inf->lpar_cap |= block->cpus[i].cpu_type_cap; + cpu_inf->lpar_grp_cap |= block->cpus[i].group_cpu_type_cap; + + if (block->cpus[i].weight == DED_WEIGHT) + cpu_inf->cpu_num_ded += 1; + else + cpu_inf->cpu_num_shd += 1; + } + + if (this_lpar && capped) { + part_inf->cp.lpar_weight = weight_cp; + part_inf->ifl.lpar_weight = weight_ifl; + } + part_inf->cp.all_weight += weight_cp; + part_inf->ifl.all_weight += weight_ifl; + return (struct diag204_x_part_block *)&block->cpus[i]; +} + +static void fill_diag(struct sthyi_sctns *sctns) +{ + int i, r, pages; + bool this_lpar; + void *diag204_buf; + void *diag224_buf = NULL; + struct diag204_x_info_blk_hdr *ti_hdr; + struct diag204_x_part_block *part_block; + struct diag204_x_phys_block *phys_block; + struct lpar_cpu_inf lpar_inf = {}; + + /* Errors are handled through the validity bits in the response. */ + pages = diag204((unsigned long)DIAG204_SUBC_RSI | + (unsigned long)DIAG204_INFO_EXT, 0, NULL); + if (pages <= 0) + return; + + diag204_buf = vmalloc(PAGE_SIZE * pages); + if (!diag204_buf) + return; + + r = diag204((unsigned long)DIAG204_SUBC_STIB7 | + (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf); + if (r < 0) + goto out; + + diag224_buf = kmalloc(PAGE_SIZE, GFP_KERNEL | GFP_DMA); + if (!diag224_buf || diag224(diag224_buf)) + goto out; + + ti_hdr = diag204_buf; + part_block = diag204_buf + sizeof(*ti_hdr); + + for (i = 0; i < ti_hdr->npar; i++) { + /* + * For the calling lpar we also need to get the cpu + * caps and weights. The time information block header + * specifies the offset to the partition block of the + * caller lpar, so we know when we process its data. + */ + this_lpar = (void *)part_block - diag204_buf == ti_hdr->this_part; + part_block = lpar_cpu_inf(&lpar_inf, this_lpar, diag224_buf, + part_block); + } + + phys_block = (struct diag204_x_phys_block *)part_block; + part_block = diag204_buf + ti_hdr->this_part; + if (part_block->hdr.mtid) + sctns->par.infpflg1 = PAR_MT_EN; + + sctns->par.infpval1 |= PAR_GRP_VLD; + sctns->par.infplgcp = scale_cap(lpar_inf.cp.lpar_grp_cap); + sctns->par.infplgif = scale_cap(lpar_inf.ifl.lpar_grp_cap); + memcpy(sctns->par.infplgnm, part_block->hdr.hardware_group_name, + sizeof(sctns->par.infplgnm)); + + sctns->par.infpscps = lpar_inf.cp.cpu_num_shd; + sctns->par.infpdcps = lpar_inf.cp.cpu_num_ded; + sctns->par.infpsifl = lpar_inf.ifl.cpu_num_shd; + sctns->par.infpdifl = lpar_inf.ifl.cpu_num_ded; + sctns->par.infpval1 |= PAR_PCNT_VLD; + + sctns->par.infpabcp = scale_cap(lpar_inf.cp.lpar_cap); + sctns->par.infpabif = scale_cap(lpar_inf.ifl.lpar_cap); + sctns->par.infpval1 |= PAR_ABS_VLD; + + /* + * Everything below needs global performance data to be + * meaningful. + */ + if (!(ti_hdr->flags & DIAG204_LPAR_PHYS_FLG)) { + sctns->hdr.infhflg1 |= HDR_PERF_UNAV; + goto out; + } + + fill_diag_mac(sctns, phys_block, diag224_buf); + + if (lpar_inf.cp.lpar_weight) { + sctns->par.infpwbcp = sctns->mac.infmscps * 0x10000 * + lpar_inf.cp.lpar_weight / lpar_inf.cp.all_weight; + } + + if (lpar_inf.ifl.lpar_weight) { + sctns->par.infpwbif = sctns->mac.infmsifl * 0x10000 * + lpar_inf.ifl.lpar_weight / lpar_inf.ifl.all_weight; + } + sctns->par.infpval1 |= PAR_WGHT_VLD; + +out: + kfree(diag224_buf); + vfree(diag204_buf); +} + +static int sthyi(u64 vaddr) +{ + register u64 code asm("0") = 0; + register u64 addr asm("2") = vaddr; + int cc; + + asm volatile( + ".insn rre,0xB2560000,%[code],%[addr]\n" + "ipm %[cc]\n" + "srl %[cc],28\n" + : [cc] "=d" (cc) + : [code] "d" (code), [addr] "a" (addr) + : "memory", "cc"); + return cc; +} + +int handle_sthyi(struct kvm_vcpu *vcpu) +{ + int reg1, reg2, r = 0; + u64 code, addr, cc = 0; + struct sthyi_sctns *sctns = NULL; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + code = vcpu->run->s.regs.gprs[reg1]; + addr = vcpu->run->s.regs.gprs[reg2]; + + vcpu->stat.instruction_sthyi++; + VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr); + trace_kvm_s390_handle_sthyi(vcpu, code, addr); + + if (reg1 == reg2 || reg1 & 1 || reg2 & 1 || addr & ~PAGE_MASK) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + if (code & 0xffff) { + cc = 3; + goto out; + } + + /* + * If the page has not yet been faulted in, we want to do that + * now and not after all the expensive calculations. + */ + r = write_guest(vcpu, addr, reg2, &cc, 1); + if (r) + return kvm_s390_inject_prog_cond(vcpu, r); + + sctns = (void *)get_zeroed_page(GFP_KERNEL); + if (!sctns) + return -ENOMEM; + + /* + * If we are a guest, we don't want to emulate an emulated + * instruction. We ask the hypervisor to provide the data. + */ + if (test_facility(74)) { + cc = sthyi((u64)sctns); + goto out; + } + + fill_hdr(sctns); + fill_stsi(sctns); + fill_diag(sctns); + +out: + if (!cc) { + r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); + if (r) { + free_page((unsigned long)sctns); + return kvm_s390_inject_prog_cond(vcpu, r); + } + } + + free_page((unsigned long)sctns); + vcpu->run->s.regs.gprs[reg2 + 1] = cc ? 4 : 0; + kvm_s390_set_psw_cc(vcpu, cc); + return r; +} diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h index 90d26a6aa52c..a429ef9b0d30 100644 --- a/arch/s390/kvm/trace.h +++ b/arch/s390/kvm/trace.h @@ -433,6 +433,26 @@ TRACE_EVENT(kvm_s390_handle_operexc, icpt_insn_codes)) ); +TRACE_EVENT(kvm_s390_handle_sthyi, + TP_PROTO(VCPU_PROTO_COMMON, u64 code, u64 addr), + TP_ARGS(VCPU_ARGS_COMMON, code, addr), + + TP_STRUCT__entry( + VCPU_FIELD_COMMON + __field(u64, code) + __field(u64, addr) + ), + + TP_fast_assign( + VCPU_ASSIGN_COMMON + __entry->code = code; + __entry->addr = addr; + ), + + VCPU_TP_PRINTK("STHYI fc: %llu addr: %016llx", + __entry->code, __entry->addr) + ); + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ From 7d0a5e62411a9223512c6af2e4c08a2d7c00fa2e Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Tue, 10 May 2016 15:03:42 +0200 Subject: [PATCH 011/302] KVM: s390: Limit sthyi execution Store hypervisor information is a valid instruction not only in supervisor state but also in problem state, i.e. the guest's userspace. Its execution is not only computational and memory intensive, but also has to get hold of the ipte lock to write to the guest's memory. This lock is not intended to be held often and long, especially not from the untrusted guest userspace. Therefore we apply rate limiting of sthyi executions per VM. Signed-off-by: Janosch Frank Acked-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 1 + arch/s390/kvm/kvm-s390.c | 2 ++ arch/s390/kvm/sthyi.c | 11 +++++++++++ 3 files changed, 14 insertions(+) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 7233b1c49964..bcc20dc91ea8 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -652,6 +652,7 @@ struct kvm_arch{ wait_queue_head_t ipte_wq; int ipte_lock_count; struct mutex ipte_mutex; + struct ratelimit_state sthyi_limit; spinlock_t start_stop_lock; struct sie_page2 *sie_page2; struct kvm_s390_cpu_model model; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 1c10254119b3..44297ff53b44 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1151,6 +1151,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) rc = -ENOMEM; + ratelimit_state_init(&kvm->arch.sthyi_limit, 5 * HZ, 500); + kvm->arch.use_esca = 0; /* start with basic SCA */ rwlock_init(&kvm->arch.sca_lock); kvm->arch.sca = (struct bsca_block *) get_zeroed_page(GFP_KERNEL); diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c index 894d5626f18d..bd98b7d25200 100644 --- a/arch/s390/kvm/sthyi.c +++ b/arch/s390/kvm/sthyi.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -403,6 +404,16 @@ int handle_sthyi(struct kvm_vcpu *vcpu) u64 code, addr, cc = 0; struct sthyi_sctns *sctns = NULL; + /* + * STHYI requires extensive locking in the higher hypervisors + * and is very computational/memory expensive. Therefore we + * ratelimit the executions per VM. + */ + if (!__ratelimit(&vcpu->kvm->arch.sthyi_limit)) { + kvm_s390_retry_instr(vcpu); + return 0; + } + kvm_s390_get_regs_rre(vcpu, ®1, ®2); code = vcpu->run->s.regs.gprs[reg1]; addr = vcpu->run->s.regs.gprs[reg2]; From c1778e515712dae0575657fe6c9511ffcb28a7e0 Mon Sep 17 00:00:00 2001 From: Alexander Yarygin Date: Fri, 6 May 2016 15:47:19 +0300 Subject: [PATCH 012/302] KVM: s390: Add mnemonic print to kvm_s390_intercept_prog We have a table of mnemonic names for intercepted program interruptions, let's print readable name of the interruption in the kvm_s390_intercept_prog trace event. Signed-off-by: Alexander Yarygin Acked-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/trace.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h index a429ef9b0d30..1c4586b367a4 100644 --- a/arch/s390/kvm/trace.h +++ b/arch/s390/kvm/trace.h @@ -185,8 +185,10 @@ TRACE_EVENT(kvm_s390_intercept_prog, __entry->code = code; ), - VCPU_TP_PRINTK("intercepted program interruption %04x", - __entry->code) + VCPU_TP_PRINTK("intercepted program interruption %04x (%s)", + __entry->code, + __print_symbolic(__entry->code, + icpt_prog_codes)) ); /* From 15c9705f0c8af2d19dede9866aec364746b269ef Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 19 Mar 2015 17:36:43 +0100 Subject: [PATCH 013/302] KVM: s390: interface to query and configure cpu features For now, we only have an interface to query and configure facilities indicated via STFL(E). However, we also have features indicated via SCLP, that have to be indicated to the guest by user space and usually require KVM support. This patch allows user space to query and configure available cpu features for the guest. Please note that disabling a feature doesn't necessarily mean that it is completely disabled (e.g. ESOP is mostly handled by the SIE). We will try our best to disable it. Most features (e.g. SCLP) can't directly be forwarded, as most of them need in addition to hardware support, support in KVM. As we later on want to turn these features in KVM explicitly on/off (to simulate different behavior), we have to filter all features provided by the hardware and make them configurable. Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- Documentation/virtual/kvm/devices/vm.txt | 27 ++++++++++ arch/s390/include/asm/kvm_host.h | 2 + arch/s390/include/uapi/asm/kvm.h | 8 +++ arch/s390/kvm/kvm-s390.c | 63 ++++++++++++++++++++++++ arch/s390/kvm/kvm-s390.h | 6 +++ 5 files changed, 106 insertions(+) diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt index a9ea8774a45f..0ed6808b9965 100644 --- a/Documentation/virtual/kvm/devices/vm.txt +++ b/Documentation/virtual/kvm/devices/vm.txt @@ -85,6 +85,33 @@ Returns: -EBUSY in case 1 or more vcpus are already activated (only in write -ENOMEM if not enough memory is available to process the ioctl 0 in case of success +2.3. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_FEAT (r/o) + +Allows user space to retrieve available cpu features. A feature is available if +provided by the hardware and supported by kvm. In theory, cpu features could +even be completely emulated by kvm. + +struct kvm_s390_vm_cpu_feat { + __u64 feat[16]; # Bitmap (1 = feature available), MSB 0 bit numbering +}; + +Parameters: address of a buffer to load the feature list from. +Returns: -EFAULT if the given address is not accessible from kernel space. + 0 in case of success. + +2.4. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_FEAT (r/w) + +Allows user space to retrieve or change enabled cpu features for all VCPUs of a +VM. Features that are not available cannot be enabled. + +See 2.3. for a description of the parameter struct. + +Parameters: address of a buffer to store/load the feature list from. +Returns: -EFAULT if the given address is not accessible from kernel space. + -EINVAL if a cpu feature that is not available is to be enabled. + -EBUSY if at least one VCPU has already been defined. + 0 in case of success. + 3. GROUP: KVM_S390_VM_TOD Architectures: s390 diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index bcc20dc91ea8..b2a83a0ce42c 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -658,6 +658,8 @@ struct kvm_arch{ struct kvm_s390_cpu_model model; struct kvm_s390_crypto crypto; u64 epoch; + /* subset of available cpu features enabled by user space */ + DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); }; #define KVM_HVA_ERR_BAD (-1UL) diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 3b8e99ef9d58..a8559f265e26 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -93,6 +93,14 @@ struct kvm_s390_vm_cpu_machine { __u64 fac_list[256]; }; +#define KVM_S390_VM_CPU_PROCESSOR_FEAT 2 +#define KVM_S390_VM_CPU_MACHINE_FEAT 3 + +#define KVM_S390_VM_CPU_FEAT_NR_BITS 1024 +struct kvm_s390_vm_cpu_feat { + __u64 feat[16]; +}; + /* kvm attributes for crypto */ #define KVM_S390_VM_CRYPTO_ENABLE_AES_KW 0 #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1 diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 44297ff53b44..6960468f28ad 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -132,6 +133,9 @@ unsigned long kvm_s390_fac_list_mask_size(void) return ARRAY_SIZE(kvm_s390_fac_list_mask); } +/* available cpu features supported by kvm */ +static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); + static struct gmap_notifier gmap_notifier; debug_info_t *kvm_s390_dbf; @@ -677,6 +681,29 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr) return ret; } +static int kvm_s390_set_processor_feat(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_feat data; + int ret = -EBUSY; + + if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data))) + return -EFAULT; + if (!bitmap_subset((unsigned long *) data.feat, + kvm_s390_available_cpu_feat, + KVM_S390_VM_CPU_FEAT_NR_BITS)) + return -EINVAL; + + mutex_lock(&kvm->lock); + if (!atomic_read(&kvm->online_vcpus)) { + bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat, + KVM_S390_VM_CPU_FEAT_NR_BITS); + ret = 0; + } + mutex_unlock(&kvm->lock); + return ret; +} + static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) { int ret = -ENXIO; @@ -685,6 +712,9 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_PROCESSOR: ret = kvm_s390_set_processor(kvm, attr); break; + case KVM_S390_VM_CPU_PROCESSOR_FEAT: + ret = kvm_s390_set_processor_feat(kvm, attr); + break; } return ret; } @@ -733,6 +763,31 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr) return ret; } +static int kvm_s390_get_processor_feat(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_feat data; + + bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat, + KVM_S390_VM_CPU_FEAT_NR_BITS); + if (copy_to_user((void __user *)attr->addr, &data, sizeof(data))) + return -EFAULT; + return 0; +} + +static int kvm_s390_get_machine_feat(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_feat data; + + bitmap_copy((unsigned long *) data.feat, + kvm_s390_available_cpu_feat, + KVM_S390_VM_CPU_FEAT_NR_BITS); + if (copy_to_user((void __user *)attr->addr, &data, sizeof(data))) + return -EFAULT; + return 0; +} + static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) { int ret = -ENXIO; @@ -744,6 +799,12 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_MACHINE: ret = kvm_s390_get_machine(kvm, attr); break; + case KVM_S390_VM_CPU_PROCESSOR_FEAT: + ret = kvm_s390_get_processor_feat(kvm, attr); + break; + case KVM_S390_VM_CPU_MACHINE_FEAT: + ret = kvm_s390_get_machine_feat(kvm, attr); + break; } return ret; } @@ -827,6 +888,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) switch (attr->attr) { case KVM_S390_VM_CPU_PROCESSOR: case KVM_S390_VM_CPU_MACHINE: + case KVM_S390_VM_CPU_PROCESSOR_FEAT: + case KVM_S390_VM_CPU_MACHINE_FEAT: ret = 0; break; default: diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index c5ec4d31e5e3..52aa47e112d8 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -175,6 +175,12 @@ static inline int set_kvm_facility(u64 *fac_list, unsigned long nr) return 0; } +static inline int test_kvm_cpu_feat(struct kvm *kvm, unsigned long nr) +{ + WARN_ON_ONCE(nr >= KVM_S390_VM_CPU_FEAT_NR_BITS); + return test_bit_inv(nr, kvm->arch.cpu_feat); +} + /* are cpu states controlled by user space */ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) { From 22be5a133169e855097936438417ab1b672ad43f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 21 Jan 2016 13:22:54 +0100 Subject: [PATCH 014/302] KVM: s390: forward ESOP if available ESOP guarantees that during a protection exception, bit 61 of real location 168-175 will only be set to 1 if it was because of ALCP or DATP. If the exception is due to LAP or KCP, the bit will always be set to 0. The old SOP definition allowed bit 61 to be unpredictable in case of LAP or KCP in some conditions. So ESOP replaces this unpredictability by a guarantee. Therefore, we can directly forward ESOP if it is available on our machine. We don't have to do anything when ESOP is disabled - the guest will simply expect unpredictable values. Our guest access functions are already handling ESOP properly. Please note that future functionality in KVM will require knowledge about ESOP being enabled for a guest or not. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/uapi/asm/kvm.h | 1 + arch/s390/kvm/kvm-s390.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index a8559f265e26..789c4e27e294 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -97,6 +97,7 @@ struct kvm_s390_vm_cpu_machine { #define KVM_S390_VM_CPU_MACHINE_FEAT 3 #define KVM_S390_VM_CPU_FEAT_NR_BITS 1024 +#define KVM_S390_VM_CPU_FEAT_ESOP 0 struct kvm_s390_vm_cpu_feat { __u64 feat[16]; }; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 6960468f28ad..2b5c14da3227 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -193,6 +193,17 @@ void kvm_arch_hardware_unsetup(void) &kvm_clock_notifier); } +static void allow_cpu_feat(unsigned long nr) +{ + set_bit_inv(nr, kvm_s390_available_cpu_feat); +} + +static void kvm_s390_cpu_feat_init(void) +{ + if (MACHINE_HAS_ESOP) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP); +} + int kvm_arch_init(void *opaque) { kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long)); @@ -204,6 +215,8 @@ int kvm_arch_init(void *opaque) return -ENOMEM; } + kvm_s390_cpu_feat_init(); + /* Register floating interrupt controller interface. */ return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); } From 6167375b558196fdedd38e9867f7bb30ff4dda50 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 31 May 2016 19:44:10 +0200 Subject: [PATCH 015/302] KVM: s390: gaccess: store guest address on ALC prot exceptions Let's pass the effective guest address to get_vcpu_asce(), so we can properly set the guest address in case we inject an ALC protection exception. Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/gaccess.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 66938d283b77..c0da9e9d4490 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -477,7 +477,7 @@ enum { }; static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, - ar_t ar, enum gacc_mode mode) + unsigned long ga, ar_t ar, enum gacc_mode mode) { int rc; struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw); @@ -519,6 +519,7 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, vcpu->arch.pgm.exc_access_id = ar; break; case PGM_PROTECTION: + tec_bits->addr = ga >> PAGE_SHIFT; tec_bits->b60 = 1; tec_bits->b61 = 1; break; @@ -783,7 +784,8 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, if (!len) return 0; - rc = get_vcpu_asce(vcpu, &asce, ar, mode); + ga = kvm_s390_logical_to_effective(vcpu, ga); + rc = get_vcpu_asce(vcpu, &asce, ga, ar, mode); if (rc) return rc; nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1; @@ -854,7 +856,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, gva = kvm_s390_logical_to_effective(vcpu, gva); tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code; - rc = get_vcpu_asce(vcpu, &asce, ar, mode); + rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode); tec->addr = gva >> PAGE_SHIFT; if (rc) return rc; From d03193de30e6d99770930c6fbf14f0d5dd5cb2f0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 31 May 2016 19:56:46 +0200 Subject: [PATCH 016/302] KVM: s390: gaccess: function for preparing translation exceptions Let's provide a function trans_exc() that can be used for handling preparation of translation exceptions on a central basis. We will use that function to replace existing code in gaccess. Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/gaccess.c | 62 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index c0da9e9d4490..b6ccb26bc3c1 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -476,6 +476,68 @@ enum { FSI_FETCH = 2 /* Exception was due to fetch operation */ }; +enum prot_type { + PROT_TYPE_LA = 0, + PROT_TYPE_KEYC = 1, + PROT_TYPE_ALC = 2, + PROT_TYPE_DAT = 3, +}; + +static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, + ar_t ar, enum gacc_mode mode, enum prot_type prot) +{ + struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; + struct trans_exc_code_bits *tec; + + memset(pgm, 0, sizeof(*pgm)); + pgm->code = code; + tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code; + + switch (code) { + case PGM_ASCE_TYPE: + case PGM_PAGE_TRANSLATION: + case PGM_REGION_FIRST_TRANS: + case PGM_REGION_SECOND_TRANS: + case PGM_REGION_THIRD_TRANS: + case PGM_SEGMENT_TRANSLATION: + /* + * op_access_id only applies to MOVE_PAGE -> set bit 61 + * exc_access_id has to be set to 0 for some instructions. Both + * cases have to be handled by the caller. We can always store + * exc_access_id, as it is undefined for non-ar cases. + */ + tec->addr = gva >> PAGE_SHIFT; + tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; + tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as; + /* FALL THROUGH */ + case PGM_ALEN_TRANSLATION: + case PGM_ALE_SEQUENCE: + case PGM_ASTE_VALIDITY: + case PGM_ASTE_SEQUENCE: + case PGM_EXTENDED_AUTHORITY: + pgm->exc_access_id = ar; + break; + case PGM_PROTECTION: + switch (prot) { + case PROT_TYPE_ALC: + tec->b60 = 1; + /* FALL THROUGH */ + case PROT_TYPE_DAT: + tec->b61 = 1; + tec->addr = gva >> PAGE_SHIFT; + tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; + tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as; + /* exc_access_id is undefined for most cases */ + pgm->exc_access_id = ar; + break; + default: /* LA and KEYC set b61 to 0, other params undefined */ + break; + } + break; + } + return code; +} + static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, unsigned long ga, ar_t ar, enum gacc_mode mode) { From 3e3c67f6a327852375247c98b0d153c44e460216 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 31 May 2016 20:00:33 +0200 Subject: [PATCH 017/302] KVM: s390: gaccess: convert kvm_s390_check_low_addr_prot_real() Let's use our new function for preparing translation exceptions. Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/gaccess.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index b6ccb26bc3c1..61dc45ef50b9 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -979,20 +979,9 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, */ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) { - struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; - psw_t *psw = &vcpu->arch.sie_block->gpsw; - struct trans_exc_code_bits *tec_bits; union ctlreg0 ctlreg0 = {.val = vcpu->arch.sie_block->gcr[0]}; if (!ctlreg0.lap || !is_low_address(gra)) return 0; - - memset(pgm, 0, sizeof(*pgm)); - tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code; - tec_bits->fsi = FSI_STORE; - tec_bits->as = psw_bits(*psw).as; - tec_bits->addr = gra >> PAGE_SHIFT; - pgm->code = PGM_PROTECTION; - - return pgm->code; + return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA); } From fbcb7d5157718645cc198c6be6b435ab326c1892 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 31 May 2016 20:06:55 +0200 Subject: [PATCH 018/302] KVM: s390: gaccess: convert guest_translate_address() Let's use our new function for preparing translation exceptions. Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/gaccess.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 61dc45ef50b9..ae9f9e8e063c 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -910,37 +910,28 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, unsigned long *gpa, enum gacc_mode mode) { - struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; psw_t *psw = &vcpu->arch.sie_block->gpsw; - struct trans_exc_code_bits *tec; union asce asce; int rc; gva = kvm_s390_logical_to_effective(vcpu, gva); - tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code; rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode); - tec->addr = gva >> PAGE_SHIFT; if (rc) return rc; if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) { - if (mode == GACC_STORE) { - rc = pgm->code = PGM_PROTECTION; - return rc; - } + if (mode == GACC_STORE) + return trans_exc(vcpu, PGM_PROTECTION, gva, 0, + mode, PROT_TYPE_LA); } if (psw_bits(*psw).t && !asce.r) { /* Use DAT? */ rc = guest_translate(vcpu, gva, gpa, asce, mode); - if (rc > 0) { - if (rc == PGM_PROTECTION) - tec->b61 = 1; - pgm->code = rc; - } + if (rc > 0) + return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT); } else { - rc = 0; *gpa = kvm_s390_real_to_abs(vcpu, gva); if (kvm_is_error_gpa(vcpu->kvm, *gpa)) - rc = pgm->code = PGM_ADDRESSING; + return trans_exc(vcpu, rc, gva, PGM_ADDRESSING, mode, 0); } return rc; From cde0dcfb5df1dbcd90a8e73130a6b7091bdb493a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 31 May 2016 20:13:35 +0200 Subject: [PATCH 019/302] KVM: s390: gaccess: convert guest_page_range() Let's use our new function for preparing translation exceptions. As we will need the correct ar, let's pass that to guest_page_range(). This will also make sure that the guest address is stored in the tec for applicable excptions. Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/gaccess.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index ae9f9e8e063c..ec6c91e85dbe 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -792,40 +792,31 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu, return 1; } -static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, +static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, unsigned long *pages, unsigned long nr_pages, const union asce asce, enum gacc_mode mode) { - struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; psw_t *psw = &vcpu->arch.sie_block->gpsw; - struct trans_exc_code_bits *tec_bits; - int lap_enabled, rc; + int lap_enabled, rc = 0; - tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code; lap_enabled = low_address_protection_enabled(vcpu, asce); while (nr_pages) { ga = kvm_s390_logical_to_effective(vcpu, ga); - tec_bits->addr = ga >> PAGE_SHIFT; - if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) { - pgm->code = PGM_PROTECTION; - return pgm->code; - } + if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) + return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode, + PROT_TYPE_LA); ga &= PAGE_MASK; if (psw_bits(*psw).t) { rc = guest_translate(vcpu, ga, pages, asce, mode); if (rc < 0) return rc; - if (rc == PGM_PROTECTION) - tec_bits->b61 = 1; - if (rc) - pgm->code = rc; } else { *pages = kvm_s390_real_to_abs(vcpu, ga); if (kvm_is_error_gpa(vcpu->kvm, *pages)) - pgm->code = PGM_ADDRESSING; + rc = PGM_ADDRESSING; } - if (pgm->code) - return pgm->code; + if (rc) + return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_DAT); ga += PAGE_SIZE; pages++; nr_pages--; @@ -859,7 +850,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, need_ipte_lock = psw_bits(*psw).t && !asce.r; if (need_ipte_lock) ipte_lock(vcpu); - rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, mode); + rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode); for (idx = 0; idx < nr_pages && !rc; idx++) { gpa = *(pages + idx) + (ga & ~PAGE_MASK); _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len); From bcfa01d787278476f3e79530d03df9b3f52e6e59 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 31 May 2016 20:21:03 +0200 Subject: [PATCH 020/302] KVM: s390: gaccess: convert get_vcpu_asce() Let's use our new function for preparing translation exceptions. Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/gaccess.c | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index ec6c91e85dbe..8e245e764c21 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -543,13 +543,6 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, { int rc; struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw); - struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; - struct trans_exc_code_bits *tec_bits; - - memset(pgm, 0, sizeof(*pgm)); - tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code; - tec_bits->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; - tec_bits->as = psw.as; if (!psw.t) { asce->val = 0; @@ -572,22 +565,8 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, return 0; case PSW_AS_ACCREG: rc = ar_translation(vcpu, asce, ar, mode); - switch (rc) { - case PGM_ALEN_TRANSLATION: - case PGM_ALE_SEQUENCE: - case PGM_ASTE_VALIDITY: - case PGM_ASTE_SEQUENCE: - case PGM_EXTENDED_AUTHORITY: - vcpu->arch.pgm.exc_access_id = ar; - break; - case PGM_PROTECTION: - tec_bits->addr = ga >> PAGE_SHIFT; - tec_bits->b60 = 1; - tec_bits->b61 = 1; - break; - } if (rc > 0) - pgm->code = rc; + return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_ALC); return rc; } return 0; From 1afd43e0fbba4a92effc22977e3a7e64213ee860 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 18 May 2016 15:59:06 +0200 Subject: [PATCH 021/302] s390/crypto: allow to query all known cpacf functions KVM will have to query these functions, let's add at least the query capabilities. PCKMO has RRE format, as bit 16-31 are ignored, we can still use the existing function. As PCKMO won't touch the cc, let's force it to 0 upfront. Signed-off-by: David Hildenbrand Acked-by: Ingo Tuchscherer Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/cpacf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h index 1a82cf26ee11..d28621de8e0b 100644 --- a/arch/s390/include/asm/cpacf.h +++ b/arch/s390/include/asm/cpacf.h @@ -20,6 +20,9 @@ #define CPACF_KMC 0xb92f /* MSA */ #define CPACF_KIMD 0xb93e /* MSA */ #define CPACF_KLMD 0xb93f /* MSA */ +#define CPACF_PCKMO 0xb928 /* MSA3 */ +#define CPACF_KMF 0xb92a /* MSA4 */ +#define CPACF_KMO 0xb92b /* MSA4 */ #define CPACF_PCC 0xb92c /* MSA4 */ #define CPACF_KMCTR 0xb92d /* MSA4 */ #define CPACF_PPNO 0xb93c /* MSA5 */ @@ -136,6 +139,7 @@ static inline void __cpacf_query(unsigned int opcode, unsigned char *status) register unsigned long r1 asm("1") = (unsigned long) status; asm volatile( + " spm 0\n" /* pckmo doesn't change the cc */ /* Parameter registers are ignored, but may not be 0 */ "0: .insn rrf,%[opc] << 16,2,2,2,0\n" " brc 1,0b\n" /* handle partial completion */ @@ -157,6 +161,12 @@ static inline int cpacf_query(unsigned int opcode, unsigned int func) if (!test_facility(17)) /* check for MSA */ return 0; break; + case CPACF_PCKMO: + if (!test_facility(76)) /* check for MSA3 */ + return 0; + break; + case CPACF_KMF: + case CPACF_KMO: case CPACF_PCC: case CPACF_KMCTR: if (!test_facility(77)) /* check for MSA4 */ From 0a763c780b7cb830c250d00ead975778ab948f40 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 18 May 2016 16:03:47 +0200 Subject: [PATCH 022/302] KVM: s390: interface to query and configure cpu subfunctions We have certain instructions that indicate available subfunctions via a query subfunction (crypto functions and ptff), or via a test bit function (plo). By exposing these "subfunction blocks" to user space, we allow user space to 1) query available subfunctions and make sure subfunctions won't get lost during migration - e.g. properly indicate them via a CPU model 2) change the subfunctions to be reported to the guest (even adding unavailable ones) This mechanism works just like the way we indicate the stfl(e) list to user space. This way, user space could even emulate some subfunctions in QEMU in the future. If this is ever applicable, we have to make sure later on, that unsupported subfunctions result in an intercept to QEMU. Please note that support to indicate them to the guest is still missing and requires hardware support. Usually, the IBC takes already care of these subfunctions for migration safety. QEMU should make sure to always set these bits properly according to the machine generation to be emulated. Available subfunctions are only valid in combination with STFLE bits retrieved via KVM_S390_VM_CPU_MACHINE and enabled via KVM_S390_VM_CPU_PROCESSOR. If the applicable bits are available, the indicated subfunctions are guaranteed to be correct. Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- Documentation/virtual/kvm/devices/vm.txt | 57 +++++++++++++++ arch/s390/include/uapi/asm/kvm.h | 20 ++++++ arch/s390/kvm/kvm-s390.c | 89 ++++++++++++++++++++++++ 3 files changed, 166 insertions(+) diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt index 0ed6808b9965..8a458f42ded2 100644 --- a/Documentation/virtual/kvm/devices/vm.txt +++ b/Documentation/virtual/kvm/devices/vm.txt @@ -112,6 +112,63 @@ Returns: -EFAULT if the given address is not accessible from kernel space. -EBUSY if at least one VCPU has already been defined. 0 in case of success. +2.5. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_SUBFUNC (r/o) + +Allows user space to retrieve available cpu subfunctions without any filtering +done by a set IBC. These subfunctions are indicated to the guest VCPU via +query or "test bit" subfunctions and used e.g. by cpacf functions, plo and ptff. + +A subfunction block is only valid if KVM_S390_VM_CPU_MACHINE contains the +STFL(E) bit introducing the affected instruction. If the affected instruction +indicates subfunctions via a "query subfunction", the response block is +contained in the returned struct. If the affected instruction +indicates subfunctions via a "test bit" mechanism, the subfunction codes are +contained in the returned struct in MSB 0 bit numbering. + +struct kvm_s390_vm_cpu_subfunc { + u8 plo[32]; # always valid (ESA/390 feature) + u8 ptff[16]; # valid with TOD-clock steering + u8 kmac[16]; # valid with Message-Security-Assist + u8 kmc[16]; # valid with Message-Security-Assist + u8 km[16]; # valid with Message-Security-Assist + u8 kimd[16]; # valid with Message-Security-Assist + u8 klmd[16]; # valid with Message-Security-Assist + u8 pckmo[16]; # valid with Message-Security-Assist-Extension 3 + u8 kmctr[16]; # valid with Message-Security-Assist-Extension 4 + u8 kmf[16]; # valid with Message-Security-Assist-Extension 4 + u8 kmo[16]; # valid with Message-Security-Assist-Extension 4 + u8 pcc[16]; # valid with Message-Security-Assist-Extension 4 + u8 ppno[16]; # valid with Message-Security-Assist-Extension 5 + u8 reserved[1824]; # reserved for future instructions +}; + +Parameters: address of a buffer to load the subfunction blocks from. +Returns: -EFAULT if the given address is not accessible from kernel space. + 0 in case of success. + +2.6. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_SUBFUNC (r/w) + +Allows user space to retrieve or change cpu subfunctions to be indicated for +all VCPUs of a VM. This attribute will only be available if kernel and +hardware support are in place. + +The kernel uses the configured subfunction blocks for indication to +the guest. A subfunction block will only be used if the associated STFL(E) bit +has not been disabled by user space (so the instruction to be queried is +actually available for the guest). + +As long as no data has been written, a read will fail. The IBC will be used +to determine available subfunctions in this case, this will guarantee backward +compatibility. + +See 2.5. for a description of the parameter struct. + +Parameters: address of a buffer to store/load the subfunction blocks from. +Returns: -EFAULT if the given address is not accessible from kernel space. + -EINVAL when reading, if there was no write yet. + -EBUSY if at least one VCPU has already been defined. + 0 in case of success. + 3. GROUP: KVM_S390_VM_TOD Architectures: s390 diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 789c4e27e294..f0818d70d73d 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -102,6 +102,26 @@ struct kvm_s390_vm_cpu_feat { __u64 feat[16]; }; +#define KVM_S390_VM_CPU_PROCESSOR_SUBFUNC 4 +#define KVM_S390_VM_CPU_MACHINE_SUBFUNC 5 +/* for "test bit" instructions MSB 0 bit ordering, for "query" raw blocks */ +struct kvm_s390_vm_cpu_subfunc { + __u8 plo[32]; /* always */ + __u8 ptff[16]; /* with TOD-clock steering */ + __u8 kmac[16]; /* with MSA */ + __u8 kmc[16]; /* with MSA */ + __u8 km[16]; /* with MSA */ + __u8 kimd[16]; /* with MSA */ + __u8 klmd[16]; /* with MSA */ + __u8 pckmo[16]; /* with MSA3 */ + __u8 kmctr[16]; /* with MSA4 */ + __u8 kmf[16]; /* with MSA4 */ + __u8 kmo[16]; /* with MSA4 */ + __u8 pcc[16]; /* with MSA4 */ + __u8 ppno[16]; /* with MSA5 */ + __u8 reserved[1824]; +}; + /* kvm attributes for crypto */ #define KVM_S390_VM_CRYPTO_ENABLE_AES_KW 0 #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1 diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 2b5c14da3227..f746a35e3950 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -36,6 +36,8 @@ #include #include #include +#include +#include #include "kvm-s390.h" #include "gaccess.h" @@ -135,6 +137,8 @@ unsigned long kvm_s390_fac_list_mask_size(void) /* available cpu features supported by kvm */ static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); +/* available subfunctions indicated via query / "test bit" */ +static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc; static struct gmap_notifier gmap_notifier; debug_info_t *kvm_s390_dbf; @@ -198,8 +202,52 @@ static void allow_cpu_feat(unsigned long nr) set_bit_inv(nr, kvm_s390_available_cpu_feat); } +static inline int plo_test_bit(unsigned char nr) +{ + register unsigned long r0 asm("0") = (unsigned long) nr | 0x100; + int cc = 3; /* subfunction not available */ + + asm volatile( + /* Parameter registers are ignored for "test bit" */ + " plo 0,0,0,0(0)\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (cc) + : "d" (r0) + : "cc"); + return cc == 0; +} + static void kvm_s390_cpu_feat_init(void) { + int i; + + for (i = 0; i < 256; ++i) { + if (plo_test_bit(i)) + kvm_s390_available_subfunc.plo[i >> 3] |= 0x80 >> (i & 7); + } + + if (test_facility(28)) /* TOD-clock steering */ + etr_ptff(kvm_s390_available_subfunc.ptff, ETR_PTFF_QAF); + + if (test_facility(17)) { /* MSA */ + __cpacf_query(CPACF_KMAC, kvm_s390_available_subfunc.kmac); + __cpacf_query(CPACF_KMC, kvm_s390_available_subfunc.kmc); + __cpacf_query(CPACF_KM, kvm_s390_available_subfunc.km); + __cpacf_query(CPACF_KIMD, kvm_s390_available_subfunc.kimd); + __cpacf_query(CPACF_KLMD, kvm_s390_available_subfunc.klmd); + } + if (test_facility(76)) /* MSA3 */ + __cpacf_query(CPACF_PCKMO, kvm_s390_available_subfunc.pckmo); + if (test_facility(77)) { /* MSA4 */ + __cpacf_query(CPACF_KMCTR, kvm_s390_available_subfunc.kmctr); + __cpacf_query(CPACF_KMF, kvm_s390_available_subfunc.kmf); + __cpacf_query(CPACF_KMO, kvm_s390_available_subfunc.kmo); + __cpacf_query(CPACF_PCC, kvm_s390_available_subfunc.pcc); + } + if (test_facility(57)) /* MSA5 */ + __cpacf_query(CPACF_PPNO, kvm_s390_available_subfunc.ppno); + if (MACHINE_HAS_ESOP) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP); } @@ -717,6 +765,16 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm, return ret; } +static int kvm_s390_set_processor_subfunc(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + /* + * Once supported by kernel + hw, we have to store the subfunctions + * in kvm->arch and remember that user space configured them. + */ + return -ENXIO; +} + static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) { int ret = -ENXIO; @@ -728,6 +786,9 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_PROCESSOR_FEAT: ret = kvm_s390_set_processor_feat(kvm, attr); break; + case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: + ret = kvm_s390_set_processor_subfunc(kvm, attr); + break; } return ret; } @@ -801,6 +862,25 @@ static int kvm_s390_get_machine_feat(struct kvm *kvm, return 0; } +static int kvm_s390_get_processor_subfunc(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + /* + * Once we can actually configure subfunctions (kernel + hw support), + * we have to check if they were already set by user space, if so copy + * them from kvm->arch. + */ + return -ENXIO; +} + +static int kvm_s390_get_machine_subfunc(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc, + sizeof(struct kvm_s390_vm_cpu_subfunc))) + return -EFAULT; + return 0; +} static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) { int ret = -ENXIO; @@ -818,6 +898,12 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_MACHINE_FEAT: ret = kvm_s390_get_machine_feat(kvm, attr); break; + case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: + ret = kvm_s390_get_processor_subfunc(kvm, attr); + break; + case KVM_S390_VM_CPU_MACHINE_SUBFUNC: + ret = kvm_s390_get_machine_subfunc(kvm, attr); + break; } return ret; } @@ -903,8 +989,11 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_MACHINE: case KVM_S390_VM_CPU_PROCESSOR_FEAT: case KVM_S390_VM_CPU_MACHINE_FEAT: + case KVM_S390_VM_CPU_MACHINE_SUBFUNC: ret = 0; break; + /* configuring subfunctions is not supported yet */ + case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: default: ret = -ENXIO; break; From 4013ade3fb2fefa021827d675d8bc1d51f4aef93 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 12:49:43 +0100 Subject: [PATCH 023/302] s390/sclp: detect 64-bit-SCAO facility Let's correctly detect that facility, so we can correctly handle its abscence later on. Reviewed-by: Christian Borntraeger Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/sclp.h | 1 + drivers/s390/char/sclp_early.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 49736a0d4e0e..521400086e65 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -59,6 +59,7 @@ struct sclp_info { unsigned char has_hvs : 1; unsigned char has_esca : 1; unsigned char has_sief2 : 1; + unsigned char has_64bscao : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index 0ac520dd1b21..211eb86ae62d 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -114,6 +114,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb) sclp.facilities = sccb->facilities; sclp.has_sprp = !!(sccb->fac84 & 0x02); sclp.has_core_type = !!(sccb->fac84 & 0x01); + sclp.has_64bscao = !!(sccb->fac116 & 0x80); sclp.has_esca = !!(sccb->fac116 & 0x08); sclp.has_hvs = !!(sccb->fac119 & 0x80); if (sccb->fac85 & 0x02) From 76a6dd7241ae03c47f44a9605dcd525f31b2124a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 13:33:49 +0100 Subject: [PATCH 024/302] KVM: s390: handle missing 64-bit-SCAO facility Without that facility, we may only use scaol. So fallback to DMA allocation in that case, so we won't overwrite random memory via the SIE. Also disallow ESCA, so we don't have to handle that allocation case. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f746a35e3950..efb902cdd1d2 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -317,8 +317,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: - r = sclp.has_esca ? KVM_S390_ESCA_CPU_SLOTS - : KVM_S390_BSCA_CPU_SLOTS; + r = KVM_S390_BSCA_CPU_SLOTS; + if (sclp.has_esca && sclp.has_64bscao) + r = KVM_S390_ESCA_CPU_SLOTS; break; case KVM_CAP_NR_MEMSLOTS: r = KVM_USER_MEM_SLOTS; @@ -1295,6 +1296,7 @@ static void sca_dispose(struct kvm *kvm) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + gfp_t alloc_flags = GFP_KERNEL; int i, rc; char debug_name[16]; static unsigned long sca_offset; @@ -1319,8 +1321,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) ratelimit_state_init(&kvm->arch.sthyi_limit, 5 * HZ, 500); kvm->arch.use_esca = 0; /* start with basic SCA */ + if (!sclp.has_64bscao) + alloc_flags |= GFP_DMA; rwlock_init(&kvm->arch.sca_lock); - kvm->arch.sca = (struct bsca_block *) get_zeroed_page(GFP_KERNEL); + kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags); if (!kvm->arch.sca) goto out_err; spin_lock(&kvm_lock); @@ -1567,7 +1571,7 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) if (id < KVM_S390_BSCA_CPU_SLOTS) return true; - if (!sclp.has_esca) + if (!sclp.has_esca || !sclp.has_64bscao) return false; mutex_lock(&kvm->lock); From b9e28897e6e9f82585ecf6ea45942866ece7d167 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 12:51:52 +0100 Subject: [PATCH 025/302] s390/sclp: detect guest-PER enhancement Let's detect that facility, so we can correctly handle its abscence. Reviewed-by: Christian Borntraeger Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/sclp.h | 4 +++- drivers/s390/char/sclp_early.c | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 521400086e65..076f6318b6fa 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -33,7 +33,8 @@ struct sclp_core_entry { u8 : 4; u8 sief2 : 1; u8 : 3; - u8 : 3; + u8 : 2; + u8 gpere : 1; u8 siif : 1; u8 sigpif : 1; u8 : 3; @@ -60,6 +61,7 @@ struct sclp_info { unsigned char has_esca : 1; unsigned char has_sief2 : 1; unsigned char has_64bscao : 1; + unsigned char has_gpere : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index 211eb86ae62d..a05f2d07ea02 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -146,6 +146,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb) sclp.has_siif = cpue->siif; sclp.has_sigpif = cpue->sigpif; sclp.has_sief2 = cpue->sief2; + sclp.has_gpere = cpue->gpere; break; } From 89b5b4de33902a57cb9c8f2d06de4ffbc921de15 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 13:47:13 +0100 Subject: [PATCH 026/302] KVM: s390: guestdbg: signal missing hardware support Without guest-PER enhancement, we can't provide any debugging support. Therefore act like kernel support is missing. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index efb902cdd1d2..e477c8e5b5c1 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2179,6 +2179,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, if (dbg->control & ~VALID_GUESTDBG_FLAGS) return -EINVAL; + if (!sclp.has_gpere) + return -EINVAL; if (dbg->control & KVM_GUESTDBG_ENABLE) { vcpu->guest_debug = dbg->control; From 09be9cb92bb9e799bdbfd3834595bd6b4703b40b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 12:55:35 +0100 Subject: [PATCH 027/302] s390/sclp: detect cmma Let's detect the Collaborative-memory-management-interpretation facility, aka CMM assist, so we can correctly enable cmma later. Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/sclp.h | 1 + drivers/s390/char/sclp_early.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 076f6318b6fa..fa40ac8056f5 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -62,6 +62,7 @@ struct sclp_info { unsigned char has_sief2 : 1; unsigned char has_64bscao : 1; unsigned char has_gpere : 1; + unsigned char has_cmma : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index a05f2d07ea02..366e1a46e96d 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -115,6 +115,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb) sclp.has_sprp = !!(sccb->fac84 & 0x02); sclp.has_core_type = !!(sccb->fac84 & 0x01); sclp.has_64bscao = !!(sccb->fac116 & 0x80); + sclp.has_cmma = !!(sccb->fac116 & 0x40); sclp.has_esca = !!(sccb->fac116 & 0x08); sclp.has_hvs = !!(sccb->fac119 & 0x80); if (sccb->fac85 & 0x02) From c24cc9c8a6ca798427d3ff46b55df8403361df3e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 13:53:04 +0100 Subject: [PATCH 028/302] KVM: s390: enable CMMA if the interpration is available Now that we can detect if collaborative-memory-management interpretation is available, replace the heuristic by a real hardware detection. Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e477c8e5b5c1..005e664f6360 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -485,9 +485,8 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att unsigned int idx; switch (attr->attr) { case KVM_S390_VM_MEM_ENABLE_CMMA: - /* enable CMMA only for z10 and later (EDAT_1) */ ret = -EINVAL; - if (!MACHINE_IS_LPAR || !MACHINE_HAS_EDAT1) + if (!sclp.has_cmma) break; ret = -EBUSY; From f9cbd9b02539330ddd349df583fcfc2db8a23b90 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 3 Mar 2016 09:48:47 +0100 Subject: [PATCH 029/302] KVM: s390: provide CMMA attributes only if available Let's not provide the device attribute for cmma enabling and clearing if the hardware doesn't support it. This also helps getting rid of the undocumented return value "-EINVAL" in case CMMA is not available when trying to enable it. Also properly document the meaning of -EINVAL for CMMA clearing. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- Documentation/virtual/kvm/api.txt | 2 ++ Documentation/virtual/kvm/devices/vm.txt | 3 ++- arch/s390/kvm/kvm-s390.c | 7 ++++++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a4482cce4bae..4aac3e51bf9f 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2520,6 +2520,7 @@ Parameters: struct kvm_device_attr Returns: 0 on success, -1 on error Errors: ENXIO: The group or attribute is unknown/unsupported for this device + or hardware support is missing. EPERM: The attribute cannot (currently) be accessed this way (e.g. read-only attribute, or attribute that only makes sense when the device is in a different state) @@ -2547,6 +2548,7 @@ Parameters: struct kvm_device_attr Returns: 0 on success, -1 on error Errors: ENXIO: The group or attribute is unknown/unsupported for this device + or hardware support is missing. Tests whether a device supports a particular attribute. A successful return indicates the attribute is implemented. It does not necessarily diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt index 8a458f42ded2..b6cda49f2ba4 100644 --- a/Documentation/virtual/kvm/devices/vm.txt +++ b/Documentation/virtual/kvm/devices/vm.txt @@ -20,7 +20,8 @@ Enables Collaborative Memory Management Assist (CMMA) for the virtual machine. 1.2. ATTRIBUTE: KVM_S390_VM_MEM_CLR_CMMA Parameters: none -Returns: 0 +Returns: -EINVAL if CMMA was not enabled + 0 otherwise Clear the CMMA status for all guest pages, so any pages the guest marked as unused are again used any may not be reclaimed by the host. diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 005e664f6360..f695c6e08337 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -485,7 +485,7 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att unsigned int idx; switch (attr->attr) { case KVM_S390_VM_MEM_ENABLE_CMMA: - ret = -EINVAL; + ret = -ENXIO; if (!sclp.has_cmma) break; @@ -499,6 +499,9 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att mutex_unlock(&kvm->lock); break; case KVM_S390_VM_MEM_CLR_CMMA: + ret = -ENXIO; + if (!sclp.has_cmma) + break; ret = -EINVAL; if (!kvm->arch.use_cmma) break; @@ -964,6 +967,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) switch (attr->attr) { case KVM_S390_VM_MEM_ENABLE_CMMA: case KVM_S390_VM_MEM_CLR_CMMA: + ret = sclp.has_cmma ? 0 : -ENXIO; + break; case KVM_S390_VM_MEM_LIMIT_SIZE: ret = 0; break; From 5236c751da5e6ccfda4e5d53690a37dfb456997b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 12:53:46 +0100 Subject: [PATCH 030/302] s390/sclp: detect guest-storage-limit-suppression Let's detect that facility. Reviewed-by: Christian Borntraeger Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/sclp.h | 1 + drivers/s390/char/sclp_early.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index fa40ac8056f5..e1450dd9d932 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -63,6 +63,7 @@ struct sclp_info { unsigned char has_64bscao : 1; unsigned char has_gpere : 1; unsigned char has_cmma : 1; + unsigned char has_gsls : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index 366e1a46e96d..99fce6b784bf 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -114,6 +114,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb) sclp.facilities = sccb->facilities; sclp.has_sprp = !!(sccb->fac84 & 0x02); sclp.has_core_type = !!(sccb->fac84 & 0x01); + sclp.has_gsls = !!(sccb->fac85 & 0x80); sclp.has_64bscao = !!(sccb->fac116 & 0x80); sclp.has_cmma = !!(sccb->fac116 & 0x40); sclp.has_esca = !!(sccb->fac116 & 0x08); From efed110446226c725268a1f980806d799990a979 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 16 Apr 2015 12:32:41 +0200 Subject: [PATCH 031/302] KVM: s390: handle missing guest-storage-limit-suppression If guest-storage-limit-suppression is not available, we would for now have a valid guest address space with size 0. So let's simply set the origin to 0 and the limit to hamax. Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 4 +++- arch/s390/kvm/kvm-s390.c | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index b2a83a0ce42c..9eed5c18a61c 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -186,7 +186,9 @@ struct kvm_s390_sie_block { __u32 scaol; /* 0x0064 */ __u8 reserved68[4]; /* 0x0068 */ __u32 todpr; /* 0x006c */ - __u8 reserved70[32]; /* 0x0070 */ + __u8 reserved70[16]; /* 0x0070 */ + __u64 mso; /* 0x0080 */ + __u64 msl; /* 0x0088 */ psw_t gpsw; /* 0x0090 */ __u64 gg14; /* 0x00a0 */ __u64 gg15; /* 0x00a8 */ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f695c6e08337..2a239554eb89 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1897,6 +1897,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block = &sie_page->sie_block; vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb; + /* the real guest size will always be smaller than msl */ + vcpu->arch.sie_block->mso = 0; + vcpu->arch.sie_block->msl = sclp.hamax; + vcpu->arch.sie_block->icpua = id; spin_lock_init(&vcpu->arch.local_int.lock); vcpu->arch.local_int.float_int = &kvm->arch.float_int; From 72cd82b9e9d075713367ad840c2a9b52b4cd447d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 12:59:03 +0100 Subject: [PATCH 032/302] s390/sclp: detect intervention bypass facility Let's detect if we have the intervention bypass facility installed. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/sclp.h | 7 ++++++- drivers/s390/char/sclp_early.c | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index e1450dd9d932..ef1f427ad4d1 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -38,7 +38,11 @@ struct sclp_core_entry { u8 siif : 1; u8 sigpif : 1; u8 : 3; - u8 reserved2[10]; + u8 reserved2[3]; + u8 : 2; + u8 ib : 1; + u8 : 5; + u8 reserved3[6]; u8 type; u8 reserved1; } __attribute__((packed)); @@ -64,6 +68,7 @@ struct sclp_info { unsigned char has_gpere : 1; unsigned char has_cmma : 1; unsigned char has_gsls : 1; + unsigned char has_ib : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index 99fce6b784bf..2240b615131e 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -149,6 +149,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb) sclp.has_sigpif = cpue->sigpif; sclp.has_sief2 = cpue->sief2; sclp.has_gpere = cpue->gpere; + sclp.has_ib = cpue->ib; break; } From 11ad65b79e8c27cdafe404e33938da270a55858a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 4 Apr 2016 15:46:26 +0200 Subject: [PATCH 033/302] KVM: s390: enable ib only if available Let's enable intervention bypass only if the facility is acutally available. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 2a239554eb89..340fb405bc23 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1845,7 +1845,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) if (test_kvm_facility(vcpu->kvm, 8)) vcpu->arch.sie_block->ecb2 |= 0x08; - vcpu->arch.sie_block->eca = 0xC1002000U; + vcpu->arch.sie_block->eca = 0x81002000U; + if (sclp.has_ib) + vcpu->arch.sie_block->eca |= 0x40000000U; if (sclp.has_siif) vcpu->arch.sie_block->eca |= 1; if (sclp.has_sigpif) From 4a5c3e08271216891ce1b5315cec3dcadbd01cd4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 13:00:23 +0100 Subject: [PATCH 034/302] s390/sclp: detect conditional-external-interception facility Let's detect if we have that facility. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/sclp.h | 4 +++- drivers/s390/char/sclp_early.c | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index ef1f427ad4d1..c91ad198a59c 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -41,7 +41,8 @@ struct sclp_core_entry { u8 reserved2[3]; u8 : 2; u8 ib : 1; - u8 : 5; + u8 cei : 1; + u8 : 4; u8 reserved3[6]; u8 type; u8 reserved1; @@ -69,6 +70,7 @@ struct sclp_info { unsigned char has_cmma : 1; unsigned char has_gsls : 1; unsigned char has_ib : 1; + unsigned char has_cei : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index 2240b615131e..4b330fbd4f08 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -150,6 +150,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb) sclp.has_sief2 = cpue->sief2; sclp.has_gpere = cpue->gpere; sclp.has_ib = cpue->ib; + sclp.has_cei = cpue->cei; break; } From 48ee7d3a7f8f3ca90dfc5e1103e68c0044051acc Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 4 Apr 2016 15:49:34 +0200 Subject: [PATCH 035/302] KVM: s390: enable cei only if available Let's only enable conditional-external-interruption if the facility is actually available. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 340fb405bc23..1a239a6748fe 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1845,7 +1845,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) if (test_kvm_facility(vcpu->kvm, 8)) vcpu->arch.sie_block->ecb2 |= 0x08; - vcpu->arch.sie_block->eca = 0x81002000U; + vcpu->arch.sie_block->eca = 0x1002000U; + if (sclp.has_cei) + vcpu->arch.sie_block->eca |= 0x80000000U; if (sclp.has_ib) vcpu->arch.sie_block->eca |= 0x40000000U; if (sclp.has_siif) From a0eb55e6318f1bcfe93b01f0944622f14a6b2977 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 13:02:25 +0100 Subject: [PATCH 036/302] s390/sclp: detect PFMF interpretation facility Let's detect that facility. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/sclp.h | 1 + drivers/s390/char/sclp_early.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index c91ad198a59c..bdb7f22d9ad4 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -71,6 +71,7 @@ struct sclp_info { unsigned char has_gsls : 1; unsigned char has_ib : 1; unsigned char has_cei : 1; + unsigned char has_pfmfi : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index 4b330fbd4f08..500cbfd83541 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -46,7 +46,8 @@ struct read_info_sccb { u64 rnmax2; /* 104-111 */ u8 _pad_112[116 - 112]; /* 112-115 */ u8 fac116; /* 116 */ - u8 _pad_117[119 - 117]; /* 117-118 */ + u8 fac117; /* 117 */ + u8 _pad_118; /* 118 */ u8 fac119; /* 119 */ u16 hcpua; /* 120-121 */ u8 _pad_122[124 - 122]; /* 122-123 */ @@ -118,6 +119,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb) sclp.has_64bscao = !!(sccb->fac116 & 0x80); sclp.has_cmma = !!(sccb->fac116 & 0x40); sclp.has_esca = !!(sccb->fac116 & 0x08); + sclp.has_pfmfi = !!(sccb->fac117 & 0x40); sclp.has_hvs = !!(sccb->fac119 & 0x80); if (sccb->fac85 & 0x02) S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP; From 873b425e4c2fd0ba6617d67a45fbf119b65575b4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 4 Apr 2016 15:53:47 +0200 Subject: [PATCH 037/302] KVM: s390: enable PFMFI only if available Let's enable interpretation of PFMFI only if the facility is actually available. Emulation code still works in case the guest is offered EDAT-1. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 1a239a6748fe..d987eb8af059 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1843,7 +1843,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73)) vcpu->arch.sie_block->ecb |= 0x10; - if (test_kvm_facility(vcpu->kvm, 8)) + if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi) vcpu->arch.sie_block->ecb2 |= 0x08; vcpu->arch.sie_block->eca = 0x1002000U; if (sclp.has_cei) From 9c375490fc812ebdf3259ea2566c271d00544fc2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 13:02:52 +0100 Subject: [PATCH 038/302] s390/sclp: detect interlock-and-broadcast-suppression facility Let's detect that facility. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/sclp.h | 1 + drivers/s390/char/sclp_early.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index bdb7f22d9ad4..99a0150d07b9 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -72,6 +72,7 @@ struct sclp_info { unsigned char has_ib : 1; unsigned char has_cei : 1; unsigned char has_pfmfi : 1; + unsigned char has_ibs : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index 500cbfd83541..d5b873c92ffc 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -120,6 +120,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb) sclp.has_cmma = !!(sccb->fac116 & 0x40); sclp.has_esca = !!(sccb->fac116 & 0x08); sclp.has_pfmfi = !!(sccb->fac117 & 0x40); + sclp.has_ibs = !!(sccb->fac117 & 0x20); sclp.has_hvs = !!(sccb->fac119 & 0x80); if (sccb->fac85 & 0x02) S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP; From 09a400e78eaf02d8ab8e836edf864e1025c8e2d7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 4 Apr 2016 15:57:08 +0200 Subject: [PATCH 039/302] KVM: s390: enable ibs only if available Let's enable interlock-and-broadcast suppression only if the facility is actually available. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d987eb8af059..ad93b40bfdc0 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2789,6 +2789,8 @@ static void __disable_ibs_on_all_vcpus(struct kvm *kvm) static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu) { + if (!sclp.has_ibs) + return; kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu); kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu); } From bdab09f3d81c3fac6314012ca0eff1206ea067ab Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 12 Apr 2016 11:07:49 +0200 Subject: [PATCH 040/302] KVM: s390: enable host-protection-interruption only with ESOP host-protection-interruption control was introduced with ESOP. So let's enable it only if we have ESOP and add an explanatory comment why we can live without it. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ad93b40bfdc0..4e764faed524 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1837,7 +1837,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) kvm_s390_vcpu_setup_model(vcpu); - vcpu->arch.sie_block->ecb = 0x02; + /* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */ + if (MACHINE_HAS_ESOP) + vcpu->arch.sie_block->ecb |= 0x02; if (test_kvm_facility(vcpu->kvm, 9)) vcpu->arch.sie_block->ecb |= 0x04; if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73)) From f597d24eee2dd9486edaac7a1821f35bc4d349c2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 22 Apr 2016 16:26:49 +0200 Subject: [PATCH 041/302] KVM: s390: turn on tx even without ctx Constrained transactional execution is an addon of transactional execution. Let's enable the assist also if only TX is enabled for the guest. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4e764faed524..9d0e4d0487f4 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1842,7 +1842,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->ecb |= 0x02; if (test_kvm_facility(vcpu->kvm, 9)) vcpu->arch.sie_block->ecb |= 0x04; - if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73)) + if (test_kvm_facility(vcpu->kvm, 73)) vcpu->arch.sie_block->ecb |= 0x10; if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi) From 1bb78d161feae5b613c80eb822059eec60d2a538 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 7 Jun 2016 09:57:08 +0200 Subject: [PATCH 042/302] KVM: s390: provide logging for diagnose 0x500 We might need to debug some virtio things, so better have diagnose 500 logged. Signed-off-by: Christian Borntraeger Acked-by: Cornelia Huck --- arch/s390/kvm/diag.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 1ea4095b67d7..ce865bd4f81d 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -212,6 +212,11 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu) (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY)) return -EOPNOTSUPP; + VCPU_EVENT(vcpu, 4, "diag 0x500 schid 0x%8.8x queue 0x%x cookie 0x%llx", + (u32) vcpu->run->s.regs.gprs[2], + (u32) vcpu->run->s.regs.gprs[3], + vcpu->run->s.regs.gprs[4]); + /* * The layout is as follows: * - gpr 2 contains the subchannel id (passed as addr) From dcc98ea6146e4da27eee2f3e9983500e9618cc23 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 7 Jun 2016 09:37:17 +0200 Subject: [PATCH 043/302] KVM: s390: fixup I/O interrupt traces We currently have two issues with the I/O interrupt injection logging: 1. All QEMU versions up to 2.6 have a wrong encoding of device numbers etc for the I/O interrupt type, so the inject VM_EVENT will have wrong data. Let's fix this by using the interrupt parameters and not the interrupt type number. 2. We only log in kvm_s390_inject_vm, but not when coming from kvm_s390_reinject_io_int or from flic. Let's move the logging to the common __inject_io function. We also enhance the logging for delivery to match the data. Signed-off-by: Christian Borntraeger Acked-by: Cornelia Huck --- arch/s390/kvm/interrupt.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 5a80af740d3e..d72c4a877622 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -28,9 +28,6 @@ #include "gaccess.h" #include "trace-s390.h" -#define IOINT_SCHID_MASK 0x0000ffff -#define IOINT_SSID_MASK 0x00030000 -#define IOINT_CSSID_MASK 0x03fc0000 #define PFAULT_INIT 0x0600 #define PFAULT_DONE 0x0680 #define VIRTIO_PARAM 0x0d00 @@ -821,7 +818,14 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_interrupt_info, list); if (inti) { - VCPU_EVENT(vcpu, 4, "deliver: I/O 0x%llx", inti->type); + if (inti->type & KVM_S390_INT_IO_AI_MASK) + VCPU_EVENT(vcpu, 4, "%s", "deliver: I/O (AI)"); + else + VCPU_EVENT(vcpu, 4, "deliver: I/O %x ss %x schid %04x", + inti->io.subchannel_id >> 8, + inti->io.subchannel_id >> 1 & 0x3, + inti->io.subchannel_nr); + vcpu->stat.deliver_io_int++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, @@ -1415,6 +1419,13 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) } fi->counters[FIRQ_CNTR_IO] += 1; + if (inti->type & KVM_S390_INT_IO_AI_MASK) + VM_EVENT(kvm, 4, "%s", "inject: I/O (AI)"); + else + VM_EVENT(kvm, 4, "inject: I/O %x ss %x schid %04x", + inti->io.subchannel_id >> 8, + inti->io.subchannel_id >> 1 & 0x3, + inti->io.subchannel_nr); isc = int_word_to_isc(inti->io.io_int_word); list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc]; list_add_tail(&inti->list, list); @@ -1531,13 +1542,6 @@ int kvm_s390_inject_vm(struct kvm *kvm, inti->mchk.mcic = s390int->parm64; break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: - if (inti->type & KVM_S390_INT_IO_AI_MASK) - VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)"); - else - VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x", - s390int->type & IOINT_CSSID_MASK, - s390int->type & IOINT_SSID_MASK, - s390int->type & IOINT_SCHID_MASK); inti->io.subchannel_id = s390int->parm >> 16; inti->io.subchannel_nr = s390int->parm & 0x0000ffffu; inti->io.io_int_parm = s390int->parm64 >> 32; From c427c42cd612719e8fb8b5891cc9761e7770024e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 10 May 2016 13:51:54 +0200 Subject: [PATCH 044/302] s390/mm: don't drop errors in get_guest_storage_key Commit 1e133ab296f3 ("s390/mm: split arch/s390/mm/pgtable.c") changed the return value of get_guest_storage_key to an unsigned char, resulting in -EFAULT getting interpreted as a valid storage key. Cc: stable@vger.kernel.org # 4.6+ Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/pgtable.h | 2 +- arch/s390/mm/pgtable.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 18d2beb89340..42b968a85863 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -893,7 +893,7 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address); int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char key, bool nq); -unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr); +unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr); /* * Certain architectures need to do special things when PTEs diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 4324b87f9398..2a23ca96f9c2 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -543,7 +543,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(set_guest_storage_key); -unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr) +unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) { unsigned char key; spinlock_t *ptl; From d3ed1ceeace311af9973d17a07a114bfaf0ca1b1 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 8 Mar 2016 11:53:35 +0100 Subject: [PATCH 045/302] s390/mm: set and get guest storage key mmap locking Move the mmap semaphore locking out of set_guest_storage_key and get_guest_storage_key. This makes the two functions more like the other ptep_xxx operations and allows to avoid repeated semaphore operations if multiple keys are read or written. Reviewed-by: David Hildenbrand Reviewed-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 26 ++++++++++++++++---------- arch/s390/kvm/priv.c | 7 +++++-- arch/s390/mm/pgtable.c | 15 +++------------ 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 9d0e4d0487f4..d0156d7969e0 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1050,26 +1050,30 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (!keys) return -ENOMEM; + down_read(¤t->mm->mmap_sem); for (i = 0; i < args->count; i++) { hva = gfn_to_hva(kvm, args->start_gfn + i); if (kvm_is_error_hva(hva)) { r = -EFAULT; - goto out; + break; } curkey = get_guest_storage_key(current->mm, hva); if (IS_ERR_VALUE(curkey)) { r = curkey; - goto out; + break; } keys[i] = curkey; } + up_read(¤t->mm->mmap_sem); + + if (!r) { + r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys, + sizeof(uint8_t) * args->count); + if (r) + r = -EFAULT; + } - r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys, - sizeof(uint8_t) * args->count); - if (r) - r = -EFAULT; -out: kvfree(keys); return r; } @@ -1106,24 +1110,26 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (r) goto out; + down_read(¤t->mm->mmap_sem); for (i = 0; i < args->count; i++) { hva = gfn_to_hva(kvm, args->start_gfn + i); if (kvm_is_error_hva(hva)) { r = -EFAULT; - goto out; + break; } /* Lowest order bit is reserved */ if (keys[i] & 0x01) { r = -EINVAL; - goto out; + break; } r = set_guest_storage_key(current->mm, hva, (unsigned long)keys[i], 0); if (r) - goto out; + break; } + up_read(¤t->mm->mmap_sem); out: kvfree(keys); return r; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 95916fa7c670..c6deed782c61 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -728,9 +728,12 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (rc) return rc; - if (set_guest_storage_key(current->mm, useraddr, + down_read(¤t->mm->mmap_sem); + rc = set_guest_storage_key(current->mm, useraddr, vcpu->run->s.regs.gprs[reg1] & PFMF_KEY, - vcpu->run->s.regs.gprs[reg1] & PFMF_NQ)) + vcpu->run->s.regs.gprs[reg1] & PFMF_NQ); + up_read(¤t->mm->mmap_sem); + if (rc) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 2a23ca96f9c2..7612a7c3a3a8 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -506,12 +506,9 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, pgste_t old, new; pte_t *ptep; - down_read(&mm->mmap_sem); ptep = get_locked_pte(mm, addr, &ptl); - if (unlikely(!ptep)) { - up_read(&mm->mmap_sem); + if (unlikely(!ptep)) return -EFAULT; - } new = old = pgste_get_lock(ptep); pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | @@ -538,7 +535,6 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, pgste_set_unlock(ptep, new); pte_unmap_unlock(ptep, ptl); - up_read(&mm->mmap_sem); return 0; } EXPORT_SYMBOL(set_guest_storage_key); @@ -550,14 +546,11 @@ unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) pgste_t pgste; pte_t *ptep; - down_read(&mm->mmap_sem); ptep = get_locked_pte(mm, addr, &ptl); - if (unlikely(!ptep)) { - up_read(&mm->mmap_sem); + if (unlikely(!ptep)) return -EFAULT; - } - pgste = pgste_get_lock(ptep); + pgste = pgste_get_lock(ptep); if (pte_val(*ptep) & _PAGE_INVALID) { key = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56; key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56; @@ -572,10 +565,8 @@ unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) if (pgste_val(pgste) & PGSTE_GC_BIT) key |= _PAGE_CHANGED; } - pgste_set_unlock(ptep, pgste); pte_unmap_unlock(ptep, ptl); - up_read(&mm->mmap_sem); return key; } EXPORT_SYMBOL(get_guest_storage_key); From 8d6037a7b4f21708451d4aec14828f9ebe77b37a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2016 11:15:32 +0200 Subject: [PATCH 046/302] s390/mm: simplify get_guest_storage_key We can safe a few LOC and make that function easier to understand by rewriting existing code. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/mm/pgtable.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 7612a7c3a3a8..4c8d572d59cc 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -551,20 +551,11 @@ unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) return -EFAULT; pgste = pgste_get_lock(ptep); - if (pte_val(*ptep) & _PAGE_INVALID) { - key = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56; - key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56; - key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48; - key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48; - } else { + key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; + if (!(pte_val(*ptep) & _PAGE_INVALID)) key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK); - - /* Reflect guest's logical view, not physical */ - if (pgste_val(pgste) & PGSTE_GR_BIT) - key |= _PAGE_REFERENCED; - if (pgste_val(pgste) & PGSTE_GC_BIT) - key |= _PAGE_CHANGED; - } + /* Reflect guest's logical view, not physical */ + key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; pgste_set_unlock(ptep, pgste); pte_unmap_unlock(ptep, ptl); return key; From 154c8c19c35b6da94a623cb793458e203572083d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2016 11:22:34 +0200 Subject: [PATCH 047/302] s390/mm: return key via pointer in get_guest_storage_key Let's just split returning the key and reporting errors. This makes calling code easier and avoids bugs as happened already. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/pgtable.h | 3 ++- arch/s390/kvm/kvm-s390.c | 8 ++------ arch/s390/mm/pgtable.c | 12 ++++++------ 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 42b968a85863..91f0e7b79821 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -893,7 +893,8 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address); int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char key, bool nq); -unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr); +int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char *key); /* * Certain architectures need to do special things when PTEs diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d0156d7969e0..ad166c6698e0 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1029,7 +1029,6 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { uint8_t *keys; uint64_t hva; - unsigned long curkey; int i, r = 0; if (args->flags != 0) @@ -1058,12 +1057,9 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) break; } - curkey = get_guest_storage_key(current->mm, hva); - if (IS_ERR_VALUE(curkey)) { - r = curkey; + r = get_guest_storage_key(current->mm, hva, &keys[i]); + if (r) break; - } - keys[i] = curkey; } up_read(¤t->mm->mmap_sem); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 4c8d572d59cc..3e35298758d6 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -539,9 +539,9 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(set_guest_storage_key); -unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) +int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char *key) { - unsigned char key; spinlock_t *ptl; pgste_t pgste; pte_t *ptep; @@ -551,14 +551,14 @@ unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) return -EFAULT; pgste = pgste_get_lock(ptep); - key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; + *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; if (!(pte_val(*ptep) & _PAGE_INVALID)) - key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK); + *key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK); /* Reflect guest's logical view, not physical */ - key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; + *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; pgste_set_unlock(ptep, pgste); pte_unmap_unlock(ptep, ptl); - return key; + return 0; } EXPORT_SYMBOL(get_guest_storage_key); #endif From fe69eabf8deb85ae8b2958830ea3b2911e332820 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2016 13:08:07 +0200 Subject: [PATCH 048/302] KVM: s390: storage keys fit into a char No need to convert the storage key into an unsigned long, the target function expects a char as argument. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ad166c6698e0..49c60393a15c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1120,8 +1120,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) break; } - r = set_guest_storage_key(current->mm, hva, - (unsigned long)keys[i], 0); + r = set_guest_storage_key(current->mm, hva, keys[i], 0); if (r) break; } From 6164a2e90a5b6c5c32ccfe7a1baff80d603d702d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 13 Apr 2016 10:09:47 +0200 Subject: [PATCH 049/302] KVM: s390: pfmf: fix end address calculation The current calculation is wrong if absolute != real address. Let's just calculate the start address for 4k frames upfront. Otherwise, the calculated end address will be wrong, resulting in wrong memory location/storage keys getting touched. To keep low-address protection working (using the effective address), we have to move the check. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/priv.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index c6deed782c61..bfba98302ca0 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -682,8 +682,15 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; start = kvm_s390_logical_to_effective(vcpu, start); + if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { + if (kvm_s390_check_low_addr_prot_real(vcpu, start)) + return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); + } + switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) { case 0x00000000: + /* only 4k frames specify a real address */ + start = kvm_s390_real_to_abs(vcpu, start); end = (start + (1UL << 12)) & ~((1UL << 12) - 1); break; case 0x00001000: @@ -701,20 +708,11 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); } - if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { - if (kvm_s390_check_low_addr_prot_real(vcpu, start)) - return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); - } - while (start < end) { - unsigned long useraddr, abs_addr; + unsigned long useraddr; /* Translate guest address to host address */ - if ((vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) == 0) - abs_addr = kvm_s390_real_to_abs(vcpu, start); - else - abs_addr = start; - useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(abs_addr)); + useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); if (kvm_is_error_hva(useraddr)) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); From 9a68f0af8cd907452fa6c33343d38cdacff96294 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 13 Apr 2016 12:09:58 +0200 Subject: [PATCH 050/302] KVM: s390: pfmf: MR and MC are ignored without CSSKE These two bits are simply ignored when the conditional-SSKE facility is not installed. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/priv.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index bfba98302ca0..5c926b74d7ca 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -675,10 +675,6 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) !test_kvm_facility(vcpu->kvm, 14)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - /* No support for conditional-SSKE */ - if (vcpu->run->s.regs.gprs[reg1] & (PFMF_MR | PFMF_MC)) - return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; start = kvm_s390_logical_to_effective(vcpu, start); From 2c26d1d23abd9a67d056c95a0823132a71edc477 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 13 Apr 2016 15:47:21 +0200 Subject: [PATCH 051/302] KVM: s390: pfmf: take care of amode when setting reg2 Depending on the addressing mode, we must not overwrite bit 0-31 of the register. In addition, 24 bit and 31 bit have to set certain bits to 0, which is guaranteed by converting the end address to an effective address. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/priv.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 5c926b74d7ca..71fa603034d0 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -733,8 +733,15 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) start += PAGE_SIZE; } - if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) - vcpu->run->s.regs.gprs[reg2] = end; + if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) { + if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT) { + vcpu->run->s.regs.gprs[reg2] = end; + } else { + vcpu->run->s.regs.gprs[reg2] &= ~0xffffffffUL; + end = kvm_s390_logical_to_effective(vcpu, end); + vcpu->run->s.regs.gprs[reg2] |= end; + } + } return 0; } From 1824c723ac90f9870ebafae4b3b3e5f4b82ffeef Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 10 May 2016 09:43:11 +0200 Subject: [PATCH 052/302] KVM: s390: pfmf: support conditional-sske facility We already indicate that facility but don't implement it in our pfmf interception handler. Let's add a new storage key handling function for conditionally setting the guest storage key. As we will reuse this function later on, let's directly implement returning the old key via parameter and indicating if any change happened via rc. Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/pgtable.h | 3 +++ arch/s390/kvm/priv.c | 18 ++++++++++++++---- arch/s390/mm/pgtable.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 91f0e7b79821..2f6702e27db9 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -893,6 +893,9 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address); int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char key, bool nq); +int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, unsigned char *oldkey, + bool nq, bool mr, bool mc); int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char *key); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 71fa603034d0..752a1ac1aab6 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -654,8 +654,10 @@ static int handle_epsw(struct kvm_vcpu *vcpu) static int handle_pfmf(struct kvm_vcpu *vcpu) { + bool mr = false, mc = false, nq; int reg1, reg2; unsigned long start, end; + unsigned char key; vcpu->stat.instruction_pfmf++; @@ -675,6 +677,15 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) !test_kvm_facility(vcpu->kvm, 14)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + /* Only provide conditional-SSKE support if enabled for the guest */ + if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK && + test_kvm_facility(vcpu->kvm, 10)) { + mr = vcpu->run->s.regs.gprs[reg1] & PFMF_MR; + mc = vcpu->run->s.regs.gprs[reg1] & PFMF_MC; + } + + nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ; + key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY; start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; start = kvm_s390_logical_to_effective(vcpu, start); @@ -723,11 +734,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (rc) return rc; down_read(¤t->mm->mmap_sem); - rc = set_guest_storage_key(current->mm, useraddr, - vcpu->run->s.regs.gprs[reg1] & PFMF_KEY, - vcpu->run->s.regs.gprs[reg1] & PFMF_NQ); + rc = cond_set_guest_storage_key(current->mm, useraddr, + key, NULL, nq, mr, mc); up_read(¤t->mm->mmap_sem); - if (rc) + if (rc < 0) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 3e35298758d6..e791e8b27fd2 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -539,6 +539,39 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(set_guest_storage_key); +/** + * Conditionally set a guest storage key (handling csske). + * oldkey will be updated when either mr or mc is set and a pointer is given. + * + * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest + * storage key was updated and -EFAULT on access errors. + */ +int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, + unsigned char key, unsigned char *oldkey, + bool nq, bool mr, bool mc) +{ + unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; + int rc; + + /* we can drop the pgste lock between getting and setting the key */ + if (mr | mc) { + rc = get_guest_storage_key(current->mm, addr, &tmp); + if (rc) + return rc; + if (oldkey) + *oldkey = tmp; + if (!mr) + mask |= _PAGE_REFERENCED; + if (!mc) + mask |= _PAGE_CHANGED; + if (!((tmp ^ key) & mask)) + return 0; + } + rc = set_guest_storage_key(current->mm, addr, key, nq); + return rc < 0 ? rc : 1; +} +EXPORT_SYMBOL(cond_set_guest_storage_key); + int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char *key) { From 695be0e7a24a8875c347437566f2c44ba673580b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 12 May 2016 14:07:05 +0200 Subject: [PATCH 053/302] KVM: s390: pfmf: handle address overflows In theory, end could always end up being < start, if overflowing to 0. Although very unlikely for now, let's just fix it. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/priv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 752a1ac1aab6..b8327b8fdb8f 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -715,7 +715,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); } - while (start < end) { + while (start != end) { unsigned long useraddr; /* Translate guest address to host address */ From 238614515287c9400727e4cd7aa958649dcbf05f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 12:56:43 +0100 Subject: [PATCH 054/302] s390/sclp: detect storage-key facility Let's correctly detect that facility. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/sclp.h | 4 +++- drivers/s390/char/sclp_early.c | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 99a0150d07b9..2ad9c204b1a2 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -32,7 +32,8 @@ struct sclp_core_entry { u8 reserved0; u8 : 4; u8 sief2 : 1; - u8 : 3; + u8 skey : 1; + u8 : 2; u8 : 2; u8 gpere : 1; u8 siif : 1; @@ -73,6 +74,7 @@ struct sclp_info { unsigned char has_cei : 1; unsigned char has_pfmfi : 1; unsigned char has_ibs : 1; + unsigned char has_skey : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index d5b873c92ffc..c71df0c7dedc 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -154,6 +154,7 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb) sclp.has_gpere = cpue->gpere; sclp.has_ib = cpue->ib; sclp.has_cei = cpue->cei; + sclp.has_skey = cpue->skey; break; } From 11ddcd41bce5c2394b0390584236afdd13656998 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 10 May 2016 09:40:09 +0200 Subject: [PATCH 055/302] KVM: s390: trace and count all skey intercepts Let's trace and count all skey handling operations, even if lazy skey handling was already activated. Also, don't enable lazy skey handling if anything went wrong while enabling skey handling for the SIE. Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/priv.c | 13 ++++++++----- arch/s390/kvm/trace.h | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index b8327b8fdb8f..6745c2a602c3 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -152,24 +152,27 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu) static int __skey_check_enable(struct kvm_vcpu *vcpu) { int rc = 0; + + trace_kvm_s390_skey_related_inst(vcpu); if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE))) return rc; rc = s390_enable_skey(); - VCPU_EVENT(vcpu, 3, "%s", "enabling storage keys for guest"); - trace_kvm_s390_skey_related_inst(vcpu); - vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE); + VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc); + if (!rc) + vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE); return rc; } static int handle_skey(struct kvm_vcpu *vcpu) { - int rc = __skey_check_enable(vcpu); + int rc; + vcpu->stat.instruction_storage_key++; + rc = __skey_check_enable(vcpu); if (rc) return rc; - vcpu->stat.instruction_storage_key++; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h index 1c4586b367a4..4fc9d4e5be89 100644 --- a/arch/s390/kvm/trace.h +++ b/arch/s390/kvm/trace.h @@ -41,7 +41,7 @@ TRACE_EVENT(kvm_s390_skey_related_inst, TP_fast_assign( VCPU_ASSIGN_COMMON ), - VCPU_TP_PRINTK("%s", "first instruction related to skeys on vcpu") + VCPU_TP_PRINTK("%s", "storage key related instruction") ); TRACE_EVENT(kvm_s390_major_guest_pfault, From a7e19ab55ffdd82f1a8d12694b9a0c0beeef534c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 10 May 2016 09:50:21 +0200 Subject: [PATCH 056/302] KVM: s390: handle missing storage-key facility Without the storage-key facility, SIE won't interpret SSKE, ISKE and RRBE for us. So let's add proper interception handlers that will be called if lazy sske cannot be enabled. Reviewed-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/page.h | 7 +- arch/s390/include/asm/pgtable.h | 1 + arch/s390/kvm/priv.c | 150 ++++++++++++++++++++++++++++++-- arch/s390/mm/pgtable.c | 37 ++++++++ 4 files changed, 184 insertions(+), 11 deletions(-) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 53eacbd4f09b..f874e7d51c19 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -109,13 +109,14 @@ static inline unsigned char page_get_storage_key(unsigned long addr) static inline int page_reset_referenced(unsigned long addr) { - unsigned int ipm; + int cc; asm volatile( " rrbe 0,%1\n" " ipm %0\n" - : "=d" (ipm) : "a" (addr) : "cc"); - return !!(ipm & 0x20000000); + " srl %0,28\n" + : "=d" (cc) : "a" (addr) : "cc"); + return cc; } /* Bits int the storage key */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2f6702e27db9..9951e7e59756 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -896,6 +896,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char key, unsigned char *oldkey, bool nq, bool mr, bool mc); +int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr); int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char *key); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 6745c2a602c3..3db3be139992 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "gaccess.h" #include "kvm-s390.h" #include "trace.h" @@ -164,8 +165,7 @@ static int __skey_check_enable(struct kvm_vcpu *vcpu) return rc; } - -static int handle_skey(struct kvm_vcpu *vcpu) +static int try_handle_skey(struct kvm_vcpu *vcpu) { int rc; @@ -173,12 +173,146 @@ static int handle_skey(struct kvm_vcpu *vcpu) rc = __skey_check_enable(vcpu); if (rc) return rc; - + if (sclp.has_skey) { + /* with storage-key facility, SIE interprets it for us */ + kvm_s390_retry_instr(vcpu); + VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation"); + return -EAGAIN; + } if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + return 0; +} - kvm_s390_retry_instr(vcpu); - VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation"); +static int handle_iske(struct kvm_vcpu *vcpu) +{ + unsigned long addr; + unsigned char key; + int reg1, reg2; + int rc; + + rc = try_handle_skey(vcpu); + if (rc) + return rc != -EAGAIN ? rc : 0; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + addr = kvm_s390_logical_to_effective(vcpu, addr); + addr = kvm_s390_real_to_abs(vcpu, addr); + addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr)); + if (kvm_is_error_hva(addr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + down_read(¤t->mm->mmap_sem); + rc = get_guest_storage_key(current->mm, addr, &key); + up_read(¤t->mm->mmap_sem); + if (rc) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + vcpu->run->s.regs.gprs[reg1] &= ~0xff; + vcpu->run->s.regs.gprs[reg1] |= key; + return 0; +} + +static int handle_rrbe(struct kvm_vcpu *vcpu) +{ + unsigned long addr; + int reg1, reg2; + int rc; + + rc = try_handle_skey(vcpu); + if (rc) + return rc != -EAGAIN ? rc : 0; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + addr = kvm_s390_logical_to_effective(vcpu, addr); + addr = kvm_s390_real_to_abs(vcpu, addr); + addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr)); + if (kvm_is_error_hva(addr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + down_read(¤t->mm->mmap_sem); + rc = reset_guest_reference_bit(current->mm, addr); + up_read(¤t->mm->mmap_sem); + if (rc < 0) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + kvm_s390_set_psw_cc(vcpu, rc); + return 0; +} + +#define SSKE_NQ 0x8 +#define SSKE_MR 0x4 +#define SSKE_MC 0x2 +#define SSKE_MB 0x1 +static int handle_sske(struct kvm_vcpu *vcpu) +{ + unsigned char m3 = vcpu->arch.sie_block->ipb >> 28; + unsigned long start, end; + unsigned char key, oldkey; + int reg1, reg2; + int rc; + + rc = try_handle_skey(vcpu); + if (rc) + return rc != -EAGAIN ? rc : 0; + + if (!test_kvm_facility(vcpu->kvm, 8)) + m3 &= ~SSKE_MB; + if (!test_kvm_facility(vcpu->kvm, 10)) + m3 &= ~(SSKE_MC | SSKE_MR); + if (!test_kvm_facility(vcpu->kvm, 14)) + m3 &= ~SSKE_NQ; + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + + key = vcpu->run->s.regs.gprs[reg1] & 0xfe; + start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; + start = kvm_s390_logical_to_effective(vcpu, start); + if (m3 & SSKE_MB) { + /* start already designates an absolute address */ + end = (start + (1UL << 20)) & ~((1UL << 20) - 1); + } else { + start = kvm_s390_real_to_abs(vcpu, start); + end = start + PAGE_SIZE; + } + + while (start != end) { + unsigned long addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); + + if (kvm_is_error_hva(addr)) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + down_read(¤t->mm->mmap_sem); + rc = cond_set_guest_storage_key(current->mm, addr, key, &oldkey, + m3 & SSKE_NQ, m3 & SSKE_MR, + m3 & SSKE_MC); + up_read(¤t->mm->mmap_sem); + if (rc < 0) + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + start += PAGE_SIZE; + }; + + if (m3 & (SSKE_MC | SSKE_MR)) { + if (m3 & SSKE_MB) { + /* skey in reg1 is unpredictable */ + kvm_s390_set_psw_cc(vcpu, 3); + } else { + kvm_s390_set_psw_cc(vcpu, rc); + vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL; + vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8; + } + } + if (m3 & SSKE_MB) { + if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT) + vcpu->run->s.regs.gprs[reg2] &= ~PAGE_MASK; + else + vcpu->run->s.regs.gprs[reg2] &= ~0xfffff000UL; + end = kvm_s390_logical_to_effective(vcpu, end); + vcpu->run->s.regs.gprs[reg2] |= end; + } return 0; } @@ -586,9 +720,9 @@ static const intercept_handler_t b2_handlers[256] = { [0x11] = handle_store_prefix, [0x12] = handle_store_cpu_address, [0x21] = handle_ipte_interlock, - [0x29] = handle_skey, - [0x2a] = handle_skey, - [0x2b] = handle_skey, + [0x29] = handle_iske, + [0x2a] = handle_rrbe, + [0x2b] = handle_sske, [0x2c] = handle_test_block, [0x30] = handle_io_inst, [0x31] = handle_io_inst, diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index e791e8b27fd2..fa286d0c0f2d 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -572,6 +572,43 @@ int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(cond_set_guest_storage_key); +/** + * Reset a guest reference bit (rrbe), returning the reference and changed bit. + * + * Returns < 0 in case of error, otherwise the cc to be reported to the guest. + */ +int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) +{ + spinlock_t *ptl; + pgste_t old, new; + pte_t *ptep; + int cc = 0; + + ptep = get_locked_pte(mm, addr, &ptl); + if (unlikely(!ptep)) + return -EFAULT; + + new = old = pgste_get_lock(ptep); + /* Reset guest reference bit only */ + pgste_val(new) &= ~PGSTE_GR_BIT; + + if (!(pte_val(*ptep) & _PAGE_INVALID)) { + cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK); + /* Merge real referenced bit into host-set */ + pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT; + } + /* Reflect guest's logical view, not physical */ + cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; + /* Changing the guest storage key is considered a change of the page */ + if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) + pgste_val(new) |= PGSTE_UC_BIT; + + pgste_set_unlock(ptep, new); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(reset_guest_reference_bit); + int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned char *key) { From d40dd9e8da02a9905dea2329c0a8404ab8436622 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:04 +0100 Subject: [PATCH 057/302] MIPS: KVM: Drop unused guest_inst from kvm_vcpu_arch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MIPS kvm_vcpu_arch::guest_inst isn't used, so drop it from the struct and drop its asm-offsets definition. Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 1 - arch/mips/kernel/asm-offsets.c | 2 -- 2 files changed, 3 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 36a391d289aa..b310bb348443 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -347,7 +347,6 @@ struct kvm_vcpu_arch { unsigned long host_cp0_cause; unsigned long host_cp0_epc; unsigned long host_cp0_entryhi; - uint32_t guest_inst; /* GPRS */ unsigned long gprs[32]; diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index 1ea973b2abb1..4d96a9033f46 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -366,8 +366,6 @@ void output_kvm_defines(void) OFFSET(VCPU_HOST_EPC, kvm_vcpu_arch, host_cp0_epc); OFFSET(VCPU_HOST_ENTRYHI, kvm_vcpu_arch, host_cp0_entryhi); - OFFSET(VCPU_GUEST_INST, kvm_vcpu_arch, guest_inst); - OFFSET(VCPU_R0, kvm_vcpu_arch, gprs[0]); OFFSET(VCPU_R1, kvm_vcpu_arch, gprs[1]); OFFSET(VCPU_R2, kvm_vcpu_arch, gprs[2]); From e4e94c0fc8d66975f0822c52d04b366c6250dc64 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:05 +0100 Subject: [PATCH 058/302] MIPS: KVM: Drop unused host_cp0_entryhi MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The host EntryHi in the KVM VCPU context is virtually unused. It gets stored on exceptions, but only ever used in a kvm_debug() when a TLB miss occurs. Drop it entirely, removing that information from the kvm_debug output. Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 1 - arch/mips/kernel/asm-offsets.c | 1 - arch/mips/kvm/emulate.c | 5 ++--- arch/mips/kvm/locore.S | 3 --- 4 files changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index b310bb348443..cbcedd7a684b 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -346,7 +346,6 @@ struct kvm_vcpu_arch { unsigned long host_cp0_badvaddr; unsigned long host_cp0_cause; unsigned long host_cp0_epc; - unsigned long host_cp0_entryhi; /* GPRS */ unsigned long gprs[32]; diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index 4d96a9033f46..420808899c70 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -364,7 +364,6 @@ void output_kvm_defines(void) OFFSET(VCPU_HOST_CP0_BADVADDR, kvm_vcpu_arch, host_cp0_badvaddr); OFFSET(VCPU_HOST_CP0_CAUSE, kvm_vcpu_arch, host_cp0_cause); OFFSET(VCPU_HOST_EPC, kvm_vcpu_arch, host_cp0_epc); - OFFSET(VCPU_HOST_ENTRYHI, kvm_vcpu_arch, host_cp0_entryhi); OFFSET(VCPU_R0, kvm_vcpu_arch, gprs[0]); OFFSET(VCPU_R1, kvm_vcpu_arch, gprs[1]); diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 645c8a1982a7..2836668d63fc 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1634,7 +1634,6 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc, (cop0) & KVM_ENTRYHI_ASID)); if (index < 0) { - vcpu->arch.host_cp0_entryhi = (va & VPN2_MASK); vcpu->arch.host_cp0_badvaddr = va; vcpu->arch.pc = curr_pc; er = kvm_mips_emulate_tlbmiss_ld(cause, NULL, run, @@ -2576,8 +2575,8 @@ enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause, unsigned long va = vcpu->arch.host_cp0_badvaddr; int index; - kvm_debug("kvm_mips_handle_tlbmiss: badvaddr: %#lx, entryhi: %#lx\n", - vcpu->arch.host_cp0_badvaddr, vcpu->arch.host_cp0_entryhi); + kvm_debug("kvm_mips_handle_tlbmiss: badvaddr: %#lx\n", + vcpu->arch.host_cp0_badvaddr); /* * KVM would not have got the exception if this entry was valid in the diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S index 828fcfc1cd7f..5ad2d507b125 100644 --- a/arch/mips/kvm/locore.S +++ b/arch/mips/kvm/locore.S @@ -308,9 +308,6 @@ NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra) mfc0 k0, CP0_CAUSE LONG_S k0, VCPU_HOST_CP0_CAUSE(k1) - mfc0 k0, CP0_ENTRYHI - LONG_S k0, VCPU_HOST_ENTRYHI(k1) - /* Now restore the host state just enough to run the handlers */ /* Switch EBASE to the one used by Linux */ From 2193c713799d90e616d8a2d814c15b69dfaa24e1 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:06 +0100 Subject: [PATCH 059/302] MIPS: KVM: Drop unused kvm_mips_sync_icache() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function kvm_mips_sync_icache() is unused, so lets remove it. Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/emulate.c | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 2836668d63fc..6c2adcfd7faf 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1529,32 +1529,6 @@ enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause, return er; } -int kvm_mips_sync_icache(unsigned long va, struct kvm_vcpu *vcpu) -{ - unsigned long offset = (va & ~PAGE_MASK); - struct kvm *kvm = vcpu->kvm; - unsigned long pa; - gfn_t gfn; - kvm_pfn_t pfn; - - gfn = va >> PAGE_SHIFT; - - if (gfn >= kvm->arch.guest_pmap_npages) { - kvm_err("%s: Invalid gfn: %#llx\n", __func__, gfn); - kvm_mips_dump_host_tlbs(); - kvm_arch_vcpu_dump_regs(vcpu); - return -1; - } - pfn = kvm->arch.guest_pmap[gfn]; - pa = (pfn << PAGE_SHIFT) | offset; - - kvm_debug("%s: va: %#lx, unmapped: %#x\n", __func__, va, - CKSEG0ADDR(pa)); - - local_flush_icache_range(CKSEG0ADDR(pa), 32); - return 0; -} - enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc, uint32_t cause, struct kvm_run *run, From bdb7ed8608f8f1944414abaffdecf3c997dfc41e Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:07 +0100 Subject: [PATCH 060/302] MIPS: KVM: Convert headers to kernel sized types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert the MIPS kvm_host.h structs, function declaration prototypes and associated definition prototypes to use standard kernel sized types (e.g. u32) instead of inttypes.h style ones (e.g. uint32_t). Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 97 ++++++++++++++++---------------- arch/mips/kvm/dyntrans.c | 8 +-- arch/mips/kvm/emulate.c | 67 +++++++++++----------- arch/mips/kvm/interrupt.c | 10 ++-- arch/mips/kvm/interrupt.h | 10 ++-- arch/mips/kvm/tlb.c | 8 +-- 6 files changed, 98 insertions(+), 102 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index cbcedd7a684b..9250b59acd18 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -368,11 +368,11 @@ struct kvm_vcpu_arch { struct hrtimer comparecount_timer; /* Count timer control KVM register */ - uint32_t count_ctl; + u32 count_ctl; /* Count bias from the raw time */ - uint32_t count_bias; + u32 count_bias; /* Frequency of timer in Hz */ - uint32_t count_hz; + u32 count_hz; /* Dynamic nanosecond bias (multiple of count_period) to avoid overflow */ s64 count_dyn_bias; /* Resume time */ @@ -395,8 +395,8 @@ struct kvm_vcpu_arch { struct kvm_mips_tlb guest_tlb[KVM_MIPS_GUEST_TLB_SIZE]; /* Cached guest kernel/user ASIDs */ - uint32_t guest_user_asid[NR_CPUS]; - uint32_t guest_kernel_asid[NR_CPUS]; + u32 guest_user_asid[NR_CPUS]; + u32 guest_kernel_asid[NR_CPUS]; struct mm_struct guest_kernel_mm, guest_user_mm; int last_sched_cpu; @@ -587,9 +587,9 @@ struct kvm_mips_callbacks { void (*dequeue_io_int)(struct kvm_vcpu *vcpu, struct kvm_mips_interrupt *irq); int (*irq_deliver)(struct kvm_vcpu *vcpu, unsigned int priority, - uint32_t cause); + u32 cause); int (*irq_clear)(struct kvm_vcpu *vcpu, unsigned int priority, - uint32_t cause); + u32 cause); int (*get_one_reg)(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, s64 *v); int (*set_one_reg)(struct kvm_vcpu *vcpu, @@ -620,11 +620,11 @@ void kvm_drop_fpu(struct kvm_vcpu *vcpu); void kvm_lose_fpu(struct kvm_vcpu *vcpu); /* TLB handling */ -uint32_t kvm_get_kernel_asid(struct kvm_vcpu *vcpu); +u32 kvm_get_kernel_asid(struct kvm_vcpu *vcpu); -uint32_t kvm_get_user_asid(struct kvm_vcpu *vcpu); +u32 kvm_get_user_asid(struct kvm_vcpu *vcpu); -uint32_t kvm_get_commpage_asid (struct kvm_vcpu *vcpu); +u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu); extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr, struct kvm_vcpu *vcpu); @@ -638,12 +638,12 @@ extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, unsigned long *hpa1); extern enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); @@ -665,90 +665,90 @@ extern void kvm_mips_vcpu_load(struct kvm_vcpu *vcpu, int cpu); extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu); /* Emulation */ -uint32_t kvm_get_inst(uint32_t *opc, struct kvm_vcpu *vcpu); -enum emulation_result update_pc(struct kvm_vcpu *vcpu, uint32_t cause); +u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu); +enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause); extern enum emulation_result kvm_mips_emulate_inst(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_syscall(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_handle_ri(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run); -uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu); -void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count); -void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack); +u32 kvm_mips_read_count(struct kvm_vcpu *vcpu); +void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count); +void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack); void kvm_mips_init_count(struct kvm_vcpu *vcpu); int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl); int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume); @@ -758,26 +758,26 @@ void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu); enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu); enum emulation_result kvm_mips_check_privilege(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -enum emulation_result kvm_mips_emulate_cache(uint32_t inst, - uint32_t *opc, - uint32_t cause, +enum emulation_result kvm_mips_emulate_cache(u32 inst, + u32 *opc, + u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu); -enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, - uint32_t *opc, - uint32_t cause, +enum emulation_result kvm_mips_emulate_CP0(u32 inst, + u32 *opc, + u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu); -enum emulation_result kvm_mips_emulate_store(uint32_t inst, - uint32_t cause, +enum emulation_result kvm_mips_emulate_store(u32 inst, + u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu); -enum emulation_result kvm_mips_emulate_load(uint32_t inst, - uint32_t cause, +enum emulation_result kvm_mips_emulate_load(u32 inst, + u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu); @@ -787,14 +787,11 @@ unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu); unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu); /* Dynamic binary translation */ -extern int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc, +extern int kvm_mips_trans_cache_index(u32 inst, u32 *opc, struct kvm_vcpu *vcpu); -extern int kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc, - struct kvm_vcpu *vcpu); -extern int kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc, - struct kvm_vcpu *vcpu); -extern int kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc, - struct kvm_vcpu *vcpu); +extern int kvm_mips_trans_cache_va(u32 inst, u32 *opc, struct kvm_vcpu *vcpu); +extern int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu); +extern int kvm_mips_trans_mtc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu); /* Misc */ extern void kvm_mips_dump_stats(struct kvm_vcpu *vcpu); diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c index f1527a465c1b..e79502a88534 100644 --- a/arch/mips/kvm/dyntrans.c +++ b/arch/mips/kvm/dyntrans.c @@ -28,7 +28,7 @@ #define CLEAR_TEMPLATE 0x00000020 #define SW_TEMPLATE 0xac000000 -int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc, +int kvm_mips_trans_cache_index(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) { int result = 0; @@ -49,7 +49,7 @@ int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc, * Address based CACHE instructions are transformed into synci(s). A little * heavy for just D-cache invalidates, but avoids an expensive trap */ -int kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc, +int kvm_mips_trans_cache_va(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) { int result = 0; @@ -70,7 +70,7 @@ int kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc, return result; } -int kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu) +int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) { int32_t rt, rd, sel; uint32_t mfc0_inst; @@ -110,7 +110,7 @@ int kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu) return 0; } -int kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu) +int kvm_mips_trans_mtc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) { int32_t rt, rd, sel; uint32_t mtc0_inst = SW_TEMPLATE; diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 6c2adcfd7faf..c59c51908476 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -198,7 +198,7 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, return nextpc; } -enum emulation_result update_pc(struct kvm_vcpu *vcpu, uint32_t cause) +enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause) { unsigned long branch_pc; enum emulation_result er = EMULATE_DONE; @@ -243,7 +243,7 @@ static inline int kvm_mips_count_disabled(struct kvm_vcpu *vcpu) * * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running). */ -static uint32_t kvm_mips_ktime_to_count(struct kvm_vcpu *vcpu, ktime_t now) +static u32 kvm_mips_ktime_to_count(struct kvm_vcpu *vcpu, ktime_t now) { s64 now_ns, periods; u64 delta; @@ -300,7 +300,7 @@ static inline ktime_t kvm_mips_count_time(struct kvm_vcpu *vcpu) * * Returns: The current value of the guest CP0_Count register. */ -static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now) +static u32 kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now) { struct mips_coproc *cop0 = vcpu->arch.cop0; ktime_t expires, threshold; @@ -360,7 +360,7 @@ static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now) * * Returns: The current guest CP0_Count value. */ -uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu) +u32 kvm_mips_read_count(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -387,8 +387,7 @@ uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu) * * Returns: The ktime at the point of freeze. */ -static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, - uint32_t *count) +static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count) { ktime_t now; @@ -419,7 +418,7 @@ static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running). */ static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu, - ktime_t now, uint32_t count) + ktime_t now, u32 count) { struct mips_coproc *cop0 = vcpu->arch.cop0; uint32_t compare; @@ -444,7 +443,7 @@ static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu, * * Sets the CP0_Count value and updates the timer accordingly. */ -void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count) +void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count) { struct mips_coproc *cop0 = vcpu->arch.cop0; ktime_t now; @@ -538,7 +537,7 @@ int kvm_mips_set_count_hz(struct kvm_vcpu *vcpu, s64 count_hz) * If @ack, atomically acknowledge any pending timer interrupt, otherwise ensure * any pending timer interrupt is preserved. */ -void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack) +void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack) { struct mips_coproc *cop0 = vcpu->arch.cop0; int dc; @@ -973,8 +972,8 @@ unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu) return mask; } -enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, - uint32_t cause, struct kvm_run *run, +enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, + struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -1312,7 +1311,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc, return er; } -enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause, +enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1424,7 +1423,7 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause, return er; } -enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause, +enum emulation_result kvm_mips_emulate_load(u32 inst, u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1529,8 +1528,8 @@ enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause, return er; } -enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc, - uint32_t cause, +enum emulation_result kvm_mips_emulate_cache(u32 inst, u32 *opc, + u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1687,7 +1686,7 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc, return er; } -enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc, +enum emulation_result kvm_mips_emulate_inst(unsigned long cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1735,7 +1734,7 @@ enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc, } enum emulation_result kvm_mips_emulate_syscall(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1770,7 +1769,7 @@ enum emulation_result kvm_mips_emulate_syscall(unsigned long cause, } enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1816,7 +1815,7 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause, } enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1862,7 +1861,7 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause, } enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1906,7 +1905,7 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause, } enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1950,7 +1949,7 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause, } /* TLBMOD: store into address matching TLB with Dirty bit off */ -enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, uint32_t *opc, +enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1979,7 +1978,7 @@ enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, uint32_t *opc, } enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2022,7 +2021,7 @@ enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause, } enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2051,7 +2050,7 @@ enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause, } enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2086,7 +2085,7 @@ enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause, } enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2121,7 +2120,7 @@ enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause, } enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2156,7 +2155,7 @@ enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause, } enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2191,7 +2190,7 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause, } enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2226,7 +2225,7 @@ enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause, } enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2275,7 +2274,7 @@ enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause, #define SYNC 0x0000000f #define RDHWR 0x0000003b -enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc, +enum emulation_result kvm_mips_handle_ri(unsigned long cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2406,7 +2405,7 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, } static enum emulation_result kvm_mips_emulate_exc(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2444,7 +2443,7 @@ static enum emulation_result kvm_mips_emulate_exc(unsigned long cause, } enum emulation_result kvm_mips_check_privilege(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2540,7 +2539,7 @@ enum emulation_result kvm_mips_check_privilege(unsigned long cause, * case we inject the TLB from the Guest TLB into the shadow host TLB */ enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause, - uint32_t *opc, + u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { diff --git a/arch/mips/kvm/interrupt.c b/arch/mips/kvm/interrupt.c index 95f790663b0c..49ce83237fc3 100644 --- a/arch/mips/kvm/interrupt.c +++ b/arch/mips/kvm/interrupt.c @@ -22,12 +22,12 @@ #include "interrupt.h" -void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, uint32_t priority) +void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority) { set_bit(priority, &vcpu->arch.pending_exceptions); } -void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, uint32_t priority) +void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority) { clear_bit(priority, &vcpu->arch.pending_exceptions); } @@ -114,7 +114,7 @@ void kvm_mips_dequeue_io_int_cb(struct kvm_vcpu *vcpu, /* Deliver the interrupt of the corresponding priority, if possible. */ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority, - uint32_t cause) + u32 cause) { int allowed = 0; uint32_t exccode; @@ -196,12 +196,12 @@ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority, } int kvm_mips_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority, - uint32_t cause) + u32 cause) { return 1; } -void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, uint32_t cause) +void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, u32 cause) { unsigned long *pending = &vcpu->arch.pending_exceptions; unsigned long *pending_clr = &vcpu->arch.pending_exceptions_clr; diff --git a/arch/mips/kvm/interrupt.h b/arch/mips/kvm/interrupt.h index 2143884709e4..d661c100b219 100644 --- a/arch/mips/kvm/interrupt.h +++ b/arch/mips/kvm/interrupt.h @@ -37,8 +37,8 @@ extern char mips32_GuestException[], mips32_GuestExceptionEnd[]; #define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (0) #define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE (0) -void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, uint32_t priority); -void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, uint32_t priority); +void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority); +void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority); int kvm_mips_pending_timer(struct kvm_vcpu *vcpu); void kvm_mips_queue_timer_int_cb(struct kvm_vcpu *vcpu); @@ -48,7 +48,7 @@ void kvm_mips_queue_io_int_cb(struct kvm_vcpu *vcpu, void kvm_mips_dequeue_io_int_cb(struct kvm_vcpu *vcpu, struct kvm_mips_interrupt *irq); int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority, - uint32_t cause); + u32 cause); int kvm_mips_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority, - uint32_t cause); -void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, uint32_t cause); + u32 cause); +void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, u32 cause); diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index ed021ae7867a..7ea346e150a8 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -47,7 +47,7 @@ EXPORT_SYMBOL_GPL(kvm_mips_release_pfn_clean); bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn); EXPORT_SYMBOL_GPL(kvm_mips_is_error_pfn); -uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) +u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) { int cpu = smp_processor_id(); @@ -55,7 +55,7 @@ uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) cpu_asid_mask(&cpu_data[cpu]); } -uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) +u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) { int cpu = smp_processor_id(); @@ -63,7 +63,7 @@ uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) cpu_asid_mask(&cpu_data[cpu]); } -inline uint32_t kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu) +inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.commpage_tlb; } @@ -751,7 +751,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_vcpu_put); -uint32_t kvm_get_inst(uint32_t *opc, struct kvm_vcpu *vcpu) +u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; unsigned long paddr, flags, vpn2, asid; From 8cffd197485122632103a12fdada911242e7c01e Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:08 +0100 Subject: [PATCH 061/302] MIPS: KVM: Convert code to kernel sized types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert the MIPS KVM C code to use standard kernel sized types (e.g. u32) instead of inttypes.h style ones (e.g. uint32_t) or other types as appropriate. Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/dyntrans.c | 24 ++++----- arch/mips/kvm/emulate.c | 104 +++++++++++++++++++------------------- arch/mips/kvm/interrupt.c | 2 +- arch/mips/kvm/mips.c | 8 +-- arch/mips/kvm/tlb.c | 28 +++++----- arch/mips/kvm/trap_emul.c | 30 +++++------ 6 files changed, 98 insertions(+), 98 deletions(-) diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c index e79502a88534..d4a86fb239cd 100644 --- a/arch/mips/kvm/dyntrans.c +++ b/arch/mips/kvm/dyntrans.c @@ -33,13 +33,13 @@ int kvm_mips_trans_cache_index(u32 inst, u32 *opc, { int result = 0; unsigned long kseg0_opc; - uint32_t synci_inst = 0x0; + u32 synci_inst = 0x0; /* Replace the CACHE instruction, with a NOP */ kseg0_opc = CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa (vcpu, (unsigned long) opc)); - memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t)); + memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(u32)); local_flush_icache_range(kseg0_opc, kseg0_opc + 32); return result; @@ -54,7 +54,7 @@ int kvm_mips_trans_cache_va(u32 inst, u32 *opc, { int result = 0; unsigned long kseg0_opc; - uint32_t synci_inst = SYNCI_TEMPLATE, base, offset; + u32 synci_inst = SYNCI_TEMPLATE, base, offset; base = (inst >> 21) & 0x1f; offset = inst & 0xffff; @@ -64,7 +64,7 @@ int kvm_mips_trans_cache_va(u32 inst, u32 *opc, kseg0_opc = CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa (vcpu, (unsigned long) opc)); - memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t)); + memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(u32)); local_flush_icache_range(kseg0_opc, kseg0_opc + 32); return result; @@ -72,8 +72,8 @@ int kvm_mips_trans_cache_va(u32 inst, u32 *opc, int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) { - int32_t rt, rd, sel; - uint32_t mfc0_inst; + u32 rt, rd, sel; + u32 mfc0_inst; unsigned long kseg0_opc, flags; rt = (inst >> 16) & 0x1f; @@ -94,11 +94,11 @@ int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) kseg0_opc = CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa (vcpu, (unsigned long) opc)); - memcpy((void *)kseg0_opc, (void *)&mfc0_inst, sizeof(uint32_t)); + memcpy((void *)kseg0_opc, (void *)&mfc0_inst, sizeof(u32)); local_flush_icache_range(kseg0_opc, kseg0_opc + 32); } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { local_irq_save(flags); - memcpy((void *)opc, (void *)&mfc0_inst, sizeof(uint32_t)); + memcpy((void *)opc, (void *)&mfc0_inst, sizeof(u32)); local_flush_icache_range((unsigned long)opc, (unsigned long)opc + 32); local_irq_restore(flags); @@ -112,8 +112,8 @@ int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) int kvm_mips_trans_mtc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) { - int32_t rt, rd, sel; - uint32_t mtc0_inst = SW_TEMPLATE; + u32 rt, rd, sel; + u32 mtc0_inst = SW_TEMPLATE; unsigned long kseg0_opc, flags; rt = (inst >> 16) & 0x1f; @@ -127,11 +127,11 @@ int kvm_mips_trans_mtc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) kseg0_opc = CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa (vcpu, (unsigned long) opc)); - memcpy((void *)kseg0_opc, (void *)&mtc0_inst, sizeof(uint32_t)); + memcpy((void *)kseg0_opc, (void *)&mtc0_inst, sizeof(u32)); local_flush_icache_range(kseg0_opc, kseg0_opc + 32); } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { local_irq_save(flags); - memcpy((void *)opc, (void *)&mtc0_inst, sizeof(uint32_t)); + memcpy((void *)opc, (void *)&mtc0_inst, sizeof(u32)); local_flush_icache_range((unsigned long)opc, (unsigned long)opc + 32); local_irq_restore(flags); diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index c59c51908476..8f4f3242a655 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -52,7 +52,7 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, goto unaligned; /* Read the instruction */ - insn.word = kvm_get_inst((uint32_t *) epc, vcpu); + insn.word = kvm_get_inst((u32 *) epc, vcpu); if (insn.word == KVM_INVALID_INST) return KVM_INVALID_INST; @@ -304,7 +304,7 @@ static u32 kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now) { struct mips_coproc *cop0 = vcpu->arch.cop0; ktime_t expires, threshold; - uint32_t count, compare; + u32 count, compare; int running; /* Calculate the biased and scaled guest CP0_Count */ @@ -315,7 +315,7 @@ static u32 kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now) * Find whether CP0_Count has reached the closest timer interrupt. If * not, we shouldn't inject it. */ - if ((int32_t)(count - compare) < 0) + if ((s32)(count - compare) < 0) return count; /* @@ -421,13 +421,13 @@ static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu, ktime_t now, u32 count) { struct mips_coproc *cop0 = vcpu->arch.cop0; - uint32_t compare; + u32 compare; u64 delta; ktime_t expire; /* Calculate timeout (wrap 0 to 2^32) */ compare = kvm_read_c0_guest_compare(cop0); - delta = (u64)(uint32_t)(compare - count - 1) + 1; + delta = (u64)(u32)(compare - count - 1) + 1; delta = div_u64(delta * NSEC_PER_SEC, vcpu->arch.count_hz); expire = ktime_add_ns(now, delta); @@ -543,7 +543,7 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack) int dc; u32 old_compare = kvm_read_c0_guest_compare(cop0); ktime_t now; - uint32_t count; + u32 count; /* if unchanged, must just be an ack */ if (old_compare == compare) { @@ -584,7 +584,7 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack) static ktime_t kvm_mips_count_disable(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - uint32_t count; + u32 count; ktime_t now; /* Stop hrtimer */ @@ -631,7 +631,7 @@ void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu) void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - uint32_t count; + u32 count; kvm_clear_c0_guest_cause(cop0, CAUSEF_DC); @@ -660,7 +660,7 @@ int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl) s64 changed = count_ctl ^ vcpu->arch.count_ctl; s64 delta; ktime_t expire, now; - uint32_t count, compare; + u32 count, compare; /* Only allow defined bits to be changed */ if (changed & ~(s64)(KVM_REG_MIPS_COUNT_CTL_DC)) @@ -686,7 +686,7 @@ int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl) */ count = kvm_read_c0_guest_count(cop0); compare = kvm_read_c0_guest_compare(cop0); - delta = (u64)(uint32_t)(compare - count - 1) + 1; + delta = (u64)(u32)(compare - count - 1) + 1; delta = div_u64(delta * NSEC_PER_SEC, vcpu->arch.count_hz); expire = ktime_add_ns(vcpu->arch.count_resume, delta); @@ -800,9 +800,9 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu) enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - uint32_t pc = vcpu->arch.pc; + unsigned long pc = vcpu->arch.pc; - kvm_err("[%#x] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0)); + kvm_err("[%#lx] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0)); return EMULATE_FAIL; } @@ -812,11 +812,11 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu) struct mips_coproc *cop0 = vcpu->arch.cop0; int index = kvm_read_c0_guest_index(cop0); struct kvm_mips_tlb *tlb = NULL; - uint32_t pc = vcpu->arch.pc; + unsigned long pc = vcpu->arch.pc; if (index < 0 || index >= KVM_MIPS_GUEST_TLB_SIZE) { kvm_debug("%s: illegal index: %d\n", __func__, index); - kvm_debug("[%#x] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n", + kvm_debug("[%#lx] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n", pc, index, kvm_read_c0_guest_entryhi(cop0), kvm_read_c0_guest_entrylo0(cop0), kvm_read_c0_guest_entrylo1(cop0), @@ -836,7 +836,7 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu) tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0); tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0); - kvm_debug("[%#x] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n", + kvm_debug("[%#lx] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n", pc, index, kvm_read_c0_guest_entryhi(cop0), kvm_read_c0_guest_entrylo0(cop0), kvm_read_c0_guest_entrylo1(cop0), @@ -850,7 +850,7 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; struct kvm_mips_tlb *tlb = NULL; - uint32_t pc = vcpu->arch.pc; + unsigned long pc = vcpu->arch.pc; int index; get_random_bytes(&index, sizeof(index)); @@ -869,7 +869,7 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu) tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0); tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0); - kvm_debug("[%#x] COP0_TLBWR[%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx)\n", + kvm_debug("[%#lx] COP0_TLBWR[%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx)\n", pc, index, kvm_read_c0_guest_entryhi(cop0), kvm_read_c0_guest_entrylo0(cop0), kvm_read_c0_guest_entrylo1(cop0)); @@ -881,14 +881,14 @@ enum emulation_result kvm_mips_emul_tlbp(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; long entryhi = kvm_read_c0_guest_entryhi(cop0); - uint32_t pc = vcpu->arch.pc; + unsigned long pc = vcpu->arch.pc; int index = -1; index = kvm_mips_guest_tlb_lookup(vcpu, entryhi); kvm_write_c0_guest_index(cop0, index); - kvm_debug("[%#x] COP0_TLBP (entryhi: %#lx), index: %d\n", pc, entryhi, + kvm_debug("[%#lx] COP0_TLBP (entryhi: %#lx), index: %d\n", pc, entryhi, index); return EMULATE_DONE; @@ -978,8 +978,8 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, { struct mips_coproc *cop0 = vcpu->arch.cop0; enum emulation_result er = EMULATE_DONE; - int32_t rt, rd, copz, sel, co_bit, op; - uint32_t pc = vcpu->arch.pc; + u32 rt, rd, copz, sel, co_bit, op; + unsigned long pc = vcpu->arch.pc; unsigned long curr_pc; /* @@ -1047,7 +1047,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, } kvm_debug - ("[%#x] MFCz[%d][%d], vcpu->arch.gprs[%d]: %#lx\n", + ("[%#lx] MFCz[%d][%d], vcpu->arch.gprs[%d]: %#lx\n", pc, rd, sel, rt, vcpu->arch.gprs[rt]); break; @@ -1077,7 +1077,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, kvm_err("MTCz, cop0->reg[EBASE]: %#lx\n", kvm_read_c0_guest_ebase(cop0)); } else if (rd == MIPS_CP0_TLB_HI && sel == 0) { - uint32_t nasid = + u32 nasid = vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID; if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) && ((kvm_read_c0_guest_entryhi(cop0) & @@ -1099,7 +1099,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]); goto done; } else if ((rd == MIPS_CP0_COMPARE) && (sel == 0)) { - kvm_debug("[%#x] MTCz, COMPARE %#lx <- %#lx\n", + kvm_debug("[%#lx] MTCz, COMPARE %#lx <- %#lx\n", pc, kvm_read_c0_guest_compare(cop0), vcpu->arch.gprs[rt]); @@ -1218,7 +1218,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, kvm_write_c0_guest_config5(cop0, val); } else if ((rd == MIPS_CP0_CAUSE) && (sel == 0)) { - uint32_t old_cause, new_cause; + u32 old_cause, new_cause; old_cause = kvm_read_c0_guest_cause(cop0); new_cause = vcpu->arch.gprs[rt]; @@ -1239,7 +1239,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, #endif } - kvm_debug("[%#x] MTCz, cop0->reg[%d][%d]: %#lx\n", pc, + kvm_debug("[%#lx] MTCz, cop0->reg[%d][%d]: %#lx\n", pc, rd, sel, cop0->reg[rd][sel]); break; @@ -1271,9 +1271,8 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, case wrpgpr_op: { - uint32_t css = - cop0->reg[MIPS_CP0_STATUS][2] & 0xf; - uint32_t pss = + u32 css = cop0->reg[MIPS_CP0_STATUS][2] & 0xf; + u32 pss = (cop0->reg[MIPS_CP0_STATUS][2] >> 6) & 0xf; /* * We don't support any shadow register sets, so @@ -1316,8 +1315,9 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DO_MMIO; - int32_t op, base, rt, offset; - uint32_t bytes; + u32 op, base, rt; + s16 offset; + u32 bytes; void *data = run->mmio.data; unsigned long curr_pc; @@ -1332,7 +1332,7 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause, rt = (inst >> 16) & 0x1f; base = (inst >> 21) & 0x1f; - offset = inst & 0xffff; + offset = (s16)inst; op = (inst >> 26) & 0x3f; switch (op) { @@ -1356,7 +1356,7 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause, *(u8 *) data = vcpu->arch.gprs[rt]; kvm_debug("OP_SB: eaddr: %#lx, gpr: %#lx, data: %#x\n", vcpu->arch.host_cp0_badvaddr, vcpu->arch.gprs[rt], - *(uint8_t *) data); + *(u8 *) data); break; @@ -1378,11 +1378,11 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause, run->mmio.is_write = 1; vcpu->mmio_needed = 1; vcpu->mmio_is_write = 1; - *(uint32_t *) data = vcpu->arch.gprs[rt]; + *(u32 *) data = vcpu->arch.gprs[rt]; kvm_debug("[%#lx] OP_SW: eaddr: %#lx, gpr: %#lx, data: %#x\n", vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr, - vcpu->arch.gprs[rt], *(uint32_t *) data); + vcpu->arch.gprs[rt], *(u32 *) data); break; case sh_op: @@ -1403,11 +1403,11 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause, run->mmio.is_write = 1; vcpu->mmio_needed = 1; vcpu->mmio_is_write = 1; - *(uint16_t *) data = vcpu->arch.gprs[rt]; + *(u16 *) data = vcpu->arch.gprs[rt]; kvm_debug("[%#lx] OP_SH: eaddr: %#lx, gpr: %#lx, data: %#x\n", vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr, - vcpu->arch.gprs[rt], *(uint32_t *) data); + vcpu->arch.gprs[rt], *(u32 *) data); break; default: @@ -1428,12 +1428,13 @@ enum emulation_result kvm_mips_emulate_load(u32 inst, u32 cause, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DO_MMIO; - int32_t op, base, rt, offset; - uint32_t bytes; + u32 op, base, rt; + s16 offset; + u32 bytes; rt = (inst >> 16) & 0x1f; base = (inst >> 21) & 0x1f; - offset = inst & 0xffff; + offset = (s16)inst; op = (inst >> 26) & 0x3f; vcpu->arch.pending_load_cause = cause; @@ -1535,7 +1536,8 @@ enum emulation_result kvm_mips_emulate_cache(u32 inst, u32 *opc, { struct mips_coproc *cop0 = vcpu->arch.cop0; enum emulation_result er = EMULATE_DONE; - int32_t offset, cache, op_inst, op, base; + u32 cache, op_inst, op, base; + s16 offset; struct kvm_vcpu_arch *arch = &vcpu->arch; unsigned long va; unsigned long curr_pc; @@ -1551,7 +1553,7 @@ enum emulation_result kvm_mips_emulate_cache(u32 inst, u32 *opc, base = (inst >> 21) & 0x1f; op_inst = (inst >> 16) & 0x1f; - offset = (int16_t)inst; + offset = (s16)inst; cache = op_inst & CacheOp_Cache; op = op_inst & CacheOp_Op; @@ -1691,7 +1693,7 @@ enum emulation_result kvm_mips_emulate_inst(unsigned long cause, u32 *opc, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DONE; - uint32_t inst; + u32 inst; /* Fetch the instruction. */ if (cause & CAUSEF_BD) @@ -2282,7 +2284,7 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, u32 *opc, struct kvm_vcpu_arch *arch = &vcpu->arch; enum emulation_result er = EMULATE_DONE; unsigned long curr_pc; - uint32_t inst; + u32 inst; /* * Update PC and hold onto current PC in case there is @@ -2377,19 +2379,19 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, switch (run->mmio.len) { case 4: - *gpr = *(int32_t *) run->mmio.data; + *gpr = *(s32 *) run->mmio.data; break; case 2: if (vcpu->mmio_needed == 2) - *gpr = *(int16_t *) run->mmio.data; + *gpr = *(s16 *) run->mmio.data; else - *gpr = *(uint16_t *)run->mmio.data; + *gpr = *(u16 *)run->mmio.data; break; case 1: if (vcpu->mmio_needed == 2) - *gpr = *(int8_t *) run->mmio.data; + *gpr = *(s8 *) run->mmio.data; else *gpr = *(u8 *) run->mmio.data; break; @@ -2409,7 +2411,7 @@ static enum emulation_result kvm_mips_emulate_exc(unsigned long cause, struct kvm_run *run, struct kvm_vcpu *vcpu) { - uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; + u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; struct mips_coproc *cop0 = vcpu->arch.cop0; struct kvm_vcpu_arch *arch = &vcpu->arch; enum emulation_result er = EMULATE_DONE; @@ -2448,7 +2450,7 @@ enum emulation_result kvm_mips_check_privilege(unsigned long cause, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DONE; - uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; + u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; int usermode = !KVM_GUEST_KERNEL_MODE(vcpu); @@ -2544,7 +2546,7 @@ enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DONE; - uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; + u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; unsigned long va = vcpu->arch.host_cp0_badvaddr; int index; diff --git a/arch/mips/kvm/interrupt.c b/arch/mips/kvm/interrupt.c index 49ce83237fc3..ad28dac6b7e9 100644 --- a/arch/mips/kvm/interrupt.c +++ b/arch/mips/kvm/interrupt.c @@ -117,7 +117,7 @@ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority, u32 cause) { int allowed = 0; - uint32_t exccode; + u32 exccode; struct kvm_vcpu_arch *arch = &vcpu->arch; struct mips_coproc *cop0 = vcpu->arch.cop0; diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 44da5259f390..a2b1b9205b94 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1222,7 +1222,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) static void kvm_mips_set_c0_status(void) { - uint32_t status = read_c0_status(); + u32 status = read_c0_status(); if (cpu_has_dsp) status |= (ST0_MX); @@ -1236,9 +1236,9 @@ static void kvm_mips_set_c0_status(void) */ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) { - uint32_t cause = vcpu->arch.host_cp0_cause; - uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; + u32 cause = vcpu->arch.host_cp0_cause; + u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 7ea346e150a8..c4e11e138042 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -32,8 +32,6 @@ #define KVM_GUEST_PC_TLB 0 #define KVM_GUEST_SP_TLB 1 -#define PRIx64 "llx" - atomic_t kvm_mips_instance; EXPORT_SYMBOL_GPL(kvm_mips_instance); @@ -102,13 +100,13 @@ void kvm_mips_dump_host_tlbs(void) kvm_info("TLB%c%3d Hi 0x%08lx ", (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*', i, tlb.tlb_hi); - kvm_info("Lo0=0x%09" PRIx64 " %c%c attr %lx ", - (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo0), + kvm_info("Lo0=0x%09llx %c%c attr %lx ", + (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo0), (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ', (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ', (tlb.tlb_lo0 >> 3) & 7); - kvm_info("Lo1=0x%09" PRIx64 " %c%c attr %lx sz=%lx\n", - (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo1), + kvm_info("Lo1=0x%09llx %c%c attr %lx sz=%lx\n", + (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo1), (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ', (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ', (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask); @@ -134,13 +132,13 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu) kvm_info("TLB%c%3d Hi 0x%08lx ", (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*', i, tlb.tlb_hi); - kvm_info("Lo0=0x%09" PRIx64 " %c%c attr %lx ", - (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo0), + kvm_info("Lo0=0x%09llx %c%c attr %lx ", + (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo0), (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ', (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ', (tlb.tlb_lo0 >> 3) & 7); - kvm_info("Lo1=0x%09" PRIx64 " %c%c attr %lx sz=%lx\n", - (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo1), + kvm_info("Lo1=0x%09llx %c%c attr %lx sz=%lx\n", + (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo1), (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ', (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ', (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask); @@ -160,7 +158,7 @@ static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) pfn = kvm_mips_gfn_to_pfn(kvm, gfn); if (kvm_mips_is_error_pfn(pfn)) { - kvm_err("Couldn't get pfn for gfn %#" PRIx64 "!\n", gfn); + kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn); err = -EFAULT; goto out; } @@ -176,7 +174,7 @@ unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, unsigned long gva) { gfn_t gfn; - uint32_t offset = gva & ~PAGE_MASK; + unsigned long offset = gva & ~PAGE_MASK; struct kvm *kvm = vcpu->kvm; if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) { @@ -726,7 +724,7 @@ EXPORT_SYMBOL_GPL(kvm_arch_vcpu_load); void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { unsigned long flags; - uint32_t cpu; + int cpu; local_irq_save(flags); @@ -755,7 +753,7 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; unsigned long paddr, flags, vpn2, asid; - uint32_t inst; + u32 inst; int index; if (KVM_GUEST_KSEGX((unsigned long) opc) < KVM_GUEST_KSEG0 || @@ -787,7 +785,7 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, (unsigned long) opc); - inst = *(uint32_t *) CKSEG0ADDR(paddr); + inst = *(u32 *) CKSEG0ADDR(paddr); } else { kvm_err("%s: illegal address: %p\n", __func__, opc); return KVM_INVALID_INST; diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 6ba0fafcecbc..4aa5d77b0d6a 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -21,7 +21,7 @@ static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva) { gpa_t gpa; - uint32_t kseg = KSEGX(gva); + gva_t kseg = KSEGX(gva); if ((kseg == CKSEG0) || (kseg == CKSEG1)) gpa = CPHYSADDR(gva); @@ -40,7 +40,7 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -87,7 +87,7 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; unsigned long cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; @@ -131,7 +131,7 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; unsigned long cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; @@ -178,7 +178,7 @@ static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; unsigned long cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; @@ -232,7 +232,7 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; unsigned long cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; @@ -262,7 +262,7 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; unsigned long cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; @@ -292,7 +292,7 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -310,7 +310,7 @@ static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -328,7 +328,7 @@ static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -346,7 +346,7 @@ static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc; + u32 __user *opc = (u32 __user *)vcpu->arch.pc; unsigned long cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -364,7 +364,7 @@ static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc; + u32 __user *opc = (u32 __user *)vcpu->arch.pc; unsigned long cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -382,7 +382,7 @@ static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_fpe(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc; + u32 __user *opc = (u32 __user *)vcpu->arch.pc; unsigned long cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -407,7 +407,7 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; struct kvm_run *run = vcpu->run; - uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc; + u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -457,7 +457,7 @@ static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu) static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - uint32_t config1; + u32 config1; int vcpu_id = vcpu->vcpu_id; /* From 31cf7498545c36cc992887bd6af17a496f26f681 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:09 +0100 Subject: [PATCH 062/302] MIPS: KVM: Make various Cause variables 32-bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CP0 Cause register is passed around in KVM quite a bit, often as an unsigned long, even though it is always 32-bits long. Resize it to u32 throughout MIPS KVM. Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 40 +++++++++++++++--------------- arch/mips/kvm/emulate.c | 38 ++++++++++++++--------------- arch/mips/kvm/locore.S | 2 +- arch/mips/kvm/trap_emul.c | 42 ++++++++++++++++---------------- 4 files changed, 61 insertions(+), 61 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 9250b59acd18..dceb49422e3b 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -344,8 +344,8 @@ struct kvm_vcpu_arch { /* Host CP0 registers used when handling exits from guest */ unsigned long host_cp0_badvaddr; - unsigned long host_cp0_cause; unsigned long host_cp0_epc; + u32 host_cp0_cause; /* GPRS */ unsigned long gprs[32]; @@ -386,7 +386,7 @@ struct kvm_vcpu_arch { /* Bitmask of pending exceptions to be cleared */ unsigned long pending_exceptions_clr; - unsigned long pending_load_cause; + u32 pending_load_cause; /* Save/Restore the entryhi register when are are preempted/scheduled back in */ unsigned long preempt_entryhi; @@ -637,12 +637,12 @@ extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, unsigned long *hpa0, unsigned long *hpa1); -extern enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause, +extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, +extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); @@ -668,77 +668,77 @@ extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu); u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu); enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause); -extern enum emulation_result kvm_mips_emulate_inst(unsigned long cause, +extern enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_syscall(unsigned long cause, +extern enum emulation_result kvm_mips_emulate_syscall(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause, +extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause, +extern enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause, +extern enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause, +extern enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause, +extern enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause, +extern enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_handle_ri(unsigned long cause, +extern enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause, +extern enum emulation_result kvm_mips_emulate_ri_exc(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause, +extern enum emulation_result kvm_mips_emulate_bp_exc(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause, +extern enum emulation_result kvm_mips_emulate_trap_exc(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause, +extern enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause, +extern enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause, +extern enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); @@ -757,7 +757,7 @@ void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu); void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu); enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu); -enum emulation_result kvm_mips_check_privilege(unsigned long cause, +enum emulation_result kvm_mips_check_privilege(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 8f4f3242a655..3baab5ec3d3b 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1688,7 +1688,7 @@ enum emulation_result kvm_mips_emulate_cache(u32 inst, u32 *opc, return er; } -enum emulation_result kvm_mips_emulate_inst(unsigned long cause, u32 *opc, +enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1735,7 +1735,7 @@ enum emulation_result kvm_mips_emulate_inst(unsigned long cause, u32 *opc, return er; } -enum emulation_result kvm_mips_emulate_syscall(unsigned long cause, +enum emulation_result kvm_mips_emulate_syscall(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -1770,7 +1770,7 @@ enum emulation_result kvm_mips_emulate_syscall(unsigned long cause, return er; } -enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause, +enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -1816,7 +1816,7 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause, return EMULATE_DONE; } -enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause, +enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -1862,7 +1862,7 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause, return EMULATE_DONE; } -enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause, +enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -1906,7 +1906,7 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause, return EMULATE_DONE; } -enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause, +enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -1951,7 +1951,7 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause, } /* TLBMOD: store into address matching TLB with Dirty bit off */ -enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, u32 *opc, +enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1979,7 +1979,7 @@ enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, u32 *opc, return er; } -enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause, +enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -2022,7 +2022,7 @@ enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause, return EMULATE_DONE; } -enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause, +enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -2051,7 +2051,7 @@ enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause, return EMULATE_DONE; } -enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause, +enum emulation_result kvm_mips_emulate_ri_exc(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -2086,7 +2086,7 @@ enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause, return er; } -enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause, +enum emulation_result kvm_mips_emulate_bp_exc(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -2121,7 +2121,7 @@ enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause, return er; } -enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause, +enum emulation_result kvm_mips_emulate_trap_exc(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -2156,7 +2156,7 @@ enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause, return er; } -enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause, +enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -2191,7 +2191,7 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause, return er; } -enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause, +enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -2226,7 +2226,7 @@ enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause, return er; } -enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause, +enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -2276,7 +2276,7 @@ enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause, #define SYNC 0x0000000f #define RDHWR 0x0000003b -enum emulation_result kvm_mips_handle_ri(unsigned long cause, u32 *opc, +enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -2406,7 +2406,7 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, return er; } -static enum emulation_result kvm_mips_emulate_exc(unsigned long cause, +static enum emulation_result kvm_mips_emulate_exc(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -2444,7 +2444,7 @@ static enum emulation_result kvm_mips_emulate_exc(unsigned long cause, return er; } -enum emulation_result kvm_mips_check_privilege(unsigned long cause, +enum emulation_result kvm_mips_check_privilege(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -2540,7 +2540,7 @@ enum emulation_result kvm_mips_check_privilege(unsigned long cause, * (2) TLB entry is present in the Guest TLB but not in the shadow, in this * case we inject the TLB from the Guest TLB into the shadow host TLB */ -enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause, +enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S index 5ad2d507b125..43c8ef847efa 100644 --- a/arch/mips/kvm/locore.S +++ b/arch/mips/kvm/locore.S @@ -306,7 +306,7 @@ NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra) LONG_S k0, VCPU_HOST_CP0_BADVADDR(k1) mfc0 k0, CP0_CAUSE - LONG_S k0, VCPU_HOST_CP0_CAUSE(k1) + sw k0, VCPU_HOST_CP0_CAUSE(k1) /* Now restore the host state just enough to run the handlers */ diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 4aa5d77b0d6a..ecf0068bc95e 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -41,7 +41,7 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) struct mips_coproc *cop0 = vcpu->arch.cop0; struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -89,13 +89,13 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0 || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) { - kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n", + kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); er = kvm_mips_handle_tlbmod(cause, opc, run, vcpu); @@ -111,14 +111,14 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) * when we are not using HIGHMEM. Need to address this in a * HIGHMEM kernel */ - kvm_err("TLB MOD fault not handled, cause %#lx, PC: %p, BadVaddr: %#lx\n", + kvm_err("TLB MOD fault not handled, cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); kvm_mips_dump_host_tlbs(); kvm_arch_vcpu_dump_regs(vcpu); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } else { - kvm_err("Illegal TLB Mod fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n", + kvm_err("Illegal TLB Mod fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); kvm_mips_dump_host_tlbs(); kvm_arch_vcpu_dump_regs(vcpu); @@ -133,7 +133,7 @@ static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -145,7 +145,7 @@ static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu) } } else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0 || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) { - kvm_debug("USER ADDR TLB LD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n", + kvm_debug("USER ADDR TLB LD fault: cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu); if (er == EMULATE_DONE) @@ -165,7 +165,7 @@ static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu) ret = RESUME_HOST; } } else { - kvm_err("Illegal TLB LD fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n", + kvm_err("Illegal TLB LD fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); kvm_mips_dump_host_tlbs(); kvm_arch_vcpu_dump_regs(vcpu); @@ -180,7 +180,7 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -219,7 +219,7 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) ret = RESUME_HOST; } } else { - kvm_err("Illegal TLB ST fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n", + kvm_err("Illegal TLB ST fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); kvm_mips_dump_host_tlbs(); kvm_arch_vcpu_dump_regs(vcpu); @@ -234,7 +234,7 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -251,7 +251,7 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu) ret = RESUME_HOST; } } else { - kvm_err("Address Error (STORE): cause %#lx, PC: %p, BadVaddr: %#lx\n", + kvm_err("Address Error (STORE): cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; @@ -264,7 +264,7 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -280,7 +280,7 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu) ret = RESUME_HOST; } } else { - kvm_err("Address Error (LOAD): cause %#lx, PC: %p, BadVaddr: %#lx\n", + kvm_err("Address Error (LOAD): cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; @@ -293,7 +293,7 @@ static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -311,7 +311,7 @@ static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -329,7 +329,7 @@ static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -347,7 +347,7 @@ static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *)vcpu->arch.pc; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -365,7 +365,7 @@ static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *)vcpu->arch.pc; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -383,7 +383,7 @@ static int kvm_trap_emul_handle_fpe(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *)vcpu->arch.pc; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; @@ -408,7 +408,7 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu) struct mips_coproc *cop0 = vcpu->arch.cop0; struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; - unsigned long cause = vcpu->arch.host_cp0_cause; + u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; From 403015b323a297475919e1a8ccc1ceb0fcb85f5f Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:10 +0100 Subject: [PATCH 063/302] MIPS: KVM: Move non-TLB handling code out of tlb.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various functions in tlb.c perform higher level MMU handling, but don't strictly need to be statically built into the kernel as they don't directly manipulate TLB entries. Move these functions out into a separate mmu.c which will be built into the KVM kernel module. This allows them to directly reference KVM functions in the KVM kernel module in future. Module exports of these functions have been removed, since they aren't needed outside of KVM. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 4 + arch/mips/kvm/Makefile | 1 + arch/mips/kvm/mmu.c | 380 +++++++++++++++++++++++++++++++ arch/mips/kvm/tlb.c | 364 +---------------------------- 4 files changed, 389 insertions(+), 360 deletions(-) create mode 100644 arch/mips/kvm/mmu.c diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index dceb49422e3b..f64be7987a32 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -649,6 +649,10 @@ extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause, extern void kvm_mips_dump_host_tlbs(void); extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu); +extern int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi, + unsigned long entrylo0, + unsigned long entrylo1, + int flush_dcache_mask); extern void kvm_mips_flush_host_tlb(int skip_kseg0); extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi); diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile index 637ebbebd549..0aabe40fcac9 100644 --- a/arch/mips/kvm/Makefile +++ b/arch/mips/kvm/Makefile @@ -10,6 +10,7 @@ common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o kvm-objs := $(common-objs-y) mips.o emulate.o locore.o \ interrupt.o stats.o commpage.o \ dyntrans.o trap_emul.o fpu.o +kvm-objs += mmu.o obj-$(CONFIG_KVM) += kvm.o obj-y += callback.o tlb.o diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c new file mode 100644 index 000000000000..d18cadfd6e01 --- /dev/null +++ b/arch/mips/kvm/mmu.c @@ -0,0 +1,380 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * KVM/MIPS MMU handling in the KVM module. + * + * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. + * Authors: Sanjay Lal + */ + +#include +#include + +static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) +{ + int cpu = smp_processor_id(); + + return vcpu->arch.guest_kernel_asid[cpu] & + cpu_asid_mask(&cpu_data[cpu]); +} + +static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) +{ + int cpu = smp_processor_id(); + + return vcpu->arch.guest_user_asid[cpu] & + cpu_asid_mask(&cpu_data[cpu]); +} + +static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) +{ + int srcu_idx, err = 0; + kvm_pfn_t pfn; + + if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE) + return 0; + + srcu_idx = srcu_read_lock(&kvm->srcu); + pfn = kvm_mips_gfn_to_pfn(kvm, gfn); + + if (kvm_mips_is_error_pfn(pfn)) { + kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn); + err = -EFAULT; + goto out; + } + + kvm->arch.guest_pmap[gfn] = pfn; +out: + srcu_read_unlock(&kvm->srcu, srcu_idx); + return err; +} + +/* Translate guest KSEG0 addresses to Host PA */ +unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, + unsigned long gva) +{ + gfn_t gfn; + unsigned long offset = gva & ~PAGE_MASK; + struct kvm *kvm = vcpu->kvm; + + if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) { + kvm_err("%s/%p: Invalid gva: %#lx\n", __func__, + __builtin_return_address(0), gva); + return KVM_INVALID_PAGE; + } + + gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT); + + if (gfn >= kvm->arch.guest_pmap_npages) { + kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn, + gva); + return KVM_INVALID_PAGE; + } + + if (kvm_mips_map_page(vcpu->kvm, gfn) < 0) + return KVM_INVALID_ADDR; + + return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset; +} + +/* XXXKYMA: Must be called with interrupts disabled */ +int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, + struct kvm_vcpu *vcpu) +{ + gfn_t gfn; + kvm_pfn_t pfn0, pfn1; + unsigned long vaddr = 0; + unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; + int even; + struct kvm *kvm = vcpu->kvm; + const int flush_dcache_mask = 0; + int ret; + + if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) { + kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr); + kvm_mips_dump_host_tlbs(); + return -1; + } + + gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT); + if (gfn >= kvm->arch.guest_pmap_npages) { + kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__, + gfn, badvaddr); + kvm_mips_dump_host_tlbs(); + return -1; + } + even = !(gfn & 0x1); + vaddr = badvaddr & (PAGE_MASK << 1); + + if (kvm_mips_map_page(vcpu->kvm, gfn) < 0) + return -1; + + if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0) + return -1; + + if (even) { + pfn0 = kvm->arch.guest_pmap[gfn]; + pfn1 = kvm->arch.guest_pmap[gfn ^ 0x1]; + } else { + pfn0 = kvm->arch.guest_pmap[gfn ^ 0x1]; + pfn1 = kvm->arch.guest_pmap[gfn]; + } + + entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) | + (1 << 2) | (0x1 << 1); + entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) | + (1 << 2) | (0x1 << 1); + + preempt_disable(); + entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu)); + ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, + flush_dcache_mask); + preempt_enable(); + + return ret; +} + +int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, + struct kvm_mips_tlb *tlb, + unsigned long *hpa0, + unsigned long *hpa1) +{ + unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; + struct kvm *kvm = vcpu->kvm; + kvm_pfn_t pfn0, pfn1; + int ret; + + if ((tlb->tlb_hi & VPN2_MASK) == 0) { + pfn0 = 0; + pfn1 = 0; + } else { + if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo0) + >> PAGE_SHIFT) < 0) + return -1; + + if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo1) + >> PAGE_SHIFT) < 0) + return -1; + + pfn0 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo0) + >> PAGE_SHIFT]; + pfn1 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo1) + >> PAGE_SHIFT]; + } + + if (hpa0) + *hpa0 = pfn0 << PAGE_SHIFT; + + if (hpa1) + *hpa1 = pfn1 << PAGE_SHIFT; + + /* Get attributes from the Guest TLB */ + entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) | + (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V); + entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) | + (tlb->tlb_lo1 & MIPS3_PG_D) | (tlb->tlb_lo1 & MIPS3_PG_V); + + kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc, + tlb->tlb_lo0, tlb->tlb_lo1); + + preempt_disable(); + entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ? + kvm_mips_get_kernel_asid(vcpu) : + kvm_mips_get_user_asid(vcpu)); + ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, + tlb->tlb_mask); + preempt_enable(); + + return ret; +} + +void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, + struct kvm_vcpu *vcpu) +{ + unsigned long asid = asid_cache(cpu); + + asid += cpu_asid_inc(); + if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) { + if (cpu_has_vtag_icache) + flush_icache_all(); + + kvm_local_flush_tlb_all(); /* start new asid cycle */ + + if (!asid) /* fix version if needed */ + asid = asid_first_version(cpu); + } + + cpu_context(cpu, mm) = asid_cache(cpu) = asid; +} + +/** + * kvm_mips_migrate_count() - Migrate timer. + * @vcpu: Virtual CPU. + * + * Migrate CP0_Count hrtimer to the current CPU by cancelling and restarting it + * if it was running prior to being cancelled. + * + * Must be called when the VCPU is migrated to a different CPU to ensure that + * timer expiry during guest execution interrupts the guest and causes the + * interrupt to be delivered in a timely manner. + */ +static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu) +{ + if (hrtimer_cancel(&vcpu->arch.comparecount_timer)) + hrtimer_restart(&vcpu->arch.comparecount_timer); +} + +/* Restore ASID once we are scheduled back after preemption */ +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]); + unsigned long flags; + int newasid = 0; + + kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu); + + /* Allocate new kernel and user ASIDs if needed */ + + local_irq_save(flags); + + if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) & + asid_version_mask(cpu)) { + kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu); + vcpu->arch.guest_kernel_asid[cpu] = + vcpu->arch.guest_kernel_mm.context.asid[cpu]; + kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu); + vcpu->arch.guest_user_asid[cpu] = + vcpu->arch.guest_user_mm.context.asid[cpu]; + newasid++; + + kvm_debug("[%d]: cpu_context: %#lx\n", cpu, + cpu_context(cpu, current->mm)); + kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n", + cpu, vcpu->arch.guest_kernel_asid[cpu]); + kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu, + vcpu->arch.guest_user_asid[cpu]); + } + + if (vcpu->arch.last_sched_cpu != cpu) { + kvm_debug("[%d->%d]KVM VCPU[%d] switch\n", + vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id); + /* + * Migrate the timer interrupt to the current CPU so that it + * always interrupts the guest and synchronously triggers a + * guest timer interrupt. + */ + kvm_mips_migrate_count(vcpu); + } + + if (!newasid) { + /* + * If we preempted while the guest was executing, then reload + * the pre-empted ASID + */ + if (current->flags & PF_VCPU) { + write_c0_entryhi(vcpu->arch. + preempt_entryhi & asid_mask); + ehb(); + } + } else { + /* New ASIDs were allocated for the VM */ + + /* + * Were we in guest context? If so then the pre-empted ASID is + * no longer valid, we need to set it to what it should be based + * on the mode of the Guest (Kernel/User) + */ + if (current->flags & PF_VCPU) { + if (KVM_GUEST_KERNEL_MODE(vcpu)) + write_c0_entryhi(vcpu->arch. + guest_kernel_asid[cpu] & + asid_mask); + else + write_c0_entryhi(vcpu->arch. + guest_user_asid[cpu] & + asid_mask); + ehb(); + } + } + + /* restore guest state to registers */ + kvm_mips_callbacks->vcpu_set_regs(vcpu); + + local_irq_restore(flags); + +} + +/* ASID can change if another task is scheduled during preemption */ +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + int cpu; + + local_irq_save(flags); + + cpu = smp_processor_id(); + + vcpu->arch.preempt_entryhi = read_c0_entryhi(); + vcpu->arch.last_sched_cpu = cpu; + + /* save guest state in registers */ + kvm_mips_callbacks->vcpu_get_regs(vcpu); + + if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) & + asid_version_mask(cpu))) { + kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__, + cpu_context(cpu, current->mm)); + drop_mmu_context(current->mm, cpu); + } + write_c0_entryhi(cpu_asid(cpu, current->mm)); + ehb(); + + local_irq_restore(flags); +} + +u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + unsigned long paddr, flags, vpn2, asid; + u32 inst; + int index; + + if (KVM_GUEST_KSEGX((unsigned long) opc) < KVM_GUEST_KSEG0 || + KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { + local_irq_save(flags); + index = kvm_mips_host_tlb_lookup(vcpu, (unsigned long) opc); + if (index >= 0) { + inst = *(opc); + } else { + vpn2 = (unsigned long) opc & VPN2_MASK; + asid = kvm_read_c0_guest_entryhi(cop0) & + KVM_ENTRYHI_ASID; + index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid); + if (index < 0) { + kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n", + __func__, opc, vcpu, read_c0_entryhi()); + kvm_mips_dump_host_tlbs(); + local_irq_restore(flags); + return KVM_INVALID_INST; + } + kvm_mips_handle_mapped_seg_tlb_fault(vcpu, + &vcpu->arch. + guest_tlb[index], + NULL, NULL); + inst = *(opc); + } + local_irq_restore(flags); + } else if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) { + paddr = + kvm_mips_translate_guest_kseg0_to_hpa(vcpu, + (unsigned long) opc); + inst = *(u32 *) CKSEG0ADDR(paddr); + } else { + kvm_err("%s: illegal address: %p\n", __func__, opc); + return KVM_INVALID_INST; + } + + return inst; +} diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index c4e11e138042..373817c3166b 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include @@ -45,7 +45,7 @@ EXPORT_SYMBOL_GPL(kvm_mips_release_pfn_clean); bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn); EXPORT_SYMBOL_GPL(kvm_mips_is_error_pfn); -u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) +static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) { int cpu = smp_processor_id(); @@ -53,7 +53,7 @@ u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) cpu_asid_mask(&cpu_data[cpu]); } -u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) +static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) { int cpu = smp_processor_id(); @@ -146,58 +146,6 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs); -static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) -{ - int srcu_idx, err = 0; - kvm_pfn_t pfn; - - if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE) - return 0; - - srcu_idx = srcu_read_lock(&kvm->srcu); - pfn = kvm_mips_gfn_to_pfn(kvm, gfn); - - if (kvm_mips_is_error_pfn(pfn)) { - kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn); - err = -EFAULT; - goto out; - } - - kvm->arch.guest_pmap[gfn] = pfn; -out: - srcu_read_unlock(&kvm->srcu, srcu_idx); - return err; -} - -/* Translate guest KSEG0 addresses to Host PA */ -unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, - unsigned long gva) -{ - gfn_t gfn; - unsigned long offset = gva & ~PAGE_MASK; - struct kvm *kvm = vcpu->kvm; - - if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) { - kvm_err("%s/%p: Invalid gva: %#lx\n", __func__, - __builtin_return_address(0), gva); - return KVM_INVALID_PAGE; - } - - gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT); - - if (gfn >= kvm->arch.guest_pmap_npages) { - kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn, - gva); - return KVM_INVALID_PAGE; - } - - if (kvm_mips_map_page(vcpu->kvm, gfn) < 0) - return KVM_INVALID_ADDR; - - return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset; -} -EXPORT_SYMBOL_GPL(kvm_mips_translate_guest_kseg0_to_hpa); - /* XXXKYMA: Must be called with interrupts disabled */ /* set flush_dcache_mask == 0 if no dcache flush required */ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi, @@ -261,64 +209,7 @@ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi, local_irq_restore(flags); return 0; } - -/* XXXKYMA: Must be called with interrupts disabled */ -int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, - struct kvm_vcpu *vcpu) -{ - gfn_t gfn; - kvm_pfn_t pfn0, pfn1; - unsigned long vaddr = 0; - unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; - int even; - struct kvm *kvm = vcpu->kvm; - const int flush_dcache_mask = 0; - int ret; - - if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) { - kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr); - kvm_mips_dump_host_tlbs(); - return -1; - } - - gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT); - if (gfn >= kvm->arch.guest_pmap_npages) { - kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__, - gfn, badvaddr); - kvm_mips_dump_host_tlbs(); - return -1; - } - even = !(gfn & 0x1); - vaddr = badvaddr & (PAGE_MASK << 1); - - if (kvm_mips_map_page(vcpu->kvm, gfn) < 0) - return -1; - - if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0) - return -1; - - if (even) { - pfn0 = kvm->arch.guest_pmap[gfn]; - pfn1 = kvm->arch.guest_pmap[gfn ^ 0x1]; - } else { - pfn0 = kvm->arch.guest_pmap[gfn ^ 0x1]; - pfn1 = kvm->arch.guest_pmap[gfn]; - } - - entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) | - (1 << 2) | (0x1 << 1); - entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) | - (1 << 2) | (0x1 << 1); - - preempt_disable(); - entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu)); - ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, - flush_dcache_mask); - preempt_enable(); - - return ret; -} -EXPORT_SYMBOL_GPL(kvm_mips_handle_kseg0_tlb_fault); +EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_write); int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, struct kvm_vcpu *vcpu) @@ -363,61 +254,6 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, } EXPORT_SYMBOL_GPL(kvm_mips_handle_commpage_tlb_fault); -int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, - struct kvm_mips_tlb *tlb, - unsigned long *hpa0, - unsigned long *hpa1) -{ - unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; - struct kvm *kvm = vcpu->kvm; - kvm_pfn_t pfn0, pfn1; - int ret; - - if ((tlb->tlb_hi & VPN2_MASK) == 0) { - pfn0 = 0; - pfn1 = 0; - } else { - if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo0) - >> PAGE_SHIFT) < 0) - return -1; - - if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo1) - >> PAGE_SHIFT) < 0) - return -1; - - pfn0 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo0) - >> PAGE_SHIFT]; - pfn1 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo1) - >> PAGE_SHIFT]; - } - - if (hpa0) - *hpa0 = pfn0 << PAGE_SHIFT; - - if (hpa1) - *hpa1 = pfn1 << PAGE_SHIFT; - - /* Get attributes from the Guest TLB */ - entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) | - (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V); - entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) | - (tlb->tlb_lo1 & MIPS3_PG_D) | (tlb->tlb_lo1 & MIPS3_PG_V); - - kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc, - tlb->tlb_lo0, tlb->tlb_lo1); - - preempt_disable(); - entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ? - kvm_mips_get_kernel_asid(vcpu) : - kvm_mips_get_user_asid(vcpu)); - ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, - tlb->tlb_mask); - preempt_enable(); - - return ret; -} -EXPORT_SYMBOL_GPL(kvm_mips_handle_mapped_seg_tlb_fault); - int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi) { int i; @@ -574,25 +410,6 @@ void kvm_mips_flush_host_tlb(int skip_kseg0) } EXPORT_SYMBOL_GPL(kvm_mips_flush_host_tlb); -void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, - struct kvm_vcpu *vcpu) -{ - unsigned long asid = asid_cache(cpu); - - asid += cpu_asid_inc(); - if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) { - if (cpu_has_vtag_icache) - flush_icache_all(); - - kvm_local_flush_tlb_all(); /* start new asid cycle */ - - if (!asid) /* fix version if needed */ - asid = asid_first_version(cpu); - } - - cpu_context(cpu, mm) = asid_cache(cpu) = asid; -} - void kvm_local_flush_tlb_all(void) { unsigned long flags; @@ -621,176 +438,3 @@ void kvm_local_flush_tlb_all(void) local_irq_restore(flags); } EXPORT_SYMBOL_GPL(kvm_local_flush_tlb_all); - -/** - * kvm_mips_migrate_count() - Migrate timer. - * @vcpu: Virtual CPU. - * - * Migrate CP0_Count hrtimer to the current CPU by cancelling and restarting it - * if it was running prior to being cancelled. - * - * Must be called when the VCPU is migrated to a different CPU to ensure that - * timer expiry during guest execution interrupts the guest and causes the - * interrupt to be delivered in a timely manner. - */ -static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu) -{ - if (hrtimer_cancel(&vcpu->arch.comparecount_timer)) - hrtimer_restart(&vcpu->arch.comparecount_timer); -} - -/* Restore ASID once we are scheduled back after preemption */ -void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) -{ - unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]); - unsigned long flags; - int newasid = 0; - - kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu); - - /* Allocate new kernel and user ASIDs if needed */ - - local_irq_save(flags); - - if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) & - asid_version_mask(cpu)) { - kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu); - vcpu->arch.guest_kernel_asid[cpu] = - vcpu->arch.guest_kernel_mm.context.asid[cpu]; - kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu); - vcpu->arch.guest_user_asid[cpu] = - vcpu->arch.guest_user_mm.context.asid[cpu]; - newasid++; - - kvm_debug("[%d]: cpu_context: %#lx\n", cpu, - cpu_context(cpu, current->mm)); - kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n", - cpu, vcpu->arch.guest_kernel_asid[cpu]); - kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu, - vcpu->arch.guest_user_asid[cpu]); - } - - if (vcpu->arch.last_sched_cpu != cpu) { - kvm_debug("[%d->%d]KVM VCPU[%d] switch\n", - vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id); - /* - * Migrate the timer interrupt to the current CPU so that it - * always interrupts the guest and synchronously triggers a - * guest timer interrupt. - */ - kvm_mips_migrate_count(vcpu); - } - - if (!newasid) { - /* - * If we preempted while the guest was executing, then reload - * the pre-empted ASID - */ - if (current->flags & PF_VCPU) { - write_c0_entryhi(vcpu->arch. - preempt_entryhi & asid_mask); - ehb(); - } - } else { - /* New ASIDs were allocated for the VM */ - - /* - * Were we in guest context? If so then the pre-empted ASID is - * no longer valid, we need to set it to what it should be based - * on the mode of the Guest (Kernel/User) - */ - if (current->flags & PF_VCPU) { - if (KVM_GUEST_KERNEL_MODE(vcpu)) - write_c0_entryhi(vcpu->arch. - guest_kernel_asid[cpu] & - asid_mask); - else - write_c0_entryhi(vcpu->arch. - guest_user_asid[cpu] & - asid_mask); - ehb(); - } - } - - /* restore guest state to registers */ - kvm_mips_callbacks->vcpu_set_regs(vcpu); - - local_irq_restore(flags); - -} -EXPORT_SYMBOL_GPL(kvm_arch_vcpu_load); - -/* ASID can change if another task is scheduled during preemption */ -void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) -{ - unsigned long flags; - int cpu; - - local_irq_save(flags); - - cpu = smp_processor_id(); - - vcpu->arch.preempt_entryhi = read_c0_entryhi(); - vcpu->arch.last_sched_cpu = cpu; - - /* save guest state in registers */ - kvm_mips_callbacks->vcpu_get_regs(vcpu); - - if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) & - asid_version_mask(cpu))) { - kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__, - cpu_context(cpu, current->mm)); - drop_mmu_context(current->mm, cpu); - } - write_c0_entryhi(cpu_asid(cpu, current->mm)); - ehb(); - - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(kvm_arch_vcpu_put); - -u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) -{ - struct mips_coproc *cop0 = vcpu->arch.cop0; - unsigned long paddr, flags, vpn2, asid; - u32 inst; - int index; - - if (KVM_GUEST_KSEGX((unsigned long) opc) < KVM_GUEST_KSEG0 || - KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { - local_irq_save(flags); - index = kvm_mips_host_tlb_lookup(vcpu, (unsigned long) opc); - if (index >= 0) { - inst = *(opc); - } else { - vpn2 = (unsigned long) opc & VPN2_MASK; - asid = kvm_read_c0_guest_entryhi(cop0) & - KVM_ENTRYHI_ASID; - index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid); - if (index < 0) { - kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n", - __func__, opc, vcpu, read_c0_entryhi()); - kvm_mips_dump_host_tlbs(); - local_irq_restore(flags); - return KVM_INVALID_INST; - } - kvm_mips_handle_mapped_seg_tlb_fault(vcpu, - &vcpu->arch. - guest_tlb[index], - NULL, NULL); - inst = *(opc); - } - local_irq_restore(flags); - } else if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) { - paddr = - kvm_mips_translate_guest_kseg0_to_hpa(vcpu, - (unsigned long) opc); - inst = *(u32 *) CKSEG0ADDR(paddr); - } else { - kvm_err("%s: illegal address: %p\n", __func__, opc); - return KVM_INVALID_INST; - } - - return inst; -} -EXPORT_SYMBOL_GPL(kvm_get_inst); From 9befad23ed3e2e178741cb84ac09c0ff45610537 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:11 +0100 Subject: [PATCH 064/302] MIPS: KVM: Don't indirect KVM functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several KVM module functions are indirected so that they can be accessed from tlb.c which is statically built into the kernel. This is no longer necessary as the relevant bits of code have moved into mmu.c which is part of the KVM module, so drop the indirections. Note: is_error_pfn() is defined inline in kvm_host.h, so didn't actually require the KVM module to be loaded for it to work anyway. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 3 --- arch/mips/kvm/mips.c | 18 +----------------- arch/mips/kvm/mmu.c | 4 ++-- arch/mips/kvm/tlb.c | 10 ---------- 4 files changed, 3 insertions(+), 32 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index f64be7987a32..c8f9671c2779 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -93,9 +93,6 @@ #define KVM_INVALID_ADDR 0xdeadbeef extern atomic_t kvm_mips_instance; -extern kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn); -extern void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn); -extern bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn); struct kvm_vm_stat { u32 remote_tlb_flush; diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a2b1b9205b94..c1ab6110ca1d 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -147,7 +147,7 @@ void kvm_mips_free_vcpus(struct kvm *kvm) /* Put the pages we reserved for the guest pmap */ for (i = 0; i < kvm->arch.guest_pmap_npages; i++) { if (kvm->arch.guest_pmap[i] != KVM_INVALID_PAGE) - kvm_mips_release_pfn_clean(kvm->arch.guest_pmap[i]); + kvm_release_pfn_clean(kvm->arch.guest_pmap[i]); } kfree(kvm->arch.guest_pmap); @@ -1645,18 +1645,6 @@ static int __init kvm_mips_init(void) register_die_notifier(&kvm_mips_csr_die_notifier); - /* - * On MIPS, kernel modules are executed from "mapped space", which - * requires TLBs. The TLB handling code is statically linked with - * the rest of the kernel (tlb.c) to avoid the possibility of - * double faulting. The issue is that the TLB code references - * routines that are part of the the KVM module, which are only - * available once the module is loaded. - */ - kvm_mips_gfn_to_pfn = gfn_to_pfn; - kvm_mips_release_pfn_clean = kvm_release_pfn_clean; - kvm_mips_is_error_pfn = is_error_pfn; - return 0; } @@ -1664,10 +1652,6 @@ static void __exit kvm_mips_exit(void) { kvm_exit(); - kvm_mips_gfn_to_pfn = NULL; - kvm_mips_release_pfn_clean = NULL; - kvm_mips_is_error_pfn = NULL; - unregister_die_notifier(&kvm_mips_csr_die_notifier); } diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index d18cadfd6e01..d5ada83ec55c 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -37,9 +37,9 @@ static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) return 0; srcu_idx = srcu_read_lock(&kvm->srcu); - pfn = kvm_mips_gfn_to_pfn(kvm, gfn); + pfn = gfn_to_pfn(kvm, gfn); - if (kvm_mips_is_error_pfn(pfn)) { + if (is_error_pfn(pfn)) { kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn); err = -EFAULT; goto out; diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 373817c3166b..37d77ad8431e 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -35,16 +35,6 @@ atomic_t kvm_mips_instance; EXPORT_SYMBOL_GPL(kvm_mips_instance); -/* These function pointers are initialized once the KVM module is loaded */ -kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn); -EXPORT_SYMBOL_GPL(kvm_mips_gfn_to_pfn); - -void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn); -EXPORT_SYMBOL_GPL(kvm_mips_release_pfn_clean); - -bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn); -EXPORT_SYMBOL_GPL(kvm_mips_is_error_pfn); - static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) { int cpu = smp_processor_id(); From 021df206354cf1e1d341b66dee19ac250c9dc37d Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:12 +0100 Subject: [PATCH 065/302] MIPS: KVM: Simplify even/odd TLB handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When handling TLB faults in the guest KSeg0 region, a pair of physical addresses are read from the guest physical address map. However that process is rather convoluted with an if/then/else statement. Simplify it to just clear the lowest bit for the even entry and set the lowest bit for the odd entry. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mmu.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index d5ada83ec55c..9924c1d253a7 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -87,7 +87,6 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, kvm_pfn_t pfn0, pfn1; unsigned long vaddr = 0; unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; - int even; struct kvm *kvm = vcpu->kvm; const int flush_dcache_mask = 0; int ret; @@ -105,7 +104,6 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, kvm_mips_dump_host_tlbs(); return -1; } - even = !(gfn & 0x1); vaddr = badvaddr & (PAGE_MASK << 1); if (kvm_mips_map_page(vcpu->kvm, gfn) < 0) @@ -114,13 +112,8 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0) return -1; - if (even) { - pfn0 = kvm->arch.guest_pmap[gfn]; - pfn1 = kvm->arch.guest_pmap[gfn ^ 0x1]; - } else { - pfn0 = kvm->arch.guest_pmap[gfn ^ 0x1]; - pfn1 = kvm->arch.guest_pmap[gfn]; - } + pfn0 = kvm->arch.guest_pmap[gfn & ~0x1]; + pfn1 = kvm->arch.guest_pmap[gfn | 0x1]; entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) | (1 << 2) | (0x1 << 1); From 26ee17ff71d3def831bfa4f6851ed1ba789e24f6 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:13 +0100 Subject: [PATCH 066/302] MIPS: KVM: Drop unused hpa0/hpa1 args from function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function kvm_mips_handle_mapped_seg_tlb_fault() has two completely unused pointer arguments, hpa0 and hpa1, for which all users always pass NULL. Drop these two arguments and update the callers. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 4 +--- arch/mips/kvm/emulate.c | 7 ++----- arch/mips/kvm/mmu.c | 13 ++----------- 3 files changed, 5 insertions(+), 19 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index c8f9671c2779..f68293b4a598 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -630,9 +630,7 @@ extern int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, struct kvm_vcpu *vcpu); extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, - struct kvm_mips_tlb *tlb, - unsigned long *hpa0, - unsigned long *hpa1); + struct kvm_mips_tlb *tlb); extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, u32 *opc, diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 3baab5ec3d3b..fb77fb469776 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1633,9 +1633,7 @@ enum emulation_result kvm_mips_emulate_cache(u32 inst, u32 *opc, * We fault an entry from the guest tlb to the * shadow host TLB */ - kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, - NULL, - NULL); + kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb); } } } else { @@ -2599,8 +2597,7 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, * OK we have a Guest TLB entry, now inject it into the * shadow host TLB */ - kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, NULL, - NULL); + kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb); } } diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 9924c1d253a7..4d42b47b500b 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -130,9 +130,7 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, } int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, - struct kvm_mips_tlb *tlb, - unsigned long *hpa0, - unsigned long *hpa1) + struct kvm_mips_tlb *tlb) { unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; struct kvm *kvm = vcpu->kvm; @@ -157,12 +155,6 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, >> PAGE_SHIFT]; } - if (hpa0) - *hpa0 = pfn0 << PAGE_SHIFT; - - if (hpa1) - *hpa1 = pfn1 << PAGE_SHIFT; - /* Get attributes from the Guest TLB */ entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) | (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V); @@ -354,8 +346,7 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) } kvm_mips_handle_mapped_seg_tlb_fault(vcpu, &vcpu->arch. - guest_tlb[index], - NULL, NULL); + guest_tlb[index]); inst = *(opc); } local_irq_restore(flags); From 878edf014e29de38c49153aba20273fbc9ae31af Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:14 +0100 Subject: [PATCH 067/302] MIPS: KVM: Restore host EBase from ebase variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The host kernel's exception vector base address is currently saved in the VCPU structure at creation time, and restored on a guest exit. However it doesn't change and can already be easily accessed from the 'ebase' variable (arch/mips/kernel/traps.c), so drop the host_ebase member of kvm_vcpu_arch, export the 'ebase' variable to modules and load from there instead. This does result in a single extra instruction (lui) on the guest exit path, but simplifies the code a bit and removes the redundant storage of the host exception base address. Credit for the idea goes to Cavium's VZ KVM implementation. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 2 +- arch/mips/kernel/asm-offsets.c | 1 - arch/mips/kernel/traps.c | 1 + arch/mips/kvm/locore.S | 2 +- arch/mips/kvm/mips.c | 3 --- 5 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index f68293b4a598..24a8e557db88 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -334,7 +334,7 @@ struct kvm_mips_tlb { #define KVM_MIPS_GUEST_TLB_SIZE 64 struct kvm_vcpu_arch { - void *host_ebase, *guest_ebase; + void *guest_ebase; int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); unsigned long host_stack; unsigned long host_gp; diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index 420808899c70..a1263d188a5a 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -355,7 +355,6 @@ void output_kvm_defines(void) OFFSET(VCPU_RUN, kvm_vcpu, run); OFFSET(VCPU_HOST_ARCH, kvm_vcpu, arch); - OFFSET(VCPU_HOST_EBASE, kvm_vcpu_arch, host_ebase); OFFSET(VCPU_GUEST_EBASE, kvm_vcpu_arch, guest_ebase); OFFSET(VCPU_HOST_STACK, kvm_vcpu_arch, host_stack); diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 4a1712b5abdf..66e5820bfdae 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -1859,6 +1859,7 @@ void __noreturn nmi_exception_handler(struct pt_regs *regs) #define VECTORSPACING 0x100 /* for EI/VI mode */ unsigned long ebase; +EXPORT_SYMBOL_GPL(ebase); unsigned long exception_handlers[32]; unsigned long vi_handlers[64]; diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S index 43c8ef847efa..f87bec546366 100644 --- a/arch/mips/kvm/locore.S +++ b/arch/mips/kvm/locore.S @@ -319,7 +319,7 @@ NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra) mtc0 k0, CP0_STATUS ehb - LONG_L k0, VCPU_HOST_EBASE(k1) + LONG_L k0, ebase mtc0 k0,CP0_EBASE /* diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index c1ab6110ca1d..6e753761b5d6 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -273,9 +273,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) else size = 0x4000; - /* Save Linux EBASE */ - vcpu->arch.host_ebase = (void *)read_c0_ebase(); - gebase = kzalloc(ALIGN(size, PAGE_SIZE), GFP_KERNEL); if (!gebase) { From 138f7ad916760a7c263678ce06545a0cfc98bf97 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:15 +0100 Subject: [PATCH 068/302] MIPS: KVM: Clean up TLB management hazards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM's host TLB handling routines were using tlbw hazard barrier macros around tlb_read(). Now that hazard barrier macros exist for tlbr, update this case to use them. Also fix various other unnecessary hazard barriers in this code. Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/tlb.c | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 37d77ad8431e..d3000680df1f 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -195,7 +195,6 @@ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi, /* Restore old ASID */ write_c0_entryhi(old_entryhi); mtc0_tlbw_hazard(); - tlbw_use_hazard(); local_irq_restore(flags); return 0; } @@ -219,15 +218,11 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, old_entryhi = read_c0_entryhi(); vaddr = badvaddr & (PAGE_MASK << 1); write_c0_entryhi(vaddr | kvm_mips_get_kernel_asid(vcpu)); - mtc0_tlbw_hazard(); write_c0_entrylo0(entrylo0); - mtc0_tlbw_hazard(); write_c0_entrylo1(entrylo1); - mtc0_tlbw_hazard(); write_c0_index(kvm_mips_get_commpage_asid(vcpu)); mtc0_tlbw_hazard(); tlb_write_indexed(); - mtc0_tlbw_hazard(); tlbw_use_hazard(); kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0 (R): 0x%08lx, entrylo1(R): 0x%08lx\n", @@ -237,7 +232,6 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, /* Restore old ASID */ write_c0_entryhi(old_entryhi); mtc0_tlbw_hazard(); - tlbw_use_hazard(); local_irq_restore(flags); return 0; @@ -291,7 +285,6 @@ int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr) /* Restore old ASID */ write_c0_entryhi(old_entryhi); mtc0_tlbw_hazard(); - tlbw_use_hazard(); local_irq_restore(flags); @@ -322,21 +315,16 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va) if (idx > 0) { write_c0_entryhi(UNIQUE_ENTRYHI(idx)); - mtc0_tlbw_hazard(); - write_c0_entrylo0(0); - mtc0_tlbw_hazard(); - write_c0_entrylo1(0); mtc0_tlbw_hazard(); tlb_write_indexed(); - mtc0_tlbw_hazard(); + tlbw_use_hazard(); } write_c0_entryhi(old_entryhi); mtc0_tlbw_hazard(); - tlbw_use_hazard(); local_irq_restore(flags); @@ -364,11 +352,11 @@ void kvm_mips_flush_host_tlb(int skip_kseg0) /* Blast 'em all away. */ for (entry = 0; entry < maxentry; entry++) { write_c0_index(entry); - mtc0_tlbw_hazard(); if (skip_kseg0) { + mtc0_tlbr_hazard(); tlb_read(); - tlbw_use_hazard(); + tlb_read_hazard(); entryhi = read_c0_entryhi(); @@ -379,22 +367,17 @@ void kvm_mips_flush_host_tlb(int skip_kseg0) /* Make sure all entries differ. */ write_c0_entryhi(UNIQUE_ENTRYHI(entry)); - mtc0_tlbw_hazard(); write_c0_entrylo0(0); - mtc0_tlbw_hazard(); write_c0_entrylo1(0); mtc0_tlbw_hazard(); tlb_write_indexed(); - mtc0_tlbw_hazard(); + tlbw_use_hazard(); } - tlbw_use_hazard(); - write_c0_entryhi(old_entryhi); write_c0_pagemask(old_pagemask); mtc0_tlbw_hazard(); - tlbw_use_hazard(); local_irq_restore(flags); } @@ -419,9 +402,9 @@ void kvm_local_flush_tlb_all(void) write_c0_index(entry); mtc0_tlbw_hazard(); tlb_write_indexed(); + tlbw_use_hazard(); entry++; } - tlbw_use_hazard(); write_c0_entryhi(old_ctx); mtc0_tlbw_hazard(); From e922a4cb71e745e53e64446d792c4603df43643a Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:16 +0100 Subject: [PATCH 069/302] MIPS: KVM: Use dump_tlb_all() for kvm_mips_dump_host_tlbs() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM implements its own routine for dumping the host TLB entries, but we already have dump_tlb_all() which does something very similar (although it only prints out TLB entries which match the current ASID or are global). Make KVM use dump_tlb_all() along with dump_tlb_regs() to avoid the duplication and inevitable bitrot, allowing TLB dumping enhancements (e.g. for VZ and GuestIDs) to be made in a single place. Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/tlb.c | 42 ++++-------------------------------------- 1 file changed, 4 insertions(+), 38 deletions(-) diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index d3000680df1f..c0b8e3fc895e 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -24,6 +24,7 @@ #include #include #include +#include #undef CONFIG_MIPS_MT #include @@ -60,50 +61,15 @@ inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu) void kvm_mips_dump_host_tlbs(void) { - unsigned long old_entryhi; - unsigned long old_pagemask; - struct kvm_mips_tlb tlb; unsigned long flags; - int i; local_irq_save(flags); - old_entryhi = read_c0_entryhi(); - old_pagemask = read_c0_pagemask(); - kvm_info("HOST TLBs:\n"); - kvm_info("ASID: %#lx\n", read_c0_entryhi() & - cpu_asid_mask(¤t_cpu_data)); + dump_tlb_regs(); + pr_info("\n"); + dump_tlb_all(); - for (i = 0; i < current_cpu_data.tlbsize; i++) { - write_c0_index(i); - mtc0_tlbw_hazard(); - - tlb_read(); - tlbw_use_hazard(); - - tlb.tlb_hi = read_c0_entryhi(); - tlb.tlb_lo0 = read_c0_entrylo0(); - tlb.tlb_lo1 = read_c0_entrylo1(); - tlb.tlb_mask = read_c0_pagemask(); - - kvm_info("TLB%c%3d Hi 0x%08lx ", - (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*', - i, tlb.tlb_hi); - kvm_info("Lo0=0x%09llx %c%c attr %lx ", - (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo0), - (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ', - (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ', - (tlb.tlb_lo0 >> 3) & 7); - kvm_info("Lo1=0x%09llx %c%c attr %lx sz=%lx\n", - (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo1), - (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ', - (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ', - (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask); - } - write_c0_entryhi(old_entryhi); - write_c0_pagemask(old_pagemask); - mtc0_tlbw_hazard(); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(kvm_mips_dump_host_tlbs); From 9fbfb06a4065772571aa58d2583868268fc8be53 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:17 +0100 Subject: [PATCH 070/302] MIPS: KVM: Arrayify struct kvm_mips_tlb::tlb_lo* MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The values of the EntryLo0 and EntryLo1 registers for a TLB entry are stored in separate members of struct kvm_mips_tlb called tlb_lo0 and tlb_lo1 respectively. To allow future code which needs to manipulate arbitrary EntryLo data in the TLB entry to be simpler and less conditional, replace these members with an array of two elements. Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 11 +++++------ arch/mips/kvm/emulate.c | 10 +++++----- arch/mips/kvm/mmu.c | 20 +++++++++++--------- arch/mips/kvm/tlb.c | 21 +++++++++++---------- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 24a8e557db88..2d15da111ba8 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -310,13 +310,13 @@ enum emulation_result { #define VPN2_MASK 0xffffe000 #define KVM_ENTRYHI_ASID MIPS_ENTRYHI_ASID -#define TLB_IS_GLOBAL(x) (((x).tlb_lo0 & MIPS3_PG_G) && \ - ((x).tlb_lo1 & MIPS3_PG_G)) +#define TLB_IS_GLOBAL(x) (((x).tlb_lo[0] & MIPS3_PG_G) && \ + ((x).tlb_lo[1] & MIPS3_PG_G)) #define TLB_VPN2(x) ((x).tlb_hi & VPN2_MASK) #define TLB_ASID(x) ((x).tlb_hi & KVM_ENTRYHI_ASID) #define TLB_IS_VALID(x, va) (((va) & (1 << PAGE_SHIFT)) \ - ? ((x).tlb_lo1 & MIPS3_PG_V) \ - : ((x).tlb_lo0 & MIPS3_PG_V)) + ? ((x).tlb_lo[1] & MIPS3_PG_V) \ + : ((x).tlb_lo[0] & MIPS3_PG_V)) #define TLB_HI_VPN2_HIT(x, y) ((TLB_VPN2(x) & ~(x).tlb_mask) == \ ((y) & VPN2_MASK & ~(x).tlb_mask)) #define TLB_HI_ASID_HIT(x, y) (TLB_IS_GLOBAL(x) || \ @@ -325,8 +325,7 @@ enum emulation_result { struct kvm_mips_tlb { long tlb_mask; long tlb_hi; - long tlb_lo0; - long tlb_lo1; + long tlb_lo[2]; }; #define KVM_MIPS_FPU_FPU 0x1 diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index fb77fb469776..5b89c0803405 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -833,8 +833,8 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu) tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0); tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0); - tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0); - tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0); + tlb->tlb_lo[0] = kvm_read_c0_guest_entrylo0(cop0); + tlb->tlb_lo[1] = kvm_read_c0_guest_entrylo1(cop0); kvm_debug("[%#lx] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n", pc, index, kvm_read_c0_guest_entryhi(cop0), @@ -866,8 +866,8 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu) tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0); tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0); - tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0); - tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0); + tlb->tlb_lo[0] = kvm_read_c0_guest_entrylo0(cop0); + tlb->tlb_lo[1] = kvm_read_c0_guest_entrylo1(cop0); kvm_debug("[%#lx] COP0_TLBWR[%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx)\n", pc, index, kvm_read_c0_guest_entryhi(cop0), @@ -2592,7 +2592,7 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, } } else { kvm_debug("Injecting hi: %#lx, lo0: %#lx, lo1: %#lx into shadow host TLB\n", - tlb->tlb_hi, tlb->tlb_lo0, tlb->tlb_lo1); + tlb->tlb_hi, tlb->tlb_lo[0], tlb->tlb_lo[1]); /* * OK we have a Guest TLB entry, now inject it into the * shadow host TLB diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 4d42b47b500b..14996aa5e7c4 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -141,28 +141,30 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, pfn0 = 0; pfn1 = 0; } else { - if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo0) + if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[0]) >> PAGE_SHIFT) < 0) return -1; - if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo1) + if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[1]) >> PAGE_SHIFT) < 0) return -1; - pfn0 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo0) - >> PAGE_SHIFT]; - pfn1 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo1) - >> PAGE_SHIFT]; + pfn0 = kvm->arch.guest_pmap[ + mips3_tlbpfn_to_paddr(tlb->tlb_lo[0]) >> PAGE_SHIFT]; + pfn1 = kvm->arch.guest_pmap[ + mips3_tlbpfn_to_paddr(tlb->tlb_lo[1]) >> PAGE_SHIFT]; } /* Get attributes from the Guest TLB */ entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) | - (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V); + (tlb->tlb_lo[0] & MIPS3_PG_D) | + (tlb->tlb_lo[0] & MIPS3_PG_V); entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) | - (tlb->tlb_lo1 & MIPS3_PG_D) | (tlb->tlb_lo1 & MIPS3_PG_V); + (tlb->tlb_lo[1] & MIPS3_PG_D) | + (tlb->tlb_lo[1] & MIPS3_PG_V); kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc, - tlb->tlb_lo0, tlb->tlb_lo1); + tlb->tlb_lo[0], tlb->tlb_lo[1]); preempt_disable(); entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ? diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index c0b8e3fc895e..4825d0dbb65e 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -86,18 +86,19 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu) for (i = 0; i < KVM_MIPS_GUEST_TLB_SIZE; i++) { tlb = vcpu->arch.guest_tlb[i]; kvm_info("TLB%c%3d Hi 0x%08lx ", - (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*', + (tlb.tlb_lo[0] | tlb.tlb_lo[1]) & MIPS3_PG_V + ? ' ' : '*', i, tlb.tlb_hi); kvm_info("Lo0=0x%09llx %c%c attr %lx ", - (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo0), - (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ', - (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ', - (tlb.tlb_lo0 >> 3) & 7); + (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[0]), + (tlb.tlb_lo[0] & MIPS3_PG_D) ? 'D' : ' ', + (tlb.tlb_lo[0] & MIPS3_PG_G) ? 'G' : ' ', + (tlb.tlb_lo[0] >> 3) & 7); kvm_info("Lo1=0x%09llx %c%c attr %lx sz=%lx\n", - (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo1), - (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ', - (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ', - (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask); + (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[1]), + (tlb.tlb_lo[1] & MIPS3_PG_D) ? 'D' : ' ', + (tlb.tlb_lo[1] & MIPS3_PG_G) ? 'G' : ' ', + (tlb.tlb_lo[1] >> 3) & 7, tlb.tlb_mask); } } EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs); @@ -219,7 +220,7 @@ int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi) } kvm_debug("%s: entryhi: %#lx, index: %d lo0: %#lx, lo1: %#lx\n", - __func__, entryhi, index, tlb[i].tlb_lo0, tlb[i].tlb_lo1); + __func__, entryhi, index, tlb[i].tlb_lo[0], tlb[i].tlb_lo[1]); return index; } From 19d194c62b25cafaf64a5fe74305b3e9b84d63d8 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:18 +0100 Subject: [PATCH 071/302] MIPS: KVM: Simplify TLB_* macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify some of the TLB_ macros making use of the arrayification of tlb_lo. Basically we index the array by the bit of the virtual address which determines whether the even or odd entry is used, instead of having a conditional. Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 2d15da111ba8..83a3212b956d 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -310,13 +310,11 @@ enum emulation_result { #define VPN2_MASK 0xffffe000 #define KVM_ENTRYHI_ASID MIPS_ENTRYHI_ASID -#define TLB_IS_GLOBAL(x) (((x).tlb_lo[0] & MIPS3_PG_G) && \ - ((x).tlb_lo[1] & MIPS3_PG_G)) +#define TLB_IS_GLOBAL(x) ((x).tlb_lo[0] & (x).tlb_lo[1] & MIPS3_PG_G) #define TLB_VPN2(x) ((x).tlb_hi & VPN2_MASK) #define TLB_ASID(x) ((x).tlb_hi & KVM_ENTRYHI_ASID) -#define TLB_IS_VALID(x, va) (((va) & (1 << PAGE_SHIFT)) \ - ? ((x).tlb_lo[1] & MIPS3_PG_V) \ - : ((x).tlb_lo[0] & MIPS3_PG_V)) +#define TLB_LO_IDX(x, va) (((va) >> PAGE_SHIFT) & 1) +#define TLB_IS_VALID(x, va) ((x).tlb_lo[TLB_LO_IDX(x, va)] & MIPS3_PG_V) #define TLB_HI_VPN2_HIT(x, y) ((TLB_VPN2(x) & ~(x).tlb_mask) == \ ((y) & VPN2_MASK & ~(x).tlb_mask)) #define TLB_HI_ASID_HIT(x, y) (TLB_IS_GLOBAL(x) || \ From e6207bbea16c60942cdc1492af4feed5aed77389 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:19 +0100 Subject: [PATCH 072/302] MIPS: KVM: Use MIPS_ENTRYLO_* defs from mipsregs.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert KVM to use the MIPS_ENTRYLO_* definitions from rather than custom definitions in kvm_host.h Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 11 ++++------- arch/mips/kvm/mmu.c | 22 ++++++++++++---------- arch/mips/kvm/tlb.c | 23 ++++++++++++----------- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 83a3212b956d..d0432b5f2343 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -19,6 +19,8 @@ #include #include +#include + /* MIPS KVM register ids */ #define MIPS_CP0_32(_R, _S) \ (KVM_REG_MIPS_CP0 | KVM_REG_SIZE_U32 | (8 * (_R) + (_S))) @@ -295,11 +297,6 @@ enum emulation_result { EMULATE_PRIV_FAIL, }; -#define MIPS3_PG_G 0x00000001 /* Global; ignore ASID if in lo0 & lo1 */ -#define MIPS3_PG_V 0x00000002 /* Valid */ -#define MIPS3_PG_NV 0x00000000 -#define MIPS3_PG_D 0x00000004 /* Dirty */ - #define mips3_paddr_to_tlbpfn(x) \ (((unsigned long)(x) >> MIPS3_PG_SHIFT) & MIPS3_PG_FRAME) #define mips3_tlbpfn_to_paddr(x) \ @@ -310,11 +307,11 @@ enum emulation_result { #define VPN2_MASK 0xffffe000 #define KVM_ENTRYHI_ASID MIPS_ENTRYHI_ASID -#define TLB_IS_GLOBAL(x) ((x).tlb_lo[0] & (x).tlb_lo[1] & MIPS3_PG_G) +#define TLB_IS_GLOBAL(x) ((x).tlb_lo[0] & (x).tlb_lo[1] & ENTRYLO_G) #define TLB_VPN2(x) ((x).tlb_hi & VPN2_MASK) #define TLB_ASID(x) ((x).tlb_hi & KVM_ENTRYHI_ASID) #define TLB_LO_IDX(x, va) (((va) >> PAGE_SHIFT) & 1) -#define TLB_IS_VALID(x, va) ((x).tlb_lo[TLB_LO_IDX(x, va)] & MIPS3_PG_V) +#define TLB_IS_VALID(x, va) ((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_V) #define TLB_HI_VPN2_HIT(x, y) ((TLB_VPN2(x) & ~(x).tlb_mask) == \ ((y) & VPN2_MASK & ~(x).tlb_mask)) #define TLB_HI_ASID_HIT(x, y) (TLB_IS_GLOBAL(x) || \ diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 14996aa5e7c4..ad3125fa9c61 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -115,10 +115,10 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, pfn0 = kvm->arch.guest_pmap[gfn & ~0x1]; pfn1 = kvm->arch.guest_pmap[gfn | 0x1]; - entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) | - (1 << 2) | (0x1 << 1); - entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) | - (1 << 2) | (0x1 << 1); + entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | + (0x3 << ENTRYLO_C_SHIFT) | ENTRYLO_D | ENTRYLO_V; + entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | + (0x3 << ENTRYLO_C_SHIFT) | ENTRYLO_D | ENTRYLO_V; preempt_disable(); entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu)); @@ -156,12 +156,14 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, } /* Get attributes from the Guest TLB */ - entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) | - (tlb->tlb_lo[0] & MIPS3_PG_D) | - (tlb->tlb_lo[0] & MIPS3_PG_V); - entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) | - (tlb->tlb_lo[1] & MIPS3_PG_D) | - (tlb->tlb_lo[1] & MIPS3_PG_V); + entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | + (0x3 << ENTRYLO_C_SHIFT) | + (tlb->tlb_lo[0] & ENTRYLO_D) | + (tlb->tlb_lo[0] & ENTRYLO_V); + entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | + (0x3 << ENTRYLO_C_SHIFT) | + (tlb->tlb_lo[1] & ENTRYLO_D) | + (tlb->tlb_lo[1] & ENTRYLO_V); kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc, tlb->tlb_lo[0], tlb->tlb_lo[1]); diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 4825d0dbb65e..8012e686d4ae 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -86,19 +86,20 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu) for (i = 0; i < KVM_MIPS_GUEST_TLB_SIZE; i++) { tlb = vcpu->arch.guest_tlb[i]; kvm_info("TLB%c%3d Hi 0x%08lx ", - (tlb.tlb_lo[0] | tlb.tlb_lo[1]) & MIPS3_PG_V + (tlb.tlb_lo[0] | tlb.tlb_lo[1]) & ENTRYLO_V ? ' ' : '*', i, tlb.tlb_hi); kvm_info("Lo0=0x%09llx %c%c attr %lx ", (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[0]), - (tlb.tlb_lo[0] & MIPS3_PG_D) ? 'D' : ' ', - (tlb.tlb_lo[0] & MIPS3_PG_G) ? 'G' : ' ', - (tlb.tlb_lo[0] >> 3) & 7); + (tlb.tlb_lo[0] & ENTRYLO_D) ? 'D' : ' ', + (tlb.tlb_lo[0] & ENTRYLO_G) ? 'G' : ' ', + (tlb.tlb_lo[0] & ENTRYLO_C) >> ENTRYLO_C_SHIFT); kvm_info("Lo1=0x%09llx %c%c attr %lx sz=%lx\n", (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[1]), - (tlb.tlb_lo[1] & MIPS3_PG_D) ? 'D' : ' ', - (tlb.tlb_lo[1] & MIPS3_PG_G) ? 'G' : ' ', - (tlb.tlb_lo[1] >> 3) & 7, tlb.tlb_mask); + (tlb.tlb_lo[1] & ENTRYLO_D) ? 'D' : ' ', + (tlb.tlb_lo[1] & ENTRYLO_G) ? 'G' : ' ', + (tlb.tlb_lo[1] & ENTRYLO_C) >> ENTRYLO_C_SHIFT, + tlb.tlb_mask); } } EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs); @@ -146,12 +147,12 @@ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi, /* Flush D-cache */ if (flush_dcache_mask) { - if (entrylo0 & MIPS3_PG_V) { + if (entrylo0 & ENTRYLO_V) { ++vcpu->stat.flush_dcache_exits; flush_data_cache_page((entryhi & VPN2_MASK) & ~flush_dcache_mask); } - if (entrylo1 & MIPS3_PG_V) { + if (entrylo1 & ENTRYLO_V) { ++vcpu->stat.flush_dcache_exits; flush_data_cache_page(((entryhi & VPN2_MASK) & ~flush_dcache_mask) | @@ -176,8 +177,8 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, pfn0 = CPHYSADDR(vcpu->arch.kseg0_commpage) >> PAGE_SHIFT; pfn1 = 0; - entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) | - (1 << 2) | (0x1 << 1); + entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | + (0x3 << ENTRYLO_C_SHIFT) | ENTRYLO_D | ENTRYLO_V; entrylo1 = 0; local_irq_save(flags); From 3b08aec549a0314b8c3788bdc2a21096d53225e1 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:20 +0100 Subject: [PATCH 073/302] MIPS: KVM: Combine handle_tlb_ld/st_miss MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The handle_tlb_ld/st_miss handlers are logically equivalent and textually almost identical, so combine their implementations into a single kvm_trap_emul_handle_tlb_miss(). Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/trap_emul.c | 71 +++++++++++---------------------------- 1 file changed, 19 insertions(+), 52 deletions(-) diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index ecf0068bc95e..09b97fa9dabb 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -128,7 +128,7 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) return ret; } -static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu) +static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) { struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; @@ -145,55 +145,8 @@ static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu) } } else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0 || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) { - kvm_debug("USER ADDR TLB LD fault: cause %#x, PC: %p, BadVaddr: %#lx\n", - cause, opc, badvaddr); - er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu); - if (er == EMULATE_DONE) - ret = RESUME_GUEST; - else { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - } - } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) { - /* - * All KSEG0 faults are handled by KVM, as the guest kernel does - * not expect to ever get them - */ - if (kvm_mips_handle_kseg0_tlb_fault - (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - } - } else { - kvm_err("Illegal TLB LD fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", - cause, opc, badvaddr); - kvm_mips_dump_host_tlbs(); - kvm_arch_vcpu_dump_regs(vcpu); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - } - return ret; -} - -static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) -{ - struct kvm_run *run = vcpu->run; - u32 __user *opc = (u32 __user *) vcpu->arch.pc; - unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; - u32 cause = vcpu->arch.host_cp0_cause; - enum emulation_result er = EMULATE_DONE; - int ret = RESUME_GUEST; - - if (((badvaddr & PAGE_MASK) == KVM_GUEST_COMMPAGE_ADDR) - && KVM_GUEST_KERNEL_MODE(vcpu)) { - if (kvm_mips_handle_commpage_tlb_fault(badvaddr, vcpu) < 0) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - } - } else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0 - || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) { - kvm_debug("USER ADDR TLB ST fault: PC: %#lx, BadVaddr: %#lx\n", - vcpu->arch.pc, badvaddr); + kvm_debug("USER ADDR TLB %s fault: cause %#x, PC: %p, BadVaddr: %#lx\n", + store ? "ST" : "LD", cause, opc, badvaddr); /* * User Address (UA) fault, this could happen if @@ -213,14 +166,18 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) ret = RESUME_HOST; } } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) { + /* + * All KSEG0 faults are handled by KVM, as the guest kernel does + * not expect to ever get them + */ if (kvm_mips_handle_kseg0_tlb_fault (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } } else { - kvm_err("Illegal TLB ST fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", - cause, opc, badvaddr); + kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", + store ? "ST" : "LD", cause, opc, badvaddr); kvm_mips_dump_host_tlbs(); kvm_arch_vcpu_dump_regs(vcpu); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -229,6 +186,16 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) return ret; } +static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu) +{ + return kvm_trap_emul_handle_tlb_miss(vcpu, true); +} + +static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) +{ + return kvm_trap_emul_handle_tlb_miss(vcpu, false); +} + static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; From 35fec26242bd3ff5a770789185852d27b44ffaec Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 9 Jun 2016 14:19:21 +0100 Subject: [PATCH 074/302] MIPS: KVM: Use va in kvm_get_inst() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Like other functions, make use of a local unsigned long va, for the virtual address of the PC. This reduces the amount of verbose casting of the opc pointer to an unsigned long. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mmu.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index ad3125fa9c61..208f70409ccb 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -327,17 +327,18 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; unsigned long paddr, flags, vpn2, asid; + unsigned long va = (unsigned long)opc; u32 inst; int index; - if (KVM_GUEST_KSEGX((unsigned long) opc) < KVM_GUEST_KSEG0 || - KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { + if (KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0 || + KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) { local_irq_save(flags); - index = kvm_mips_host_tlb_lookup(vcpu, (unsigned long) opc); + index = kvm_mips_host_tlb_lookup(vcpu, va); if (index >= 0) { inst = *(opc); } else { - vpn2 = (unsigned long) opc & VPN2_MASK; + vpn2 = va & VPN2_MASK; asid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID; index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid); @@ -354,10 +355,8 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) inst = *(opc); } local_irq_restore(flags); - } else if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) { - paddr = - kvm_mips_translate_guest_kseg0_to_hpa(vcpu, - (unsigned long) opc); + } else if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) { + paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, va); inst = *(u32 *) CKSEG0ADDR(paddr); } else { kvm_err("%s: illegal address: %p\n", __func__, opc); From f943176a7205a064da05f81fc94dccc4c7379010 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 14 Jun 2016 09:40:10 +0100 Subject: [PATCH 075/302] MIPS: KVM: Generalise fpu_inuse for other state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename fpu_inuse and the related definitions to aux_inuse so it can be used for lazy context management of other auxiliary processor state too, such as VZ guest timer, watchpoints and performance counters. Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 8 +++---- arch/mips/kvm/emulate.c | 8 +++---- arch/mips/kvm/mips.c | 38 ++++++++++++++++---------------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index d0432b5f2343..e6273850bab6 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -323,8 +323,8 @@ struct kvm_mips_tlb { long tlb_lo[2]; }; -#define KVM_MIPS_FPU_FPU 0x1 -#define KVM_MIPS_FPU_MSA 0x2 +#define KVM_MIPS_AUX_FPU 0x1 +#define KVM_MIPS_AUX_MSA 0x2 #define KVM_MIPS_GUEST_TLB_SIZE 64 struct kvm_vcpu_arch { @@ -346,8 +346,8 @@ struct kvm_vcpu_arch { /* FPU State */ struct mips_fpu_struct fpu; - /* Which FPU state is loaded (KVM_MIPS_FPU_*) */ - unsigned int fpu_inuse; + /* Which auxiliary state is loaded (KVM_MIPS_AUX_*) */ + unsigned int aux_inuse; /* COP0 State */ struct mips_coproc *cop0; diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 5b89c0803405..8647bd97b934 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1154,7 +1154,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, * it first. */ if (change & ST0_CU1 && !(val & ST0_FR) && - vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) + vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) kvm_lose_fpu(vcpu); /* @@ -1165,7 +1165,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, * the near future. */ if (change & ST0_CU1 && - vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) + vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) change_c0_status(ST0_CU1, val); preempt_enable(); @@ -1200,7 +1200,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, * context is already loaded. */ if (change & MIPS_CONF5_FRE && - vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) + vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) change_c0_config5(MIPS_CONF5_FRE, val); /* @@ -1210,7 +1210,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, * quickly enabled again in the near future. */ if (change & MIPS_CONF5_MSAEN && - vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) + vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) change_c0_config5(MIPS_CONF5_MSAEN, val); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 6e753761b5d6..9093262ff3ce 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1447,7 +1447,7 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu) * not to clobber the status register directly via the commpage. */ if (cpu_has_msa && sr & ST0_CU1 && !(sr & ST0_FR) && - vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) + vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) kvm_lose_fpu(vcpu); /* @@ -1462,9 +1462,9 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu) enable_fpu_hazard(); /* If guest FPU state not active, restore it now */ - if (!(vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)) { + if (!(vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)) { __kvm_restore_fpu(&vcpu->arch); - vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_FPU; + vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU; } preempt_enable(); @@ -1491,8 +1491,8 @@ void kvm_own_msa(struct kvm_vcpu *vcpu) * interacts with MSA state, so play it safe and save it first. */ if (!(sr & ST0_FR) && - (vcpu->arch.fpu_inuse & (KVM_MIPS_FPU_FPU | - KVM_MIPS_FPU_MSA)) == KVM_MIPS_FPU_FPU) + (vcpu->arch.aux_inuse & (KVM_MIPS_AUX_FPU | + KVM_MIPS_AUX_MSA)) == KVM_MIPS_AUX_FPU) kvm_lose_fpu(vcpu); change_c0_status(ST0_CU1 | ST0_FR, sr); @@ -1506,20 +1506,20 @@ void kvm_own_msa(struct kvm_vcpu *vcpu) set_c0_config5(MIPS_CONF5_MSAEN); enable_fpu_hazard(); - switch (vcpu->arch.fpu_inuse & (KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA)) { - case KVM_MIPS_FPU_FPU: + switch (vcpu->arch.aux_inuse & (KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA)) { + case KVM_MIPS_AUX_FPU: /* * Guest FPU state already loaded, only restore upper MSA state */ __kvm_restore_msa_upper(&vcpu->arch); - vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_MSA; + vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA; break; case 0: /* Neither FPU or MSA already active, restore full MSA state */ __kvm_restore_msa(&vcpu->arch); - vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_MSA; + vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA; if (kvm_mips_guest_has_fpu(&vcpu->arch)) - vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_FPU; + vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU; break; default: break; @@ -1533,13 +1533,13 @@ void kvm_own_msa(struct kvm_vcpu *vcpu) void kvm_drop_fpu(struct kvm_vcpu *vcpu) { preempt_disable(); - if (cpu_has_msa && vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) { + if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) { disable_msa(); - vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_MSA; + vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_MSA; } - if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) { + if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) { clear_c0_status(ST0_CU1 | ST0_FR); - vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_FPU; + vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU; } preempt_enable(); } @@ -1555,7 +1555,7 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu) */ preempt_disable(); - if (cpu_has_msa && vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) { + if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) { set_c0_config5(MIPS_CONF5_MSAEN); enable_fpu_hazard(); @@ -1563,17 +1563,17 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu) /* Disable MSA & FPU */ disable_msa(); - if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) { + if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) { clear_c0_status(ST0_CU1 | ST0_FR); disable_fpu_hazard(); } - vcpu->arch.fpu_inuse &= ~(KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA); - } else if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) { + vcpu->arch.aux_inuse &= ~(KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA); + } else if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) { set_c0_status(ST0_CU1); enable_fpu_hazard(); __kvm_save_fpu(&vcpu->arch); - vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_FPU; + vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU; /* Disable FPU */ clear_c0_status(ST0_CU1 | ST0_FR); From 04ebebf45a6ec61a4405040ea47c4320be5ed229 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 14 Jun 2016 09:40:11 +0100 Subject: [PATCH 076/302] MIPS: KVM: Add kvm_aux trace event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a MIPS specific trace event for auxiliary context operations (notably FPU and MSA). Unfortunately the generic kvm_fpu trace event isn't flexible enough to handle the range of interesting things that can happen with FPU and MSA context. The type of state being operated on is traced: - FPU: Just the FPU registers. - MSA: Just the upper half of the MSA vector registers (low half already loaded with FPU state). - FPU & MSA: Full MSA vector state (includes FPU state). As is the type of operation: - Restore: State was enabled and restored. - Save: State was saved and disabled. - Enable: State was enabled (already loaded). - Disable: State was disabled (kept loaded). - Discard: State was discarded and disabled. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: Steven Rostedt Cc: Ingo Molnar Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org [Fix remaining occurrence of "fpu_msa", change to "aux". - Paolo] Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 11 +++++++++++ arch/mips/kvm/trace.h | 46 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 9093262ff3ce..c0e8f8640f2b 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1465,6 +1465,9 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu) if (!(vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)) { __kvm_restore_fpu(&vcpu->arch); vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU; + trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_FPU); + } else { + trace_kvm_aux(vcpu, KVM_TRACE_AUX_ENABLE, KVM_TRACE_AUX_FPU); } preempt_enable(); @@ -1513,6 +1516,7 @@ void kvm_own_msa(struct kvm_vcpu *vcpu) */ __kvm_restore_msa_upper(&vcpu->arch); vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA; + trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_MSA); break; case 0: /* Neither FPU or MSA already active, restore full MSA state */ @@ -1520,8 +1524,11 @@ void kvm_own_msa(struct kvm_vcpu *vcpu) vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA; if (kvm_mips_guest_has_fpu(&vcpu->arch)) vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU; + trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, + KVM_TRACE_AUX_FPU_MSA); break; default: + trace_kvm_aux(vcpu, KVM_TRACE_AUX_ENABLE, KVM_TRACE_AUX_MSA); break; } @@ -1535,10 +1542,12 @@ void kvm_drop_fpu(struct kvm_vcpu *vcpu) preempt_disable(); if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) { disable_msa(); + trace_kvm_aux(vcpu, KVM_TRACE_AUX_DISCARD, KVM_TRACE_AUX_MSA); vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_MSA; } if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) { clear_c0_status(ST0_CU1 | ST0_FR); + trace_kvm_aux(vcpu, KVM_TRACE_AUX_DISCARD, KVM_TRACE_AUX_FPU); vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU; } preempt_enable(); @@ -1560,6 +1569,7 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu) enable_fpu_hazard(); __kvm_save_msa(&vcpu->arch); + trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU_MSA); /* Disable MSA & FPU */ disable_msa(); @@ -1574,6 +1584,7 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu) __kvm_save_fpu(&vcpu->arch); vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU; + trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU); /* Disable FPU */ clear_c0_status(ST0_CU1 | ST0_FR); diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h index bd6437f67dc0..f3ada591ca25 100644 --- a/arch/mips/kvm/trace.h +++ b/arch/mips/kvm/trace.h @@ -38,6 +38,52 @@ TRACE_EVENT(kvm_exit, __entry->pc) ); +#define KVM_TRACE_AUX_RESTORE 0 +#define KVM_TRACE_AUX_SAVE 1 +#define KVM_TRACE_AUX_ENABLE 2 +#define KVM_TRACE_AUX_DISABLE 3 +#define KVM_TRACE_AUX_DISCARD 4 + +#define KVM_TRACE_AUX_FPU 1 +#define KVM_TRACE_AUX_MSA 2 +#define KVM_TRACE_AUX_FPU_MSA 3 + +#define kvm_trace_symbol_aux_op \ + { KVM_TRACE_AUX_RESTORE, "restore" }, \ + { KVM_TRACE_AUX_SAVE, "save" }, \ + { KVM_TRACE_AUX_ENABLE, "enable" }, \ + { KVM_TRACE_AUX_DISABLE, "disable" }, \ + { KVM_TRACE_AUX_DISCARD, "discard" } + +#define kvm_trace_symbol_aux_state \ + { KVM_TRACE_AUX_FPU, "FPU" }, \ + { KVM_TRACE_AUX_MSA, "MSA" }, \ + { KVM_TRACE_AUX_FPU_MSA, "FPU & MSA" } + +TRACE_EVENT(kvm_aux, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op, + unsigned int state), + TP_ARGS(vcpu, op, state), + TP_STRUCT__entry( + __field(unsigned long, pc) + __field(u8, op) + __field(u8, state) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.pc; + __entry->op = op; + __entry->state = state; + ), + + TP_printk("%s %s PC: 0x%08lx", + __print_symbolic(__entry->op, + kvm_trace_symbol_aux_op), + __print_symbolic(__entry->state, + kvm_trace_symbol_aux_state), + __entry->pc) +); + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ From 1e09e86ac13747903501004082bf1c5b7c6262b2 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 14 Jun 2016 09:40:12 +0100 Subject: [PATCH 077/302] MIPS: KVM: Clean up kvm_exit trace event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up the MIPS kvm_exit trace event so that the exit reasons are specified in a trace friendly way (via __print_symbolic), and so that the exit reasons that derive straight from Cause.ExcCode values map directly, allowing a single trace_kvm_exit() call to replace a bunch of individual ones. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: Steven Rostedt Cc: Ingo Molnar Cc: kvm@vger.kernel.org Cc: linux-mips@linux-mips.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 22 ---------------- arch/mips/kvm/emulate.c | 4 +-- arch/mips/kvm/mips.c | 17 ++---------- arch/mips/kvm/stats.c | 21 --------------- arch/mips/kvm/trace.h | 44 +++++++++++++++++++++++++++++--- 5 files changed, 45 insertions(+), 63 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index e6273850bab6..b8cb74270746 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -125,28 +125,6 @@ struct kvm_vcpu_stat { u32 halt_wakeup; }; -enum kvm_mips_exit_types { - WAIT_EXITS, - CACHE_EXITS, - SIGNAL_EXITS, - INT_EXITS, - COP_UNUSABLE_EXITS, - TLBMOD_EXITS, - TLBMISS_LD_EXITS, - TLBMISS_ST_EXITS, - ADDRERR_ST_EXITS, - ADDRERR_LD_EXITS, - SYSCALL_EXITS, - RESVD_INST_EXITS, - BREAK_INST_EXITS, - TRAP_INST_EXITS, - MSA_FPE_EXITS, - FPE_EXITS, - MSA_DISABLED_EXITS, - FLUSH_DCACHE_EXITS, - MAX_KVM_MIPS_EXIT_TYPES -}; - struct kvm_arch_memory_slot { }; diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 8647bd97b934..fce08bda9ebc 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -775,7 +775,7 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu) vcpu->arch.pending_exceptions); ++vcpu->stat.wait_exits; - trace_kvm_exit(vcpu, WAIT_EXITS); + trace_kvm_exit(vcpu, KVM_TRACE_EXIT_WAIT); if (!vcpu->arch.pending_exceptions) { vcpu->arch.wait = 1; kvm_vcpu_block(vcpu); @@ -1718,7 +1718,7 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, case cache_op: ++vcpu->stat.cache_exits; - trace_kvm_exit(vcpu, CACHE_EXITS); + trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); er = kvm_mips_emulate_cache(inst, opc, cause, run, vcpu); break; diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index c0e8f8640f2b..e9e40b9dd9be 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1257,6 +1257,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) kvm_debug("kvm_mips_handle_exit: cause: %#x, PC: %p, kvm_run: %p, kvm_vcpu: %p\n", cause, opc, run, vcpu); + trace_kvm_exit(vcpu, exccode); /* * Do a privilege check, if in UM most of these exit conditions end up @@ -1276,7 +1277,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) kvm_debug("[%d]EXCCODE_INT @ %p\n", vcpu->vcpu_id, opc); ++vcpu->stat.int_exits; - trace_kvm_exit(vcpu, INT_EXITS); if (need_resched()) cond_resched(); @@ -1288,7 +1288,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) kvm_debug("EXCCODE_CPU: @ PC: %p\n", opc); ++vcpu->stat.cop_unusable_exits; - trace_kvm_exit(vcpu, COP_UNUSABLE_EXITS); ret = kvm_mips_callbacks->handle_cop_unusable(vcpu); /* XXXKYMA: Might need to return to user space */ if (run->exit_reason == KVM_EXIT_IRQ_WINDOW_OPEN) @@ -1297,7 +1296,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) case EXCCODE_MOD: ++vcpu->stat.tlbmod_exits; - trace_kvm_exit(vcpu, TLBMOD_EXITS); ret = kvm_mips_callbacks->handle_tlb_mod(vcpu); break; @@ -1307,7 +1305,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) badvaddr); ++vcpu->stat.tlbmiss_st_exits; - trace_kvm_exit(vcpu, TLBMISS_ST_EXITS); ret = kvm_mips_callbacks->handle_tlb_st_miss(vcpu); break; @@ -1316,61 +1313,51 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) cause, opc, badvaddr); ++vcpu->stat.tlbmiss_ld_exits; - trace_kvm_exit(vcpu, TLBMISS_LD_EXITS); ret = kvm_mips_callbacks->handle_tlb_ld_miss(vcpu); break; case EXCCODE_ADES: ++vcpu->stat.addrerr_st_exits; - trace_kvm_exit(vcpu, ADDRERR_ST_EXITS); ret = kvm_mips_callbacks->handle_addr_err_st(vcpu); break; case EXCCODE_ADEL: ++vcpu->stat.addrerr_ld_exits; - trace_kvm_exit(vcpu, ADDRERR_LD_EXITS); ret = kvm_mips_callbacks->handle_addr_err_ld(vcpu); break; case EXCCODE_SYS: ++vcpu->stat.syscall_exits; - trace_kvm_exit(vcpu, SYSCALL_EXITS); ret = kvm_mips_callbacks->handle_syscall(vcpu); break; case EXCCODE_RI: ++vcpu->stat.resvd_inst_exits; - trace_kvm_exit(vcpu, RESVD_INST_EXITS); ret = kvm_mips_callbacks->handle_res_inst(vcpu); break; case EXCCODE_BP: ++vcpu->stat.break_inst_exits; - trace_kvm_exit(vcpu, BREAK_INST_EXITS); ret = kvm_mips_callbacks->handle_break(vcpu); break; case EXCCODE_TR: ++vcpu->stat.trap_inst_exits; - trace_kvm_exit(vcpu, TRAP_INST_EXITS); ret = kvm_mips_callbacks->handle_trap(vcpu); break; case EXCCODE_MSAFPE: ++vcpu->stat.msa_fpe_exits; - trace_kvm_exit(vcpu, MSA_FPE_EXITS); ret = kvm_mips_callbacks->handle_msa_fpe(vcpu); break; case EXCCODE_FPE: ++vcpu->stat.fpe_exits; - trace_kvm_exit(vcpu, FPE_EXITS); ret = kvm_mips_callbacks->handle_fpe(vcpu); break; case EXCCODE_MSADIS: ++vcpu->stat.msa_disabled_exits; - trace_kvm_exit(vcpu, MSA_DISABLED_EXITS); ret = kvm_mips_callbacks->handle_msa_disabled(vcpu); break; @@ -1397,7 +1384,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) run->exit_reason = KVM_EXIT_INTR; ret = (-EINTR << 2) | RESUME_HOST; ++vcpu->stat.signal_exits; - trace_kvm_exit(vcpu, SIGNAL_EXITS); + trace_kvm_exit(vcpu, KVM_TRACE_EXIT_SIGNAL); } } diff --git a/arch/mips/kvm/stats.c b/arch/mips/kvm/stats.c index 888bb67070ac..53f851a61554 100644 --- a/arch/mips/kvm/stats.c +++ b/arch/mips/kvm/stats.c @@ -11,27 +11,6 @@ #include -char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES] = { - "WAIT", - "CACHE", - "Signal", - "Interrupt", - "COP0/1 Unusable", - "TLB Mod", - "TLB Miss (LD)", - "TLB Miss (ST)", - "Address Err (ST)", - "Address Error (LD)", - "System Call", - "Reserved Inst", - "Break Inst", - "Trap Inst", - "MSA FPE", - "FPE", - "MSA Disabled", - "D-Cache Flushes", -}; - char *kvm_cop0_str[N_MIPS_COPROC_REGS] = { "Index", "Random", diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h index f3ada591ca25..ffa6ee8f2025 100644 --- a/arch/mips/kvm/trace.h +++ b/arch/mips/kvm/trace.h @@ -17,8 +17,45 @@ #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE trace -/* Tracepoints for VM eists */ -extern char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES]; +/* The first 32 exit reasons correspond to Cause.ExcCode */ +#define KVM_TRACE_EXIT_INT 0 +#define KVM_TRACE_EXIT_TLBMOD 1 +#define KVM_TRACE_EXIT_TLBMISS_LD 2 +#define KVM_TRACE_EXIT_TLBMISS_ST 3 +#define KVM_TRACE_EXIT_ADDRERR_LD 4 +#define KVM_TRACE_EXIT_ADDRERR_ST 5 +#define KVM_TRACE_EXIT_SYSCALL 8 +#define KVM_TRACE_EXIT_BREAK_INST 9 +#define KVM_TRACE_EXIT_RESVD_INST 10 +#define KVM_TRACE_EXIT_COP_UNUSABLE 11 +#define KVM_TRACE_EXIT_TRAP_INST 13 +#define KVM_TRACE_EXIT_MSA_FPE 14 +#define KVM_TRACE_EXIT_FPE 15 +#define KVM_TRACE_EXIT_MSA_DISABLED 21 +/* Further exit reasons */ +#define KVM_TRACE_EXIT_WAIT 32 +#define KVM_TRACE_EXIT_CACHE 33 +#define KVM_TRACE_EXIT_SIGNAL 34 + +/* Tracepoints for VM exits */ +#define kvm_trace_symbol_exit_types \ + { KVM_TRACE_EXIT_INT, "Interrupt" }, \ + { KVM_TRACE_EXIT_TLBMOD, "TLB Mod" }, \ + { KVM_TRACE_EXIT_TLBMISS_LD, "TLB Miss (LD)" }, \ + { KVM_TRACE_EXIT_TLBMISS_ST, "TLB Miss (ST)" }, \ + { KVM_TRACE_EXIT_ADDRERR_LD, "Address Error (LD)" }, \ + { KVM_TRACE_EXIT_ADDRERR_ST, "Address Err (ST)" }, \ + { KVM_TRACE_EXIT_SYSCALL, "System Call" }, \ + { KVM_TRACE_EXIT_BREAK_INST, "Break Inst" }, \ + { KVM_TRACE_EXIT_RESVD_INST, "Reserved Inst" }, \ + { KVM_TRACE_EXIT_COP_UNUSABLE, "COP0/1 Unusable" }, \ + { KVM_TRACE_EXIT_TRAP_INST, "Trap Inst" }, \ + { KVM_TRACE_EXIT_MSA_FPE, "MSA FPE" }, \ + { KVM_TRACE_EXIT_FPE, "FPE" }, \ + { KVM_TRACE_EXIT_MSA_DISABLED, "MSA Disabled" }, \ + { KVM_TRACE_EXIT_WAIT, "WAIT" }, \ + { KVM_TRACE_EXIT_CACHE, "CACHE" }, \ + { KVM_TRACE_EXIT_SIGNAL, "Signal" } TRACE_EVENT(kvm_exit, TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason), @@ -34,7 +71,8 @@ TRACE_EVENT(kvm_exit, ), TP_printk("[%s]PC: 0x%08lx", - kvm_mips_exit_types_str[__entry->reason], + __print_symbolic(__entry->reason, + kvm_trace_symbol_exit_types), __entry->pc) ); From 9887d1c75ba2f210b403d536ab025d4b2b36fb57 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 14 Jun 2016 09:40:13 +0100 Subject: [PATCH 078/302] MIPS: KVM: Add kvm_asid_change trace event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a trace event for guest ASID changes, replacing the existing kvm_debug call. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: Steven Rostedt Cc: Ingo Molnar Cc: kvm@vger.kernel.org Cc: linux-mips@linux-mips.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/emulate.c | 7 +++---- arch/mips/kvm/trace.h | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index fce08bda9ebc..ee0e61d2b6fb 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1082,11 +1082,10 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) && ((kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID) != nasid)) { - kvm_debug("MTCz, change ASID from %#lx to %#lx\n", + trace_kvm_asid_change(vcpu, kvm_read_c0_guest_entryhi(cop0) - & KVM_ENTRYHI_ASID, - vcpu->arch.gprs[rt] - & KVM_ENTRYHI_ASID); + & KVM_ENTRYHI_ASID, + nasid); /* Blow away the shadow host TLBs */ kvm_mips_flush_host_tlb(1); diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h index ffa6ee8f2025..7daf7474d6a6 100644 --- a/arch/mips/kvm/trace.h +++ b/arch/mips/kvm/trace.h @@ -122,6 +122,28 @@ TRACE_EVENT(kvm_aux, __entry->pc) ); +TRACE_EVENT(kvm_asid_change, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int old_asid, + unsigned int new_asid), + TP_ARGS(vcpu, old_asid, new_asid), + TP_STRUCT__entry( + __field(unsigned long, pc) + __field(u8, old_asid) + __field(u8, new_asid) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.pc; + __entry->old_asid = old_asid; + __entry->new_asid = new_asid; + ), + + TP_printk("PC: 0x%08lx old: 0x%02x new: 0x%02x", + __entry->pc, + __entry->old_asid, + __entry->new_asid) +); + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ From 93258604ab6d3f2bdc6cb02f61961af56712f144 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 14 Jun 2016 09:40:14 +0100 Subject: [PATCH 079/302] MIPS: KVM: Add guest mode switch trace events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a few trace events for entering and coming out of guest mode, as well as re-entering it from a guest exit exception. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: Steven Rostedt Cc: Ingo Molnar Cc: kvm@vger.kernel.org Cc: linux-mips@linux-mips.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 4 ++++ arch/mips/kvm/trace.h | 48 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index e9e40b9dd9be..b5ad2ba1847a 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -410,7 +410,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) /* Disable hardware page table walking while in guest */ htw_stop(); + trace_kvm_enter(vcpu); r = vcpu->arch.vcpu_run(run, vcpu); + trace_kvm_out(vcpu); /* Re-enable HTW before enabling interrupts */ htw_start(); @@ -1389,6 +1391,8 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) } if (ret == RESUME_GUEST) { + trace_kvm_reenter(vcpu); + /* * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context * is live), restore FCR31 / MSACSR. diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h index 7daf7474d6a6..aec1c43f2b44 100644 --- a/arch/mips/kvm/trace.h +++ b/arch/mips/kvm/trace.h @@ -17,6 +17,54 @@ #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE trace +/* + * Tracepoints for VM enters + */ +TRACE_EVENT(kvm_enter, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), + TP_STRUCT__entry( + __field(unsigned long, pc) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.pc; + ), + + TP_printk("PC: 0x%08lx", + __entry->pc) +); + +TRACE_EVENT(kvm_reenter, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), + TP_STRUCT__entry( + __field(unsigned long, pc) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.pc; + ), + + TP_printk("PC: 0x%08lx", + __entry->pc) +); + +TRACE_EVENT(kvm_out, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), + TP_STRUCT__entry( + __field(unsigned long, pc) + ), + + TP_fast_assign( + __entry->pc = vcpu->arch.pc; + ), + + TP_printk("PC: 0x%08lx", + __entry->pc) +); + /* The first 32 exit reasons correspond to Cause.ExcCode */ #define KVM_TRACE_EXIT_INT 0 #define KVM_TRACE_EXIT_TLBMOD 1 From 6398da1391ba9285aeb4fa3f4470f008bf730220 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 14 Jun 2016 09:40:15 +0100 Subject: [PATCH 080/302] MIPS: KVM: Trace guest register access emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trace emulation of guest access to various registers via MFC0/MTC0/DMFC0/DMTC0 instructions (coprocessor 0) and the RDHWR instruction (hardware registers exposed to userland), replacing some existing kvm_debug calls. Trace events are much more practical for this kind of debug output. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: Steven Rostedt Cc: Ingo Molnar Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/emulate.c | 31 +++++++++------ arch/mips/kvm/trace.h | 88 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 12 deletions(-) diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index ee0e61d2b6fb..2004e35288d0 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -979,7 +979,6 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, struct mips_coproc *cop0 = vcpu->arch.cop0; enum emulation_result er = EMULATE_DONE; u32 rt, rd, copz, sel, co_bit, op; - unsigned long pc = vcpu->arch.pc; unsigned long curr_pc; /* @@ -1046,20 +1045,27 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, #endif } - kvm_debug - ("[%#lx] MFCz[%d][%d], vcpu->arch.gprs[%d]: %#lx\n", - pc, rd, sel, rt, vcpu->arch.gprs[rt]); - + trace_kvm_hwr(vcpu, KVM_TRACE_MFC0, + KVM_TRACE_COP0(rd, sel), + vcpu->arch.gprs[rt]); break; case dmfc_op: vcpu->arch.gprs[rt] = cop0->reg[rd][sel]; + + trace_kvm_hwr(vcpu, KVM_TRACE_DMFC0, + KVM_TRACE_COP0(rd, sel), + vcpu->arch.gprs[rt]); break; case mtc_op: #ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS cop0->stat[rd][sel]++; #endif + trace_kvm_hwr(vcpu, KVM_TRACE_MTC0, + KVM_TRACE_COP0(rd, sel), + vcpu->arch.gprs[rt]); + if ((rd == MIPS_CP0_TLB_INDEX) && (vcpu->arch.gprs[rt] >= KVM_MIPS_GUEST_TLB_SIZE)) { @@ -1098,10 +1104,6 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]); goto done; } else if ((rd == MIPS_CP0_COMPARE) && (sel == 0)) { - kvm_debug("[%#lx] MTCz, COMPARE %#lx <- %#lx\n", - pc, kvm_read_c0_guest_compare(cop0), - vcpu->arch.gprs[rt]); - /* If we are writing to COMPARE */ /* Clear pending timer interrupt, if any */ kvm_mips_write_compare(vcpu, @@ -1237,14 +1239,14 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, kvm_mips_trans_mtc0(inst, opc, vcpu); #endif } - - kvm_debug("[%#lx] MTCz, cop0->reg[%d][%d]: %#lx\n", pc, - rd, sel, cop0->reg[rd][sel]); break; case dmtc_op: kvm_err("!!!!!!![%#lx]dmtc_op: rt: %d, rd: %d, sel: %d!!!!!!\n", vcpu->arch.pc, rt, rd, sel); + trace_kvm_hwr(vcpu, KVM_TRACE_DMTC0, + KVM_TRACE_COP0(rd, sel), + vcpu->arch.gprs[rt]); er = EMULATE_FAIL; break; @@ -2307,6 +2309,8 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, int usermode = !KVM_GUEST_KERNEL_MODE(vcpu); int rd = (inst & RD) >> 11; int rt = (inst & RT) >> 16; + int sel = (inst >> 6) & 0x7; + /* If usermode, check RDHWR rd is allowed by guest HWREna */ if (usermode && !(kvm_read_c0_guest_hwrena(cop0) & BIT(rd))) { kvm_debug("RDHWR %#x disallowed by HWREna @ %p\n", @@ -2342,6 +2346,9 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, kvm_debug("RDHWR %#x not supported @ %p\n", rd, opc); goto emulate_ri; } + + trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR, KVM_TRACE_HWR(rd, sel), + vcpu->arch.gprs[rt]); } else { kvm_debug("Emulate RI not supported @ %p: %#x\n", opc, inst); goto emulate_ri; diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h index aec1c43f2b44..5d712ecb0734 100644 --- a/arch/mips/kvm/trace.h +++ b/arch/mips/kvm/trace.h @@ -124,6 +124,94 @@ TRACE_EVENT(kvm_exit, __entry->pc) ); +#define KVM_TRACE_MFC0 0 +#define KVM_TRACE_MTC0 1 +#define KVM_TRACE_DMFC0 2 +#define KVM_TRACE_DMTC0 3 +#define KVM_TRACE_RDHWR 4 + +#define KVM_TRACE_HWR_COP0 0 +#define KVM_TRACE_HWR_HWR 1 + +#define KVM_TRACE_COP0(REG, SEL) ((KVM_TRACE_HWR_COP0 << 8) | \ + ((REG) << 3) | (SEL)) +#define KVM_TRACE_HWR(REG, SEL) ((KVM_TRACE_HWR_HWR << 8) | \ + ((REG) << 3) | (SEL)) + +#define kvm_trace_symbol_hwr_ops \ + { KVM_TRACE_MFC0, "MFC0" }, \ + { KVM_TRACE_MTC0, "MTC0" }, \ + { KVM_TRACE_DMFC0, "DMFC0" }, \ + { KVM_TRACE_DMTC0, "DMTC0" }, \ + { KVM_TRACE_RDHWR, "RDHWR" } + +#define kvm_trace_symbol_hwr_cop \ + { KVM_TRACE_HWR_COP0, "COP0" }, \ + { KVM_TRACE_HWR_HWR, "HWR" } + +#define kvm_trace_symbol_hwr_regs \ + { KVM_TRACE_COP0( 0, 0), "Index" }, \ + { KVM_TRACE_COP0( 2, 0), "EntryLo0" }, \ + { KVM_TRACE_COP0( 3, 0), "EntryLo1" }, \ + { KVM_TRACE_COP0( 4, 0), "Context" }, \ + { KVM_TRACE_COP0( 4, 2), "UserLocal" }, \ + { KVM_TRACE_COP0( 5, 0), "PageMask" }, \ + { KVM_TRACE_COP0( 6, 0), "Wired" }, \ + { KVM_TRACE_COP0( 7, 0), "HWREna" }, \ + { KVM_TRACE_COP0( 8, 0), "BadVAddr" }, \ + { KVM_TRACE_COP0( 9, 0), "Count" }, \ + { KVM_TRACE_COP0(10, 0), "EntryHi" }, \ + { KVM_TRACE_COP0(11, 0), "Compare" }, \ + { KVM_TRACE_COP0(12, 0), "Status" }, \ + { KVM_TRACE_COP0(12, 1), "IntCtl" }, \ + { KVM_TRACE_COP0(12, 2), "SRSCtl" }, \ + { KVM_TRACE_COP0(13, 0), "Cause" }, \ + { KVM_TRACE_COP0(14, 0), "EPC" }, \ + { KVM_TRACE_COP0(15, 0), "PRId" }, \ + { KVM_TRACE_COP0(15, 1), "EBase" }, \ + { KVM_TRACE_COP0(16, 0), "Config" }, \ + { KVM_TRACE_COP0(16, 1), "Config1" }, \ + { KVM_TRACE_COP0(16, 2), "Config2" }, \ + { KVM_TRACE_COP0(16, 3), "Config3" }, \ + { KVM_TRACE_COP0(16, 4), "Config4" }, \ + { KVM_TRACE_COP0(16, 5), "Config5" }, \ + { KVM_TRACE_COP0(16, 7), "Config7" }, \ + { KVM_TRACE_COP0(26, 0), "ECC" }, \ + { KVM_TRACE_COP0(30, 0), "ErrorEPC" }, \ + { KVM_TRACE_HWR( 0, 0), "CPUNum" }, \ + { KVM_TRACE_HWR( 1, 0), "SYNCI_Step" }, \ + { KVM_TRACE_HWR( 2, 0), "CC" }, \ + { KVM_TRACE_HWR( 3, 0), "CCRes" }, \ + { KVM_TRACE_HWR(29, 0), "ULR" } + +TRACE_EVENT(kvm_hwr, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op, unsigned int reg, + unsigned long val), + TP_ARGS(vcpu, op, reg, val), + TP_STRUCT__entry( + __field(unsigned long, val) + __field(u16, reg) + __field(u8, op) + ), + + TP_fast_assign( + __entry->val = val; + __entry->reg = reg; + __entry->op = op; + ), + + TP_printk("%s %s (%s:%u:%u) 0x%08lx", + __print_symbolic(__entry->op, + kvm_trace_symbol_hwr_ops), + __print_symbolic(__entry->reg, + kvm_trace_symbol_hwr_regs), + __print_symbolic(__entry->reg >> 8, + kvm_trace_symbol_hwr_cop), + (__entry->reg >> 3) & 0x1f, + __entry->reg & 0x7, + __entry->val) +); + #define KVM_TRACE_AUX_RESTORE 0 #define KVM_TRACE_AUX_SAVE 1 #define KVM_TRACE_AUX_ENABLE 2 From eafc4ed206c562d205de9d2acf40d57616faaf03 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 14 Jun 2016 09:40:16 +0100 Subject: [PATCH 081/302] MIPS: KVM: Dump guest tlbs if kvm_get_inst() fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If kvm_get_inst() fails to find a guest TLB mapping for the guest PC then dump the guest TLB entries. The contents of the guest TLB is likely to be more interesting than the host TLB entries. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: kvm@vger.kernel.org Cc: linux-mips@linux-mips.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 208f70409ccb..2f494ec5c939 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -346,6 +346,7 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n", __func__, opc, vcpu, read_c0_entryhi()); kvm_mips_dump_host_tlbs(); + kvm_mips_dump_guest_tlbs(vcpu); local_irq_restore(flags); return KVM_INVALID_INST; } From d86c1ebe8e3d8a13aea9ce8437405d0ea3765698 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 14 Jun 2016 09:40:17 +0100 Subject: [PATCH 082/302] MIPS: KVM: Print unknown load/store encodings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When trying to emulate an unrecognised load or store instruction, print the encoding to aid debug. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/emulate.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 2004e35288d0..ff4072c2b25e 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1412,7 +1412,8 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause, break; default: - kvm_err("Store not yet supported"); + kvm_err("Store not yet supported (inst=0x%08x)\n", + inst); er = EMULATE_FAIL; break; } @@ -1522,7 +1523,8 @@ enum emulation_result kvm_mips_emulate_load(u32 inst, u32 cause, break; default: - kvm_err("Load not yet supported"); + kvm_err("Load not yet supported (inst=0x%08x)\n", + inst); er = EMULATE_FAIL; break; } From 6a727b0b3f9305b2c9f107decee3d8b9122de9e1 Mon Sep 17 00:00:00 2001 From: Andrea Gelmini Date: Sat, 21 May 2016 13:48:35 +0200 Subject: [PATCH 083/302] KVM: ARM: Fix typos Signed-off-by: Andrea Gelmini Signed-off-by: Paolo Bonzini --- arch/arm/kvm/arm.c | 2 +- arch/arm/kvm/emulate.c | 2 +- arch/arm/kvm/guest.c | 2 +- arch/arm/kvm/reset.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 893941ec98dc..f20ca84537f5 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -376,7 +376,7 @@ void force_vm_exit(const cpumask_t *mask) /** * need_new_vmid_gen - check that the VMID is still valid - * @kvm: The VM's VMID to checkt + * @kvm: The VM's VMID to check * * return true if there is a new generation of VMIDs being used * diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c index a494def3f195..af93e3ffc9f3 100644 --- a/arch/arm/kvm/emulate.c +++ b/arch/arm/kvm/emulate.c @@ -210,7 +210,7 @@ bool kvm_condition_valid(struct kvm_vcpu *vcpu) * @vcpu: The VCPU pointer * * When exceptions occur while instructions are executed in Thumb IF-THEN - * blocks, the ITSTATE field of the CPSR is not advanved (updated), so we have + * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have * to do this little bit of work manually. The fields map like this: * * IT[7:0] -> CPSR[26:25],CPSR[15:10] diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c index 9093ed0f8b2a..9aca92074f85 100644 --- a/arch/arm/kvm/guest.c +++ b/arch/arm/kvm/guest.c @@ -182,7 +182,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) /** * kvm_arm_copy_reg_indices - get indices of all registers. * - * We do core registers right here, then we apppend coproc regs. + * We do core registers right here, then we append coproc regs. */ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c index 0048b5a62a50..4b5e802e57d1 100644 --- a/arch/arm/kvm/reset.c +++ b/arch/arm/kvm/reset.c @@ -52,7 +52,7 @@ static const struct kvm_irq_level cortexa_vtimer_irq = { * @vcpu: The VCPU pointer * * This function finds the right table above and sets the registers on the - * virtual CPU struct to their architectually defined reset values. + * virtual CPU struct to their architecturally defined reset values. */ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) { From edce2292c1e026c6a2da6899c114d930bc1f518b Mon Sep 17 00:00:00 2001 From: Andrea Gelmini Date: Sat, 21 May 2016 13:53:14 +0200 Subject: [PATCH 084/302] KVM: ARM64: Fix typos Signed-off-by: Andrea Gelmini Signed-off-by: Paolo Bonzini --- arch/arm64/include/asm/kvm_arm.h | 2 +- arch/arm64/kvm/guest.c | 2 +- arch/arm64/kvm/reset.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 2cdb6b551ac6..4b5c977af465 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -178,7 +178,7 @@ /* Hyp System Trap Register */ #define HSTR_EL2_T(x) (1 << x) -/* Hyp Coproccessor Trap Register Shifts */ +/* Hyp Coprocessor Trap Register Shifts */ #define CPTR_EL2_TFP_SHIFT 10 /* Hyp Coprocessor Trap Register */ diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 32fad75bb9ff..3f9e15722473 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -211,7 +211,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) /** * kvm_arm_copy_reg_indices - get indices of all registers. * - * We do core registers right here, then we apppend system regs. + * We do core registers right here, then we append system regs. */ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) { diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index b1ad730e1567..7be24f2b18db 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -98,7 +98,7 @@ int kvm_arch_dev_ioctl_check_extension(long ext) * @vcpu: The VCPU pointer * * This function finds the right table above and sets the registers on - * the virtual CPU struct to their architectually defined reset + * the virtual CPU struct to their architecturally defined reset * values. */ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) From 960cb306e63d4efde7753c0a2f2cef523a41e8ec Mon Sep 17 00:00:00 2001 From: Andrea Gelmini Date: Sat, 21 May 2016 14:08:55 +0200 Subject: [PATCH 085/302] KVM: S390: Fix typo Signed-off-by: Andrea Gelmini Signed-off-by: Paolo Bonzini --- arch/s390/kvm/guestdbg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index e8c6843b9600..1e0849e20965 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -465,7 +465,7 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu) guest_perc &= ~PER_EVENT_IFETCH; /* All other PER events will be given to the guest */ - /* TODO: Check alterated address/address space */ + /* TODO: Check altered address/address space */ vcpu->arch.sie_block->perc = guest_perc >> 24; From bb3541f175a977198d128f3a4e13534e019754a3 Mon Sep 17 00:00:00 2001 From: Andrea Gelmini Date: Sat, 21 May 2016 14:14:44 +0200 Subject: [PATCH 086/302] KVM: x86: Fix typos Signed-off-by: Andrea Gelmini Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/locking.txt | 4 ++-- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/pmu_intel.c | 2 +- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index 19f94a6b9bb0..f2491a8c68b4 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -89,7 +89,7 @@ In mmu_spte_clear_track_bits(): old_spte = *spte; /* 'if' condition is satisfied. */ - if (old_spte.Accssed == 1 && + if (old_spte.Accessed == 1 && old_spte.W == 0) spte = 0ull; on fast page fault path: @@ -102,7 +102,7 @@ In mmu_spte_clear_track_bits(): old_spte = xchg(spte, 0ull) - if (old_spte.Accssed == 1) + if (old_spte.Accessed == 1) kvm_set_pfn_accessed(spte.pfn); if (old_spte.Dirty == 1) kvm_set_pfn_dirty(spte.pfn); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index def97b3a392b..837bf23c5b06 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -523,7 +523,7 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) } /* Rules for using mmu_spte_update: - * Update the state bits, it means the mapped pfn is not changged. + * Update the state bits, it means the mapped pfn is not changed. * * Whenever we overwrite a writable spte with a read-only one we * should flush remote TLBs. Otherwise rmap_write_protect diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c index ab38af4f4947..9d4a8504a95a 100644 --- a/arch/x86/kvm/pmu_intel.c +++ b/arch/x86/kvm/pmu_intel.c @@ -93,7 +93,7 @@ static unsigned intel_find_fixed_event(int idx) return intel_arch_events[fixed_pmc_events[idx]].event_type; } -/* check if a PMC is enabled by comparising it with globl_ctrl bits. */ +/* check if a PMC is enabled by comparing it with globl_ctrl bits. */ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1163e8173e5a..5ff292778110 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1572,7 +1572,7 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { /* - * Any change of EFLAGS.VM is accompained by a reload of SS + * Any change of EFLAGS.VM is accompanied by a reload of SS * (caused by either a task switch or an inter-privilege IRET), * so we do not need to update the CPL here. */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fb93010beaa4..57ec6a4b4958 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3364,7 +3364,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) /* * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL - * but due to arrata below it can't be used. Workaround is to use + * but due to errata below it can't be used. Workaround is to use * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL. * * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32] diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9d6a30593655..bf227212aebb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8418,7 +8418,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, /* * When producer of consumer is unregistered, we change back to * remapped mode, so we can re-use the current implementation - * when the irq is masked/disabed or the consumer side (KVM + * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); From 66ffc50c480e7ab6ad5642f47276435a8873c31a Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:29:45 +0100 Subject: [PATCH 087/302] MIPS: KVM: Fix translation of MFC0 ErrCtl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MIPS KVM dynamic translation is meant to translate "MFC0 rt, ErrCtl" instructions into "ADD rt, zero, zero" to zero the destination register, however the rt register number was copied into rt of the ADD instruction encoding, which is the 2nd source operand. This results in "ADD zero, zero, rt" which is a no-op, so only the first execution of each such MFC0 from ErrCtl will actually read 0. Fix the shift to put the rt from the MFC0 encoding into the rd field of the ADD. Fixes: 50c8308538dc ("KVM/MIPS32: Binary patching of select privileged instructions.") Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/dyntrans.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c index d4a86fb239cd..79b134c91333 100644 --- a/arch/mips/kvm/dyntrans.c +++ b/arch/mips/kvm/dyntrans.c @@ -82,7 +82,7 @@ int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) { mfc0_inst = CLEAR_TEMPLATE; - mfc0_inst |= ((rt & 0x1f) << 16); + mfc0_inst |= ((rt & 0x1f) << 11); } else { mfc0_inst = LW_TEMPLATE; mfc0_inst |= ((rt & 0x1f) << 16); From d5cd26bcfc881f5443d510e3acd40b30d7b7d0df Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:29:46 +0100 Subject: [PATCH 088/302] MIPS: KVM: Factor writing of translated guest instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code in kvm_mips_dyntrans.c to write a translated guest instruction to guest memory depending on the segment is duplicated between each of the functions. Additionally the cache op translation functions assume the instruction is in the KSEG0/1 segment rather than KSEG2/3, which is generally true but isn't guaranteed. Factor that code into a new kvm_mips_trans_replace() which handles both KSEG0/1 and KSEG2/3. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/dyntrans.c | 92 +++++++++++++++------------------------- 1 file changed, 34 insertions(+), 58 deletions(-) diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c index 79b134c91333..eb6e0d17a668 100644 --- a/arch/mips/kvm/dyntrans.c +++ b/arch/mips/kvm/dyntrans.c @@ -28,21 +28,41 @@ #define CLEAR_TEMPLATE 0x00000020 #define SW_TEMPLATE 0xac000000 +/** + * kvm_mips_trans_replace() - Replace trapping instruction in guest memory. + * @vcpu: Virtual CPU. + * @opc: PC of instruction to replace. + * @replace: Instruction to write + */ +static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc, u32 replace) +{ + unsigned long kseg0_opc, flags; + + if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) { + kseg0_opc = + CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa + (vcpu, (unsigned long) opc)); + memcpy((void *)kseg0_opc, (void *)&replace, sizeof(u32)); + local_flush_icache_range(kseg0_opc, kseg0_opc + 32); + } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { + local_irq_save(flags); + memcpy((void *)opc, (void *)&replace, sizeof(u32)); + local_flush_icache_range((unsigned long)opc, + (unsigned long)opc + 32); + local_irq_restore(flags); + } else { + kvm_err("%s: Invalid address: %p\n", __func__, opc); + return -EFAULT; + } + + return 0; +} + int kvm_mips_trans_cache_index(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) { - int result = 0; - unsigned long kseg0_opc; - u32 synci_inst = 0x0; - /* Replace the CACHE instruction, with a NOP */ - kseg0_opc = - CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa - (vcpu, (unsigned long) opc)); - memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(u32)); - local_flush_icache_range(kseg0_opc, kseg0_opc + 32); - - return result; + return kvm_mips_trans_replace(vcpu, opc, 0x00000000); } /* @@ -52,8 +72,6 @@ int kvm_mips_trans_cache_index(u32 inst, u32 *opc, int kvm_mips_trans_cache_va(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) { - int result = 0; - unsigned long kseg0_opc; u32 synci_inst = SYNCI_TEMPLATE, base, offset; base = (inst >> 21) & 0x1f; @@ -61,20 +79,13 @@ int kvm_mips_trans_cache_va(u32 inst, u32 *opc, synci_inst |= (base << 21); synci_inst |= offset; - kseg0_opc = - CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa - (vcpu, (unsigned long) opc)); - memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(u32)); - local_flush_icache_range(kseg0_opc, kseg0_opc + 32); - - return result; + return kvm_mips_trans_replace(vcpu, opc, synci_inst); } int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) { u32 rt, rd, sel; u32 mfc0_inst; - unsigned long kseg0_opc, flags; rt = (inst >> 16) & 0x1f; rd = (inst >> 11) & 0x1f; @@ -90,31 +101,13 @@ int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) cop0.reg[rd][sel]); } - if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) { - kseg0_opc = - CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa - (vcpu, (unsigned long) opc)); - memcpy((void *)kseg0_opc, (void *)&mfc0_inst, sizeof(u32)); - local_flush_icache_range(kseg0_opc, kseg0_opc + 32); - } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { - local_irq_save(flags); - memcpy((void *)opc, (void *)&mfc0_inst, sizeof(u32)); - local_flush_icache_range((unsigned long)opc, - (unsigned long)opc + 32); - local_irq_restore(flags); - } else { - kvm_err("%s: Invalid address: %p\n", __func__, opc); - return -EFAULT; - } - - return 0; + return kvm_mips_trans_replace(vcpu, opc, mfc0_inst); } int kvm_mips_trans_mtc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) { u32 rt, rd, sel; u32 mtc0_inst = SW_TEMPLATE; - unsigned long kseg0_opc, flags; rt = (inst >> 16) & 0x1f; rd = (inst >> 11) & 0x1f; @@ -123,22 +116,5 @@ int kvm_mips_trans_mtc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) mtc0_inst |= ((rt & 0x1f) << 16); mtc0_inst |= offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]); - if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) { - kseg0_opc = - CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa - (vcpu, (unsigned long) opc)); - memcpy((void *)kseg0_opc, (void *)&mtc0_inst, sizeof(u32)); - local_flush_icache_range(kseg0_opc, kseg0_opc + 32); - } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { - local_irq_save(flags); - memcpy((void *)opc, (void *)&mtc0_inst, sizeof(u32)); - local_flush_icache_range((unsigned long)opc, - (unsigned long)opc + 32); - local_irq_restore(flags); - } else { - kvm_err("%s: Invalid address: %p\n", __func__, opc); - return -EFAULT; - } - - return 0; + return kvm_mips_trans_replace(vcpu, opc, mtc0_inst); } From 258f3a2ea93ff7e322006c716cedc4fa3d861453 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:29:47 +0100 Subject: [PATCH 089/302] MIPS: KVM: Convert emulation to use asm/inst.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert various MIPS KVM guest instruction emulation functions to decode instructions (and encode translations) using the union mips_instruction and related enumerations in asm/inst.h rather than #defines and hardcoded values. Signed-off-by: James Hogan Acked-by: Ralf Baechle Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 22 +++--- arch/mips/include/uapi/asm/inst.h | 35 +++++++++- arch/mips/kvm/dyntrans.c | 74 ++++++++++---------- arch/mips/kvm/emulate.c | 109 ++++++++++++------------------ 4 files changed, 126 insertions(+), 114 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index b8cb74270746..1e002136f514 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -19,6 +19,7 @@ #include #include +#include #include /* MIPS KVM register ids */ @@ -733,21 +734,21 @@ enum emulation_result kvm_mips_check_privilege(u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu); -enum emulation_result kvm_mips_emulate_cache(u32 inst, +enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, u32 *opc, u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu); -enum emulation_result kvm_mips_emulate_CP0(u32 inst, +enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, u32 *opc, u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu); -enum emulation_result kvm_mips_emulate_store(u32 inst, +enum emulation_result kvm_mips_emulate_store(union mips_instruction inst, u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu); -enum emulation_result kvm_mips_emulate_load(u32 inst, +enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu); @@ -758,11 +759,14 @@ unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu); unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu); /* Dynamic binary translation */ -extern int kvm_mips_trans_cache_index(u32 inst, u32 *opc, - struct kvm_vcpu *vcpu); -extern int kvm_mips_trans_cache_va(u32 inst, u32 *opc, struct kvm_vcpu *vcpu); -extern int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu); -extern int kvm_mips_trans_mtc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu); +extern int kvm_mips_trans_cache_index(union mips_instruction inst, + u32 *opc, struct kvm_vcpu *vcpu); +extern int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc, + struct kvm_vcpu *vcpu); +extern int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc, + struct kvm_vcpu *vcpu); +extern int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc, + struct kvm_vcpu *vcpu); /* Misc */ extern void kvm_mips_dump_stats(struct kvm_vcpu *vcpu); diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h index 8051f9aa1379..a1ebf973725c 100644 --- a/arch/mips/include/uapi/asm/inst.h +++ b/arch/mips/include/uapi/asm/inst.h @@ -103,7 +103,7 @@ enum rt_op { bltzal_op, bgezal_op, bltzall_op, bgezall_op, rt_op_0x14, rt_op_0x15, rt_op_0x16, rt_op_0x17, rt_op_0x18, rt_op_0x19, rt_op_0x1a, rt_op_0x1b, - bposge32_op, rt_op_0x1d, rt_op_0x1e, rt_op_0x1f + bposge32_op, rt_op_0x1d, rt_op_0x1e, synci_op }; /* @@ -586,6 +586,36 @@ struct r_format { /* Register format */ ;)))))) }; +struct c0r_format { /* C0 register format */ + __BITFIELD_FIELD(unsigned int opcode : 6, + __BITFIELD_FIELD(unsigned int rs : 5, + __BITFIELD_FIELD(unsigned int rt : 5, + __BITFIELD_FIELD(unsigned int rd : 5, + __BITFIELD_FIELD(unsigned int z: 8, + __BITFIELD_FIELD(unsigned int sel : 3, + ;)))))) +}; + +struct mfmc0_format { /* MFMC0 register format */ + __BITFIELD_FIELD(unsigned int opcode : 6, + __BITFIELD_FIELD(unsigned int rs : 5, + __BITFIELD_FIELD(unsigned int rt : 5, + __BITFIELD_FIELD(unsigned int rd : 5, + __BITFIELD_FIELD(unsigned int re : 5, + __BITFIELD_FIELD(unsigned int sc : 1, + __BITFIELD_FIELD(unsigned int : 2, + __BITFIELD_FIELD(unsigned int sel : 3, + ;)))))))) +}; + +struct co_format { /* C0 CO format */ + __BITFIELD_FIELD(unsigned int opcode : 6, + __BITFIELD_FIELD(unsigned int co : 1, + __BITFIELD_FIELD(unsigned int code : 19, + __BITFIELD_FIELD(unsigned int func : 6, + ;)))) +}; + struct p_format { /* Performance counter format (R10000) */ __BITFIELD_FIELD(unsigned int opcode : 6, __BITFIELD_FIELD(unsigned int rs : 5, @@ -937,6 +967,9 @@ union mips_instruction { struct u_format u_format; struct c_format c_format; struct r_format r_format; + struct c0r_format c0r_format; + struct mfmc0_format mfmc0_format; + struct co_format co_format; struct p_format p_format; struct f_format f_format; struct ma_format ma_format; diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c index eb6e0d17a668..a3031dae8d1b 100644 --- a/arch/mips/kvm/dyntrans.c +++ b/arch/mips/kvm/dyntrans.c @@ -20,21 +20,14 @@ #include "commpage.h" -#define SYNCI_TEMPLATE 0x041f0000 -#define SYNCI_BASE(x) (((x) >> 21) & 0x1f) -#define SYNCI_OFFSET ((x) & 0xffff) - -#define LW_TEMPLATE 0x8c000000 -#define CLEAR_TEMPLATE 0x00000020 -#define SW_TEMPLATE 0xac000000 - /** * kvm_mips_trans_replace() - Replace trapping instruction in guest memory. * @vcpu: Virtual CPU. * @opc: PC of instruction to replace. * @replace: Instruction to write */ -static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc, u32 replace) +static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc, + union mips_instruction replace) { unsigned long kseg0_opc, flags; @@ -58,63 +51,68 @@ static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc, u32 replace) return 0; } -int kvm_mips_trans_cache_index(u32 inst, u32 *opc, +int kvm_mips_trans_cache_index(union mips_instruction inst, u32 *opc, struct kvm_vcpu *vcpu) { + union mips_instruction nop_inst = { 0 }; + /* Replace the CACHE instruction, with a NOP */ - return kvm_mips_trans_replace(vcpu, opc, 0x00000000); + return kvm_mips_trans_replace(vcpu, opc, nop_inst); } /* * Address based CACHE instructions are transformed into synci(s). A little * heavy for just D-cache invalidates, but avoids an expensive trap */ -int kvm_mips_trans_cache_va(u32 inst, u32 *opc, +int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc, struct kvm_vcpu *vcpu) { - u32 synci_inst = SYNCI_TEMPLATE, base, offset; + union mips_instruction synci_inst = { 0 }; - base = (inst >> 21) & 0x1f; - offset = inst & 0xffff; - synci_inst |= (base << 21); - synci_inst |= offset; + synci_inst.i_format.opcode = bcond_op; + synci_inst.i_format.rs = inst.i_format.rs; + synci_inst.i_format.rt = synci_op; + synci_inst.i_format.simmediate = inst.i_format.simmediate; return kvm_mips_trans_replace(vcpu, opc, synci_inst); } -int kvm_mips_trans_mfc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) +int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc, + struct kvm_vcpu *vcpu) { - u32 rt, rd, sel; - u32 mfc0_inst; + union mips_instruction mfc0_inst = { 0 }; + u32 rd, sel; - rt = (inst >> 16) & 0x1f; - rd = (inst >> 11) & 0x1f; - sel = inst & 0x7; + rd = inst.c0r_format.rd; + sel = inst.c0r_format.sel; - if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) { - mfc0_inst = CLEAR_TEMPLATE; - mfc0_inst |= ((rt & 0x1f) << 11); + if (rd == MIPS_CP0_ERRCTL && sel == 0) { + mfc0_inst.r_format.opcode = spec_op; + mfc0_inst.r_format.rd = inst.c0r_format.rt; + mfc0_inst.r_format.func = add_op; } else { - mfc0_inst = LW_TEMPLATE; - mfc0_inst |= ((rt & 0x1f) << 16); - mfc0_inst |= offsetof(struct kvm_mips_commpage, - cop0.reg[rd][sel]); + mfc0_inst.i_format.opcode = lw_op; + mfc0_inst.i_format.rt = inst.c0r_format.rt; + mfc0_inst.i_format.simmediate = + offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]); } return kvm_mips_trans_replace(vcpu, opc, mfc0_inst); } -int kvm_mips_trans_mtc0(u32 inst, u32 *opc, struct kvm_vcpu *vcpu) +int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc, + struct kvm_vcpu *vcpu) { - u32 rt, rd, sel; - u32 mtc0_inst = SW_TEMPLATE; + union mips_instruction mtc0_inst = { 0 }; + u32 rd, sel; - rt = (inst >> 16) & 0x1f; - rd = (inst >> 11) & 0x1f; - sel = inst & 0x7; + rd = inst.c0r_format.rd; + sel = inst.c0r_format.sel; - mtc0_inst |= ((rt & 0x1f) << 16); - mtc0_inst |= offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]); + mtc0_inst.i_format.opcode = sw_op; + mtc0_inst.i_format.rt = inst.c0r_format.rt; + mtc0_inst.i_format.simmediate = + offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]); return kvm_mips_trans_replace(vcpu, opc, mtc0_inst); } diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index ff4072c2b25e..80bb6212a067 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -972,13 +972,14 @@ unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu) return mask; } -enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, +enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, + u32 *opc, u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; enum emulation_result er = EMULATE_DONE; - u32 rt, rd, copz, sel, co_bit, op; + u32 rt, rd, sel; unsigned long curr_pc; /* @@ -990,16 +991,8 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, if (er == EMULATE_FAIL) return er; - copz = (inst >> 21) & 0x1f; - rt = (inst >> 16) & 0x1f; - rd = (inst >> 11) & 0x1f; - sel = inst & 0x7; - co_bit = (inst >> 25) & 1; - - if (co_bit) { - op = (inst) & 0xff; - - switch (op) { + if (inst.co_format.co) { + switch (inst.co_format.func) { case tlbr_op: /* Read indexed TLB entry */ er = kvm_mips_emul_tlbr(vcpu); break; @@ -1018,13 +1011,16 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, case eret_op: er = kvm_mips_emul_eret(vcpu); goto dont_update_pc; - break; case wait_op: er = kvm_mips_emul_wait(vcpu); break; } } else { - switch (copz) { + rt = inst.c0r_format.rt; + rd = inst.c0r_format.rd; + sel = inst.c0r_format.sel; + + switch (inst.c0r_format.rs) { case mfc_op: #ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS cop0->stat[rd][sel]++; @@ -1258,7 +1254,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, vcpu->arch.gprs[rt] = kvm_read_c0_guest_status(cop0); /* EI */ - if (inst & 0x20) { + if (inst.mfmc0_format.sc) { kvm_debug("[%#lx] mfmc0_op: EI\n", vcpu->arch.pc); kvm_set_c0_guest_status(cop0, ST0_IE); @@ -1290,7 +1286,7 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, break; default: kvm_err("[%#lx]MachEmulateCP0: unsupported COP0, copz: 0x%x\n", - vcpu->arch.pc, copz); + vcpu->arch.pc, inst.c0r_format.rs); er = EMULATE_FAIL; break; } @@ -1311,13 +1307,13 @@ enum emulation_result kvm_mips_emulate_CP0(u32 inst, u32 *opc, u32 cause, return er; } -enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause, +enum emulation_result kvm_mips_emulate_store(union mips_instruction inst, + u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DO_MMIO; - u32 op, base, rt; - s16 offset; + u32 rt; u32 bytes; void *data = run->mmio.data; unsigned long curr_pc; @@ -1331,12 +1327,9 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause, if (er == EMULATE_FAIL) return er; - rt = (inst >> 16) & 0x1f; - base = (inst >> 21) & 0x1f; - offset = (s16)inst; - op = (inst >> 26) & 0x3f; + rt = inst.i_format.rt; - switch (op) { + switch (inst.i_format.opcode) { case sb_op: bytes = 1; if (bytes > sizeof(run->mmio.data)) { @@ -1413,7 +1406,7 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause, default: kvm_err("Store not yet supported (inst=0x%08x)\n", - inst); + inst.word); er = EMULATE_FAIL; break; } @@ -1425,19 +1418,16 @@ enum emulation_result kvm_mips_emulate_store(u32 inst, u32 cause, return er; } -enum emulation_result kvm_mips_emulate_load(u32 inst, u32 cause, - struct kvm_run *run, +enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, + u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DO_MMIO; - u32 op, base, rt; - s16 offset; + u32 op, rt; u32 bytes; - rt = (inst >> 16) & 0x1f; - base = (inst >> 21) & 0x1f; - offset = (s16)inst; - op = (inst >> 26) & 0x3f; + rt = inst.i_format.rt; + op = inst.i_format.opcode; vcpu->arch.pending_load_cause = cause; vcpu->arch.io_gpr = rt; @@ -1524,7 +1514,7 @@ enum emulation_result kvm_mips_emulate_load(u32 inst, u32 cause, default: kvm_err("Load not yet supported (inst=0x%08x)\n", - inst); + inst.word); er = EMULATE_FAIL; break; } @@ -1532,8 +1522,8 @@ enum emulation_result kvm_mips_emulate_load(u32 inst, u32 cause, return er; } -enum emulation_result kvm_mips_emulate_cache(u32 inst, u32 *opc, - u32 cause, +enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, + u32 *opc, u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu) { @@ -1554,9 +1544,9 @@ enum emulation_result kvm_mips_emulate_cache(u32 inst, u32 *opc, if (er == EMULATE_FAIL) return er; - base = (inst >> 21) & 0x1f; - op_inst = (inst >> 16) & 0x1f; - offset = (s16)inst; + base = inst.i_format.rs; + op_inst = inst.i_format.rt; + offset = inst.i_format.simmediate; cache = op_inst & CacheOp_Cache; op = op_inst & CacheOp_Op; @@ -1693,16 +1683,16 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) { + union mips_instruction inst; enum emulation_result er = EMULATE_DONE; - u32 inst; /* Fetch the instruction. */ if (cause & CAUSEF_BD) opc += 1; - inst = kvm_get_inst(opc, vcpu); + inst.word = kvm_get_inst(opc, vcpu); - switch (((union mips_instruction)inst).r_format.opcode) { + switch (inst.r_format.opcode) { case cop0_op: er = kvm_mips_emulate_CP0(inst, opc, cause, run, vcpu); break; @@ -1727,7 +1717,7 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, default: kvm_err("Instruction emulation not supported (%p/%#x)\n", opc, - inst); + inst.word); kvm_arch_vcpu_dump_regs(vcpu); er = EMULATE_FAIL; break; @@ -2262,21 +2252,6 @@ enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause, return er; } -/* ll/sc, rdhwr, sync emulation */ - -#define OPCODE 0xfc000000 -#define BASE 0x03e00000 -#define RT 0x001f0000 -#define OFFSET 0x0000ffff -#define LL 0xc0000000 -#define SC 0xe0000000 -#define SPEC0 0x00000000 -#define SPEC3 0x7c000000 -#define RD 0x0000f800 -#define FUNC 0x0000003f -#define SYNC 0x0000000f -#define RDHWR 0x0000003b - enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu) @@ -2285,7 +2260,7 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, struct kvm_vcpu_arch *arch = &vcpu->arch; enum emulation_result er = EMULATE_DONE; unsigned long curr_pc; - u32 inst; + union mips_instruction inst; /* * Update PC and hold onto current PC in case there is @@ -2300,18 +2275,19 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, if (cause & CAUSEF_BD) opc += 1; - inst = kvm_get_inst(opc, vcpu); + inst.word = kvm_get_inst(opc, vcpu); - if (inst == KVM_INVALID_INST) { + if (inst.word == KVM_INVALID_INST) { kvm_err("%s: Cannot get inst @ %p\n", __func__, opc); return EMULATE_FAIL; } - if ((inst & OPCODE) == SPEC3 && (inst & FUNC) == RDHWR) { + if (inst.r_format.opcode == spec3_op && + inst.r_format.func == rdhwr_op) { int usermode = !KVM_GUEST_KERNEL_MODE(vcpu); - int rd = (inst & RD) >> 11; - int rt = (inst & RT) >> 16; - int sel = (inst >> 6) & 0x7; + int rd = inst.r_format.rd; + int rt = inst.r_format.rt; + int sel = inst.r_format.re & 0x7; /* If usermode, check RDHWR rd is allowed by guest HWREna */ if (usermode && !(kvm_read_c0_guest_hwrena(cop0) & BIT(rd))) { @@ -2352,7 +2328,8 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR, KVM_TRACE_HWR(rd, sel), vcpu->arch.gprs[rt]); } else { - kvm_debug("Emulate RI not supported @ %p: %#x\n", opc, inst); + kvm_debug("Emulate RI not supported @ %p: %#x\n", + opc, inst.word); goto emulate_ri; } From cc68d22f9727d02c1d981d27c11389fd9413e419 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:29:48 +0100 Subject: [PATCH 090/302] MIPS: KVM: Pass all unknown registers to callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass all unrecognised register IDs through to the set_one_reg() and get_one_reg() callbacks, not just select ones. This allows implementation specific registers to be more easily added without having to modify arch/mips/kvm/mips.c. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index b5ad2ba1847a..fe82f3354c23 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -688,16 +688,11 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu, v = (long)kvm_read_c0_guest_errorepc(cop0); break; /* registers to be handled specially */ - case KVM_REG_MIPS_CP0_COUNT: - case KVM_REG_MIPS_COUNT_CTL: - case KVM_REG_MIPS_COUNT_RESUME: - case KVM_REG_MIPS_COUNT_HZ: + default: ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v); if (ret) return ret; break; - default: - return -EINVAL; } if ((reg->id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U64) { u64 __user *uaddr64 = (u64 __user *)(long)reg->addr; @@ -859,21 +854,8 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu, kvm_write_c0_guest_errorepc(cop0, v); break; /* registers to be handled specially */ - case KVM_REG_MIPS_CP0_COUNT: - case KVM_REG_MIPS_CP0_COMPARE: - case KVM_REG_MIPS_CP0_CAUSE: - case KVM_REG_MIPS_CP0_CONFIG: - case KVM_REG_MIPS_CP0_CONFIG1: - case KVM_REG_MIPS_CP0_CONFIG2: - case KVM_REG_MIPS_CP0_CONFIG3: - case KVM_REG_MIPS_CP0_CONFIG4: - case KVM_REG_MIPS_CP0_CONFIG5: - case KVM_REG_MIPS_COUNT_CTL: - case KVM_REG_MIPS_COUNT_RESUME: - case KVM_REG_MIPS_COUNT_HZ: - return kvm_mips_callbacks->set_one_reg(vcpu, reg, v); default: - return -EINVAL; + return kvm_mips_callbacks->set_one_reg(vcpu, reg, v); } return 0; } From f5c43bd4218c0d7bd65b010fd080cd6edeaeb4c8 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:29:49 +0100 Subject: [PATCH 091/302] MIPS: KVM: Make KVM_GET_REG_LIST dynamic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the implementation of KVM_GET_REG_LIST more dynamic so that only the subset of registers actually available can be exposed to user mode. This is important for VZ where some of the guest register state may not be possible to prevent the guest from accessing, therefore the user process may need to be aware of the state even if it doesn't understand what the state is for. This also allows different MIPS KVM implementations to provide different registers to one another, by way of new num_regs(vcpu) and copy_reg_indices(vcpu, indices) callback functions, currently just stubbed for trap & emulate. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 2 ++ arch/mips/kvm/mips.c | 29 ++++++++++++++++++++++------- arch/mips/kvm/trap_emul.c | 13 +++++++++++++ 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 1e002136f514..38f0491fcb2f 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -560,6 +560,8 @@ struct kvm_mips_callbacks { u32 cause); int (*irq_clear)(struct kvm_vcpu *vcpu, unsigned int priority, u32 cause); + unsigned long (*num_regs)(struct kvm_vcpu *vcpu); + int (*copy_reg_indices)(struct kvm_vcpu *vcpu, u64 __user *indices); int (*get_one_reg)(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, s64 *v); int (*set_one_reg)(struct kvm_vcpu *vcpu, diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index fe82f3354c23..2c4709a09b78 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -538,6 +538,26 @@ static u64 kvm_mips_get_one_regs[] = { KVM_REG_MIPS_COUNT_HZ, }; +static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu) +{ + unsigned long ret; + + ret = ARRAY_SIZE(kvm_mips_get_one_regs); + ret += kvm_mips_callbacks->num_regs(vcpu); + + return ret; +} + +static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices) +{ + if (copy_to_user(indices, kvm_mips_get_one_regs, + sizeof(kvm_mips_get_one_regs))) + return -EFAULT; + indices += ARRAY_SIZE(kvm_mips_get_one_regs); + + return kvm_mips_callbacks->copy_reg_indices(vcpu, indices); +} + static int kvm_mips_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { @@ -908,23 +928,18 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, } case KVM_GET_REG_LIST: { struct kvm_reg_list __user *user_list = argp; - u64 __user *reg_dest; struct kvm_reg_list reg_list; unsigned n; if (copy_from_user(®_list, user_list, sizeof(reg_list))) return -EFAULT; n = reg_list.n; - reg_list.n = ARRAY_SIZE(kvm_mips_get_one_regs); + reg_list.n = kvm_mips_num_regs(vcpu); if (copy_to_user(user_list, ®_list, sizeof(reg_list))) return -EFAULT; if (n < reg_list.n) return -E2BIG; - reg_dest = user_list->reg; - if (copy_to_user(reg_dest, kvm_mips_get_one_regs, - sizeof(kvm_mips_get_one_regs))) - return -EFAULT; - return 0; + return kvm_mips_copy_reg_indices(vcpu, user_list->reg); } case KVM_NMI: /* Treat the NMI as a CPU reset */ diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 09b97fa9dabb..b64ca1a222f7 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -478,6 +478,17 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } +static unsigned long kvm_trap_emul_num_regs(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static int kvm_trap_emul_copy_reg_indices(struct kvm_vcpu *vcpu, + u64 __user *indices) +{ + return 0; +} + static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, s64 *v) @@ -627,6 +638,8 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = { .dequeue_io_int = kvm_mips_dequeue_io_int_cb, .irq_deliver = kvm_mips_irq_deliver_cb, .irq_clear = kvm_mips_irq_clear_cb, + .num_regs = kvm_trap_emul_num_regs, + .copy_reg_indices = kvm_trap_emul_copy_reg_indices, .get_one_reg = kvm_trap_emul_get_one_reg, .set_one_reg = kvm_trap_emul_set_one_reg, .vcpu_get_regs = kvm_trap_emul_vcpu_get_regs, From 19451e51012fa49070252b1b8157460d36618cee Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:29:50 +0100 Subject: [PATCH 092/302] MIPS: KVM: Use raw_cpu_has_fpu in kvm_mips_guest_can_have_fpu() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to use kvm_mips_guest_can_have_fpu() when deciding which registers to list with KVM_GET_REG_LIST, however it causes warnings with preemption since it uses cpu_has_fpu. KVM is only really supported on CPUs which have symmetric FPUs, so switch to raw_cpu_has_fpu to avoid the warning. Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 38f0491fcb2f..f12eb01a3195 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -510,7 +510,7 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg, static inline bool kvm_mips_guest_can_have_fpu(struct kvm_vcpu_arch *vcpu) { - return (!__builtin_constant_p(cpu_has_fpu) || cpu_has_fpu) && + return (!__builtin_constant_p(raw_cpu_has_fpu) || raw_cpu_has_fpu) && vcpu->fpu_enabled; } From e57759306c44ba6105c04eafc3b22efc55bb7ad2 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:29:51 +0100 Subject: [PATCH 093/302] MIPS: KVM: List FPU/MSA registers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make KVM_GET_REG_LIST list FPU & MSA registers. Specifically we list all 32 vector registers when MSA can be enabled, 32 single-precision FP registers when FPU can be enabled, and either 16 or 32 double-precision FP registers when FPU can be enabled depending on whether FR mode is supported (which provides 32 doubles instead of 16 even doubles). Note, these registers may still be inaccessible depending on the current FP mode of the guest. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 58 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 2c4709a09b78..622b9feba927 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -538,11 +538,29 @@ static u64 kvm_mips_get_one_regs[] = { KVM_REG_MIPS_COUNT_HZ, }; +static u64 kvm_mips_get_one_regs_fpu[] = { + KVM_REG_MIPS_FCR_IR, + KVM_REG_MIPS_FCR_CSR, +}; + +static u64 kvm_mips_get_one_regs_msa[] = { + KVM_REG_MIPS_MSA_IR, + KVM_REG_MIPS_MSA_CSR, +}; + static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu) { unsigned long ret; ret = ARRAY_SIZE(kvm_mips_get_one_regs); + if (kvm_mips_guest_can_have_fpu(&vcpu->arch)) { + ret += ARRAY_SIZE(kvm_mips_get_one_regs_fpu) + 48; + /* odd doubles */ + if (boot_cpu_data.fpu_id & MIPS_FPIR_F64) + ret += 16; + } + if (kvm_mips_guest_can_have_msa(&vcpu->arch)) + ret += ARRAY_SIZE(kvm_mips_get_one_regs_msa) + 32; ret += kvm_mips_callbacks->num_regs(vcpu); return ret; @@ -550,11 +568,51 @@ static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu) static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices) { + u64 index; + unsigned int i; + if (copy_to_user(indices, kvm_mips_get_one_regs, sizeof(kvm_mips_get_one_regs))) return -EFAULT; indices += ARRAY_SIZE(kvm_mips_get_one_regs); + if (kvm_mips_guest_can_have_fpu(&vcpu->arch)) { + if (copy_to_user(indices, kvm_mips_get_one_regs_fpu, + sizeof(kvm_mips_get_one_regs_fpu))) + return -EFAULT; + indices += ARRAY_SIZE(kvm_mips_get_one_regs_fpu); + + for (i = 0; i < 32; ++i) { + index = KVM_REG_MIPS_FPR_32(i); + if (copy_to_user(indices, &index, sizeof(index))) + return -EFAULT; + ++indices; + + /* skip odd doubles if no F64 */ + if (i & 1 && !(boot_cpu_data.fpu_id & MIPS_FPIR_F64)) + continue; + + index = KVM_REG_MIPS_FPR_64(i); + if (copy_to_user(indices, &index, sizeof(index))) + return -EFAULT; + ++indices; + } + } + + if (kvm_mips_guest_can_have_msa(&vcpu->arch)) { + if (copy_to_user(indices, kvm_mips_get_one_regs_msa, + sizeof(kvm_mips_get_one_regs_msa))) + return -EFAULT; + indices += ARRAY_SIZE(kvm_mips_get_one_regs_msa); + + for (i = 0; i < 32; ++i) { + index = KVM_REG_MIPS_VEC_128(i); + if (copy_to_user(indices, &index, sizeof(index))) + return -EFAULT; + ++indices; + } + } + return kvm_mips_callbacks->copy_reg_indices(vcpu, indices); } From aff565aab961d3cab3299a7008af6cdef88b79a0 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:29:52 +0100 Subject: [PATCH 094/302] MIPS: Clean up RDHWR handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No preprocessor definitions are used in the handling of the registers accessible with the RDHWR instruction, nor the corresponding bits in the CP0 HWREna register. Add definitions for both the register numbers (MIPS_HWR_*) and HWREna bits (MIPS_HWRENA_*) in asm/mipsregs.h and make use of them in the initialisation of HWREna and emulation of the RDHWR instruction. Signed-off-by: James Hogan Acked-by: Ralf Baechle Cc: David Daney Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- .../cpu-feature-overrides.h | 2 +- arch/mips/include/asm/mipsregs.h | 20 ++++++++++++++++++- arch/mips/kernel/traps.c | 17 +++++++++------- arch/mips/kvm/emulate.c | 10 +++++----- 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h b/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h index d68e685cde60..bd8b9bbe1771 100644 --- a/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h +++ b/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h @@ -55,7 +55,7 @@ #define cpu_has_mipsmt 0 #define cpu_has_vint 0 #define cpu_has_veic 0 -#define cpu_hwrena_impl_bits 0xc0000000 +#define cpu_hwrena_impl_bits (MIPS_HWRENA_IMPL1 | MIPS_HWRENA_IMPL2) #define cpu_has_wsbh 1 #define cpu_has_rixi (cpu_data[0].cputype != CPU_CAVIUM_OCTEON) diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h index e1ca65c62f6a..8b1b37d50d15 100644 --- a/arch/mips/include/asm/mipsregs.h +++ b/arch/mips/include/asm/mipsregs.h @@ -53,7 +53,7 @@ #define CP0_SEGCTL2 $5, 4 #define CP0_WIRED $6 #define CP0_INFO $7 -#define CP0_HWRENA $7, 0 +#define CP0_HWRENA $7 #define CP0_BADVADDR $8 #define CP0_BADINSTR $8, 1 #define CP0_COUNT $9 @@ -853,6 +853,24 @@ #define MIPS_CDMMBASE_ADDR_SHIFT 11 #define MIPS_CDMMBASE_ADDR_START 15 +/* RDHWR register numbers */ +#define MIPS_HWR_CPUNUM 0 /* CPU number */ +#define MIPS_HWR_SYNCISTEP 1 /* SYNCI step size */ +#define MIPS_HWR_CC 2 /* Cycle counter */ +#define MIPS_HWR_CCRES 3 /* Cycle counter resolution */ +#define MIPS_HWR_ULR 29 /* UserLocal */ +#define MIPS_HWR_IMPL1 30 /* Implementation dependent */ +#define MIPS_HWR_IMPL2 31 /* Implementation dependent */ + +/* Bits in HWREna register */ +#define MIPS_HWRENA_CPUNUM (_ULCAST_(1) << MIPS_HWR_CPUNUM) +#define MIPS_HWRENA_SYNCISTEP (_ULCAST_(1) << MIPS_HWR_SYNCISTEP) +#define MIPS_HWRENA_CC (_ULCAST_(1) << MIPS_HWR_CC) +#define MIPS_HWRENA_CCRES (_ULCAST_(1) << MIPS_HWR_CCRES) +#define MIPS_HWRENA_ULR (_ULCAST_(1) << MIPS_HWR_ULR) +#define MIPS_HWRENA_IMPL1 (_ULCAST_(1) << MIPS_HWR_IMPL1) +#define MIPS_HWRENA_IMPL2 (_ULCAST_(1) << MIPS_HWR_IMPL2) + /* * Bitfields in the TX39 family CP0 Configuration Register 3 */ diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 66e5820bfdae..7176a6057e26 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -619,17 +619,17 @@ static int simulate_rdhwr(struct pt_regs *regs, int rd, int rt) perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, 0); switch (rd) { - case 0: /* CPU number */ + case MIPS_HWR_CPUNUM: /* CPU number */ regs->regs[rt] = smp_processor_id(); return 0; - case 1: /* SYNCI length */ + case MIPS_HWR_SYNCISTEP: /* SYNCI length */ regs->regs[rt] = min(current_cpu_data.dcache.linesz, current_cpu_data.icache.linesz); return 0; - case 2: /* Read count register */ + case MIPS_HWR_CC: /* Read count register */ regs->regs[rt] = read_c0_count(); return 0; - case 3: /* Count register resolution */ + case MIPS_HWR_CCRES: /* Count register resolution */ switch (current_cpu_type()) { case CPU_20KC: case CPU_25KF: @@ -639,7 +639,7 @@ static int simulate_rdhwr(struct pt_regs *regs, int rd, int rt) regs->regs[rt] = 2; } return 0; - case 29: + case MIPS_HWR_ULR: /* Read UserLocal register */ regs->regs[rt] = ti->tp_value; return 0; default: @@ -2070,10 +2070,13 @@ static void configure_hwrena(void) unsigned int hwrena = cpu_hwrena_impl_bits; if (cpu_has_mips_r2_r6) - hwrena |= 0x0000000f; + hwrena |= MIPS_HWRENA_CPUNUM | + MIPS_HWRENA_SYNCISTEP | + MIPS_HWRENA_CC | + MIPS_HWRENA_CCRES; if (!noulri && cpu_has_userlocal) - hwrena |= (1 << 29); + hwrena |= MIPS_HWRENA_ULR; if (hwrena) write_c0_hwrena(hwrena); diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 80bb6212a067..892f36f56d32 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -2296,17 +2296,17 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, goto emulate_ri; } switch (rd) { - case 0: /* CPU number */ + case MIPS_HWR_CPUNUM: /* CPU number */ arch->gprs[rt] = 0; break; - case 1: /* SYNCI length */ + case MIPS_HWR_SYNCISTEP: /* SYNCI length */ arch->gprs[rt] = min(current_cpu_data.dcache.linesz, current_cpu_data.icache.linesz); break; - case 2: /* Read count register */ + case MIPS_HWR_CC: /* Read count register */ arch->gprs[rt] = kvm_mips_read_count(vcpu); break; - case 3: /* Count register resolution */ + case MIPS_HWR_CCRES: /* Count register resolution */ switch (current_cpu_data.cputype) { case CPU_20KC: case CPU_25KF: @@ -2316,7 +2316,7 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, arch->gprs[rt] = 2; } break; - case 29: + case MIPS_HWR_ULR: /* Read UserLocal register */ arch->gprs[rt] = kvm_read_c0_guest_userlocal(cop0); break; From b937ff628fa76b242a74cb9087df972d5f1cecbb Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:29:53 +0100 Subject: [PATCH 095/302] MIPS: KVM: Don't hardcode restored HWREna MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM modifies CP0_HWREna during guest execution so it can trap and emulate RDHWR instructions, however it always restores the hardcoded value 0x2000000F. This assumes the presence of the UserLocal register, and the absence of any implementation dependent or future HW registers. Fix by exporting the value that traps.c write into CP0_HWREna, and loading from there instead of hard coding. Signed-off-by: James Hogan Acked-by: Ralf Baechle Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/setup.h | 1 + arch/mips/kernel/traps.c | 5 ++++- arch/mips/kvm/locore.S | 4 ++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/mips/include/asm/setup.h b/arch/mips/include/asm/setup.h index d7bfdeba9e84..4f5279a8308d 100644 --- a/arch/mips/include/asm/setup.h +++ b/arch/mips/include/asm/setup.h @@ -21,6 +21,7 @@ extern void *set_vi_handler(int n, vi_handler_t addr); extern void *set_except_vector(int n, void *addr); extern unsigned long ebase; +extern unsigned int hwrena; extern void per_cpu_trap_init(bool); extern void cpu_cache_init(void); diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 7176a6057e26..6fb4704bd156 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -2064,10 +2064,13 @@ static void configure_status(void) status_set); } +unsigned int hwrena; +EXPORT_SYMBOL_GPL(hwrena); + /* configure HWRENA register */ static void configure_hwrena(void) { - unsigned int hwrena = cpu_hwrena_impl_bits; + hwrena = cpu_hwrena_impl_bits; if (cpu_has_mips_r2_r6) hwrena |= MIPS_HWRENA_CPUNUM | diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S index f87bec546366..698286c0f732 100644 --- a/arch/mips/kvm/locore.S +++ b/arch/mips/kvm/locore.S @@ -381,7 +381,7 @@ NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra) mtc0 k0, CP0_DDATA_LO /* Restore RDHWR access */ - PTR_LI k0, 0x2000000F + INT_L k0, hwrena mtc0 k0, CP0_HWRENA /* Jump to handler */ @@ -553,7 +553,7 @@ __kvm_mips_return_to_host: mtlo k0 /* Restore RDHWR access */ - PTR_LI k0, 0x2000000F + INT_L k0, hwrena mtc0 k0, CP0_HWRENA /* Restore RA, which is the address we will return to */ From cef061d086b1de75445cba63af5306f98fb52f4b Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:29:54 +0100 Subject: [PATCH 096/302] MIPS: KVM: Allow ULRI to restrict UserLocal register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ULRI bit in Config3 specifies whether the UserLocal register is implemented, but it is assumed to always be set. Now that the Config registers can be modified by userland, allow Config3.ULRI to be cleared and check ULRI before allowing the corresponding bit to be set in HWREna. In fact any HWREna bits corresponding to unimplemented RDHWR registers should read as zero and be ignored on write, so we actually prevent other unimplemented bits being set too. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/emulate.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 892f36f56d32..84f435bf74bd 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -921,8 +921,8 @@ unsigned int kvm_mips_config1_wrmask(struct kvm_vcpu *vcpu) */ unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu) { - /* Config4 is optional */ - unsigned int mask = MIPS_CONF_M; + /* Config4 and ULRI are optional */ + unsigned int mask = MIPS_CONF_M | MIPS_CONF3_ULRI; /* Permit MSA to be present if MSA is supported */ if (kvm_mips_guest_can_have_msa(&vcpu->arch)) @@ -1229,6 +1229,16 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, else kvm_mips_count_enable_cause(vcpu); } + } else if ((rd == MIPS_CP0_HWRENA) && (sel == 0)) { + u32 mask = MIPS_HWRENA_CPUNUM | + MIPS_HWRENA_SYNCISTEP | + MIPS_HWRENA_CC | + MIPS_HWRENA_CCRES; + + if (kvm_read_c0_guest_config3(cop0) & + MIPS_CONF3_ULRI) + mask |= MIPS_HWRENA_ULR; + cop0->reg[rd][sel] = vcpu->arch.gprs[rt] & mask; } else { cop0->reg[rd][sel] = vcpu->arch.gprs[rt]; #ifdef CONFIG_KVM_MIPS_DYN_TRANS From cf1fb0f29d83bcc1d8af912e56f2295397372908 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:29:55 +0100 Subject: [PATCH 097/302] MIPS: KVM: Emulate RDHWR CPUNum register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Actually provide the VCPU number when emulating the RDHWR CPUNum register, so that it will match the CPUNum field of CP0_EBase register, rather than always returning 0. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 84f435bf74bd..4ca5450febbb 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -2307,7 +2307,7 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, } switch (rd) { case MIPS_HWR_CPUNUM: /* CPU number */ - arch->gprs[rt] = 0; + arch->gprs[rt] = vcpu->vcpu_id; break; case MIPS_HWR_SYNCISTEP: /* SYNCI length */ arch->gprs[rt] = min(current_cpu_data.dcache.linesz, From 05108709526716e1d40210fe3b9d7acd1cb694ea Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:29:56 +0100 Subject: [PATCH 098/302] MIPS: KVM: Add KScratch registers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow up to 6 KVM guest KScratch registers to be enabled and accessed via the KVM guest register API and from the guest itself (the fallback reading and writing of commpage registers is sufficient for KScratch registers to work as expected). User mode can expose the registers by setting the appropriate bits of the guest Config4.KScrExist field. KScratch registers that aren't usable won't be writeable via the KVM Ioctl API. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 6 +++ arch/mips/include/asm/kvm_host.h | 19 +++++++++ arch/mips/kvm/emulate.c | 7 ++- arch/mips/kvm/mips.c | 71 +++++++++++++++++++++++++++++++ arch/mips/kvm/trace.h | 6 +++ arch/mips/kvm/trap_emul.c | 2 + 6 files changed, 110 insertions(+), 1 deletion(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 4aac3e51bf9f..09efa9eb3926 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2032,6 +2032,12 @@ registers, find a list below: MIPS | KVM_REG_MIPS_CP0_CONFIG5 | 32 MIPS | KVM_REG_MIPS_CP0_CONFIG7 | 32 MIPS | KVM_REG_MIPS_CP0_ERROREPC | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH1 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH2 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH3 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH4 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH5 | 64 + MIPS | KVM_REG_MIPS_CP0_KSCRATCH6 | 64 MIPS | KVM_REG_MIPS_COUNT_CTL | 64 MIPS | KVM_REG_MIPS_COUNT_RESUME | 64 MIPS | KVM_REG_MIPS_COUNT_HZ | 64 diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index f12eb01a3195..5e9da2a31fde 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -56,6 +56,12 @@ #define KVM_REG_MIPS_CP0_CONFIG7 MIPS_CP0_32(16, 7) #define KVM_REG_MIPS_CP0_XCONTEXT MIPS_CP0_64(20, 0) #define KVM_REG_MIPS_CP0_ERROREPC MIPS_CP0_64(30, 0) +#define KVM_REG_MIPS_CP0_KSCRATCH1 MIPS_CP0_64(31, 2) +#define KVM_REG_MIPS_CP0_KSCRATCH2 MIPS_CP0_64(31, 3) +#define KVM_REG_MIPS_CP0_KSCRATCH3 MIPS_CP0_64(31, 4) +#define KVM_REG_MIPS_CP0_KSCRATCH4 MIPS_CP0_64(31, 5) +#define KVM_REG_MIPS_CP0_KSCRATCH5 MIPS_CP0_64(31, 6) +#define KVM_REG_MIPS_CP0_KSCRATCH6 MIPS_CP0_64(31, 7) #define KVM_MAX_VCPUS 1 @@ -376,6 +382,7 @@ struct kvm_vcpu_arch { u8 fpu_enabled; u8 msa_enabled; + u8 kscratch_enabled; }; @@ -429,6 +436,18 @@ struct kvm_vcpu_arch { #define kvm_write_c0_guest_config7(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][7] = (val)) #define kvm_read_c0_guest_errorepc(cop0) (cop0->reg[MIPS_CP0_ERROR_PC][0]) #define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val)) +#define kvm_read_c0_guest_kscratch1(cop0) (cop0->reg[MIPS_CP0_DESAVE][2]) +#define kvm_read_c0_guest_kscratch2(cop0) (cop0->reg[MIPS_CP0_DESAVE][3]) +#define kvm_read_c0_guest_kscratch3(cop0) (cop0->reg[MIPS_CP0_DESAVE][4]) +#define kvm_read_c0_guest_kscratch4(cop0) (cop0->reg[MIPS_CP0_DESAVE][5]) +#define kvm_read_c0_guest_kscratch5(cop0) (cop0->reg[MIPS_CP0_DESAVE][6]) +#define kvm_read_c0_guest_kscratch6(cop0) (cop0->reg[MIPS_CP0_DESAVE][7]) +#define kvm_write_c0_guest_kscratch1(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][2] = (val)) +#define kvm_write_c0_guest_kscratch2(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][3] = (val)) +#define kvm_write_c0_guest_kscratch3(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][4] = (val)) +#define kvm_write_c0_guest_kscratch4(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][5] = (val)) +#define kvm_write_c0_guest_kscratch5(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][6] = (val)) +#define kvm_write_c0_guest_kscratch6(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][7] = (val)) /* * Some of the guest registers may be modified asynchronously (e.g. from a diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 4ca5450febbb..5f0354c80c8e 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -941,7 +941,12 @@ unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu) unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu) { /* Config5 is optional */ - return MIPS_CONF_M; + unsigned int mask = MIPS_CONF_M; + + /* KScrExist */ + mask |= (unsigned int)vcpu->arch.kscratch_enabled << 16; + + return mask; } /** diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 622b9feba927..5a2b9034a05c 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -9,6 +9,7 @@ * Authors: Sanjay Lal */ +#include #include #include #include @@ -548,6 +549,15 @@ static u64 kvm_mips_get_one_regs_msa[] = { KVM_REG_MIPS_MSA_CSR, }; +static u64 kvm_mips_get_one_regs_kscratch[] = { + KVM_REG_MIPS_CP0_KSCRATCH1, + KVM_REG_MIPS_CP0_KSCRATCH2, + KVM_REG_MIPS_CP0_KSCRATCH3, + KVM_REG_MIPS_CP0_KSCRATCH4, + KVM_REG_MIPS_CP0_KSCRATCH5, + KVM_REG_MIPS_CP0_KSCRATCH6, +}; + static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu) { unsigned long ret; @@ -561,6 +571,7 @@ static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu) } if (kvm_mips_guest_can_have_msa(&vcpu->arch)) ret += ARRAY_SIZE(kvm_mips_get_one_regs_msa) + 32; + ret += __arch_hweight8(vcpu->arch.kscratch_enabled); ret += kvm_mips_callbacks->num_regs(vcpu); return ret; @@ -613,6 +624,16 @@ static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices) } } + for (i = 0; i < 6; ++i) { + if (!(vcpu->arch.kscratch_enabled & BIT(i + 2))) + continue; + + if (copy_to_user(indices, &kvm_mips_get_one_regs_kscratch[i], + sizeof(kvm_mips_get_one_regs_kscratch[i]))) + return -EFAULT; + ++indices; + } + return kvm_mips_callbacks->copy_reg_indices(vcpu, indices); } @@ -765,6 +786,31 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_CP0_ERROREPC: v = (long)kvm_read_c0_guest_errorepc(cop0); break; + case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6: + idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2; + if (!(vcpu->arch.kscratch_enabled & BIT(idx))) + return -EINVAL; + switch (idx) { + case 2: + v = (long)kvm_read_c0_guest_kscratch1(cop0); + break; + case 3: + v = (long)kvm_read_c0_guest_kscratch2(cop0); + break; + case 4: + v = (long)kvm_read_c0_guest_kscratch3(cop0); + break; + case 5: + v = (long)kvm_read_c0_guest_kscratch4(cop0); + break; + case 6: + v = (long)kvm_read_c0_guest_kscratch5(cop0); + break; + case 7: + v = (long)kvm_read_c0_guest_kscratch6(cop0); + break; + } + break; /* registers to be handled specially */ default: ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v); @@ -931,6 +977,31 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_CP0_ERROREPC: kvm_write_c0_guest_errorepc(cop0, v); break; + case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6: + idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2; + if (!(vcpu->arch.kscratch_enabled & BIT(idx))) + return -EINVAL; + switch (idx) { + case 2: + kvm_write_c0_guest_kscratch1(cop0, v); + break; + case 3: + kvm_write_c0_guest_kscratch2(cop0, v); + break; + case 4: + kvm_write_c0_guest_kscratch3(cop0, v); + break; + case 5: + kvm_write_c0_guest_kscratch4(cop0, v); + break; + case 6: + kvm_write_c0_guest_kscratch5(cop0, v); + break; + case 7: + kvm_write_c0_guest_kscratch6(cop0, v); + break; + } + break; /* registers to be handled specially */ default: return kvm_mips_callbacks->set_one_reg(vcpu, reg, v); diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h index 5d712ecb0734..a38bdab68574 100644 --- a/arch/mips/kvm/trace.h +++ b/arch/mips/kvm/trace.h @@ -178,6 +178,12 @@ TRACE_EVENT(kvm_exit, { KVM_TRACE_COP0(16, 7), "Config7" }, \ { KVM_TRACE_COP0(26, 0), "ECC" }, \ { KVM_TRACE_COP0(30, 0), "ErrorEPC" }, \ + { KVM_TRACE_COP0(31, 2), "KScratch1" }, \ + { KVM_TRACE_COP0(31, 3), "KScratch2" }, \ + { KVM_TRACE_COP0(31, 4), "KScratch3" }, \ + { KVM_TRACE_COP0(31, 5), "KScratch4" }, \ + { KVM_TRACE_COP0(31, 6), "KScratch5" }, \ + { KVM_TRACE_COP0(31, 7), "KScratch6" }, \ { KVM_TRACE_HWR( 0, 0), "CPUNum" }, \ { KVM_TRACE_HWR( 1, 0), "SYNCI_Step" }, \ { KVM_TRACE_HWR( 2, 0), "CC" }, \ diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index b64ca1a222f7..eb191c4612bb 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -418,6 +418,8 @@ static int kvm_trap_emul_vm_init(struct kvm *kvm) static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu) { + vcpu->arch.kscratch_enabled = 0xfc; + return 0; } From 42aa12e74e91f790d239bfb852260d07573ce83f Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:29:57 +0100 Subject: [PATCH 099/302] MIPS: KVM: Move commpage so 0x0 is unmapped MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comm page which is mapped into the guest kernel address space at 0x0 has the unfortunate side effect of allowing guest kernel NULL pointer dereferences to succeed. The only constraint on this address is that it must be within 32KiB of 0x0, so that single lw/sw instructions (which have 16-bit signed offset fields) can be used to access it, using the zero register as a base. So lets move the comm page as high as possible within that constraint so that 0x0 can be left unmapped, at least for page sizes < 32KiB. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 10 ++++++++-- arch/mips/kvm/commpage.c | 2 +- arch/mips/kvm/dyntrans.c | 4 ++-- arch/mips/kvm/tlb.c | 18 +++++++++--------- 4 files changed, 20 insertions(+), 14 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 5e9da2a31fde..6c43c782bdfa 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -74,8 +74,14 @@ -/* Special address that contains the comm page, used for reducing # of traps */ -#define KVM_GUEST_COMMPAGE_ADDR 0x0 +/* + * Special address that contains the comm page, used for reducing # of traps + * This needs to be within 32Kb of 0x0 (so the zero register can be used), but + * preferably not at 0x0 so that most kernel NULL pointer dereferences can be + * caught. + */ +#define KVM_GUEST_COMMPAGE_ADDR ((PAGE_SIZE > 0x8000) ? 0 : \ + (0x8000 - PAGE_SIZE)) #define KVM_GUEST_KERNEL_MODE(vcpu) ((kvm_read_c0_guest_status(vcpu->arch.cop0) & (ST0_EXL | ST0_ERL)) || \ ((kvm_read_c0_guest_status(vcpu->arch.cop0) & KSU_USER) == 0)) diff --git a/arch/mips/kvm/commpage.c b/arch/mips/kvm/commpage.c index 2d6e976d1add..a36b77e1705c 100644 --- a/arch/mips/kvm/commpage.c +++ b/arch/mips/kvm/commpage.c @@ -4,7 +4,7 @@ * for more details. * * commpage, currently used for Virtual COP0 registers. - * Mapped into the guest kernel @ 0x0. + * Mapped into the guest kernel @ KVM_GUEST_COMMPAGE_ADDR. * * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. * Authors: Sanjay Lal diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c index a3031dae8d1b..8a1833b9eb38 100644 --- a/arch/mips/kvm/dyntrans.c +++ b/arch/mips/kvm/dyntrans.c @@ -93,7 +93,7 @@ int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc, } else { mfc0_inst.i_format.opcode = lw_op; mfc0_inst.i_format.rt = inst.c0r_format.rt; - mfc0_inst.i_format.simmediate = + mfc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR | offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]); } @@ -111,7 +111,7 @@ int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc, mtc0_inst.i_format.opcode = sw_op; mtc0_inst.i_format.rt = inst.c0r_format.rt; - mtc0_inst.i_format.simmediate = + mtc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR | offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]); return kvm_mips_trans_replace(vcpu, opc, mtc0_inst); diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 8012e686d4ae..385fbd34e77d 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -171,23 +171,23 @@ EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_write); int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, struct kvm_vcpu *vcpu) { - kvm_pfn_t pfn0, pfn1; + kvm_pfn_t pfn; unsigned long flags, old_entryhi = 0, vaddr = 0; - unsigned long entrylo0 = 0, entrylo1 = 0; + unsigned long entrylo[2] = { 0, 0 }; + unsigned int pair_idx; - pfn0 = CPHYSADDR(vcpu->arch.kseg0_commpage) >> PAGE_SHIFT; - pfn1 = 0; - entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | - (0x3 << ENTRYLO_C_SHIFT) | ENTRYLO_D | ENTRYLO_V; - entrylo1 = 0; + pfn = CPHYSADDR(vcpu->arch.kseg0_commpage) >> PAGE_SHIFT; + pair_idx = (badvaddr >> PAGE_SHIFT) & 1; + entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) | + (0x3 << ENTRYLO_C_SHIFT) | ENTRYLO_D | ENTRYLO_V; local_irq_save(flags); old_entryhi = read_c0_entryhi(); vaddr = badvaddr & (PAGE_MASK << 1); write_c0_entryhi(vaddr | kvm_mips_get_kernel_asid(vcpu)); - write_c0_entrylo0(entrylo0); - write_c0_entrylo1(entrylo1); + write_c0_entrylo0(entrylo[0]); + write_c0_entrylo1(entrylo[1]); write_c0_index(kvm_mips_get_commpage_asid(vcpu)); mtc0_tlbw_hazard(); tlb_write_indexed(); From 7414d2f65006ac8609196092f2869e0942599b72 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:29:58 +0100 Subject: [PATCH 100/302] MIPS: KVM: Use host CCA for TLB mappings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM TLB mappings for the guest were being created with a cache coherency attribute (CCA) of 3, which is cached incoherent. Create them instead with the default host CCA, which should be the correct one for coherency on SMP systems. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mmu.c | 18 ++++++++++-------- arch/mips/kvm/tlb.c | 3 ++- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 2f494ec5c939..ecead748de04 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -116,9 +116,11 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, pfn1 = kvm->arch.guest_pmap[gfn | 0x1]; entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | - (0x3 << ENTRYLO_C_SHIFT) | ENTRYLO_D | ENTRYLO_V; + ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | + ENTRYLO_D | ENTRYLO_V; entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | - (0x3 << ENTRYLO_C_SHIFT) | ENTRYLO_D | ENTRYLO_V; + ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | + ENTRYLO_D | ENTRYLO_V; preempt_disable(); entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu)); @@ -157,13 +159,13 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, /* Get attributes from the Guest TLB */ entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | - (0x3 << ENTRYLO_C_SHIFT) | - (tlb->tlb_lo[0] & ENTRYLO_D) | - (tlb->tlb_lo[0] & ENTRYLO_V); + ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | + (tlb->tlb_lo[0] & ENTRYLO_D) | + (tlb->tlb_lo[0] & ENTRYLO_V); entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | - (0x3 << ENTRYLO_C_SHIFT) | - (tlb->tlb_lo[1] & ENTRYLO_D) | - (tlb->tlb_lo[1] & ENTRYLO_V); + ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | + (tlb->tlb_lo[1] & ENTRYLO_D) | + (tlb->tlb_lo[1] & ENTRYLO_V); kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc, tlb->tlb_lo[0], tlb->tlb_lo[1]); diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 385fbd34e77d..9699352293e4 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -179,7 +179,8 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, pfn = CPHYSADDR(vcpu->arch.kseg0_commpage) >> PAGE_SHIFT; pair_idx = (badvaddr >> PAGE_SHIFT) & 1; entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) | - (0x3 << ENTRYLO_C_SHIFT) | ENTRYLO_D | ENTRYLO_V; + ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | + ENTRYLO_D | ENTRYLO_V; local_irq_save(flags); From 4b34bca0e4c7091a06d774342faf8c9a4836af22 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:29:59 +0100 Subject: [PATCH 101/302] MIPS: Add define for Config.VI (virtual icache) bit The Config.VI bit specifies that the instruction cache is virtually tagged, which is checked in c-r4k.c's probe_pcache(). Add a proper definition for it in mipsregs.h and make use of it. Signed-off-by: James Hogan Acked-by: Ralf Baechle Cc: linux-mips@linux-mips.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/mipsregs.h | 1 + arch/mips/mm/c-r4k.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h index 8b1b37d50d15..def9d8d13f6e 100644 --- a/arch/mips/include/asm/mipsregs.h +++ b/arch/mips/include/asm/mipsregs.h @@ -533,6 +533,7 @@ #define TX49_CONF_CWFON (_ULCAST_(1) << 27) /* Bits specific to the MIPS32/64 PRA. */ +#define MIPS_CONF_VI (_ULCAST_(1) << 3) #define MIPS_CONF_MT (_ULCAST_(7) << 7) #define MIPS_CONF_MT_TLB (_ULCAST_(1) << 7) #define MIPS_CONF_MT_FTLB (_ULCAST_(4) << 7) diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index ef7f925dd1b0..7a9c345e87e5 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -1206,7 +1206,7 @@ static void probe_pcache(void) c->icache.linesz; c->icache.waybit = __ffs(icache_size/c->icache.ways); - if (config & 0x8) /* VI bit */ + if (config & MIPS_CONF_VI) c->icache.flags |= MIPS_CACHE_VTAG; /* From e342925f1777f73befda61b48845b0bc88a33181 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:30:00 +0100 Subject: [PATCH 102/302] MIPS: KVM: Report more accurate CP0_Config fields to guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initialise the guest's CP0_Config register with a few more bits of information from the host. The BE bit should be set on big endian machines, the VI bit should be set on machines with a virtually tagged instruction cache, and the reported architecture revision should match that of the host (since we won't support emulating pre-r6 instruction encodings on r6 or vice versa). Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/trap_emul.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index eb191c4612bb..1dc003ddca91 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -426,7 +426,7 @@ static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu) static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - u32 config1; + u32 config, config1; int vcpu_id = vcpu->vcpu_id; /* @@ -434,10 +434,20 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) * guest will come up as expected, for now we simulate a MIPS 24kc */ kvm_write_c0_guest_prid(cop0, 0x00019300); - /* Have config1, Cacheable, noncoherent, write-back, write allocate */ - kvm_write_c0_guest_config(cop0, MIPS_CONF_M | (0x3 << CP0C0_K0) | - (0x1 << CP0C0_AR) | - (MMU_TYPE_R4000 << CP0C0_MT)); + /* + * Have config1, Cacheable, noncoherent, write-back, write allocate. + * Endianness, arch revision & virtually tagged icache should match + * host. + */ + config = read_c0_config() & MIPS_CONF_AR; + config |= MIPS_CONF_M | (0x3 << CP0C0_K0) | + (MMU_TYPE_R4000 << CP0C0_MT); +#ifdef CONFIG_CPU_BIG_ENDIAN + config |= CONF_BE; +#endif + if (cpu_has_vtag_icache) + config |= MIPS_CONF_VI; + kvm_write_c0_guest_config(cop0, config); /* Read the cache characteristics from the host Config1 Register */ config1 = (read_c0_config1() & ~0x7f); From 4e10b764e2cba8d8eb5e22d9d8061806ec86805c Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 15 Jun 2016 19:30:01 +0100 Subject: [PATCH 103/302] MIPS: KVM: Use mipsregs.h defs for config registers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert MIPS KVM guest register state initialisation to use the standard register field definitions for Config registers, and drop the custom definitions in kvm_host.h which it was using before. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 67 -------------------------------- arch/mips/kvm/trap_emul.c | 8 ++-- 2 files changed, 3 insertions(+), 72 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 6c43c782bdfa..b0773c6d622f 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -205,73 +205,6 @@ struct mips_coproc { #define MIPS_CP0_CONFIG4_SEL 4 #define MIPS_CP0_CONFIG5_SEL 5 -/* Config0 register bits */ -#define CP0C0_M 31 -#define CP0C0_K23 28 -#define CP0C0_KU 25 -#define CP0C0_MDU 20 -#define CP0C0_MM 17 -#define CP0C0_BM 16 -#define CP0C0_BE 15 -#define CP0C0_AT 13 -#define CP0C0_AR 10 -#define CP0C0_MT 7 -#define CP0C0_VI 3 -#define CP0C0_K0 0 - -/* Config1 register bits */ -#define CP0C1_M 31 -#define CP0C1_MMU 25 -#define CP0C1_IS 22 -#define CP0C1_IL 19 -#define CP0C1_IA 16 -#define CP0C1_DS 13 -#define CP0C1_DL 10 -#define CP0C1_DA 7 -#define CP0C1_C2 6 -#define CP0C1_MD 5 -#define CP0C1_PC 4 -#define CP0C1_WR 3 -#define CP0C1_CA 2 -#define CP0C1_EP 1 -#define CP0C1_FP 0 - -/* Config2 Register bits */ -#define CP0C2_M 31 -#define CP0C2_TU 28 -#define CP0C2_TS 24 -#define CP0C2_TL 20 -#define CP0C2_TA 16 -#define CP0C2_SU 12 -#define CP0C2_SS 8 -#define CP0C2_SL 4 -#define CP0C2_SA 0 - -/* Config3 Register bits */ -#define CP0C3_M 31 -#define CP0C3_ISA_ON_EXC 16 -#define CP0C3_ULRI 13 -#define CP0C3_DSPP 10 -#define CP0C3_LPA 7 -#define CP0C3_VEIC 6 -#define CP0C3_VInt 5 -#define CP0C3_SP 4 -#define CP0C3_MT 2 -#define CP0C3_SM 1 -#define CP0C3_TL 0 - -/* MMU types, the first four entries have the same layout as the - CP0C0_MT field. */ -enum mips_mmu_types { - MMU_TYPE_NONE, - MMU_TYPE_R4000, - MMU_TYPE_RESERVED, - MMU_TYPE_FMT, - MMU_TYPE_R3000, - MMU_TYPE_R6000, - MMU_TYPE_R8000 -}; - /* Resume Flags */ #define RESUME_FLAG_DR (1<<0) /* Reload guest nonvolatile state? */ #define RESUME_FLAG_HOST (1<<1) /* Resume host? */ diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 1dc003ddca91..00e8dc3d36cb 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -440,8 +440,7 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) * host. */ config = read_c0_config() & MIPS_CONF_AR; - config |= MIPS_CONF_M | (0x3 << CP0C0_K0) | - (MMU_TYPE_R4000 << CP0C0_MT); + config |= MIPS_CONF_M | CONF_CM_CACHABLE_NONCOHERENT | MIPS_CONF_MT_TLB; #ifdef CONFIG_CPU_BIG_ENDIAN config |= CONF_BE; #endif @@ -457,9 +456,8 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) config1 |= ((KVM_MIPS_GUEST_TLB_SIZE - 1) << 25); /* We unset some bits that we aren't emulating */ - config1 &= - ~((1 << CP0C1_C2) | (1 << CP0C1_MD) | (1 << CP0C1_PC) | - (1 << CP0C1_WR) | (1 << CP0C1_CA)); + config1 &= ~(MIPS_CONF1_C2 | MIPS_CONF1_MD | MIPS_CONF1_PC | + MIPS_CONF1_WR | MIPS_CONF1_CA); kvm_write_c0_guest_config1(cop0, config1); /* Have config3, no tertiary/secondary caches implemented */ From 682a8108872f78560c891cf30c7d08aa01dac943 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 9 May 2016 11:53:06 +0200 Subject: [PATCH 104/302] x86/kvm/svm: Simplify cpu_has_svm() Use already cached CPUID information instead of querying CPUID again. No functionality change. Signed-off-by: Borislav Petkov Cc: Joerg Roedel Cc: kvm@vger.kernel.org Cc: x86@kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 1 - arch/x86/include/asm/virtext.h | 8 ++------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index d0fe23ec7e98..14824fc78f7e 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -193,7 +193,6 @@ struct __attribute__ ((__packed__)) vmcb { struct vmcb_save_area save; }; -#define SVM_CPUID_FEATURE_SHIFT 2 #define SVM_CPUID_FUNC 0x8000000a #define SVM_VM_CR_SVM_DISABLE 4 diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index cce9ee68e335..0116b2ee9e64 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -83,23 +83,19 @@ static inline void cpu_emergency_vmxoff(void) */ static inline int cpu_has_svm(const char **msg) { - uint32_t eax, ebx, ecx, edx; - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { if (msg) *msg = "not amd"; return 0; } - cpuid(0x80000000, &eax, &ebx, &ecx, &edx); - if (eax < SVM_CPUID_FUNC) { + if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) { if (msg) *msg = "can't execute cpuid_8000000a"; return 0; } - cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { + if (!boot_cpu_has(X86_FEATURE_SVM)) { if (msg) *msg = "svm not available"; return 0; From 6c7caebc26c5f0b618f0ef6b851e9f5f27c3812f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Jun 2016 14:48:25 +0200 Subject: [PATCH 105/302] KVM: introduce kvm->created_vcpus The race between creating the irqchip and the first VCPU is currently fixed by checking the presence of an irqchip before updating kvm->online_vcpus, and undoing the whole VCPU creation if someone created the irqchip in the meanwhile. Instead, introduce a new field in struct kvm that will count VCPUs under a mutex, without the atomic access and memory ordering that we need elsewhere to protect the vcpus array. This also plugs the race and is more easily applicable in all similar circumstances. Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 8 ++++++++ virt/kvm/kvm_main.c | 23 +++++++++++++++++------ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1c9c973a7dd9..63c6ab30bc81 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -371,7 +371,15 @@ struct kvm { struct srcu_struct srcu; struct srcu_struct irq_srcu; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + + /* + * created_vcpus is protected by kvm->lock, and is incremented + * at the beginning of KVM_CREATE_VCPU. online_vcpus is only + * incremented after storing the kvm_vcpu pointer in vcpus, + * and is accessed atomically. + */ atomic_t online_vcpus; + int created_vcpus; int last_boosted_vcpu; struct list_head vm_list; struct mutex lock; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 02e98f3131bd..15b757ae64e1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2346,9 +2346,20 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) if (id >= KVM_MAX_VCPU_ID) return -EINVAL; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus == KVM_MAX_VCPUS) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } + + kvm->created_vcpus++; + mutex_unlock(&kvm->lock); + vcpu = kvm_arch_vcpu_create(kvm, id); - if (IS_ERR(vcpu)) - return PTR_ERR(vcpu); + if (IS_ERR(vcpu)) { + r = PTR_ERR(vcpu); + goto vcpu_decrement; + } preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); @@ -2361,10 +2372,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) r = -EINVAL; goto unlock_vcpu_destroy; } - if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { - r = -EINVAL; - goto unlock_vcpu_destroy; - } if (kvm_get_vcpu_by_id(kvm, id)) { r = -EEXIST; goto unlock_vcpu_destroy; @@ -2397,6 +2404,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) mutex_unlock(&kvm->lock); vcpu_destroy: kvm_arch_vcpu_destroy(vcpu); +vcpu_decrement: + mutex_lock(&kvm->lock); + kvm->created_vcpus--; + mutex_unlock(&kvm->lock); return r; } From 557abc40d121358883d2da8bc8bf976d6e8ec332 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Jun 2016 14:50:04 +0200 Subject: [PATCH 106/302] KVM: remove kvm_vcpu_compatible The new created_vcpus field makes it possible to avoid the race between irqchip and VCPU creation in a much nicer way; just check under kvm->lock whether a VCPU has already been created. We can then remove KVM_APIC_ARCHITECTURE too, because at this point the symbol is only governing the default definition of kvm_vcpu_compatible. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Kconfig | 1 - arch/x86/kvm/x86.c | 11 +++-------- include/linux/kvm_host.h | 6 ------ virt/kvm/Kconfig | 3 --- virt/kvm/kvm_main.c | 4 ---- 5 files changed, 3 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 639a6e34500c..ab8e32f7b9a8 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -32,7 +32,6 @@ config KVM select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_EVENTFD - select KVM_APIC_ARCHITECTURE select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bf227212aebb..ab2f45a50bb5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3774,7 +3774,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, r = -EEXIST; if (irqchip_in_kernel(kvm)) goto split_irqchip_unlock; - if (atomic_read(&kvm->online_vcpus)) + if (kvm->created_vcpus) goto split_irqchip_unlock; r = kvm_setup_empty_irq_routing(kvm); if (r) @@ -3839,7 +3839,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.vpic) goto create_irqchip_unlock; r = -EINVAL; - if (atomic_read(&kvm->online_vcpus)) + if (kvm->created_vcpus) goto create_irqchip_unlock; r = -ENOMEM; vpic = kvm_create_pic(kvm); @@ -3995,7 +3995,7 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_SET_BOOT_CPU_ID: r = 0; mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus) != 0) + if (kvm->created_vcpus) r = -EBUSY; else kvm->arch.bsp_vcpu_id = arg; @@ -7639,11 +7639,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; } -bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) -{ - return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu); -} - struct static_key kvm_no_apic_vcpu __read_mostly; EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 63c6ab30bc81..0640ee92b978 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1105,12 +1105,6 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) #endif /* CONFIG_HAVE_KVM_EVENTFD */ -#ifdef CONFIG_KVM_APIC_ARCHITECTURE -bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu); -#else -static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; } -#endif - static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) { /* diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index e5d6108f5e85..b0cc1a34db27 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -16,9 +16,6 @@ config HAVE_KVM_EVENTFD bool select EVENTFD -config KVM_APIC_ARCHITECTURE - bool - config KVM_MMIO bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 15b757ae64e1..ef54b4c31792 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2368,10 +2368,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) goto vcpu_destroy; mutex_lock(&kvm->lock); - if (!kvm_vcpu_compatible(vcpu)) { - r = -EINVAL; - goto unlock_vcpu_destroy; - } if (kvm_get_vcpu_by_id(kvm, id)) { r = -EEXIST; goto unlock_vcpu_destroy; From a03825bbd0c39feeba605912cdbc28e79e4e01e1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Jun 2016 14:50:04 +0200 Subject: [PATCH 107/302] KVM: s390: use kvm->created_vcpus The new created_vcpus field avoids possible races between enabling capabilities and creating VCPUs. Acked-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 49c60393a15c..0dcf9b8fc12c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -422,7 +422,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) break; case KVM_CAP_S390_VECTOR_REGISTERS: mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus)) { + if (kvm->created_vcpus) { r = -EBUSY; } else if (MACHINE_HAS_VX) { set_kvm_facility(kvm->arch.model.fac_mask, 129); @@ -437,7 +437,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) case KVM_CAP_S390_RI: r = -EINVAL; mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus)) { + if (kvm->created_vcpus) { r = -EBUSY; } else if (test_facility(64)) { set_kvm_facility(kvm->arch.model.fac_mask, 64); @@ -492,7 +492,7 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att ret = -EBUSY; VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support"); mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus) == 0) { + if (!kvm->created_vcpus) { kvm->arch.use_cmma = 1; ret = 0; } @@ -536,7 +536,7 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att ret = -EBUSY; mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus) == 0) { + if (!kvm->created_vcpus) { /* gmap_alloc will round the limit up */ struct gmap *new = gmap_alloc(current->mm, new_limit); @@ -713,7 +713,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr) int ret = 0; mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus)) { + if (kvm->created_vcpus) { ret = -EBUSY; goto out; } From 53f9eedff713bab262b64682ad1abb1e8116d041 Mon Sep 17 00:00:00 2001 From: Yunhong Jiang Date: Mon, 13 Jun 2016 14:20:00 -0700 Subject: [PATCH 108/302] kvm: lapic: separate start_sw_tscdeadline from start_apic_timer The function to start the tsc deadline timer virtualization will be used also by the pre_block hook when we use the preemption timer; change it to a separate function. No logic changes. Signed-off-by: Yunhong Jiang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 57 ++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index bbb5b283ff63..f1cf8a5ede11 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1313,6 +1313,36 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) __delay(tsc_deadline - guest_tsc); } +static void start_sw_tscdeadline(struct kvm_lapic *apic) +{ + u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; + u64 ns = 0; + ktime_t expire; + struct kvm_vcpu *vcpu = apic->vcpu; + unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; + unsigned long flags; + ktime_t now; + + if (unlikely(!tscdeadline || !this_tsc_khz)) + return; + + local_irq_save(flags); + + now = apic->lapic_timer.timer.base->get_time(); + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + if (likely(tscdeadline > guest_tsc)) { + ns = (tscdeadline - guest_tsc) * 1000000ULL; + do_div(ns, this_tsc_khz); + expire = ktime_add_ns(now, ns); + expire = ktime_sub_ns(expire, lapic_timer_advance_ns); + hrtimer_start(&apic->lapic_timer.timer, + expire, HRTIMER_MODE_ABS_PINNED); + } else + apic_timer_expired(apic); + + local_irq_restore(flags); +} + static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now; @@ -1359,32 +1389,7 @@ static void start_apic_timer(struct kvm_lapic *apic) ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); } else if (apic_lvtt_tscdeadline(apic)) { - /* lapic timer in tsc deadline mode */ - u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; - u64 ns = 0; - ktime_t expire; - struct kvm_vcpu *vcpu = apic->vcpu; - unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; - unsigned long flags; - - if (unlikely(!tscdeadline || !this_tsc_khz)) - return; - - local_irq_save(flags); - - now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - if (likely(tscdeadline > guest_tsc)) { - ns = (tscdeadline - guest_tsc) * 1000000ULL; - do_div(ns, this_tsc_khz); - expire = ktime_add_ns(now, ns); - expire = ktime_sub_ns(expire, lapic_timer_advance_ns); - hrtimer_start(&apic->lapic_timer.timer, - expire, HRTIMER_MODE_ABS_PINNED); - } else - apic_timer_expired(apic); - - local_irq_restore(flags); + start_sw_tscdeadline(apic); } } From ce7a058a2117f0bca2f42f2870a97bfa9aa8e099 Mon Sep 17 00:00:00 2001 From: Yunhong Jiang Date: Mon, 13 Jun 2016 14:20:01 -0700 Subject: [PATCH 109/302] KVM: x86: support using the vmx preemption timer for tsc deadline timer The VMX preemption timer can be used to virtualize the TSC deadline timer. The VMX preemption timer is armed when the vCPU is running, and a VMExit will happen if the virtual TSC deadline timer expires. When the vCPU thread is blocked because of HLT, KVM will switch to use an hrtimer, and then go back to the VMX preemption timer when the vCPU thread is unblocked. This solution avoids the complex OS's hrtimer system, and the host timer interrupt handling cost, replacing them with a little math (for guest->host TSC and host TSC->preemption timer conversion) and a cheaper VMexit. This benefits latency for isolated pCPUs. [A word about performance... Yunhong reported a 30% reduction in average latency from cyclictest. I made a similar test with tscdeadline_latency from kvm-unit-tests, and measured - ~20 clock cycles loss (out of ~3200, so less than 1% but still statistically significant) in the worst case where the test halts just after programming the TSC deadline timer - ~800 clock cycles gain (25% reduction in latency) in the best case where the test busy waits. I removed the VMX bits from Yunhong's patch, to concentrate them in the next patch - Paolo] Signed-off-by: Yunhong Jiang Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++ arch/x86/kvm/lapic.c | 73 ++++++++++++++++++++++++++++++++- arch/x86/kvm/lapic.h | 5 +++ arch/x86/kvm/trace.h | 15 +++++++ arch/x86/kvm/x86.c | 5 +++ 5 files changed, 100 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e0fbe7e70dc1..e055f3787dc9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1005,6 +1005,9 @@ struct kvm_x86_ops { int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); + + int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc); + void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index f1cf8a5ede11..fdc05ae08bac 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1343,6 +1343,68 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) local_irq_restore(flags); } +bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.apic->lapic_timer.hv_timer_in_use; +} +EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); + +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + WARN_ON(swait_active(&vcpu->wq)); + kvm_x86_ops->cancel_hv_timer(vcpu); + apic->lapic_timer.hv_timer_in_use = false; + apic_timer_expired(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); + +void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + WARN_ON(apic->lapic_timer.hv_timer_in_use); + + if (apic_lvtt_tscdeadline(apic) && + !atomic_read(&apic->lapic_timer.pending)) { + u64 tscdeadline = apic->lapic_timer.tscdeadline; + + if (!kvm_x86_ops->set_hv_timer(vcpu, tscdeadline)) { + apic->lapic_timer.hv_timer_in_use = true; + hrtimer_cancel(&apic->lapic_timer.timer); + + /* In case the sw timer triggered in the window */ + if (atomic_read(&apic->lapic_timer.pending)) { + apic->lapic_timer.hv_timer_in_use = false; + kvm_x86_ops->cancel_hv_timer(apic->vcpu); + } + } + trace_kvm_hv_timer_state(vcpu->vcpu_id, + apic->lapic_timer.hv_timer_in_use); + } +} +EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); + +void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + /* Possibly the TSC deadline timer is not enabled yet */ + if (!apic->lapic_timer.hv_timer_in_use) + return; + + kvm_x86_ops->cancel_hv_timer(vcpu); + apic->lapic_timer.hv_timer_in_use = false; + + if (atomic_read(&apic->lapic_timer.pending)) + return; + + start_sw_tscdeadline(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); + static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now; @@ -1389,7 +1451,16 @@ static void start_apic_timer(struct kvm_lapic *apic) ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); } else if (apic_lvtt_tscdeadline(apic)) { - start_sw_tscdeadline(apic); + /* lapic timer in tsc deadline mode */ + u64 tscdeadline = apic->lapic_timer.tscdeadline; + + if (kvm_x86_ops->set_hv_timer && + !kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { + apic->lapic_timer.hv_timer_in_use = true; + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, + apic->lapic_timer.hv_timer_in_use); + } else + start_sw_tscdeadline(apic); } } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 891c6da7d4aa..336ba51bb16e 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -20,6 +20,7 @@ struct kvm_timer { u64 tscdeadline; u64 expired_tscdeadline; atomic_t pending; /* accumulated triggered timers */ + bool hv_timer_in_use; }; struct kvm_lapic { @@ -212,4 +213,8 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); int kvm_vector_to_index(u32 vector, u32 dest_vcpus, const unsigned long *bitmap, u32 bitmap_size); +void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); +void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); +bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 8de925031b5c..0a6cc6754ec5 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1348,6 +1348,21 @@ TRACE_EVENT(kvm_avic_unaccelerated_access, __entry->vec) ); +TRACE_EVENT(kvm_hv_timer_state, + TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use), + TP_ARGS(vcpu_id, hv_timer_in_use), + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(unsigned int, hv_timer_in_use) + ), + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->hv_timer_in_use = hv_timer_in_use; + ), + TP_printk("vcpu_id %x hv_timer %x\n", + __entry->vcpu_id, + __entry->hv_timer_in_use) +); #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ab2f45a50bb5..1f4b2926a5a3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2740,6 +2740,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) rdtsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); + + if (kvm_lapic_hv_timer_in_use(vcpu) && + kvm_x86_ops->set_hv_timer(vcpu, + kvm_get_lapic_tscdeadline_msr(vcpu))) + kvm_lapic_switch_to_sw_timer(vcpu); if (check_tsc_unstable()) { u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); From bc22512bb24c480fae8ae96b233378ef96007590 Mon Sep 17 00:00:00 2001 From: Yunhong Jiang Date: Mon, 13 Jun 2016 14:19:58 -0700 Subject: [PATCH 110/302] kvm: vmx: rename vmx_pre/post_block to pi_pre/post_block Prepare to switch from preemption timer to hrtimer in the vmx_pre/post_block. Current functions are only for posted interrupt, rename them accordingly. Signed-off-by: Yunhong Jiang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 57ec6a4b4958..29e78fdd8ab8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10706,7 +10706,7 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, * this case, return 1, otherwise, return 0. * */ -static int vmx_pre_block(struct kvm_vcpu *vcpu) +static int pi_pre_block(struct kvm_vcpu *vcpu) { unsigned long flags; unsigned int dest; @@ -10772,7 +10772,15 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) return 0; } -static void vmx_post_block(struct kvm_vcpu *vcpu) +static int vmx_pre_block(struct kvm_vcpu *vcpu) +{ + if (pi_pre_block(vcpu)) + return 1; + + return 0; +} + +static void pi_post_block(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct pi_desc old, new; @@ -10813,6 +10821,11 @@ static void vmx_post_block(struct kvm_vcpu *vcpu) } } +static void vmx_post_block(struct kvm_vcpu *vcpu) +{ + pi_post_block(vcpu); +} + /* * vmx_update_pi_irte - set IRTE for Posted-Interrupts * From 64672c95ea4c2f7096e519e826076867e8ef0938 Mon Sep 17 00:00:00 2001 From: Yunhong Jiang Date: Mon, 13 Jun 2016 14:19:59 -0700 Subject: [PATCH 111/302] kvm: vmx: hook preemption timer support Hook the VMX preemption timer to the "hv timer" functionality added by the previous patch. This includes: checking if the feature is supported, if the feature is broken on the CPU, the hooks to setup/clean the VMX preemption timer, arming the timer on vmentry and handling the vmexit. A module parameter states if the VMX preemption timer should be utilized. Signed-off-by: Yunhong Jiang [Move hv_deadline_tsc to struct vcpu_vmx, use -1 as the "unset" value. Put all VMX bits here. Enable it by default #yolo. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 + arch/x86/kvm/vmx.c | 180 +++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 3 +- 3 files changed, 183 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e055f3787dc9..360c5171ea1a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1079,6 +1079,8 @@ extern u32 kvm_max_guest_tsc_khz; extern u8 kvm_tsc_scaling_ratio_frac_bits; /* maximum allowed value of TSC scaling ratio */ extern u64 kvm_max_tsc_scaling_ratio; +/* 1ull << kvm_tsc_scaling_ratio_frac_bits */ +extern u64 kvm_default_tsc_scaling_ratio; enum emulation_result { EMULATE_DONE, /* no further processing */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 29e78fdd8ab8..e185649fb8b7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -110,6 +110,13 @@ module_param_named(pml, enable_pml, bool, S_IRUGO); #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL +/* Guest_tsc -> host_tsc conversion requires 64-bit division. */ +static int __read_mostly cpu_preemption_timer_multi; +static bool __read_mostly enable_preemption_timer = 1; +#ifdef CONFIG_X86_64 +module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); +#endif + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ @@ -597,6 +604,9 @@ struct vcpu_vmx { #define PML_ENTITY_NUM 512 struct page *pml_pg; + /* apic deadline value in host tsc */ + u64 hv_deadline_tsc; + u64 current_tsc_ratio; bool guest_pkru_valid; @@ -1056,6 +1066,61 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; } +/* + * Comment's format: document - errata name - stepping - processor name. + * Refer from + * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp + */ +static u32 vmx_preemption_cpu_tfms[] = { +/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ +0x000206E6, +/* 323056.pdf - AAX65 - C2 - Xeon L3406 */ +/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ +/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ +0x00020652, +/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ +0x00020655, +/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ +/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ +/* + * 320767.pdf - AAP86 - B1 - + * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile + */ +0x000106E5, +/* 321333.pdf - AAM126 - C0 - Xeon 3500 */ +0x000106A0, +/* 321333.pdf - AAM126 - C1 - Xeon 3500 */ +0x000106A1, +/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ +0x000106A4, + /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ + /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ + /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ +0x000106A5, +}; + +static inline bool cpu_has_broken_vmx_preemption_timer(void) +{ + u32 eax = cpuid_eax(0x00000001), i; + + /* Clear the reserved bits */ + eax &= ~(0x3U << 14 | 0xfU << 28); + for (i = 0; i < sizeof(vmx_preemption_cpu_tfms)/sizeof(u32); i++) + if (eax == vmx_preemption_cpu_tfms[i]) + return true; + + return false; +} + +static inline bool cpu_has_vmx_preemption_timer(void) +{ + if (cpu_has_broken_vmx_preemption_timer()) + return false; + + return vmcs_config.pin_based_exec_ctrl & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + static inline bool cpu_has_vmx_posted_intr(void) { return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && @@ -3308,7 +3373,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return -EIO; min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | + PIN_BASED_VMX_PREEMPTION_TIMER; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; @@ -4781,6 +4847,8 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) if (!kvm_vcpu_apicv_active(&vmx->vcpu)) pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; + /* Enable the preemption timer dynamically */ + pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; return pin_based_exec_ctrl; } @@ -4899,6 +4967,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) /* Control */ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); + vmx->hv_deadline_tsc = -1; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); @@ -6389,6 +6458,17 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } + if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { + u64 vmx_msr; + + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + cpu_preemption_timer_multi = + vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + } else { + kvm_x86_ops->set_hv_timer = NULL; + kvm_x86_ops->cancel_hv_timer = NULL; + } + kvm_set_posted_intr_wakeup_handler(wakeup_handler); return alloc_kvm_area(); @@ -7564,6 +7644,12 @@ static int handle_pcommit(struct kvm_vcpu *vcpu) return 1; } +static int handle_preemption_timer(struct kvm_vcpu *vcpu) +{ + kvm_lapic_expired_hv_timer(vcpu); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7615,6 +7701,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_PCOMMIT] = handle_pcommit, + [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, }; static const int kvm_vmx_max_exit_handlers = @@ -8623,6 +8710,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host); } +void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 tscl; + u32 delta_tsc; + + if (vmx->hv_deadline_tsc == -1) + return; + + tscl = rdtsc(); + if (vmx->hv_deadline_tsc > tscl) + /* sure to be 32 bit only because checked on set_hv_timer */ + delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> + cpu_preemption_timer_multi); + else + delta_tsc = 0; + + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); +} + static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8672,6 +8779,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); + vmx_arm_hv_timer(vcpu); + vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ @@ -10662,6 +10771,64 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, return X86EMUL_CONTINUE; } +#ifdef CONFIG_X86_64 +/* (a << shift) / divisor, return 1 if overflow otherwise 0 */ +static inline int u64_shl_div_u64(u64 a, unsigned int shift, + u64 divisor, u64 *result) +{ + u64 low = a << shift, high = a >> (64 - shift); + + /* To avoid the overflow on divq */ + if (high >= divisor) + return 1; + + /* Low hold the result, high hold rem which is discarded */ + asm("divq %2\n\t" : "=a" (low), "=d" (high) : + "rm" (divisor), "0" (low), "1" (high)); + *result = low; + + return 0; +} + +static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 tscl = rdtsc(), delta_tsc; + + delta_tsc = guest_deadline_tsc - kvm_read_l1_tsc(vcpu, tscl); + + /* Convert to host delta tsc if tsc scaling is enabled */ + if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && + u64_shl_div_u64(delta_tsc, + kvm_tsc_scaling_ratio_frac_bits, + vcpu->arch.tsc_scaling_ratio, + &delta_tsc)) + return -ERANGE; + + /* + * If the delta tsc can't fit in the 32 bit after the multi shift, + * we can't use the preemption timer. + * It's possible that it fits on later vmentries, but checking + * on every vmentry is costly so we just use an hrtimer. + */ + if (delta_tsc >> (cpu_preemption_timer_multi + 32)) + return -ERANGE; + + vmx->hv_deadline_tsc = tscl + delta_tsc; + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + return 0; +} + +static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + vmx->hv_deadline_tsc = -1; + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); +} +#endif + static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) { if (ple_gap) @@ -10777,6 +10944,9 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) if (pi_pre_block(vcpu)) return 1; + if (kvm_lapic_hv_timer_in_use(vcpu)) + kvm_lapic_switch_to_sw_timer(vcpu); + return 0; } @@ -10823,6 +10993,9 @@ static void pi_post_block(struct kvm_vcpu *vcpu) static void vmx_post_block(struct kvm_vcpu *vcpu) { + if (kvm_x86_ops->set_hv_timer) + kvm_lapic_switch_to_hv_timer(vcpu); + pi_post_block(vcpu); } @@ -11038,6 +11211,11 @@ static struct kvm_x86_ops vmx_x86_ops = { .pmu_ops = &intel_pmu_ops, .update_pi_irte = vmx_update_pi_irte, + +#ifdef CONFIG_X86_64 + .set_hv_timer = vmx_set_hv_timer, + .cancel_hv_timer = vmx_cancel_hv_timer, +#endif }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1f4b2926a5a3..299219630c94 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -114,7 +114,8 @@ u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); u64 __read_mostly kvm_max_tsc_scaling_ratio; EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); -static u64 __read_mostly kvm_default_tsc_scaling_ratio; +u64 __read_mostly kvm_default_tsc_scaling_ratio; +EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; From 708e75a3ee750dce1072134e630d66c4e6eaf63c Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 18 May 2016 21:01:20 +0200 Subject: [PATCH 112/302] KVM: PPC: Book3S PR: Fix illegal opcode emulation If kvmppc_handle_exit_pr() calls kvmppc_emulate_instruction() to emulate one instruction (in the BOOK3S_INTERRUPT_H_EMUL_ASSIST case), it calls kvmppc_core_queue_program() afterwards if kvmppc_emulate_instruction() returned EMULATE_FAIL, so the guest gets an program interrupt for the illegal opcode. However, the kvmppc_emulate_instruction() also tried to inject a program exception for this already, so the program interrupt gets injected twice and the return address in srr0 gets destroyed. All other callers of kvmppc_emulate_instruction() are also injecting a program interrupt, and since the callers have the right knowledge about the srr1 flags that should be used, it is the function kvmppc_emulate_instruction() that should _not_ inject program interrupts, so remove the kvmppc_core_queue_program() here. This fixes the issue discovered by Laurent Vivier with kvm-unit-tests where the logs are filled with these messages when the test tries to execute an illegal instruction: Couldn't emulate instruction 0x00000000 (op 0 xop 0) kvmppc_handle_exit_pr: emulation at 700 failed (00000000) Signed-off-by: Thomas Huth Reviewed-by: Alexander Graf Tested-by: Laurent Vivier Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/emulate.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 5cc2e7af3a7b..b379146de55b 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -302,7 +302,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) advance = 0; printk(KERN_ERR "Couldn't emulate instruction 0x%08x " "(op %d xop %d)\n", inst, get_op(inst), get_xop(inst)); - kvmppc_core_queue_program(vcpu, 0); } } From b69890d18fa33a53cec6ae5c93555ee0c24fe0a9 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 19 May 2016 11:33:31 +0200 Subject: [PATCH 113/302] KVM: PPC: Book3S PR: Fix contents of SRR1 when injecting a program exception vcpu->arch.shadow_srr1 only contains usable values for injecting a program exception into the guest if we entered the function kvmppc_handle_exit_pr() with exit_nr == BOOK3S_INTERRUPT_PROGRAM. In other cases, the shadow_srr1 bits are zero. Since we want to pass an illegal-instruction program check to the guest, set "flags" to SRR1_PROGILL for these other cases. Signed-off-by: Thomas Huth Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_pr.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 8e4f64f0b774..a910fef86bba 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1049,7 +1049,17 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, int emul; program_interrupt: - flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; + /* + * shadow_srr1 only contains valid flags if we came here via + * a program exception. The other exceptions (emulation assist, + * FP unavailable, etc.) do not provide flags in SRR1, so use + * an illegal-instruction exception when injecting a program + * interrupt into the guest. + */ + if (exit_nr == BOOK3S_INTERRUPT_PROGRAM) + flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; + else + flags = SRR1_PROGILL; emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); if (emul != EMULATE_DONE) { From 6dd06d15a86e8fca21ed4fb568bed2b3da7a7907 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Sun, 15 May 2016 09:44:13 +0530 Subject: [PATCH 114/302] powerpc/powernv: Remove the usage of PACAR1 from opal wrappers OPAL_CALL wrapper code sticks the r1 (stack pointer) into PACAR1 purely for debugging purpose only. The power7_wakeup* functions relies on stack pointer saved in PACAR1. Any opal call made using opal wrapper (directly or in-directly) before we fall through power7_wakeup*, then it ends up replacing r1 in PACAR1(r13) leading to kernel panic. So far we don't see any issues because we have never made any opal calls using OPAL wrapper before power7_wakeup*. But the subsequent HMI patch would need to invoke C calls during cpu wakeup/idle path that in-directly makes opal call using opal wrapper. This patch facilitates the subsequent HMI patch by removing usage of PACAR1 from opal call wrapper. Signed-off-by: Mahesh Salgaonkar Acked-by: Michael Ellerman Signed-off-by: Paul Mackerras --- arch/powerpc/platforms/powernv/opal-wrappers.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index e45b88a5d7e0..df6ad949e35f 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -64,7 +64,6 @@ END_FTR_SECTION(0, 1); \ OPAL_BRANCH(opal_tracepoint_entry) \ mfcr r12; \ stw r12,8(r1); \ - std r1,PACAR1(r13); \ li r11,0; \ mfmsr r12; \ ori r11,r11,MSR_EE; \ @@ -127,7 +126,6 @@ opal_tracepoint_entry: mfcr r12 std r11,16(r1) stw r12,8(r1) - std r1,PACAR1(r13) li r11,0 mfmsr r12 ori r11,r11,MSR_EE From fd7bacbca47a86a6f418440d8a5d7b7edbb2f8f9 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Sun, 15 May 2016 09:44:26 +0530 Subject: [PATCH 115/302] KVM: PPC: Book3S HV: Fix TB corruption in guest exit path on HMI interrupt When a guest is assigned to a core it converts the host Timebase (TB) into guest TB by adding guest timebase offset before entering into guest. During guest exit it restores the guest TB to host TB. This means under certain conditions (Guest migration) host TB and guest TB can differ. When we get an HMI for TB related issues the opal HMI handler would try fixing errors and restore the correct host TB value. With no guest running, we don't have any issues. But with guest running on the core we run into TB corruption issues. If we get an HMI while in the guest, the current HMI handler invokes opal hmi handler before forcing guest to exit. The guest exit path subtracts the guest TB offset from the current TB value which may have already been restored with host value by opal hmi handler. This leads to incorrect host and guest TB values. With split-core, things become more complex. With split-core, TB also gets split and each subcore gets its own TB register. When a hmi handler fixes a TB error and restores the TB value, it affects all the TB values of sibling subcores on the same core. On TB errors all the thread in the core gets HMI. With existing code, the individual threads call opal hmi handle independently which can easily throw TB out of sync if we have guest running on subcores. Hence we will need to co-ordinate with all the threads before making opal hmi handler call followed by TB resync. This patch introduces a sibling subcore state structure (shared by all threads in the core) in paca which holds information about whether sibling subcores are in Guest mode or host mode. An array in_guest[] of size MAX_SUBCORE_PER_CORE=4 is used to maintain the state of each subcore. The subcore id is used as index into in_guest[] array. Only primary thread entering/exiting the guest is responsible to set/unset its designated array element. On TB error, we get HMI interrupt on every thread on the core. Upon HMI, this patch will now force guest to vacate the core/subcore. Primary thread from each subcore will then turn off its respective bit from the above bitmap during the guest exit path just after the guest->host partition switch is complete. All other threads that have just exited the guest OR were already in host will wait until all other subcores clears their respective bit. Once all the subcores turn off their respective bit, all threads will will make call to opal hmi handler. It is not necessary that opal hmi handler would resync the TB value for every HMI interrupts. It would do so only for the HMI caused due to TB errors. For rest, it would not touch TB value. Hence to make things simpler, primary thread would call TB resync explicitly once for each core immediately after opal hmi handler instead of subtracting guest offset from TB. TB resync call will restore the TB with host value. Thus we can be sure about the TB state. One of the primary threads exiting the guest will take up the responsibility of calling TB resync. It will use one of the top bits (bit 63) from subcore state flags bitmap to make the decision. The first primary thread (among the subcores) that is able to set the bit will have to call the TB resync. Rest all other threads will wait until TB resync is complete. Once TB resync is complete all threads will then proceed. Signed-off-by: Mahesh Salgaonkar Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/hmi.h | 45 ++++++ arch/powerpc/include/asm/paca.h | 6 + arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/exceptions-64s.S | 4 +- arch/powerpc/kernel/hmi.c | 56 ++++++++ arch/powerpc/kernel/idle_power7.S | 5 +- arch/powerpc/kernel/traps.c | 5 + arch/powerpc/kvm/book3s_hv.c | 37 +++++ arch/powerpc/kvm/book3s_hv_ras.c | 176 ++++++++++++++++++++++++ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 65 ++++++++- 10 files changed, 396 insertions(+), 5 deletions(-) create mode 100644 arch/powerpc/include/asm/hmi.h create mode 100644 arch/powerpc/kernel/hmi.c diff --git a/arch/powerpc/include/asm/hmi.h b/arch/powerpc/include/asm/hmi.h new file mode 100644 index 000000000000..88b4901ac4ee --- /dev/null +++ b/arch/powerpc/include/asm/hmi.h @@ -0,0 +1,45 @@ +/* + * Hypervisor Maintenance Interrupt header file. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. + * + * Copyright 2015 IBM Corporation + * Author: Mahesh Salgaonkar + */ + +#ifndef __ASM_PPC64_HMI_H__ +#define __ASM_PPC64_HMI_H__ + +#ifdef CONFIG_PPC_BOOK3S_64 + +#define CORE_TB_RESYNC_REQ_BIT 63 +#define MAX_SUBCORE_PER_CORE 4 + +/* + * sibling_subcore_state structure is used to co-ordinate all threads + * during HMI to avoid TB corruption. This structure is allocated once + * per each core and shared by all threads on that core. + */ +struct sibling_subcore_state { + unsigned long flags; + u8 in_guest[MAX_SUBCORE_PER_CORE]; +}; + +extern void wait_for_subcore_guest_exit(void); +extern void wait_for_tb_resync(void); +#else +static inline void wait_for_subcore_guest_exit(void) { } +static inline void wait_for_tb_resync(void) { } +#endif +#endif /* __ASM_PPC64_HMI_H__ */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 546540b91095..4b17bd058e01 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -25,6 +25,7 @@ #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #include #endif +#include register struct paca_struct *local_paca asm("r13"); @@ -181,6 +182,11 @@ struct paca_struct { */ u16 in_mce; u8 hmi_event_available; /* HMI event is available */ + /* + * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for + * more details + */ + struct sibling_subcore_state *sibling_subcore_state; #endif /* Stuff for accurate time accounting */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 2da380fcc34c..6972a23433d3 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32) += vdso32/ obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o -obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o +obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o hmi.o obj64-$(CONFIG_RELOCATABLE) += reloc_64.o obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o obj-$(CONFIG_PPC64) += vdso64/ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 4c9440629128..0eba47e074b9 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -680,6 +680,8 @@ _GLOBAL(__replay_interrupt) BEGIN_FTR_SECTION cmpwi r3,0xe80 beq h_doorbell_common + cmpwi r3,0xe60 + beq hmi_exception_common FTR_SECTION_ELSE cmpwi r3,0xa00 beq doorbell_super_common @@ -1172,7 +1174,7 @@ fwnmi_data_area: .globl hmi_exception_early hmi_exception_early: - EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60) + EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, 0xe62) mr r10,r1 /* Save r1 */ ld r1,PACAEMERGSP(r13) /* Use emergency stack */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ diff --git a/arch/powerpc/kernel/hmi.c b/arch/powerpc/kernel/hmi.c new file mode 100644 index 000000000000..e3f738eb1cac --- /dev/null +++ b/arch/powerpc/kernel/hmi.c @@ -0,0 +1,56 @@ +/* + * Hypervisor Maintenance Interrupt (HMI) handling. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. + * + * Copyright 2015 IBM Corporation + * Author: Mahesh Salgaonkar + */ + +#undef DEBUG + +#include +#include +#include +#include + +void wait_for_subcore_guest_exit(void) +{ + int i; + + /* + * NULL bitmap pointer indicates that KVM module hasn't + * been loaded yet and hence no guests are running. + * If no KVM is in use, no need to co-ordinate among threads + * as all of them will always be in host and no one is going + * to modify TB other than the opal hmi handler. + * Hence, just return from here. + */ + if (!local_paca->sibling_subcore_state) + return; + + for (i = 0; i < MAX_SUBCORE_PER_CORE; i++) + while (local_paca->sibling_subcore_state->in_guest[i]) + cpu_relax(); +} + +void wait_for_tb_resync(void) +{ + if (!local_paca->sibling_subcore_state) + return; + + while (test_bit(CORE_TB_RESYNC_REQ_BIT, + &local_paca->sibling_subcore_state->flags)) + cpu_relax(); +} diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S index 470ceebd2d23..bb5112908fb2 100644 --- a/arch/powerpc/kernel/idle_power7.S +++ b/arch/powerpc/kernel/idle_power7.S @@ -270,8 +270,9 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ ld r2,PACATOC(r13); \ ld r1,PACAR1(r13); \ std r3,ORIG_GPR3(r1); /* Save original r3 */ \ - li r0,OPAL_HANDLE_HMI; /* Pass opal token argument*/ \ - bl opal_call_realmode; \ + li r3,0; /* NULL argument */ \ + bl hmi_exception_realmode; \ + nop; \ ld r3,ORIG_GPR3(r1); /* Restore original r3 */ \ 20: nop; diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 9229ba63c370..9ec95daccad9 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC) @@ -307,9 +308,13 @@ long hmi_exception_realmode(struct pt_regs *regs) { __this_cpu_inc(irq_stat.hmi_exceptions); + wait_for_subcore_guest_exit(); + if (ppc_md.hmi_exception_early) ppc_md.hmi_exception_early(regs); + wait_for_tb_resync(); + return 0; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e20beae5ca7a..bc27c4d6383c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -3401,6 +3402,38 @@ static struct kvmppc_ops kvm_ops_hv = { .hcall_implemented = kvmppc_hcall_impl_hv, }; +static int kvm_init_subcore_bitmap(void) +{ + int i, j; + int nr_cores = cpu_nr_cores(); + struct sibling_subcore_state *sibling_subcore_state; + + for (i = 0; i < nr_cores; i++) { + int first_cpu = i * threads_per_core; + int node = cpu_to_node(first_cpu); + + /* Ignore if it is already allocated. */ + if (paca[first_cpu].sibling_subcore_state) + continue; + + sibling_subcore_state = + kmalloc_node(sizeof(struct sibling_subcore_state), + GFP_KERNEL, node); + if (!sibling_subcore_state) + return -ENOMEM; + + memset(sibling_subcore_state, 0, + sizeof(struct sibling_subcore_state)); + + for (j = 0; j < threads_per_core; j++) { + int cpu = first_cpu + j; + + paca[cpu].sibling_subcore_state = sibling_subcore_state; + } + } + return 0; +} + static int kvmppc_book3s_init_hv(void) { int r; @@ -3411,6 +3444,10 @@ static int kvmppc_book3s_init_hv(void) if (r < 0) return -ENODEV; + r = kvm_init_subcore_bitmap(); + if (r) + return r; + kvm_ops_hv.owner = THIS_MODULE; kvmppc_hv_ops = &kvm_ops_hv; diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index 93b5f5c9b445..0fa70a9618d7 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -13,6 +13,9 @@ #include #include #include +#include +#include +#include /* SRR1 bits for machine check on POWER7 */ #define SRR1_MC_LDSTERR (1ul << (63-42)) @@ -140,3 +143,176 @@ long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) { return kvmppc_realmode_mc_power7(vcpu); } + +/* Check if dynamic split is in force and return subcore size accordingly. */ +static inline int kvmppc_cur_subcore_size(void) +{ + if (local_paca->kvm_hstate.kvm_split_mode) + return local_paca->kvm_hstate.kvm_split_mode->subcore_size; + + return threads_per_subcore; +} + +void kvmppc_subcore_enter_guest(void) +{ + int thread_id, subcore_id; + + thread_id = cpu_thread_in_core(local_paca->paca_index); + subcore_id = thread_id / kvmppc_cur_subcore_size(); + + local_paca->sibling_subcore_state->in_guest[subcore_id] = 1; +} + +void kvmppc_subcore_exit_guest(void) +{ + int thread_id, subcore_id; + + thread_id = cpu_thread_in_core(local_paca->paca_index); + subcore_id = thread_id / kvmppc_cur_subcore_size(); + + local_paca->sibling_subcore_state->in_guest[subcore_id] = 0; +} + +static bool kvmppc_tb_resync_required(void) +{ + if (test_and_set_bit(CORE_TB_RESYNC_REQ_BIT, + &local_paca->sibling_subcore_state->flags)) + return false; + + return true; +} + +static void kvmppc_tb_resync_done(void) +{ + clear_bit(CORE_TB_RESYNC_REQ_BIT, + &local_paca->sibling_subcore_state->flags); +} + +/* + * kvmppc_realmode_hmi_handler() is called only by primary thread during + * guest exit path. + * + * There are multiple reasons why HMI could occur, one of them is + * Timebase (TB) error. If this HMI is due to TB error, then TB would + * have been in stopped state. The opal hmi handler Will fix it and + * restore the TB value with host timebase value. For HMI caused due + * to non-TB errors, opal hmi handler will not touch/restore TB register + * and hence there won't be any change in TB value. + * + * Since we are not sure about the cause of this HMI, we can't be sure + * about the content of TB register whether it holds guest or host timebase + * value. Hence the idea is to resync the TB on every HMI, so that we + * know about the exact state of the TB value. Resync TB call will + * restore TB to host timebase. + * + * Things to consider: + * - On TB error, HMI interrupt is reported on all the threads of the core + * that has encountered TB error irrespective of split-core mode. + * - The very first thread on the core that get chance to fix TB error + * would rsync the TB with local chipTOD value. + * - The resync TB is a core level action i.e. it will sync all the TBs + * in that core independent of split-core mode. This means if we trigger + * TB sync from a thread from one subcore, it would affect TB values of + * sibling subcores of the same core. + * + * All threads need to co-ordinate before making opal hmi handler. + * All threads will use sibling_subcore_state->in_guest[] (shared by all + * threads in the core) in paca which holds information about whether + * sibling subcores are in Guest mode or host mode. The in_guest[] array + * is of size MAX_SUBCORE_PER_CORE=4, indexed using subcore id to set/unset + * subcore status. Only primary threads from each subcore is responsible + * to set/unset its designated array element while entering/exiting the + * guset. + * + * After invoking opal hmi handler call, one of the thread (of entire core) + * will need to resync the TB. Bit 63 from subcore state bitmap flags + * (sibling_subcore_state->flags) will be used to co-ordinate between + * primary threads to decide who takes up the responsibility. + * + * This is what we do: + * - Primary thread from each subcore tries to set resync required bit[63] + * of paca->sibling_subcore_state->flags. + * - The first primary thread that is able to set the flag takes the + * responsibility of TB resync. (Let us call it as thread leader) + * - All other threads which are in host will call + * wait_for_subcore_guest_exit() and wait for in_guest[0-3] from + * paca->sibling_subcore_state to get cleared. + * - All the primary thread will clear its subcore status from subcore + * state in_guest[] array respectively. + * - Once all primary threads clear in_guest[0-3], all of them will invoke + * opal hmi handler. + * - Now all threads will wait for TB resync to complete by invoking + * wait_for_tb_resync() except the thread leader. + * - Thread leader will do a TB resync by invoking opal_resync_timebase() + * call and the it will clear the resync required bit. + * - All other threads will now come out of resync wait loop and proceed + * with individual execution. + * - On return of this function, primary thread will signal all + * secondary threads to proceed. + * - All secondary threads will eventually call opal hmi handler on + * their exit path. + */ + +long kvmppc_realmode_hmi_handler(void) +{ + int ptid = local_paca->kvm_hstate.ptid; + bool resync_req; + + /* This is only called on primary thread. */ + BUG_ON(ptid != 0); + __this_cpu_inc(irq_stat.hmi_exceptions); + + /* + * By now primary thread has already completed guest->host + * partition switch but haven't signaled secondaries yet. + * All the secondary threads on this subcore is waiting + * for primary thread to signal them to go ahead. + * + * For threads from subcore which isn't in guest, they all will + * wait until all other subcores on this core exit the guest. + * + * Now set the resync required bit. If you are the first to + * set this bit then kvmppc_tb_resync_required() function will + * return true. For rest all other subcores + * kvmppc_tb_resync_required() will return false. + * + * If resync_req == true, then this thread is responsible to + * initiate TB resync after hmi handler has completed. + * All other threads on this core will wait until this thread + * clears the resync required bit flag. + */ + resync_req = kvmppc_tb_resync_required(); + + /* Reset the subcore status to indicate it has exited guest */ + kvmppc_subcore_exit_guest(); + + /* + * Wait for other subcores on this core to exit the guest. + * All the primary threads and threads from subcore that are + * not in guest will wait here until all subcores are out + * of guest context. + */ + wait_for_subcore_guest_exit(); + + /* + * At this point we are sure that primary threads from each + * subcore on this core have completed guest->host partition + * switch. Now it is safe to call HMI handler. + */ + if (ppc_md.hmi_exception_early) + ppc_md.hmi_exception_early(NULL); + + /* + * Check if this thread is responsible to resync TB. + * All other threads will wait until this thread completes the + * TB resync. + */ + if (resync_req) { + opal_resync_timebase(); + /* Reset TB resync req bit */ + kvmppc_tb_resync_done(); + } else { + wait_for_tb_resync(); + } + return 0; +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index e571ad277398..0d246fca157a 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -29,6 +29,7 @@ #include #include #include +#include #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM) @@ -373,6 +374,18 @@ kvm_secondary_got_guest: lwsync std r0, HSTATE_KVM_VCORE(r13) + /* + * All secondaries exiting guest will fall through this path. + * Before proceeding, just check for HMI interrupt and + * invoke opal hmi handler. By now we are sure that the + * primary thread on this core/subcore has already made partition + * switch/TB resync and we are good to call opal hmi handler. + */ + cmpwi r12, BOOK3S_INTERRUPT_HMI + bne kvm_no_guest + + li r3,0 /* NULL argument */ + bl hmi_exception_realmode /* * At this point we have finished executing in the guest. * We need to wait for hwthread_req to become zero, since @@ -427,6 +440,22 @@ kvm_no_guest: * whole-core mode, so we need to nap. */ kvm_unsplit_nap: + /* + * When secondaries are napping in kvm_unsplit_nap() with + * hwthread_req = 1, HMI goes ignored even though subcores are + * already exited the guest. Hence HMI keeps waking up secondaries + * from nap in a loop and secondaries always go back to nap since + * no vcore is assigned to them. This makes impossible for primary + * thread to get hold of secondary threads resulting into a soft + * lockup in KVM path. + * + * Let us check if HMI is pending and handle it before we go to nap. + */ + cmpwi r12, BOOK3S_INTERRUPT_HMI + bne 55f + li r3, 0 /* NULL argument */ + bl hmi_exception_realmode +55: /* * Ensure that secondary doesn't nap when it has * its vcore pointer set. @@ -601,6 +630,11 @@ BEGIN_FTR_SECTION mtspr SPRN_DPDES, r8 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + /* Mark the subcore state as inside guest */ + bl kvmppc_subcore_enter_guest + nop + ld r5, HSTATE_KVM_VCORE(r13) + ld r4, HSTATE_KVM_VCPU(r13) li r0,1 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ @@ -1683,6 +1717,23 @@ BEGIN_FTR_SECTION mtspr SPRN_DPDES, r8 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + /* If HMI, call kvmppc_realmode_hmi_handler() */ + cmpwi r12, BOOK3S_INTERRUPT_HMI + bne 27f + bl kvmppc_realmode_hmi_handler + nop + li r12, BOOK3S_INTERRUPT_HMI + /* + * At this point kvmppc_realmode_hmi_handler would have resync-ed + * the TB. Hence it is not required to subtract guest timebase + * offset from timebase. So, skip it. + * + * Also, do not call kvmppc_subcore_exit_guest() because it has + * been invoked as part of kvmppc_realmode_hmi_handler(). + */ + b 30f + +27: /* Subtract timebase offset from timebase */ ld r8,VCORE_TB_OFFSET(r5) cmpdi r8,0 @@ -1698,8 +1749,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) addis r8,r8,0x100 /* if so, increment upper 40 bits */ mtspr SPRN_TBU40,r8 +17: bl kvmppc_subcore_exit_guest + nop +30: ld r5,HSTATE_KVM_VCORE(r13) + ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ + /* Reset PCR */ -17: ld r0, VCORE_PCR(r5) + ld r0, VCORE_PCR(r5) cmpdi r0, 0 beq 18f li r0, 0 @@ -2461,6 +2517,8 @@ BEGIN_FTR_SECTION cmpwi r6, 3 /* hypervisor doorbell? */ beq 3f END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + cmpwi r6, 0xa /* Hypervisor maintenance ? */ + beq 4f li r3, 1 /* anything else, return 1 */ 0: blr @@ -2482,6 +2540,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) li r3, -1 blr + /* Woken up due to Hypervisor maintenance interrupt */ +4: li r12, BOOK3S_INTERRUPT_HMI + li r3, 1 + blr + /* * Determine what sort of external interrupt is pending (if any). * Returns: From 414d3b07496604a4372466a6b474ca24291a143c Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 8 Mar 2016 11:52:54 +0100 Subject: [PATCH 116/302] s390/kvm: page table invalidation notifier Pass an address range to the page table invalidation notifier for KVM. This allows to notify changes that affect a larger virtual memory area, e.g. for 1MB pages. Reviewed-by: David Hildenbrand Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/gmap.h | 3 ++- arch/s390/kvm/kvm-s390.c | 18 +++++++++++++----- arch/s390/mm/gmap.c | 19 ++++++++++++++++--- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index d054c1b07a3c..bc0eadf9ed8e 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -39,7 +39,8 @@ struct gmap { */ struct gmap_notifier { struct list_head list; - void (*notifier_call)(struct gmap *gmap, unsigned long gaddr); + void (*notifier_call)(struct gmap *gmap, unsigned long start, + unsigned long end); }; struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0dcf9b8fc12c..67f1b6b4c060 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -150,7 +150,8 @@ int kvm_arch_hardware_enable(void) return 0; } -static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address); +static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end); /* * This callback is executed during stop_machine(). All CPUs are therefore @@ -1976,16 +1977,23 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu) kvm_s390_vcpu_request(vcpu); } -static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address) +static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) { - int i; struct kvm *kvm = gmap->private; struct kvm_vcpu *vcpu; + unsigned long prefix; + int i; + if (start >= 1UL << 31) + /* We are only interested in prefix pages */ + return; kvm_for_each_vcpu(i, vcpu, kvm) { /* match against both prefix pages */ - if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) { - VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address); + prefix = kvm_s390_get_prefix(vcpu); + if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) { + VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx", + start, end); kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu); } } diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index cace818d86eb..b5820bf47ec6 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -572,6 +572,21 @@ void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) } EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); +/** + * gmap_call_notifier - call all registered invalidation callbacks + * @gmap: pointer to guest mapping meta data structure + * @start: start virtual address in the guest address space + * @end: end virtual address in the guest address space + */ +static void gmap_call_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) +{ + struct gmap_notifier *nb; + + list_for_each_entry(nb, &gmap_notifier_list, list) + nb->notifier_call(gmap, start, end); +} + /** * gmap_ipte_notify - mark a range of ptes for invalidation notification * @gmap: pointer to guest mapping meta data structure @@ -643,7 +658,6 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) { unsigned long offset, gaddr; unsigned long *table; - struct gmap_notifier *nb; struct gmap *gmap; offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); @@ -655,8 +669,7 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) if (!table) continue; gaddr = __gmap_segment_gaddr(table) + offset; - list_for_each_entry(nb, &gmap_notifier_list, list) - nb->notifier_call(gmap, gaddr); + gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); } spin_unlock(&gmap_notifier_lock); } From 8ecb1a59d6c6674bc98e4eee0c2482490748e21a Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 8 Mar 2016 11:54:14 +0100 Subject: [PATCH 117/302] s390/mm: use RCU for gmap notifier list and the per-mm gmap list The gmap notifier list and the gmap list in the mm_struct change rarely. Use RCU to optimize the reader of these lists. Reviewed-by: David Hildenbrand Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/gmap.h | 1 + arch/s390/include/asm/mmu.h | 11 +++++--- arch/s390/include/asm/mmu_context.h | 3 ++- arch/s390/mm/gmap.c | 39 +++++++++++++++++------------ arch/s390/mm/pgalloc.c | 16 ++++++------ 5 files changed, 41 insertions(+), 29 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index bc0eadf9ed8e..2cf49624af99 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -39,6 +39,7 @@ struct gmap { */ struct gmap_notifier { struct list_head list; + struct rcu_head rcu; void (*notifier_call)(struct gmap *gmap, unsigned long start, unsigned long end); }; diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index 081b2ad99d73..b941528cc49e 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -8,8 +8,9 @@ typedef struct { cpumask_t cpu_attach_mask; atomic_t attach_count; unsigned int flush_mm; - spinlock_t list_lock; + spinlock_t pgtable_lock; struct list_head pgtable_list; + spinlock_t gmap_lock; struct list_head gmap_list; unsigned long asce; unsigned long asce_limit; @@ -22,9 +23,11 @@ typedef struct { unsigned int use_skey:1; } mm_context_t; -#define INIT_MM_CONTEXT(name) \ - .context.list_lock = __SPIN_LOCK_UNLOCKED(name.context.list_lock), \ - .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ +#define INIT_MM_CONTEXT(name) \ + .context.pgtable_lock = \ + __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock), \ + .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ + .context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \ .context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list), static inline int tprot(unsigned long addr) diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index c837b79b455d..3ce3854b7a41 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -15,8 +15,9 @@ static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { - spin_lock_init(&mm->context.list_lock); + spin_lock_init(&mm->context.pgtable_lock); INIT_LIST_HEAD(&mm->context.pgtable_list); + spin_lock_init(&mm->context.gmap_lock); INIT_LIST_HEAD(&mm->context.gmap_list); cpumask_clear(&mm->context.cpu_attach_mask); atomic_set(&mm->context.attach_count, 0); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index b5820bf47ec6..8b56423a8297 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -70,9 +70,9 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) gmap->asce = atype | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | __pa(table); gmap->asce_end = limit; - down_write(&mm->mmap_sem); - list_add(&gmap->list, &mm->context.gmap_list); - up_write(&mm->mmap_sem); + spin_lock(&mm->context.gmap_lock); + list_add_rcu(&gmap->list, &mm->context.gmap_list); + spin_unlock(&mm->context.gmap_lock); return gmap; out_free: @@ -128,14 +128,16 @@ void gmap_free(struct gmap *gmap) else __tlb_flush_global(); + spin_lock(&gmap->mm->context.gmap_lock); + list_del_rcu(&gmap->list); + spin_unlock(&gmap->mm->context.gmap_lock); + synchronize_rcu(); + /* Free all segment & region tables. */ list_for_each_entry_safe(page, next, &gmap->crst_list, lru) __free_pages(page, 2); gmap_radix_tree_free(&gmap->guest_to_host); gmap_radix_tree_free(&gmap->host_to_guest); - down_write(&gmap->mm->mmap_sem); - list_del(&gmap->list); - up_write(&gmap->mm->mmap_sem); kfree(gmap); } EXPORT_SYMBOL_GPL(gmap_free); @@ -369,11 +371,13 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table, struct gmap *gmap; int flush; - list_for_each_entry(gmap, &mm->context.gmap_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); if (flush) gmap_flush_tlb(gmap); } + rcu_read_unlock(); } /** @@ -555,7 +559,7 @@ static DEFINE_SPINLOCK(gmap_notifier_lock); void gmap_register_ipte_notifier(struct gmap_notifier *nb) { spin_lock(&gmap_notifier_lock); - list_add(&nb->list, &gmap_notifier_list); + list_add_rcu(&nb->list, &gmap_notifier_list); spin_unlock(&gmap_notifier_lock); } EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); @@ -567,8 +571,9 @@ EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) { spin_lock(&gmap_notifier_lock); - list_del_init(&nb->list); + list_del_rcu(&nb->list); spin_unlock(&gmap_notifier_lock); + synchronize_rcu(); } EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); @@ -662,16 +667,18 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); offset = offset * (4096 / sizeof(pte_t)); - spin_lock(&gmap_notifier_lock); - list_for_each_entry(gmap, &mm->context.gmap_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + spin_lock(&gmap->guest_table_lock); table = radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); - if (!table) - continue; - gaddr = __gmap_segment_gaddr(table) + offset; - gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); + if (table) + gaddr = __gmap_segment_gaddr(table) + offset; + spin_unlock(&gmap->guest_table_lock); + if (table) + gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1); } - spin_unlock(&gmap_notifier_lock); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ptep_notify); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index e8b5962ac12a..7be1f94f70a8 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -149,7 +149,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) /* Try to get a fragment of a 4K page as a 2K page table */ if (!mm_alloc_pgste(mm)) { table = NULL; - spin_lock_bh(&mm->context.list_lock); + spin_lock_bh(&mm->context.pgtable_lock); if (!list_empty(&mm->context.pgtable_list)) { page = list_first_entry(&mm->context.pgtable_list, struct page, lru); @@ -164,7 +164,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) list_del(&page->lru); } } - spin_unlock_bh(&mm->context.list_lock); + spin_unlock_bh(&mm->context.pgtable_lock); if (table) return table; } @@ -187,9 +187,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm) /* Return the first 2K fragment of the page */ atomic_set(&page->_mapcount, 1); clear_table(table, _PAGE_INVALID, PAGE_SIZE); - spin_lock_bh(&mm->context.list_lock); + spin_lock_bh(&mm->context.pgtable_lock); list_add(&page->lru, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.list_lock); + spin_unlock_bh(&mm->context.pgtable_lock); } return table; } @@ -203,13 +203,13 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) if (!mm_alloc_pgste(mm)) { /* Free 2K page table fragment of a 4K page */ bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.list_lock); + spin_lock_bh(&mm->context.pgtable_lock); mask = atomic_xor_bits(&page->_mapcount, 1U << bit); if (mask & 3) list_add(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); - spin_unlock_bh(&mm->context.list_lock); + spin_unlock_bh(&mm->context.pgtable_lock); if (mask != 0) return; } @@ -235,13 +235,13 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, return; } bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.list_lock); + spin_lock_bh(&mm->context.pgtable_lock); mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); if (mask & 3) list_add_tail(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); - spin_unlock_bh(&mm->context.list_lock); + spin_unlock_bh(&mm->context.pgtable_lock); table = (unsigned long *) (__pa(table) | (1U << bit)); tlb_remove_table(tlb, table); } From b2d73b2a0ad1c758cb0c1acb01a911744b845942 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 8 Mar 2016 11:54:42 +0100 Subject: [PATCH 118/302] s390/mm: extended gmap pte notifier The current gmap pte notifier forces a pte into to a read-write state. If the pte is invalidated the gmap notifier is called to inform KVM that the mapping will go away. Extend this approach to allow read-write, read-only and no-access as possible target states and call the pte notifier for any change to the pte. This mechanism is used to temporarily set specific access rights for a pte without doing the heavy work of a true mprotect call. Reviewed-by: David Hildenbrand Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/gmap.h | 9 +- arch/s390/include/asm/pgtable.h | 2 + arch/s390/kvm/kvm-s390.c | 13 +-- arch/s390/mm/gmap.c | 170 ++++++++++++++++++++++++-------- arch/s390/mm/pgtable.c | 54 +++++++++- 5 files changed, 193 insertions(+), 55 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 2cf49624af99..6897a0919446 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -59,8 +59,11 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to); void __gmap_zap(struct gmap *, unsigned long gaddr); void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); -void gmap_register_ipte_notifier(struct gmap_notifier *); -void gmap_unregister_ipte_notifier(struct gmap_notifier *); -int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len); +void gmap_register_pte_notifier(struct gmap_notifier *); +void gmap_unregister_pte_notifier(struct gmap_notifier *); +void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *); + +int gmap_mprotect_notify(struct gmap *, unsigned long start, + unsigned long len, int prot); #endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 9951e7e59756..35dde6afffcf 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -886,6 +886,8 @@ void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry); void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr, + pte_t *ptep, int prot); void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, pte_t *ptep , int reset); void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 67f1b6b4c060..b6e7f66f0f01 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -185,7 +186,7 @@ static struct notifier_block kvm_clock_notifier = { int kvm_arch_hardware_setup(void) { gmap_notifier.notifier_call = kvm_gmap_notifier; - gmap_register_ipte_notifier(&gmap_notifier); + gmap_register_pte_notifier(&gmap_notifier); atomic_notifier_chain_register(&s390_epoch_delta_notifier, &kvm_clock_notifier); return 0; @@ -193,7 +194,7 @@ int kvm_arch_hardware_setup(void) void kvm_arch_hardware_unsetup(void) { - gmap_unregister_ipte_notifier(&gmap_notifier); + gmap_unregister_pte_notifier(&gmap_notifier); atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, &kvm_clock_notifier); } @@ -2272,16 +2273,16 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) return 0; /* * We use MMU_RELOAD just to re-arm the ipte notifier for the - * guest prefix page. gmap_ipte_notify will wait on the ptl lock. + * guest prefix page. gmap_mprotect_notify will wait on the ptl lock. * This ensures that the ipte instruction for this request has * already finished. We might race against a second unmapper that * wants to set the blocking bit. Lets just retry the request loop. */ if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) { int rc; - rc = gmap_ipte_notify(vcpu->arch.gmap, - kvm_s390_get_prefix(vcpu), - PAGE_SIZE * 2); + rc = gmap_mprotect_notify(vcpu->arch.gmap, + kvm_s390_get_prefix(vcpu), + PAGE_SIZE * 2, PROT_WRITE); if (rc) return rc; goto retry; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 8b56423a8297..480c076afceb 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -553,29 +553,29 @@ static LIST_HEAD(gmap_notifier_list); static DEFINE_SPINLOCK(gmap_notifier_lock); /** - * gmap_register_ipte_notifier - register a pte invalidation callback + * gmap_register_pte_notifier - register a pte invalidation callback * @nb: pointer to the gmap notifier block */ -void gmap_register_ipte_notifier(struct gmap_notifier *nb) +void gmap_register_pte_notifier(struct gmap_notifier *nb) { spin_lock(&gmap_notifier_lock); list_add_rcu(&nb->list, &gmap_notifier_list); spin_unlock(&gmap_notifier_lock); } -EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); +EXPORT_SYMBOL_GPL(gmap_register_pte_notifier); /** - * gmap_unregister_ipte_notifier - remove a pte invalidation callback + * gmap_unregister_pte_notifier - remove a pte invalidation callback * @nb: pointer to the gmap notifier block */ -void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) +void gmap_unregister_pte_notifier(struct gmap_notifier *nb) { spin_lock(&gmap_notifier_lock); list_del_rcu(&nb->list); spin_unlock(&gmap_notifier_lock); synchronize_rcu(); } -EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); +EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier); /** * gmap_call_notifier - call all registered invalidation callbacks @@ -593,62 +593,150 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start, } /** - * gmap_ipte_notify - mark a range of ptes for invalidation notification + * gmap_table_walk - walk the gmap page tables + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * + * Returns a table pointer for the given guest address. + */ +static inline unsigned long *gmap_table_walk(struct gmap *gmap, + unsigned long gaddr) +{ + unsigned long *table; + + table = gmap->table; + switch (gmap->asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: + table += (gaddr >> 53) & 0x7ff; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* Fallthrough */ + case _ASCE_TYPE_REGION2: + table += (gaddr >> 42) & 0x7ff; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* Fallthrough */ + case _ASCE_TYPE_REGION3: + table += (gaddr >> 31) & 0x7ff; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + /* Fallthrough */ + case _ASCE_TYPE_SEGMENT: + table += (gaddr >> 20) & 0x7ff; + } + return table; +} + +/** + * gmap_pte_op_walk - walk the gmap page table, get the page table lock + * and return the pte pointer + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @ptl: pointer to the spinlock pointer + * + * Returns a pointer to the locked pte for a guest address, or NULL + */ +static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, + spinlock_t **ptl) +{ + unsigned long *table; + + /* Walk the gmap page table, lock and get pte pointer */ + table = gmap_table_walk(gmap, gaddr); + if (!table || *table & _SEGMENT_ENTRY_INVALID) + return NULL; + return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); +} + +/** + * gmap_pte_op_fixup - force a page in and connect the gmap page table + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @vmaddr: address in the host process address space + * + * Returns 0 if the caller can retry __gmap_translate (might fail again), + * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing + * up or connecting the gmap page table. + */ +static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, + unsigned long vmaddr) +{ + struct mm_struct *mm = gmap->mm; + bool unlocked = false; + + if (fixup_user_fault(current, mm, vmaddr, FAULT_FLAG_WRITE, &unlocked)) + return -EFAULT; + if (unlocked) + /* lost mmap_sem, caller has to retry __gmap_translate */ + return 0; + /* Connect the page tables */ + return __gmap_link(gmap, gaddr, vmaddr); +} + +/** + * gmap_pte_op_end - release the page table lock + * @ptl: pointer to the spinlock pointer + */ +static void gmap_pte_op_end(spinlock_t *ptl) +{ + spin_unlock(ptl); +} + +/** + * gmap_mprotect_notify - change access rights for a range of ptes and + * call the notifier if any pte changes again * @gmap: pointer to guest mapping meta data structure * @gaddr: virtual address in the guest address space * @len: size of area + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE * - * Returns 0 if for each page in the given range a gmap mapping exists and - * the invalidation notification could be set. If the gmap mapping is missing - * for one or more pages -EFAULT is returned. If no memory could be allocated - * -ENOMEM is returned. This function establishes missing page table entries. + * Returns 0 if for each page in the given range a gmap mapping exists, + * the new access rights could be set and the notifier could be armed. + * If the gmap mapping is missing for one or more pages -EFAULT is + * returned. If no memory could be allocated -ENOMEM is returned. + * This function establishes missing page table entries. */ -int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) +int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, + unsigned long len, int prot) { - unsigned long addr; + unsigned long vmaddr; spinlock_t *ptl; pte_t *ptep; - bool unlocked; int rc = 0; if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) return -EINVAL; + if (!MACHINE_HAS_ESOP && prot == PROT_READ) + return -EINVAL; down_read(&gmap->mm->mmap_sem); while (len) { - unlocked = false; - /* Convert gmap address and connect the page tables */ - addr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(addr)) { - rc = addr; - break; + rc = -EAGAIN; + ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); + if (ptep) { + rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot); + gmap_pte_op_end(ptl); } - /* Get the page mapped */ - if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE, - &unlocked)) { - rc = -EFAULT; - break; - } - /* While trying to map mmap_sem got unlocked. Let us retry */ - if (unlocked) + if (rc) { + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + break; + } + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr); + if (rc) + break; continue; - rc = __gmap_link(gmap, gaddr, addr); - if (rc) - break; - /* Walk the process page table, lock and get pte pointer */ - ptep = get_locked_pte(gmap->mm, addr, &ptl); - VM_BUG_ON(!ptep); - /* Set notification bit in the pgste of the pte */ - if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { - ptep_set_notify(gmap->mm, addr, ptep); - gaddr += PAGE_SIZE; - len -= PAGE_SIZE; } - pte_unmap_unlock(ptep, ptl); + gaddr += PAGE_SIZE; + len -= PAGE_SIZE; } up_read(&gmap->mm->mmap_sem); return rc; } -EXPORT_SYMBOL_GPL(gmap_ipte_notify); +EXPORT_SYMBOL_GPL(gmap_mprotect_notify); /** * ptep_notify - call all invalidation callbacks for a specific pte. diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index fa286d0c0f2d..ab65fb11e058 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -179,9 +179,9 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) return pgste; } -static inline pgste_t pgste_ipte_notify(struct mm_struct *mm, - unsigned long addr, - pte_t *ptep, pgste_t pgste) +static inline pgste_t pgste_pte_notify(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, pgste_t pgste) { #ifdef CONFIG_PGSTE if (pgste_val(pgste) & PGSTE_IN_BIT) { @@ -199,7 +199,7 @@ static inline pgste_t ptep_xchg_start(struct mm_struct *mm, if (mm_has_pgste(mm)) { pgste = pgste_get_lock(ptep); - pgste = pgste_ipte_notify(mm, addr, ptep, pgste); + pgste = pgste_pte_notify(mm, addr, ptep, pgste); } return pgste; } @@ -414,6 +414,50 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) pgste_set_unlock(ptep, pgste); } +/** + * ptep_force_prot - change access rights of a locked pte + * @mm: pointer to the process mm_struct + * @addr: virtual address in the guest address space + * @ptep: pointer to the page table entry + * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE + * + * Returns 0 if the access rights were changed and -EAGAIN if the current + * and requested access rights are incompatible. + */ +int ptep_force_prot(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int prot) +{ + pte_t entry; + pgste_t pgste; + int pte_i, pte_p; + + pgste = pgste_get_lock(ptep); + entry = *ptep; + /* Check pte entry after all locks have been acquired */ + pte_i = pte_val(entry) & _PAGE_INVALID; + pte_p = pte_val(entry) & _PAGE_PROTECT; + if ((pte_i && (prot != PROT_NONE)) || + (pte_p && (prot & PROT_WRITE))) { + pgste_set_unlock(ptep, pgste); + return -EAGAIN; + } + /* Change access rights and set the pgste notification bit */ + if (prot == PROT_NONE && !pte_i) { + ptep_flush_direct(mm, addr, ptep); + pgste = pgste_update_all(entry, pgste, mm); + pte_val(entry) |= _PAGE_INVALID; + } + if (prot == PROT_READ && !pte_p) { + ptep_flush_direct(mm, addr, ptep); + pte_val(entry) &= ~_PAGE_INVALID; + pte_val(entry) |= _PAGE_PROTECT; + } + pgste_val(pgste) |= PGSTE_IN_BIT; + pgste = pgste_set_pte(ptep, pgste, entry); + pgste_set_unlock(ptep, pgste); + return 0; +} + static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) { if (!non_swap_entry(entry)) @@ -483,7 +527,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pgste_val(pgste) &= ~PGSTE_UC_BIT; pte = *ptep; if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { - pgste = pgste_ipte_notify(mm, addr, ptep, pgste); + pgste = pgste_pte_notify(mm, addr, ptep, pgste); __ptep_ipte(addr, ptep); if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) pte_val(pte) |= _PAGE_PROTECT; From 6ea427bbbd4078297bb1dbd6c5cb83f3f48aac46 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 8 Mar 2016 11:55:04 +0100 Subject: [PATCH 119/302] s390/mm: add reference counter to gmap structure Let's use a reference counter mechanism to control the lifetime of gmap structures. This will be needed for further changes related to gmap shadows. Reviewed-by: David Hildenbrand Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/gmap.h | 9 +++- arch/s390/kvm/kvm-s390.c | 16 +++---- arch/s390/mm/gmap.c | 90 ++++++++++++++++++++++++++++-------- 3 files changed, 85 insertions(+), 30 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 6897a0919446..e69853ce55da 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -15,6 +15,7 @@ * @guest_to_host: radix tree with guest to host address translation * @host_to_guest: radix tree with pointer to segment table entries * @guest_table_lock: spinlock to protect all entries in the guest page table + * @ref_count: reference counter for the gmap structure * @table: pointer to the page directory * @asce: address space control element for gmap page table * @pfault_enabled: defines if pfaults are applicable for the guest @@ -26,6 +27,7 @@ struct gmap { struct radix_tree_root guest_to_host; struct radix_tree_root host_to_guest; spinlock_t guest_table_lock; + atomic_t ref_count; unsigned long *table; unsigned long asce; unsigned long asce_end; @@ -44,8 +46,11 @@ struct gmap_notifier { unsigned long end); }; -struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit); -void gmap_free(struct gmap *gmap); +struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit); +void gmap_remove(struct gmap *gmap); +struct gmap *gmap_get(struct gmap *gmap); +void gmap_put(struct gmap *gmap); + void gmap_enable(struct gmap *gmap); void gmap_disable(struct gmap *gmap); int gmap_map_segment(struct gmap *gmap, unsigned long from, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index b6e7f66f0f01..9dd52980605c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -532,20 +532,20 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (!new_limit) return -EINVAL; - /* gmap_alloc takes last usable address */ + /* gmap_create takes last usable address */ if (new_limit != KVM_S390_NO_MEM_LIMIT) new_limit -= 1; ret = -EBUSY; mutex_lock(&kvm->lock); if (!kvm->created_vcpus) { - /* gmap_alloc will round the limit up */ - struct gmap *new = gmap_alloc(current->mm, new_limit); + /* gmap_create will round the limit up */ + struct gmap *new = gmap_create(current->mm, new_limit); if (!new) { ret = -ENOMEM; } else { - gmap_free(kvm->arch.gmap); + gmap_remove(kvm->arch.gmap); new->private = kvm; kvm->arch.gmap = new; ret = 0; @@ -1394,7 +1394,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) else kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE, sclp.hamax + 1); - kvm->arch.gmap = gmap_alloc(current->mm, kvm->arch.mem_limit - 1); + kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1); if (!kvm->arch.gmap) goto out_err; kvm->arch.gmap->private = kvm; @@ -1427,7 +1427,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) sca_del_vcpu(vcpu); if (kvm_is_ucontrol(vcpu->kvm)) - gmap_free(vcpu->arch.gmap); + gmap_remove(vcpu->arch.gmap); if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); @@ -1460,7 +1460,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) debug_unregister(kvm->arch.dbf); free_page((unsigned long)kvm->arch.sie_page2); if (!kvm_is_ucontrol(kvm)) - gmap_free(kvm->arch.gmap); + gmap_remove(kvm->arch.gmap); kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); KVM_EVENT(3, "vm 0x%pK destroyed", kvm); @@ -1469,7 +1469,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) /* Section: vcpu related */ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) { - vcpu->arch.gmap = gmap_alloc(current->mm, -1UL); + vcpu->arch.gmap = gmap_create(current->mm, -1UL); if (!vcpu->arch.gmap) return -ENOMEM; vcpu->arch.gmap->private = vcpu->kvm; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 480c076afceb..fe25f1915800 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -21,13 +21,13 @@ #include /** - * gmap_alloc - allocate a guest address space + * gmap_alloc - allocate and initialize a guest address space * @mm: pointer to the parent mm_struct * @limit: maximum address of the gmap address space * * Returns a guest address space structure. */ -struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) +static struct gmap *gmap_alloc(unsigned long limit) { struct gmap *gmap; struct page *page; @@ -58,7 +58,7 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); spin_lock_init(&gmap->guest_table_lock); - gmap->mm = mm; + atomic_set(&gmap->ref_count, 1); page = alloc_pages(GFP_KERNEL, 2); if (!page) goto out_free; @@ -70,9 +70,6 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) gmap->asce = atype | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | __pa(table); gmap->asce_end = limit; - spin_lock(&mm->context.gmap_lock); - list_add_rcu(&gmap->list, &mm->context.gmap_list); - spin_unlock(&mm->context.gmap_lock); return gmap; out_free: @@ -80,7 +77,28 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) out: return NULL; } -EXPORT_SYMBOL_GPL(gmap_alloc); + +/** + * gmap_create - create a guest address space + * @mm: pointer to the parent mm_struct + * @limit: maximum size of the gmap address space + * + * Returns a guest address space structure. + */ +struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) +{ + struct gmap *gmap; + + gmap = gmap_alloc(limit); + if (!gmap) + return NULL; + gmap->mm = mm; + spin_lock(&mm->context.gmap_lock); + list_add_rcu(&gmap->list, &mm->context.gmap_list); + spin_unlock(&mm->context.gmap_lock); + return gmap; +} +EXPORT_SYMBOL_GPL(gmap_create); static void gmap_flush_tlb(struct gmap *gmap) { @@ -118,21 +136,10 @@ static void gmap_radix_tree_free(struct radix_tree_root *root) * gmap_free - free a guest address space * @gmap: pointer to the guest address space structure */ -void gmap_free(struct gmap *gmap) +static void gmap_free(struct gmap *gmap) { struct page *page, *next; - /* Flush tlb. */ - if (MACHINE_HAS_IDTE) - __tlb_flush_asce(gmap->mm, gmap->asce); - else - __tlb_flush_global(); - - spin_lock(&gmap->mm->context.gmap_lock); - list_del_rcu(&gmap->list); - spin_unlock(&gmap->mm->context.gmap_lock); - synchronize_rcu(); - /* Free all segment & region tables. */ list_for_each_entry_safe(page, next, &gmap->crst_list, lru) __free_pages(page, 2); @@ -140,7 +147,50 @@ void gmap_free(struct gmap *gmap) gmap_radix_tree_free(&gmap->host_to_guest); kfree(gmap); } -EXPORT_SYMBOL_GPL(gmap_free); + +/** + * gmap_get - increase reference counter for guest address space + * @gmap: pointer to the guest address space structure + * + * Returns the gmap pointer + */ +struct gmap *gmap_get(struct gmap *gmap) +{ + atomic_inc(&gmap->ref_count); + return gmap; +} +EXPORT_SYMBOL_GPL(gmap_get); + +/** + * gmap_put - decrease reference counter for guest address space + * @gmap: pointer to the guest address space structure + * + * If the reference counter reaches zero the guest address space is freed. + */ +void gmap_put(struct gmap *gmap) +{ + if (atomic_dec_return(&gmap->ref_count) == 0) + gmap_free(gmap); +} +EXPORT_SYMBOL_GPL(gmap_put); + +/** + * gmap_remove - remove a guest address space but do not free it yet + * @gmap: pointer to the guest address space structure + */ +void gmap_remove(struct gmap *gmap) +{ + /* Flush tlb. */ + gmap_flush_tlb(gmap); + /* Remove gmap from the pre-mm list */ + spin_lock(&gmap->mm->context.gmap_lock); + list_del_rcu(&gmap->list); + spin_unlock(&gmap->mm->context.gmap_lock); + synchronize_rcu(); + /* Put reference */ + gmap_put(gmap); +} +EXPORT_SYMBOL_GPL(gmap_remove); /** * gmap_enable - switch primary space to the guest address space From 4be130a08420d6918d80c1067f8078f425eb98df Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 8 Mar 2016 12:12:18 +0100 Subject: [PATCH 120/302] s390/mm: add shadow gmap support For a nested KVM guest the outer KVM host needs to create shadow page tables for the nested guest. This patch adds the basic support to the guest address space (gmap) code. For each guest address space the inner KVM host creates, the first outer KVM host needs to create shadow page tables. The address space is identified by the ASCE loaded into the control register 1 at the time the inner SIE instruction for the second nested KVM guest is executed. The outer KVM host creates the shadow tables starting with the table identified by the ASCE on a on-demand basis. The outer KVM host will get repeated faults for all the shadow tables needed to run the second KVM guest. While a shadow page table for the second KVM guest is active the access to the origin region, segment and page tables needs to be restricted for the first KVM guest. For region and segment and page tables the first KVM guest may read the memory, but write attempt has to lead to an unshadow. This is done using the page invalid and read-only bits in the page table of the first KVM guest. If the first guest re-accesses one of the origin pages of a shadow, it gets a fault and the affected parts of the shadow page table hierarchy needs to be removed again. PGSTE tables don't have to be shadowed, as all interpretation assist can't deal with the invalid bits in the shadow pte being set differently than the original ones provided by the first KVM guest. Many bug fixes and improvements by David Hildenbrand. Reviewed-by: David Hildenbrand Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/gmap.h | 52 +- arch/s390/include/asm/pgalloc.h | 2 + arch/s390/include/asm/pgtable.h | 10 +- arch/s390/include/asm/processor.h | 1 + arch/s390/mm/fault.c | 1 + arch/s390/mm/gmap.c | 1168 ++++++++++++++++++++++++++++- arch/s390/mm/pgalloc.c | 23 + arch/s390/mm/pgtable.c | 57 +- 8 files changed, 1271 insertions(+), 43 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index e69853ce55da..58e65ee5b2d2 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -10,6 +10,7 @@ /** * struct gmap_struct - guest address space + * @list: list head for the mm->context gmap list * @crst_list: list of all crst tables used in the guest address space * @mm: pointer to the parent mm_struct * @guest_to_host: radix tree with guest to host address translation @@ -19,6 +20,13 @@ * @table: pointer to the page directory * @asce: address space control element for gmap page table * @pfault_enabled: defines if pfaults are applicable for the guest + * @host_to_rmap: radix tree with gmap_rmap lists + * @children: list of shadow gmap structures + * @pt_list: list of all page tables used in the shadow guest address space + * @shadow_lock: spinlock to protect the shadow gmap list + * @parent: pointer to the parent gmap for shadow guest address spaces + * @orig_asce: ASCE for which the shadow page table has been created + * @removed: flag to indicate if a shadow guest address space has been removed */ struct gmap { struct list_head list; @@ -33,8 +41,32 @@ struct gmap { unsigned long asce_end; void *private; bool pfault_enabled; + /* Additional data for shadow guest address spaces */ + struct radix_tree_root host_to_rmap; + struct list_head children; + struct list_head pt_list; + spinlock_t shadow_lock; + struct gmap *parent; + unsigned long orig_asce; + bool removed; }; +/** + * struct gmap_rmap - reverse mapping for shadow page table entries + * @next: pointer to next rmap in the list + * @raddr: virtual rmap address in the shadow guest address space + */ +struct gmap_rmap { + struct gmap_rmap *next; + unsigned long raddr; +}; + +#define gmap_for_each_rmap(pos, head) \ + for (pos = (head); pos; pos = pos->next) + +#define gmap_for_each_rmap_safe(pos, n, head) \ + for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n) + /** * struct gmap_notifier - notify function block for page invalidation * @notifier_call: address of callback function @@ -46,6 +78,11 @@ struct gmap_notifier { unsigned long end); }; +static inline int gmap_is_shadow(struct gmap *gmap) +{ + return !!gmap->parent; +} + struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit); void gmap_remove(struct gmap *gmap); struct gmap *gmap_get(struct gmap *gmap); @@ -64,9 +101,22 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to); void __gmap_zap(struct gmap *, unsigned long gaddr); void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr); +int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val); + +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce); +int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t); +int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t); +int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt); +int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt); +int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, + unsigned long *pgt, int *dat_protection); +int gmap_shadow_page(struct gmap *sg, unsigned long saddr, + unsigned long paddr, int write); + void gmap_register_pte_notifier(struct gmap_notifier *); void gmap_unregister_pte_notifier(struct gmap_notifier *); -void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *); +void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *, + unsigned long bits); int gmap_mprotect_notify(struct gmap *, unsigned long start, unsigned long len, int prot); diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index da34cb6b1f3b..f4eb9843eed4 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -19,8 +19,10 @@ unsigned long *crst_table_alloc(struct mm_struct *); void crst_table_free(struct mm_struct *, unsigned long *); unsigned long *page_table_alloc(struct mm_struct *); +struct page *page_table_alloc_pgste(struct mm_struct *mm); void page_table_free(struct mm_struct *, unsigned long *); void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long); +void page_table_free_pgste(struct page *page); extern int page_table_allocate_pgste; static inline void clear_table(unsigned long *s, unsigned long val, size_t n) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 35dde6afffcf..a6e7fc8f5b49 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -256,6 +256,7 @@ static inline int is_module_addr(void *addr) /* Bits in the region table entry */ #define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */ #define _REGION_ENTRY_PROTECT 0x200 /* region protection bit */ +#define _REGION_ENTRY_OFFSET 0xc0 /* region table offset */ #define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */ #define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */ #define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */ @@ -327,6 +328,7 @@ static inline int is_module_addr(void *addr) #define PGSTE_GC_BIT 0x0002000000000000UL #define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */ #define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */ +#define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */ /* Guest Page State used for virtualization */ #define _PGSTE_GPS_ZERO 0x0000000080000000UL @@ -885,12 +887,16 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry); void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void ptep_notify(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long bits); int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr, - pte_t *ptep, int prot); + pte_t *ptep, int prot, unsigned long bit); void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, pte_t *ptep , int reset); void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, + pte_t *sptep, pte_t *tptep, int write); +void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep); bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address); int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 9d4d311d7e52..94c80b6d031d 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -109,6 +109,7 @@ struct thread_struct { unsigned long ksp; /* kernel stack pointer */ mm_segment_t mm_segment; unsigned long gmap_addr; /* address of last gmap fault. */ + unsigned int gmap_write_flag; /* gmap fault write indication */ unsigned int gmap_pfault; /* signal of a pending guest pfault */ struct per_regs per_user; /* User specified PER registers */ struct per_event per_event; /* Cause of the last PER trap */ diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 19288c1b36d3..b84416c11c43 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -418,6 +418,7 @@ static inline int do_exception(struct pt_regs *regs, int access) (struct gmap *) S390_lowcore.gmap : NULL; if (gmap) { current->thread.gmap_addr = address; + current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE); address = __gmap_translate(gmap, address); if (address == -EFAULT) { fault = VM_FAULT_BADMAP; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index fe25f1915800..6695a09a3885 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -55,9 +55,13 @@ static struct gmap *gmap_alloc(unsigned long limit) if (!gmap) goto out; INIT_LIST_HEAD(&gmap->crst_list); + INIT_LIST_HEAD(&gmap->children); + INIT_LIST_HEAD(&gmap->pt_list); INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); + INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC); spin_lock_init(&gmap->guest_table_lock); + spin_lock_init(&gmap->shadow_lock); atomic_set(&gmap->ref_count, 1); page = alloc_pages(GFP_KERNEL, 2); if (!page) @@ -132,9 +136,38 @@ static void gmap_radix_tree_free(struct radix_tree_root *root) } while (nr > 0); } +static void gmap_rmap_radix_tree_free(struct radix_tree_root *root) +{ + struct gmap_rmap *rmap, *rnext, *head; + struct radix_tree_iter iter; + unsigned long indices[16]; + unsigned long index; + void **slot; + int i, nr; + + /* A radix tree is freed by deleting all of its entries */ + index = 0; + do { + nr = 0; + radix_tree_for_each_slot(slot, root, &iter, index) { + indices[nr] = iter.index; + if (++nr == 16) + break; + } + for (i = 0; i < nr; i++) { + index = indices[i]; + head = radix_tree_delete(root, index); + gmap_for_each_rmap_safe(rmap, rnext, head) + kfree(rmap); + } + } while (nr > 0); +} + /** * gmap_free - free a guest address space * @gmap: pointer to the guest address space structure + * + * No locks required. There are no references to this gmap anymore. */ static void gmap_free(struct gmap *gmap) { @@ -145,6 +178,17 @@ static void gmap_free(struct gmap *gmap) __free_pages(page, 2); gmap_radix_tree_free(&gmap->guest_to_host); gmap_radix_tree_free(&gmap->host_to_guest); + + /* Free additional data for a shadow gmap */ + if (gmap_is_shadow(gmap)) { + /* Free all page tables. */ + list_for_each_entry_safe(page, next, &gmap->pt_list, lru) + page_table_free_pgste(page); + gmap_rmap_radix_tree_free(&gmap->host_to_rmap); + /* Release reference to the parent */ + gmap_put(gmap->parent); + } + kfree(gmap); } @@ -180,8 +224,20 @@ EXPORT_SYMBOL_GPL(gmap_put); */ void gmap_remove(struct gmap *gmap) { + struct gmap *sg, *next; + /* Flush tlb. */ gmap_flush_tlb(gmap); + /* Remove all shadow gmaps linked to this gmap */ + if (!list_empty(&gmap->children)) { + spin_lock(&gmap->shadow_lock); + list_for_each_entry_safe(sg, next, &gmap->children, list) { + gmap_flush_tlb(sg); + list_del(&sg->list); + gmap_put(sg); + } + spin_unlock(&gmap->shadow_lock); + } /* Remove gmap from the pre-mm list */ spin_lock(&gmap->mm->context.gmap_lock); list_del_rcu(&gmap->list); @@ -227,7 +283,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, return -ENOMEM; new = (unsigned long *) page_to_phys(page); crst_table_init(new, init); - spin_lock(&gmap->mm->page_table_lock); + spin_lock(&gmap->guest_table_lock); if (*table & _REGION_ENTRY_INVALID) { list_add(&page->lru, &gmap->crst_list); *table = (unsigned long) new | _REGION_ENTRY_LENGTH | @@ -235,7 +291,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, page->index = gaddr; page = NULL; } - spin_unlock(&gmap->mm->page_table_lock); + spin_unlock(&gmap->guest_table_lock); if (page) __free_pages(page, 2); return 0; @@ -271,6 +327,7 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) unsigned long *entry; int flush = 0; + BUG_ON(gmap_is_shadow(gmap)); spin_lock(&gmap->guest_table_lock); entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); if (entry) { @@ -310,6 +367,7 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) unsigned long off; int flush; + BUG_ON(gmap_is_shadow(gmap)); if ((to | len) & (PMD_SIZE - 1)) return -EINVAL; if (len == 0 || to + len < to) @@ -341,6 +399,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from, unsigned long off; int flush; + BUG_ON(gmap_is_shadow(gmap)); if ((from | to | len) & (PMD_SIZE - 1)) return -EINVAL; if (len == 0 || from + len < from || to + len < to || @@ -378,6 +437,8 @@ EXPORT_SYMBOL_GPL(gmap_map_segment); * This function does not establish potentially missing page table entries. * The mmap_sem of the mm that belongs to the address space must be held * when this function gets called. + * + * Note: Can also be called for shadow gmaps. */ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) { @@ -385,6 +446,7 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); + /* Note: guest_to_host is empty for a shadow gmap */ return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; } EXPORT_SYMBOL_GPL(__gmap_translate); @@ -451,6 +513,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) pmd_t *pmd; int rc; + BUG_ON(gmap_is_shadow(gmap)); /* Create higher level tables in the gmap page table */ table = gmap->table; if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { @@ -646,36 +709,65 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start, * gmap_table_walk - walk the gmap page tables * @gmap: pointer to guest mapping meta data structure * @gaddr: virtual address in the guest address space + * @level: page table level to stop at * - * Returns a table pointer for the given guest address. + * Returns a table entry pointer for the given guest address and @level + * @level=0 : returns a pointer to a page table table entry (or NULL) + * @level=1 : returns a pointer to a segment table entry (or NULL) + * @level=2 : returns a pointer to a region-3 table entry (or NULL) + * @level=3 : returns a pointer to a region-2 table entry (or NULL) + * @level=4 : returns a pointer to a region-1 table entry (or NULL) + * + * Returns NULL if the gmap page tables could not be walked to the + * requested level. + * + * Note: Can also be called for shadow gmaps. */ static inline unsigned long *gmap_table_walk(struct gmap *gmap, - unsigned long gaddr) + unsigned long gaddr, int level) { unsigned long *table; + if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4)) + return NULL; + if (gmap_is_shadow(gmap) && gmap->removed) + return NULL; + if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11))) + return NULL; table = gmap->table; switch (gmap->asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: table += (gaddr >> 53) & 0x7ff; + if (level == 4) + break; if (*table & _REGION_ENTRY_INVALID) return NULL; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* Fallthrough */ case _ASCE_TYPE_REGION2: table += (gaddr >> 42) & 0x7ff; + if (level == 3) + break; if (*table & _REGION_ENTRY_INVALID) return NULL; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* Fallthrough */ case _ASCE_TYPE_REGION3: table += (gaddr >> 31) & 0x7ff; + if (level == 2) + break; if (*table & _REGION_ENTRY_INVALID) return NULL; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* Fallthrough */ case _ASCE_TYPE_SEGMENT: table += (gaddr >> 20) & 0x7ff; + if (level == 1) + break; + if (*table & _REGION_ENTRY_INVALID) + return NULL; + table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); + table += (gaddr >> 12) & 0xff; } return table; } @@ -688,16 +780,27 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, * @ptl: pointer to the spinlock pointer * * Returns a pointer to the locked pte for a guest address, or NULL + * + * Note: Can also be called for shadow gmaps. */ static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, spinlock_t **ptl) { unsigned long *table; + if (gmap_is_shadow(gmap)) + spin_lock(&gmap->guest_table_lock); /* Walk the gmap page table, lock and get pte pointer */ - table = gmap_table_walk(gmap, gaddr); - if (!table || *table & _SEGMENT_ENTRY_INVALID) + table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */ + if (!table || *table & _SEGMENT_ENTRY_INVALID) { + if (gmap_is_shadow(gmap)) + spin_unlock(&gmap->guest_table_lock); return NULL; + } + if (gmap_is_shadow(gmap)) { + *ptl = &gmap->guest_table_lock; + return pte_offset_map((pmd_t *) table, gaddr); + } return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl); } @@ -717,6 +820,7 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, struct mm_struct *mm = gmap->mm; bool unlocked = false; + BUG_ON(gmap_is_shadow(gmap)); if (fixup_user_fault(current, mm, vmaddr, FAULT_FLAG_WRITE, &unlocked)) return -EFAULT; if (unlocked) @@ -735,6 +839,51 @@ static void gmap_pte_op_end(spinlock_t *ptl) spin_unlock(ptl); } +/* + * gmap_protect_range - remove access rights to memory and set pgste bits + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @len: size of area + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bits: pgste notification bits to set + * + * Returns 0 if successfully protected, -ENOMEM if out of memory and + * -EFAULT if gaddr is invalid (or mapping for shadows is missing). + * + * Called with sg->mm->mmap_sem in read. + * + * Note: Can also be called for shadow gmaps. + */ +static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, + unsigned long len, int prot, unsigned long bits) +{ + unsigned long vmaddr; + spinlock_t *ptl; + pte_t *ptep; + int rc; + + while (len) { + rc = -EAGAIN; + ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); + if (ptep) { + rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits); + gmap_pte_op_end(ptl); + } + if (rc) { + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) + return vmaddr; + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr); + if (rc) + return rc; + continue; + } + gaddr += PAGE_SIZE; + len -= PAGE_SIZE; + } + return 0; +} + /** * gmap_mprotect_notify - change access rights for a range of ptes and * call the notifier if any pte changes again @@ -752,61 +901,1012 @@ static void gmap_pte_op_end(spinlock_t *ptl) int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len, int prot) { - unsigned long vmaddr; - spinlock_t *ptl; - pte_t *ptep; - int rc = 0; + int rc; - if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) + if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap)) return -EINVAL; if (!MACHINE_HAS_ESOP && prot == PROT_READ) return -EINVAL; down_read(&gmap->mm->mmap_sem); - while (len) { - rc = -EAGAIN; - ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); - if (ptep) { - rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot); - gmap_pte_op_end(ptl); - } - if (rc) { - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) { - rc = vmaddr; - break; - } - rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr); - if (rc) - break; - continue; - } - gaddr += PAGE_SIZE; - len -= PAGE_SIZE; - } + rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT); up_read(&gmap->mm->mmap_sem); return rc; } EXPORT_SYMBOL_GPL(gmap_mprotect_notify); +/** + * gmap_read_table - get an unsigned long value from a guest page table using + * absolute addressing, without marking the page referenced. + * @gmap: pointer to guest mapping meta data structure + * @gaddr: virtual address in the guest address space + * @val: pointer to the unsigned long value to return + * + * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT + * if reading using the virtual address failed. + * + * Called with gmap->mm->mmap_sem in read. + */ +int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) +{ + unsigned long address, vmaddr; + spinlock_t *ptl; + pte_t *ptep, pte; + int rc; + + while (1) { + rc = -EAGAIN; + ptep = gmap_pte_op_walk(gmap, gaddr, &ptl); + if (ptep) { + pte = *ptep; + if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { + address = pte_val(pte) & PAGE_MASK; + address += gaddr & ~PAGE_MASK; + *val = *(unsigned long *) address; + pte_val(*ptep) |= _PAGE_YOUNG; + /* Do *NOT* clear the _PAGE_INVALID bit! */ + rc = 0; + } + gmap_pte_op_end(ptl); + } + if (!rc) + break; + vmaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + break; + } + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr); + if (rc) + break; + } + return rc; +} +EXPORT_SYMBOL_GPL(gmap_read_table); + +/** + * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree + * @sg: pointer to the shadow guest address space structure + * @vmaddr: vm address associated with the rmap + * @rmap: pointer to the rmap structure + * + * Called with the sg->guest_table_lock + */ +static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, + struct gmap_rmap *rmap) +{ + void **slot; + + BUG_ON(!gmap_is_shadow(sg)); + slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); + if (slot) { + rmap->next = radix_tree_deref_slot_protected(slot, + &sg->guest_table_lock); + radix_tree_replace_slot(slot, rmap); + } else { + rmap->next = NULL; + radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, + rmap); + } +} + +/** + * gmap_protect_rmap - modify access rights to memory and create an rmap + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow gmap + * @paddr: address in the parent guest address space + * @len: length of the memory area to protect + * @prot: indicates access rights: none, read-only or read-write + * + * Returns 0 if successfully protected and the rmap was created, -ENOMEM + * if out of memory and -EFAULT if paddr is invalid. + */ +static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, + unsigned long paddr, unsigned long len, int prot) +{ + struct gmap *parent; + struct gmap_rmap *rmap; + unsigned long vmaddr; + spinlock_t *ptl; + pte_t *ptep; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + parent = sg->parent; + while (len) { + vmaddr = __gmap_translate(parent, paddr); + if (IS_ERR_VALUE(vmaddr)) + return vmaddr; + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); + if (!rmap) + return -ENOMEM; + rmap->raddr = raddr; + rc = radix_tree_preload(GFP_KERNEL); + if (rc) { + kfree(rmap); + return rc; + } + rc = -EAGAIN; + ptep = gmap_pte_op_walk(parent, paddr, &ptl); + if (ptep) { + spin_lock(&sg->guest_table_lock); + rc = ptep_force_prot(parent->mm, paddr, ptep, prot, + PGSTE_VSIE_BIT); + if (!rc) + gmap_insert_rmap(sg, vmaddr, rmap); + spin_unlock(&sg->guest_table_lock); + gmap_pte_op_end(ptl); + } + radix_tree_preload_end(); + if (rc) { + kfree(rmap); + rc = gmap_pte_op_fixup(parent, paddr, vmaddr); + if (rc) + return rc; + continue; + } + paddr += PAGE_SIZE; + len -= PAGE_SIZE; + } + return 0; +} + +#define _SHADOW_RMAP_MASK 0x7 +#define _SHADOW_RMAP_REGION1 0x5 +#define _SHADOW_RMAP_REGION2 0x4 +#define _SHADOW_RMAP_REGION3 0x3 +#define _SHADOW_RMAP_SEGMENT 0x2 +#define _SHADOW_RMAP_PGTABLE 0x1 + +/** + * gmap_idte_one - invalidate a single region or segment table entry + * @asce: region or segment table *origin* + table-type bits + * @vaddr: virtual address to identify the table entry to flush + * + * The invalid bit of a single region or segment table entry is set + * and the associated TLB entries depending on the entry are flushed. + * The table-type of the @asce identifies the portion of the @vaddr + * that is used as the invalidation index. + */ +static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) +{ + asm volatile( + " .insn rrf,0xb98e0000,%0,%1,0,0" + : : "a" (asce), "a" (vaddr) : "cc", "memory"); +} + +/** + * gmap_unshadow_page - remove a page from a shadow page table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) +{ + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ + if (!table || *table & _PAGE_INVALID) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1); + ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); +} + +/** + * __gmap_unshadow_pgt - remove all entries from a shadow page table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @pgt: pointer to the start of a shadow page table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, + unsigned long *pgt) +{ + int i; + + BUG_ON(!gmap_is_shadow(sg)); + for (i = 0; i < 256; i++, raddr += 1UL << 12) + pgt[i] = _PAGE_INVALID; +} + +/** + * gmap_unshadow_pgt - remove a shadow page table from a segment entry + * @sg: pointer to the shadow guest address space structure + * @raddr: address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) +{ + unsigned long sto, *ste, *pgt; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ + if (!ste || *ste & _SEGMENT_ENTRY_INVALID) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1); + sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff)); + gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); + pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); + *ste = _SEGMENT_ENTRY_EMPTY; + __gmap_unshadow_pgt(sg, raddr, pgt); + /* Free page table */ + page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); + list_del(&page->lru); + page_table_free_pgste(page); +} + +/** + * __gmap_unshadow_sgt - remove all entries from a shadow segment table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @sgt: pointer to the start of a shadow segment table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, + unsigned long *sgt) +{ + unsigned long asce, *pgt; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT; + for (i = 0; i < 2048; i++, raddr += 1UL << 20) { + if (sgt[i] & _SEGMENT_ENTRY_INVALID) + continue; + pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); + sgt[i] = _SEGMENT_ENTRY_EMPTY; + __gmap_unshadow_pgt(sg, raddr, pgt); + /* Free page table */ + page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); + list_del(&page->lru); + page_table_free_pgste(page); + } +} + +/** + * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the shadow->guest_table_lock + */ +static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) +{ + unsigned long r3o, *r3e, *sgt; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ + if (!r3e || *r3e & _REGION_ENTRY_INVALID) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1); + r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff)); + gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); + sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); + *r3e = _REGION3_ENTRY_EMPTY; + __gmap_unshadow_sgt(sg, raddr, sgt); + /* Free segment table */ + page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); +} + +/** + * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table + * @sg: pointer to the shadow guest address space structure + * @raddr: address in the shadow guest address space + * @r3t: pointer to the start of a shadow region-3 table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, + unsigned long *r3t) +{ + unsigned long asce, *sgt; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) r3t | _ASCE_TYPE_REGION3; + for (i = 0; i < 2048; i++, raddr += 1UL << 31) { + if (r3t[i] & _REGION_ENTRY_INVALID) + continue; + sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); + r3t[i] = _REGION3_ENTRY_EMPTY; + __gmap_unshadow_sgt(sg, raddr, sgt); + /* Free segment table */ + page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); + } +} + +/** + * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) +{ + unsigned long r2o, *r2e, *r3t; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ + if (!r2e || *r2e & _REGION_ENTRY_INVALID) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1); + r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff)); + gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); + r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); + *r2e = _REGION2_ENTRY_EMPTY; + __gmap_unshadow_r3t(sg, raddr, r3t); + /* Free region 3 table */ + page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); +} + +/** + * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @r2t: pointer to the start of a shadow region-2 table + * + * Called with the sg->guest_table_lock + */ +static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, + unsigned long *r2t) +{ + unsigned long asce, *r3t; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) r2t | _ASCE_TYPE_REGION2; + for (i = 0; i < 2048; i++, raddr += 1UL << 42) { + if (r2t[i] & _REGION_ENTRY_INVALID) + continue; + r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); + r2t[i] = _REGION2_ENTRY_EMPTY; + __gmap_unshadow_r3t(sg, raddr, r3t); + /* Free region 3 table */ + page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); + } +} + +/** + * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * + * Called with the sg->guest_table_lock + */ +static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) +{ + unsigned long r1o, *r1e, *r2t; + struct page *page; + + BUG_ON(!gmap_is_shadow(sg)); + r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ + if (!r1e || *r1e & _REGION_ENTRY_INVALID) + return; + gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1); + r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff)); + gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); + r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); + *r1e = _REGION1_ENTRY_EMPTY; + __gmap_unshadow_r2t(sg, raddr, r2t); + /* Free region 2 table */ + page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); +} + +/** + * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table + * @sg: pointer to the shadow guest address space structure + * @raddr: rmap address in the shadow guest address space + * @r1t: pointer to the start of a shadow region-1 table + * + * Called with the shadow->guest_table_lock + */ +static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, + unsigned long *r1t) +{ + unsigned long asce, *r2t; + struct page *page; + int i; + + BUG_ON(!gmap_is_shadow(sg)); + asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; + for (i = 0; i < 2048; i++, raddr += 1UL << 53) { + if (r1t[i] & _REGION_ENTRY_INVALID) + continue; + r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); + __gmap_unshadow_r2t(sg, raddr, r2t); + /* Clear entry and flush translation r1t -> r2t */ + gmap_idte_one(asce, raddr); + r1t[i] = _REGION1_ENTRY_EMPTY; + /* Free region 2 table */ + page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + list_del(&page->lru); + __free_pages(page, 2); + } +} + +/** + * gmap_unshadow - remove a shadow page table completely + * @sg: pointer to the shadow guest address space structure + * + * Called with sg->guest_table_lock + */ +static void gmap_unshadow(struct gmap *sg) +{ + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + if (sg->removed) + return; + sg->removed = 1; + gmap_call_notifier(sg, 0, -1UL); + table = (unsigned long *)(sg->asce & _ASCE_ORIGIN); + switch (sg->asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: + __gmap_unshadow_r1t(sg, 0, table); + break; + case _ASCE_TYPE_REGION2: + __gmap_unshadow_r2t(sg, 0, table); + break; + case _ASCE_TYPE_REGION3: + __gmap_unshadow_r3t(sg, 0, table); + break; + case _ASCE_TYPE_SEGMENT: + __gmap_unshadow_sgt(sg, 0, table); + break; + } +} + +/** + * gmap_find_shadow - find a specific asce in the list of shadow tables + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * + * Returns the pointer to a gmap if a shadow table with the given asce is + * already available, otherwise NULL + */ +static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce) +{ + struct gmap *sg; + + list_for_each_entry(sg, &parent->children, list) { + if (sg->orig_asce != asce || sg->removed) + continue; + atomic_inc(&sg->ref_count); + return sg; + } + return NULL; +} + +/** + * gmap_shadow - create/find a shadow guest address space + * @parent: pointer to the parent gmap + * @asce: ASCE for which the shadow table is created + * + * The pages of the top level page table referred by the asce parameter + * will be set to read-only and marked in the PGSTEs of the kvm process. + * The shadow table will be removed automatically on any change to the + * PTE mapping for the source table. + * + * Returns a guest address space structure, NULL if out of memory or if + * anything goes wrong while protecting the top level pages. + */ +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce) +{ + struct gmap *sg, *new; + unsigned long limit; + int rc; + + BUG_ON(gmap_is_shadow(parent)); + spin_lock(&parent->shadow_lock); + sg = gmap_find_shadow(parent, asce); + spin_unlock(&parent->shadow_lock); + if (sg) + return sg; + /* Create a new shadow gmap */ + limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); + new = gmap_alloc(limit); + if (!new) + return NULL; + new->mm = parent->mm; + new->parent = gmap_get(parent); + new->orig_asce = asce; + down_read(&parent->mm->mmap_sem); + rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, + ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096, + PROT_READ, PGSTE_VSIE_BIT); + up_read(&parent->mm->mmap_sem); + if (rc) { + atomic_set(&new->ref_count, 2); + spin_lock(&parent->shadow_lock); + /* Recheck if another CPU created the same shadow */ + sg = gmap_find_shadow(parent, asce); + if (!sg) { + list_add(&new->list, &parent->children); + sg = new; + new = NULL; + } + spin_unlock(&parent->shadow_lock); + } + if (new) + gmap_free(new); + return sg; +} +EXPORT_SYMBOL_GPL(gmap_shadow); + +/** + * gmap_shadow_r2t - create an empty shadow region 2 table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @r2t: parent gmap address of the region 2 table to get shadowed + * + * The r2t parameter specifies the address of the source table. The + * four pages of the source table are made read-only in the parent gmap + * address space. A write to the source table area @r2t will automatically + * remove the shadow r2 table and all of its decendents. + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t) +{ + unsigned long raddr, origin, offset, len; + unsigned long *s_r2t, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + /* Allocate a shadow region second table */ + page = alloc_pages(GFP_KERNEL, 2); + if (!page) + return -ENOMEM; + page->index = r2t & _REGION_ENTRY_ORIGIN; + s_r2t = (unsigned long *) page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } + crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY); + *table = (unsigned long) s_r2t | + _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R1; + list_add(&page->lru, &sg->crst_list); + spin_unlock(&sg->guest_table_lock); + /* Make r2t read-only in parent gmap page table */ + raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1; + origin = r2t & _REGION_ENTRY_ORIGIN; + offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096; + len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + if (rc) { + spin_lock(&sg->guest_table_lock); + gmap_unshadow_r2t(sg, raddr); + spin_unlock(&sg->guest_table_lock); + } + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, 2); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_r2t); + +/** + * gmap_shadow_r3t - create a shadow region 3 table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @r3t: parent gmap address of the region 3 table to get shadowed + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t) +{ + unsigned long raddr, origin, offset, len; + unsigned long *s_r3t, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + /* Allocate a shadow region second table */ + page = alloc_pages(GFP_KERNEL, 2); + if (!page) + return -ENOMEM; + page->index = r3t & _REGION_ENTRY_ORIGIN; + s_r3t = (unsigned long *) page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } + crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY); + *table = (unsigned long) s_r3t | + _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R2; + list_add(&page->lru, &sg->crst_list); + spin_unlock(&sg->guest_table_lock); + /* Make r3t read-only in parent gmap page table */ + raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2; + origin = r3t & _REGION_ENTRY_ORIGIN; + offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096; + len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + if (rc) { + spin_lock(&sg->guest_table_lock); + gmap_unshadow_r3t(sg, raddr); + spin_unlock(&sg->guest_table_lock); + } + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, 2); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_r3t); + +/** + * gmap_shadow_sgt - create a shadow segment table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @sgt: parent gmap address of the segment table to get shadowed + * + * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt) +{ + unsigned long raddr, origin, offset, len; + unsigned long *s_sgt, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + /* Allocate a shadow segment table */ + page = alloc_pages(GFP_KERNEL, 2); + if (!page) + return -ENOMEM; + page->index = sgt & _REGION_ENTRY_ORIGIN; + s_sgt = (unsigned long *) page_to_phys(page); + /* Install shadow region second table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _REGION_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } + crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY); + *table = (unsigned long) s_sgt | + _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R3; + list_add(&page->lru, &sg->crst_list); + spin_unlock(&sg->guest_table_lock); + /* Make sgt read-only in parent gmap page table */ + raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3; + origin = sgt & _REGION_ENTRY_ORIGIN; + offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096; + len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); + if (rc) { + spin_lock(&sg->guest_table_lock); + gmap_unshadow_sgt(sg, raddr); + spin_unlock(&sg->guest_table_lock); + } + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + __free_pages(page, 2); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_sgt); + +/** + * gmap_shadow_lookup_pgtable - find a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: the address in the shadow aguest address space + * @pgt: parent gmap address of the page table to get shadowed + * @dat_protection: if the pgtable is marked as protected by dat + * + * Returns 0 if the shadow page table was found and -EAGAIN if the page + * table was not found. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, + unsigned long *pgt, int *dat_protection) +{ + unsigned long *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ + if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { + /* Shadow page tables are full pages (pte+pgste) */ + page = pfn_to_page(*table >> PAGE_SHIFT); + *pgt = page->index; + *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); + rc = 0; + } else { + rc = -EAGAIN; + } + spin_unlock(&sg->guest_table_lock); + return rc; + +} +EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); + +/** + * gmap_shadow_pgt - instantiate a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pgt: parent gmap address of the page table to get shadowed + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory, + * -EFAULT if an address in the parent gmap could not be resolved and + * + * Called with gmap->mm->mmap_sem in read + */ +int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt) +{ + unsigned long raddr, origin; + unsigned long *s_pgt, *table; + struct page *page; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + /* Allocate a shadow page table */ + page = page_table_alloc_pgste(sg->mm); + if (!page) + return -ENOMEM; + page->index = pgt & _SEGMENT_ENTRY_ORIGIN; + s_pgt = (unsigned long *) page_to_phys(page); + /* Install shadow page table */ + spin_lock(&sg->guest_table_lock); + table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ + if (!table) { + rc = -EAGAIN; /* Race with unshadow */ + goto out_free; + } + if (!(*table & _SEGMENT_ENTRY_INVALID)) { + rc = 0; /* Already established */ + goto out_free; + } + *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | + (pgt & _SEGMENT_ENTRY_PROTECT); + list_add(&page->lru, &sg->pt_list); + spin_unlock(&sg->guest_table_lock); + /* Make pgt read-only in parent gmap page table (not the pgste) */ + raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT; + origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; + rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ); + if (rc) { + spin_lock(&sg->guest_table_lock); + gmap_unshadow_pgt(sg, raddr); + spin_unlock(&sg->guest_table_lock); + } + return rc; +out_free: + spin_unlock(&sg->guest_table_lock); + page_table_free_pgste(page); + return rc; + +} +EXPORT_SYMBOL_GPL(gmap_shadow_pgt); + +/** + * gmap_shadow_page - create a shadow page mapping + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @paddr: parent gmap address to get mapped at @saddr + * @write: =1 map r/w, =0 map r/o + * + * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the + * shadow table structure is incomplete, -ENOMEM if out of memory and + * -EFAULT if an address in the parent gmap could not be resolved. + * + * Called with sg->mm->mmap_sem in read. + */ +int gmap_shadow_page(struct gmap *sg, unsigned long saddr, + unsigned long paddr, int write) +{ + struct gmap *parent; + struct gmap_rmap *rmap; + unsigned long vmaddr; + spinlock_t *ptl; + pte_t *sptep, *tptep; + int rc; + + BUG_ON(!gmap_is_shadow(sg)); + parent = sg->parent; + + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); + if (!rmap) + return -ENOMEM; + rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; + + while (1) { + vmaddr = __gmap_translate(parent, paddr); + if (IS_ERR_VALUE(vmaddr)) { + rc = vmaddr; + break; + } + rc = radix_tree_preload(GFP_KERNEL); + if (rc) + break; + rc = -EAGAIN; + sptep = gmap_pte_op_walk(parent, paddr, &ptl); + if (sptep) { + spin_lock(&sg->guest_table_lock); + /* Get page table pointer */ + tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); + if (!tptep) { + spin_unlock(&sg->guest_table_lock); + gmap_pte_op_end(ptl); + radix_tree_preload_end(); + break; + } + rc = ptep_shadow_pte(sg->mm, saddr, + sptep, tptep, write); + if (rc > 0) { + /* Success and a new mapping */ + gmap_insert_rmap(sg, vmaddr, rmap); + rmap = NULL; + rc = 0; + } + gmap_pte_op_end(ptl); + spin_unlock(&sg->guest_table_lock); + } + radix_tree_preload_end(); + if (!rc) + break; + rc = gmap_pte_op_fixup(parent, paddr, vmaddr); + if (rc) + break; + } + kfree(rmap); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_shadow_page); + +/** + * gmap_shadow_notify - handle notifications for shadow gmap + * + * Called with sg->parent->shadow_lock. + */ +static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, + unsigned long offset, pte_t *pte) +{ + struct gmap_rmap *rmap, *rnext, *head; + unsigned long gaddr, start, end, bits, raddr; + unsigned long *table; + + BUG_ON(!gmap_is_shadow(sg)); + spin_lock(&sg->parent->guest_table_lock); + table = radix_tree_lookup(&sg->parent->host_to_guest, + vmaddr >> PMD_SHIFT); + gaddr = table ? __gmap_segment_gaddr(table) + offset : 0; + spin_unlock(&sg->parent->guest_table_lock); + if (!table) + return; + + spin_lock(&sg->guest_table_lock); + if (sg->removed) { + spin_unlock(&sg->guest_table_lock); + return; + } + /* Check for top level table */ + start = sg->orig_asce & _ASCE_ORIGIN; + end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096; + if (gaddr >= start && gaddr < end) { + /* The complete shadow table has to go */ + gmap_unshadow(sg); + spin_unlock(&sg->guest_table_lock); + list_del(&sg->list); + gmap_put(sg); + return; + } + /* Remove the page table tree from on specific entry */ + head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12); + gmap_for_each_rmap_safe(rmap, rnext, head) { + bits = rmap->raddr & _SHADOW_RMAP_MASK; + raddr = rmap->raddr ^ bits; + switch (bits) { + case _SHADOW_RMAP_REGION1: + gmap_unshadow_r2t(sg, raddr); + break; + case _SHADOW_RMAP_REGION2: + gmap_unshadow_r3t(sg, raddr); + break; + case _SHADOW_RMAP_REGION3: + gmap_unshadow_sgt(sg, raddr); + break; + case _SHADOW_RMAP_SEGMENT: + gmap_unshadow_pgt(sg, raddr); + break; + case _SHADOW_RMAP_PGTABLE: + gmap_unshadow_page(sg, raddr); + break; + } + kfree(rmap); + } + spin_unlock(&sg->guest_table_lock); +} + /** * ptep_notify - call all invalidation callbacks for a specific pte. * @mm: pointer to the process mm_struct * @addr: virtual address in the process address space * @pte: pointer to the page table entry + * @bits: bits from the pgste that caused the notify call * * This function is assumed to be called with the page table lock held * for the pte to notify. */ -void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) +void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, + pte_t *pte, unsigned long bits) { unsigned long offset, gaddr; unsigned long *table; - struct gmap *gmap; + struct gmap *gmap, *sg, *next; offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); offset = offset * (4096 / sizeof(pte_t)); rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { + if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) { + spin_lock(&gmap->shadow_lock); + list_for_each_entry_safe(sg, next, + &gmap->children, list) + gmap_shadow_notify(sg, vmaddr, offset, pte); + spin_unlock(&gmap->shadow_lock); + } + if (!(bits & PGSTE_IN_BIT)) + continue; spin_lock(&gmap->guest_table_lock); table = radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 7be1f94f70a8..9c57a295a045 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -137,6 +137,29 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) return new; } +#ifdef CONFIG_PGSTE + +struct page *page_table_alloc_pgste(struct mm_struct *mm) +{ + struct page *page; + unsigned long *table; + + page = alloc_page(GFP_KERNEL|__GFP_REPEAT); + if (page) { + table = (unsigned long *) page_to_phys(page); + clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); + clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); + } + return page; +} + +void page_table_free_pgste(struct page *page) +{ + __free_page(page); +} + +#endif /* CONFIG_PGSTE */ + /* * page table entry allocation/free routines. */ diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index ab65fb11e058..5b02583fbf4c 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -184,9 +184,12 @@ static inline pgste_t pgste_pte_notify(struct mm_struct *mm, pte_t *ptep, pgste_t pgste) { #ifdef CONFIG_PGSTE - if (pgste_val(pgste) & PGSTE_IN_BIT) { - pgste_val(pgste) &= ~PGSTE_IN_BIT; - ptep_notify(mm, addr, ptep); + unsigned long bits; + + bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); + if (bits) { + pgste_val(pgste) ^= bits; + ptep_notify(mm, addr, ptep, bits); } #endif return pgste; @@ -420,12 +423,13 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) * @addr: virtual address in the guest address space * @ptep: pointer to the page table entry * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE + * @bit: pgste bit to set (e.g. for notification) * * Returns 0 if the access rights were changed and -EAGAIN if the current * and requested access rights are incompatible. */ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, int prot) + pte_t *ptep, int prot, unsigned long bit) { pte_t entry; pgste_t pgste; @@ -441,7 +445,7 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, pgste_set_unlock(ptep, pgste); return -EAGAIN; } - /* Change access rights and set the pgste notification bit */ + /* Change access rights and set pgste bit */ if (prot == PROT_NONE && !pte_i) { ptep_flush_direct(mm, addr, ptep); pgste = pgste_update_all(entry, pgste, mm); @@ -452,12 +456,53 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, pte_val(entry) &= ~_PAGE_INVALID; pte_val(entry) |= _PAGE_PROTECT; } - pgste_val(pgste) |= PGSTE_IN_BIT; + pgste_val(pgste) |= bit; pgste = pgste_set_pte(ptep, pgste, entry); pgste_set_unlock(ptep, pgste); return 0; } +int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, + pte_t *sptep, pte_t *tptep, int write) +{ + pgste_t spgste, tpgste; + pte_t spte, tpte; + int rc = -EAGAIN; + + spgste = pgste_get_lock(sptep); + spte = *sptep; + if (!(pte_val(spte) & _PAGE_INVALID) && + !(pte_val(spte) & _PAGE_PROTECT)) { + rc = 0; + if (!(pte_val(*tptep) & _PAGE_INVALID)) + /* Update existing mapping */ + ptep_flush_direct(mm, saddr, tptep); + else + rc = 1; + pgste_val(spgste) |= PGSTE_VSIE_BIT; + tpgste = pgste_get_lock(tptep); + pte_val(tpte) = (pte_val(spte) & PAGE_MASK) | + (write ? 0 : _PAGE_PROTECT); + /* don't touch the storage key - it belongs to parent pgste */ + tpgste = pgste_set_pte(tptep, tpgste, tpte); + pgste_set_unlock(tptep, tpgste); + } + pgste_set_unlock(sptep, spgste); + return rc; +} + +void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) +{ + pgste_t pgste; + + pgste = pgste_get_lock(ptep); + /* notifier is called by the caller */ + ptep_flush_direct(mm, saddr, ptep); + /* don't touch the storage key - it belongs to parent pgste */ + pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); + pgste_set_unlock(ptep, pgste); +} + static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) { if (!non_swap_entry(entry)) From aa17aa57cfb95b169f25fe98caae49e477590af3 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 8 Mar 2016 12:16:35 +0100 Subject: [PATCH 121/302] s390/mm: add kvm shadow fault function This patch introduces function kvm_s390_shadow_fault() used to resolve a fault on a shadow gmap. This function will do validity checking and build up the shadow page table hierarchy in order to fault in the requested page into the shadow page table structure. If an exception occurs while shadowing, guest 2 has to be notified about it using either an exception or a program interrupt intercept. If concurrent unshadowing occurres, this function will simply return with -EAGAIN and the caller has to retry. Reviewed-by: David Hildenbrand Signed-off-by: Martin Schwidefsky Signed-off-by: Christian Borntraeger --- arch/s390/kvm/gaccess.c | 168 ++++++++++++++++++++++++++++++++++++++++ arch/s390/kvm/gaccess.h | 2 + 2 files changed, 170 insertions(+) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 8e245e764c21..ba4985262bce 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "kvm-s390.h" #include "gaccess.h" #include @@ -946,3 +947,170 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) return 0; return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA); } + +/** + * kvm_s390_shadow_tables - walk the guest page table and create shadow tables + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @pgt: pointer to the page table address result + */ +static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, + unsigned long *pgt, int *dat_protection) +{ + struct gmap *parent; + union asce asce; + union vaddress vaddr; + unsigned long ptr; + int rc; + + parent = sg->parent; + vaddr.addr = saddr; + asce.val = sg->orig_asce; + ptr = asce.origin * 4096; + switch (asce.dt) { + case ASCE_TYPE_REGION1: + if (vaddr.rfx01 > asce.tl) + return PGM_REGION_FIRST_TRANS; + break; + case ASCE_TYPE_REGION2: + if (vaddr.rfx) + return PGM_ASCE_TYPE; + if (vaddr.rsx01 > asce.tl) + return PGM_REGION_SECOND_TRANS; + break; + case ASCE_TYPE_REGION3: + if (vaddr.rfx || vaddr.rsx) + return PGM_ASCE_TYPE; + if (vaddr.rtx01 > asce.tl) + return PGM_REGION_THIRD_TRANS; + break; + case ASCE_TYPE_SEGMENT: + if (vaddr.rfx || vaddr.rsx || vaddr.rtx) + return PGM_ASCE_TYPE; + if (vaddr.sx01 > asce.tl) + return PGM_SEGMENT_TRANSLATION; + break; + } + + switch (asce.dt) { + case ASCE_TYPE_REGION1: { + union region1_table_entry rfte; + + rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val); + if (rc) + return rc; + if (rfte.i) + return PGM_REGION_FIRST_TRANS; + if (rfte.tt != TABLE_TYPE_REGION1) + return PGM_TRANSLATION_SPEC; + if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl) + return PGM_REGION_SECOND_TRANS; + rc = gmap_shadow_r2t(sg, saddr, rfte.val); + if (rc) + return rc; + ptr = rfte.rto * 4096; + /* fallthrough */ + } + case ASCE_TYPE_REGION2: { + union region2_table_entry rste; + + rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val); + if (rc) + return rc; + if (rste.i) + return PGM_REGION_SECOND_TRANS; + if (rste.tt != TABLE_TYPE_REGION2) + return PGM_TRANSLATION_SPEC; + if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl) + return PGM_REGION_THIRD_TRANS; + rc = gmap_shadow_r3t(sg, saddr, rste.val); + if (rc) + return rc; + ptr = rste.rto * 4096; + /* fallthrough */ + } + case ASCE_TYPE_REGION3: { + union region3_table_entry rtte; + + rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val); + if (rc) + return rc; + if (rtte.i) + return PGM_REGION_THIRD_TRANS; + if (rtte.tt != TABLE_TYPE_REGION3) + return PGM_TRANSLATION_SPEC; + if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl) + return PGM_SEGMENT_TRANSLATION; + rc = gmap_shadow_sgt(sg, saddr, rtte.val); + if (rc) + return rc; + ptr = rtte.fc0.sto * 4096; + /* fallthrough */ + } + case ASCE_TYPE_SEGMENT: { + union segment_table_entry ste; + + rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val); + if (rc) + return rc; + if (ste.i) + return PGM_SEGMENT_TRANSLATION; + if (ste.tt != TABLE_TYPE_SEGMENT) + return PGM_TRANSLATION_SPEC; + if (ste.cs && asce.p) + return PGM_TRANSLATION_SPEC; + *dat_protection = ste.fc0.p; + rc = gmap_shadow_pgt(sg, saddr, ste.val); + if (rc) + return rc; + ptr = ste.fc0.pto * 2048; + } + } + /* Return the parent address of the page table */ + *pgt = ptr; + return 0; +} + +/** + * kvm_s390_shadow_fault - handle fault on a shadow page table + * @sg: pointer to the shadow guest address space structure + * @saddr: faulting address in the shadow gmap + * @write: =1 map r/w, =0 map r/o + * + * Returns: - 0 if the shadow fault was successfully resolved + * - > 0 (pgm exception code) on exceptions while faulting + * - -EAGAIN if the caller can retry immediately + * - -EFAULT when accessing invalid guest addresses + * - -ENOMEM if out of memory + */ +int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr, int write) +{ + union vaddress vaddr; + union page_table_entry pte; + unsigned long pgt; + int dat_protection; + int rc; + + rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection); + if (rc) { + rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection); + if (rc) + return rc; + } + + vaddr.addr = saddr; + rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val); + if (rc) + return rc; + if (pte.i) + return PGM_PAGE_TRANSLATION; + if (pte.z || pte.co) + return PGM_TRANSLATION_SPEC; + dat_protection |= pte.p; + if (write && dat_protection) + return PGM_PROTECTION; + rc = gmap_shadow_page(sg, saddr, pte.pfra * 4096, write); + if (rc) + return rc; + return 0; +} diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index df0a79dd8159..e5ec4734d42d 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -361,4 +361,6 @@ void ipte_unlock(struct kvm_vcpu *vcpu); int ipte_lock_held(struct kvm_vcpu *vcpu); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); +int kvm_s390_shadow_fault(struct gmap *shadow, unsigned long saddr, int write); + #endif /* __KVM_S390_GACCESS_H */ From eea3678d4334925bf838e6f4bc88760811a84cd6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Apr 2016 12:45:45 +0200 Subject: [PATCH 122/302] s390/mm: flush tlb of shadows in all situations For now, the tlb of shadow gmap is only flushed when the parent is removed, not when it is removed upfront. Therefore other shadow gmaps can reuse the tables without the tlb getting flushed. Fix this by simply flushing the tlb 1. Before the shadow tables are removed (analogouos to other unshadow functions) 2. When the gmap is freed and therefore the top level pages are freed. Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/mm/gmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 6695a09a3885..b02d0d0cc641 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -173,6 +173,9 @@ static void gmap_free(struct gmap *gmap) { struct page *page, *next; + /* Flush tlb of all gmaps (if not already done for shadows) */ + if (!(gmap_is_shadow(gmap) && gmap->removed)) + gmap_flush_tlb(gmap); /* Free all segment & region tables. */ list_for_each_entry_safe(page, next, &gmap->crst_list, lru) __free_pages(page, 2); @@ -226,13 +229,10 @@ void gmap_remove(struct gmap *gmap) { struct gmap *sg, *next; - /* Flush tlb. */ - gmap_flush_tlb(gmap); /* Remove all shadow gmaps linked to this gmap */ if (!list_empty(&gmap->children)) { spin_lock(&gmap->shadow_lock); list_for_each_entry_safe(sg, next, &gmap->children, list) { - gmap_flush_tlb(sg); list_del(&sg->list); gmap_put(sg); } @@ -1360,6 +1360,7 @@ static void gmap_unshadow(struct gmap *sg) return; sg->removed = 1; gmap_call_notifier(sg, 0, -1UL); + gmap_flush_tlb(sg); table = (unsigned long *)(sg->asce & _ASCE_ORIGIN); switch (sg->asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: From a9d23e71d7716e394a772686bfd994f4e181b235 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Mar 2016 12:21:41 +0100 Subject: [PATCH 123/302] s390/mm: shadow pages with real guest requested protection We really want to avoid manually handling protection for nested virtualization. By shadowing pages with the protection the guest asked us for, the SIE can handle most protection-related actions for us (e.g. special handling for MVPG) and we can directly forward protection exceptions to the guest. PTEs will now always be shadowed with the correct _PAGE_PROTECT flag. Unshadowing will take care of any guest changes to the parent PTE and any host changes to the host PTE. If the host PTE doesn't have the fitting access rights or is not available, we have to fix it up. Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/gmap.h | 3 +-- arch/s390/include/asm/pgtable.h | 2 +- arch/s390/kvm/gaccess.c | 2 +- arch/s390/mm/gmap.c | 12 +++++------- arch/s390/mm/pgtable.c | 16 +++++++--------- 5 files changed, 15 insertions(+), 20 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 58e65ee5b2d2..4a47055f58d7 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -110,8 +110,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt); int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt); int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt, int *dat_protection); -int gmap_shadow_page(struct gmap *sg, unsigned long saddr, - unsigned long paddr, int write); +int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); void gmap_register_pte_notifier(struct gmap_notifier *); void gmap_unregister_pte_notifier(struct gmap_notifier *); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index a6e7fc8f5b49..c7ebba483f09 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -895,7 +895,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, pte_t *ptep , int reset); void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep); int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, - pte_t *sptep, pte_t *tptep, int write); + pte_t *sptep, pte_t *tptep, pte_t pte); void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep); bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address); diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index ba4985262bce..c5f79c1205cf 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1109,7 +1109,7 @@ int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr, int write) dat_protection |= pte.p; if (write && dat_protection) return PGM_PROTECTION; - rc = gmap_shadow_page(sg, saddr, pte.pfra * 4096, write); + rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); if (rc) return rc; return 0; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index b02d0d0cc641..a57a87bfeb27 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1743,8 +1743,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt); * gmap_shadow_page - create a shadow page mapping * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap - * @paddr: parent gmap address to get mapped at @saddr - * @write: =1 map r/w, =0 map r/o + * @pte: pte in parent gmap address space to get shadowed * * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the * shadow table structure is incomplete, -ENOMEM if out of memory and @@ -1752,12 +1751,11 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt); * * Called with sg->mm->mmap_sem in read. */ -int gmap_shadow_page(struct gmap *sg, unsigned long saddr, - unsigned long paddr, int write) +int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) { struct gmap *parent; struct gmap_rmap *rmap; - unsigned long vmaddr; + unsigned long vmaddr, paddr; spinlock_t *ptl; pte_t *sptep, *tptep; int rc; @@ -1771,6 +1769,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; while (1) { + paddr = pte_val(pte) & PAGE_MASK; vmaddr = __gmap_translate(parent, paddr); if (IS_ERR_VALUE(vmaddr)) { rc = vmaddr; @@ -1791,8 +1790,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, radix_tree_preload_end(); break; } - rc = ptep_shadow_pte(sg->mm, saddr, - sptep, tptep, write); + rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte); if (rc > 0) { /* Success and a new mapping */ gmap_insert_rmap(sg, vmaddr, rmap); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 5b02583fbf4c..293130b5aee7 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -463,29 +463,27 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, } int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, - pte_t *sptep, pte_t *tptep, int write) + pte_t *sptep, pte_t *tptep, pte_t pte) { pgste_t spgste, tpgste; pte_t spte, tpte; int rc = -EAGAIN; + if (!(pte_val(*tptep) & _PAGE_INVALID)) + return 0; /* already shadowed */ spgste = pgste_get_lock(sptep); spte = *sptep; if (!(pte_val(spte) & _PAGE_INVALID) && - !(pte_val(spte) & _PAGE_PROTECT)) { - rc = 0; - if (!(pte_val(*tptep) & _PAGE_INVALID)) - /* Update existing mapping */ - ptep_flush_direct(mm, saddr, tptep); - else - rc = 1; + !((pte_val(spte) & _PAGE_PROTECT) && + !(pte_val(pte) & _PAGE_PROTECT))) { pgste_val(spgste) |= PGSTE_VSIE_BIT; tpgste = pgste_get_lock(tptep); pte_val(tpte) = (pte_val(spte) & PAGE_MASK) | - (write ? 0 : _PAGE_PROTECT); + (pte_val(pte) & _PAGE_PROTECT); /* don't touch the storage key - it belongs to parent pgste */ tpgste = pgste_set_pte(tptep, tpgste, tpte); pgste_set_unlock(tptep, tpgste); + rc = 1; } pgste_set_unlock(sptep, spgste); return rc; From 998f637cc4b9ef3fa32b196294a3136ee05271a2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Mar 2016 12:23:38 +0100 Subject: [PATCH 124/302] s390/mm: avoid races on region/segment/page table shadowing We have to unlock sg->guest_table_lock in order to call gmap_protect_rmap(). If we sleep just before that call, another VCPU might pick up that shadowed page table (while it is not protected yet) and use it. In order to avoid these races, we have to introduce a third state - "origin set but still invalid" for an entry. This way, we can avoid another thread already using the entry before the table is fully protected. As soon as everything is set up, we can clear the invalid bit - if we had no race with the unshadowing code. Suggested-by: Martin Schwidefsky Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/mm/gmap.c | 97 ++++++++++++++++++++++++++++++++------------- 1 file changed, 70 insertions(+), 27 deletions(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index a57a87bfeb27..a396e58b5a43 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1125,7 +1125,7 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) BUG_ON(!gmap_is_shadow(sg)); ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ - if (!ste || *ste & _SEGMENT_ENTRY_INVALID) + if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) return; gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1); sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff)); @@ -1157,7 +1157,7 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT; for (i = 0; i < 2048; i++, raddr += 1UL << 20) { - if (sgt[i] & _SEGMENT_ENTRY_INVALID) + if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) continue; pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); sgt[i] = _SEGMENT_ENTRY_EMPTY; @@ -1183,7 +1183,7 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) BUG_ON(!gmap_is_shadow(sg)); r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ - if (!r3e || *r3e & _REGION_ENTRY_INVALID) + if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) return; gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1); r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff)); @@ -1215,7 +1215,7 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) r3t | _ASCE_TYPE_REGION3; for (i = 0; i < 2048; i++, raddr += 1UL << 31) { - if (r3t[i] & _REGION_ENTRY_INVALID) + if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) continue; sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); r3t[i] = _REGION3_ENTRY_EMPTY; @@ -1241,7 +1241,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) BUG_ON(!gmap_is_shadow(sg)); r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ - if (!r2e || *r2e & _REGION_ENTRY_INVALID) + if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) return; gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1); r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff)); @@ -1273,7 +1273,7 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) r2t | _ASCE_TYPE_REGION2; for (i = 0; i < 2048; i++, raddr += 1UL << 42) { - if (r2t[i] & _REGION_ENTRY_INVALID) + if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) continue; r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); r2t[i] = _REGION2_ENTRY_EMPTY; @@ -1299,7 +1299,7 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) BUG_ON(!gmap_is_shadow(sg)); r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ - if (!r1e || *r1e & _REGION_ENTRY_INVALID) + if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) return; gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1); r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff)); @@ -1331,7 +1331,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; for (i = 0; i < 2048; i++, raddr += 1UL << 53) { - if (r1t[i] & _REGION_ENTRY_INVALID) + if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) continue; r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); __gmap_unshadow_r2t(sg, raddr, r2t); @@ -1496,10 +1496,14 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t) if (!(*table & _REGION_ENTRY_INVALID)) { rc = 0; /* Already established */ goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; } crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY); - *table = (unsigned long) s_r2t | - _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R1; + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; list_add(&page->lru, &sg->crst_list); spin_unlock(&sg->guest_table_lock); /* Make r2t read-only in parent gmap page table */ @@ -1508,11 +1512,18 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t) offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096; len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); - if (rc) { - spin_lock(&sg->guest_table_lock); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 4); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != + (unsigned long) s_r2t) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { gmap_unshadow_r2t(sg, raddr); - spin_unlock(&sg->guest_table_lock); } + spin_unlock(&sg->guest_table_lock); return rc; out_free: spin_unlock(&sg->guest_table_lock); @@ -1557,10 +1568,13 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t) if (!(*table & _REGION_ENTRY_INVALID)) { rc = 0; /* Already established */ goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ } crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY); - *table = (unsigned long) s_r3t | - _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R2; + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; list_add(&page->lru, &sg->crst_list); spin_unlock(&sg->guest_table_lock); /* Make r3t read-only in parent gmap page table */ @@ -1569,11 +1583,18 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t) offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096; len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); - if (rc) { - spin_lock(&sg->guest_table_lock); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 3); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != + (unsigned long) s_r3t) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { gmap_unshadow_r3t(sg, raddr); - spin_unlock(&sg->guest_table_lock); } + spin_unlock(&sg->guest_table_lock); return rc; out_free: spin_unlock(&sg->guest_table_lock); @@ -1618,10 +1639,14 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt) if (!(*table & _REGION_ENTRY_INVALID)) { rc = 0; /* Already established */ goto out_free; + } else if (*table & _REGION_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; } crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY); - *table = (unsigned long) s_sgt | - _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R3; + /* mark as invalid as long as the parent table is not protected */ + *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH | + _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; list_add(&page->lru, &sg->crst_list); spin_unlock(&sg->guest_table_lock); /* Make sgt read-only in parent gmap page table */ @@ -1630,11 +1655,18 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt) offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096; len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); - if (rc) { - spin_lock(&sg->guest_table_lock); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 2); + if (!table || (*table & _REGION_ENTRY_ORIGIN) != + (unsigned long) s_sgt) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_REGION_ENTRY_INVALID; + } else { gmap_unshadow_sgt(sg, raddr); - spin_unlock(&sg->guest_table_lock); } + spin_unlock(&sg->guest_table_lock); return rc; out_free: spin_unlock(&sg->guest_table_lock); @@ -1716,20 +1748,31 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt) if (!(*table & _SEGMENT_ENTRY_INVALID)) { rc = 0; /* Already established */ goto out_free; + } else if (*table & _SEGMENT_ENTRY_ORIGIN) { + rc = -EAGAIN; /* Race with shadow */ + goto out_free; } + /* mark as invalid as long as the parent table is not protected */ *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | - (pgt & _SEGMENT_ENTRY_PROTECT); + (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; list_add(&page->lru, &sg->pt_list); spin_unlock(&sg->guest_table_lock); /* Make pgt read-only in parent gmap page table (not the pgste) */ raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT; origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ); - if (rc) { - spin_lock(&sg->guest_table_lock); + spin_lock(&sg->guest_table_lock); + if (!rc) { + table = gmap_table_walk(sg, saddr, 1); + if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != + (unsigned long) s_pgt) + rc = -EAGAIN; /* Race with unshadow */ + else + *table &= ~_SEGMENT_ENTRY_INVALID; + } else { gmap_unshadow_pgt(sg, raddr); - spin_unlock(&sg->guest_table_lock); } + spin_unlock(&sg->guest_table_lock); return rc; out_free: spin_unlock(&sg->guest_table_lock); From 0f7f84891516dc1ff7500fae12143710d2d9d11f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Mar 2016 12:30:46 +0100 Subject: [PATCH 125/302] s390/mm: fix races on gmap_shadow creation Before any thread is allowed to use a gmap_shadow, it has to be fully initialized. However, for invalidation to work properly, we have to register the new gmap_shadow before we protect the parent gmap table. Because locking is tricky, and we have to avoid duplicate gmaps, let's introduce an initialized field, that signalizes other threads if that gmap_shadow can already be used or if they have to retry. Let's properly return errors using ERR_PTR() instead of simply returning NULL, so a caller can properly react on the error. Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/gmap.h | 2 ++ arch/s390/mm/gmap.c | 45 ++++++++++++++++++++++-------------- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 4a47055f58d7..54a2487efce4 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -27,6 +27,7 @@ * @parent: pointer to the parent gmap for shadow guest address spaces * @orig_asce: ASCE for which the shadow page table has been created * @removed: flag to indicate if a shadow guest address space has been removed + * @initialized: flag to indicate if a shadow guest address space can be used */ struct gmap { struct list_head list; @@ -49,6 +50,7 @@ struct gmap { struct gmap *parent; unsigned long orig_asce; bool removed; + bool initialized; }; /** diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index a396e58b5a43..a7dfb337e133 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1384,7 +1384,8 @@ static void gmap_unshadow(struct gmap *sg) * @asce: ASCE for which the shadow table is created * * Returns the pointer to a gmap if a shadow table with the given asce is - * already available, otherwise NULL + * already available, ERR_PTR(-EAGAIN) if another one is just being created, + * otherwise NULL */ static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce) { @@ -1393,6 +1394,8 @@ static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce) list_for_each_entry(sg, &parent->children, list) { if (sg->orig_asce != asce || sg->removed) continue; + if (!sg->initialized) + return ERR_PTR(-EAGAIN); atomic_inc(&sg->ref_count); return sg; } @@ -1409,8 +1412,9 @@ static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce) * The shadow table will be removed automatically on any change to the * PTE mapping for the source table. * - * Returns a guest address space structure, NULL if out of memory or if - * anything goes wrong while protecting the top level pages. + * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, + * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the + * parent gmap table could not be protected. */ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce) { @@ -1428,30 +1432,37 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce) limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); new = gmap_alloc(limit); if (!new) - return NULL; + return ERR_PTR(-ENOMEM); new->mm = parent->mm; new->parent = gmap_get(parent); new->orig_asce = asce; + new->initialized = false; + spin_lock(&parent->shadow_lock); + /* Recheck if another CPU created the same shadow */ + sg = gmap_find_shadow(parent, asce); + if (sg) { + spin_unlock(&parent->shadow_lock); + gmap_free(new); + return sg; + } + atomic_set(&new->ref_count, 2); + list_add(&new->list, &parent->children); + spin_unlock(&parent->shadow_lock); + /* protect after insertion, so it will get properly invalidated */ down_read(&parent->mm->mmap_sem); rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096, PROT_READ, PGSTE_VSIE_BIT); up_read(&parent->mm->mmap_sem); + spin_lock(&parent->shadow_lock); + new->initialized = true; if (rc) { - atomic_set(&new->ref_count, 2); - spin_lock(&parent->shadow_lock); - /* Recheck if another CPU created the same shadow */ - sg = gmap_find_shadow(parent, asce); - if (!sg) { - list_add(&new->list, &parent->children); - sg = new; - new = NULL; - } - spin_unlock(&parent->shadow_lock); - } - if (new) + list_del(&new->list); gmap_free(new); - return sg; + new = ERR_PTR(rc); + } + spin_unlock(&parent->shadow_lock); + return new; } EXPORT_SYMBOL_GPL(gmap_shadow); From e52f8b6112353e9e8eac64f082bfbc65e64bb2dd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 2 Feb 2016 12:26:00 +0100 Subject: [PATCH 126/302] s390/mm: take the mmap_sem in kvm_s390_shadow_fault() Instead of doing it in the caller, let's just take the mmap_sem in kvm_s390_shadow_fault(). By taking it as read, we allow parallel faulting on shadow page tables, gmap shadow code is prepared for that. Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/gaccess.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index c5f79c1205cf..5b5eee2d51cd 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1091,26 +1091,24 @@ int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr, int write) int dat_protection; int rc; + down_read(&sg->mm->mmap_sem); + rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection); - if (rc) { + if (rc) rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection); - if (rc) - return rc; - } vaddr.addr = saddr; - rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val); - if (rc) - return rc; - if (pte.i) - return PGM_PAGE_TRANSLATION; - if (pte.z || pte.co) - return PGM_TRANSLATION_SPEC; + if (!rc) + rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val); + if (!rc && pte.i) + rc = PGM_PAGE_TRANSLATION; + if (!rc && (pte.z || pte.co)) + rc = PGM_TRANSLATION_SPEC; dat_protection |= pte.p; - if (write && dat_protection) - return PGM_PROTECTION; - rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); - if (rc) - return rc; - return 0; + if (!rc && write && dat_protection) + rc = PGM_PROTECTION; + if (!rc) + rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); + up_read(&sg->mm->mmap_sem); + return rc; } From 7a6741576b268820c8bd2b66288e6ff3bc57d4a7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 27 Jan 2016 17:18:41 +0100 Subject: [PATCH 127/302] s390/mm: protection exceptions are corrrectly shadowed As gmap shadows contains correct protection permissions, protection exceptons can directly be forwarded to guest 3. If we would encounter a protection exception while faulting, the next guest 3 run will automatically handle that for us. Keep the dat_protection logic in place, as it will be helpful later. Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/gaccess.c | 6 +----- arch/s390/kvm/gaccess.h | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 5b5eee2d51cd..b2783dd71854 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1075,7 +1075,6 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, * kvm_s390_shadow_fault - handle fault on a shadow page table * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap - * @write: =1 map r/w, =0 map r/o * * Returns: - 0 if the shadow fault was successfully resolved * - > 0 (pgm exception code) on exceptions while faulting @@ -1083,7 +1082,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, * - -EFAULT when accessing invalid guest addresses * - -ENOMEM if out of memory */ -int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr, int write) +int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr) { union vaddress vaddr; union page_table_entry pte; @@ -1104,9 +1103,6 @@ int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr, int write) rc = PGM_PAGE_TRANSLATION; if (!rc && (pte.z || pte.co)) rc = PGM_TRANSLATION_SPEC; - dat_protection |= pte.p; - if (!rc && write && dat_protection) - rc = PGM_PROTECTION; if (!rc) rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); up_read(&sg->mm->mmap_sem); diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index e5ec4734d42d..0d044d09dbd8 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -361,6 +361,6 @@ void ipte_unlock(struct kvm_vcpu *vcpu); int ipte_lock_held(struct kvm_vcpu *vcpu); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); -int kvm_s390_shadow_fault(struct gmap *shadow, unsigned long saddr, int write); +int kvm_s390_shadow_fault(struct gmap *shadow, unsigned long saddr); #endif /* __KVM_S390_GACCESS_H */ From f4debb40903978bbddfb9e877ca4d2f27e26567f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 27 Jan 2016 17:24:03 +0100 Subject: [PATCH 128/302] s390/mm: take ipte_lock during shadow faults Let's take the ipte_lock while working on guest 2 provided page table, just like the other gaccess functions. Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/gaccess.c | 11 ++++++++++- arch/s390/kvm/gaccess.h | 3 ++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index b2783dd71854..e70f916c1079 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1073,6 +1073,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, /** * kvm_s390_shadow_fault - handle fault on a shadow page table + * @vcpu: virtual cpu * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap * @@ -1082,7 +1083,8 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, * - -EFAULT when accessing invalid guest addresses * - -ENOMEM if out of memory */ -int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr) +int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, + unsigned long saddr) { union vaddress vaddr; union page_table_entry pte; @@ -1091,6 +1093,12 @@ int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr) int rc; down_read(&sg->mm->mmap_sem); + /* + * We don't want any guest-2 tables to change - so the parent + * tables/pointers we read stay valid - unshadowing is however + * always possible - only guest_table_lock protects us. + */ + ipte_lock(vcpu); rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection); if (rc) @@ -1105,6 +1113,7 @@ int kvm_s390_shadow_fault(struct gmap *sg, unsigned long saddr) rc = PGM_TRANSLATION_SPEC; if (!rc) rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); + ipte_unlock(vcpu); up_read(&sg->mm->mmap_sem); return rc; } diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 0d044d09dbd8..8756569ad938 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -361,6 +361,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu); int ipte_lock_held(struct kvm_vcpu *vcpu); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); -int kvm_s390_shadow_fault(struct gmap *shadow, unsigned long saddr); +int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow, + unsigned long saddr); #endif /* __KVM_S390_GACCESS_H */ From 00fc062d5364174b94e3b5780c22e95c0fb4b60a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 18 Apr 2016 17:19:59 +0200 Subject: [PATCH 129/302] s390/mm: push ste protection down to shadow pte If a guest ste is read-only, it doesn't make sense to force the ptes in as writable in the host. If the source page is read-only in the host, it won't have to be made writable. Please note that if the source page is not available, it will still be faulted in writable. This can be changed internally later on. If ste protection is removed, underlying shadow tables are also removed, therefore this change does not affect the guest. Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/gaccess.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index e70f916c1079..a85bc6c6a098 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1111,6 +1111,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, rc = PGM_PAGE_TRANSLATION; if (!rc && (pte.z || pte.co)) rc = PGM_TRANSLATION_SPEC; + pte.p |= dat_protection; if (!rc) rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); ipte_unlock(vcpu); From 5b062bd4940f81e0bd26b0d75f56d7abebf0309f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Mar 2016 12:17:40 +0100 Subject: [PATCH 130/302] s390/mm: prepare for EDAT1/EDAT2 support in gmap shadow In preparation for EDAT1/EDAT2 support for gmap shadows, we have to store the requested edat level in the gmap shadow. The edat level used during shadow translation is a property of the gmap shadow. Depending on that level, the gmap shadow will look differently for the same guest tables. We have to store it internally in order to support it later. Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/gmap.h | 5 ++++- arch/s390/mm/gmap.c | 16 +++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 54a2487efce4..2ab397c8ca09 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -26,6 +26,7 @@ * @shadow_lock: spinlock to protect the shadow gmap list * @parent: pointer to the parent gmap for shadow guest address spaces * @orig_asce: ASCE for which the shadow page table has been created + * @edat_level: edat level to be used for the shadow translation * @removed: flag to indicate if a shadow guest address space has been removed * @initialized: flag to indicate if a shadow guest address space can be used */ @@ -49,6 +50,7 @@ struct gmap { spinlock_t shadow_lock; struct gmap *parent; unsigned long orig_asce; + int edat_level; bool removed; bool initialized; }; @@ -105,7 +107,8 @@ void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr) int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val); -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce); +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, + int edat_level); int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t); int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t); int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index a7dfb337e133..f0b2a531c599 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1382,17 +1382,20 @@ static void gmap_unshadow(struct gmap *sg) * gmap_find_shadow - find a specific asce in the list of shadow tables * @parent: pointer to the parent gmap * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation * * Returns the pointer to a gmap if a shadow table with the given asce is * already available, ERR_PTR(-EAGAIN) if another one is just being created, * otherwise NULL */ -static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce) +static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, + int edat_level) { struct gmap *sg; list_for_each_entry(sg, &parent->children, list) { - if (sg->orig_asce != asce || sg->removed) + if (sg->orig_asce != asce || sg->edat_level != edat_level || + sg->removed) continue; if (!sg->initialized) return ERR_PTR(-EAGAIN); @@ -1406,6 +1409,7 @@ static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce) * gmap_shadow - create/find a shadow guest address space * @parent: pointer to the parent gmap * @asce: ASCE for which the shadow table is created + * @edat_level: edat level to be used for the shadow translation * * The pages of the top level page table referred by the asce parameter * will be set to read-only and marked in the PGSTEs of the kvm process. @@ -1416,7 +1420,8 @@ static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce) * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the * parent gmap table could not be protected. */ -struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce) +struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, + int edat_level) { struct gmap *sg, *new; unsigned long limit; @@ -1424,7 +1429,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce) BUG_ON(gmap_is_shadow(parent)); spin_lock(&parent->shadow_lock); - sg = gmap_find_shadow(parent, asce); + sg = gmap_find_shadow(parent, asce, edat_level); spin_unlock(&parent->shadow_lock); if (sg) return sg; @@ -1436,10 +1441,11 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce) new->mm = parent->mm; new->parent = gmap_get(parent); new->orig_asce = asce; + new->edat_level = edat_level; new->initialized = false; spin_lock(&parent->shadow_lock); /* Recheck if another CPU created the same shadow */ - sg = gmap_find_shadow(parent, asce); + sg = gmap_find_shadow(parent, asce, edat_level); if (sg) { spin_unlock(&parent->shadow_lock); gmap_free(new); From fd8d4e3ab6993e194287a59c4d3a6a43da86b8dc Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 18 Apr 2016 13:24:52 +0200 Subject: [PATCH 131/302] s390/mm: support EDAT1 for gmap shadows If the guest is enabled for EDAT1, we can easily create shadows for guest2 -> guest3 provided tables that make use of EDAT1. If guest2 references a 1MB page, this memory looks consecutive for guest2, but it might not be so for us. Therefore we have to create fake page tables. We can easily add that to our existing infrastructure. The invalidation mechanism will make sure that fake page tables are removed when the parent table (sgt table entry) is changed. As EDAT1 also introduced protection on all page table levels, we have to also shadow these correctly. We don't have to care about: - ACCF-Validity Control in STE - Access-Control Bits in STE - Fetch-Protection Bit in STE - Common-Segment Bit in STE As all bits might be dropped and there is no guaranteed that they are active ("unpredictable whether the CPU uses these bits", "may be used"). Without using EDAT1 in the shadow ourselfes (STE-format control == 0), simply shadowing these bits would not be enough. They would be ignored. Please note that we are using the "fake" flag to make this look consistent with further changes (EDAT2, real-space designation support) and don't let the shadow functions handle fc=1 stes. In the future, with huge pages in the host, gmap_shadow_pgt() could simply try to map a huge host page if "fake" is set to one and indicate via return value that no lower fake tables / shadow ptes are required. Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/gmap.h | 5 +++-- arch/s390/kvm/gaccess.c | 34 +++++++++++++++++++++++++++------- arch/s390/mm/gmap.c | 29 +++++++++++++++++++++++++---- 3 files changed, 55 insertions(+), 13 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 2ab397c8ca09..c8ba5a197b1d 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -112,9 +112,10 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t); int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t); int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt); -int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt); +int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, + int fake); int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, - unsigned long *pgt, int *dat_protection); + unsigned long *pgt, int *dat_protection, int *fake); int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte); void gmap_register_pte_notifier(struct gmap_notifier *); diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index a85bc6c6a098..af1fc6fa7b74 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -953,9 +953,11 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap * @pgt: pointer to the page table address result + * @fake: pgt references contiguous guest memory block, not a pgtable */ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, - unsigned long *pgt, int *dat_protection) + unsigned long *pgt, int *dat_protection, + int *fake) { struct gmap *parent; union asce asce; @@ -963,6 +965,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, unsigned long ptr; int rc; + *fake = 0; parent = sg->parent; vaddr.addr = saddr; asce.val = sg->orig_asce; @@ -1060,10 +1063,20 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, if (ste.cs && asce.p) return PGM_TRANSLATION_SPEC; *dat_protection = ste.fc0.p; - rc = gmap_shadow_pgt(sg, saddr, ste.val); + if (ste.fc && sg->edat_level >= 1) { + bool prot = ste.fc1.p; + + *fake = 1; + ptr = ste.fc1.sfaa << 20UL; + ste.val = ptr; + ste.fc0.p = prot; + goto shadow_pgt; + } + ptr = ste.fc0.pto << 11UL; +shadow_pgt: + rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); if (rc) return rc; - ptr = ste.fc0.pto * 2048; } } /* Return the parent address of the page table */ @@ -1089,7 +1102,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, union vaddress vaddr; union page_table_entry pte; unsigned long pgt; - int dat_protection; + int dat_protection, fake; int rc; down_read(&sg->mm->mmap_sem); @@ -1100,17 +1113,24 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, */ ipte_lock(vcpu); - rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection); + rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); if (rc) - rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection); + rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection, + &fake); vaddr.addr = saddr; + if (fake) { + /* offset in 1MB guest memory block */ + pte.val = pgt + ((unsigned long) vaddr.px << 12UL); + goto shadow_page; + } if (!rc) rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val); if (!rc && pte.i) rc = PGM_PAGE_TRANSLATION; - if (!rc && (pte.z || pte.co)) + if (!rc && (pte.z || (pte.co && sg->edat_level < 1))) rc = PGM_TRANSLATION_SPEC; +shadow_page: pte.p |= dat_protection; if (!rc) rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index f0b2a531c599..de7ad7bd4a48 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -20,6 +20,8 @@ #include #include +#define GMAP_SHADOW_FAKE_TABLE 1ULL + /** * gmap_alloc - allocate and initialize a guest address space * @mm: pointer to the parent mm_struct @@ -1521,6 +1523,8 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t) /* mark as invalid as long as the parent table is not protected */ *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= (r2t & _REGION_ENTRY_PROTECT); list_add(&page->lru, &sg->crst_list); spin_unlock(&sg->guest_table_lock); /* Make r2t read-only in parent gmap page table */ @@ -1592,6 +1596,8 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t) /* mark as invalid as long as the parent table is not protected */ *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= (r3t & _REGION_ENTRY_PROTECT); list_add(&page->lru, &sg->crst_list); spin_unlock(&sg->guest_table_lock); /* Make r3t read-only in parent gmap page table */ @@ -1664,6 +1670,8 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt) /* mark as invalid as long as the parent table is not protected */ *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; + if (sg->edat_level >= 1) + *table |= sgt & _REGION_ENTRY_PROTECT; list_add(&page->lru, &sg->crst_list); spin_unlock(&sg->guest_table_lock); /* Make sgt read-only in parent gmap page table */ @@ -1698,6 +1706,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_sgt); * @saddr: the address in the shadow aguest address space * @pgt: parent gmap address of the page table to get shadowed * @dat_protection: if the pgtable is marked as protected by dat + * @fake: pgt references contiguous guest memory block, not a pgtable * * Returns 0 if the shadow page table was found and -EAGAIN if the page * table was not found. @@ -1705,7 +1714,8 @@ EXPORT_SYMBOL_GPL(gmap_shadow_sgt); * Called with sg->mm->mmap_sem in read. */ int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, - unsigned long *pgt, int *dat_protection) + unsigned long *pgt, int *dat_protection, + int *fake) { unsigned long *table; struct page *page; @@ -1717,8 +1727,9 @@ int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { /* Shadow page tables are full pages (pte+pgste) */ page = pfn_to_page(*table >> PAGE_SHIFT); - *pgt = page->index; + *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE; *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); + *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE); rc = 0; } else { rc = -EAGAIN; @@ -1734,6 +1745,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap * @pgt: parent gmap address of the page table to get shadowed + * @fake: pgt references contiguous guest memory block, not a pgtable * * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the * shadow table structure is incomplete, -ENOMEM if out of memory, @@ -1741,19 +1753,22 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); * * Called with gmap->mm->mmap_sem in read */ -int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt) +int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, + int fake) { unsigned long raddr, origin; unsigned long *s_pgt, *table; struct page *page; int rc; - BUG_ON(!gmap_is_shadow(sg)); + BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); /* Allocate a shadow page table */ page = page_table_alloc_pgste(sg->mm); if (!page) return -ENOMEM; page->index = pgt & _SEGMENT_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; s_pgt = (unsigned long *) page_to_phys(page); /* Install shadow page table */ spin_lock(&sg->guest_table_lock); @@ -1773,6 +1788,12 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt) *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; list_add(&page->lru, &sg->pt_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_SEGMENT_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } spin_unlock(&sg->guest_table_lock); /* Make pgt read-only in parent gmap page table (not the pgste) */ raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT; From 18b89809881834cecd2977e6048a30c4c8f140fe Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 18 Apr 2016 13:42:05 +0200 Subject: [PATCH 132/302] s390/mm: support EDAT2 for gmap shadows If the guest is enabled for EDAT2, we can easily create shadows for guest2 -> guest3 provided tables that make use of EDAT2. If guest2 references a 2GB page, this memory looks consecutive for guest2, but it does not have to be so for us. Therefore we have to create fake segment and page tables. This works just like EDAT1 support, so page tables are removed when the parent table (r3t table entry) is changed. We don't hve to care about: - ACCF-Validity Control in RTTE - Access-Control Bits in RTTE - Fetch-Protection Bit in RTTE - Common-Region Bit in RTTE Just like for EDAT1, all bits might be dropped and there is no guaranteed that they are active. Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/gmap.h | 3 ++- arch/s390/kvm/gaccess.c | 22 ++++++++++++++++++++-- arch/s390/mm/gmap.c | 14 ++++++++++++-- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index c8ba5a197b1d..2e4c3b222a96 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -111,7 +111,8 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level); int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t); int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t); -int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt); +int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, + int fake); int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, int fake); int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index af1fc6fa7b74..fab03ecb5bd5 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -1042,17 +1042,35 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, return PGM_REGION_THIRD_TRANS; if (rtte.tt != TABLE_TYPE_REGION3) return PGM_TRANSLATION_SPEC; + if (rtte.cr && asce.p && sg->edat_level >= 2) + return PGM_TRANSLATION_SPEC; + if (rtte.fc && sg->edat_level >= 2) { + bool prot = rtte.fc1.p; + + *fake = 1; + ptr = rtte.fc1.rfaa << 31UL; + rtte.val = ptr; + rtte.fc0.p = prot; + goto shadow_sgt; + } if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl) return PGM_SEGMENT_TRANSLATION; - rc = gmap_shadow_sgt(sg, saddr, rtte.val); + ptr = rtte.fc0.sto << 12UL; +shadow_sgt: + rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); if (rc) return rc; - ptr = rtte.fc0.sto * 4096; /* fallthrough */ } case ASCE_TYPE_SEGMENT: { union segment_table_entry ste; + if (*fake) { + /* offset in 2G guest memory block */ + ptr = ptr + ((unsigned long) vaddr.sx << 20UL); + ste.val = ptr; + goto shadow_pgt; + } rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val); if (rc) return rc; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index de7ad7bd4a48..c96bf30245c0 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1631,6 +1631,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r3t); * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap * @sgt: parent gmap address of the segment table to get shadowed + * @fake: sgt references contiguous guest memory block, not a sgt * * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the * shadow table structure is incomplete, -ENOMEM if out of memory and @@ -1638,19 +1639,22 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r3t); * * Called with sg->mm->mmap_sem in read. */ -int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt) +int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, + int fake) { unsigned long raddr, origin, offset, len; unsigned long *s_sgt, *table; struct page *page; int rc; - BUG_ON(!gmap_is_shadow(sg)); + BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); /* Allocate a shadow segment table */ page = alloc_pages(GFP_KERNEL, 2); if (!page) return -ENOMEM; page->index = sgt & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; s_sgt = (unsigned long *) page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); @@ -1673,6 +1677,12 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt) if (sg->edat_level >= 1) *table |= sgt & _REGION_ENTRY_PROTECT; list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } spin_unlock(&sg->guest_table_lock); /* Make sgt read-only in parent gmap page table */ raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3; From 1c65781b56ce812ce9729bf414201921c9408678 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 18 Apr 2016 17:46:21 +0200 Subject: [PATCH 133/302] s390/mm: push rte protection down to shadow pte Just like we already do with ste protection, let's take rte protection into account. This way, the host pte doesn't have to be mapped writable. Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/gaccess.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index fab03ecb5bd5..f6d556dfafcd 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -966,6 +966,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, int rc; *fake = 0; + *dat_protection = 0; parent = sg->parent; vaddr.addr = saddr; asce.val = sg->orig_asce; @@ -1008,6 +1009,8 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, return PGM_TRANSLATION_SPEC; if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl) return PGM_REGION_SECOND_TRANS; + if (sg->edat_level >= 1) + *dat_protection |= rfte.p; rc = gmap_shadow_r2t(sg, saddr, rfte.val); if (rc) return rc; @@ -1026,6 +1029,9 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, return PGM_TRANSLATION_SPEC; if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl) return PGM_REGION_THIRD_TRANS; + if (sg->edat_level >= 1) + *dat_protection |= rste.p; + rste.p |= *dat_protection; rc = gmap_shadow_r3t(sg, saddr, rste.val); if (rc) return rc; @@ -1045,18 +1051,19 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, if (rtte.cr && asce.p && sg->edat_level >= 2) return PGM_TRANSLATION_SPEC; if (rtte.fc && sg->edat_level >= 2) { - bool prot = rtte.fc1.p; - + *dat_protection |= rtte.fc0.p; *fake = 1; ptr = rtte.fc1.rfaa << 31UL; rtte.val = ptr; - rtte.fc0.p = prot; goto shadow_sgt; } if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl) return PGM_SEGMENT_TRANSLATION; + if (sg->edat_level >= 1) + *dat_protection |= rtte.fc0.p; ptr = rtte.fc0.sto << 12UL; shadow_sgt: + rtte.fc0.p |= *dat_protection; rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); if (rc) return rc; @@ -1080,18 +1087,16 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, return PGM_TRANSLATION_SPEC; if (ste.cs && asce.p) return PGM_TRANSLATION_SPEC; - *dat_protection = ste.fc0.p; + *dat_protection |= ste.fc0.p; if (ste.fc && sg->edat_level >= 1) { - bool prot = ste.fc1.p; - *fake = 1; ptr = ste.fc1.sfaa << 20UL; ste.val = ptr; - ste.fc0.p = prot; goto shadow_pgt; } ptr = ste.fc0.pto << 11UL; shadow_pgt: + ste.fc0.p |= *dat_protection; rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); if (rc) return rc; From 3218f7094b6b583f4f01bffcf84572c6beacdcc2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 18 Apr 2016 16:22:24 +0200 Subject: [PATCH 134/302] s390/mm: support real-space for gmap shadows We can easily support real-space designation just like EDAT1 and EDAT2. So guest2 can provide for guest3 an asce with the real-space control being set. We simply have to allocate the biggest page table possible and fake all levels. There is no protection to consider. If we exceed guest memory, vsie code will inject an addressing exception (via program intercept). In the future, we could limit the fake table level to the gmap page table. As the top level page table can never go away, such gmap shadows will never get unshadowed, we'll have to come up with another way to limit the number of kept gmap shadows. Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/gmap.h | 6 ++++-- arch/s390/kvm/gaccess.c | 34 +++++++++++++++++++++++++++++----- arch/s390/mm/gmap.c | 35 ++++++++++++++++++++++++++++++++--- 3 files changed, 65 insertions(+), 10 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 2e4c3b222a96..752cf47a81ab 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -109,8 +109,10 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val); struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level); -int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t); -int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t); +int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, + int fake); +int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, + int fake); int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, int fake); int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index f6d556dfafcd..54200208bf24 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -971,9 +971,13 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, vaddr.addr = saddr; asce.val = sg->orig_asce; ptr = asce.origin * 4096; + if (asce.r) { + *fake = 1; + asce.dt = ASCE_TYPE_REGION1; + } switch (asce.dt) { case ASCE_TYPE_REGION1: - if (vaddr.rfx01 > asce.tl) + if (vaddr.rfx01 > asce.tl && !asce.r) return PGM_REGION_FIRST_TRANS; break; case ASCE_TYPE_REGION2: @@ -1000,6 +1004,12 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, case ASCE_TYPE_REGION1: { union region1_table_entry rfte; + if (*fake) { + /* offset in 16EB guest memory block */ + ptr = ptr + ((unsigned long) vaddr.rsx << 53UL); + rfte.val = ptr; + goto shadow_r2t; + } rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val); if (rc) return rc; @@ -1011,15 +1021,22 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, return PGM_REGION_SECOND_TRANS; if (sg->edat_level >= 1) *dat_protection |= rfte.p; - rc = gmap_shadow_r2t(sg, saddr, rfte.val); + ptr = rfte.rto << 12UL; +shadow_r2t: + rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); if (rc) return rc; - ptr = rfte.rto * 4096; /* fallthrough */ } case ASCE_TYPE_REGION2: { union region2_table_entry rste; + if (*fake) { + /* offset in 8PB guest memory block */ + ptr = ptr + ((unsigned long) vaddr.rtx << 42UL); + rste.val = ptr; + goto shadow_r3t; + } rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val); if (rc) return rc; @@ -1031,16 +1048,23 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, return PGM_REGION_THIRD_TRANS; if (sg->edat_level >= 1) *dat_protection |= rste.p; + ptr = rste.rto << 12UL; +shadow_r3t: rste.p |= *dat_protection; - rc = gmap_shadow_r3t(sg, saddr, rste.val); + rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); if (rc) return rc; - ptr = rste.rto * 4096; /* fallthrough */ } case ASCE_TYPE_REGION3: { union region3_table_entry rtte; + if (*fake) { + /* offset in 4TB guest memory block */ + ptr = ptr + ((unsigned long) vaddr.sx << 31UL); + rtte.val = ptr; + goto shadow_sgt; + } rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val); if (rc) return rc; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index c96bf30245c0..c07d64f5cdb5 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1437,6 +1437,8 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, return sg; /* Create a new shadow gmap */ limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); + if (asce & _ASCE_REAL_SPACE) + limit = -1UL; new = gmap_alloc(limit); if (!new) return ERR_PTR(-ENOMEM); @@ -1455,6 +1457,12 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, } atomic_set(&new->ref_count, 2); list_add(&new->list, &parent->children); + if (asce & _ASCE_REAL_SPACE) { + /* nothing to protect, return right away */ + new->initialized = true; + spin_unlock(&parent->shadow_lock); + return new; + } spin_unlock(&parent->shadow_lock); /* protect after insertion, so it will get properly invalidated */ down_read(&parent->mm->mmap_sem); @@ -1479,6 +1487,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow); * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap * @r2t: parent gmap address of the region 2 table to get shadowed + * @fake: r2t references contiguous guest memory block, not a r2t * * The r2t parameter specifies the address of the source table. The * four pages of the source table are made read-only in the parent gmap @@ -1491,7 +1500,8 @@ EXPORT_SYMBOL_GPL(gmap_shadow); * * Called with sg->mm->mmap_sem in read. */ -int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t) +int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, + int fake) { unsigned long raddr, origin, offset, len; unsigned long *s_r2t, *table; @@ -1504,6 +1514,8 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t) if (!page) return -ENOMEM; page->index = r2t & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; s_r2t = (unsigned long *) page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); @@ -1526,6 +1538,12 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t) if (sg->edat_level >= 1) *table |= (r2t & _REGION_ENTRY_PROTECT); list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } spin_unlock(&sg->guest_table_lock); /* Make r2t read-only in parent gmap page table */ raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1; @@ -1558,6 +1576,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r2t); * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap * @r3t: parent gmap address of the region 3 table to get shadowed + * @fake: r3t references contiguous guest memory block, not a r3t * * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the * shadow table structure is incomplete, -ENOMEM if out of memory and @@ -1565,7 +1584,8 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r2t); * * Called with sg->mm->mmap_sem in read. */ -int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t) +int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, + int fake) { unsigned long raddr, origin, offset, len; unsigned long *s_r3t, *table; @@ -1578,6 +1598,8 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t) if (!page) return -ENOMEM; page->index = r3t & _REGION_ENTRY_ORIGIN; + if (fake) + page->index |= GMAP_SHADOW_FAKE_TABLE; s_r3t = (unsigned long *) page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); @@ -1599,6 +1621,12 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t) if (sg->edat_level >= 1) *table |= (r3t & _REGION_ENTRY_PROTECT); list_add(&page->lru, &sg->crst_list); + if (fake) { + /* nothing to protect for fake tables */ + *table &= ~_REGION_ENTRY_INVALID; + spin_unlock(&sg->guest_table_lock); + return 0; + } spin_unlock(&sg->guest_table_lock); /* Make r3t read-only in parent gmap page table */ raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2; @@ -1932,7 +1960,8 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, /* Check for top level table */ start = sg->orig_asce & _ASCE_ORIGIN; end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096; - if (gaddr >= start && gaddr < end) { + if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && + gaddr < end) { /* The complete shadow table has to go */ gmap_unshadow(sg); spin_unlock(&sg->guest_table_lock); From 717c05554afa69a36398a57dac64b95972f138d5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 May 2016 12:10:17 +0200 Subject: [PATCH 135/302] s390/mm: limit number of real-space gmap shadows We have no known user of real-space designation and only support it to be architecture compliant. Gmap shadows with real-space designation are never unshadowed automatically, as there is nothing to protect for the top level table. So let's simply limit the number of such shadows to one by removing existing ones on creation of another one. Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/mm/gmap.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index c07d64f5cdb5..4a1434bc2f0e 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1455,6 +1455,19 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, gmap_free(new); return sg; } + if (asce & _ASCE_REAL_SPACE) { + /* only allow one real-space gmap shadow */ + list_for_each_entry(sg, &parent->children, list) { + if (sg->orig_asce & _ASCE_REAL_SPACE) { + spin_lock(&sg->guest_table_lock); + gmap_unshadow(sg); + spin_unlock(&sg->guest_table_lock); + list_del(&sg->list); + gmap_put(sg); + break; + } + } + } atomic_set(&new->ref_count, 2); list_add(&new->list, &parent->children); if (asce & _ASCE_REAL_SPACE) { From 4a49443924731823da2e9b3ae9311b74a34e7ed8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Mar 2016 12:31:52 +0100 Subject: [PATCH 136/302] s390/mm: remember the int code for the last gmap fault For nested virtualization, we want to know if we are handling a protection exception, because these can directly be forwarded to the guest without additional checks. Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/processor.h | 1 + arch/s390/mm/fault.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 94c80b6d031d..8c2922f540f9 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -110,6 +110,7 @@ struct thread_struct { mm_segment_t mm_segment; unsigned long gmap_addr; /* address of last gmap fault. */ unsigned int gmap_write_flag; /* gmap fault write indication */ + unsigned int gmap_int_code; /* int code of last gmap fault */ unsigned int gmap_pfault; /* signal of a pending guest pfault */ struct per_regs per_user; /* User specified PER registers */ struct per_event per_event; /* Cause of the last PER trap */ diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index b84416c11c43..730e0d3aa840 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -419,6 +419,7 @@ static inline int do_exception(struct pt_regs *regs, int access) if (gmap) { current->thread.gmap_addr = address; current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE); + current->thread.gmap_int_code = regs->int_code & 0xffff; address = __gmap_translate(gmap, address); if (address == -EFAULT) { fault = VM_FAULT_BADMAP; From 5b6c963bcef5c3a857e3f8ba84aa9380069fc95f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 27 May 2016 18:57:33 +0200 Subject: [PATCH 137/302] s390/mm: allow to check if a gmap shadow is valid It will be very helpful to have a mechanism to check without any locks if a given gmap shadow is still valid and matches the given properties. Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/gmap.h | 1 + arch/s390/mm/gmap.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 752cf47a81ab..c67fb854705e 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -109,6 +109,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val); struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level); +int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level); int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, int fake); int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 4a1434bc2f0e..d00e4abb559e 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1407,6 +1407,26 @@ static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, return NULL; } +/** + * gmap_shadow_valid - check if a shadow guest address space matches the + * given properties and is still valid + * @sg: pointer to the shadow guest address space structure + * @asce: ASCE for which the shadow table is requested + * @edat_level: edat level to be used for the shadow translation + * + * Returns 1 if the gmap shadow is still valid and matches the given + * properties, the caller can continue using it. Returns 0 otherwise, the + * caller has to request a new shadow gmap in this case. + * + */ +int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) +{ + if (sg->removed) + return 0; + return sg->orig_asce == asce && sg->edat_level == edat_level; +} +EXPORT_SYMBOL_GPL(gmap_shadow_valid); + /** * gmap_shadow - create/find a shadow guest address space * @parent: pointer to the parent gmap From 01f719176f28016da1b588f6560a4eef18a98a93 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 Jun 2016 10:49:04 +0200 Subject: [PATCH 138/302] s390/mm: don't fault everything in read-write in gmap_pte_op_fixup() Let's not fault in everything in read-write but limit it to read-only where possible. When restricting access rights, we already have the required protection level in our hands. When reading from guest 2 storage (gmap_read_table), it is obviously PROT_READ. When shadowing a pte, the required protection level is given via the guest 2 provided pte. Based on an initial patch by Martin Schwidefsky. Acked-by: Martin Schwidefsky Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/mm/gmap.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index d00e4abb559e..738d75495e56 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -811,19 +811,22 @@ static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr, * @gmap: pointer to guest mapping meta data structure * @gaddr: virtual address in the guest address space * @vmaddr: address in the host process address space + * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE * * Returns 0 if the caller can retry __gmap_translate (might fail again), * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing * up or connecting the gmap page table. */ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, - unsigned long vmaddr) + unsigned long vmaddr, int prot) { struct mm_struct *mm = gmap->mm; + unsigned int fault_flags; bool unlocked = false; BUG_ON(gmap_is_shadow(gmap)); - if (fixup_user_fault(current, mm, vmaddr, FAULT_FLAG_WRITE, &unlocked)) + fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; + if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked)) return -EFAULT; if (unlocked) /* lost mmap_sem, caller has to retry __gmap_translate */ @@ -875,7 +878,7 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, vmaddr = __gmap_translate(gmap, gaddr); if (IS_ERR_VALUE(vmaddr)) return vmaddr; - rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr); + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot); if (rc) return rc; continue; @@ -957,7 +960,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) rc = vmaddr; break; } - rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr); + rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ); if (rc) break; } @@ -1041,7 +1044,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, radix_tree_preload_end(); if (rc) { kfree(rmap); - rc = gmap_pte_op_fixup(parent, paddr, vmaddr); + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); if (rc) return rc; continue; @@ -1910,10 +1913,12 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) unsigned long vmaddr, paddr; spinlock_t *ptl; pte_t *sptep, *tptep; + int prot; int rc; BUG_ON(!gmap_is_shadow(sg)); parent = sg->parent; + prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); if (!rmap) @@ -1955,7 +1960,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) radix_tree_preload_end(); if (!rc) break; - rc = gmap_pte_op_fixup(parent, paddr, vmaddr); + rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot); if (rc) break; } From 65d0b0d4bcc67b596d8e7286c3bebf24c59ade6a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 27 Apr 2015 16:29:34 +0200 Subject: [PATCH 139/302] KVM: s390: fast path for shadow gmaps in gmap notifier The default kvm gmap notifier doesn't have to handle shadow gmaps. So let's just directly exit in case we get notified about one. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 9dd52980605c..45a8316ba1eb 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1986,6 +1986,8 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long prefix; int i; + if (gmap_is_shadow(gmap)) + return; if (start >= 1UL << 31) /* We are only interested in prefix pages */ return; From 37d9df98b71afdf3baf41ee5451b6206c13328c6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 11 Mar 2015 16:47:33 +0100 Subject: [PATCH 140/302] KVM: s390: backup the currently enabled gmap when scheduled out Nested virtualization will have to enable own gmaps. Current code would enable the wrong gmap whenever scheduled out and back in, therefore resulting in the wrong gmap being enabled. This patch reenables the last enabled gmap, therefore avoiding having to touch vcpu->arch.gmap when enabling a different gmap. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/gmap.h | 1 + arch/s390/include/asm/kvm_host.h | 2 ++ arch/s390/kvm/kvm-s390.c | 8 +++++--- arch/s390/mm/gmap.c | 11 +++++++++++ 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index c67fb854705e..741ddba0bf11 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -94,6 +94,7 @@ void gmap_put(struct gmap *gmap); void gmap_enable(struct gmap *gmap); void gmap_disable(struct gmap *gmap); +struct gmap *gmap_get_enabled(void); int gmap_map_segment(struct gmap *gmap, unsigned long from, unsigned long to, unsigned long len); int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len); diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 9eed5c18a61c..96bef30e2e33 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -551,6 +551,8 @@ struct kvm_vcpu_arch { struct hrtimer ckc_timer; struct kvm_s390_pgm_info pgm; struct gmap *gmap; + /* backup location for the currently enabled gmap when scheduled out */ + struct gmap *enabled_gmap; struct kvm_guestdbg_info_arch guestdbg; unsigned long pfault_token; unsigned long pfault_select; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 45a8316ba1eb..a890f7d20711 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1719,7 +1719,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) save_access_regs(vcpu->arch.host_acrs); restore_access_regs(vcpu->run->s.regs.acrs); - gmap_enable(vcpu->arch.gmap); + gmap_enable(vcpu->arch.enabled_gmap); atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu)) __start_cpu_timer_accounting(vcpu); @@ -1732,7 +1732,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu)) __stop_cpu_timer_accounting(vcpu); atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); - gmap_disable(vcpu->arch.gmap); + vcpu->arch.enabled_gmap = gmap_get_enabled(); + gmap_disable(vcpu->arch.enabled_gmap); /* Save guest register state */ save_fpu_regs(); @@ -1781,7 +1782,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) vcpu->arch.gmap = vcpu->kvm->arch.gmap; sca_add_vcpu(vcpu); } - + /* make vcpu_load load the right gmap on the first trigger */ + vcpu->arch.enabled_gmap = vcpu->arch.gmap; } static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 738d75495e56..af0ae6d7ac59 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -270,6 +270,17 @@ void gmap_disable(struct gmap *gmap) } EXPORT_SYMBOL_GPL(gmap_disable); +/** + * gmap_get_enabled - get a pointer to the currently enabled gmap + * + * Returns a pointer to the currently enabled gmap. 0 if none is enabled. + */ +struct gmap *gmap_get_enabled(void) +{ + return (struct gmap *) S390_lowcore.gmap; +} +EXPORT_SYMBOL_GPL(gmap_get_enabled); + /* * gmap_alloc_table is assumed to be called with mmap_sem held */ From 3d84683bd737e397ae200e881a3230e469c59ad6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 16 Nov 2015 10:45:03 +0100 Subject: [PATCH 141/302] s390: introduce page_to_virt() and pfn_to_virt() Acked-by: Heiko Carstens Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/page.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index f874e7d51c19..b5edff32e547 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -147,6 +147,8 @@ static inline int devmem_is_allowed(unsigned long pfn) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT) +#define page_to_virt(page) pfn_to_virt(page_to_pfn(page)) #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) From df9b2b4a4aa49f874f8507680a533369e4b9c378 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 20 Jun 2016 12:09:41 +0200 Subject: [PATCH 142/302] mm/page_ref: introduce page_ref_inc_return Let's introduce that helper. Signed-off-by: David Hildenbrand Acked-by: Andrew Morton Signed-off-by: Christian Borntraeger --- include/linux/page_ref.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index 8b5e0a9f2431..610e13271918 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -124,6 +124,15 @@ static inline int page_ref_sub_and_test(struct page *page, int nr) return ret; } +static inline int page_ref_inc_return(struct page *page) +{ + int ret = atomic_inc_return(&page->_refcount); + + if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return)) + __page_ref_mod_and_return(page, 1, ret); + return ret; +} + static inline int page_ref_dec_and_test(struct page *page) { int ret = atomic_dec_and_test(&page->_refcount); From a3508fbe9dc6dd3bece0c7bf889cc085a011738c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 8 Jul 2015 13:19:48 +0200 Subject: [PATCH 143/302] KVM: s390: vsie: initial support for nested virtualization This patch adds basic support for nested virtualization on s390x, called VSIE (virtual SIE) and allows it to be used by the guest if the necessary facilities are supported by the hardware and enabled for the guest. In order to make this work, we have to shadow the sie control block provided by guest 2. In order to gain some performance, we have to reuse the same shadow blocks as good as possible. For now, we allow as many shadow blocks as we have VCPUs (that way, every VCPU can run the VSIE concurrently). We have to watch out for the prefix getting unmapped out of our shadow gmap and properly get the VCPU out of VSIE in that case, to fault the prefix pages back in. We use the PROG_REQUEST bit for that purpose. This patch is based on an initial prototype by Tobias Elpelt. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 15 +- arch/s390/include/uapi/asm/kvm.h | 1 + arch/s390/kvm/Makefile | 2 +- arch/s390/kvm/kvm-s390.c | 15 + arch/s390/kvm/kvm-s390.h | 7 + arch/s390/kvm/priv.c | 1 + arch/s390/kvm/vsie.c | 755 +++++++++++++++++++++++++++++++ 7 files changed, 794 insertions(+), 2 deletions(-) create mode 100644 arch/s390/kvm/vsie.c diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 96bef30e2e33..255609c86901 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -145,7 +145,7 @@ struct kvm_s390_sie_block { __u64 cputm; /* 0x0028 */ __u64 ckc; /* 0x0030 */ __u64 epoch; /* 0x0038 */ - __u8 reserved40[4]; /* 0x0040 */ + __u32 svcc; /* 0x0040 */ #define LCTL_CR0 0x8000 #define LCTL_CR6 0x0200 #define LCTL_CR9 0x0040 @@ -167,6 +167,9 @@ struct kvm_s390_sie_block { #define ICPT_INST 0x04 #define ICPT_PROGI 0x08 #define ICPT_INSTPROGI 0x0C +#define ICPT_EXTINT 0x14 +#define ICPT_VALIDITY 0x20 +#define ICPT_STOP 0x28 #define ICPT_OPEREXC 0x2C #define ICPT_PARTEXEC 0x38 #define ICPT_IOINST 0x40 @@ -281,6 +284,7 @@ struct kvm_vcpu_stat { u32 instruction_stsi; u32 instruction_stfl; u32 instruction_tprot; + u32 instruction_sie; u32 instruction_essa; u32 instruction_sthyi; u32 instruction_sigp_sense; @@ -637,6 +641,14 @@ struct sie_page2 { u8 reserved900[0x1000 - 0x900]; /* 0x0900 */ } __packed; +struct kvm_s390_vsie { + struct mutex mutex; + struct radix_tree_root addr_to_page; + int page_count; + int next; + struct page *pages[KVM_MAX_VCPUS]; +}; + struct kvm_arch{ void *sca; int use_esca; @@ -661,6 +673,7 @@ struct kvm_arch{ struct sie_page2 *sie_page2; struct kvm_s390_cpu_model model; struct kvm_s390_crypto crypto; + struct kvm_s390_vsie vsie; u64 epoch; /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index f0818d70d73d..62423b1931c0 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -98,6 +98,7 @@ struct kvm_s390_vm_cpu_machine { #define KVM_S390_VM_CPU_FEAT_NR_BITS 1024 #define KVM_S390_VM_CPU_FEAT_ESOP 0 +#define KVM_S390_VM_CPU_FEAT_SIEF2 1 struct kvm_s390_vm_cpu_feat { __u64 feat[16]; }; diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 82e73e2b953d..09a9e6dfc09f 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqch ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o +kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o vsie.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index a890f7d20711..3fb124226e97 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -99,6 +99,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "instruction_stfl", VCPU_STAT(instruction_stfl) }, { "instruction_tprot", VCPU_STAT(instruction_tprot) }, { "instruction_sthyi", VCPU_STAT(instruction_sthyi) }, + { "instruction_sie", VCPU_STAT(instruction_sie) }, { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) }, { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) }, { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) }, @@ -142,6 +143,7 @@ static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS) static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc; static struct gmap_notifier gmap_notifier; +static struct gmap_notifier vsie_gmap_notifier; debug_info_t *kvm_s390_dbf; /* Section: not file related */ @@ -187,6 +189,8 @@ int kvm_arch_hardware_setup(void) { gmap_notifier.notifier_call = kvm_gmap_notifier; gmap_register_pte_notifier(&gmap_notifier); + vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; + gmap_register_pte_notifier(&vsie_gmap_notifier); atomic_notifier_chain_register(&s390_epoch_delta_notifier, &kvm_clock_notifier); return 0; @@ -195,6 +199,7 @@ int kvm_arch_hardware_setup(void) void kvm_arch_hardware_unsetup(void) { gmap_unregister_pte_notifier(&gmap_notifier); + gmap_unregister_pte_notifier(&vsie_gmap_notifier); atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, &kvm_clock_notifier); } @@ -252,6 +257,14 @@ static void kvm_s390_cpu_feat_init(void) if (MACHINE_HAS_ESOP) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP); + /* + * We need SIE support, ESOP (PROT_READ protection for gmap_shadow), + * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing). + */ + if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao || + !test_facility(3)) + return; + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2); } int kvm_arch_init(void *opaque) @@ -1406,6 +1419,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.epoch = 0; spin_lock_init(&kvm->arch.start_stop_lock); + kvm_s390_vsie_init(kvm); KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); return 0; @@ -1463,6 +1477,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) gmap_remove(kvm->arch.gmap); kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); + kvm_s390_vsie_destroy(kvm); KVM_EVENT(3, "vm 0x%pK destroyed", kvm); } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 52aa47e112d8..b137fbaac91c 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -252,6 +252,13 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu); int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu); int kvm_s390_handle_eb(struct kvm_vcpu *vcpu); +/* implemented in vsie.c */ +int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu); +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end); +void kvm_s390_vsie_init(struct kvm *kvm); +void kvm_s390_vsie_destroy(struct kvm *kvm); + /* implemented in sigp.c */ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 3db3be139992..c77ad2dc334f 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -719,6 +719,7 @@ static const intercept_handler_t b2_handlers[256] = { [0x10] = handle_set_prefix, [0x11] = handle_store_prefix, [0x12] = handle_store_cpu_address, + [0x14] = kvm_s390_handle_vsie, [0x21] = handle_ipte_interlock, [0x29] = handle_iske, [0x2a] = handle_rrbe, diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c new file mode 100644 index 000000000000..747d4f900155 --- /dev/null +++ b/arch/s390/kvm/vsie.c @@ -0,0 +1,755 @@ +/* + * kvm nested virtualization support for s390x + * + * Copyright IBM Corp. 2016 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + * + * Author(s): David Hildenbrand + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kvm-s390.h" +#include "gaccess.h" + +struct vsie_page { + struct kvm_s390_sie_block scb_s; /* 0x0000 */ + /* the pinned originial scb */ + struct kvm_s390_sie_block *scb_o; /* 0x0200 */ + /* the shadow gmap in use by the vsie_page */ + struct gmap *gmap; /* 0x0208 */ + __u8 reserved[0x1000 - 0x0210]; /* 0x0210 */ +} __packed; + +/* trigger a validity icpt for the given scb */ +static int set_validity_icpt(struct kvm_s390_sie_block *scb, + __u16 reason_code) +{ + scb->ipa = 0x1000; + scb->ipb = ((__u32) reason_code) << 16; + scb->icptcode = ICPT_VALIDITY; + return 1; +} + +/* mark the prefix as unmapped, this will block the VSIE */ +static void prefix_unmapped(struct vsie_page *vsie_page) +{ + atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20); +} + +/* mark the prefix as unmapped and wait until the VSIE has been left */ +static void prefix_unmapped_sync(struct vsie_page *vsie_page) +{ + prefix_unmapped(vsie_page); + if (vsie_page->scb_s.prog0c & PROG_IN_SIE) + atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags); + while (vsie_page->scb_s.prog0c & PROG_IN_SIE) + cpu_relax(); +} + +/* mark the prefix as mapped, this will allow the VSIE to run */ +static void prefix_mapped(struct vsie_page *vsie_page) +{ + atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20); +} + + +/* copy the updated intervention request bits into the shadow scb */ +static void update_intervention_requests(struct vsie_page *vsie_page) +{ + const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT; + int cpuflags; + + cpuflags = atomic_read(&vsie_page->scb_o->cpuflags); + atomic_andnot(bits, &vsie_page->scb_s.cpuflags); + atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags); +} + +/* shadow (filter and validate) the cpuflags */ +static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + int newflags, cpuflags = atomic_read(&scb_o->cpuflags); + + /* we don't allow ESA/390 guests */ + if (!(cpuflags & CPUSTAT_ZARCH)) + return set_validity_icpt(scb_s, 0x0001U); + + if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS)) + return set_validity_icpt(scb_s, 0x0001U); + else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR)) + return set_validity_icpt(scb_s, 0x0007U); + + /* intervention requests will be set later */ + newflags = CPUSTAT_ZARCH; + + atomic_set(&scb_s->cpuflags, newflags); + return 0; +} + +/* unshadow the scb, copying parameters back to the real scb */ +static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + + /* interception */ + scb_o->icptcode = scb_s->icptcode; + scb_o->icptstatus = scb_s->icptstatus; + scb_o->ipa = scb_s->ipa; + scb_o->ipb = scb_s->ipb; + scb_o->gbea = scb_s->gbea; + + /* timer */ + scb_o->cputm = scb_s->cputm; + scb_o->ckc = scb_s->ckc; + scb_o->todpr = scb_s->todpr; + + /* guest state */ + scb_o->gpsw = scb_s->gpsw; + scb_o->gg14 = scb_s->gg14; + scb_o->gg15 = scb_s->gg15; + memcpy(scb_o->gcr, scb_s->gcr, 128); + scb_o->pp = scb_s->pp; + + /* interrupt intercept */ + switch (scb_s->icptcode) { + case ICPT_PROGI: + case ICPT_INSTPROGI: + case ICPT_EXTINT: + memcpy((void *)((u64)scb_o + 0xc0), + (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0); + break; + case ICPT_PARTEXEC: + /* MVPG only */ + memcpy((void *)((u64)scb_o + 0xc0), + (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0); + break; + } + + if (scb_s->ihcpu != 0xffffU) + scb_o->ihcpu = scb_s->ihcpu; +} + +/* + * Setup the shadow scb by copying and checking the relevant parts of the g2 + * provided scb. + * + * Returns: - 0 if the scb has been shadowed + * - > 0 if control has to be given to guest 2 + */ +static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + int rc; + + /* make sure we don't have any leftovers when reusing the scb */ + scb_s->icptcode = 0; + scb_s->eca = 0; + scb_s->ecb = 0; + scb_s->ecb2 = 0; + scb_s->ecb3 = 0; + scb_s->ecd = 0; + + rc = prepare_cpuflags(vcpu, vsie_page); + if (rc) + goto out; + + /* timer */ + scb_s->cputm = scb_o->cputm; + scb_s->ckc = scb_o->ckc; + scb_s->todpr = scb_o->todpr; + scb_s->epoch = scb_o->epoch; + + /* guest state */ + scb_s->gpsw = scb_o->gpsw; + scb_s->gg14 = scb_o->gg14; + scb_s->gg15 = scb_o->gg15; + memcpy(scb_s->gcr, scb_o->gcr, 128); + scb_s->pp = scb_o->pp; + + /* interception / execution handling */ + scb_s->gbea = scb_o->gbea; + scb_s->lctl = scb_o->lctl; + scb_s->svcc = scb_o->svcc; + scb_s->ictl = scb_o->ictl; + /* + * SKEY handling functions can't deal with false setting of PTE invalid + * bits. Therefore we cannot provide interpretation and would later + * have to provide own emulation handlers. + */ + scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; + scb_s->icpua = scb_o->icpua; + + /* SIE will do mso/msl validity and exception checks for us */ + scb_s->msl = scb_o->msl & 0xfffffffffff00000UL; + scb_s->mso = scb_o->mso & 0xfffffffffff00000UL; + scb_s->prefix = scb_o->prefix; + + /* We have to definetly flush the tlb if this scb never ran */ + if (scb_s->ihcpu != 0xffffU) + scb_s->ihcpu = scb_o->ihcpu; + + /* MVPG and Protection Exception Interpretation are always available */ + scb_s->eca |= scb_o->eca & 0x01002000U; + +out: + if (rc) + unshadow_scb(vcpu, vsie_page); + return rc; +} + +void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, + unsigned long end) +{ + struct kvm *kvm = gmap->private; + struct vsie_page *cur; + unsigned long prefix; + struct page *page; + int i; + + if (!gmap_is_shadow(gmap)) + return; + if (start >= 1UL << 31) + /* We are only interested in prefix pages */ + return; + + /* + * Only new shadow blocks are added to the list during runtime, + * therefore we can safely reference them all the time. + */ + for (i = 0; i < kvm->arch.vsie.page_count; i++) { + page = READ_ONCE(kvm->arch.vsie.pages[i]); + if (!page) + continue; + cur = page_to_virt(page); + if (READ_ONCE(cur->gmap) != gmap) + continue; + prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT; + /* with mso/msl, the prefix lies at an offset */ + prefix += cur->scb_s.mso; + if (prefix <= end && start <= prefix + PAGE_SIZE - 1) + prefix_unmapped_sync(cur); + } +} + +/* + * Map the first prefix page. + * + * The prefix will be protected, a gmap notifier will inform about unmaps. + * The shadow scb must not be executed until the prefix is remapped, this is + * guaranteed by properly handling PROG_REQUEST. + * + * Returns: - 0 on if successfully mapped or already mapped + * - > 0 if control has to be given to guest 2 + * - -EAGAIN if the caller can retry immediately + * - -ENOMEM if out of memory + */ +static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT; + int rc; + + /* mark it as mapped so we can catch any concurrent unmappers */ + prefix_mapped(vsie_page); + + /* with mso/msl, the prefix lies at offset *mso* */ + prefix += scb_s->mso; + + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix); + /* + * We don't have to mprotect, we will be called for all unshadows. + * SIE will detect if protection applies and trigger a validity. + */ + if (rc) + prefix_unmapped(vsie_page); + if (rc > 0 || rc == -EFAULT) + rc = set_validity_icpt(scb_s, 0x0037U); + return rc; +} + +/* + * Pin the guest page given by gpa and set hpa to the pinned host address. + * Will always be pinned writable. + * + * Returns: - 0 on success + * - -EINVAL if the gpa is not valid guest storage + * - -ENOMEM if out of memory + */ +static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) +{ + struct page *page; + hva_t hva; + int rc; + + hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); + if (kvm_is_error_hva(hva)) + return -EINVAL; + rc = get_user_pages_fast(hva, 1, 1, &page); + if (rc < 0) + return rc; + else if (rc != 1) + return -ENOMEM; + *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK); + return 0; +} + +/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */ +static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa) +{ + struct page *page; + + page = virt_to_page(hpa); + set_page_dirty_lock(page); + put_page(page); + /* mark the page always as dirty for migration */ + mark_page_dirty(kvm, gpa_to_gfn(gpa)); +} + +/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */ +static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + hpa_t hpa; + gpa_t gpa; + + hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol; + if (hpa) { + gpa = scb_o->scaol & ~0xfUL; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->scaol = 0; + scb_s->scaoh = 0; + } +} + +/* + * Instead of shadowing some blocks, we can simply forward them because the + * addresses in the scb are 64 bit long. + * + * This works as long as the data lies in one page. If blocks ever exceed one + * page, we have to fall back to shadowing. + * + * As we reuse the sca, the vcpu pointers contained in it are invalid. We must + * therefore not enable any facilities that access these pointers (e.g. SIGPIF). + * + * Returns: - 0 if all blocks were pinned. + * - > 0 if control has to be given to guest 2 + * - -ENOMEM if out of memory + */ +static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + hpa_t hpa; + gpa_t gpa; + int rc = 0; + + gpa = scb_o->scaol & ~0xfUL; + if (gpa) { + if (!(gpa & ~0x1fffUL)) + rc = set_validity_icpt(scb_s, 0x0038U); + else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu)) + rc = set_validity_icpt(scb_s, 0x0011U); + else if ((gpa & PAGE_MASK) != + ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK)) + rc = set_validity_icpt(scb_s, 0x003bU); + if (!rc) { + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x0034U); + } + if (rc) + goto unpin; + scb_s->scaoh = (u32)((u64)hpa >> 32); + scb_s->scaol = (u32)(u64)hpa; + } + return 0; +unpin: + unpin_blocks(vcpu, vsie_page); + return rc; +} + +/* unpin the scb provided by guest 2, marking it as dirty */ +static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, + gpa_t gpa) +{ + hpa_t hpa = (hpa_t) vsie_page->scb_o; + + if (hpa) + unpin_guest_page(vcpu->kvm, gpa, hpa); + vsie_page->scb_o = NULL; +} + +/* + * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o. + * + * Returns: - 0 if the scb was pinned. + * - > 0 if control has to be given to guest 2 + * - -ENOMEM if out of memory + */ +static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, + gpa_t gpa) +{ + hpa_t hpa; + int rc; + + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) { + rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (!rc) + rc = 1; + } + if (!rc) + vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa; + return rc; +} + +/* + * Inject a fault into guest 2. + * + * Returns: - > 0 if control has to be given to guest 2 + * < 0 if an error occurred during injection. + */ +static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr, + bool write_flag) +{ + struct kvm_s390_pgm_info pgm = { + .code = code, + .trans_exc_code = + /* 0-51: virtual address */ + (vaddr & 0xfffffffffffff000UL) | + /* 52-53: store / fetch */ + (((unsigned int) !write_flag) + 1) << 10, + /* 62-63: asce id (alway primary == 0) */ + .exc_access_id = 0, /* always primary */ + .op_access_id = 0, /* not MVPG */ + }; + int rc; + + if (code == PGM_PROTECTION) + pgm.trans_exc_code |= 0x4UL; + + rc = kvm_s390_inject_prog_irq(vcpu, &pgm); + return rc ? rc : 1; +} + +/* + * Handle a fault during vsie execution on a gmap shadow. + * + * Returns: - 0 if the fault was resolved + * - > 0 if control has to be given to guest 2 + * - < 0 if an error occurred + */ +static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + int rc; + + if (current->thread.gmap_int_code == PGM_PROTECTION) + /* we can directly forward all protection exceptions */ + return inject_fault(vcpu, PGM_PROTECTION, + current->thread.gmap_addr, 1); + + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, + current->thread.gmap_addr); + if (rc > 0) { + rc = inject_fault(vcpu, rc, + current->thread.gmap_addr, + current->thread.gmap_write_flag); + } + return rc; +} + +static inline void clear_vsie_icpt(struct vsie_page *vsie_page) +{ + vsie_page->scb_s.icptcode = 0; +} + +/* + * Run the vsie on a shadow scb and a shadow gmap, without any further + * sanity checks, handling SIE faults. + * + * Returns: - 0 everything went fine + * - > 0 if control has to be given to guest 2 + * - < 0 if an error occurred + */ +static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + int rc; + + if (need_resched()) + schedule(); + if (test_cpu_flag(CIF_MCCK_PENDING)) + s390_handle_mcck(); + + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + local_irq_disable(); + kvm_guest_enter(); + local_irq_enable(); + + rc = sie64a(scb_s, vcpu->run->s.regs.gprs); + + local_irq_disable(); + kvm_guest_exit(); + local_irq_enable(); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + if (rc > 0) + rc = 0; /* we could still have an icpt */ + else if (rc == -EFAULT) + return handle_fault(vcpu, vsie_page); + + switch (scb_s->icptcode) { + case ICPT_STOP: + /* stop not requested by g2 - must have been a kick */ + if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT)) + clear_vsie_icpt(vsie_page); + break; + case ICPT_VALIDITY: + if ((scb_s->ipa & 0xf000) != 0xf000) + scb_s->ipa += 0x1000; + break; + } + return rc; +} + +static void release_gmap_shadow(struct vsie_page *vsie_page) +{ + if (vsie_page->gmap) + gmap_put(vsie_page->gmap); + WRITE_ONCE(vsie_page->gmap, NULL); +} + +static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, + struct vsie_page *vsie_page) +{ + unsigned long asce; + union ctlreg0 cr0; + struct gmap *gmap; + int edat; + + asce = vcpu->arch.sie_block->gcr[1]; + cr0.val = vcpu->arch.sie_block->gcr[0]; + edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8); + edat += edat && test_kvm_facility(vcpu->kvm, 78); + + gmap = gmap_shadow(vcpu->arch.gmap, asce, edat); + if (IS_ERR(gmap)) + return PTR_ERR(gmap); + gmap->private = vcpu->kvm; + WRITE_ONCE(vsie_page->gmap, gmap); + return 0; +} + +/* + * Run the vsie on a shadowed scb, managing the gmap shadow, handling + * prefix pages and faults. + * + * Returns: - 0 if no errors occurred + * - > 0 if control has to be given to guest 2 + * - -ENOMEM if out of memory + */ +static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + int rc = 0; + + while (1) { + rc = acquire_gmap_shadow(vcpu, vsie_page); + if (!rc) + rc = map_prefix(vcpu, vsie_page); + if (!rc) { + gmap_enable(vsie_page->gmap); + update_intervention_requests(vsie_page); + rc = do_vsie_run(vcpu, vsie_page); + gmap_enable(vcpu->arch.gmap); + } + release_gmap_shadow(vsie_page); + + if (rc == -EAGAIN) + rc = 0; + if (rc || scb_s->icptcode || signal_pending(current) || + kvm_s390_vcpu_has_irq(vcpu, 0)) + break; + }; + + if (rc == -EFAULT) { + /* + * Addressing exceptions are always presentes as intercepts. + * As addressing exceptions are suppressing and our guest 3 PSW + * points at the responsible instruction, we have to + * forward the PSW and set the ilc. If we can't read guest 3 + * instruction, we can use an arbitrary ilc. Let's always use + * ilen = 4 for now, so we can avoid reading in guest 3 virtual + * memory. (we could also fake the shadow so the hardware + * handles it). + */ + scb_s->icptcode = ICPT_PROGI; + scb_s->iprcc = PGM_ADDRESSING; + scb_s->pgmilc = 4; + scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4); + } + return rc; +} + +/* + * Get or create a vsie page for a scb address. + * + * Returns: - address of a vsie page (cached or new one) + * - NULL if the same scb address is already used by another VCPU + * - ERR_PTR(-ENOMEM) if out of memory + */ +static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) +{ + struct vsie_page *vsie_page; + struct page *page; + int nr_vcpus; + + rcu_read_lock(); + page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9); + rcu_read_unlock(); + if (page) { + if (page_ref_inc_return(page) == 2) + return page_to_virt(page); + page_ref_dec(page); + } + + /* + * We want at least #online_vcpus shadows, so every VCPU can execute + * the VSIE in parallel. + */ + nr_vcpus = atomic_read(&kvm->online_vcpus); + + mutex_lock(&kvm->arch.vsie.mutex); + if (kvm->arch.vsie.page_count < nr_vcpus) { + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + mutex_unlock(&kvm->arch.vsie.mutex); + return ERR_PTR(-ENOMEM); + } + page_ref_inc(page); + kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page; + kvm->arch.vsie.page_count++; + } else { + /* reuse an existing entry that belongs to nobody */ + while (true) { + page = kvm->arch.vsie.pages[kvm->arch.vsie.next]; + if (page_ref_inc_return(page) == 2) + break; + page_ref_dec(page); + kvm->arch.vsie.next++; + kvm->arch.vsie.next %= nr_vcpus; + } + radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); + } + page->index = addr; + /* double use of the same address */ + if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) { + page_ref_dec(page); + mutex_unlock(&kvm->arch.vsie.mutex); + return NULL; + } + mutex_unlock(&kvm->arch.vsie.mutex); + + vsie_page = page_to_virt(page); + memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block)); + vsie_page->scb_s.ihcpu = 0xffffU; + return vsie_page; +} + +/* put a vsie page acquired via get_vsie_page */ +static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page) +{ + struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT); + + page_ref_dec(page); +} + +int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) +{ + struct vsie_page *vsie_page; + unsigned long scb_addr; + int rc; + + vcpu->stat.instruction_sie++; + if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2)) + return -EOPNOTSUPP; + if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) + return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); + + BUILD_BUG_ON(sizeof(struct vsie_page) != 4096); + scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL); + + /* 512 byte alignment */ + if (unlikely(scb_addr & 0x1ffUL)) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0)) + return 0; + + vsie_page = get_vsie_page(vcpu->kvm, scb_addr); + if (IS_ERR(vsie_page)) + return PTR_ERR(vsie_page); + else if (!vsie_page) + /* double use of sie control block - simply do nothing */ + return 0; + + rc = pin_scb(vcpu, vsie_page, scb_addr); + if (rc) + goto out_put; + rc = shadow_scb(vcpu, vsie_page); + if (rc) + goto out_unpin_scb; + rc = pin_blocks(vcpu, vsie_page); + if (rc) + goto out_unshadow; + rc = vsie_run(vcpu, vsie_page); + unpin_blocks(vcpu, vsie_page); +out_unshadow: + unshadow_scb(vcpu, vsie_page); +out_unpin_scb: + unpin_scb(vcpu, vsie_page, scb_addr); +out_put: + put_vsie_page(vcpu->kvm, vsie_page); + + return rc < 0 ? rc : 0; +} + +/* Init the vsie data structures. To be called when a vm is initialized. */ +void kvm_s390_vsie_init(struct kvm *kvm) +{ + mutex_init(&kvm->arch.vsie.mutex); + INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL); +} + +/* Destroy the vsie data structures. To be called when a vm is destroyed. */ +void kvm_s390_vsie_destroy(struct kvm *kvm) +{ + struct page *page; + int i; + + mutex_lock(&kvm->arch.vsie.mutex); + for (i = 0; i < kvm->arch.vsie.page_count; i++) { + page = kvm->arch.vsie.pages[i]; + kvm->arch.vsie.pages[i] = NULL; + /* free the radix tree entry */ + radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); + __free_page(page); + } + kvm->arch.vsie.page_count = 0; + mutex_unlock(&kvm->arch.vsie.mutex); +} From 06d68a6c85d95515533663ff002d06753fd772aa Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 22 Apr 2016 13:50:09 +0200 Subject: [PATCH 144/302] KVM: s390: vsie: optimize gmap prefix mapping In order to not always map the prefix, we have to take care of certain aspects that implicitly unmap the prefix: - Changes to the prefix address - Changes to MSO, because the HVA of the prefix is changed - Changes of the gmap shadow (e.g. unshadowed, asce or edat changes) By properly handling these cases, we can stop remapping the prefix when there is no reason to do so. This also allows us now to not acquire any gmap shadow locks when rerunning the vsie and still having a valid gmap shadow. Please note, to detect changing gmap shadows, we have to keep the reference of the gmap shadow. The address of a gmap shadow does otherwise not reliably indicate if the gmap shadow has changed (the memory chunk could get reused). Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 747d4f900155..2839efcfc5ff 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -62,6 +62,11 @@ static void prefix_mapped(struct vsie_page *vsie_page) atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20); } +/* test if the prefix is mapped into the gmap shadow */ +static int prefix_is_mapped(struct vsie_page *vsie_page) +{ + return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST); +} /* copy the updated intervention request bits into the shadow scb */ static void update_intervention_requests(struct vsie_page *vsie_page) @@ -152,6 +157,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + unsigned long new_mso; int rc; /* make sure we don't have any leftovers when reusing the scb */ @@ -192,9 +198,13 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; scb_s->icpua = scb_o->icpua; + new_mso = scb_o->mso & 0xfffffffffff00000UL; + /* if the hva of the prefix changes, we have to remap the prefix */ + if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix) + prefix_unmapped(vsie_page); /* SIE will do mso/msl validity and exception checks for us */ scb_s->msl = scb_o->msl & 0xfffffffffff00000UL; - scb_s->mso = scb_o->mso & 0xfffffffffff00000UL; + scb_s->mso = new_mso; scb_s->prefix = scb_o->prefix; /* We have to definetly flush the tlb if this scb never ran */ @@ -262,6 +272,9 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT; int rc; + if (prefix_is_mapped(vsie_page)) + return 0; + /* mark it as mapped so we can catch any concurrent unmappers */ prefix_mapped(vsie_page); @@ -532,6 +545,7 @@ static void release_gmap_shadow(struct vsie_page *vsie_page) if (vsie_page->gmap) gmap_put(vsie_page->gmap); WRITE_ONCE(vsie_page->gmap, NULL); + prefix_unmapped(vsie_page); } static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, @@ -547,6 +561,16 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8); edat += edat && test_kvm_facility(vcpu->kvm, 78); + /* + * ASCE or EDAT could have changed since last icpt, or the gmap + * we're holding has been unshadowed. If the gmap is still valid, + * we can safely reuse it. + */ + if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) + return 0; + + /* release the old shadow - if any, and mark the prefix as unmapped */ + release_gmap_shadow(vsie_page); gmap = gmap_shadow(vcpu->arch.gmap, asce, edat); if (IS_ERR(gmap)) return PTR_ERR(gmap); @@ -578,7 +602,6 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) rc = do_vsie_run(vcpu, vsie_page); gmap_enable(vcpu->arch.gmap); } - release_gmap_shadow(vsie_page); if (rc == -EAGAIN) rc = 0; @@ -667,6 +690,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) vsie_page = page_to_virt(page); memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block)); + release_gmap_shadow(vsie_page); vsie_page->scb_s.ihcpu = 0xffffU; return vsie_page; } @@ -739,6 +763,7 @@ void kvm_s390_vsie_init(struct kvm *kvm) /* Destroy the vsie data structures. To be called when a vm is destroyed. */ void kvm_s390_vsie_destroy(struct kvm *kvm) { + struct vsie_page *vsie_page; struct page *page; int i; @@ -746,6 +771,8 @@ void kvm_s390_vsie_destroy(struct kvm *kvm) for (i = 0; i < kvm->arch.vsie.page_count; i++) { page = kvm->arch.vsie.pages[i]; kvm->arch.vsie.pages[i] = NULL; + vsie_page = page_to_virt(page); + release_gmap_shadow(vsie_page); /* free the radix tree entry */ radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9); __free_page(page); From 3573602b20b061030c34b04f206b781857f155df Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 19 Feb 2016 10:11:24 +0100 Subject: [PATCH 145/302] KVM: s390: vsie: support setting the ibc As soon as we forward an ibc to guest 2 (indicated via kvm->arch.model.ibc), he can also use it for guest 3. Let's properly round the ibc up/down, so we avoid any potential validity icpts from the underlying SIE, if it doesn't simply round the values. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 2839efcfc5ff..1165baf78535 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -102,6 +102,26 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return 0; } +/* shadow (round up/down) the ibc to avoid validity icpt */ +static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + __u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU; + + scb_s->ibc = 0; + /* ibc installed in g2 and requested for g3 */ + if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) { + scb_s->ibc = scb_o->ibc & 0x0fffU; + /* takte care of the minimum ibc level of the machine */ + if (scb_s->ibc < min_ibc) + scb_s->ibc = min_ibc; + /* take care of the maximum ibc level set for the guest */ + if (scb_s->ibc > vcpu->kvm->arch.model.ibc) + scb_s->ibc = vcpu->kvm->arch.model.ibc; + } +} + /* unshadow the scb, copying parameters back to the real scb */ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { @@ -214,6 +234,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* MVPG and Protection Exception Interpretation are always available */ scb_s->eca |= scb_o->eca & 0x01002000U; + prepare_ibc(vcpu, vsie_page); out: if (rc) unshadow_scb(vcpu, vsie_page); From 535ef81c6e7910c0205f58a69ed6c765f8ba7f18 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 12 Feb 2016 12:24:20 +0100 Subject: [PATCH 146/302] KVM: s390: vsie: support edat1 / edat2 If guest 2 is allowed to use edat 1 / edat 2, it can also set it up for guest 3, so let's properly check and forward the edat cpuflags. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 1165baf78535..7c9835b0a33f 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -97,6 +97,13 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* intervention requests will be set later */ newflags = CPUSTAT_ZARCH; + if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8)) + newflags |= CPUSTAT_GED; + if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) { + if (cpuflags & CPUSTAT_GED) + return set_validity_icpt(scb_s, 0x0001U); + newflags |= CPUSTAT_GED2; + } atomic_set(&scb_s->cpuflags, newflags); return 0; From 4ceafa9027b0c2671ab731c7d95896a5b3c2dc0b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 27 Nov 2015 12:34:28 +0100 Subject: [PATCH 147/302] KVM: s390: vsie: support host-protection-interruption Introduced with ESOP, therefore available for the guest if it is allowed to use ESOP. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 7c9835b0a33f..aaed63ce29b2 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -240,6 +240,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* MVPG and Protection Exception Interpretation are always available */ scb_s->eca |= scb_o->eca & 0x01002000U; + /* Host-protection-interruption introduced with ESOP */ + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP)) + scb_s->ecb |= scb_o->ecb & 0x02U; prepare_ibc(vcpu, vsie_page); out: From 66b630d5b7f2d3afb5e8eddad3e8326091375f1a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 26 Nov 2015 14:11:19 +0100 Subject: [PATCH 148/302] KVM: s390: vsie: support STFLE interpretation Issuing STFLE is extremely rare. Instead of copying 2k on every VSIE call, let's do this lazily, when a guest 3 tries to execute STFLE. We can setup the block and retry. Unfortunately, we can't directly forward that facility list, as we only have a 31 bit address for the facility list designation. So let's use a DMA allocation for our vsie_page instead for now. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 49 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index aaed63ce29b2..cd4bbfa72881 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "kvm-s390.h" #include "gaccess.h" @@ -27,7 +28,8 @@ struct vsie_page { struct kvm_s390_sie_block *scb_o; /* 0x0200 */ /* the shadow gmap in use by the vsie_page */ struct gmap *gmap; /* 0x0208 */ - __u8 reserved[0x1000 - 0x0210]; /* 0x0210 */ + __u8 reserved[0x0800 - 0x0210]; /* 0x0210 */ + __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ } __packed; /* trigger a validity icpt for the given scb */ @@ -194,6 +196,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->ecb2 = 0; scb_s->ecb3 = 0; scb_s->ecd = 0; + scb_s->fac = 0; rc = prepare_cpuflags(vcpu, vsie_page); if (rc) @@ -521,6 +524,44 @@ static inline void clear_vsie_icpt(struct vsie_page *vsie_page) vsie_page->scb_s.icptcode = 0; } +/* rewind the psw and clear the vsie icpt, so we can retry execution */ +static void retry_vsie_icpt(struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + int ilen = insn_length(scb_s->ipa >> 8); + + /* take care of EXECUTE instructions */ + if (scb_s->icptstatus & 1) { + ilen = (scb_s->icptstatus >> 4) & 0x6; + if (!ilen) + ilen = 4; + } + scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen); + clear_vsie_icpt(vsie_page); +} + +/* + * Try to shadow + enable the guest 2 provided facility list. + * Retry instruction execution if enabled for and provided by guest 2. + * + * Returns: - 0 if handled (retry or guest 2 icpt) + * - > 0 if control has to be given to guest 2 + */ +static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + __u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U; + + if (fac && test_kvm_facility(vcpu->kvm, 7)) { + retry_vsie_icpt(vsie_page); + if (read_guest_real(vcpu, fac, &vsie_page->fac, + sizeof(vsie_page->fac))) + return set_validity_icpt(scb_s, 0x1090U); + scb_s->fac = (__u32)(__u64) &vsie_page->fac; + } + return 0; +} + /* * Run the vsie on a shadow scb and a shadow gmap, without any further * sanity checks, handling SIE faults. @@ -558,6 +599,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return handle_fault(vcpu, vsie_page); switch (scb_s->icptcode) { + case ICPT_INST: + if (scb_s->ipa == 0xb2b0) + rc = handle_stfle(vcpu, vsie_page); + break; case ICPT_STOP: /* stop not requested by g2 - must have been a kick */ if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT)) @@ -690,7 +735,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) mutex_lock(&kvm->arch.vsie.mutex); if (kvm->arch.vsie.page_count < nr_vcpus) { - page = alloc_page(GFP_KERNEL | __GFP_ZERO); + page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA); if (!page) { mutex_unlock(&kvm->arch.vsie.mutex); return ERR_PTR(-ENOMEM); From bbeaa58b32ab627b68748543b3dcb98b9a28d570 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 26 Nov 2015 13:11:42 +0100 Subject: [PATCH 149/302] KVM: s390: vsie: support aes dea wrapping keys As soon as message-security-assist extension 3 is enabled for guest 2, we have to allow key wrapping for guest 3. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 56 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index cd4bbfa72881..6b26b0be63c1 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -28,7 +28,8 @@ struct vsie_page { struct kvm_s390_sie_block *scb_o; /* 0x0200 */ /* the shadow gmap in use by the vsie_page */ struct gmap *gmap; /* 0x0208 */ - __u8 reserved[0x0800 - 0x0210]; /* 0x0210 */ + __u8 reserved[0x0700 - 0x0210]; /* 0x0210 */ + struct kvm_s390_crypto_cb crycb; /* 0x0700 */ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ } __packed; @@ -111,6 +112,58 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return 0; } +/* + * Create a shadow copy of the crycb block and setup key wrapping, if + * requested for guest 3 and enabled for guest 2. + * + * We only accept format-1 (no AP in g2), but convert it into format-2 + * There is nothing to do for format-0. + * + * Returns: - 0 if shadowed or nothing to do + * - > 0 if control has to be given to guest 2 + */ +static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U; + unsigned long *b1, *b2; + u8 ecb3_flags; + + scb_s->crycbd = 0; + if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1)) + return 0; + /* format-1 is supported with message-security-assist extension 3 */ + if (!test_kvm_facility(vcpu->kvm, 76)) + return 0; + /* we may only allow it if enabled for guest 2 */ + ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 & + (ECB3_AES | ECB3_DEA); + if (!ecb3_flags) + return 0; + + if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK)) + return set_validity_icpt(scb_s, 0x003CU); + else if (!crycb_addr) + return set_validity_icpt(scb_s, 0x0039U); + + /* copy only the wrapping keys */ + if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56)) + return set_validity_icpt(scb_s, 0x0035U); + + scb_s->ecb3 |= ecb3_flags; + scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 | + CRYCB_FORMAT2; + + /* xor both blocks in one run */ + b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask; + b2 = (unsigned long *) + vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask; + /* as 56%8 == 0, bitmap_xor won't overwrite any data */ + bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56); + return 0; +} + /* shadow (round up/down) the ibc to avoid validity icpt */ static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { @@ -248,6 +301,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->ecb |= scb_o->ecb & 0x02U; prepare_ibc(vcpu, vsie_page); + rc = shadow_crycb(vcpu, vsie_page); out: if (rc) unshadow_scb(vcpu, vsie_page); From 166ecb3d3cfecb62c31fdeab9949d70e84cd75cd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 25 Nov 2015 11:13:32 +0100 Subject: [PATCH 150/302] KVM: s390: vsie: support transactional execution As soon as guest 2 is allowed to use transactional execution (indicated via STFLE), he can also enable it for guest 3. Active transactional execution requires also the second prefix page to be mapped. If that page cannot be mapped, a validity icpt has to be presented to the guest. We have to take care of tx being toggled on/off, otherwise we might get wrong prefix validity icpt. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 6b26b0be63c1..4e2c71cced48 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -239,6 +239,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + bool had_tx = scb_s->ecb & 0x10U; unsigned long new_mso; int rc; @@ -299,6 +300,13 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* Host-protection-interruption introduced with ESOP */ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP)) scb_s->ecb |= scb_o->ecb & 0x02U; + /* transactional execution */ + if (test_kvm_facility(vcpu->kvm, 73)) { + /* remap the prefix is tx is toggled on */ + if ((scb_o->ecb & 0x10U) && !had_tx) + prefix_unmapped(vsie_page); + scb_s->ecb |= scb_o->ecb & 0x10U; + } prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); @@ -337,13 +345,13 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT; /* with mso/msl, the prefix lies at an offset */ prefix += cur->scb_s.mso; - if (prefix <= end && start <= prefix + PAGE_SIZE - 1) + if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1) prefix_unmapped_sync(cur); } } /* - * Map the first prefix page. + * Map the first prefix page and if tx is enabled also the second prefix page. * * The prefix will be protected, a gmap notifier will inform about unmaps. * The shadow scb must not be executed until the prefix is remapped, this is @@ -370,6 +378,9 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) prefix += scb_s->mso; rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix); + if (!rc && (scb_s->ecb & 0x10U)) + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, + prefix + PAGE_SIZE); /* * We don't have to mprotect, we will be called for all unshadows. * SIE will detect if protection applies and trigger a validity. @@ -434,6 +445,13 @@ static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->scaol = 0; scb_s->scaoh = 0; } + + hpa = scb_s->itdba; + if (hpa) { + gpa = scb_o->itdba & ~0xffUL; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->itdba = 0; + } } /* @@ -477,6 +495,21 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->scaoh = (u32)((u64)hpa >> 32); scb_s->scaol = (u32)(u64)hpa; } + + gpa = scb_o->itdba & ~0xffUL; + if (gpa && (scb_s->ecb & 0x10U)) { + if (!(gpa & ~0x1fffU)) { + rc = set_validity_icpt(scb_s, 0x0080U); + goto unpin; + } + /* 256 bytes cannot cross page boundaries */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x0080U); + if (rc) + goto unpin; + scb_s->itdba = hpa; + } return 0; unpin: unpin_blocks(vcpu, vsie_page); From c9bc1eabe5ee49f1be68550cc0bd907b55d9da8d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 25 Nov 2015 11:08:32 +0100 Subject: [PATCH 151/302] KVM: s390: vsie: support vectory facility (SIMD) As soon as guest 2 is allowed to use the vector facility (indicated via STFLE), it can also enable it for guest 3. We have to take care of the sattellite block that might be used when not relying on lazy vector copying (not the case for KVM). Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 2 +- arch/s390/kvm/vsie.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 255609c86901..190ad63291fb 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -229,7 +229,7 @@ struct kvm_s390_sie_block { __u8 reserved1e6[2]; /* 0x01e6 */ __u64 itdba; /* 0x01e8 */ __u64 riccbd; /* 0x01f0 */ - __u8 reserved1f8[8]; /* 0x01f8 */ + __u64 gvrd; /* 0x01f8 */ } __attribute__((packed)); struct kvm_s390_itdb { diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 4e2c71cced48..6d9f4058ce15 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -307,6 +307,11 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) prefix_unmapped(vsie_page); scb_s->ecb |= scb_o->ecb & 0x10U; } + /* SIMD */ + if (test_kvm_facility(vcpu->kvm, 129)) { + scb_s->eca |= scb_o->eca & 0x00020000U; + scb_s->ecd |= scb_o->ecd & 0x20000000U; + } prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); @@ -452,6 +457,13 @@ static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) unpin_guest_page(vcpu->kvm, gpa, hpa); scb_s->itdba = 0; } + + hpa = scb_s->gvrd; + if (hpa) { + gpa = scb_o->gvrd & ~0x1ffUL; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->gvrd = 0; + } } /* @@ -510,6 +522,25 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) goto unpin; scb_s->itdba = hpa; } + + gpa = scb_o->gvrd & ~0x1ffUL; + if (gpa && (scb_s->eca & 0x00020000U) && + !(scb_s->ecd & 0x20000000U)) { + if (!(gpa & ~0x1fffUL)) { + rc = set_validity_icpt(scb_s, 0x1310U); + goto unpin; + } + /* + * 512 bytes vector registers cannot cross page boundaries + * if this block gets bigger, we have to shadow it. + */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x1310U); + if (rc) + goto unpin; + scb_s->gvrd = hpa; + } return 0; unpin: unpin_blocks(vcpu, vsie_page); From 588438cba015ff3d14504b7598308dd3ebe06a99 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 26 Jan 2016 12:51:06 +0100 Subject: [PATCH 152/302] KVM: s390: vsie: support run-time-instrumentation As soon as guest 2 is allowed to use run-time-instrumentation (indicated via via STFLE), it can also enable it for guest 3. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 6d9f4058ce15..ebc988ffd3e5 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -312,6 +312,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->eca |= scb_o->eca & 0x00020000U; scb_s->ecd |= scb_o->ecd & 0x20000000U; } + /* Run-time-Instrumentation */ + if (test_kvm_facility(vcpu->kvm, 64)) + scb_s->ecb3 |= scb_o->ecb3 & 0x01U; prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); @@ -464,6 +467,13 @@ static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) unpin_guest_page(vcpu->kvm, gpa, hpa); scb_s->gvrd = 0; } + + hpa = scb_s->riccbd; + if (hpa) { + gpa = scb_o->riccbd & ~0x3fUL; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->riccbd = 0; + } } /* @@ -541,6 +551,22 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) goto unpin; scb_s->gvrd = hpa; } + + gpa = scb_o->riccbd & ~0x3fUL; + if (gpa && (scb_s->ecb3 & 0x01U)) { + if (!(gpa & ~0x1fffUL)) { + rc = set_validity_icpt(scb_s, 0x0043U); + goto unpin; + } + /* 64 bytes cannot cross page boundaries */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x0043U); + /* Validity 0x0044 will be checked by SIE */ + if (rc) + goto unpin; + scb_s->gvrd = hpa; + } return 0; unpin: unpin_blocks(vcpu, vsie_page); From 19c439b564b05939b83876a687bd48389d0aebb5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 25 Nov 2015 11:02:26 +0100 Subject: [PATCH 153/302] KVM: s390: vsie: support 64-bit-SCAO Let's provide the 64-bit-SCAO facility to guest 2, so he can set up a SCA for guest 3 that has a 64 bit address. Please note that we already require the 64 bit SCAO for our vsie implementation, in order to forward the SCA directly (by pinning the page). Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/uapi/asm/kvm.h | 1 + arch/s390/kvm/kvm-s390.c | 2 ++ arch/s390/kvm/vsie.c | 4 ++++ 3 files changed, 7 insertions(+) diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 62423b1931c0..c5e4537e96d9 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -99,6 +99,7 @@ struct kvm_s390_vm_cpu_machine { #define KVM_S390_VM_CPU_FEAT_NR_BITS 1024 #define KVM_S390_VM_CPU_FEAT_ESOP 0 #define KVM_S390_VM_CPU_FEAT_SIEF2 1 +#define KVM_S390_VM_CPU_FEAT_64BSCAO 2 struct kvm_s390_vm_cpu_feat { __u64 feat[16]; }; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3fb124226e97..e0c5a57bf58b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -265,6 +265,8 @@ static void kvm_s390_cpu_feat_init(void) !test_facility(3)) return; allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2); + if (sclp.has_64bscao) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO); } int kvm_arch_init(void *opaque) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index ebc988ffd3e5..44e66c329026 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -449,6 +449,8 @@ static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol; if (hpa) { gpa = scb_o->scaol & ~0xfUL; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO)) + gpa |= (u64) scb_o->scaoh << 32; unpin_guest_page(vcpu->kvm, gpa, hpa); scb_s->scaol = 0; scb_s->scaoh = 0; @@ -499,6 +501,8 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) int rc = 0; gpa = scb_o->scaol & ~0xfUL; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO)) + gpa |= (u64) scb_o->scaoh << 32; if (gpa) { if (!(gpa & ~0x1fffUL)) rc = set_validity_icpt(scb_s, 0x0038U); From 0615a326e066b580cf26d16a092ea54997dd6cbb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 25 Nov 2015 09:59:49 +0100 Subject: [PATCH 154/302] KVM: s390: vsie: support shared IPTE-interlock facility As we forward the whole SCA provided by guest 2, we can directly forward SIIF if available. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/uapi/asm/kvm.h | 1 + arch/s390/kvm/kvm-s390.c | 2 ++ arch/s390/kvm/vsie.c | 2 ++ 3 files changed, 5 insertions(+) diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index c5e4537e96d9..1d2e820f763d 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -100,6 +100,7 @@ struct kvm_s390_vm_cpu_machine { #define KVM_S390_VM_CPU_FEAT_ESOP 0 #define KVM_S390_VM_CPU_FEAT_SIEF2 1 #define KVM_S390_VM_CPU_FEAT_64BSCAO 2 +#define KVM_S390_VM_CPU_FEAT_SIIF 3 struct kvm_s390_vm_cpu_feat { __u64 feat[16]; }; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e0c5a57bf58b..d735612f9081 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -267,6 +267,8 @@ static void kvm_s390_cpu_feat_init(void) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2); if (sclp.has_64bscao) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO); + if (sclp.has_siif) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF); } int kvm_arch_init(void *opaque) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 44e66c329026..1615ed37f7da 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -315,6 +315,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* Run-time-Instrumentation */ if (test_kvm_facility(vcpu->kvm, 64)) scb_s->ecb3 |= scb_o->ecb3 & 0x01U; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF)) + scb_s->eca |= scb_o->eca & 0x00000001U; prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); From 77d18f6d47fbeaaceb15df9ab928757d5bb96ec6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 16:32:35 +0100 Subject: [PATCH 155/302] KVM: s390: vsie: support guest-PER-enhancement We can easily forward the guest-PER-enhancement facility to guest 2 if available. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/uapi/asm/kvm.h | 1 + arch/s390/kvm/kvm-s390.c | 2 ++ arch/s390/kvm/vsie.c | 2 ++ 3 files changed, 5 insertions(+) diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 1d2e820f763d..98526ac114bf 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -101,6 +101,7 @@ struct kvm_s390_vm_cpu_machine { #define KVM_S390_VM_CPU_FEAT_SIEF2 1 #define KVM_S390_VM_CPU_FEAT_64BSCAO 2 #define KVM_S390_VM_CPU_FEAT_SIIF 3 +#define KVM_S390_VM_CPU_FEAT_GPERE 4 struct kvm_s390_vm_cpu_feat { __u64 feat[16]; }; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d735612f9081..175752877c0d 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -269,6 +269,8 @@ static void kvm_s390_cpu_feat_init(void) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO); if (sclp.has_siif) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF); + if (sclp.has_gpere) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE); } int kvm_arch_init(void *opaque) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 1615ed37f7da..b8792ef01030 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -107,6 +107,8 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return set_validity_icpt(scb_s, 0x0001U); newflags |= CPUSTAT_GED2; } + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE)) + newflags |= cpuflags & CPUSTAT_P; atomic_set(&scb_s->cpuflags, newflags); return 0; From a1b7b9b286c0157748922526ecb353e550209833 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 16:41:33 +0100 Subject: [PATCH 156/302] KVM: s390: vsie: support guest-storage-limit-suppression We can easily forward guest-storage-limit-suppression if available. One thing to care about is keeping the prefix properly mapped when gsls in toggled on/off or the mso changes in between. Therefore we better remap the prefix on any mso changes just like we already do with the prefix. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/uapi/asm/kvm.h | 1 + arch/s390/kvm/kvm-s390.c | 2 ++ arch/s390/kvm/vsie.c | 7 +++++-- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 98526ac114bf..9ed07479714f 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -102,6 +102,7 @@ struct kvm_s390_vm_cpu_machine { #define KVM_S390_VM_CPU_FEAT_64BSCAO 2 #define KVM_S390_VM_CPU_FEAT_SIIF 3 #define KVM_S390_VM_CPU_FEAT_GPERE 4 +#define KVM_S390_VM_CPU_FEAT_GSLS 5 struct kvm_s390_vm_cpu_feat { __u64 feat[16]; }; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 175752877c0d..ce9813afd502 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -271,6 +271,8 @@ static void kvm_s390_cpu_feat_init(void) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF); if (sclp.has_gpere) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE); + if (sclp.has_gsls) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS); } int kvm_arch_init(void *opaque) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index b8792ef01030..ea65bf2f0201 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -109,6 +109,8 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE)) newflags |= cpuflags & CPUSTAT_P; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS)) + newflags |= cpuflags & CPUSTAT_SM; atomic_set(&scb_s->cpuflags, newflags); return 0; @@ -242,7 +244,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; bool had_tx = scb_s->ecb & 0x10U; - unsigned long new_mso; + unsigned long new_mso = 0; int rc; /* make sure we don't have any leftovers when reusing the scb */ @@ -284,7 +286,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; scb_s->icpua = scb_o->icpua; - new_mso = scb_o->mso & 0xfffffffffff00000UL; + if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM)) + new_mso = scb_o->mso & 0xfffffffffff00000UL; /* if the hva of the prefix changes, we have to remap the prefix */ if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix) prefix_unmapped(vsie_page); From 5630a8e82b1ee4d13daa500c045603c5b4801fd9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 16:53:51 +0100 Subject: [PATCH 157/302] KVM: s390: vsie: support intervention-bypass We can easily enable intervention bypass for guest 2, so it can use it for guest 3. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/uapi/asm/kvm.h | 1 + arch/s390/kvm/kvm-s390.c | 2 ++ arch/s390/kvm/vsie.c | 2 ++ 3 files changed, 5 insertions(+) diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 9ed07479714f..347f4f61b656 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -103,6 +103,7 @@ struct kvm_s390_vm_cpu_machine { #define KVM_S390_VM_CPU_FEAT_SIIF 3 #define KVM_S390_VM_CPU_FEAT_GPERE 4 #define KVM_S390_VM_CPU_FEAT_GSLS 5 +#define KVM_S390_VM_CPU_FEAT_IB 6 struct kvm_s390_vm_cpu_feat { __u64 feat[16]; }; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ce9813afd502..5ec598ca7660 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -273,6 +273,8 @@ static void kvm_s390_cpu_feat_init(void) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE); if (sclp.has_gsls) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS); + if (sclp.has_ib) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB); } int kvm_arch_init(void *opaque) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index ea65bf2f0201..d29bd592fb3d 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -322,6 +322,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->ecb3 |= scb_o->ecb3 & 0x01U; if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF)) scb_s->eca |= scb_o->eca & 0x00000001U; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB)) + scb_s->eca |= scb_o->eca & 0x40000000U; prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); From 13ee3f678b1117d7511a2c5e10549f7c37f4cadf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 16:54:37 +0100 Subject: [PATCH 158/302] KVM: s390: vsie: support conditional-external-interception We can easily enable cei for guest 2, so he can use it for guest 3. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/uapi/asm/kvm.h | 1 + arch/s390/kvm/kvm-s390.c | 2 ++ arch/s390/kvm/vsie.c | 2 ++ 3 files changed, 5 insertions(+) diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 347f4f61b656..7630dd70ed57 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -104,6 +104,7 @@ struct kvm_s390_vm_cpu_machine { #define KVM_S390_VM_CPU_FEAT_GPERE 4 #define KVM_S390_VM_CPU_FEAT_GSLS 5 #define KVM_S390_VM_CPU_FEAT_IB 6 +#define KVM_S390_VM_CPU_FEAT_CEI 7 struct kvm_s390_vm_cpu_feat { __u64 feat[16]; }; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 5ec598ca7660..1c1188ba1042 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -275,6 +275,8 @@ static void kvm_s390_cpu_feat_init(void) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS); if (sclp.has_ib) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB); + if (sclp.has_cei) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI); } int kvm_arch_init(void *opaque) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index d29bd592fb3d..f3a4a0bad4a7 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -324,6 +324,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->eca |= scb_o->eca & 0x00000001U; if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB)) scb_s->eca |= scb_o->eca & 0x40000000U; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI)) + scb_s->eca |= scb_o->eca & 0x80000000U; prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); From 7fd7f39daa3da822122124730437c4f37e4d82de Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 Nov 2015 16:56:23 +0100 Subject: [PATCH 159/302] KVM: s390: vsie: support IBS interpretation We can easily enable ibs for guest 2, so he can use it for guest 3. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/uapi/asm/kvm.h | 1 + arch/s390/kvm/kvm-s390.c | 2 ++ arch/s390/kvm/vsie.c | 2 ++ 3 files changed, 5 insertions(+) diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 7630dd70ed57..c128567d1cd3 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -105,6 +105,7 @@ struct kvm_s390_vm_cpu_machine { #define KVM_S390_VM_CPU_FEAT_GSLS 5 #define KVM_S390_VM_CPU_FEAT_IB 6 #define KVM_S390_VM_CPU_FEAT_CEI 7 +#define KVM_S390_VM_CPU_FEAT_IBS 8 struct kvm_s390_vm_cpu_feat { __u64 feat[16]; }; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 1c1188ba1042..8ba7a98a50cf 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -277,6 +277,8 @@ static void kvm_s390_cpu_feat_init(void) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB); if (sclp.has_cei) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI); + if (sclp.has_ibs) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS); } int kvm_arch_init(void *opaque) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index f3a4a0bad4a7..3ececbbd6bb0 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -111,6 +111,8 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) newflags |= cpuflags & CPUSTAT_P; if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS)) newflags |= cpuflags & CPUSTAT_SM; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS)) + newflags |= cpuflags & CPUSTAT_IBS; atomic_set(&scb_s->cpuflags, newflags); return 0; From 1b7029bec18718eca8cfc5c1c0917444f019be1e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 8 Jul 2015 13:25:31 +0200 Subject: [PATCH 160/302] KVM: s390: vsie: try to refault after a reported fault to g2 We can avoid one unneeded SIE entry after we reported a fault to g2. Theoretically, g2 resolves the fault and we can create the shadow mapping directly, instead of failing again when entering the SIE. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 3ececbbd6bb0..7482488d21d0 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -28,7 +28,9 @@ struct vsie_page { struct kvm_s390_sie_block *scb_o; /* 0x0200 */ /* the shadow gmap in use by the vsie_page */ struct gmap *gmap; /* 0x0208 */ - __u8 reserved[0x0700 - 0x0210]; /* 0x0210 */ + /* address of the last reported fault to guest2 */ + unsigned long fault_addr; /* 0x0210 */ + __u8 reserved[0x0700 - 0x0218]; /* 0x0218 */ struct kvm_s390_crypto_cb crycb; /* 0x0700 */ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ } __packed; @@ -676,10 +678,27 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) rc = inject_fault(vcpu, rc, current->thread.gmap_addr, current->thread.gmap_write_flag); + if (rc >= 0) + vsie_page->fault_addr = current->thread.gmap_addr; } return rc; } +/* + * Retry the previous fault that required guest 2 intervention. This avoids + * one superfluous SIE re-entry and direct exit. + * + * Will ignore any errors. The next SIE fault will do proper fault handling. + */ +static void handle_last_fault(struct kvm_vcpu *vcpu, + struct vsie_page *vsie_page) +{ + if (vsie_page->fault_addr) + kvm_s390_shadow_fault(vcpu, vsie_page->gmap, + vsie_page->fault_addr); + vsie_page->fault_addr = 0; +} + static inline void clear_vsie_icpt(struct vsie_page *vsie_page) { vsie_page->scb_s.icptcode = 0; @@ -737,6 +756,8 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; int rc; + handle_last_fault(vcpu, vsie_page); + if (need_resched()) schedule(); if (test_cpu_flag(CIF_MCCK_PENDING)) @@ -928,6 +949,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) vsie_page = page_to_virt(page); memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block)); release_gmap_shadow(vsie_page); + vsie_page->fault_addr = 0; vsie_page->scb_s.ihcpu = 0xffffU; return vsie_page; } From adbf16985c387851fd3454ca34893705dbde7f98 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 27 May 2016 22:03:52 +0200 Subject: [PATCH 161/302] KVM: s390: vsie: speed up VCPU irq delivery when handling vsie Whenever we want to wake up a VCPU (e.g. when injecting an IRQ), we have to kick it out of vsie, so the request will be handled faster. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 2 ++ arch/s390/kvm/interrupt.c | 5 +++++ arch/s390/kvm/kvm-s390.h | 1 + arch/s390/kvm/vsie.c | 35 ++++++++++++++++++++++++++++++++ 4 files changed, 43 insertions(+) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 190ad63291fb..946fc86202fd 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -549,6 +549,8 @@ struct kvm_guestdbg_info_arch { struct kvm_vcpu_arch { struct kvm_s390_sie_block *sie_block; + /* if vsie is active, currently executed shadow sie control block */ + struct kvm_s390_sie_block *vsie_block; unsigned int host_acrs[NUM_ACRS]; struct fpu host_fpregs; struct kvm_s390_local_interrupt local_int; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index d72c4a877622..ca19627779db 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -995,6 +995,11 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) swake_up(&vcpu->wq); vcpu->stat.halt_wakeup++; } + /* + * The VCPU might not be sleeping but is executing the VSIE. Let's + * kick it, so it leaves the SIE to process the request. + */ + kvm_s390_vsie_kick(vcpu); } enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index b137fbaac91c..ffbbdd285385 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -254,6 +254,7 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu); /* implemented in vsie.c */ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu); +void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu); void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); void kvm_s390_vsie_init(struct kvm *kvm); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 7482488d21d0..c8c8763e7822 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -837,6 +837,23 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, return 0; } +/* + * Register the shadow scb at the VCPU, e.g. for kicking out of vsie. + */ +static void register_shadow_scb(struct kvm_vcpu *vcpu, + struct vsie_page *vsie_page) +{ + WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s); +} + +/* + * Unregister a shadow scb from a VCPU. + */ +static void unregister_shadow_scb(struct kvm_vcpu *vcpu) +{ + WRITE_ONCE(vcpu->arch.vsie_block, NULL); +} + /* * Run the vsie on a shadowed scb, managing the gmap shadow, handling * prefix pages and faults. @@ -860,6 +877,7 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) rc = do_vsie_run(vcpu, vsie_page); gmap_enable(vcpu->arch.gmap); } + atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20); if (rc == -EAGAIN) rc = 0; @@ -1000,7 +1018,9 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) rc = pin_blocks(vcpu, vsie_page); if (rc) goto out_unshadow; + register_shadow_scb(vcpu, vsie_page); rc = vsie_run(vcpu, vsie_page); + unregister_shadow_scb(vcpu); unpin_blocks(vcpu, vsie_page); out_unshadow: unshadow_scb(vcpu, vsie_page); @@ -1039,3 +1059,18 @@ void kvm_s390_vsie_destroy(struct kvm *kvm) kvm->arch.vsie.page_count = 0; mutex_unlock(&kvm->arch.vsie.mutex); } + +void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block); + + /* + * Even if the VCPU lets go of the shadow sie block reference, it is + * still valid in the cache. So we can safely kick it. + */ + if (scb) { + atomic_or(PROG_BLOCK_SIE, &scb->prog20); + if (scb->prog0c & PROG_IN_SIE) + atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags); + } +} From 94a15de8fb2667791d66c49610676ea2add90034 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Feb 2016 10:15:43 +0100 Subject: [PATCH 162/302] KVM: s390: don't use CPUSTAT_WAIT to detect if a VCPU is idle As we want to make use of CPUSTAT_WAIT also when a VCPU is not idle but to force interception of external calls, let's check in the bitmap instead. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.h | 2 +- arch/s390/kvm/sigp.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index ffbbdd285385..031f451bb2cf 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -56,7 +56,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_WAIT; + return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 28ea0cab1f1b..1a252f537081 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -77,18 +77,18 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu, const u64 psw_int_mask = PSW_MASK_IO | PSW_MASK_EXT; u16 p_asn, s_asn; psw_t *psw; - u32 flags; + bool idle; - flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags); + idle = is_vcpu_idle(vcpu); psw = &dst_vcpu->arch.sie_block->gpsw; p_asn = dst_vcpu->arch.sie_block->gcr[4] & 0xffff; /* Primary ASN */ s_asn = dst_vcpu->arch.sie_block->gcr[3] & 0xffff; /* Secondary ASN */ /* Inject the emergency signal? */ - if (!(flags & CPUSTAT_STOPPED) + if (!is_vcpu_stopped(vcpu) || (psw->mask & psw_int_mask) != psw_int_mask - || ((flags & CPUSTAT_WAIT) && psw->addr != 0) - || (!(flags & CPUSTAT_WAIT) && (asn == p_asn || asn == s_asn))) { + || (idle && psw->addr != 0) + || (!idle && (asn == p_asn || asn == s_asn))) { return __inject_sigp_emergency(vcpu, dst_vcpu); } else { *reg &= 0xffffffff00000000UL; From b917ae573f5b3f7fee8cfb0d42d74bd8641f6401 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 7 Jul 2015 20:39:35 +0200 Subject: [PATCH 163/302] KVM: s390: vsie: speed up VCPU external calls Whenever a SIGP external call is injected via the SIGP external call interpretation facility, the VCPU is not kicked. When a VCPU is currently in the VSIE, the external call might not be processed immediately. Therefore we have to provoke partial execution exceptions, which leads to a kick of the VCPU and therefore also kick out of VSIE. This is done by simulating the WAIT state. This bit has no other side effects. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index c8c8763e7822..90781ba52803 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -844,6 +844,11 @@ static void register_shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s); + /* + * External calls have to lead to a kick of the vcpu and + * therefore the vsie -> Simulate Wait state. + */ + atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); } /* @@ -851,6 +856,7 @@ static void register_shadow_scb(struct kvm_vcpu *vcpu, */ static void unregister_shadow_scb(struct kvm_vcpu *vcpu) { + atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); WRITE_ONCE(vcpu->arch.vsie_block, NULL); } From 91473b487dd58af6384c5c3db13de50defa2c106 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 29 Oct 2015 10:30:36 +0100 Subject: [PATCH 164/302] KVM: s390: vsie: correctly set and handle guest TOD Guest 2 sets up the epoch of guest 3 from his point of view. Therefore, we have to add the guest 2 epoch to the guest 3 epoch. We also have to take care of guest 2 epoch changes on STP syncs. This will work just fine by also updating the guest 3 epoch when a vsie_block has been set for a VCPU. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 2 ++ arch/s390/kvm/vsie.c | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 8ba7a98a50cf..6fdf1f7647d7 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -176,6 +176,8 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val, vcpu->arch.sie_block->epoch -= *delta; if (vcpu->arch.cputm_enabled) vcpu->arch.cputm_start += *delta; + if (vcpu->arch.vsie_block) + vcpu->arch.vsie_block->epoch -= *delta; } } return NOTIFY_OK; diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 90781ba52803..6895e7b3be12 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -843,12 +843,21 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, static void register_shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s); /* * External calls have to lead to a kick of the vcpu and * therefore the vsie -> Simulate Wait state. */ atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags); + /* + * We have to adjust the g3 epoch by the g2 epoch. The epoch will + * automatically be adjusted on tod clock changes via kvm_sync_clock. + */ + preempt_disable(); + scb_s->epoch += vcpu->kvm->arch.epoch; + preempt_enable(); } /* From 5d3876a8bf4607b72cbe754278d19c68990b57a8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 13 Apr 2016 17:06:50 +0200 Subject: [PATCH 165/302] KVM: s390: vsie: add indication for future features We have certain SIE features that we cannot support for now. Let's add these features, so user space can directly prepare to enable them, so we don't have to update yet another component. In addition, add a comment block, telling why it is for now not possible to forward/enable these features. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/include/uapi/asm/kvm.h | 4 ++++ arch/s390/kvm/kvm-s390.c | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index c128567d1cd3..a2ffec4139ad 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -106,6 +106,10 @@ struct kvm_s390_vm_cpu_machine { #define KVM_S390_VM_CPU_FEAT_IB 6 #define KVM_S390_VM_CPU_FEAT_CEI 7 #define KVM_S390_VM_CPU_FEAT_IBS 8 +#define KVM_S390_VM_CPU_FEAT_SKEY 9 +#define KVM_S390_VM_CPU_FEAT_CMMA 10 +#define KVM_S390_VM_CPU_FEAT_PFMFI 11 +#define KVM_S390_VM_CPU_FEAT_SIGPIF 12 struct kvm_s390_vm_cpu_feat { __u64 feat[16]; }; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 6fdf1f7647d7..31cf22f7d846 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -281,6 +281,24 @@ static void kvm_s390_cpu_feat_init(void) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI); if (sclp.has_ibs) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS); + /* + * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make + * all skey handling functions read/set the skey from the PGSTE + * instead of the real storage key. + * + * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make + * pages being detected as preserved although they are resident. + * + * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will + * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY. + * + * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and + * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be + * correctly shadowed. We can do that for the PGSTE but not for PTE.I. + * + * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We + * cannot easily shadow the SCA because of the ipte lock. + */ } int kvm_arch_init(void *opaque) From a411edf1320ed8fa3b4560902ac4e033c4a72bcf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 2 Feb 2016 15:41:22 +0100 Subject: [PATCH 166/302] KVM: s390: vsie: add module parameter "nested" Let's be careful first and allow nested virtualization only if enabled by the system administrator. In addition, user space still has to explicitly enable it via SCLP features for it to work. Acked-by: Christian Borntraeger Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 31cf22f7d846..03eeeb0ded24 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -125,6 +125,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; +/* allow nested virtualization in KVM (if enabled by user space) */ +static int nested; +module_param(nested, int, S_IRUGO); +MODULE_PARM_DESC(nested, "Nested virtualization support"); + /* upper facilities limit for kvm */ unsigned long kvm_s390_fac_list_mask[16] = { 0xffe6000000000000UL, @@ -264,7 +269,7 @@ static void kvm_s390_cpu_feat_init(void) * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing). */ if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao || - !test_facility(3)) + !test_facility(3) || !nested) return; allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2); if (sclp.has_64bscao) From 3b84080b9512bcacad3805f345fb8f092c8d3a7d Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Wed, 22 Jun 2016 14:59:54 +0800 Subject: [PATCH 167/302] KVM: VMX: move msr_ia32_feature_control to vcpu_vmx msr_ia32_feature_control will be used for LMCE and not depend only on nested anymore, so move it from struct nested_vmx to struct vcpu_vmx. Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e185649fb8b7..ad66978281c1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -428,7 +428,6 @@ struct nested_vmx { struct pi_desc *pi_desc; bool pi_pending; u16 posted_intr_nv; - u64 msr_ia32_feature_control; struct hrtimer preemption_timer; bool preemption_timer_expired; @@ -612,6 +611,8 @@ struct vcpu_vmx { bool guest_pkru_valid; u32 guest_pkru; u32 host_pkru; + + u64 msr_ia32_feature_control; }; enum segment_cache_field { @@ -2970,9 +2971,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; case MSR_IA32_FEATURE_CONTROL: - if (!nested_vmx_allowed(vcpu)) - return 1; - msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control; + msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control; break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) @@ -3064,10 +3063,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_FEATURE_CONTROL: if (!nested_vmx_allowed(vcpu) || - (to_vmx(vcpu)->nested.msr_ia32_feature_control & + (to_vmx(vcpu)->msr_ia32_feature_control & FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) return 1; - vmx->nested.msr_ia32_feature_control = data; + vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); break; @@ -6939,7 +6938,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES) + if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { kvm_inject_gp(vcpu, 0); return 1; From 37e4c997dadf713d5b9cb88a801eb38d61a2aefc Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Wed, 22 Jun 2016 14:59:55 +0800 Subject: [PATCH 168/302] KVM: VMX: validate individual bits of guest MSR_IA32_FEATURE_CONTROL KVM currently does not check the value written to guest MSR_IA32_FEATURE_CONTROL, though bits corresponding to disabled features may be set. This patch makes KVM to validate individual bits written to guest MSR_IA32_FEATURE_CONTROL according to enabled features. Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ad66978281c1..0a3ccb073bb4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -612,7 +612,13 @@ struct vcpu_vmx { u32 guest_pkru; u32 host_pkru; + /* + * Only bits masked by msr_ia32_feature_control_valid_bits can be set in + * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included + * in msr_ia32_feature_control_valid_bits. + */ u64 msr_ia32_feature_control; + u64 msr_ia32_feature_control_valid_bits; }; enum segment_cache_field { @@ -2929,6 +2935,14 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 0; } +static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, + uint64_t val) +{ + uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; + + return !(val & ~valid_bits); +} + /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -3062,7 +3076,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) ret = kvm_set_msr_common(vcpu, msr_info); break; case MSR_IA32_FEATURE_CONTROL: - if (!nested_vmx_allowed(vcpu) || + if (!vmx_feature_control_msr_valid(vcpu, data) || (to_vmx(vcpu)->msr_ia32_feature_control & FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) return 1; @@ -9055,6 +9069,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vmcs; } + vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; + return &vmx->vcpu; free_vmcs: @@ -9202,6 +9218,13 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) vmx->nested.nested_vmx_secondary_ctls_high &= ~SECONDARY_EXEC_PCOMMIT; } + + if (nested_vmx_allowed(vcpu)) + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + else + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) From c45dcc71b794b5a346a43ad83bdcfac2138f0a2c Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Wed, 22 Jun 2016 14:59:56 +0800 Subject: [PATCH 169/302] KVM: VMX: enable guest access to LMCE related MSRs On Intel platforms, this patch adds LMCE to KVM MCE supported capabilities and handles guest access to LMCE related MSRs. Signed-off-by: Ashok Raj [Haozhong: macro KVM_MCE_CAP_SUPPORTED => variable kvm_mce_cap_supported Only enable LMCE on Intel platform Check MSR_IA32_FEATURE_CONTROL when handling guest access to MSR_IA32_MCG_EXT_CTL] Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 +++++ arch/x86/kvm/vmx.c | 29 +++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 15 +++++++++------ 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 360c5171ea1a..7a628fb6a2c2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -598,6 +598,7 @@ struct kvm_vcpu_arch { u64 mcg_cap; u64 mcg_status; u64 mcg_ctl; + u64 mcg_ext_ctl; u64 *mce_banks; /* Cache MMIO info */ @@ -1008,6 +1009,8 @@ struct kvm_x86_ops { int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc); void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); + + void (*setup_mce)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { @@ -1082,6 +1085,8 @@ extern u64 kvm_max_tsc_scaling_ratio; /* 1ull << kvm_tsc_scaling_ratio_frac_bits */ extern u64 kvm_default_tsc_scaling_ratio; +extern u64 kvm_mce_cap_supported; + enum emulation_result { EMULATE_DONE, /* no further processing */ EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0a3ccb073bb4..943609f06c90 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2984,6 +2984,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; + case MSR_IA32_MCG_EXT_CTL: + if (!msr_info->host_initiated && + !(to_vmx(vcpu)->msr_ia32_feature_control & + FEATURE_CONTROL_LMCE)) + return 1; + msr_info->data = vcpu->arch.mcg_ext_ctl; + break; case MSR_IA32_FEATURE_CONTROL: msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control; break; @@ -3075,6 +3082,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC_ADJUST: ret = kvm_set_msr_common(vcpu, msr_info); break; + case MSR_IA32_MCG_EXT_CTL: + if ((!msr_info->host_initiated && + !(to_vmx(vcpu)->msr_ia32_feature_control & + FEATURE_CONTROL_LMCE)) || + (data & ~MCG_EXT_CTL_LMCE_EN)) + return 1; + vcpu->arch.mcg_ext_ctl = data; + break; case MSR_IA32_FEATURE_CONTROL: if (!vmx_feature_control_msr_valid(vcpu, data) || (to_vmx(vcpu)->msr_ia32_feature_control & @@ -6484,6 +6499,8 @@ static __init int hardware_setup(void) kvm_set_posted_intr_wakeup_handler(wakeup_handler); + kvm_mce_cap_supported |= MCG_LMCE_P; + return alloc_kvm_area(); out8: @@ -11109,6 +11126,16 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, return ret; } +static void vmx_setup_mce(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.mcg_cap & MCG_LMCE_P) + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEATURE_CONTROL_LMCE; + else + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + ~FEATURE_CONTROL_LMCE; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -11238,6 +11265,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_hv_timer = vmx_set_hv_timer, .cancel_hv_timer = vmx_cancel_hv_timer, #endif + + .setup_mce = vmx_setup_mce, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 299219630c94..0a42fc729ff3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -70,7 +70,8 @@ #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 -#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) +u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; +EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); #define emul_to_vcpu(ctxt) \ container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) @@ -984,6 +985,7 @@ static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, + MSR_IA32_MCG_EXT_CTL, MSR_IA32_SMBASE, }; @@ -2685,11 +2687,9 @@ long kvm_arch_dev_ioctl(struct file *filp, break; } case KVM_X86_GET_MCE_CAP_SUPPORTED: { - u64 mce_cap; - - mce_cap = KVM_MCE_CAP_SUPPORTED; r = -EFAULT; - if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) + if (copy_to_user(argp, &kvm_mce_cap_supported, + sizeof(kvm_mce_cap_supported))) goto out; r = 0; break; @@ -2872,7 +2872,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, r = -EINVAL; if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; - if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) + if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) goto out; r = 0; vcpu->arch.mcg_cap = mcg_cap; @@ -2882,6 +2882,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, /* Init IA32_MCi_CTL to all 1s */ for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; + + if (kvm_x86_ops->setup_mce) + kvm_x86_ops->setup_mce(vcpu); out: return r; } From 87aeb54f1b9891cf08b84b3f0c34f220a4977c4f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Jun 2016 17:48:56 +0200 Subject: [PATCH 170/302] kvm: x86: use getboottime64 KVM reads the current boottime value as a struct timespec in order to calculate the guest wallclock time, resulting in an overflow in 2038 on 32-bit systems. The data then gets passed as an unsigned 32-bit number to the guest, and that in turn overflows in 2106. We cannot do much about the second overflow, which affects both 32-bit and 64-bit hosts, but we can ensure that they both behave the same way and don't overflow until 2106, by using getboottime64() to read a timespec64 value. Signed-off-by: Arnd Bergmann Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0a42fc729ff3..9e50e2ad6d08 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1165,7 +1165,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) int version; int r; struct pvclock_wall_clock wc; - struct timespec boot; + struct timespec64 boot; if (!wall_clock) return; @@ -1188,13 +1188,13 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ - getboottime(&boot); + getboottime64(&boot); if (kvm->arch.kvmclock_offset) { - struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset); - boot = timespec_sub(boot, ts); + struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset); + boot = timespec64_sub(boot, ts); } - wc.sec = boot.tv_sec; + wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */ wc.nsec = boot.tv_nsec; wc.version = version; From fb6cec1492d6a693d33224487061e92d0275c6e2 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 17 Jun 2016 18:19:40 +0100 Subject: [PATCH 171/302] MIPS: KVM: Combine entry trace events into class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Combine the kvm_enter, kvm_reenter and kvm_out trace events into a single kvm_transition event class to reduce duplication and bloat. Suggested-by: Steven Rostedt Fixes: 93258604ab6d ("MIPS: KVM: Add guest mode switch trace events") Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Ralf Baechle Cc: Steven Rostedt Cc: Ingo Molnar Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/trace.h | 58 +++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 38 deletions(-) diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h index a38bdab68574..c858cf168078 100644 --- a/arch/mips/kvm/trace.h +++ b/arch/mips/kvm/trace.h @@ -20,50 +20,32 @@ /* * Tracepoints for VM enters */ -TRACE_EVENT(kvm_enter, - TP_PROTO(struct kvm_vcpu *vcpu), - TP_ARGS(vcpu), - TP_STRUCT__entry( - __field(unsigned long, pc) - ), +DECLARE_EVENT_CLASS(kvm_transition, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), + TP_STRUCT__entry( + __field(unsigned long, pc) + ), - TP_fast_assign( - __entry->pc = vcpu->arch.pc; - ), + TP_fast_assign( + __entry->pc = vcpu->arch.pc; + ), - TP_printk("PC: 0x%08lx", - __entry->pc) + TP_printk("PC: 0x%08lx", + __entry->pc) ); -TRACE_EVENT(kvm_reenter, - TP_PROTO(struct kvm_vcpu *vcpu), - TP_ARGS(vcpu), - TP_STRUCT__entry( - __field(unsigned long, pc) - ), +DEFINE_EVENT(kvm_transition, kvm_enter, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu)); - TP_fast_assign( - __entry->pc = vcpu->arch.pc; - ), +DEFINE_EVENT(kvm_transition, kvm_reenter, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu)); - TP_printk("PC: 0x%08lx", - __entry->pc) -); - -TRACE_EVENT(kvm_out, - TP_PROTO(struct kvm_vcpu *vcpu), - TP_ARGS(vcpu), - TP_STRUCT__entry( - __field(unsigned long, pc) - ), - - TP_fast_assign( - __entry->pc = vcpu->arch.pc; - ), - - TP_printk("PC: 0x%08lx", - __entry->pc) -); +DEFINE_EVENT(kvm_transition, kvm_out, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu)); /* The first 32 exit reasons correspond to Cause.ExcCode */ #define KVM_TRACE_EXIT_INT 0 From ebaac1736245e78109cd47d453a86a18dcfc94b8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 Jun 2016 15:09:28 +0200 Subject: [PATCH 172/302] context_tracking: move rcu_virt_note_context_switch out of kvm_host.h Make kvm_guest_{enter,exit} and __kvm_guest_{enter,exit} trivial wrappers around the code in context_tracking.h. Name the context_tracking.h functions consistently with what those for kernel<->user switch. Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Thomas Gleixner Reviewed-by: Rik van Riel Signed-off-by: Paolo Bonzini --- include/linux/context_tracking.h | 38 ++++++++++++++++++++++++++++---- include/linux/kvm_host.h | 25 ++++----------------- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index d259274238db..ff4a32d24d56 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -84,7 +84,8 @@ static inline void context_tracking_init(void) { } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static inline void guest_enter(void) +/* must be called with irqs disabled */ +static inline void guest_enter_irqoff(void) { if (vtime_accounting_cpu_enabled()) vtime_guest_enter(current); @@ -93,9 +94,19 @@ static inline void guest_enter(void) if (context_tracking_is_enabled()) __context_tracking_enter(CONTEXT_GUEST); + + /* KVM does not hold any references to rcu protected data when it + * switches CPU into a guest mode. In fact switching to a guest mode + * is very similar to exiting to userspace from rcu point of view. In + * addition CPU may stay in a guest mode for quite a long time (up to + * one time slice). Lets treat guest mode as quiescent state, just like + * we do with user-mode execution. + */ + if (!context_tracking_cpu_is_enabled()) + rcu_virt_note_context_switch(smp_processor_id()); } -static inline void guest_exit(void) +static inline void guest_exit_irqoff(void) { if (context_tracking_is_enabled()) __context_tracking_exit(CONTEXT_GUEST); @@ -107,7 +118,7 @@ static inline void guest_exit(void) } #else -static inline void guest_enter(void) +static inline void guest_enter_irqoff(void) { /* * This is running in ioctl context so its safe @@ -116,9 +127,10 @@ static inline void guest_enter(void) */ vtime_account_system(current); current->flags |= PF_VCPU; + rcu_virt_note_context_switch(smp_processor_id()); } -static inline void guest_exit(void) +static inline void guest_exit_irqoff(void) { /* Flush the guest cputime we spent on the guest */ vtime_account_system(current); @@ -126,4 +138,22 @@ static inline void guest_exit(void) } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ +static inline void guest_enter(void) +{ + unsigned long flags; + + local_irq_save(flags); + guest_enter_irqoff(); + local_irq_restore(flags); +} + +static inline void guest_exit(void) +{ + unsigned long flags; + + local_irq_save(flags); + guest_exit_irqoff(); + local_irq_restore(flags); +} + #endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0640ee92b978..ffff40522688 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -878,40 +878,23 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm, /* must be called with irqs disabled */ static inline void __kvm_guest_enter(void) { - guest_enter(); - /* KVM does not hold any references to rcu protected data when it - * switches CPU into a guest mode. In fact switching to a guest mode - * is very similar to exiting to userspace from rcu point of view. In - * addition CPU may stay in a guest mode for quite a long time (up to - * one time slice). Lets treat guest mode as quiescent state, just like - * we do with user-mode execution. - */ - if (!context_tracking_cpu_is_enabled()) - rcu_virt_note_context_switch(smp_processor_id()); + guest_enter_irqoff(); } /* must be called with irqs disabled */ static inline void __kvm_guest_exit(void) { - guest_exit(); + guest_exit_irqoff(); } static inline void kvm_guest_enter(void) { - unsigned long flags; - - local_irq_save(flags); - __kvm_guest_enter(); - local_irq_restore(flags); + guest_enter(); } static inline void kvm_guest_exit(void) { - unsigned long flags; - - local_irq_save(flags); - __kvm_guest_exit(); - local_irq_restore(flags); + guest_exit(); } /* From c8dddecdeb2ae95a6535855ce8a26b7197471b16 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 13 Jun 2016 15:00:45 +0100 Subject: [PATCH 173/302] arm/arm64: KVM: Add a protection parameter to create_hyp_mappings Currently, create_hyp_mappings applies a "one size fits all" page protection (PAGE_HYP). As we're heading towards separate protections for different sections, let's make this protection a parameter, and let the callers pass their prefered protection (PAGE_HYP for everyone for the time being). Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_mmu.h | 2 +- arch/arm/kvm/arm.c | 13 +++++++------ arch/arm/kvm/mmu.c | 5 +++-- arch/arm64/include/asm/kvm_mmu.h | 2 +- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index f9a65061130b..6cb4d4d5c48c 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -49,7 +49,7 @@ #include #include -int create_hyp_mappings(void *from, void *to); +int create_hyp_mappings(void *from, void *to, pgprot_t prot); int create_hyp_io_mappings(void *from, void *to, phys_addr_t); void free_boot_hyp_pgd(void); void free_hyp_pgds(void); diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index f20ca84537f5..45dd6df70cdf 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -122,7 +122,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (ret) goto out_fail_alloc; - ret = create_hyp_mappings(kvm, kvm + 1); + ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP); if (ret) goto out_free_stage2_pgd; @@ -239,7 +239,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) if (err) goto free_vcpu; - err = create_hyp_mappings(vcpu, vcpu + 1); + err = create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP); if (err) goto vcpu_uninit; @@ -1293,14 +1293,14 @@ static int init_hyp_mode(void) * Map the Hyp-code called directly from the host */ err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start), - kvm_ksym_ref(__hyp_text_end)); + kvm_ksym_ref(__hyp_text_end), PAGE_HYP); if (err) { kvm_err("Cannot map world-switch code\n"); goto out_err; } err = create_hyp_mappings(kvm_ksym_ref(__start_rodata), - kvm_ksym_ref(__end_rodata)); + kvm_ksym_ref(__end_rodata), PAGE_HYP); if (err) { kvm_err("Cannot map rodata section\n"); goto out_err; @@ -1311,7 +1311,8 @@ static int init_hyp_mode(void) */ for_each_possible_cpu(cpu) { char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); - err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE); + err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE, + PAGE_HYP); if (err) { kvm_err("Cannot map hyp stack\n"); @@ -1323,7 +1324,7 @@ static int init_hyp_mode(void) kvm_cpu_context_t *cpu_ctxt; cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu); - err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1); + err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP); if (err) { kvm_err("Cannot map host CPU state: %d\n", err); diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 45c43aecb8f2..49cb5ccf6c23 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -679,12 +679,13 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr) * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode * @from: The virtual kernel start address of the range * @to: The virtual kernel end address of the range (exclusive) + * @prot: The protection to be applied to this range * * The same virtual address as the kernel virtual address is also used * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying * physical pages. */ -int create_hyp_mappings(void *from, void *to) +int create_hyp_mappings(void *from, void *to, pgprot_t prot) { phys_addr_t phys_addr; unsigned long virt_addr; @@ -704,7 +705,7 @@ int create_hyp_mappings(void *from, void *to) err = __create_hyp_mappings(hyp_pgd, virt_addr, virt_addr + PAGE_SIZE, __phys_to_pfn(phys_addr), - PAGE_HYP); + prot); if (err) return err; } diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index f05ac27d033e..fdfbddbe9fba 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -81,7 +81,7 @@ alternative_endif #include -int create_hyp_mappings(void *from, void *to); +int create_hyp_mappings(void *from, void *to, pgprot_t prot); int create_hyp_io_mappings(void *from, void *to, phys_addr_t); void free_boot_hyp_pgd(void); void free_hyp_pgds(void); From 1166f3fe6a86798e4fcd24cefb6b06da3fd0420f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 13 Jun 2016 15:00:46 +0100 Subject: [PATCH 174/302] arm64: Add PTE_HYP_XN page table flag EL2 page tables can be configured to deny code from being executed, which is done by setting bit 54 in the page descriptor. It is the same bit as PTE_UXN, but the "USER" reference felt odd in the hypervisor code. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/pgtable-hwdef.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 2813748e2f24..c3ae239db3ee 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -164,6 +164,7 @@ #define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */ #define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */ #define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */ +#define PTE_HYP_XN (_AT(pteval_t, 1) << 54) /* HYP XN */ /* * AttrIndx[2:0] encoding (mapping attributes defined in the MAIR* registers). From 74a6b8885f7026e33f8a4776b7ac17c76b8e5a52 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 13 Jun 2016 15:00:47 +0100 Subject: [PATCH 175/302] arm/arm64: KVM: Enforce HYP read-only mapping of the kernel's rodata section In order to be able to use C code in HYP, we're now mapping the kernel's rodata in HYP. It works absolutely fine, except that we're mapping it RWX, which is not what it should be. Add a new HYP_PAGE_RO protection, and pass it as the protection flags when mapping the rodata section. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/pgtable.h | 1 + arch/arm/kvm/arm.c | 2 +- arch/arm64/include/asm/pgtable-prot.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 348caabb7625..f3320870a90a 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -98,6 +98,7 @@ extern pgprot_t pgprot_s2_device; #define PAGE_KERNEL _MOD_PROT(pgprot_kernel, L_PTE_XN) #define PAGE_KERNEL_EXEC pgprot_kernel #define PAGE_HYP _MOD_PROT(pgprot_kernel, L_PTE_HYP) +#define PAGE_HYP_RO _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY | L_PTE_XN) #define PAGE_HYP_DEVICE _MOD_PROT(pgprot_hyp_device, L_PTE_HYP) #define PAGE_S2 _MOD_PROT(pgprot_s2, L_PTE_S2_RDONLY) #define PAGE_S2_DEVICE _MOD_PROT(pgprot_s2_device, L_PTE_S2_RDONLY) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 45dd6df70cdf..b30897679e53 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -1300,7 +1300,7 @@ static int init_hyp_mode(void) } err = create_hyp_mappings(kvm_ksym_ref(__start_rodata), - kvm_ksym_ref(__end_rodata), PAGE_HYP); + kvm_ksym_ref(__end_rodata), PAGE_HYP_RO); if (err) { kvm_err("Cannot map rodata section\n"); goto out_err; diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 29fcb33ab401..88db58cac587 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -56,6 +56,7 @@ #define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT) #define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP) +#define PAGE_HYP_RO __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) #define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP) #define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) From 5900270550cbb8a272bfc248b69531cd44dcf0d5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 13 Jun 2016 15:00:48 +0100 Subject: [PATCH 176/302] arm/arm64: KVM: Map the HYP text as read-only There should be no reason for mapping the HYP text read/write. As such, let's have a new set of flags (PAGE_HYP_EXEC) that allows execution, but makes the page as read-only, and update the two call sites that deal with mapping code. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/pgtable.h | 1 + arch/arm/kvm/arm.c | 2 +- arch/arm/kvm/mmu.c | 6 +++--- arch/arm64/include/asm/pgtable-prot.h | 1 + 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index f3320870a90a..7487bf9f97dc 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -98,6 +98,7 @@ extern pgprot_t pgprot_s2_device; #define PAGE_KERNEL _MOD_PROT(pgprot_kernel, L_PTE_XN) #define PAGE_KERNEL_EXEC pgprot_kernel #define PAGE_HYP _MOD_PROT(pgprot_kernel, L_PTE_HYP) +#define PAGE_HYP_EXEC _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY) #define PAGE_HYP_RO _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY | L_PTE_XN) #define PAGE_HYP_DEVICE _MOD_PROT(pgprot_hyp_device, L_PTE_HYP) #define PAGE_S2 _MOD_PROT(pgprot_s2, L_PTE_S2_RDONLY) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index b30897679e53..c74483fc39f2 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -1293,7 +1293,7 @@ static int init_hyp_mode(void) * Map the Hyp-code called directly from the host */ err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start), - kvm_ksym_ref(__hyp_text_end), PAGE_HYP); + kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC); if (err) { kvm_err("Cannot map world-switch code\n"); goto out_err; diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 49cb5ccf6c23..679608fa1666 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1733,7 +1733,7 @@ int kvm_mmu_init(void) err = __create_hyp_mappings(boot_hyp_pgd, hyp_idmap_start, hyp_idmap_end, __phys_to_pfn(hyp_idmap_start), - PAGE_HYP); + PAGE_HYP_EXEC); if (err) { kvm_err("Failed to idmap %lx-%lx\n", @@ -1756,7 +1756,7 @@ int kvm_mmu_init(void) err = __create_hyp_mappings(boot_hyp_pgd, TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE, __phys_to_pfn(hyp_idmap_start), - PAGE_HYP); + PAGE_HYP_EXEC); if (err) { kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n", TRAMPOLINE_VA); @@ -1767,7 +1767,7 @@ int kvm_mmu_init(void) err = __create_hyp_mappings(hyp_pgd, TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE, __phys_to_pfn(hyp_idmap_start), - PAGE_HYP); + PAGE_HYP_EXEC); if (err) { kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n", TRAMPOLINE_VA); diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 88db58cac587..380204847c20 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -56,6 +56,7 @@ #define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT) #define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP) +#define PAGE_HYP_EXEC __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY) #define PAGE_HYP_RO __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) #define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP) From 0996353f8ec6c6dba4a1f916bf6d9ace6f7d2b49 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 13 Jun 2016 15:00:49 +0100 Subject: [PATCH 177/302] arm/arm64: KVM: Make default HYP mappings non-excutable Structures that can be generally written to don't have any requirement to be executable (quite the opposite). This includes the kvm and vcpu structures, as well as the stacks. Let's change the default to incorporate the XN flag. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/pgtable.h | 2 +- arch/arm64/include/asm/pgtable-prot.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 7487bf9f97dc..e0d76ba24b30 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -97,7 +97,7 @@ extern pgprot_t pgprot_s2_device; #define PAGE_READONLY_EXEC _MOD_PROT(pgprot_user, L_PTE_USER | L_PTE_RDONLY) #define PAGE_KERNEL _MOD_PROT(pgprot_kernel, L_PTE_XN) #define PAGE_KERNEL_EXEC pgprot_kernel -#define PAGE_HYP _MOD_PROT(pgprot_kernel, L_PTE_HYP) +#define PAGE_HYP _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_XN) #define PAGE_HYP_EXEC _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY) #define PAGE_HYP_RO _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY | L_PTE_XN) #define PAGE_HYP_DEVICE _MOD_PROT(pgprot_hyp_device, L_PTE_HYP) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 380204847c20..39f5252673f7 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -55,7 +55,7 @@ #define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE) #define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT) -#define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP) +#define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN) #define PAGE_HYP_EXEC __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY) #define PAGE_HYP_RO __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) #define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP) From 6edaa5307f3f51e4e56dc4c63f68a69d88c6ddf5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 Jun 2016 15:18:26 +0200 Subject: [PATCH 178/302] KVM: remove kvm_guest_enter/exit wrappers Use the functions from context_tracking.h directly. Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Thomas Gleixner Reviewed-by: Rik van Riel Signed-off-by: Paolo Bonzini --- arch/arm/kvm/arm.c | 8 ++++---- arch/mips/kvm/mips.c | 4 ++-- arch/powerpc/kvm/book3s_hv.c | 4 ++-- arch/powerpc/kvm/book3s_pr.c | 4 ++-- arch/powerpc/kvm/booke.c | 4 ++-- arch/powerpc/kvm/powerpc.c | 2 +- arch/s390/kvm/kvm-s390.c | 4 ++-- arch/s390/kvm/vsie.c | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- include/linux/kvm_host.h | 22 ---------------------- 10 files changed, 19 insertions(+), 41 deletions(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index f20ca84537f5..9ac4970882fe 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -615,7 +615,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) * Enter the guest */ trace_kvm_entry(*vcpu_pc(vcpu)); - __kvm_guest_enter(); + guest_enter_irqoff(); vcpu->mode = IN_GUEST_MODE; ret = kvm_call_hyp(__kvm_vcpu_run, vcpu); @@ -641,14 +641,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) local_irq_enable(); /* - * We do local_irq_enable() before calling kvm_guest_exit() so + * We do local_irq_enable() before calling guest_exit() so * that if a timer interrupt hits while running the guest we * account that tick as being spent in the guest. We enable - * preemption after calling kvm_guest_exit() so that if we get + * preemption after calling guest_exit() so that if we get * preempted we make sure ticks after that is not counted as * guest time. */ - kvm_guest_exit(); + guest_exit(); trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); /* diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 5a2b9034a05c..5f1163653b50 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -406,7 +406,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_mips_deliver_interrupts(vcpu, kvm_read_c0_guest_cause(vcpu->arch.cop0)); - __kvm_guest_enter(); + guest_enter_irqoff(); /* Disable hardware page table walking while in guest */ htw_stop(); @@ -418,7 +418,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) /* Re-enable HTW before enabling interrupts */ htw_start(); - __kvm_guest_exit(); + guest_exit_irqoff(); local_irq_enable(); if (vcpu->sigset_active) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e20beae5ca7a..6b2859c12ae8 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2522,7 +2522,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) spin_unlock(&pvc->lock); - kvm_guest_enter(); + guest_enter(); srcu_idx = srcu_read_lock(&vc->kvm->srcu); @@ -2570,7 +2570,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) /* make sure updates to secondary vcpu structs are visible now */ smp_mb(); - kvm_guest_exit(); + guest_exit(); for (sub = 0; sub < core_info.n_subcores; ++sub) list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub], diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 8e4f64f0b774..6a66c5ff0827 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -914,7 +914,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, /* We get here with MSR.EE=1 */ trace_kvm_exit(exit_nr, vcpu); - kvm_guest_exit(); + guest_exit(); switch (exit_nr) { case BOOK3S_INTERRUPT_INST_STORAGE: @@ -1531,7 +1531,7 @@ static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvmppc_clear_debug(vcpu); - /* No need for kvm_guest_exit. It's done in handle_exit. + /* No need for guest_exit. It's done in handle_exit. We also get here with interrupts enabled. */ /* Make sure we save the guest FPU/Altivec/VSX state */ diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 4afae695899a..02b4672f7347 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -776,7 +776,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) ret = __kvmppc_vcpu_run(kvm_run, vcpu); - /* No need for kvm_guest_exit. It's done in handle_exit. + /* No need for guest_exit. It's done in handle_exit. We also get here with interrupts enabled. */ /* Switch back to user space debug context */ @@ -1012,7 +1012,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, } trace_kvm_exit(exit_nr, vcpu); - __kvm_guest_exit(); + guest_exit_irqoff(); local_irq_enable(); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 02416fea7653..1ac036e45ed4 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -119,7 +119,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) continue; } - __kvm_guest_enter(); + guest_enter_irqoff(); return 1; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 03eeeb0ded24..d42428c11794 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2623,14 +2623,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) * guest_enter and guest_exit should be no uaccess. */ local_irq_disable(); - __kvm_guest_enter(); + guest_enter_irqoff(); __disable_cpu_timer_accounting(vcpu); local_irq_enable(); exit_reason = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); local_irq_disable(); __enable_cpu_timer_accounting(vcpu); - __kvm_guest_exit(); + guest_exit_irqoff(); local_irq_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 6895e7b3be12..c106488b4137 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -765,13 +765,13 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); local_irq_disable(); - kvm_guest_enter(); + guest_enter_irqoff(); local_irq_enable(); rc = sie64a(scb_s, vcpu->run->s.regs.gprs); local_irq_disable(); - kvm_guest_exit(); + guest_exit_irqoff(); local_irq_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9e50e2ad6d08..618463abeec5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6658,7 +6658,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) trace_kvm_entry(vcpu->vcpu_id); wait_lapic_expire(vcpu); - __kvm_guest_enter(); + guest_enter_irqoff(); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); @@ -6717,7 +6717,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ barrier(); - kvm_guest_exit(); + guest_exit(); preempt_enable(); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ffff40522688..66b2f6159aad 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -875,28 +875,6 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm, } #endif -/* must be called with irqs disabled */ -static inline void __kvm_guest_enter(void) -{ - guest_enter_irqoff(); -} - -/* must be called with irqs disabled */ -static inline void __kvm_guest_exit(void) -{ - guest_exit_irqoff(); -} - -static inline void kvm_guest_enter(void) -{ - guest_enter(); -} - -static inline void kvm_guest_exit(void) -{ - guest_exit(); -} - /* * search_memslots() and __gfn_to_memslot() are here because they are * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c. From 91fa0f8e9e2937fd9360f326ad60d51908347afd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 Jun 2016 20:55:08 +0200 Subject: [PATCH 179/302] KVM: x86: always use "acknowledge interrupt on exit" This is necessary to simplify handle_external_intr in the next patch. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 943609f06c90..1b413a520ef3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3390,12 +3390,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmx_capability.ept, vmx_capability.vpid); } - min = VM_EXIT_SAVE_DEBUG_CONTROLS; + min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; + VM_EXIT_CLEAR_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; @@ -3408,8 +3408,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return -EIO; if (!(_cpu_based_2nd_exec_control & - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || - !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; min = VM_ENTRY_LOAD_DEBUG_CONTROLS; From f2485b3e0c6c0aa3a9546babc2fad3739e964ebb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 Jun 2016 15:23:11 +0200 Subject: [PATCH 180/302] KVM: x86: use guest_exit_irqoff This gains a few clock cycles per vmexit. On Intel there is no need anymore to enable the interrupts in vmx_handle_external_intr, since we are using the "acknowledge interrupt on exit" feature. AMD needs to do that, and must be careful to avoid the interrupt shadow. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx.c | 4 +--- arch/x86/kvm/x86.c | 11 ++--------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5ff292778110..5bfdbbf1ce79 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4935,6 +4935,12 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, static void svm_handle_external_intr(struct kvm_vcpu *vcpu) { local_irq_enable(); + /* + * We must have an instruction with interrupts enabled, so + * the timer interrupt isn't delayed by the interrupt shadow. + */ + asm("nop"); + local_irq_disable(); } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1b413a520ef3..c1d655c10fd2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8574,7 +8574,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) "push %[sp]\n\t" #endif "pushf\n\t" - "orl $0x200, (%%" _ASM_SP ")\n\t" __ASM_SIZE(push) " $%c[cs]\n\t" "call *%[entry]\n\t" : @@ -8587,8 +8586,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) [ss]"i"(__KERNEL_DS), [cs]"i"(__KERNEL_CS) ); - } else - local_irq_enable(); + } } static bool vmx_has_high_real_mode_segbase(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 618463abeec5..0cc6cf834cdd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6709,16 +6709,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.exits; - /* - * We must have an instruction between local_irq_enable() and - * kvm_guest_exit(), so the timer interrupt isn't delayed by - * the interrupt shadow. The stat.exits increment will do nicely. - * But we need to prevent reordering, hence this barrier(): - */ - barrier(); - - guest_exit(); + guest_exit_irqoff(); + local_irq_enable(); preempt_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); From 9175d2e97b08e86293e68246020a5c29f88aa674 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 27 Jun 2016 15:08:01 +0200 Subject: [PATCH 181/302] KVM: vmx: fix underflow in TSC deadline calculation If the TSC deadline timer is programmed really close to the deadline or even in the past, the computation in vmx_set_hv_timer can underflow and cause delta_tsc to be set to a huge value. This generally results in vmx_set_hv_timer returning -ERANGE, but we can fix it by limiting delta_tsc to be positive or zero. Reported-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c1d655c10fd2..85e2f0a882ca 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10829,9 +10829,9 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift, static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 tscl = rdtsc(), delta_tsc; - - delta_tsc = guest_deadline_tsc - kvm_read_l1_tsc(vcpu, tscl); + u64 tscl = rdtsc(); + u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); + u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; /* Convert to host delta tsc if tsc scaling is enabled */ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && From bd97ad0e7ed6a8870cc691fdfd108dc952fe45eb Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 30 Jun 2016 08:52:49 +0800 Subject: [PATCH 182/302] KVM: x86: introduce cancel_hv_tscdeadline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce cancel_hv_tscdeadline() to encapsulate preemption timer cancel stuff. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Yunhong Jiang Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index fdc05ae08bac..9c20ac14caf5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1349,14 +1349,19 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); +static void cancel_hv_tscdeadline(struct kvm_lapic *apic) +{ + kvm_x86_ops->cancel_hv_timer(apic->vcpu); + apic->lapic_timer.hv_timer_in_use = false; +} + void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; WARN_ON(!apic->lapic_timer.hv_timer_in_use); WARN_ON(swait_active(&vcpu->wq)); - kvm_x86_ops->cancel_hv_timer(vcpu); - apic->lapic_timer.hv_timer_in_use = false; + cancel_hv_tscdeadline(apic); apic_timer_expired(apic); } EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); @@ -1376,10 +1381,8 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) hrtimer_cancel(&apic->lapic_timer.timer); /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending)) { - apic->lapic_timer.hv_timer_in_use = false; - kvm_x86_ops->cancel_hv_timer(apic->vcpu); - } + if (atomic_read(&apic->lapic_timer.pending)) + cancel_hv_tscdeadline(apic); } trace_kvm_hv_timer_state(vcpu->vcpu_id, apic->lapic_timer.hv_timer_in_use); @@ -1395,8 +1398,7 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) if (!apic->lapic_timer.hv_timer_in_use) return; - kvm_x86_ops->cancel_hv_timer(vcpu); - apic->lapic_timer.hv_timer_in_use = false; + cancel_hv_tscdeadline(apic); if (atomic_read(&apic->lapic_timer.pending)) return; From 196f20ca52e8c7281932663c348fa54b82d03914 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 28 Jun 2016 14:54:19 +0800 Subject: [PATCH 183/302] KVM: vmx: fix missed cancellation of TSC deadline timer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit INFO: rcu_sched detected stalls on CPUs/tasks: 1-...: (11800 GPs behind) idle=45d/140000000000000/0 softirq=0/0 fqs=21663 (detected by 0, t=65016 jiffies, g=11500, c=11499, q=719) Task dump for CPU 1: qemu-system-x86 R running task 0 3529 3525 0x00080808 ffff8802021791a0 ffff880212895040 0000000000000001 00007f1c2c00db40 ffff8801dd20fcd3 ffffc90002b98000 ffff8801dd20fc88 ffff8801dd20fcf8 0000000000000286 ffff8801dd2ac538 ffff8801dd20fcc0 ffffffffc06949c9 Call Trace: ? kvm_write_guest_cached+0xb9/0x160 [kvm] ? __delay+0xf/0x20 ? wait_lapic_expire+0x14a/0x200 [kvm] ? kvm_arch_vcpu_ioctl_run+0xcbe/0x1b00 [kvm] ? kvm_arch_vcpu_ioctl_run+0xe34/0x1b00 [kvm] ? kvm_vcpu_ioctl+0x2d3/0x7c0 [kvm] ? __fget+0x5/0x210 ? do_vfs_ioctl+0x96/0x6a0 ? __fget_light+0x2a/0x90 ? SyS_ioctl+0x79/0x90 ? do_syscall_64+0x7c/0x1e0 ? entry_SYSCALL64_slow_path+0x25/0x25 This can be reproduced readily by running a full dynticks guest(since hrtimer in guest is heavily used) w/ lapic_timer_advance disabled. If fail to program hardware preemption timer, we will fallback to hrtimer based method, however, a previous programmed preemption timer miss to cancel in this scenario which results in one hardware preemption timer and one hrtimer emulated tsc deadline timer run simultaneously. So sometimes the target guest deadline tsc is earlier than guest tsc, which leads to the computation in vmx_set_hv_timer can underflow and cause delta_tsc to be set a huge value, then host soft lockup as above. This patch fix it by cancelling the previous programmed preemption timer if there is once we failed to program the new preemption timer and fallback to hrtimer based method. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Yunhong Jiang Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 48 ++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9c20ac14caf5..22a6474af220 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1366,27 +1366,35 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); +static bool start_hv_tscdeadline(struct kvm_lapic *apic) +{ + u64 tscdeadline = apic->lapic_timer.tscdeadline; + + if (atomic_read(&apic->lapic_timer.pending) || + kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_tscdeadline(apic); + } else { + apic->lapic_timer.hv_timer_in_use = true; + hrtimer_cancel(&apic->lapic_timer.timer); + + /* In case the sw timer triggered in the window */ + if (atomic_read(&apic->lapic_timer.pending)) + cancel_hv_tscdeadline(apic); + } + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, + apic->lapic_timer.hv_timer_in_use); + return apic->lapic_timer.hv_timer_in_use; +} + void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; WARN_ON(apic->lapic_timer.hv_timer_in_use); - if (apic_lvtt_tscdeadline(apic) && - !atomic_read(&apic->lapic_timer.pending)) { - u64 tscdeadline = apic->lapic_timer.tscdeadline; - - if (!kvm_x86_ops->set_hv_timer(vcpu, tscdeadline)) { - apic->lapic_timer.hv_timer_in_use = true; - hrtimer_cancel(&apic->lapic_timer.timer); - - /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending)) - cancel_hv_tscdeadline(apic); - } - trace_kvm_hv_timer_state(vcpu->vcpu_id, - apic->lapic_timer.hv_timer_in_use); - } + if (apic_lvtt_tscdeadline(apic)) + start_hv_tscdeadline(apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); @@ -1453,15 +1461,7 @@ static void start_apic_timer(struct kvm_lapic *apic) ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); } else if (apic_lvtt_tscdeadline(apic)) { - /* lapic timer in tsc deadline mode */ - u64 tscdeadline = apic->lapic_timer.tscdeadline; - - if (kvm_x86_ops->set_hv_timer && - !kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { - apic->lapic_timer.hv_timer_in_use = true; - trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, - apic->lapic_timer.hv_timer_in_use); - } else + if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic))) start_sw_tscdeadline(apic); } } From 50926d82fa271fa76d5717b546a66f7b5703ff05 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2016 11:27:11 +0100 Subject: [PATCH 184/302] KVM: arm/arm64: The GIC is dead, long live the GIC I don't think any single piece of the KVM/ARM code ever generated as much hatred as the GIC emulation. It was written by someone who had zero experience in modeling hardware (me), was riddled with design flaws, should have been scrapped and rewritten from scratch long before having a remote chance of reaching mainline, and yet we supported it for a good three years. No need to mention the names of those who suffered, the git log is singing their praises. Thankfully, we now have a much more maintainable implementation, and we can safely put the grumpy old GIC to rest. Fellow hackers, please raise your glass in memory of the GIC: The GIC is dead, long live the GIC! Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/kvm/Kconfig | 7 - arch/arm/kvm/Makefile | 6 - arch/arm64/kvm/Kconfig | 7 - arch/arm64/kvm/Makefile | 8 - include/kvm/arm_vgic.h | 389 ++---- include/kvm/vgic/vgic.h | 246 ---- virt/kvm/arm/hyp/vgic-v2-sr.c | 15 +- virt/kvm/arm/vgic-v2-emul.c | 856 ------------ virt/kvm/arm/vgic-v2.c | 274 ---- virt/kvm/arm/vgic-v3-emul.c | 1074 --------------- virt/kvm/arm/vgic-v3.c | 279 ---- virt/kvm/arm/vgic.c | 2440 --------------------------------- virt/kvm/arm/vgic.h | 140 -- 13 files changed, 134 insertions(+), 5607 deletions(-) delete mode 100644 include/kvm/vgic/vgic.h delete mode 100644 virt/kvm/arm/vgic-v2-emul.c delete mode 100644 virt/kvm/arm/vgic-v2.c delete mode 100644 virt/kvm/arm/vgic-v3-emul.c delete mode 100644 virt/kvm/arm/vgic-v3.c delete mode 100644 virt/kvm/arm/vgic.c delete mode 100644 virt/kvm/arm/vgic.h diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 02abfff68ee5..95a000515e43 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -46,13 +46,6 @@ config KVM_ARM_HOST ---help--- Provides host support for ARM processors. -config KVM_NEW_VGIC - bool "New VGIC implementation" - depends on KVM - default y - ---help--- - uses the new VGIC implementation - source drivers/vhost/Kconfig endif # VIRTUALIZATION diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index a596b58f6d37..5e28df80dca7 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -22,7 +22,6 @@ obj-y += kvm-arm.o init.o interrupts.o obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o -ifeq ($(CONFIG_KVM_NEW_VGIC),y) obj-y += $(KVM)/arm/vgic/vgic.o obj-y += $(KVM)/arm/vgic/vgic-init.o obj-y += $(KVM)/arm/vgic/vgic-irqfd.o @@ -30,9 +29,4 @@ obj-y += $(KVM)/arm/vgic/vgic-v2.o obj-y += $(KVM)/arm/vgic/vgic-mmio.o obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o -else -obj-y += $(KVM)/arm/vgic.o -obj-y += $(KVM)/arm/vgic-v2.o -obj-y += $(KVM)/arm/vgic-v2-emul.o -endif obj-y += $(KVM)/arm/arch_timer.o diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index c4f26ef91e77..aa2e34e99582 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -54,13 +54,6 @@ config KVM_ARM_PMU Adds support for a virtual Performance Monitoring Unit (PMU) in virtual machines. -config KVM_NEW_VGIC - bool "New VGIC implementation" - depends on KVM - default y - ---help--- - uses the new VGIC implementation - source drivers/vhost/Kconfig endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index a7a958ca29d5..f00b2cdd0d33 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -20,7 +20,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o -ifeq ($(CONFIG_KVM_NEW_VGIC),y) kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-irqfd.o @@ -30,12 +29,5 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o -else -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o -kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o -endif kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index da0a524802cb..12640378db98 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -1,6 +1,5 @@ /* - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier + * Copyright (C) 2015, 2016 ARM Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -12,16 +11,10 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * along with this program. If not, see . */ - -#ifndef __ASM_ARM_KVM_VGIC_H -#define __ASM_ARM_KVM_VGIC_H - -#ifdef CONFIG_KVM_NEW_VGIC -#include -#else +#ifndef __KVM_ARM_VGIC_H +#define __KVM_ARM_VGIC_H #include #include @@ -29,248 +22,130 @@ #include #include #include -#include -#define VGIC_NR_IRQS_LEGACY 256 +#define VGIC_V3_MAX_CPUS 255 +#define VGIC_V2_MAX_CPUS 8 +#define VGIC_NR_IRQS_LEGACY 256 #define VGIC_NR_SGIS 16 #define VGIC_NR_PPIS 16 #define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS) - -#define VGIC_V2_MAX_LRS (1 << 6) -#define VGIC_V3_MAX_LRS 16 -#define VGIC_MAX_IRQS 1024 -#define VGIC_V2_MAX_CPUS 8 -#define VGIC_V3_MAX_CPUS 255 - -#if (VGIC_NR_IRQS_LEGACY & 31) -#error "VGIC_NR_IRQS must be a multiple of 32" -#endif - -#if (VGIC_NR_IRQS_LEGACY > VGIC_MAX_IRQS) -#error "VGIC_NR_IRQS must be <= 1024" -#endif - -/* - * The GIC distributor registers describing interrupts have two parts: - * - 32 per-CPU interrupts (SGI + PPI) - * - a bunch of shared interrupts (SPI) - */ -struct vgic_bitmap { - /* - * - One UL per VCPU for private interrupts (assumes UL is at - * least 32 bits) - * - As many UL as necessary for shared interrupts. - * - * The private interrupts are accessed via the "private" - * field, one UL per vcpu (the state for vcpu n is in - * private[n]). The shared interrupts are accessed via the - * "shared" pointer (IRQn state is at bit n-32 in the bitmap). - */ - unsigned long *private; - unsigned long *shared; -}; - -struct vgic_bytemap { - /* - * - 8 u32 per VCPU for private interrupts - * - As many u32 as necessary for shared interrupts. - * - * The private interrupts are accessed via the "private" - * field, (the state for vcpu n is in private[n*8] to - * private[n*8 + 7]). The shared interrupts are accessed via - * the "shared" pointer (IRQn state is at byte (n-32)%4 of the - * shared[(n-32)/4] word). - */ - u32 *private; - u32 *shared; -}; - -struct kvm_vcpu; +#define VGIC_MAX_PRIVATE (VGIC_NR_PRIVATE_IRQS - 1) +#define VGIC_MAX_SPI 1019 +#define VGIC_MAX_RESERVED 1023 +#define VGIC_MIN_LPI 8192 enum vgic_type { VGIC_V2, /* Good ol' GICv2 */ VGIC_V3, /* New fancy GICv3 */ }; -#define LR_STATE_PENDING (1 << 0) -#define LR_STATE_ACTIVE (1 << 1) -#define LR_STATE_MASK (3 << 0) -#define LR_EOI_INT (1 << 2) -#define LR_HW (1 << 3) +/* same for all guests, as depending only on the _host's_ GIC model */ +struct vgic_global { + /* type of the host GIC */ + enum vgic_type type; -struct vgic_lr { - unsigned irq:10; - union { - unsigned hwirq:10; - unsigned source:3; - }; - unsigned state:4; -}; - -struct vgic_vmcr { - u32 ctlr; - u32 abpr; - u32 bpr; - u32 pmr; -}; - -struct vgic_ops { - struct vgic_lr (*get_lr)(const struct kvm_vcpu *, int); - void (*set_lr)(struct kvm_vcpu *, int, struct vgic_lr); - u64 (*get_elrsr)(const struct kvm_vcpu *vcpu); - u64 (*get_eisr)(const struct kvm_vcpu *vcpu); - void (*clear_eisr)(struct kvm_vcpu *vcpu); - u32 (*get_interrupt_status)(const struct kvm_vcpu *vcpu); - void (*enable_underflow)(struct kvm_vcpu *vcpu); - void (*disable_underflow)(struct kvm_vcpu *vcpu); - void (*get_vmcr)(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); - void (*set_vmcr)(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); - void (*enable)(struct kvm_vcpu *vcpu); -}; - -struct vgic_params { - /* vgic type */ - enum vgic_type type; /* Physical address of vgic virtual cpu interface */ - phys_addr_t vcpu_base; - /* Number of list registers */ - u32 nr_lr; - /* Interrupt number */ - unsigned int maint_irq; - /* Virtual control interface base address */ - void __iomem *vctrl_base; - int max_gic_vcpus; + phys_addr_t vcpu_base; + + /* virtual control interface mapping */ + void __iomem *vctrl_base; + + /* Number of implemented list registers */ + int nr_lr; + + /* Maintenance IRQ number */ + unsigned int maint_irq; + + /* maximum number of VCPUs allowed (GICv2 limits us to 8) */ + int max_gic_vcpus; + /* Only needed for the legacy KVM_CREATE_IRQCHIP */ - bool can_emulate_gicv2; + bool can_emulate_gicv2; }; -struct vgic_vm_ops { - bool (*queue_sgi)(struct kvm_vcpu *, int irq); - void (*add_sgi_source)(struct kvm_vcpu *, int irq, int source); - int (*init_model)(struct kvm *); - int (*map_resources)(struct kvm *, const struct vgic_params *); +extern struct vgic_global kvm_vgic_global_state; + +#define VGIC_V2_MAX_LRS (1 << 6) +#define VGIC_V3_MAX_LRS 16 +#define VGIC_V3_LR_INDEX(lr) (VGIC_V3_MAX_LRS - 1 - lr) + +enum vgic_irq_config { + VGIC_CONFIG_EDGE = 0, + VGIC_CONFIG_LEVEL }; +struct vgic_irq { + spinlock_t irq_lock; /* Protects the content of the struct */ + struct list_head ap_list; + + struct kvm_vcpu *vcpu; /* SGIs and PPIs: The VCPU + * SPIs and LPIs: The VCPU whose ap_list + * this is queued on. + */ + + struct kvm_vcpu *target_vcpu; /* The VCPU that this interrupt should + * be sent to, as a result of the + * targets reg (v2) or the + * affinity reg (v3). + */ + + u32 intid; /* Guest visible INTID */ + bool pending; + bool line_level; /* Level only */ + bool soft_pending; /* Level only */ + bool active; /* not used for LPIs */ + bool enabled; + bool hw; /* Tied to HW IRQ */ + u32 hwintid; /* HW INTID number */ + union { + u8 targets; /* GICv2 target VCPUs mask */ + u32 mpidr; /* GICv3 target VCPU */ + }; + u8 source; /* GICv2 SGIs only */ + u8 priority; + enum vgic_irq_config config; /* Level or edge */ +}; + +struct vgic_register_region; + struct vgic_io_device { - gpa_t addr; - int len; - const struct vgic_io_range *reg_ranges; + gpa_t base_addr; struct kvm_vcpu *redist_vcpu; + const struct vgic_register_region *regions; + int nr_regions; struct kvm_io_device dev; }; -struct irq_phys_map { - u32 virt_irq; - u32 phys_irq; -}; - -struct irq_phys_map_entry { - struct list_head entry; - struct rcu_head rcu; - struct irq_phys_map map; -}; - struct vgic_dist { - spinlock_t lock; bool in_kernel; bool ready; + bool initialized; /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */ u32 vgic_model; - int nr_cpus; - int nr_irqs; + int nr_spis; + /* TODO: Consider moving to global state */ /* Virtual control interface mapping */ void __iomem *vctrl_base; - /* Distributor and vcpu interface mapping in the guest */ - phys_addr_t vgic_dist_base; - /* GICv2 and GICv3 use different mapped register blocks */ + /* base addresses in guest physical address space: */ + gpa_t vgic_dist_base; /* distributor */ union { - phys_addr_t vgic_cpu_base; - phys_addr_t vgic_redist_base; + /* either a GICv2 CPU interface */ + gpa_t vgic_cpu_base; + /* or a number of GICv3 redistributor regions */ + gpa_t vgic_redist_base; }; - /* Distributor enabled */ - u32 enabled; + /* distributor enabled */ + bool enabled; - /* Interrupt enabled (one bit per IRQ) */ - struct vgic_bitmap irq_enabled; + struct vgic_irq *spis; - /* Level-triggered interrupt external input is asserted */ - struct vgic_bitmap irq_level; - - /* - * Interrupt state is pending on the distributor - */ - struct vgic_bitmap irq_pending; - - /* - * Tracks writes to GICD_ISPENDRn and GICD_ICPENDRn for level-triggered - * interrupts. Essentially holds the state of the flip-flop in - * Figure 4-10 on page 4-101 in ARM IHI 0048B.b. - * Once set, it is only cleared for level-triggered interrupts on - * guest ACKs (when we queue it) or writes to GICD_ICPENDRn. - */ - struct vgic_bitmap irq_soft_pend; - - /* Level-triggered interrupt queued on VCPU interface */ - struct vgic_bitmap irq_queued; - - /* Interrupt was active when unqueue from VCPU interface */ - struct vgic_bitmap irq_active; - - /* Interrupt priority. Not used yet. */ - struct vgic_bytemap irq_priority; - - /* Level/edge triggered */ - struct vgic_bitmap irq_cfg; - - /* - * Source CPU per SGI and target CPU: - * - * Each byte represent a SGI observable on a VCPU, each bit of - * this byte indicating if the corresponding VCPU has - * generated this interrupt. This is a GICv2 feature only. - * - * For VCPUn (n < 8), irq_sgi_sources[n*16] to [n*16 + 15] are - * the SGIs observable on VCPUn. - */ - u8 *irq_sgi_sources; - - /* - * Target CPU for each SPI: - * - * Array of available SPI, each byte indicating the target - * VCPU for SPI. IRQn (n >=32) is at irq_spi_cpu[n-32]. - */ - u8 *irq_spi_cpu; - - /* - * Reverse lookup of irq_spi_cpu for faster compute pending: - * - * Array of bitmaps, one per VCPU, describing if IRQn is - * routed to a particular VCPU. - */ - struct vgic_bitmap *irq_spi_target; - - /* Target MPIDR for each IRQ (needed for GICv3 IROUTERn) only */ - u32 *irq_spi_mpidr; - - /* Bitmap indicating which CPU has something pending */ - unsigned long *irq_pending_on_cpu; - - /* Bitmap indicating which CPU has active IRQs */ - unsigned long *irq_active_on_cpu; - - struct vgic_vm_ops vm_ops; struct vgic_io_device dist_iodev; struct vgic_io_device *redist_iodevs; - - /* Virtual irq to hwirq mapping */ - spinlock_t irq_phys_map_lock; - struct list_head irq_phys_map_list; }; struct vgic_v2_cpu_if { @@ -298,78 +173,74 @@ struct vgic_v3_cpu_if { }; struct vgic_cpu { - /* Pending/active/both interrupts on this VCPU */ - DECLARE_BITMAP(pending_percpu, VGIC_NR_PRIVATE_IRQS); - DECLARE_BITMAP(active_percpu, VGIC_NR_PRIVATE_IRQS); - DECLARE_BITMAP(pend_act_percpu, VGIC_NR_PRIVATE_IRQS); - - /* Pending/active/both shared interrupts, dynamically sized */ - unsigned long *pending_shared; - unsigned long *active_shared; - unsigned long *pend_act_shared; - /* CPU vif control registers for world switch */ union { struct vgic_v2_cpu_if vgic_v2; struct vgic_v3_cpu_if vgic_v3; }; - /* Protected by the distributor's irq_phys_map_lock */ - struct list_head irq_phys_map_list; + unsigned int used_lrs; + struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS]; - u64 live_lrs; + spinlock_t ap_list_lock; /* Protects the ap_list */ + + /* + * List of IRQs that this VCPU should consider because they are either + * Active or Pending (hence the name; AP list), or because they recently + * were one of the two and need to be migrated off this list to another + * VCPU. + */ + struct list_head ap_list_head; + + u64 live_lrs; }; -#define LR_EMPTY 0xff - -#define INT_STATUS_EOI (1 << 0) -#define INT_STATUS_UNDERFLOW (1 << 1) - -struct kvm; -struct kvm_vcpu; - int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write); -int kvm_vgic_hyp_init(void); -int kvm_vgic_map_resources(struct kvm *kvm); -int kvm_vgic_get_max_vcpus(void); void kvm_vgic_early_init(struct kvm *kvm); int kvm_vgic_create(struct kvm *kvm, u32 type); void kvm_vgic_destroy(struct kvm *kvm); void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu); void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu); -void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); -void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); -int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, +int kvm_vgic_map_resources(struct kvm *kvm); +int kvm_vgic_hyp_init(void); + +int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, bool level); -int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, - unsigned int virt_irq, bool level); -void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg); -int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); -int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq); +int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid, + bool level); +int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq); int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq); bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq); +int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); + #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) -#define vgic_initialized(k) (!!((k)->arch.vgic.nr_cpus)) +#define vgic_initialized(k) ((k)->arch.vgic.initialized) #define vgic_ready(k) ((k)->arch.vgic.ready) #define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \ - ((i) < (k)->arch.vgic.nr_irqs)) + ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) + +bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu); +void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); +void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); -int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info, - const struct vgic_ops **ops, - const struct vgic_params **params); #ifdef CONFIG_KVM_ARM_VGIC_V3 -int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info, - const struct vgic_ops **ops, - const struct vgic_params **params); +void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg); #else -static inline int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info, - const struct vgic_ops **ops, - const struct vgic_params **params) +static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) { - return -ENODEV; } #endif -#endif /* old VGIC include */ -#endif +/** + * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW + * + * The host's GIC naturally limits the maximum amount of VCPUs a guest + * can use. + */ +static inline int kvm_vgic_get_max_vcpus(void) +{ + return kvm_vgic_global_state.max_gic_vcpus; +} + +#endif /* __KVM_ARM_VGIC_H */ diff --git a/include/kvm/vgic/vgic.h b/include/kvm/vgic/vgic.h deleted file mode 100644 index 3fbd175265ae..000000000000 --- a/include/kvm/vgic/vgic.h +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright (C) 2015, 2016 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ -#ifndef __ASM_ARM_KVM_VGIC_VGIC_H -#define __ASM_ARM_KVM_VGIC_VGIC_H - -#include -#include -#include -#include -#include -#include - -#define VGIC_V3_MAX_CPUS 255 -#define VGIC_V2_MAX_CPUS 8 -#define VGIC_NR_IRQS_LEGACY 256 -#define VGIC_NR_SGIS 16 -#define VGIC_NR_PPIS 16 -#define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS) -#define VGIC_MAX_PRIVATE (VGIC_NR_PRIVATE_IRQS - 1) -#define VGIC_MAX_SPI 1019 -#define VGIC_MAX_RESERVED 1023 -#define VGIC_MIN_LPI 8192 - -enum vgic_type { - VGIC_V2, /* Good ol' GICv2 */ - VGIC_V3, /* New fancy GICv3 */ -}; - -/* same for all guests, as depending only on the _host's_ GIC model */ -struct vgic_global { - /* type of the host GIC */ - enum vgic_type type; - - /* Physical address of vgic virtual cpu interface */ - phys_addr_t vcpu_base; - - /* virtual control interface mapping */ - void __iomem *vctrl_base; - - /* Number of implemented list registers */ - int nr_lr; - - /* Maintenance IRQ number */ - unsigned int maint_irq; - - /* maximum number of VCPUs allowed (GICv2 limits us to 8) */ - int max_gic_vcpus; - - /* Only needed for the legacy KVM_CREATE_IRQCHIP */ - bool can_emulate_gicv2; -}; - -extern struct vgic_global kvm_vgic_global_state; - -#define VGIC_V2_MAX_LRS (1 << 6) -#define VGIC_V3_MAX_LRS 16 -#define VGIC_V3_LR_INDEX(lr) (VGIC_V3_MAX_LRS - 1 - lr) - -enum vgic_irq_config { - VGIC_CONFIG_EDGE = 0, - VGIC_CONFIG_LEVEL -}; - -struct vgic_irq { - spinlock_t irq_lock; /* Protects the content of the struct */ - struct list_head ap_list; - - struct kvm_vcpu *vcpu; /* SGIs and PPIs: The VCPU - * SPIs and LPIs: The VCPU whose ap_list - * this is queued on. - */ - - struct kvm_vcpu *target_vcpu; /* The VCPU that this interrupt should - * be sent to, as a result of the - * targets reg (v2) or the - * affinity reg (v3). - */ - - u32 intid; /* Guest visible INTID */ - bool pending; - bool line_level; /* Level only */ - bool soft_pending; /* Level only */ - bool active; /* not used for LPIs */ - bool enabled; - bool hw; /* Tied to HW IRQ */ - u32 hwintid; /* HW INTID number */ - union { - u8 targets; /* GICv2 target VCPUs mask */ - u32 mpidr; /* GICv3 target VCPU */ - }; - u8 source; /* GICv2 SGIs only */ - u8 priority; - enum vgic_irq_config config; /* Level or edge */ -}; - -struct vgic_register_region; - -struct vgic_io_device { - gpa_t base_addr; - struct kvm_vcpu *redist_vcpu; - const struct vgic_register_region *regions; - int nr_regions; - struct kvm_io_device dev; -}; - -struct vgic_dist { - bool in_kernel; - bool ready; - bool initialized; - - /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */ - u32 vgic_model; - - int nr_spis; - - /* TODO: Consider moving to global state */ - /* Virtual control interface mapping */ - void __iomem *vctrl_base; - - /* base addresses in guest physical address space: */ - gpa_t vgic_dist_base; /* distributor */ - union { - /* either a GICv2 CPU interface */ - gpa_t vgic_cpu_base; - /* or a number of GICv3 redistributor regions */ - gpa_t vgic_redist_base; - }; - - /* distributor enabled */ - bool enabled; - - struct vgic_irq *spis; - - struct vgic_io_device dist_iodev; - struct vgic_io_device *redist_iodevs; -}; - -struct vgic_v2_cpu_if { - u32 vgic_hcr; - u32 vgic_vmcr; - u32 vgic_misr; /* Saved only */ - u64 vgic_eisr; /* Saved only */ - u64 vgic_elrsr; /* Saved only */ - u32 vgic_apr; - u32 vgic_lr[VGIC_V2_MAX_LRS]; -}; - -struct vgic_v3_cpu_if { -#ifdef CONFIG_KVM_ARM_VGIC_V3 - u32 vgic_hcr; - u32 vgic_vmcr; - u32 vgic_sre; /* Restored only, change ignored */ - u32 vgic_misr; /* Saved only */ - u32 vgic_eisr; /* Saved only */ - u32 vgic_elrsr; /* Saved only */ - u32 vgic_ap0r[4]; - u32 vgic_ap1r[4]; - u64 vgic_lr[VGIC_V3_MAX_LRS]; -#endif -}; - -struct vgic_cpu { - /* CPU vif control registers for world switch */ - union { - struct vgic_v2_cpu_if vgic_v2; - struct vgic_v3_cpu_if vgic_v3; - }; - - unsigned int used_lrs; - struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS]; - - spinlock_t ap_list_lock; /* Protects the ap_list */ - - /* - * List of IRQs that this VCPU should consider because they are either - * Active or Pending (hence the name; AP list), or because they recently - * were one of the two and need to be migrated off this list to another - * VCPU. - */ - struct list_head ap_list_head; - - u64 live_lrs; -}; - -int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write); -void kvm_vgic_early_init(struct kvm *kvm); -int kvm_vgic_create(struct kvm *kvm, u32 type); -void kvm_vgic_destroy(struct kvm *kvm); -void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu); -void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu); -int kvm_vgic_map_resources(struct kvm *kvm); -int kvm_vgic_hyp_init(void); - -int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, - bool level); -int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid, - bool level); -int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq); -int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq); -bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq); - -int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); - -#define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) -#define vgic_initialized(k) ((k)->arch.vgic.initialized) -#define vgic_ready(k) ((k)->arch.vgic.ready) -#define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \ - ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) - -bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu); -void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); -void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); - -#ifdef CONFIG_KVM_ARM_VGIC_V3 -void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg); -#else -static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) -{ -} -#endif - -/** - * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW - * - * The host's GIC naturally limits the maximum amount of VCPUs a guest - * can use. - */ -static inline int kvm_vgic_get_max_vcpus(void) -{ - return kvm_vgic_global_state.max_gic_vcpus; -} - -#endif /* __ASM_ARM_KVM_VGIC_VGIC_H */ diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c index 3a3a699b7489..7cffd9338c49 100644 --- a/virt/kvm/arm/hyp/vgic-v2-sr.c +++ b/virt/kvm/arm/hyp/vgic-v2-sr.c @@ -21,18 +21,11 @@ #include -#ifdef CONFIG_KVM_NEW_VGIC -extern struct vgic_global kvm_vgic_global_state; -#define vgic_v2_params kvm_vgic_global_state -#else -extern struct vgic_params vgic_v2_params; -#endif - static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, void __iomem *base) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr; + int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; u32 eisr0, eisr1; int i; bool expect_mi; @@ -74,7 +67,7 @@ static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr; + int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; u32 elrsr0, elrsr1; elrsr0 = readl_relaxed(base + GICH_ELRSR0); @@ -93,7 +86,7 @@ static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base) static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr; + int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; int i; for (i = 0; i < nr_lr; i++) { @@ -147,7 +140,7 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu) struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; struct vgic_dist *vgic = &kvm->arch.vgic; void __iomem *base = kern_hyp_va(vgic->vctrl_base); - int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr; + int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; int i; u64 live_lrs = 0; diff --git a/virt/kvm/arm/vgic-v2-emul.c b/virt/kvm/arm/vgic-v2-emul.c deleted file mode 100644 index 1b0bee095427..000000000000 --- a/virt/kvm/arm/vgic-v2-emul.c +++ /dev/null @@ -1,856 +0,0 @@ -/* - * Contains GICv2 specific emulation code, was in vgic.c before. - * - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include "vgic.h" - -#define GICC_ARCH_VERSION_V2 0x2 - -static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg); -static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi) -{ - return dist->irq_sgi_sources + vcpu_id * VGIC_NR_SGIS + sgi; -} - -static bool handle_mmio_misc(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 reg; - u32 word_offset = offset & 3; - - switch (offset & ~3) { - case 0: /* GICD_CTLR */ - reg = vcpu->kvm->arch.vgic.enabled; - vgic_reg_access(mmio, ®, word_offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - if (mmio->is_write) { - vcpu->kvm->arch.vgic.enabled = reg & 1; - vgic_update_state(vcpu->kvm); - return true; - } - break; - - case 4: /* GICD_TYPER */ - reg = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5; - reg |= (vcpu->kvm->arch.vgic.nr_irqs >> 5) - 1; - vgic_reg_access(mmio, ®, word_offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - break; - - case 8: /* GICD_IIDR */ - reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); - vgic_reg_access(mmio, ®, word_offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - break; - } - - return false; -} - -static bool handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - return vgic_handle_enable_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id, ACCESS_WRITE_SETBIT); -} - -static bool handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - return vgic_handle_enable_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id, ACCESS_WRITE_CLEARBIT); -} - -static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id); -} - -static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id); -} - -static bool handle_mmio_set_active_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id); -} - -static bool handle_mmio_clear_active_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id); -} - -static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority, - vcpu->vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - return false; -} - -#define GICD_ITARGETSR_SIZE 32 -#define GICD_CPUTARGETS_BITS 8 -#define GICD_IRQS_PER_ITARGETSR (GICD_ITARGETSR_SIZE / GICD_CPUTARGETS_BITS) -static u32 vgic_get_target_reg(struct kvm *kvm, int irq) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - int i; - u32 val = 0; - - irq -= VGIC_NR_PRIVATE_IRQS; - - for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) - val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8); - - return val; -} - -static void vgic_set_target_reg(struct kvm *kvm, u32 val, int irq) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int i, c; - unsigned long *bmap; - u32 target; - - irq -= VGIC_NR_PRIVATE_IRQS; - - /* - * Pick the LSB in each byte. This ensures we target exactly - * one vcpu per IRQ. If the byte is null, assume we target - * CPU0. - */ - for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) { - int shift = i * GICD_CPUTARGETS_BITS; - - target = ffs((val >> shift) & 0xffU); - target = target ? (target - 1) : 0; - dist->irq_spi_cpu[irq + i] = target; - kvm_for_each_vcpu(c, vcpu, kvm) { - bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]); - if (c == target) - set_bit(irq + i, bmap); - else - clear_bit(irq + i, bmap); - } - } -} - -static bool handle_mmio_target_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 reg; - - /* We treat the banked interrupts targets as read-only */ - if (offset < 32) { - u32 roreg; - - roreg = 1 << vcpu->vcpu_id; - roreg |= roreg << 8; - roreg |= roreg << 16; - - vgic_reg_access(mmio, &roreg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - return false; - } - - reg = vgic_get_target_reg(vcpu->kvm, offset & ~3U); - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - if (mmio->is_write) { - vgic_set_target_reg(vcpu->kvm, reg, offset & ~3U); - vgic_update_state(vcpu->kvm); - return true; - } - - return false; -} - -static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 *reg; - - reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg, - vcpu->vcpu_id, offset >> 1); - - return vgic_handle_cfg_reg(reg, mmio, offset); -} - -static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 reg; - - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_VALUE); - if (mmio->is_write) { - vgic_dispatch_sgi(vcpu, reg); - vgic_update_state(vcpu->kvm); - return true; - } - - return false; -} - -/* Handle reads of GICD_CPENDSGIRn and GICD_SPENDSGIRn */ -static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - int sgi; - int min_sgi = (offset & ~0x3); - int max_sgi = min_sgi + 3; - int vcpu_id = vcpu->vcpu_id; - u32 reg = 0; - - /* Copy source SGIs from distributor side */ - for (sgi = min_sgi; sgi <= max_sgi; sgi++) { - u8 sources = *vgic_get_sgi_sources(dist, vcpu_id, sgi); - - reg |= ((u32)sources) << (8 * (sgi - min_sgi)); - } - - mmio_data_write(mmio, ~0, reg); - return false; -} - -static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset, bool set) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - int sgi; - int min_sgi = (offset & ~0x3); - int max_sgi = min_sgi + 3; - int vcpu_id = vcpu->vcpu_id; - u32 reg; - bool updated = false; - - reg = mmio_data_read(mmio, ~0); - - /* Clear pending SGIs on the distributor */ - for (sgi = min_sgi; sgi <= max_sgi; sgi++) { - u8 mask = reg >> (8 * (sgi - min_sgi)); - u8 *src = vgic_get_sgi_sources(dist, vcpu_id, sgi); - - if (set) { - if ((*src & mask) != mask) - updated = true; - *src |= mask; - } else { - if (*src & mask) - updated = true; - *src &= ~mask; - } - } - - if (updated) - vgic_update_state(vcpu->kvm); - - return updated; -} - -static bool handle_mmio_sgi_set(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - if (!mmio->is_write) - return read_set_clear_sgi_pend_reg(vcpu, mmio, offset); - else - return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, true); -} - -static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - if (!mmio->is_write) - return read_set_clear_sgi_pend_reg(vcpu, mmio, offset); - else - return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, false); -} - -static const struct vgic_io_range vgic_dist_ranges[] = { - { - .base = GIC_DIST_SOFTINT, - .len = 4, - .handle_mmio = handle_mmio_sgi_reg, - }, - { - .base = GIC_DIST_CTRL, - .len = 12, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_misc, - }, - { - .base = GIC_DIST_IGROUP, - .len = VGIC_MAX_IRQS / 8, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = GIC_DIST_ENABLE_SET, - .len = VGIC_MAX_IRQS / 8, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_enable_reg, - }, - { - .base = GIC_DIST_ENABLE_CLEAR, - .len = VGIC_MAX_IRQS / 8, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_enable_reg, - }, - { - .base = GIC_DIST_PENDING_SET, - .len = VGIC_MAX_IRQS / 8, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_pending_reg, - }, - { - .base = GIC_DIST_PENDING_CLEAR, - .len = VGIC_MAX_IRQS / 8, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_pending_reg, - }, - { - .base = GIC_DIST_ACTIVE_SET, - .len = VGIC_MAX_IRQS / 8, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_active_reg, - }, - { - .base = GIC_DIST_ACTIVE_CLEAR, - .len = VGIC_MAX_IRQS / 8, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_active_reg, - }, - { - .base = GIC_DIST_PRI, - .len = VGIC_MAX_IRQS, - .bits_per_irq = 8, - .handle_mmio = handle_mmio_priority_reg, - }, - { - .base = GIC_DIST_TARGET, - .len = VGIC_MAX_IRQS, - .bits_per_irq = 8, - .handle_mmio = handle_mmio_target_reg, - }, - { - .base = GIC_DIST_CONFIG, - .len = VGIC_MAX_IRQS / 4, - .bits_per_irq = 2, - .handle_mmio = handle_mmio_cfg_reg, - }, - { - .base = GIC_DIST_SGI_PENDING_CLEAR, - .len = VGIC_NR_SGIS, - .handle_mmio = handle_mmio_sgi_clear, - }, - { - .base = GIC_DIST_SGI_PENDING_SET, - .len = VGIC_NR_SGIS, - .handle_mmio = handle_mmio_sgi_set, - }, - {} -}; - -static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg) -{ - struct kvm *kvm = vcpu->kvm; - struct vgic_dist *dist = &kvm->arch.vgic; - int nrcpus = atomic_read(&kvm->online_vcpus); - u8 target_cpus; - int sgi, mode, c, vcpu_id; - - vcpu_id = vcpu->vcpu_id; - - sgi = reg & 0xf; - target_cpus = (reg >> 16) & 0xff; - mode = (reg >> 24) & 3; - - switch (mode) { - case 0: - if (!target_cpus) - return; - break; - - case 1: - target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff; - break; - - case 2: - target_cpus = 1 << vcpu_id; - break; - } - - kvm_for_each_vcpu(c, vcpu, kvm) { - if (target_cpus & 1) { - /* Flag the SGI as pending */ - vgic_dist_irq_set_pending(vcpu, sgi); - *vgic_get_sgi_sources(dist, c, sgi) |= 1 << vcpu_id; - kvm_debug("SGI%d from CPU%d to CPU%d\n", - sgi, vcpu_id, c); - } - - target_cpus >>= 1; - } -} - -static bool vgic_v2_queue_sgi(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - unsigned long sources; - int vcpu_id = vcpu->vcpu_id; - int c; - - sources = *vgic_get_sgi_sources(dist, vcpu_id, irq); - - for_each_set_bit(c, &sources, dist->nr_cpus) { - if (vgic_queue_irq(vcpu, c, irq)) - clear_bit(c, &sources); - } - - *vgic_get_sgi_sources(dist, vcpu_id, irq) = sources; - - /* - * If the sources bitmap has been cleared it means that we - * could queue all the SGIs onto link registers (see the - * clear_bit above), and therefore we are done with them in - * our emulated gic and can get rid of them. - */ - if (!sources) { - vgic_dist_irq_clear_pending(vcpu, irq); - vgic_cpu_irq_clear(vcpu, irq); - return true; - } - - return false; -} - -/** - * kvm_vgic_map_resources - Configure global VGIC state before running any VCPUs - * @kvm: pointer to the kvm struct - * - * Map the virtual CPU interface into the VM before running any VCPUs. We - * can't do this at creation time, because user space must first set the - * virtual CPU interface address in the guest physical address space. - */ -static int vgic_v2_map_resources(struct kvm *kvm, - const struct vgic_params *params) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - int ret = 0; - - if (!irqchip_in_kernel(kvm)) - return 0; - - mutex_lock(&kvm->lock); - - if (vgic_ready(kvm)) - goto out; - - if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || - IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) { - kvm_err("Need to set vgic cpu and dist addresses first\n"); - ret = -ENXIO; - goto out; - } - - vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base, - KVM_VGIC_V2_DIST_SIZE, - vgic_dist_ranges, -1, &dist->dist_iodev); - - /* - * Initialize the vgic if this hasn't already been done on demand by - * accessing the vgic state from userspace. - */ - ret = vgic_init(kvm); - if (ret) { - kvm_err("Unable to allocate maps\n"); - goto out_unregister; - } - - ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base, - params->vcpu_base, KVM_VGIC_V2_CPU_SIZE, - true); - if (ret) { - kvm_err("Unable to remap VGIC CPU to VCPU\n"); - goto out_unregister; - } - - dist->ready = true; - goto out; - -out_unregister: - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev); - -out: - if (ret) - kvm_vgic_destroy(kvm); - mutex_unlock(&kvm->lock); - return ret; -} - -static void vgic_v2_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - *vgic_get_sgi_sources(dist, vcpu->vcpu_id, irq) |= 1 << source; -} - -static int vgic_v2_init_model(struct kvm *kvm) -{ - int i; - - for (i = VGIC_NR_PRIVATE_IRQS; i < kvm->arch.vgic.nr_irqs; i += 4) - vgic_set_target_reg(kvm, 0, i); - - return 0; -} - -void vgic_v2_init_emulation(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - - dist->vm_ops.queue_sgi = vgic_v2_queue_sgi; - dist->vm_ops.add_sgi_source = vgic_v2_add_sgi_source; - dist->vm_ops.init_model = vgic_v2_init_model; - dist->vm_ops.map_resources = vgic_v2_map_resources; - - kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS; -} - -static bool handle_cpu_mmio_misc(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - bool updated = false; - struct vgic_vmcr vmcr; - u32 *vmcr_field; - u32 reg; - - vgic_get_vmcr(vcpu, &vmcr); - - switch (offset & ~0x3) { - case GIC_CPU_CTRL: - vmcr_field = &vmcr.ctlr; - break; - case GIC_CPU_PRIMASK: - vmcr_field = &vmcr.pmr; - break; - case GIC_CPU_BINPOINT: - vmcr_field = &vmcr.bpr; - break; - case GIC_CPU_ALIAS_BINPOINT: - vmcr_field = &vmcr.abpr; - break; - default: - BUG(); - } - - if (!mmio->is_write) { - reg = *vmcr_field; - mmio_data_write(mmio, ~0, reg); - } else { - reg = mmio_data_read(mmio, ~0); - if (reg != *vmcr_field) { - *vmcr_field = reg; - vgic_set_vmcr(vcpu, &vmcr); - updated = true; - } - } - return updated; -} - -static bool handle_mmio_abpr(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - return handle_cpu_mmio_misc(vcpu, mmio, GIC_CPU_ALIAS_BINPOINT); -} - -static bool handle_cpu_mmio_ident(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 reg; - - if (mmio->is_write) - return false; - - /* GICC_IIDR */ - reg = (PRODUCT_ID_KVM << 20) | - (GICC_ARCH_VERSION_V2 << 16) | - (IMPLEMENTER_ARM << 0); - mmio_data_write(mmio, ~0, reg); - return false; -} - -/* - * CPU Interface Register accesses - these are not accessed by the VM, but by - * user space for saving and restoring VGIC state. - */ -static const struct vgic_io_range vgic_cpu_ranges[] = { - { - .base = GIC_CPU_CTRL, - .len = 12, - .handle_mmio = handle_cpu_mmio_misc, - }, - { - .base = GIC_CPU_ALIAS_BINPOINT, - .len = 4, - .handle_mmio = handle_mmio_abpr, - }, - { - .base = GIC_CPU_ACTIVEPRIO, - .len = 16, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = GIC_CPU_IDENT, - .len = 4, - .handle_mmio = handle_cpu_mmio_ident, - }, -}; - -static int vgic_attr_regs_access(struct kvm_device *dev, - struct kvm_device_attr *attr, - u32 *reg, bool is_write) -{ - const struct vgic_io_range *r = NULL, *ranges; - phys_addr_t offset; - int ret, cpuid, c; - struct kvm_vcpu *vcpu, *tmp_vcpu; - struct vgic_dist *vgic; - struct kvm_exit_mmio mmio; - u32 data; - - offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; - cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >> - KVM_DEV_ARM_VGIC_CPUID_SHIFT; - - mutex_lock(&dev->kvm->lock); - - ret = vgic_init(dev->kvm); - if (ret) - goto out; - - if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) { - ret = -EINVAL; - goto out; - } - - vcpu = kvm_get_vcpu(dev->kvm, cpuid); - vgic = &dev->kvm->arch.vgic; - - mmio.len = 4; - mmio.is_write = is_write; - mmio.data = &data; - if (is_write) - mmio_data_write(&mmio, ~0, *reg); - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - mmio.phys_addr = vgic->vgic_dist_base + offset; - ranges = vgic_dist_ranges; - break; - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - mmio.phys_addr = vgic->vgic_cpu_base + offset; - ranges = vgic_cpu_ranges; - break; - default: - BUG(); - } - r = vgic_find_range(ranges, 4, offset); - - if (unlikely(!r || !r->handle_mmio)) { - ret = -ENXIO; - goto out; - } - - - spin_lock(&vgic->lock); - - /* - * Ensure that no other VCPU is running by checking the vcpu->cpu - * field. If no other VPCUs are running we can safely access the VGIC - * state, because even if another VPU is run after this point, that - * VCPU will not touch the vgic state, because it will block on - * getting the vgic->lock in kvm_vgic_sync_hwstate(). - */ - kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) { - if (unlikely(tmp_vcpu->cpu != -1)) { - ret = -EBUSY; - goto out_vgic_unlock; - } - } - - /* - * Move all pending IRQs from the LRs on all VCPUs so the pending - * state can be properly represented in the register state accessible - * through this API. - */ - kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) - vgic_unqueue_irqs(tmp_vcpu); - - offset -= r->base; - r->handle_mmio(vcpu, &mmio, offset); - - if (!is_write) - *reg = mmio_data_read(&mmio, ~0); - - ret = 0; -out_vgic_unlock: - spin_unlock(&vgic->lock); -out: - mutex_unlock(&dev->kvm->lock); - return ret; -} - -static int vgic_v2_create(struct kvm_device *dev, u32 type) -{ - return kvm_vgic_create(dev->kvm, type); -} - -static void vgic_v2_destroy(struct kvm_device *dev) -{ - kfree(dev); -} - -static int vgic_v2_set_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_set_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 reg; - - if (get_user(reg, uaddr)) - return -EFAULT; - - return vgic_attr_regs_access(dev, attr, ®, true); - } - - } - - return -ENXIO; -} - -static int vgic_v2_get_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_get_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 reg = 0; - - ret = vgic_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - return put_user(reg, uaddr); - } - - } - - return -ENXIO; -} - -static int vgic_v2_has_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - phys_addr_t offset; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: - switch (attr->attr) { - case KVM_VGIC_V2_ADDR_TYPE_DIST: - case KVM_VGIC_V2_ADDR_TYPE_CPU: - return 0; - } - break; - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; - return vgic_has_attr_regs(vgic_dist_ranges, offset); - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; - return vgic_has_attr_regs(vgic_cpu_ranges, offset); - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: - return 0; - case KVM_DEV_ARM_VGIC_GRP_CTRL: - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - return 0; - } - } - return -ENXIO; -} - -struct kvm_device_ops kvm_arm_vgic_v2_ops = { - .name = "kvm-arm-vgic-v2", - .create = vgic_v2_create, - .destroy = vgic_v2_destroy, - .set_attr = vgic_v2_set_attr, - .get_attr = vgic_v2_get_attr, - .has_attr = vgic_v2_has_attr, -}; diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c deleted file mode 100644 index 334cd7a89106..000000000000 --- a/virt/kvm/arm/vgic-v2.c +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Copyright (C) 2012,2013 ARM Limited, All Rights Reserved. - * Author: Marc Zyngier - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr) -{ - struct vgic_lr lr_desc; - u32 val = vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr]; - - lr_desc.irq = val & GICH_LR_VIRTUALID; - if (lr_desc.irq <= 15) - lr_desc.source = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7; - else - lr_desc.source = 0; - lr_desc.state = 0; - - if (val & GICH_LR_PENDING_BIT) - lr_desc.state |= LR_STATE_PENDING; - if (val & GICH_LR_ACTIVE_BIT) - lr_desc.state |= LR_STATE_ACTIVE; - if (val & GICH_LR_EOI) - lr_desc.state |= LR_EOI_INT; - if (val & GICH_LR_HW) { - lr_desc.state |= LR_HW; - lr_desc.hwirq = (val & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT; - } - - return lr_desc; -} - -static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr, - struct vgic_lr lr_desc) -{ - u32 lr_val; - - lr_val = lr_desc.irq; - - if (lr_desc.state & LR_STATE_PENDING) - lr_val |= GICH_LR_PENDING_BIT; - if (lr_desc.state & LR_STATE_ACTIVE) - lr_val |= GICH_LR_ACTIVE_BIT; - if (lr_desc.state & LR_EOI_INT) - lr_val |= GICH_LR_EOI; - - if (lr_desc.state & LR_HW) { - lr_val |= GICH_LR_HW; - lr_val |= (u32)lr_desc.hwirq << GICH_LR_PHYSID_CPUID_SHIFT; - } - - if (lr_desc.irq < VGIC_NR_SGIS) - lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT); - - vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val; - - if (!(lr_desc.state & LR_STATE_MASK)) - vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr); - else - vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr &= ~(1ULL << lr); -} - -static u64 vgic_v2_get_elrsr(const struct kvm_vcpu *vcpu) -{ - return vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr; -} - -static u64 vgic_v2_get_eisr(const struct kvm_vcpu *vcpu) -{ - return vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr; -} - -static void vgic_v2_clear_eisr(struct kvm_vcpu *vcpu) -{ - vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr = 0; -} - -static u32 vgic_v2_get_interrupt_status(const struct kvm_vcpu *vcpu) -{ - u32 misr = vcpu->arch.vgic_cpu.vgic_v2.vgic_misr; - u32 ret = 0; - - if (misr & GICH_MISR_EOI) - ret |= INT_STATUS_EOI; - if (misr & GICH_MISR_U) - ret |= INT_STATUS_UNDERFLOW; - - return ret; -} - -static void vgic_v2_enable_underflow(struct kvm_vcpu *vcpu) -{ - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr |= GICH_HCR_UIE; -} - -static void vgic_v2_disable_underflow(struct kvm_vcpu *vcpu) -{ - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr &= ~GICH_HCR_UIE; -} - -static void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr; - - vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >> GICH_VMCR_CTRL_SHIFT; - vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >> GICH_VMCR_ALIAS_BINPOINT_SHIFT; - vmcrp->bpr = (vmcr & GICH_VMCR_BINPOINT_MASK) >> GICH_VMCR_BINPOINT_SHIFT; - vmcrp->pmr = (vmcr & GICH_VMCR_PRIMASK_MASK) >> GICH_VMCR_PRIMASK_SHIFT; -} - -static void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - u32 vmcr; - - vmcr = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK; - vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) & GICH_VMCR_ALIAS_BINPOINT_MASK; - vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & GICH_VMCR_BINPOINT_MASK; - vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK; - - vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr; -} - -static void vgic_v2_enable(struct kvm_vcpu *vcpu) -{ - /* - * By forcing VMCR to zero, the GIC will restore the binary - * points to their reset values. Anything else resets to zero - * anyway. - */ - vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; - vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0; - - /* Get the show on the road... */ - vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; -} - -static const struct vgic_ops vgic_v2_ops = { - .get_lr = vgic_v2_get_lr, - .set_lr = vgic_v2_set_lr, - .get_elrsr = vgic_v2_get_elrsr, - .get_eisr = vgic_v2_get_eisr, - .clear_eisr = vgic_v2_clear_eisr, - .get_interrupt_status = vgic_v2_get_interrupt_status, - .enable_underflow = vgic_v2_enable_underflow, - .disable_underflow = vgic_v2_disable_underflow, - .get_vmcr = vgic_v2_get_vmcr, - .set_vmcr = vgic_v2_set_vmcr, - .enable = vgic_v2_enable, -}; - -struct vgic_params __section(.hyp.text) vgic_v2_params; - -static void vgic_cpu_init_lrs(void *params) -{ - struct vgic_params *vgic = params; - int i; - - for (i = 0; i < vgic->nr_lr; i++) - writel_relaxed(0, vgic->vctrl_base + GICH_LR0 + (i * 4)); -} - -/** - * vgic_v2_probe - probe for a GICv2 compatible interrupt controller - * @gic_kvm_info: pointer to the GIC description - * @ops: address of a pointer to the GICv2 operations - * @params: address of a pointer to HW-specific parameters - * - * Returns 0 if a GICv2 has been found, with the low level operations - * in *ops and the HW parameters in *params. Returns an error code - * otherwise. - */ -int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info, - const struct vgic_ops **ops, - const struct vgic_params **params) -{ - int ret; - struct vgic_params *vgic = &vgic_v2_params; - const struct resource *vctrl_res = &gic_kvm_info->vctrl; - const struct resource *vcpu_res = &gic_kvm_info->vcpu; - - memset(vgic, 0, sizeof(*vgic)); - - if (!gic_kvm_info->maint_irq) { - kvm_err("error getting vgic maintenance irq\n"); - ret = -ENXIO; - goto out; - } - vgic->maint_irq = gic_kvm_info->maint_irq; - - if (!gic_kvm_info->vctrl.start) { - kvm_err("GICH not present in the firmware table\n"); - ret = -ENXIO; - goto out; - } - - vgic->vctrl_base = ioremap(gic_kvm_info->vctrl.start, - resource_size(&gic_kvm_info->vctrl)); - if (!vgic->vctrl_base) { - kvm_err("Cannot ioremap GICH\n"); - ret = -ENOMEM; - goto out; - } - - vgic->nr_lr = readl_relaxed(vgic->vctrl_base + GICH_VTR); - vgic->nr_lr = (vgic->nr_lr & 0x3f) + 1; - - ret = create_hyp_io_mappings(vgic->vctrl_base, - vgic->vctrl_base + resource_size(vctrl_res), - vctrl_res->start); - if (ret) { - kvm_err("Cannot map VCTRL into hyp\n"); - goto out_unmap; - } - - if (!PAGE_ALIGNED(vcpu_res->start)) { - kvm_err("GICV physical address 0x%llx not page aligned\n", - (unsigned long long)vcpu_res->start); - ret = -ENXIO; - goto out_unmap; - } - - if (!PAGE_ALIGNED(resource_size(vcpu_res))) { - kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n", - (unsigned long long)resource_size(vcpu_res), - PAGE_SIZE); - ret = -ENXIO; - goto out_unmap; - } - - vgic->can_emulate_gicv2 = true; - kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2); - - vgic->vcpu_base = vcpu_res->start; - - kvm_info("GICH base=0x%llx, GICV base=0x%llx, IRQ=%d\n", - gic_kvm_info->vctrl.start, vgic->vcpu_base, vgic->maint_irq); - - vgic->type = VGIC_V2; - vgic->max_gic_vcpus = VGIC_V2_MAX_CPUS; - - on_each_cpu(vgic_cpu_init_lrs, vgic, 1); - - *ops = &vgic_v2_ops; - *params = vgic; - goto out; - -out_unmap: - iounmap(vgic->vctrl_base); -out: - return ret; -} diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c deleted file mode 100644 index e661e7fb9d91..000000000000 --- a/virt/kvm/arm/vgic-v3-emul.c +++ /dev/null @@ -1,1074 +0,0 @@ -/* - * GICv3 distributor and redistributor emulation - * - * GICv3 emulation is currently only supported on a GICv3 host (because - * we rely on the hardware's CPU interface virtualization support), but - * supports both hardware with or without the optional GICv2 backwards - * compatibility features. - * - * Limitations of the emulation: - * (RAZ/WI: read as zero, write ignore, RAO/WI: read as one, write ignore) - * - We do not support LPIs (yet). TYPER.LPIS is reported as 0 and is RAZ/WI. - * - We do not support the message based interrupts (MBIs) triggered by - * writes to the GICD_{SET,CLR}SPI_* registers. TYPER.MBIS is reported as 0. - * - We do not support the (optional) backwards compatibility feature. - * GICD_CTLR.ARE resets to 1 and is RAO/WI. If the _host_ GIC supports - * the compatiblity feature, you can use a GICv2 in the guest, though. - * - We only support a single security state. GICD_CTLR.DS is 1 and is RAO/WI. - * - Priorities are not emulated (same as the GICv2 emulation). Linux - * as a guest is fine with this, because it does not use priorities. - * - We only support Group1 interrupts. Again Linux uses only those. - * - * Copyright (C) 2014 ARM Ltd. - * Author: Andre Przywara - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -#include "vgic.h" - -static bool handle_mmio_rao_wi(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 reg = 0xffffffff; - - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - - return false; -} - -static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 reg = 0; - - /* - * Force ARE and DS to 1, the guest cannot change this. - * For the time being we only support Group1 interrupts. - */ - if (vcpu->kvm->arch.vgic.enabled) - reg = GICD_CTLR_ENABLE_SS_G1; - reg |= GICD_CTLR_ARE_NS | GICD_CTLR_DS; - - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - if (mmio->is_write) { - vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1); - vgic_update_state(vcpu->kvm); - return true; - } - return false; -} - -/* - * As this implementation does not provide compatibility - * with GICv2 (ARE==1), we report zero CPUs in bits [5..7]. - * Also LPIs and MBIs are not supported, so we set the respective bits to 0. - * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs). - */ -#define INTERRUPT_ID_BITS 10 -static bool handle_mmio_typer(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 reg; - - reg = (min(vcpu->kvm->arch.vgic.nr_irqs, 1024) >> 5) - 1; - - reg |= (INTERRUPT_ID_BITS - 1) << 19; - - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - - return false; -} - -static bool handle_mmio_iidr(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, phys_addr_t offset) -{ - u32 reg; - - reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - - return false; -} - -static bool handle_mmio_set_enable_reg_dist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8)) - return vgic_handle_enable_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id, - ACCESS_WRITE_SETBIT); - - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_clear_enable_reg_dist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8)) - return vgic_handle_enable_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id, - ACCESS_WRITE_CLEARBIT); - - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_set_pending_reg_dist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8)) - return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id); - - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_clear_pending_reg_dist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8)) - return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id); - - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_set_active_reg_dist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8)) - return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id); - - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_clear_active_reg_dist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8)) - return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset, - vcpu->vcpu_id); - - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 *reg; - - if (unlikely(offset < VGIC_NR_PRIVATE_IRQS)) { - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; - } - - reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority, - vcpu->vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - return false; -} - -static bool handle_mmio_cfg_reg_dist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 *reg; - - if (unlikely(offset < VGIC_NR_PRIVATE_IRQS / 4)) { - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; - } - - reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg, - vcpu->vcpu_id, offset >> 1); - - return vgic_handle_cfg_reg(reg, mmio, offset); -} - -/* - * We use a compressed version of the MPIDR (all 32 bits in one 32-bit word) - * when we store the target MPIDR written by the guest. - */ -static u32 compress_mpidr(unsigned long mpidr) -{ - u32 ret; - - ret = MPIDR_AFFINITY_LEVEL(mpidr, 0); - ret |= MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8; - ret |= MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16; - ret |= MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24; - - return ret; -} - -static unsigned long uncompress_mpidr(u32 value) -{ - unsigned long mpidr; - - mpidr = ((value >> 0) & 0xFF) << MPIDR_LEVEL_SHIFT(0); - mpidr |= ((value >> 8) & 0xFF) << MPIDR_LEVEL_SHIFT(1); - mpidr |= ((value >> 16) & 0xFF) << MPIDR_LEVEL_SHIFT(2); - mpidr |= (u64)((value >> 24) & 0xFF) << MPIDR_LEVEL_SHIFT(3); - - return mpidr; -} - -/* - * Lookup the given MPIDR value to get the vcpu_id (if there is one) - * and store that in the irq_spi_cpu[] array. - * This limits the number of VCPUs to 255 for now, extending the data - * type (or storing kvm_vcpu pointers) should lift the limit. - * Store the original MPIDR value in an extra array to support read-as-written. - * Unallocated MPIDRs are translated to a special value and caught - * before any array accesses. - */ -static bool handle_mmio_route_reg(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm *kvm = vcpu->kvm; - struct vgic_dist *dist = &kvm->arch.vgic; - int spi; - u32 reg; - int vcpu_id; - unsigned long *bmap, mpidr; - - /* - * The upper 32 bits of each 64 bit register are zero, - * as we don't support Aff3. - */ - if ((offset & 4)) { - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; - } - - /* This region only covers SPIs, so no handling of private IRQs here. */ - spi = offset / 8; - - /* get the stored MPIDR for this IRQ */ - mpidr = uncompress_mpidr(dist->irq_spi_mpidr[spi]); - reg = mpidr; - - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - - if (!mmio->is_write) - return false; - - /* - * Now clear the currently assigned vCPU from the map, making room - * for the new one to be written below - */ - vcpu = kvm_mpidr_to_vcpu(kvm, mpidr); - if (likely(vcpu)) { - vcpu_id = vcpu->vcpu_id; - bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]); - __clear_bit(spi, bmap); - } - - dist->irq_spi_mpidr[spi] = compress_mpidr(reg); - vcpu = kvm_mpidr_to_vcpu(kvm, reg & MPIDR_HWID_BITMASK); - - /* - * The spec says that non-existent MPIDR values should not be - * forwarded to any existent (v)CPU, but should be able to become - * pending anyway. We simply keep the irq_spi_target[] array empty, so - * the interrupt will never be injected. - * irq_spi_cpu[irq] gets a magic value in this case. - */ - if (likely(vcpu)) { - vcpu_id = vcpu->vcpu_id; - dist->irq_spi_cpu[spi] = vcpu_id; - bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]); - __set_bit(spi, bmap); - } else { - dist->irq_spi_cpu[spi] = VCPU_NOT_ALLOCATED; - } - - vgic_update_state(kvm); - - return true; -} - -/* - * We should be careful about promising too much when a guest reads - * this register. Don't claim to be like any hardware implementation, - * but just report the GIC as version 3 - which is what a Linux guest - * would check. - */ -static bool handle_mmio_idregs(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 reg = 0; - - switch (offset + GICD_IDREGS) { - case GICD_PIDR2: - reg = 0x3b; - break; - } - - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - - return false; -} - -static const struct vgic_io_range vgic_v3_dist_ranges[] = { - { - .base = GICD_CTLR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_ctlr, - }, - { - .base = GICD_TYPER, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_typer, - }, - { - .base = GICD_IIDR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_iidr, - }, - { - /* this register is optional, it is RAZ/WI if not implemented */ - .base = GICD_STATUSR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_raz_wi, - }, - { - /* this write only register is WI when TYPER.MBIS=0 */ - .base = GICD_SETSPI_NSR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_raz_wi, - }, - { - /* this write only register is WI when TYPER.MBIS=0 */ - .base = GICD_CLRSPI_NSR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_raz_wi, - }, - { - /* this is RAZ/WI when DS=1 */ - .base = GICD_SETSPI_SR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_raz_wi, - }, - { - /* this is RAZ/WI when DS=1 */ - .base = GICD_CLRSPI_SR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = GICD_IGROUPR, - .len = 0x80, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_rao_wi, - }, - { - .base = GICD_ISENABLER, - .len = 0x80, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_enable_reg_dist, - }, - { - .base = GICD_ICENABLER, - .len = 0x80, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_enable_reg_dist, - }, - { - .base = GICD_ISPENDR, - .len = 0x80, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_pending_reg_dist, - }, - { - .base = GICD_ICPENDR, - .len = 0x80, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_pending_reg_dist, - }, - { - .base = GICD_ISACTIVER, - .len = 0x80, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_active_reg_dist, - }, - { - .base = GICD_ICACTIVER, - .len = 0x80, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_active_reg_dist, - }, - { - .base = GICD_IPRIORITYR, - .len = 0x400, - .bits_per_irq = 8, - .handle_mmio = handle_mmio_priority_reg_dist, - }, - { - /* TARGETSRn is RES0 when ARE=1 */ - .base = GICD_ITARGETSR, - .len = 0x400, - .bits_per_irq = 8, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = GICD_ICFGR, - .len = 0x100, - .bits_per_irq = 2, - .handle_mmio = handle_mmio_cfg_reg_dist, - }, - { - /* this is RAZ/WI when DS=1 */ - .base = GICD_IGRPMODR, - .len = 0x80, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_raz_wi, - }, - { - /* this is RAZ/WI when DS=1 */ - .base = GICD_NSACR, - .len = 0x100, - .bits_per_irq = 2, - .handle_mmio = handle_mmio_raz_wi, - }, - { - /* this is RAZ/WI when ARE=1 */ - .base = GICD_SGIR, - .len = 0x04, - .handle_mmio = handle_mmio_raz_wi, - }, - { - /* this is RAZ/WI when ARE=1 */ - .base = GICD_CPENDSGIR, - .len = 0x10, - .handle_mmio = handle_mmio_raz_wi, - }, - { - /* this is RAZ/WI when ARE=1 */ - .base = GICD_SPENDSGIR, - .len = 0x10, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = GICD_IROUTER + 0x100, - .len = 0x1ee0, - .bits_per_irq = 64, - .handle_mmio = handle_mmio_route_reg, - }, - { - .base = GICD_IDREGS, - .len = 0x30, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_idregs, - }, - {}, -}; - -static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - /* since we don't support LPIs, this register is zero for now */ - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 reg; - u64 mpidr; - struct kvm_vcpu *redist_vcpu = mmio->private; - int target_vcpu_id = redist_vcpu->vcpu_id; - - /* the upper 32 bits contain the affinity value */ - if ((offset & ~3) == 4) { - mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu); - reg = compress_mpidr(mpidr); - - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - return false; - } - - reg = redist_vcpu->vcpu_id << 8; - if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1) - reg |= GICR_TYPER_LAST; - vgic_reg_access(mmio, ®, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED); - return false; -} - -static bool handle_mmio_set_enable_reg_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm_vcpu *redist_vcpu = mmio->private; - - return vgic_handle_enable_reg(vcpu->kvm, mmio, offset, - redist_vcpu->vcpu_id, - ACCESS_WRITE_SETBIT); -} - -static bool handle_mmio_clear_enable_reg_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm_vcpu *redist_vcpu = mmio->private; - - return vgic_handle_enable_reg(vcpu->kvm, mmio, offset, - redist_vcpu->vcpu_id, - ACCESS_WRITE_CLEARBIT); -} - -static bool handle_mmio_set_active_reg_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm_vcpu *redist_vcpu = mmio->private; - - return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset, - redist_vcpu->vcpu_id); -} - -static bool handle_mmio_clear_active_reg_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm_vcpu *redist_vcpu = mmio->private; - - return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset, - redist_vcpu->vcpu_id); -} - -static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm_vcpu *redist_vcpu = mmio->private; - - return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset, - redist_vcpu->vcpu_id); -} - -static bool handle_mmio_clear_pending_reg_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm_vcpu *redist_vcpu = mmio->private; - - return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset, - redist_vcpu->vcpu_id); -} - -static bool handle_mmio_priority_reg_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm_vcpu *redist_vcpu = mmio->private; - u32 *reg; - - reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority, - redist_vcpu->vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - return false; -} - -static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - struct kvm_vcpu *redist_vcpu = mmio->private; - - u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg, - redist_vcpu->vcpu_id, offset >> 1); - - return vgic_handle_cfg_reg(reg, mmio, offset); -} - -#define SGI_base(x) ((x) + SZ_64K) - -static const struct vgic_io_range vgic_redist_ranges[] = { - { - .base = GICR_CTLR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_ctlr_redist, - }, - { - .base = GICR_TYPER, - .len = 0x08, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_typer_redist, - }, - { - .base = GICR_IIDR, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_iidr, - }, - { - .base = GICR_WAKER, - .len = 0x04, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = GICR_IDREGS, - .len = 0x30, - .bits_per_irq = 0, - .handle_mmio = handle_mmio_idregs, - }, - { - .base = SGI_base(GICR_IGROUPR0), - .len = 0x04, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_rao_wi, - }, - { - .base = SGI_base(GICR_ISENABLER0), - .len = 0x04, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_enable_reg_redist, - }, - { - .base = SGI_base(GICR_ICENABLER0), - .len = 0x04, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_enable_reg_redist, - }, - { - .base = SGI_base(GICR_ISPENDR0), - .len = 0x04, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_pending_reg_redist, - }, - { - .base = SGI_base(GICR_ICPENDR0), - .len = 0x04, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_pending_reg_redist, - }, - { - .base = SGI_base(GICR_ISACTIVER0), - .len = 0x04, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_set_active_reg_redist, - }, - { - .base = SGI_base(GICR_ICACTIVER0), - .len = 0x04, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_clear_active_reg_redist, - }, - { - .base = SGI_base(GICR_IPRIORITYR0), - .len = 0x20, - .bits_per_irq = 8, - .handle_mmio = handle_mmio_priority_reg_redist, - }, - { - .base = SGI_base(GICR_ICFGR0), - .len = 0x08, - .bits_per_irq = 2, - .handle_mmio = handle_mmio_cfg_reg_redist, - }, - { - .base = SGI_base(GICR_IGRPMODR0), - .len = 0x04, - .bits_per_irq = 1, - .handle_mmio = handle_mmio_raz_wi, - }, - { - .base = SGI_base(GICR_NSACR), - .len = 0x04, - .handle_mmio = handle_mmio_raz_wi, - }, - {}, -}; - -static bool vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, int irq) -{ - if (vgic_queue_irq(vcpu, 0, irq)) { - vgic_dist_irq_clear_pending(vcpu, irq); - vgic_cpu_irq_clear(vcpu, irq); - return true; - } - - return false; -} - -static int vgic_v3_map_resources(struct kvm *kvm, - const struct vgic_params *params) -{ - int ret = 0; - struct vgic_dist *dist = &kvm->arch.vgic; - gpa_t rdbase = dist->vgic_redist_base; - struct vgic_io_device *iodevs = NULL; - int i; - - if (!irqchip_in_kernel(kvm)) - return 0; - - mutex_lock(&kvm->lock); - - if (vgic_ready(kvm)) - goto out; - - if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || - IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) { - kvm_err("Need to set vgic distributor addresses first\n"); - ret = -ENXIO; - goto out; - } - - /* - * For a VGICv3 we require the userland to explicitly initialize - * the VGIC before we need to use it. - */ - if (!vgic_initialized(kvm)) { - ret = -EBUSY; - goto out; - } - - ret = vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base, - GIC_V3_DIST_SIZE, vgic_v3_dist_ranges, - -1, &dist->dist_iodev); - if (ret) - goto out; - - iodevs = kcalloc(dist->nr_cpus, sizeof(iodevs[0]), GFP_KERNEL); - if (!iodevs) { - ret = -ENOMEM; - goto out_unregister; - } - - for (i = 0; i < dist->nr_cpus; i++) { - ret = vgic_register_kvm_io_dev(kvm, rdbase, - SZ_128K, vgic_redist_ranges, - i, &iodevs[i]); - if (ret) - goto out_unregister; - rdbase += GIC_V3_REDIST_SIZE; - } - - dist->redist_iodevs = iodevs; - dist->ready = true; - goto out; - -out_unregister: - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev); - if (iodevs) { - for (i = 0; i < dist->nr_cpus; i++) { - if (iodevs[i].dev.ops) - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, - &iodevs[i].dev); - } - } - -out: - if (ret) - kvm_vgic_destroy(kvm); - mutex_unlock(&kvm->lock); - return ret; -} - -static int vgic_v3_init_model(struct kvm *kvm) -{ - int i; - u32 mpidr; - struct vgic_dist *dist = &kvm->arch.vgic; - int nr_spis = dist->nr_irqs - VGIC_NR_PRIVATE_IRQS; - - dist->irq_spi_mpidr = kcalloc(nr_spis, sizeof(dist->irq_spi_mpidr[0]), - GFP_KERNEL); - - if (!dist->irq_spi_mpidr) - return -ENOMEM; - - /* Initialize the target VCPUs for each IRQ to VCPU 0 */ - mpidr = compress_mpidr(kvm_vcpu_get_mpidr_aff(kvm_get_vcpu(kvm, 0))); - for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i++) { - dist->irq_spi_cpu[i - VGIC_NR_PRIVATE_IRQS] = 0; - dist->irq_spi_mpidr[i - VGIC_NR_PRIVATE_IRQS] = mpidr; - vgic_bitmap_set_irq_val(dist->irq_spi_target, 0, i, 1); - } - - return 0; -} - -/* GICv3 does not keep track of SGI sources anymore. */ -static void vgic_v3_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source) -{ -} - -void vgic_v3_init_emulation(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - - dist->vm_ops.queue_sgi = vgic_v3_queue_sgi; - dist->vm_ops.add_sgi_source = vgic_v3_add_sgi_source; - dist->vm_ops.init_model = vgic_v3_init_model; - dist->vm_ops.map_resources = vgic_v3_map_resources; - - kvm->arch.max_vcpus = KVM_MAX_VCPUS; -} - -/* - * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI - * generation register ICC_SGI1R_EL1) with a given VCPU. - * If the VCPU's MPIDR matches, return the level0 affinity, otherwise - * return -1. - */ -static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu) -{ - unsigned long affinity; - int level0; - - /* - * Split the current VCPU's MPIDR into affinity level 0 and the - * rest as this is what we have to compare against. - */ - affinity = kvm_vcpu_get_mpidr_aff(vcpu); - level0 = MPIDR_AFFINITY_LEVEL(affinity, 0); - affinity &= ~MPIDR_LEVEL_MASK; - - /* bail out if the upper three levels don't match */ - if (sgi_aff != affinity) - return -1; - - /* Is this VCPU's bit set in the mask ? */ - if (!(sgi_cpu_mask & BIT(level0))) - return -1; - - return level0; -} - -#define SGI_AFFINITY_LEVEL(reg, level) \ - ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \ - >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level)) - -/** - * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs - * @vcpu: The VCPU requesting a SGI - * @reg: The value written into the ICC_SGI1R_EL1 register by that VCPU - * - * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register. - * This will trap in sys_regs.c and call this function. - * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the - * target processors as well as a bitmask of 16 Aff0 CPUs. - * If the interrupt routing mode bit is not set, we iterate over all VCPUs to - * check for matching ones. If this bit is set, we signal all, but not the - * calling VCPU. - */ -void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu *c_vcpu; - struct vgic_dist *dist = &kvm->arch.vgic; - u16 target_cpus; - u64 mpidr; - int sgi, c; - int vcpu_id = vcpu->vcpu_id; - bool broadcast; - int updated = 0; - - sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT; - broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT); - target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT; - mpidr = SGI_AFFINITY_LEVEL(reg, 3); - mpidr |= SGI_AFFINITY_LEVEL(reg, 2); - mpidr |= SGI_AFFINITY_LEVEL(reg, 1); - - /* - * We take the dist lock here, because we come from the sysregs - * code path and not from the MMIO one (which already takes the lock). - */ - spin_lock(&dist->lock); - - /* - * We iterate over all VCPUs to find the MPIDRs matching the request. - * If we have handled one CPU, we clear it's bit to detect early - * if we are already finished. This avoids iterating through all - * VCPUs when most of the times we just signal a single VCPU. - */ - kvm_for_each_vcpu(c, c_vcpu, kvm) { - - /* Exit early if we have dealt with all requested CPUs */ - if (!broadcast && target_cpus == 0) - break; - - /* Don't signal the calling VCPU */ - if (broadcast && c == vcpu_id) - continue; - - if (!broadcast) { - int level0; - - level0 = match_mpidr(mpidr, target_cpus, c_vcpu); - if (level0 == -1) - continue; - - /* remove this matching VCPU from the mask */ - target_cpus &= ~BIT(level0); - } - - /* Flag the SGI as pending */ - vgic_dist_irq_set_pending(c_vcpu, sgi); - updated = 1; - kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c); - } - if (updated) - vgic_update_state(vcpu->kvm); - spin_unlock(&dist->lock); - if (updated) - vgic_kick_vcpus(vcpu->kvm); -} - -static int vgic_v3_create(struct kvm_device *dev, u32 type) -{ - return kvm_vgic_create(dev->kvm, type); -} - -static void vgic_v3_destroy(struct kvm_device *dev) -{ - kfree(dev); -} - -static int vgic_v3_set_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_set_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - return -ENXIO; - } - - return -ENXIO; -} - -static int vgic_v3_get_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - int ret; - - ret = vgic_get_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - return -ENXIO; - } - - return -ENXIO; -} - -static int vgic_v3_has_attr(struct kvm_device *dev, - struct kvm_device_attr *attr) -{ - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: - switch (attr->attr) { - case KVM_VGIC_V2_ADDR_TYPE_DIST: - case KVM_VGIC_V2_ADDR_TYPE_CPU: - return -ENXIO; - case KVM_VGIC_V3_ADDR_TYPE_DIST: - case KVM_VGIC_V3_ADDR_TYPE_REDIST: - return 0; - } - break; - case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - return -ENXIO; - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: - return 0; - case KVM_DEV_ARM_VGIC_GRP_CTRL: - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - return 0; - } - } - return -ENXIO; -} - -struct kvm_device_ops kvm_arm_vgic_v3_ops = { - .name = "kvm-arm-vgic-v3", - .create = vgic_v3_create, - .destroy = vgic_v3_destroy, - .set_attr = vgic_v3_set_attr, - .get_attr = vgic_v3_get_attr, - .has_attr = vgic_v3_has_attr, -}; diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c deleted file mode 100644 index 75b02fa86436..000000000000 --- a/virt/kvm/arm/vgic-v3.c +++ /dev/null @@ -1,279 +0,0 @@ -/* - * Copyright (C) 2013 ARM Limited, All Rights Reserved. - * Author: Marc Zyngier - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include - -static u32 ich_vtr_el2; - -static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr) -{ - struct vgic_lr lr_desc; - u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr]; - - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) - lr_desc.irq = val & ICH_LR_VIRTUAL_ID_MASK; - else - lr_desc.irq = val & GICH_LR_VIRTUALID; - - lr_desc.source = 0; - if (lr_desc.irq <= 15 && - vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) - lr_desc.source = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7; - - lr_desc.state = 0; - - if (val & ICH_LR_PENDING_BIT) - lr_desc.state |= LR_STATE_PENDING; - if (val & ICH_LR_ACTIVE_BIT) - lr_desc.state |= LR_STATE_ACTIVE; - if (val & ICH_LR_EOI) - lr_desc.state |= LR_EOI_INT; - if (val & ICH_LR_HW) { - lr_desc.state |= LR_HW; - lr_desc.hwirq = (val >> ICH_LR_PHYS_ID_SHIFT) & GENMASK(9, 0); - } - - return lr_desc; -} - -static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr, - struct vgic_lr lr_desc) -{ - u64 lr_val; - - lr_val = lr_desc.irq; - - /* - * Currently all guest IRQs are Group1, as Group0 would result - * in a FIQ in the guest, which it wouldn't expect. - * Eventually we want to make this configurable, so we may revisit - * this in the future. - */ - switch (vcpu->kvm->arch.vgic.vgic_model) { - case KVM_DEV_TYPE_ARM_VGIC_V3: - lr_val |= ICH_LR_GROUP; - break; - case KVM_DEV_TYPE_ARM_VGIC_V2: - if (lr_desc.irq < VGIC_NR_SGIS) - lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT; - break; - default: - BUG(); - } - - if (lr_desc.state & LR_STATE_PENDING) - lr_val |= ICH_LR_PENDING_BIT; - if (lr_desc.state & LR_STATE_ACTIVE) - lr_val |= ICH_LR_ACTIVE_BIT; - if (lr_desc.state & LR_EOI_INT) - lr_val |= ICH_LR_EOI; - if (lr_desc.state & LR_HW) { - lr_val |= ICH_LR_HW; - lr_val |= ((u64)lr_desc.hwirq) << ICH_LR_PHYS_ID_SHIFT; - } - - vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = lr_val; - - if (!(lr_desc.state & LR_STATE_MASK)) - vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr); - else - vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr &= ~(1U << lr); -} - -static u64 vgic_v3_get_elrsr(const struct kvm_vcpu *vcpu) -{ - return vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr; -} - -static u64 vgic_v3_get_eisr(const struct kvm_vcpu *vcpu) -{ - return vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr; -} - -static void vgic_v3_clear_eisr(struct kvm_vcpu *vcpu) -{ - vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr = 0; -} - -static u32 vgic_v3_get_interrupt_status(const struct kvm_vcpu *vcpu) -{ - u32 misr = vcpu->arch.vgic_cpu.vgic_v3.vgic_misr; - u32 ret = 0; - - if (misr & ICH_MISR_EOI) - ret |= INT_STATUS_EOI; - if (misr & ICH_MISR_U) - ret |= INT_STATUS_UNDERFLOW; - - return ret; -} - -static void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr; - - vmcrp->ctlr = (vmcr & ICH_VMCR_CTLR_MASK) >> ICH_VMCR_CTLR_SHIFT; - vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT; - vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT; - vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; -} - -static void vgic_v3_enable_underflow(struct kvm_vcpu *vcpu) -{ - vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr |= ICH_HCR_UIE; -} - -static void vgic_v3_disable_underflow(struct kvm_vcpu *vcpu) -{ - vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr &= ~ICH_HCR_UIE; -} - -static void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) -{ - u32 vmcr; - - vmcr = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK; - vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK; - vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK; - vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK; - - vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr; -} - -static void vgic_v3_enable(struct kvm_vcpu *vcpu) -{ - struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; - - /* - * By forcing VMCR to zero, the GIC will restore the binary - * points to their reset values. Anything else resets to zero - * anyway. - */ - vgic_v3->vgic_vmcr = 0; - vgic_v3->vgic_elrsr = ~0; - - /* - * If we are emulating a GICv3, we do it in an non-GICv2-compatible - * way, so we force SRE to 1 to demonstrate this to the guest. - * This goes with the spec allowing the value to be RAO/WI. - */ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) - vgic_v3->vgic_sre = ICC_SRE_EL1_SRE; - else - vgic_v3->vgic_sre = 0; - - /* Get the show on the road... */ - vgic_v3->vgic_hcr = ICH_HCR_EN; -} - -static const struct vgic_ops vgic_v3_ops = { - .get_lr = vgic_v3_get_lr, - .set_lr = vgic_v3_set_lr, - .get_elrsr = vgic_v3_get_elrsr, - .get_eisr = vgic_v3_get_eisr, - .clear_eisr = vgic_v3_clear_eisr, - .get_interrupt_status = vgic_v3_get_interrupt_status, - .enable_underflow = vgic_v3_enable_underflow, - .disable_underflow = vgic_v3_disable_underflow, - .get_vmcr = vgic_v3_get_vmcr, - .set_vmcr = vgic_v3_set_vmcr, - .enable = vgic_v3_enable, -}; - -static struct vgic_params vgic_v3_params; - -static void vgic_cpu_init_lrs(void *params) -{ - kvm_call_hyp(__vgic_v3_init_lrs); -} - -/** - * vgic_v3_probe - probe for a GICv3 compatible interrupt controller - * @gic_kvm_info: pointer to the GIC description - * @ops: address of a pointer to the GICv3 operations - * @params: address of a pointer to HW-specific parameters - * - * Returns 0 if a GICv3 has been found, with the low level operations - * in *ops and the HW parameters in *params. Returns an error code - * otherwise. - */ -int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info, - const struct vgic_ops **ops, - const struct vgic_params **params) -{ - int ret = 0; - struct vgic_params *vgic = &vgic_v3_params; - const struct resource *vcpu_res = &gic_kvm_info->vcpu; - - vgic->maint_irq = gic_kvm_info->maint_irq; - - ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2); - - /* - * The ListRegs field is 5 bits, but there is a architectural - * maximum of 16 list registers. Just ignore bit 4... - */ - vgic->nr_lr = (ich_vtr_el2 & 0xf) + 1; - vgic->can_emulate_gicv2 = false; - - if (!vcpu_res->start) { - kvm_info("GICv3: no GICV resource entry\n"); - vgic->vcpu_base = 0; - } else if (!PAGE_ALIGNED(vcpu_res->start)) { - pr_warn("GICV physical address 0x%llx not page aligned\n", - (unsigned long long)vcpu_res->start); - vgic->vcpu_base = 0; - } else if (!PAGE_ALIGNED(resource_size(vcpu_res))) { - pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n", - (unsigned long long)resource_size(vcpu_res), - PAGE_SIZE); - } else { - vgic->vcpu_base = vcpu_res->start; - vgic->can_emulate_gicv2 = true; - kvm_register_device_ops(&kvm_arm_vgic_v2_ops, - KVM_DEV_TYPE_ARM_VGIC_V2); - } - if (vgic->vcpu_base == 0) - kvm_info("disabling GICv2 emulation\n"); - kvm_register_device_ops(&kvm_arm_vgic_v3_ops, KVM_DEV_TYPE_ARM_VGIC_V3); - - vgic->vctrl_base = NULL; - vgic->type = VGIC_V3; - vgic->max_gic_vcpus = VGIC_V3_MAX_CPUS; - - kvm_info("GICV base=0x%llx, IRQ=%d\n", - vgic->vcpu_base, vgic->maint_irq); - - on_each_cpu(vgic_cpu_init_lrs, vgic, 1); - - *ops = &vgic_v3_ops; - *params = vgic; - - return ret; -} diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c deleted file mode 100644 index c3bfbb981e73..000000000000 --- a/virt/kvm/arm/vgic.c +++ /dev/null @@ -1,2440 +0,0 @@ -/* - * Copyright (C) 2012 ARM Ltd. - * Author: Marc Zyngier - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include "trace.h" - -/* - * How the whole thing works (courtesy of Christoffer Dall): - * - * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if - * something is pending on the CPU interface. - * - Interrupts that are pending on the distributor are stored on the - * vgic.irq_pending vgic bitmap (this bitmap is updated by both user land - * ioctls and guest mmio ops, and other in-kernel peripherals such as the - * arch. timers). - * - Every time the bitmap changes, the irq_pending_on_cpu oracle is - * recalculated - * - To calculate the oracle, we need info for each cpu from - * compute_pending_for_cpu, which considers: - * - PPI: dist->irq_pending & dist->irq_enable - * - SPI: dist->irq_pending & dist->irq_enable & dist->irq_spi_target - * - irq_spi_target is a 'formatted' version of the GICD_ITARGETSRn - * registers, stored on each vcpu. We only keep one bit of - * information per interrupt, making sure that only one vcpu can - * accept the interrupt. - * - If any of the above state changes, we must recalculate the oracle. - * - The same is true when injecting an interrupt, except that we only - * consider a single interrupt at a time. The irq_spi_cpu array - * contains the target CPU for each SPI. - * - * The handling of level interrupts adds some extra complexity. We - * need to track when the interrupt has been EOIed, so we can sample - * the 'line' again. This is achieved as such: - * - * - When a level interrupt is moved onto a vcpu, the corresponding - * bit in irq_queued is set. As long as this bit is set, the line - * will be ignored for further interrupts. The interrupt is injected - * into the vcpu with the GICH_LR_EOI bit set (generate a - * maintenance interrupt on EOI). - * - When the interrupt is EOIed, the maintenance interrupt fires, - * and clears the corresponding bit in irq_queued. This allows the - * interrupt line to be sampled again. - * - Note that level-triggered interrupts can also be set to pending from - * writes to GICD_ISPENDRn and lowering the external input line does not - * cause the interrupt to become inactive in such a situation. - * Conversely, writes to GICD_ICPENDRn do not cause the interrupt to become - * inactive as long as the external input line is held high. - * - * - * Initialization rules: there are multiple stages to the vgic - * initialization, both for the distributor and the CPU interfaces. - * - * Distributor: - * - * - kvm_vgic_early_init(): initialization of static data that doesn't - * depend on any sizing information or emulation type. No allocation - * is allowed there. - * - * - vgic_init(): allocation and initialization of the generic data - * structures that depend on sizing information (number of CPUs, - * number of interrupts). Also initializes the vcpu specific data - * structures. Can be executed lazily for GICv2. - * [to be renamed to kvm_vgic_init??] - * - * CPU Interface: - * - * - kvm_vgic_cpu_early_init(): initialization of static data that - * doesn't depend on any sizing information or emulation type. No - * allocation is allowed there. - */ - -#include "vgic.h" - -static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu); -static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu); -static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr); -static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc); -static u64 vgic_get_elrsr(struct kvm_vcpu *vcpu); -static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu, - int virt_irq); -static int compute_pending_for_cpu(struct kvm_vcpu *vcpu); - -static const struct vgic_ops *vgic_ops; -static const struct vgic_params *vgic; - -static void add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source) -{ - vcpu->kvm->arch.vgic.vm_ops.add_sgi_source(vcpu, irq, source); -} - -static bool queue_sgi(struct kvm_vcpu *vcpu, int irq) -{ - return vcpu->kvm->arch.vgic.vm_ops.queue_sgi(vcpu, irq); -} - -int kvm_vgic_map_resources(struct kvm *kvm) -{ - return kvm->arch.vgic.vm_ops.map_resources(kvm, vgic); -} - -/* - * struct vgic_bitmap contains a bitmap made of unsigned longs, but - * extracts u32s out of them. - * - * This does not work on 64-bit BE systems, because the bitmap access - * will store two consecutive 32-bit words with the higher-addressed - * register's bits at the lower index and the lower-addressed register's - * bits at the higher index. - * - * Therefore, swizzle the register index when accessing the 32-bit word - * registers to access the right register's value. - */ -#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 64 -#define REG_OFFSET_SWIZZLE 1 -#else -#define REG_OFFSET_SWIZZLE 0 -#endif - -static int vgic_init_bitmap(struct vgic_bitmap *b, int nr_cpus, int nr_irqs) -{ - int nr_longs; - - nr_longs = nr_cpus + BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS); - - b->private = kzalloc(sizeof(unsigned long) * nr_longs, GFP_KERNEL); - if (!b->private) - return -ENOMEM; - - b->shared = b->private + nr_cpus; - - return 0; -} - -static void vgic_free_bitmap(struct vgic_bitmap *b) -{ - kfree(b->private); - b->private = NULL; - b->shared = NULL; -} - -/* - * Call this function to convert a u64 value to an unsigned long * bitmask - * in a way that works on both 32-bit and 64-bit LE and BE platforms. - * - * Warning: Calling this function may modify *val. - */ -static unsigned long *u64_to_bitmask(u64 *val) -{ -#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32 - *val = (*val >> 32) | (*val << 32); -#endif - return (unsigned long *)val; -} - -u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset) -{ - offset >>= 2; - if (!offset) - return (u32 *)(x->private + cpuid) + REG_OFFSET_SWIZZLE; - else - return (u32 *)(x->shared) + ((offset - 1) ^ REG_OFFSET_SWIZZLE); -} - -static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x, - int cpuid, int irq) -{ - if (irq < VGIC_NR_PRIVATE_IRQS) - return test_bit(irq, x->private + cpuid); - - return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared); -} - -void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid, - int irq, int val) -{ - unsigned long *reg; - - if (irq < VGIC_NR_PRIVATE_IRQS) { - reg = x->private + cpuid; - } else { - reg = x->shared; - irq -= VGIC_NR_PRIVATE_IRQS; - } - - if (val) - set_bit(irq, reg); - else - clear_bit(irq, reg); -} - -static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid) -{ - return x->private + cpuid; -} - -unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x) -{ - return x->shared; -} - -static int vgic_init_bytemap(struct vgic_bytemap *x, int nr_cpus, int nr_irqs) -{ - int size; - - size = nr_cpus * VGIC_NR_PRIVATE_IRQS; - size += nr_irqs - VGIC_NR_PRIVATE_IRQS; - - x->private = kzalloc(size, GFP_KERNEL); - if (!x->private) - return -ENOMEM; - - x->shared = x->private + nr_cpus * VGIC_NR_PRIVATE_IRQS / sizeof(u32); - return 0; -} - -static void vgic_free_bytemap(struct vgic_bytemap *b) -{ - kfree(b->private); - b->private = NULL; - b->shared = NULL; -} - -u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset) -{ - u32 *reg; - - if (offset < VGIC_NR_PRIVATE_IRQS) { - reg = x->private; - offset += cpuid * VGIC_NR_PRIVATE_IRQS; - } else { - reg = x->shared; - offset -= VGIC_NR_PRIVATE_IRQS; - } - - return reg + (offset / sizeof(u32)); -} - -#define VGIC_CFG_LEVEL 0 -#define VGIC_CFG_EDGE 1 - -static bool vgic_irq_is_edge(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - int irq_val; - - irq_val = vgic_bitmap_get_irq_val(&dist->irq_cfg, vcpu->vcpu_id, irq); - return irq_val == VGIC_CFG_EDGE; -} - -static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return vgic_bitmap_get_irq_val(&dist->irq_enabled, vcpu->vcpu_id, irq); -} - -static int vgic_irq_is_queued(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return vgic_bitmap_get_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq); -} - -static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq); -} - -static void vgic_irq_set_queued(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 1); -} - -static void vgic_irq_clear_queued(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 0); -} - -static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 1); -} - -static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0); -} - -static int vgic_dist_irq_get_level(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return vgic_bitmap_get_irq_val(&dist->irq_level, vcpu->vcpu_id, irq); -} - -static void vgic_dist_irq_set_level(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 1); -} - -static void vgic_dist_irq_clear_level(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 0); -} - -static int vgic_dist_irq_soft_pend(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return vgic_bitmap_get_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq); -} - -static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0); - if (!vgic_dist_irq_get_level(vcpu, irq)) { - vgic_dist_irq_clear_pending(vcpu, irq); - if (!compute_pending_for_cpu(vcpu)) - clear_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu); - } -} - -static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return vgic_bitmap_get_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq); -} - -void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 1); -} - -void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 0); -} - -static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq) -{ - if (irq < VGIC_NR_PRIVATE_IRQS) - set_bit(irq, vcpu->arch.vgic_cpu.pending_percpu); - else - set_bit(irq - VGIC_NR_PRIVATE_IRQS, - vcpu->arch.vgic_cpu.pending_shared); -} - -void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq) -{ - if (irq < VGIC_NR_PRIVATE_IRQS) - clear_bit(irq, vcpu->arch.vgic_cpu.pending_percpu); - else - clear_bit(irq - VGIC_NR_PRIVATE_IRQS, - vcpu->arch.vgic_cpu.pending_shared); -} - -static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq) -{ - return !vgic_irq_is_queued(vcpu, irq); -} - -/** - * vgic_reg_access - access vgic register - * @mmio: pointer to the data describing the mmio access - * @reg: pointer to the virtual backing of vgic distributor data - * @offset: least significant 2 bits used for word offset - * @mode: ACCESS_ mode (see defines above) - * - * Helper to make vgic register access easier using one of the access - * modes defined for vgic register access - * (read,raz,write-ignored,setbit,clearbit,write) - */ -void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg, - phys_addr_t offset, int mode) -{ - int word_offset = (offset & 3) * 8; - u32 mask = (1UL << (mmio->len * 8)) - 1; - u32 regval; - - /* - * Any alignment fault should have been delivered to the guest - * directly (ARM ARM B3.12.7 "Prioritization of aborts"). - */ - - if (reg) { - regval = *reg; - } else { - BUG_ON(mode != (ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED)); - regval = 0; - } - - if (mmio->is_write) { - u32 data = mmio_data_read(mmio, mask) << word_offset; - switch (ACCESS_WRITE_MASK(mode)) { - case ACCESS_WRITE_IGNORED: - return; - - case ACCESS_WRITE_SETBIT: - regval |= data; - break; - - case ACCESS_WRITE_CLEARBIT: - regval &= ~data; - break; - - case ACCESS_WRITE_VALUE: - regval = (regval & ~(mask << word_offset)) | data; - break; - } - *reg = regval; - } else { - switch (ACCESS_READ_MASK(mode)) { - case ACCESS_READ_RAZ: - regval = 0; - /* fall through */ - - case ACCESS_READ_VALUE: - mmio_data_write(mmio, mask, regval >> word_offset); - } - } -} - -bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - vgic_reg_access(mmio, NULL, offset, - ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED); - return false; -} - -bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id, int access) -{ - u32 *reg; - int mode = ACCESS_READ_VALUE | access; - struct kvm_vcpu *target_vcpu = kvm_get_vcpu(kvm, vcpu_id); - - reg = vgic_bitmap_get_reg(&kvm->arch.vgic.irq_enabled, vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, mode); - if (mmio->is_write) { - if (access & ACCESS_WRITE_CLEARBIT) { - if (offset < 4) /* Force SGI enabled */ - *reg |= 0xffff; - vgic_retire_disabled_irqs(target_vcpu); - } - vgic_update_state(kvm); - return true; - } - - return false; -} - -bool vgic_handle_set_pending_reg(struct kvm *kvm, - struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id) -{ - u32 *reg, orig; - u32 level_mask; - int mode = ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT; - struct vgic_dist *dist = &kvm->arch.vgic; - - reg = vgic_bitmap_get_reg(&dist->irq_cfg, vcpu_id, offset); - level_mask = (~(*reg)); - - /* Mark both level and edge triggered irqs as pending */ - reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset); - orig = *reg; - vgic_reg_access(mmio, reg, offset, mode); - - if (mmio->is_write) { - /* Set the soft-pending flag only for level-triggered irqs */ - reg = vgic_bitmap_get_reg(&dist->irq_soft_pend, - vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, mode); - *reg &= level_mask; - - /* Ignore writes to SGIs */ - if (offset < 2) { - *reg &= ~0xffff; - *reg |= orig & 0xffff; - } - - vgic_update_state(kvm); - return true; - } - - return false; -} - -bool vgic_handle_clear_pending_reg(struct kvm *kvm, - struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id) -{ - u32 *level_active; - u32 *reg, orig; - int mode = ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT; - struct vgic_dist *dist = &kvm->arch.vgic; - - reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset); - orig = *reg; - vgic_reg_access(mmio, reg, offset, mode); - if (mmio->is_write) { - /* Re-set level triggered level-active interrupts */ - level_active = vgic_bitmap_get_reg(&dist->irq_level, - vcpu_id, offset); - reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset); - *reg |= *level_active; - - /* Ignore writes to SGIs */ - if (offset < 2) { - *reg &= ~0xffff; - *reg |= orig & 0xffff; - } - - /* Clear soft-pending flags */ - reg = vgic_bitmap_get_reg(&dist->irq_soft_pend, - vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, mode); - - vgic_update_state(kvm); - return true; - } - return false; -} - -bool vgic_handle_set_active_reg(struct kvm *kvm, - struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id) -{ - u32 *reg; - struct vgic_dist *dist = &kvm->arch.vgic; - - reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT); - - if (mmio->is_write) { - vgic_update_state(kvm); - return true; - } - - return false; -} - -bool vgic_handle_clear_active_reg(struct kvm *kvm, - struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id) -{ - u32 *reg; - struct vgic_dist *dist = &kvm->arch.vgic; - - reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset); - vgic_reg_access(mmio, reg, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT); - - if (mmio->is_write) { - vgic_update_state(kvm); - return true; - } - - return false; -} - -static u32 vgic_cfg_expand(u16 val) -{ - u32 res = 0; - int i; - - /* - * Turn a 16bit value like abcd...mnop into a 32bit word - * a0b0c0d0...m0n0o0p0, which is what the HW cfg register is. - */ - for (i = 0; i < 16; i++) - res |= ((val >> i) & VGIC_CFG_EDGE) << (2 * i + 1); - - return res; -} - -static u16 vgic_cfg_compress(u32 val) -{ - u16 res = 0; - int i; - - /* - * Turn a 32bit word a0b0c0d0...m0n0o0p0 into 16bit value like - * abcd...mnop which is what we really care about. - */ - for (i = 0; i < 16; i++) - res |= ((val >> (i * 2 + 1)) & VGIC_CFG_EDGE) << i; - - return res; -} - -/* - * The distributor uses 2 bits per IRQ for the CFG register, but the - * LSB is always 0. As such, we only keep the upper bit, and use the - * two above functions to compress/expand the bits - */ -bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio, - phys_addr_t offset) -{ - u32 val; - - if (offset & 4) - val = *reg >> 16; - else - val = *reg & 0xffff; - - val = vgic_cfg_expand(val); - vgic_reg_access(mmio, &val, offset, - ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); - if (mmio->is_write) { - /* Ignore writes to read-only SGI and PPI bits */ - if (offset < 8) - return false; - - val = vgic_cfg_compress(val); - if (offset & 4) { - *reg &= 0xffff; - *reg |= val << 16; - } else { - *reg &= 0xffff << 16; - *reg |= val; - } - } - - return false; -} - -/** - * vgic_unqueue_irqs - move pending/active IRQs from LRs to the distributor - * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs - * - * Move any IRQs that have already been assigned to LRs back to the - * emulated distributor state so that the complete emulated state can be read - * from the main emulation structures without investigating the LRs. - */ -void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) -{ - u64 elrsr = vgic_get_elrsr(vcpu); - unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr); - int i; - - for_each_clear_bit(i, elrsr_ptr, vgic->nr_lr) { - struct vgic_lr lr = vgic_get_lr(vcpu, i); - - /* - * There are three options for the state bits: - * - * 01: pending - * 10: active - * 11: pending and active - */ - BUG_ON(!(lr.state & LR_STATE_MASK)); - - /* Reestablish SGI source for pending and active IRQs */ - if (lr.irq < VGIC_NR_SGIS) - add_sgi_source(vcpu, lr.irq, lr.source); - - /* - * If the LR holds an active (10) or a pending and active (11) - * interrupt then move the active state to the - * distributor tracking bit. - */ - if (lr.state & LR_STATE_ACTIVE) - vgic_irq_set_active(vcpu, lr.irq); - - /* - * Reestablish the pending state on the distributor and the - * CPU interface and mark the LR as free for other use. - */ - vgic_retire_lr(i, vcpu); - - /* Finally update the VGIC state. */ - vgic_update_state(vcpu->kvm); - } -} - -const -struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges, - int len, gpa_t offset) -{ - while (ranges->len) { - if (offset >= ranges->base && - (offset + len) <= (ranges->base + ranges->len)) - return ranges; - ranges++; - } - - return NULL; -} - -static bool vgic_validate_access(const struct vgic_dist *dist, - const struct vgic_io_range *range, - unsigned long offset) -{ - int irq; - - if (!range->bits_per_irq) - return true; /* Not an irq-based access */ - - irq = offset * 8 / range->bits_per_irq; - if (irq >= dist->nr_irqs) - return false; - - return true; -} - -/* - * Call the respective handler function for the given range. - * We split up any 64 bit accesses into two consecutive 32 bit - * handler calls and merge the result afterwards. - * We do this in a little endian fashion regardless of the host's - * or guest's endianness, because the GIC is always LE and the rest of - * the code (vgic_reg_access) also puts it in a LE fashion already. - * At this point we have already identified the handle function, so - * range points to that one entry and offset is relative to this. - */ -static bool call_range_handler(struct kvm_vcpu *vcpu, - struct kvm_exit_mmio *mmio, - unsigned long offset, - const struct vgic_io_range *range) -{ - struct kvm_exit_mmio mmio32; - bool ret; - - if (likely(mmio->len <= 4)) - return range->handle_mmio(vcpu, mmio, offset); - - /* - * Any access bigger than 4 bytes (that we currently handle in KVM) - * is actually 8 bytes long, caused by a 64-bit access - */ - - mmio32.len = 4; - mmio32.is_write = mmio->is_write; - mmio32.private = mmio->private; - - mmio32.phys_addr = mmio->phys_addr + 4; - mmio32.data = &((u32 *)mmio->data)[1]; - ret = range->handle_mmio(vcpu, &mmio32, offset + 4); - - mmio32.phys_addr = mmio->phys_addr; - mmio32.data = &((u32 *)mmio->data)[0]; - ret |= range->handle_mmio(vcpu, &mmio32, offset); - - return ret; -} - -/** - * vgic_handle_mmio_access - handle an in-kernel MMIO access - * This is called by the read/write KVM IO device wrappers below. - * @vcpu: pointer to the vcpu performing the access - * @this: pointer to the KVM IO device in charge - * @addr: guest physical address of the access - * @len: size of the access - * @val: pointer to the data region - * @is_write: read or write access - * - * returns true if the MMIO access could be performed - */ -static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu, - struct kvm_io_device *this, gpa_t addr, - int len, void *val, bool is_write) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - struct vgic_io_device *iodev = container_of(this, - struct vgic_io_device, dev); - const struct vgic_io_range *range; - struct kvm_exit_mmio mmio; - bool updated_state; - gpa_t offset; - - offset = addr - iodev->addr; - range = vgic_find_range(iodev->reg_ranges, len, offset); - if (unlikely(!range || !range->handle_mmio)) { - pr_warn("Unhandled access %d %08llx %d\n", is_write, addr, len); - return -ENXIO; - } - - mmio.phys_addr = addr; - mmio.len = len; - mmio.is_write = is_write; - mmio.data = val; - mmio.private = iodev->redist_vcpu; - - spin_lock(&dist->lock); - offset -= range->base; - if (vgic_validate_access(dist, range, offset)) { - updated_state = call_range_handler(vcpu, &mmio, offset, range); - } else { - if (!is_write) - memset(val, 0, len); - updated_state = false; - } - spin_unlock(&dist->lock); - - if (updated_state) - vgic_kick_vcpus(vcpu->kvm); - - return 0; -} - -static int vgic_handle_mmio_read(struct kvm_vcpu *vcpu, - struct kvm_io_device *this, - gpa_t addr, int len, void *val) -{ - return vgic_handle_mmio_access(vcpu, this, addr, len, val, false); -} - -static int vgic_handle_mmio_write(struct kvm_vcpu *vcpu, - struct kvm_io_device *this, - gpa_t addr, int len, const void *val) -{ - return vgic_handle_mmio_access(vcpu, this, addr, len, (void *)val, - true); -} - -static struct kvm_io_device_ops vgic_io_ops = { - .read = vgic_handle_mmio_read, - .write = vgic_handle_mmio_write, -}; - -/** - * vgic_register_kvm_io_dev - register VGIC register frame on the KVM I/O bus - * @kvm: The VM structure pointer - * @base: The (guest) base address for the register frame - * @len: Length of the register frame window - * @ranges: Describing the handler functions for each register - * @redist_vcpu_id: The VCPU ID to pass on to the handlers on call - * @iodev: Points to memory to be passed on to the handler - * - * @iodev stores the parameters of this function to be usable by the handler - * respectively the dispatcher function (since the KVM I/O bus framework lacks - * an opaque parameter). Initialization is done in this function, but the - * reference should be valid and unique for the whole VGIC lifetime. - * If the register frame is not mapped for a specific VCPU, pass -1 to - * @redist_vcpu_id. - */ -int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len, - const struct vgic_io_range *ranges, - int redist_vcpu_id, - struct vgic_io_device *iodev) -{ - struct kvm_vcpu *vcpu = NULL; - int ret; - - if (redist_vcpu_id >= 0) - vcpu = kvm_get_vcpu(kvm, redist_vcpu_id); - - iodev->addr = base; - iodev->len = len; - iodev->reg_ranges = ranges; - iodev->redist_vcpu = vcpu; - - kvm_iodevice_init(&iodev->dev, &vgic_io_ops); - - mutex_lock(&kvm->slots_lock); - - ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, base, len, - &iodev->dev); - mutex_unlock(&kvm->slots_lock); - - /* Mark the iodev as invalid if registration fails. */ - if (ret) - iodev->dev.ops = NULL; - - return ret; -} - -static int vgic_nr_shared_irqs(struct vgic_dist *dist) -{ - return dist->nr_irqs - VGIC_NR_PRIVATE_IRQS; -} - -static int compute_active_for_cpu(struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - unsigned long *active, *enabled, *act_percpu, *act_shared; - unsigned long active_private, active_shared; - int nr_shared = vgic_nr_shared_irqs(dist); - int vcpu_id; - - vcpu_id = vcpu->vcpu_id; - act_percpu = vcpu->arch.vgic_cpu.active_percpu; - act_shared = vcpu->arch.vgic_cpu.active_shared; - - active = vgic_bitmap_get_cpu_map(&dist->irq_active, vcpu_id); - enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id); - bitmap_and(act_percpu, active, enabled, VGIC_NR_PRIVATE_IRQS); - - active = vgic_bitmap_get_shared_map(&dist->irq_active); - enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled); - bitmap_and(act_shared, active, enabled, nr_shared); - bitmap_and(act_shared, act_shared, - vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]), - nr_shared); - - active_private = find_first_bit(act_percpu, VGIC_NR_PRIVATE_IRQS); - active_shared = find_first_bit(act_shared, nr_shared); - - return (active_private < VGIC_NR_PRIVATE_IRQS || - active_shared < nr_shared); -} - -static int compute_pending_for_cpu(struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - unsigned long *pending, *enabled, *pend_percpu, *pend_shared; - unsigned long pending_private, pending_shared; - int nr_shared = vgic_nr_shared_irqs(dist); - int vcpu_id; - - vcpu_id = vcpu->vcpu_id; - pend_percpu = vcpu->arch.vgic_cpu.pending_percpu; - pend_shared = vcpu->arch.vgic_cpu.pending_shared; - - if (!dist->enabled) { - bitmap_zero(pend_percpu, VGIC_NR_PRIVATE_IRQS); - bitmap_zero(pend_shared, nr_shared); - return 0; - } - - pending = vgic_bitmap_get_cpu_map(&dist->irq_pending, vcpu_id); - enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id); - bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS); - - pending = vgic_bitmap_get_shared_map(&dist->irq_pending); - enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled); - bitmap_and(pend_shared, pending, enabled, nr_shared); - bitmap_and(pend_shared, pend_shared, - vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]), - nr_shared); - - pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS); - pending_shared = find_first_bit(pend_shared, nr_shared); - return (pending_private < VGIC_NR_PRIVATE_IRQS || - pending_shared < vgic_nr_shared_irqs(dist)); -} - -/* - * Update the interrupt state and determine which CPUs have pending - * or active interrupts. Must be called with distributor lock held. - */ -void vgic_update_state(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int c; - - kvm_for_each_vcpu(c, vcpu, kvm) { - if (compute_pending_for_cpu(vcpu)) - set_bit(c, dist->irq_pending_on_cpu); - - if (compute_active_for_cpu(vcpu)) - set_bit(c, dist->irq_active_on_cpu); - else - clear_bit(c, dist->irq_active_on_cpu); - } -} - -static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr) -{ - return vgic_ops->get_lr(vcpu, lr); -} - -static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, - struct vgic_lr vlr) -{ - vgic_ops->set_lr(vcpu, lr, vlr); -} - -static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu) -{ - return vgic_ops->get_elrsr(vcpu); -} - -static inline u64 vgic_get_eisr(struct kvm_vcpu *vcpu) -{ - return vgic_ops->get_eisr(vcpu); -} - -static inline void vgic_clear_eisr(struct kvm_vcpu *vcpu) -{ - vgic_ops->clear_eisr(vcpu); -} - -static inline u32 vgic_get_interrupt_status(struct kvm_vcpu *vcpu) -{ - return vgic_ops->get_interrupt_status(vcpu); -} - -static inline void vgic_enable_underflow(struct kvm_vcpu *vcpu) -{ - vgic_ops->enable_underflow(vcpu); -} - -static inline void vgic_disable_underflow(struct kvm_vcpu *vcpu) -{ - vgic_ops->disable_underflow(vcpu); -} - -void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) -{ - vgic_ops->get_vmcr(vcpu, vmcr); -} - -void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr) -{ - vgic_ops->set_vmcr(vcpu, vmcr); -} - -static inline void vgic_enable(struct kvm_vcpu *vcpu) -{ - vgic_ops->enable(vcpu); -} - -static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu) -{ - struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr); - - vgic_irq_clear_queued(vcpu, vlr.irq); - - /* - * We must transfer the pending state back to the distributor before - * retiring the LR, otherwise we may loose edge-triggered interrupts. - */ - if (vlr.state & LR_STATE_PENDING) { - vgic_dist_irq_set_pending(vcpu, vlr.irq); - vlr.hwirq = 0; - } - - vlr.state = 0; - vgic_set_lr(vcpu, lr_nr, vlr); -} - -static bool dist_active_irq(struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - return test_bit(vcpu->vcpu_id, dist->irq_active_on_cpu); -} - -bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq) -{ - int i; - - for (i = 0; i < vgic->nr_lr; i++) { - struct vgic_lr vlr = vgic_get_lr(vcpu, i); - - if (vlr.irq == virt_irq && vlr.state & LR_STATE_ACTIVE) - return true; - } - - return vgic_irq_is_active(vcpu, virt_irq); -} - -/* - * An interrupt may have been disabled after being made pending on the - * CPU interface (the classic case is a timer running while we're - * rebooting the guest - the interrupt would kick as soon as the CPU - * interface gets enabled, with deadly consequences). - * - * The solution is to examine already active LRs, and check the - * interrupt is still enabled. If not, just retire it. - */ -static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu) -{ - u64 elrsr = vgic_get_elrsr(vcpu); - unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr); - int lr; - - for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) { - struct vgic_lr vlr = vgic_get_lr(vcpu, lr); - - if (!vgic_irq_is_enabled(vcpu, vlr.irq)) - vgic_retire_lr(lr, vcpu); - } -} - -static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq, - int lr_nr, struct vgic_lr vlr) -{ - if (vgic_irq_is_active(vcpu, irq)) { - vlr.state |= LR_STATE_ACTIVE; - kvm_debug("Set active, clear distributor: 0x%x\n", vlr.state); - vgic_irq_clear_active(vcpu, irq); - vgic_update_state(vcpu->kvm); - } else { - WARN_ON(!vgic_dist_irq_is_pending(vcpu, irq)); - vlr.state |= LR_STATE_PENDING; - kvm_debug("Set pending: 0x%x\n", vlr.state); - } - - if (!vgic_irq_is_edge(vcpu, irq)) - vlr.state |= LR_EOI_INT; - - if (vlr.irq >= VGIC_NR_SGIS) { - struct irq_phys_map *map; - map = vgic_irq_map_search(vcpu, irq); - - if (map) { - vlr.hwirq = map->phys_irq; - vlr.state |= LR_HW; - vlr.state &= ~LR_EOI_INT; - - /* - * Make sure we're not going to sample this - * again, as a HW-backed interrupt cannot be - * in the PENDING_ACTIVE stage. - */ - vgic_irq_set_queued(vcpu, irq); - } - } - - vgic_set_lr(vcpu, lr_nr, vlr); -} - -/* - * Queue an interrupt to a CPU virtual interface. Return true on success, - * or false if it wasn't possible to queue it. - * sgi_source must be zero for any non-SGI interrupts. - */ -bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - u64 elrsr = vgic_get_elrsr(vcpu); - unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr); - struct vgic_lr vlr; - int lr; - - /* Sanitize the input... */ - BUG_ON(sgi_source_id & ~7); - BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS); - BUG_ON(irq >= dist->nr_irqs); - - kvm_debug("Queue IRQ%d\n", irq); - - /* Do we have an active interrupt for the same CPUID? */ - for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) { - vlr = vgic_get_lr(vcpu, lr); - if (vlr.irq == irq && vlr.source == sgi_source_id) { - kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq); - vgic_queue_irq_to_lr(vcpu, irq, lr, vlr); - return true; - } - } - - /* Try to use another LR for this interrupt */ - lr = find_first_bit(elrsr_ptr, vgic->nr_lr); - if (lr >= vgic->nr_lr) - return false; - - kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id); - - vlr.irq = irq; - vlr.source = sgi_source_id; - vlr.state = 0; - vgic_queue_irq_to_lr(vcpu, irq, lr, vlr); - - return true; -} - -static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq) -{ - if (!vgic_can_sample_irq(vcpu, irq)) - return true; /* level interrupt, already queued */ - - if (vgic_queue_irq(vcpu, 0, irq)) { - if (vgic_irq_is_edge(vcpu, irq)) { - vgic_dist_irq_clear_pending(vcpu, irq); - vgic_cpu_irq_clear(vcpu, irq); - } else { - vgic_irq_set_queued(vcpu, irq); - } - - return true; - } - - return false; -} - -/* - * Fill the list registers with pending interrupts before running the - * guest. - */ -static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - unsigned long *pa_percpu, *pa_shared; - int i, vcpu_id; - int overflow = 0; - int nr_shared = vgic_nr_shared_irqs(dist); - - vcpu_id = vcpu->vcpu_id; - - pa_percpu = vcpu->arch.vgic_cpu.pend_act_percpu; - pa_shared = vcpu->arch.vgic_cpu.pend_act_shared; - - bitmap_or(pa_percpu, vgic_cpu->pending_percpu, vgic_cpu->active_percpu, - VGIC_NR_PRIVATE_IRQS); - bitmap_or(pa_shared, vgic_cpu->pending_shared, vgic_cpu->active_shared, - nr_shared); - /* - * We may not have any pending interrupt, or the interrupts - * may have been serviced from another vcpu. In all cases, - * move along. - */ - if (!kvm_vgic_vcpu_pending_irq(vcpu) && !dist_active_irq(vcpu)) - goto epilog; - - /* SGIs */ - for_each_set_bit(i, pa_percpu, VGIC_NR_SGIS) { - if (!queue_sgi(vcpu, i)) - overflow = 1; - } - - /* PPIs */ - for_each_set_bit_from(i, pa_percpu, VGIC_NR_PRIVATE_IRQS) { - if (!vgic_queue_hwirq(vcpu, i)) - overflow = 1; - } - - /* SPIs */ - for_each_set_bit(i, pa_shared, nr_shared) { - if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS)) - overflow = 1; - } - - - - -epilog: - if (overflow) { - vgic_enable_underflow(vcpu); - } else { - vgic_disable_underflow(vcpu); - /* - * We're about to run this VCPU, and we've consumed - * everything the distributor had in store for - * us. Claim we don't have anything pending. We'll - * adjust that if needed while exiting. - */ - clear_bit(vcpu_id, dist->irq_pending_on_cpu); - } -} - -static int process_queued_irq(struct kvm_vcpu *vcpu, - int lr, struct vgic_lr vlr) -{ - int pending = 0; - - /* - * If the IRQ was EOIed (called from vgic_process_maintenance) or it - * went from active to non-active (called from vgic_sync_hwirq) it was - * also ACKed and we we therefore assume we can clear the soft pending - * state (should it had been set) for this interrupt. - * - * Note: if the IRQ soft pending state was set after the IRQ was - * acked, it actually shouldn't be cleared, but we have no way of - * knowing that unless we start trapping ACKs when the soft-pending - * state is set. - */ - vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq); - - /* - * Tell the gic to start sampling this interrupt again. - */ - vgic_irq_clear_queued(vcpu, vlr.irq); - - /* Any additional pending interrupt? */ - if (vgic_irq_is_edge(vcpu, vlr.irq)) { - BUG_ON(!(vlr.state & LR_HW)); - pending = vgic_dist_irq_is_pending(vcpu, vlr.irq); - } else { - if (vgic_dist_irq_get_level(vcpu, vlr.irq)) { - vgic_cpu_irq_set(vcpu, vlr.irq); - pending = 1; - } else { - vgic_dist_irq_clear_pending(vcpu, vlr.irq); - vgic_cpu_irq_clear(vcpu, vlr.irq); - } - } - - /* - * Despite being EOIed, the LR may not have - * been marked as empty. - */ - vlr.state = 0; - vlr.hwirq = 0; - vgic_set_lr(vcpu, lr, vlr); - - return pending; -} - -static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) -{ - u32 status = vgic_get_interrupt_status(vcpu); - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - struct kvm *kvm = vcpu->kvm; - int level_pending = 0; - - kvm_debug("STATUS = %08x\n", status); - - if (status & INT_STATUS_EOI) { - /* - * Some level interrupts have been EOIed. Clear their - * active bit. - */ - u64 eisr = vgic_get_eisr(vcpu); - unsigned long *eisr_ptr = u64_to_bitmask(&eisr); - int lr; - - for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) { - struct vgic_lr vlr = vgic_get_lr(vcpu, lr); - - WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq)); - WARN_ON(vlr.state & LR_STATE_MASK); - - - /* - * kvm_notify_acked_irq calls kvm_set_irq() - * to reset the IRQ level, which grabs the dist->lock - * so we call this before taking the dist->lock. - */ - kvm_notify_acked_irq(kvm, 0, - vlr.irq - VGIC_NR_PRIVATE_IRQS); - - spin_lock(&dist->lock); - level_pending |= process_queued_irq(vcpu, lr, vlr); - spin_unlock(&dist->lock); - } - } - - if (status & INT_STATUS_UNDERFLOW) - vgic_disable_underflow(vcpu); - - /* - * In the next iterations of the vcpu loop, if we sync the vgic state - * after flushing it, but before entering the guest (this happens for - * pending signals and vmid rollovers), then make sure we don't pick - * up any old maintenance interrupts here. - */ - vgic_clear_eisr(vcpu); - - return level_pending; -} - -/* - * Save the physical active state, and reset it to inactive. - * - * Return true if there's a pending forwarded interrupt to queue. - */ -static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - bool level_pending; - - if (!(vlr.state & LR_HW)) - return false; - - if (vlr.state & LR_STATE_ACTIVE) - return false; - - spin_lock(&dist->lock); - level_pending = process_queued_irq(vcpu, lr, vlr); - spin_unlock(&dist->lock); - return level_pending; -} - -/* Sync back the VGIC state after a guest run */ -static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - u64 elrsr; - unsigned long *elrsr_ptr; - int lr, pending; - bool level_pending; - - level_pending = vgic_process_maintenance(vcpu); - - /* Deal with HW interrupts, and clear mappings for empty LRs */ - for (lr = 0; lr < vgic->nr_lr; lr++) { - struct vgic_lr vlr = vgic_get_lr(vcpu, lr); - - level_pending |= vgic_sync_hwirq(vcpu, lr, vlr); - BUG_ON(vlr.irq >= dist->nr_irqs); - } - - /* Check if we still have something up our sleeve... */ - elrsr = vgic_get_elrsr(vcpu); - elrsr_ptr = u64_to_bitmask(&elrsr); - pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr); - if (level_pending || pending < vgic->nr_lr) - set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu); -} - -void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - if (!irqchip_in_kernel(vcpu->kvm)) - return; - - spin_lock(&dist->lock); - __kvm_vgic_flush_hwstate(vcpu); - spin_unlock(&dist->lock); -} - -void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) -{ - if (!irqchip_in_kernel(vcpu->kvm)) - return; - - __kvm_vgic_sync_hwstate(vcpu); -} - -int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - - if (!irqchip_in_kernel(vcpu->kvm)) - return 0; - - return test_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu); -} - -void vgic_kick_vcpus(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int c; - - /* - * We've injected an interrupt, time to find out who deserves - * a good kick... - */ - kvm_for_each_vcpu(c, vcpu, kvm) { - if (kvm_vgic_vcpu_pending_irq(vcpu)) - kvm_vcpu_kick(vcpu); - } -} - -static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level) -{ - int edge_triggered = vgic_irq_is_edge(vcpu, irq); - - /* - * Only inject an interrupt if: - * - edge triggered and we have a rising edge - * - level triggered and we change level - */ - if (edge_triggered) { - int state = vgic_dist_irq_is_pending(vcpu, irq); - return level > state; - } else { - int state = vgic_dist_irq_get_level(vcpu, irq); - return level != state; - } -} - -static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, - unsigned int irq_num, bool level) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int edge_triggered, level_triggered; - int enabled; - bool ret = true, can_inject = true; - - trace_vgic_update_irq_pending(cpuid, irq_num, level); - - if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020)) - return -EINVAL; - - spin_lock(&dist->lock); - - vcpu = kvm_get_vcpu(kvm, cpuid); - edge_triggered = vgic_irq_is_edge(vcpu, irq_num); - level_triggered = !edge_triggered; - - if (!vgic_validate_injection(vcpu, irq_num, level)) { - ret = false; - goto out; - } - - if (irq_num >= VGIC_NR_PRIVATE_IRQS) { - cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS]; - if (cpuid == VCPU_NOT_ALLOCATED) { - /* Pretend we use CPU0, and prevent injection */ - cpuid = 0; - can_inject = false; - } - vcpu = kvm_get_vcpu(kvm, cpuid); - } - - kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid); - - if (level) { - if (level_triggered) - vgic_dist_irq_set_level(vcpu, irq_num); - vgic_dist_irq_set_pending(vcpu, irq_num); - } else { - if (level_triggered) { - vgic_dist_irq_clear_level(vcpu, irq_num); - if (!vgic_dist_irq_soft_pend(vcpu, irq_num)) { - vgic_dist_irq_clear_pending(vcpu, irq_num); - vgic_cpu_irq_clear(vcpu, irq_num); - if (!compute_pending_for_cpu(vcpu)) - clear_bit(cpuid, dist->irq_pending_on_cpu); - } - } - - ret = false; - goto out; - } - - enabled = vgic_irq_is_enabled(vcpu, irq_num); - - if (!enabled || !can_inject) { - ret = false; - goto out; - } - - if (!vgic_can_sample_irq(vcpu, irq_num)) { - /* - * Level interrupt in progress, will be picked up - * when EOId. - */ - ret = false; - goto out; - } - - if (level) { - vgic_cpu_irq_set(vcpu, irq_num); - set_bit(cpuid, dist->irq_pending_on_cpu); - } - -out: - spin_unlock(&dist->lock); - - if (ret) { - /* kick the specified vcpu */ - kvm_vcpu_kick(kvm_get_vcpu(kvm, cpuid)); - } - - return 0; -} - -static int vgic_lazy_init(struct kvm *kvm) -{ - int ret = 0; - - if (unlikely(!vgic_initialized(kvm))) { - /* - * We only provide the automatic initialization of the VGIC - * for the legacy case of a GICv2. Any other type must - * be explicitly initialized once setup with the respective - * KVM device call. - */ - if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) - return -EBUSY; - - mutex_lock(&kvm->lock); - ret = vgic_init(kvm); - mutex_unlock(&kvm->lock); - } - - return ret; -} - -/** - * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic - * @kvm: The VM structure pointer - * @cpuid: The CPU for PPIs - * @irq_num: The IRQ number that is assigned to the device. This IRQ - * must not be mapped to a HW interrupt. - * @level: Edge-triggered: true: to trigger the interrupt - * false: to ignore the call - * Level-sensitive true: raise the input signal - * false: lower the input signal - * - * The GIC is not concerned with devices being active-LOW or active-HIGH for - * level-sensitive interrupts. You can think of the level parameter as 1 - * being HIGH and 0 being LOW and all devices being active-HIGH. - */ -int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num, - bool level) -{ - struct irq_phys_map *map; - int ret; - - ret = vgic_lazy_init(kvm); - if (ret) - return ret; - - map = vgic_irq_map_search(kvm_get_vcpu(kvm, cpuid), irq_num); - if (map) - return -EINVAL; - - return vgic_update_irq_pending(kvm, cpuid, irq_num, level); -} - -/** - * kvm_vgic_inject_mapped_irq - Inject a physically mapped IRQ to the vgic - * @kvm: The VM structure pointer - * @cpuid: The CPU for PPIs - * @virt_irq: The virtual IRQ to be injected - * @level: Edge-triggered: true: to trigger the interrupt - * false: to ignore the call - * Level-sensitive true: raise the input signal - * false: lower the input signal - * - * The GIC is not concerned with devices being active-LOW or active-HIGH for - * level-sensitive interrupts. You can think of the level parameter as 1 - * being HIGH and 0 being LOW and all devices being active-HIGH. - */ -int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, - unsigned int virt_irq, bool level) -{ - int ret; - - ret = vgic_lazy_init(kvm); - if (ret) - return ret; - - return vgic_update_irq_pending(kvm, cpuid, virt_irq, level); -} - -static irqreturn_t vgic_maintenance_handler(int irq, void *data) -{ - /* - * We cannot rely on the vgic maintenance interrupt to be - * delivered synchronously. This means we can only use it to - * exit the VM, and we perform the handling of EOIed - * interrupts on the exit path (see vgic_process_maintenance). - */ - return IRQ_HANDLED; -} - -static struct list_head *vgic_get_irq_phys_map_list(struct kvm_vcpu *vcpu, - int virt_irq) -{ - if (virt_irq < VGIC_NR_PRIVATE_IRQS) - return &vcpu->arch.vgic_cpu.irq_phys_map_list; - else - return &vcpu->kvm->arch.vgic.irq_phys_map_list; -} - -/** - * kvm_vgic_map_phys_irq - map a virtual IRQ to a physical IRQ - * @vcpu: The VCPU pointer - * @virt_irq: The virtual IRQ number for the guest - * @phys_irq: The hardware IRQ number of the host - * - * Establish a mapping between a guest visible irq (@virt_irq) and a - * hardware irq (@phys_irq). On injection, @virt_irq will be associated with - * the physical interrupt represented by @phys_irq. This mapping can be - * established multiple times as long as the parameters are the same. - * - * Returns 0 on success or an error value otherwise. - */ -int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq); - struct irq_phys_map *map; - struct irq_phys_map_entry *entry; - int ret = 0; - - /* Create a new mapping */ - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - return -ENOMEM; - - spin_lock(&dist->irq_phys_map_lock); - - /* Try to match an existing mapping */ - map = vgic_irq_map_search(vcpu, virt_irq); - if (map) { - /* Make sure this mapping matches */ - if (map->phys_irq != phys_irq) - ret = -EINVAL; - - /* Found an existing, valid mapping */ - goto out; - } - - map = &entry->map; - map->virt_irq = virt_irq; - map->phys_irq = phys_irq; - - list_add_tail_rcu(&entry->entry, root); - -out: - spin_unlock(&dist->irq_phys_map_lock); - /* If we've found a hit in the existing list, free the useless - * entry */ - if (ret || map != &entry->map) - kfree(entry); - return ret; -} - -static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu, - int virt_irq) -{ - struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq); - struct irq_phys_map_entry *entry; - struct irq_phys_map *map; - - rcu_read_lock(); - - list_for_each_entry_rcu(entry, root, entry) { - map = &entry->map; - if (map->virt_irq == virt_irq) { - rcu_read_unlock(); - return map; - } - } - - rcu_read_unlock(); - - return NULL; -} - -static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu) -{ - struct irq_phys_map_entry *entry; - - entry = container_of(rcu, struct irq_phys_map_entry, rcu); - kfree(entry); -} - -/** - * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping - * @vcpu: The VCPU pointer - * @virt_irq: The virtual IRQ number to be unmapped - * - * Remove an existing mapping between virtual and physical interrupts. - */ -int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq) -{ - struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - struct irq_phys_map_entry *entry; - struct list_head *root; - - root = vgic_get_irq_phys_map_list(vcpu, virt_irq); - - spin_lock(&dist->irq_phys_map_lock); - - list_for_each_entry(entry, root, entry) { - if (entry->map.virt_irq == virt_irq) { - list_del_rcu(&entry->entry); - call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu); - break; - } - } - - spin_unlock(&dist->irq_phys_map_lock); - - return 0; -} - -static void vgic_destroy_irq_phys_map(struct kvm *kvm, struct list_head *root) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct irq_phys_map_entry *entry; - - spin_lock(&dist->irq_phys_map_lock); - - list_for_each_entry(entry, root, entry) { - list_del_rcu(&entry->entry); - call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu); - } - - spin_unlock(&dist->irq_phys_map_lock); -} - -void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - - kfree(vgic_cpu->pending_shared); - kfree(vgic_cpu->active_shared); - kfree(vgic_cpu->pend_act_shared); - vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list); - vgic_cpu->pending_shared = NULL; - vgic_cpu->active_shared = NULL; - vgic_cpu->pend_act_shared = NULL; -} - -static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - int nr_longs = BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS); - int sz = nr_longs * sizeof(unsigned long); - vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL); - vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL); - vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL); - - if (!vgic_cpu->pending_shared - || !vgic_cpu->active_shared - || !vgic_cpu->pend_act_shared) { - kvm_vgic_vcpu_destroy(vcpu); - return -ENOMEM; - } - - return 0; -} - -/** - * kvm_vgic_vcpu_early_init - Earliest possible per-vcpu vgic init stage - * - * No memory allocation should be performed here, only static init. - */ -void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu) -{ - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - INIT_LIST_HEAD(&vgic_cpu->irq_phys_map_list); -} - -/** - * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW - * - * The host's GIC naturally limits the maximum amount of VCPUs a guest - * can use. - */ -int kvm_vgic_get_max_vcpus(void) -{ - return vgic->max_gic_vcpus; -} - -void kvm_vgic_destroy(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int i; - - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vgic_vcpu_destroy(vcpu); - - vgic_free_bitmap(&dist->irq_enabled); - vgic_free_bitmap(&dist->irq_level); - vgic_free_bitmap(&dist->irq_pending); - vgic_free_bitmap(&dist->irq_soft_pend); - vgic_free_bitmap(&dist->irq_queued); - vgic_free_bitmap(&dist->irq_cfg); - vgic_free_bytemap(&dist->irq_priority); - if (dist->irq_spi_target) { - for (i = 0; i < dist->nr_cpus; i++) - vgic_free_bitmap(&dist->irq_spi_target[i]); - } - kfree(dist->irq_sgi_sources); - kfree(dist->irq_spi_cpu); - kfree(dist->irq_spi_mpidr); - kfree(dist->irq_spi_target); - kfree(dist->irq_pending_on_cpu); - kfree(dist->irq_active_on_cpu); - vgic_destroy_irq_phys_map(kvm, &dist->irq_phys_map_list); - dist->irq_sgi_sources = NULL; - dist->irq_spi_cpu = NULL; - dist->irq_spi_target = NULL; - dist->irq_pending_on_cpu = NULL; - dist->irq_active_on_cpu = NULL; - dist->nr_cpus = 0; -} - -/* - * Allocate and initialize the various data structures. Must be called - * with kvm->lock held! - */ -int vgic_init(struct kvm *kvm) -{ - struct vgic_dist *dist = &kvm->arch.vgic; - struct kvm_vcpu *vcpu; - int nr_cpus, nr_irqs; - int ret, i, vcpu_id; - - if (vgic_initialized(kvm)) - return 0; - - nr_cpus = dist->nr_cpus = atomic_read(&kvm->online_vcpus); - if (!nr_cpus) /* No vcpus? Can't be good... */ - return -ENODEV; - - /* - * If nobody configured the number of interrupts, use the - * legacy one. - */ - if (!dist->nr_irqs) - dist->nr_irqs = VGIC_NR_IRQS_LEGACY; - - nr_irqs = dist->nr_irqs; - - ret = vgic_init_bitmap(&dist->irq_enabled, nr_cpus, nr_irqs); - ret |= vgic_init_bitmap(&dist->irq_level, nr_cpus, nr_irqs); - ret |= vgic_init_bitmap(&dist->irq_pending, nr_cpus, nr_irqs); - ret |= vgic_init_bitmap(&dist->irq_soft_pend, nr_cpus, nr_irqs); - ret |= vgic_init_bitmap(&dist->irq_queued, nr_cpus, nr_irqs); - ret |= vgic_init_bitmap(&dist->irq_active, nr_cpus, nr_irqs); - ret |= vgic_init_bitmap(&dist->irq_cfg, nr_cpus, nr_irqs); - ret |= vgic_init_bytemap(&dist->irq_priority, nr_cpus, nr_irqs); - - if (ret) - goto out; - - dist->irq_sgi_sources = kzalloc(nr_cpus * VGIC_NR_SGIS, GFP_KERNEL); - dist->irq_spi_cpu = kzalloc(nr_irqs - VGIC_NR_PRIVATE_IRQS, GFP_KERNEL); - dist->irq_spi_target = kzalloc(sizeof(*dist->irq_spi_target) * nr_cpus, - GFP_KERNEL); - dist->irq_pending_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long), - GFP_KERNEL); - dist->irq_active_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long), - GFP_KERNEL); - if (!dist->irq_sgi_sources || - !dist->irq_spi_cpu || - !dist->irq_spi_target || - !dist->irq_pending_on_cpu || - !dist->irq_active_on_cpu) { - ret = -ENOMEM; - goto out; - } - - for (i = 0; i < nr_cpus; i++) - ret |= vgic_init_bitmap(&dist->irq_spi_target[i], - nr_cpus, nr_irqs); - - if (ret) - goto out; - - ret = kvm->arch.vgic.vm_ops.init_model(kvm); - if (ret) - goto out; - - kvm_for_each_vcpu(vcpu_id, vcpu, kvm) { - ret = vgic_vcpu_init_maps(vcpu, nr_irqs); - if (ret) { - kvm_err("VGIC: Failed to allocate vcpu memory\n"); - break; - } - - /* - * Enable and configure all SGIs to be edge-triggere and - * configure all PPIs as level-triggered. - */ - for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - if (i < VGIC_NR_SGIS) { - /* SGIs */ - vgic_bitmap_set_irq_val(&dist->irq_enabled, - vcpu->vcpu_id, i, 1); - vgic_bitmap_set_irq_val(&dist->irq_cfg, - vcpu->vcpu_id, i, - VGIC_CFG_EDGE); - } else if (i < VGIC_NR_PRIVATE_IRQS) { - /* PPIs */ - vgic_bitmap_set_irq_val(&dist->irq_cfg, - vcpu->vcpu_id, i, - VGIC_CFG_LEVEL); - } - } - - vgic_enable(vcpu); - } - -out: - if (ret) - kvm_vgic_destroy(kvm); - - return ret; -} - -static int init_vgic_model(struct kvm *kvm, int type) -{ - switch (type) { - case KVM_DEV_TYPE_ARM_VGIC_V2: - vgic_v2_init_emulation(kvm); - break; -#ifdef CONFIG_KVM_ARM_VGIC_V3 - case KVM_DEV_TYPE_ARM_VGIC_V3: - vgic_v3_init_emulation(kvm); - break; -#endif - default: - return -ENODEV; - } - - if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) - return -E2BIG; - - return 0; -} - -/** - * kvm_vgic_early_init - Earliest possible vgic initialization stage - * - * No memory allocation should be performed here, only static init. - */ -void kvm_vgic_early_init(struct kvm *kvm) -{ - spin_lock_init(&kvm->arch.vgic.lock); - spin_lock_init(&kvm->arch.vgic.irq_phys_map_lock); - INIT_LIST_HEAD(&kvm->arch.vgic.irq_phys_map_list); -} - -int kvm_vgic_create(struct kvm *kvm, u32 type) -{ - int i, vcpu_lock_idx = -1, ret; - struct kvm_vcpu *vcpu; - - mutex_lock(&kvm->lock); - - if (irqchip_in_kernel(kvm)) { - ret = -EEXIST; - goto out; - } - - /* - * This function is also called by the KVM_CREATE_IRQCHIP handler, - * which had no chance yet to check the availability of the GICv2 - * emulation. So check this here again. KVM_CREATE_DEVICE does - * the proper checks already. - */ - if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && !vgic->can_emulate_gicv2) { - ret = -ENODEV; - goto out; - } - - /* - * Any time a vcpu is run, vcpu_load is called which tries to grab the - * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure - * that no other VCPUs are run while we create the vgic. - */ - ret = -EBUSY; - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!mutex_trylock(&vcpu->mutex)) - goto out_unlock; - vcpu_lock_idx = i; - } - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu->arch.has_run_once) - goto out_unlock; - } - ret = 0; - - ret = init_vgic_model(kvm, type); - if (ret) - goto out_unlock; - - kvm->arch.vgic.in_kernel = true; - kvm->arch.vgic.vgic_model = type; - kvm->arch.vgic.vctrl_base = vgic->vctrl_base; - kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; - kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF; - kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF; - -out_unlock: - for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { - vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); - mutex_unlock(&vcpu->mutex); - } - -out: - mutex_unlock(&kvm->lock); - return ret; -} - -static int vgic_ioaddr_overlap(struct kvm *kvm) -{ - phys_addr_t dist = kvm->arch.vgic.vgic_dist_base; - phys_addr_t cpu = kvm->arch.vgic.vgic_cpu_base; - - if (IS_VGIC_ADDR_UNDEF(dist) || IS_VGIC_ADDR_UNDEF(cpu)) - return 0; - if ((dist <= cpu && dist + KVM_VGIC_V2_DIST_SIZE > cpu) || - (cpu <= dist && cpu + KVM_VGIC_V2_CPU_SIZE > dist)) - return -EBUSY; - return 0; -} - -static int vgic_ioaddr_assign(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t size) -{ - int ret; - - if (addr & ~KVM_PHYS_MASK) - return -E2BIG; - - if (addr & (SZ_4K - 1)) - return -EINVAL; - - if (!IS_VGIC_ADDR_UNDEF(*ioaddr)) - return -EEXIST; - if (addr + size < addr) - return -EINVAL; - - *ioaddr = addr; - ret = vgic_ioaddr_overlap(kvm); - if (ret) - *ioaddr = VGIC_ADDR_UNDEF; - - return ret; -} - -/** - * kvm_vgic_addr - set or get vgic VM base addresses - * @kvm: pointer to the vm struct - * @type: the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX - * @addr: pointer to address value - * @write: if true set the address in the VM address space, if false read the - * address - * - * Set or get the vgic base addresses for the distributor and the virtual CPU - * interface in the VM physical address space. These addresses are properties - * of the emulated core/SoC and therefore user space initially knows this - * information. - */ -int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) -{ - int r = 0; - struct vgic_dist *vgic = &kvm->arch.vgic; - int type_needed; - phys_addr_t *addr_ptr, block_size; - phys_addr_t alignment; - - mutex_lock(&kvm->lock); - switch (type) { - case KVM_VGIC_V2_ADDR_TYPE_DIST: - type_needed = KVM_DEV_TYPE_ARM_VGIC_V2; - addr_ptr = &vgic->vgic_dist_base; - block_size = KVM_VGIC_V2_DIST_SIZE; - alignment = SZ_4K; - break; - case KVM_VGIC_V2_ADDR_TYPE_CPU: - type_needed = KVM_DEV_TYPE_ARM_VGIC_V2; - addr_ptr = &vgic->vgic_cpu_base; - block_size = KVM_VGIC_V2_CPU_SIZE; - alignment = SZ_4K; - break; -#ifdef CONFIG_KVM_ARM_VGIC_V3 - case KVM_VGIC_V3_ADDR_TYPE_DIST: - type_needed = KVM_DEV_TYPE_ARM_VGIC_V3; - addr_ptr = &vgic->vgic_dist_base; - block_size = KVM_VGIC_V3_DIST_SIZE; - alignment = SZ_64K; - break; - case KVM_VGIC_V3_ADDR_TYPE_REDIST: - type_needed = KVM_DEV_TYPE_ARM_VGIC_V3; - addr_ptr = &vgic->vgic_redist_base; - block_size = KVM_VGIC_V3_REDIST_SIZE; - alignment = SZ_64K; - break; -#endif - default: - r = -ENODEV; - goto out; - } - - if (vgic->vgic_model != type_needed) { - r = -ENODEV; - goto out; - } - - if (write) { - if (!IS_ALIGNED(*addr, alignment)) - r = -EINVAL; - else - r = vgic_ioaddr_assign(kvm, addr_ptr, *addr, - block_size); - } else { - *addr = *addr_ptr; - } - -out: - mutex_unlock(&kvm->lock); - return r; -} - -int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr) -{ - int r; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 addr; - unsigned long type = (unsigned long)attr->attr; - - if (copy_from_user(&addr, uaddr, sizeof(addr))) - return -EFAULT; - - r = kvm_vgic_addr(dev->kvm, type, &addr, true); - return (r == -ENODEV) ? -ENXIO : r; - } - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 val; - int ret = 0; - - if (get_user(val, uaddr)) - return -EFAULT; - - /* - * We require: - * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs - * - at most 1024 interrupts - * - a multiple of 32 interrupts - */ - if (val < (VGIC_NR_PRIVATE_IRQS + 32) || - val > VGIC_MAX_IRQS || - (val & 31)) - return -EINVAL; - - mutex_lock(&dev->kvm->lock); - - if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_irqs) - ret = -EBUSY; - else - dev->kvm->arch.vgic.nr_irqs = val; - - mutex_unlock(&dev->kvm->lock); - - return ret; - } - case KVM_DEV_ARM_VGIC_GRP_CTRL: { - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - r = vgic_init(dev->kvm); - return r; - } - break; - } - } - - return -ENXIO; -} - -int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr) -{ - int r = -ENXIO; - - switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 addr; - unsigned long type = (unsigned long)attr->attr; - - r = kvm_vgic_addr(dev->kvm, type, &addr, false); - if (r) - return (r == -ENODEV) ? -ENXIO : r; - - if (copy_to_user(uaddr, &addr, sizeof(addr))) - return -EFAULT; - break; - } - case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - - r = put_user(dev->kvm->arch.vgic.nr_irqs, uaddr); - break; - } - - } - - return r; -} - -int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset) -{ - if (vgic_find_range(ranges, 4, offset)) - return 0; - else - return -ENXIO; -} - -static void vgic_init_maintenance_interrupt(void *info) -{ - enable_percpu_irq(vgic->maint_irq, 0); -} - -static int vgic_cpu_notify(struct notifier_block *self, - unsigned long action, void *cpu) -{ - switch (action) { - case CPU_STARTING: - case CPU_STARTING_FROZEN: - vgic_init_maintenance_interrupt(NULL); - break; - case CPU_DYING: - case CPU_DYING_FROZEN: - disable_percpu_irq(vgic->maint_irq); - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block vgic_cpu_nb = { - .notifier_call = vgic_cpu_notify, -}; - -static int kvm_vgic_probe(void) -{ - const struct gic_kvm_info *gic_kvm_info; - int ret; - - gic_kvm_info = gic_get_kvm_info(); - if (!gic_kvm_info) - return -ENODEV; - - switch (gic_kvm_info->type) { - case GIC_V2: - ret = vgic_v2_probe(gic_kvm_info, &vgic_ops, &vgic); - break; - case GIC_V3: - ret = vgic_v3_probe(gic_kvm_info, &vgic_ops, &vgic); - break; - default: - ret = -ENODEV; - } - - return ret; -} - -int kvm_vgic_hyp_init(void) -{ - int ret; - - ret = kvm_vgic_probe(); - if (ret) { - kvm_err("error: KVM vGIC probing failed\n"); - return ret; - } - - ret = request_percpu_irq(vgic->maint_irq, vgic_maintenance_handler, - "vgic", kvm_get_running_vcpus()); - if (ret) { - kvm_err("Cannot register interrupt %d\n", vgic->maint_irq); - return ret; - } - - ret = __register_cpu_notifier(&vgic_cpu_nb); - if (ret) { - kvm_err("Cannot register vgic CPU notifier\n"); - goto out_free_irq; - } - - on_each_cpu(vgic_init_maintenance_interrupt, NULL, 1); - - return 0; - -out_free_irq: - free_percpu_irq(vgic->maint_irq, kvm_get_running_vcpus()); - return ret; -} - -int kvm_irq_map_gsi(struct kvm *kvm, - struct kvm_kernel_irq_routing_entry *entries, - int gsi) -{ - return 0; -} - -int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin) -{ - return pin; -} - -int kvm_set_irq(struct kvm *kvm, int irq_source_id, - u32 irq, int level, bool line_status) -{ - unsigned int spi = irq + VGIC_NR_PRIVATE_IRQS; - - trace_kvm_set_irq(irq, level, irq_source_id); - - BUG_ON(!vgic_initialized(kvm)); - - return kvm_vgic_inject_irq(kvm, 0, spi, level); -} - -/* MSI not implemented yet */ -int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, - struct kvm *kvm, int irq_source_id, - int level, bool line_status) -{ - return 0; -} diff --git a/virt/kvm/arm/vgic.h b/virt/kvm/arm/vgic.h deleted file mode 100644 index 0df74cbb6200..000000000000 --- a/virt/kvm/arm/vgic.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (C) 2012-2014 ARM Ltd. - * Author: Marc Zyngier - * - * Derived from virt/kvm/arm/vgic.c - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#ifndef __KVM_VGIC_H__ -#define __KVM_VGIC_H__ - -#include - -#define VGIC_ADDR_UNDEF (-1) -#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) - -#define PRODUCT_ID_KVM 0x4b /* ASCII code K */ -#define IMPLEMENTER_ARM 0x43b - -#define ACCESS_READ_VALUE (1 << 0) -#define ACCESS_READ_RAZ (0 << 0) -#define ACCESS_READ_MASK(x) ((x) & (1 << 0)) -#define ACCESS_WRITE_IGNORED (0 << 1) -#define ACCESS_WRITE_SETBIT (1 << 1) -#define ACCESS_WRITE_CLEARBIT (2 << 1) -#define ACCESS_WRITE_VALUE (3 << 1) -#define ACCESS_WRITE_MASK(x) ((x) & (3 << 1)) - -#define VCPU_NOT_ALLOCATED ((u8)-1) - -unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x); - -void vgic_update_state(struct kvm *kvm); -int vgic_init_common_maps(struct kvm *kvm); - -u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset); -u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset); - -void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq); -void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq); -void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq); -void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid, - int irq, int val); - -void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); -void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); - -bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq); -void vgic_unqueue_irqs(struct kvm_vcpu *vcpu); - -struct kvm_exit_mmio { - phys_addr_t phys_addr; - void *data; - u32 len; - bool is_write; - void *private; -}; - -void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg, - phys_addr_t offset, int mode); -bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, - phys_addr_t offset); - -static inline -u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask) -{ - return le32_to_cpu(*((u32 *)mmio->data)) & mask; -} - -static inline -void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value) -{ - *((u32 *)mmio->data) = cpu_to_le32(value) & mask; -} - -struct vgic_io_range { - phys_addr_t base; - unsigned long len; - int bits_per_irq; - bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio, - phys_addr_t offset); -}; - -int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len, - const struct vgic_io_range *ranges, - int redist_id, - struct vgic_io_device *iodev); - -static inline bool is_in_range(phys_addr_t addr, unsigned long len, - phys_addr_t baseaddr, unsigned long size) -{ - return (addr >= baseaddr) && (addr + len <= baseaddr + size); -} - -const -struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges, - int len, gpa_t offset); - -bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id, int access); - -bool vgic_handle_set_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id); - -bool vgic_handle_clear_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id); - -bool vgic_handle_set_active_reg(struct kvm *kvm, - struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id); - -bool vgic_handle_clear_active_reg(struct kvm *kvm, - struct kvm_exit_mmio *mmio, - phys_addr_t offset, int vcpu_id); - -bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio, - phys_addr_t offset); - -void vgic_kick_vcpus(struct kvm *kvm); - -int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset); -int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr); -int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr); - -int vgic_init(struct kvm *kvm); -void vgic_v2_init_emulation(struct kvm *kvm); -void vgic_v3_init_emulation(struct kvm *kvm); - -#endif From 82a81bff90c5fd11fefae35773f7396617a3cfff Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:34 +0100 Subject: [PATCH 185/302] arm64: KVM: Merged page tables documentation Since dealing with VA ranges tends to hurt my brain badly, let's start with a bit of documentation that will hopefully help understanding what comes next... Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/kvm_mmu.h | 40 +++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index fdfbddbe9fba..6149dfc2c012 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -29,10 +29,44 @@ * * Instead, give the HYP mode its own VA region at a fixed offset from * the kernel by just masking the top bits (which are all ones for a - * kernel address). + * kernel address). We need to find out how many bits to mask. * - * ARMv8.1 (using VHE) does have a TTBR1_EL2, and doesn't use these - * macros (the entire kernel runs at EL2). + * We want to build a set of page tables that cover both parts of the + * idmap (the trampoline page used to initialize EL2), and our normal + * runtime VA space, at the same time. + * + * Given that the kernel uses VA_BITS for its entire address space, + * and that half of that space (VA_BITS - 1) is used for the linear + * mapping, we can also limit the EL2 space to (VA_BITS - 1). + * + * The main question is "Within the VA_BITS space, does EL2 use the + * top or the bottom half of that space to shadow the kernel's linear + * mapping?". As we need to idmap the trampoline page, this is + * determined by the range in which this page lives. + * + * If the page is in the bottom half, we have to use the top half. If + * the page is in the top half, we have to use the bottom half: + * + * T = __virt_to_phys(__hyp_idmap_text_start) + * if (T & BIT(VA_BITS - 1)) + * HYP_VA_MIN = 0 //idmap in upper half + * else + * HYP_VA_MIN = 1 << (VA_BITS - 1) + * HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1 + * + * This of course assumes that the trampoline page exists within the + * VA_BITS range. If it doesn't, then it means we're in the odd case + * where the kernel idmap (as well as HYP) uses more levels than the + * kernel runtime page tables (as seen when the kernel is configured + * for 4k pages, 39bits VA, and yet memory lives just above that + * limit, forcing the idmap to use 4 levels of page tables while the + * kernel itself only uses 3). In this particular case, it doesn't + * matter which side of VA_BITS we use, as we're guaranteed not to + * conflict with anything. + * + * When using VHE, there are no separate hyp mappings and all KVM + * functionality is already mapped as part of the main kernel + * mappings, and none of this applies in that case. */ #define HYP_PAGE_OFFSET_SHIFT VA_BITS #define HYP_PAGE_OFFSET_MASK ((UL(1) << HYP_PAGE_OFFSET_SHIFT) - 1) From cf7df13d3c7c7f8a475c09ef49a5b72f7cfe3f4b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:35 +0100 Subject: [PATCH 186/302] arm64: KVM: Always reference __hyp_panic_string via its kernel VA __hyp_panic_string is passed via the HYP panic code to the panic function, and is being "upgraded" to a kernel address, as it is referenced by the HYP code (in a PC-relative way). This is a bit silly, and we'd be better off obtaining the kernel address and not mess with it at all. This patch implements this with a tiny bit of asm glue, by forcing the string pointer to be read from the literal pool. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/kvm/hyp/switch.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 437cfad5e3d8..81f21a2ab968 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -299,9 +299,16 @@ static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:% static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par) { - unsigned long str_va = (unsigned long)__hyp_panic_string; + unsigned long str_va; - __hyp_do_panic(hyp_kern_va(str_va), + /* + * Force the panic string to be loaded from the literal pool, + * making sure it is a kernel address and not a PC-relative + * reference. + */ + asm volatile("ldr %0, =__hyp_panic_string" : "=r" (str_va)); + + __hyp_do_panic(str_va, spsr, elr, read_sysreg(esr_el2), read_sysreg_el2(far), read_sysreg(hpfar_el2), par, From 3f0f8830d440e3edf5580424519a7c3434891c64 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:36 +0100 Subject: [PATCH 187/302] arm/arm64: KVM: Remove hyp_kern_va helper hyp_kern_va is now completely unused, so let's remove it entirely. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_hyp.h | 1 - arch/arm64/include/asm/kvm_hyp.h | 12 ------------ 2 files changed, 13 deletions(-) diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h index f0e860761380..e38fce2270ac 100644 --- a/arch/arm/include/asm/kvm_hyp.h +++ b/arch/arm/include/asm/kvm_hyp.h @@ -26,7 +26,6 @@ #define __hyp_text __section(.hyp.text) notrace #define kern_hyp_va(v) (v) -#define hyp_kern_va(v) (v) #define __ACCESS_CP15(CRn, Op1, CRm, Op2) \ "mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32 diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 44eaff70da6a..1d81f9abd172 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -36,18 +36,6 @@ static inline unsigned long __kern_hyp_va(unsigned long v) #define kern_hyp_va(v) (typeof(v))(__kern_hyp_va((unsigned long)(v))) -static inline unsigned long __hyp_kern_va(unsigned long v) -{ - u64 offset = PAGE_OFFSET - HYP_PAGE_OFFSET; - asm volatile(ALTERNATIVE("add %0, %0, %1", - "nop", - ARM64_HAS_VIRT_HOST_EXTN) - : "+r" (v) : "r" (offset)); - return v; -} - -#define hyp_kern_va(v) (typeof(v))(__hyp_kern_va((unsigned long)(v))) - #define read_sysreg_elx(r,nvh,vh) \ ({ \ u64 reg; \ From fd16fe6820ede711c6e6950ffebdbc8ade5d05b3 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:37 +0100 Subject: [PATCH 188/302] arm64: KVM: Kill HYP_PAGE_OFFSET HYP_PAGE_OFFSET is not massively useful. And the way we use it in KERN_HYP_VA is inconsistent with the equivalent operation in EL2, where we use a mask instead. Let's replace the uses of HYP_PAGE_OFFSET with HYP_PAGE_OFFSET_MASK, and get rid of the pointless macro. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/kvm_mmu.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 6149dfc2c012..2f1e1aec5ecb 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -70,7 +70,6 @@ */ #define HYP_PAGE_OFFSET_SHIFT VA_BITS #define HYP_PAGE_OFFSET_MASK ((UL(1) << HYP_PAGE_OFFSET_SHIFT) - 1) -#define HYP_PAGE_OFFSET (PAGE_OFFSET & HYP_PAGE_OFFSET_MASK) /* * Our virtual mapping for the idmap-ed MMU-enable code. Must be @@ -104,7 +103,7 @@ alternative_endif #include #include -#define KERN_TO_HYP(kva) ((unsigned long)kva - PAGE_OFFSET + HYP_PAGE_OFFSET) +#define KERN_TO_HYP(kva) ((unsigned long)kva & HYP_PAGE_OFFSET_MASK) /* * We currently only support a 40bit IPA. From 853c3b21ff35816a2ae351fd7c2adb101c1f4503 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:38 +0100 Subject: [PATCH 189/302] arm64: Add ARM64_HYP_OFFSET_LOW capability As we need to indicate to the rest of the kernel which region of the HYP VA space is safe to use, add a capability that will indicate that KVM should use the [VA_BITS-2:0] range. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/cpufeature.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 224efe730e46..d40edbb6ef23 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -36,8 +36,9 @@ #define ARM64_HAS_VIRT_HOST_EXTN 11 #define ARM64_WORKAROUND_CAVIUM_27456 12 #define ARM64_HAS_32BIT_EL0 13 +#define ARM64_HYP_OFFSET_LOW 14 -#define ARM64_NCAPS 14 +#define ARM64_NCAPS 15 #ifndef __ASSEMBLY__ From d53d9bc65289dc50b42587313466594e4d611f0f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:39 +0100 Subject: [PATCH 190/302] arm64: KVM: Define HYP offset masks Define the two possible HYP VA regions in terms of VA_BITS, and keep HYP_PAGE_OFFSET_MASK as a temporary compatibility definition. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/kvm_mmu.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 2f1e1aec5ecb..5e543231f615 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -68,8 +68,12 @@ * functionality is already mapped as part of the main kernel * mappings, and none of this applies in that case. */ -#define HYP_PAGE_OFFSET_SHIFT VA_BITS -#define HYP_PAGE_OFFSET_MASK ((UL(1) << HYP_PAGE_OFFSET_SHIFT) - 1) + +#define HYP_PAGE_OFFSET_HIGH_MASK ((UL(1) << VA_BITS) - 1) +#define HYP_PAGE_OFFSET_LOW_MASK ((UL(1) << (VA_BITS - 1)) - 1) + +/* Temporary compat define */ +#define HYP_PAGE_OFFSET_MASK HYP_PAGE_OFFSET_HIGH_MASK /* * Our virtual mapping for the idmap-ed MMU-enable code. Must be From fd81e6bf3928c14f90a033df164c375d4ce0fd85 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:40 +0100 Subject: [PATCH 191/302] arm64: KVM: Refactor kern_hyp_va to deal with multiple offsets As we move towards a selectable HYP VA range, it is obvious that we don't want to test a variable to find out if we need to use the bottom VA range, the top VA range, or use the address as is (for VHE). Instead, we can expand our current helper to generate the right mask or nop with code patching. We default to using the top VA space, with alternatives to switch to the bottom one or to nop out the instructions. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/kvm_hyp.h | 11 --------- arch/arm64/include/asm/kvm_mmu.h | 42 +++++++++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 1d81f9abd172..cff510574fae 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -25,17 +25,6 @@ #define __hyp_text __section(.hyp.text) notrace -static inline unsigned long __kern_hyp_va(unsigned long v) -{ - asm volatile(ALTERNATIVE("and %0, %0, %1", - "nop", - ARM64_HAS_VIRT_HOST_EXTN) - : "+r" (v) : "i" (HYP_PAGE_OFFSET_MASK)); - return v; -} - -#define kern_hyp_va(v) (typeof(v))(__kern_hyp_va((unsigned long)(v))) - #define read_sysreg_elx(r,nvh,vh) \ ({ \ u64 reg; \ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 5e543231f615..2970537161d2 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -90,13 +90,33 @@ /* * Convert a kernel VA into a HYP VA. * reg: VA to be converted. + * + * This generates the following sequences: + * - High mask: + * and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK + * nop + * - Low mask: + * and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK + * and x0, x0, #HYP_PAGE_OFFSET_LOW_MASK + * - VHE: + * nop + * nop + * + * The "low mask" version works because the mask is a strict subset of + * the "high mask", hence performing the first mask for nothing. + * Should be completely invisible on any viable CPU. */ .macro kern_hyp_va reg -alternative_if_not ARM64_HAS_VIRT_HOST_EXTN - and \reg, \reg, #HYP_PAGE_OFFSET_MASK +alternative_if_not ARM64_HAS_VIRT_HOST_EXTN + and \reg, \reg, #HYP_PAGE_OFFSET_HIGH_MASK alternative_else nop alternative_endif +alternative_if_not ARM64_HYP_OFFSET_LOW + nop +alternative_else + and \reg, \reg, #HYP_PAGE_OFFSET_LOW_MASK +alternative_endif .endm #else @@ -107,7 +127,23 @@ alternative_endif #include #include -#define KERN_TO_HYP(kva) ((unsigned long)kva & HYP_PAGE_OFFSET_MASK) +static inline unsigned long __kern_hyp_va(unsigned long v) +{ + asm volatile(ALTERNATIVE("and %0, %0, %1", + "nop", + ARM64_HAS_VIRT_HOST_EXTN) + : "+r" (v) + : "i" (HYP_PAGE_OFFSET_HIGH_MASK)); + asm volatile(ALTERNATIVE("nop", + "and %0, %0, %1", + ARM64_HYP_OFFSET_LOW) + : "+r" (v) + : "i" (HYP_PAGE_OFFSET_LOW_MASK)); + return v; +} + +#define kern_hyp_va(v) (typeof(v))(__kern_hyp_va((unsigned long)(v))) +#define KERN_TO_HYP(v) kern_hyp_va(v) /* * We currently only support a 40bit IPA. From 1df3e2347a432fec7ec4aea67161986e116f68eb Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:41 +0100 Subject: [PATCH 192/302] arm/arm64: KVM: Export __hyp_text_start/end symbols Declare the __hyp_text_start/end symbols in asm/virt.h so that they can be reused without having to declare them locally. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/virt.h | 4 ++++ arch/arm/kvm/mmu.c | 2 -- arch/arm64/include/asm/virt.h | 4 ++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/virt.h b/arch/arm/include/asm/virt.h index d4ceaf5f299b..a2e75b84e2ae 100644 --- a/arch/arm/include/asm/virt.h +++ b/arch/arm/include/asm/virt.h @@ -80,6 +80,10 @@ static inline bool is_kernel_in_hyp_mode(void) return false; } +/* The section containing the hypervisor idmap text */ +extern char __hyp_idmap_text_start[]; +extern char __hyp_idmap_text_end[]; + /* The section containing the hypervisor text */ extern char __hyp_text_start[]; extern char __hyp_text_end[]; diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 679608fa1666..f004e7017201 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -32,8 +32,6 @@ #include "trace.h" -extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; - static pgd_t *boot_hyp_pgd; static pgd_t *hyp_pgd; static pgd_t *merged_hyp_pgd; diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index dcbcf8dcbefb..88aa8ec784f6 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -82,6 +82,10 @@ extern void verify_cpu_run_el(void); static inline void verify_cpu_run_el(void) {} #endif +/* The section containing the hypervisor idmap text */ +extern char __hyp_idmap_text_start[]; +extern char __hyp_idmap_text_end[]; + /* The section containing the hypervisor text */ extern char __hyp_text_start[]; extern char __hyp_text_end[]; From d174591016ce9fcb61182e2a4d0aac951900fc32 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:42 +0100 Subject: [PATCH 193/302] arm64: KVM: Runtime detection of lower HYP offset Add the code that enables the switch to the lower HYP VA range. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/kernel/cpufeature.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 811773d1c1d0..ffb3e14dda60 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -726,6 +726,19 @@ static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused return is_kernel_in_hyp_mode(); } +static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry, + int __unused) +{ + phys_addr_t idmap_addr = virt_to_phys(__hyp_idmap_text_start); + + /* + * Activate the lower HYP offset only if: + * - the idmap doesn't clash with it, + * - the kernel is not running at EL2. + */ + return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode(); +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -803,6 +816,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .field_pos = ID_AA64PFR0_EL0_SHIFT, .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT, }, + { + .desc = "Reduced HYP mapping offset", + .capability = ARM64_HYP_OFFSET_LOW, + .def_scope = SCOPE_SYSTEM, + .matches = hyp_offset_low, + }, {}, }; From 0535a3e2b2d518a21d93e7cfe07821f1b24ccd0c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:43 +0100 Subject: [PATCH 194/302] arm/arm64: KVM: Always have merged page tables We're in a position where we can now always have "merged" page tables, where both the runtime mapping and the idmap coexist. This results in some code being removed, but there is more to come. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/kvm/mmu.c | 76 +++++++++++++++++++----------------------- arch/arm64/kvm/reset.c | 31 ++++------------- 2 files changed, 42 insertions(+), 65 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index f004e7017201..80d3737d68d2 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -492,13 +492,12 @@ void free_boot_hyp_pgd(void) if (boot_hyp_pgd) { unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); - unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); boot_hyp_pgd = NULL; } if (hyp_pgd) - unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); + unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE); mutex_unlock(&kvm_hyp_pgd_mutex); } @@ -1691,7 +1690,7 @@ phys_addr_t kvm_mmu_get_boot_httbr(void) if (__kvm_cpu_uses_extended_idmap()) return virt_to_phys(merged_hyp_pgd); else - return virt_to_phys(boot_hyp_pgd); + return virt_to_phys(hyp_pgd); } phys_addr_t kvm_get_idmap_vector(void) @@ -1704,6 +1703,22 @@ phys_addr_t kvm_get_idmap_start(void) return hyp_idmap_start; } +static int kvm_map_idmap_text(pgd_t *pgd) +{ + int err; + + /* Create the idmap in the boot page tables */ + err = __create_hyp_mappings(pgd, + hyp_idmap_start, hyp_idmap_end, + __phys_to_pfn(hyp_idmap_start), + PAGE_HYP_EXEC); + if (err) + kvm_err("Failed to idmap %lx-%lx\n", + hyp_idmap_start, hyp_idmap_end); + + return err; +} + int kvm_mmu_init(void) { int err; @@ -1719,27 +1734,25 @@ int kvm_mmu_init(void) BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); - boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); - - if (!hyp_pgd || !boot_hyp_pgd) { + if (!hyp_pgd) { kvm_err("Hyp mode PGD not allocated\n"); err = -ENOMEM; goto out; } - /* Create the idmap in the boot page tables */ - err = __create_hyp_mappings(boot_hyp_pgd, - hyp_idmap_start, hyp_idmap_end, - __phys_to_pfn(hyp_idmap_start), - PAGE_HYP_EXEC); - - if (err) { - kvm_err("Failed to idmap %lx-%lx\n", - hyp_idmap_start, hyp_idmap_end); - goto out; - } - if (__kvm_cpu_uses_extended_idmap()) { + boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + hyp_pgd_order); + if (!boot_hyp_pgd) { + kvm_err("Hyp boot PGD not allocated\n"); + err = -ENOMEM; + goto out; + } + + err = kvm_map_idmap_text(boot_hyp_pgd); + if (err) + goto out; + merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); if (!merged_hyp_pgd) { kvm_err("Failed to allocate extra HYP pgd\n"); @@ -1747,29 +1760,10 @@ int kvm_mmu_init(void) } __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd, hyp_idmap_start); - return 0; - } - - /* Map the very same page at the trampoline VA */ - err = __create_hyp_mappings(boot_hyp_pgd, - TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE, - __phys_to_pfn(hyp_idmap_start), - PAGE_HYP_EXEC); - if (err) { - kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n", - TRAMPOLINE_VA); - goto out; - } - - /* Map the same page again into the runtime page tables */ - err = __create_hyp_mappings(hyp_pgd, - TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE, - __phys_to_pfn(hyp_idmap_start), - PAGE_HYP_EXEC); - if (err) { - kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n", - TRAMPOLINE_VA); - goto out; + } else { + err = kvm_map_idmap_text(hyp_pgd); + if (err) + goto out; } return 0; diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 7be24f2b18db..8ed7e4a92e95 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -133,30 +133,13 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq); } -extern char __hyp_idmap_text_start[]; - unsigned long kvm_hyp_reset_entry(void) { - if (!__kvm_cpu_uses_extended_idmap()) { - unsigned long offset; - - /* - * Find the address of __kvm_hyp_reset() in the trampoline page. - * This is present in the running page tables, and the boot page - * tables, so we call the code here to start the trampoline - * dance in reverse. - */ - offset = (unsigned long)__kvm_hyp_reset - - ((unsigned long)__hyp_idmap_text_start & PAGE_MASK); - - return TRAMPOLINE_VA + offset; - } else { - /* - * KVM is running with merged page tables, which don't have the - * trampoline page mapped. We know the idmap is still mapped, - * but can't be called into directly. Use - * __extended_idmap_trampoline to do the call. - */ - return (unsigned long)kvm_ksym_ref(__extended_idmap_trampoline); - } + /* + * KVM is running with merged page tables, which don't have the + * trampoline page mapped. We know the idmap is still mapped, + * but can't be called into directly. Use + * __extended_idmap_trampoline to do the call. + */ + return (unsigned long)kvm_ksym_ref(__extended_idmap_trampoline); } From 3421e9d88d7ae70fbc8c903e44a5acace8ae2d29 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:44 +0100 Subject: [PATCH 195/302] arm64: KVM: Simplify HYP init/teardown Now that we only have the "merged page tables" case to deal with, there is a bunch of things we can simplify in the HYP code (both at init and teardown time). Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/kvm_host.h | 12 ++---- arch/arm64/kvm/hyp-init.S | 61 ++++--------------------------- arch/arm64/kvm/hyp/entry.S | 19 ---------- arch/arm64/kvm/hyp/hyp-entry.S | 15 ++++++++ arch/arm64/kvm/reset.c | 11 ------ 5 files changed, 26 insertions(+), 92 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 49095fc4b482..88462c307510 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -48,7 +48,6 @@ int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); int kvm_arch_dev_ioctl_check_extension(long ext); -unsigned long kvm_hyp_reset_entry(void); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); struct kvm_arch { @@ -357,19 +356,14 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, * Call initialization code, and switch to the full blown * HYP code. */ - __kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr, - hyp_stack_ptr, vector_ptr); + __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr); } +void __kvm_hyp_teardown(void); static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr, phys_addr_t phys_idmap_start) { - /* - * Call reset code, and switch back to stub hyp vectors. - * Uses __kvm_call_hyp() to avoid kaslr's kvm_ksym_ref() translation. - */ - __kvm_call_hyp((void *)kvm_hyp_reset_entry(), - boot_pgd_ptr, phys_idmap_start); + kvm_call_hyp(__kvm_hyp_teardown, phys_idmap_start); } static inline void kvm_arch_hardware_unsetup(void) {} diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index a873a6d8be90..6b29d3d9e1f2 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -53,10 +53,9 @@ __invalid: b . /* - * x0: HYP boot pgd - * x1: HYP pgd - * x2: HYP stack - * x3: HYP vectors + * x0: HYP pgd + * x1: HYP stack + * x2: HYP vectors */ __do_hyp_init: @@ -110,71 +109,27 @@ __do_hyp_init: msr sctlr_el2, x4 isb - /* Skip the trampoline dance if we merged the boot and runtime PGDs */ - cmp x0, x1 - b.eq merged - - /* MMU is now enabled. Get ready for the trampoline dance */ - ldr x4, =TRAMPOLINE_VA - adr x5, target - bfi x4, x5, #0, #PAGE_SHIFT - br x4 - -target: /* We're now in the trampoline code, switch page tables */ - msr ttbr0_el2, x1 - isb - - /* Invalidate the old TLBs */ - tlbi alle2 - dsb sy - -merged: /* Set the stack and new vectors */ + kern_hyp_va x1 + mov sp, x1 kern_hyp_va x2 - mov sp, x2 - kern_hyp_va x3 - msr vbar_el2, x3 + msr vbar_el2, x2 /* Hello, World! */ eret ENDPROC(__kvm_hyp_init) /* - * Reset kvm back to the hyp stub. This is the trampoline dance in - * reverse. If kvm used an extended idmap, __extended_idmap_trampoline - * calls this code directly in the idmap. In this case switching to the - * boot tables is a no-op. - * - * x0: HYP boot pgd - * x1: HYP phys_idmap_start + * Reset kvm back to the hyp stub. */ ENTRY(__kvm_hyp_reset) - /* We're in trampoline code in VA, switch back to boot page tables */ - msr ttbr0_el2, x0 - isb - - /* Ensure the PA branch doesn't find a stale tlb entry or stale code. */ - ic iallu - tlbi alle2 - dsb sy - isb - - /* Branch into PA space */ - adr x0, 1f - bfi x1, x0, #0, #PAGE_SHIFT - br x1 - /* We're now in idmap, disable MMU */ -1: mrs x0, sctlr_el2 + mrs x0, sctlr_el2 ldr x1, =SCTLR_ELx_FLAGS bic x0, x0, x1 // Clear SCTL_M and etc msr sctlr_el2, x0 isb - /* Invalidate the old TLBs */ - tlbi alle2 - dsb sy - /* Install stub vectors */ adr_l x0, __hyp_stub_vectors msr vbar_el2, x0 diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S index 70254a65bd5b..ce9e5e5f28cf 100644 --- a/arch/arm64/kvm/hyp/entry.S +++ b/arch/arm64/kvm/hyp/entry.S @@ -164,22 +164,3 @@ alternative_endif eret ENDPROC(__fpsimd_guest_restore) - -/* - * When using the extended idmap, we don't have a trampoline page we can use - * while we switch pages tables during __kvm_hyp_reset. Accessing the idmap - * directly would be ideal, but if we're using the extended idmap then the - * idmap is located above HYP_PAGE_OFFSET, and the address will be masked by - * kvm_call_hyp using kern_hyp_va. - * - * x0: HYP boot pgd - * x1: HYP phys_idmap_start - */ -ENTRY(__extended_idmap_trampoline) - mov x4, x1 - adr_l x3, __kvm_hyp_reset - - /* insert __kvm_hyp_reset()s offset into phys_idmap_start */ - bfi x4, x3, #0, #PAGE_SHIFT - br x4 -ENDPROC(__extended_idmap_trampoline) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 2d87f36d5cb4..f6d9694ae3b1 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -62,6 +62,21 @@ ENTRY(__vhe_hyp_call) isb ret ENDPROC(__vhe_hyp_call) + +/* + * Compute the idmap address of __kvm_hyp_reset based on the idmap + * start passed as a parameter, and jump there. + * + * x0: HYP phys_idmap_start + */ +ENTRY(__kvm_hyp_teardown) + mov x4, x0 + adr_l x3, __kvm_hyp_reset + + /* insert __kvm_hyp_reset()s offset into phys_idmap_start */ + bfi x4, x3, #0, #PAGE_SHIFT + br x4 +ENDPROC(__kvm_hyp_teardown) el1_sync: // Guest trapped into EL2 save_x0_to_x3 diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 8ed7e4a92e95..79f324823340 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -132,14 +132,3 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) /* Reset timer */ return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq); } - -unsigned long kvm_hyp_reset_entry(void) -{ - /* - * KVM is running with merged page tables, which don't have the - * trampoline page mapped. We know the idmap is still mapped, - * but can't be called into directly. Use - * __extended_idmap_trampoline to do the call. - */ - return (unsigned long)kvm_ksym_ref(__extended_idmap_trampoline); -} From 12fda8123d74903d1f65fb006fe4964e23ede0d1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:45 +0100 Subject: [PATCH 196/302] arm/arm64: KVM: Drop boot_pgd Since we now only have one set of page tables, the concept of boot_pgd is useless and can be removed. We still keep it as an element of the "extended idmap" thing. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 8 +++----- arch/arm/include/asm/kvm_mmu.h | 1 - arch/arm/kvm/arm.c | 15 +++------------ arch/arm/kvm/mmu.c | 8 -------- arch/arm64/include/asm/kvm_host.h | 6 ++---- arch/arm64/include/asm/kvm_mmu.h | 1 - 6 files changed, 8 insertions(+), 31 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 96387d477e91..020f4eb14f0b 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -241,8 +241,7 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, int exception_index); -static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, - phys_addr_t pgd_ptr, +static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, unsigned long hyp_stack_ptr, unsigned long vector_ptr) { @@ -272,12 +271,11 @@ static inline void __cpu_init_stage2(void) kvm_call_hyp(__init_stage2_translation); } -static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr, - phys_addr_t phys_idmap_start) +static inline void __cpu_reset_hyp_mode(phys_addr_t phys_idmap_start) { /* * TODO - * kvm_call_reset(boot_pgd_ptr, phys_idmap_start); + * kvm_call_reset(phys_idmap_start); */ } diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 6cb4d4d5c48c..5d161d13b471 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -65,7 +65,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run); void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); -phys_addr_t kvm_mmu_get_boot_httbr(void); phys_addr_t kvm_get_idmap_vector(void); phys_addr_t kvm_get_idmap_start(void); int kvm_mmu_init(void); diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index c74483fc39f2..0887cc12b401 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -1038,7 +1038,6 @@ long kvm_arch_vm_ioctl(struct file *filp, static void cpu_init_hyp_mode(void *dummy) { - phys_addr_t boot_pgd_ptr; phys_addr_t pgd_ptr; unsigned long hyp_stack_ptr; unsigned long stack_page; @@ -1047,13 +1046,12 @@ static void cpu_init_hyp_mode(void *dummy) /* Switch from the HYP stub to our own HYP init vector */ __hyp_set_vectors(kvm_get_idmap_vector()); - boot_pgd_ptr = kvm_mmu_get_boot_httbr(); pgd_ptr = kvm_mmu_get_httbr(); stack_page = __this_cpu_read(kvm_arm_hyp_stack_page); hyp_stack_ptr = stack_page + PAGE_SIZE; vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector); - __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr); + __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr); __cpu_init_stage2(); kvm_arm_init_debug(); @@ -1075,15 +1073,8 @@ static void cpu_hyp_reinit(void) static void cpu_hyp_reset(void) { - phys_addr_t boot_pgd_ptr; - phys_addr_t phys_idmap_start; - - if (!is_kernel_in_hyp_mode()) { - boot_pgd_ptr = kvm_mmu_get_boot_httbr(); - phys_idmap_start = kvm_get_idmap_start(); - - __cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start); - } + if (!is_kernel_in_hyp_mode()) + __cpu_reset_hyp_mode(kvm_get_idmap_start()); } static void _kvm_arch_hardware_enable(void *discard) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 80d3737d68d2..dd4ccc7f7baf 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1685,14 +1685,6 @@ phys_addr_t kvm_mmu_get_httbr(void) return virt_to_phys(hyp_pgd); } -phys_addr_t kvm_mmu_get_boot_httbr(void) -{ - if (__kvm_cpu_uses_extended_idmap()) - return virt_to_phys(merged_hyp_pgd); - else - return virt_to_phys(hyp_pgd); -} - phys_addr_t kvm_get_idmap_vector(void) { return hyp_idmap_vector; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 88462c307510..6731d4e1c746 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -347,8 +347,7 @@ int kvm_perf_teardown(void); struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); -static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, - phys_addr_t pgd_ptr, +static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, unsigned long hyp_stack_ptr, unsigned long vector_ptr) { @@ -360,8 +359,7 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr, } void __kvm_hyp_teardown(void); -static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr, - phys_addr_t phys_idmap_start) +static inline void __cpu_reset_hyp_mode(phys_addr_t phys_idmap_start) { kvm_call_hyp(__kvm_hyp_teardown, phys_idmap_start); } diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 2970537161d2..390acabdb1b2 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -170,7 +170,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run); void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); -phys_addr_t kvm_mmu_get_boot_httbr(void); phys_addr_t kvm_get_idmap_vector(void); phys_addr_t kvm_get_idmap_start(void); int kvm_mmu_init(void); From 26781f9ce16801a9c680dae1a7c1ca2fd3d112bd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:46 +0100 Subject: [PATCH 197/302] arm/arm64: KVM: Kill free_boot_hyp_pgd There is no way to free the boot PGD, because it doesn't exist anymore as a standalone entity. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_mmu.h | 1 - arch/arm/kvm/arm.c | 4 ---- arch/arm/kvm/mmu.c | 30 +++++++----------------------- arch/arm64/include/asm/kvm_mmu.h | 1 - 4 files changed, 7 insertions(+), 29 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 5d161d13b471..d5fd9fb8550b 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -51,7 +51,6 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot); int create_hyp_io_mappings(void *from, void *to, phys_addr_t); -void free_boot_hyp_pgd(void); void free_hyp_pgds(void); void stage2_unmap_vm(struct kvm *kvm); diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 0887cc12b401..9b8c53798f50 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -1323,10 +1323,6 @@ static int init_hyp_mode(void) } } -#ifndef CONFIG_HOTPLUG_CPU - free_boot_hyp_pgd(); -#endif - /* set size of VMID supported by CPU */ kvm_vmid_bits = kvm_get_vmid_bits(); kvm_info("%d-bit VMID\n", kvm_vmid_bits); diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index dd4ccc7f7baf..0b36dd52af62 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -481,27 +481,6 @@ static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) } while (pgd++, addr = next, addr != end); } -/** - * free_boot_hyp_pgd - free HYP boot page tables - * - * Free the HYP boot page tables. The bounce page is also freed. - */ -void free_boot_hyp_pgd(void) -{ - mutex_lock(&kvm_hyp_pgd_mutex); - - if (boot_hyp_pgd) { - unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); - free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); - boot_hyp_pgd = NULL; - } - - if (hyp_pgd) - unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE); - - mutex_unlock(&kvm_hyp_pgd_mutex); -} - /** * free_hyp_pgds - free Hyp-mode page tables * @@ -516,11 +495,16 @@ void free_hyp_pgds(void) { unsigned long addr; - free_boot_hyp_pgd(); - mutex_lock(&kvm_hyp_pgd_mutex); + if (boot_hyp_pgd) { + unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); + free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); + boot_hyp_pgd = NULL; + } + if (hyp_pgd) { + unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE); for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 390acabdb1b2..b89122ed827d 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -156,7 +156,6 @@ static inline unsigned long __kern_hyp_va(unsigned long v) int create_hyp_mappings(void *from, void *to, pgprot_t prot); int create_hyp_io_mappings(void *from, void *to, phys_addr_t); -void free_boot_hyp_pgd(void); void free_hyp_pgds(void); void stage2_unmap_vm(struct kvm *kvm); From cd602a37e80c791adf2a256d2aedec60b898cd51 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:47 +0100 Subject: [PATCH 198/302] arm: KVM: Simplify HYP init Just like for arm64, we can now make the HYP setup a lot simpler, and we can now initialise it in one go (instead of the two phases we currently have). Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_host.h | 15 ++++------ arch/arm/kvm/init.S | 49 ++++++--------------------------- 2 files changed, 14 insertions(+), 50 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 020f4eb14f0b..eafbfd5ad34a 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -250,18 +250,13 @@ static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, * code. The init code doesn't need to preserve these * registers as r0-r3 are already callee saved according to * the AAPCS. - * Note that we slightly misuse the prototype by casing the + * Note that we slightly misuse the prototype by casting the * stack pointer to a void *. - * - * We don't have enough registers to perform the full init in - * one go. Install the boot PGD first, and then install the - * runtime PGD, stack pointer and vectors. The PGDs are always - * passed as the third argument, in order to be passed into - * r2-r3 to the init code (yes, this is compliant with the - * PCS!). - */ - kvm_call_hyp(NULL, 0, boot_pgd_ptr); + * The PGDs are always passed as the third argument, in order + * to be passed into r2-r3 to the init code (yes, this is + * compliant with the PCS!). + */ kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr); } diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S index 1f9ae17476f9..b82a99dcfb61 100644 --- a/arch/arm/kvm/init.S +++ b/arch/arm/kvm/init.S @@ -32,23 +32,13 @@ * r2,r3 = Hypervisor pgd pointer * * The init scenario is: - * - We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd, - * runtime stack, runtime vectors - * - Enable the MMU with the boot pgd - * - Jump to a target into the trampoline page (remember, this is the same - * physical page!) - * - Now switch to the runtime pgd (same VA, and still the same physical - * page!) + * - We jump in HYP with 3 parameters: runtime HYP pgd, runtime stack, + * runtime vectors * - Invalidate TLBs * - Set stack and vectors + * - Setup the page tables + * - Enable the MMU * - Profit! (or eret, if you only care about the code). - * - * As we only have four registers available to pass parameters (and we - * need six), we split the init in two phases: - * - Phase 1: r0 = 0, r1 = 0, r2,r3 contain the boot PGD. - * Provides the basic HYP init, and enable the MMU. - * - Phase 2: r0 = ToS, r1 = vectors, r2,r3 contain the runtime PGD. - * Switches to the runtime PGD, set stack and vectors. */ .text @@ -68,8 +58,11 @@ __kvm_hyp_init: W(b) . __do_hyp_init: - cmp r0, #0 @ We have a SP? - bne phase2 @ Yes, second stage init + @ Set stack pointer + mov sp, r0 + + @ Set HVBAR to point to the HYP vectors + mcr p15, 4, r1, c12, c0, 0 @ HVBAR @ Set the HTTBR to point to the hypervisor PGD pointer passed mcrr p15, 4, rr_lo_hi(r2, r3), c2 @@ -114,33 +107,9 @@ __do_hyp_init: THUMB( ldr r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE) ) orr r1, r1, r2 orr r0, r0, r1 - isb mcr p15, 4, r0, c1, c0, 0 @ HSCR - - @ End of init phase-1 - eret - -phase2: - @ Set stack pointer - mov sp, r0 - - @ Set HVBAR to point to the HYP vectors - mcr p15, 4, r1, c12, c0, 0 @ HVBAR - - @ Jump to the trampoline page - ldr r0, =TRAMPOLINE_VA - adr r1, target - bfi r0, r1, #0, #PAGE_SHIFT - ret r0 - -target: @ We're now in the trampoline code, switch page tables - mcrr p15, 4, rr_lo_hi(r2, r3), c2 isb - @ Invalidate the old TLBs - mcr p15, 4, r0, c8, c7, 0 @ TLBIALLH - dsb ish - eret .ltorg From e537ecd7efacaa7512e87ecb07c0c0335a902558 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:48 +0100 Subject: [PATCH 199/302] arm: KVM: Allow hyp teardown So far, KVM was getting in the way of kexec on 32bit (and the arm64 kexec hackers couldn't be bothered to fix it on 32bit...). With simpler page tables, tearing KVM down becomes very easy, so let's just do it. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_asm.h | 2 ++ arch/arm/include/asm/kvm_host.h | 8 +++----- arch/arm/kvm/arm.c | 3 ++- arch/arm/kvm/init.S | 15 +++++++++++++++ arch/arm64/include/asm/kvm_host.h | 3 ++- 5 files changed, 24 insertions(+), 7 deletions(-) diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 3d5a5cd071bd..58faff5f1eb2 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -66,6 +66,8 @@ extern void __kvm_tlb_flush_vmid(struct kvm *kvm); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); extern void __init_stage2_translation(void); + +extern void __kvm_hyp_reset(unsigned long); #endif #endif /* __ARM_KVM_ASM_H__ */ diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index eafbfd5ad34a..58d0b69e7428 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -266,12 +266,10 @@ static inline void __cpu_init_stage2(void) kvm_call_hyp(__init_stage2_translation); } -static inline void __cpu_reset_hyp_mode(phys_addr_t phys_idmap_start) +static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr, + phys_addr_t phys_idmap_start) { - /* - * TODO - * kvm_call_reset(phys_idmap_start); - */ + kvm_call_hyp((void *)virt_to_idmap(__kvm_hyp_reset), vector_ptr); } static inline int kvm_arch_dev_ioctl_check_extension(long ext) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 9b8c53798f50..7cf266c502d6 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -1074,7 +1074,8 @@ static void cpu_hyp_reinit(void) static void cpu_hyp_reset(void) { if (!is_kernel_in_hyp_mode()) - __cpu_reset_hyp_mode(kvm_get_idmap_start()); + __cpu_reset_hyp_mode(hyp_default_vectors, + kvm_get_idmap_start()); } static void _kvm_arch_hardware_enable(void *discard) diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S index b82a99dcfb61..bf89c919efc1 100644 --- a/arch/arm/kvm/init.S +++ b/arch/arm/kvm/init.S @@ -112,6 +112,21 @@ __do_hyp_init: eret + @ r0 : stub vectors address +ENTRY(__kvm_hyp_reset) + /* We're now in idmap, disable MMU */ + mrc p15, 4, r1, c1, c0, 0 @ HSCTLR + ldr r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_C | HSCTLR_I) + bic r1, r1, r2 + mcr p15, 4, r1, c1, c0, 0 @ HSCTLR + + /* Install stub vectors */ + mcr p15, 4, r0, c12, c0, 0 @ HVBAR + isb + + eret +ENDPROC(__kvm_hyp_reset) + .ltorg .globl __kvm_hyp_init_end diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 6731d4e1c746..69d5cc2d2e17 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -359,7 +359,8 @@ static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, } void __kvm_hyp_teardown(void); -static inline void __cpu_reset_hyp_mode(phys_addr_t phys_idmap_start) +static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr, + phys_addr_t phys_idmap_start) { kvm_call_hyp(__kvm_hyp_teardown, phys_idmap_start); } From f7bec68d2faed8180d7172cdbd69d99e3cad1387 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:49 +0100 Subject: [PATCH 200/302] arm/arm64: KVM: Prune unused #defines We can now remove a number of dead #defines, thanks to the trampoline code being gone. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_mmu.h | 9 --------- arch/arm64/include/asm/kvm_mmu.h | 10 ---------- 2 files changed, 19 deletions(-) diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index d5fd9fb8550b..73c28180bb63 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -26,17 +26,8 @@ * We directly use the kernel VA for the HYP, as we can directly share * the mapping (HTTBR "covers" TTBR1). */ -#define HYP_PAGE_OFFSET_MASK UL(~0) -#define HYP_PAGE_OFFSET PAGE_OFFSET #define KERN_TO_HYP(kva) (kva) -/* - * Our virtual mapping for the boot-time MMU-enable code. Must be - * shared across all the page-tables. Conveniently, we use the vectors - * page, where no kernel data will ever be shared with HYP. - */ -#define TRAMPOLINE_VA UL(CONFIG_VECTORS_BASE) - /* * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels. */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index b89122ed827d..9226f8be6341 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -72,16 +72,6 @@ #define HYP_PAGE_OFFSET_HIGH_MASK ((UL(1) << VA_BITS) - 1) #define HYP_PAGE_OFFSET_LOW_MASK ((UL(1) << (VA_BITS - 1)) - 1) -/* Temporary compat define */ -#define HYP_PAGE_OFFSET_MASK HYP_PAGE_OFFSET_HIGH_MASK - -/* - * Our virtual mapping for the idmap-ed MMU-enable code. Must be - * shared across all the page-tables. Conveniently, we use the last - * possible page, where no kernel mapping will ever exist. - */ -#define TRAMPOLINE_VA (HYP_PAGE_OFFSET_MASK & PAGE_MASK) - #ifdef __ASSEMBLY__ #include From eac378a9ceb7196b776a965d915e02995fb8ba55 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:50 +0100 Subject: [PATCH 201/302] arm/arm64: KVM: Check that IDMAP doesn't intersect with VA range This is more of a safety measure than anything else: If we end-up with an idmap page that intersect with the range picked for the the HYP VA space, abort the KVM setup, as it is unsafe to go further. I cannot imagine it happening on 64bit (we have a mechanism to work around it), but could potentially occur on a 32bit system with the kernel loaded high enough in memory so that in conflicts with the kernel VA. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/kvm/mmu.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 0b36dd52af62..8a0aa37605c5 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1709,6 +1709,21 @@ int kvm_mmu_init(void) */ BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); + kvm_info("IDMAP page: %lx\n", hyp_idmap_start); + kvm_info("HYP VA range: %lx:%lx\n", + KERN_TO_HYP(PAGE_OFFSET), KERN_TO_HYP(~0UL)); + + if (hyp_idmap_start >= KERN_TO_HYP(PAGE_OFFSET) && + hyp_idmap_start < KERN_TO_HYP(~0UL)) { + /* + * The idmap page is intersecting with the VA space, + * it is not safe to continue further. + */ + kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); + err = -EINVAL; + goto out; + } + hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order); if (!hyp_pgd) { kvm_err("Hyp mode PGD not allocated\n"); From 6c41a413fd44af8eae2949869d4d57ce681a0c30 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Jun 2016 18:40:51 +0100 Subject: [PATCH 202/302] arm/arm64: Get rid of KERN_TO_HYP We have both KERN_TO_HYP and kern_hyp_va, which do the exact same thing. Let's standardize on the latter. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_hyp.h | 2 -- arch/arm/include/asm/kvm_mmu.h | 2 +- arch/arm/kvm/mmu.c | 18 +++++++++--------- arch/arm64/include/asm/kvm_mmu.h | 1 - 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h index e38fce2270ac..6eaff28f2ff3 100644 --- a/arch/arm/include/asm/kvm_hyp.h +++ b/arch/arm/include/asm/kvm_hyp.h @@ -25,8 +25,6 @@ #define __hyp_text __section(.hyp.text) notrace -#define kern_hyp_va(v) (v) - #define __ACCESS_CP15(CRn, Op1, CRm, Op2) \ "mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32 #define __ACCESS_CP15_64(Op1, CRm) \ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 73c28180bb63..3bb803d6814b 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -26,7 +26,7 @@ * We directly use the kernel VA for the HYP, as we can directly share * the mapping (HTTBR "covers" TTBR1). */ -#define KERN_TO_HYP(kva) (kva) +#define kern_hyp_va(kva) (kva) /* * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels. diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 8a0aa37605c5..bda27b6b1aa2 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -506,9 +506,9 @@ void free_hyp_pgds(void) if (hyp_pgd) { unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE); for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) - unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); + unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE); for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) - unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); + unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE); free_pages((unsigned long)hyp_pgd, hyp_pgd_order); hyp_pgd = NULL; @@ -670,8 +670,8 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot) { phys_addr_t phys_addr; unsigned long virt_addr; - unsigned long start = KERN_TO_HYP((unsigned long)from); - unsigned long end = KERN_TO_HYP((unsigned long)to); + unsigned long start = kern_hyp_va((unsigned long)from); + unsigned long end = kern_hyp_va((unsigned long)to); if (is_kernel_in_hyp_mode()) return 0; @@ -705,8 +705,8 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot) */ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) { - unsigned long start = KERN_TO_HYP((unsigned long)from); - unsigned long end = KERN_TO_HYP((unsigned long)to); + unsigned long start = kern_hyp_va((unsigned long)from); + unsigned long end = kern_hyp_va((unsigned long)to); if (is_kernel_in_hyp_mode()) return 0; @@ -1711,10 +1711,10 @@ int kvm_mmu_init(void) kvm_info("IDMAP page: %lx\n", hyp_idmap_start); kvm_info("HYP VA range: %lx:%lx\n", - KERN_TO_HYP(PAGE_OFFSET), KERN_TO_HYP(~0UL)); + kern_hyp_va(PAGE_OFFSET), kern_hyp_va(~0UL)); - if (hyp_idmap_start >= KERN_TO_HYP(PAGE_OFFSET) && - hyp_idmap_start < KERN_TO_HYP(~0UL)) { + if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && + hyp_idmap_start < kern_hyp_va(~0UL)) { /* * The idmap page is intersecting with the VA space, * it is not safe to continue further. diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 9226f8be6341..b6bb83400cd8 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -133,7 +133,6 @@ static inline unsigned long __kern_hyp_va(unsigned long v) } #define kern_hyp_va(v) (typeof(v))(__kern_hyp_va((unsigned long)(v))) -#define KERN_TO_HYP(v) kern_hyp_va(v) /* * We currently only support a 40bit IPA. From 5ffe466cd3a33543306c37a0789e2116286367f1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 24 May 2016 12:10:27 +0200 Subject: [PATCH 203/302] KVM: s390: inject PER i-fetch events on applicable icpts In case we have to emuluate an instruction or part of it (instruction, partial instruction, operation exception), we have to inject a PER instruction-fetching event for that instruction, if hardware told us to do so. In case we retry an instruction, we must not inject the PER event. Please note that we don't filter the events properly yet, so guest debugging will be visible for the guest. Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/guestdbg.c | 17 +++++++++++++++++ arch/s390/kvm/intercept.c | 17 ++++++++++++++--- arch/s390/kvm/kvm-s390.h | 3 +++ 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index 1e0849e20965..31a05330d11c 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -439,6 +439,23 @@ static int debug_exit_required(struct kvm_vcpu *vcpu) #define guest_per_enabled(vcpu) \ (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) +int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu) +{ + const u8 ilen = kvm_s390_get_ilen(vcpu); + struct kvm_s390_pgm_info pgm_info = { + .code = PGM_PER, + .per_code = PER_EVENT_IFETCH >> 24, + .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen), + }; + + /* + * The PSW points to the next instruction, therefore the intercepted + * instruction generated a PER i-fetch event. PER address therefore + * points at the previous PSW address (could be an EXECUTE function). + */ + return kvm_s390_inject_prog_irq(vcpu, &pgm_info); +} + static void filter_guest_per_event(struct kvm_vcpu *vcpu) { u32 perc = vcpu->arch.sie_block->perc << 24; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 9359f65c8634..850be47c4cc9 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -364,6 +364,8 @@ static int handle_operexc(struct kvm_vcpu *vcpu) int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) { + int rc, per_rc = 0; + if (kvm_is_ucontrol(vcpu->kvm)) return -EOPNOTSUPP; @@ -372,7 +374,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) case 0x18: return handle_noop(vcpu); case 0x04: - return handle_instruction(vcpu); + rc = handle_instruction(vcpu); + break; case 0x08: return handle_prog(vcpu); case 0x14: @@ -384,10 +387,18 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) case 0x28: return handle_stop(vcpu); case 0x2c: - return handle_operexc(vcpu); + rc = handle_operexc(vcpu); + break; case 0x38: - return handle_partial_execution(vcpu); + rc = handle_partial_execution(vcpu); + break; default: return -EOPNOTSUPP; } + + /* process PER, also if the instrution is processed in user space */ + if (vcpu->arch.sie_block->icptstatus & 0x02 && + (!rc || rc == -EOPNOTSUPP)) + per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu); + return per_rc ? per_rc : rc; } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 031f451bb2cf..b8432862a817 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -238,6 +238,8 @@ static inline void kvm_s390_forward_psw(struct kvm_vcpu *vcpu, int ilen) } static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu) { + /* don't inject PER events if we re-execute the instruction */ + vcpu->arch.sie_block->icptstatus &= ~0x02; kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu)); } @@ -377,6 +379,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg); void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu); void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu); +int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu); void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu); /* support for Basic/Extended SCA handling */ From 92176a8ede577d0ff78ab3298e06701f67ad5f51 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 7 Jun 2016 16:22:47 +0200 Subject: [PATCH 204/302] KVM: MMU: prepare to support mapping of VM_IO and VM_PFNMAP frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Handle VM_IO like VM_PFNMAP, as is common in the rest of Linux; extract the formula to convert hva->pfn into a new function, which will soon gain more capabilities. Cc: Xiao Guangrong Cc: Andrea Arcangeli Cc: Radim Krčmář Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ef54b4c31792..5aae59e00bef 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1442,6 +1442,16 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) return true; } +static int hva_to_pfn_remapped(struct vm_area_struct *vma, + unsigned long addr, bool *async, + bool write_fault, kvm_pfn_t *p_pfn) +{ + *p_pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + BUG_ON(!kvm_is_reserved_pfn(*p_pfn)); + return 0; +} + /* * Pin guest page in memory and return its pfn. * @addr: host virtual address which maps memory to the guest @@ -1461,7 +1471,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, { struct vm_area_struct *vma; kvm_pfn_t pfn = 0; - int npages; + int npages, r; /* we can do it either atomically or asynchronously, not both */ BUG_ON(atomic && async); @@ -1487,10 +1497,10 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; - else if ((vma->vm_flags & VM_PFNMAP)) { - pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + - vma->vm_pgoff; - BUG_ON(!kvm_is_reserved_pfn(pfn)); + else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { + r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn); + if (r < 0) + pfn = KVM_PFN_ERR_FAULT; } else { if (async && vma_is_valid(vma, write_fault)) *async = true; From add6a0cd1c5ba51b201e1361b05a5df817083618 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 7 Jun 2016 17:51:18 +0200 Subject: [PATCH 205/302] KVM: MMU: try to fix up page faults before giving up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vGPU folks would like to trap the first access to a BAR by setting vm_ops on the VMAs produced by mmap-ing a VFIO device. The fault handler then can use remap_pfn_range to place some non-reserved pages in the VMA. This kind of VM_PFNMAP mapping is not handled by KVM, but follow_pfn and fixup_user_fault together help supporting it. The patch also supports VM_MIXEDMAP vmas where the pfns are not reserved and thus subject to reference counting. Cc: Xiao Guangrong Cc: Andrea Arcangeli Cc: Radim Krčmář Tested-by: Neo Jia Reported-by: Kirti Wankhede Signed-off-by: Paolo Bonzini --- mm/gup.c | 1 + virt/kvm/kvm_main.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index c057784c8444..e3ac22f90fa4 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -720,6 +720,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, } return 0; } +EXPORT_SYMBOL_GPL(fixup_user_fault); static __always_inline long __get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5aae59e00bef..154b9ab459b0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1446,9 +1446,45 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, unsigned long addr, bool *async, bool write_fault, kvm_pfn_t *p_pfn) { - *p_pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + - vma->vm_pgoff; - BUG_ON(!kvm_is_reserved_pfn(*p_pfn)); + unsigned long pfn; + int r; + + r = follow_pfn(vma, addr, &pfn); + if (r) { + /* + * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does + * not call the fault handler, so do it here. + */ + bool unlocked = false; + r = fixup_user_fault(current, current->mm, addr, + (write_fault ? FAULT_FLAG_WRITE : 0), + &unlocked); + if (unlocked) + return -EAGAIN; + if (r) + return r; + + r = follow_pfn(vma, addr, &pfn); + if (r) + return r; + + } + + + /* + * Get a reference here because callers of *hva_to_pfn* and + * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the + * returned pfn. This is only needed if the VMA has VM_MIXEDMAP + * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will + * simply do nothing for reserved pfns. + * + * Whoever called remap_pfn_range is also going to call e.g. + * unmap_mapping_range before the underlying pages are freed, + * causing a call to our MMU notifier. + */ + kvm_get_pfn(pfn); + + *p_pfn = pfn; return 0; } @@ -1493,12 +1529,15 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, goto exit; } +retry: vma = find_vma_intersection(current->mm, addr, addr + 1); if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn); + if (r == -EAGAIN) + goto retry; if (r < 0) pfn = KVM_PFN_ERR_FAULT; } else { From 03f6a22a3979f20575c39ff86c79b3fa3b7f469d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 4 Jul 2016 15:13:07 +0000 Subject: [PATCH 206/302] KVM: x86: Use ARRAY_SIZE instead of dividing sizeof array with sizeof an element Use ARRAY_SIZE instead of dividing sizeof array with sizeof an element Signed-off-by: Wei Yongjun Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 85e2f0a882ca..e564fa2c7ac8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1112,7 +1112,7 @@ static inline bool cpu_has_broken_vmx_preemption_timer(void) /* Clear the reserved bits */ eax &= ~(0x3U << 14 | 0xfU << 28); - for (i = 0; i < sizeof(vmx_preemption_cpu_tfms)/sizeof(u32); i++) + for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) if (eax == vmx_preemption_cpu_tfms[i]) return true; From c29732a179c2ed0cb9f001a8dc07dcf432389313 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 23 Jun 2016 17:34:34 +0100 Subject: [PATCH 207/302] MIPS: uasm: Add CFC1/CTC1 instructions Add CFC1/CTC1 instructions for accessing FP control registers to uasm so that KVM can use uasm for generating its entry point code at runtime. Signed-off-by: James Hogan Acked-by: Ralf Baechle Cc: linux-mips@linux-mips.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/uasm.h | 2 ++ arch/mips/mm/uasm-micromips.c | 8 ++++++-- arch/mips/mm/uasm-mips.c | 2 ++ arch/mips/mm/uasm.c | 26 ++++++++++++++------------ 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h index b6ecfeee4dbe..3153ada46e9a 100644 --- a/arch/mips/include/asm/uasm.h +++ b/arch/mips/include/asm/uasm.h @@ -104,6 +104,8 @@ Ip_u1s2(_bltz); Ip_u1s2(_bltzl); Ip_u1u2s3(_bne); Ip_u2s3u1(_cache); +Ip_u1u2(_cfc1); +Ip_u1u2(_ctc1); Ip_u2u1s3(_daddiu); Ip_u3u1u2(_daddu); Ip_u2u1msbu3(_dins); diff --git a/arch/mips/mm/uasm-micromips.c b/arch/mips/mm/uasm-micromips.c index d78178daea4b..8b1acb2f6b8b 100644 --- a/arch/mips/mm/uasm-micromips.c +++ b/arch/mips/mm/uasm-micromips.c @@ -53,6 +53,8 @@ static struct insn insn_table_MM[] = { { insn_bltzl, 0, 0 }, { insn_bne, M(mm_bne32_op, 0, 0, 0, 0, 0), RT | RS | BIMM }, { insn_cache, M(mm_pool32b_op, 0, 0, mm_cache_func, 0, 0), RT | RS | SIMM }, + { insn_cfc1, M(mm_pool32f_op, 0, 0, 0, mm_cfc1_op, mm_32f_73_op), RT | RS }, + { insn_ctc1, M(mm_pool32f_op, 0, 0, 0, mm_ctc1_op, mm_32f_73_op), RT | RS }, { insn_daddu, 0, 0 }, { insn_daddiu, 0, 0 }, { insn_divu, M(mm_pool32a_op, 0, 0, 0, mm_divu_op, mm_pool32axf_op), RT | RS }, @@ -166,13 +168,15 @@ static void build_insn(u32 **buf, enum opcode opc, ...) op = ip->match; va_start(ap, opc); if (ip->fields & RS) { - if (opc == insn_mfc0 || opc == insn_mtc0) + if (opc == insn_mfc0 || opc == insn_mtc0 || + opc == insn_cfc1 || opc == insn_ctc1) op |= build_rt(va_arg(ap, u32)); else op |= build_rs(va_arg(ap, u32)); } if (ip->fields & RT) { - if (opc == insn_mfc0 || opc == insn_mtc0) + if (opc == insn_mfc0 || opc == insn_mtc0 || + opc == insn_cfc1 || opc == insn_ctc1) op |= build_rs(va_arg(ap, u32)); else op |= build_rt(va_arg(ap, u32)); diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c index 9c2220a45189..5152544962c3 100644 --- a/arch/mips/mm/uasm-mips.c +++ b/arch/mips/mm/uasm-mips.c @@ -67,6 +67,8 @@ static struct insn insn_table[] = { #else { insn_cache, M6(cache_op, 0, 0, 0, cache6_op), RS | RT | SIMM9 }, #endif + { insn_cfc1, M(cop1_op, cfc_op, 0, 0, 0, 0), RT | RD }, + { insn_ctc1, M(cop1_op, ctc_op, 0, 0, 0, 0), RT | RD }, { insn_daddiu, M(daddiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM }, { insn_daddu, M(spec_op, 0, 0, 0, 0, daddu_op), RS | RT | RD }, { insn_dinsm, M(spec3_op, 0, 0, 0, 0, dinsm_op), RS | RT | RD | RE }, diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c index ad718debc35a..4731893db3f7 100644 --- a/arch/mips/mm/uasm.c +++ b/arch/mips/mm/uasm.c @@ -49,18 +49,18 @@ enum opcode { insn_invalid, insn_addiu, insn_addu, insn_and, insn_andi, insn_bbit0, insn_bbit1, insn_beq, insn_beql, insn_bgez, insn_bgezl, insn_bltz, insn_bltzl, - insn_bne, insn_cache, insn_daddiu, insn_daddu, insn_dins, insn_dinsm, - insn_divu, insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll, - insn_dsll32, insn_dsra, insn_dsrl, insn_dsrl32, insn_dsubu, insn_eret, - insn_ext, insn_ins, insn_j, insn_jal, insn_jalr, insn_jr, insn_lb, - insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld, insn_lui, insn_lw, - insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0, - insn_mthc0, insn_mul, insn_or, insn_ori, insn_pref, insn_rfe, - insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll, insn_sllv, insn_slt, - insn_sltiu, insn_sltu, insn_sra, insn_srl, insn_srlv, insn_subu, - insn_sw, insn_sync, insn_syscall, insn_tlbp, insn_tlbr, insn_tlbwi, - insn_tlbwr, insn_wait, insn_wsbh, insn_xor, insn_xori, insn_yield, - insn_lddir, insn_ldpte, + insn_bne, insn_cache, insn_cfc1, insn_ctc1, insn_daddiu, insn_daddu, + insn_dins, insn_dinsm, insn_divu, insn_dmfc0, insn_dmtc0, insn_drotr, + insn_drotr32, insn_dsll, insn_dsll32, insn_dsra, insn_dsrl, insn_dsrl32, + insn_dsubu, insn_eret, insn_ext, insn_ins, insn_j, insn_jal, insn_jalr, + insn_jr, insn_lb, insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld, + insn_lui, insn_lw, insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi, + insn_mflo, insn_mtc0, insn_mthc0, insn_mul, insn_or, insn_ori, + insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll, + insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra, insn_srl, + insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, insn_tlbp, + insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, insn_xor, + insn_xori, insn_yield, insn_lddir, insn_ldpte, }; struct insn { @@ -268,6 +268,8 @@ I_u1s2(_bltz) I_u1s2(_bltzl) I_u1u2s3(_bne) I_u2s3u1(_cache) +I_u1u2(_cfc1) +I_u1u2(_ctc1) I_u1u2u3(_dmfc0) I_u1u2u3(_dmtc0) I_u2u1s3(_daddiu) From 59e3559f48dcad3051f60c32775e028cd999ae53 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 23 Jun 2016 17:34:35 +0100 Subject: [PATCH 208/302] MIPS: uasm: Add CFCMSA/CTCMSA instructions Add CFCMSA/CTCMSA instructions for accessing MSA control registers to uasm so that KVM can use uasm for generating its entry point code at runtime. Signed-off-by: James Hogan Acked-by: Ralf Baechle Cc: linux-mips@linux-mips.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/uasm.h | 2 ++ arch/mips/include/uapi/asm/inst.h | 24 +++++++++++++++++++++++- arch/mips/mm/uasm-micromips.c | 2 ++ arch/mips/mm/uasm-mips.c | 2 ++ arch/mips/mm/uasm.c | 26 ++++++++++++++------------ 5 files changed, 43 insertions(+), 13 deletions(-) diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h index 3153ada46e9a..edc02687016e 100644 --- a/arch/mips/include/asm/uasm.h +++ b/arch/mips/include/asm/uasm.h @@ -105,7 +105,9 @@ Ip_u1s2(_bltzl); Ip_u1u2s3(_bne); Ip_u2s3u1(_cache); Ip_u1u2(_cfc1); +Ip_u2u1(_cfcmsa); Ip_u1u2(_ctc1); +Ip_u2u1(_ctcmsa); Ip_u2u1s3(_daddiu); Ip_u3u1u2(_daddu); Ip_u2u1msbu3(_dins); diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h index a1ebf973725c..2e624dd058ef 100644 --- a/arch/mips/include/uapi/asm/inst.h +++ b/arch/mips/include/uapi/asm/inst.h @@ -237,6 +237,21 @@ enum bshfl_func { seh_op = 0x18, }; +/* + * MSA minor opcodes. + */ +enum msa_func { + msa_elm_op = 0x19, +}; + +/* + * MSA ELM opcodes. + */ +enum msa_elm { + msa_ctc_op = 0x3e, + msa_cfc_op = 0x7e, +}; + /* * func field for MSA MI10 format. */ @@ -264,7 +279,7 @@ enum mm_major_op { mm_pool32b_op, mm_pool16b_op, mm_lhu16_op, mm_andi16_op, mm_addiu32_op, mm_lhu32_op, mm_sh32_op, mm_lh32_op, mm_pool32i_op, mm_pool16c_op, mm_lwsp16_op, mm_pool16d_op, - mm_ori32_op, mm_pool32f_op, mm_reserved1_op, mm_reserved2_op, + mm_ori32_op, mm_pool32f_op, mm_pool32s_op, mm_reserved2_op, mm_pool32c_op, mm_lwgp16_op, mm_lw16_op, mm_pool16e_op, mm_xori32_op, mm_jals32_op, mm_addiupc_op, mm_reserved3_op, mm_reserved4_op, mm_pool16f_op, mm_sb16_op, mm_beqz16_op, @@ -478,6 +493,13 @@ enum mm_32f_73_minor_op { mm_fcvts1_op = 0xed, }; +/* + * (microMIPS) POOL32S minor opcodes. + */ +enum mm_32s_minor_op { + mm_32s_elm_op = 0x16, +}; + /* * (microMIPS) POOL16C minor opcodes. */ diff --git a/arch/mips/mm/uasm-micromips.c b/arch/mips/mm/uasm-micromips.c index 8b1acb2f6b8b..eba5018961eb 100644 --- a/arch/mips/mm/uasm-micromips.c +++ b/arch/mips/mm/uasm-micromips.c @@ -54,7 +54,9 @@ static struct insn insn_table_MM[] = { { insn_bne, M(mm_bne32_op, 0, 0, 0, 0, 0), RT | RS | BIMM }, { insn_cache, M(mm_pool32b_op, 0, 0, mm_cache_func, 0, 0), RT | RS | SIMM }, { insn_cfc1, M(mm_pool32f_op, 0, 0, 0, mm_cfc1_op, mm_32f_73_op), RT | RS }, + { insn_cfcmsa, M(mm_pool32s_op, 0, msa_cfc_op, 0, 0, mm_32s_elm_op), RD | RE }, { insn_ctc1, M(mm_pool32f_op, 0, 0, 0, mm_ctc1_op, mm_32f_73_op), RT | RS }, + { insn_ctcmsa, M(mm_pool32s_op, 0, msa_ctc_op, 0, 0, mm_32s_elm_op), RD | RE }, { insn_daddu, 0, 0 }, { insn_daddiu, 0, 0 }, { insn_divu, M(mm_pool32a_op, 0, 0, 0, mm_divu_op, mm_pool32axf_op), RT | RS }, diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c index 5152544962c3..9f77783f23a6 100644 --- a/arch/mips/mm/uasm-mips.c +++ b/arch/mips/mm/uasm-mips.c @@ -68,7 +68,9 @@ static struct insn insn_table[] = { { insn_cache, M6(cache_op, 0, 0, 0, cache6_op), RS | RT | SIMM9 }, #endif { insn_cfc1, M(cop1_op, cfc_op, 0, 0, 0, 0), RT | RD }, + { insn_cfcmsa, M(msa_op, 0, msa_cfc_op, 0, 0, msa_elm_op), RD | RE }, { insn_ctc1, M(cop1_op, ctc_op, 0, 0, 0, 0), RT | RD }, + { insn_ctcmsa, M(msa_op, 0, msa_ctc_op, 0, 0, msa_elm_op), RD | RE }, { insn_daddiu, M(daddiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM }, { insn_daddu, M(spec_op, 0, 0, 0, 0, daddu_op), RS | RT | RD }, { insn_dinsm, M(spec3_op, 0, 0, 0, 0, dinsm_op), RS | RT | RD | RE }, diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c index 4731893db3f7..3affd08a262b 100644 --- a/arch/mips/mm/uasm.c +++ b/arch/mips/mm/uasm.c @@ -49,18 +49,18 @@ enum opcode { insn_invalid, insn_addiu, insn_addu, insn_and, insn_andi, insn_bbit0, insn_bbit1, insn_beq, insn_beql, insn_bgez, insn_bgezl, insn_bltz, insn_bltzl, - insn_bne, insn_cache, insn_cfc1, insn_ctc1, insn_daddiu, insn_daddu, - insn_dins, insn_dinsm, insn_divu, insn_dmfc0, insn_dmtc0, insn_drotr, - insn_drotr32, insn_dsll, insn_dsll32, insn_dsra, insn_dsrl, insn_dsrl32, - insn_dsubu, insn_eret, insn_ext, insn_ins, insn_j, insn_jal, insn_jalr, - insn_jr, insn_lb, insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld, - insn_lui, insn_lw, insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi, - insn_mflo, insn_mtc0, insn_mthc0, insn_mul, insn_or, insn_ori, - insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll, - insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra, insn_srl, - insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, insn_tlbp, - insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, insn_xor, - insn_xori, insn_yield, insn_lddir, insn_ldpte, + insn_bne, insn_cache, insn_cfc1, insn_cfcmsa, insn_ctc1, insn_ctcmsa, + insn_daddiu, insn_daddu, insn_dins, insn_dinsm, insn_divu, insn_dmfc0, + insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll, insn_dsll32, insn_dsra, + insn_dsrl, insn_dsrl32, insn_dsubu, insn_eret, insn_ext, insn_ins, + insn_j, insn_jal, insn_jalr, insn_jr, insn_lb, insn_ld, insn_ldx, + insn_lh, insn_ll, insn_lld, insn_lui, insn_lw, insn_lwx, insn_mfc0, + insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0, insn_mthc0, insn_mul, + insn_or, insn_ori, insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd, + insn_sd, insn_sll, insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra, + insn_srl, insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, + insn_tlbp, insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, + insn_xor, insn_xori, insn_yield, insn_lddir, insn_ldpte, }; struct insn { @@ -269,7 +269,9 @@ I_u1s2(_bltzl) I_u1u2s3(_bne) I_u2s3u1(_cache) I_u1u2(_cfc1) +I_u2u1(_cfcmsa) I_u1u2(_ctc1) +I_u2u1(_ctcmsa) I_u1u2u3(_dmfc0) I_u1u2u3(_dmtc0) I_u2u1s3(_daddiu) From 61c64cf99ae589af3835dbc9bb57200d4a4842ae Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 23 Jun 2016 17:34:36 +0100 Subject: [PATCH 209/302] MIPS: uasm: Add DI instruction Add DI instruction for disabling interrupts to uasm so that KVM can use uasm for generating its entry point code at runtime. Signed-off-by: James Hogan Acked-by: Ralf Baechle Cc: linux-mips@linux-mips.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/uasm.h | 1 + arch/mips/include/uapi/asm/inst.h | 1 + arch/mips/mm/uasm-micromips.c | 1 + arch/mips/mm/uasm-mips.c | 1 + arch/mips/mm/uasm.c | 23 ++++++++++++----------- 5 files changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h index edc02687016e..4af8a5becbbb 100644 --- a/arch/mips/include/asm/uasm.h +++ b/arch/mips/include/asm/uasm.h @@ -110,6 +110,7 @@ Ip_u1u2(_ctc1); Ip_u2u1(_ctcmsa); Ip_u2u1s3(_daddiu); Ip_u3u1u2(_daddu); +Ip_u1(_di); Ip_u2u1msbu3(_dins); Ip_u2u1msbu3(_dinsm); Ip_u1u2(_divu); diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h index 2e624dd058ef..7010d0b7b752 100644 --- a/arch/mips/include/uapi/asm/inst.h +++ b/arch/mips/include/uapi/asm/inst.h @@ -376,6 +376,7 @@ enum mm_32axf_minor_op { mm_jalrhb_op = 0x07c, mm_tlbwi_op = 0x08d, mm_tlbwr_op = 0x0cd, + mm_di_op = 0x11d, mm_jalrs_op = 0x13c, mm_jalrshb_op = 0x17c, mm_sync_op = 0x1ad, diff --git a/arch/mips/mm/uasm-micromips.c b/arch/mips/mm/uasm-micromips.c index eba5018961eb..40bef28f192c 100644 --- a/arch/mips/mm/uasm-micromips.c +++ b/arch/mips/mm/uasm-micromips.c @@ -59,6 +59,7 @@ static struct insn insn_table_MM[] = { { insn_ctcmsa, M(mm_pool32s_op, 0, msa_ctc_op, 0, 0, mm_32s_elm_op), RD | RE }, { insn_daddu, 0, 0 }, { insn_daddiu, 0, 0 }, + { insn_di, M(mm_pool32a_op, 0, 0, 0, mm_di_op, mm_pool32axf_op), RS }, { insn_divu, M(mm_pool32a_op, 0, 0, 0, mm_divu_op, mm_pool32axf_op), RT | RS }, { insn_dmfc0, 0, 0 }, { insn_dmtc0, 0, 0 }, diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c index 9f77783f23a6..2b7d85b8241f 100644 --- a/arch/mips/mm/uasm-mips.c +++ b/arch/mips/mm/uasm-mips.c @@ -74,6 +74,7 @@ static struct insn insn_table[] = { { insn_daddiu, M(daddiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM }, { insn_daddu, M(spec_op, 0, 0, 0, 0, daddu_op), RS | RT | RD }, { insn_dinsm, M(spec3_op, 0, 0, 0, 0, dinsm_op), RS | RT | RD | RE }, + { insn_di, M(cop0_op, mfmc0_op, 0, 12, 0, 0), RT }, { insn_dins, M(spec3_op, 0, 0, 0, 0, dins_op), RS | RT | RD | RE }, { insn_divu, M(spec_op, 0, 0, 0, 0, divu_op), RS | RT }, { insn_dmfc0, M(cop0_op, dmfc_op, 0, 0, 0, 0), RT | RD | SET}, diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c index 3affd08a262b..006fb05b74a7 100644 --- a/arch/mips/mm/uasm.c +++ b/arch/mips/mm/uasm.c @@ -50,17 +50,17 @@ enum opcode { insn_addiu, insn_addu, insn_and, insn_andi, insn_bbit0, insn_bbit1, insn_beq, insn_beql, insn_bgez, insn_bgezl, insn_bltz, insn_bltzl, insn_bne, insn_cache, insn_cfc1, insn_cfcmsa, insn_ctc1, insn_ctcmsa, - insn_daddiu, insn_daddu, insn_dins, insn_dinsm, insn_divu, insn_dmfc0, - insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll, insn_dsll32, insn_dsra, - insn_dsrl, insn_dsrl32, insn_dsubu, insn_eret, insn_ext, insn_ins, - insn_j, insn_jal, insn_jalr, insn_jr, insn_lb, insn_ld, insn_ldx, - insn_lh, insn_ll, insn_lld, insn_lui, insn_lw, insn_lwx, insn_mfc0, - insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0, insn_mthc0, insn_mul, - insn_or, insn_ori, insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd, - insn_sd, insn_sll, insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra, - insn_srl, insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, - insn_tlbp, insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, - insn_xor, insn_xori, insn_yield, insn_lddir, insn_ldpte, + insn_daddiu, insn_daddu, insn_di, insn_dins, insn_dinsm, insn_divu, + insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll, + insn_dsll32, insn_dsra, insn_dsrl, insn_dsrl32, insn_dsubu, insn_eret, + insn_ext, insn_ins, insn_j, insn_jal, insn_jalr, insn_jr, insn_lb, + insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld, insn_lui, insn_lw, + insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0, + insn_mthc0, insn_mul, insn_or, insn_ori, insn_pref, insn_rfe, insn_rotr, + insn_sc, insn_scd, insn_sd, insn_sll, insn_sllv, insn_slt, insn_sltiu, + insn_sltu, insn_sra, insn_srl, insn_srlv, insn_subu, insn_sw, insn_sync, + insn_syscall, insn_tlbp, insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, + insn_wsbh, insn_xor, insn_xori, insn_yield, insn_lddir, insn_ldpte, }; struct insn { @@ -276,6 +276,7 @@ I_u1u2u3(_dmfc0) I_u1u2u3(_dmtc0) I_u2u1s3(_daddiu) I_u3u1u2(_daddu) +I_u1(_di); I_u1u2(_divu) I_u2u1u3(_dsll) I_u2u1u3(_dsll32) From 9f730a60e5a046230cff8c9f4c8eb73f6dca7d81 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 23 Jun 2016 17:34:37 +0100 Subject: [PATCH 210/302] MIPS: uasm: Add MTHI/MTLO instructions Add MTHI/MTLO instructions for writing to the hi & lo registers to uasm so that KVM can use uasm for generating its entry point code at runtime. Signed-off-by: James Hogan Acked-by: Ralf Baechle Cc: linux-mips@linux-mips.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/uasm.h | 2 ++ arch/mips/include/uapi/asm/inst.h | 2 ++ arch/mips/mm/uasm-micromips.c | 2 ++ arch/mips/mm/uasm-mips.c | 2 ++ arch/mips/mm/uasm.c | 13 ++++++++----- 5 files changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h index 4af8a5becbbb..f7929f65f7ca 100644 --- a/arch/mips/include/asm/uasm.h +++ b/arch/mips/include/asm/uasm.h @@ -146,6 +146,8 @@ Ip_u1(_mfhi); Ip_u1(_mflo); Ip_u1u2u3(_mtc0); Ip_u1u2u3(_mthc0); +Ip_u1(_mthi); +Ip_u1(_mtlo); Ip_u3u1u2(_mul); Ip_u3u1u2(_or); Ip_u2u1u3(_ori); diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h index 7010d0b7b752..6319c5037e66 100644 --- a/arch/mips/include/uapi/asm/inst.h +++ b/arch/mips/include/uapi/asm/inst.h @@ -375,7 +375,9 @@ enum mm_32axf_minor_op { mm_mflo32_op = 0x075, mm_jalrhb_op = 0x07c, mm_tlbwi_op = 0x08d, + mm_mthi32_op = 0x0b5, mm_tlbwr_op = 0x0cd, + mm_mtlo32_op = 0x0f5, mm_di_op = 0x11d, mm_jalrs_op = 0x13c, mm_jalrshb_op = 0x17c, diff --git a/arch/mips/mm/uasm-micromips.c b/arch/mips/mm/uasm-micromips.c index 40bef28f192c..277cf52d80e1 100644 --- a/arch/mips/mm/uasm-micromips.c +++ b/arch/mips/mm/uasm-micromips.c @@ -89,6 +89,8 @@ static struct insn insn_table_MM[] = { { insn_mfhi, M(mm_pool32a_op, 0, 0, 0, mm_mfhi32_op, mm_pool32axf_op), RS }, { insn_mflo, M(mm_pool32a_op, 0, 0, 0, mm_mflo32_op, mm_pool32axf_op), RS }, { insn_mtc0, M(mm_pool32a_op, 0, 0, 0, mm_mtc0_op, mm_pool32axf_op), RT | RS | RD }, + { insn_mthi, M(mm_pool32a_op, 0, 0, 0, mm_mthi32_op, mm_pool32axf_op), RS }, + { insn_mtlo, M(mm_pool32a_op, 0, 0, 0, mm_mtlo32_op, mm_pool32axf_op), RS }, { insn_mul, M(mm_pool32a_op, 0, 0, 0, 0, mm_mul_op), RT | RS | RD }, { insn_or, M(mm_pool32a_op, 0, 0, 0, 0, mm_or32_op), RT | RS | RD }, { insn_ori, M(mm_ori32_op, 0, 0, 0, 0, 0), RT | RS | UIMM }, diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c index 2b7d85b8241f..86a3c76a1ad8 100644 --- a/arch/mips/mm/uasm-mips.c +++ b/arch/mips/mm/uasm-mips.c @@ -119,6 +119,8 @@ static struct insn insn_table[] = { { insn_mflo, M(spec_op, 0, 0, 0, 0, mflo_op), RD }, { insn_mtc0, M(cop0_op, mtc_op, 0, 0, 0, 0), RT | RD | SET}, { insn_mthc0, M(cop0_op, mthc0_op, 0, 0, 0, 0), RT | RD | SET}, + { insn_mthi, M(spec_op, 0, 0, 0, 0, mthi_op), RS }, + { insn_mtlo, M(spec_op, 0, 0, 0, 0, mtlo_op), RS }, { insn_mul, M(spec2_op, 0, 0, 0, 0, mul_op), RS | RT | RD}, { insn_ori, M(ori_op, 0, 0, 0, 0, 0), RS | RT | UIMM }, { insn_or, M(spec_op, 0, 0, 0, 0, or_op), RS | RT | RD }, diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c index 006fb05b74a7..3e0282d301d6 100644 --- a/arch/mips/mm/uasm.c +++ b/arch/mips/mm/uasm.c @@ -56,11 +56,12 @@ enum opcode { insn_ext, insn_ins, insn_j, insn_jal, insn_jalr, insn_jr, insn_lb, insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld, insn_lui, insn_lw, insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0, - insn_mthc0, insn_mul, insn_or, insn_ori, insn_pref, insn_rfe, insn_rotr, - insn_sc, insn_scd, insn_sd, insn_sll, insn_sllv, insn_slt, insn_sltiu, - insn_sltu, insn_sra, insn_srl, insn_srlv, insn_subu, insn_sw, insn_sync, - insn_syscall, insn_tlbp, insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, - insn_wsbh, insn_xor, insn_xori, insn_yield, insn_lddir, insn_ldpte, + insn_mthc0, insn_mthi, insn_mtlo, insn_mul, insn_or, insn_ori, + insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll, + insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra, insn_srl, + insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, insn_tlbp, + insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, insn_xor, + insn_xori, insn_yield, insn_lddir, insn_ldpte, }; struct insn { @@ -306,6 +307,8 @@ I_u1(_mfhi) I_u1(_mflo) I_u1u2u3(_mtc0) I_u1u2u3(_mthc0) +I_u1(_mthi) +I_u1(_mtlo) I_u3u1u2(_mul) I_u2u1u3(_ori) I_u3u1u2(_or) From 6f63405cb67bc4424cd7cada11783dcef0f8b3c2 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 23 Jun 2016 17:34:38 +0100 Subject: [PATCH 211/302] MIPS: uasm: Add r6 MUL encoding Add the R6 MUL instruction encoding for 3 operand signed multiply to uasm so that KVM can use uasm for generating its entry point code at runtime on R6. Signed-off-by: James Hogan Acked-by: Ralf Baechle Cc: linux-mips@linux-mips.org Signed-off-by: Paolo Bonzini --- arch/mips/include/uapi/asm/inst.h | 44 +++++++++++++++++++++++++++++++ arch/mips/mm/uasm-mips.c | 4 +++ 2 files changed, 48 insertions(+) diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h index 6319c5037e66..fc96012c75d1 100644 --- a/arch/mips/include/uapi/asm/inst.h +++ b/arch/mips/include/uapi/asm/inst.h @@ -92,6 +92,50 @@ enum spec3_op { rdhwr_op = 0x3b }; +/* + * Bits 10-6 minor opcode for r6 spec mult/div encodings + */ +enum mult_op { + mult_mult_op = 0x0, + mult_mul_op = 0x2, + mult_muh_op = 0x3, +}; +enum multu_op { + multu_multu_op = 0x0, + multu_mulu_op = 0x2, + multu_muhu_op = 0x3, +}; +enum div_op { + div_div_op = 0x0, + div_div6_op = 0x2, + div_mod_op = 0x3, +}; +enum divu_op { + divu_divu_op = 0x0, + divu_divu6_op = 0x2, + divu_modu_op = 0x3, +}; +enum dmult_op { + dmult_dmult_op = 0x0, + dmult_dmul_op = 0x2, + dmult_dmuh_op = 0x3, +}; +enum dmultu_op { + dmultu_dmultu_op = 0x0, + dmultu_dmulu_op = 0x2, + dmultu_dmuhu_op = 0x3, +}; +enum ddiv_op { + ddiv_ddiv_op = 0x0, + ddiv_ddiv6_op = 0x2, + ddiv_dmod_op = 0x3, +}; +enum ddivu_op { + ddivu_ddivu_op = 0x0, + ddivu_ddivu6_op = 0x2, + ddivu_dmodu_op = 0x3, +}; + /* * rt field of bcond opcodes. */ diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c index 86a3c76a1ad8..cec524167822 100644 --- a/arch/mips/mm/uasm-mips.c +++ b/arch/mips/mm/uasm-mips.c @@ -121,7 +121,11 @@ static struct insn insn_table[] = { { insn_mthc0, M(cop0_op, mthc0_op, 0, 0, 0, 0), RT | RD | SET}, { insn_mthi, M(spec_op, 0, 0, 0, 0, mthi_op), RS }, { insn_mtlo, M(spec_op, 0, 0, 0, 0, mtlo_op), RS }, +#ifndef CONFIG_CPU_MIPSR6 { insn_mul, M(spec2_op, 0, 0, 0, 0, mul_op), RS | RT | RD}, +#else + { insn_mul, M(spec_op, 0, 0, 0, mult_mul_op, mult_op), RS | RT | RD}, +#endif { insn_ori, M(ori_op, 0, 0, 0, 0, 0), RS | RT | UIMM }, { insn_or, M(spec_op, 0, 0, 0, 0, or_op), RS | RT | RD }, #ifndef CONFIG_CPU_MIPSR6 From 90e9311a34e7b88f246a6d741ef70e3fdba15a34 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 23 Jun 2016 17:34:39 +0100 Subject: [PATCH 212/302] MIPS; KVM: Convert exception entry to uasm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert the whole of locore.S (assembly to enter guest and handle exception entry) to be generated dynamically with uasm. This is done with minimal changes to the resulting code. The main changes are: - Some constants are generated by uasm using LUI+ADDIU instead of LUI+ORI. - Loading of lo and hi are swapped around in vcpu_run but not when resuming the guest after an exit. Both bits of logic are now generated by the same code. - Register MOVEs in uasm use different ADDU operand ordering to GNU as, putting zero register into rs instead of rt. - The JALR.HB to call the C exit handler is switched to JALR, since the hazard barrier would appear to be unnecessary. This will allow further optimisation in the future to dynamically handle the capabilities of the CPU. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim KrÄmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 8 +- arch/mips/kvm/Kconfig | 1 + arch/mips/kvm/Makefile | 2 +- arch/mips/kvm/entry.c | 622 +++++++++++++++++++++++++++++++ arch/mips/kvm/interrupt.h | 4 - arch/mips/kvm/locore.S | 602 ------------------------------ arch/mips/kvm/mips.c | 37 +- 7 files changed, 642 insertions(+), 634 deletions(-) create mode 100644 arch/mips/kvm/entry.c delete mode 100644 arch/mips/kvm/locore.S diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index b0773c6d622f..2e76e899079c 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -533,8 +533,12 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks); /* Debug: dump vcpu state */ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu); -/* Trampoline ASM routine to start running in "Guest" context */ -extern int __kvm_mips_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu); +extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu); + +/* Building of entry/exception code */ +void *kvm_mips_build_vcpu_run(void *addr); +void *kvm_mips_build_exception(void *addr); +void *kvm_mips_build_exit(void *addr); /* FPU/MSA context management */ void __kvm_save_fpu(struct kvm_vcpu_arch *vcpu); diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 2ae12825529f..7c56d6b124d1 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -17,6 +17,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM + select EXPORT_UASM select PREEMPT_NOTIFIERS select ANON_INODES select KVM_MMIO diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile index 0aabe40fcac9..847429de780d 100644 --- a/arch/mips/kvm/Makefile +++ b/arch/mips/kvm/Makefile @@ -7,7 +7,7 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o -kvm-objs := $(common-objs-y) mips.o emulate.o locore.o \ +kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \ interrupt.o stats.o commpage.o \ dyntrans.o trap_emul.o fpu.o kvm-objs += mmu.o diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c new file mode 100644 index 000000000000..9a18b4939b35 --- /dev/null +++ b/arch/mips/kvm/entry.c @@ -0,0 +1,622 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Generation of main entry point for the guest, exception handling. + * + * Copyright (C) 2012 MIPS Technologies, Inc. + * Authors: Sanjay Lal + * + * Copyright (C) 2016 Imagination Technologies Ltd. + */ + +#include +#include +#include +#include + +/* Register names */ +#define ZERO 0 +#define AT 1 +#define V0 2 +#define V1 3 +#define A0 4 +#define A1 5 + +#if _MIPS_SIM == _MIPS_SIM_ABI32 +#define T0 8 +#define T1 9 +#define T2 10 +#define T3 11 +#endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */ + +#if _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32 +#define T0 12 +#define T1 13 +#define T2 14 +#define T3 15 +#endif /* _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32 */ + +#define S0 16 +#define S1 17 +#define T9 25 +#define K0 26 +#define K1 27 +#define GP 28 +#define SP 29 +#define RA 31 + +/* Some CP0 registers */ +#define C0_HWRENA 7, 0 +#define C0_BADVADDR 8, 0 +#define C0_ENTRYHI 10, 0 +#define C0_STATUS 12, 0 +#define C0_CAUSE 13, 0 +#define C0_EPC 14, 0 +#define C0_EBASE 15, 1 +#define C0_CONFIG3 16, 3 +#define C0_CONFIG5 16, 5 +#define C0_DDATA_LO 28, 3 +#define C0_ERROREPC 30, 0 + +#define CALLFRAME_SIZ 32 + +enum label_id { + label_fpu_1 = 1, + label_msa_1, + label_return_to_host, + label_kernel_asid, +}; + +UASM_L_LA(_fpu_1) +UASM_L_LA(_msa_1) +UASM_L_LA(_return_to_host) +UASM_L_LA(_kernel_asid) + +static void *kvm_mips_build_enter_guest(void *addr); +static void *kvm_mips_build_ret_from_exit(void *addr); +static void *kvm_mips_build_ret_to_guest(void *addr); +static void *kvm_mips_build_ret_to_host(void *addr); + +/** + * kvm_mips_build_vcpu_run() - Assemble function to start running a guest VCPU. + * @addr: Address to start writing code. + * + * Assemble the start of the vcpu_run function to run a guest VCPU. The function + * conforms to the following prototype: + * + * int vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu); + * + * The exit from the guest and return to the caller is handled by the code + * generated by kvm_mips_build_ret_to_host(). + * + * Returns: Next address after end of written function. + */ +void *kvm_mips_build_vcpu_run(void *addr) +{ + u32 *p = addr; + unsigned int i; + + /* + * A0: run + * A1: vcpu + */ + + /* k0/k1 not being used in host kernel context */ + uasm_i_addiu(&p, K1, SP, -(int)sizeof(struct pt_regs)); + for (i = 16; i < 32; ++i) { + if (i == 24) + i = 28; + UASM_i_SW(&p, i, offsetof(struct pt_regs, regs[i]), K1); + } + + /* Save hi/lo */ + uasm_i_mflo(&p, V0); + UASM_i_SW(&p, V0, offsetof(struct pt_regs, lo), K1); + uasm_i_mfhi(&p, V1); + UASM_i_SW(&p, V1, offsetof(struct pt_regs, hi), K1); + + /* Save host status */ + uasm_i_mfc0(&p, V0, C0_STATUS); + UASM_i_SW(&p, V0, offsetof(struct pt_regs, cp0_status), K1); + + /* Save DDATA_LO, will be used to store pointer to vcpu */ + uasm_i_mfc0(&p, V1, C0_DDATA_LO); + UASM_i_SW(&p, V1, offsetof(struct pt_regs, cp0_epc), K1); + + /* DDATA_LO has pointer to vcpu */ + uasm_i_mtc0(&p, A1, C0_DDATA_LO); + + /* Offset into vcpu->arch */ + uasm_i_addiu(&p, K1, A1, offsetof(struct kvm_vcpu, arch)); + + /* + * Save the host stack to VCPU, used for exception processing + * when we exit from the Guest + */ + UASM_i_SW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1); + + /* Save the kernel gp as well */ + UASM_i_SW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1); + + /* + * Setup status register for running the guest in UM, interrupts + * are disabled + */ + UASM_i_LA(&p, K0, ST0_EXL | KSU_USER | ST0_BEV); + uasm_i_mtc0(&p, K0, C0_STATUS); + uasm_i_ehb(&p); + + /* load up the new EBASE */ + UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1); + uasm_i_mtc0(&p, K0, C0_EBASE); + + /* + * Now that the new EBASE has been loaded, unset BEV, set + * interrupt mask as it was but make sure that timer interrupts + * are enabled + */ + uasm_i_addiu(&p, K0, ZERO, ST0_EXL | KSU_USER | ST0_IE); + uasm_i_andi(&p, V0, V0, ST0_IM); + uasm_i_or(&p, K0, K0, V0); + uasm_i_mtc0(&p, K0, C0_STATUS); + uasm_i_ehb(&p); + + p = kvm_mips_build_enter_guest(p); + + return p; +} + +/** + * kvm_mips_build_enter_guest() - Assemble code to resume guest execution. + * @addr: Address to start writing code. + * + * Assemble the code to resume guest execution. This code is common between the + * initial entry into the guest from the host, and returning from the exit + * handler back to the guest. + * + * Returns: Next address after end of written function. + */ +static void *kvm_mips_build_enter_guest(void *addr) +{ + u32 *p = addr; + unsigned int i; + struct uasm_label labels[2]; + struct uasm_reloc relocs[2]; + struct uasm_label *l = labels; + struct uasm_reloc *r = relocs; + + memset(labels, 0, sizeof(labels)); + memset(relocs, 0, sizeof(relocs)); + + /* Set Guest EPC */ + UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, pc), K1); + uasm_i_mtc0(&p, T0, C0_EPC); + + /* Set the ASID for the Guest Kernel */ + UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, cop0), K1); + UASM_i_LW(&p, T0, offsetof(struct mips_coproc, reg[MIPS_CP0_STATUS][0]), + T0); + uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL); + uasm_i_xori(&p, T0, T0, KSU_USER); + uasm_il_bnez(&p, &r, T0, label_kernel_asid); + uasm_i_addiu(&p, T1, K1, + offsetof(struct kvm_vcpu_arch, guest_kernel_asid)); + /* else user */ + uasm_i_addiu(&p, T1, K1, + offsetof(struct kvm_vcpu_arch, guest_user_asid)); + uasm_l_kernel_asid(&l, p); + + /* t1: contains the base of the ASID array, need to get the cpu id */ + /* smp_processor_id */ + UASM_i_LW(&p, T2, offsetof(struct thread_info, cpu), GP); + /* x4 */ + uasm_i_sll(&p, T2, T2, 2); + UASM_i_ADDU(&p, T3, T1, T2); + UASM_i_LW(&p, K0, 0, T3); +#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE + /* x sizeof(struct cpuinfo_mips)/4 */ + uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4); + uasm_i_mul(&p, T2, T2, T3); + + UASM_i_LA_mostly(&p, AT, (long)&cpu_data[0].asid_mask); + UASM_i_ADDU(&p, AT, AT, T2); + UASM_i_LW(&p, T2, uasm_rel_lo((long)&cpu_data[0].asid_mask), AT); + uasm_i_and(&p, K0, K0, T2); +#else + uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID); +#endif + uasm_i_mtc0(&p, K0, C0_ENTRYHI); + uasm_i_ehb(&p); + + /* Disable RDHWR access */ + uasm_i_mtc0(&p, ZERO, C0_HWRENA); + + /* load the guest context from VCPU and return */ + for (i = 1; i < 32; ++i) { + /* Guest k0/k1 loaded later */ + if (i == K0 || i == K1) + continue; + UASM_i_LW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1); + } + + /* Restore hi/lo */ + UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, hi), K1); + uasm_i_mthi(&p, K0); + + UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, lo), K1); + uasm_i_mtlo(&p, K0); + + /* Restore the guest's k0/k1 registers */ + UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1); + UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1); + + /* Jump to guest */ + uasm_i_eret(&p); + + uasm_resolve_relocs(relocs, labels); + + return p; +} + +/** + * kvm_mips_build_exception() - Assemble first level guest exception handler. + * @addr: Address to start writing code. + * + * Assemble exception vector code for guest execution. The generated vector will + * jump to the common exception handler generated by kvm_mips_build_exit(). + * + * Returns: Next address after end of written function. + */ +void *kvm_mips_build_exception(void *addr) +{ + u32 *p = addr; + + /* Save guest k0 */ + uasm_i_mtc0(&p, K0, C0_ERROREPC); + uasm_i_ehb(&p); + + /* Get EBASE */ + uasm_i_mfc0(&p, K0, C0_EBASE); + /* Get rid of CPUNum */ + uasm_i_srl(&p, K0, K0, 10); + uasm_i_sll(&p, K0, K0, 10); + /* Save k1 @ offset 0x3000 */ + UASM_i_SW(&p, K1, 0x3000, K0); + + /* Exception handler is installed @ offset 0x2000 */ + uasm_i_addiu(&p, K0, K0, 0x2000); + /* Jump to the function */ + uasm_i_jr(&p, K0); + uasm_i_nop(&p); + + return p; +} + +/** + * kvm_mips_build_exit() - Assemble common guest exit handler. + * @addr: Address to start writing code. + * + * Assemble the generic guest exit handling code. This is called by the + * exception vectors (generated by kvm_mips_build_exception()), and calls + * kvm_mips_handle_exit(), then either resumes the guest or returns to the host + * depending on the return value. + * + * Returns: Next address after end of written function. + */ +void *kvm_mips_build_exit(void *addr) +{ + u32 *p = addr; + unsigned int i; + struct uasm_label labels[3]; + struct uasm_reloc relocs[3]; + struct uasm_label *l = labels; + struct uasm_reloc *r = relocs; + + memset(labels, 0, sizeof(labels)); + memset(relocs, 0, sizeof(relocs)); + + /* + * Generic Guest exception handler. We end up here when the guest + * does something that causes a trap to kernel mode. + */ + + /* Get the VCPU pointer from DDATA_LO */ + uasm_i_mfc0(&p, K1, C0_DDATA_LO); + uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch)); + + /* Start saving Guest context to VCPU */ + for (i = 0; i < 32; ++i) { + /* Guest k0/k1 saved later */ + if (i == K0 || i == K1) + continue; + UASM_i_SW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1); + } + + /* We need to save hi/lo and restore them on the way out */ + uasm_i_mfhi(&p, T0); + UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, hi), K1); + + uasm_i_mflo(&p, T0); + UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, lo), K1); + + /* Finally save guest k0/k1 to VCPU */ + uasm_i_mfc0(&p, T0, C0_ERROREPC); + UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1); + + /* Get GUEST k1 and save it in VCPU */ + uasm_i_addiu(&p, T1, ZERO, ~0x2ff); + uasm_i_mfc0(&p, T0, C0_EBASE); + uasm_i_and(&p, T0, T0, T1); + UASM_i_LW(&p, T0, 0x3000, T0); + UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1); + + /* Now that context has been saved, we can use other registers */ + + /* Restore vcpu */ + uasm_i_mfc0(&p, A1, C0_DDATA_LO); + uasm_i_move(&p, S1, A1); + + /* Restore run (vcpu->run) */ + UASM_i_LW(&p, A0, offsetof(struct kvm_vcpu, run), A1); + /* Save pointer to run in s0, will be saved by the compiler */ + uasm_i_move(&p, S0, A0); + + /* + * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process + * the exception + */ + uasm_i_mfc0(&p, K0, C0_EPC); + UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, pc), K1); + + uasm_i_mfc0(&p, K0, C0_BADVADDR); + UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_badvaddr), + K1); + + uasm_i_mfc0(&p, K0, C0_CAUSE); + uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_cause), K1); + + /* Now restore the host state just enough to run the handlers */ + + /* Switch EBASE to the one used by Linux */ + /* load up the host EBASE */ + uasm_i_mfc0(&p, V0, C0_STATUS); + + uasm_i_lui(&p, AT, ST0_BEV >> 16); + uasm_i_or(&p, K0, V0, AT); + + uasm_i_mtc0(&p, K0, C0_STATUS); + uasm_i_ehb(&p); + + UASM_i_LA_mostly(&p, K0, (long)&ebase); + UASM_i_LW(&p, K0, uasm_rel_lo((long)&ebase), K0); + uasm_i_mtc0(&p, K0, C0_EBASE); + + /* + * If FPU is enabled, save FCR31 and clear it so that later ctc1's don't + * trigger FPE for pending exceptions. + */ + uasm_i_lui(&p, AT, ST0_CU1 >> 16); + uasm_i_and(&p, V1, V0, AT); + uasm_il_beqz(&p, &r, V1, label_fpu_1); + uasm_i_nop(&p); + uasm_i_cfc1(&p, T0, 31); + uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.fcr31), K1); + uasm_i_ctc1(&p, ZERO, 31); + uasm_l_fpu_1(&l, p); + +#ifdef CONFIG_CPU_HAS_MSA + /* + * If MSA is enabled, save MSACSR and clear it so that later + * instructions don't trigger MSAFPE for pending exceptions. + */ + uasm_i_mfc0(&p, T0, C0_CONFIG3); + uasm_i_ext(&p, T0, T0, 28, 1); /* MIPS_CONF3_MSAP */ + uasm_il_beqz(&p, &r, T0, label_msa_1); + uasm_i_nop(&p); + uasm_i_mfc0(&p, T0, C0_CONFIG5); + uasm_i_ext(&p, T0, T0, 27, 1); /* MIPS_CONF5_MSAEN */ + uasm_il_beqz(&p, &r, T0, label_msa_1); + uasm_i_nop(&p); + uasm_i_cfcmsa(&p, T0, MSA_CSR); + uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.msacsr), + K1); + uasm_i_ctcmsa(&p, MSA_CSR, ZERO); + uasm_l_msa_1(&l, p); +#endif + + /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */ + uasm_i_addiu(&p, AT, ZERO, ~(ST0_EXL | KSU_USER | ST0_IE)); + uasm_i_and(&p, V0, V0, AT); + uasm_i_lui(&p, AT, ST0_CU0 >> 16); + uasm_i_or(&p, V0, V0, AT); + uasm_i_mtc0(&p, V0, C0_STATUS); + uasm_i_ehb(&p); + + /* Load up host GP */ + UASM_i_LW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1); + + /* Need a stack before we can jump to "C" */ + UASM_i_LW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1); + + /* Saved host state */ + uasm_i_addiu(&p, SP, SP, -(int)sizeof(struct pt_regs)); + + /* + * XXXKYMA do we need to load the host ASID, maybe not because the + * kernel entries are marked GLOBAL, need to verify + */ + + /* Restore host DDATA_LO */ + UASM_i_LW(&p, K0, offsetof(struct pt_regs, cp0_epc), SP); + uasm_i_mtc0(&p, K0, C0_DDATA_LO); + + /* Restore RDHWR access */ + UASM_i_LA_mostly(&p, K0, (long)&hwrena); + uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0); + uasm_i_mtc0(&p, K0, C0_HWRENA); + + /* Jump to handler */ + /* + * XXXKYMA: not sure if this is safe, how large is the stack?? + * Now jump to the kvm_mips_handle_exit() to see if we can deal + * with this in the kernel + */ + UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit); + uasm_i_jalr(&p, RA, T9); + uasm_i_addiu(&p, SP, SP, -CALLFRAME_SIZ); + + uasm_resolve_relocs(relocs, labels); + + p = kvm_mips_build_ret_from_exit(p); + + return p; +} + +/** + * kvm_mips_build_ret_from_exit() - Assemble guest exit return handler. + * @addr: Address to start writing code. + * + * Assemble the code to handle the return from kvm_mips_handle_exit(), either + * resuming the guest or returning to the host depending on the return value. + * + * Returns: Next address after end of written function. + */ +static void *kvm_mips_build_ret_from_exit(void *addr) +{ + u32 *p = addr; + struct uasm_label labels[2]; + struct uasm_reloc relocs[2]; + struct uasm_label *l = labels; + struct uasm_reloc *r = relocs; + + memset(labels, 0, sizeof(labels)); + memset(relocs, 0, sizeof(relocs)); + + /* Return from handler Make sure interrupts are disabled */ + uasm_i_di(&p, ZERO); + uasm_i_ehb(&p); + + /* + * XXXKYMA: k0/k1 could have been blown away if we processed + * an exception while we were handling the exception from the + * guest, reload k1 + */ + + uasm_i_move(&p, K1, S1); + uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch)); + + /* + * Check return value, should tell us if we are returning to the + * host (handle I/O etc)or resuming the guest + */ + uasm_i_andi(&p, T0, V0, RESUME_HOST); + uasm_il_bnez(&p, &r, T0, label_return_to_host); + uasm_i_nop(&p); + + p = kvm_mips_build_ret_to_guest(p); + + uasm_l_return_to_host(&l, p); + p = kvm_mips_build_ret_to_host(p); + + uasm_resolve_relocs(relocs, labels); + + return p; +} + +/** + * kvm_mips_build_ret_to_guest() - Assemble code to return to the guest. + * @addr: Address to start writing code. + * + * Assemble the code to handle return from the guest exit handler + * (kvm_mips_handle_exit()) back to the guest. + * + * Returns: Next address after end of written function. + */ +static void *kvm_mips_build_ret_to_guest(void *addr) +{ + u32 *p = addr; + + /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */ + uasm_i_mtc0(&p, S1, C0_DDATA_LO); + + /* Load up the Guest EBASE to minimize the window where BEV is set */ + UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1); + + /* Switch EBASE back to the one used by KVM */ + uasm_i_mfc0(&p, V1, C0_STATUS); + uasm_i_lui(&p, AT, ST0_BEV >> 16); + uasm_i_or(&p, K0, V1, AT); + uasm_i_mtc0(&p, K0, C0_STATUS); + uasm_i_ehb(&p); + uasm_i_mtc0(&p, T0, C0_EBASE); + + /* Setup status register for running guest in UM */ + uasm_i_ori(&p, V1, V1, ST0_EXL | KSU_USER | ST0_IE); + UASM_i_LA(&p, AT, ~(ST0_CU0 | ST0_MX)); + uasm_i_and(&p, V1, V1, AT); + uasm_i_mtc0(&p, V1, C0_STATUS); + uasm_i_ehb(&p); + + p = kvm_mips_build_enter_guest(p); + + return p; +} + +/** + * kvm_mips_build_ret_to_host() - Assemble code to return to the host. + * @addr: Address to start writing code. + * + * Assemble the code to handle return from the guest exit handler + * (kvm_mips_handle_exit()) back to the host, i.e. to the caller of the vcpu_run + * function generated by kvm_mips_build_vcpu_run(). + * + * Returns: Next address after end of written function. + */ +static void *kvm_mips_build_ret_to_host(void *addr) +{ + u32 *p = addr; + unsigned int i; + + /* EBASE is already pointing to Linux */ + UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, host_stack), K1); + uasm_i_addiu(&p, K1, K1, -(int)sizeof(struct pt_regs)); + + /* Restore host DDATA_LO */ + UASM_i_LW(&p, K0, offsetof(struct pt_regs, cp0_epc), K1); + uasm_i_mtc0(&p, K0, C0_DDATA_LO); + + /* + * r2/v0 is the return code, shift it down by 2 (arithmetic) + * to recover the err code + */ + uasm_i_sra(&p, K0, V0, 2); + uasm_i_move(&p, V0, K0); + + /* Load context saved on the host stack */ + for (i = 16; i < 31; ++i) { + if (i == 24) + i = 28; + UASM_i_LW(&p, i, offsetof(struct pt_regs, regs[i]), K1); + } + + UASM_i_LW(&p, K0, offsetof(struct pt_regs, hi), K1); + uasm_i_mthi(&p, K0); + + UASM_i_LW(&p, K0, offsetof(struct pt_regs, lo), K1); + uasm_i_mtlo(&p, K0); + + /* Restore RDHWR access */ + UASM_i_LA_mostly(&p, K0, (long)&hwrena); + uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0); + uasm_i_mtc0(&p, K0, C0_HWRENA); + + /* Restore RA, which is the address we will return to */ + UASM_i_LW(&p, RA, offsetof(struct pt_regs, regs[RA]), K1); + uasm_i_jr(&p, RA); + uasm_i_nop(&p); + + return p; +} + diff --git a/arch/mips/kvm/interrupt.h b/arch/mips/kvm/interrupt.h index d661c100b219..fb118a2c8379 100644 --- a/arch/mips/kvm/interrupt.h +++ b/arch/mips/kvm/interrupt.h @@ -28,10 +28,6 @@ #define MIPS_EXC_MAX 12 /* XXXSL More to follow */ -extern char __kvm_mips_vcpu_run_end[]; -extern char mips32_exception[], mips32_exceptionEnd[]; -extern char mips32_GuestException[], mips32_GuestExceptionEnd[]; - #define C_TI (_ULCAST_(1) << 30) #define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (0) diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S deleted file mode 100644 index 698286c0f732..000000000000 --- a/arch/mips/kvm/locore.S +++ /dev/null @@ -1,602 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Main entry point for the guest, exception handling. - * - * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. - * Authors: Sanjay Lal - */ - -#include -#include -#include -#include -#include -#include - -#define _C_LABEL(x) x -#define MIPSX(name) mips32_ ## name -#define CALLFRAME_SIZ 32 - -/* - * VECTOR - * exception vector entrypoint - */ -#define VECTOR(x, regmask) \ - .ent _C_LABEL(x),0; \ - EXPORT(x); - -#define VECTOR_END(x) \ - EXPORT(x); - -/* Overload, Danger Will Robinson!! */ -#define PT_HOST_USERLOCAL PT_EPC - -#define CP0_DDATA_LO $28,3 - -/* Resume Flags */ -#define RESUME_FLAG_HOST (1<<1) /* Resume host? */ - -#define RESUME_GUEST 0 -#define RESUME_HOST RESUME_FLAG_HOST - -/* - * __kvm_mips_vcpu_run: entry point to the guest - * a0: run - * a1: vcpu - */ - .set noreorder - -FEXPORT(__kvm_mips_vcpu_run) - /* k0/k1 not being used in host kernel context */ - INT_ADDIU k1, sp, -PT_SIZE - LONG_S $16, PT_R16(k1) - LONG_S $17, PT_R17(k1) - LONG_S $18, PT_R18(k1) - LONG_S $19, PT_R19(k1) - LONG_S $20, PT_R20(k1) - LONG_S $21, PT_R21(k1) - LONG_S $22, PT_R22(k1) - LONG_S $23, PT_R23(k1) - - LONG_S $28, PT_R28(k1) - LONG_S $29, PT_R29(k1) - LONG_S $30, PT_R30(k1) - LONG_S $31, PT_R31(k1) - - /* Save hi/lo */ - mflo v0 - LONG_S v0, PT_LO(k1) - mfhi v1 - LONG_S v1, PT_HI(k1) - - /* Save host status */ - mfc0 v0, CP0_STATUS - LONG_S v0, PT_STATUS(k1) - - /* Save DDATA_LO, will be used to store pointer to vcpu */ - mfc0 v1, CP0_DDATA_LO - LONG_S v1, PT_HOST_USERLOCAL(k1) - - /* DDATA_LO has pointer to vcpu */ - mtc0 a1, CP0_DDATA_LO - - /* Offset into vcpu->arch */ - INT_ADDIU k1, a1, VCPU_HOST_ARCH - - /* - * Save the host stack to VCPU, used for exception processing - * when we exit from the Guest - */ - LONG_S sp, VCPU_HOST_STACK(k1) - - /* Save the kernel gp as well */ - LONG_S gp, VCPU_HOST_GP(k1) - - /* - * Setup status register for running the guest in UM, interrupts - * are disabled - */ - li k0, (ST0_EXL | KSU_USER | ST0_BEV) - mtc0 k0, CP0_STATUS - ehb - - /* load up the new EBASE */ - LONG_L k0, VCPU_GUEST_EBASE(k1) - mtc0 k0, CP0_EBASE - - /* - * Now that the new EBASE has been loaded, unset BEV, set - * interrupt mask as it was but make sure that timer interrupts - * are enabled - */ - li k0, (ST0_EXL | KSU_USER | ST0_IE) - andi v0, v0, ST0_IM - or k0, k0, v0 - mtc0 k0, CP0_STATUS - ehb - - /* Set Guest EPC */ - LONG_L t0, VCPU_PC(k1) - mtc0 t0, CP0_EPC - -FEXPORT(__kvm_mips_load_asid) - /* Set the ASID for the Guest Kernel */ - PTR_L t0, VCPU_COP0(k1) - LONG_L t0, COP0_STATUS(t0) - andi t0, KSU_USER | ST0_ERL | ST0_EXL - xori t0, KSU_USER - bnez t0, 1f /* If kernel */ - INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID /* (BD) */ - INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID /* else user */ -1: - /* t1: contains the base of the ASID array, need to get the cpu id */ - LONG_L t2, TI_CPU($28) /* smp_processor_id */ - INT_SLL t2, t2, 2 /* x4 */ - REG_ADDU t3, t1, t2 - LONG_L k0, (t3) -#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE - li t3, CPUINFO_SIZE/4 - mul t2, t2, t3 /* x sizeof(struct cpuinfo_mips)/4 */ - LONG_L t2, (cpu_data + CPUINFO_ASID_MASK)(t2) - and k0, k0, t2 -#else - andi k0, k0, MIPS_ENTRYHI_ASID -#endif - mtc0 k0, CP0_ENTRYHI - ehb - - /* Disable RDHWR access */ - mtc0 zero, CP0_HWRENA - - .set noat - /* Now load up the Guest Context from VCPU */ - LONG_L $1, VCPU_R1(k1) - LONG_L $2, VCPU_R2(k1) - LONG_L $3, VCPU_R3(k1) - - LONG_L $4, VCPU_R4(k1) - LONG_L $5, VCPU_R5(k1) - LONG_L $6, VCPU_R6(k1) - LONG_L $7, VCPU_R7(k1) - - LONG_L $8, VCPU_R8(k1) - LONG_L $9, VCPU_R9(k1) - LONG_L $10, VCPU_R10(k1) - LONG_L $11, VCPU_R11(k1) - LONG_L $12, VCPU_R12(k1) - LONG_L $13, VCPU_R13(k1) - LONG_L $14, VCPU_R14(k1) - LONG_L $15, VCPU_R15(k1) - LONG_L $16, VCPU_R16(k1) - LONG_L $17, VCPU_R17(k1) - LONG_L $18, VCPU_R18(k1) - LONG_L $19, VCPU_R19(k1) - LONG_L $20, VCPU_R20(k1) - LONG_L $21, VCPU_R21(k1) - LONG_L $22, VCPU_R22(k1) - LONG_L $23, VCPU_R23(k1) - LONG_L $24, VCPU_R24(k1) - LONG_L $25, VCPU_R25(k1) - - /* k0/k1 loaded up later */ - - LONG_L $28, VCPU_R28(k1) - LONG_L $29, VCPU_R29(k1) - LONG_L $30, VCPU_R30(k1) - LONG_L $31, VCPU_R31(k1) - - /* Restore hi/lo */ - LONG_L k0, VCPU_LO(k1) - mtlo k0 - - LONG_L k0, VCPU_HI(k1) - mthi k0 - -FEXPORT(__kvm_mips_load_k0k1) - /* Restore the guest's k0/k1 registers */ - LONG_L k0, VCPU_R26(k1) - LONG_L k1, VCPU_R27(k1) - - /* Jump to guest */ - eret -EXPORT(__kvm_mips_vcpu_run_end) - -VECTOR(MIPSX(exception), unknown) -/* Find out what mode we came from and jump to the proper handler. */ - mtc0 k0, CP0_ERROREPC #01: Save guest k0 - ehb #02: - - mfc0 k0, CP0_EBASE #02: Get EBASE - INT_SRL k0, k0, 10 #03: Get rid of CPUNum - INT_SLL k0, k0, 10 #04 - LONG_S k1, 0x3000(k0) #05: Save k1 @ offset 0x3000 - INT_ADDIU k0, k0, 0x2000 #06: Exception handler is - # installed @ offset 0x2000 - j k0 #07: jump to the function - nop #08: branch delay slot -VECTOR_END(MIPSX(exceptionEnd)) -.end MIPSX(exception) - -/* - * Generic Guest exception handler. We end up here when the guest - * does something that causes a trap to kernel mode. - */ -NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra) - /* Get the VCPU pointer from DDTATA_LO */ - mfc0 k1, CP0_DDATA_LO - INT_ADDIU k1, k1, VCPU_HOST_ARCH - - /* Start saving Guest context to VCPU */ - LONG_S $0, VCPU_R0(k1) - LONG_S $1, VCPU_R1(k1) - LONG_S $2, VCPU_R2(k1) - LONG_S $3, VCPU_R3(k1) - LONG_S $4, VCPU_R4(k1) - LONG_S $5, VCPU_R5(k1) - LONG_S $6, VCPU_R6(k1) - LONG_S $7, VCPU_R7(k1) - LONG_S $8, VCPU_R8(k1) - LONG_S $9, VCPU_R9(k1) - LONG_S $10, VCPU_R10(k1) - LONG_S $11, VCPU_R11(k1) - LONG_S $12, VCPU_R12(k1) - LONG_S $13, VCPU_R13(k1) - LONG_S $14, VCPU_R14(k1) - LONG_S $15, VCPU_R15(k1) - LONG_S $16, VCPU_R16(k1) - LONG_S $17, VCPU_R17(k1) - LONG_S $18, VCPU_R18(k1) - LONG_S $19, VCPU_R19(k1) - LONG_S $20, VCPU_R20(k1) - LONG_S $21, VCPU_R21(k1) - LONG_S $22, VCPU_R22(k1) - LONG_S $23, VCPU_R23(k1) - LONG_S $24, VCPU_R24(k1) - LONG_S $25, VCPU_R25(k1) - - /* Guest k0/k1 saved later */ - - LONG_S $28, VCPU_R28(k1) - LONG_S $29, VCPU_R29(k1) - LONG_S $30, VCPU_R30(k1) - LONG_S $31, VCPU_R31(k1) - - .set at - - /* We need to save hi/lo and restore them on the way out */ - mfhi t0 - LONG_S t0, VCPU_HI(k1) - - mflo t0 - LONG_S t0, VCPU_LO(k1) - - /* Finally save guest k0/k1 to VCPU */ - mfc0 t0, CP0_ERROREPC - LONG_S t0, VCPU_R26(k1) - - /* Get GUEST k1 and save it in VCPU */ - PTR_LI t1, ~0x2ff - mfc0 t0, CP0_EBASE - and t0, t0, t1 - LONG_L t0, 0x3000(t0) - LONG_S t0, VCPU_R27(k1) - - /* Now that context has been saved, we can use other registers */ - - /* Restore vcpu */ - mfc0 a1, CP0_DDATA_LO - move s1, a1 - - /* Restore run (vcpu->run) */ - LONG_L a0, VCPU_RUN(a1) - /* Save pointer to run in s0, will be saved by the compiler */ - move s0, a0 - - /* - * Save Host level EPC, BadVaddr and Cause to VCPU, useful to - * process the exception - */ - mfc0 k0,CP0_EPC - LONG_S k0, VCPU_PC(k1) - - mfc0 k0, CP0_BADVADDR - LONG_S k0, VCPU_HOST_CP0_BADVADDR(k1) - - mfc0 k0, CP0_CAUSE - sw k0, VCPU_HOST_CP0_CAUSE(k1) - - /* Now restore the host state just enough to run the handlers */ - - /* Switch EBASE to the one used by Linux */ - /* load up the host EBASE */ - mfc0 v0, CP0_STATUS - - or k0, v0, ST0_BEV - - mtc0 k0, CP0_STATUS - ehb - - LONG_L k0, ebase - mtc0 k0,CP0_EBASE - - /* - * If FPU is enabled, save FCR31 and clear it so that later ctc1's don't - * trigger FPE for pending exceptions. - */ - and v1, v0, ST0_CU1 - beqz v1, 1f - nop - .set push - SET_HARDFLOAT - cfc1 t0, fcr31 - sw t0, VCPU_FCR31(k1) - ctc1 zero,fcr31 - .set pop -1: - -#ifdef CONFIG_CPU_HAS_MSA - /* - * If MSA is enabled, save MSACSR and clear it so that later - * instructions don't trigger MSAFPE for pending exceptions. - */ - mfc0 t0, CP0_CONFIG3 - ext t0, t0, 28, 1 /* MIPS_CONF3_MSAP */ - beqz t0, 1f - nop - mfc0 t0, CP0_CONFIG5 - ext t0, t0, 27, 1 /* MIPS_CONF5_MSAEN */ - beqz t0, 1f - nop - _cfcmsa t0, MSA_CSR - sw t0, VCPU_MSA_CSR(k1) - _ctcmsa MSA_CSR, zero -1: -#endif - - /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */ - and v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE) - or v0, v0, ST0_CU0 - mtc0 v0, CP0_STATUS - ehb - - /* Load up host GP */ - LONG_L gp, VCPU_HOST_GP(k1) - - /* Need a stack before we can jump to "C" */ - LONG_L sp, VCPU_HOST_STACK(k1) - - /* Saved host state */ - INT_ADDIU sp, sp, -PT_SIZE - - /* - * XXXKYMA do we need to load the host ASID, maybe not because the - * kernel entries are marked GLOBAL, need to verify - */ - - /* Restore host DDATA_LO */ - LONG_L k0, PT_HOST_USERLOCAL(sp) - mtc0 k0, CP0_DDATA_LO - - /* Restore RDHWR access */ - INT_L k0, hwrena - mtc0 k0, CP0_HWRENA - - /* Jump to handler */ -FEXPORT(__kvm_mips_jump_to_handler) - /* - * XXXKYMA: not sure if this is safe, how large is the stack?? - * Now jump to the kvm_mips_handle_exit() to see if we can deal - * with this in the kernel - */ - PTR_LA t9, kvm_mips_handle_exit - jalr.hb t9 - INT_ADDIU sp, sp, -CALLFRAME_SIZ /* BD Slot */ - - /* Return from handler Make sure interrupts are disabled */ - di - ehb - - /* - * XXXKYMA: k0/k1 could have been blown away if we processed - * an exception while we were handling the exception from the - * guest, reload k1 - */ - - move k1, s1 - INT_ADDIU k1, k1, VCPU_HOST_ARCH - - /* - * Check return value, should tell us if we are returning to the - * host (handle I/O etc)or resuming the guest - */ - andi t0, v0, RESUME_HOST - bnez t0, __kvm_mips_return_to_host - nop - -__kvm_mips_return_to_guest: - /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */ - mtc0 s1, CP0_DDATA_LO - - /* Load up the Guest EBASE to minimize the window where BEV is set */ - LONG_L t0, VCPU_GUEST_EBASE(k1) - - /* Switch EBASE back to the one used by KVM */ - mfc0 v1, CP0_STATUS - or k0, v1, ST0_BEV - mtc0 k0, CP0_STATUS - ehb - mtc0 t0, CP0_EBASE - - /* Setup status register for running guest in UM */ - or v1, v1, (ST0_EXL | KSU_USER | ST0_IE) - and v1, v1, ~(ST0_CU0 | ST0_MX) - mtc0 v1, CP0_STATUS - ehb - - /* Set Guest EPC */ - LONG_L t0, VCPU_PC(k1) - mtc0 t0, CP0_EPC - - /* Set the ASID for the Guest Kernel */ - PTR_L t0, VCPU_COP0(k1) - LONG_L t0, COP0_STATUS(t0) - andi t0, KSU_USER | ST0_ERL | ST0_EXL - xori t0, KSU_USER - bnez t0, 1f /* If kernel */ - INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID /* (BD) */ - INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID /* else user */ -1: - /* t1: contains the base of the ASID array, need to get the cpu id */ - LONG_L t2, TI_CPU($28) /* smp_processor_id */ - INT_SLL t2, t2, 2 /* x4 */ - REG_ADDU t3, t1, t2 - LONG_L k0, (t3) -#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE - li t3, CPUINFO_SIZE/4 - mul t2, t2, t3 /* x sizeof(struct cpuinfo_mips)/4 */ - LONG_L t2, (cpu_data + CPUINFO_ASID_MASK)(t2) - and k0, k0, t2 -#else - andi k0, k0, MIPS_ENTRYHI_ASID -#endif - mtc0 k0, CP0_ENTRYHI - ehb - - /* Disable RDHWR access */ - mtc0 zero, CP0_HWRENA - - .set noat - /* load the guest context from VCPU and return */ - LONG_L $0, VCPU_R0(k1) - LONG_L $1, VCPU_R1(k1) - LONG_L $2, VCPU_R2(k1) - LONG_L $3, VCPU_R3(k1) - LONG_L $4, VCPU_R4(k1) - LONG_L $5, VCPU_R5(k1) - LONG_L $6, VCPU_R6(k1) - LONG_L $7, VCPU_R7(k1) - LONG_L $8, VCPU_R8(k1) - LONG_L $9, VCPU_R9(k1) - LONG_L $10, VCPU_R10(k1) - LONG_L $11, VCPU_R11(k1) - LONG_L $12, VCPU_R12(k1) - LONG_L $13, VCPU_R13(k1) - LONG_L $14, VCPU_R14(k1) - LONG_L $15, VCPU_R15(k1) - LONG_L $16, VCPU_R16(k1) - LONG_L $17, VCPU_R17(k1) - LONG_L $18, VCPU_R18(k1) - LONG_L $19, VCPU_R19(k1) - LONG_L $20, VCPU_R20(k1) - LONG_L $21, VCPU_R21(k1) - LONG_L $22, VCPU_R22(k1) - LONG_L $23, VCPU_R23(k1) - LONG_L $24, VCPU_R24(k1) - LONG_L $25, VCPU_R25(k1) - - /* $/k1 loaded later */ - LONG_L $28, VCPU_R28(k1) - LONG_L $29, VCPU_R29(k1) - LONG_L $30, VCPU_R30(k1) - LONG_L $31, VCPU_R31(k1) - -FEXPORT(__kvm_mips_skip_guest_restore) - LONG_L k0, VCPU_HI(k1) - mthi k0 - - LONG_L k0, VCPU_LO(k1) - mtlo k0 - - LONG_L k0, VCPU_R26(k1) - LONG_L k1, VCPU_R27(k1) - - eret - .set at - -__kvm_mips_return_to_host: - /* EBASE is already pointing to Linux */ - LONG_L k1, VCPU_HOST_STACK(k1) - INT_ADDIU k1,k1, -PT_SIZE - - /* Restore host DDATA_LO */ - LONG_L k0, PT_HOST_USERLOCAL(k1) - mtc0 k0, CP0_DDATA_LO - - /* - * r2/v0 is the return code, shift it down by 2 (arithmetic) - * to recover the err code - */ - INT_SRA k0, v0, 2 - move $2, k0 - - /* Load context saved on the host stack */ - LONG_L $16, PT_R16(k1) - LONG_L $17, PT_R17(k1) - LONG_L $18, PT_R18(k1) - LONG_L $19, PT_R19(k1) - LONG_L $20, PT_R20(k1) - LONG_L $21, PT_R21(k1) - LONG_L $22, PT_R22(k1) - LONG_L $23, PT_R23(k1) - - LONG_L $28, PT_R28(k1) - LONG_L $29, PT_R29(k1) - LONG_L $30, PT_R30(k1) - - LONG_L k0, PT_HI(k1) - mthi k0 - - LONG_L k0, PT_LO(k1) - mtlo k0 - - /* Restore RDHWR access */ - INT_L k0, hwrena - mtc0 k0, CP0_HWRENA - - /* Restore RA, which is the address we will return to */ - LONG_L ra, PT_R31(k1) - j ra - nop - -VECTOR_END(MIPSX(GuestExceptionEnd)) -.end MIPSX(GuestException) - -MIPSX(exceptions): - #### - ##### The exception handlers. - ##### - .word _C_LABEL(MIPSX(GuestException)) # 0 - .word _C_LABEL(MIPSX(GuestException)) # 1 - .word _C_LABEL(MIPSX(GuestException)) # 2 - .word _C_LABEL(MIPSX(GuestException)) # 3 - .word _C_LABEL(MIPSX(GuestException)) # 4 - .word _C_LABEL(MIPSX(GuestException)) # 5 - .word _C_LABEL(MIPSX(GuestException)) # 6 - .word _C_LABEL(MIPSX(GuestException)) # 7 - .word _C_LABEL(MIPSX(GuestException)) # 8 - .word _C_LABEL(MIPSX(GuestException)) # 9 - .word _C_LABEL(MIPSX(GuestException)) # 10 - .word _C_LABEL(MIPSX(GuestException)) # 11 - .word _C_LABEL(MIPSX(GuestException)) # 12 - .word _C_LABEL(MIPSX(GuestException)) # 13 - .word _C_LABEL(MIPSX(GuestException)) # 14 - .word _C_LABEL(MIPSX(GuestException)) # 15 - .word _C_LABEL(MIPSX(GuestException)) # 16 - .word _C_LABEL(MIPSX(GuestException)) # 17 - .word _C_LABEL(MIPSX(GuestException)) # 18 - .word _C_LABEL(MIPSX(GuestException)) # 19 - .word _C_LABEL(MIPSX(GuestException)) # 20 - .word _C_LABEL(MIPSX(GuestException)) # 21 - .word _C_LABEL(MIPSX(GuestException)) # 22 - .word _C_LABEL(MIPSX(GuestException)) # 23 - .word _C_LABEL(MIPSX(GuestException)) # 24 - .word _C_LABEL(MIPSX(GuestException)) # 25 - .word _C_LABEL(MIPSX(GuestException)) # 26 - .word _C_LABEL(MIPSX(GuestException)) # 27 - .word _C_LABEL(MIPSX(GuestException)) # 28 - .word _C_LABEL(MIPSX(GuestException)) # 29 - .word _C_LABEL(MIPSX(GuestException)) # 30 - .word _C_LABEL(MIPSX(GuestException)) # 31 diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 5f1163653b50..e3ae1229f147 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -247,8 +247,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { - int err, size, offset; - void *gebase; + int err, size; + void *gebase, *p; int i; struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); @@ -286,41 +286,28 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) /* Save new ebase */ vcpu->arch.guest_ebase = gebase; - /* Copy L1 Guest Exception handler to correct offset */ + /* Build guest exception vectors dynamically in unmapped memory */ /* TLB Refill, EXL = 0 */ - memcpy(gebase, mips32_exception, - mips32_exceptionEnd - mips32_exception); + kvm_mips_build_exception(gebase); /* General Exception Entry point */ - memcpy(gebase + 0x180, mips32_exception, - mips32_exceptionEnd - mips32_exception); + kvm_mips_build_exception(gebase + 0x180); /* For vectored interrupts poke the exception code @ all offsets 0-7 */ for (i = 0; i < 8; i++) { kvm_debug("L1 Vectored handler @ %p\n", gebase + 0x200 + (i * VECTORSPACING)); - memcpy(gebase + 0x200 + (i * VECTORSPACING), mips32_exception, - mips32_exceptionEnd - mips32_exception); + kvm_mips_build_exception(gebase + 0x200 + i * VECTORSPACING); } - /* General handler, relocate to unmapped space for sanity's sake */ - offset = 0x2000; - kvm_debug("Installing KVM Exception handlers @ %p, %#x bytes\n", - gebase + offset, - mips32_GuestExceptionEnd - mips32_GuestException); + /* General exit handler */ + p = gebase + 0x2000; + p = kvm_mips_build_exit(p); - memcpy(gebase + offset, mips32_GuestException, - mips32_GuestExceptionEnd - mips32_GuestException); - -#ifdef MODULE - offset += mips32_GuestExceptionEnd - mips32_GuestException; - memcpy(gebase + offset, (char *)__kvm_mips_vcpu_run, - __kvm_mips_vcpu_run_end - (char *)__kvm_mips_vcpu_run); - vcpu->arch.vcpu_run = gebase + offset; -#else - vcpu->arch.vcpu_run = __kvm_mips_vcpu_run; -#endif + /* Guest entry routine */ + vcpu->arch.vcpu_run = p; + p = kvm_mips_build_vcpu_run(p); /* Invalidate the icache for these ranges */ local_flush_icache_range((unsigned long)gebase, From d7b8f890b63f386ca28c820b8ddb7ff1f63cbe3c Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 23 Jun 2016 17:34:40 +0100 Subject: [PATCH 213/302] MIPS: KVM: Add dumping of generated entry code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dump the generated entry code with pr_debug(), similar to how it is done in tlbex.c, so it can be more easily debugged. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim KrÄmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index e3ae1229f147..9f36dcb3c580 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -245,6 +245,23 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, } } +static inline void dump_handler(const char *symbol, void *start, void *end) +{ + u32 *p; + + pr_debug("LEAF(%s)\n", symbol); + + pr_debug("\t.set push\n"); + pr_debug("\t.set noreorder\n"); + + for (p = start; p < (u32 *)end; ++p) + pr_debug("\t.word\t0x%08x\t\t# %p\n", *p, p); + + pr_debug("\t.set\tpop\n"); + + pr_debug("\tEND(%s)\n", symbol); +} + struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { int err, size; @@ -309,6 +326,14 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) vcpu->arch.vcpu_run = p; p = kvm_mips_build_vcpu_run(p); + /* Dump the generated code */ + pr_debug("#include \n"); + pr_debug("#include \n"); + pr_debug("\n"); + dump_handler("kvm_vcpu_run", vcpu->arch.vcpu_run, p); + dump_handler("kvm_gen_exc", gebase + 0x180, gebase + 0x200); + dump_handler("kvm_exit", gebase + 0x2000, vcpu->arch.vcpu_run); + /* Invalidate the icache for these ranges */ local_flush_icache_range((unsigned long)gebase, (unsigned long)gebase + ALIGN(size, PAGE_SIZE)); From 9c9886584086f33b6f709d284360c6ad6bcd01c4 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 23 Jun 2016 17:34:41 +0100 Subject: [PATCH 214/302] MIPS: KVM: Drop now unused asm offsets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that locore.S is converted to uasm, remove a bunch of the assembly offset definitions created by asm-offsets.c, including the CPUINFO_ ones for reading the variable asid mask, and the non FPU/MSA related VCPU_ definitions. KVM's fpu.S and msa.S still use the remaining definitions. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim KrÄmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kernel/asm-offsets.c | 66 ---------------------------------- 1 file changed, 66 deletions(-) diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index a1263d188a5a..fae2f9447792 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -339,67 +339,9 @@ void output_pm_defines(void) } #endif -void output_cpuinfo_defines(void) -{ - COMMENT(" MIPS cpuinfo offsets. "); - DEFINE(CPUINFO_SIZE, sizeof(struct cpuinfo_mips)); -#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE - OFFSET(CPUINFO_ASID_MASK, cpuinfo_mips, asid_mask); -#endif -} - void output_kvm_defines(void) { COMMENT(" KVM/MIPS Specfic offsets. "); - DEFINE(VCPU_ARCH_SIZE, sizeof(struct kvm_vcpu_arch)); - OFFSET(VCPU_RUN, kvm_vcpu, run); - OFFSET(VCPU_HOST_ARCH, kvm_vcpu, arch); - - OFFSET(VCPU_GUEST_EBASE, kvm_vcpu_arch, guest_ebase); - - OFFSET(VCPU_HOST_STACK, kvm_vcpu_arch, host_stack); - OFFSET(VCPU_HOST_GP, kvm_vcpu_arch, host_gp); - - OFFSET(VCPU_HOST_CP0_BADVADDR, kvm_vcpu_arch, host_cp0_badvaddr); - OFFSET(VCPU_HOST_CP0_CAUSE, kvm_vcpu_arch, host_cp0_cause); - OFFSET(VCPU_HOST_EPC, kvm_vcpu_arch, host_cp0_epc); - - OFFSET(VCPU_R0, kvm_vcpu_arch, gprs[0]); - OFFSET(VCPU_R1, kvm_vcpu_arch, gprs[1]); - OFFSET(VCPU_R2, kvm_vcpu_arch, gprs[2]); - OFFSET(VCPU_R3, kvm_vcpu_arch, gprs[3]); - OFFSET(VCPU_R4, kvm_vcpu_arch, gprs[4]); - OFFSET(VCPU_R5, kvm_vcpu_arch, gprs[5]); - OFFSET(VCPU_R6, kvm_vcpu_arch, gprs[6]); - OFFSET(VCPU_R7, kvm_vcpu_arch, gprs[7]); - OFFSET(VCPU_R8, kvm_vcpu_arch, gprs[8]); - OFFSET(VCPU_R9, kvm_vcpu_arch, gprs[9]); - OFFSET(VCPU_R10, kvm_vcpu_arch, gprs[10]); - OFFSET(VCPU_R11, kvm_vcpu_arch, gprs[11]); - OFFSET(VCPU_R12, kvm_vcpu_arch, gprs[12]); - OFFSET(VCPU_R13, kvm_vcpu_arch, gprs[13]); - OFFSET(VCPU_R14, kvm_vcpu_arch, gprs[14]); - OFFSET(VCPU_R15, kvm_vcpu_arch, gprs[15]); - OFFSET(VCPU_R16, kvm_vcpu_arch, gprs[16]); - OFFSET(VCPU_R17, kvm_vcpu_arch, gprs[17]); - OFFSET(VCPU_R18, kvm_vcpu_arch, gprs[18]); - OFFSET(VCPU_R19, kvm_vcpu_arch, gprs[19]); - OFFSET(VCPU_R20, kvm_vcpu_arch, gprs[20]); - OFFSET(VCPU_R21, kvm_vcpu_arch, gprs[21]); - OFFSET(VCPU_R22, kvm_vcpu_arch, gprs[22]); - OFFSET(VCPU_R23, kvm_vcpu_arch, gprs[23]); - OFFSET(VCPU_R24, kvm_vcpu_arch, gprs[24]); - OFFSET(VCPU_R25, kvm_vcpu_arch, gprs[25]); - OFFSET(VCPU_R26, kvm_vcpu_arch, gprs[26]); - OFFSET(VCPU_R27, kvm_vcpu_arch, gprs[27]); - OFFSET(VCPU_R28, kvm_vcpu_arch, gprs[28]); - OFFSET(VCPU_R29, kvm_vcpu_arch, gprs[29]); - OFFSET(VCPU_R30, kvm_vcpu_arch, gprs[30]); - OFFSET(VCPU_R31, kvm_vcpu_arch, gprs[31]); - OFFSET(VCPU_LO, kvm_vcpu_arch, lo); - OFFSET(VCPU_HI, kvm_vcpu_arch, hi); - OFFSET(VCPU_PC, kvm_vcpu_arch, pc); - BLANK(); OFFSET(VCPU_FPR0, kvm_vcpu_arch, fpu.fpr[0]); OFFSET(VCPU_FPR1, kvm_vcpu_arch, fpu.fpr[1]); @@ -437,14 +379,6 @@ void output_kvm_defines(void) OFFSET(VCPU_FCR31, kvm_vcpu_arch, fpu.fcr31); OFFSET(VCPU_MSA_CSR, kvm_vcpu_arch, fpu.msacsr); BLANK(); - - OFFSET(VCPU_COP0, kvm_vcpu_arch, cop0); - OFFSET(VCPU_GUEST_KERNEL_ASID, kvm_vcpu_arch, guest_kernel_asid); - OFFSET(VCPU_GUEST_USER_ASID, kvm_vcpu_arch, guest_user_asid); - - OFFSET(COP0_TLB_HI, mips_coproc, reg[MIPS_CP0_TLB_HI][0]); - OFFSET(COP0_STATUS, mips_coproc, reg[MIPS_CP0_STATUS][0]); - BLANK(); } #ifdef CONFIG_MIPS_CPS From d37f4038d16273087bdc60387807b90a8c06da7f Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 23 Jun 2016 17:34:42 +0100 Subject: [PATCH 215/302] MIPS: KVM: Omit FPU handling entry code if possible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The FPU handling code on entry from guest is unnecessary if no FPU is present, so allow it to be dropped at uasm assembly time. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim KrÄmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/entry.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index 9a18b4939b35..c0d9f551c1c1 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -393,18 +393,21 @@ void *kvm_mips_build_exit(void *addr) UASM_i_LW(&p, K0, uasm_rel_lo((long)&ebase), K0); uasm_i_mtc0(&p, K0, C0_EBASE); - /* - * If FPU is enabled, save FCR31 and clear it so that later ctc1's don't - * trigger FPE for pending exceptions. - */ - uasm_i_lui(&p, AT, ST0_CU1 >> 16); - uasm_i_and(&p, V1, V0, AT); - uasm_il_beqz(&p, &r, V1, label_fpu_1); - uasm_i_nop(&p); - uasm_i_cfc1(&p, T0, 31); - uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.fcr31), K1); - uasm_i_ctc1(&p, ZERO, 31); - uasm_l_fpu_1(&l, p); + if (raw_cpu_has_fpu) { + /* + * If FPU is enabled, save FCR31 and clear it so that later + * ctc1's don't trigger FPE for pending exceptions. + */ + uasm_i_lui(&p, AT, ST0_CU1 >> 16); + uasm_i_and(&p, V1, V0, AT); + uasm_il_beqz(&p, &r, V1, label_fpu_1); + uasm_i_nop(&p); + uasm_i_cfc1(&p, T0, 31); + uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.fcr31), + K1); + uasm_i_ctc1(&p, ZERO, 31); + uasm_l_fpu_1(&l, p); + } #ifdef CONFIG_CPU_HAS_MSA /* From 38ea7a715d43752e1c53d5a0c3cbab5e321f22f7 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 23 Jun 2016 17:34:43 +0100 Subject: [PATCH 216/302] MIPS: KVM: Check MSA presence at uasm time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check for presence of MSA at uasm assembly time rather than at runtime in the generated KVM host entry code. This optimises the guest exit path by eliminating the MSA code entirely if not present, and eliminating the read of Config3.MSAP and conditional branch if MSA is present. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim KrÄmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/entry.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index c0d9f551c1c1..53e1e576d18a 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -55,7 +55,6 @@ #define C0_CAUSE 13, 0 #define C0_EPC 14, 0 #define C0_EBASE 15, 1 -#define C0_CONFIG3 16, 3 #define C0_CONFIG5 16, 5 #define C0_DDATA_LO 28, 3 #define C0_ERROREPC 30, 0 @@ -409,25 +408,21 @@ void *kvm_mips_build_exit(void *addr) uasm_l_fpu_1(&l, p); } -#ifdef CONFIG_CPU_HAS_MSA - /* - * If MSA is enabled, save MSACSR and clear it so that later - * instructions don't trigger MSAFPE for pending exceptions. - */ - uasm_i_mfc0(&p, T0, C0_CONFIG3); - uasm_i_ext(&p, T0, T0, 28, 1); /* MIPS_CONF3_MSAP */ - uasm_il_beqz(&p, &r, T0, label_msa_1); - uasm_i_nop(&p); - uasm_i_mfc0(&p, T0, C0_CONFIG5); - uasm_i_ext(&p, T0, T0, 27, 1); /* MIPS_CONF5_MSAEN */ - uasm_il_beqz(&p, &r, T0, label_msa_1); - uasm_i_nop(&p); - uasm_i_cfcmsa(&p, T0, MSA_CSR); - uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.msacsr), - K1); - uasm_i_ctcmsa(&p, MSA_CSR, ZERO); - uasm_l_msa_1(&l, p); -#endif + if (cpu_has_msa) { + /* + * If MSA is enabled, save MSACSR and clear it so that later + * instructions don't trigger MSAFPE for pending exceptions. + */ + uasm_i_mfc0(&p, T0, C0_CONFIG5); + uasm_i_ext(&p, T0, T0, 27, 1); /* MIPS_CONF5_MSAEN */ + uasm_il_beqz(&p, &r, T0, label_msa_1); + uasm_i_nop(&p); + uasm_i_cfcmsa(&p, T0, MSA_CSR); + uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.msacsr), + K1); + uasm_i_ctcmsa(&p, MSA_CSR, ZERO); + uasm_l_msa_1(&l, p); + } /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */ uasm_i_addiu(&p, AT, ZERO, ~(ST0_EXL | KSU_USER | ST0_IE)); From 025014e3fb8f6afab92d3050c3423e2b1ffcbc84 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 23 Jun 2016 17:34:44 +0100 Subject: [PATCH 217/302] MIPS: KVM: Drop redundant restore of DDATA_LO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On return from the exit handler to the host (without re-entering the guest) we restore the saved value of the DDATA_LO register which we use as a scratch register. However we've already restored it ready for calling the exit handler so there is no need to do it again, so drop that code. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim KrÄmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/entry.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index 53e1e576d18a..6395bfa7e542 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -581,10 +581,6 @@ static void *kvm_mips_build_ret_to_host(void *addr) UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, host_stack), K1); uasm_i_addiu(&p, K1, K1, -(int)sizeof(struct pt_regs)); - /* Restore host DDATA_LO */ - UASM_i_LW(&p, K0, offsetof(struct pt_regs, cp0_epc), K1); - uasm_i_mtc0(&p, K0, C0_DDATA_LO); - /* * r2/v0 is the return code, shift it down by 2 (arithmetic) * to recover the err code From 1e5217f54251ddd339e00a0b30f126589737d467 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 23 Jun 2016 17:34:45 +0100 Subject: [PATCH 218/302] MIPS: KVM: Dynamically choose scratch registers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scratch cop0 registers are needed by KVM to be able to save/restore all the GPRs, including k0/k1, and for storing the VCPU pointer. However no registers are universally suitable for these purposes, so the decision should be made at runtime. Until now, we've used DDATA_LO to store the VCPU pointer, and ErrorEPC as a temporary. It could be argued that this is abuse of those registers, and DDATA_LO is known not to be usable on certain implementations (Cavium Octeon). If KScratch registers are present, use them instead. We save & restore the temporary register in addition to the VCPU pointer register when using a KScratch register for it, as it may be used for normal host TLB handling too. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim KrÄmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 1 + arch/mips/kvm/entry.c | 94 +++++++++++++++++++++++++++----- arch/mips/kvm/mips.c | 4 ++ 3 files changed, 84 insertions(+), 15 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 2e76e899079c..a80c3208b234 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -536,6 +536,7 @@ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu); extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu); /* Building of entry/exception code */ +int kvm_mips_entry_setup(void); void *kvm_mips_build_vcpu_run(void *addr); void *kvm_mips_build_exception(void *addr); void *kvm_mips_build_exit(void *addr); diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index 6395bfa7e542..b6e7fd9f12f0 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -61,6 +61,9 @@ #define CALLFRAME_SIZ 32 +static unsigned int scratch_vcpu[2] = { C0_DDATA_LO }; +static unsigned int scratch_tmp[2] = { C0_ERROREPC }; + enum label_id { label_fpu_1 = 1, label_msa_1, @@ -78,6 +81,69 @@ static void *kvm_mips_build_ret_from_exit(void *addr); static void *kvm_mips_build_ret_to_guest(void *addr); static void *kvm_mips_build_ret_to_host(void *addr); +/** + * kvm_mips_entry_setup() - Perform global setup for entry code. + * + * Perform global setup for entry code, such as choosing a scratch register. + * + * Returns: 0 on success. + * -errno on failure. + */ +int kvm_mips_entry_setup(void) +{ + /* + * We prefer to use KScratchN registers if they are available over the + * defaults above, which may not work on all cores. + */ + unsigned int kscratch_mask = cpu_data[0].kscratch_mask & 0xfc; + + /* Pick a scratch register for storing VCPU */ + if (kscratch_mask) { + scratch_vcpu[0] = 31; + scratch_vcpu[1] = ffs(kscratch_mask) - 1; + kscratch_mask &= ~BIT(scratch_vcpu[1]); + } + + /* Pick a scratch register to use as a temp for saving state */ + if (kscratch_mask) { + scratch_tmp[0] = 31; + scratch_tmp[1] = ffs(kscratch_mask) - 1; + kscratch_mask &= ~BIT(scratch_tmp[1]); + } + + return 0; +} + +static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp, + unsigned int frame) +{ + /* Save the VCPU scratch register value in cp0_epc of the stack frame */ + uasm_i_mfc0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]); + UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame); + + /* Save the temp scratch register value in cp0_cause of stack frame */ + if (scratch_tmp[0] == 31) { + uasm_i_mfc0(p, tmp, scratch_tmp[0], scratch_tmp[1]); + UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame); + } +} + +static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp, + unsigned int frame) +{ + /* + * Restore host scratch register values saved by + * kvm_mips_build_save_scratch(). + */ + UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame); + uasm_i_mtc0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]); + + if (scratch_tmp[0] == 31) { + UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame); + uasm_i_mtc0(p, tmp, scratch_tmp[0], scratch_tmp[1]); + } +} + /** * kvm_mips_build_vcpu_run() - Assemble function to start running a guest VCPU. * @addr: Address to start writing code. @@ -120,12 +186,11 @@ void *kvm_mips_build_vcpu_run(void *addr) uasm_i_mfc0(&p, V0, C0_STATUS); UASM_i_SW(&p, V0, offsetof(struct pt_regs, cp0_status), K1); - /* Save DDATA_LO, will be used to store pointer to vcpu */ - uasm_i_mfc0(&p, V1, C0_DDATA_LO); - UASM_i_SW(&p, V1, offsetof(struct pt_regs, cp0_epc), K1); + /* Save scratch registers, will be used to store pointer to vcpu etc */ + kvm_mips_build_save_scratch(&p, V1, K1); - /* DDATA_LO has pointer to vcpu */ - uasm_i_mtc0(&p, A1, C0_DDATA_LO); + /* VCPU scratch register has pointer to vcpu */ + uasm_i_mtc0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]); /* Offset into vcpu->arch */ uasm_i_addiu(&p, K1, A1, offsetof(struct kvm_vcpu, arch)); @@ -273,7 +338,7 @@ void *kvm_mips_build_exception(void *addr) u32 *p = addr; /* Save guest k0 */ - uasm_i_mtc0(&p, K0, C0_ERROREPC); + uasm_i_mtc0(&p, K0, scratch_tmp[0], scratch_tmp[1]); uasm_i_ehb(&p); /* Get EBASE */ @@ -321,8 +386,8 @@ void *kvm_mips_build_exit(void *addr) * does something that causes a trap to kernel mode. */ - /* Get the VCPU pointer from DDATA_LO */ - uasm_i_mfc0(&p, K1, C0_DDATA_LO); + /* Get the VCPU pointer from the scratch register */ + uasm_i_mfc0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]); uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch)); /* Start saving Guest context to VCPU */ @@ -341,7 +406,7 @@ void *kvm_mips_build_exit(void *addr) UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, lo), K1); /* Finally save guest k0/k1 to VCPU */ - uasm_i_mfc0(&p, T0, C0_ERROREPC); + uasm_i_mfc0(&p, T0, scratch_tmp[0], scratch_tmp[1]); UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1); /* Get GUEST k1 and save it in VCPU */ @@ -354,7 +419,7 @@ void *kvm_mips_build_exit(void *addr) /* Now that context has been saved, we can use other registers */ /* Restore vcpu */ - uasm_i_mfc0(&p, A1, C0_DDATA_LO); + uasm_i_mfc0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]); uasm_i_move(&p, S1, A1); /* Restore run (vcpu->run) */ @@ -446,9 +511,8 @@ void *kvm_mips_build_exit(void *addr) * kernel entries are marked GLOBAL, need to verify */ - /* Restore host DDATA_LO */ - UASM_i_LW(&p, K0, offsetof(struct pt_regs, cp0_epc), SP); - uasm_i_mtc0(&p, K0, C0_DDATA_LO); + /* Restore host scratch registers, as we'll have clobbered them */ + kvm_mips_build_restore_scratch(&p, K0, SP); /* Restore RDHWR access */ UASM_i_LA_mostly(&p, K0, (long)&hwrena); @@ -536,8 +600,8 @@ static void *kvm_mips_build_ret_to_guest(void *addr) { u32 *p = addr; - /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */ - uasm_i_mtc0(&p, S1, C0_DDATA_LO); + /* Put the saved pointer to vcpu (s1) back into the scratch register */ + uasm_i_mtc0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]); /* Load up the Guest EBASE to minimize the window where BEV is set */ UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 9f36dcb3c580..26cc0b93c565 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1775,6 +1775,10 @@ static int __init kvm_mips_init(void) { int ret; + ret = kvm_mips_entry_setup(); + if (ret) + return ret; + ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (ret) From 1f9ca62cbc5f4d1663a0f0d193156ce9dc6ed452 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 23 Jun 2016 17:34:46 +0100 Subject: [PATCH 219/302] MIPS: KVM: Relative branch to common exit handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use a relative branch to get from the individual exception vectors to the common guest exit handler, rather than loading the address of the exit handler and jumping to it. This is made easier due to the fact we are now generating the entry code dynamically. This will also allow the exception code to be further reduced in future patches. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim KrÄmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 2 +- arch/mips/kvm/entry.c | 23 +++++++++++++++++------ arch/mips/kvm/mips.c | 12 +++++++----- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index a80c3208b234..b32785543787 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -538,7 +538,7 @@ extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu); /* Building of entry/exception code */ int kvm_mips_entry_setup(void); void *kvm_mips_build_vcpu_run(void *addr); -void *kvm_mips_build_exception(void *addr); +void *kvm_mips_build_exception(void *addr, void *handler); void *kvm_mips_build_exit(void *addr); /* FPU/MSA context management */ diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index b6e7fd9f12f0..fb2cbf653474 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -69,12 +69,14 @@ enum label_id { label_msa_1, label_return_to_host, label_kernel_asid, + label_exit_common, }; UASM_L_LA(_fpu_1) UASM_L_LA(_msa_1) UASM_L_LA(_return_to_host) UASM_L_LA(_kernel_asid) +UASM_L_LA(_exit_common) static void *kvm_mips_build_enter_guest(void *addr); static void *kvm_mips_build_ret_from_exit(void *addr); @@ -327,15 +329,23 @@ static void *kvm_mips_build_enter_guest(void *addr) /** * kvm_mips_build_exception() - Assemble first level guest exception handler. * @addr: Address to start writing code. + * @handler: Address of common handler (within range of @addr). * * Assemble exception vector code for guest execution. The generated vector will - * jump to the common exception handler generated by kvm_mips_build_exit(). + * branch to the common exception handler generated by kvm_mips_build_exit(). * * Returns: Next address after end of written function. */ -void *kvm_mips_build_exception(void *addr) +void *kvm_mips_build_exception(void *addr, void *handler) { u32 *p = addr; + struct uasm_label labels[2]; + struct uasm_reloc relocs[2]; + struct uasm_label *l = labels; + struct uasm_reloc *r = relocs; + + memset(labels, 0, sizeof(labels)); + memset(relocs, 0, sizeof(relocs)); /* Save guest k0 */ uasm_i_mtc0(&p, K0, scratch_tmp[0], scratch_tmp[1]); @@ -349,12 +359,13 @@ void *kvm_mips_build_exception(void *addr) /* Save k1 @ offset 0x3000 */ UASM_i_SW(&p, K1, 0x3000, K0); - /* Exception handler is installed @ offset 0x2000 */ - uasm_i_addiu(&p, K0, K0, 0x2000); - /* Jump to the function */ - uasm_i_jr(&p, K0); + /* Branch to the common handler */ + uasm_il_b(&p, &r, label_exit_common); uasm_i_nop(&p); + uasm_l_exit_common(&l, handler); + uasm_resolve_relocs(relocs, labels); + return p; } diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 26cc0b93c565..7c76768ff364 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -265,7 +265,7 @@ static inline void dump_handler(const char *symbol, void *start, void *end) struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { int err, size; - void *gebase, *p; + void *gebase, *p, *handler; int i; struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); @@ -304,22 +304,24 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) vcpu->arch.guest_ebase = gebase; /* Build guest exception vectors dynamically in unmapped memory */ + handler = gebase + 0x2000; /* TLB Refill, EXL = 0 */ - kvm_mips_build_exception(gebase); + kvm_mips_build_exception(gebase, handler); /* General Exception Entry point */ - kvm_mips_build_exception(gebase + 0x180); + kvm_mips_build_exception(gebase + 0x180, handler); /* For vectored interrupts poke the exception code @ all offsets 0-7 */ for (i = 0; i < 8; i++) { kvm_debug("L1 Vectored handler @ %p\n", gebase + 0x200 + (i * VECTORSPACING)); - kvm_mips_build_exception(gebase + 0x200 + i * VECTORSPACING); + kvm_mips_build_exception(gebase + 0x200 + i * VECTORSPACING, + handler); } /* General exit handler */ - p = gebase + 0x2000; + p = handler; p = kvm_mips_build_exit(p); /* Guest entry routine */ From eadfb501a5f5522b3df1b06ed8ffbb063d19d827 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Thu, 23 Jun 2016 17:34:47 +0100 Subject: [PATCH 220/302] MIPS: KVM: Save k0 straight into VCPU structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently on a guest exception the guest's k0 register is saved to the scratch temp register and the guest k1 saved to the exception base address + 0x3000 using k0 to extract the Exception Base field of the EBase register and as the base operand to the store. Both are then copied into the VCPU structure after the other general purpose registers have been saved there. This bouncing to exception base + 0x3000 is not actually necessary as the VCPU pointer can be determined and written through just as easily with only a single spare register. The VCPU pointer is already needed in k1 for saving the other GP registers, so lets save the guest k0 register straight into the VCPU structure through k1, first saving k1 into the scratch temp register instead of k0. This could potentially pave the way for having a single exception base area for use by all guests. The ehb after saving the k register to the scratch temp register is also delayed until just before it needs to be read back. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim KrÄmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/entry.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index fb2cbf653474..de8b6ec5573f 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -347,17 +347,15 @@ void *kvm_mips_build_exception(void *addr, void *handler) memset(labels, 0, sizeof(labels)); memset(relocs, 0, sizeof(relocs)); - /* Save guest k0 */ - uasm_i_mtc0(&p, K0, scratch_tmp[0], scratch_tmp[1]); - uasm_i_ehb(&p); + /* Save guest k1 into scratch register */ + uasm_i_mtc0(&p, K1, scratch_tmp[0], scratch_tmp[1]); - /* Get EBASE */ - uasm_i_mfc0(&p, K0, C0_EBASE); - /* Get rid of CPUNum */ - uasm_i_srl(&p, K0, K0, 10); - uasm_i_sll(&p, K0, K0, 10); - /* Save k1 @ offset 0x3000 */ - UASM_i_SW(&p, K1, 0x3000, K0); + /* Get the VCPU pointer from the VCPU scratch register */ + uasm_i_mfc0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]); + uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch)); + + /* Save guest k0 into VCPU structure */ + UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1); /* Branch to the common handler */ uasm_il_b(&p, &r, label_exit_common); @@ -395,12 +393,13 @@ void *kvm_mips_build_exit(void *addr) /* * Generic Guest exception handler. We end up here when the guest * does something that causes a trap to kernel mode. + * + * Both k0/k1 registers will have already been saved (k0 into the vcpu + * structure, and k1 into the scratch_tmp register). + * + * The k1 register will already contain the kvm_vcpu_arch pointer. */ - /* Get the VCPU pointer from the scratch register */ - uasm_i_mfc0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]); - uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch)); - /* Start saving Guest context to VCPU */ for (i = 0; i < 32; ++i) { /* Guest k0/k1 saved later */ @@ -416,15 +415,9 @@ void *kvm_mips_build_exit(void *addr) uasm_i_mflo(&p, T0); UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, lo), K1); - /* Finally save guest k0/k1 to VCPU */ + /* Finally save guest k1 to VCPU */ + uasm_i_ehb(&p); uasm_i_mfc0(&p, T0, scratch_tmp[0], scratch_tmp[1]); - UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1); - - /* Get GUEST k1 and save it in VCPU */ - uasm_i_addiu(&p, T1, ZERO, ~0x2ff); - uasm_i_mfc0(&p, T0, C0_EBASE); - uasm_i_and(&p, T0, T0, T1); - UASM_i_LW(&p, T0, 0x3000, T0); UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1); /* Now that context has been saved, we can use other registers */ From 1c66b79bb3b11942a98085fd89295cf6cddae41a Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Mon, 4 Jul 2016 19:35:07 +0100 Subject: [PATCH 221/302] MIPS: inst.h: Rename b{eq,ne}zcji[al]c_op to pop{6,7}6_op The opcodes currently defined in inst.h as beqzcjic_op & bnezcjialc_op are actually defined in the MIPS base instruction set manuals as pop66 & pop76 respectively. Rename them as such, for consistency with the documentation. Signed-off-by: Paul Burton Signed-off-by: James Hogan Acked-by: Ralf Baechle Cc: linux-mips@linux-mips.org Signed-off-by: Paolo Bonzini --- arch/mips/include/uapi/asm/inst.h | 4 ++-- arch/mips/kernel/branch.c | 4 ++-- arch/mips/math-emu/cp1emu.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h index fc96012c75d1..3fc00e7b33c4 100644 --- a/arch/mips/include/uapi/asm/inst.h +++ b/arch/mips/include/uapi/asm/inst.h @@ -32,9 +32,9 @@ enum major_op { sb_op, sh_op, swl_op, sw_op, sdl_op, sdr_op, swr_op, cache_op, ll_op, lwc1_op, lwc2_op, bc6_op = lwc2_op, pref_op, - lld_op, ldc1_op, ldc2_op, beqzcjic_op = ldc2_op, ld_op, + lld_op, ldc1_op, ldc2_op, pop66_op = ldc2_op, ld_op, sc_op, swc1_op, swc2_op, balc6_op = swc2_op, major_3b_op, - scd_op, sdc1_op, sdc2_op, bnezcjialc_op = sdc2_op, sd_op + scd_op, sdc1_op, sdc2_op, pop76_op = sdc2_op, sd_op }; /* diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c index 6dc3f1fdaccc..fb9ed96d7858 100644 --- a/arch/mips/kernel/branch.c +++ b/arch/mips/kernel/branch.c @@ -790,7 +790,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs, epc += 4 + (insn.i_format.simmediate << 2); regs->cp0_epc = epc; break; - case beqzcjic_op: + case pop66_op: if (!cpu_has_mips_r6) { ret = -SIGILL; break; @@ -798,7 +798,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs, /* Compact branch: BEQZC || JIC */ regs->cp0_epc += 8; break; - case bnezcjialc_op: + case pop76_op: if (!cpu_has_mips_r6) { ret = -SIGILL; break; diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c index d96e912b9d44..1bbf16581f19 100644 --- a/arch/mips/math-emu/cp1emu.c +++ b/arch/mips/math-emu/cp1emu.c @@ -683,14 +683,14 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn, dec_insn.next_pc_inc; return 1; - case beqzcjic_op: + case pop66_op: if (!cpu_has_mips_r6) break; *contpc = regs->cp0_epc + dec_insn.pc_inc + dec_insn.next_pc_inc; return 1; - case bnezcjialc_op: + case pop76_op: if (!cpu_has_mips_r6) break; if (!insn.i_format.rs) From 1b492600068d5fbd033196ce2bdb28735a23747e Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Mon, 4 Jul 2016 19:35:08 +0100 Subject: [PATCH 222/302] MIPS: inst.h: Rename cbcond{0,1}_op to pop{1,3}0_op The opcodes currently defined in inst.h as cbcond0_op & cbcond1_op are actually defined in the MIPS base instruction set manuals as pop10 & pop30 respectively. Rename them as such, for consistency with the documentation. Signed-off-by: Paul Burton Signed-off-by: James Hogan Acked-by: Ralf Baechle Cc: linux-mips@linux-mips.org Signed-off-by: Paolo Bonzini --- arch/mips/include/uapi/asm/inst.h | 4 ++-- arch/mips/kernel/branch.c | 4 ++-- arch/mips/math-emu/cp1emu.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h index 3fc00e7b33c4..77429d1622b3 100644 --- a/arch/mips/include/uapi/asm/inst.h +++ b/arch/mips/include/uapi/asm/inst.h @@ -21,11 +21,11 @@ enum major_op { spec_op, bcond_op, j_op, jal_op, beq_op, bne_op, blez_op, bgtz_op, - addi_op, cbcond0_op = addi_op, addiu_op, slti_op, sltiu_op, + addi_op, pop10_op = addi_op, addiu_op, slti_op, sltiu_op, andi_op, ori_op, xori_op, lui_op, cop0_op, cop1_op, cop2_op, cop1x_op, beql_op, bnel_op, blezl_op, bgtzl_op, - daddi_op, cbcond1_op = daddi_op, daddiu_op, ldl_op, ldr_op, + daddi_op, pop30_op = daddi_op, daddiu_op, ldl_op, ldr_op, spec2_op, jalx_op, mdmx_op, msa_op = mdmx_op, spec3_op, lb_op, lh_op, lwl_op, lw_op, lbu_op, lhu_op, lwr_op, lwu_op, diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c index fb9ed96d7858..46c227fc98f5 100644 --- a/arch/mips/kernel/branch.c +++ b/arch/mips/kernel/branch.c @@ -809,8 +809,8 @@ int __compute_return_epc_for_insn(struct pt_regs *regs, regs->cp0_epc += 8; break; #endif - case cbcond0_op: - case cbcond1_op: + case pop10_op: + case pop30_op: /* Only valid for MIPS R6 */ if (!cpu_has_mips_r6) { ret = -SIGILL; diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c index 1bbf16581f19..6dc07fba187f 100644 --- a/arch/mips/math-emu/cp1emu.c +++ b/arch/mips/math-emu/cp1emu.c @@ -627,8 +627,8 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn, dec_insn.pc_inc + dec_insn.next_pc_inc; return 1; - case cbcond0_op: - case cbcond1_op: + case pop10_op: + case pop30_op: if (!cpu_has_mips_r6) break; if (insn.i_format.rt && !insn.i_format.rs) From d14740fed8f12beb59d3087df985097c008e868e Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 4 Jul 2016 19:35:09 +0100 Subject: [PATCH 223/302] MIPS: KVM: Fix fpu.S misassembly with r6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __kvm_save_fpu and __kvm_restore_fpu use .set mips64r2 so that they can access the odd FPU registers as well as the even, however this causes misassembly of the return instruction on MIPSr6. Fix by replacing .set mips64r2 with .set fp=64, which doesn't change the architecture revision. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim KrÄmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/fpu.S | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/mips/kvm/fpu.S b/arch/mips/kvm/fpu.S index 531fbf5131c0..16f17c6390dd 100644 --- a/arch/mips/kvm/fpu.S +++ b/arch/mips/kvm/fpu.S @@ -14,13 +14,16 @@ #include #include +/* preprocessor replaces the fp in ".set fp=64" with $30 otherwise */ +#undef fp + .set noreorder .set noat LEAF(__kvm_save_fpu) .set push - .set mips64r2 SET_HARDFLOAT + .set fp=64 mfc0 t0, CP0_STATUS sll t0, t0, 5 # is Status.FR set? bgez t0, 1f # no: skip odd doubles @@ -63,8 +66,8 @@ LEAF(__kvm_save_fpu) LEAF(__kvm_restore_fpu) .set push - .set mips64r2 SET_HARDFLOAT + .set fp=64 mfc0 t0, CP0_STATUS sll t0, t0, 5 # is Status.FR set? bgez t0, 1f # no: skip odd doubles From d85ebff0073c783f0c74dc0e08c348f6f2d807c7 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 4 Jul 2016 19:35:10 +0100 Subject: [PATCH 224/302] MIPS: KVM: Fix pre-r6 ll/sc instructions on r6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The atomic KVM register access macros in kvm_host.h (for the guest Cause register with KVM in trap & emulate mode) use ll/sc instructions, however they still .set mips3, which causes pre-MIPSr6 instruction encodings to be emitted, even for a MIPSr6 build. Fix it to use MIPS_ISA_ARCH_LEVEL as other parts of arch/mips already do. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim KrÄmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/kvm_host.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index b32785543787..b54bcadd8aec 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -400,7 +400,7 @@ static inline void _kvm_atomic_set_c0_guest_reg(unsigned long *reg, unsigned long temp; do { __asm__ __volatile__( - " .set mips3 \n" + " .set "MIPS_ISA_ARCH_LEVEL" \n" " " __LL "%0, %1 \n" " or %0, %2 \n" " " __SC "%0, %1 \n" @@ -416,7 +416,7 @@ static inline void _kvm_atomic_clear_c0_guest_reg(unsigned long *reg, unsigned long temp; do { __asm__ __volatile__( - " .set mips3 \n" + " .set "MIPS_ISA_ARCH_LEVEL" \n" " " __LL "%0, %1 \n" " and %0, %2 \n" " " __SC "%0, %1 \n" @@ -433,7 +433,7 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg, unsigned long temp; do { __asm__ __volatile__( - " .set mips3 \n" + " .set "MIPS_ISA_ARCH_LEVEL" \n" " " __LL "%0, %1 \n" " and %0, %2 \n" " or %0, %3 \n" From 70e92c7ee94094d2db8bfe225a8c9b1bde89c26d Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 4 Jul 2016 19:35:11 +0100 Subject: [PATCH 225/302] MIPS: KVM: Don't save/restore lo/hi for r6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIPSr6 doesn't have lo/hi registers, so don't bother saving or restoring them, and don't expose them to userland with the KVM ioctl interface either. In fact the lo/hi registers aren't callee saved in the MIPS ABIs anyway, so there is no need to preserve the host lo/hi values at all when transitioning to and from the guest (which happens via a function call). Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim KrÄmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/entry.c | 16 ++++------------ arch/mips/kvm/mips.c | 6 ++++++ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index de8b6ec5573f..75ba7c2ecb3d 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -178,12 +178,6 @@ void *kvm_mips_build_vcpu_run(void *addr) UASM_i_SW(&p, i, offsetof(struct pt_regs, regs[i]), K1); } - /* Save hi/lo */ - uasm_i_mflo(&p, V0); - UASM_i_SW(&p, V0, offsetof(struct pt_regs, lo), K1); - uasm_i_mfhi(&p, V1); - UASM_i_SW(&p, V1, offsetof(struct pt_regs, hi), K1); - /* Save host status */ uasm_i_mfc0(&p, V0, C0_STATUS); UASM_i_SW(&p, V0, offsetof(struct pt_regs, cp0_status), K1); @@ -307,12 +301,14 @@ static void *kvm_mips_build_enter_guest(void *addr) UASM_i_LW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1); } +#ifndef CONFIG_CPU_MIPSR6 /* Restore hi/lo */ UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, hi), K1); uasm_i_mthi(&p, K0); UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, lo), K1); uasm_i_mtlo(&p, K0); +#endif /* Restore the guest's k0/k1 registers */ UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1); @@ -408,12 +404,14 @@ void *kvm_mips_build_exit(void *addr) UASM_i_SW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1); } +#ifndef CONFIG_CPU_MIPSR6 /* We need to save hi/lo and restore them on the way out */ uasm_i_mfhi(&p, T0); UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, hi), K1); uasm_i_mflo(&p, T0); UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, lo), K1); +#endif /* Finally save guest k1 to VCPU */ uasm_i_ehb(&p); @@ -663,12 +661,6 @@ static void *kvm_mips_build_ret_to_host(void *addr) UASM_i_LW(&p, i, offsetof(struct pt_regs, regs[i]), K1); } - UASM_i_LW(&p, K0, offsetof(struct pt_regs, hi), K1); - uasm_i_mthi(&p, K0); - - UASM_i_LW(&p, K0, offsetof(struct pt_regs, lo), K1); - uasm_i_mtlo(&p, K0); - /* Restore RDHWR access */ UASM_i_LA_mostly(&p, K0, (long)&hwrena); uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 7c76768ff364..414b00074e29 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -521,8 +521,10 @@ static u64 kvm_mips_get_one_regs[] = { KVM_REG_MIPS_R30, KVM_REG_MIPS_R31, +#ifndef CONFIG_CPU_MIPSR6 KVM_REG_MIPS_HI, KVM_REG_MIPS_LO, +#endif KVM_REG_MIPS_PC, KVM_REG_MIPS_CP0_INDEX, @@ -666,12 +668,14 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_R0 ... KVM_REG_MIPS_R31: v = (long)vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0]; break; +#ifndef CONFIG_CPU_MIPSR6 case KVM_REG_MIPS_HI: v = (long)vcpu->arch.hi; break; case KVM_REG_MIPS_LO: v = (long)vcpu->arch.lo; break; +#endif case KVM_REG_MIPS_PC: v = (long)vcpu->arch.pc; break; @@ -887,12 +891,14 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_R1 ... KVM_REG_MIPS_R31: vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0] = v; break; +#ifndef CONFIG_CPU_MIPSR6 case KVM_REG_MIPS_HI: vcpu->arch.hi = v; break; case KVM_REG_MIPS_LO: vcpu->arch.lo = v; break; +#endif case KVM_REG_MIPS_PC: vcpu->arch.pc = v; break; From 2e0badfaac234ef3ed6b5397ff208d218cd450fb Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 4 Jul 2016 19:35:12 +0100 Subject: [PATCH 226/302] MIPS: KVM: Support r6 compact branch emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support in KVM for emulation of instructions in the forbidden slot of MIPSr6 compact branches. If we hit an exception on the forbidden slot, then the branch must not have been taken, which makes calculation of the resume PC trivial. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim KrÄmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/emulate.c | 52 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 5f0354c80c8e..f0fa9e956056 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -161,9 +161,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, nextpc = epc; break; - case blez_op: /* not really i_format */ - case blezl_op: - /* rt field assumed to be zero */ + case blez_op: /* POP06 */ +#ifndef CONFIG_CPU_MIPSR6 + case blezl_op: /* removed in R6 */ +#endif + if (insn.i_format.rt != 0) + goto compact_branch; if ((long)arch->gprs[insn.i_format.rs] <= 0) epc = epc + 4 + (insn.i_format.simmediate << 2); else @@ -171,9 +174,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, nextpc = epc; break; - case bgtz_op: - case bgtzl_op: - /* rt field assumed to be zero */ + case bgtz_op: /* POP07 */ +#ifndef CONFIG_CPU_MIPSR6 + case bgtzl_op: /* removed in R6 */ +#endif + if (insn.i_format.rt != 0) + goto compact_branch; if ((long)arch->gprs[insn.i_format.rs] > 0) epc = epc + 4 + (insn.i_format.simmediate << 2); else @@ -185,6 +191,40 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, case cop1_op: kvm_err("%s: unsupported cop1_op\n", __func__); break; + +#ifdef CONFIG_CPU_MIPSR6 + /* R6 added the following compact branches with forbidden slots */ + case blezl_op: /* POP26 */ + case bgtzl_op: /* POP27 */ + /* only rt == 0 isn't compact branch */ + if (insn.i_format.rt != 0) + goto compact_branch; + break; + case pop10_op: + case pop30_op: + /* only rs == rt == 0 is reserved, rest are compact branches */ + if (insn.i_format.rs != 0 || insn.i_format.rt != 0) + goto compact_branch; + break; + case pop66_op: + case pop76_op: + /* only rs == 0 isn't compact branch */ + if (insn.i_format.rs != 0) + goto compact_branch; + break; +compact_branch: + /* + * If we've hit an exception on the forbidden slot, then + * the branch must not have been taken. + */ + epc += 8; + nextpc = epc; + break; +#else +compact_branch: + /* Compact branches not supported before R6 */ + break; +#endif } return nextpc; From 5cc4aafced42d7ece3d20650bf6ca2a165e6fca3 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 4 Jul 2016 19:35:13 +0100 Subject: [PATCH 227/302] MIPS: KVM: Recognise r6 CACHE encoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recognise the new MIPSr6 CACHE instruction encoding rather than the pre-r6 one when an r6 kernel is being built. A SPECIAL3 opcode is used and the immediate field is reduced to 9 bits wide since MIPSr6. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim KrÄmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/dyntrans.c | 5 ++++- arch/mips/kvm/emulate.c | 21 ++++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c index 8a1833b9eb38..91ebd2b6034f 100644 --- a/arch/mips/kvm/dyntrans.c +++ b/arch/mips/kvm/dyntrans.c @@ -72,7 +72,10 @@ int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc, synci_inst.i_format.opcode = bcond_op; synci_inst.i_format.rs = inst.i_format.rs; synci_inst.i_format.rt = synci_op; - synci_inst.i_format.simmediate = inst.i_format.simmediate; + if (cpu_has_mips_r6) + synci_inst.i_format.simmediate = inst.spec3_format.simmediate; + else + synci_inst.i_format.simmediate = inst.i_format.simmediate; return kvm_mips_trans_replace(vcpu, opc, synci_inst); } diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index f0fa9e956056..62e6a7b313ae 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1601,7 +1601,10 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, base = inst.i_format.rs; op_inst = inst.i_format.rt; - offset = inst.i_format.simmediate; + if (cpu_has_mips_r6) + offset = inst.spec3_format.simmediate; + else + offset = inst.i_format.simmediate; cache = op_inst & CacheOp_Cache; op = op_inst & CacheOp_Op; @@ -1764,11 +1767,27 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, er = kvm_mips_emulate_load(inst, cause, run, vcpu); break; +#ifndef CONFIG_CPU_MIPSR6 case cache_op: ++vcpu->stat.cache_exits; trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); er = kvm_mips_emulate_cache(inst, opc, cause, run, vcpu); break; +#else + case spec3_op: + switch (inst.spec3_format.func) { + case cache6_op: + ++vcpu->stat.cache_exits; + trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); + er = kvm_mips_emulate_cache(inst, opc, cause, run, + vcpu); + break; + default: + goto unknown; + }; + break; +unknown: +#endif default: kvm_err("Instruction emulation not supported (%p/%#x)\n", opc, From 8eeab81c3d55ba41ae68888b13a1a34893104e12 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 4 Jul 2016 19:35:14 +0100 Subject: [PATCH 228/302] MIPS: KVM: Decode RDHWR more strictly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When KVM emulates the RDHWR instruction, decode the instruction more strictly. The rs field (bits 25:21) should be zero, as should bits 10:9. Bits 8:6 is the register select field in MIPSr6, so we aren't strict about those bits (no other operations should use that encoding space). Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim KrÄmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/emulate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 62e6a7b313ae..be18dfe9ecaa 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -2357,7 +2357,9 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, } if (inst.r_format.opcode == spec3_op && - inst.r_format.func == rdhwr_op) { + inst.r_format.func == rdhwr_op && + inst.r_format.rs == 0 && + (inst.r_format.re >> 3) == 0) { int usermode = !KVM_GUEST_KERNEL_MODE(vcpu); int rd = inst.r_format.rd; int rt = inst.r_format.rt; From 8426097258c8092f8f3f7a5c420d3809e99b0769 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Mon, 4 Jul 2016 19:35:15 +0100 Subject: [PATCH 229/302] MIPS: KVM: Emulate generic QEMU machine on r6 T&E MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Default the guest PRId register to represent a generic QEMU machine instead of a 24kc on MIPSr6. 24kc isn't supported by r6 Linux kernels. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: Radim KrÄmář Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/trap_emul.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 00e8dc3d36cb..091553942bcb 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -431,9 +431,15 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) /* * Arch specific stuff, set up config registers properly so that the - * guest will come up as expected, for now we simulate a MIPS 24kc + * guest will come up as expected */ +#ifndef CONFIG_CPU_MIPSR6 + /* r2-r5, simulate a MIPS 24kc */ kvm_write_c0_guest_prid(cop0, 0x00019300); +#else + /* r6+, simulate a generic QEMU machine */ + kvm_write_c0_guest_prid(cop0, 0x00010000); +#endif /* * Have config1, Cacheable, noncoherent, write-back, write allocate. * Endianness, arch revision & virtually tagged icache should match From 1c17c3e6bfe61846d1120291b2f486d00bc0d18f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 8 Jul 2016 11:53:38 +0200 Subject: [PATCH 230/302] KVM: VMX: reflect broken preemption timer in vmcs_config Simplify cpu_has_vmx_preemption_timer. This is consistent with the rest of setup_vmcs_config and preparatory for the next patch. Tested-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e564fa2c7ac8..00ce07e6f2ca 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1121,9 +1121,6 @@ static inline bool cpu_has_broken_vmx_preemption_timer(void) static inline bool cpu_has_vmx_preemption_timer(void) { - if (cpu_has_broken_vmx_preemption_timer()) - return false; - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VMX_PREEMPTION_TIMER; } @@ -3407,6 +3404,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_pin_based_exec_control) < 0) return -EIO; + if (cpu_has_broken_vmx_preemption_timer()) + _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; From 55123e3c862d98dc4fbcade38d158c32c022afd8 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 6 Jul 2016 18:29:58 +0800 Subject: [PATCH 231/302] KVM: nVMX: avoid incorrect preemption timer vmexit in nested guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The preemption timer for nested VMX is emulated by hrtimer which is started on L2 entry, stopped on L2 exit and evaluated via the check_nested_events hook. However, nested_vmx_exit_handled is always returning true for preemption timer vmexit. Then, the L1 preemption timer vmexit is captured and be treated as a L2 preemption timer vmexit, causing NULL pointer dereferences or worse in the L1 guest's vmexit handler: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [< (null)>] (null) PGD 0 Oops: 0010 [#1] SMP Call Trace: ? kvm_lapic_expired_hv_timer+0x47/0x90 [kvm] handle_preemption_timer+0xe/0x20 [kvm_intel] vmx_handle_exit+0x169/0x15a0 [kvm_intel] ? kvm_arch_vcpu_ioctl_run+0xd5d/0x19d0 [kvm] kvm_arch_vcpu_ioctl_run+0xdee/0x19d0 [kvm] ? kvm_arch_vcpu_ioctl_run+0xd5d/0x19d0 [kvm] ? vcpu_load+0x1c/0x60 [kvm] ? kvm_arch_vcpu_load+0x57/0x260 [kvm] kvm_vcpu_ioctl+0x2d3/0x7c0 [kvm] do_vfs_ioctl+0x96/0x6a0 ? __fget_light+0x2a/0x90 SyS_ioctl+0x79/0x90 do_syscall_64+0x68/0x180 entry_SYSCALL64_slow_path+0x25/0x25 Code: Bad RIP value. RIP [< (null)>] (null) RSP CR2: 0000000000000000 ---[ end trace 9c70c48b1a2bc66e ]--- This can be reproduced readily by preemption timer enabled on L0 and disabled on L1. Return false since preemption timer vmexits must never be reflected to L2. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Yunhong Jiang Cc: Jan Kiszka Cc: Haozhong Zhang Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 00ce07e6f2ca..0048be79c7b9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8040,6 +8040,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); case EXIT_REASON_PCOMMIT: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT); + case EXIT_REASON_PREEMPTION_TIMER: + return false; default: return true; } From 9314006db8b781715658cd6a28994d84ccce5dee Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 6 Jul 2016 13:23:51 +0200 Subject: [PATCH 232/302] KVM: nVMX: keep preemption timer enabled during L2 execution Because the vmcs12 preemption timer is emulated through a separate hrtimer, we can keep on using the preemption timer in the vmcs02 to emulare L1's TSC deadline timer. However, the corresponding bit in the pin-based execution control field must be kept consistent between vmcs01 and vmcs02. On vmentry we copy it into the vmcs02; on vmexit the preemption timer must be disabled in the vmcs01 if a preemption timer vmexit happened while in guest mode. The preemption timer value in the vmcs02 is set by vmx_vcpu_run, so it need not be considered in prepare_vmcs02. Cc: Yunhong Jiang Cc: Haozhong Zhang Tested-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0048be79c7b9..8cda4449a60e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9796,9 +9796,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VMCS_LINK_POINTER, -1ull); exec_control = vmcs12->pin_based_vm_exec_control; - exec_control |= vmcs_config.pin_based_exec_ctrl; - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + /* Preemption timer setting is only taken from vmcs01. */ + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + exec_control |= vmcs_config.pin_based_exec_ctrl; + if (vmx->hv_deadline_tsc == -1) + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + + /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { /* * Note that we use L0's vector here and in @@ -10727,8 +10732,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, load_vmcs12_host_state(vcpu, vmcs12); - /* Update TSC_OFFSET if TSC was changed while L2 ran */ + /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); + if (vmx->hv_deadline_tsc == -1) + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + else + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); /* This is needed for same reason as it was needed in prepare_vmcs02 */ vmx->host_rsp = 0; From 8391ce447f18476273331399a7f5930e5494bf46 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 7 Jul 2016 14:58:33 +0200 Subject: [PATCH 233/302] KVM: VMX: introduce vm_{entry,exit}_control_reset_shadow There is no reason to read the entry/exit control fields of the VMCS and immediately write back the same value. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8cda4449a60e..e51503063181 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1672,6 +1672,11 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) __vmcs_writel(field, __vmcs_readl(field) | mask); } +static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) +{ + vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); +} + static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) { vmcs_write32(VM_ENTRY_CONTROLS, val); @@ -1700,6 +1705,11 @@ static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); } +static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx) +{ + vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS); +} + static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) { vmcs_write32(VM_EXIT_CONTROLS, val); @@ -10722,8 +10732,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs12->vm_exit_intr_error_code, KVM_ISA_VMX); - vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); - vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); + vm_entry_controls_reset_shadow(vmx); + vm_exit_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); /* if no vmcs02 cache requested, remove the one we used */ From 8d5cf1610da526c3c1286bd7b3ac9f35f96ed43d Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Tue, 12 Jul 2016 18:18:48 -0400 Subject: [PATCH 234/302] kvm: mmu: extend the is_present check to 32 bits This is safe because this function is called on host controlled page table and non-present/non-MMIO sptes never use bits 1..31. For the EPT case, this ensures that cases where only the execute bit is set is marked valid. Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 837bf23c5b06..55a20c0524fe 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -304,7 +304,7 @@ static int is_nx(struct kvm_vcpu *vcpu) static int is_shadow_present_pte(u64 pte) { - return pte & PT_PRESENT_MASK && !is_mmio_spte(pte); + return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte); } static int is_large_pte(u64 pte) From 812f30b234539ccb0139f92dfdbec1e8158cf535 Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Tue, 12 Jul 2016 18:18:50 -0400 Subject: [PATCH 235/302] kvm: mmu: remove is_present_gpte() We have two versions of the above function. To prevent confusion and bugs in the future, remove the non-FNAME version entirely and replace all calls with the actual check. Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/mmu.h | 5 ----- arch/x86/kvm/paging_tmpl.h | 2 +- arch/x86/kvm/x86.c | 2 +- 4 files changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 55a20c0524fe..6471f8788bd2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3189,7 +3189,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) MMU_WARN_ON(VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); - if (!is_present_gpte(pdptr)) { + if (!(pdptr & PT_PRESENT_MASK)) { vcpu->arch.mmu.pae_root[i] = 0; continue; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 66b33b96a31b..ddc56e91f2e4 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -93,11 +93,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) return kvm_mmu_load(vcpu); } -static inline int is_present_gpte(unsigned long pte) -{ - return pte & PT_PRESENT_MASK; -} - /* * Currently, we have two sorts of write-protection, a) the first one * write-protects guest page to sync the guest modification, b) another one is diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bc019f70e0b6..fda5b64ae8f1 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -131,7 +131,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) static inline int FNAME(is_present_gpte)(unsigned long pte) { #if PTTYPE != PTTYPE_EPT - return is_present_gpte(pte); + return pte & PT_PRESENT_MASK; #else return pte & 7; #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0cc6cf834cdd..bb6e8bfaee3b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -540,7 +540,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) goto out; } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { - if (is_present_gpte(pdpte[i]) && + if ((pdpte[i] & PT_PRESENT_MASK) && (pdpte[i] & vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) { ret = 0; From ffb128c89b77b44da18ccf51844a8e750e2c427a Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Tue, 12 Jul 2016 18:18:49 -0400 Subject: [PATCH 236/302] kvm: mmu: don't set the present bit unconditionally To support execute only mappings on behalf of L1 hypervisors, we need to teach set_spte() to honor all three of L1's XWR bits. As a start, add a new variable "shadow_present_mask" that will be set for non-EPT shadow paging and clear for EPT. Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 13 +++++++------ arch/x86/kvm/vmx.c | 1 + arch/x86/kvm/x86.c | 4 ++-- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7a628fb6a2c2..d0845b289adb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1031,7 +1031,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask); + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6471f8788bd2..b8628e905806 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -175,6 +175,7 @@ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; +static u64 __read_mostly shadow_present_mask; static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); @@ -282,13 +283,14 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) } void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask) { shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; + shadow_present_mask = p_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -2245,10 +2247,9 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, { u64 spte; - BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK || - VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); + BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); - spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | + spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | shadow_user_mask | shadow_x_mask | shadow_accessed_mask; mmu_spte_set(sptep, spte); @@ -2515,13 +2516,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { - u64 spte; + u64 spte = 0; int ret = 0; if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; - spte = PT_PRESENT_MASK; + spte |= shadow_present_mask; if (!speculative) spte |= shadow_accessed_mask; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e51503063181..a75d09d2a799 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6473,6 +6473,7 @@ static __init int hardware_setup(void) (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK); + 0ull, VMX_EPT_EXECUTABLE_MASK, VMX_EPT_READABLE_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); } else diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bb6e8bfaee3b..0c1fbb8d9d11 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5878,8 +5878,8 @@ int kvm_arch_init(void *opaque) kvm_x86_ops = ops; kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0); - + PT_DIRTY_MASK, PT64_NX_MASK, 0, + PT_PRESENT_MASK); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); From d95c55687e11febe3ab1aacfe82b58b1822c52c4 Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Tue, 12 Jul 2016 18:18:51 -0400 Subject: [PATCH 237/302] kvm: mmu: track read permission explicitly for shadow EPT page tables To support execute only mappings on behalf of L1 hypervisors, reuse ACC_USER_MASK to signify if the L1 hypervisor has the R bit set. For the nested EPT case, we assumed that the U bit was always set since there was no equivalent in EPT page tables. Strictly speaking, this was not necessary because handle_ept_violation never set PFERR_USER_MASK in the error code (uf=0 in the parlance of update_permission_bitmask). We now have to set both U and UF correctly, respectively in FNAME(gpte_access) and in handle_ept_violation. Also in handle_ept_violation bit 3 of the exit qualification is not enough to detect a present PTE; all three bits 3-5 have to be checked. Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 10 +++++++--- arch/x86/kvm/paging_tmpl.h | 8 +++++++- arch/x86/kvm/vmx.c | 15 +++++++++------ 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b8628e905806..3041902ec827 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2522,6 +2522,12 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; + /* + * For the EPT case, shadow_present_mask is 0 if hardware + * supports exec-only page table entries. In that case, + * ACC_USER_MASK and shadow_user_mask are used to represent + * read access. See FNAME(gpte_access) in paging_tmpl.h. + */ spte |= shadow_present_mask; if (!speculative) spte |= shadow_accessed_mask; @@ -3915,9 +3921,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, * clearer. */ smap = cr4_smap && u && !uf && !ff; - } else - /* Not really needed: no U/S accesses on ept */ - u = 1; + } fault = (ff && !x) || (uf && !u) || (wf && !w) || (smapf && smap); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index fda5b64ae8f1..a01105485315 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -181,13 +181,19 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, return true; } +/* + * For PTTYPE_EPT, a page table can be executable but not readable + * on supported processors. Therefore, set_spte does not automatically + * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK + * to signify readability since it isn't used in the EPT case + */ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) { unsigned access; #if PTTYPE == PTTYPE_EPT access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | - ACC_USER_MASK; + ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0); #else BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK); BUILD_BUG_ON(ACC_EXEC_MASK != 1); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a75d09d2a799..bd7d60f66b93 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6117,12 +6117,14 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - /* It is a write fault? */ - error_code = exit_qualification & PFERR_WRITE_MASK; + /* it is a read fault? */ + error_code = (exit_qualification << 2) & PFERR_USER_MASK; + /* it is a write fault? */ + error_code |= exit_qualification & PFERR_WRITE_MASK; /* It is a fetch fault? */ error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; /* ept page table is present? */ - error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK; + error_code |= (exit_qualification & 0x38) != 0; vcpu->arch.exit_qualification = exit_qualification; @@ -6469,11 +6471,12 @@ static __init int hardware_setup(void) vmx_disable_intercept_msr_write_x2apic(0x83f); if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, + kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK); - 0ull, VMX_EPT_EXECUTABLE_MASK, VMX_EPT_READABLE_MASK); + 0ull, VMX_EPT_EXECUTABLE_MASK, + cpu_has_vmx_ept_execute_only() ? + 0ull : VMX_EPT_READABLE_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); } else From 02120c45b07953ca4dfc19fa6ff90466efaf363f Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Tue, 12 Jul 2016 18:18:52 -0400 Subject: [PATCH 238/302] kvm: vmx: advertise support for ept execute only MMU now knows about execute only mappings, so advertise the feature to L1 hypervisors Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bd7d60f66b93..729e5f689097 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2794,6 +2794,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_INVEPT_BIT; + if (cpu_has_vmx_ept_execute_only()) + vmx->nested.nested_vmx_ept_caps |= + VMX_EPT_EXECUTE_ONLY_BIT; vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; /* * For nested guests, we don't do anything specific From 757883de41eca292765578ef87c4f49453529bb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Tue, 12 Jul 2016 22:09:17 +0200 Subject: [PATCH 239/302] KVM: x86: bump KVM_SOFT_MAX_VCPUS to 240 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 240 has been well tested by Red Hat. Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d0845b289adb..5f90dce6fbd1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -35,7 +35,7 @@ #include #define KVM_MAX_VCPUS 255 -#define KVM_SOFT_MAX_VCPUS 160 +#define KVM_SOFT_MAX_VCPUS 240 #define KVM_USER_MEM_SLOTS 509 /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 From 64aa47bfc45323040d5db8f30cbd6851f2606c7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Tue, 12 Jul 2016 22:09:18 +0200 Subject: [PATCH 240/302] KVM: x86: add kvm_apic_map_get_dest_lapic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_irq_delivery_to_apic_fast and kvm_intr_is_single_vcpu_fast both compute the interrupt destination. Factor the code. 'struct kvm_lapic **dst = NULL' had to be added to silence GCC. GCC might complain about potential NULL access in the future, because it missed conditions that avoided uninitialized uses of dst. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 242 +++++++++++++++++++------------------------ 1 file changed, 104 insertions(+), 138 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 22a6474af220..2987843657db 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -671,14 +671,99 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm) } } +/* Return true if the interrupt can be handled by using *bitmap as index mask + * for valid destinations in *dst array. + * Return false if kvm_apic_map_get_dest_lapic did nothing useful. + * Note: we may have zero kvm_lapic destinations when we return true, which + * means that the interrupt should be dropped. In this case, *bitmap would be + * zero and *dst undefined. + */ +static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, + struct kvm_lapic **src, struct kvm_lapic_irq *irq, + struct kvm_apic_map *map, struct kvm_lapic ***dst, + unsigned long *bitmap) +{ + int i, lowest; + bool x2apic_ipi; + u16 cid; + + if (irq->shorthand == APIC_DEST_SELF && src) { + *dst = src; + *bitmap = 1; + return true; + } else if (irq->shorthand) + return false; + + x2apic_ipi = src && *src && apic_x2apic_mode(*src); + if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) + return false; + + if (!map) + return false; + + if (irq->dest_mode == APIC_DEST_PHYSICAL) { + if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) { + *bitmap = 0; + } else { + *dst = &map->phys_map[irq->dest_id]; + *bitmap = 1; + } + return true; + } + + if (!kvm_apic_logical_map_valid(map)) + return false; + + apic_logical_id(map, irq->dest_id, &cid, (u16 *)bitmap); + + if (cid >= ARRAY_SIZE(map->logical_map)) { + *bitmap = 0; + return true; + } + + *dst = map->logical_map[cid]; + + if (!kvm_lowest_prio_delivery(irq)) + return true; + + if (!kvm_vector_hashing_enabled()) { + lowest = -1; + for_each_set_bit(i, bitmap, 16) { + if (!(*dst)[i]) + continue; + if (lowest < 0) + lowest = i; + else if (kvm_apic_compare_prio((*dst)[i]->vcpu, + (*dst)[lowest]->vcpu) < 0) + lowest = i; + } + } else { + if (!*bitmap) + return true; + + lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap), + bitmap, 16); + + if (!(*dst)[lowest]) { + kvm_apic_disabled_lapic_found(kvm); + *bitmap = 0; + return true; + } + } + + *bitmap = (lowest >= 0) ? 1 << lowest : 0; + + return true; +} + bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map) { struct kvm_apic_map *map; - unsigned long bitmap = 1; - struct kvm_lapic **dst; + unsigned long bitmap; + struct kvm_lapic **dst = NULL; int i; - bool ret, x2apic_ipi; + bool ret; *r = -1; @@ -687,86 +772,19 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, return true; } - if (irq->shorthand) - return false; - - x2apic_ipi = src && apic_x2apic_mode(src); - if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) - return false; - - ret = true; rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); - if (!map) { - ret = false; - goto out; - } - - if (irq->dest_mode == APIC_DEST_PHYSICAL) { - if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) - goto out; - - dst = &map->phys_map[irq->dest_id]; - } else { - u16 cid; - - if (!kvm_apic_logical_map_valid(map)) { - ret = false; - goto out; + ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap); + if (ret) + for_each_set_bit(i, &bitmap, 16) { + if (!dst[i]) + continue; + if (*r < 0) + *r = 0; + *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); } - apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); - - if (cid >= ARRAY_SIZE(map->logical_map)) - goto out; - - dst = map->logical_map[cid]; - - if (!kvm_lowest_prio_delivery(irq)) - goto set_irq; - - if (!kvm_vector_hashing_enabled()) { - int l = -1; - for_each_set_bit(i, &bitmap, 16) { - if (!dst[i]) - continue; - if (l < 0) - l = i; - else if (kvm_apic_compare_prio(dst[i]->vcpu, - dst[l]->vcpu) < 0) - l = i; - } - bitmap = (l >= 0) ? 1 << l : 0; - } else { - int idx; - unsigned int dest_vcpus; - - dest_vcpus = hweight16(bitmap); - if (dest_vcpus == 0) - goto out; - - idx = kvm_vector_to_index(irq->vector, - dest_vcpus, &bitmap, 16); - - if (!dst[idx]) { - kvm_apic_disabled_lapic_found(kvm); - goto out; - } - - bitmap = (idx >= 0) ? 1 << idx : 0; - } - } - -set_irq: - for_each_set_bit(i, &bitmap, 16) { - if (!dst[i]) - continue; - if (*r < 0) - *r = 0; - *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); - } -out: rcu_read_unlock(); return ret; } @@ -789,8 +807,9 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu) { struct kvm_apic_map *map; + unsigned long bitmap; + struct kvm_lapic **dst = NULL; bool ret = false; - struct kvm_lapic *dst = NULL; if (irq->shorthand) return false; @@ -798,69 +817,16 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); - if (!map) - goto out; + if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) && + hweight16(bitmap) == 1) { + unsigned long i = find_first_bit(&bitmap, 16); - if (irq->dest_mode == APIC_DEST_PHYSICAL) { - if (irq->dest_id == 0xFF) - goto out; - - if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) - goto out; - - dst = map->phys_map[irq->dest_id]; - if (dst && kvm_apic_present(dst->vcpu)) - *dest_vcpu = dst->vcpu; - else - goto out; - } else { - u16 cid; - unsigned long bitmap = 1; - int i, r = 0; - - if (!kvm_apic_logical_map_valid(map)) - goto out; - - apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); - - if (cid >= ARRAY_SIZE(map->logical_map)) - goto out; - - if (kvm_vector_hashing_enabled() && - kvm_lowest_prio_delivery(irq)) { - int idx; - unsigned int dest_vcpus; - - dest_vcpus = hweight16(bitmap); - if (dest_vcpus == 0) - goto out; - - idx = kvm_vector_to_index(irq->vector, dest_vcpus, - &bitmap, 16); - - dst = map->logical_map[cid][idx]; - if (!dst) { - kvm_apic_disabled_lapic_found(kvm); - goto out; - } - - *dest_vcpu = dst->vcpu; - } else { - for_each_set_bit(i, &bitmap, 16) { - dst = map->logical_map[cid][i]; - if (++r == 2) - goto out; - } - - if (dst && kvm_apic_present(dst->vcpu)) - *dest_vcpu = dst->vcpu; - else - goto out; + if (dst[i]) { + *dest_vcpu = dst[i]->vcpu; + ret = true; } } - ret = true; -out: rcu_read_unlock(); return ret; } From e45115b62f9abb143a03036dbde05faf5864aa01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Tue, 12 Jul 2016 22:09:19 +0200 Subject: [PATCH 241/302] KVM: x86: use physical LAPIC array for logical x2APIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Logical x2APIC IDs map injectively to physical x2APIC IDs, so we can reuse the physical array for them. This allows us to save space by sizing the logical maps according to the needs of xAPIC. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 ++- arch/x86/kvm/lapic.c | 69 +++++++++++++++++---------------- 2 files changed, 39 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5f90dce6fbd1..623089c4e1a7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -683,8 +683,10 @@ struct kvm_apic_map { struct rcu_head rcu; u8 mode; struct kvm_lapic *phys_map[256]; - /* first index is cluster id second is cpu id in a cluster */ - struct kvm_lapic *logical_map[16][16]; + union { + struct kvm_lapic *xapic_flat_map[8]; + struct kvm_lapic *xapic_cluster_map[16][4]; + }; }; /* Hyper-V emulation context */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2987843657db..9880d03f533d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -115,26 +115,36 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) -/* The logical map is definitely wrong if we have multiple - * modes at the same time. (Physical map is always right.) - */ -static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map) -{ - return !(map->mode & (map->mode - 1)); -} +static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, + u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { + switch (map->mode) { + case KVM_APIC_MODE_X2APIC: { + u32 offset = (dest_id >> 16) * 16; + u32 max_apic_id = ARRAY_SIZE(map->phys_map) - 1; -static inline void -apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid) -{ - unsigned lid_bits; + if (offset <= max_apic_id) { + u8 cluster_size = min(max_apic_id - offset + 1, 16U); - BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER != 4); - BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT != 8); - BUILD_BUG_ON(KVM_APIC_MODE_X2APIC != 16); - lid_bits = map->mode; + *cluster = &map->phys_map[offset]; + *mask = dest_id & (0xffff >> (16 - cluster_size)); + } else { + *mask = 0; + } - *cid = dest_id >> lid_bits; - *lid = dest_id & ((1 << lid_bits) - 1); + return true; + } + case KVM_APIC_MODE_XAPIC_FLAT: + *cluster = map->xapic_flat_map; + *mask = dest_id & 0xff; + return true; + case KVM_APIC_MODE_XAPIC_CLUSTER: + *cluster = map->xapic_cluster_map[dest_id >> 4]; + *mask = dest_id & 0xf; + return true; + default: + /* Not optimized. */ + return false; + } } static void recalculate_apic_map(struct kvm *kvm) @@ -152,7 +162,8 @@ static void recalculate_apic_map(struct kvm *kvm) kvm_for_each_vcpu(i, vcpu, kvm) { struct kvm_lapic *apic = vcpu->arch.apic; - u16 cid, lid; + struct kvm_lapic **cluster; + u16 mask; u32 ldr, aid; if (!kvm_apic_present(vcpu)) @@ -174,13 +185,11 @@ static void recalculate_apic_map(struct kvm *kvm) new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER; } - if (!kvm_apic_logical_map_valid(new)) + if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask)) continue; - apic_logical_id(new, ldr, &cid, &lid); - - if (lid && cid < ARRAY_SIZE(new->logical_map)) - new->logical_map[cid][ffs(lid) - 1] = apic; + if (mask) + cluster[ffs(mask) - 1] = apic; } out: old = rcu_dereference_protected(kvm->arch.apic_map, @@ -685,7 +694,6 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, { int i, lowest; bool x2apic_ipi; - u16 cid; if (irq->shorthand == APIC_DEST_SELF && src) { *dst = src; @@ -711,18 +719,11 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, return true; } - if (!kvm_apic_logical_map_valid(map)) + *bitmap = 0; + if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst, + (u16 *)bitmap)) return false; - apic_logical_id(map, irq->dest_id, &cid, (u16 *)bitmap); - - if (cid >= ARRAY_SIZE(map->logical_map)) { - *bitmap = 0; - return true; - } - - *dst = map->logical_map[cid]; - if (!kvm_lowest_prio_delivery(irq)) return true; From 0ca52e7b81a37260c7edb823c8ac6a49c6280b5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Tue, 12 Jul 2016 22:09:20 +0200 Subject: [PATCH 242/302] KVM: x86: dynamic kvm_apic_map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit x2APIC supports up to 2^32-1 LAPICs, but most guest in coming years will probably has fewer VCPUs. Dynamic size saves memory at the cost of turning one constant into a variable. apic_map mutex had to be moved before allocation to avoid races with cpu hotplug. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/lapic.c | 18 +++++++++++++----- arch/x86/kvm/lapic.h | 2 +- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 623089c4e1a7..a2832cc3cb81 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -682,11 +682,12 @@ struct kvm_arch_memory_slot { struct kvm_apic_map { struct rcu_head rcu; u8 mode; - struct kvm_lapic *phys_map[256]; + u32 max_apic_id; union { struct kvm_lapic *xapic_flat_map[8]; struct kvm_lapic *xapic_cluster_map[16][4]; }; + struct kvm_lapic *phys_map[]; }; /* Hyper-V emulation context */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9880d03f533d..224fc1c5fcc6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -120,7 +120,7 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, switch (map->mode) { case KVM_APIC_MODE_X2APIC: { u32 offset = (dest_id >> 16) * 16; - u32 max_apic_id = ARRAY_SIZE(map->phys_map) - 1; + u32 max_apic_id = map->max_apic_id; if (offset <= max_apic_id) { u8 cluster_size = min(max_apic_id - offset + 1, 16U); @@ -152,14 +152,22 @@ static void recalculate_apic_map(struct kvm *kvm) struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; int i; - - new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL); + u32 max_id = 255; mutex_lock(&kvm->arch.apic_map_lock); + kvm_for_each_vcpu(i, vcpu, kvm) + if (kvm_apic_present(vcpu)) + max_id = max(max_id, kvm_apic_id(vcpu->arch.apic)); + + new = kzalloc(sizeof(struct kvm_apic_map) + + sizeof(struct kvm_lapic *) * (max_id + 1), GFP_KERNEL); + if (!new) goto out; + new->max_apic_id = max_id; + kvm_for_each_vcpu(i, vcpu, kvm) { struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_lapic **cluster; @@ -172,7 +180,7 @@ static void recalculate_apic_map(struct kvm *kvm) aid = kvm_apic_id(apic); ldr = kvm_lapic_get_reg(apic, APIC_LDR); - if (aid < ARRAY_SIZE(new->phys_map)) + if (aid <= new->max_apic_id) new->phys_map[aid] = apic; if (apic_x2apic_mode(apic)) { @@ -710,7 +718,7 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, return false; if (irq->dest_mode == APIC_DEST_PHYSICAL) { - if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) { + if (irq->dest_id > map->max_apic_id) { *bitmap = 0; } else { *dst = &map->phys_map[irq->dest_id]; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 336ba51bb16e..8d811139d2b3 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -200,7 +200,7 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } -static inline int kvm_apic_id(struct kvm_lapic *apic) +static inline u32 kvm_apic_id(struct kvm_lapic *apic) { return (kvm_lapic_get_reg(apic, APIC_ID) >> 24) & 0xff; } From 3159d36ad799a117eb2f898de4c6b7777e4dc045 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Tue, 12 Jul 2016 22:09:21 +0200 Subject: [PATCH 243/302] KVM: x86: use generic function for MSI parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/irq_comm.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index dfb4c6476877..47ad681a33fd 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -388,21 +388,16 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, kvm->arch.nr_reserved_ioapic_pins); for (i = 0; i < nr_ioapic_pins; ++i) { hlist_for_each_entry(entry, &table->map[i], link) { - u32 dest_id, dest_mode; - bool level; + struct kvm_lapic_irq irq; if (entry->type != KVM_IRQ_ROUTING_MSI) continue; - dest_id = (entry->msi.address_lo >> 12) & 0xff; - dest_mode = (entry->msi.address_lo >> 2) & 0x1; - level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL; - if (level && kvm_apic_match_dest(vcpu, NULL, 0, - dest_id, dest_mode)) { - u32 vector = entry->msi.data & 0xff; - __set_bit(vector, - ioapic_handled_vectors); - } + kvm_set_msi_irq(entry, &irq); + + if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0, + irq.dest_id, irq.dest_mode)) + __set_bit(irq.vector, ioapic_handled_vectors); } } srcu_read_unlock(&kvm->irq_srcu, idx); From a92e2543d6a8653a8ab45cf5df7ef07dafcf3f3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Tue, 12 Jul 2016 22:09:22 +0200 Subject: [PATCH 244/302] KVM: x86: use hardware-compatible format for APIC ID register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We currently always shift APIC ID as if APIC was in xAPIC mode. x2APIC mode wants to use more bits and storing a hardware-compabible value is the the sanest option. KVM API to set the lapic expects that bottom 8 bits of APIC ID are in top 8 bits of APIC_ID register, so the register needs to be shifted in x2APIC mode. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 52 +++++++++++++++++++++++++++++++------------- arch/x86/kvm/lapic.h | 12 +++++++--- arch/x86/kvm/x86.c | 10 +++++---- 3 files changed, 52 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 224fc1c5fcc6..41089dbeeafc 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -227,7 +227,7 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) } } -static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) +static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id) { kvm_lapic_set_reg(apic, APIC_ID, id << 24); recalculate_apic_map(apic->vcpu->kvm); @@ -239,11 +239,11 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) recalculate_apic_map(apic->vcpu->kvm); } -static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id) +static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); - kvm_lapic_set_reg(apic, APIC_ID, id << 24); + kvm_lapic_set_reg(apic, APIC_ID, id); kvm_lapic_set_reg(apic, APIC_LDR, ldr); recalculate_apic_map(apic->vcpu->kvm); } @@ -1102,12 +1102,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) return 0; switch (offset) { - case APIC_ID: - if (apic_x2apic_mode(apic)) - val = kvm_apic_id(apic); - else - val = kvm_apic_id(apic) << 24; - break; case APIC_ARBPRI: apic_debug("Access APIC ARBPRI register which is for P6\n"); break; @@ -1465,7 +1459,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) switch (reg) { case APIC_ID: /* Local APIC ID */ if (!apic_x2apic_mode(apic)) - kvm_apic_set_id(apic, val >> 24); + kvm_apic_set_xapic_id(apic, val >> 24); else ret = 1; break; @@ -1769,7 +1763,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) hrtimer_cancel(&apic->lapic_timer.timer); if (!init_event) - kvm_apic_set_id(apic, vcpu->vcpu_id); + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); kvm_apic_set_version(apic->vcpu); for (i = 0; i < KVM_APIC_LVT_NUM; i++) @@ -1990,17 +1984,43 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) return vector; } -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s) +static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s, bool set) +{ + if (apic_x2apic_mode(vcpu->arch.apic)) { + u32 *id = (u32 *)(s->regs + APIC_ID); + + if (set) + *id >>= 24; + else + *id <<= 24; + } + + return 0; +} + +int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) +{ + memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s)); + return kvm_apic_state_fixup(vcpu, s, false); +} + +int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { struct kvm_lapic *apic = vcpu->arch.apic; + int r; + kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); /* set SPIV separately to get count of SW disabled APICs right */ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); + + r = kvm_apic_state_fixup(vcpu, s, true); + if (r) + return r; memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); - /* call kvm_apic_set_id() to put apic into apic_map */ - kvm_apic_set_id(apic, kvm_apic_id(apic)); + + recalculate_apic_map(vcpu->kvm); kvm_apic_set_version(vcpu); apic_update_ppr(apic); @@ -2026,6 +2046,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, kvm_rtc_eoi_tracking_restore_one(vcpu); vcpu->arch.apic_arb_prio = 0; + + return 0; } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 8d811139d2b3..f60d01c29d51 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -81,8 +81,8 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s); +int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); +int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); @@ -202,7 +202,13 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) static inline u32 kvm_apic_id(struct kvm_lapic *apic) { - return (kvm_lapic_get_reg(apic, APIC_ID) >> 24) & 0xff; + /* To avoid a race between apic_base and following APIC_ID update when + * switching to x2apic_mode, the x2apic mode returns initial x2apic id. + */ + if (apic_x2apic_mode(apic)) + return apic->vcpu->vcpu_id; + + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; } bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0c1fbb8d9d11..b6e402d16e0c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2779,15 +2779,17 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, if (vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); - memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); - - return 0; + return kvm_apic_get_state(vcpu, s); } static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - kvm_apic_post_state_restore(vcpu, s); + int r; + + r = kvm_apic_set_state(vcpu, s); + if (r) + return r; update_cr8_intercept(vcpu); return 0; From 49bd29ba1dbd57b029f69cd9afb335a8f564f32f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Tue, 12 Jul 2016 22:09:23 +0200 Subject: [PATCH 245/302] KVM: x86: reset APIC ID when enabling LAPIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit APIC ID should be set to the initial APIC ID when enabling LAPIC. This only matters if the guest changes APIC ID. No sane OS does that. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 41089dbeeafc..0fce77fdbe91 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1720,9 +1720,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) /* update jump label if enable bit changes */ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { - if (value & MSR_IA32_APICBASE_ENABLE) + if (value & MSR_IA32_APICBASE_ENABLE) { + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); static_key_slow_dec_deferred(&apic_hw_disabled); - else + } else static_key_slow_inc(&apic_hw_disabled.key); recalculate_apic_map(vcpu->kvm); } From c93de59dcd9bef0044e615493ab52d3958243d87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Tue, 12 Jul 2016 22:09:24 +0200 Subject: [PATCH 246/302] KVM: VMX: optimize APIC ID read with APICv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The register is in hardware-compatible format now, so there is not need to intercept. Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 729e5f689097..7bdd6b1a2373 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6461,9 +6461,6 @@ static __init int hardware_setup(void) for (msr = 0x800; msr <= 0x8ff; msr++) vmx_disable_intercept_msr_read_x2apic(msr); - /* According SDM, in x2apic mode, the whole id reg is used. But in - * KVM, it only use the highest eight bits. Need to intercept it */ - vmx_enable_intercept_msr_read_x2apic(0x802); /* TMCCT */ vmx_enable_intercept_msr_read_x2apic(0x839); /* TPR */ From 4d8e772bf8e3fcf55fe17e84ce68c20e03041efc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Tue, 12 Jul 2016 22:09:25 +0200 Subject: [PATCH 247/302] KVM: x86: reset lapic base in kvm_lapic_reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LAPIC is reset in xAPIC mode and the surrounding code expects that. KVM never resets after initialization. This patch is just for sanity. Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0fce77fdbe91..3c2a8c113054 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1763,8 +1763,11 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); - if (!init_event) + if (!init_event) { + kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE); kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); + } kvm_apic_set_version(apic->vcpu); for (i = 0; i < KVM_APIC_LVT_NUM; i++) @@ -1903,9 +1906,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) * thinking that APIC satet has changed. */ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; - kvm_lapic_set_base(vcpu, - APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE); - static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_lapic_reset(vcpu, false); kvm_iodevice_init(&apic->dev, &apic_mmio_ops); From c63cf538eb4bf6a5ffd3750366d8d56f023645a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Tue, 12 Jul 2016 22:09:26 +0200 Subject: [PATCH 248/302] KVM: pass struct kvm to kvm_set_routing_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Arch-specific code will use it. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/mpic.c | 3 ++- arch/s390/kvm/interrupt.c | 3 ++- arch/x86/kvm/irq_comm.c | 3 ++- include/linux/kvm_host.h | 3 ++- virt/kvm/irqchip.c | 7 ++++--- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c index 6249cdc834d1..ed38f8114118 100644 --- a/arch/powerpc/kvm/mpic.c +++ b/arch/powerpc/kvm/mpic.c @@ -1823,7 +1823,8 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return 0; } -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int r = -EINVAL; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index ca19627779db..24524c0f3ef8 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2246,7 +2246,8 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e, return ret; } -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int ret; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 47ad681a33fd..889563d50c55 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -248,7 +248,8 @@ static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); } -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int r = -EINVAL; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 66b2f6159aad..60d339faa94c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1011,7 +1011,8 @@ int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, unsigned flags); -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue); void kvm_free_irq_routing(struct kvm *kvm); diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 8db197bb6c7a..df99e9c3b64d 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -135,7 +135,8 @@ void kvm_free_irq_routing(struct kvm *kvm) free_irq_routing_table(rt); } -static int setup_routing_entry(struct kvm_irq_routing_table *rt, +static int setup_routing_entry(struct kvm *kvm, + struct kvm_irq_routing_table *rt, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { @@ -154,7 +155,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, e->gsi = ue->gsi; e->type = ue->type; - r = kvm_set_routing_entry(e, ue); + r = kvm_set_routing_entry(kvm, e, ue); if (r) goto out; if (e->type == KVM_IRQ_ROUTING_IRQCHIP) @@ -211,7 +212,7 @@ int kvm_set_irq_routing(struct kvm *kvm, kfree(e); goto out; } - r = setup_routing_entry(new, e, ue); + r = setup_routing_entry(kvm, new, e, ue); if (r) { kfree(e); goto out; From 3713131345fbea291cbd859d248e06ed77815962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Tue, 12 Jul 2016 22:09:27 +0200 Subject: [PATCH 249/302] KVM: x86: add KVM_CAP_X2APIC_API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM_CAP_X2APIC_API is a capability for features related to x2APIC enablement. KVM_X2APIC_API_32BIT_FORMAT feature can be enabled to extend APIC ID in get/set ioctl and MSI addresses to 32 bits. Both are needed to support x2APIC. The feature has to be enableable and disabled by default, because get/set ioctl shifted and truncated APIC ID to 8 bits by using a non-standard protocol inspired by xAPIC and the change is not backward-compatible. Changes to MSI addresses follow the format used by interrupt remapping unit. The upper address word, that used to be 0, contains upper 24 bits of the LAPIC address in its upper 24 bits. Lower 8 bits are reserved as 0. Using the upper address word is not backward-compatible either as we didn't check that userspace zeroed the word. Reserved bits are still not explicitly checked, but non-zero data will affect LAPIC addresses, which will cause a bug. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 41 +++++++++++++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 4 ++- arch/x86/kvm/irq_comm.c | 29 ++++++++++++++++++---- arch/x86/kvm/lapic.c | 13 +++++++--- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 15 +++++++++++ include/trace/events/kvm.h | 5 ++-- include/uapi/linux/kvm.h | 3 +++ 8 files changed, 99 insertions(+), 13 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 09efa9eb3926..e34e51fa28b0 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1482,6 +1482,11 @@ struct kvm_irq_routing_msi { __u32 pad; }; +On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS +feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled, +address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of +address_hi must be zero. + struct kvm_irq_routing_s390_adapter { __u64 ind_addr; __u64 summary_addr; @@ -1583,6 +1588,17 @@ struct kvm_lapic_state { Reads the Local APIC registers and copies them into the input argument. The data format and layout are the same as documented in the architecture manual. +If KVM_X2APIC_API_USE_32BIT_IDS feature of KVM_CAP_X2APIC_API is +enabled, then the format of APIC_ID register depends on the APIC mode +(reported by MSR_IA32_APICBASE) of its VCPU. x2APIC stores APIC ID in +the APIC_ID register (bytes 32-35). xAPIC only allows an 8-bit APIC ID +which is stored in bits 31-24 of the APIC register, or equivalently in +byte 35 of struct kvm_lapic_state's regs field. KVM_GET_LAPIC must then +be called after MSR_IA32_APICBASE has been set with KVM_SET_MSR. + +If KVM_X2APIC_API_USE_32BIT_IDS feature is disabled, struct kvm_lapic_state +always uses xAPIC format. + 4.58 KVM_SET_LAPIC @@ -1600,6 +1616,10 @@ struct kvm_lapic_state { Copies the input argument into the Local APIC registers. The data format and layout are the same as documented in the architecture manual. +The format of the APIC ID register (bytes 32-35 of struct kvm_lapic_state's +regs field) depends on the state of the KVM_CAP_X2APIC_API capability. +See the note in KVM_GET_LAPIC. + 4.59 KVM_IOEVENTFD @@ -2180,6 +2200,10 @@ struct kvm_msi { No flags are defined so far. The corresponding field must be 0. +On x86, address_hi is ignored unless the KVM_CAP_X2APIC_API capability is +enabled. If it is enabled, address_hi bits 31-8 provide bits 31-8 of the +destination id. Bits 7-0 of address_hi must be zero. + 4.71 KVM_CREATE_PIT2 @@ -3811,6 +3835,23 @@ Allows use of runtime-instrumentation introduced with zEC12 processor. Will return -EINVAL if the machine does not support runtime-instrumentation. Will return -EBUSY if a VCPU has already been created. +7.7 KVM_CAP_X2APIC_API + +Architectures: x86 +Parameters: args[0] - features that should be enabled +Returns: 0 on success, -EINVAL when args[0] contains invalid features + +Valid feature flags in args[0] are + +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) + +Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of +KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, +allowing the use of 32-bit APIC IDs. See KVM_CAP_X2APIC_API in their +respective sections. + + + 8. Other capabilities. ---------------------- diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a2832cc3cb81..7c00ba3242d7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -782,6 +782,8 @@ struct kvm_arch { u32 ldr_mode; struct page *avic_logical_id_table_page; struct page *avic_physical_id_table_page; + + bool x2apic_format; }; struct kvm_vm_stat { @@ -1364,7 +1366,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); -void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, +void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq); static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 889563d50c55..25810b144b58 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -110,13 +110,17 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } -void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, +void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq) { - trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); + trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ? + (u64)e->msi.address_hi << 32 : 0), + e->msi.data); irq->dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; + if (kvm->arch.x2apic_format) + irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi); irq->vector = (e->msi.data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; @@ -129,15 +133,24 @@ void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, } EXPORT_SYMBOL_GPL(kvm_set_msi_irq); +static inline bool kvm_msi_route_invalid(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e) +{ + return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff); +} + int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) { struct kvm_lapic_irq irq; + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + if (!level) return -1; - kvm_set_msi_irq(e, &irq); + kvm_set_msi_irq(kvm, e, &irq); return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); } @@ -153,7 +166,10 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, if (unlikely(e->type != KVM_IRQ_ROUTING_MSI)) return -EWOULDBLOCK; - kvm_set_msi_irq(e, &irq); + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + + kvm_set_msi_irq(kvm, e, &irq); if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) return r; @@ -286,6 +302,9 @@ int kvm_set_routing_entry(struct kvm *kvm, e->msi.address_lo = ue->u.msi.address_lo; e->msi.address_hi = ue->u.msi.address_hi; e->msi.data = ue->u.msi.data; + + if (kvm_msi_route_invalid(kvm, e)) + goto out; break; case KVM_IRQ_ROUTING_HV_SINT: e->set = kvm_hv_set_sint; @@ -394,7 +413,7 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, if (entry->type != KVM_IRQ_ROUTING_MSI) continue; - kvm_set_msi_irq(entry, &irq); + kvm_set_msi_irq(vcpu->kvm, entry, &irq); if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0, irq.dest_id, irq.dest_mode)) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3c2a8c113054..d27a7829a4ce 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1991,10 +1991,15 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, if (apic_x2apic_mode(vcpu->arch.apic)) { u32 *id = (u32 *)(s->regs + APIC_ID); - if (set) - *id >>= 24; - else - *id <<= 24; + if (vcpu->kvm->arch.x2apic_format) { + if (*id != vcpu->vcpu_id) + return -EINVAL; + } else { + if (set) + *id >>= 24; + else + *id <<= 24; + } } return 0; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7bdd6b1a2373..b61cdadf8623 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11104,7 +11104,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * We will support full lowest-priority interrupt later. */ - kvm_set_msi_irq(e, &irq); + kvm_set_msi_irq(kvm, e, &irq); if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { /* * Make sure the IRTE is in remapped mode if diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b6e402d16e0c..d86f563a6896 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -90,6 +90,8 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS) + static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); static void enter_smm(struct kvm_vcpu *vcpu); @@ -2625,6 +2627,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; + case KVM_CAP_X2APIC_API: + r = KVM_X2APIC_API_VALID_FLAGS; + break; default: r = 0; break; @@ -3799,6 +3804,16 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, mutex_unlock(&kvm->lock); break; } + case KVM_CAP_X2APIC_API: + r = -EINVAL; + if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS) + break; + + if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) + kvm->arch.x2apic_format = true; + + r = 0; + break; default: r = -EINVAL; break; diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index f28292d73ddb..8ade3eb6c640 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -151,8 +151,9 @@ TRACE_EVENT(kvm_msi_set_irq, __entry->data = data; ), - TP_printk("dst %u vec %u (%s|%s|%s%s)", - (u8)(__entry->address >> 12), (u8)__entry->data, + TP_printk("dst %llx vec %u (%s|%s|%s%s)", + (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00), + (u8)__entry->data, __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode), (__entry->address & (1<<2)) ? "logical" : "physical", (__entry->data & (1<<15)) ? "level" : "edge", diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 05ebf475104c..f704403e19a0 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -866,6 +866,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_ARM_PMU_V3 126 #define KVM_CAP_VCPU_ATTRIBUTES 127 #define KVM_CAP_MAX_VCPU_ID 128 +#define KVM_CAP_X2APIC_API 129 #ifdef KVM_CAP_IRQ_ROUTING @@ -1313,4 +1314,6 @@ struct kvm_assigned_msix_entry { __u16 padding[3]; }; +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) + #endif /* __LINUX_KVM_H */ From c519265f2aa348b2f1b9ecf8fbe20bb7c0fb102e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Tue, 12 Jul 2016 22:09:28 +0200 Subject: [PATCH 250/302] KVM: x86: add a flag to disable KVM x2apic broadcast quirk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK as a feature flag to KVM_CAP_X2APIC_API. The quirk made KVM interpret 0xff as a broadcast even in x2APIC mode. The enableable capability is needed in order to support standard x2APIC and remain backward compatible. Signed-off-by: Radim Krčmář [Expand kvm_apic_mda comment. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 6 ++++ arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/lapic.c | 53 +++++++++++++++++++++++-------- arch/x86/kvm/x86.c | 5 ++- include/uapi/linux/kvm.h | 1 + 5 files changed, 52 insertions(+), 14 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index e34e51fa28b0..c4d2fb0e28de 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3844,12 +3844,18 @@ Returns: 0 on success, -EINVAL when args[0] contains invalid features Valid feature flags in args[0] are #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, allowing the use of 32-bit APIC IDs. See KVM_CAP_X2APIC_API in their respective sections. +KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK must be enabled for x2APIC to work +in logical mode or with more than 255 VCPUs. Otherwise, KVM treats 0xff +as a broadcast even in x2APIC mode in order to support physical x2APIC +without interrupt remapping. This is undesirable in logical mode, +where 0xff represents CPUs 0-7 in cluster 0. 8. Other capabilities. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7c00ba3242d7..074b5c760327 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -784,6 +784,7 @@ struct kvm_arch { struct page *avic_physical_id_table_page; bool x2apic_format; + bool x2apic_broadcast_quirk_disabled; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d27a7829a4ce..a16e0bb95d28 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -616,17 +616,30 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) } } -/* KVM APIC implementation has two quirks - * - dest always begins at 0 while xAPIC MDA has offset 24, - * - IOxAPIC messages have to be delivered (directly) to x2APIC. +/* The KVM local APIC implementation has two quirks: + * + * - the xAPIC MDA stores the destination at bits 24-31, while this + * is not true of struct kvm_lapic_irq's dest_id field. This is + * just a quirk in the API and is not problematic. + * + * - in-kernel IOAPIC messages have to be delivered directly to + * x2APIC, because the kernel does not support interrupt remapping. + * In order to support broadcast without interrupt remapping, x2APIC + * rewrites the destination of non-IPI messages from APIC_BROADCAST + * to X2APIC_BROADCAST. + * + * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is + * important when userspace wants to use x2APIC-format MSIs, because + * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7". */ -static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source, - struct kvm_lapic *target) +static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id, + struct kvm_lapic *source, struct kvm_lapic *target) { bool ipi = source != NULL; bool x2apic_mda = apic_x2apic_mode(ipi ? source : target); - if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda) + if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled && + !ipi && dest_id == APIC_BROADCAST && x2apic_mda) return X2APIC_BROADCAST; return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id); @@ -636,7 +649,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode) { struct kvm_lapic *target = vcpu->arch.apic; - u32 mda = kvm_apic_mda(dest, source, target); + u32 mda = kvm_apic_mda(vcpu, dest, source, target); apic_debug("target %p, source %p, dest 0x%x, " "dest_mode 0x%x, short_hand 0x%x\n", @@ -688,6 +701,25 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm) } } +static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, + struct kvm_lapic_irq *irq, struct kvm_apic_map *map) +{ + if (kvm->arch.x2apic_broadcast_quirk_disabled) { + if ((irq->dest_id == APIC_BROADCAST && + map->mode != KVM_APIC_MODE_X2APIC)) + return true; + if (irq->dest_id == X2APIC_BROADCAST) + return true; + } else { + bool x2apic_ipi = src && *src && apic_x2apic_mode(*src); + if (irq->dest_id == (x2apic_ipi ? + X2APIC_BROADCAST : APIC_BROADCAST)) + return true; + } + + return false; +} + /* Return true if the interrupt can be handled by using *bitmap as index mask * for valid destinations in *dst array. * Return false if kvm_apic_map_get_dest_lapic did nothing useful. @@ -701,7 +733,6 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, unsigned long *bitmap) { int i, lowest; - bool x2apic_ipi; if (irq->shorthand == APIC_DEST_SELF && src) { *dst = src; @@ -710,11 +741,7 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, } else if (irq->shorthand) return false; - x2apic_ipi = src && *src && apic_x2apic_mode(*src); - if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) - return false; - - if (!map) + if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map)) return false; if (irq->dest_mode == APIC_DEST_PHYSICAL) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d86f563a6896..f0d23622bc4e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -90,7 +90,8 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU -#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS) +#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ + KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); @@ -3811,6 +3812,8 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) kvm->arch.x2apic_format = true; + if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) + kvm->arch.x2apic_broadcast_quirk_disabled = true; r = 0; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f704403e19a0..4f8030e5b05d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1315,5 +1315,6 @@ struct kvm_assigned_msix_entry { }; #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) #endif /* __LINUX_KVM_H */ From 682f732ecf7396e9d6fe24d44738966699fae6c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Tue, 12 Jul 2016 22:09:29 +0200 Subject: [PATCH 251/302] KVM: x86: bump MAX_VCPUS to 288 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 288 is in high demand because of Knights Landing CPU. We cannot set the limit to 640k, because that would be wasting space. Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 074b5c760327..21a40dc7aad6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -34,7 +34,7 @@ #include #include -#define KVM_MAX_VCPUS 255 +#define KVM_MAX_VCPUS 288 #define KVM_SOFT_MAX_VCPUS 240 #define KVM_USER_MEM_SLOTS 509 /* memory slots that are not exposed to userspace */ From af1bae5497b98cb99d6b0492e6981f060420a00c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Tue, 12 Jul 2016 22:09:30 +0200 Subject: [PATCH 252/302] KVM: x86: bump KVM_MAX_VCPU_ID to 1023 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kzalloc was replaced with kvm_kvzalloc to allow non-contiguous areas and rcu had to be modified to cope with it. The practical limit for KVM_MAX_VCPU_ID right now is INT_MAX, but lower value was chosen in case there were bugs. 1023 is sufficient maximum APIC ID for 288 VCPUs. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/lapic.c | 13 ++++++++++--- arch/x86/kvm/x86.c | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 21a40dc7aad6..9fcb197aa5ce 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -36,6 +36,7 @@ #define KVM_MAX_VCPUS 288 #define KVM_SOFT_MAX_VCPUS 240 +#define KVM_MAX_VCPU_ID 1023 #define KVM_USER_MEM_SLOTS 509 /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a16e0bb95d28..6895fd28aae9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -147,6 +147,13 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, } } +static void kvm_apic_map_free(struct rcu_head *rcu) +{ + struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu); + + kvfree(map); +} + static void recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; @@ -160,8 +167,8 @@ static void recalculate_apic_map(struct kvm *kvm) if (kvm_apic_present(vcpu)) max_id = max(max_id, kvm_apic_id(vcpu->arch.apic)); - new = kzalloc(sizeof(struct kvm_apic_map) + - sizeof(struct kvm_lapic *) * (max_id + 1), GFP_KERNEL); + new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + + sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); if (!new) goto out; @@ -206,7 +213,7 @@ static void recalculate_apic_map(struct kvm *kvm) mutex_unlock(&kvm->arch.apic_map_lock); if (old) - kfree_rcu(old, rcu); + call_rcu(&old->rcu, kvm_apic_map_free); kvm_make_scan_ioapic_request(kvm); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f0d23622bc4e..a27b33033700 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7927,7 +7927,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); - kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); + kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kvm_mmu_uninit_vm(kvm); } From 40c4f8d27296428c894551c9e30d8016a2551116 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 14 Jul 2016 13:19:34 +0300 Subject: [PATCH 253/302] arm64: KVM: Clean up a condition My static checker complains that this condition looks like it should be == instead of =. This isn't a fast path, so we don't need to be fancy. Signed-off-by: Dan Carpenter Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index a57d650f552c..b0b225ceca18 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1546,7 +1546,7 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu, struct sys_reg_params *params) { u8 hsr_ec = kvm_vcpu_trap_get_class(vcpu); - int cp; + int cp = -1; switch(hsr_ec) { case ESR_ELx_EC_CP15_32: @@ -1558,7 +1558,7 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu, cp = 14; break; default: - WARN_ON((cp = -1)); + WARN_ON(1); } kvm_err("Unsupported guest CP%d access at: %08lx\n", From 6502a34cfd6695929086187f63fe670cc3050e68 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 21 Jun 2016 14:19:51 +0200 Subject: [PATCH 254/302] KVM: s390: allow user space to handle instr 0x0000 We will use illegal instruction 0x0000 for handling 2 byte sw breakpoints from user space. As it can be enabled dynamically via a capability, let's move setting of ICTL_OPEREXC to the post creation step, so we avoid any races when enabling that capability just while adding new cpus. Acked-by: Janosch Frank Reviewed-by: Cornelia Huck Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- Documentation/virtual/kvm/api.txt | 13 +++++++++++++ arch/s390/include/asm/kvm_host.h | 2 ++ arch/s390/kvm/intercept.c | 3 +++ arch/s390/kvm/kvm-s390.c | 26 ++++++++++++++++++++++++-- include/uapi/linux/kvm.h | 1 + 5 files changed, 43 insertions(+), 2 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index c4d2fb0e28de..299306db5d84 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3857,6 +3857,19 @@ as a broadcast even in x2APIC mode in order to support physical x2APIC without interrupt remapping. This is undesirable in logical mode, where 0xff represents CPUs 0-7 in cluster 0. +7.8 KVM_CAP_S390_USER_INSTR0 + +Architectures: s390 +Parameters: none + +With this capability enabled, all illegal instructions 0x0000 (2 bytes) will +be intercepted and forwarded to user space. User space can use this +mechanism e.g. to realize 2-byte software breakpoints. The kernel will +not inject an operating exception for these instructions, user space has +to take care of that. + +This capability can be enabled dynamically even if VCPUs were already +created and are running. 8. Other capabilities. ---------------------- diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 946fc86202fd..183b01727de4 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -43,6 +43,7 @@ /* s390-specific vcpu->requests bit members */ #define KVM_REQ_ENABLE_IBS 8 #define KVM_REQ_DISABLE_IBS 9 +#define KVM_REQ_ICPT_OPEREXC 10 #define SIGP_CTRL_C 0x80 #define SIGP_CTRL_SCN_MASK 0x3f @@ -666,6 +667,7 @@ struct kvm_arch{ int user_cpu_state_ctrl; int user_sigp; int user_stsi; + int user_instr0; struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS]; wait_queue_head_t ipte_wq; int ipte_lock_count; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 850be47c4cc9..7a2f1551bc39 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -359,6 +359,9 @@ static int handle_operexc(struct kvm_vcpu *vcpu) test_kvm_facility(vcpu->kvm, 74)) return handle_sthyi(vcpu); + if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0) + return -EOPNOTSUPP; + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d42428c11794..63ac7c1641a7 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -364,6 +364,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_USER_STSI: case KVM_CAP_S390_SKEYS: case KVM_CAP_S390_IRQ_STATE: + case KVM_CAP_S390_USER_INSTR0: r = 1; break; case KVM_CAP_S390_MEM_OP: @@ -456,6 +457,16 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, return r; } +static void icpt_operexc_on_all_vcpus(struct kvm *kvm) +{ + unsigned int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu); + } +} + static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { int r; @@ -507,6 +518,12 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) kvm->arch.user_stsi = 1; r = 0; break; + case KVM_CAP_S390_USER_INSTR0: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0"); + kvm->arch.user_instr0 = 1; + icpt_operexc_on_all_vcpus(kvm); + r = 0; + break; default: r = -EINVAL; break; @@ -1836,6 +1853,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) vcpu->arch.gmap = vcpu->kvm->arch.gmap; sca_add_vcpu(vcpu); } + if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0) + vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; /* make vcpu_load load the right gmap on the first trigger */ vcpu->arch.enabled_gmap = vcpu->arch.gmap; } @@ -1923,8 +1942,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) } vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb; vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; - if (test_kvm_facility(vcpu->kvm, 74)) - vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; if (vcpu->kvm->arch.use_cmma) { rc = kvm_s390_vcpu_setup_cmma(vcpu); @@ -2369,6 +2386,11 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) goto retry; } + if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) { + vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; + goto retry; + } + /* nothing to do, just clear the request */ clear_bit(KVM_REQ_UNHALT, &vcpu->requests); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 4f8030e5b05d..70941f4ab6d8 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -867,6 +867,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_VCPU_ATTRIBUTES 127 #define KVM_CAP_MAX_VCPU_ID 128 #define KVM_CAP_X2APIC_API 129 +#define KVM_CAP_S390_USER_INSTR0 130 #ifdef KVM_CAP_IRQ_ROUTING From 9acc317b183fdd3ed3bca218271875c0e808daae Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 18 Jul 2016 09:18:13 +0200 Subject: [PATCH 255/302] KVM: s390: let ptff intercepts result in cc=3 We don't emulate ptff subfunctions, therefore react on any attempt of execution by setting cc=3 (Requested function not available). Signed-off-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/priv.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index c77ad2dc334f..46160388e996 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -1185,7 +1185,15 @@ static int handle_sckpf(struct kvm_vcpu *vcpu) return 0; } +static int handle_ptff(struct kvm_vcpu *vcpu) +{ + /* we don't emulate any control instructions yet */ + kvm_s390_set_psw_cc(vcpu, 3); + return 0; +} + static const intercept_handler_t x01_handlers[256] = { + [0x04] = handle_ptff, [0x07] = handle_sckpf, }; From 8f6cdc1c2eec20c3bbf3a83ad0e1db165f709917 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:22 +0100 Subject: [PATCH 256/302] KVM: arm/arm64: vgic: Move redistributor kvm_io_devices Logically a GICv3 redistributor is assigned to a (v)CPU, so we should aim to keep redistributor related variables out of our struct vgic_dist. Let's start by replacing the redistributor related kvm_io_device array with two members in our existing struct vgic_cpu, which are naturally per-VCPU and thus don't require any allocation / freeing. So apart from the better fit with the redistributor design this saves some code as well. Signed-off-by: Andre Przywara Reviewed-by: Eric Auger Reviewed-by: Marc Zyngier Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- include/kvm/arm_vgic.h | 8 +++++++- virt/kvm/arm/vgic/vgic-init.c | 1 - virt/kvm/arm/vgic/vgic-mmio-v3.c | 22 ++++++++-------------- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 12640378db98..5142e2ab9f5e 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -145,7 +145,6 @@ struct vgic_dist { struct vgic_irq *spis; struct vgic_io_device dist_iodev; - struct vgic_io_device *redist_iodevs; }; struct vgic_v2_cpu_if { @@ -193,6 +192,13 @@ struct vgic_cpu { struct list_head ap_list_head; u64 live_lrs; + + /* + * Members below are used with GICv3 emulation only and represent + * parts of the redistributor. + */ + struct vgic_io_device rd_iodev; + struct vgic_io_device sgi_iodev; }; int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write); diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index a1442f7c9c4d..90cae489c34c 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -271,7 +271,6 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm) dist->initialized = false; kfree(dist->spis); - kfree(dist->redist_iodevs); dist->nr_spis = 0; mutex_unlock(&kvm->lock); diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index a0c515a412a7..fc7b6c97acbb 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -285,21 +285,14 @@ unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev) int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address) { - int nr_vcpus = atomic_read(&kvm->online_vcpus); struct kvm_vcpu *vcpu; - struct vgic_io_device *devices; int c, ret = 0; - devices = kmalloc(sizeof(struct vgic_io_device) * nr_vcpus * 2, - GFP_KERNEL); - if (!devices) - return -ENOMEM; - kvm_for_each_vcpu(c, vcpu, kvm) { gpa_t rd_base = redist_base_address + c * SZ_64K * 2; gpa_t sgi_base = rd_base + SZ_64K; - struct vgic_io_device *rd_dev = &devices[c * 2]; - struct vgic_io_device *sgi_dev = &devices[c * 2 + 1]; + struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev; + struct vgic_io_device *sgi_dev = &vcpu->arch.vgic_cpu.sgi_iodev; kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops); rd_dev->base_addr = rd_base; @@ -335,14 +328,15 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address) if (ret) { /* The current c failed, so we start with the previous one. */ for (c--; c >= 0; c--) { + struct vgic_cpu *vgic_cpu; + + vcpu = kvm_get_vcpu(kvm, c); + vgic_cpu = &vcpu->arch.vgic_cpu; kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, - &devices[c * 2].dev); + &vgic_cpu->rd_iodev.dev); kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, - &devices[c * 2 + 1].dev); + &vgic_cpu->sgi_iodev.dev); } - kfree(devices); - } else { - kvm->arch.vgic.redist_iodevs = devices; } return ret; From 42c8870f90098796c2ed7c9eaa3e7526407502a8 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:23 +0100 Subject: [PATCH 257/302] KVM: arm/arm64: vgic: Check return value for kvm_register_vgic_device kvm_register_device_ops() can return an error, so lets check its return value and propagate this up the call chain. Signed-off-by: Andre Przywara Reviewed-by: Marc Zyngier Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-kvm-device.c | 15 +++++++++------ virt/kvm/arm/vgic/vgic-v2.c | 11 ++++++++--- virt/kvm/arm/vgic/vgic-v3.c | 15 +++++++++++++-- virt/kvm/arm/vgic/vgic.h | 2 +- 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c index 0130c4b147b7..2f24f13c6c90 100644 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ b/virt/kvm/arm/vgic/vgic-kvm-device.c @@ -210,20 +210,24 @@ static void vgic_destroy(struct kvm_device *dev) kfree(dev); } -void kvm_register_vgic_device(unsigned long type) +int kvm_register_vgic_device(unsigned long type) { + int ret = -ENODEV; + switch (type) { case KVM_DEV_TYPE_ARM_VGIC_V2: - kvm_register_device_ops(&kvm_arm_vgic_v2_ops, - KVM_DEV_TYPE_ARM_VGIC_V2); + ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops, + KVM_DEV_TYPE_ARM_VGIC_V2); break; #ifdef CONFIG_KVM_ARM_VGIC_V3 case KVM_DEV_TYPE_ARM_VGIC_V3: - kvm_register_device_ops(&kvm_arm_vgic_v3_ops, - KVM_DEV_TYPE_ARM_VGIC_V3); + ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops, + KVM_DEV_TYPE_ARM_VGIC_V3); break; #endif } + + return ret; } /** vgic_attr_regs_access: allows user space to read/write VGIC registers @@ -428,4 +432,3 @@ struct kvm_device_ops kvm_arm_vgic_v3_ops = { }; #endif /* CONFIG_KVM_ARM_VGIC_V3 */ - diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index e31405ee5515..079bf670c451 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -332,20 +332,25 @@ int vgic_v2_probe(const struct gic_kvm_info *info) vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR); kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1; + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); + if (ret) { + kvm_err("Cannot register GICv2 KVM device\n"); + iounmap(kvm_vgic_global_state.vctrl_base); + return ret; + } + ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base, kvm_vgic_global_state.vctrl_base + resource_size(&info->vctrl), info->vctrl.start); - if (ret) { kvm_err("Cannot map VCTRL into hyp\n"); + kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2); iounmap(kvm_vgic_global_state.vctrl_base); return ret; } kvm_vgic_global_state.can_emulate_gicv2 = true; - kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); - kvm_vgic_global_state.vcpu_base = info->vcpu.start; kvm_vgic_global_state.type = VGIC_V2; kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS; diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index 346b4ad12b49..e48a22e9ee40 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -296,6 +296,7 @@ int vgic_v3_map_resources(struct kvm *kvm) int vgic_v3_probe(const struct gic_kvm_info *info) { u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2); + int ret; /* * The ListRegs field is 5 bits, but there is a architectural @@ -319,12 +320,22 @@ int vgic_v3_probe(const struct gic_kvm_info *info) } else { kvm_vgic_global_state.vcpu_base = info->vcpu.start; kvm_vgic_global_state.can_emulate_gicv2 = true; - kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2); + if (ret) { + kvm_err("Cannot register GICv2 KVM device.\n"); + return ret; + } kvm_info("vgic-v2@%llx\n", info->vcpu.start); } + ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3); + if (ret) { + kvm_err("Cannot register GICv3 KVM device.\n"); + kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2); + return ret; + } + if (kvm_vgic_global_state.vcpu_base == 0) kvm_info("disabling GICv2 emulation\n"); - kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3); kvm_vgic_global_state.vctrl_base = NULL; kvm_vgic_global_state.type = VGIC_V3; diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 7b300ca370b7..c752152e8248 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -124,7 +124,7 @@ static inline int vgic_register_redist_iodevs(struct kvm *kvm, } #endif -void kvm_register_vgic_device(unsigned long type); +int kvm_register_vgic_device(unsigned long type); int vgic_lazy_init(struct kvm *kvm); int vgic_init(struct kvm *kvm); From 2b8ddd9337ee0d001b22507f95596648a1a90992 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:24 +0100 Subject: [PATCH 258/302] KVM: Extend struct kvm_msi to hold a 32-bit device ID The ARM GICv3 ITS MSI controller requires a device ID to be able to assign the proper interrupt vector. On real hardware, this ID is sampled from the bus. To be able to emulate an ITS controller, extend the KVM MSI interface to let userspace provide such a device ID. For PCI devices, the device ID is simply the 16-bit bus-device-function triplet, which should be easily available to the userland tool. Also there is a new KVM capability which advertises whether the current VM requires a device ID to be set along with the MSI data. This flag is still reported as not available everywhere, later we will enable it when ITS emulation is used. Signed-off-by: Andre Przywara Reviewed-by: Eric Auger Reviewed-by: Marc Zyngier Acked-by: Christoffer Dall Acked-by: Paolo Bonzini Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- Documentation/virtual/kvm/api.txt | 12 ++++++++++-- include/uapi/linux/kvm.h | 5 ++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 09efa9eb3926..65513119fee8 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2175,10 +2175,18 @@ struct kvm_msi { __u32 address_hi; __u32 data; __u32 flags; - __u8 pad[16]; + __u32 devid; + __u8 pad[12]; }; -No flags are defined so far. The corresponding field must be 0. +flags: KVM_MSI_VALID_DEVID: devid contains a valid value +devid: If KVM_MSI_VALID_DEVID is set, contains a unique device identifier + for the device that wrote the MSI message. + For PCI, this is usually a BFD identifier in the lower 16 bits. + +The per-VM KVM_CAP_MSI_DEVID capability advertises the need to provide +the device ID. If this capability is not set, userland cannot rely on +the kernel to allow the KVM_MSI_VALID_DEVID flag being set. 4.71 KVM_CREATE_PIT2 diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 05ebf475104c..7de96f5bb92c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -866,6 +866,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_ARM_PMU_V3 126 #define KVM_CAP_VCPU_ATTRIBUTES 127 #define KVM_CAP_MAX_VCPU_ID 128 +#define KVM_CAP_MSI_DEVID 129 #ifdef KVM_CAP_IRQ_ROUTING @@ -1024,12 +1025,14 @@ struct kvm_one_reg { __u64 addr; }; +#define KVM_MSI_VALID_DEVID (1U << 0) struct kvm_msi { __u32 address_lo; __u32 address_hi; __u32 data; __u32 flags; - __u8 pad[16]; + __u32 devid; + __u8 pad[12]; }; struct kvm_arm_device_addr { From b46f01ce4dcfdce636588fe2ef5035724c77f266 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:25 +0100 Subject: [PATCH 259/302] KVM: arm/arm64: Extend arch CAP checks to allow per-VM capabilities KVM capabilities can be a per-VM property, though ARM/ARM64 currently does not pass on the VM pointer to the architecture specific capability handlers. Add a "struct kvm*" parameter to those function to later allow proper per-VM capability reporting. Signed-off-by: Andre Przywara Reviewed-by: Eric Auger Reviewed-by: Marc Zyngier Acked-by: Christoffer Dall Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- arch/arm/include/asm/kvm_host.h | 2 +- arch/arm/kvm/arm.c | 2 +- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/reset.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 58d0b69e7428..de338d93d11b 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -272,7 +272,7 @@ static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr, kvm_call_hyp((void *)virt_to_idmap(__kvm_hyp_reset), vector_ptr); } -static inline int kvm_arch_dev_ioctl_check_extension(long ext) +static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) { return 0; } diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 7cf266c502d6..972075cc111c 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -201,7 +201,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_MAX_VCPUS; break; default: - r = kvm_arch_dev_ioctl_check_extension(ext); + r = kvm_arch_dev_ioctl_check_extension(kvm, ext); break; } return r; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 69d5cc2d2e17..3eda975837d0 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -47,7 +47,7 @@ int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); -int kvm_arch_dev_ioctl_check_extension(long ext); +int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); struct kvm_arch { diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 79f324823340..e95d4f68bf54 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -65,7 +65,7 @@ static bool cpu_has_32bit_el1(void) * We currently assume that the number of HW registers is uniform * across all CPUs (see cpuinfo_sanity_check). */ -int kvm_arch_dev_ioctl_check_extension(long ext) +int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) { int r; From 8a39d00670f0792c1186e442e1dd28fe0326f2ee Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:26 +0100 Subject: [PATCH 260/302] KVM: kvm_io_bus: Add kvm_io_bus_get_dev() call The kvm_io_bus framework is a nice place of holding information about various MMIO regions for kernel emulated devices. Add a call to retrieve the kvm_io_device structure which is associated with a certain MMIO address. This avoids to duplicate kvm_io_bus' knowledge of MMIO regions without having to fake MMIO calls if a user needs the device a certain MMIO address belongs to. This will be used by the ITS emulation to get the associated ITS device when someone triggers an MSI via an ioctl from userspace. Signed-off-by: Andre Przywara Reviewed-by: Eric Auger Reviewed-by: Marc Zyngier Acked-by: Christoffer Dall Acked-by: Paolo Bonzini Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0640ee92b978..614a98137c5f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -164,6 +164,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev); int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev); +struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, + gpa_t addr); #ifdef CONFIG_KVM_ASYNC_PF struct kvm_async_pf { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ef54b4c31792..bd2eb92c5d0e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3496,6 +3496,30 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, return r; } +struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, + gpa_t addr) +{ + struct kvm_io_bus *bus; + int dev_idx, srcu_idx; + struct kvm_io_device *iodev = NULL; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + + dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); + if (dev_idx < 0) + goto out_unlock; + + iodev = bus->range[dev_idx].dev; + +out_unlock: + srcu_read_unlock(&kvm->srcu, srcu_idx); + + return iodev; +} +EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); + static struct notifier_block kvm_cpu_notifier = { .notifier_call = kvm_cpu_hotplug, }; From 5dd4b924e390af426e424d5e52c1b4d1566af817 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:27 +0100 Subject: [PATCH 261/302] KVM: arm/arm64: vgic: Add refcounting for IRQs In the moment our struct vgic_irq's are statically allocated at guest creation time. So getting a pointer to an IRQ structure is trivial and safe. LPIs are more dynamic, they can be mapped and unmapped at any time during the guest's _runtime_. In preparation for supporting LPIs we introduce reference counting for those structures using the kernel's kref infrastructure. Since private IRQs and SPIs are statically allocated, we avoid actually refcounting them, since they would never be released anyway. But we take provisions to increase the refcount when an IRQ gets onto a VCPU list and decrease it when it gets removed. Also this introduces vgic_put_irq(), which wraps kref_put and hides the release function from the callers. Signed-off-by: Andre Przywara Reviewed-by: Marc Zyngier Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- include/kvm/arm_vgic.h | 1 + virt/kvm/arm/vgic/vgic-init.c | 2 ++ virt/kvm/arm/vgic/vgic-mmio-v2.c | 8 +++++ virt/kvm/arm/vgic/vgic-mmio-v3.c | 20 +++++++----- virt/kvm/arm/vgic/vgic-mmio.c | 25 ++++++++++++++- virt/kvm/arm/vgic/vgic-v2.c | 1 + virt/kvm/arm/vgic/vgic-v3.c | 1 + virt/kvm/arm/vgic/vgic.c | 52 +++++++++++++++++++++++++++++--- virt/kvm/arm/vgic/vgic.h | 1 + 9 files changed, 99 insertions(+), 12 deletions(-) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 5142e2ab9f5e..450b4dab9a9f 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -96,6 +96,7 @@ struct vgic_irq { bool active; /* not used for LPIs */ bool enabled; bool hw; /* Tied to HW IRQ */ + struct kref refcount; /* Used for LPIs */ u32 hwintid; /* HW INTID number */ union { u8 targets; /* GICv2 target VCPUs mask */ diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index 90cae489c34c..ac3c1a5f7bf4 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -177,6 +177,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) spin_lock_init(&irq->irq_lock); irq->vcpu = NULL; irq->target_vcpu = vcpu0; + kref_init(&irq->refcount); if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) irq->targets = 0; else @@ -211,6 +212,7 @@ static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) irq->vcpu = NULL; irq->target_vcpu = vcpu; irq->targets = 1U << vcpu->vcpu_id; + kref_init(&irq->refcount); if (vgic_irq_is_sgi(i)) { /* SGIs */ irq->enabled = 1; diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c index a21393637e4b..4152348f5e4f 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c @@ -102,6 +102,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, irq->source |= 1U << source_vcpu->vcpu_id; vgic_queue_irq_unlock(source_vcpu->kvm, irq); + vgic_put_irq(source_vcpu->kvm, irq); } } @@ -116,6 +117,8 @@ static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu, struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); val |= (u64)irq->targets << (i * 8); + + vgic_put_irq(vcpu->kvm, irq); } return val; @@ -143,6 +146,7 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu, irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target); spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); } } @@ -157,6 +161,8 @@ static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu, struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); val |= (u64)irq->source << (i * 8); + + vgic_put_irq(vcpu->kvm, irq); } return val; } @@ -178,6 +184,7 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu, irq->pending = false; spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); } } @@ -201,6 +208,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu, } else { spin_unlock(&irq->irq_lock); } + vgic_put_irq(vcpu->kvm, irq); } } diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index fc7b6c97acbb..bfcafbd8fa02 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -80,15 +80,17 @@ static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu, { int intid = VGIC_ADDR_TO_INTID(addr, 64); struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid); + unsigned long ret = 0; if (!irq) return 0; /* The upper word is RAZ for us. */ - if (addr & 4) - return 0; + if (!(addr & 4)) + ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len); - return extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len); + vgic_put_irq(vcpu->kvm, irq); + return ret; } static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, @@ -96,15 +98,17 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, unsigned long val) { int intid = VGIC_ADDR_TO_INTID(addr, 64); - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid); - - if (!irq) - return; + struct vgic_irq *irq; /* The upper word is WI for us since we don't implement Aff3. */ if (addr & 4) return; + irq = vgic_get_irq(vcpu->kvm, NULL, intid); + + if (!irq) + return; + spin_lock(&irq->irq_lock); /* We only care about and preserve Aff0, Aff1 and Aff2. */ @@ -112,6 +116,7 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr); spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); } static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, @@ -445,5 +450,6 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) irq->pending = true; vgic_queue_irq_unlock(vcpu->kvm, irq); + vgic_put_irq(vcpu->kvm, irq); } } diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 9f6fab74dce7..5e79e0137cb6 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -56,6 +56,8 @@ unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu, if (irq->enabled) value |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); } return value; @@ -74,6 +76,8 @@ void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, spin_lock(&irq->irq_lock); irq->enabled = true; vgic_queue_irq_unlock(vcpu->kvm, irq); + + vgic_put_irq(vcpu->kvm, irq); } } @@ -92,6 +96,7 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, irq->enabled = false; spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); } } @@ -108,6 +113,8 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, if (irq->pending) value |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); } return value; @@ -129,6 +136,7 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, irq->soft_pending = true; vgic_queue_irq_unlock(vcpu->kvm, irq); + vgic_put_irq(vcpu->kvm, irq); } } @@ -152,6 +160,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, } spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); } } @@ -168,6 +177,8 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, if (irq->active) value |= (1U << i); + + vgic_put_irq(vcpu->kvm, irq); } return value; @@ -242,6 +253,7 @@ void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu, for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); vgic_mmio_change_active(vcpu, irq, false); + vgic_put_irq(vcpu->kvm, irq); } vgic_change_active_finish(vcpu, intid); } @@ -257,6 +269,7 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu, for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); vgic_mmio_change_active(vcpu, irq, true); + vgic_put_irq(vcpu->kvm, irq); } vgic_change_active_finish(vcpu, intid); } @@ -272,6 +285,8 @@ unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu, struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); val |= (u64)irq->priority << (i * 8); + + vgic_put_irq(vcpu->kvm, irq); } return val; @@ -298,6 +313,8 @@ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, /* Narrow the priority range to what we actually support */ irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS); spin_unlock(&irq->irq_lock); + + vgic_put_irq(vcpu->kvm, irq); } } @@ -313,6 +330,8 @@ unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu, if (irq->config == VGIC_CONFIG_EDGE) value |= (2U << (i * 2)); + + vgic_put_irq(vcpu->kvm, irq); } return value; @@ -326,7 +345,7 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, int i; for (i = 0; i < len * 4; i++) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); + struct vgic_irq *irq; /* * The configuration cannot be changed for SGIs in general, @@ -337,14 +356,18 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, if (intid + i < VGIC_NR_PRIVATE_IRQS) continue; + irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); spin_lock(&irq->irq_lock); + if (test_bit(i * 2 + 1, &val)) { irq->config = VGIC_CONFIG_EDGE; } else { irq->config = VGIC_CONFIG_LEVEL; irq->pending = irq->line_level | irq->soft_pending; } + spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); } } diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index 079bf670c451..0bf6709d1006 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -124,6 +124,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) } spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); } } diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index e48a22e9ee40..f0ac0642303c 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -113,6 +113,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) } spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); } } diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 69b61abefa19..fb19a554d090 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -64,6 +64,28 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, return NULL; } +static void vgic_get_irq_kref(struct vgic_irq *irq) +{ + if (irq->intid < VGIC_MIN_LPI) + return; + + kref_get(&irq->refcount); +} + +/* The refcount should never drop to 0 at the moment. */ +static void vgic_irq_release(struct kref *ref) +{ + WARN_ON(1); +} + +void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) +{ + if (irq->intid < VGIC_MIN_LPI) + return; + + kref_put(&irq->refcount, vgic_irq_release); +} + /** * kvm_vgic_target_oracle - compute the target vcpu for an irq * @@ -236,6 +258,11 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq) goto retry; } + /* + * Grab a reference to the irq to reflect the fact that it is + * now in the ap_list. + */ + vgic_get_irq_kref(irq); list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); irq->vcpu = vcpu; @@ -269,14 +296,17 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, if (!irq) return -EINVAL; - if (irq->hw != mapped_irq) + if (irq->hw != mapped_irq) { + vgic_put_irq(kvm, irq); return -EINVAL; + } spin_lock(&irq->irq_lock); if (!vgic_validate_injection(irq, level)) { /* Nothing to see here, move along... */ spin_unlock(&irq->irq_lock); + vgic_put_irq(kvm, irq); return 0; } @@ -288,6 +318,7 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, } vgic_queue_irq_unlock(kvm, irq); + vgic_put_irq(kvm, irq); return 0; } @@ -330,25 +361,28 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq) irq->hwintid = phys_irq; spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); return 0; } int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); - - BUG_ON(!irq); + struct vgic_irq *irq; if (!vgic_initialized(vcpu->kvm)) return -EAGAIN; + irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); + BUG_ON(!irq); + spin_lock(&irq->irq_lock); irq->hw = false; irq->hwintid = 0; spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); return 0; } @@ -386,6 +420,15 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) list_del(&irq->ap_list); irq->vcpu = NULL; spin_unlock(&irq->irq_lock); + + /* + * This vgic_put_irq call matches the + * vgic_get_irq_kref in vgic_queue_irq_unlock, + * where we added the LPI to the ap_list. As + * we remove the irq from the list, we drop + * also drop the refcount. + */ + vgic_put_irq(vcpu->kvm, irq); continue; } @@ -614,6 +657,7 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq) spin_lock(&irq->irq_lock); map_is_active = irq->hw && irq->active; spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); return map_is_active; } diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index c752152e8248..5b79c340f17e 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -38,6 +38,7 @@ struct vgic_vmcr { struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 intid); +void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq); void vgic_kick_vcpus(struct kvm *kvm); From 645b9e49a8c053182aae0765d797f557f7a67eda Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:28 +0100 Subject: [PATCH 262/302] irqchip/gic-v3: Refactor and add GICv3 definitions arm-gic-v3.h contains bit and register definitions for the GICv3 and ITS, at least for the bits the we currently care about. The ITS emulation needs more definitions, so add them and refactor the memory attribute #defines to be more universally usable. To avoid changing all users, we still provide some of the old definitons defined with the help of the new macros. Signed-off-by: Andre Przywara Reviewed-by: Marc Zyngier Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v3.h | 176 +++++++++++++++++++---------- 1 file changed, 118 insertions(+), 58 deletions(-) diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index bfbd707de390..9442be7f2461 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -112,34 +112,60 @@ #define GICR_WAKER_ProcessorSleep (1U << 1) #define GICR_WAKER_ChildrenAsleep (1U << 2) -#define GICR_PROPBASER_NonShareable (0U << 10) -#define GICR_PROPBASER_InnerShareable (1U << 10) -#define GICR_PROPBASER_OuterShareable (2U << 10) -#define GICR_PROPBASER_SHAREABILITY_MASK (3UL << 10) -#define GICR_PROPBASER_nCnB (0U << 7) -#define GICR_PROPBASER_nC (1U << 7) -#define GICR_PROPBASER_RaWt (2U << 7) -#define GICR_PROPBASER_RaWb (3U << 7) -#define GICR_PROPBASER_WaWt (4U << 7) -#define GICR_PROPBASER_WaWb (5U << 7) -#define GICR_PROPBASER_RaWaWt (6U << 7) -#define GICR_PROPBASER_RaWaWb (7U << 7) -#define GICR_PROPBASER_CACHEABILITY_MASK (7U << 7) -#define GICR_PROPBASER_IDBITS_MASK (0x1f) +#define GIC_BASER_CACHE_nCnB 0ULL +#define GIC_BASER_CACHE_SameAsInner 0ULL +#define GIC_BASER_CACHE_nC 1ULL +#define GIC_BASER_CACHE_RaWt 2ULL +#define GIC_BASER_CACHE_RaWb 3ULL +#define GIC_BASER_CACHE_WaWt 4ULL +#define GIC_BASER_CACHE_WaWb 5ULL +#define GIC_BASER_CACHE_RaWaWt 6ULL +#define GIC_BASER_CACHE_RaWaWb 7ULL +#define GIC_BASER_CACHE_MASK 7ULL +#define GIC_BASER_NonShareable 0ULL +#define GIC_BASER_InnerShareable 1ULL +#define GIC_BASER_OuterShareable 2ULL +#define GIC_BASER_SHAREABILITY_MASK 3ULL -#define GICR_PENDBASER_NonShareable (0U << 10) -#define GICR_PENDBASER_InnerShareable (1U << 10) -#define GICR_PENDBASER_OuterShareable (2U << 10) -#define GICR_PENDBASER_SHAREABILITY_MASK (3UL << 10) -#define GICR_PENDBASER_nCnB (0U << 7) -#define GICR_PENDBASER_nC (1U << 7) -#define GICR_PENDBASER_RaWt (2U << 7) -#define GICR_PENDBASER_RaWb (3U << 7) -#define GICR_PENDBASER_WaWt (4U << 7) -#define GICR_PENDBASER_WaWb (5U << 7) -#define GICR_PENDBASER_RaWaWt (6U << 7) -#define GICR_PENDBASER_RaWaWb (7U << 7) -#define GICR_PENDBASER_CACHEABILITY_MASK (7U << 7) +#define GIC_BASER_CACHEABILITY(reg, inner_outer, type) \ + (GIC_BASER_CACHE_##type << reg##_##inner_outer##_CACHEABILITY_SHIFT) + +#define GIC_BASER_SHAREABILITY(reg, type) \ + (GIC_BASER_##type << reg##_SHAREABILITY_SHIFT) + +#define GICR_PROPBASER_SHAREABILITY_SHIFT (10) +#define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_PROPBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, SHAREABILITY_MASK) +#define GICR_PROPBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, MASK) +#define GICR_PROPBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, MASK) +#define GICR_PROPBASER_CACHEABILITY_MASK GICR_PROPBASER_INNER_CACHEABILITY_MASK + +#define GICR_PROPBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable) +#define GICR_PROPBASER_nC GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC) +#define GICR_PROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb) +#define GICR_PROPBASER_IDBITS_MASK (0x1f) + +#define GICR_PENDBASER_SHAREABILITY_SHIFT (10) +#define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT (7) +#define GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT (56) +#define GICR_PENDBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, SHAREABILITY_MASK) +#define GICR_PENDBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, MASK) +#define GICR_PENDBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, MASK) +#define GICR_PENDBASER_CACHEABILITY_MASK GICR_PENDBASER_INNER_CACHEABILITY_MASK + +#define GICR_PENDBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable) +#define GICR_PENDBASER_nC GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC) +#define GICR_PENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb) +#define GICR_PENDBASER_PTZ BIT_ULL(62) /* * Re-Distributor registers, offsets from SGI_base @@ -175,59 +201,74 @@ #define GITS_CWRITER 0x0088 #define GITS_CREADR 0x0090 #define GITS_BASER 0x0100 +#define GITS_IDREGS_BASE 0xffd0 +#define GITS_PIDR0 0xffe0 +#define GITS_PIDR1 0xffe4 #define GITS_PIDR2 GICR_PIDR2 +#define GITS_PIDR4 0xffd0 +#define GITS_CIDR0 0xfff0 +#define GITS_CIDR1 0xfff4 +#define GITS_CIDR2 0xfff8 +#define GITS_CIDR3 0xfffc #define GITS_TRANSLATER 0x10040 #define GITS_CTLR_ENABLE (1U << 0) #define GITS_CTLR_QUIESCENT (1U << 31) +#define GITS_TYPER_PLPIS (1UL << 0) +#define GITS_TYPER_IDBITS_SHIFT 8 #define GITS_TYPER_DEVBITS_SHIFT 13 #define GITS_TYPER_DEVBITS(r) ((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1) #define GITS_TYPER_PTA (1UL << 19) +#define GITS_TYPER_HWCOLLCNT_SHIFT 24 -#define GITS_CBASER_VALID (1UL << 63) -#define GITS_CBASER_nCnB (0UL << 59) -#define GITS_CBASER_nC (1UL << 59) -#define GITS_CBASER_RaWt (2UL << 59) -#define GITS_CBASER_RaWb (3UL << 59) -#define GITS_CBASER_WaWt (4UL << 59) -#define GITS_CBASER_WaWb (5UL << 59) -#define GITS_CBASER_RaWaWt (6UL << 59) -#define GITS_CBASER_RaWaWb (7UL << 59) -#define GITS_CBASER_CACHEABILITY_MASK (7UL << 59) -#define GITS_CBASER_NonShareable (0UL << 10) -#define GITS_CBASER_InnerShareable (1UL << 10) -#define GITS_CBASER_OuterShareable (2UL << 10) -#define GITS_CBASER_SHAREABILITY_MASK (3UL << 10) +#define GITS_CBASER_VALID (1UL << 63) +#define GITS_CBASER_SHAREABILITY_SHIFT (10) +#define GITS_CBASER_INNER_CACHEABILITY_SHIFT (59) +#define GITS_CBASER_OUTER_CACHEABILITY_SHIFT (53) +#define GITS_CBASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GITS_CBASER, SHAREABILITY_MASK) +#define GITS_CBASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, MASK) +#define GITS_CBASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_CBASER, OUTER, MASK) +#define GITS_CBASER_CACHEABILITY_MASK GITS_CBASER_INNER_CACHEABILITY_MASK + +#define GITS_CBASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable) +#define GITS_CBASER_nC GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC) +#define GITS_CBASER_WaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb) #define GITS_BASER_NR_REGS 8 -#define GITS_BASER_VALID (1UL << 63) -#define GITS_BASER_nCnB (0UL << 59) -#define GITS_BASER_nC (1UL << 59) -#define GITS_BASER_RaWt (2UL << 59) -#define GITS_BASER_RaWb (3UL << 59) -#define GITS_BASER_WaWt (4UL << 59) -#define GITS_BASER_WaWb (5UL << 59) -#define GITS_BASER_RaWaWt (6UL << 59) -#define GITS_BASER_RaWaWb (7UL << 59) -#define GITS_BASER_CACHEABILITY_MASK (7UL << 59) -#define GITS_BASER_TYPE_SHIFT (56) +#define GITS_BASER_VALID (1UL << 63) +#define GITS_BASER_INDIRECT (1ULL << 62) +#define GITS_BASER_INNER_CACHEABILITY_SHIFT (59) +#define GITS_BASER_OUTER_CACHEABILITY_SHIFT (53) +#define GITS_BASER_INNER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK) +#define GITS_BASER_OUTER_CACHEABILITY_MASK \ + GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK) +#define GITS_BASER_SHAREABILITY_MASK \ + GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK) + +#define GITS_BASER_nC GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC) +#define GITS_BASER_WaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb) +#define GITS_BASER_TYPE_SHIFT (56) #define GITS_BASER_TYPE(r) (((r) >> GITS_BASER_TYPE_SHIFT) & 7) -#define GITS_BASER_ENTRY_SIZE_SHIFT (48) +#define GITS_BASER_ENTRY_SIZE_SHIFT (48) #define GITS_BASER_ENTRY_SIZE(r) ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0xff) + 1) -#define GITS_BASER_NonShareable (0UL << 10) -#define GITS_BASER_InnerShareable (1UL << 10) -#define GITS_BASER_OuterShareable (2UL << 10) #define GITS_BASER_SHAREABILITY_SHIFT (10) -#define GITS_BASER_SHAREABILITY_MASK (3UL << GITS_BASER_SHAREABILITY_SHIFT) +#define GITS_BASER_InnerShareable \ + GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) #define GITS_BASER_PAGE_SIZE_SHIFT (8) #define GITS_BASER_PAGE_SIZE_4K (0UL << GITS_BASER_PAGE_SIZE_SHIFT) #define GITS_BASER_PAGE_SIZE_16K (1UL << GITS_BASER_PAGE_SIZE_SHIFT) #define GITS_BASER_PAGE_SIZE_64K (2UL << GITS_BASER_PAGE_SIZE_SHIFT) #define GITS_BASER_PAGE_SIZE_MASK (3UL << GITS_BASER_PAGE_SIZE_SHIFT) #define GITS_BASER_PAGES_MAX 256 +#define GITS_BASER_NR_PAGES(r) (((r) & 0xff) + 1) #define GITS_BASER_TYPE_NONE 0 #define GITS_BASER_TYPE_DEVICE 1 @@ -243,7 +284,10 @@ */ #define GITS_CMD_MAPD 0x08 #define GITS_CMD_MAPC 0x09 -#define GITS_CMD_MAPVI 0x0a +#define GITS_CMD_MAPTI 0x0a +/* older GIC documentation used MAPVI for this command */ +#define GITS_CMD_MAPVI GITS_CMD_MAPTI +#define GITS_CMD_MAPI 0x0b #define GITS_CMD_MOVI 0x01 #define GITS_CMD_DISCARD 0x0f #define GITS_CMD_INV 0x0c @@ -253,6 +297,22 @@ #define GITS_CMD_CLEAR 0x04 #define GITS_CMD_SYNC 0x05 +/* + * ITS error numbers + */ +#define E_ITS_MOVI_UNMAPPED_INTERRUPT 0x010107 +#define E_ITS_MOVI_UNMAPPED_COLLECTION 0x010109 +#define E_ITS_CLEAR_UNMAPPED_INTERRUPT 0x010507 +#define E_ITS_MAPD_DEVICE_OOR 0x010801 +#define E_ITS_MAPC_PROCNUM_OOR 0x010902 +#define E_ITS_MAPC_COLLECTION_OOR 0x010903 +#define E_ITS_MAPTI_UNMAPPED_DEVICE 0x010a04 +#define E_ITS_MAPTI_PHYSICALID_OOR 0x010a06 +#define E_ITS_INV_UNMAPPED_INTERRUPT 0x010c07 +#define E_ITS_INVALL_UNMAPPED_COLLECTION 0x010d09 +#define E_ITS_MOVALL_PROCNUM_OOR 0x010e01 +#define E_ITS_DISCARD_UNMAPPED_INTERRUPT 0x010f07 + /* * CPU interface registers */ From 0aa1de57319c4e023187aca0d59dd593a96459a8 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:29 +0100 Subject: [PATCH 263/302] KVM: arm64: vgic: Handle ITS related GICv3 redistributor registers In the GICv3 redistributor there are the PENDBASER and PROPBASER registers which we did not emulate so far, as they only make sense when having an ITS. In preparation for that emulate those MMIO accesses by storing the 64-bit data written into it into a variable which we later read in the ITS emulation. We also sanitise the registers, making sure RES0 regions are respected and checking for valid memory attributes. Signed-off-by: Andre Przywara Reviewed-by: Marc Zyngier Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- include/kvm/arm_vgic.h | 13 +++ virt/kvm/arm/vgic/vgic-mmio-v3.c | 153 ++++++++++++++++++++++++++++++- virt/kvm/arm/vgic/vgic-mmio.h | 8 ++ virt/kvm/arm/vgic/vgic-v3.c | 11 ++- 4 files changed, 181 insertions(+), 4 deletions(-) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 450b4dab9a9f..df2dec5ef620 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -146,6 +146,14 @@ struct vgic_dist { struct vgic_irq *spis; struct vgic_io_device dist_iodev; + + /* + * Contains the attributes and gpa of the LPI configuration table. + * Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share + * one address across all redistributors. + * GICv3 spec: 6.1.2 "LPI Configuration tables" + */ + u64 propbaser; }; struct vgic_v2_cpu_if { @@ -200,6 +208,11 @@ struct vgic_cpu { */ struct vgic_io_device rd_iodev; struct vgic_io_device sgi_iodev; + + /* Contains the attributes and gpa of the LPI pending tables. */ + u64 pendbaser; + + bool lpis_enabled; }; int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write); diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index bfcafbd8fa02..278bfbb36ef9 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -29,6 +29,19 @@ static unsigned long extract_bytes(unsigned long data, unsigned int offset, return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0); } +/* allows updates of any half of a 64-bit register (or the whole thing) */ +static u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, + unsigned long val) +{ + int lower = (offset & 4) * 8; + int upper = lower + 8 * len - 1; + + reg &= ~GENMASK_ULL(upper, lower); + val &= GENMASK_ULL(len * 8 - 1, 0); + + return reg | ((u64)val << lower); +} + static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { @@ -152,6 +165,142 @@ static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu, return 0; } +/* We want to avoid outer shareable. */ +u64 vgic_sanitise_shareability(u64 field) +{ + switch (field) { + case GIC_BASER_OuterShareable: + return GIC_BASER_InnerShareable; + default: + return field; + } +} + +/* Avoid any inner non-cacheable mapping. */ +u64 vgic_sanitise_inner_cacheability(u64 field) +{ + switch (field) { + case GIC_BASER_CACHE_nCnB: + case GIC_BASER_CACHE_nC: + return GIC_BASER_CACHE_RaWb; + default: + return field; + } +} + +/* Non-cacheable or same-as-inner are OK. */ +u64 vgic_sanitise_outer_cacheability(u64 field) +{ + switch (field) { + case GIC_BASER_CACHE_SameAsInner: + case GIC_BASER_CACHE_nC: + return field; + default: + return GIC_BASER_CACHE_nC; + } +} + +u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, + u64 (*sanitise_fn)(u64)) +{ + u64 field = (reg & field_mask) >> field_shift; + + field = sanitise_fn(field) << field_shift; + return (reg & ~field_mask) | field; +} + +#define PROPBASER_RES0_MASK \ + (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5)) +#define PENDBASER_RES0_MASK \ + (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) | \ + GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0)) + +static u64 vgic_sanitise_pendbaser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK, + GICR_PENDBASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK, + GICR_PENDBASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK, + GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + reg &= ~PENDBASER_RES0_MASK; + reg &= ~GENMASK_ULL(51, 48); + + return reg; +} + +static u64 vgic_sanitise_propbaser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK, + GICR_PROPBASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK, + GICR_PROPBASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK, + GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + reg &= ~PROPBASER_RES0_MASK; + reg &= ~GENMASK_ULL(51, 48); + return reg; +} + +static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + + return extract_bytes(dist->propbaser, addr & 7, len); +} + +static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 propbaser = dist->propbaser; + + /* Storing a value with LPIs already enabled is undefined */ + if (vgic_cpu->lpis_enabled) + return; + + propbaser = update_64bit_reg(propbaser, addr & 4, len, val); + propbaser = vgic_sanitise_propbaser(propbaser); + + dist->propbaser = propbaser; +} + +static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + return extract_bytes(vgic_cpu->pendbaser, addr & 7, len); +} + +static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 pendbaser = vgic_cpu->pendbaser; + + /* Storing a value with LPIs already enabled is undefined */ + if (vgic_cpu->lpis_enabled) + return; + + pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val); + pendbaser = vgic_sanitise_pendbaser(pendbaser); + + vgic_cpu->pendbaser = pendbaser; +} + /* * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the * redistributors, while SPIs are covered by registers in the distributor @@ -232,10 +381,10 @@ static const struct vgic_register_region vgic_v3_rdbase_registers[] = { vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER, - vgic_mmio_read_raz, vgic_mmio_write_wi, 8, + vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER, - vgic_mmio_read_raz, vgic_mmio_write_wi, 8, + vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_IDREGS, vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48, diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h index 850901482aec..71aa39d4cfdf 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.h +++ b/virt/kvm/arm/vgic/vgic-mmio.h @@ -147,4 +147,12 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev); +#ifdef CONFIG_KVM_ARM_VGIC_V3 +u64 vgic_sanitise_outer_cacheability(u64 reg); +u64 vgic_sanitise_inner_cacheability(u64 reg); +u64 vgic_sanitise_shareability(u64 reg); +u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift, + u64 (*sanitise_fn)(u64)); +#endif + #endif diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index f0ac0642303c..6f8f31f910e7 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -191,6 +191,11 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT; } +#define INITIAL_PENDBASER_VALUE \ + (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) | \ + GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \ + GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)) + void vgic_v3_enable(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3; @@ -208,10 +213,12 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) * way, so we force SRE to 1 to demonstrate this to the guest. * This goes with the spec allowing the value to be RAO/WI. */ - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { vgic_v3->vgic_sre = ICC_SRE_EL1_SRE; - else + vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE; + } else { vgic_v3->vgic_sre = 0; + } /* Get the show on the road... */ vgic_v3->vgic_hcr = ICH_HCR_EN; From 59c5ab40989afa5aba9c4a0918a5ed910a917422 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:30 +0100 Subject: [PATCH 264/302] KVM: arm64: vgic-its: Introduce ITS emulation file with MMIO framework The ARM GICv3 ITS emulation code goes into a separate file, but needs to be connected to the GICv3 emulation, of which it is an option. The ITS MMIO handlers require the respective ITS pointer to be passed in, so we amend the existing VGIC MMIO framework to let it cope with that. Also we introduce the basic ITS data structure and initialize it, but don't return any success yet, as we are not yet ready for the show. Signed-off-by: Andre Przywara Reviewed-by: Marc Zyngier Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- include/kvm/arm_vgic.h | 22 ++++++- virt/kvm/arm/vgic/vgic-its.c | 103 +++++++++++++++++++++++++++++++ virt/kvm/arm/vgic/vgic-mmio-v3.c | 40 +++++++++++- virt/kvm/arm/vgic/vgic-mmio.c | 37 ++++++++--- virt/kvm/arm/vgic/vgic-mmio.h | 17 +++-- virt/kvm/arm/vgic/vgic.h | 7 +++ 6 files changed, 213 insertions(+), 13 deletions(-) create mode 100644 virt/kvm/arm/vgic/vgic-its.c diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index df2dec5ef620..685f33975ce4 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -108,15 +108,35 @@ struct vgic_irq { }; struct vgic_register_region; +struct vgic_its; + +enum iodev_type { + IODEV_CPUIF, + IODEV_DIST, + IODEV_REDIST, + IODEV_ITS +}; struct vgic_io_device { gpa_t base_addr; - struct kvm_vcpu *redist_vcpu; + union { + struct kvm_vcpu *redist_vcpu; + struct vgic_its *its; + }; const struct vgic_register_region *regions; + enum iodev_type iodev_type; int nr_regions; struct kvm_io_device dev; }; +struct vgic_its { + /* The base address of the ITS control register frame */ + gpa_t vgic_its_base; + + bool enabled; + struct vgic_io_device iodev; +}; + struct vgic_dist { bool in_kernel; bool ready; diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c new file mode 100644 index 000000000000..4654d6edf6a6 --- /dev/null +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -0,0 +1,103 @@ +/* + * GICv3 ITS emulation + * + * Copyright (C) 2015,2016 ARM Ltd. + * Author: Andre Przywara + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "vgic.h" +#include "vgic-mmio.h" + +#define REGISTER_ITS_DESC(off, rd, wr, length, acc) \ +{ \ + .reg_offset = off, \ + .len = length, \ + .access_flags = acc, \ + .its_read = rd, \ + .its_write = wr, \ +} + +static unsigned long its_mmio_read_raz(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return 0; +} + +static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, unsigned long val) +{ + /* Ignore */ +} + +static struct vgic_register_region its_registers[] = { + REGISTER_ITS_DESC(GITS_CTLR, + its_mmio_read_raz, its_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_IIDR, + its_mmio_read_raz, its_mmio_write_wi, 4, + VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_TYPER, + its_mmio_read_raz, its_mmio_write_wi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_CBASER, + its_mmio_read_raz, its_mmio_write_wi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_CWRITER, + its_mmio_read_raz, its_mmio_write_wi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_CREADR, + its_mmio_read_raz, its_mmio_write_wi, 8, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_BASER, + its_mmio_read_raz, its_mmio_write_wi, 0x40, + VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), + REGISTER_ITS_DESC(GITS_IDREGS_BASE, + its_mmio_read_raz, its_mmio_write_wi, 0x30, + VGIC_ACCESS_32bit), +}; + +static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its) +{ + struct vgic_io_device *iodev = &its->iodev; + int ret; + + if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) + return -ENXIO; + + iodev->regions = its_registers; + iodev->nr_regions = ARRAY_SIZE(its_registers); + kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops); + + iodev->base_addr = its->vgic_its_base; + iodev->iodev_type = IODEV_ITS; + iodev->its = its; + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr, + KVM_VGIC_V3_ITS_SIZE, &iodev->dev); + mutex_unlock(&kvm->slots_lock); + + return ret; +} diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 278bfbb36ef9..b92b7d6cabe6 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -42,6 +42,16 @@ static u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, return reg | ((u64)val << lower); } +bool vgic_has_its(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + + if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) + return false; + + return false; +} + static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { @@ -132,6 +142,32 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, vgic_put_irq(vcpu->kvm, irq); } +static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0; +} + + +static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu, + gpa_t addr, unsigned int len, + unsigned long val) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + bool was_enabled = vgic_cpu->lpis_enabled; + + if (!vgic_has_its(vcpu->kvm)) + return; + + vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS; + + if (!was_enabled && vgic_cpu->lpis_enabled) { + /* Eventually do something */ + } +} + static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { @@ -372,7 +408,7 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = { static const struct vgic_register_region vgic_v3_rdbase_registers[] = { REGISTER_DESC_WITH_LENGTH(GICR_CTLR, - vgic_mmio_read_raz, vgic_mmio_write_wi, 4, + vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4, VGIC_ACCESS_32bit), REGISTER_DESC_WITH_LENGTH(GICR_IIDR, vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4, @@ -450,6 +486,7 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address) kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops); rd_dev->base_addr = rd_base; + rd_dev->iodev_type = IODEV_REDIST; rd_dev->regions = vgic_v3_rdbase_registers; rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers); rd_dev->redist_vcpu = vcpu; @@ -464,6 +501,7 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address) kvm_iodevice_init(&sgi_dev->dev, &kvm_io_gic_ops); sgi_dev->base_addr = sgi_base; + sgi_dev->iodev_type = IODEV_REDIST; sgi_dev->regions = vgic_v3_sgibase_registers; sgi_dev->nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers); sgi_dev->redist_vcpu = vcpu; diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 5e79e0137cb6..26be827bbfcc 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -473,8 +473,7 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, { struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); const struct vgic_register_region *region; - struct kvm_vcpu *r_vcpu; - unsigned long data; + unsigned long data = 0; region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, addr - iodev->base_addr); @@ -483,8 +482,20 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, return 0; } - r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; - data = region->read(r_vcpu, addr, len); + switch (iodev->iodev_type) { + case IODEV_CPUIF: + return 1; + case IODEV_DIST: + data = region->read(vcpu, addr, len); + break; + case IODEV_REDIST: + data = region->read(iodev->redist_vcpu, addr, len); + break; + case IODEV_ITS: + data = region->its_read(vcpu->kvm, iodev->its, addr, len); + break; + } + vgic_data_host_to_mmio_bus(val, len, data); return 0; } @@ -494,7 +505,6 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, { struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); const struct vgic_register_region *region; - struct kvm_vcpu *r_vcpu; unsigned long data = vgic_data_mmio_bus_to_host(val, len); region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions, @@ -505,8 +515,20 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, if (!check_region(region, addr, len)) return 0; - r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu; - region->write(r_vcpu, addr, len, data); + switch (iodev->iodev_type) { + case IODEV_CPUIF: + break; + case IODEV_DIST: + region->write(vcpu, addr, len, data); + break; + case IODEV_REDIST: + region->write(iodev->redist_vcpu, addr, len, data); + break; + case IODEV_ITS: + region->its_write(vcpu->kvm, iodev->its, addr, len, data); + break; + } + return 0; } @@ -536,6 +558,7 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, } io_device->base_addr = dist_base_address; + io_device->iodev_type = IODEV_DIST; io_device->redist_vcpu = NULL; mutex_lock(&kvm->slots_lock); diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h index 71aa39d4cfdf..366d66378732 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.h +++ b/virt/kvm/arm/vgic/vgic-mmio.h @@ -21,10 +21,19 @@ struct vgic_register_region { unsigned int len; unsigned int bits_per_irq; unsigned int access_flags; - unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr, - unsigned int len); - void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, - unsigned long val); + union { + unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len); + unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len); + }; + union { + void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, + unsigned int len, unsigned long val); + void (*its_write)(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val); + }; }; extern struct kvm_io_device_ops kvm_io_gic_ops; diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 5b79c340f17e..31807c166d2a 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -72,6 +72,7 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu); int vgic_v3_probe(const struct gic_kvm_info *info); int vgic_v3_map_resources(struct kvm *kvm); int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address); +bool vgic_has_its(struct kvm *kvm); #else static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu) { @@ -123,6 +124,12 @@ static inline int vgic_register_redist_iodevs(struct kvm *kvm, { return -ENODEV; } + +static inline bool vgic_has_its(struct kvm *kvm) +{ + return false; +} + #endif int kvm_register_vgic_device(unsigned long type); From 1085fdc68c6097244627a02a56bd2d8fe58a1a9c Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:31 +0100 Subject: [PATCH 265/302] KVM: arm64: vgic-its: Introduce new KVM ITS device Introduce a new KVM device that represents an ARM Interrupt Translation Service (ITS) controller. Since there can be multiple of this per guest, we can't piggy back on the existing GICv3 distributor device, but create a new type of KVM device. On the KVM_CREATE_DEVICE ioctl we allocate and initialize the ITS data structure and store the pointer in the kvm_device data. Upon an explicit init ioctl from userland (after having setup the MMIO address) we register the handlers with the kvm_io_bus framework. Any reference to an ITS thus has to go via this interface. Signed-off-by: Andre Przywara Reviewed-by: Marc Zyngier Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- .../virtual/kvm/devices/arm-vgic.txt | 25 +++- arch/arm/kvm/arm.c | 1 + arch/arm64/include/uapi/asm/kvm.h | 2 + include/kvm/arm_vgic.h | 3 + include/uapi/linux/kvm.h | 2 + virt/kvm/arm/vgic/vgic-its.c | 135 ++++++++++++++++++ virt/kvm/arm/vgic/vgic-kvm-device.c | 4 +- virt/kvm/arm/vgic/vgic-mmio-v3.c | 2 +- virt/kvm/arm/vgic/vgic.h | 3 + 9 files changed, 168 insertions(+), 9 deletions(-) diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt index 59541d49e15c..89182f80cc7f 100644 --- a/Documentation/virtual/kvm/devices/arm-vgic.txt +++ b/Documentation/virtual/kvm/devices/arm-vgic.txt @@ -4,16 +4,22 @@ ARM Virtual Generic Interrupt Controller (VGIC) Device types supported: KVM_DEV_TYPE_ARM_VGIC_V2 ARM Generic Interrupt Controller v2.0 KVM_DEV_TYPE_ARM_VGIC_V3 ARM Generic Interrupt Controller v3.0 + KVM_DEV_TYPE_ARM_VGIC_ITS ARM Interrupt Translation Service Controller -Only one VGIC instance may be instantiated through either this API or the -legacy KVM_CREATE_IRQCHIP api. The created VGIC will act as the VM interrupt -controller, requiring emulated user-space devices to inject interrupts to the -VGIC instead of directly to CPUs. +Only one VGIC instance of the V2/V3 types above may be instantiated through +either this API or the legacy KVM_CREATE_IRQCHIP api. The created VGIC will +act as the VM interrupt controller, requiring emulated user-space devices to +inject interrupts to the VGIC instead of directly to CPUs. Creating a guest GICv3 device requires a host GICv3 as well. GICv3 implementations with hardware compatibility support allow a guest GICv2 as well. +Creating a virtual ITS controller requires a host GICv3 (but does not depend +on having physical ITS controllers). +There can be multiple ITS controllers per guest, each of them has to have +a separate, non-overlapping MMIO region. + Groups: KVM_DEV_ARM_VGIC_GRP_ADDR Attributes: @@ -39,6 +45,13 @@ Groups: Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. This address needs to be 64K aligned. + KVM_VGIC_V3_ADDR_TYPE_ITS (rw, 64-bit) + Base address in the guest physical address space of the GICv3 ITS + control register frame. The ITS allows MSI(-X) interrupts to be + injected into guests. This extension is optional. If the kernel + does not support the ITS, the call returns -ENODEV. + Only valid for KVM_DEV_TYPE_ARM_VGIC_ITS. + This address needs to be 64K aligned and the region covers 128K. KVM_DEV_ARM_VGIC_GRP_DIST_REGS Attributes: @@ -109,8 +122,8 @@ Groups: KVM_DEV_ARM_VGIC_GRP_CTRL Attributes: KVM_DEV_ARM_VGIC_CTRL_INIT - request the initialization of the VGIC, no additional parameter in - kvm_device_attr.addr. + request the initialization of the VGIC or ITS, no additional parameter + in kvm_device_attr.addr. Errors: -ENXIO: VGIC not properly configured as required prior to calling this attribute diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 972075cc111c..fb4661cf896e 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index f209ea151dca..3051f86a9b5f 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -87,9 +87,11 @@ struct kvm_regs { /* Supported VGICv3 address types */ #define KVM_VGIC_V3_ADDR_TYPE_DIST 2 #define KVM_VGIC_V3_ADDR_TYPE_REDIST 3 +#define KVM_VGIC_ITS_ADDR_TYPE 4 #define KVM_VGIC_V3_DIST_SIZE SZ_64K #define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K) +#define KVM_VGIC_V3_ITS_SIZE (2 * SZ_64K) #define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */ #define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */ diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 685f33975ce4..8609faced83e 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -134,6 +134,7 @@ struct vgic_its { gpa_t vgic_its_base; bool enabled; + bool initialized; struct vgic_io_device iodev; }; @@ -167,6 +168,8 @@ struct vgic_dist { struct vgic_io_device dist_iodev; + bool has_its; + /* * Contains the attributes and gpa of the LPI configuration table. * Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7de96f5bb92c..d8c4c324cfae 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1077,6 +1077,8 @@ enum kvm_device_type { #define KVM_DEV_TYPE_FLIC KVM_DEV_TYPE_FLIC KVM_DEV_TYPE_ARM_VGIC_V3, #define KVM_DEV_TYPE_ARM_VGIC_V3 KVM_DEV_TYPE_ARM_VGIC_V3 + KVM_DEV_TYPE_ARM_VGIC_ITS, +#define KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_MAX, }; diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 4654d6edf6a6..6b47b3674690 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -84,6 +85,9 @@ static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its) struct vgic_io_device *iodev = &its->iodev; int ret; + if (its->initialized) + return 0; + if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) return -ENXIO; @@ -99,5 +103,136 @@ static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its) KVM_VGIC_V3_ITS_SIZE, &iodev->dev); mutex_unlock(&kvm->slots_lock); + if (!ret) + its->initialized = true; + return ret; } + +static int vgic_its_create(struct kvm_device *dev, u32 type) +{ + struct vgic_its *its; + + if (type != KVM_DEV_TYPE_ARM_VGIC_ITS) + return -ENODEV; + + its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL); + if (!its) + return -ENOMEM; + + its->vgic_its_base = VGIC_ADDR_UNDEF; + + dev->kvm->arch.vgic.has_its = true; + its->initialized = false; + its->enabled = false; + + dev->private = its; + + return 0; +} + +static void vgic_its_destroy(struct kvm_device *kvm_dev) +{ + struct vgic_its *its = kvm_dev->private; + + kfree(its); +} + +static int vgic_its_has_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: + switch (attr->attr) { + case KVM_VGIC_ITS_ADDR_TYPE: + return 0; + } + break; + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return 0; + } + break; + } + return -ENXIO; +} + +static int vgic_its_set_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct vgic_its *its = dev->private; + int ret; + + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + unsigned long type = (unsigned long)attr->attr; + u64 addr; + + if (type != KVM_VGIC_ITS_ADDR_TYPE) + return -ENODEV; + + if (its->initialized) + return -EBUSY; + + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base, + addr, SZ_64K); + if (ret) + return ret; + + its->vgic_its_base = addr; + + return 0; + } + case KVM_DEV_ARM_VGIC_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_ARM_VGIC_CTRL_INIT: + return vgic_its_init_its(dev->kvm, its); + } + break; + } + return -ENXIO; +} + +static int vgic_its_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_ADDR: { + struct vgic_its *its = dev->private; + u64 addr = its->vgic_its_base; + u64 __user *uaddr = (u64 __user *)(long)attr->addr; + unsigned long type = (unsigned long)attr->attr; + + if (type != KVM_VGIC_ITS_ADDR_TYPE) + return -ENODEV; + + if (copy_to_user(uaddr, &addr, sizeof(addr))) + return -EFAULT; + break; + default: + return -ENXIO; + } + } + + return 0; +} + +static struct kvm_device_ops kvm_arm_vgic_its_ops = { + .name = "kvm-arm-vgic-its", + .create = vgic_its_create, + .destroy = vgic_its_destroy, + .set_attr = vgic_its_set_attr, + .get_attr = vgic_its_get_attr, + .has_attr = vgic_its_has_attr, +}; + +int kvm_vgic_register_its_device(void) +{ + return kvm_register_device_ops(&kvm_arm_vgic_its_ops, + KVM_DEV_TYPE_ARM_VGIC_ITS); +} diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c index 2f24f13c6c90..561d2ba96a4f 100644 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ b/virt/kvm/arm/vgic/vgic-kvm-device.c @@ -21,8 +21,8 @@ /* common helpers */ -static int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t alignment) +int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, + phys_addr_t addr, phys_addr_t alignment) { if (addr & ~KVM_PHYS_MASK) return -E2BIG; diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index b92b7d6cabe6..a5c35050c786 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -49,7 +49,7 @@ bool vgic_has_its(struct kvm *kvm) if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3) return false; - return false; + return dist->has_its; } static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 31807c166d2a..8192a293f119 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -42,6 +42,9 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq); void vgic_kick_vcpus(struct kvm *kvm); +int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, + phys_addr_t addr, phys_addr_t alignment); + void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu); void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); From 424c33830f53f248a68da125e70d9a4d95a8e010 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:32 +0100 Subject: [PATCH 266/302] KVM: arm64: vgic-its: Implement basic ITS register handlers Add emulation for some basic MMIO registers used in the ITS emulation. This includes: - GITS_{CTLR,TYPER,IIDR} - ID registers - GITS_{CBASER,CREADR,CWRITER} (which implement the ITS command buffer handling) - GITS_BASER Most of the handlers are pretty straight forward, only the CWRITER handler is a bit more involved by taking the new its_cmd mutex and then iterating over the command buffer. The registers holding base addresses and attributes are sanitised before storing them. Signed-off-by: Andre Przywara Reviewed-by: Marc Zyngier Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- include/kvm/arm_vgic.h | 16 ++ virt/kvm/arm/vgic/vgic-its.c | 399 +++++++++++++++++++++++++++++-- virt/kvm/arm/vgic/vgic-mmio-v3.c | 8 +- virt/kvm/arm/vgic/vgic-mmio.h | 6 + virt/kvm/arm/vgic/vgic.c | 12 +- 5 files changed, 420 insertions(+), 21 deletions(-) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 8609faced83e..61867492d361 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -22,6 +22,7 @@ #include #include #include +#include #define VGIC_V3_MAX_CPUS 255 #define VGIC_V2_MAX_CPUS 8 @@ -136,6 +137,21 @@ struct vgic_its { bool enabled; bool initialized; struct vgic_io_device iodev; + + /* These registers correspond to GITS_BASER{0,1} */ + u64 baser_device_table; + u64 baser_coll_table; + + /* Protects the command queue */ + struct mutex cmd_lock; + u64 cbaser; + u32 creadr; + u32 cwriter; + + /* Protects the device and collection lists */ + struct mutex its_lock; + struct list_head device_list; + struct list_head collection_list; }; struct vgic_dist { diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 6b47b3674690..11cfe2f12c6c 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -32,6 +33,329 @@ #include "vgic.h" #include "vgic-mmio.h" +struct its_device { + struct list_head dev_list; + + /* the head for the list of ITTEs */ + struct list_head itt_head; + u32 device_id; +}; + +#define COLLECTION_NOT_MAPPED ((u32)~0) + +struct its_collection { + struct list_head coll_list; + + u32 collection_id; + u32 target_addr; +}; + +#define its_is_collection_mapped(coll) ((coll) && \ + ((coll)->target_addr != COLLECTION_NOT_MAPPED)) + +struct its_itte { + struct list_head itte_list; + + struct its_collection *collection; + u32 lpi; + u32 event_id; +}; + +/* + * We only implement 48 bits of PA at the moment, although the ITS + * supports more. Let's be restrictive here. + */ +#define CBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12)) + +static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + u32 reg = 0; + + mutex_lock(&its->cmd_lock); + if (its->creadr == its->cwriter) + reg |= GITS_CTLR_QUIESCENT; + if (its->enabled) + reg |= GITS_CTLR_ENABLE; + mutex_unlock(&its->cmd_lock); + + return reg; +} + +static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + its->enabled = !!(val & GITS_CTLR_ENABLE); +} + +static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + u64 reg = GITS_TYPER_PLPIS; + + /* + * We use linear CPU numbers for redistributor addressing, + * so GITS_TYPER.PTA is 0. + * Also we force all PROPBASER registers to be the same, so + * CommonLPIAff is 0 as well. + * To avoid memory waste in the guest, we keep the number of IDBits and + * DevBits low - as least for the time being. + */ + reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT; + reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT; + + return extract_bytes(reg, addr & 7, len); +} + +static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); +} + +static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + switch (addr & 0xffff) { + case GITS_PIDR0: + return 0x92; /* part number, bits[7:0] */ + case GITS_PIDR1: + return 0xb4; /* part number, bits[11:8] */ + case GITS_PIDR2: + return GIC_PIDR2_ARCH_GICv3 | 0x0b; + case GITS_PIDR4: + return 0x40; /* This is a 64K software visible page */ + /* The following are the ID registers for (any) GIC. */ + case GITS_CIDR0: + return 0x0d; + case GITS_CIDR1: + return 0xf0; + case GITS_CIDR2: + return 0x05; + case GITS_CIDR3: + return 0xb1; + } + + return 0; +} + +/* Requires the its_lock to be held. */ +static void its_free_itte(struct kvm *kvm, struct its_itte *itte) +{ + list_del(&itte->itte_list); + kfree(itte); +} + +static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + return -ENODEV; +} + +static u64 vgic_sanitise_its_baser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK, + GITS_BASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK, + GITS_BASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK, + GITS_BASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + /* Bits 15:12 contain bits 51:48 of the PA, which we don't support. */ + reg &= ~GENMASK_ULL(15, 12); + + /* We support only one (ITS) page size: 64K */ + reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K; + + return reg; +} + +static u64 vgic_sanitise_its_cbaser(u64 reg) +{ + reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK, + GITS_CBASER_SHAREABILITY_SHIFT, + vgic_sanitise_shareability); + reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK, + GITS_CBASER_INNER_CACHEABILITY_SHIFT, + vgic_sanitise_inner_cacheability); + reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK, + GITS_CBASER_OUTER_CACHEABILITY_SHIFT, + vgic_sanitise_outer_cacheability); + + /* + * Sanitise the physical address to be 64k aligned. + * Also limit the physical addresses to 48 bits. + */ + reg &= ~(GENMASK_ULL(51, 48) | GENMASK_ULL(15, 12)); + + return reg; +} + +static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return extract_bytes(its->cbaser, addr & 7, len); +} + +static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + /* When GITS_CTLR.Enable is 1, this register is RO. */ + if (its->enabled) + return; + + mutex_lock(&its->cmd_lock); + its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val); + its->cbaser = vgic_sanitise_its_cbaser(its->cbaser); + its->creadr = 0; + /* + * CWRITER is architecturally UNKNOWN on reset, but we need to reset + * it to CREADR to make sure we start with an empty command buffer. + */ + its->cwriter = its->creadr; + mutex_unlock(&its->cmd_lock); +} + +#define ITS_CMD_BUFFER_SIZE(baser) ((((baser) & 0xff) + 1) << 12) +#define ITS_CMD_SIZE 32 +#define ITS_CMD_OFFSET(reg) ((reg) & GENMASK(19, 5)) + +/* + * By writing to CWRITER the guest announces new commands to be processed. + * To avoid any races in the first place, we take the its_cmd lock, which + * protects our ring buffer variables, so that there is only one user + * per ITS handling commands at a given time. + */ +static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + gpa_t cbaser; + u64 cmd_buf[4]; + u32 reg; + + if (!its) + return; + + mutex_lock(&its->cmd_lock); + + reg = update_64bit_reg(its->cwriter, addr & 7, len, val); + reg = ITS_CMD_OFFSET(reg); + if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) { + mutex_unlock(&its->cmd_lock); + return; + } + + its->cwriter = reg; + cbaser = CBASER_ADDRESS(its->cbaser); + + while (its->cwriter != its->creadr) { + int ret = kvm_read_guest(kvm, cbaser + its->creadr, + cmd_buf, ITS_CMD_SIZE); + /* + * If kvm_read_guest() fails, this could be due to the guest + * programming a bogus value in CBASER or something else going + * wrong from which we cannot easily recover. + * According to section 6.3.2 in the GICv3 spec we can just + * ignore that command then. + */ + if (!ret) + vgic_its_handle_command(kvm, its, cmd_buf); + + its->creadr += ITS_CMD_SIZE; + if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser)) + its->creadr = 0; + } + + mutex_unlock(&its->cmd_lock); +} + +static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return extract_bytes(its->cwriter, addr & 0x7, len); +} + +static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + return extract_bytes(its->creadr, addr & 0x7, len); +} + +#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7) +static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len) +{ + u64 reg; + + switch (BASER_INDEX(addr)) { + case 0: + reg = its->baser_device_table; + break; + case 1: + reg = its->baser_coll_table; + break; + default: + reg = 0; + break; + } + + return extract_bytes(reg, addr & 7, len); +} + +#define GITS_BASER_RO_MASK (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56)) +static void vgic_mmio_write_its_baser(struct kvm *kvm, + struct vgic_its *its, + gpa_t addr, unsigned int len, + unsigned long val) +{ + u64 entry_size, device_type; + u64 reg, *regptr, clearbits = 0; + + /* When GITS_CTLR.Enable is 1, we ignore write accesses. */ + if (its->enabled) + return; + + switch (BASER_INDEX(addr)) { + case 0: + regptr = &its->baser_device_table; + entry_size = 8; + device_type = GITS_BASER_TYPE_DEVICE; + break; + case 1: + regptr = &its->baser_coll_table; + entry_size = 8; + device_type = GITS_BASER_TYPE_COLLECTION; + clearbits = GITS_BASER_INDIRECT; + break; + default: + return; + } + + reg = update_64bit_reg(*regptr, addr & 7, len, val); + reg &= ~GITS_BASER_RO_MASK; + reg &= ~clearbits; + + reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT; + reg |= device_type << GITS_BASER_TYPE_SHIFT; + reg = vgic_sanitise_its_baser(reg); + + *regptr = reg; +} + #define REGISTER_ITS_DESC(off, rd, wr, length, acc) \ { \ .reg_offset = off, \ @@ -41,12 +365,6 @@ .its_write = wr, \ } -static unsigned long its_mmio_read_raz(struct kvm *kvm, struct vgic_its *its, - gpa_t addr, unsigned int len) -{ - return 0; -} - static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its, gpa_t addr, unsigned int len, unsigned long val) { @@ -55,28 +373,28 @@ static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its, static struct vgic_register_region its_registers[] = { REGISTER_ITS_DESC(GITS_CTLR, - its_mmio_read_raz, its_mmio_write_wi, 4, + vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4, VGIC_ACCESS_32bit), REGISTER_ITS_DESC(GITS_IIDR, - its_mmio_read_raz, its_mmio_write_wi, 4, + vgic_mmio_read_its_iidr, its_mmio_write_wi, 4, VGIC_ACCESS_32bit), REGISTER_ITS_DESC(GITS_TYPER, - its_mmio_read_raz, its_mmio_write_wi, 8, + vgic_mmio_read_its_typer, its_mmio_write_wi, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_ITS_DESC(GITS_CBASER, - its_mmio_read_raz, its_mmio_write_wi, 8, + vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_ITS_DESC(GITS_CWRITER, - its_mmio_read_raz, its_mmio_write_wi, 8, + vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_ITS_DESC(GITS_CREADR, - its_mmio_read_raz, its_mmio_write_wi, 8, + vgic_mmio_read_its_creadr, its_mmio_write_wi, 8, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_ITS_DESC(GITS_BASER, - its_mmio_read_raz, its_mmio_write_wi, 0x40, + vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40, VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), REGISTER_ITS_DESC(GITS_IDREGS_BASE, - its_mmio_read_raz, its_mmio_write_wi, 0x30, + vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30, VGIC_ACCESS_32bit), }; @@ -109,6 +427,18 @@ static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its) return ret; } +#define INITIAL_BASER_VALUE \ + (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \ + GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \ + GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) | \ + ((8ULL - 1) << GITS_BASER_ENTRY_SIZE_SHIFT) | \ + GITS_BASER_PAGE_SIZE_64K) + +#define INITIAL_PROPBASER_VALUE \ + (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb) | \ + GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner) | \ + GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)) + static int vgic_its_create(struct kvm_device *dev, u32 type) { struct vgic_its *its; @@ -120,12 +450,24 @@ static int vgic_its_create(struct kvm_device *dev, u32 type) if (!its) return -ENOMEM; + mutex_init(&its->its_lock); + mutex_init(&its->cmd_lock); + its->vgic_its_base = VGIC_ADDR_UNDEF; + INIT_LIST_HEAD(&its->device_list); + INIT_LIST_HEAD(&its->collection_list); + dev->kvm->arch.vgic.has_its = true; its->initialized = false; its->enabled = false; + its->baser_device_table = INITIAL_BASER_VALUE | + ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT); + its->baser_coll_table = INITIAL_BASER_VALUE | + ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT); + dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE; + dev->private = its; return 0; @@ -133,7 +475,36 @@ static int vgic_its_create(struct kvm_device *dev, u32 type) static void vgic_its_destroy(struct kvm_device *kvm_dev) { + struct kvm *kvm = kvm_dev->kvm; struct vgic_its *its = kvm_dev->private; + struct its_device *dev; + struct its_itte *itte; + struct list_head *dev_cur, *dev_temp; + struct list_head *cur, *temp; + + /* + * We may end up here without the lists ever having been initialized. + * Check this and bail out early to avoid dereferencing a NULL pointer. + */ + if (!its->device_list.next) + return; + + mutex_lock(&its->its_lock); + list_for_each_safe(dev_cur, dev_temp, &its->device_list) { + dev = container_of(dev_cur, struct its_device, dev_list); + list_for_each_safe(cur, temp, &dev->itt_head) { + itte = (container_of(cur, struct its_itte, itte_list)); + its_free_itte(kvm, itte); + } + list_del(dev_cur); + kfree(dev); + } + + list_for_each_safe(cur, temp, &its->collection_list) { + list_del(cur); + kfree(container_of(cur, struct its_collection, coll_list)); + } + mutex_unlock(&its->its_lock); kfree(its); } diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index a5c35050c786..84a301d789e0 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -23,15 +23,15 @@ #include "vgic-mmio.h" /* extract @num bytes at @offset bytes offset in data */ -static unsigned long extract_bytes(unsigned long data, unsigned int offset, - unsigned int num) +unsigned long extract_bytes(unsigned long data, unsigned int offset, + unsigned int num) { return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0); } /* allows updates of any half of a 64-bit register (or the whole thing) */ -static u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, - unsigned long val) +u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, + unsigned long val) { int lower = (offset & 4) * 8; int upper = lower + 8 * len - 1; diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h index 366d66378732..0b3ecf9d100e 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.h +++ b/virt/kvm/arm/vgic/vgic-mmio.h @@ -96,6 +96,12 @@ unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len); void vgic_data_host_to_mmio_bus(void *buf, unsigned int len, unsigned long data); +unsigned long extract_bytes(unsigned long data, unsigned int offset, + unsigned int num); + +u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len, + unsigned long val); + unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len); diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index fb19a554d090..d3ba1b4227e7 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -33,10 +33,16 @@ struct vgic_global __section(.hyp.text) kvm_vgic_global_state; /* * Locking order is always: - * vgic_cpu->ap_list_lock - * vgic_irq->irq_lock + * its->cmd_lock (mutex) + * its->its_lock (mutex) + * vgic_cpu->ap_list_lock + * vgic_irq->irq_lock * - * (that is, always take the ap_list_lock before the struct vgic_irq lock). + * If you need to take multiple locks, always take the upper lock first, + * then the lower ones, e.g. first take the its_lock, then the irq_lock. + * If you are already holding a lock and need to take a higher one, you + * have to drop the lower ranking lock first and re-aquire it after having + * taken the upper one. * * When taking more than one ap_list_lock at the same time, always take the * lowest numbered VCPU's ap_list_lock first, so: From 3802411d01880c4283426d22653e011159b1c947 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:33 +0100 Subject: [PATCH 267/302] KVM: arm64: vgic-its: Connect LPIs to the VGIC emulation LPIs are dynamically created (mapped) at guest runtime and their actual number can be quite high, but is mostly assigned using a very sparse allocation scheme. So arrays are not an ideal data structure to hold the information. We use a spin-lock protected linked list to hold all mapped LPIs, represented by their struct vgic_irq. This lock is grouped between the ap_list_lock and the vgic_irq lock in our locking order. Also we store a pointer to that struct vgic_irq in our struct its_itte, so we can easily access it. Eventually we call our new vgic_get_lpi() from vgic_get_irq(), so the VGIC code gets transparently access to LPIs. Signed-off-by: Andre Przywara Reviewed-by: Marc Zyngier Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- include/kvm/arm_vgic.h | 6 ++++ virt/kvm/arm/vgic/vgic-init.c | 3 ++ virt/kvm/arm/vgic/vgic-its.c | 5 +++ virt/kvm/arm/vgic/vgic-v3.c | 2 ++ virt/kvm/arm/vgic/vgic.c | 63 +++++++++++++++++++++++++++++++---- 5 files changed, 73 insertions(+), 6 deletions(-) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 61867492d361..a6ca326055cf 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -77,6 +77,7 @@ enum vgic_irq_config { struct vgic_irq { spinlock_t irq_lock; /* Protects the content of the struct */ + struct list_head lpi_list; /* Used to link all LPIs together */ struct list_head ap_list; struct kvm_vcpu *vcpu; /* SGIs and PPIs: The VCPU @@ -193,6 +194,11 @@ struct vgic_dist { * GICv3 spec: 6.1.2 "LPI Configuration tables" */ u64 propbaser; + + /* Protects the lpi_list and the count value below. */ + spinlock_t lpi_list_lock; + struct list_head lpi_list_head; + int lpi_list_count; }; struct vgic_v2_cpu_if { diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index ac3c1a5f7bf4..535e713704f0 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -157,6 +157,9 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); int i; + INIT_LIST_HEAD(&dist->lpi_list_head); + spin_lock_init(&dist->lpi_list_lock); + dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL); if (!dist->spis) return -ENOMEM; diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 11cfe2f12c6c..14f91ff487cc 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -56,6 +56,7 @@ struct its_collection { struct its_itte { struct list_head itte_list; + struct vgic_irq *irq; struct its_collection *collection; u32 lpi; u32 event_id; @@ -148,6 +149,10 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, static void its_free_itte(struct kvm *kvm, struct its_itte *itte) { list_del(&itte->itte_list); + + /* This put matches the get in vgic_add_lpi. */ + vgic_put_irq(kvm, itte->irq); + kfree(itte); } diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index 6f8f31f910e7..0506543df38a 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -81,6 +81,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) else intid = val & GICH_LR_VIRTUALID; irq = vgic_get_irq(vcpu->kvm, vcpu, intid); + if (!irq) /* An LPI could have been unmapped. */ + continue; spin_lock(&irq->irq_lock); diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index d3ba1b4227e7..53299fc93c15 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -36,7 +36,8 @@ struct vgic_global __section(.hyp.text) kvm_vgic_global_state; * its->cmd_lock (mutex) * its->its_lock (mutex) * vgic_cpu->ap_list_lock - * vgic_irq->irq_lock + * kvm->lpi_list_lock + * vgic_irq->irq_lock * * If you need to take multiple locks, always take the upper lock first, * then the lower ones, e.g. first take the its_lock, then the irq_lock. @@ -51,6 +52,41 @@ struct vgic_global __section(.hyp.text) kvm_vgic_global_state; * spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock); */ +/* + * Iterate over the VM's list of mapped LPIs to find the one with a + * matching interrupt ID and return a reference to the IRQ structure. + */ +static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq = NULL; + + spin_lock(&dist->lpi_list_lock); + + list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { + if (irq->intid != intid) + continue; + + /* + * This increases the refcount, the caller is expected to + * call vgic_put_irq() later once it's finished with the IRQ. + */ + kref_get(&irq->refcount); + goto out_unlock; + } + irq = NULL; + +out_unlock: + spin_unlock(&dist->lpi_list_lock); + + return irq; +} + +/* + * This looks up the virtual interrupt ID to get the corresponding + * struct vgic_irq. It also increases the refcount, so any caller is expected + * to call vgic_put_irq() once it's finished with this IRQ. + */ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 intid) { @@ -62,9 +98,9 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, if (intid <= VGIC_MAX_SPI) return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; - /* LPIs are not yet covered */ + /* LPIs */ if (intid >= VGIC_MIN_LPI) - return NULL; + return vgic_get_lpi(kvm, intid); WARN(1, "Looking up struct vgic_irq for reserved INTID"); return NULL; @@ -78,18 +114,33 @@ static void vgic_get_irq_kref(struct vgic_irq *irq) kref_get(&irq->refcount); } -/* The refcount should never drop to 0 at the moment. */ +/* + * We can't do anything in here, because we lack the kvm pointer to + * lock and remove the item from the lpi_list. So we keep this function + * empty and use the return value of kref_put() to trigger the freeing. + */ static void vgic_irq_release(struct kref *ref) { - WARN_ON(1); } void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) { + struct vgic_dist *dist; + if (irq->intid < VGIC_MIN_LPI) return; - kref_put(&irq->refcount, vgic_irq_release); + if (!kref_put(&irq->refcount, vgic_irq_release)) + return; + + dist = &kvm->arch.vgic; + + spin_lock(&dist->lpi_list_lock); + list_del(&irq->lpi_list); + dist->lpi_list_count--; + spin_unlock(&dist->lpi_list_lock); + + kfree(irq); } /** From 33d3bc9556a7dda5bba2cb6b2d08ae4841ae423e Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:34 +0100 Subject: [PATCH 268/302] KVM: arm64: vgic-its: Read initial LPI pending table The LPI pending status for a GICv3 redistributor is held in a table in (guest) memory. To achieve reasonable performance, we cache the pending bit in our struct vgic_irq. The initial pending state must be read from guest memory upon enabling LPIs for this redistributor. As we can't access the guest memory while we hold the lpi_list spinlock, we create a snapshot of the LPI list and iterate over that. Signed-off-by: Andre Przywara Reviewed-by: Marc Zyngier Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-its.c | 95 ++++++++++++++++++++++++++++++++++++ virt/kvm/arm/vgic/vgic.h | 5 ++ 2 files changed, 100 insertions(+) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 14f91ff487cc..2881b84cbf75 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -67,6 +67,94 @@ struct its_itte { * supports more. Let's be restrictive here. */ #define CBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12)) +#define PENDBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 16)) + +/* + * Create a snapshot of the current LPI list, so that we can enumerate all + * LPIs without holding any lock. + * Returns the array length and puts the kmalloc'ed array into intid_ptr. + */ +static int vgic_copy_lpi_list(struct kvm *kvm, u32 **intid_ptr) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq; + u32 *intids; + int irq_count = dist->lpi_list_count, i = 0; + + /* + * We use the current value of the list length, which may change + * after the kmalloc. We don't care, because the guest shouldn't + * change anything while the command handling is still running, + * and in the worst case we would miss a new IRQ, which one wouldn't + * expect to be covered by this command anyway. + */ + intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL); + if (!intids) + return -ENOMEM; + + spin_lock(&dist->lpi_list_lock); + list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { + /* We don't need to "get" the IRQ, as we hold the list lock. */ + intids[i] = irq->intid; + if (++i == irq_count) + break; + } + spin_unlock(&dist->lpi_list_lock); + + *intid_ptr = intids; + return irq_count; +} + +/* + * Scan the whole LPI pending table and sync the pending bit in there + * with our own data structures. This relies on the LPI being + * mapped before. + */ +static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) +{ + gpa_t pendbase = PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser); + struct vgic_irq *irq; + int last_byte_offset = -1; + int ret = 0; + u32 *intids; + int nr_irqs, i; + + nr_irqs = vgic_copy_lpi_list(vcpu->kvm, &intids); + if (nr_irqs < 0) + return nr_irqs; + + for (i = 0; i < nr_irqs; i++) { + int byte_offset, bit_nr; + u8 pendmask; + + byte_offset = intids[i] / BITS_PER_BYTE; + bit_nr = intids[i] % BITS_PER_BYTE; + + /* + * For contiguously allocated LPIs chances are we just read + * this very same byte in the last iteration. Reuse that. + */ + if (byte_offset != last_byte_offset) { + ret = kvm_read_guest(vcpu->kvm, pendbase + byte_offset, + &pendmask, 1); + if (ret) { + kfree(intids); + return ret; + } + last_byte_offset = byte_offset; + } + + irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); + spin_lock(&irq->irq_lock); + irq->pending = pendmask & (1U << bit_nr); + vgic_queue_irq_unlock(vcpu->kvm, irq); + vgic_put_irq(vcpu->kvm, irq); + } + + kfree(intids); + + return ret; +} static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu, struct vgic_its *its, @@ -403,6 +491,13 @@ static struct vgic_register_region its_registers[] = { VGIC_ACCESS_32bit), }; +/* This is called on setting the LPI enable bit in the redistributor. */ +void vgic_enable_lpis(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ)) + its_sync_lpi_pending_table(vcpu); +} + static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its) { struct vgic_io_device *iodev = &its->iodev; diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 8192a293f119..ee348deb8737 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -25,6 +25,7 @@ #define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF) #define INTERRUPT_ID_BITS_SPIS 10 +#define INTERRUPT_ID_BITS_ITS 16 #define VGIC_PRI_BITS 5 #define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS) @@ -76,6 +77,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info); int vgic_v3_map_resources(struct kvm *kvm); int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address); bool vgic_has_its(struct kvm *kvm); +void vgic_enable_lpis(struct kvm_vcpu *vcpu); #else static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu) { @@ -133,6 +135,9 @@ static inline bool vgic_has_its(struct kvm *kvm) return false; } +static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu) +{ +} #endif int kvm_register_vgic_device(unsigned long type); From f9f77af9e2a551ac34eb0eb40630d91d6dbd4295 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:35 +0100 Subject: [PATCH 269/302] KVM: arm64: vgic-its: Allow updates of LPI configuration table The (system-wide) LPI configuration table is held in a table in (guest) memory. To achieve reasonable performance, we cache this data in our struct vgic_irq. If the guest updates the configuration data (which consists of the enable bit and the priority value), it issues an INV or INVALL command to allow us to update our information. Provide functions that update that information for one LPI or all LPIs mapped to a specific collection. Signed-off-by: Andre Przywara Reviewed-by: Marc Zyngier Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-its.c | 39 ++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 2881b84cbf75..6f43b3b1172b 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -68,6 +68,45 @@ struct its_itte { */ #define CBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12)) #define PENDBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 16)) +#define PROPBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12)) + +#define GIC_LPI_OFFSET 8192 + +#define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED) +#define LPI_PROP_PRIORITY(p) ((p) & 0xfc) + +/* + * Reads the configuration data for a given LPI from guest memory and + * updates the fields in struct vgic_irq. + * If filter_vcpu is not NULL, applies only if the IRQ is targeting this + * VCPU. Unconditionally applies if filter_vcpu is NULL. + */ +static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, + struct kvm_vcpu *filter_vcpu) +{ + u64 propbase = PROPBASER_ADDRESS(kvm->arch.vgic.propbaser); + u8 prop; + int ret; + + ret = kvm_read_guest(kvm, propbase + irq->intid - GIC_LPI_OFFSET, + &prop, 1); + + if (ret) + return ret; + + spin_lock(&irq->irq_lock); + + if (!filter_vcpu || filter_vcpu == irq->target_vcpu) { + irq->priority = LPI_PROP_PRIORITY(prop); + irq->enabled = LPI_PROP_ENABLE_BIT(prop); + + vgic_queue_irq_unlock(kvm, irq); + } else { + spin_unlock(&irq->irq_lock); + } + + return 0; +} /* * Create a snapshot of the current LPI list, so that we can enumerate all From df9f58fbea9bc656b5a7770c885c97b26255b234 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:36 +0100 Subject: [PATCH 270/302] KVM: arm64: vgic-its: Implement ITS command queue command handlers The connection between a device, an event ID, the LPI number and the associated CPU is stored in in-memory tables in a GICv3, but their format is not specified by the spec. Instead software uses a command queue in a ring buffer to let an ITS implementation use its own format. Implement handlers for the various ITS commands and let them store the requested relation into our own data structures. Those data structures are protected by the its_lock mutex. Our internal ring buffer read and write pointers are protected by the its_cmd mutex, so that only one VCPU per ITS can handle commands at any given time. Error handling is very basic at the moment, as we don't have a good way of communicating errors to the guest (usually an SError). The INT command handler is missing from this patch, as we gain the capability of actually injecting MSIs into the guest only later on. Signed-off-by: Andre Przywara Reviewed-by: Marc Zyngier Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-its.c | 661 ++++++++++++++++++++++++++++++++++- 1 file changed, 660 insertions(+), 1 deletion(-) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 6f43b3b1172b..1408c88d063e 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -33,6 +33,67 @@ #include "vgic.h" #include "vgic-mmio.h" +/* + * Creates a new (reference to a) struct vgic_irq for a given LPI. + * If this LPI is already mapped on another ITS, we increase its refcount + * and return a pointer to the existing structure. + * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq. + * This function returns a pointer to the _unlocked_ structure. + */ +static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq; + + /* In this case there is no put, since we keep the reference. */ + if (irq) + return irq; + + irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL); + if (!irq) + return NULL; + + INIT_LIST_HEAD(&irq->lpi_list); + INIT_LIST_HEAD(&irq->ap_list); + spin_lock_init(&irq->irq_lock); + + irq->config = VGIC_CONFIG_EDGE; + kref_init(&irq->refcount); + irq->intid = intid; + + spin_lock(&dist->lpi_list_lock); + + /* + * There could be a race with another vgic_add_lpi(), so we need to + * check that we don't add a second list entry with the same LPI. + */ + list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) { + if (oldirq->intid != intid) + continue; + + /* Someone was faster with adding this LPI, lets use that. */ + kfree(irq); + irq = oldirq; + + /* + * This increases the refcount, the caller is expected to + * call vgic_put_irq() on the returned pointer once it's + * finished with the IRQ. + */ + kref_get(&irq->refcount); + + goto out_unlock; + } + + list_add_tail(&irq->lpi_list, &dist->lpi_list_head); + dist->lpi_list_count++; + +out_unlock: + spin_unlock(&dist->lpi_list_lock); + + return irq; +} + struct its_device { struct list_head dev_list; @@ -62,16 +123,75 @@ struct its_itte { u32 event_id; }; +/* + * Find and returns a device in the device table for an ITS. + * Must be called with the its_lock mutex held. + */ +static struct its_device *find_its_device(struct vgic_its *its, u32 device_id) +{ + struct its_device *device; + + list_for_each_entry(device, &its->device_list, dev_list) + if (device_id == device->device_id) + return device; + + return NULL; +} + +/* + * Find and returns an interrupt translation table entry (ITTE) for a given + * Device ID/Event ID pair on an ITS. + * Must be called with the its_lock mutex held. + */ +static struct its_itte *find_itte(struct vgic_its *its, u32 device_id, + u32 event_id) +{ + struct its_device *device; + struct its_itte *itte; + + device = find_its_device(its, device_id); + if (device == NULL) + return NULL; + + list_for_each_entry(itte, &device->itt_head, itte_list) + if (itte->event_id == event_id) + return itte; + + return NULL; +} + +/* To be used as an iterator this macro misses the enclosing parentheses */ +#define for_each_lpi_its(dev, itte, its) \ + list_for_each_entry(dev, &(its)->device_list, dev_list) \ + list_for_each_entry(itte, &(dev)->itt_head, itte_list) + /* * We only implement 48 bits of PA at the moment, although the ITS * supports more. Let's be restrictive here. */ +#define BASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 16)) #define CBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12)) #define PENDBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 16)) #define PROPBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12)) #define GIC_LPI_OFFSET 8192 +/* + * Finds and returns a collection in the ITS collection table. + * Must be called with the its_lock mutex held. + */ +static struct its_collection *find_collection(struct vgic_its *its, int coll_id) +{ + struct its_collection *collection; + + list_for_each_entry(collection, &its->collection_list, coll_list) { + if (coll_id == collection->collection_id) + return collection; + } + + return NULL; +} + #define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED) #define LPI_PROP_PRIORITY(p) ((p) & 0xfc) @@ -144,6 +264,51 @@ static int vgic_copy_lpi_list(struct kvm *kvm, u32 **intid_ptr) return irq_count; } +/* + * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI + * is targeting) to the VGIC's view, which deals with target VCPUs. + * Needs to be called whenever either the collection for a LPIs has + * changed or the collection itself got retargeted. + */ +static void update_affinity_itte(struct kvm *kvm, struct its_itte *itte) +{ + struct kvm_vcpu *vcpu; + + if (!its_is_collection_mapped(itte->collection)) + return; + + vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr); + + spin_lock(&itte->irq->irq_lock); + itte->irq->target_vcpu = vcpu; + spin_unlock(&itte->irq->irq_lock); +} + +/* + * Updates the target VCPU for every LPI targeting this collection. + * Must be called with the its_lock mutex held. + */ +static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its, + struct its_collection *coll) +{ + struct its_device *device; + struct its_itte *itte; + + for_each_lpi_its(device, itte, its) { + if (!itte->collection || coll != itte->collection) + continue; + + update_affinity_itte(kvm, itte); + } +} + +static u32 max_lpis_propbaser(u64 propbaser) +{ + int nr_idbits = (propbaser & 0x1f) + 1; + + return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS); +} + /* * Scan the whole LPI pending table and sync the pending bit in there * with our own data structures. This relies on the LPI being @@ -283,10 +448,504 @@ static void its_free_itte(struct kvm *kvm, struct its_itte *itte) kfree(itte); } +static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size) +{ + return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1); +} + +#define its_cmd_get_command(cmd) its_cmd_mask_field(cmd, 0, 0, 8) +#define its_cmd_get_deviceid(cmd) its_cmd_mask_field(cmd, 0, 32, 32) +#define its_cmd_get_id(cmd) its_cmd_mask_field(cmd, 1, 0, 32) +#define its_cmd_get_physical_id(cmd) its_cmd_mask_field(cmd, 1, 32, 32) +#define its_cmd_get_collection(cmd) its_cmd_mask_field(cmd, 2, 0, 16) +#define its_cmd_get_target_addr(cmd) its_cmd_mask_field(cmd, 2, 16, 32) +#define its_cmd_get_validbit(cmd) its_cmd_mask_field(cmd, 2, 63, 1) + +/* + * The DISCARD command frees an Interrupt Translation Table Entry (ITTE). + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + struct its_itte *itte; + + + itte = find_itte(its, device_id, event_id); + if (itte && itte->collection) { + /* + * Though the spec talks about removing the pending state, we + * don't bother here since we clear the ITTE anyway and the + * pending state is a property of the ITTE struct. + */ + its_free_itte(kvm, itte); + return 0; + } + + return E_ITS_DISCARD_UNMAPPED_INTERRUPT; +} + +/* + * The MOVI command moves an ITTE to a different collection. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + u32 coll_id = its_cmd_get_collection(its_cmd); + struct kvm_vcpu *vcpu; + struct its_itte *itte; + struct its_collection *collection; + + itte = find_itte(its, device_id, event_id); + if (!itte) + return E_ITS_MOVI_UNMAPPED_INTERRUPT; + + if (!its_is_collection_mapped(itte->collection)) + return E_ITS_MOVI_UNMAPPED_COLLECTION; + + collection = find_collection(its, coll_id); + if (!its_is_collection_mapped(collection)) + return E_ITS_MOVI_UNMAPPED_COLLECTION; + + itte->collection = collection; + vcpu = kvm_get_vcpu(kvm, collection->target_addr); + + spin_lock(&itte->irq->irq_lock); + itte->irq->target_vcpu = vcpu; + spin_unlock(&itte->irq->irq_lock); + + return 0; +} + +static void vgic_its_init_collection(struct vgic_its *its, + struct its_collection *collection, + u32 coll_id) +{ + collection->collection_id = coll_id; + collection->target_addr = COLLECTION_NOT_MAPPED; + + list_add_tail(&collection->coll_list, &its->collection_list); +} + +/* + * The MAPTI and MAPI commands map LPIs to ITTEs. + * Must be called with its_lock mutex held. + */ +static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd, u8 subcmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + u32 coll_id = its_cmd_get_collection(its_cmd); + struct its_itte *itte; + struct its_device *device; + struct its_collection *collection, *new_coll = NULL; + int lpi_nr; + + device = find_its_device(its, device_id); + if (!device) + return E_ITS_MAPTI_UNMAPPED_DEVICE; + + collection = find_collection(its, coll_id); + if (!collection) { + new_coll = kzalloc(sizeof(struct its_collection), GFP_KERNEL); + if (!new_coll) + return -ENOMEM; + } + + if (subcmd == GITS_CMD_MAPTI) + lpi_nr = its_cmd_get_physical_id(its_cmd); + else + lpi_nr = event_id; + if (lpi_nr < GIC_LPI_OFFSET || + lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser)) { + kfree(new_coll); + return E_ITS_MAPTI_PHYSICALID_OOR; + } + + itte = find_itte(its, device_id, event_id); + if (!itte) { + itte = kzalloc(sizeof(struct its_itte), GFP_KERNEL); + if (!itte) { + kfree(new_coll); + return -ENOMEM; + } + + itte->event_id = event_id; + list_add_tail(&itte->itte_list, &device->itt_head); + } + + if (!collection) { + collection = new_coll; + vgic_its_init_collection(its, collection, coll_id); + } + + itte->collection = collection; + itte->lpi = lpi_nr; + itte->irq = vgic_add_lpi(kvm, lpi_nr); + update_affinity_itte(kvm, itte); + + /* + * We "cache" the configuration table entries in out struct vgic_irq's. + * However we only have those structs for mapped IRQs, so we read in + * the respective config data from memory here upon mapping the LPI. + */ + update_lpi_config(kvm, itte->irq, NULL); + + return 0; +} + +/* Requires the its_lock to be held. */ +static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device) +{ + struct its_itte *itte, *temp; + + /* + * The spec says that unmapping a device with still valid + * ITTEs associated is UNPREDICTABLE. We remove all ITTEs, + * since we cannot leave the memory unreferenced. + */ + list_for_each_entry_safe(itte, temp, &device->itt_head, itte_list) + its_free_itte(kvm, itte); + + list_del(&device->dev_list); + kfree(device); +} + +/* + * Check whether a device ID can be stored into the guest device tables. + * For a direct table this is pretty easy, but gets a bit nasty for + * indirect tables. We check whether the resulting guest physical address + * is actually valid (covered by a memslot and guest accessbible). + * For this we have to read the respective first level entry. + */ +static bool vgic_its_check_device_id(struct kvm *kvm, struct vgic_its *its, + int device_id) +{ + u64 r = its->baser_device_table; + int nr_entries = GITS_BASER_NR_PAGES(r) * SZ_64K; + int index; + u64 indirect_ptr; + gfn_t gfn; + + + if (!(r & GITS_BASER_INDIRECT)) + return device_id < (nr_entries / GITS_BASER_ENTRY_SIZE(r)); + + /* calculate and check the index into the 1st level */ + index = device_id / (SZ_64K / GITS_BASER_ENTRY_SIZE(r)); + if (index >= (nr_entries / sizeof(u64))) + return false; + + /* Each 1st level entry is represented by a 64-bit value. */ + if (!kvm_read_guest(kvm, + BASER_ADDRESS(r) + index * sizeof(indirect_ptr), + &indirect_ptr, sizeof(indirect_ptr))) + return false; + + /* check the valid bit of the first level entry */ + if (!(indirect_ptr & BIT_ULL(63))) + return false; + + /* + * Mask the guest physical address and calculate the frame number. + * Any address beyond our supported 48 bits of PA will be caught + * by the actual check in the final step. + */ + gfn = (indirect_ptr & GENMASK_ULL(51, 16)) >> PAGE_SHIFT; + + return kvm_is_visible_gfn(kvm, gfn); +} + +/* + * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs). + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + bool valid = its_cmd_get_validbit(its_cmd); + struct its_device *device; + + if (!vgic_its_check_device_id(kvm, its, device_id)) + return E_ITS_MAPD_DEVICE_OOR; + + device = find_its_device(its, device_id); + + /* + * The spec says that calling MAPD on an already mapped device + * invalidates all cached data for this device. We implement this + * by removing the mapping and re-establishing it. + */ + if (device) + vgic_its_unmap_device(kvm, device); + + /* + * The spec does not say whether unmapping a not-mapped device + * is an error, so we are done in any case. + */ + if (!valid) + return 0; + + device = kzalloc(sizeof(struct its_device), GFP_KERNEL); + if (!device) + return -ENOMEM; + + device->device_id = device_id; + INIT_LIST_HEAD(&device->itt_head); + + list_add_tail(&device->dev_list, &its->device_list); + + return 0; +} + +static int vgic_its_nr_collection_ids(struct vgic_its *its) +{ + u64 r = its->baser_coll_table; + + return (GITS_BASER_NR_PAGES(r) * SZ_64K) / GITS_BASER_ENTRY_SIZE(r); +} + +/* + * The MAPC command maps collection IDs to redistributors. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u16 coll_id; + u32 target_addr; + struct its_collection *collection; + bool valid; + + valid = its_cmd_get_validbit(its_cmd); + coll_id = its_cmd_get_collection(its_cmd); + target_addr = its_cmd_get_target_addr(its_cmd); + + if (target_addr >= atomic_read(&kvm->online_vcpus)) + return E_ITS_MAPC_PROCNUM_OOR; + + if (coll_id >= vgic_its_nr_collection_ids(its)) + return E_ITS_MAPC_COLLECTION_OOR; + + collection = find_collection(its, coll_id); + + if (!valid) { + struct its_device *device; + struct its_itte *itte; + /* + * Clearing the mapping for that collection ID removes the + * entry from the list. If there wasn't any before, we can + * go home early. + */ + if (!collection) + return 0; + + for_each_lpi_its(device, itte, its) + if (itte->collection && + itte->collection->collection_id == coll_id) + itte->collection = NULL; + + list_del(&collection->coll_list); + kfree(collection); + } else { + if (!collection) { + collection = kzalloc(sizeof(struct its_collection), + GFP_KERNEL); + if (!collection) + return -ENOMEM; + + vgic_its_init_collection(its, collection, coll_id); + collection->target_addr = target_addr; + } else { + collection->target_addr = target_addr; + update_affinity_collection(kvm, its, collection); + } + } + + return 0; +} + +/* + * The CLEAR command removes the pending state for a particular LPI. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + struct its_itte *itte; + + + itte = find_itte(its, device_id, event_id); + if (!itte) + return E_ITS_CLEAR_UNMAPPED_INTERRUPT; + + itte->irq->pending = false; + + return 0; +} + +/* + * The INV command syncs the configuration bits from the memory table. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 device_id = its_cmd_get_deviceid(its_cmd); + u32 event_id = its_cmd_get_id(its_cmd); + struct its_itte *itte; + + + itte = find_itte(its, device_id, event_id); + if (!itte) + return E_ITS_INV_UNMAPPED_INTERRUPT; + + return update_lpi_config(kvm, itte->irq, NULL); +} + +/* + * The INVALL command requests flushing of all IRQ data in this collection. + * Find the VCPU mapped to that collection, then iterate over the VM's list + * of mapped LPIs and update the configuration for each IRQ which targets + * the specified vcpu. The configuration will be read from the in-memory + * configuration table. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 coll_id = its_cmd_get_collection(its_cmd); + struct its_collection *collection; + struct kvm_vcpu *vcpu; + struct vgic_irq *irq; + u32 *intids; + int irq_count, i; + + collection = find_collection(its, coll_id); + if (!its_is_collection_mapped(collection)) + return E_ITS_INVALL_UNMAPPED_COLLECTION; + + vcpu = kvm_get_vcpu(kvm, collection->target_addr); + + irq_count = vgic_copy_lpi_list(kvm, &intids); + if (irq_count < 0) + return irq_count; + + for (i = 0; i < irq_count; i++) { + irq = vgic_get_irq(kvm, NULL, intids[i]); + if (!irq) + continue; + update_lpi_config(kvm, irq, vcpu); + vgic_put_irq(kvm, irq); + } + + kfree(intids); + + return 0; +} + +/* + * The MOVALL command moves the pending state of all IRQs targeting one + * redistributor to another. We don't hold the pending state in the VCPUs, + * but in the IRQs instead, so there is really not much to do for us here. + * However the spec says that no IRQ must target the old redistributor + * afterwards, so we make sure that no LPI is using the associated target_vcpu. + * This command affects all LPIs in the system that target that redistributor. + */ +static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + u32 target1_addr = its_cmd_get_target_addr(its_cmd); + u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32); + struct kvm_vcpu *vcpu1, *vcpu2; + struct vgic_irq *irq; + + if (target1_addr >= atomic_read(&kvm->online_vcpus) || + target2_addr >= atomic_read(&kvm->online_vcpus)) + return E_ITS_MOVALL_PROCNUM_OOR; + + if (target1_addr == target2_addr) + return 0; + + vcpu1 = kvm_get_vcpu(kvm, target1_addr); + vcpu2 = kvm_get_vcpu(kvm, target2_addr); + + spin_lock(&dist->lpi_list_lock); + + list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { + spin_lock(&irq->irq_lock); + + if (irq->target_vcpu == vcpu1) + irq->target_vcpu = vcpu2; + + spin_unlock(&irq->irq_lock); + } + + spin_unlock(&dist->lpi_list_lock); + + return 0; +} + +/* + * This function is called with the its_cmd lock held, but the ITS data + * structure lock dropped. + */ static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { - return -ENODEV; + u8 cmd = its_cmd_get_command(its_cmd); + int ret = -ENODEV; + + mutex_lock(&its->its_lock); + switch (cmd) { + case GITS_CMD_MAPD: + ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd); + break; + case GITS_CMD_MAPC: + ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd); + break; + case GITS_CMD_MAPI: + ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd, cmd); + break; + case GITS_CMD_MAPTI: + ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd, cmd); + break; + case GITS_CMD_MOVI: + ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd); + break; + case GITS_CMD_DISCARD: + ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd); + break; + case GITS_CMD_CLEAR: + ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd); + break; + case GITS_CMD_MOVALL: + ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd); + break; + case GITS_CMD_INV: + ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd); + break; + case GITS_CMD_INVALL: + ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd); + break; + case GITS_CMD_SYNC: + /* we ignore this command: we are in sync all of the time */ + ret = 0; + break; + } + mutex_unlock(&its->its_lock); + + return ret; } static u64 vgic_sanitise_its_baser(u64 reg) From 2891a7dfb6c4a273996f0047660a75e88e3b8690 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:37 +0100 Subject: [PATCH 271/302] KVM: arm64: vgic-its: Implement MSI injection in ITS emulation When userland wants to inject an MSI into the guest, it uses the KVM_SIGNAL_MSI ioctl, which carries the doorbell address along with the payload and the device ID. With the help of the KVM IO bus framework we learn the corresponding ITS from the doorbell address. We then use our wrapper functions to iterate the linked lists and find the proper Interrupt Translation Table Entry (ITTE) and thus the corresponding struct vgic_irq to finally set the pending bit. We also provide the handler for the ITS "INT" command, which allows a guest to trigger an MSI via the ITS command queue. Since this one knows about the right ITS already, we directly call the MMIO handler function without using the kvm_io_bus framework. Signed-off-by: Andre Przywara Reviewed-by: Marc Zyngier Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-its.c | 77 ++++++++++++++++++++++++++++++++++++ virt/kvm/arm/vgic/vgic.h | 6 +++ 2 files changed, 83 insertions(+) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 1408c88d063e..d8e8f14135b4 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -437,6 +437,65 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, return 0; } +/* + * Find the target VCPU and the LPI number for a given devid/eventid pair + * and make this IRQ pending, possibly injecting it. + * Must be called with the its_lock mutex held. + */ +static void vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid) +{ + struct its_itte *itte; + + if (!its->enabled) + return; + + itte = find_itte(its, devid, eventid); + /* Triggering an unmapped IRQ gets silently dropped. */ + if (itte && its_is_collection_mapped(itte->collection)) { + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr); + if (vcpu && vcpu->arch.vgic_cpu.lpis_enabled) { + spin_lock(&itte->irq->irq_lock); + itte->irq->pending = true; + vgic_queue_irq_unlock(kvm, itte->irq); + } + } +} + +/* + * Queries the KVM IO bus framework to get the ITS pointer from the given + * doorbell address. + * We then call vgic_its_trigger_msi() with the decoded data. + */ +int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi) +{ + u64 address; + struct kvm_io_device *kvm_io_dev; + struct vgic_io_device *iodev; + + if (!vgic_has_its(kvm)) + return -ENODEV; + + if (!(msi->flags & KVM_MSI_VALID_DEVID)) + return -EINVAL; + + address = (u64)msi->address_hi << 32 | msi->address_lo; + + kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address); + if (!kvm_io_dev) + return -ENODEV; + + iodev = container_of(kvm_io_dev, struct vgic_io_device, dev); + + mutex_lock(&iodev->its->its_lock); + vgic_its_trigger_msi(kvm, iodev->its, msi->devid, msi->data); + mutex_unlock(&iodev->its->its_lock); + + return 0; +} + /* Requires the its_lock to be held. */ static void its_free_itte(struct kvm *kvm, struct its_itte *itte) { @@ -896,6 +955,21 @@ static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, return 0; } +/* + * The INT command injects the LPI associated with that DevID/EvID pair. + * Must be called with the its_lock mutex held. + */ +static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its, + u64 *its_cmd) +{ + u32 msi_data = its_cmd_get_id(its_cmd); + u64 msi_devid = its_cmd_get_deviceid(its_cmd); + + vgic_its_trigger_msi(kvm, its, msi_devid, msi_data); + + return 0; +} + /* * This function is called with the its_cmd lock held, but the ITS data * structure lock dropped. @@ -932,6 +1006,9 @@ static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its, case GITS_CMD_MOVALL: ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd); break; + case GITS_CMD_INT: + ret = vgic_its_cmd_handle_int(kvm, its, its_cmd); + break; case GITS_CMD_INV: ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd); break; diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index ee348deb8737..9d557f25cbfc 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -78,6 +78,7 @@ int vgic_v3_map_resources(struct kvm *kvm); int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address); bool vgic_has_its(struct kvm *kvm); void vgic_enable_lpis(struct kvm_vcpu *vcpu); +int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); #else static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu) { @@ -138,6 +139,11 @@ static inline bool vgic_has_its(struct kvm *kvm) static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu) { } + +static inline int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi) +{ + return -ENODEV; +} #endif int kvm_register_vgic_device(unsigned long type); From 0e4e82f154e387969ea7ecd2c8876689fb68f710 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 15 Jul 2016 12:43:38 +0100 Subject: [PATCH 272/302] KVM: arm64: vgic-its: Enable ITS emulation as a virtual MSI controller Now that all ITS emulation functionality is in place, we advertise MSI functionality to userland and also the ITS device to the guest - if userland has configured that. Signed-off-by: Andre Przywara Reviewed-by: Marc Zyngier Tested-by: Eric Auger Signed-off-by: Marc Zyngier --- Documentation/virtual/kvm/api.txt | 2 +- arch/arm64/kvm/Kconfig | 1 + arch/arm64/kvm/Makefile | 1 + arch/arm64/kvm/reset.c | 6 ++++++ include/kvm/arm_vgic.h | 5 +++++ virt/kvm/arm/vgic/vgic-init.c | 3 +++ virt/kvm/arm/vgic/vgic-kvm-device.c | 3 +++ virt/kvm/arm/vgic/vgic-mmio-v3.c | 14 ++++++++++---- virt/kvm/arm/vgic/vgic.c | 8 ++++++++ virt/kvm/arm/vgic/vgic.h | 6 ++++++ 10 files changed, 44 insertions(+), 5 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 65513119fee8..07049eadb124 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2162,7 +2162,7 @@ after pausing the vcpu, but before it is resumed. 4.71 KVM_SIGNAL_MSI Capability: KVM_CAP_SIGNAL_MSI -Architectures: x86 +Architectures: x86 arm64 Type: vm ioctl Parameters: struct kvm_msi (in) Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index aa2e34e99582..9d2eff0b3ad3 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -36,6 +36,7 @@ config KVM select HAVE_KVM_IRQFD select KVM_ARM_VGIC_V3 select KVM_ARM_PMU if HW_PERF_EVENTS + select HAVE_KVM_MSI ---help--- Support hosting virtualized guest machines. We don't support KVM with 16K page tables yet, due to the multiple diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index f00b2cdd0d33..a5b96642a9cb 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -29,5 +29,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o +kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-its.o kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index e95d4f68bf54..5bc460884639 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -86,6 +86,12 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VCPU_ATTRIBUTES: r = 1; break; + case KVM_CAP_MSI_DEVID: + if (!kvm) + r = -EINVAL; + else + r = kvm->arch.vgic.msis_require_devid; + break; default: r = 0; } diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index a6ca326055cf..4e63a07b9001 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -163,6 +163,9 @@ struct vgic_dist { /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */ u32 vgic_model; + /* Do injected MSIs require an additional device ID? */ + bool msis_require_devid; + int nr_spis; /* TODO: Consider moving to global state */ @@ -308,4 +311,6 @@ static inline int kvm_vgic_get_max_vcpus(void) return kvm_vgic_global_state.max_gic_vcpus; } +int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); + #endif /* __KVM_ARM_VGIC_H */ diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index 535e713704f0..01a60dcd05d6 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -258,6 +258,9 @@ int vgic_init(struct kvm *kvm) if (ret) goto out; + if (vgic_has_its(kvm)) + dist->msis_require_devid = true; + kvm_for_each_vcpu(i, vcpu, kvm) kvm_vgic_vcpu_init(vcpu); diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c index 561d2ba96a4f..1813f93b5cde 100644 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ b/virt/kvm/arm/vgic/vgic-kvm-device.c @@ -223,6 +223,9 @@ int kvm_register_vgic_device(unsigned long type) case KVM_DEV_TYPE_ARM_VGIC_V3: ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops, KVM_DEV_TYPE_ARM_VGIC_V3); + if (ret) + break; + ret = kvm_vgic_register_its_device(); break; #endif } diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 84a301d789e0..ff668e0dd586 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -66,7 +66,12 @@ static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, case GICD_TYPER: value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; value = (value >> 5) - 1; - value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19; + if (vgic_has_its(vcpu->kvm)) { + value |= (INTERRUPT_ID_BITS_ITS - 1) << 19; + value |= GICD_TYPER_LPIS; + } else { + value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19; + } break; case GICD_IIDR: value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0); @@ -163,9 +168,8 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu, vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS; - if (!was_enabled && vgic_cpu->lpis_enabled) { - /* Eventually do something */ - } + if (!was_enabled && vgic_cpu->lpis_enabled) + vgic_enable_lpis(vcpu); } static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, @@ -179,6 +183,8 @@ static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, value |= ((target_vcpu_id & 0xffff) << 8); if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1) value |= GICR_TYPER_LAST; + if (vgic_has_its(vcpu->kvm)) + value |= GICR_TYPER_PLPIS; return extract_bytes(value, addr & 7, len); } diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 53299fc93c15..424cb9ceebd9 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -718,3 +718,11 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq) return map_is_active; } + +int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) +{ + if (vgic_has_its(kvm)) + return vgic_its_inject_msi(kvm, msi); + else + return -ENODEV; +} diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 9d557f25cbfc..9d40d7bb89f7 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -77,6 +77,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info); int vgic_v3_map_resources(struct kvm *kvm); int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address); bool vgic_has_its(struct kvm *kvm); +int kvm_vgic_register_its_device(void); void vgic_enable_lpis(struct kvm_vcpu *vcpu); int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); #else @@ -136,6 +137,11 @@ static inline bool vgic_has_its(struct kvm *kvm) return false; } +static inline int kvm_vgic_register_its_device(void) +{ + return -ENODEV; +} + static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu) { } From 9d5fcb9dd74b5e0070ef2f66f7f4ae14a23b0206 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 18 Jul 2016 10:57:36 +0000 Subject: [PATCH 273/302] KVM: arm/arm64: Fix vGICv2 KVM_DEV_ARM_VGIC_GRP_CPU/DIST_REGS For VGICv2 save and restore the CPU interface registers are accessed. Restore the modality which has been altered. Also explicitly set the iodev_type for both the DIST and CPU interface. Signed-off-by: Eric Auger Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-mmio-v2.c | 2 ++ virt/kvm/arm/vgic/vgic-mmio.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c index 4152348f5e4f..b44b359cbbad 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c @@ -437,6 +437,7 @@ int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write, struct vgic_io_device dev = { .regions = vgic_v2_cpu_registers, .nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers), + .iodev_type = IODEV_CPUIF, }; return vgic_uaccess(vcpu, &dev, is_write, offset, val); @@ -448,6 +449,7 @@ int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, struct vgic_io_device dev = { .regions = vgic_v2_dist_registers, .nr_regions = ARRAY_SIZE(vgic_v2_dist_registers), + .iodev_type = IODEV_DIST, }; return vgic_uaccess(vcpu, &dev, is_write, offset, val); diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index 26be827bbfcc..3bad3c5ed431 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -484,7 +484,8 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, switch (iodev->iodev_type) { case IODEV_CPUIF: - return 1; + data = region->read(vcpu, addr, len); + break; case IODEV_DIST: data = region->read(vcpu, addr, len); break; @@ -517,6 +518,7 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, switch (iodev->iodev_type) { case IODEV_CPUIF: + region->write(vcpu, addr, len, data); break; case IODEV_DIST: region->write(vcpu, addr, len, data); From 8c828a535e29f50282f1a49a52c3b20ccaa039aa Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 18 Jul 2016 15:28:52 +0100 Subject: [PATCH 274/302] irqchip/gicv3-its: Restore all cacheability attributes Let's restore some of the #defines that have been savagely dropped by the introduction of the KVM ITS code, as pointlessly break other users (including series that are already in -next). Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v3.h | 48 +++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 9442be7f2461..700b4216c87a 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -146,8 +146,16 @@ #define GICR_PROPBASER_InnerShareable \ GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable) -#define GICR_PROPBASER_nC GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC) -#define GICR_PROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb) + +#define GICR_PROPBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nCnB) +#define GICR_PROPBASER_nC GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC) +#define GICR_PROPBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt) +#define GICR_PROPBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt) +#define GICR_PROPBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWt) +#define GICR_PROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb) +#define GICR_PROPBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWt) +#define GICR_PROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb) + #define GICR_PROPBASER_IDBITS_MASK (0x1f) #define GICR_PENDBASER_SHAREABILITY_SHIFT (10) @@ -163,8 +171,16 @@ #define GICR_PENDBASER_InnerShareable \ GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable) -#define GICR_PENDBASER_nC GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC) -#define GICR_PENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb) + +#define GICR_PENDBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nCnB) +#define GICR_PENDBASER_nC GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC) +#define GICR_PENDBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt) +#define GICR_PENDBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt) +#define GICR_PENDBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWt) +#define GICR_PENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb) +#define GICR_PENDBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWt) +#define GICR_PENDBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWb) + #define GICR_PENDBASER_PTZ BIT_ULL(62) /* @@ -237,24 +253,40 @@ #define GITS_CBASER_InnerShareable \ GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable) -#define GITS_CBASER_nC GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC) -#define GITS_CBASER_WaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb) + +#define GITS_CBASER_nCnB GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nCnB) +#define GITS_CBASER_nC GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC) +#define GITS_CBASER_RaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt) +#define GITS_CBASER_RaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt) +#define GITS_CBASER_WaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWt) +#define GITS_CBASER_WaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb) +#define GITS_CBASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt) +#define GITS_CBASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb) #define GITS_BASER_NR_REGS 8 #define GITS_BASER_VALID (1UL << 63) #define GITS_BASER_INDIRECT (1ULL << 62) + #define GITS_BASER_INNER_CACHEABILITY_SHIFT (59) #define GITS_BASER_OUTER_CACHEABILITY_SHIFT (53) #define GITS_BASER_INNER_CACHEABILITY_MASK \ GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK) +#define GITS_BASER_CACHEABILITY_MASK GITS_BASER_INNER_CACHEABILITY_MASK #define GITS_BASER_OUTER_CACHEABILITY_MASK \ GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK) #define GITS_BASER_SHAREABILITY_MASK \ GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK) -#define GITS_BASER_nC GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC) -#define GITS_BASER_WaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb) +#define GITS_BASER_nCnB GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nCnB) +#define GITS_BASER_nC GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC) +#define GITS_BASER_RaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt) +#define GITS_BASER_RaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt) +#define GITS_BASER_WaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWt) +#define GITS_BASER_WaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb) +#define GITS_BASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWt) +#define GITS_BASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWb) + #define GITS_BASER_TYPE_SHIFT (56) #define GITS_BASER_TYPE(r) (((r) >> GITS_BASER_TYPE_SHIFT) & 7) #define GITS_BASER_ENTRY_SIZE_SHIFT (48) From d97594e6bc1b4aaad3ccae3ef678513b63dd5221 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 17 Jul 2016 11:27:23 +0100 Subject: [PATCH 275/302] KVM: arm64: vgic-its: Generalize use of vgic_get_irq_kref Instead of sprinkling raw kref_get() calls everytime we cannot do a normal vgic_get_irq(), use the existing vgic_get_irq_kref(), which does the same thing and is paired with a vgic_put_irq(). vgic_get_irq_kref is moved to vgic.h in order to be easily shared. Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-its.c | 2 +- virt/kvm/arm/vgic/vgic.c | 10 +--------- virt/kvm/arm/vgic/vgic.h | 8 ++++++++ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index d8e8f14135b4..f427fa2f7263 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -80,7 +80,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid) * call vgic_put_irq() on the returned pointer once it's * finished with the IRQ. */ - kref_get(&irq->refcount); + vgic_get_irq_kref(irq); goto out_unlock; } diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 424cb9ceebd9..39f3358c6d91 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -71,7 +71,7 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) * This increases the refcount, the caller is expected to * call vgic_put_irq() later once it's finished with the IRQ. */ - kref_get(&irq->refcount); + vgic_get_irq_kref(irq); goto out_unlock; } irq = NULL; @@ -106,14 +106,6 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, return NULL; } -static void vgic_get_irq_kref(struct vgic_irq *irq) -{ - if (irq->intid < VGIC_MIN_LPI) - return; - - kref_get(&irq->refcount); -} - /* * We can't do anything in here, because we lack the kvm pointer to * lock and remove the item from the lpi_list. So we keep this function diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 9d40d7bb89f7..1d8e21d5c13f 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -64,6 +64,14 @@ int vgic_v2_map_resources(struct kvm *kvm); int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, enum vgic_type); +static inline void vgic_get_irq_kref(struct vgic_irq *irq) +{ + if (irq->intid < VGIC_MIN_LPI) + return; + + kref_get(&irq->refcount); +} + #ifdef CONFIG_KVM_ARM_VGIC_V3 void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu); void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); From c0091073dd775d0446a9f88dda8c9a86b64340b2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 18 Jul 2016 16:16:26 +0100 Subject: [PATCH 276/302] KVM: arm64: vgic-its: Fix handling of indirect tables The current code will fail on valid indirect tables, and happily use the ones that are pointing out of the guest RAM. Funny what a small "!" can do for you... Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-its.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index f427fa2f7263..d6697c4d81ec 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -702,9 +702,9 @@ static bool vgic_its_check_device_id(struct kvm *kvm, struct vgic_its *its, return false; /* Each 1st level entry is represented by a 64-bit value. */ - if (!kvm_read_guest(kvm, - BASER_ADDRESS(r) + index * sizeof(indirect_ptr), - &indirect_ptr, sizeof(indirect_ptr))) + if (kvm_read_guest(kvm, + BASER_ADDRESS(r) + index * sizeof(indirect_ptr), + &indirect_ptr, sizeof(indirect_ptr))) return false; /* check the valid bit of the first level entry */ From 7e3963a51563d844fcd3bdc13e2847561b15e8de Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 17 Jul 2016 11:34:53 +0100 Subject: [PATCH 277/302] KVM: arm64: vgic-its: Fix vgic_its_check_device_id BE handling The ITS tables are stored in LE format. If the host is reading a L1 table entry to check its validity, it must convert it to the CPU endianness. Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-its.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index d6697c4d81ec..2ac5927b1d91 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -707,6 +707,8 @@ static bool vgic_its_check_device_id(struct kvm *kvm, struct vgic_its *its, &indirect_ptr, sizeof(indirect_ptr))) return false; + indirect_ptr = le64_to_cpu(indirect_ptr); + /* check the valid bit of the first level entry */ if (!(indirect_ptr & BIT_ULL(63))) return false; From b90338b7cbb7c8cad8dbd3c4de4e64180ce0d88b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 17 Jul 2016 12:15:19 +0100 Subject: [PATCH 278/302] KVM: arm64: vgic-its: Fix misleading nr_entries in vgic_its_check_device_id The nr_entries variable in vgic_its_check_device_id actually describe the size of the L1 table, and not the number of entries in this table. Rename it to l1_tbl_size, so that we can now change the code with a better understanding of what is what. Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-its.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 2ac5927b1d91..268a0c7ea3a5 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -687,18 +687,18 @@ static bool vgic_its_check_device_id(struct kvm *kvm, struct vgic_its *its, int device_id) { u64 r = its->baser_device_table; - int nr_entries = GITS_BASER_NR_PAGES(r) * SZ_64K; + int l1_tbl_size = GITS_BASER_NR_PAGES(r) * SZ_64K; int index; u64 indirect_ptr; gfn_t gfn; if (!(r & GITS_BASER_INDIRECT)) - return device_id < (nr_entries / GITS_BASER_ENTRY_SIZE(r)); + return device_id < (l1_tbl_size / GITS_BASER_ENTRY_SIZE(r)); /* calculate and check the index into the 1st level */ index = device_id / (SZ_64K / GITS_BASER_ENTRY_SIZE(r)); - if (index >= (nr_entries / sizeof(u64))) + if (index >= (l1_tbl_size / sizeof(u64))) return false; /* Each 1st level entry is represented by a 64-bit value. */ From 333a53ff7fb9d836ff4a2b7f266ac9b2bb85e873 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 17 Jul 2016 13:00:49 +0100 Subject: [PATCH 279/302] KVM: arm64: vgic-its: Validate the device table L1 entry Checking that the device_id fits if the table, and we must make sure that the associated memory is also accessible. Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-its.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 268a0c7ea3a5..4943d6aebdd1 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -693,8 +693,17 @@ static bool vgic_its_check_device_id(struct kvm *kvm, struct vgic_its *its, gfn_t gfn; - if (!(r & GITS_BASER_INDIRECT)) - return device_id < (l1_tbl_size / GITS_BASER_ENTRY_SIZE(r)); + if (!(r & GITS_BASER_INDIRECT)) { + phys_addr_t addr; + + if (device_id >= (l1_tbl_size / GITS_BASER_ENTRY_SIZE(r))) + return false; + + addr = BASER_ADDRESS(r) + device_id * GITS_BASER_ENTRY_SIZE(r); + gfn = addr >> PAGE_SHIFT; + + return kvm_is_visible_gfn(kvm, gfn); + } /* calculate and check the index into the 1st level */ index = device_id / (SZ_64K / GITS_BASER_ENTRY_SIZE(r)); From d6c7f865f00adf98ca79712167fb0f1b9dccb272 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 17 Jul 2016 11:48:47 +0100 Subject: [PATCH 280/302] KVM: arm64: vgic-its: Fix L2 entry validation for indirect tables When checking that the storage address of a device entry is valid, it is critical to compute the actual address of the entry, rather than relying on the beginning of the page to match a CPU page of the same size: for example, if the guest places the table at the last 64kB boundary of RAM, but RAM size isn't a multiple of 64kB... Fix this by computing the actual offset of the device ID in the L2 page, and check the corresponding GFN. Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-its.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 4943d6aebdd1..2faf1f458e8a 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -727,7 +727,12 @@ static bool vgic_its_check_device_id(struct kvm *kvm, struct vgic_its *its, * Any address beyond our supported 48 bits of PA will be caught * by the actual check in the final step. */ - gfn = (indirect_ptr & GENMASK_ULL(51, 16)) >> PAGE_SHIFT; + indirect_ptr &= GENMASK_ULL(51, 16); + + /* Find the address of the actual entry */ + index = device_id % (SZ_64K / GITS_BASER_ENTRY_SIZE(r)); + indirect_ptr += index * GITS_BASER_ENTRY_SIZE(r); + gfn = indirect_ptr >> PAGE_SHIFT; return kvm_is_visible_gfn(kvm, gfn); } From 17a21f58ff3e60fef3df788561b65e576a0b494d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 17 Jul 2016 20:01:46 +0100 Subject: [PATCH 281/302] KVM: arm64: vgic-its: Add collection allocator/destructor Instead of spreading random allocations all over the place, consolidate allocation/init/freeing of collections in a pair of constructor/destructor. Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-its.c | 94 +++++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 2faf1f458e8a..d6f68e9c946d 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -581,14 +581,45 @@ static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, return 0; } -static void vgic_its_init_collection(struct vgic_its *its, - struct its_collection *collection, +static int vgic_its_alloc_collection(struct vgic_its *its, + struct its_collection **colp, u32 coll_id) { + struct its_collection *collection; + + collection = kzalloc(sizeof(*collection), GFP_KERNEL); + collection->collection_id = coll_id; collection->target_addr = COLLECTION_NOT_MAPPED; list_add_tail(&collection->coll_list, &its->collection_list); + *colp = collection; + + return 0; +} + +static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id) +{ + struct its_collection *collection; + struct its_device *device; + struct its_itte *itte; + + /* + * Clearing the mapping for that collection ID removes the + * entry from the list. If there wasn't any before, we can + * go home early. + */ + collection = find_collection(its, coll_id); + if (!collection) + return; + + for_each_lpi_its(device, itte, its) + if (itte->collection && + itte->collection->collection_id == coll_id) + itte->collection = NULL; + + list_del(&collection->coll_list); + kfree(collection); } /* @@ -605,6 +636,7 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, struct its_device *device; struct its_collection *collection, *new_coll = NULL; int lpi_nr; + int ret; device = find_its_device(its, device_id); if (!device) @@ -612,9 +644,10 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, collection = find_collection(its, coll_id); if (!collection) { - new_coll = kzalloc(sizeof(struct its_collection), GFP_KERNEL); - if (!new_coll) - return -ENOMEM; + ret = vgic_its_alloc_collection(its, &collection, coll_id); + if (ret) + return ret; + new_coll = collection; } if (subcmd == GITS_CMD_MAPTI) @@ -623,27 +656,22 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, lpi_nr = event_id; if (lpi_nr < GIC_LPI_OFFSET || lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser)) { - kfree(new_coll); - return E_ITS_MAPTI_PHYSICALID_OOR; + ret = E_ITS_MAPTI_PHYSICALID_OOR; + goto err; } itte = find_itte(its, device_id, event_id); if (!itte) { itte = kzalloc(sizeof(struct its_itte), GFP_KERNEL); if (!itte) { - kfree(new_coll); - return -ENOMEM; + ret = -ENOMEM; + goto err; } itte->event_id = event_id; list_add_tail(&itte->itte_list, &device->itt_head); } - if (!collection) { - collection = new_coll; - vgic_its_init_collection(its, collection, coll_id); - } - itte->collection = collection; itte->lpi = lpi_nr; itte->irq = vgic_add_lpi(kvm, lpi_nr); @@ -657,6 +685,10 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, update_lpi_config(kvm, itte->irq, NULL); return 0; +err: + if (new_coll) + vgic_its_free_collection(its, coll_id); + return ret; } /* Requires the its_lock to be held. */ @@ -809,34 +841,18 @@ static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its, if (coll_id >= vgic_its_nr_collection_ids(its)) return E_ITS_MAPC_COLLECTION_OOR; - collection = find_collection(its, coll_id); - if (!valid) { - struct its_device *device; - struct its_itte *itte; - /* - * Clearing the mapping for that collection ID removes the - * entry from the list. If there wasn't any before, we can - * go home early. - */ - if (!collection) - return 0; - - for_each_lpi_its(device, itte, its) - if (itte->collection && - itte->collection->collection_id == coll_id) - itte->collection = NULL; - - list_del(&collection->coll_list); - kfree(collection); + vgic_its_free_collection(its, coll_id); } else { - if (!collection) { - collection = kzalloc(sizeof(struct its_collection), - GFP_KERNEL); - if (!collection) - return -ENOMEM; + collection = find_collection(its, coll_id); - vgic_its_init_collection(its, collection, coll_id); + if (!collection) { + int ret; + + ret = vgic_its_alloc_collection(its, &collection, + coll_id); + if (ret) + return ret; collection->target_addr = target_addr; } else { collection->target_addr = target_addr; From bb7176449f6da27534a0faf3a67997bf2c3172aa Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 17 Jul 2016 21:35:07 +0100 Subject: [PATCH 282/302] KVM: arm64: vgic-its: Add pointer to corresponding kvm_device Going from the ITS structure to the corresponding KVM structure would be quite handy at times. The kvm_device pointer that is passed at create time is quite convenient for this, so let's keep a copy of it in the vgic_its structure. This will be put to a good use in subsequent patches. Signed-off-by: Marc Zyngier --- include/kvm/arm_vgic.h | 1 + virt/kvm/arm/vgic/vgic-its.c | 1 + 2 files changed, 2 insertions(+) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 4e63a07b9001..540da5149ba7 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -138,6 +138,7 @@ struct vgic_its { bool enabled; bool initialized; struct vgic_io_device iodev; + struct kvm_device *dev; /* These registers correspond to GITS_BASER{0,1} */ u64 baser_device_table; diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index d6f68e9c946d..dcae567c522d 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -1368,6 +1368,7 @@ static int vgic_its_create(struct kvm_device *dev, u32 type) dev->kvm->arch.vgic.has_its = true; its->initialized = false; its->enabled = false; + its->dev = dev; its->baser_device_table = INITIAL_BASER_VALUE | ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT); From 6d03a68f8054430cba28e49d9e46c1cd4db39a70 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 17 Jul 2016 21:52:55 +0100 Subject: [PATCH 283/302] KVM: arm64: vgic-its: Turn device_id validation into generic ID validation There is no need to have separate functions to validate devices and collections, as the architecture doesn't really distinguish the two, and they are supposed to be managed the same way. Let's turn the DevID checker into a generic one. Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-its.c | 134 ++++++++++++++++------------------- 1 file changed, 62 insertions(+), 72 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index dcae567c522d..996e3e19b53f 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -581,12 +581,73 @@ static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, return 0; } +/* + * Check whether an ID can be stored into the corresponding guest table. + * For a direct table this is pretty easy, but gets a bit nasty for + * indirect tables. We check whether the resulting guest physical address + * is actually valid (covered by a memslot and guest accessbible). + * For this we have to read the respective first level entry. + */ +static bool vgic_its_check_id(struct vgic_its *its, u64 baser, int id) +{ + int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; + int index; + u64 indirect_ptr; + gfn_t gfn; + + if (!(baser & GITS_BASER_INDIRECT)) { + phys_addr_t addr; + + if (id >= (l1_tbl_size / GITS_BASER_ENTRY_SIZE(baser))) + return false; + + addr = BASER_ADDRESS(baser) + id * GITS_BASER_ENTRY_SIZE(baser); + gfn = addr >> PAGE_SHIFT; + + return kvm_is_visible_gfn(its->dev->kvm, gfn); + } + + /* calculate and check the index into the 1st level */ + index = id / (SZ_64K / GITS_BASER_ENTRY_SIZE(baser)); + if (index >= (l1_tbl_size / sizeof(u64))) + return false; + + /* Each 1st level entry is represented by a 64-bit value. */ + if (kvm_read_guest(its->dev->kvm, + BASER_ADDRESS(baser) + index * sizeof(indirect_ptr), + &indirect_ptr, sizeof(indirect_ptr))) + return false; + + indirect_ptr = le64_to_cpu(indirect_ptr); + + /* check the valid bit of the first level entry */ + if (!(indirect_ptr & BIT_ULL(63))) + return false; + + /* + * Mask the guest physical address and calculate the frame number. + * Any address beyond our supported 48 bits of PA will be caught + * by the actual check in the final step. + */ + indirect_ptr &= GENMASK_ULL(51, 16); + + /* Find the address of the actual entry */ + index = id % (SZ_64K / GITS_BASER_ENTRY_SIZE(baser)); + indirect_ptr += index * GITS_BASER_ENTRY_SIZE(baser); + gfn = indirect_ptr >> PAGE_SHIFT; + + return kvm_is_visible_gfn(its->dev->kvm, gfn); +} + static int vgic_its_alloc_collection(struct vgic_its *its, struct its_collection **colp, u32 coll_id) { struct its_collection *collection; + if (!vgic_its_check_id(its, its->baser_coll_table, coll_id)) + return E_ITS_MAPC_COLLECTION_OOR; + collection = kzalloc(sizeof(*collection), GFP_KERNEL); collection->collection_id = coll_id; @@ -708,67 +769,6 @@ static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device) kfree(device); } -/* - * Check whether a device ID can be stored into the guest device tables. - * For a direct table this is pretty easy, but gets a bit nasty for - * indirect tables. We check whether the resulting guest physical address - * is actually valid (covered by a memslot and guest accessbible). - * For this we have to read the respective first level entry. - */ -static bool vgic_its_check_device_id(struct kvm *kvm, struct vgic_its *its, - int device_id) -{ - u64 r = its->baser_device_table; - int l1_tbl_size = GITS_BASER_NR_PAGES(r) * SZ_64K; - int index; - u64 indirect_ptr; - gfn_t gfn; - - - if (!(r & GITS_BASER_INDIRECT)) { - phys_addr_t addr; - - if (device_id >= (l1_tbl_size / GITS_BASER_ENTRY_SIZE(r))) - return false; - - addr = BASER_ADDRESS(r) + device_id * GITS_BASER_ENTRY_SIZE(r); - gfn = addr >> PAGE_SHIFT; - - return kvm_is_visible_gfn(kvm, gfn); - } - - /* calculate and check the index into the 1st level */ - index = device_id / (SZ_64K / GITS_BASER_ENTRY_SIZE(r)); - if (index >= (l1_tbl_size / sizeof(u64))) - return false; - - /* Each 1st level entry is represented by a 64-bit value. */ - if (kvm_read_guest(kvm, - BASER_ADDRESS(r) + index * sizeof(indirect_ptr), - &indirect_ptr, sizeof(indirect_ptr))) - return false; - - indirect_ptr = le64_to_cpu(indirect_ptr); - - /* check the valid bit of the first level entry */ - if (!(indirect_ptr & BIT_ULL(63))) - return false; - - /* - * Mask the guest physical address and calculate the frame number. - * Any address beyond our supported 48 bits of PA will be caught - * by the actual check in the final step. - */ - indirect_ptr &= GENMASK_ULL(51, 16); - - /* Find the address of the actual entry */ - index = device_id % (SZ_64K / GITS_BASER_ENTRY_SIZE(r)); - indirect_ptr += index * GITS_BASER_ENTRY_SIZE(r); - gfn = indirect_ptr >> PAGE_SHIFT; - - return kvm_is_visible_gfn(kvm, gfn); -} - /* * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs). * Must be called with the its_lock mutex held. @@ -780,7 +780,7 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, bool valid = its_cmd_get_validbit(its_cmd); struct its_device *device; - if (!vgic_its_check_device_id(kvm, its, device_id)) + if (!vgic_its_check_id(its, its->baser_device_table, device_id)) return E_ITS_MAPD_DEVICE_OOR; device = find_its_device(its, device_id); @@ -812,13 +812,6 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, return 0; } -static int vgic_its_nr_collection_ids(struct vgic_its *its) -{ - u64 r = its->baser_coll_table; - - return (GITS_BASER_NR_PAGES(r) * SZ_64K) / GITS_BASER_ENTRY_SIZE(r); -} - /* * The MAPC command maps collection IDs to redistributors. * Must be called with the its_lock mutex held. @@ -838,9 +831,6 @@ static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its, if (target_addr >= atomic_read(&kvm->online_vcpus)) return E_ITS_MAPC_PROCNUM_OOR; - if (coll_id >= vgic_its_nr_collection_ids(its)) - return E_ITS_MAPC_COLLECTION_OOR; - if (!valid) { vgic_its_free_collection(its, coll_id); } else { From a3e7aa271eec50a674d33bb6eafa51c5f1e5f51f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 17 Jul 2016 22:38:32 +0100 Subject: [PATCH 284/302] KVM: arm64: vgic-its: Make vgic_its_cmd_handle_mapi similar to other handlers vgic_its_cmd_handle_mapi has an extra "subcmd" argument, which is already contained in the command buffer that all command handlers obtain from the command queue. Let's drop it, as it is not that useful. Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-its.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 996e3e19b53f..ec7e07bc2559 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -688,7 +688,7 @@ static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id) * Must be called with its_lock mutex held. */ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, - u64 *its_cmd, u8 subcmd) + u64 *its_cmd) { u32 device_id = its_cmd_get_deviceid(its_cmd); u32 event_id = its_cmd_get_id(its_cmd); @@ -711,7 +711,7 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, new_coll = collection; } - if (subcmd == GITS_CMD_MAPTI) + if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI) lpi_nr = its_cmd_get_physical_id(its_cmd); else lpi_nr = event_id; @@ -999,11 +999,10 @@ static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its, static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { - u8 cmd = its_cmd_get_command(its_cmd); int ret = -ENODEV; mutex_lock(&its->its_lock); - switch (cmd) { + switch (its_cmd_get_command(its_cmd)) { case GITS_CMD_MAPD: ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd); break; @@ -1011,10 +1010,10 @@ static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its, ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd); break; case GITS_CMD_MAPI: - ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd, cmd); + ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); break; case GITS_CMD_MAPTI: - ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd, cmd); + ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd); break; case GITS_CMD_MOVI: ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd); From 3a88bded203591d4683aacdbb65cd0f549bc58cb Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 18 Jul 2016 16:27:14 +0100 Subject: [PATCH 285/302] KVM: arm64: vgic-its: Simplify MAPI error handling If we care to move all the checks that do not involve any memory allocation, we can simplify the MAPI error handling. Let's do that, it cannot hurt. Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-its.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index ec7e07bc2559..07411cf967b9 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -697,36 +697,34 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, struct its_device *device; struct its_collection *collection, *new_coll = NULL; int lpi_nr; - int ret; device = find_its_device(its, device_id); if (!device) return E_ITS_MAPTI_UNMAPPED_DEVICE; - collection = find_collection(its, coll_id); - if (!collection) { - ret = vgic_its_alloc_collection(its, &collection, coll_id); - if (ret) - return ret; - new_coll = collection; - } - if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI) lpi_nr = its_cmd_get_physical_id(its_cmd); else lpi_nr = event_id; if (lpi_nr < GIC_LPI_OFFSET || - lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser)) { - ret = E_ITS_MAPTI_PHYSICALID_OOR; - goto err; + lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser)) + return E_ITS_MAPTI_PHYSICALID_OOR; + + collection = find_collection(its, coll_id); + if (!collection) { + int ret = vgic_its_alloc_collection(its, &collection, coll_id); + if (ret) + return ret; + new_coll = collection; } itte = find_itte(its, device_id, event_id); if (!itte) { itte = kzalloc(sizeof(struct its_itte), GFP_KERNEL); if (!itte) { - ret = -ENOMEM; - goto err; + if (new_coll) + vgic_its_free_collection(its, coll_id); + return -ENOMEM; } itte->event_id = event_id; @@ -746,10 +744,6 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, update_lpi_config(kvm, itte->irq, NULL); return 0; -err: - if (new_coll) - vgic_its_free_collection(its, coll_id); - return ret; } /* Requires the its_lock to be held. */ From f024ee098476a3e620232e4a78cfac505f121245 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 22 Jun 2016 14:21:59 +1000 Subject: [PATCH 286/302] KVM: PPC: Book3S HV: Pull out TM state save/restore into separate procedures This moves the transactional memory state save and restore sequences out of the guest entry/exit paths into separate procedures. This is so that these sequences can be used in going into and out of nap in a subsequent patch. The only code changes here are (a) saving and restore LR on the stack, since these new procedures get called with a bl instruction, (b) explicitly saving r1 into the PACA instead of assuming that HSTATE_HOST_R1(r13) is already set, and (c) removing an unnecessary and redundant setting of MSR[TM] that should have been removed by commit 9d4d0bdd9e0a ("KVM: PPC: Book3S HV: Add transactional memory support", 2013-09-24) but wasn't. Cc: stable@vger.kernel.org # v3.15+ Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 449 +++++++++++++----------- 1 file changed, 237 insertions(+), 212 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 0d246fca157a..cfa4031f0806 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -689,112 +689,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION - b skip_tm -END_FTR_SECTION_IFCLR(CPU_FTR_TM) - - /* Turn on TM/FP/VSX/VMX so we can restore them. */ - mfmsr r5 - li r6, MSR_TM >> 32 - sldi r6, r6, 32 - or r5, r5, r6 - ori r5, r5, MSR_FP - oris r5, r5, (MSR_VEC | MSR_VSX)@h - mtmsrd r5 - - /* - * The user may change these outside of a transaction, so they must - * always be context switched. - */ - ld r5, VCPU_TFHAR(r4) - ld r6, VCPU_TFIAR(r4) - ld r7, VCPU_TEXASR(r4) - mtspr SPRN_TFHAR, r5 - mtspr SPRN_TFIAR, r6 - mtspr SPRN_TEXASR, r7 - - ld r5, VCPU_MSR(r4) - rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 - beq skip_tm /* TM not active in guest */ - - /* Make sure the failure summary is set, otherwise we'll program check - * when we trechkpt. It's possible that this might have been not set - * on a kvmppc_set_one_reg() call but we shouldn't let this crash the - * host. - */ - oris r7, r7, (TEXASR_FS)@h - mtspr SPRN_TEXASR, r7 - - /* - * We need to load up the checkpointed state for the guest. - * We need to do this early as it will blow away any GPRs, VSRs and - * some SPRs. - */ - - mr r31, r4 - addi r3, r31, VCPU_FPRS_TM - bl load_fp_state - addi r3, r31, VCPU_VRS_TM - bl load_vr_state - mr r4, r31 - lwz r7, VCPU_VRSAVE_TM(r4) - mtspr SPRN_VRSAVE, r7 - - ld r5, VCPU_LR_TM(r4) - lwz r6, VCPU_CR_TM(r4) - ld r7, VCPU_CTR_TM(r4) - ld r8, VCPU_AMR_TM(r4) - ld r9, VCPU_TAR_TM(r4) - mtlr r5 - mtcr r6 - mtctr r7 - mtspr SPRN_AMR, r8 - mtspr SPRN_TAR, r9 - - /* - * Load up PPR and DSCR values but don't put them in the actual SPRs - * till the last moment to avoid running with userspace PPR and DSCR for - * too long. - */ - ld r29, VCPU_DSCR_TM(r4) - ld r30, VCPU_PPR_TM(r4) - - std r2, PACATMSCRATCH(r13) /* Save TOC */ - - /* Clear the MSR RI since r1, r13 are all going to be foobar. */ - li r5, 0 - mtmsrd r5, 1 - - /* Load GPRs r0-r28 */ - reg = 0 - .rept 29 - ld reg, VCPU_GPRS_TM(reg)(r31) - reg = reg + 1 - .endr - - mtspr SPRN_DSCR, r29 - mtspr SPRN_PPR, r30 - - /* Load final GPRs */ - ld 29, VCPU_GPRS_TM(29)(r31) - ld 30, VCPU_GPRS_TM(30)(r31) - ld 31, VCPU_GPRS_TM(31)(r31) - - /* TM checkpointed state is now setup. All GPRs are now volatile. */ - TRECHKPT - - /* Now let's get back the state we need. */ - HMT_MEDIUM - GET_PACA(r13) - ld r29, HSTATE_DSCR(r13) - mtspr SPRN_DSCR, r29 - ld r4, HSTATE_KVM_VCPU(r13) - ld r1, HSTATE_HOST_R1(r13) - ld r2, PACATMSCRATCH(r13) - - /* Set the MSR RI since we have our registers back. */ - li r5, MSR_RI - mtmsrd r5, 1 -skip_tm: + bl kvmppc_restore_tm +END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif /* Load guest PMU registers */ @@ -875,12 +771,6 @@ BEGIN_FTR_SECTION /* Skip next section on POWER7 */ b 8f END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) - /* Turn on TM so we can access TFHAR/TFIAR/TEXASR */ - mfmsr r8 - li r0, 1 - rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG - mtmsrd r8 - /* Load up POWER8-specific registers */ ld r5, VCPU_IAMR(r4) lwz r6, VCPU_PSPB(r4) @@ -1470,106 +1360,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION - b 2f -END_FTR_SECTION_IFCLR(CPU_FTR_TM) - /* Turn on TM. */ - mfmsr r8 - li r0, 1 - rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG - mtmsrd r8 - - ld r5, VCPU_MSR(r9) - rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 - beq 1f /* TM not active in guest. */ - - li r3, TM_CAUSE_KVM_RESCHED - - /* Clear the MSR RI since r1, r13 are all going to be foobar. */ - li r5, 0 - mtmsrd r5, 1 - - /* All GPRs are volatile at this point. */ - TRECLAIM(R3) - - /* Temporarily store r13 and r9 so we have some regs to play with */ - SET_SCRATCH0(r13) - GET_PACA(r13) - std r9, PACATMSCRATCH(r13) - ld r9, HSTATE_KVM_VCPU(r13) - - /* Get a few more GPRs free. */ - std r29, VCPU_GPRS_TM(29)(r9) - std r30, VCPU_GPRS_TM(30)(r9) - std r31, VCPU_GPRS_TM(31)(r9) - - /* Save away PPR and DSCR soon so don't run with user values. */ - mfspr r31, SPRN_PPR - HMT_MEDIUM - mfspr r30, SPRN_DSCR - ld r29, HSTATE_DSCR(r13) - mtspr SPRN_DSCR, r29 - - /* Save all but r9, r13 & r29-r31 */ - reg = 0 - .rept 29 - .if (reg != 9) && (reg != 13) - std reg, VCPU_GPRS_TM(reg)(r9) - .endif - reg = reg + 1 - .endr - /* ... now save r13 */ - GET_SCRATCH0(r4) - std r4, VCPU_GPRS_TM(13)(r9) - /* ... and save r9 */ - ld r4, PACATMSCRATCH(r13) - std r4, VCPU_GPRS_TM(9)(r9) - - /* Reload stack pointer and TOC. */ - ld r1, HSTATE_HOST_R1(r13) - ld r2, PACATOC(r13) - - /* Set MSR RI now we have r1 and r13 back. */ - li r5, MSR_RI - mtmsrd r5, 1 - - /* Save away checkpinted SPRs. */ - std r31, VCPU_PPR_TM(r9) - std r30, VCPU_DSCR_TM(r9) - mflr r5 - mfcr r6 - mfctr r7 - mfspr r8, SPRN_AMR - mfspr r10, SPRN_TAR - std r5, VCPU_LR_TM(r9) - stw r6, VCPU_CR_TM(r9) - std r7, VCPU_CTR_TM(r9) - std r8, VCPU_AMR_TM(r9) - std r10, VCPU_TAR_TM(r9) - - /* Restore r12 as trap number. */ - lwz r12, VCPU_TRAP(r9) - - /* Save FP/VSX. */ - addi r3, r9, VCPU_FPRS_TM - bl store_fp_state - addi r3, r9, VCPU_VRS_TM - bl store_vr_state - mfspr r6, SPRN_VRSAVE - stw r6, VCPU_VRSAVE_TM(r9) -1: - /* - * We need to save these SPRs after the treclaim so that the software - * error code is recorded correctly in the TEXASR. Also the user may - * change these outside of a transaction, so they must always be - * context switched. - */ - mfspr r5, SPRN_TFHAR - mfspr r6, SPRN_TFIAR - mfspr r7, SPRN_TEXASR - std r5, VCPU_TFHAR(r9) - std r6, VCPU_TFIAR(r9) - std r7, VCPU_TEXASR(r9) -2: + bl kvmppc_save_tm +END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif /* Increment yield count if they have a VPA */ @@ -2694,6 +2486,239 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) mr r4,r31 blr +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* + * Save transactional state and TM-related registers. + * Called with r9 pointing to the vcpu struct. + * This can modify all checkpointed registers, but + * restores r1, r2 and r9 (vcpu pointer) before exit. + */ +kvmppc_save_tm: + mflr r0 + std r0, PPC_LR_STKOFF(r1) + + /* Turn on TM. */ + mfmsr r8 + li r0, 1 + rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG + mtmsrd r8 + + ld r5, VCPU_MSR(r9) + rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 + beq 1f /* TM not active in guest. */ + + std r1, HSTATE_HOST_R1(r13) + li r3, TM_CAUSE_KVM_RESCHED + + /* Clear the MSR RI since r1, r13 are all going to be foobar. */ + li r5, 0 + mtmsrd r5, 1 + + /* All GPRs are volatile at this point. */ + TRECLAIM(R3) + + /* Temporarily store r13 and r9 so we have some regs to play with */ + SET_SCRATCH0(r13) + GET_PACA(r13) + std r9, PACATMSCRATCH(r13) + ld r9, HSTATE_KVM_VCPU(r13) + + /* Get a few more GPRs free. */ + std r29, VCPU_GPRS_TM(29)(r9) + std r30, VCPU_GPRS_TM(30)(r9) + std r31, VCPU_GPRS_TM(31)(r9) + + /* Save away PPR and DSCR soon so don't run with user values. */ + mfspr r31, SPRN_PPR + HMT_MEDIUM + mfspr r30, SPRN_DSCR + ld r29, HSTATE_DSCR(r13) + mtspr SPRN_DSCR, r29 + + /* Save all but r9, r13 & r29-r31 */ + reg = 0 + .rept 29 + .if (reg != 9) && (reg != 13) + std reg, VCPU_GPRS_TM(reg)(r9) + .endif + reg = reg + 1 + .endr + /* ... now save r13 */ + GET_SCRATCH0(r4) + std r4, VCPU_GPRS_TM(13)(r9) + /* ... and save r9 */ + ld r4, PACATMSCRATCH(r13) + std r4, VCPU_GPRS_TM(9)(r9) + + /* Reload stack pointer and TOC. */ + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATOC(r13) + + /* Set MSR RI now we have r1 and r13 back. */ + li r5, MSR_RI + mtmsrd r5, 1 + + /* Save away checkpinted SPRs. */ + std r31, VCPU_PPR_TM(r9) + std r30, VCPU_DSCR_TM(r9) + mflr r5 + mfcr r6 + mfctr r7 + mfspr r8, SPRN_AMR + mfspr r10, SPRN_TAR + std r5, VCPU_LR_TM(r9) + stw r6, VCPU_CR_TM(r9) + std r7, VCPU_CTR_TM(r9) + std r8, VCPU_AMR_TM(r9) + std r10, VCPU_TAR_TM(r9) + + /* Restore r12 as trap number. */ + lwz r12, VCPU_TRAP(r9) + + /* Save FP/VSX. */ + addi r3, r9, VCPU_FPRS_TM + bl store_fp_state + addi r3, r9, VCPU_VRS_TM + bl store_vr_state + mfspr r6, SPRN_VRSAVE + stw r6, VCPU_VRSAVE_TM(r9) +1: + /* + * We need to save these SPRs after the treclaim so that the software + * error code is recorded correctly in the TEXASR. Also the user may + * change these outside of a transaction, so they must always be + * context switched. + */ + mfspr r5, SPRN_TFHAR + mfspr r6, SPRN_TFIAR + mfspr r7, SPRN_TEXASR + std r5, VCPU_TFHAR(r9) + std r6, VCPU_TFIAR(r9) + std r7, VCPU_TEXASR(r9) + + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 + blr + +/* + * Restore transactional state and TM-related registers. + * Called with r4 pointing to the vcpu struct. + * This potentially modifies all checkpointed registers. + * It restores r1, r2, r4 from the PACA. + */ +kvmppc_restore_tm: + mflr r0 + std r0, PPC_LR_STKOFF(r1) + + /* Turn on TM/FP/VSX/VMX so we can restore them. */ + mfmsr r5 + li r6, MSR_TM >> 32 + sldi r6, r6, 32 + or r5, r5, r6 + ori r5, r5, MSR_FP + oris r5, r5, (MSR_VEC | MSR_VSX)@h + mtmsrd r5 + + /* + * The user may change these outside of a transaction, so they must + * always be context switched. + */ + ld r5, VCPU_TFHAR(r4) + ld r6, VCPU_TFIAR(r4) + ld r7, VCPU_TEXASR(r4) + mtspr SPRN_TFHAR, r5 + mtspr SPRN_TFIAR, r6 + mtspr SPRN_TEXASR, r7 + + ld r5, VCPU_MSR(r4) + rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 + beqlr /* TM not active in guest */ + std r1, HSTATE_HOST_R1(r13) + + /* Make sure the failure summary is set, otherwise we'll program check + * when we trechkpt. It's possible that this might have been not set + * on a kvmppc_set_one_reg() call but we shouldn't let this crash the + * host. + */ + oris r7, r7, (TEXASR_FS)@h + mtspr SPRN_TEXASR, r7 + + /* + * We need to load up the checkpointed state for the guest. + * We need to do this early as it will blow away any GPRs, VSRs and + * some SPRs. + */ + + mr r31, r4 + addi r3, r31, VCPU_FPRS_TM + bl load_fp_state + addi r3, r31, VCPU_VRS_TM + bl load_vr_state + mr r4, r31 + lwz r7, VCPU_VRSAVE_TM(r4) + mtspr SPRN_VRSAVE, r7 + + ld r5, VCPU_LR_TM(r4) + lwz r6, VCPU_CR_TM(r4) + ld r7, VCPU_CTR_TM(r4) + ld r8, VCPU_AMR_TM(r4) + ld r9, VCPU_TAR_TM(r4) + mtlr r5 + mtcr r6 + mtctr r7 + mtspr SPRN_AMR, r8 + mtspr SPRN_TAR, r9 + + /* + * Load up PPR and DSCR values but don't put them in the actual SPRs + * till the last moment to avoid running with userspace PPR and DSCR for + * too long. + */ + ld r29, VCPU_DSCR_TM(r4) + ld r30, VCPU_PPR_TM(r4) + + std r2, PACATMSCRATCH(r13) /* Save TOC */ + + /* Clear the MSR RI since r1, r13 are all going to be foobar. */ + li r5, 0 + mtmsrd r5, 1 + + /* Load GPRs r0-r28 */ + reg = 0 + .rept 29 + ld reg, VCPU_GPRS_TM(reg)(r31) + reg = reg + 1 + .endr + + mtspr SPRN_DSCR, r29 + mtspr SPRN_PPR, r30 + + /* Load final GPRs */ + ld 29, VCPU_GPRS_TM(29)(r31) + ld 30, VCPU_GPRS_TM(30)(r31) + ld 31, VCPU_GPRS_TM(31)(r31) + + /* TM checkpointed state is now setup. All GPRs are now volatile. */ + TRECHKPT + + /* Now let's get back the state we need. */ + HMT_MEDIUM + GET_PACA(r13) + ld r29, HSTATE_DSCR(r13) + mtspr SPRN_DSCR, r29 + ld r4, HSTATE_KVM_VCPU(r13) + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATMSCRATCH(r13) + + /* Set the MSR RI since we have our registers back. */ + li r5, MSR_RI + mtmsrd r5, 1 + + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 + blr +#endif + /* * We come here if we get any exception or interrupt while we are * executing host real mode code while in guest MMU context. From 93d17397e4e2182fdaad503e2f9da46202c0f1c3 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 22 Jun 2016 15:52:55 +1000 Subject: [PATCH 287/302] KVM: PPC: Book3S HV: Save/restore TM state in H_CEDE It turns out that if the guest does a H_CEDE while the CPU is in a transactional state, and the H_CEDE does a nap, and the nap loses the architected state of the CPU (which is is allowed to do), then we lose the checkpointed state of the virtual CPU. In addition, the transactional-memory state recorded in the MSR gets reset back to non-transactional, and when we try to return to the guest, we take a TM bad thing type of program interrupt because we are trying to transition from non-transactional to transactional with a hrfid instruction, which is not permitted. The result of the program interrupt occurring at that point is that the host CPU will hang in an infinite loop with interrupts disabled. Thus this is a denial of service vulnerability in the host which can be triggered by any guest (and depending on the guest kernel, it can potentially triggered by unprivileged userspace in the guest). This vulnerability has been assigned the ID CVE-2016-5412. To fix this, we save the TM state before napping and restore it on exit from the nap, when handling a H_CEDE in real mode. The case where H_CEDE exits to host virtual mode is already OK (as are other hcalls which exit to host virtual mode) because the exit path saves the TM state. Cc: stable@vger.kernel.org # v3.15+ Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index cfa4031f0806..543124fa11d9 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2093,6 +2093,13 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */ /* save FP state */ bl kvmppc_save_fp +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + ld r9, HSTATE_KVM_VCPU(r13) + bl kvmppc_save_tm +END_FTR_SECTION_IFSET(CPU_FTR_TM) +#endif + /* * Set DEC to the smaller of DEC and HDEC, so that we wake * no later than the end of our timeslice (HDEC interrupts @@ -2169,6 +2176,12 @@ kvm_end_cede: bl kvmhv_accumulate_time #endif +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + bl kvmppc_restore_tm +END_FTR_SECTION_IFSET(CPU_FTR_TM) +#endif + /* load up FP state */ bl kvmppc_load_fp From 4f2777bc97974b0df9276ee9a85155a9e27a5282 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 13 Jul 2016 17:16:37 -0700 Subject: [PATCH 288/302] kvm: x86: nVMX: maintain internal copy of current VMCS KVM maintains L1's current VMCS in guest memory, at the guest physical page identified by the argument to VMPTRLD. This makes hairy time-of-check to time-of-use bugs possible,as VCPUs can be writing the the VMCS page in memory while KVM is emulating VMLAUNCH and VMRESUME. The spec documents that writing to the VMCS page while it is loaded is "undefined". Therefore it is reasonable to load the entire VMCS into an internal cache during VMPTRLD and ignore writes to the VMCS page -- the guest should be using VMREAD and VMWRITE to access the current VMCS. To adhere to the spec, KVM should flush the current VMCS during VMPTRLD, and the target VMCS during VMCLEAR (as given by the operand to VMCLEAR). Since this implementation of VMCS caching only maintains the the current VMCS, VMCLEAR will only do a flush if the operand to VMCLEAR is the current VMCS pointer. KVM will also flush during VMXOFF, which is not mandated by the spec, but also not in conflict with the spec. Signed-off-by: David Matlack Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b61cdadf8623..151d2619238c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -405,6 +405,12 @@ struct nested_vmx { /* The host-usable pointer to the above */ struct page *current_vmcs12_page; struct vmcs12 *current_vmcs12; + /* + * Cache of the guest's VMCS, existing outside of guest memory. + * Loaded from guest memory during VMPTRLD. Flushed to guest + * memory during VMXOFF, VMCLEAR, VMPTRLD. + */ + struct vmcs12 *cached_vmcs12; struct vmcs *current_shadow_vmcs; /* * Indicates if the shadow vmcs must be updated with the @@ -858,7 +864,7 @@ static inline short vmcs_field_to_offset(unsigned long field) static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { - return to_vmx(vcpu)->nested.current_vmcs12; + return to_vmx(vcpu)->nested.cached_vmcs12; } static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) @@ -6987,10 +6993,16 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } + vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_vmcs12) + return -ENOMEM; + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); - if (!shadow_vmcs) + if (!shadow_vmcs) { + kfree(vmx->nested.cached_vmcs12); return -ENOMEM; + } /* mark vmcs as shadow */ shadow_vmcs->revision_id |= (1u << 31); /* init shadow vmcs */ @@ -7061,6 +7073,11 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) vmcs_write64(VMCS_LINK_POINTER, -1ull); } vmx->nested.posted_intr_nv = -1; + + /* Flush VMCS12 to guest memory */ + memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12, + VMCS12_SIZE); + kunmap(vmx->nested.current_vmcs12_page); nested_release_page(vmx->nested.current_vmcs12_page); vmx->nested.current_vmptr = -1ull; @@ -7081,6 +7098,7 @@ static void free_nested(struct vcpu_vmx *vmx) nested_release_vmcs12(vmx); if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); + kfree(vmx->nested.cached_vmcs12); /* Unpin physical memory we referred to in current vmcs02 */ if (vmx->nested.apic_access_page) { nested_release_page(vmx->nested.apic_access_page); @@ -7484,6 +7502,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) vmx->nested.current_vmptr = vmptr; vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; + /* + * Load VMCS12 from guest memory since it is not already + * cached. + */ + memcpy(vmx->nested.cached_vmcs12, + vmx->nested.current_vmcs12, VMCS12_SIZE); + if (enable_shadow_vmcs) { vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); @@ -8456,7 +8481,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) * the next L2->L1 exit. */ if (!is_guest_mode(vcpu) || - !nested_cpu_has2(vmx->nested.current_vmcs12, + !nested_cpu_has2(get_vmcs12(&vmx->vcpu), SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) vmcs_write64(APIC_ACCESS_ADDR, hpa); } From b80c76ec982c00f2a15668ed71c1d705b6ff95fd Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 29 Jul 2016 18:56:53 -0700 Subject: [PATCH 289/302] KVM: VMX: Add VMCS to CPU's loaded VMCSs before VMPTRLD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kexec needs to know the addresses of all VMCSs that are active on each CPU, so that it can flush them from the VMCS caches. It is safe to record superfluous addresses that are not associated with an active VMCS, but it is not safe to omit an address associated with an active VMCS. After a call to vmcs_load, the VMCS that was loaded is active on the CPU. The VMCS should be added to the CPU's list of active VMCSs before it is loaded. Signed-off-by: Jim Mattson Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 151d2619238c..b2f559159f3a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2205,22 +2205,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + bool already_loaded = vmx->loaded_vmcs->cpu == cpu; if (!vmm_exclusive) kvm_cpu_vmxon(phys_addr); - else if (vmx->loaded_vmcs->cpu != cpu) + else if (!already_loaded) loaded_vmcs_clear(vmx->loaded_vmcs); - if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { - per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; - vmcs_load(vmx->loaded_vmcs->vmcs); - } - - if (vmx->loaded_vmcs->cpu != cpu) { - struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); - unsigned long sysenter_esp; - - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + if (!already_loaded) { local_irq_disable(); crash_disable_local_vmclear(cpu); @@ -2235,6 +2227,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) &per_cpu(loaded_vmcss_on_cpu, cpu)); crash_enable_local_vmclear(cpu); local_irq_enable(); + } + + if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; + vmcs_load(vmx->loaded_vmcs->vmcs); + } + + if (!already_loaded) { + struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); + unsigned long sysenter_esp; + + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); /* * Linux uses per-cpu TSS and GDT, so set these when switching From 6002bdd3e6688954f5f5c1d71b83862cfd7387d9 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 8 Jul 2016 11:53:20 +0100 Subject: [PATCH 290/302] MIPS: Fix definition of KSEGX() for 64-bit The KSEGX() macro is defined to 32-bit sign extend the address argument and logically AND the result with 0xe0000000, with the final result usually compared against one of the CKSEG macros. However the literal 0xe0000000 is unsigned as the high bit is set, and is therefore zero-extended on 64-bit kernels, resulting in the sign extension bits of the argument being masked to zero. This results in the odd situation where: KSEGX(CKSEG) != CKSEG (0xffffffff80000000 & 0x00000000e0000000) != 0xffffffff80000000) Fix this by 32-bit sign extending the 0xe0000000 literal using _ACAST32_. This will help some MIPS KVM code handling 32-bit guest addresses to work on 64-bit host kernels, but will also affect KSEGX in dec_kn01_be_backend() on a 64-bit DECstation kernel, and the SiByte DMA page ops KSEGX check in clear_page() and copy_page() on 64-bit SB1 kernels, neither of which appear to be designed with 64-bit segments in mind anyway. Signed-off-by: James Hogan Acked-by: Ralf Baechle Cc: Maciej W. Rozycki Cc: linux-mips@linux-mips.org Signed-off-by: Paolo Bonzini --- arch/mips/include/asm/addrspace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/include/asm/addrspace.h b/arch/mips/include/asm/addrspace.h index 3b0e51d5a613..c5b04e752e97 100644 --- a/arch/mips/include/asm/addrspace.h +++ b/arch/mips/include/asm/addrspace.h @@ -45,7 +45,7 @@ /* * Returns the kernel segment base of a given address */ -#define KSEGX(a) ((_ACAST32_ (a)) & 0xe0000000) +#define KSEGX(a) ((_ACAST32_(a)) & _ACAST32_(0xe0000000)) /* * Returns the physical address of a CKSEGx / XKPHYS address From cfacaced0cce20859de25b61d672edeb9789a1e9 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 8 Jul 2016 11:53:21 +0100 Subject: [PATCH 291/302] MIPS: KVM: Use virt_to_phys() to get commpage PFN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calculate the PFN of the commpage using virt_to_phys() instead of CPHYSADDR(). This is more portable as kzalloc() may allocate from XKPhys instead of KSeg0 on 64-bit kernels, which CPHYSADDR() doesn't handle. This is sufficient for highmem kernels too since kzalloc() will allocate from lowmem in KSeg0. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 9699352293e4..f5f8c2acae53 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -176,7 +176,7 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, unsigned long entrylo[2] = { 0, 0 }; unsigned int pair_idx; - pfn = CPHYSADDR(vcpu->arch.kseg0_commpage) >> PAGE_SHIFT; + pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage)); pair_idx = (badvaddr >> PAGE_SHIFT) & 1; entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) | ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | From 28cc5bd568745a58bb06291ac336d06b66c66dff Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 8 Jul 2016 11:53:22 +0100 Subject: [PATCH 292/302] MIPS: KVM: Use kmap instead of CKSEG0ADDR() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are several unportable uses of CKSEG0ADDR() in MIPS KVM, which implicitly assume that a host physical address will be in the low 512MB of the physical address space (accessible in KSeg0). These assumptions don't hold for highmem or on 64-bit kernels. When interpreting the guest physical address when reading or overwriting a trapping instruction, use kmap_atomic() to get a usable virtual address to access guest memory, which is portable to 64-bit and highmem kernels. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/dyntrans.c | 17 +++++++++++------ arch/mips/kvm/mmu.c | 7 ++++++- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c index 91ebd2b6034f..9a16ba2cb487 100644 --- a/arch/mips/kvm/dyntrans.c +++ b/arch/mips/kvm/dyntrans.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -29,14 +30,18 @@ static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc, union mips_instruction replace) { - unsigned long kseg0_opc, flags; + unsigned long paddr, flags; + void *vaddr; if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) { - kseg0_opc = - CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa - (vcpu, (unsigned long) opc)); - memcpy((void *)kseg0_opc, (void *)&replace, sizeof(u32)); - local_flush_icache_range(kseg0_opc, kseg0_opc + 32); + paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, + (unsigned long)opc); + vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr))); + vaddr += paddr & ~PAGE_MASK; + memcpy(vaddr, (void *)&replace, sizeof(u32)); + local_flush_icache_range((unsigned long)vaddr, + (unsigned long)vaddr + 32); + kunmap_atomic(vaddr); } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { local_irq_save(flags); memcpy((void *)opc, (void *)&replace, sizeof(u32)); diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index ecead748de04..57319ee57c4f 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -9,6 +9,7 @@ * Authors: Sanjay Lal */ +#include #include #include @@ -330,6 +331,7 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) struct mips_coproc *cop0 = vcpu->arch.cop0; unsigned long paddr, flags, vpn2, asid; unsigned long va = (unsigned long)opc; + void *vaddr; u32 inst; int index; @@ -360,7 +362,10 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) local_irq_restore(flags); } else if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) { paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, va); - inst = *(u32 *) CKSEG0ADDR(paddr); + vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr))); + vaddr += paddr & ~PAGE_MASK; + inst = *(u32 *)vaddr; + kunmap_atomic(vaddr); } else { kvm_err("%s: illegal address: %p\n", __func__, opc); return KVM_INVALID_INST; From e41637d85846b5b4b6ef5232a22b7e74c03f1be6 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 8 Jul 2016 11:53:23 +0100 Subject: [PATCH 293/302] MIPS: KVM: Make entry code MIPS64 friendly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MIPS KVM entry code (originally kvm_locore.S, later locore.S, and now entry.c) has never quite been right when built for 64-bit, using 32-bit instructions when 64-bit instructions were needed for handling 64-bit registers and pointers. Fix several cases of this now. The changes roughly fall into the following categories. - COP0 scratch registers contain guest register values and the VCPU pointer, and are themselves full width. Similarly CP0_EPC and CP0_BadVAddr registers are full width (even though technically we don't support 64-bit guest address spaces with trap & emulate KVM). Use MFC0/MTC0 for accessing them. - Handling of stack pointers and the VCPU pointer must match the pointer size of the kernel ABI (always o32 or n64), so use ADDIU. - The CPU number in thread_info, and the guest_{user,kernel}_asid arrays in kvm_vcpu_arch are all 32 bit integers, so use lw (instead of LW) to load them. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/entry.c | 48 +++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index 75ba7c2ecb3d..f4556d0279c6 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -120,12 +120,12 @@ static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp, unsigned int frame) { /* Save the VCPU scratch register value in cp0_epc of the stack frame */ - uasm_i_mfc0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]); + UASM_i_MFC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]); UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame); /* Save the temp scratch register value in cp0_cause of stack frame */ if (scratch_tmp[0] == 31) { - uasm_i_mfc0(p, tmp, scratch_tmp[0], scratch_tmp[1]); + UASM_i_MFC0(p, tmp, scratch_tmp[0], scratch_tmp[1]); UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame); } } @@ -138,11 +138,11 @@ static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp, * kvm_mips_build_save_scratch(). */ UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame); - uasm_i_mtc0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]); + UASM_i_MTC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]); if (scratch_tmp[0] == 31) { UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame); - uasm_i_mtc0(p, tmp, scratch_tmp[0], scratch_tmp[1]); + UASM_i_MTC0(p, tmp, scratch_tmp[0], scratch_tmp[1]); } } @@ -171,7 +171,7 @@ void *kvm_mips_build_vcpu_run(void *addr) */ /* k0/k1 not being used in host kernel context */ - uasm_i_addiu(&p, K1, SP, -(int)sizeof(struct pt_regs)); + UASM_i_ADDIU(&p, K1, SP, -(int)sizeof(struct pt_regs)); for (i = 16; i < 32; ++i) { if (i == 24) i = 28; @@ -186,10 +186,10 @@ void *kvm_mips_build_vcpu_run(void *addr) kvm_mips_build_save_scratch(&p, V1, K1); /* VCPU scratch register has pointer to vcpu */ - uasm_i_mtc0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]); + UASM_i_MTC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]); /* Offset into vcpu->arch */ - uasm_i_addiu(&p, K1, A1, offsetof(struct kvm_vcpu, arch)); + UASM_i_ADDIU(&p, K1, A1, offsetof(struct kvm_vcpu, arch)); /* * Save the host stack to VCPU, used for exception processing @@ -252,7 +252,7 @@ static void *kvm_mips_build_enter_guest(void *addr) /* Set Guest EPC */ UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, pc), K1); - uasm_i_mtc0(&p, T0, C0_EPC); + UASM_i_MTC0(&p, T0, C0_EPC); /* Set the ASID for the Guest Kernel */ UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, cop0), K1); @@ -261,20 +261,20 @@ static void *kvm_mips_build_enter_guest(void *addr) uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL); uasm_i_xori(&p, T0, T0, KSU_USER); uasm_il_bnez(&p, &r, T0, label_kernel_asid); - uasm_i_addiu(&p, T1, K1, + UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch, guest_kernel_asid)); /* else user */ - uasm_i_addiu(&p, T1, K1, + UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch, guest_user_asid)); uasm_l_kernel_asid(&l, p); /* t1: contains the base of the ASID array, need to get the cpu id */ /* smp_processor_id */ - UASM_i_LW(&p, T2, offsetof(struct thread_info, cpu), GP); + uasm_i_lw(&p, T2, offsetof(struct thread_info, cpu), GP); /* x4 */ uasm_i_sll(&p, T2, T2, 2); UASM_i_ADDU(&p, T3, T1, T2); - UASM_i_LW(&p, K0, 0, T3); + uasm_i_lw(&p, K0, 0, T3); #ifdef CONFIG_MIPS_ASID_BITS_VARIABLE /* x sizeof(struct cpuinfo_mips)/4 */ uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4); @@ -344,11 +344,11 @@ void *kvm_mips_build_exception(void *addr, void *handler) memset(relocs, 0, sizeof(relocs)); /* Save guest k1 into scratch register */ - uasm_i_mtc0(&p, K1, scratch_tmp[0], scratch_tmp[1]); + UASM_i_MTC0(&p, K1, scratch_tmp[0], scratch_tmp[1]); /* Get the VCPU pointer from the VCPU scratch register */ - uasm_i_mfc0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]); - uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch)); + UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]); + UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch)); /* Save guest k0 into VCPU structure */ UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1); @@ -415,13 +415,13 @@ void *kvm_mips_build_exit(void *addr) /* Finally save guest k1 to VCPU */ uasm_i_ehb(&p); - uasm_i_mfc0(&p, T0, scratch_tmp[0], scratch_tmp[1]); + UASM_i_MFC0(&p, T0, scratch_tmp[0], scratch_tmp[1]); UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1); /* Now that context has been saved, we can use other registers */ /* Restore vcpu */ - uasm_i_mfc0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]); + UASM_i_MFC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]); uasm_i_move(&p, S1, A1); /* Restore run (vcpu->run) */ @@ -433,10 +433,10 @@ void *kvm_mips_build_exit(void *addr) * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process * the exception */ - uasm_i_mfc0(&p, K0, C0_EPC); + UASM_i_MFC0(&p, K0, C0_EPC); UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, pc), K1); - uasm_i_mfc0(&p, K0, C0_BADVADDR); + UASM_i_MFC0(&p, K0, C0_BADVADDR); UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_badvaddr), K1); @@ -506,7 +506,7 @@ void *kvm_mips_build_exit(void *addr) UASM_i_LW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1); /* Saved host state */ - uasm_i_addiu(&p, SP, SP, -(int)sizeof(struct pt_regs)); + UASM_i_ADDIU(&p, SP, SP, -(int)sizeof(struct pt_regs)); /* * XXXKYMA do we need to load the host ASID, maybe not because the @@ -529,7 +529,7 @@ void *kvm_mips_build_exit(void *addr) */ UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit); uasm_i_jalr(&p, RA, T9); - uasm_i_addiu(&p, SP, SP, -CALLFRAME_SIZ); + UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ); uasm_resolve_relocs(relocs, labels); @@ -569,7 +569,7 @@ static void *kvm_mips_build_ret_from_exit(void *addr) */ uasm_i_move(&p, K1, S1); - uasm_i_addiu(&p, K1, K1, offsetof(struct kvm_vcpu, arch)); + UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch)); /* * Check return value, should tell us if we are returning to the @@ -603,7 +603,7 @@ static void *kvm_mips_build_ret_to_guest(void *addr) u32 *p = addr; /* Put the saved pointer to vcpu (s1) back into the scratch register */ - uasm_i_mtc0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]); + UASM_i_MTC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]); /* Load up the Guest EBASE to minimize the window where BEV is set */ UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1); @@ -645,7 +645,7 @@ static void *kvm_mips_build_ret_to_host(void *addr) /* EBASE is already pointing to Linux */ UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, host_stack), K1); - uasm_i_addiu(&p, K1, K1, -(int)sizeof(struct pt_regs)); + UASM_i_ADDIU(&p, K1, K1, -(int)sizeof(struct pt_regs)); /* * r2/v0 is the return code, shift it down by 2 (arithmetic) From 1d756942533b2330d8929dd0ea61a81a5d020196 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 8 Jul 2016 11:53:24 +0100 Subject: [PATCH 294/302] MIPS: KVM: Set CP0_Status.KX on MIPS64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the KVM entry code to set the CP0_Entry.KX bit on 64-bit kernels. This is important to allow the entry code, running in kernel mode, to access the full 64-bit address space right up to the point of entering the guest, and immediately after exiting the guest, so it can safely restore & save the guest context from 64-bit segments. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/entry.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index f4556d0279c6..c824bfc4daa0 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -61,6 +61,12 @@ #define CALLFRAME_SIZ 32 +#ifdef CONFIG_64BIT +#define ST0_KX_IF_64 ST0_KX +#else +#define ST0_KX_IF_64 0 +#endif + static unsigned int scratch_vcpu[2] = { C0_DDATA_LO }; static unsigned int scratch_tmp[2] = { C0_ERROREPC }; @@ -204,7 +210,7 @@ void *kvm_mips_build_vcpu_run(void *addr) * Setup status register for running the guest in UM, interrupts * are disabled */ - UASM_i_LA(&p, K0, ST0_EXL | KSU_USER | ST0_BEV); + UASM_i_LA(&p, K0, ST0_EXL | KSU_USER | ST0_BEV | ST0_KX_IF_64); uasm_i_mtc0(&p, K0, C0_STATUS); uasm_i_ehb(&p); @@ -217,7 +223,7 @@ void *kvm_mips_build_vcpu_run(void *addr) * interrupt mask as it was but make sure that timer interrupts * are enabled */ - uasm_i_addiu(&p, K0, ZERO, ST0_EXL | KSU_USER | ST0_IE); + uasm_i_addiu(&p, K0, ZERO, ST0_EXL | KSU_USER | ST0_IE | ST0_KX_IF_64); uasm_i_andi(&p, V0, V0, ST0_IM); uasm_i_or(&p, K0, K0, V0); uasm_i_mtc0(&p, K0, C0_STATUS); From 0d17aea5c27d7d748b1d8116d275b2b17dc5cad6 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 8 Jul 2016 11:53:25 +0100 Subject: [PATCH 295/302] MIPS: KVM: Use 64-bit CP0_EBase when appropriate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the KVM entry point to write CP0_EBase as a 64-bit register when it is 64-bits wide, and to set the WG (write gate) bit if it exists in order to write bits 63:30 (or 31:30 on MIPS32). Prior to MIPS64r6 it was UNDEFINED to perform a 64-bit read or write of a 32-bit COP0 register. Since this is dynamically generated code, generate the right type of access depending on whether the kernel is 64-bit and cpu_has_ebase_wg. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/entry.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index c824bfc4daa0..6a02b3a3fa65 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -152,6 +152,25 @@ static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp, } } +/** + * build_set_exc_base() - Assemble code to write exception base address. + * @p: Code buffer pointer. + * @reg: Source register (generated code may set WG bit in @reg). + * + * Assemble code to modify the exception base address in the EBase register, + * using the appropriately sized access and setting the WG bit if necessary. + */ +static inline void build_set_exc_base(u32 **p, unsigned int reg) +{ + if (cpu_has_ebase_wg) { + /* Set WG so that all the bits get written */ + uasm_i_ori(p, reg, reg, MIPS_EBASE_WG); + UASM_i_MTC0(p, reg, C0_EBASE); + } else { + uasm_i_mtc0(p, reg, C0_EBASE); + } +} + /** * kvm_mips_build_vcpu_run() - Assemble function to start running a guest VCPU. * @addr: Address to start writing code. @@ -216,7 +235,7 @@ void *kvm_mips_build_vcpu_run(void *addr) /* load up the new EBASE */ UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1); - uasm_i_mtc0(&p, K0, C0_EBASE); + build_set_exc_base(&p, K0); /* * Now that the new EBASE has been loaded, unset BEV, set @@ -463,7 +482,7 @@ void *kvm_mips_build_exit(void *addr) UASM_i_LA_mostly(&p, K0, (long)&ebase); UASM_i_LW(&p, K0, uasm_rel_lo((long)&ebase), K0); - uasm_i_mtc0(&p, K0, C0_EBASE); + build_set_exc_base(&p, K0); if (raw_cpu_has_fpu) { /* @@ -620,7 +639,7 @@ static void *kvm_mips_build_ret_to_guest(void *addr) uasm_i_or(&p, K0, V1, AT); uasm_i_mtc0(&p, K0, C0_STATUS); uasm_i_ehb(&p); - uasm_i_mtc0(&p, T0, C0_EBASE); + build_set_exc_base(&p, T0); /* Setup status register for running guest in UM */ uasm_i_ori(&p, V1, V1, ST0_EXL | KSU_USER | ST0_IE); From 2a06dab877dee3d4144c3ba32c662db18a1fdd2b Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 8 Jul 2016 11:53:26 +0100 Subject: [PATCH 296/302] MIPS: KVM: Fail if ebase doesn't fit in CP0_EBase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fail if the address of the allocated exception base doesn't fit into the CP0_EBase register. This can happen on MIPS64 if CP0_EBase.WG isn't implemented but RAM is available outside of the range of KSeg0. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 414b00074e29..a6ea084b4d9d 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -300,6 +300,18 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) kvm_debug("Allocated %d bytes for KVM Exception Handlers @ %p\n", ALIGN(size, PAGE_SIZE), gebase); + /* + * Check new ebase actually fits in CP0_EBase. The lack of a write gate + * limits us to the low 512MB of physical address space. If the memory + * we allocate is out of range, just give up now. + */ + if (!cpu_has_ebase_wg && virt_to_phys(gebase) >= 0x20000000) { + kvm_err("CP0_EBase.WG required for guest exception base %pK\n", + gebase); + err = -ENOMEM; + goto out_free_gebase; + } + /* Save new ebase */ vcpu->arch.guest_ebase = gebase; From 5808844f03b4b31a13a87cf41cc0701718c1b622 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 8 Jul 2016 11:53:27 +0100 Subject: [PATCH 297/302] MIPS: KVM: Fix 64-bit big endian dynamic translation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MFC0 and MTC0 instructions in the guest which cause traps can be replaced with 32-bit loads and stores to the commpage, however on big endian 64-bit builds the offset needs to have 4 added so as to load/store the least significant half of the long instead of the most significant half. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/dyntrans.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c index 9a16ba2cb487..c793ff19a8a8 100644 --- a/arch/mips/kvm/dyntrans.c +++ b/arch/mips/kvm/dyntrans.c @@ -103,6 +103,10 @@ int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc, mfc0_inst.i_format.rt = inst.c0r_format.rt; mfc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR | offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]); +#ifdef CONFIG_CPU_BIG_ENDIAN + if (sizeof(vcpu->arch.cop0->reg[0][0]) == 8) + mfc0_inst.i_format.simmediate |= 4; +#endif } return kvm_mips_trans_replace(vcpu, opc, mfc0_inst); @@ -121,6 +125,10 @@ int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc, mtc0_inst.i_format.rt = inst.c0r_format.rt; mtc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR | offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]); +#ifdef CONFIG_CPU_BIG_ENDIAN + if (sizeof(vcpu->arch.cop0->reg[0][0]) == 8) + mtc0_inst.i_format.simmediate |= 4; +#endif return kvm_mips_trans_replace(vcpu, opc, mtc0_inst); } From 172e02d1474d5c37a8728ccdfdc731c118366144 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 8 Jul 2016 11:53:28 +0100 Subject: [PATCH 298/302] MIPS: KVM: Sign extend MFC0/RDHWR results MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When emulating MFC0 instructions to load 32-bit values from guest COP0 registers and the RDHWR instruction to read the CC (Count) register, sign extend the result to comply with the MIPS64 architecture. The result must be in canonical 32-bit form or the guest may malfunction. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/emulate.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index be18dfe9ecaa..6eb52b9c9818 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1072,14 +1072,15 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, #endif /* Get reg */ if ((rd == MIPS_CP0_COUNT) && (sel == 0)) { - vcpu->arch.gprs[rt] = kvm_mips_read_count(vcpu); + vcpu->arch.gprs[rt] = + (s32)kvm_mips_read_count(vcpu); } else if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) { vcpu->arch.gprs[rt] = 0x0; #ifdef CONFIG_KVM_MIPS_DYN_TRANS kvm_mips_trans_mfc0(inst, opc, vcpu); #endif } else { - vcpu->arch.gprs[rt] = cop0->reg[rd][sel]; + vcpu->arch.gprs[rt] = (s32)cop0->reg[rd][sel]; #ifdef CONFIG_KVM_MIPS_DYN_TRANS kvm_mips_trans_mfc0(inst, opc, vcpu); @@ -2380,7 +2381,7 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, current_cpu_data.icache.linesz); break; case MIPS_HWR_CC: /* Read count register */ - arch->gprs[rt] = kvm_mips_read_count(vcpu); + arch->gprs[rt] = (s32)kvm_mips_read_count(vcpu); break; case MIPS_HWR_CCRES: /* Count register resolution */ switch (current_cpu_data.cputype) { From 8296963e6e8c656c4d91dfa7245e49672aa9675e Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 8 Jul 2016 11:53:29 +0100 Subject: [PATCH 299/302] MIPS: KVM: Fix ptr->int cast via KVM_GUEST_KSEGX() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_mips_trans_replace() passes a pointer to KVM_GUEST_KSEGX(). This breaks on 64-bit builds due to the cast of that 64-bit pointer to a different sized 32-bit int. Cast the pointer argument to an unsigned long to work around the warning. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/dyntrans.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c index c793ff19a8a8..d280894915ed 100644 --- a/arch/mips/kvm/dyntrans.c +++ b/arch/mips/kvm/dyntrans.c @@ -33,7 +33,7 @@ static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc, unsigned long paddr, flags; void *vaddr; - if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) { + if (KVM_GUEST_KSEGX((unsigned long)opc) == KVM_GUEST_KSEG0) { paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, (unsigned long)opc); vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr))); From a700434d80eab4c42380a5c57745aff07493784c Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 8 Jul 2016 11:53:30 +0100 Subject: [PATCH 300/302] MIPS: KVM: Reset CP0_PageMask during host TLB flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM sometimes flushes host TLB entries, reading each one to check if it corresponds to a guest KSeg0 address. In the absence of EntryHi.EHInv bits to invalidate the whole entry, the entries will be set to unique virtual addresses in KSeg0 (which is not TLB mapped), spaced 2*PAGE_SIZE apart. The TLB read however will clobber the CP0_PageMask register with whatever page size that TLB entry had, and that same page size will be written back into the TLB entry along with the unique address. This would cause breakage when transparent huge pages are enabled on 64-bit host kernels, since huge page entries will overlap other nearby entries when separated by only 2*PAGE_SIZE, causing a machine check exception. Fix this by restoring the old CP0_PageMask value (which should be set to the normal page size) after reading the TLB entry if we're going to go ahead and invalidate it. Signed-off-by: James Hogan Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/kvm/tlb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index f5f8c2acae53..254377d8e0b9 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -332,6 +332,8 @@ void kvm_mips_flush_host_tlb(int skip_kseg0) /* Don't blow away guest kernel entries */ if (KVM_GUEST_KSEGX(entryhi) == KVM_GUEST_KSEG0) continue; + + write_c0_pagemask(old_pagemask); } /* Make sure all entries differ. */ From 40a2df49858eb40ccfe979da69b74d8b203a869b Mon Sep 17 00:00:00 2001 From: James Hogan Date: Fri, 8 Jul 2016 11:53:31 +0100 Subject: [PATCH 301/302] MIPS: Select HAVE_KVM for MIPS64_R{2,6} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We are now able to support KVM T&E with MIPS32 guests on some MIPS64r2 and MIPS64r6 hosts, so select HAVE_KVM so it can be enabled. Signed-off-by: James Hogan Cc: Ralf Baechle Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: linux-mips@linux-mips.org Cc: kvm@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/mips/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index ac91939b9b75..29867139851e 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -1488,6 +1488,7 @@ config CPU_MIPS64_R2 select CPU_SUPPORTS_HIGHMEM select CPU_SUPPORTS_HUGEPAGES select CPU_SUPPORTS_MSA + select HAVE_KVM help Choose this option to build a kernel for release 2 or later of the MIPS64 architecture. Many modern embedded systems with a 64-bit @@ -1505,6 +1506,7 @@ config CPU_MIPS64_R6 select CPU_SUPPORTS_MSA select GENERIC_CSUM select MIPS_O32_FP64_SUPPORT if MIPS32_O32 + select HAVE_KVM help Choose this option to build a kernel for release 6 or later of the MIPS64 architecture. New MIPS processors, starting with the Warrior From 23528bb21ee2c9b27f3feddd77a2a3351a8df148 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Wed, 20 Jul 2016 13:41:36 +1000 Subject: [PATCH 302/302] KVM: PPC: Introduce KVM_CAP_PPC_HTM Introduce a new KVM capability, KVM_CAP_PPC_HTM, that can be queried to determine if a PowerPC KVM guest should use HTM (Hardware Transactional Memory). This will be used by QEMU to populate the pa-features bits in the guest's device tree. Signed-off-by: Sam Bobroff Reviewed-by: David Gibson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/powerpc.c | 4 ++++ include/uapi/linux/kvm.h | 1 + 2 files changed, 5 insertions(+) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 1ac036e45ed4..6ce40dd6fe51 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -588,6 +588,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; #endif + case KVM_CAP_PPC_HTM: + r = cpu_has_feature(CPU_FTR_TM_COMP) && + is_kvmppc_hv_enabled(kvm); + break; default: r = 0; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 8f2756c263d4..e98bb4cce639 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -869,6 +869,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_X2APIC_API 129 #define KVM_CAP_S390_USER_INSTR0 130 #define KVM_CAP_MSI_DEVID 131 +#define KVM_CAP_PPC_HTM 132 #ifdef KVM_CAP_IRQ_ROUTING