From 772f95e3b9460c64fb99b134022855cbce75b9a0 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 16 Jul 2015 20:34:42 +0100 Subject: [PATCH 01/33] x86/xen: fix non-ANSI declaration of xen_has_pv_devices() xen_has_pv_devices() has no parameters, so use the normal void parameter convention to make it match the prototype in the header file include/xen/platform_pci.h. Signed-off-by: Colin Ian King Signed-off-by: David Vrabel --- arch/x86/xen/platform-pci-unplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index a8261716d58d..9586ff32810c 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -68,7 +68,7 @@ static int check_platform_magic(void) return 0; } -bool xen_has_pv_devices() +bool xen_has_pv_devices(void) { if (!xen_domain()) return false; From a7da51ae10032a507ddeae6a490916eadbd1e10a Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 15 Jul 2015 12:52:01 +0300 Subject: [PATCH 02/33] xen/preempt: use need_resched() instead of should_resched() This code is used only when CONFIG_PREEMPT=n and only in non-atomic context: xen_in_preemptible_hcall is set only in privcmd_ioctl_hypercall(). Thus preempt_count is zero and should_resched() is equal to need_resched(). Signed-off-by: Konstantin Khlebnikov Signed-off-by: David Vrabel --- drivers/xen/preempt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c index a1800c150839..08cb419eb4e6 100644 --- a/drivers/xen/preempt.c +++ b/drivers/xen/preempt.c @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); asmlinkage __visible void xen_maybe_preempt_hcall(void) { if (unlikely(__this_cpu_read(xen_in_preemptible_hcall) - && should_resched())) { + && need_resched())) { /* * Clear flag as we may be rescheduled on a different * cpu. From 907c3eb18e0bd86ca12a9de80befe8e3647bac3e Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Mon, 13 Jul 2015 17:55:24 +0800 Subject: [PATCH 03/33] xen-blkfront: convert to blk-mq APIs Note: This patch is based on original work of Arianna's internship for GNOME's Outreach Program for Women. Only one hardware queue is used now, so there is no significant performance change The legacy non-mq code is deleted completely which is the same as other drivers like virtio, mtip, and nvme. Also dropped one unnecessary holding of info->io_lock when calling blk_mq_stop_hw_queues(). Signed-off-by: Arianna Avanzini Signed-off-by: Bob Liu Reviewed-by: Christoph Hellwig Acked-by: Jens Axboe Signed-off-by: David Vrabel --- drivers/block/xen-blkfront.c | 146 ++++++++++++++--------------------- 1 file changed, 60 insertions(+), 86 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 7a8a73f1fc04..5dd591d6c859 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -37,6 +37,7 @@ #include #include +#include #include #include #include @@ -148,6 +149,7 @@ struct blkfront_info unsigned int feature_persistent:1; unsigned int max_indirect_segments; int is_ready; + struct blk_mq_tag_set tag_set; }; static unsigned int nr_minors; @@ -617,54 +619,41 @@ static inline bool blkif_request_flush_invalid(struct request *req, !(info->feature_flush & REQ_FUA))); } -/* - * do_blkif_request - * read a block; request is in a request queue - */ -static void do_blkif_request(struct request_queue *rq) +static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *qd) { - struct blkfront_info *info = NULL; - struct request *req; - int queued; + struct blkfront_info *info = qd->rq->rq_disk->private_data; - pr_debug("Entered do_blkif_request\n"); + blk_mq_start_request(qd->rq); + spin_lock_irq(&info->io_lock); + if (RING_FULL(&info->ring)) + goto out_busy; - queued = 0; + if (blkif_request_flush_invalid(qd->rq, info)) + goto out_err; - while ((req = blk_peek_request(rq)) != NULL) { - info = req->rq_disk->private_data; + if (blkif_queue_request(qd->rq)) + goto out_busy; - if (RING_FULL(&info->ring)) - goto wait; + flush_requests(info); + spin_unlock_irq(&info->io_lock); + return BLK_MQ_RQ_QUEUE_OK; - blk_start_request(req); +out_err: + spin_unlock_irq(&info->io_lock); + return BLK_MQ_RQ_QUEUE_ERROR; - if (blkif_request_flush_invalid(req, info)) { - __blk_end_request_all(req, -EOPNOTSUPP); - continue; - } - - pr_debug("do_blk_req %p: cmd %p, sec %lx, " - "(%u/%u) [%s]\n", - req, req->cmd, (unsigned long)blk_rq_pos(req), - blk_rq_cur_sectors(req), blk_rq_sectors(req), - rq_data_dir(req) ? "write" : "read"); - - if (blkif_queue_request(req)) { - blk_requeue_request(rq, req); -wait: - /* Avoid pointless unplugs. */ - blk_stop_queue(rq); - break; - } - - queued++; - } - - if (queued != 0) - flush_requests(info); +out_busy: + spin_unlock_irq(&info->io_lock); + blk_mq_stop_hw_queue(hctx); + return BLK_MQ_RQ_QUEUE_BUSY; } +static struct blk_mq_ops blkfront_mq_ops = { + .queue_rq = blkif_queue_rq, + .map_queue = blk_mq_map_queue, +}; + static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, unsigned int physical_sector_size, unsigned int segments) @@ -672,9 +661,22 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, struct request_queue *rq; struct blkfront_info *info = gd->private_data; - rq = blk_init_queue(do_blkif_request, &info->io_lock); - if (rq == NULL) + memset(&info->tag_set, 0, sizeof(info->tag_set)); + info->tag_set.ops = &blkfront_mq_ops; + info->tag_set.nr_hw_queues = 1; + info->tag_set.queue_depth = BLK_RING_SIZE(info); + info->tag_set.numa_node = NUMA_NO_NODE; + info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + info->tag_set.cmd_size = 0; + info->tag_set.driver_data = info; + + if (blk_mq_alloc_tag_set(&info->tag_set)) return -1; + rq = blk_mq_init_queue(&info->tag_set); + if (IS_ERR(rq)) { + blk_mq_free_tag_set(&info->tag_set); + return -1; + } queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); @@ -902,19 +904,15 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, static void xlvbd_release_gendisk(struct blkfront_info *info) { unsigned int minor, nr_minors; - unsigned long flags; if (info->rq == NULL) return; - spin_lock_irqsave(&info->io_lock, flags); - /* No more blkif_request(). */ - blk_stop_queue(info->rq); + blk_mq_stop_hw_queues(info->rq); /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); - spin_unlock_irqrestore(&info->io_lock, flags); /* Flush gnttab callback work. Must be done with no locks held. */ flush_work(&info->work); @@ -926,20 +924,18 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) xlbd_release_minors(minor, nr_minors); blk_cleanup_queue(info->rq); + blk_mq_free_tag_set(&info->tag_set); info->rq = NULL; put_disk(info->gd); info->gd = NULL; } +/* Must be called with io_lock holded */ static void kick_pending_request_queues(struct blkfront_info *info) { - if (!RING_FULL(&info->ring)) { - /* Re-enable calldowns. */ - blk_start_queue(info->rq); - /* Kick things off immediately. */ - do_blkif_request(info->rq); - } + if (!RING_FULL(&info->ring)) + blk_mq_start_stopped_hw_queues(info->rq, true); } static void blkif_restart_queue(struct work_struct *work) @@ -964,7 +960,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; /* No more blkif_request(). */ if (info->rq) - blk_stop_queue(info->rq); + blk_mq_stop_hw_queues(info->rq); /* Remove all persistent grants */ if (!list_empty(&info->grants)) { @@ -1147,7 +1143,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) RING_IDX i, rp; unsigned long flags; struct blkfront_info *info = (struct blkfront_info *)dev_id; - int error; spin_lock_irqsave(&info->io_lock, flags); @@ -1188,37 +1183,37 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) continue; } - error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; + req->errors = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; switch (bret->operation) { case BLKIF_OP_DISCARD: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { struct request_queue *rq = info->rq; printk(KERN_WARNING "blkfront: %s: %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - error = -EOPNOTSUPP; + req->errors = -EOPNOTSUPP; info->feature_discard = 0; info->feature_secdiscard = 0; queue_flag_clear(QUEUE_FLAG_DISCARD, rq); queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq); } - __blk_end_request_all(req, error); + blk_mq_complete_request(req); break; case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_WRITE_BARRIER: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { printk(KERN_WARNING "blkfront: %s: %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - error = -EOPNOTSUPP; + req->errors = -EOPNOTSUPP; } if (unlikely(bret->status == BLKIF_RSP_ERROR && info->shadow[id].req.u.rw.nr_segments == 0)) { printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - error = -EOPNOTSUPP; + req->errors = -EOPNOTSUPP; } - if (unlikely(error)) { - if (error == -EOPNOTSUPP) - error = 0; + if (unlikely(req->errors)) { + if (req->errors == -EOPNOTSUPP) + req->errors = 0; info->feature_flush = 0; xlvbd_flush(info); } @@ -1229,7 +1224,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) dev_dbg(&info->xbdev->dev, "Bad return from blkdev data " "request: %x\n", bret->status); - __blk_end_request_all(req, error); + blk_mq_complete_request(req); break; default: BUG(); @@ -1558,28 +1553,6 @@ static int blkif_recover(struct blkfront_info *info) kfree(copy); - /* - * Empty the queue, this is important because we might have - * requests in the queue with more segments than what we - * can handle now. - */ - spin_lock_irq(&info->io_lock); - while ((req = blk_fetch_request(info->rq)) != NULL) { - if (req->cmd_flags & - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { - list_add(&req->queuelist, &requests); - continue; - } - merge_bio.head = req->bio; - merge_bio.tail = req->biotail; - bio_list_merge(&bio_list, &merge_bio); - req->bio = NULL; - if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) - pr_alert("diskcache flush request found!\n"); - __blk_end_request_all(req, 0); - } - spin_unlock_irq(&info->io_lock); - xenbus_switch_state(info->xbdev, XenbusStateConnected); spin_lock_irq(&info->io_lock); @@ -1594,9 +1567,10 @@ static int blkif_recover(struct blkfront_info *info) /* Requeue pending requests (flush or discard) */ list_del_init(&req->queuelist); BUG_ON(req->nr_phys_segments > segs); - blk_requeue_request(info->rq, req); + blk_mq_requeue_request(req); } spin_unlock_irq(&info->io_lock); + blk_mq_kick_requeue_list(info->rq); while ((bio = bio_list_pop(&bio_list)) != NULL) { /* Traverse the list of pending bios and re-queue them */ From 4a5b69464e51f4a8dd432e8c2a1468630df1a53c Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 28 Jul 2015 10:10:42 +0100 Subject: [PATCH 04/33] xen/events: Support event channel rebind on ARM Currently, the event channel rebind code is gated with the presence of the vector callback. The virtual interrupt controller on ARM has the concept of per-CPU interrupt (PPI) which allow us to support per-VCPU event channel. Therefore there is no need of vector callback for ARM. Xen is already using a free PPI to notify the guest VCPU of an event. Furthermore, the xen code initialization in Linux (see arch/arm/xen/enlighten.c) is requesting correctly a per-CPU IRQ. Introduce new helper xen_support_evtchn_rebind to allow architecture decide whether rebind an event is support or not. It will always return true on ARM and keep the same behavior on x86. This is also allow us to drop the usage of xen_have_vector_callback entirely in the ARM code. Signed-off-by: Julien Grall Signed-off-by: David Vrabel --- arch/arm/include/asm/xen/events.h | 6 ++++++ arch/arm/xen/enlighten.c | 4 ---- arch/arm64/include/asm/xen/events.h | 6 ++++++ arch/x86/include/asm/xen/events.h | 11 +++++++++++ drivers/xen/events/events_base.c | 6 +----- include/xen/events.h | 1 - 6 files changed, 24 insertions(+), 10 deletions(-) diff --git a/arch/arm/include/asm/xen/events.h b/arch/arm/include/asm/xen/events.h index 8b1f37bfeeec..71e473d05fcc 100644 --- a/arch/arm/include/asm/xen/events.h +++ b/arch/arm/include/asm/xen/events.h @@ -20,4 +20,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) atomic64_t, \ counter), (val)) +/* Rebind event channel is supported by default */ +static inline bool xen_support_evtchn_rebind(void) +{ + return true; +} + #endif /* _ASM_ARM_XEN_EVENTS_H */ diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 6c09cc440a2b..40b961d8e953 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -45,10 +45,6 @@ static struct vcpu_info __percpu *xen_vcpu_info; unsigned long xen_released_pages; struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; -/* TODO: to be removed */ -__read_mostly int xen_have_vector_callback; -EXPORT_SYMBOL_GPL(xen_have_vector_callback); - int xen_platform_pci_unplug = XEN_UNPLUG_ALL; EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); diff --git a/arch/arm64/include/asm/xen/events.h b/arch/arm64/include/asm/xen/events.h index 86553213c132..4318866d053c 100644 --- a/arch/arm64/include/asm/xen/events.h +++ b/arch/arm64/include/asm/xen/events.h @@ -18,4 +18,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) +/* Rebind event channel is supported by default */ +static inline bool xen_support_evtchn_rebind(void) +{ + return true; +} + #endif /* _ASM_ARM64_XEN_EVENTS_H */ diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index 608a79d5a466..e6911caf5bbf 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) /* No need for a barrier -- XCHG is a barrier on x86. */ #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) +extern int xen_have_vector_callback; + +/* + * Events delivered via platform PCI interrupts are always + * routed to vcpu 0 and hence cannot be rebound. + */ +static inline bool xen_support_evtchn_rebind(void) +{ + return (!xen_hvm_domain() || xen_have_vector_callback); +} + #endif /* _ASM_X86_XEN_EVENTS_H */ diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 96093ae369a5..ed620e5857a1 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1301,11 +1301,7 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) if (!VALID_EVTCHN(evtchn)) return -1; - /* - * Events delivered via platform PCI interrupts are always - * routed to vcpu 0 and hence cannot be rebound. - */ - if (xen_hvm_domain() && !xen_have_vector_callback) + if (!xen_support_evtchn_rebind()) return -1; /* Send future instances of this interrupt to other vcpu. */ diff --git a/include/xen/events.h b/include/xen/events.h index 7d95fdf9cf3e..88da2abaf535 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -92,7 +92,6 @@ void xen_hvm_callback_vector(void); #ifdef CONFIG_TRACING #define trace_xen_hvm_callback_vector xen_hvm_callback_vector #endif -extern int xen_have_vector_callback; int xen_set_callback_via(uint64_t via); void xen_evtchn_do_upcall(struct pt_regs *regs); void xen_hvm_evtchn_do_upcall(void); From 7ed208ef4ef9dbd03cda8a5b5a85cc78f79ef213 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Mon, 3 Aug 2015 09:50:55 +0000 Subject: [PATCH 05/33] arm/xen: Drop the definition of xen_pci_platform_unplug The commit 6f6c15ef912465b3aaafe709f39bd6026a8b3e72 "xen/pvhvm: Remove the xen_platform_pci int." makes the x86 version of xen_pci_platform_unplug static. Therefore we don't need anymore to define a dummy xen_pci_platform_unplug for ARM. Signed-off-by: Julien Grall Signed-off-by: Stefano Stabellini --- arch/arm/xen/enlighten.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 40b961d8e953..c50c8d33f874 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -45,9 +45,6 @@ static struct vcpu_info __percpu *xen_vcpu_info; unsigned long xen_released_pages; struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; -int xen_platform_pci_unplug = XEN_UNPLUG_ALL; -EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); - static __read_mostly unsigned int xen_events_irq; static __initdata struct device_node *xen_node; From 17fb46b1190b677a37cdd636e2aa30052109f51b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:22 +0200 Subject: [PATCH 06/33] xen: sync with xen headers Use the newest headers from the xen tree to get some new structure layouts. Signed-off-by: Juergen Gross Reviewed-by: David Vrabel Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/include/asm/xen/interface.h | 96 +++++++++++++++++++++++++--- include/xen/interface/xen.h | 35 +++++----- 2 files changed, 107 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 3400dbaec3c3..3b88eeacdbda 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -3,12 +3,38 @@ * * Guest OS interface to x86 Xen. * - * Copyright (c) 2004, K A Fraser + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2004-2006, K A Fraser */ #ifndef _ASM_X86_XEN_INTERFACE_H #define _ASM_X86_XEN_INTERFACE_H +/* + * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field + * in a struct in memory. + * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an + * hypercall argument. + * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but + * they might not be on other architectures. + */ #ifdef __XEN__ #define __DEFINE_GUEST_HANDLE(name, type) \ typedef struct { type *p; } __guest_handle_ ## name @@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t); * start of the GDT because some stupid OSes export hard-coded selector values * in their ABI. These hard-coded values are always near the start of the GDT, * so Xen places itself out of the way, at the far end of the GDT. + * + * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op */ #define FIRST_RESERVED_GDT_PAGE 14 #define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096) #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) /* - * Send an array of these to HYPERVISOR_set_trap_table() + * Send an array of these to HYPERVISOR_set_trap_table(). + * Terminate the array with a sentinel entry, with traps[].address==0. * The privilege level specifies which modes may enter a trap via a software * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate * privilege levels as follows: @@ -118,10 +147,41 @@ struct trap_info { DEFINE_GUEST_HANDLE_STRUCT(trap_info); struct arch_shared_info { - unsigned long max_pfn; /* max pfn that appears in table */ - /* Frame containing list of mfns containing list of mfns containing p2m. */ - unsigned long pfn_to_mfn_frame_list_list; - unsigned long nmi_reason; + /* + * Number of valid entries in the p2m table(s) anchored at + * pfn_to_mfn_frame_list_list and/or p2m_vaddr. + */ + unsigned long max_pfn; + /* + * Frame containing list of mfns containing list of mfns containing p2m. + * A value of 0 indicates it has not yet been set up, ~0 indicates it + * has been set to invalid e.g. due to the p2m being too large for the + * 3-level p2m tree. In this case the linear mapper p2m list anchored + * at p2m_vaddr is to be used. + */ + xen_pfn_t pfn_to_mfn_frame_list_list; + unsigned long nmi_reason; + /* + * Following three fields are valid if p2m_cr3 contains a value + * different from 0. + * p2m_cr3 is the root of the address space where p2m_vaddr is valid. + * p2m_cr3 is in the same format as a cr3 value in the vcpu register + * state and holds the folded machine frame number (via xen_pfn_to_cr3) + * of a L3 or L4 page table. + * p2m_vaddr holds the virtual address of the linear p2m list. All + * entries in the range [0...max_pfn[ are accessible via this pointer. + * p2m_generation will be incremented by the guest before and after each + * change of the mappings of the p2m list. p2m_generation starts at 0 + * and a value with the least significant bit set indicates that a + * mapping update is in progress. This allows guest external software + * (e.g. in Dom0) to verify that read mappings are consistent and + * whether they have changed since the last check. + * Modifying a p2m element in the linear p2m list is allowed via an + * atomic write only. + */ + unsigned long p2m_cr3; /* cr3 value of the p2m address space */ + unsigned long p2m_vaddr; /* virtual address of the p2m list */ + unsigned long p2m_generation; /* generation count of p2m mapping */ }; #endif /* !__ASSEMBLY__ */ @@ -137,13 +197,31 @@ struct arch_shared_info { /* * The following is all CPU context. Note that the fpu_ctxt block is filled * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. + * + * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise + * for HVM and PVH guests, not all information in this structure is updated: + * + * - For HVM guests, the structures read include: fpu_ctxt (if + * VGCT_I387_VALID is set), flags, user_regs, debugreg[*] + * + * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to + * set cr3. All other fields not used should be set to 0. */ struct vcpu_guest_context { /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */ struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */ -#define VGCF_I387_VALID (1<<0) -#define VGCF_HVM_GUEST (1<<1) -#define VGCF_IN_KERNEL (1<<2) +#define VGCF_I387_VALID (1<<0) +#define VGCF_IN_KERNEL (1<<2) +#define _VGCF_i387_valid 0 +#define VGCF_i387_valid (1<<_VGCF_i387_valid) +#define _VGCF_in_kernel 2 +#define VGCF_in_kernel (1<<_VGCF_in_kernel) +#define _VGCF_failsafe_disables_events 3 +#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events) +#define _VGCF_syscall_disables_events 4 +#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events) +#define _VGCF_online 5 +#define VGCF_online (1<<_VGCF_online) unsigned long flags; /* VGCF_* flags */ struct cpu_user_regs user_regs; /* User-level CPU registers */ struct trap_info trap_ctxt[256]; /* Virtual IDT */ diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index a48378958062..8194270edcf0 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -585,26 +585,29 @@ struct shared_info { }; /* - * Start-of-day memory layout for the initial domain (DOM0): + * Start-of-day memory layout + * * 1. The domain is started within contiguous virtual-memory region. * 2. The contiguous region begins and ends on an aligned 4MB boundary. - * 3. The region start corresponds to the load address of the OS image. - * If the load address is not 4MB aligned then the address is rounded down. - * 4. This the order of bootstrap elements in the initial virtual region: + * 3. This the order of bootstrap elements in the initial virtual region: * a. relocated kernel image * b. initial ram disk [mod_start, mod_len] + * (may be omitted) * c. list of allocated page frames [mfn_list, nr_pages] + * (unless relocated due to XEN_ELFNOTE_INIT_P2M) * d. start_info_t structure [register ESI (x86)] - * e. bootstrap page tables [pt_base, CR3 (x86)] - * f. bootstrap stack [register ESP (x86)] - * 5. Bootstrap elements are packed together, but each is 4kB-aligned. - * 6. The initial ram disk may be omitted. - * 7. The list of page frames forms a contiguous 'pseudo-physical' memory + * in case of dom0 this page contains the console info, too + * e. unless dom0: xenstore ring page + * f. unless dom0: console ring page + * g. bootstrap page tables [pt_base, CR3 (x86)] + * h. bootstrap stack [register ESP (x86)] + * 4. Bootstrap elements are packed together, but each is 4kB-aligned. + * 5. The list of page frames forms a contiguous 'pseudo-physical' memory * layout for the domain. In particular, the bootstrap virtual-memory * region is a 1:1 mapping to the first section of the pseudo-physical map. - * 8. All bootstrap elements are mapped read-writable for the guest OS. The + * 6. All bootstrap elements are mapped read-writable for the guest OS. The * only exception is the bootstrap page table, which is mapped read-only. - * 9. There is guaranteed to be at least 512kB padding after the final + * 7. There is guaranteed to be at least 512kB padding after the final * bootstrap element. If necessary, the bootstrap virtual region is * extended by an extra 4MB to ensure this. */ @@ -641,10 +644,12 @@ struct start_info { }; /* These flags are passed in the 'flags' field of start_info_t. */ -#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ -#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ -#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */ -#define SIF_MOD_START_PFN (1<<3) /* Is mod_start a PFN? */ +#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ +#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ +#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */ +#define SIF_MOD_START_PFN (1<<3) /* Is mod_start a PFN? */ +#define SIF_VIRT_P2M_4TOOLS (1<<4) /* Do Xen tools understand a virt. mapped */ + /* P->M making the 3 level tree obsolete? */ #define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ /* From 4b9c9a11803eaa73b3223da9fcaea39b2f919d80 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:23 +0200 Subject: [PATCH 07/33] xen: save linear p2m list address in shared info structure The virtual address of the linear p2m list should be stored in the shared info structure read by the Xen tools to be able to support 64 bit pv-domains larger than 512 GB. Additionally the linear p2m list interface includes a generation count which is changed prior to and after each mapping change of the p2m list. Reading the generation count the Xen tools can detect changes of the mappings and re-read the p2m list eventually. Signed-off-by: Juergen Gross Reviewed-by: David Vrabel Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/xen/p2m.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 8b7f18e200aa..b89983e9656f 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -263,6 +263,10 @@ void xen_setup_mfn_list_list(void) HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = virt_to_mfn(p2m_top_mfn); HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; + HYPERVISOR_shared_info->arch.p2m_generation = 0; + HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr; + HYPERVISOR_shared_info->arch.p2m_cr3 = + xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); } /* Set up p2m_top to point to the domain-builder provided p2m pages */ @@ -478,8 +482,12 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) ptechk = lookup_address(vaddr, &level); if (ptechk == pte_pg) { + HYPERVISOR_shared_info->arch.p2m_generation++; + wmb(); /* Tools are synchronizing via p2m_generation. */ set_pmd(pmdp, __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE)); + wmb(); /* Tools are synchronizing via p2m_generation. */ + HYPERVISOR_shared_info->arch.p2m_generation++; pte_newpg[i] = NULL; } @@ -577,8 +585,12 @@ static bool alloc_p2m(unsigned long pfn) spin_lock_irqsave(&p2m_update_lock, flags); if (pte_pfn(*ptep) == p2m_pfn) { + HYPERVISOR_shared_info->arch.p2m_generation++; + wmb(); /* Tools are synchronizing via p2m_generation. */ set_pte(ptep, pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL)); + wmb(); /* Tools are synchronizing via p2m_generation. */ + HYPERVISOR_shared_info->arch.p2m_generation++; if (mid_mfn) mid_mfn[mididx] = virt_to_mfn(p2m); p2m = NULL; @@ -630,6 +642,11 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) return true; } + /* + * The interface requires atomic updates on p2m elements. + * xen_safe_write_ulong() is using __put_user which does an atomic + * store via asm(). + */ if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn))) return true; From d51e8b3e85972dee10be7943b0b0106742b1e847 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:24 +0200 Subject: [PATCH 08/33] xen: don't build mfn tree if tools don't need it In case the Xen tools indicate they don't need the p2m 3 level tree as they support the virtual mapped linear p2m list, just omit building the tree. Signed-off-by: Juergen Gross Reviewed-by: David Vrabel Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/xen/p2m.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index b89983e9656f..c719f7c36cb8 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -199,7 +199,8 @@ void __ref xen_build_mfn_list_list(void) unsigned int level, topidx, mididx; unsigned long *mid_mfn_p; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_feature(XENFEAT_auto_translated_physmap) || + xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) return; /* Pre-initialize p2m_top_mfn to be completely missing */ @@ -260,8 +261,11 @@ void xen_setup_mfn_list_list(void) BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - virt_to_mfn(p2m_top_mfn); + if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL; + else + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(p2m_top_mfn); HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; HYPERVISOR_shared_info->arch.p2m_generation = 0; HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr; From 8f5b0c63987207fd5c3c1f89c9eb6cb95b30386e Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:25 +0200 Subject: [PATCH 09/33] xen: eliminate scalability issues from initial mapping setup Direct Xen to place the initial P->M table outside of the initial mapping, as otherwise the 1G (implementation) / 2G (theoretical) restriction on the size of the initial mapping limits the amount of memory a domain can be handed initially. As the initial P->M table is copied rather early during boot to domain private memory and it's initial virtual mapping is dropped, the easiest way to avoid virtual address conflicts with other addresses in the kernel is to use a user address area for the virtual address of the initial P->M table. This allows us to just throw away the page tables of the initial mapping after the copy without having to care about address invalidation. It should be noted that this patch won't enable a pv-domain to USE more than 512 GB of RAM. It just enables it to be started with a P->M table covering more memory. This is especially important for being able to boot a Dom0 on a system with more than 512 GB memory. Signed-off-by: Juergen Gross Based-on-patch-by: Jan Beulich Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/xen/mmu.c | 126 ++++++++++++++++++++++++++++++++++++---- arch/x86/xen/setup.c | 67 ++++++++++++--------- arch/x86/xen/xen-head.S | 2 + 3 files changed, 156 insertions(+), 39 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index dd151b2045b0..c04e14e6b301 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1114,6 +1114,77 @@ static void __init xen_cleanhighmap(unsigned long vaddr, xen_mc_flush(); } +/* + * Make a page range writeable and free it. + */ +static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) +{ + void *vaddr = __va(paddr); + void *vaddr_end = vaddr + size; + + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) + make_lowmem_page_readwrite(vaddr); + + memblock_free(paddr, size); +} + +static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl) +{ + unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK; + + ClearPagePinned(virt_to_page(__va(pa))); + xen_free_ro_pages(pa, PAGE_SIZE); +} + +/* + * Since it is well isolated we can (and since it is perhaps large we should) + * also free the page tables mapping the initial P->M table. + */ +static void __init xen_cleanmfnmap(unsigned long vaddr) +{ + unsigned long va = vaddr & PMD_MASK; + unsigned long pa; + pgd_t *pgd = pgd_offset_k(va); + pud_t *pud_page = pud_offset(pgd, 0); + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned int i; + + set_pgd(pgd, __pgd(0)); + do { + pud = pud_page + pud_index(va); + if (pud_none(*pud)) { + va += PUD_SIZE; + } else if (pud_large(*pud)) { + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PUD_SIZE); + va += PUD_SIZE; + } else { + pmd = pmd_offset(pud, va); + if (pmd_large(*pmd)) { + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PMD_SIZE); + } else if (!pmd_none(*pmd)) { + pte = pte_offset_kernel(pmd, va); + for (i = 0; i < PTRS_PER_PTE; ++i) { + if (pte_none(pte[i])) + break; + pa = pte_pfn(pte[i]) << PAGE_SHIFT; + xen_free_ro_pages(pa, PAGE_SIZE); + } + xen_cleanmfnmap_free_pgtbl(pte); + } + va += PMD_SIZE; + if (pmd_index(va)) + continue; + xen_cleanmfnmap_free_pgtbl(pmd); + } + + } while (pud_index(va) || pmd_index(va)); + xen_cleanmfnmap_free_pgtbl(pud_page); +} + static void __init xen_pagetable_p2m_free(void) { unsigned long size; @@ -1128,18 +1199,25 @@ static void __init xen_pagetable_p2m_free(void) /* using __ka address and sticking INVALID_P2M_ENTRY! */ memset((void *)xen_start_info->mfn_list, 0xff, size); - /* We should be in __ka space. */ - BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); addr = xen_start_info->mfn_list; - /* We roundup to the PMD, which means that if anybody at this stage is - * using the __ka address of xen_start_info or xen_start_info->shared_info - * they are in going to crash. Fortunatly we have already revectored - * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ + /* + * We could be in __ka space. + * We roundup to the PMD, which means that if anybody at this stage is + * using the __ka address of xen_start_info or + * xen_start_info->shared_info they are in going to crash. Fortunatly + * we have already revectored in xen_setup_kernel_pagetable and in + * xen_setup_shared_info. + */ size = roundup(size, PMD_SIZE); - xen_cleanhighmap(addr, addr + size); - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - memblock_free(__pa(xen_start_info->mfn_list), size); + if (addr >= __START_KERNEL_map) { + xen_cleanhighmap(addr, addr + size); + size = PAGE_ALIGN(xen_start_info->nr_pages * + sizeof(unsigned long)); + memblock_free(__pa(addr), size); + } else { + xen_cleanmfnmap(addr); + } /* At this stage, cleanup_highmap has already cleaned __ka space * from _brk_limit way up to the max_pfn_mapped (which is the end of @@ -1461,6 +1539,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) #else /* CONFIG_X86_64 */ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) { + unsigned long pfn; + + if (xen_feature(XENFEAT_writable_page_tables) || + xen_feature(XENFEAT_auto_translated_physmap) || + xen_start_info->mfn_list >= __START_KERNEL_map) + return pte; + + /* + * Pages belonging to the initial p2m list mapped outside the default + * address range must be mapped read-only. This region contains the + * page tables for mapping the p2m list, too, and page tables MUST be + * mapped read-only. + */ + pfn = pte_pfn(pte); + if (pfn >= xen_start_info->first_p2m_pfn && + pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames) + pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW); + return pte; } #endif /* CONFIG_X86_64 */ @@ -1815,7 +1911,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) * mappings. Considering that on Xen after the kernel mappings we * have the mappings of some pages that don't exist in pfn space, we * set max_pfn_mapped to the last real pfn mapped. */ - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + if (xen_start_info->mfn_list < __START_KERNEL_map) + max_pfn_mapped = xen_start_info->first_p2m_pfn; + else + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); pt_end = pt_base + xen_start_info->nr_pt_frames; @@ -1855,6 +1954,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Graft it onto L4[511][510] */ copy_page(level2_kernel_pgt, l2); + /* Copy the initial P->M table mappings if necessary. */ + i = pgd_index(xen_start_info->mfn_list); + if (i && i < pgd_index(__START_KERNEL_map)) + init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; + if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* Make pagetable pieces RO */ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); @@ -1895,6 +1999,8 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Our (by three pages) smaller Xen pagetable that we are using */ memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); + /* protect xen_start_info */ + memblock_reserve(__pa(xen_start_info), PAGE_SIZE); /* Revector the xen_start_info */ xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 55f388ef481a..adad417ca1ba 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -560,6 +560,41 @@ static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size) } } +/* + * Reserve Xen mfn_list. + * See comment above "struct start_info" in + * We tried to make the the memblock_reserve more selective so + * that it would be clear what region is reserved. Sadly we ran + * in the problem wherein on a 64-bit hypervisor with a 32-bit + * initial domain, the pt_base has the cr3 value which is not + * neccessarily where the pagetable starts! As Jan put it: " + * Actually, the adjustment turns out to be correct: The page + * tables for a 32-on-64 dom0 get allocated in the order "first L1", + * "first L2", "first L3", so the offset to the page table base is + * indeed 2. When reading xen/include/public/xen.h's comment + * very strictly, this is not a violation (since there nothing is said + * that the first thing in the page table space is pointed to by + * pt_base; I admit that this seems to be implied though, namely + * do I think that it is implied that the page table space is the + * range [pt_base, pt_base + nt_pt_frames), whereas that + * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), + * which - without a priori knowledge - the kernel would have + * difficulty to figure out)." - so lets just fall back to the + * easy way and reserve the whole region. + */ +static void __init xen_reserve_xen_mfnlist(void) +{ + if (xen_start_info->mfn_list >= __START_KERNEL_map) { + memblock_reserve(__pa(xen_start_info->mfn_list), + xen_start_info->pt_base - + xen_start_info->mfn_list); + return; + } + + memblock_reserve(PFN_PHYS(xen_start_info->first_p2m_pfn), + PFN_PHYS(xen_start_info->nr_p2m_frames)); +} + /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ @@ -684,35 +719,10 @@ char * __init xen_memory_setup(void) e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); - /* - * Reserve Xen bits: - * - mfn_list - * - xen_start_info - * See comment above "struct start_info" in - * We tried to make the the memblock_reserve more selective so - * that it would be clear what region is reserved. Sadly we ran - * in the problem wherein on a 64-bit hypervisor with a 32-bit - * initial domain, the pt_base has the cr3 value which is not - * neccessarily where the pagetable starts! As Jan put it: " - * Actually, the adjustment turns out to be correct: The page - * tables for a 32-on-64 dom0 get allocated in the order "first L1", - * "first L2", "first L3", so the offset to the page table base is - * indeed 2. When reading xen/include/public/xen.h's comment - * very strictly, this is not a violation (since there nothing is said - * that the first thing in the page table space is pointed to by - * pt_base; I admit that this seems to be implied though, namely - * do I think that it is implied that the page table space is the - * range [pt_base, pt_base + nt_pt_frames), whereas that - * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), - * which - without a priori knowledge - the kernel would have - * difficulty to figure out)." - so lets just fall back to the - * easy way and reserve the whole region. - */ - memblock_reserve(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + xen_reserve_xen_mfnlist(); + return "Xen"; } @@ -739,8 +749,7 @@ char * __init xen_auto_xlated_memory_setup(void) for (i = 0; i < memmap.nr_entries; i++) e820_add_region(map[i].addr, map[i].size, map[i].type); - memblock_reserve(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list); + xen_reserve_xen_mfnlist(); return "Xen"; } diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 8afdfccf6086..b65f59a358a2 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -104,6 +104,8 @@ ENTRY(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) #else ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) + /* Map the p2m table to a 512GB-aligned user address. */ + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE) #endif ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) From 69632ecfcd03b12202ed62dfa0aabac83904f8ac Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:26 +0200 Subject: [PATCH 10/33] xen: move static e820 map to global scope Instead of using a function local static e820 map in xen_memory_setup() and calling various functions in the same source with the map as a parameter use a map directly accessible by all functions in the source. Signed-off-by: Juergen Gross Reviewed-by: David Vrabel Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/xen/setup.c | 96 ++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index adad417ca1ba..ab6c36e1801b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -38,6 +38,10 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; /* Number of pages released from the initial allocation. */ unsigned long xen_released_pages; +/* E820 map used during setting up memory. */ +static struct e820entry xen_e820_map[E820MAX] __initdata; +static u32 xen_e820_map_entries __initdata; + /* * Buffer used to remap identity mapped pages. We only need the virtual space. * The physical page behind this address is remapped as needed to different @@ -164,15 +168,13 @@ void __init xen_inv_extra_mem(void) * This function updates min_pfn with the pfn found and returns * the size of that range or zero if not found. */ -static unsigned long __init xen_find_pfn_range( - const struct e820entry *list, size_t map_size, - unsigned long *min_pfn) +static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn) { - const struct e820entry *entry; + const struct e820entry *entry = xen_e820_map; unsigned int i; unsigned long done = 0; - for (i = 0, entry = list; i < map_size; i++, entry++) { + for (i = 0; i < xen_e820_map_entries; i++, entry++) { unsigned long s_pfn; unsigned long e_pfn; @@ -356,9 +358,9 @@ static void __init xen_do_set_identity_and_remap_chunk( * to Xen and not remapped. */ static unsigned long __init xen_set_identity_and_remap_chunk( - const struct e820entry *list, size_t map_size, unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, - unsigned long *released, unsigned long *remapped) + unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long remap_pfn, unsigned long *released, + unsigned long *remapped) { unsigned long pfn; unsigned long i = 0; @@ -379,8 +381,7 @@ static unsigned long __init xen_set_identity_and_remap_chunk( if (cur_pfn + size > nr_pages) size = nr_pages - cur_pfn; - remap_range_size = xen_find_pfn_range(list, map_size, - &remap_pfn); + remap_range_size = xen_find_pfn_range(&remap_pfn); if (!remap_range_size) { pr_warning("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, @@ -411,13 +412,12 @@ static unsigned long __init xen_set_identity_and_remap_chunk( return remap_pfn; } -static void __init xen_set_identity_and_remap( - const struct e820entry *list, size_t map_size, unsigned long nr_pages, - unsigned long *released, unsigned long *remapped) +static void __init xen_set_identity_and_remap(unsigned long nr_pages, + unsigned long *released, unsigned long *remapped) { phys_addr_t start = 0; unsigned long last_pfn = nr_pages; - const struct e820entry *entry; + const struct e820entry *entry = xen_e820_map; unsigned long num_released = 0; unsigned long num_remapped = 0; int i; @@ -433,9 +433,9 @@ static void __init xen_set_identity_and_remap( * example) the DMI tables in a reserved region that begins on * a non-page boundary. */ - for (i = 0, entry = list; i < map_size; i++, entry++) { + for (i = 0; i < xen_e820_map_entries; i++, entry++) { phys_addr_t end = entry->addr + entry->size; - if (entry->type == E820_RAM || i == map_size - 1) { + if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); @@ -444,9 +444,9 @@ static void __init xen_set_identity_and_remap( if (start_pfn < end_pfn) last_pfn = xen_set_identity_and_remap_chunk( - list, map_size, start_pfn, - end_pfn, nr_pages, last_pfn, - &num_released, &num_remapped); + start_pfn, end_pfn, nr_pages, + last_pfn, &num_released, + &num_remapped); start = end; } } @@ -549,12 +549,12 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start, e820_add_region(start, end - start, type); } -static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size) +static void __init xen_ignore_unusable(void) { - struct e820entry *entry; + struct e820entry *entry = xen_e820_map; unsigned int i; - for (i = 0, entry = list; i < map_size; i++, entry++) { + for (i = 0; i < xen_e820_map_entries; i++, entry++) { if (entry->type == E820_UNUSABLE) entry->type = E820_RAM; } @@ -600,8 +600,6 @@ static void __init xen_reserve_xen_mfnlist(void) **/ char * __init xen_memory_setup(void) { - static struct e820entry map[E820MAX] __initdata; - unsigned long max_pfn = xen_start_info->nr_pages; phys_addr_t mem_end; int rc; @@ -616,7 +614,7 @@ char * __init xen_memory_setup(void) mem_end = PFN_PHYS(max_pfn); memmap.nr_entries = E820MAX; - set_xen_guest_handle(memmap.buffer, map); + set_xen_guest_handle(memmap.buffer, xen_e820_map); op = xen_initial_domain() ? XENMEM_machine_memory_map : @@ -625,15 +623,16 @@ char * __init xen_memory_setup(void) if (rc == -ENOSYS) { BUG_ON(xen_initial_domain()); memmap.nr_entries = 1; - map[0].addr = 0ULL; - map[0].size = mem_end; + xen_e820_map[0].addr = 0ULL; + xen_e820_map[0].size = mem_end; /* 8MB slack (to balance backend allocations). */ - map[0].size += 8ULL << 20; - map[0].type = E820_RAM; + xen_e820_map[0].size += 8ULL << 20; + xen_e820_map[0].type = E820_RAM; rc = 0; } BUG_ON(rc); BUG_ON(memmap.nr_entries == 0); + xen_e820_map_entries = memmap.nr_entries; /* * Xen won't allow a 1:1 mapping to be created to UNUSABLE @@ -644,10 +643,11 @@ char * __init xen_memory_setup(void) * a patch in the future. */ if (xen_initial_domain()) - xen_ignore_unusable(map, memmap.nr_entries); + xen_ignore_unusable(); /* Make sure the Xen-supplied memory map is well-ordered. */ - sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); + sanitize_e820_map(xen_e820_map, xen_e820_map_entries, + &xen_e820_map_entries); max_pages = xen_get_max_pages(); if (max_pages > max_pfn) @@ -657,8 +657,8 @@ char * __init xen_memory_setup(void) * Set identity map on non-RAM pages and prepare remapping the * underlying RAM. */ - xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, - &xen_released_pages, &remapped_pages); + xen_set_identity_and_remap(max_pfn, &xen_released_pages, + &remapped_pages); extra_pages += xen_released_pages; extra_pages += remapped_pages; @@ -677,10 +677,10 @@ char * __init xen_memory_setup(void) extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages); i = 0; - while (i < memmap.nr_entries) { - phys_addr_t addr = map[i].addr; - phys_addr_t size = map[i].size; - u32 type = map[i].type; + while (i < xen_e820_map_entries) { + phys_addr_t addr = xen_e820_map[i].addr; + phys_addr_t size = xen_e820_map[i].size; + u32 type = xen_e820_map[i].type; if (type == E820_RAM) { if (addr < mem_end) { @@ -696,9 +696,9 @@ char * __init xen_memory_setup(void) xen_align_and_add_e820_region(addr, size, type); - map[i].addr += size; - map[i].size -= size; - if (map[i].size == 0) + xen_e820_map[i].addr += size; + xen_e820_map[i].size -= size; + if (xen_e820_map[i].size == 0) i++; } @@ -709,7 +709,7 @@ char * __init xen_memory_setup(void) * PFNs above MAX_P2M_PFN are considered identity mapped as * well. */ - set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); + set_phys_range_identity(xen_e820_map[i - 1].addr / PAGE_SIZE, ~0ul); /* * In domU, the ISA region is normal, usable memory, but we @@ -731,23 +731,25 @@ char * __init xen_memory_setup(void) */ char * __init xen_auto_xlated_memory_setup(void) { - static struct e820entry map[E820MAX] __initdata; - struct xen_memory_map memmap; int i; int rc; memmap.nr_entries = E820MAX; - set_xen_guest_handle(memmap.buffer, map); + set_xen_guest_handle(memmap.buffer, xen_e820_map); rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); if (rc < 0) panic("No memory map (%d)\n", rc); - sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries); + xen_e820_map_entries = memmap.nr_entries; - for (i = 0; i < memmap.nr_entries; i++) - e820_add_region(map[i].addr, map[i].size, map[i].type); + sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map), + &xen_e820_map_entries); + + for (i = 0; i < xen_e820_map_entries; i++) + e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size, + xen_e820_map[i].type); xen_reserve_xen_mfnlist(); From 5097cdf6cef15439f971df54f9abcf143d7ca698 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:27 +0200 Subject: [PATCH 11/33] xen: split counting of extra memory pages from remapping Memory pages in the initial memory setup done by the Xen hypervisor conflicting with the target E820 map are remapped. In order to do this those pages are counted and remapped in xen_set_identity_and_remap(). Split the counting from the remapping operation to be able to setup the needed memory sizes in time but doing the remap operation at a later time. This enables us to simplify the interface to xen_set_identity_and_remap() as the number of remapped and released pages is no longer needed here. Finally move the remapping further down to prepare relocating conflicting memory contents before the memory might be clobbered by xen_set_identity_and_remap(). This requires to not destroy the Xen E820 map when the one for the system is being constructed. Signed-off-by: Juergen Gross Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/xen/setup.c | 98 ++++++++++++++++++++++++++------------------ 1 file changed, 58 insertions(+), 40 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ab6c36e1801b..87251b4c2e30 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -223,7 +223,7 @@ static int __init xen_free_mfn(unsigned long mfn) * as a fallback if the remapping fails. */ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages, unsigned long *released) + unsigned long end_pfn, unsigned long nr_pages) { unsigned long pfn, end; int ret; @@ -243,7 +243,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); if (ret == 1) { - (*released)++; + xen_released_pages++; if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY)) break; } else @@ -359,8 +359,7 @@ static void __init xen_do_set_identity_and_remap_chunk( */ static unsigned long __init xen_set_identity_and_remap_chunk( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, - unsigned long remap_pfn, unsigned long *released, - unsigned long *remapped) + unsigned long remap_pfn) { unsigned long pfn; unsigned long i = 0; @@ -385,7 +384,7 @@ static unsigned long __init xen_set_identity_and_remap_chunk( if (!remap_range_size) { pr_warning("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, - cur_pfn + left, nr_pages, released); + cur_pfn + left, nr_pages); break; } /* Adjust size to fit in current e820 RAM region */ @@ -397,7 +396,6 @@ static unsigned long __init xen_set_identity_and_remap_chunk( /* Update variables to reflect new mappings. */ i += size; remap_pfn += size; - *remapped += size; } /* @@ -412,14 +410,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk( return remap_pfn; } -static void __init xen_set_identity_and_remap(unsigned long nr_pages, - unsigned long *released, unsigned long *remapped) +static void __init xen_set_identity_and_remap(unsigned long nr_pages) { phys_addr_t start = 0; unsigned long last_pfn = nr_pages; const struct e820entry *entry = xen_e820_map; - unsigned long num_released = 0; - unsigned long num_remapped = 0; int i; /* @@ -445,16 +440,12 @@ static void __init xen_set_identity_and_remap(unsigned long nr_pages, if (start_pfn < end_pfn) last_pfn = xen_set_identity_and_remap_chunk( start_pfn, end_pfn, nr_pages, - last_pfn, &num_released, - &num_remapped); + last_pfn); start = end; } } - *released = num_released; - *remapped = num_remapped; - - pr_info("Released %ld page(s)\n", num_released); + pr_info("Released %ld page(s)\n", xen_released_pages); } /* @@ -560,6 +551,28 @@ static void __init xen_ignore_unusable(void) } } +static unsigned long __init xen_count_remap_pages(unsigned long max_pfn) +{ + unsigned long extra = 0; + const struct e820entry *entry = xen_e820_map; + int i; + + for (i = 0; i < xen_e820_map_entries; i++, entry++) { + unsigned long start_pfn = PFN_DOWN(entry->addr); + unsigned long end_pfn = PFN_UP(entry->addr + entry->size); + + if (start_pfn >= max_pfn) + break; + if (entry->type == E820_RAM) + continue; + if (end_pfn >= max_pfn) + end_pfn = max_pfn; + extra += end_pfn - start_pfn; + } + + return extra; +} + /* * Reserve Xen mfn_list. * See comment above "struct start_info" in @@ -601,12 +614,12 @@ static void __init xen_reserve_xen_mfnlist(void) char * __init xen_memory_setup(void) { unsigned long max_pfn = xen_start_info->nr_pages; - phys_addr_t mem_end; + phys_addr_t mem_end, addr, size, chunk_size; + u32 type; int rc; struct xen_memory_map memmap; unsigned long max_pages; unsigned long extra_pages = 0; - unsigned long remapped_pages; int i; int op; @@ -653,15 +666,8 @@ char * __init xen_memory_setup(void) if (max_pages > max_pfn) extra_pages += max_pages - max_pfn; - /* - * Set identity map on non-RAM pages and prepare remapping the - * underlying RAM. - */ - xen_set_identity_and_remap(max_pfn, &xen_released_pages, - &remapped_pages); - - extra_pages += xen_released_pages; - extra_pages += remapped_pages; + /* How many extra pages do we need due to remapping? */ + extra_pages += xen_count_remap_pages(max_pfn); /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO @@ -677,29 +683,35 @@ char * __init xen_memory_setup(void) extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages); i = 0; + addr = xen_e820_map[0].addr; + size = xen_e820_map[0].size; while (i < xen_e820_map_entries) { - phys_addr_t addr = xen_e820_map[i].addr; - phys_addr_t size = xen_e820_map[i].size; - u32 type = xen_e820_map[i].type; + chunk_size = size; + type = xen_e820_map[i].type; if (type == E820_RAM) { if (addr < mem_end) { - size = min(size, mem_end - addr); + chunk_size = min(size, mem_end - addr); } else if (extra_pages) { - size = min(size, PFN_PHYS(extra_pages)); - extra_pages -= PFN_DOWN(size); - xen_add_extra_mem(addr, size); - xen_max_p2m_pfn = PFN_DOWN(addr + size); + chunk_size = min(size, PFN_PHYS(extra_pages)); + extra_pages -= PFN_DOWN(chunk_size); + xen_add_extra_mem(addr, chunk_size); + xen_max_p2m_pfn = PFN_DOWN(addr + chunk_size); } else type = E820_UNUSABLE; } - xen_align_and_add_e820_region(addr, size, type); + xen_align_and_add_e820_region(addr, chunk_size, type); - xen_e820_map[i].addr += size; - xen_e820_map[i].size -= size; - if (xen_e820_map[i].size == 0) + addr += chunk_size; + size -= chunk_size; + if (size == 0) { i++; + if (i < xen_e820_map_entries) { + addr = xen_e820_map[i].addr; + size = xen_e820_map[i].size; + } + } } /* @@ -709,7 +721,7 @@ char * __init xen_memory_setup(void) * PFNs above MAX_P2M_PFN are considered identity mapped as * well. */ - set_phys_range_identity(xen_e820_map[i - 1].addr / PAGE_SIZE, ~0ul); + set_phys_range_identity(addr / PAGE_SIZE, ~0ul); /* * In domU, the ISA region is normal, usable memory, but we @@ -723,6 +735,12 @@ char * __init xen_memory_setup(void) xen_reserve_xen_mfnlist(); + /* + * Set identity map on non-RAM pages and prepare remapping the + * underlying RAM. + */ + xen_set_identity_and_remap(max_pfn); + return "Xen"; } From e612b4a7db4ae1dd8c2bbe171e10c21723de95b2 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:28 +0200 Subject: [PATCH 12/33] xen: check memory area against e820 map Provide a service routine to check a physical memory area against the E820 map. The routine will return false if the complete area is RAM according to the E820 map and true otherwise. Signed-off-by: Juergen Gross Reviewed-by: David Vrabel Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/xen/setup.c | 23 +++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 1 + 2 files changed, 24 insertions(+) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 87251b4c2e30..99ef82cc4edc 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -573,6 +573,29 @@ static unsigned long __init xen_count_remap_pages(unsigned long max_pfn) return extra; } +bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) +{ + struct e820entry *entry; + unsigned mapcnt; + phys_addr_t end; + + if (!size) + return false; + + end = start + size; + entry = xen_e820_map; + + for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) { + if (entry->type == E820_RAM && entry->addr <= start && + (entry->addr + entry->size) >= end) + return false; + + entry++; + } + + return true; +} + /* * Reserve Xen mfn_list. * See comment above "struct start_info" in diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 2292721b1d10..a4cbb76642d8 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -39,6 +39,7 @@ void xen_reserve_top(void); void xen_mm_pin_all(void); void xen_mm_unpin_all(void); +bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size); unsigned long __ref xen_chk_extra_mem(unsigned long pfn); void __init xen_inv_extra_mem(void); void __init xen_remap_memory(void); From 9ddac5b724a9465e27f25a0aa943e92c8341a85b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:29 +0200 Subject: [PATCH 13/33] xen: find unused contiguous memory area For being able to relocate pre-allocated data areas like initrd or p2m list it is mandatory to find a contiguous memory area which is not yet in use and doesn't conflict with the memory map we want to be in effect. In case such an area is found reserve it at once as this will be required to be done in any case. Signed-off-by: Juergen Gross Reviewed-by: David Vrabel Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/xen/setup.c | 34 ++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 1 + 2 files changed, 35 insertions(+) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 99ef82cc4edc..973d29441f96 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -596,6 +596,40 @@ bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) return true; } +/* + * Find a free area in physical memory not yet reserved and compliant with + * E820 map. + * Used to relocate pre-allocated areas like initrd or p2m list which are in + * conflict with the to be used E820 map. + * In case no area is found, return 0. Otherwise return the physical address + * of the area which is already reserved for convenience. + */ +phys_addr_t __init xen_find_free_area(phys_addr_t size) +{ + unsigned mapcnt; + phys_addr_t addr, start; + struct e820entry *entry = xen_e820_map; + + for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) { + if (entry->type != E820_RAM || entry->size < size) + continue; + start = entry->addr; + for (addr = start; addr < start + size; addr += PAGE_SIZE) { + if (!memblock_is_reserved(addr)) + continue; + start = addr + PAGE_SIZE; + if (start + size > entry->addr + entry->size) + break; + } + if (addr >= start + size) { + memblock_reserve(start, size); + return start; + } + } + + return 0; +} + /* * Reserve Xen mfn_list. * See comment above "struct start_info" in diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index a4cbb76642d8..c4c3c23597a5 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -43,6 +43,7 @@ bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size); unsigned long __ref xen_chk_extra_mem(unsigned long pfn); void __init xen_inv_extra_mem(void); void __init xen_remap_memory(void); +phys_addr_t __init xen_find_free_area(phys_addr_t size); char * __init xen_memory_setup(void); char * xen_auto_xlated_memory_setup(void); void __init xen_arch_setup(void); From 808fdb71936c41d46245f0e3aa6ec889cba70d97 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:30 +0200 Subject: [PATCH 14/33] xen: check for kernel memory conflicting with memory layout Checks whether the pre-allocated memory of the loaded kernel is in conflict with the target memory map. If this is the case, just panic instead of run into problems later, as there is nothing we can do to repair this situation. Signed-off-by: Juergen Gross Reviewed-by: David Vrabel Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/xen/setup.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 973d29441f96..9bd3f358f3d9 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "xen-ops.h" #include "vdso.h" #include "p2m.h" @@ -790,6 +791,17 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + /* + * Check whether the kernel itself conflicts with the target E820 map. + * Failing now is better than running into weird problems later due + * to relocating (and even reusing) pages with kernel text or data. + */ + if (xen_is_e820_reserved(__pa_symbol(_text), + __pa_symbol(__bss_stop) - __pa_symbol(_text))) { + xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); + BUG(); + } + xen_reserve_xen_mfnlist(); /* From 04414baab5ba862b10bde837c4773ffdbb78f0e0 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:31 +0200 Subject: [PATCH 15/33] xen: check pre-allocated page tables for conflict with memory map Check whether the page tables built by the domain builder are at memory addresses which are in conflict with the target memory map. If this is the case just panic instead of running into problems later. Signed-off-by: Juergen Gross Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/xen/mmu.c | 19 ++++++++++++++++--- arch/x86/xen/setup.c | 6 ++++++ arch/x86/xen/xen-ops.h | 1 + 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c04e14e6b301..1982617fa9c7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ +static phys_addr_t xen_pt_base, xen_pt_size __initdata; /* * Just beyond the highest usermode address. STACK_TOP_MAX has a @@ -1998,7 +1999,9 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) check_pt_base(&pt_base, &pt_end, addr[i]); /* Our (by three pages) smaller Xen pagetable that we are using */ - memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); + xen_pt_base = PFN_PHYS(pt_base); + xen_pt_size = (pt_end - pt_base) * PAGE_SIZE; + memblock_reserve(xen_pt_base, xen_pt_size); /* protect xen_start_info */ memblock_reserve(__pa(xen_start_info), PAGE_SIZE); /* Revector the xen_start_info */ @@ -2074,11 +2077,21 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) PFN_DOWN(__pa(initial_page_table))); xen_write_cr3(__pa(initial_page_table)); - memblock_reserve(__pa(xen_start_info->pt_base), - xen_start_info->nr_pt_frames * PAGE_SIZE); + xen_pt_base = __pa(xen_start_info->pt_base); + xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE; + + memblock_reserve(xen_pt_base, xen_pt_size); } #endif /* CONFIG_X86_64 */ +void __init xen_pt_check_e820(void) +{ + if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { + xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); + BUG(); + } +} + static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 9bd3f358f3d9..3fca9c114828 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -802,6 +802,12 @@ char * __init xen_memory_setup(void) BUG(); } + /* + * Check for a conflict of the hypervisor supplied page tables with + * the target E820 map. + */ + xen_pt_check_e820(); + xen_reserve_xen_mfnlist(); /* diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index c4c3c23597a5..9038a0f1d5a1 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -35,6 +35,7 @@ void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_reserve_top(void); +void __init xen_pt_check_e820(void); void xen_mm_pin_all(void); void xen_mm_unpin_all(void); From 4b9c15377f96e241be347fd3bbeeff74fbad0b44 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:32 +0200 Subject: [PATCH 16/33] xen: check for initrd conflicting with e820 map Check whether the initrd is placed at a location which is conflicting with the target E820 map. If this is the case relocate it to a new area unused up to now and compliant to the E820 map. Signed-off-by: Juergen Gross Reviewed-by: David Vrabel Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/xen/setup.c | 51 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 3fca9c114828..0d7f8818f8ca 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -631,6 +631,36 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size) return 0; } +/* + * Like memcpy, but with physical addresses for dest and src. + */ +static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src, + phys_addr_t n) +{ + phys_addr_t dest_off, src_off, dest_len, src_len, len; + void *from, *to; + + while (n) { + dest_off = dest & ~PAGE_MASK; + src_off = src & ~PAGE_MASK; + dest_len = n; + if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off) + dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off; + src_len = n; + if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off) + src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off; + len = min(dest_len, src_len); + to = early_memremap(dest - dest_off, dest_len + dest_off); + from = early_memremap(src - src_off, src_len + src_off); + memcpy(to, from, len); + early_memunmap(to, dest_len + dest_off); + early_memunmap(from, src_len + src_off); + n -= len; + dest += len; + src += len; + } +} + /* * Reserve Xen mfn_list. * See comment above "struct start_info" in @@ -810,6 +840,27 @@ char * __init xen_memory_setup(void) xen_reserve_xen_mfnlist(); + /* Check for a conflict of the initrd with the target E820 map. */ + if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image, + boot_params.hdr.ramdisk_size)) { + phys_addr_t new_area, start, size; + + new_area = xen_find_free_area(boot_params.hdr.ramdisk_size); + if (!new_area) { + xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n"); + BUG(); + } + + start = boot_params.hdr.ramdisk_image; + size = boot_params.hdr.ramdisk_size; + xen_phys_memcpy(new_area, start, size); + pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n", + start, start + size, new_area, new_area + size); + memblock_free(start, size); + boot_params.hdr.ramdisk_image = new_area; + boot_params.ext_ramdisk_image = new_area >> 32; + } + /* * Set identity map on non-RAM pages and prepare remapping the * underlying RAM. From 2592dbbbf4c67501c2bd2dcf89c2b8924d592a9f Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:33 +0200 Subject: [PATCH 17/33] mm: provide early_memremap_ro to establish read-only mapping During early boot as Xen pv domain the kernel needs to map some page tables supplied by the hypervisor read only. This is needed to be able to relocate some data structures conflicting with the physical memory map especially on systems with huge RAM (above 512GB). Provide the function early_memremap_ro() to provide this read only mapping. Signed-off-by: Juergen Gross Acked-by: Konrad Rzeszutek Wilk Acked-by: Vlastimil Babka Signed-off-by: David Vrabel --- include/asm-generic/early_ioremap.h | 2 ++ include/asm-generic/fixmap.h | 3 +++ mm/early_ioremap.c | 12 ++++++++++++ 3 files changed, 17 insertions(+) diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h index a5de55c04fb2..316bd043319e 100644 --- a/include/asm-generic/early_ioremap.h +++ b/include/asm-generic/early_ioremap.h @@ -11,6 +11,8 @@ extern void __iomem *early_ioremap(resource_size_t phys_addr, unsigned long size); extern void *early_memremap(resource_size_t phys_addr, unsigned long size); +extern void *early_memremap_ro(resource_size_t phys_addr, + unsigned long size); extern void early_iounmap(void __iomem *addr, unsigned long size); extern void early_memunmap(void *addr, unsigned long size); diff --git a/include/asm-generic/fixmap.h b/include/asm-generic/fixmap.h index f23174fb9ec4..1cbb8338edf3 100644 --- a/include/asm-generic/fixmap.h +++ b/include/asm-generic/fixmap.h @@ -46,6 +46,9 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr) #ifndef FIXMAP_PAGE_NORMAL #define FIXMAP_PAGE_NORMAL PAGE_KERNEL #endif +#if !defined(FIXMAP_PAGE_RO) && defined(PAGE_KERNEL_RO) +#define FIXMAP_PAGE_RO PAGE_KERNEL_RO +#endif #ifndef FIXMAP_PAGE_NOCACHE #define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_NOCACHE #endif diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index e10ccd299d66..0cfadafb3fb0 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -217,6 +217,13 @@ early_memremap(resource_size_t phys_addr, unsigned long size) return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_NORMAL); } +#ifdef FIXMAP_PAGE_RO +void __init * +early_memremap_ro(resource_size_t phys_addr, unsigned long size) +{ + return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO); +} +#endif #else /* CONFIG_MMU */ void __init __iomem * @@ -231,6 +238,11 @@ early_memremap(resource_size_t phys_addr, unsigned long size) { return (void *)phys_addr; } +void __init * +early_memremap_ro(resource_size_t phys_addr, unsigned long size) +{ + return (void *)phys_addr; +} void __init early_iounmap(void __iomem *addr, unsigned long size) { From 6c2681c863b24360098d1ba60f2af060a13a0561 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:34 +0200 Subject: [PATCH 18/33] xen: add explicit memblock_reserve() calls for special pages Some special pages containing interfaces to xen are being reserved implicitly only today. The memblock_reserve() call to reserve them is meant to reserve the p2m list supplied by xen. It is just reserving not only the p2m list itself, but some more pages up to the start of the xen built page tables. To be able to move the p2m list to another pfn range, which is needed for support of huge RAM, this memblock_reserve() must be split up to cover all affected reserved pages explicitly. The affected pages are: - start_info page - xenstore ring (might be missing, mfn is 0 in this case) - console ring (not for initial domain) Signed-off-by: Juergen Gross Signed-off-by: David Vrabel --- arch/x86/xen/enlighten.c | 4 +++- arch/x86/xen/mmu.c | 15 +++++++++++++++ arch/x86/xen/xen-ops.h | 1 + 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 11d6fb4e8483..373dbc9810d1 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1610,7 +1610,9 @@ asmlinkage __visible void __init xen_start_kernel(void) early_boot_irqs_disabled = true; xen_raw_console_write("mapping kernel into physical memory\n"); - xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); + xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, + xen_start_info->nr_pages); + xen_reserve_special_pages(); /* * Modify the cache mode translation tables to match Xen's PAT diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 1982617fa9c7..a36e7b462640 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2084,6 +2084,21 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) } #endif /* CONFIG_X86_64 */ +void __init xen_reserve_special_pages(void) +{ + phys_addr_t paddr; + + memblock_reserve(__pa(xen_start_info), PAGE_SIZE); + if (xen_start_info->store_mfn) { + paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn)); + memblock_reserve(paddr, PAGE_SIZE); + } + if (!xen_initial_domain()) { + paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn)); + memblock_reserve(paddr, PAGE_SIZE); + } +} + void __init xen_pt_check_e820(void) { if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9038a0f1d5a1..8795c2ef57f3 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -35,6 +35,7 @@ void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_reserve_top(void); +void __init xen_reserve_special_pages(void); void __init xen_pt_check_e820(void); void xen_mm_pin_all(void); From 70e61199559a09c62714694cd5ac3c3640c41552 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:35 +0200 Subject: [PATCH 19/33] xen: move p2m list if conflicting with e820 map Check whether the hypervisor supplied p2m list is placed at a location which is conflicting with the target E820 map. If this is the case relocate it to a new area unused up to now and compliant to the E820 map. As the p2m list might by huge (up to several GB) and is required to be mapped virtually, set up a temporary mapping for the copied list. For pvh domains just delete the p2m related information from start info instead of reserving the p2m memory, as we don't need it at all. For 32 bit kernels adjust the memblock_reserve() parameters in order to cover the page tables only. This requires to memblock_reserve() the start_info page on it's own. Signed-off-by: Juergen Gross Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/xen/mmu.c | 257 +++++++++++++++++++++++++++++++++++++---- arch/x86/xen/setup.c | 51 ++++---- arch/x86/xen/xen-ops.h | 3 + 3 files changed, 264 insertions(+), 47 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a36e7b462640..2c50b445884e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1094,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm) static void xen_post_allocator_init(void); +static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct mmuext_op op; + + op.cmd = cmd; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + #ifdef CONFIG_X86_64 static void __init xen_cleanhighmap(unsigned long vaddr, unsigned long vaddr_end) @@ -1129,10 +1139,12 @@ static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) memblock_free(paddr, size); } -static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl) +static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) { unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK; + if (unpin) + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa)); ClearPagePinned(virt_to_page(__va(pa))); xen_free_ro_pages(pa, PAGE_SIZE); } @@ -1151,7 +1163,9 @@ static void __init xen_cleanmfnmap(unsigned long vaddr) pmd_t *pmd; pte_t *pte; unsigned int i; + bool unpin; + unpin = (vaddr == 2 * PGDIR_SIZE); set_pgd(pgd, __pgd(0)); do { pud = pud_page + pud_index(va); @@ -1168,22 +1182,24 @@ static void __init xen_cleanmfnmap(unsigned long vaddr) xen_free_ro_pages(pa, PMD_SIZE); } else if (!pmd_none(*pmd)) { pte = pte_offset_kernel(pmd, va); + set_pmd(pmd, __pmd(0)); for (i = 0; i < PTRS_PER_PTE; ++i) { if (pte_none(pte[i])) break; pa = pte_pfn(pte[i]) << PAGE_SHIFT; xen_free_ro_pages(pa, PAGE_SIZE); } - xen_cleanmfnmap_free_pgtbl(pte); + xen_cleanmfnmap_free_pgtbl(pte, unpin); } va += PMD_SIZE; if (pmd_index(va)) continue; - xen_cleanmfnmap_free_pgtbl(pmd); + set_pud(pud, __pud(0)); + xen_cleanmfnmap_free_pgtbl(pmd, unpin); } } while (pud_index(va) || pmd_index(va)); - xen_cleanmfnmap_free_pgtbl(pud_page); + xen_cleanmfnmap_free_pgtbl(pud_page, unpin); } static void __init xen_pagetable_p2m_free(void) @@ -1219,6 +1235,12 @@ static void __init xen_pagetable_p2m_free(void) } else { xen_cleanmfnmap(addr); } +} + +static void __init xen_pagetable_cleanhighmap(void) +{ + unsigned long size; + unsigned long addr; /* At this stage, cleanup_highmap has already cleaned __ka space * from _brk_limit way up to the max_pfn_mapped (which is the end of @@ -1251,6 +1273,8 @@ static void __init xen_pagetable_p2m_setup(void) #ifdef CONFIG_X86_64 xen_pagetable_p2m_free(); + + xen_pagetable_cleanhighmap(); #endif /* And revector! Bye bye old array */ xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; @@ -1586,15 +1610,6 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) native_set_pte(ptep, pte); } -static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) -{ - struct mmuext_op op; - op.cmd = cmd; - op.arg1.mfn = pfn_to_mfn(pfn); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); -} - /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) @@ -2002,11 +2017,189 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) xen_pt_base = PFN_PHYS(pt_base); xen_pt_size = (pt_end - pt_base) * PAGE_SIZE; memblock_reserve(xen_pt_base, xen_pt_size); - /* protect xen_start_info */ - memblock_reserve(__pa(xen_start_info), PAGE_SIZE); + /* Revector the xen_start_info */ xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); } + +/* + * Read a value from a physical address. + */ +static unsigned long __init xen_read_phys_ulong(phys_addr_t addr) +{ + unsigned long *vaddr; + unsigned long val; + + vaddr = early_memremap_ro(addr, sizeof(val)); + val = *vaddr; + early_memunmap(vaddr, sizeof(val)); + return val; +} + +/* + * Translate a virtual address to a physical one without relying on mapped + * page tables. + */ +static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) +{ + phys_addr_t pa; + pgd_t pgd; + pud_t pud; + pmd_t pmd; + pte_t pte; + + pa = read_cr3(); + pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * + sizeof(pgd))); + if (!pgd_present(pgd)) + return 0; + + pa = pgd_val(pgd) & PTE_PFN_MASK; + pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) * + sizeof(pud))); + if (!pud_present(pud)) + return 0; + pa = pud_pfn(pud) << PAGE_SHIFT; + if (pud_large(pud)) + return pa + (vaddr & ~PUD_MASK); + + pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) * + sizeof(pmd))); + if (!pmd_present(pmd)) + return 0; + pa = pmd_pfn(pmd) << PAGE_SHIFT; + if (pmd_large(pmd)) + return pa + (vaddr & ~PMD_MASK); + + pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) * + sizeof(pte))); + if (!pte_present(pte)) + return 0; + pa = pte_pfn(pte) << PAGE_SHIFT; + + return pa | (vaddr & ~PAGE_MASK); +} + +/* + * Find a new area for the hypervisor supplied p2m list and relocate the p2m to + * this area. + */ +void __init xen_relocate_p2m(void) +{ + phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys; + unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; + int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud; + pte_t *pt; + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd; + unsigned long *new_p2m; + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; + n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; + n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; + n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; + n_frames = n_pte + n_pt + n_pmd + n_pud; + + new_area = xen_find_free_area(PFN_PHYS(n_frames)); + if (!new_area) { + xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n"); + BUG(); + } + + /* + * Setup the page tables for addressing the new p2m list. + * We have asked the hypervisor to map the p2m list at the user address + * PUD_SIZE. It may have done so, or it may have used a kernel space + * address depending on the Xen version. + * To avoid any possible virtual address collision, just use + * 2 * PUD_SIZE for the new area. + */ + pud_phys = new_area; + pmd_phys = pud_phys + PFN_PHYS(n_pud); + pt_phys = pmd_phys + PFN_PHYS(n_pmd); + p2m_pfn = PFN_DOWN(pt_phys) + n_pt; + + pgd = __va(read_cr3()); + new_p2m = (unsigned long *)(2 * PGDIR_SIZE); + for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { + pud = early_memremap(pud_phys, PAGE_SIZE); + clear_page(pud); + for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); + idx_pmd++) { + pmd = early_memremap(pmd_phys, PAGE_SIZE); + clear_page(pmd); + for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); + idx_pt++) { + pt = early_memremap(pt_phys, PAGE_SIZE); + clear_page(pt); + for (idx_pte = 0; + idx_pte < min(n_pte, PTRS_PER_PTE); + idx_pte++) { + set_pte(pt + idx_pte, + pfn_pte(p2m_pfn, PAGE_KERNEL)); + p2m_pfn++; + } + n_pte -= PTRS_PER_PTE; + early_memunmap(pt, PAGE_SIZE); + make_lowmem_page_readonly(__va(pt_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, + PFN_DOWN(pt_phys)); + set_pmd(pmd + idx_pt, + __pmd(_PAGE_TABLE | pt_phys)); + pt_phys += PAGE_SIZE; + } + n_pt -= PTRS_PER_PMD; + early_memunmap(pmd, PAGE_SIZE); + make_lowmem_page_readonly(__va(pmd_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, + PFN_DOWN(pmd_phys)); + set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); + pmd_phys += PAGE_SIZE; + } + n_pmd -= PTRS_PER_PUD; + early_memunmap(pud, PAGE_SIZE); + make_lowmem_page_readonly(__va(pud_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); + set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); + pud_phys += PAGE_SIZE; + } + + /* Now copy the old p2m info to the new area. */ + memcpy(new_p2m, xen_p2m_addr, size); + xen_p2m_addr = new_p2m; + + /* Release the old p2m list and set new list info. */ + p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list)); + BUG_ON(!p2m_pfn); + p2m_pfn_end = p2m_pfn + PFN_DOWN(size); + + if (xen_start_info->mfn_list < __START_KERNEL_map) { + pfn = xen_start_info->first_p2m_pfn; + pfn_end = xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames; + set_pgd(pgd + 1, __pgd(0)); + } else { + pfn = p2m_pfn; + pfn_end = p2m_pfn_end; + } + + memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); + while (pfn < pfn_end) { + if (pfn == p2m_pfn) { + pfn = p2m_pfn_end; + continue; + } + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + pfn++; + } + + xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; + xen_start_info->first_p2m_pfn = PFN_DOWN(new_area); + xen_start_info->nr_p2m_frames = n_frames; +} + #else /* !CONFIG_X86_64 */ static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); @@ -2047,18 +2240,41 @@ static void __init xen_write_cr3_init(unsigned long cr3) pv_mmu_ops.write_cr3 = &xen_write_cr3; } +/* + * For 32 bit domains xen_start_info->pt_base is the pgd address which might be + * not the first page table in the page table pool. + * Iterate through the initial page tables to find the real page table base. + */ +static phys_addr_t xen_find_pt_base(pmd_t *pmd) +{ + phys_addr_t pt_base, paddr; + unsigned pmdidx; + + pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd)); + + for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) + if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) { + paddr = m2p(pmd[pmdidx].pmd); + pt_base = min(pt_base, paddr); + } + + return pt_base; +} + void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; + kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); + + xen_pt_base = xen_find_pt_base(kernel_pmd); + xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE; + initial_kernel_pmd = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + - xen_start_info->nr_pt_frames * PAGE_SIZE + - 512*1024); + max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024); - kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); copy_page(initial_kernel_pmd, kernel_pmd); xen_map_identity_early(initial_kernel_pmd, max_pfn); @@ -2077,9 +2293,6 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) PFN_DOWN(__pa(initial_page_table))); xen_write_cr3(__pa(initial_page_table)); - xen_pt_base = __pa(xen_start_info->pt_base); - xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE; - memblock_reserve(xen_pt_base, xen_pt_size); } #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 0d7f8818f8ca..b096d02d34ac 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -663,37 +663,35 @@ static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src, /* * Reserve Xen mfn_list. - * See comment above "struct start_info" in - * We tried to make the the memblock_reserve more selective so - * that it would be clear what region is reserved. Sadly we ran - * in the problem wherein on a 64-bit hypervisor with a 32-bit - * initial domain, the pt_base has the cr3 value which is not - * neccessarily where the pagetable starts! As Jan put it: " - * Actually, the adjustment turns out to be correct: The page - * tables for a 32-on-64 dom0 get allocated in the order "first L1", - * "first L2", "first L3", so the offset to the page table base is - * indeed 2. When reading xen/include/public/xen.h's comment - * very strictly, this is not a violation (since there nothing is said - * that the first thing in the page table space is pointed to by - * pt_base; I admit that this seems to be implied though, namely - * do I think that it is implied that the page table space is the - * range [pt_base, pt_base + nt_pt_frames), whereas that - * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), - * which - without a priori knowledge - the kernel would have - * difficulty to figure out)." - so lets just fall back to the - * easy way and reserve the whole region. */ static void __init xen_reserve_xen_mfnlist(void) { + phys_addr_t start, size; + if (xen_start_info->mfn_list >= __START_KERNEL_map) { - memblock_reserve(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - - xen_start_info->mfn_list); + start = __pa(xen_start_info->mfn_list); + size = PFN_ALIGN(xen_start_info->nr_pages * + sizeof(unsigned long)); + } else { + start = PFN_PHYS(xen_start_info->first_p2m_pfn); + size = PFN_PHYS(xen_start_info->nr_p2m_frames); + } + + if (!xen_is_e820_reserved(start, size)) { + memblock_reserve(start, size); return; } - memblock_reserve(PFN_PHYS(xen_start_info->first_p2m_pfn), - PFN_PHYS(xen_start_info->nr_p2m_frames)); +#ifdef CONFIG_X86_32 + /* + * Relocating the p2m on 32 bit system to an arbitrary virtual address + * is not supported, so just give up. + */ + xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n"); + BUG(); +#else + xen_relocate_p2m(); +#endif } /** @@ -895,7 +893,10 @@ char * __init xen_auto_xlated_memory_setup(void) e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size, xen_e820_map[i].type); - xen_reserve_xen_mfnlist(); + /* Remove p2m info, it is not needed. */ + xen_start_info->mfn_list = 0; + xen_start_info->first_p2m_pfn = 0; + xen_start_info->nr_p2m_frames = 0; return "Xen"; } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 8795c2ef57f3..1399423f3418 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -40,6 +40,9 @@ void __init xen_pt_check_e820(void); void xen_mm_pin_all(void); void xen_mm_unpin_all(void); +#ifdef CONFIG_X86_64 +void __init xen_relocate_p2m(void); +#endif bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size); unsigned long __ref xen_chk_extra_mem(unsigned long pfn); From c70727a5bc18a5a233fddc6056d1de9144d7a293 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:36 +0200 Subject: [PATCH 20/33] xen: allow more than 512 GB of RAM for 64 bit pv-domains 64 bit pv-domains under Xen are limited to 512 GB of RAM today. The main reason has been the 3 level p2m tree, which was replaced by the virtual mapped linear p2m list. Parallel to the p2m list which is being used by the kernel itself there is a 3 level mfn tree for usage by the Xen tools and eventually for crash dump analysis. For this tree the linear p2m list can serve as a replacement, too. As the kernel can't know whether the tools are capable of dealing with the p2m list instead of the mfn tree, the limit of 512 GB can't be dropped in all cases. This patch replaces the hard limit by a kernel parameter which tells the kernel to obey the 512 GB limit or not. The default is selected by a configuration parameter which specifies whether the 512 GB limit should be active per default for domUs (domain save/restore/migration and crash dump analysis are affected). Memory above the domain limit is returned to the hypervisor instead of being identity mapped, which was wrong anyway. The kernel configuration parameter to specify the maximum size of a domain can be deleted, as it is not relevant any more. Signed-off-by: Juergen Gross Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- Documentation/kernel-parameters.txt | 7 ++++ arch/x86/include/asm/xen/page.h | 4 -- arch/x86/xen/Kconfig | 20 ++++++---- arch/x86/xen/p2m.c | 10 ++--- arch/x86/xen/setup.c | 59 ++++++++++++++++++++++++----- 5 files changed, 73 insertions(+), 27 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 1d6f0459cd7b..47f7b985f833 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -4078,6 +4078,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. plus one apbt timer for broadcast timer. x86_intel_mid_timer=apbt_only | lapic_and_apbt + xen_512gb_limit [KNL,X86-64,XEN] + Restricts the kernel running paravirtualized under Xen + to use only up to 512 GB of RAM. The reason to do so is + crash analysis tools and Xen tools for doing domain + save/restore/migration must be enabled to handle larger + domains. + xen_emul_unplug= [HW,X86,XEN] Unplug Xen emulated devices Format: [unplug0,][unplug1] diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index c44a5d53e464..10ef14b2616d 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -35,10 +35,6 @@ typedef struct xpaddr { #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) #define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT) -/* Maximum amount of memory we can handle in a domain in pages */ -#define MAX_DOMAIN_PAGES \ - ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) - extern unsigned long *machine_to_phys_mapping; extern unsigned long machine_to_phys_nr; extern unsigned long *xen_p2m_addr; diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index e88fda867a33..7bcf21b865d1 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -23,14 +23,18 @@ config XEN_PVHVM def_bool y depends on XEN && PCI && X86_LOCAL_APIC -config XEN_MAX_DOMAIN_MEMORY - int - default 500 if X86_64 - default 64 if X86_32 - depends on XEN - help - This only affects the sizing of some bss arrays, the unused - portions of which are freed. +config XEN_512GB + bool "Limit Xen pv-domain memory to 512GB" + depends on XEN && X86_64 + default y + help + Limit paravirtualized user domains to 512GB of RAM. + + The Xen tools and crash dump analysis tools might not support + pv-domains with more than 512 GB of RAM. This option controls the + default setting of the kernel to use only up to 512 GB or more. + It is always possible to change the default via specifying the + boot parameter "xen_512gb_limit". config XEN_SAVE_RESTORE bool diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index c719f7c36cb8..c059ca1c8293 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -517,7 +517,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) */ static bool alloc_p2m(unsigned long pfn) { - unsigned topidx, mididx; + unsigned topidx; unsigned long *top_mfn_p, *mid_mfn; pte_t *ptep, *pte_pg; unsigned int level; @@ -525,9 +525,6 @@ static bool alloc_p2m(unsigned long pfn) unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); unsigned long p2m_pfn; - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - ptep = lookup_address(addr, &level); BUG_ON(!ptep || level != PG_LEVEL_4K); pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); @@ -539,7 +536,8 @@ static bool alloc_p2m(unsigned long pfn) return false; } - if (p2m_top_mfn) { + if (p2m_top_mfn && pfn < MAX_P2M_PFN) { + topidx = p2m_top_index(pfn); top_mfn_p = &p2m_top_mfn[topidx]; mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); @@ -596,7 +594,7 @@ static bool alloc_p2m(unsigned long pfn) wmb(); /* Tools are synchronizing via p2m_generation. */ HYPERVISOR_shared_info->arch.p2m_generation++; if (mid_mfn) - mid_mfn[mididx] = virt_to_mfn(p2m); + mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m); p2m = NULL; } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index b096d02d34ac..f96002107691 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -33,6 +33,8 @@ #include "p2m.h" #include "mmu.h" +#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024) + /* Amount of extra memory space we add to the e820 ranges */ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; @@ -69,6 +71,26 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; */ #define EXTRA_MEM_RATIO (10) +static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB); + +static void __init xen_parse_512gb(void) +{ + bool val = false; + char *arg; + + arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit"); + if (!arg) + return; + + arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit="); + if (!arg) + val = true; + else if (strtobool(arg + strlen("xen_512gb_limit="), &val)) + return; + + xen_512gb_limit = val; +} + static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size) { int i; @@ -503,12 +525,29 @@ void __init xen_remap_memory(void) pr_info("Remapped %ld page(s)\n", remapped); } +static unsigned long __init xen_get_pages_limit(void) +{ + unsigned long limit; + +#ifdef CONFIG_X86_32 + limit = GB(64) / PAGE_SIZE; +#else + limit = ~0ul; + if (!xen_initial_domain() && xen_512gb_limit) + limit = GB(512) / PAGE_SIZE; +#endif + return limit; +} + static unsigned long __init xen_get_max_pages(void) { - unsigned long max_pages = MAX_DOMAIN_PAGES; + unsigned long max_pages, limit; domid_t domid = DOMID_SELF; int ret; + limit = xen_get_pages_limit(); + max_pages = limit; + /* * For the initial domain we use the maximum reservation as * the maximum page. @@ -524,7 +563,7 @@ static unsigned long __init xen_get_max_pages(void) max_pages = ret; } - return min(max_pages, MAX_DOMAIN_PAGES); + return min(max_pages, limit); } static void __init xen_align_and_add_e820_region(phys_addr_t start, @@ -699,7 +738,7 @@ static void __init xen_reserve_xen_mfnlist(void) **/ char * __init xen_memory_setup(void) { - unsigned long max_pfn = xen_start_info->nr_pages; + unsigned long max_pfn; phys_addr_t mem_end, addr, size, chunk_size; u32 type; int rc; @@ -709,7 +748,9 @@ char * __init xen_memory_setup(void) int i; int op; - max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); + xen_parse_512gb(); + max_pfn = xen_get_pages_limit(); + max_pfn = min(max_pfn, xen_start_info->nr_pages); mem_end = PFN_PHYS(max_pfn); memmap.nr_entries = E820MAX; @@ -762,12 +803,15 @@ char * __init xen_memory_setup(void) * is limited to the max size of lowmem, so that it doesn't * get completely filled. * + * Make sure we have no memory above max_pages, as this area + * isn't handled by the p2m management. + * * In principle there could be a problem in lowmem systems if * the initial memory is also very large with respect to * lowmem, but we won't try to deal with that here. */ - extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), - extra_pages); + extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), + extra_pages, max_pages - max_pfn); i = 0; addr = xen_e820_map[0].addr; size = xen_e820_map[0].size; @@ -803,9 +847,6 @@ char * __init xen_memory_setup(void) /* * Set the rest as identity mapped, in case PCI BARs are * located here. - * - * PFNs above MAX_P2M_PFN are considered identity mapped as - * well. */ set_phys_range_identity(addr / PAGE_SIZE, ~0ul); From cb3eb850137cd43fc3e25d2062525f5ba5fd884a Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:37 +0200 Subject: [PATCH 21/33] xen: remove no longer needed p2m.h Cleanup by removing arch/x86/xen/p2m.h as it isn't needed any more. Most definitions in this file are used in p2m.c only. Move those into p2m.c. set_phys_range_identity() is already declared in arch/x86/include/asm/xen/page.h, add __init annotation there. MAX_REMAP_RANGES isn't used at all, just delete it. The only define left is P2M_PER_PAGE which is moved to page.h as well. Signed-off-by: Juergen Gross Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/include/asm/xen/page.h | 6 ++++-- arch/x86/xen/p2m.c | 6 +++++- arch/x86/xen/p2m.h | 15 --------------- arch/x86/xen/setup.c | 1 - 4 files changed, 9 insertions(+), 19 deletions(-) delete mode 100644 arch/x86/xen/p2m.h diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 10ef14b2616d..a3804fbe1f36 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -35,6 +35,8 @@ typedef struct xpaddr { #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) #define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT) +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) + extern unsigned long *machine_to_phys_mapping; extern unsigned long machine_to_phys_nr; extern unsigned long *xen_p2m_addr; @@ -44,8 +46,8 @@ extern unsigned long xen_max_p2m_pfn; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); -extern unsigned long set_phys_range_identity(unsigned long pfn_s, - unsigned long pfn_e); +extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e); extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index c059ca1c8293..bfc08b13044b 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -79,10 +79,14 @@ #include #include -#include "p2m.h" #include "multicalls.h" #include "xen-ops.h" +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) + +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) + #define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE) unsigned long *xen_p2m_addr __read_mostly; diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h deleted file mode 100644 index ad8aee24ab72..000000000000 --- a/arch/x86/xen/p2m.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _XEN_P2M_H -#define _XEN_P2M_H - -#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) -#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) - -#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) - -#define MAX_REMAP_RANGES 10 - -extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, - unsigned long pfn_e); - -#endif /* _XEN_P2M_H */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index f96002107691..a1a77eabe858 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -30,7 +30,6 @@ #include #include "xen-ops.h" #include "vdso.h" -#include "p2m.h" #include "mmu.h" #define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024) From a11f4f0a4e18b4bdc7d5e36438711e038b7a1f74 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 10 Aug 2015 16:34:32 -0400 Subject: [PATCH 22/33] xen: xensyms support Export Xen symbols to dom0 via /proc/xen/xensyms (similar to /proc/kallsyms). Signed-off-by: Boris Ostrovsky Reviewed-by: David Vrabel Signed-off-by: David Vrabel --- drivers/xen/Kconfig | 8 ++ drivers/xen/xenfs/Makefile | 1 + drivers/xen/xenfs/super.c | 3 + drivers/xen/xenfs/xenfs.h | 1 + drivers/xen/xenfs/xensyms.c | 152 +++++++++++++++++++++++++++++++ include/xen/interface/platform.h | 18 ++++ 6 files changed, 183 insertions(+) create mode 100644 drivers/xen/xenfs/xensyms.c diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 7cd226da15fe..936760455dd5 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -280,4 +280,12 @@ config XEN_ACPI def_bool y depends on X86 && ACPI +config XEN_SYMS + bool "Xen symbols" + depends on X86 && XEN_DOM0 && XENFS + default y if KALLSYMS + help + Exports hypervisor symbols (along with their types and addresses) via + /proc/xen/xensyms file, similar to /proc/kallsyms + endmenu diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile index b019865fcc56..1a83010ddffa 100644 --- a/drivers/xen/xenfs/Makefile +++ b/drivers/xen/xenfs/Makefile @@ -2,3 +2,4 @@ obj-$(CONFIG_XENFS) += xenfs.o xenfs-y = super.o xenfs-$(CONFIG_XEN_DOM0) += xenstored.o +xenfs-$(CONFIG_XEN_SYMS) += xensyms.o diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 06092e0fe8ce..8559a71f36b1 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -57,6 +57,9 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent) { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, { "xsd_kva", &xsd_kva_file_ops, S_IRUSR|S_IWUSR}, { "xsd_port", &xsd_port_file_ops, S_IRUSR|S_IWUSR}, +#ifdef CONFIG_XEN_SYMS + { "xensyms", &xensyms_ops, S_IRUSR}, +#endif {""}, }; diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h index 6b80c7779c02..2c5934ea9b1e 100644 --- a/drivers/xen/xenfs/xenfs.h +++ b/drivers/xen/xenfs/xenfs.h @@ -3,5 +3,6 @@ extern const struct file_operations xsd_kva_file_ops; extern const struct file_operations xsd_port_file_ops; +extern const struct file_operations xensyms_ops; #endif /* _XENFS_XENBUS_H */ diff --git a/drivers/xen/xenfs/xensyms.c b/drivers/xen/xenfs/xensyms.c new file mode 100644 index 000000000000..f8b12856753f --- /dev/null +++ b/drivers/xen/xenfs/xensyms.c @@ -0,0 +1,152 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xenfs.h" + + +#define XEN_KSYM_NAME_LEN 127 /* Hypervisor may have different name length */ + +struct xensyms { + struct xen_platform_op op; + char *name; + uint32_t namelen; +}; + +/* Grab next output page from the hypervisor */ +static int xensyms_next_sym(struct xensyms *xs) +{ + int ret; + struct xenpf_symdata *symdata = &xs->op.u.symdata; + uint64_t symnum; + + memset(xs->name, 0, xs->namelen); + symdata->namelen = xs->namelen; + + symnum = symdata->symnum; + + ret = HYPERVISOR_dom0_op(&xs->op); + if (ret < 0) + return ret; + + /* + * If hypervisor's symbol didn't fit into the buffer then allocate + * a larger buffer and try again. + */ + if (unlikely(symdata->namelen > xs->namelen)) { + kfree(xs->name); + + xs->namelen = symdata->namelen; + xs->name = kzalloc(xs->namelen, GFP_KERNEL); + if (!xs->name) + return -ENOMEM; + + set_xen_guest_handle(symdata->name, xs->name); + symdata->symnum--; /* Rewind */ + + ret = HYPERVISOR_dom0_op(&xs->op); + if (ret < 0) + return ret; + } + + if (symdata->symnum == symnum) + /* End of symbols */ + return 1; + + return 0; +} + +static void *xensyms_start(struct seq_file *m, loff_t *pos) +{ + struct xensyms *xs = (struct xensyms *)m->private; + + xs->op.u.symdata.symnum = *pos; + + if (xensyms_next_sym(xs)) + return NULL; + + return m->private; +} + +static void *xensyms_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct xensyms *xs = (struct xensyms *)m->private; + + xs->op.u.symdata.symnum = ++(*pos); + + if (xensyms_next_sym(xs)) + return NULL; + + return p; +} + +static int xensyms_show(struct seq_file *m, void *p) +{ + struct xensyms *xs = (struct xensyms *)m->private; + struct xenpf_symdata *symdata = &xs->op.u.symdata; + + seq_printf(m, "%016llx %c %s\n", symdata->address, + symdata->type, xs->name); + + return 0; +} + +static void xensyms_stop(struct seq_file *m, void *p) +{ +} + +static const struct seq_operations xensyms_seq_ops = { + .start = xensyms_start, + .next = xensyms_next, + .show = xensyms_show, + .stop = xensyms_stop, +}; + +static int xensyms_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + struct xensyms *xs; + int ret; + + ret = seq_open_private(file, &xensyms_seq_ops, + sizeof(struct xensyms)); + if (ret) + return ret; + + m = file->private_data; + xs = (struct xensyms *)m->private; + + xs->namelen = XEN_KSYM_NAME_LEN + 1; + xs->name = kzalloc(xs->namelen, GFP_KERNEL); + if (!xs->name) { + seq_release_private(inode, file); + return -ENOMEM; + } + set_xen_guest_handle(xs->op.u.symdata.name, xs->name); + xs->op.cmd = XENPF_get_symbol; + xs->op.u.symdata.namelen = xs->namelen; + + return 0; +} + +static int xensyms_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct xensyms *xs = (struct xensyms *)m->private; + + kfree(xs->name); + return seq_release_private(inode, file); +} + +const struct file_operations xensyms_ops = { + .open = xensyms_open, + .read = seq_read, + .llseek = seq_lseek, + .release = xensyms_release +}; diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h index 5cc49ea8d840..8e035871360e 100644 --- a/include/xen/interface/platform.h +++ b/include/xen/interface/platform.h @@ -474,6 +474,23 @@ struct xenpf_core_parking { }; DEFINE_GUEST_HANDLE_STRUCT(xenpf_core_parking); +#define XENPF_get_symbol 63 +struct xenpf_symdata { + /* IN/OUT variables */ + uint32_t namelen; /* size of 'name' buffer */ + + /* IN/OUT variables */ + uint32_t symnum; /* IN: Symbol to read */ + /* OUT: Next available symbol. If same as IN */ + /* then we reached the end */ + + /* OUT variables */ + GUEST_HANDLE(char) name; + uint64_t address; + char type; +}; +DEFINE_GUEST_HANDLE_STRUCT(xenpf_symdata); + struct xen_platform_op { uint32_t cmd; uint32_t interface_version; /* XENPF_INTERFACE_VERSION */ @@ -495,6 +512,7 @@ struct xen_platform_op { struct xenpf_cpu_hotadd cpu_add; struct xenpf_mem_hotadd mem_add; struct xenpf_core_parking core_parking; + struct xenpf_symdata symdata; uint8_t pad[128]; } u; }; From 5f141548824cebbff2e838ff401c34e667797467 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 10 Aug 2015 16:34:33 -0400 Subject: [PATCH 23/33] xen/PMU: Sysfs interface for setting Xen PMU mode Set Xen's PMU mode via /sys/hypervisor/pmu/pmu_mode. Add XENPMU hypercall. Signed-off-by: Boris Ostrovsky Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- .../ABI/testing/sysfs-hypervisor-pmu | 23 +++ arch/x86/include/asm/xen/hypercall.h | 6 + arch/x86/xen/Kconfig | 1 + drivers/xen/Kconfig | 3 + drivers/xen/sys-hypervisor.c | 136 +++++++++++++++++- include/xen/interface/xen.h | 1 + include/xen/interface/xenpmu.h | 59 ++++++++ 7 files changed, 228 insertions(+), 1 deletion(-) create mode 100644 Documentation/ABI/testing/sysfs-hypervisor-pmu create mode 100644 include/xen/interface/xenpmu.h diff --git a/Documentation/ABI/testing/sysfs-hypervisor-pmu b/Documentation/ABI/testing/sysfs-hypervisor-pmu new file mode 100644 index 000000000000..224faa105e18 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-hypervisor-pmu @@ -0,0 +1,23 @@ +What: /sys/hypervisor/pmu/pmu_mode +Date: August 2015 +KernelVersion: 4.3 +Contact: Boris Ostrovsky +Description: + Describes mode that Xen's performance-monitoring unit (PMU) + uses. Accepted values are + "off" -- PMU is disabled + "self" -- The guest can profile itself + "hv" -- The guest can profile itself and, if it is + privileged (e.g. dom0), the hypervisor + "all" -- The guest can profile itself, the hypervisor + and all other guests. Only available to + privileged guests. + +What: /sys/hypervisor/pmu/pmu_features +Date: August 2015 +KernelVersion: 4.3 +Contact: Boris Ostrovsky +Description: + Describes Xen PMU features (as an integer). A set bit indicates + that the corresponding feature is enabled. See + include/xen/interface/xenpmu.h for available features diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index ca08a27b90b3..83aea8055119 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -465,6 +465,12 @@ HYPERVISOR_tmem_op( return _hypercall1(int, tmem_op, op); } +static inline int +HYPERVISOR_xenpmu_op(unsigned int op, void *arg) +{ + return _hypercall2(int, xenpmu_op, op, arg); +} + static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 7bcf21b865d1..a8ffdb85656f 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -7,6 +7,7 @@ config XEN depends on PARAVIRT select PARAVIRT_CLOCK select XEN_HAVE_PVMMU + select XEN_HAVE_VPMU depends on X86_64 || (X86_32 && X86_PAE) depends on X86_TSC help diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 936760455dd5..73708acce3ca 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -288,4 +288,7 @@ config XEN_SYMS Exports hypervisor symbols (along with their types and addresses) via /proc/xen/xensyms file, similar to /proc/kallsyms +config XEN_HAVE_VPMU + bool + endmenu diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index 96453f8a85c5..b5a7342e0ba5 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -20,6 +20,9 @@ #include #include #include +#ifdef CONFIG_XEN_HAVE_VPMU +#include +#endif #define HYPERVISOR_ATTR_RO(_name) \ static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name) @@ -368,6 +371,126 @@ static void xen_properties_destroy(void) sysfs_remove_group(hypervisor_kobj, &xen_properties_group); } +#ifdef CONFIG_XEN_HAVE_VPMU +struct pmu_mode { + const char *name; + uint32_t mode; +}; + +static struct pmu_mode pmu_modes[] = { + {"off", XENPMU_MODE_OFF}, + {"self", XENPMU_MODE_SELF}, + {"hv", XENPMU_MODE_HV}, + {"all", XENPMU_MODE_ALL} +}; + +static ssize_t pmu_mode_store(struct hyp_sysfs_attr *attr, + const char *buffer, size_t len) +{ + int ret; + struct xen_pmu_params xp; + int i; + + for (i = 0; i < ARRAY_SIZE(pmu_modes); i++) { + if (strncmp(buffer, pmu_modes[i].name, len - 1) == 0) { + xp.val = pmu_modes[i].mode; + break; + } + } + + if (i == ARRAY_SIZE(pmu_modes)) + return -EINVAL; + + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + ret = HYPERVISOR_xenpmu_op(XENPMU_mode_set, &xp); + if (ret) + return ret; + + return len; +} + +static ssize_t pmu_mode_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret; + struct xen_pmu_params xp; + int i; + uint32_t mode; + + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + ret = HYPERVISOR_xenpmu_op(XENPMU_mode_get, &xp); + if (ret) + return ret; + + mode = (uint32_t)xp.val; + for (i = 0; i < ARRAY_SIZE(pmu_modes); i++) { + if (mode == pmu_modes[i].mode) + return sprintf(buffer, "%s\n", pmu_modes[i].name); + } + + return -EINVAL; +} +HYPERVISOR_ATTR_RW(pmu_mode); + +static ssize_t pmu_features_store(struct hyp_sysfs_attr *attr, + const char *buffer, size_t len) +{ + int ret; + uint32_t features; + struct xen_pmu_params xp; + + ret = kstrtou32(buffer, 0, &features); + if (ret) + return ret; + + xp.val = features; + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + ret = HYPERVISOR_xenpmu_op(XENPMU_feature_set, &xp); + if (ret) + return ret; + + return len; +} + +static ssize_t pmu_features_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret; + struct xen_pmu_params xp; + + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + ret = HYPERVISOR_xenpmu_op(XENPMU_feature_get, &xp); + if (ret) + return ret; + + return sprintf(buffer, "0x%x\n", (uint32_t)xp.val); +} +HYPERVISOR_ATTR_RW(pmu_features); + +static struct attribute *xen_pmu_attrs[] = { + &pmu_mode_attr.attr, + &pmu_features_attr.attr, + NULL +}; + +static const struct attribute_group xen_pmu_group = { + .name = "pmu", + .attrs = xen_pmu_attrs, +}; + +static int __init xen_pmu_init(void) +{ + return sysfs_create_group(hypervisor_kobj, &xen_pmu_group); +} + +static void xen_pmu_destroy(void) +{ + sysfs_remove_group(hypervisor_kobj, &xen_pmu_group); +} +#endif + static int __init hyper_sysfs_init(void) { int ret; @@ -390,7 +513,15 @@ static int __init hyper_sysfs_init(void) ret = xen_properties_init(); if (ret) goto prop_out; - +#ifdef CONFIG_XEN_HAVE_VPMU + if (xen_initial_domain()) { + ret = xen_pmu_init(); + if (ret) { + xen_properties_destroy(); + goto prop_out; + } + } +#endif goto out; prop_out: @@ -407,6 +538,9 @@ static int __init hyper_sysfs_init(void) static void __exit hyper_sysfs_exit(void) { +#ifdef CONFIG_XEN_HAVE_VPMU + xen_pmu_destroy(); +#endif xen_properties_destroy(); xen_compilation_destroy(); xen_sysfs_uuid_destroy(); diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 8194270edcf0..e9d4501d1f5e 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -80,6 +80,7 @@ #define __HYPERVISOR_kexec_op 37 #define __HYPERVISOR_tmem_op 38 #define __HYPERVISOR_xc_reserved_op 39 /* reserved for XenClient */ +#define __HYPERVISOR_xenpmu_op 40 /* Architecture-specific hypercall definitions. */ #define __HYPERVISOR_arch_0 48 diff --git a/include/xen/interface/xenpmu.h b/include/xen/interface/xenpmu.h new file mode 100644 index 000000000000..eac1b498b89f --- /dev/null +++ b/include/xen/interface/xenpmu.h @@ -0,0 +1,59 @@ +#ifndef __XEN_PUBLIC_XENPMU_H__ +#define __XEN_PUBLIC_XENPMU_H__ + +#include "xen.h" + +#define XENPMU_VER_MAJ 0 +#define XENPMU_VER_MIN 1 + +/* + * ` enum neg_errnoval + * ` HYPERVISOR_xenpmu_op(enum xenpmu_op cmd, struct xenpmu_params *args); + * + * @cmd == XENPMU_* (PMU operation) + * @args == struct xenpmu_params + */ +/* ` enum xenpmu_op { */ +#define XENPMU_mode_get 0 /* Also used for getting PMU version */ +#define XENPMU_mode_set 1 +#define XENPMU_feature_get 2 +#define XENPMU_feature_set 3 +#define XENPMU_init 4 +#define XENPMU_finish 5 + +/* ` } */ + +/* Parameters structure for HYPERVISOR_xenpmu_op call */ +struct xen_pmu_params { + /* IN/OUT parameters */ + struct { + uint32_t maj; + uint32_t min; + } version; + uint64_t val; + + /* IN parameters */ + uint32_t vcpu; + uint32_t pad; +}; + +/* PMU modes: + * - XENPMU_MODE_OFF: No PMU virtualization + * - XENPMU_MODE_SELF: Guests can profile themselves + * - XENPMU_MODE_HV: Guests can profile themselves, dom0 profiles + * itself and Xen + * - XENPMU_MODE_ALL: Only dom0 has access to VPMU and it profiles + * everyone: itself, the hypervisor and the guests. + */ +#define XENPMU_MODE_OFF 0 +#define XENPMU_MODE_SELF (1<<0) +#define XENPMU_MODE_HV (1<<1) +#define XENPMU_MODE_ALL (1<<2) + +/* + * PMU features: + * - XENPMU_FEATURE_INTEL_BTS: Intel BTS support (ignored on AMD) + */ +#define XENPMU_FEATURE_INTEL_BTS 1 + +#endif /* __XEN_PUBLIC_XENPMU_H__ */ From 65d0cf0be79feebeb19e7626fd3ed41ae73f642d Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 10 Aug 2015 16:34:34 -0400 Subject: [PATCH 24/33] xen/PMU: Initialization code for Xen PMU Map shared data structure that will hold CPU registers, VPMU context, V/PCPU IDs of the CPU interrupted by PMU interrupt. Hypervisor fills this information in its handler and passes it to the guest for further processing. Set up PMU VIRQ. Now that perf infrastructure will assume that PMU is available on a PV guest we need to be careful and make sure that accesses via RDPMC instruction don't cause fatal traps by the hypervisor. Provide a nop RDPMC handler. For the same reason avoid issuing a warning on a write to APIC's LVTPC. Both of these will be made functional in later patches. Signed-off-by: Boris Ostrovsky Reviewed-by: David Vrabel Signed-off-by: David Vrabel --- arch/x86/include/asm/xen/interface.h | 123 +++++++++++++++++++ arch/x86/xen/Makefile | 2 +- arch/x86/xen/apic.c | 3 + arch/x86/xen/enlighten.c | 12 +- arch/x86/xen/pmu.c | 170 +++++++++++++++++++++++++++ arch/x86/xen/pmu.h | 11 ++ arch/x86/xen/smp.c | 29 ++++- arch/x86/xen/suspend.c | 23 +++- include/xen/interface/xen.h | 1 + include/xen/interface/xenpmu.h | 33 ++++++ 10 files changed, 398 insertions(+), 9 deletions(-) create mode 100644 arch/x86/xen/pmu.c create mode 100644 arch/x86/xen/pmu.h diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 3b88eeacdbda..62ca03ef5c65 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -250,6 +250,129 @@ struct vcpu_guest_context { #endif }; DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context); + +/* AMD PMU registers and structures */ +struct xen_pmu_amd_ctxt { + /* + * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd). + * For PV(H) guests these fields are RO. + */ + uint32_t counters; + uint32_t ctrls; + + /* Counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + uint64_t regs[]; +#elif defined(__GNUC__) + uint64_t regs[0]; +#endif +}; + +/* Intel PMU registers and structures */ +struct xen_pmu_cntr_pair { + uint64_t counter; + uint64_t control; +}; + +struct xen_pmu_intel_ctxt { + /* + * Offsets to fixed and architectural counter MSRs (relative to + * xen_pmu_arch.c.intel). + * For PV(H) guests these fields are RO. + */ + uint32_t fixed_counters; + uint32_t arch_counters; + + /* PMU registers */ + uint64_t global_ctrl; + uint64_t global_ovf_ctrl; + uint64_t global_status; + uint64_t fixed_ctrl; + uint64_t ds_area; + uint64_t pebs_enable; + uint64_t debugctl; + + /* Fixed and architectural counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + uint64_t regs[]; +#elif defined(__GNUC__) + uint64_t regs[0]; +#endif +}; + +/* Sampled domain's registers */ +struct xen_pmu_regs { + uint64_t ip; + uint64_t sp; + uint64_t flags; + uint16_t cs; + uint16_t ss; + uint8_t cpl; + uint8_t pad[3]; +}; + +/* PMU flags */ +#define PMU_CACHED (1<<0) /* PMU MSRs are cached in the context */ +#define PMU_SAMPLE_USER (1<<1) /* Sample is from user or kernel mode */ +#define PMU_SAMPLE_REAL (1<<2) /* Sample is from realmode */ +#define PMU_SAMPLE_PV (1<<3) /* Sample from a PV guest */ + +/* + * Architecture-specific information describing state of the processor at + * the time of PMU interrupt. + * Fields of this structure marked as RW for guest should only be written by + * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the + * hypervisor during PMU interrupt). Hypervisor will read updated data in + * XENPMU_flush hypercall and clear PMU_CACHED bit. + */ +struct xen_pmu_arch { + union { + /* + * Processor's registers at the time of interrupt. + * WO for hypervisor, RO for guests. + */ + struct xen_pmu_regs regs; + /* + * Padding for adding new registers to xen_pmu_regs in + * the future + */ +#define XENPMU_REGS_PAD_SZ 64 + uint8_t pad[XENPMU_REGS_PAD_SZ]; + } r; + + /* WO for hypervisor, RO for guest */ + uint64_t pmu_flags; + + /* + * APIC LVTPC register. + * RW for both hypervisor and guest. + * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware + * during XENPMU_flush or XENPMU_lvtpc_set. + */ + union { + uint32_t lapic_lvtpc; + uint64_t pad; + } l; + + /* + * Vendor-specific PMU registers. + * RW for both hypervisor and guest (see exceptions above). + * Guest's updates to this field are verified and then loaded by the + * hypervisor into hardware during XENPMU_flush + */ + union { + struct xen_pmu_amd_ctxt amd; + struct xen_pmu_intel_ctxt intel; + + /* + * Padding for contexts (fixed parts only, does not include + * MSR banks that are specified by offsets) + */ +#define XENPMU_CTXT_PAD_SZ 128 + uint8_t pad[XENPMU_CTXT_PAD_SZ]; + } c; +}; + #endif /* !__ASSEMBLY__ */ /* diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 4b6e29ac0968..e47e52787d32 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -13,7 +13,7 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ grant-table.o suspend.o platform-pci-unplug.o \ - p2m.o apic.o + p2m.o apic.o pmu.o obj-$(CONFIG_EVENT_TRACING) += trace.o diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 70e060ad879a..d03ebfa89b9f 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -72,6 +72,9 @@ static u32 xen_apic_read(u32 reg) static void xen_apic_write(u32 reg, u32 val) { + if (reg == APIC_LVTPC) + return; + /* Warn to see if there's any stray references */ WARN(1,"register: %x, value: %x\n", reg, val); } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 373dbc9810d1..19072f91a8e2 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -84,6 +84,7 @@ #include "mmu.h" #include "smp.h" #include "multicalls.h" +#include "pmu.h" EXPORT_SYMBOL_GPL(hypercall_page); @@ -1082,6 +1083,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) return ret; } +unsigned long long xen_read_pmc(int counter) +{ + return 0; +} + void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { @@ -1216,7 +1222,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .write_msr = xen_write_msr_safe, .read_tsc = native_read_tsc, - .read_pmc = native_read_pmc, + .read_pmc = xen_read_pmc, .read_tscp = native_read_tscp, @@ -1267,6 +1273,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = { static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; + int cpu; + + for_each_online_cpu(cpu) + xen_pmu_finish(cpu); if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) BUG(); diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c new file mode 100644 index 000000000000..1d1ae1b874ea --- /dev/null +++ b/arch/x86/xen/pmu.c @@ -0,0 +1,170 @@ +#include +#include + +#include +#include +#include +#include +#include + +#include "xen-ops.h" +#include "pmu.h" + +/* x86_pmu.handle_irq definition */ +#include "../kernel/cpu/perf_event.h" + + +/* Shared page between hypervisor and domain */ +static DEFINE_PER_CPU(struct xen_pmu_data *, xenpmu_shared); +#define get_xenpmu_data() per_cpu(xenpmu_shared, smp_processor_id()) + +/* perf callbacks */ +static int xen_is_in_guest(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF)) + return 0; + + return 1; +} + +static int xen_is_user_mode(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV) + return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER); + else + return !!(xenpmu_data->pmu.r.regs.cpl & 3); +} + +static unsigned long xen_get_guest_ip(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + return xenpmu_data->pmu.r.regs.ip; +} + +static struct perf_guest_info_callbacks xen_guest_cbs = { + .is_in_guest = xen_is_in_guest, + .is_user_mode = xen_is_user_mode, + .get_guest_ip = xen_get_guest_ip, +}; + +/* Convert registers from Xen's format to Linux' */ +static void xen_convert_regs(const struct xen_pmu_regs *xen_regs, + struct pt_regs *regs, uint64_t pmu_flags) +{ + regs->ip = xen_regs->ip; + regs->cs = xen_regs->cs; + regs->sp = xen_regs->sp; + + if (pmu_flags & PMU_SAMPLE_PV) { + if (pmu_flags & PMU_SAMPLE_USER) + regs->cs |= 3; + else + regs->cs &= ~3; + } else { + if (xen_regs->cpl) + regs->cs |= 3; + else + regs->cs &= ~3; + } +} + +irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) +{ + int ret = IRQ_NONE; + struct pt_regs regs; + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return ret; + } + + xen_convert_regs(&xenpmu_data->pmu.r.regs, ®s, + xenpmu_data->pmu.pmu_flags); + if (x86_pmu.handle_irq(®s)) + ret = IRQ_HANDLED; + + return ret; +} + +bool is_xen_pmu(int cpu) +{ + return (per_cpu(xenpmu_shared, cpu) != NULL); +} + +void xen_pmu_init(int cpu) +{ + int err; + struct xen_pmu_params xp; + unsigned long pfn; + struct xen_pmu_data *xenpmu_data; + + BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE); + + if (xen_hvm_domain()) + return; + + xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL); + if (!xenpmu_data) { + pr_err("VPMU init: No memory\n"); + return; + } + pfn = virt_to_pfn(xenpmu_data); + + xp.val = pfn_to_mfn(pfn); + xp.vcpu = cpu; + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + err = HYPERVISOR_xenpmu_op(XENPMU_init, &xp); + if (err) + goto fail; + + per_cpu(xenpmu_shared, cpu) = xenpmu_data; + + if (cpu == 0) + perf_register_guest_info_callbacks(&xen_guest_cbs); + + return; + +fail: + pr_warn_once("Could not initialize VPMU for cpu %d, error %d\n", + cpu, err); + free_pages((unsigned long)xenpmu_data, 0); +} + +void xen_pmu_finish(int cpu) +{ + struct xen_pmu_params xp; + + if (xen_hvm_domain()) + return; + + xp.vcpu = cpu; + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + + (void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp); + + free_pages((unsigned long)per_cpu(xenpmu_shared, cpu), 0); + per_cpu(xenpmu_shared, cpu) = NULL; +} diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h new file mode 100644 index 000000000000..a76d2cf83581 --- /dev/null +++ b/arch/x86/xen/pmu.h @@ -0,0 +1,11 @@ +#ifndef __XEN_PMU_H +#define __XEN_PMU_H + +#include + +irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); +void xen_pmu_init(int cpu); +void xen_pmu_finish(int cpu); +bool is_xen_pmu(int cpu); + +#endif /* __XEN_PMU_H */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 86484384492e..2a9ff7342791 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -38,6 +39,7 @@ #include "xen-ops.h" #include "mmu.h" #include "smp.h" +#include "pmu.h" cpumask_var_t xen_cpu_initialized_map; @@ -50,6 +52,7 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); @@ -148,11 +151,18 @@ static void xen_smp_intr_free(unsigned int cpu) kfree(per_cpu(xen_irq_work, cpu).name); per_cpu(xen_irq_work, cpu).name = NULL; } + + if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); + per_cpu(xen_pmu_irq, cpu).irq = -1; + kfree(per_cpu(xen_pmu_irq, cpu).name); + per_cpu(xen_pmu_irq, cpu).name = NULL; + } }; static int xen_smp_intr_init(unsigned int cpu) { int rc; - char *resched_name, *callfunc_name, *debug_name; + char *resched_name, *callfunc_name, *debug_name, *pmu_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, @@ -218,6 +228,18 @@ static int xen_smp_intr_init(unsigned int cpu) per_cpu(xen_irq_work, cpu).irq = rc; per_cpu(xen_irq_work, cpu).name = callfunc_name; + if (is_xen_pmu(cpu)) { + pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); + rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, + xen_pmu_irq_handler, + IRQF_PERCPU|IRQF_NOBALANCING, + pmu_name, NULL); + if (rc < 0) + goto fail; + per_cpu(xen_pmu_irq, cpu).irq = rc; + per_cpu(xen_pmu_irq, cpu).name = pmu_name; + } + return 0; fail: @@ -335,6 +357,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) } set_cpu_sibling_map(0); + xen_pmu_init(0); + if (xen_smp_intr_init(0)) BUG(); @@ -462,6 +486,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) if (rc) return rc; + xen_pmu_init(cpu); + rc = xen_smp_intr_init(cpu); if (rc) return rc; @@ -503,6 +529,7 @@ static void xen_cpu_die(unsigned int cpu) xen_smp_intr_free(cpu); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); + xen_pmu_finish(cpu); } } diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 53b4c0811f4f..feddabdab448 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -11,6 +11,7 @@ #include "xen-ops.h" #include "mmu.h" +#include "pmu.h" static void xen_pv_pre_suspend(void) { @@ -67,16 +68,26 @@ static void xen_pv_post_suspend(int suspend_cancelled) void xen_arch_pre_suspend(void) { - if (xen_pv_domain()) - xen_pv_pre_suspend(); + int cpu; + + for_each_online_cpu(cpu) + xen_pmu_finish(cpu); + + if (xen_pv_domain()) + xen_pv_pre_suspend(); } void xen_arch_post_suspend(int cancelled) { - if (xen_pv_domain()) - xen_pv_post_suspend(cancelled); - else - xen_hvm_post_suspend(cancelled); + int cpu; + + if (xen_pv_domain()) + xen_pv_post_suspend(cancelled); + else + xen_hvm_post_suspend(cancelled); + + for_each_online_cpu(cpu) + xen_pmu_init(cpu); } static void xen_vcpu_notify_restore(void *data) diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index e9d4501d1f5e..167071c290b3 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -113,6 +113,7 @@ #define VIRQ_MEM_EVENT 10 /* G. (DOM0) A memory event has occured */ #define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient */ #define VIRQ_ENOMEM 12 /* G. (DOM0) Low on heap memory */ +#define VIRQ_XENPMU 13 /* PMC interrupt */ /* Architecture-specific VIRQ definitions. */ #define VIRQ_ARCH_0 16 diff --git a/include/xen/interface/xenpmu.h b/include/xen/interface/xenpmu.h index eac1b498b89f..ca42301949b5 100644 --- a/include/xen/interface/xenpmu.h +++ b/include/xen/interface/xenpmu.h @@ -56,4 +56,37 @@ struct xen_pmu_params { */ #define XENPMU_FEATURE_INTEL_BTS 1 +/* + * Shared PMU data between hypervisor and PV(H) domains. + * + * The hypervisor fills out this structure during PMU interrupt and sends an + * interrupt to appropriate VCPU. + * Architecture-independent fields of xen_pmu_data are WO for the hypervisor + * and RO for the guest but some fields in xen_pmu_arch can be writable + * by both the hypervisor and the guest (see arch-$arch/pmu.h). + */ +struct xen_pmu_data { + /* Interrupted VCPU */ + uint32_t vcpu_id; + + /* + * Physical processor on which the interrupt occurred. On non-privileged + * guests set to vcpu_id; + */ + uint32_t pcpu_id; + + /* + * Domain that was interrupted. On non-privileged guests set to + * DOMID_SELF. + * On privileged guests can be DOMID_SELF, DOMID_XEN, or, when in + * XENPMU_MODE_ALL mode, domain ID of another domain. + */ + domid_t domain_id; + + uint8_t pad[6]; + + /* Architecture-specific information */ + struct xen_pmu_arch pmu; +}; + #endif /* __XEN_PUBLIC_XENPMU_H__ */ From e27b72df01109c689062caeba1defa013b759e0e Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 10 Aug 2015 16:34:35 -0400 Subject: [PATCH 25/33] xen/PMU: Describe vendor-specific PMU registers AMD and Intel PMU register initialization and helpers that determine whether a register belongs to PMU. This and some of subsequent PMU emulation code is somewhat similar to Xen's PMU implementation. Signed-off-by: Boris Ostrovsky Reviewed-by: David Vrabel Signed-off-by: David Vrabel --- arch/x86/xen/pmu.c | 153 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 152 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 1d1ae1b874ea..a4a6e4f04f37 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -18,6 +18,155 @@ static DEFINE_PER_CPU(struct xen_pmu_data *, xenpmu_shared); #define get_xenpmu_data() per_cpu(xenpmu_shared, smp_processor_id()) + +/* AMD PMU */ +#define F15H_NUM_COUNTERS 6 +#define F10H_NUM_COUNTERS 4 + +static __read_mostly uint32_t amd_counters_base; +static __read_mostly uint32_t amd_ctrls_base; +static __read_mostly int amd_msr_step; +static __read_mostly int k7_counters_mirrored; +static __read_mostly int amd_num_counters; + +/* Intel PMU */ +#define MSR_TYPE_COUNTER 0 +#define MSR_TYPE_CTRL 1 +#define MSR_TYPE_GLOBAL 2 +#define MSR_TYPE_ARCH_COUNTER 3 +#define MSR_TYPE_ARCH_CTRL 4 + +/* Number of general pmu registers (CPUID.EAX[0xa].EAX[8..15]) */ +#define PMU_GENERAL_NR_SHIFT 8 +#define PMU_GENERAL_NR_BITS 8 +#define PMU_GENERAL_NR_MASK (((1 << PMU_GENERAL_NR_BITS) - 1) \ + << PMU_GENERAL_NR_SHIFT) + +/* Number of fixed pmu registers (CPUID.EDX[0xa].EDX[0..4]) */ +#define PMU_FIXED_NR_SHIFT 0 +#define PMU_FIXED_NR_BITS 5 +#define PMU_FIXED_NR_MASK (((1 << PMU_FIXED_NR_BITS) - 1) \ + << PMU_FIXED_NR_SHIFT) + +/* Alias registers (0x4c1) for full-width writes to PMCs */ +#define MSR_PMC_ALIAS_MASK (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0)) + +static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters; + + +static void xen_pmu_arch_init(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + + switch (boot_cpu_data.x86) { + case 0x15: + amd_num_counters = F15H_NUM_COUNTERS; + amd_counters_base = MSR_F15H_PERF_CTR; + amd_ctrls_base = MSR_F15H_PERF_CTL; + amd_msr_step = 2; + k7_counters_mirrored = 1; + break; + case 0x10: + case 0x12: + case 0x14: + case 0x16: + default: + amd_num_counters = F10H_NUM_COUNTERS; + amd_counters_base = MSR_K7_PERFCTR0; + amd_ctrls_base = MSR_K7_EVNTSEL0; + amd_msr_step = 1; + k7_counters_mirrored = 0; + break; + } + } else { + uint32_t eax, ebx, ecx, edx; + + cpuid(0xa, &eax, &ebx, &ecx, &edx); + + intel_num_arch_counters = (eax & PMU_GENERAL_NR_MASK) >> + PMU_GENERAL_NR_SHIFT; + intel_num_fixed_counters = (edx & PMU_FIXED_NR_MASK) >> + PMU_FIXED_NR_SHIFT; + } +} + +static inline uint32_t get_fam15h_addr(u32 addr) +{ + switch (addr) { + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + return MSR_F15H_PERF_CTR + (addr - MSR_K7_PERFCTR0); + case MSR_K7_EVNTSEL0: + case MSR_K7_EVNTSEL1: + case MSR_K7_EVNTSEL2: + case MSR_K7_EVNTSEL3: + return MSR_F15H_PERF_CTL + (addr - MSR_K7_EVNTSEL0); + default: + break; + } + + return addr; +} + +static inline bool is_amd_pmu_msr(unsigned int msr) +{ + if ((msr >= MSR_F15H_PERF_CTL && + msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) || + (msr >= MSR_K7_EVNTSEL0 && + msr < MSR_K7_PERFCTR0 + amd_num_counters)) + return true; + + return false; +} + +static int is_intel_pmu_msr(u32 msr_index, int *type, int *index) +{ + u32 msr_index_pmc; + + switch (msr_index) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + case MSR_IA32_DS_AREA: + case MSR_IA32_PEBS_ENABLE: + *type = MSR_TYPE_CTRL; + return true; + + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + *type = MSR_TYPE_GLOBAL; + return true; + + default: + + if ((msr_index >= MSR_CORE_PERF_FIXED_CTR0) && + (msr_index < MSR_CORE_PERF_FIXED_CTR0 + + intel_num_fixed_counters)) { + *index = msr_index - MSR_CORE_PERF_FIXED_CTR0; + *type = MSR_TYPE_COUNTER; + return true; + } + + if ((msr_index >= MSR_P6_EVNTSEL0) && + (msr_index < MSR_P6_EVNTSEL0 + intel_num_arch_counters)) { + *index = msr_index - MSR_P6_EVNTSEL0; + *type = MSR_TYPE_ARCH_CTRL; + return true; + } + + msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK; + if ((msr_index_pmc >= MSR_IA32_PERFCTR0) && + (msr_index_pmc < MSR_IA32_PERFCTR0 + + intel_num_arch_counters)) { + *type = MSR_TYPE_ARCH_COUNTER; + *index = msr_index_pmc - MSR_IA32_PERFCTR0; + return true; + } + return false; + } +} + /* perf callbacks */ static int xen_is_in_guest(void) { @@ -141,8 +290,10 @@ void xen_pmu_init(int cpu) per_cpu(xenpmu_shared, cpu) = xenpmu_data; - if (cpu == 0) + if (cpu == 0) { perf_register_guest_info_callbacks(&xen_guest_cbs); + xen_pmu_arch_init(); + } return; From 6b08cd6328c58a2ae190c5ee03a2ffcab5ef828e Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 10 Aug 2015 16:34:36 -0400 Subject: [PATCH 26/33] xen/PMU: Intercept PMU-related MSR and APIC accesses Provide interfaces for recognizing accesses to PMU-related MSRs and LVTPC APIC and process these accesses in Xen PMU code. (The interrupt handler performs XENPMU_flush right away in the beginning since no PMU emulation is available. It will be added with a later patch). Signed-off-by: Boris Ostrovsky Reviewed-by: David Vrabel Signed-off-by: David Vrabel --- arch/x86/xen/apic.c | 5 +- arch/x86/xen/enlighten.c | 11 ++-- arch/x86/xen/pmu.c | 95 +++++++++++++++++++++++++++++++++- arch/x86/xen/pmu.h | 4 ++ include/xen/interface/xenpmu.h | 2 + 5 files changed, 109 insertions(+), 8 deletions(-) diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index d03ebfa89b9f..acda713ab5be 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -7,6 +7,7 @@ #include #include #include "xen-ops.h" +#include "pmu.h" #include "smp.h" static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) @@ -72,8 +73,10 @@ static u32 xen_apic_read(u32 reg) static void xen_apic_write(u32 reg, u32 val) { - if (reg == APIC_LVTPC) + if (reg == APIC_LVTPC) { + (void)pmu_apic_update(reg); return; + } /* Warn to see if there's any stray references */ WARN(1,"register: %x, value: %x\n", reg, val); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 19072f91a8e2..fdaba49f6759 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1031,6 +1031,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err) { u64 val; + if (pmu_msr_read(msr, &val, err)) + return val; + val = native_read_msr_safe(msr, err); switch (msr) { case MSR_IA32_APICBASE: @@ -1077,17 +1080,13 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) Xen console noise. */ default: - ret = native_write_msr_safe(msr, low, high); + if (!pmu_msr_write(msr, low, high, &ret)) + ret = native_write_msr_safe(msr, low, high); } return ret; } -unsigned long long xen_read_pmc(int counter) -{ - return 0; -} - void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index a4a6e4f04f37..f92b908e005f 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -51,6 +51,8 @@ static __read_mostly int amd_num_counters; /* Alias registers (0x4c1) for full-width writes to PMCs */ #define MSR_PMC_ALIAS_MASK (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0)) +#define INTEL_PMC_TYPE_SHIFT 30 + static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters; @@ -167,6 +169,91 @@ static int is_intel_pmu_msr(u32 msr_index, int *type, int *index) } } +bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err) +{ + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (is_amd_pmu_msr(msr)) { + *val = native_read_msr_safe(msr, err); + return true; + } + } else { + int type, index; + + if (is_intel_pmu_msr(msr, &type, &index)) { + *val = native_read_msr_safe(msr, err); + return true; + } + } + + return false; +} + +bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (is_amd_pmu_msr(msr)) { + *err = native_write_msr_safe(msr, low, high); + return true; + } + } else { + int type, index; + + if (is_intel_pmu_msr(msr, &type, &index)) { + *err = native_write_msr_safe(msr, low, high); + return true; + } + } + + return false; +} + +static unsigned long long xen_amd_read_pmc(int counter) +{ + uint32_t msr; + int err; + + msr = amd_counters_base + (counter * amd_msr_step); + return native_read_msr_safe(msr, &err); +} + +static unsigned long long xen_intel_read_pmc(int counter) +{ + int err; + uint32_t msr; + + if (counter & (1<pmu.l.lapic_lvtpc = val; + ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL); + + return ret; +} + /* perf callbacks */ static int xen_is_in_guest(void) { @@ -239,7 +326,7 @@ static void xen_convert_regs(const struct xen_pmu_regs *xen_regs, irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) { - int ret = IRQ_NONE; + int err, ret = IRQ_NONE; struct pt_regs regs; const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); @@ -248,6 +335,12 @@ irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) return ret; } + err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL); + if (err) { + pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err); + return ret; + } + xen_convert_regs(&xenpmu_data->pmu.r.regs, ®s, xenpmu_data->pmu.pmu_flags); if (x86_pmu.handle_irq(®s)) diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h index a76d2cf83581..af5f0ad94078 100644 --- a/arch/x86/xen/pmu.h +++ b/arch/x86/xen/pmu.h @@ -7,5 +7,9 @@ irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); void xen_pmu_init(int cpu); void xen_pmu_finish(int cpu); bool is_xen_pmu(int cpu); +bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err); +bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err); +int pmu_apic_update(uint32_t reg); +unsigned long long xen_read_pmc(int counter); #endif /* __XEN_PMU_H */ diff --git a/include/xen/interface/xenpmu.h b/include/xen/interface/xenpmu.h index ca42301949b5..139efc91bceb 100644 --- a/include/xen/interface/xenpmu.h +++ b/include/xen/interface/xenpmu.h @@ -20,6 +20,8 @@ #define XENPMU_feature_set 3 #define XENPMU_init 4 #define XENPMU_finish 5 +#define XENPMU_lvtpc_set 6 +#define XENPMU_flush 7 /* ` } */ From bf6dfb154d935725c9a2005033ca33017b9df439 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 10 Aug 2015 16:34:37 -0400 Subject: [PATCH 27/33] xen/PMU: PMU emulation code Add PMU emulation code that runs when we are processing a PMU interrupt. This code will allow us not to trap to hypervisor on each MSR/LVTPC access (of which there may be quite a few in the handler). Signed-off-by: Boris Ostrovsky Reviewed-by: David Vrabel Signed-off-by: David Vrabel --- arch/x86/xen/pmu.c | 214 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 185 insertions(+), 29 deletions(-) diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index f92b908e005f..724a08740a04 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -13,11 +13,20 @@ /* x86_pmu.handle_irq definition */ #include "../kernel/cpu/perf_event.h" +#define XENPMU_IRQ_PROCESSING 1 +struct xenpmu { + /* Shared page between hypervisor and domain */ + struct xen_pmu_data *xenpmu_data; -/* Shared page between hypervisor and domain */ -static DEFINE_PER_CPU(struct xen_pmu_data *, xenpmu_shared); -#define get_xenpmu_data() per_cpu(xenpmu_shared, smp_processor_id()) + uint8_t flags; +}; +static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared); +#define get_xenpmu_data() (this_cpu_ptr(&xenpmu_shared)->xenpmu_data) +#define get_xenpmu_flags() (this_cpu_ptr(&xenpmu_shared)->flags) +/* Macro for computing address of a PMU MSR bank */ +#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \ + (uintptr_t)ctxt->field)) /* AMD PMU */ #define F15H_NUM_COUNTERS 6 @@ -169,19 +178,124 @@ static int is_intel_pmu_msr(u32 msr_index, int *type, int *index) } } +static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type, + int index, bool is_read) +{ + uint64_t *reg = NULL; + struct xen_pmu_intel_ctxt *ctxt; + uint64_t *fix_counters; + struct xen_pmu_cntr_pair *arch_cntr_pair; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) + return false; + + ctxt = &xenpmu_data->pmu.c.intel; + + switch (msr) { + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + reg = &ctxt->global_ovf_ctrl; + break; + case MSR_CORE_PERF_GLOBAL_STATUS: + reg = &ctxt->global_status; + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + reg = &ctxt->global_ctrl; + break; + case MSR_CORE_PERF_FIXED_CTR_CTRL: + reg = &ctxt->fixed_ctrl; + break; + default: + switch (type) { + case MSR_TYPE_COUNTER: + fix_counters = field_offset(ctxt, fixed_counters); + reg = &fix_counters[index]; + break; + case MSR_TYPE_ARCH_COUNTER: + arch_cntr_pair = field_offset(ctxt, arch_counters); + reg = &arch_cntr_pair[index].counter; + break; + case MSR_TYPE_ARCH_CTRL: + arch_cntr_pair = field_offset(ctxt, arch_counters); + reg = &arch_cntr_pair[index].control; + break; + default: + return false; + } + } + + if (reg) { + if (is_read) + *val = *reg; + else { + *reg = *val; + + if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL) + ctxt->global_status &= (~(*val)); + } + return true; + } + + return false; +} + +static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read) +{ + uint64_t *reg = NULL; + int i, off = 0; + struct xen_pmu_amd_ctxt *ctxt; + uint64_t *counter_regs, *ctrl_regs; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) + return false; + + if (k7_counters_mirrored && + ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3))) + msr = get_fam15h_addr(msr); + + ctxt = &xenpmu_data->pmu.c.amd; + for (i = 0; i < amd_num_counters; i++) { + if (msr == amd_ctrls_base + off) { + ctrl_regs = field_offset(ctxt, ctrls); + reg = &ctrl_regs[i]; + break; + } else if (msr == amd_counters_base + off) { + counter_regs = field_offset(ctxt, counters); + reg = &counter_regs[i]; + break; + } + off += amd_msr_step; + } + + if (reg) { + if (is_read) + *val = *reg; + else + *reg = *val; + + return true; + } + return false; +} + bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { if (is_amd_pmu_msr(msr)) { - *val = native_read_msr_safe(msr, err); + if (!xen_amd_pmu_emulate(msr, val, 1)) + *val = native_read_msr_safe(msr, err); return true; } } else { int type, index; if (is_intel_pmu_msr(msr, &type, &index)) { - *val = native_read_msr_safe(msr, err); + if (!xen_intel_pmu_emulate(msr, val, type, index, 1)) + *val = native_read_msr_safe(msr, err); return true; } } @@ -191,16 +305,20 @@ bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err) bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err) { + uint64_t val = ((uint64_t)high << 32) | low; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { if (is_amd_pmu_msr(msr)) { - *err = native_write_msr_safe(msr, low, high); + if (!xen_amd_pmu_emulate(msr, &val, 0)) + *err = native_write_msr_safe(msr, low, high); return true; } } else { int type, index; if (is_intel_pmu_msr(msr, &type, &index)) { - *err = native_write_msr_safe(msr, low, high); + if (!xen_intel_pmu_emulate(msr, &val, type, index, 0)) + *err = native_write_msr_safe(msr, low, high); return true; } } @@ -210,24 +328,52 @@ bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err) static unsigned long long xen_amd_read_pmc(int counter) { - uint32_t msr; - int err; + struct xen_pmu_amd_ctxt *ctxt; + uint64_t *counter_regs; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); - msr = amd_counters_base + (counter * amd_msr_step); - return native_read_msr_safe(msr, &err); + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) { + uint32_t msr; + int err; + + msr = amd_counters_base + (counter * amd_msr_step); + return native_read_msr_safe(msr, &err); + } + + ctxt = &xenpmu_data->pmu.c.amd; + counter_regs = field_offset(ctxt, counters); + return counter_regs[counter]; } static unsigned long long xen_intel_read_pmc(int counter) { - int err; - uint32_t msr; + struct xen_pmu_intel_ctxt *ctxt; + uint64_t *fixed_counters; + struct xen_pmu_cntr_pair *arch_cntr_pair; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); - if (counter & (1<pmu.c.intel; + if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) { + fixed_counters = field_offset(ctxt, fixed_counters); + return fixed_counters[counter & 0xffff]; + } + + arch_cntr_pair = field_offset(ctxt, arch_counters); + return arch_cntr_pair[counter].counter; } unsigned long long xen_read_pmc(int counter) @@ -249,6 +395,10 @@ int pmu_apic_update(uint32_t val) } xenpmu_data->pmu.l.lapic_lvtpc = val; + + if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING) + return 0; + ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL); return ret; @@ -329,29 +479,34 @@ irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) int err, ret = IRQ_NONE; struct pt_regs regs; const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); if (!xenpmu_data) { pr_warn_once("%s: pmudata not initialized\n", __func__); return ret; } - err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL); - if (err) { - pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err); - return ret; - } - + this_cpu_ptr(&xenpmu_shared)->flags = + xenpmu_flags | XENPMU_IRQ_PROCESSING; xen_convert_regs(&xenpmu_data->pmu.r.regs, ®s, xenpmu_data->pmu.pmu_flags); if (x86_pmu.handle_irq(®s)) ret = IRQ_HANDLED; + /* Write out cached context to HW */ + err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL); + this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags; + if (err) { + pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err); + return IRQ_NONE; + } + return ret; } bool is_xen_pmu(int cpu) { - return (per_cpu(xenpmu_shared, cpu) != NULL); + return (get_xenpmu_data() != NULL); } void xen_pmu_init(int cpu) @@ -381,7 +536,8 @@ void xen_pmu_init(int cpu) if (err) goto fail; - per_cpu(xenpmu_shared, cpu) = xenpmu_data; + per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data; + per_cpu(xenpmu_shared, cpu).flags = 0; if (cpu == 0) { perf_register_guest_info_callbacks(&xen_guest_cbs); @@ -409,6 +565,6 @@ void xen_pmu_finish(int cpu) (void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp); - free_pages((unsigned long)per_cpu(xenpmu_shared, cpu), 0); - per_cpu(xenpmu_shared, cpu) = NULL; + free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0); + per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL; } From 3375d8284dfb7866f261ec008d15d30999ff273b Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 10 Aug 2015 16:34:38 -0400 Subject: [PATCH 28/33] xen/x86: Don't try to set PCE bit in CR4 Since VPMU code emulates RDPMC instruction with RDMSR and because hypervisor does not emulate it there is no reason to try setting CR4's PCE bit (and the hypervisor will warn on seeing it set). Signed-off-by: Boris Ostrovsky Signed-off-by: David Vrabel --- arch/x86/xen/enlighten.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index fdaba49f6759..25309c168311 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1011,8 +1011,7 @@ static void xen_write_cr0(unsigned long cr0) static void xen_write_cr4(unsigned long cr4) { - cr4 &= ~X86_CR4_PGE; - cr4 &= ~X86_CR4_PSE; + cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE); native_write_cr4(cr4); } From 724afaea2020f3bd98891b535f3ce5d3935bcf63 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Fri, 7 Aug 2015 17:34:34 +0100 Subject: [PATCH 29/33] arm/xen: Remove helpers which are PV specific ARM guests are always HVM. The current implementation is assuming a 1:1 mapping which is only true for DOM0 and may not be at all in the future. Furthermore, all the helpers but arbitrary_virt_to_machine are used in x86 specific code (or only compiled for). The helper arbitrary_virt_to_machine is only used in PV specific code. Therefore we should never call the function. Add a BUG() in this helper and drop all the others. Signed-off-by: Julien Grall Acked-by: Stefano Stabellini Signed-off-by: David Vrabel --- arch/arm/include/asm/xen/page.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h index 1bee8ca12494..98b1084f8282 100644 --- a/arch/arm/include/asm/xen/page.h +++ b/arch/arm/include/asm/xen/page.h @@ -54,26 +54,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) #define mfn_to_local_pfn(mfn) mfn_to_pfn(mfn) -static inline xmaddr_t phys_to_machine(xpaddr_t phys) -{ - unsigned offset = phys.paddr & ~PAGE_MASK; - return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset); -} - -static inline xpaddr_t machine_to_phys(xmaddr_t machine) -{ - unsigned offset = machine.maddr & ~PAGE_MASK; - return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset); -} /* VIRT <-> MACHINE conversion */ -#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) #define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) +/* Only used in PV code. But ARM guests are always HVM. */ static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr) { - /* TODO: assuming it is mapped in the kernel 1:1 */ - return virt_to_machine(vaddr); + BUG(); } /* TODO: this shouldn't be here but it is because the frontend drivers From eafd72e016c69df511b14a98b61e439c58ad9c51 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 19 Aug 2015 18:52:34 +0200 Subject: [PATCH 30/33] xen: avoid early crash of memory limited dom0 Commit b1c9f169047b ("xen: split counting of extra memory pages...") introduced an error when dom0 was started with limited memory. The problem arises in case dom0 is started with initial memory and maximum memory being the same and exactly a multiple of 1 GB. The kernel must be configured without CONFIG_XEN_BALLOON_MEMORY_HOTPLUG for the problem to happen. In this case it will crash very early during boot due to the virtual mapped p2m list not being large enough to be able to remap any memory: (XEN) Freed 304kB init memory. mapping kernel into physical memory about to get started... (XEN) traps.c:459:d0v0 Unhandled invalid opcode fault/trap [#6] on VCPU 0 [ec=0000] (XEN) domain_crash_sync called from entry.S: fault at ffff82d080229a93 create_bounce_frame+0x12b/0x13a (XEN) Domain 0 (vcpu#0) crashed on cpu#0: (XEN) ----[ Xen-4.5.2-pre x86_64 debug=n Not tainted ]---- (XEN) CPU: 0 (XEN) RIP: e033:[] (XEN) RFLAGS: 0000000000000206 EM: 1 CONTEXT: pv guest (d0v0) (XEN) rax: ffffffff81db2000 rbx: 000000004d000000 rcx: 0000000000000000 (XEN) rdx: 000000004d000000 rsi: 0000000000063000 rdi: 000000004d063000 (XEN) rbp: ffffffff81c03d78 rsp: ffffffff81c03d28 r8: 0000000000023000 (XEN) r9: 00000001040ff000 r10: 0000000000007ff0 r11: 0000000000000000 (XEN) r12: 0000000000063000 r13: 000000000004d000 r14: 0000000000000063 (XEN) r15: 0000000000000063 cr0: 0000000080050033 cr4: 00000000000006f0 (XEN) cr3: 0000000105c0f000 cr2: ffffc90000268000 (XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e02b cs: e033 (XEN) Guest stack trace from rsp=ffffffff81c03d28: (XEN) 0000000000000000 0000000000000000 ffffffff81d120cb 000000010000e030 (XEN) 0000000000010006 ffffffff81c03d68 000000000000e02b ffffffffffffffff (XEN) 0000000000000063 000000000004d063 ffffffff81c03de8 ffffffff81d130a7 (XEN) ffffffff81c03de8 000000000004d000 00000001040ff000 0000000000105db1 (XEN) 00000001040ff001 000000000004d062 ffff8800092d6ff8 0000000002027000 (XEN) ffff8800094d8340 ffff8800092d6ff8 00003ffffffff000 ffff8800092d7ff8 (XEN) ffffffff81c03e48 ffffffff81d13c43 ffff8800094d8000 ffff8800094d9000 (XEN) 0000000000000000 ffff8800092d6000 00000000092d6000 000000004cfbf000 (XEN) 00000000092d6000 00000000052d5442 0000000000000000 0000000000000000 (XEN) ffffffff81c03ed8 ffffffff81d185c1 0000000000000000 0000000000000000 (XEN) ffffffff81c03e78 ffffffff810f8ca4 ffffffff81c03ed8 ffffffff8171a15d (XEN) 0000000000000010 ffffffff81c03ee8 0000000000000000 0000000000000000 (XEN) ffffffff81f0e402 ffffffffffffffff ffffffff81dae900 0000000000000000 (XEN) 0000000000000000 0000000000000000 ffffffff81c03f28 ffffffff81d0cf0f (XEN) 0000000000000000 0000000000000000 0000000000000000 ffffffff81db82e0 (XEN) 0000000000000000 0000000000000000 0000000000000000 0000000000000000 (XEN) ffffffff81c03f38 ffffffff81d0c603 ffffffff81c03ff8 ffffffff81d11c86 (XEN) 0300000100000032 0000000000000005 0000000000000020 0000000000000000 (XEN) 0000000000000000 0000000000000000 0000000000000000 0000000000000000 (XEN) 0000000000000000 0000000000000000 0000000000000000 0000000000000000 (XEN) Domain 0 crashed: rebooting machine in 5 seconds. This can be avoided by allocating aneough space for the p2m to cover the maximum memory of dom0 plus the identity mapped holes required for PCI space, BIOS etc. Reported-by: Boris Ostrovsky Signed-off-by: Juergen Gross Signed-off-by: David Vrabel --- arch/x86/xen/setup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a1a77eabe858..ead0d363bfba 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -789,11 +789,12 @@ char * __init xen_memory_setup(void) &xen_e820_map_entries); max_pages = xen_get_max_pages(); - if (max_pages > max_pfn) - extra_pages += max_pages - max_pfn; /* How many extra pages do we need due to remapping? */ - extra_pages += xen_count_remap_pages(max_pfn); + max_pages += xen_count_remap_pages(max_pfn); + + if (max_pages > max_pfn) + extra_pages += max_pages - max_pfn; /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO From ab24507cfae8d916814bb6c16f66e453184a29a5 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 19 Aug 2015 18:53:11 +0200 Subject: [PATCH 31/33] xen: avoid another early crash of memory limited dom0 Commit b1c9f169047b ("xen: split counting of extra memory pages...") introduced an error when dom0 was started with limited memory occurring only on some hardware. The problem arises in case dom0 is started with initial memory and maximum memory being the same. The kernel must be configured without CONFIG_XEN_BALLOON_MEMORY_HOTPLUG for the problem to happen. If all of this is true and the E820 map of the machine is sparse (some areas are not covered) then the machine might crash early in the boot process. An example E820 map triggering the problem looks like this: [ 0.000000] e820: BIOS-provided physical RAM map: [ 0.000000] BIOS-e820: [mem 0x0000000000000000-0x000000000009d7ff] usable [ 0.000000] BIOS-e820: [mem 0x000000000009d800-0x000000000009ffff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000000e0000-0x00000000000fffff] reserved [ 0.000000] BIOS-e820: [mem 0x0000000000100000-0x00000000cf7fafff] usable [ 0.000000] BIOS-e820: [mem 0x00000000cf7fb000-0x00000000cf95ffff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000cf960000-0x00000000cfb62fff] ACPI NVS [ 0.000000] BIOS-e820: [mem 0x00000000cfb63000-0x00000000cfd14fff] usable [ 0.000000] BIOS-e820: [mem 0x00000000cfd15000-0x00000000cfd61fff] ACPI NVS [ 0.000000] BIOS-e820: [mem 0x00000000cfd62000-0x00000000cfd6cfff] ACPI data [ 0.000000] BIOS-e820: [mem 0x00000000cfd6d000-0x00000000cfd6ffff] ACPI NVS [ 0.000000] BIOS-e820: [mem 0x00000000cfd70000-0x00000000cfd70fff] usable [ 0.000000] BIOS-e820: [mem 0x00000000cfd71000-0x00000000cfea8fff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000cfea9000-0x00000000cfeb9fff] ACPI NVS [ 0.000000] BIOS-e820: [mem 0x00000000cfeba000-0x00000000cfecafff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000cfecb000-0x00000000cfecbfff] ACPI NVS [ 0.000000] BIOS-e820: [mem 0x00000000cfecc000-0x00000000cfedbfff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000cfedc000-0x00000000cfedcfff] ACPI NVS [ 0.000000] BIOS-e820: [mem 0x00000000cfedd000-0x00000000cfeddfff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000cfede000-0x00000000cfee3fff] ACPI NVS [ 0.000000] BIOS-e820: [mem 0x00000000cfee4000-0x00000000cfef6fff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000cfef7000-0x00000000cfefffff] usable [ 0.000000] BIOS-e820: [mem 0x00000000e0000000-0x00000000efffffff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000fec00000-0x00000000fec00fff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000fec10000-0x00000000fec10fff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000fed00000-0x00000000fed00fff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000fed40000-0x00000000fed44fff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000fed61000-0x00000000fed70fff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000fed80000-0x00000000fed8ffff] reserved [ 0.000000] BIOS-e820: [mem 0x00000000ff000000-0x00000000ffffffff] reserved [ 0.000000] BIOS-e820: [mem 0x0000000100001000-0x000000020effffff] usable In this case the area a0000-dffff isn't present in the map. This will confuse the memory setup of the domain when remapping the memory from such holes to populated areas. To avoid the problem the accounting of to be remapped memory has to count such holes in the E820 map as well. Reported-by: Boris Ostrovsky Signed-off-by: Juergen Gross Signed-off-by: David Vrabel --- arch/x86/xen/setup.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ead0d363bfba..7a5d5666677f 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -593,20 +593,27 @@ static void __init xen_ignore_unusable(void) static unsigned long __init xen_count_remap_pages(unsigned long max_pfn) { unsigned long extra = 0; + unsigned long start_pfn, end_pfn; const struct e820entry *entry = xen_e820_map; int i; + end_pfn = 0; for (i = 0; i < xen_e820_map_entries; i++, entry++) { - unsigned long start_pfn = PFN_DOWN(entry->addr); - unsigned long end_pfn = PFN_UP(entry->addr + entry->size); + start_pfn = PFN_DOWN(entry->addr); + /* Adjacent regions on non-page boundaries handling! */ + end_pfn = min(end_pfn, start_pfn); if (start_pfn >= max_pfn) - break; - if (entry->type == E820_RAM) - continue; - if (end_pfn >= max_pfn) - end_pfn = max_pfn; - extra += end_pfn - start_pfn; + return extra + max_pfn - end_pfn; + + /* Add any holes in map to result. */ + extra += start_pfn - end_pfn; + + end_pfn = PFN_UP(entry->addr + entry->size); + end_pfn = min(end_pfn, max_pfn); + + if (entry->type != E820_RAM) + extra += end_pfn - start_pfn; } return extra; From cb9e444b5aaa900bb4310da411315b6947c53e37 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 4 Sep 2015 14:18:08 +0200 Subject: [PATCH 32/33] xen: limit memory to architectural maximum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a pv-domain (including dom0) is started it tries to size it's p2m list according to the maximum possible memory amount it ever can achieve. Limit the initial maximum memory size to the architectural limit of the hardware in order to avoid overflows during remapping of memory. This problem will occur when dom0 is started with an initial memory size being a multiple of 1GB, but without specifying it's maximum memory size. The kernel must be configured without CONFIG_XEN_BALLOON_MEMORY_HOTPLUG for the problem to happen. Reported-by: Roger Pau Monné Tested-by: Roger Pau Monné Signed-off-by: Juergen Gross Signed-off-by: David Vrabel --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 7a5d5666677f..70de4c8b8f27 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -531,7 +531,7 @@ static unsigned long __init xen_get_pages_limit(void) #ifdef CONFIG_X86_32 limit = GB(64) / PAGE_SIZE; #else - limit = ~0ul; + limit = MAXMEM / PAGE_SIZE; if (!xen_initial_domain() && xen_512gb_limit) limit = GB(512) / PAGE_SIZE; #endif From 626d7508664c4bc8e67f496da4387ecd0c410b8c Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 4 Sep 2015 14:05:51 +0200 Subject: [PATCH 33/33] xen: switch extra memory accounting to use pfns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of using physical addresses for accounting of extra memory areas available for ballooning switch to pfns as this is much less error prone regarding partial pages. Reported-by: Roger Pau Monné Tested-by: Roger Pau Monné Signed-off-by: Juergen Gross Signed-off-by: David Vrabel --- arch/x86/xen/setup.c | 80 ++++++++++++++++++++++++------------------- drivers/xen/balloon.c | 6 ++-- include/xen/page.h | 4 +-- 3 files changed, 49 insertions(+), 41 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 70de4c8b8f27..f5ef6746d47a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -90,62 +90,69 @@ static void __init xen_parse_512gb(void) xen_512gb_limit = val; } -static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size) +static void __init xen_add_extra_mem(unsigned long start_pfn, + unsigned long n_pfns) { int i; + /* + * No need to check for zero size, should happen rarely and will only + * write a new entry regarded to be unused due to zero size. + */ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { /* Add new region. */ - if (xen_extra_mem[i].size == 0) { - xen_extra_mem[i].start = start; - xen_extra_mem[i].size = size; + if (xen_extra_mem[i].n_pfns == 0) { + xen_extra_mem[i].start_pfn = start_pfn; + xen_extra_mem[i].n_pfns = n_pfns; break; } /* Append to existing region. */ - if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { - xen_extra_mem[i].size += size; + if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns == + start_pfn) { + xen_extra_mem[i].n_pfns += n_pfns; break; } } if (i == XEN_EXTRA_MEM_MAX_REGIONS) printk(KERN_WARNING "Warning: not enough extra memory regions\n"); - memblock_reserve(start, size); + memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } -static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) +static void __init xen_del_extra_mem(unsigned long start_pfn, + unsigned long n_pfns) { int i; - phys_addr_t start_r, size_r; + unsigned long start_r, size_r; for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - start_r = xen_extra_mem[i].start; - size_r = xen_extra_mem[i].size; + start_r = xen_extra_mem[i].start_pfn; + size_r = xen_extra_mem[i].n_pfns; /* Start of region. */ - if (start_r == start) { - BUG_ON(size > size_r); - xen_extra_mem[i].start += size; - xen_extra_mem[i].size -= size; + if (start_r == start_pfn) { + BUG_ON(n_pfns > size_r); + xen_extra_mem[i].start_pfn += n_pfns; + xen_extra_mem[i].n_pfns -= n_pfns; break; } /* End of region. */ - if (start_r + size_r == start + size) { - BUG_ON(size > size_r); - xen_extra_mem[i].size -= size; + if (start_r + size_r == start_pfn + n_pfns) { + BUG_ON(n_pfns > size_r); + xen_extra_mem[i].n_pfns -= n_pfns; break; } /* Mid of region. */ - if (start > start_r && start < start_r + size_r) { - BUG_ON(start + size > start_r + size_r); - xen_extra_mem[i].size = start - start_r; + if (start_pfn > start_r && start_pfn < start_r + size_r) { + BUG_ON(start_pfn + n_pfns > start_r + size_r); + xen_extra_mem[i].n_pfns = start_pfn - start_r; /* Calling memblock_reserve() again is okay. */ - xen_add_extra_mem(start + size, start_r + size_r - - (start + size)); + xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r - + (start_pfn + n_pfns)); break; } } - memblock_free(start, size); + memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } /* @@ -156,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) unsigned long __ref xen_chk_extra_mem(unsigned long pfn) { int i; - phys_addr_t addr = PFN_PHYS(pfn); for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - if (addr >= xen_extra_mem[i].start && - addr < xen_extra_mem[i].start + xen_extra_mem[i].size) + if (pfn >= xen_extra_mem[i].start_pfn && + pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns) return INVALID_P2M_ENTRY; } @@ -176,10 +182,10 @@ void __init xen_inv_extra_mem(void) int i; for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - if (!xen_extra_mem[i].size) + if (!xen_extra_mem[i].n_pfns) continue; - pfn_s = PFN_DOWN(xen_extra_mem[i].start); - pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size); + pfn_s = xen_extra_mem[i].start_pfn; + pfn_e = pfn_s + xen_extra_mem[i].n_pfns; for (pfn = pfn_s; pfn < pfn_e; pfn++) set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } @@ -507,7 +513,7 @@ void __init xen_remap_memory(void) } else if (pfn_s + len == xen_remap_buf.target_pfn) { len += xen_remap_buf.size; } else { - xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); + xen_del_extra_mem(pfn_s, len); pfn_s = xen_remap_buf.target_pfn; len = xen_remap_buf.size; } @@ -517,7 +523,7 @@ void __init xen_remap_memory(void) } if (pfn_s != ~0UL && len) - xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); + xen_del_extra_mem(pfn_s, len); set_pte_mfn(buf, mfn_save, PAGE_KERNEL); @@ -744,7 +750,7 @@ static void __init xen_reserve_xen_mfnlist(void) **/ char * __init xen_memory_setup(void) { - unsigned long max_pfn; + unsigned long max_pfn, pfn_s, n_pfns; phys_addr_t mem_end, addr, size, chunk_size; u32 type; int rc; @@ -831,9 +837,11 @@ char * __init xen_memory_setup(void) chunk_size = min(size, mem_end - addr); } else if (extra_pages) { chunk_size = min(size, PFN_PHYS(extra_pages)); - extra_pages -= PFN_DOWN(chunk_size); - xen_add_extra_mem(addr, chunk_size); - xen_max_p2m_pfn = PFN_DOWN(addr + chunk_size); + pfn_s = PFN_UP(addr); + n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s; + extra_pages -= n_pfns; + xen_add_extra_mem(pfn_s, n_pfns); + xen_max_p2m_pfn = pfn_s + n_pfns; } else type = E820_UNUSABLE; } diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index bf4a23c7c591..1fa633b2d556 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -638,9 +638,9 @@ static int __init balloon_init(void) * regions (see arch/x86/xen/setup.c). */ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) - if (xen_extra_mem[i].size) - balloon_add_region(PFN_UP(xen_extra_mem[i].start), - PFN_DOWN(xen_extra_mem[i].size)); + if (xen_extra_mem[i].n_pfns) + balloon_add_region(xen_extra_mem[i].start_pfn, + xen_extra_mem[i].n_pfns); return 0; } diff --git a/include/xen/page.h b/include/xen/page.h index c5ed20bb3fe9..a5983da2f5cd 100644 --- a/include/xen/page.h +++ b/include/xen/page.h @@ -9,8 +9,8 @@ static inline unsigned long page_to_mfn(struct page *page) } struct xen_memory_region { - phys_addr_t start; - phys_addr_t size; + unsigned long start_pfn; + unsigned long n_pfns; }; #define XEN_EXTRA_MEM_MAX_REGIONS 128 /* == E820MAX */