mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-13 03:16:04 +07:00
5388a5b821
machine_crash_nonpanic_core() does this: while (1) cpu_relax(); because the kernel has crashed, and we have no known safe way to deal with the CPU. So, we place the CPU into an infinite loop which we expect it to never exit - at least not until the system as a whole is reset by some method. In the absence of erratum 754327, this code assembles to: b . In other words, an infinite loop. When erratum 754327 is enabled, this becomes: 1: dmb b 1b It has been observed that on some systems (eg, OMAP4) where, if a crash is triggered, the system tries to kexec into the panic kernel, but fails after taking the secondary CPU down - placing it into one of these loops. This causes the system to livelock, and the most noticable effect is the system stops after issuing: Loading crashdump kernel... to the system console. The tested as working solution I came up with was to add wfe() to these infinite loops thusly: while (1) { cpu_relax(); wfe(); } which, without 754327 builds to: 1: wfe b 1b or with 754327 is enabled: 1: dmb wfe b 1b Adding "wfe" does two things depending on the environment we're running under: - where we're running on bare metal, and the processor implements "wfe", it stops us spinning endlessly in a loop where we're never going to do any useful work. - if we're running in a VM, it allows the CPU to be given back to the hypervisor and rescheduled for other purposes (maybe a different VM) rather than wasting CPU cycles inside a crashed VM. However, in light of erratum 794072, Will Deacon wanted to see 10 nops as well - which is reasonable to cover the case where we have erratum 754327 enabled _and_ we have a processor that doesn't implement the wfe hint. So, we now end up with: 1: wfe b 1b when erratum 754327 is disabled, or: 1: dmb nop nop nop nop nop nop nop nop nop nop wfe b 1b when erratum 754327 is enabled. We also get the dmb + 10 nop sequence elsewhere in the kernel, in terminating loops. This is reasonable - it means we get the workaround for erratum 794072 when erratum 754327 is enabled, but still relinquish the dead processor - either by placing it in a lower power mode when wfe is implemented as such or by returning it to the hypervisior, or in the case where wfe is a no-op, we use the workaround specified in erratum 794072 to avoid the problem. These as two entirely orthogonal problems - the 10 nops addresses erratum 794072, and the wfe is an optimisation that makes the system more efficient when crashed either in terms of power consumption or by allowing the host/other VMs to make use of the CPU. I don't see any reason not to use kexec() inside a VM - it has the potential to provide automated recovery from a failure of the VMs kernel with the opportunity for saving a crashdump of the failure. A panic() with a reboot timeout won't do that, and reading the libvirt documentation, setting on_reboot to "preserve" won't either (the documentation states "The preserve action for an on_reboot event is treated as a destroy".) Surely it has to be a good thing to avoiding having CPUs spinning inside a VM that is doing no useful work. Acked-by: Will Deacon <will.deacon@arm.com> Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
208 lines
5.0 KiB
C
208 lines
5.0 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* machine_kexec.c - handle transition of Linux booting another kernel
|
|
*/
|
|
|
|
#include <linux/mm.h>
|
|
#include <linux/kexec.h>
|
|
#include <linux/delay.h>
|
|
#include <linux/reboot.h>
|
|
#include <linux/io.h>
|
|
#include <linux/irq.h>
|
|
#include <linux/memblock.h>
|
|
#include <asm/pgtable.h>
|
|
#include <linux/of_fdt.h>
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/mmu_context.h>
|
|
#include <asm/cacheflush.h>
|
|
#include <asm/fncpy.h>
|
|
#include <asm/mach-types.h>
|
|
#include <asm/smp_plat.h>
|
|
#include <asm/system_misc.h>
|
|
#include <asm/set_memory.h>
|
|
|
|
extern void relocate_new_kernel(void);
|
|
extern const unsigned int relocate_new_kernel_size;
|
|
|
|
extern unsigned long kexec_start_address;
|
|
extern unsigned long kexec_indirection_page;
|
|
extern unsigned long kexec_mach_type;
|
|
extern unsigned long kexec_boot_atags;
|
|
|
|
static atomic_t waiting_for_crash_ipi;
|
|
|
|
/*
|
|
* Provide a dummy crash_notes definition while crash dump arrives to arm.
|
|
* This prevents breakage of crash_notes attribute in kernel/ksysfs.c.
|
|
*/
|
|
|
|
int machine_kexec_prepare(struct kimage *image)
|
|
{
|
|
struct kexec_segment *current_segment;
|
|
__be32 header;
|
|
int i, err;
|
|
|
|
image->arch.kernel_r2 = image->start - KEXEC_ARM_ZIMAGE_OFFSET
|
|
+ KEXEC_ARM_ATAGS_OFFSET;
|
|
|
|
/*
|
|
* Validate that if the current HW supports SMP, then the SW supports
|
|
* and implements CPU hotplug for the current HW. If not, we won't be
|
|
* able to kexec reliably, so fail the prepare operation.
|
|
*/
|
|
if (num_possible_cpus() > 1 && platform_can_secondary_boot() &&
|
|
!platform_can_cpu_hotplug())
|
|
return -EINVAL;
|
|
|
|
/*
|
|
* No segment at default ATAGs address. try to locate
|
|
* a dtb using magic.
|
|
*/
|
|
for (i = 0; i < image->nr_segments; i++) {
|
|
current_segment = &image->segment[i];
|
|
|
|
if (!memblock_is_region_memory(idmap_to_phys(current_segment->mem),
|
|
current_segment->memsz))
|
|
return -EINVAL;
|
|
|
|
err = get_user(header, (__be32*)current_segment->buf);
|
|
if (err)
|
|
return err;
|
|
|
|
if (header == cpu_to_be32(OF_DT_HEADER))
|
|
image->arch.kernel_r2 = current_segment->mem;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void machine_kexec_cleanup(struct kimage *image)
|
|
{
|
|
}
|
|
|
|
void machine_crash_nonpanic_core(void *unused)
|
|
{
|
|
struct pt_regs regs;
|
|
|
|
crash_setup_regs(®s, get_irq_regs());
|
|
printk(KERN_DEBUG "CPU %u will stop doing anything useful since another CPU has crashed\n",
|
|
smp_processor_id());
|
|
crash_save_cpu(®s, smp_processor_id());
|
|
flush_cache_all();
|
|
|
|
set_cpu_online(smp_processor_id(), false);
|
|
atomic_dec(&waiting_for_crash_ipi);
|
|
|
|
while (1) {
|
|
cpu_relax();
|
|
wfe();
|
|
}
|
|
}
|
|
|
|
void crash_smp_send_stop(void)
|
|
{
|
|
static int cpus_stopped;
|
|
unsigned long msecs;
|
|
|
|
if (cpus_stopped)
|
|
return;
|
|
|
|
atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
|
|
smp_call_function(machine_crash_nonpanic_core, NULL, false);
|
|
msecs = 1000; /* Wait at most a second for the other cpus to stop */
|
|
while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
|
|
mdelay(1);
|
|
msecs--;
|
|
}
|
|
if (atomic_read(&waiting_for_crash_ipi) > 0)
|
|
pr_warn("Non-crashing CPUs did not react to IPI\n");
|
|
|
|
cpus_stopped = 1;
|
|
}
|
|
|
|
static void machine_kexec_mask_interrupts(void)
|
|
{
|
|
unsigned int i;
|
|
struct irq_desc *desc;
|
|
|
|
for_each_irq_desc(i, desc) {
|
|
struct irq_chip *chip;
|
|
|
|
chip = irq_desc_get_chip(desc);
|
|
if (!chip)
|
|
continue;
|
|
|
|
if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
|
|
chip->irq_eoi(&desc->irq_data);
|
|
|
|
if (chip->irq_mask)
|
|
chip->irq_mask(&desc->irq_data);
|
|
|
|
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
|
|
chip->irq_disable(&desc->irq_data);
|
|
}
|
|
}
|
|
|
|
void machine_crash_shutdown(struct pt_regs *regs)
|
|
{
|
|
local_irq_disable();
|
|
crash_smp_send_stop();
|
|
|
|
crash_save_cpu(regs, smp_processor_id());
|
|
machine_kexec_mask_interrupts();
|
|
|
|
pr_info("Loading crashdump kernel...\n");
|
|
}
|
|
|
|
/*
|
|
* Function pointer to optional machine-specific reinitialization
|
|
*/
|
|
void (*kexec_reinit)(void);
|
|
|
|
void machine_kexec(struct kimage *image)
|
|
{
|
|
unsigned long page_list, reboot_entry_phys;
|
|
void (*reboot_entry)(void);
|
|
void *reboot_code_buffer;
|
|
|
|
/*
|
|
* This can only happen if machine_shutdown() failed to disable some
|
|
* CPU, and that can only happen if the checks in
|
|
* machine_kexec_prepare() were not correct. If this fails, we can't
|
|
* reliably kexec anyway, so BUG_ON is appropriate.
|
|
*/
|
|
BUG_ON(num_online_cpus() > 1);
|
|
|
|
page_list = image->head & PAGE_MASK;
|
|
|
|
reboot_code_buffer = page_address(image->control_code_page);
|
|
|
|
/* Prepare parameters for reboot_code_buffer*/
|
|
set_kernel_text_rw();
|
|
kexec_start_address = image->start;
|
|
kexec_indirection_page = page_list;
|
|
kexec_mach_type = machine_arch_type;
|
|
kexec_boot_atags = image->arch.kernel_r2;
|
|
|
|
/* copy our kernel relocation code to the control code page */
|
|
reboot_entry = fncpy(reboot_code_buffer,
|
|
&relocate_new_kernel,
|
|
relocate_new_kernel_size);
|
|
|
|
/* get the identity mapping physical address for the reboot code */
|
|
reboot_entry_phys = virt_to_idmap(reboot_entry);
|
|
|
|
pr_info("Bye!\n");
|
|
|
|
if (kexec_reinit)
|
|
kexec_reinit();
|
|
|
|
soft_restart(reboot_entry_phys);
|
|
}
|
|
|
|
void arch_crash_save_vmcoreinfo(void)
|
|
{
|
|
#ifdef CONFIG_ARM_LPAE
|
|
VMCOREINFO_CONFIG(ARM_LPAE);
|
|
#endif
|
|
}
|