mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-28 11:18:45 +07:00
x86/mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries
Extend the severity checking code to add a new context IN_KERN_RECOV which is used to indicate that the machine check was triggered by code in the kernel tagged with _ASM_EXTABLE_FAULT() so that the ex_handler_fault() handler will provide the fixup code with the trap number. Major re-work to the tail code in do_machine_check() to make all this readable/maintainable. One functional change is that tolerant=3 no longer stops recovery actions. Revert to only skipping sending SIGBUS to the current process. Signed-off-by: Tony Luck <tony.luck@intel.com> Reviewed-by: Borislav Petkov <bp@suse.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/89d243d05a7943bb187d1074bb30d9c4f482d5f5.1455732970.git.tony.luck@intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
548acf1923
commit
b2f9d678e2
@ -14,6 +14,7 @@
|
|||||||
#include <linux/init.h>
|
#include <linux/init.h>
|
||||||
#include <linux/debugfs.h>
|
#include <linux/debugfs.h>
|
||||||
#include <asm/mce.h>
|
#include <asm/mce.h>
|
||||||
|
#include <asm/uaccess.h>
|
||||||
|
|
||||||
#include "mce-internal.h"
|
#include "mce-internal.h"
|
||||||
|
|
||||||
@ -29,7 +30,7 @@
|
|||||||
* panic situations)
|
* panic situations)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
enum context { IN_KERNEL = 1, IN_USER = 2 };
|
enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
|
||||||
enum ser { SER_REQUIRED = 1, NO_SER = 2 };
|
enum ser { SER_REQUIRED = 1, NO_SER = 2 };
|
||||||
enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
|
enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
|
||||||
|
|
||||||
@ -48,6 +49,7 @@ static struct severity {
|
|||||||
#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
|
#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
|
||||||
#define KERNEL .context = IN_KERNEL
|
#define KERNEL .context = IN_KERNEL
|
||||||
#define USER .context = IN_USER
|
#define USER .context = IN_USER
|
||||||
|
#define KERNEL_RECOV .context = IN_KERNEL_RECOV
|
||||||
#define SER .ser = SER_REQUIRED
|
#define SER .ser = SER_REQUIRED
|
||||||
#define NOSER .ser = NO_SER
|
#define NOSER .ser = NO_SER
|
||||||
#define EXCP .excp = EXCP_CONTEXT
|
#define EXCP .excp = EXCP_CONTEXT
|
||||||
@ -86,6 +88,10 @@ static struct severity {
|
|||||||
PANIC, "In kernel and no restart IP",
|
PANIC, "In kernel and no restart IP",
|
||||||
EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
|
EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
|
||||||
),
|
),
|
||||||
|
MCESEV(
|
||||||
|
PANIC, "In kernel and no restart IP",
|
||||||
|
EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
|
||||||
|
),
|
||||||
MCESEV(
|
MCESEV(
|
||||||
DEFERRED, "Deferred error",
|
DEFERRED, "Deferred error",
|
||||||
NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
|
NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
|
||||||
@ -122,6 +128,11 @@ static struct severity {
|
|||||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
|
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
|
||||||
MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
|
MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
|
||||||
),
|
),
|
||||||
|
MCESEV(
|
||||||
|
AR, "Action required: data load in error recoverable area of kernel",
|
||||||
|
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
|
||||||
|
KERNEL_RECOV
|
||||||
|
),
|
||||||
MCESEV(
|
MCESEV(
|
||||||
AR, "Action required: data load error in a user process",
|
AR, "Action required: data load error in a user process",
|
||||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
|
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
|
||||||
@ -170,6 +181,9 @@ static struct severity {
|
|||||||
) /* always matches. keep at end */
|
) /* always matches. keep at end */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
|
||||||
|
(MCG_STATUS_RIPV|MCG_STATUS_EIPV))
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If mcgstatus indicated that ip/cs on the stack were
|
* If mcgstatus indicated that ip/cs on the stack were
|
||||||
* no good, then "m->cs" will be zero and we will have
|
* no good, then "m->cs" will be zero and we will have
|
||||||
@ -183,7 +197,11 @@ static struct severity {
|
|||||||
*/
|
*/
|
||||||
static int error_context(struct mce *m)
|
static int error_context(struct mce *m)
|
||||||
{
|
{
|
||||||
return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
|
if ((m->cs & 3) == 3)
|
||||||
|
return IN_USER;
|
||||||
|
if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
|
||||||
|
return IN_KERNEL_RECOV;
|
||||||
|
return IN_KERNEL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -961,6 +961,20 @@ static void mce_clear_state(unsigned long *toclear)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int do_memory_failure(struct mce *m)
|
||||||
|
{
|
||||||
|
int flags = MF_ACTION_REQUIRED;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
|
||||||
|
if (!(m->mcgstatus & MCG_STATUS_RIPV))
|
||||||
|
flags |= MF_MUST_KILL;
|
||||||
|
ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
|
||||||
|
if (ret)
|
||||||
|
pr_err("Memory error not recovered");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The actual machine check handler. This only handles real
|
* The actual machine check handler. This only handles real
|
||||||
* exceptions when something got corrupted coming in through int 18.
|
* exceptions when something got corrupted coming in through int 18.
|
||||||
@ -998,8 +1012,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|||||||
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
|
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
|
||||||
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
|
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
|
||||||
char *msg = "Unknown";
|
char *msg = "Unknown";
|
||||||
u64 recover_paddr = ~0ull;
|
|
||||||
int flags = MF_ACTION_REQUIRED;
|
|
||||||
int lmce = 0;
|
int lmce = 0;
|
||||||
|
|
||||||
/* If this CPU is offline, just bail out. */
|
/* If this CPU is offline, just bail out. */
|
||||||
@ -1136,22 +1148,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* At insane "tolerant" levels we take no action. Otherwise
|
* If tolerant is at an insane level we drop requests to kill
|
||||||
* we only die if we have no other choice. For less serious
|
* processes and continue even when there is no way out.
|
||||||
* issues we try to recover, or limit damage to the current
|
|
||||||
* process.
|
|
||||||
*/
|
*/
|
||||||
if (cfg->tolerant < 3) {
|
if (cfg->tolerant == 3)
|
||||||
if (no_way_out)
|
kill_it = 0;
|
||||||
|
else if (no_way_out)
|
||||||
mce_panic("Fatal machine check on current CPU", &m, msg);
|
mce_panic("Fatal machine check on current CPU", &m, msg);
|
||||||
if (worst == MCE_AR_SEVERITY) {
|
|
||||||
recover_paddr = m.addr;
|
|
||||||
if (!(m.mcgstatus & MCG_STATUS_RIPV))
|
|
||||||
flags |= MF_MUST_KILL;
|
|
||||||
} else if (kill_it) {
|
|
||||||
force_sig(SIGBUS, current);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (worst > 0)
|
if (worst > 0)
|
||||||
mce_report_event(regs);
|
mce_report_event(regs);
|
||||||
@ -1159,25 +1162,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|||||||
out:
|
out:
|
||||||
sync_core();
|
sync_core();
|
||||||
|
|
||||||
if (recover_paddr == ~0ull)
|
if (worst != MCE_AR_SEVERITY && !kill_it)
|
||||||
goto done;
|
goto out_ist;
|
||||||
|
|
||||||
pr_err("Uncorrected hardware memory error in user-access at %llx",
|
/* Fault was in user mode and we need to take some action */
|
||||||
recover_paddr);
|
if ((m.cs & 3) == 3) {
|
||||||
/*
|
|
||||||
* We must call memory_failure() here even if the current process is
|
|
||||||
* doomed. We still need to mark the page as poisoned and alert any
|
|
||||||
* other users of the page.
|
|
||||||
*/
|
|
||||||
ist_begin_non_atomic(regs);
|
ist_begin_non_atomic(regs);
|
||||||
local_irq_enable();
|
local_irq_enable();
|
||||||
if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
|
|
||||||
pr_err("Memory error not recovered");
|
if (kill_it || do_memory_failure(&m))
|
||||||
force_sig(SIGBUS, current);
|
force_sig(SIGBUS, current);
|
||||||
}
|
|
||||||
local_irq_disable();
|
local_irq_disable();
|
||||||
ist_end_non_atomic();
|
ist_end_non_atomic();
|
||||||
done:
|
} else {
|
||||||
|
if (!fixup_exception(regs, X86_TRAP_MC))
|
||||||
|
mce_panic("Failed kernel mode recovery", &m, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
out_ist:
|
||||||
ist_exit(regs);
|
ist_exit(regs);
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(do_machine_check);
|
EXPORT_SYMBOL_GPL(do_machine_check);
|
||||||
|
Loading…
Reference in New Issue
Block a user