linux_dsm_epyc7002/arch/x86/mm/extable.c
Eric Biggers d5c8028b47 x86/fpu: Reinitialize FPU registers if restoring FPU state fails
Userspace can change the FPU state of a task using the ptrace() or
rt_sigreturn() system calls.  Because reserved bits in the FPU state can
cause the XRSTOR instruction to fail, the kernel has to carefully
validate that no reserved bits or other invalid values are being set.

Unfortunately, there have been bugs in this validation code.  For
example, we were not checking that the 'xcomp_bv' field in the
xstate_header was 0.  As-is, such bugs are exploitable to read the FPU
registers of other processes on the system.  To do so, an attacker can
create a task, assign to it an invalid FPU state, then spin in a loop
and monitor the values of the FPU registers.  Because the task's FPU
registers are not being restored, sometimes the FPU registers will have
the values from another process.

This is likely to continue to be a problem in the future because the
validation done by the CPU instructions like XRSTOR is not immediately
visible to kernel developers.  Nor will invalid FPU states ever be
encountered during ordinary use --- they will only be seen during
fuzzing or exploits.  There can even be reserved bits outside the
xstate_header which are easy to forget about.  For example, the MXCSR
register contains reserved bits, which were not validated by the
KVM_SET_XSAVE ioctl until commit a575813bfe ("KVM: x86: Fix load
damaged SSEx MXCSR register").

Therefore, mitigate this class of vulnerability by restoring the FPU
registers from init_fpstate if restoring from the task's state fails.

We actually used to do this, but it was (perhaps unwisely) removed by
commit 9ccc27a5d2 ("x86/fpu: Remove error return values from
copy_kernel_to_*regs() functions").  This new patch is also a bit
different.  First, it only clears the registers, not also the bad
in-memory state; this is simpler and makes it easier to make the
mitigation cover all callers of __copy_kernel_to_fpregs().  Second, it
does the register clearing in an exception handler so that no extra
instructions are added to context switches.  In fact, we *remove*
instructions, since previously we were always zeroing the register
containing 'err' even if CONFIG_X86_DEBUG_FPU was disabled.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Halcrow <mhalcrow@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/20170922174156.16780-4-ebiggers3@gmail.com
Link: http://lkml.kernel.org/r/20170923130016.21448-27-mingo@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-09-25 09:26:38 +02:00

245 lines
7.1 KiB
C

#include <linux/extable.h>
#include <linux/uaccess.h>
#include <linux/sched/debug.h>
#include <asm/fpu/internal.h>
#include <asm/traps.h>
#include <asm/kdebug.h>
typedef bool (*ex_handler_t)(const struct exception_table_entry *,
struct pt_regs *, int);
static inline unsigned long
ex_fixup_addr(const struct exception_table_entry *x)
{
return (unsigned long)&x->fixup + x->fixup;
}
static inline ex_handler_t
ex_fixup_handler(const struct exception_table_entry *x)
{
return (ex_handler_t)((unsigned long)&x->handler + x->handler);
}
bool ex_handler_default(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
regs->ip = ex_fixup_addr(fixup);
return true;
}
EXPORT_SYMBOL(ex_handler_default);
bool ex_handler_fault(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
regs->ip = ex_fixup_addr(fixup);
regs->ax = trapnr;
return true;
}
EXPORT_SYMBOL_GPL(ex_handler_fault);
/*
* Handler for UD0 exception following a failed test against the
* result of a refcount inc/dec/add/sub.
*/
bool ex_handler_refcount(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
/* First unconditionally saturate the refcount. */
*(int *)regs->cx = INT_MIN / 2;
/*
* Strictly speaking, this reports the fixup destination, not
* the fault location, and not the actually overflowing
* instruction, which is the instruction before the "js", but
* since that instruction could be a variety of lengths, just
* report the location after the overflow, which should be close
* enough for finding the overflow, as it's at least back in
* the function, having returned from .text.unlikely.
*/
regs->ip = ex_fixup_addr(fixup);
/*
* This function has been called because either a negative refcount
* value was seen by any of the refcount functions, or a zero
* refcount value was seen by refcount_dec().
*
* If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result
* wrapped around) will be set. Additionally, seeing the refcount
* reach 0 will set ZF (Zero Flag: result was zero). In each of
* these cases we want a report, since it's a boundary condition.
*
*/
if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) {
bool zero = regs->flags & X86_EFLAGS_ZF;
refcount_error_report(regs, zero ? "hit zero" : "overflow");
}
return true;
}
EXPORT_SYMBOL_GPL(ex_handler_refcount);
/*
* Handler for when we fail to restore a task's FPU state. We should never get
* here because the FPU state of a task using the FPU (task->thread.fpu.state)
* should always be valid. However, past bugs have allowed userspace to set
* reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn().
* These caused XRSTOR to fail when switching to the task, leaking the FPU
* registers of the task previously executing on the CPU. Mitigate this class
* of vulnerability by restoring from the initial state (essentially, zeroing
* out all the FPU registers) if we can't restore from the task's FPU state.
*/
bool ex_handler_fprestore(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
regs->ip = ex_fixup_addr(fixup);
WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
(void *)instruction_pointer(regs));
__copy_kernel_to_fpregs(&init_fpstate, -1);
return true;
}
EXPORT_SYMBOL_GPL(ex_handler_fprestore);
bool ex_handler_ext(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
/* Special hack for uaccess_err */
current->thread.uaccess_err = 1;
regs->ip = ex_fixup_addr(fixup);
return true;
}
EXPORT_SYMBOL(ex_handler_ext);
bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n",
(unsigned int)regs->cx, regs->ip, (void *)regs->ip))
show_stack_regs(regs);
/* Pretend that the read succeeded and returned 0. */
regs->ip = ex_fixup_addr(fixup);
regs->ax = 0;
regs->dx = 0;
return true;
}
EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n",
(unsigned int)regs->cx, (unsigned int)regs->dx,
(unsigned int)regs->ax, regs->ip, (void *)regs->ip))
show_stack_regs(regs);
/* Pretend that the write succeeded. */
regs->ip = ex_fixup_addr(fixup);
return true;
}
EXPORT_SYMBOL(ex_handler_wrmsr_unsafe);
bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
if (static_cpu_has(X86_BUG_NULL_SEG))
asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS));
asm volatile ("mov %0, %%fs" : : "rm" (0));
return ex_handler_default(fixup, regs, trapnr);
}
EXPORT_SYMBOL(ex_handler_clear_fs);
bool ex_has_fault_handler(unsigned long ip)
{
const struct exception_table_entry *e;
ex_handler_t handler;
e = search_exception_tables(ip);
if (!e)
return false;
handler = ex_fixup_handler(e);
return handler == ex_handler_fault;
}
int fixup_exception(struct pt_regs *regs, int trapnr)
{
const struct exception_table_entry *e;
ex_handler_t handler;
#ifdef CONFIG_PNPBIOS
if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
extern u32 pnp_bios_is_utter_crap;
pnp_bios_is_utter_crap = 1;
printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
__asm__ volatile(
"movl %0, %%esp\n\t"
"jmp *%1\n\t"
: : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
panic("do_trap: can't hit this");
}
#endif
e = search_exception_tables(regs->ip);
if (!e)
return 0;
handler = ex_fixup_handler(e);
return handler(e, regs, trapnr);
}
extern unsigned int early_recursion_flag;
/* Restricted version used during very early boot */
void __init early_fixup_exception(struct pt_regs *regs, int trapnr)
{
/* Ignore early NMIs. */
if (trapnr == X86_TRAP_NMI)
return;
if (early_recursion_flag > 2)
goto halt_loop;
/*
* Old CPUs leave the high bits of CS on the stack
* undefined. I'm not sure which CPUs do this, but at least
* the 486 DX works this way.
*/
if (regs->cs != __KERNEL_CS)
goto fail;
/*
* The full exception fixup machinery is available as soon as
* the early IDT is loaded. This means that it is the
* responsibility of extable users to either function correctly
* when handlers are invoked early or to simply avoid causing
* exceptions before they're ready to handle them.
*
* This is better than filtering which handlers can be used,
* because refusing to call a handler here is guaranteed to
* result in a hard-to-debug panic.
*
* Keep in mind that not all vectors actually get here. Early
* fage faults, for example, are special.
*/
if (fixup_exception(regs, trapnr))
return;
if (fixup_bug(regs, trapnr))
return;
fail:
early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n",
(unsigned)trapnr, (unsigned long)regs->cs, regs->ip,
regs->orig_ax, read_cr2());
show_regs(regs);
halt_loop:
while (true)
halt();
}