Misc fixes:

- Fix more generic entry code ABI fallout
  - Fix debug register handling bugs
  - Fix vmalloc mappings on 32-bit kernels
  - Fix kprobes instrumentation output on 32-bit kernels
  - Fix over-eager WARN_ON_ONCE() on !SMAP hardware
  - Fix NUMA debugging
  - Fix Clang related crash on !RETPOLINE kernels
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAl9UljIRHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1g1yA//VecoyJOw4jb43LdkeKDGtUjCsPVZlt4w
 fw55nT4taqqbgl9mQjrJQlh8thtk7LvAqcsrEGk/SH+1fp/hDvBG0i3etyI1mPJ2
 t97MCVtD1bz2zyLpOtGN48tgiRxSazr4S9nZPCLTec+c75I3pmJssj44m/eJi/Z2
 hoj/syiO4J0BPa7a1ou++Jeyag6J+PgXdJTOMyjuqi99vqai1aTVKo8GdWMInext
 +fJNYd0ZQRj1FxVdMusDfzxOk7N7b8nAzvd30iJN67R6QwoEazO12K1F4IYQmHSq
 0rhHrwe0lTLtjmYdp/ef14kfzD7DRFN6Nv2gk/zyZsH+tjGflxTZConkFPnfoJEc
 33cNHfigh0V9TSVNDDhHnkRyy6dzCHkYHEf33KFuX3amC236TgrCEL7+oWE2rcNp
 9PJbPGlXCqNb2feNy2de4cY+KiZ2a1N/T4VcdMK6DEdENFh5T03EZgIChQEd0S99
 LNBYHqTWJdQEKfkzfAXlR4Bd2hX1LWLMM6rNcXxInrH7rWDXUCS0X9m3gLZR9DIs
 7/nXoK4OkaJdgH/D2CToDgwMNT5hlIiTGtVtB3H6Qz8eQQ4+fwTyboQDqpeG4Upy
 LfOH2h5Fo33FCgqnrua8IsgUKLwW2yJGdghJpcd9d0qfVUDEJuXGo6xe6SEHdSu/
 VEiQtFUf50U=
 =EhRy
 -----END PGP SIGNATURE-----

Merge tag 'x86-urgent-2020-09-06' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:

 - more generic entry code ABI fallout

 - debug register handling bugfixes

 - fix vmalloc mappings on 32-bit kernels

 - kprobes instrumentation output fix on 32-bit kernels

 - fix over-eager WARN_ON_ONCE() on !SMAP hardware

 - NUMA debugging fix

 - fix Clang related crash on !RETPOLINE kernels

* tag 'x86-urgent-2020-09-06' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/entry: Unbreak 32bit fast syscall
  x86/debug: Allow a single level of #DB recursion
  x86/entry: Fix AC assertion
  tracing/kprobes, x86/ptrace: Fix regs argument order for i386
  x86, fakenuma: Fix invalid starting node ID
  x86/mm/32: Bring back vmalloc faulting on x86_32
  x86/cmdline: Disable jump tables for cmdline.c
This commit is contained in:
Linus Torvalds 2020-09-06 10:28:00 -07:00
commit 015b3155c4
9 changed files with 213 additions and 63 deletions

View File

@ -60,16 +60,10 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs)
{ {
unsigned int nr = (unsigned int)regs->orig_ax;
if (IS_ENABLED(CONFIG_IA32_EMULATION)) if (IS_ENABLED(CONFIG_IA32_EMULATION))
current_thread_info()->status |= TS_COMPAT; current_thread_info()->status |= TS_COMPAT;
/*
* Subtlety here: if ptrace pokes something larger than 2^32-1 into return (unsigned int)regs->orig_ax;
* orig_ax, the unsigned int return value truncates it. This may
* or may not be necessary, but it matches the old asm behavior.
*/
return (unsigned int)syscall_enter_from_user_mode(regs, nr);
} }
/* /*
@ -91,15 +85,29 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
{ {
unsigned int nr = syscall_32_enter(regs); unsigned int nr = syscall_32_enter(regs);
/*
* Subtlety here: if ptrace pokes something larger than 2^32-1 into
* orig_ax, the unsigned int return value truncates it. This may
* or may not be necessary, but it matches the old asm behavior.
*/
nr = (unsigned int)syscall_enter_from_user_mode(regs, nr);
do_syscall_32_irqs_on(regs, nr); do_syscall_32_irqs_on(regs, nr);
syscall_exit_to_user_mode(regs); syscall_exit_to_user_mode(regs);
} }
static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
{ {
unsigned int nr = syscall_32_enter(regs); unsigned int nr = syscall_32_enter(regs);
int res; int res;
/*
* This cannot use syscall_enter_from_user_mode() as it has to
* fetch EBP before invoking any of the syscall entry work
* functions.
*/
syscall_enter_from_user_mode_prepare(regs);
instrumentation_begin(); instrumentation_begin();
/* Fetch EBP from where the vDSO stashed it. */ /* Fetch EBP from where the vDSO stashed it. */
if (IS_ENABLED(CONFIG_X86_64)) { if (IS_ENABLED(CONFIG_X86_64)) {
@ -122,6 +130,9 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
return false; return false;
} }
/* The case truncates any ptrace induced syscall nr > 2^32 -1 */
nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr);
/* Now this is just like a normal syscall. */ /* Now this is just like a normal syscall. */
do_syscall_32_irqs_on(regs, nr); do_syscall_32_irqs_on(regs, nr);
syscall_exit_to_user_mode(regs); syscall_exit_to_user_mode(regs);

View File

@ -18,8 +18,16 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs)
* state, not the interrupt state as imagined by Xen. * state, not the interrupt state as imagined by Xen.
*/ */
unsigned long flags = native_save_fl(); unsigned long flags = native_save_fl();
WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF | unsigned long mask = X86_EFLAGS_DF | X86_EFLAGS_NT;
X86_EFLAGS_NT));
/*
* For !SMAP hardware we patch out CLAC on entry.
*/
if (boot_cpu_has(X86_FEATURE_SMAP) ||
(IS_ENABLED(CONFIG_64_BIT) && boot_cpu_has(X86_FEATURE_XENPV)))
mask |= X86_EFLAGS_AC;
WARN_ON_ONCE(flags & mask);
/* We think we came from user mode. Make sure pt_regs agrees. */ /* We think we came from user mode. Make sure pt_regs agrees. */
WARN_ON_ONCE(!user_mode(regs)); WARN_ON_ONCE(!user_mode(regs));

View File

@ -327,8 +327,8 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
static const unsigned int argument_offs[] = { static const unsigned int argument_offs[] = {
#ifdef __i386__ #ifdef __i386__
offsetof(struct pt_regs, ax), offsetof(struct pt_regs, ax),
offsetof(struct pt_regs, cx),
offsetof(struct pt_regs, dx), offsetof(struct pt_regs, dx),
offsetof(struct pt_regs, cx),
#define NR_REG_ARGUMENTS 3 #define NR_REG_ARGUMENTS 3
#else #else
offsetof(struct pt_regs, di), offsetof(struct pt_regs, di),

View File

@ -729,20 +729,9 @@ static bool is_sysenter_singlestep(struct pt_regs *regs)
#endif #endif
} }
static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7) static __always_inline unsigned long debug_read_clear_dr6(void)
{ {
/* unsigned long dr6;
* Disable breakpoints during exception handling; recursive exceptions
* are exceedingly 'fun'.
*
* Since this function is NOKPROBE, and that also applies to
* HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
* HW_BREAKPOINT_W on our stack)
*
* Entry text is excluded for HW_BP_X and cpu_entry_area, which
* includes the entry stack is excluded for everything.
*/
*dr7 = local_db_save();
/* /*
* The Intel SDM says: * The Intel SDM says:
@ -755,15 +744,12 @@ static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7)
* *
* Keep it simple: clear DR6 immediately. * Keep it simple: clear DR6 immediately.
*/ */
get_debugreg(*dr6, 6); get_debugreg(dr6, 6);
set_debugreg(0, 6); set_debugreg(0, 6);
/* Filter out all the reserved bits which are preset to 1 */ /* Filter out all the reserved bits which are preset to 1 */
*dr6 &= ~DR6_RESERVED; dr6 &= ~DR6_RESERVED;
}
static __always_inline void debug_exit(unsigned long dr7) return dr6;
{
local_db_restore(dr7);
} }
/* /*
@ -863,6 +849,18 @@ static void handle_debug(struct pt_regs *regs, unsigned long dr6, bool user)
static __always_inline void exc_debug_kernel(struct pt_regs *regs, static __always_inline void exc_debug_kernel(struct pt_regs *regs,
unsigned long dr6) unsigned long dr6)
{ {
/*
* Disable breakpoints during exception handling; recursive exceptions
* are exceedingly 'fun'.
*
* Since this function is NOKPROBE, and that also applies to
* HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
* HW_BREAKPOINT_W on our stack)
*
* Entry text is excluded for HW_BP_X and cpu_entry_area, which
* includes the entry stack is excluded for everything.
*/
unsigned long dr7 = local_db_save();
bool irq_state = idtentry_enter_nmi(regs); bool irq_state = idtentry_enter_nmi(regs);
instrumentation_begin(); instrumentation_begin();
@ -883,6 +881,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
instrumentation_end(); instrumentation_end();
idtentry_exit_nmi(regs, irq_state); idtentry_exit_nmi(regs, irq_state);
local_db_restore(dr7);
} }
static __always_inline void exc_debug_user(struct pt_regs *regs, static __always_inline void exc_debug_user(struct pt_regs *regs,
@ -894,6 +894,15 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
*/ */
WARN_ON_ONCE(!user_mode(regs)); WARN_ON_ONCE(!user_mode(regs));
/*
* NB: We can't easily clear DR7 here because
* idtentry_exit_to_usermode() can invoke ptrace, schedule, access
* user memory, etc. This means that a recursive #DB is possible. If
* this happens, that #DB will hit exc_debug_kernel() and clear DR7.
* Since we're not on the IST stack right now, everything will be
* fine.
*/
irqentry_enter_from_user_mode(regs); irqentry_enter_from_user_mode(regs);
instrumentation_begin(); instrumentation_begin();
@ -907,36 +916,24 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
/* IST stack entry */ /* IST stack entry */
DEFINE_IDTENTRY_DEBUG(exc_debug) DEFINE_IDTENTRY_DEBUG(exc_debug)
{ {
unsigned long dr6, dr7; exc_debug_kernel(regs, debug_read_clear_dr6());
debug_enter(&dr6, &dr7);
exc_debug_kernel(regs, dr6);
debug_exit(dr7);
} }
/* User entry, runs on regular task stack */ /* User entry, runs on regular task stack */
DEFINE_IDTENTRY_DEBUG_USER(exc_debug) DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
{ {
unsigned long dr6, dr7; exc_debug_user(regs, debug_read_clear_dr6());
debug_enter(&dr6, &dr7);
exc_debug_user(regs, dr6);
debug_exit(dr7);
} }
#else #else
/* 32 bit does not have separate entry points. */ /* 32 bit does not have separate entry points. */
DEFINE_IDTENTRY_RAW(exc_debug) DEFINE_IDTENTRY_RAW(exc_debug)
{ {
unsigned long dr6, dr7; unsigned long dr6 = debug_read_clear_dr6();
debug_enter(&dr6, &dr7);
if (user_mode(regs)) if (user_mode(regs))
exc_debug_user(regs, dr6); exc_debug_user(regs, dr6);
else else
exc_debug_kernel(regs, dr6); exc_debug_kernel(regs, dr6);
debug_exit(dr7);
} }
#endif #endif

View File

@ -24,7 +24,7 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_cmdline.o = -pg CFLAGS_REMOVE_cmdline.o = -pg
endif endif
CFLAGS_cmdline.o := -fno-stack-protector CFLAGS_cmdline.o := -fno-stack-protector -fno-jump-tables
endif endif
inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk

View File

@ -190,6 +190,53 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
return pmd_k; return pmd_k;
} }
/*
* Handle a fault on the vmalloc or module mapping area
*
* This is needed because there is a race condition between the time
* when the vmalloc mapping code updates the PMD to the point in time
* where it synchronizes this update with the other page-tables in the
* system.
*
* In this race window another thread/CPU can map an area on the same
* PMD, finds it already present and does not synchronize it with the
* rest of the system yet. As a result v[mz]alloc might return areas
* which are not mapped in every page-table in the system, causing an
* unhandled page-fault when they are accessed.
*/
static noinline int vmalloc_fault(unsigned long address)
{
unsigned long pgd_paddr;
pmd_t *pmd_k;
pte_t *pte_k;
/* Make sure we are in vmalloc area: */
if (!(address >= VMALLOC_START && address < VMALLOC_END))
return -1;
/*
* Synchronize this task's top level page-table
* with the 'reference' page table.
*
* Do _not_ use "current" here. We might be inside
* an interrupt in the middle of a task switch..
*/
pgd_paddr = read_cr3_pa();
pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
if (!pmd_k)
return -1;
if (pmd_large(*pmd_k))
return 0;
pte_k = pte_offset_kernel(pmd_k, address);
if (!pte_present(*pte_k))
return -1;
return 0;
}
NOKPROBE_SYMBOL(vmalloc_fault);
void arch_sync_kernel_mappings(unsigned long start, unsigned long end) void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
{ {
unsigned long addr; unsigned long addr;
@ -1110,6 +1157,37 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
*/ */
WARN_ON_ONCE(hw_error_code & X86_PF_PK); WARN_ON_ONCE(hw_error_code & X86_PF_PK);
#ifdef CONFIG_X86_32
/*
* We can fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
* be in an interrupt or a critical region, and should
* only copy the information from the master page table,
* nothing more.
*
* Before doing this on-demand faulting, ensure that the
* fault is not any of the following:
* 1. A fault on a PTE with a reserved bit set.
* 2. A fault caused by a user-mode access. (Do not demand-
* fault kernel memory due to user-mode accesses).
* 3. A fault caused by a page-level protection violation.
* (A demand fault would be on a non-present page which
* would have X86_PF_PROT==0).
*
* This is only needed to close a race condition on x86-32 in
* the vmalloc mapping/unmapping code. See the comment above
* vmalloc_fault() for details. On x86-64 the race does not
* exist as the vmalloc mappings don't need to be synchronized
* there.
*/
if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
}
#endif
/* Was the fault spurious, caused by lazy TLB invalidation? */ /* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address)) if (spurious_kernel_fault(hw_error_code, address))
return; return;

View File

@ -321,7 +321,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
u64 addr, u64 max_addr, u64 size) u64 addr, u64 max_addr, u64 size)
{ {
return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
0, NULL, NUMA_NO_NODE); 0, NULL, 0);
} }
static int __init setup_emu2phys_nid(int *dfl_phys_nid) static int __init setup_emu2phys_nid(int *dfl_phys_nid)

View File

@ -110,15 +110,30 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs
#endif #endif
/** /**
* syscall_enter_from_user_mode - Check and handle work before invoking * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts
* a syscall * @regs: Pointer to currents pt_regs
*
* Invoked from architecture specific syscall entry code with interrupts
* disabled. The calling code has to be non-instrumentable. When the
* function returns all state is correct, interrupts are enabled and the
* subsequent functions can be instrumented.
*
* This handles lockdep, RCU (context tracking) and tracing state.
*
* This is invoked when there is extra architecture specific functionality
* to be done between establishing state and handling user mode entry work.
*/
void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
/**
* syscall_enter_from_user_mode_work - Check and handle work before invoking
* a syscall
* @regs: Pointer to currents pt_regs * @regs: Pointer to currents pt_regs
* @syscall: The syscall number * @syscall: The syscall number
* *
* Invoked from architecture specific syscall entry code with interrupts * Invoked from architecture specific syscall entry code with interrupts
* disabled. The calling code has to be non-instrumentable. When the * enabled after invoking syscall_enter_from_user_mode_prepare() and extra
* function returns all state is correct and the subsequent functions can be * architecture specific work.
* instrumented.
* *
* Returns: The original or a modified syscall number * Returns: The original or a modified syscall number
* *
@ -127,12 +142,30 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs
* syscall_set_return_value() first. If neither of those are called and -1 * syscall_set_return_value() first. If neither of those are called and -1
* is returned, then the syscall will fail with ENOSYS. * is returned, then the syscall will fail with ENOSYS.
* *
* The following functionality is handled here: * It handles the following work items:
* *
* 1) Establish state (lockdep, RCU (context tracking), tracing) * 1) TIF flag dependent invocations of arch_syscall_enter_tracehook(),
* 2) TIF flag dependent invocations of arch_syscall_enter_tracehook(),
* __secure_computing(), trace_sys_enter() * __secure_computing(), trace_sys_enter()
* 3) Invocation of audit_syscall_entry() * 2) Invocation of audit_syscall_entry()
*/
long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall);
/**
* syscall_enter_from_user_mode - Establish state and check and handle work
* before invoking a syscall
* @regs: Pointer to currents pt_regs
* @syscall: The syscall number
*
* Invoked from architecture specific syscall entry code with interrupts
* disabled. The calling code has to be non-instrumentable. When the
* function returns all state is correct, interrupts are enabled and the
* subsequent functions can be instrumented.
*
* This is combination of syscall_enter_from_user_mode_prepare() and
* syscall_enter_from_user_mode_work().
*
* Returns: The original or a modified syscall number. See
* syscall_enter_from_user_mode_work() for further explanation.
*/ */
long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall); long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall);

View File

@ -69,22 +69,45 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
return ret ? : syscall_get_nr(current, regs); return ret ? : syscall_get_nr(current, regs);
} }
noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) static __always_inline long
__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
{ {
unsigned long ti_work; unsigned long ti_work;
enter_from_user_mode(regs);
instrumentation_begin();
local_irq_enable();
ti_work = READ_ONCE(current_thread_info()->flags); ti_work = READ_ONCE(current_thread_info()->flags);
if (ti_work & SYSCALL_ENTER_WORK) if (ti_work & SYSCALL_ENTER_WORK)
syscall = syscall_trace_enter(regs, syscall, ti_work); syscall = syscall_trace_enter(regs, syscall, ti_work);
instrumentation_end();
return syscall; return syscall;
} }
long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
{
return __syscall_enter_from_user_work(regs, syscall);
}
noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
{
long ret;
enter_from_user_mode(regs);
instrumentation_begin();
local_irq_enable();
ret = __syscall_enter_from_user_work(regs, syscall);
instrumentation_end();
return ret;
}
noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
{
enter_from_user_mode(regs);
instrumentation_begin();
local_irq_enable();
instrumentation_end();
}
/** /**
* exit_to_user_mode - Fixup state when exiting to user mode * exit_to_user_mode - Fixup state when exiting to user mode
* *