linux_dsm_epyc7002/arch/alpha/mm/fault.c
Sergei Trofimovich 491af60ffb alpha: fix page fault handling for r16-r18 targets
Fix page fault handling code to fixup r16-r18 registers.
Before the patch code had off-by-two registers bug.
This bug caused overwriting of ps,pc,gp registers instead
of fixing intended r16,r17,r18 (see `struct pt_regs`).

More details:

Initially Dmitry noticed a kernel bug as a failure
on strace test suite. Test passes unmapped userspace
pointer to io_submit:

```c
    #include <err.h>
    #include <unistd.h>
    #include <sys/mman.h>
    #include <asm/unistd.h>
    int main(void)
    {
        unsigned long ctx = 0;
        if (syscall(__NR_io_setup, 1, &ctx))
            err(1, "io_setup");
        const size_t page_size = sysconf(_SC_PAGESIZE);
        const size_t size = page_size * 2;
        void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        if (MAP_FAILED == ptr)
            err(1, "mmap(%zu)", size);
        if (munmap(ptr, size))
            err(1, "munmap");
        syscall(__NR_io_submit, ctx, 1, ptr + page_size);
        syscall(__NR_io_destroy, ctx);
        return 0;
    }
```

Running this test causes kernel to crash when handling page fault:

```
    Unable to handle kernel paging request at virtual address ffffffffffff9468
    CPU 3
    aio(26027): Oops 0
    pc = [<fffffc00004eddf8>]  ra = [<fffffc00004edd5c>]  ps = 0000    Not tainted
    pc is at sys_io_submit+0x108/0x200
    ra is at sys_io_submit+0x6c/0x200
    v0 = fffffc00c58e6300  t0 = fffffffffffffff2  t1 = 000002000025e000
    t2 = fffffc01f159fef8  t3 = fffffc0001009640  t4 = fffffc0000e0f6e0
    t5 = 0000020001002e9e  t6 = 4c41564e49452031  t7 = fffffc01f159c000
    s0 = 0000000000000002  s1 = 000002000025e000  s2 = 0000000000000000
    s3 = 0000000000000000  s4 = 0000000000000000  s5 = fffffffffffffff2
    s6 = fffffc00c58e6300
    a0 = fffffc00c58e6300  a1 = 0000000000000000  a2 = 000002000025e000
    a3 = 00000200001ac260  a4 = 00000200001ac1e8  a5 = 0000000000000001
    t8 = 0000000000000008  t9 = 000000011f8bce30  t10= 00000200001ac440
    t11= 0000000000000000  pv = fffffc00006fd320  at = 0000000000000000
    gp = 0000000000000000  sp = 00000000265fd174
    Disabling lock debugging due to kernel taint
    Trace:
    [<fffffc0000311404>] entSys+0xa4/0xc0
```

Here `gp` has invalid value. `gp is s overwritten by a fixup for the
following page fault handler in `io_submit` syscall handler:

```
    __se_sys_io_submit
    ...
        ldq     a1,0(t1)
        bne     t0,4280 <__se_sys_io_submit+0x180>
```

After a page fault `t0` should contain -EFALUT and `a1` is 0.
Instead `gp` was overwritten in place of `a1`.

This happens due to a off-by-two bug in `dpf_reg()` for `r16-r18`
(aka `a0-a2`).

I think the bug went unnoticed for a long time as `gp` is one
of scratch registers. Any kernel function call would re-calculate `gp`.

Dmitry tracked down the bug origin back to 2.1.32 kernel version
where trap_a{0,1,2} fields were inserted into struct pt_regs.
And even before that `dpf_reg()` contained off-by-one error.

Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: linux-alpha@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reported-and-reviewed-by: "Dmitry V. Levin" <ldv@altlinux.org>
Cc: stable@vger.kernel.org # v2.1.32+
Bug: https://bugs.gentoo.org/672040
Signed-off-by: Sergei Trofimovich <slyfox@gentoo.org>
Signed-off-by: Matt Turner <mattst88@gmail.com>
2019-02-10 20:42:23 -08:00

253 lines
6.1 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* linux/arch/alpha/mm/fault.c
*
* Copyright (C) 1995 Linus Torvalds
*/
#include <linux/sched/signal.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <asm/io.h>
#define __EXTERN_INLINE inline
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
#undef __EXTERN_INLINE
#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/extable.h>
#include <linux/uaccess.h>
extern void die_if_kernel(char *,struct pt_regs *,long, unsigned long *);
/*
* Force a new ASN for a task.
*/
#ifndef CONFIG_SMP
unsigned long last_asn = ASN_FIRST_VERSION;
#endif
void
__load_new_mm_context(struct mm_struct *next_mm)
{
unsigned long mmc;
struct pcb_struct *pcb;
mmc = __get_new_mm_context(next_mm, smp_processor_id());
next_mm->context[smp_processor_id()] = mmc;
pcb = &current_thread_info()->pcb;
pcb->asn = mmc & HARDWARE_ASN_MASK;
pcb->ptbr = ((unsigned long) next_mm->pgd - IDENT_ADDR) >> PAGE_SHIFT;
__reload_thread(pcb);
}
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to handle_mm_fault().
*
* mmcsr:
* 0 = translation not valid
* 1 = access violation
* 2 = fault-on-read
* 3 = fault-on-execute
* 4 = fault-on-write
*
* cause:
* -1 = instruction fetch
* 0 = load
* 1 = store
*
* Registers $9 through $15 are saved in a block just prior to `regs' and
* are saved and restored around the call to allow exception code to
* modify them.
*/
/* Macro for exception fixup code to access integer registers. */
#define dpf_reg(r) \
(((unsigned long *)regs)[(r) <= 8 ? (r) : (r) <= 15 ? (r)-16 : \
(r) <= 18 ? (r)+10 : (r)-10])
asmlinkage void
do_page_fault(unsigned long address, unsigned long mmcsr,
long cause, struct pt_regs *regs)
{
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
const struct exception_table_entry *fixup;
int si_code = SEGV_MAPERR;
vm_fault_t fault;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
/* As of EV6, a load into $31/$f31 is a prefetch, and never faults
(or is suppressed by the PALcode). Support that for older CPUs
by ignoring such an instruction. */
if (cause == 0) {
unsigned int insn;
__get_user(insn, (unsigned int __user *)regs->pc);
if ((insn >> 21 & 0x1f) == 0x1f &&
/* ldq ldl ldt lds ldg ldf ldwu ldbu */
(1ul << (insn >> 26) & 0x30f00001400ul)) {
regs->pc += 4;
return;
}
}
/* If we're in an interrupt context, or have no user context,
we must not take the fault. */
if (!mm || faulthandler_disabled())
goto no_context;
#ifdef CONFIG_ALPHA_LARGE_VMALLOC
if (address >= TASK_SIZE)
goto vmalloc_fault;
#endif
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
retry:
down_read(&mm->mmap_sem);
vma = find_vma(mm, address);
if (!vma)
goto bad_area;
if (vma->vm_start <= address)
goto good_area;
if (!(vma->vm_flags & VM_GROWSDOWN))
goto bad_area;
if (expand_stack(vma, address))
goto bad_area;
/* Ok, we have a good vm_area for this memory access, so
we can handle it. */
good_area:
si_code = SEGV_ACCERR;
if (cause < 0) {
if (!(vma->vm_flags & VM_EXEC))
goto bad_area;
} else if (!cause) {
/* Allow reads even for write-only mappings */
if (!(vma->vm_flags & (VM_READ | VM_WRITE)))
goto bad_area;
} else {
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
flags |= FAULT_FLAG_WRITE;
}
/* If for any reason at all we couldn't handle the fault,
make sure we exit gracefully rather than endlessly redo
the fault. */
fault = handle_mm_fault(vma, address, flags);
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
return;
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
else if (fault & VM_FAULT_SIGSEGV)
goto bad_area;
else if (fault & VM_FAULT_SIGBUS)
goto do_sigbus;
BUG();
}
if (flags & FAULT_FLAG_ALLOW_RETRY) {
if (fault & VM_FAULT_MAJOR)
current->maj_flt++;
else
current->min_flt++;
if (fault & VM_FAULT_RETRY) {
flags &= ~FAULT_FLAG_ALLOW_RETRY;
/* No need to up_read(&mm->mmap_sem) as we would
* have already released it in __lock_page_or_retry
* in mm/filemap.c.
*/
goto retry;
}
}
up_read(&mm->mmap_sem);
return;
/* Something tried to access memory that isn't in our memory map.
Fix it, but check if it's kernel or user first. */
bad_area:
up_read(&mm->mmap_sem);
if (user_mode(regs))
goto do_sigsegv;
no_context:
/* Are we prepared to handle this fault as an exception? */
if ((fixup = search_exception_tables(regs->pc)) != 0) {
unsigned long newpc;
newpc = fixup_exception(dpf_reg, fixup, regs->pc);
regs->pc = newpc;
return;
}
/* Oops. The kernel tried to access some bad page. We'll have to
terminate things with extreme prejudice. */
printk(KERN_ALERT "Unable to handle kernel paging request at "
"virtual address %016lx\n", address);
die_if_kernel("Oops", regs, cause, (unsigned long*)regs - 16);
do_exit(SIGKILL);
/* We ran out of memory, or some other thing happened to us that
made us unable to handle the page fault gracefully. */
out_of_memory:
up_read(&mm->mmap_sem);
if (!user_mode(regs))
goto no_context;
pagefault_out_of_memory();
return;
do_sigbus:
up_read(&mm->mmap_sem);
/* Send a sigbus, regardless of whether we were in kernel
or user mode. */
force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *) address, 0, current);
if (!user_mode(regs))
goto no_context;
return;
do_sigsegv:
force_sig_fault(SIGSEGV, si_code, (void __user *) address, 0, current);
return;
#ifdef CONFIG_ALPHA_LARGE_VMALLOC
vmalloc_fault:
if (user_mode(regs))
goto do_sigsegv;
else {
/* Synchronize this task's top level page-table
with the "reference" page table from init. */
long index = pgd_index(address);
pgd_t *pgd, *pgd_k;
pgd = current->active_mm->pgd + index;
pgd_k = swapper_pg_dir + index;
if (!pgd_present(*pgd) && pgd_present(*pgd_k)) {
pgd_val(*pgd) = pgd_val(*pgd_k);
return;
}
goto no_context;
}
#endif
}