2012-03-05 18:49:27 +07:00
|
|
|
/*
|
|
|
|
* Low-level exception handling code
|
|
|
|
*
|
|
|
|
* Copyright (C) 2012 ARM Ltd.
|
|
|
|
* Authors: Catalin Marinas <catalin.marinas@arm.com>
|
|
|
|
* Will Deacon <will.deacon@arm.com>
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/linkage.h>
|
|
|
|
|
2015-06-01 16:47:41 +07:00
|
|
|
#include <asm/alternative.h>
|
2012-03-05 18:49:27 +07:00
|
|
|
#include <asm/assembler.h>
|
|
|
|
#include <asm/asm-offsets.h>
|
2015-03-24 02:07:02 +07:00
|
|
|
#include <asm/cpufeature.h>
|
2012-03-05 18:49:27 +07:00
|
|
|
#include <asm/errno.h>
|
2013-04-08 23:17:03 +07:00
|
|
|
#include <asm/esr.h>
|
2015-12-04 18:02:27 +07:00
|
|
|
#include <asm/irq.h>
|
2017-11-14 21:07:40 +07:00
|
|
|
#include <asm/memory.h>
|
|
|
|
#include <asm/mmu.h>
|
2017-08-31 15:30:50 +07:00
|
|
|
#include <asm/processor.h>
|
2016-09-02 20:54:03 +07:00
|
|
|
#include <asm/ptrace.h>
|
2012-03-05 18:49:27 +07:00
|
|
|
#include <asm/thread_info.h>
|
2016-12-26 16:10:19 +07:00
|
|
|
#include <asm/asm-uaccess.h>
|
2012-03-05 18:49:27 +07:00
|
|
|
#include <asm/unistd.h>
|
|
|
|
|
2014-05-31 02:34:15 +07:00
|
|
|
/*
|
|
|
|
* Context tracking subsystem. Used to instrument transitions
|
|
|
|
* between user and kernel mode.
|
|
|
|
*/
|
|
|
|
.macro ct_user_exit, syscall = 0
|
|
|
|
#ifdef CONFIG_CONTEXT_TRACKING
|
|
|
|
bl context_tracking_user_exit
|
|
|
|
.if \syscall == 1
|
|
|
|
/*
|
|
|
|
* Save/restore needed during syscalls. Restore syscall arguments from
|
|
|
|
* the values already saved on stack during kernel_entry.
|
|
|
|
*/
|
|
|
|
ldp x0, x1, [sp]
|
|
|
|
ldp x2, x3, [sp, #S_X2]
|
|
|
|
ldp x4, x5, [sp, #S_X4]
|
|
|
|
ldp x6, x7, [sp, #S_X6]
|
|
|
|
.endif
|
|
|
|
#endif
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro ct_user_enter
|
|
|
|
#ifdef CONFIG_CONTEXT_TRACKING
|
|
|
|
bl context_tracking_user_enter
|
|
|
|
#endif
|
|
|
|
.endm
|
|
|
|
|
2012-03-05 18:49:27 +07:00
|
|
|
/*
|
|
|
|
* Bad Abort numbers
|
|
|
|
*-----------------
|
|
|
|
*/
|
|
|
|
#define BAD_SYNC 0
|
|
|
|
#define BAD_IRQ 1
|
|
|
|
#define BAD_FIQ 2
|
|
|
|
#define BAD_ERROR 3
|
|
|
|
|
2017-11-14 21:20:21 +07:00
|
|
|
.macro kernel_ventry, el, label, regsize = 64
|
2017-07-19 23:24:49 +07:00
|
|
|
.align 7
|
2017-11-14 21:24:29 +07:00
|
|
|
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
|
2017-11-14 21:38:19 +07:00
|
|
|
alternative_if ARM64_UNMAP_KERNEL_AT_EL0
|
2017-11-14 21:24:29 +07:00
|
|
|
.if \el == 0
|
|
|
|
.if \regsize == 64
|
|
|
|
mrs x30, tpidrro_el0
|
|
|
|
msr tpidrro_el0, xzr
|
|
|
|
.else
|
|
|
|
mov x30, xzr
|
|
|
|
.endif
|
|
|
|
.endif
|
2017-11-14 21:38:19 +07:00
|
|
|
alternative_else_nop_endif
|
2017-11-14 21:24:29 +07:00
|
|
|
#endif
|
|
|
|
|
2014-09-29 18:26:41 +07:00
|
|
|
sub sp, sp, #S_FRAME_SIZE
|
arm64: add VMAP_STACK overflow detection
This patch adds stack overflow detection to arm64, usable when vmap'd stacks
are in use.
Overflow is detected in a small preamble executed for each exception entry,
which checks whether there is enough space on the current stack for the general
purpose registers to be saved. If there is not enough space, the overflow
handler is invoked on a per-cpu overflow stack. This approach preserves the
original exception information in ESR_EL1 (and where appropriate, FAR_EL1).
Task and IRQ stacks are aligned to double their size, enabling overflow to be
detected with a single bit test. For example, a 16K stack is aligned to 32K,
ensuring that bit 14 of the SP must be zero. On an overflow (or underflow),
this bit is flipped. Thus, overflow (of less than the size of the stack) can be
detected by testing whether this bit is set.
The overflow check is performed before any attempt is made to access the
stack, avoiding recursive faults (and the loss of exception information
these would entail). As logical operations cannot be performed on the SP
directly, the SP is temporarily swapped with a general purpose register
using arithmetic operations to enable the test to be performed.
This gives us a useful error message on stack overflow, as can be trigger with
the LKDTM overflow test:
[ 305.388749] lkdtm: Performing direct entry OVERFLOW
[ 305.395444] Insufficient stack space to handle exception!
[ 305.395482] ESR: 0x96000047 -- DABT (current EL)
[ 305.399890] FAR: 0xffff00000a5e7f30
[ 305.401315] Task stack: [0xffff00000a5e8000..0xffff00000a5ec000]
[ 305.403815] IRQ stack: [0xffff000008000000..0xffff000008004000]
[ 305.407035] Overflow stack: [0xffff80003efce4e0..0xffff80003efcf4e0]
[ 305.409622] CPU: 0 PID: 1219 Comm: sh Not tainted 4.13.0-rc3-00021-g9636aea #5
[ 305.412785] Hardware name: linux,dummy-virt (DT)
[ 305.415756] task: ffff80003d051c00 task.stack: ffff00000a5e8000
[ 305.419221] PC is at recursive_loop+0x10/0x48
[ 305.421637] LR is at recursive_loop+0x38/0x48
[ 305.423768] pc : [<ffff00000859f330>] lr : [<ffff00000859f358>] pstate: 40000145
[ 305.428020] sp : ffff00000a5e7f50
[ 305.430469] x29: ffff00000a5e8350 x28: ffff80003d051c00
[ 305.433191] x27: ffff000008981000 x26: ffff000008f80400
[ 305.439012] x25: ffff00000a5ebeb8 x24: ffff00000a5ebeb8
[ 305.440369] x23: ffff000008f80138 x22: 0000000000000009
[ 305.442241] x21: ffff80003ce65000 x20: ffff000008f80188
[ 305.444552] x19: 0000000000000013 x18: 0000000000000006
[ 305.446032] x17: 0000ffffa2601280 x16: ffff0000081fe0b8
[ 305.448252] x15: ffff000008ff546d x14: 000000000047a4c8
[ 305.450246] x13: ffff000008ff7872 x12: 0000000005f5e0ff
[ 305.452953] x11: ffff000008ed2548 x10: 000000000005ee8d
[ 305.454824] x9 : ffff000008545380 x8 : ffff00000a5e8770
[ 305.457105] x7 : 1313131313131313 x6 : 00000000000000e1
[ 305.459285] x5 : 0000000000000000 x4 : 0000000000000000
[ 305.461781] x3 : 0000000000000000 x2 : 0000000000000400
[ 305.465119] x1 : 0000000000000013 x0 : 0000000000000012
[ 305.467724] Kernel panic - not syncing: kernel stack overflow
[ 305.470561] CPU: 0 PID: 1219 Comm: sh Not tainted 4.13.0-rc3-00021-g9636aea #5
[ 305.473325] Hardware name: linux,dummy-virt (DT)
[ 305.475070] Call trace:
[ 305.476116] [<ffff000008088ad8>] dump_backtrace+0x0/0x378
[ 305.478991] [<ffff000008088e64>] show_stack+0x14/0x20
[ 305.481237] [<ffff00000895a178>] dump_stack+0x98/0xb8
[ 305.483294] [<ffff0000080c3288>] panic+0x118/0x280
[ 305.485673] [<ffff0000080c2e9c>] nmi_panic+0x6c/0x70
[ 305.486216] [<ffff000008089710>] handle_bad_stack+0x118/0x128
[ 305.486612] Exception stack(0xffff80003efcf3a0 to 0xffff80003efcf4e0)
[ 305.487334] f3a0: 0000000000000012 0000000000000013 0000000000000400 0000000000000000
[ 305.488025] f3c0: 0000000000000000 0000000000000000 00000000000000e1 1313131313131313
[ 305.488908] f3e0: ffff00000a5e8770 ffff000008545380 000000000005ee8d ffff000008ed2548
[ 305.489403] f400: 0000000005f5e0ff ffff000008ff7872 000000000047a4c8 ffff000008ff546d
[ 305.489759] f420: ffff0000081fe0b8 0000ffffa2601280 0000000000000006 0000000000000013
[ 305.490256] f440: ffff000008f80188 ffff80003ce65000 0000000000000009 ffff000008f80138
[ 305.490683] f460: ffff00000a5ebeb8 ffff00000a5ebeb8 ffff000008f80400 ffff000008981000
[ 305.491051] f480: ffff80003d051c00 ffff00000a5e8350 ffff00000859f358 ffff00000a5e7f50
[ 305.491444] f4a0: ffff00000859f330 0000000040000145 0000000000000000 0000000000000000
[ 305.492008] f4c0: 0001000000000000 0000000000000000 ffff00000a5e8350 ffff00000859f330
[ 305.493063] [<ffff00000808205c>] __bad_stack+0x88/0x8c
[ 305.493396] [<ffff00000859f330>] recursive_loop+0x10/0x48
[ 305.493731] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.494088] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.494425] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.494649] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.494898] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.495205] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.495453] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.495708] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.496000] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.496302] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.496644] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.496894] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.497138] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.497325] [<ffff00000859f3dc>] lkdtm_OVERFLOW+0x14/0x20
[ 305.497506] [<ffff00000859f314>] lkdtm_do_action+0x1c/0x28
[ 305.497786] [<ffff00000859f178>] direct_entry+0xe0/0x170
[ 305.498095] [<ffff000008345568>] full_proxy_write+0x60/0xa8
[ 305.498387] [<ffff0000081fb7f4>] __vfs_write+0x1c/0x128
[ 305.498679] [<ffff0000081fcc68>] vfs_write+0xa0/0x1b0
[ 305.498926] [<ffff0000081fe0fc>] SyS_write+0x44/0xa0
[ 305.499182] Exception stack(0xffff00000a5ebec0 to 0xffff00000a5ec000)
[ 305.499429] bec0: 0000000000000001 000000001c4cf5e0 0000000000000009 000000001c4cf5e0
[ 305.499674] bee0: 574f4c465245564f 0000000000000000 0000000000000000 8000000080808080
[ 305.499904] bf00: 0000000000000040 0000000000000038 fefefeff1b4bc2ff 7f7f7f7f7f7fff7f
[ 305.500189] bf20: 0101010101010101 0000000000000000 000000000047a4c8 0000000000000038
[ 305.500712] bf40: 0000000000000000 0000ffffa2601280 0000ffffc63f6068 00000000004b5000
[ 305.501241] bf60: 0000000000000001 000000001c4cf5e0 0000000000000009 000000001c4cf5e0
[ 305.501791] bf80: 0000000000000020 0000000000000000 00000000004b5000 000000001c4cc458
[ 305.502314] bfa0: 0000000000000000 0000ffffc63f7950 000000000040a3c4 0000ffffc63f70e0
[ 305.502762] bfc0: 0000ffffa2601268 0000000080000000 0000000000000001 0000000000000040
[ 305.503207] bfe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[ 305.503680] [<ffff000008082fb0>] el0_svc_naked+0x24/0x28
[ 305.504720] Kernel Offset: disabled
[ 305.505189] CPU features: 0x002082
[ 305.505473] Memory Limit: none
[ 305.506181] ---[ end Kernel panic - not syncing: kernel stack overflow
This patch was co-authored by Ard Biesheuvel and Mark Rutland.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
2017-07-15 02:30:35 +07:00
|
|
|
#ifdef CONFIG_VMAP_STACK
|
|
|
|
/*
|
|
|
|
* Test whether the SP has overflowed, without corrupting a GPR.
|
|
|
|
* Task and IRQ stacks are aligned to (1 << THREAD_SHIFT).
|
|
|
|
*/
|
|
|
|
add sp, sp, x0 // sp' = sp + x0
|
|
|
|
sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp
|
|
|
|
tbnz x0, #THREAD_SHIFT, 0f
|
|
|
|
sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0
|
|
|
|
sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp
|
2017-11-14 21:20:21 +07:00
|
|
|
b el\()\el\()_\label
|
arm64: add VMAP_STACK overflow detection
This patch adds stack overflow detection to arm64, usable when vmap'd stacks
are in use.
Overflow is detected in a small preamble executed for each exception entry,
which checks whether there is enough space on the current stack for the general
purpose registers to be saved. If there is not enough space, the overflow
handler is invoked on a per-cpu overflow stack. This approach preserves the
original exception information in ESR_EL1 (and where appropriate, FAR_EL1).
Task and IRQ stacks are aligned to double their size, enabling overflow to be
detected with a single bit test. For example, a 16K stack is aligned to 32K,
ensuring that bit 14 of the SP must be zero. On an overflow (or underflow),
this bit is flipped. Thus, overflow (of less than the size of the stack) can be
detected by testing whether this bit is set.
The overflow check is performed before any attempt is made to access the
stack, avoiding recursive faults (and the loss of exception information
these would entail). As logical operations cannot be performed on the SP
directly, the SP is temporarily swapped with a general purpose register
using arithmetic operations to enable the test to be performed.
This gives us a useful error message on stack overflow, as can be trigger with
the LKDTM overflow test:
[ 305.388749] lkdtm: Performing direct entry OVERFLOW
[ 305.395444] Insufficient stack space to handle exception!
[ 305.395482] ESR: 0x96000047 -- DABT (current EL)
[ 305.399890] FAR: 0xffff00000a5e7f30
[ 305.401315] Task stack: [0xffff00000a5e8000..0xffff00000a5ec000]
[ 305.403815] IRQ stack: [0xffff000008000000..0xffff000008004000]
[ 305.407035] Overflow stack: [0xffff80003efce4e0..0xffff80003efcf4e0]
[ 305.409622] CPU: 0 PID: 1219 Comm: sh Not tainted 4.13.0-rc3-00021-g9636aea #5
[ 305.412785] Hardware name: linux,dummy-virt (DT)
[ 305.415756] task: ffff80003d051c00 task.stack: ffff00000a5e8000
[ 305.419221] PC is at recursive_loop+0x10/0x48
[ 305.421637] LR is at recursive_loop+0x38/0x48
[ 305.423768] pc : [<ffff00000859f330>] lr : [<ffff00000859f358>] pstate: 40000145
[ 305.428020] sp : ffff00000a5e7f50
[ 305.430469] x29: ffff00000a5e8350 x28: ffff80003d051c00
[ 305.433191] x27: ffff000008981000 x26: ffff000008f80400
[ 305.439012] x25: ffff00000a5ebeb8 x24: ffff00000a5ebeb8
[ 305.440369] x23: ffff000008f80138 x22: 0000000000000009
[ 305.442241] x21: ffff80003ce65000 x20: ffff000008f80188
[ 305.444552] x19: 0000000000000013 x18: 0000000000000006
[ 305.446032] x17: 0000ffffa2601280 x16: ffff0000081fe0b8
[ 305.448252] x15: ffff000008ff546d x14: 000000000047a4c8
[ 305.450246] x13: ffff000008ff7872 x12: 0000000005f5e0ff
[ 305.452953] x11: ffff000008ed2548 x10: 000000000005ee8d
[ 305.454824] x9 : ffff000008545380 x8 : ffff00000a5e8770
[ 305.457105] x7 : 1313131313131313 x6 : 00000000000000e1
[ 305.459285] x5 : 0000000000000000 x4 : 0000000000000000
[ 305.461781] x3 : 0000000000000000 x2 : 0000000000000400
[ 305.465119] x1 : 0000000000000013 x0 : 0000000000000012
[ 305.467724] Kernel panic - not syncing: kernel stack overflow
[ 305.470561] CPU: 0 PID: 1219 Comm: sh Not tainted 4.13.0-rc3-00021-g9636aea #5
[ 305.473325] Hardware name: linux,dummy-virt (DT)
[ 305.475070] Call trace:
[ 305.476116] [<ffff000008088ad8>] dump_backtrace+0x0/0x378
[ 305.478991] [<ffff000008088e64>] show_stack+0x14/0x20
[ 305.481237] [<ffff00000895a178>] dump_stack+0x98/0xb8
[ 305.483294] [<ffff0000080c3288>] panic+0x118/0x280
[ 305.485673] [<ffff0000080c2e9c>] nmi_panic+0x6c/0x70
[ 305.486216] [<ffff000008089710>] handle_bad_stack+0x118/0x128
[ 305.486612] Exception stack(0xffff80003efcf3a0 to 0xffff80003efcf4e0)
[ 305.487334] f3a0: 0000000000000012 0000000000000013 0000000000000400 0000000000000000
[ 305.488025] f3c0: 0000000000000000 0000000000000000 00000000000000e1 1313131313131313
[ 305.488908] f3e0: ffff00000a5e8770 ffff000008545380 000000000005ee8d ffff000008ed2548
[ 305.489403] f400: 0000000005f5e0ff ffff000008ff7872 000000000047a4c8 ffff000008ff546d
[ 305.489759] f420: ffff0000081fe0b8 0000ffffa2601280 0000000000000006 0000000000000013
[ 305.490256] f440: ffff000008f80188 ffff80003ce65000 0000000000000009 ffff000008f80138
[ 305.490683] f460: ffff00000a5ebeb8 ffff00000a5ebeb8 ffff000008f80400 ffff000008981000
[ 305.491051] f480: ffff80003d051c00 ffff00000a5e8350 ffff00000859f358 ffff00000a5e7f50
[ 305.491444] f4a0: ffff00000859f330 0000000040000145 0000000000000000 0000000000000000
[ 305.492008] f4c0: 0001000000000000 0000000000000000 ffff00000a5e8350 ffff00000859f330
[ 305.493063] [<ffff00000808205c>] __bad_stack+0x88/0x8c
[ 305.493396] [<ffff00000859f330>] recursive_loop+0x10/0x48
[ 305.493731] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.494088] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.494425] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.494649] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.494898] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.495205] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.495453] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.495708] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.496000] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.496302] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.496644] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.496894] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.497138] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.497325] [<ffff00000859f3dc>] lkdtm_OVERFLOW+0x14/0x20
[ 305.497506] [<ffff00000859f314>] lkdtm_do_action+0x1c/0x28
[ 305.497786] [<ffff00000859f178>] direct_entry+0xe0/0x170
[ 305.498095] [<ffff000008345568>] full_proxy_write+0x60/0xa8
[ 305.498387] [<ffff0000081fb7f4>] __vfs_write+0x1c/0x128
[ 305.498679] [<ffff0000081fcc68>] vfs_write+0xa0/0x1b0
[ 305.498926] [<ffff0000081fe0fc>] SyS_write+0x44/0xa0
[ 305.499182] Exception stack(0xffff00000a5ebec0 to 0xffff00000a5ec000)
[ 305.499429] bec0: 0000000000000001 000000001c4cf5e0 0000000000000009 000000001c4cf5e0
[ 305.499674] bee0: 574f4c465245564f 0000000000000000 0000000000000000 8000000080808080
[ 305.499904] bf00: 0000000000000040 0000000000000038 fefefeff1b4bc2ff 7f7f7f7f7f7fff7f
[ 305.500189] bf20: 0101010101010101 0000000000000000 000000000047a4c8 0000000000000038
[ 305.500712] bf40: 0000000000000000 0000ffffa2601280 0000ffffc63f6068 00000000004b5000
[ 305.501241] bf60: 0000000000000001 000000001c4cf5e0 0000000000000009 000000001c4cf5e0
[ 305.501791] bf80: 0000000000000020 0000000000000000 00000000004b5000 000000001c4cc458
[ 305.502314] bfa0: 0000000000000000 0000ffffc63f7950 000000000040a3c4 0000ffffc63f70e0
[ 305.502762] bfc0: 0000ffffa2601268 0000000080000000 0000000000000001 0000000000000040
[ 305.503207] bfe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[ 305.503680] [<ffff000008082fb0>] el0_svc_naked+0x24/0x28
[ 305.504720] Kernel Offset: disabled
[ 305.505189] CPU features: 0x002082
[ 305.505473] Memory Limit: none
[ 305.506181] ---[ end Kernel panic - not syncing: kernel stack overflow
This patch was co-authored by Ard Biesheuvel and Mark Rutland.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
2017-07-15 02:30:35 +07:00
|
|
|
|
|
|
|
0:
|
|
|
|
/*
|
|
|
|
* Either we've just detected an overflow, or we've taken an exception
|
|
|
|
* while on the overflow stack. Either way, we won't return to
|
|
|
|
* userspace, and can clobber EL0 registers to free up GPRs.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Stash the original SP (minus S_FRAME_SIZE) in tpidr_el0. */
|
|
|
|
msr tpidr_el0, x0
|
|
|
|
|
|
|
|
/* Recover the original x0 value and stash it in tpidrro_el0 */
|
|
|
|
sub x0, sp, x0
|
|
|
|
msr tpidrro_el0, x0
|
|
|
|
|
|
|
|
/* Switch to the overflow stack */
|
|
|
|
adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether we were already on the overflow stack. This may happen
|
|
|
|
* after panic() re-enables interrupts.
|
|
|
|
*/
|
|
|
|
mrs x0, tpidr_el0 // sp of interrupted context
|
|
|
|
sub x0, sp, x0 // delta with top of overflow stack
|
|
|
|
tst x0, #~(OVERFLOW_STACK_SIZE - 1) // within range?
|
|
|
|
b.ne __bad_stack // no? -> bad stack pointer
|
|
|
|
|
|
|
|
/* We were already on the overflow stack. Restore sp/x0 and carry on. */
|
|
|
|
sub sp, sp, x0
|
|
|
|
mrs x0, tpidrro_el0
|
|
|
|
#endif
|
2017-11-14 21:20:21 +07:00
|
|
|
b el\()\el\()_\label
|
2017-07-19 23:24:49 +07:00
|
|
|
.endm
|
|
|
|
|
2017-11-14 21:24:29 +07:00
|
|
|
.macro tramp_alias, dst, sym
|
|
|
|
mov_q \dst, TRAMP_VALIAS
|
|
|
|
add \dst, \dst, #(\sym - .entry.tramp.text)
|
|
|
|
.endm
|
|
|
|
|
2017-07-19 23:24:49 +07:00
|
|
|
.macro kernel_entry, el, regsize = 64
|
2012-03-05 18:49:27 +07:00
|
|
|
.if \regsize == 32
|
|
|
|
mov w0, w0 // zero upper 32 bits of x0
|
|
|
|
.endif
|
2014-09-29 18:26:41 +07:00
|
|
|
stp x0, x1, [sp, #16 * 0]
|
|
|
|
stp x2, x3, [sp, #16 * 1]
|
|
|
|
stp x4, x5, [sp, #16 * 2]
|
|
|
|
stp x6, x7, [sp, #16 * 3]
|
|
|
|
stp x8, x9, [sp, #16 * 4]
|
|
|
|
stp x10, x11, [sp, #16 * 5]
|
|
|
|
stp x12, x13, [sp, #16 * 6]
|
|
|
|
stp x14, x15, [sp, #16 * 7]
|
|
|
|
stp x16, x17, [sp, #16 * 8]
|
|
|
|
stp x18, x19, [sp, #16 * 9]
|
|
|
|
stp x20, x21, [sp, #16 * 10]
|
|
|
|
stp x22, x23, [sp, #16 * 11]
|
|
|
|
stp x24, x25, [sp, #16 * 12]
|
|
|
|
stp x26, x27, [sp, #16 * 13]
|
|
|
|
stp x28, x29, [sp, #16 * 14]
|
|
|
|
|
2012-03-05 18:49:27 +07:00
|
|
|
.if \el == 0
|
|
|
|
mrs x21, sp_el0
|
arm64: split thread_info from task stack
This patch moves arm64's struct thread_info from the task stack into
task_struct. This protects thread_info from corruption in the case of
stack overflows, and makes its address harder to determine if stack
addresses are leaked, making a number of attacks more difficult. Precise
detection and handling of overflow is left for subsequent patches.
Largely, this involves changing code to store the task_struct in sp_el0,
and acquire the thread_info from the task struct. Core code now
implements current_thread_info(), and as noted in <linux/sched.h> this
relies on offsetof(task_struct, thread_info) == 0, enforced by core
code.
This change means that the 'tsk' register used in entry.S now points to
a task_struct, rather than a thread_info as it used to. To make this
clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets
appropriately updated to account for the structural change.
Userspace clobbers sp_el0, and we can no longer restore this from the
stack. Instead, the current task is cached in a per-cpu variable that we
can safely access from early assembly as interrupts are disabled (and we
are thus not preemptible).
Both secondary entry and idle are updated to stash the sp and task
pointer separately.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-04 03:23:13 +07:00
|
|
|
ldr_this_cpu tsk, __entry_task, x20 // Ensure MDSCR_EL1.SS is clear,
|
|
|
|
ldr x19, [tsk, #TSK_TI_FLAGS] // since we can unmask debug
|
arm64: debug: avoid accessing mdscr_el1 on fault paths where possible
Since mdscr_el1 is part of the debug register group, it is highly likely
to be trapped by a hypervisor to prevent virtual machines from debugging
(buggering?) each other. Unfortunately, this absolutely destroys our
performance, since we access the register on many of our low-level
fault handling paths to keep track of the various debug state machines.
This patch removes our dependency on mdscr_el1 in the case that debugging
is not being used. More specifically we:
- Use TIF_SINGLESTEP to indicate that a task is stepping at EL0 and
avoid disabling step in the MDSCR when we don't need to.
MDSCR_EL1.SS handling is moved to kernel_entry, when trapping from
userspace.
- Ensure debug exceptions are re-enabled on *all* exception entry
paths, even the debug exception handling path (where we re-enable
exceptions after invoking the handler). Since we can now rely on
MDSCR_EL1.SS being cleared by the entry code, exception handlers can
usually enable debug immediately before enabling interrupts.
- Remove all debug exception unmasking from ret_to_user and
el1_preempt, since we will never get here with debug exceptions
masked.
This results in a slight change to kernel debug behaviour, where we now
step into interrupt handlers and data aborts from EL1 when debugging the
kernel, which is actually a useful thing to do. A side-effect of this is
that it *does* potentially prevent stepping off {break,watch}points when
there is a high-frequency interrupt source (e.g. a timer), so a debugger
would need to use either breakpoints or manually disable interrupts to
get around this issue.
With this patch applied, guest performance is restored under KVM when
debug register accesses are trapped (and we get a measurable performance
increase on the host on Cortex-A57 too).
Cc: Ian Campbell <ian.campbell@citrix.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2014-04-30 01:04:06 +07:00
|
|
|
disable_step_tsk x19, x20 // exceptions when scheduling.
|
2015-12-10 17:22:41 +07:00
|
|
|
|
|
|
|
mov x29, xzr // fp pointed to user-space
|
2012-03-05 18:49:27 +07:00
|
|
|
.else
|
|
|
|
add x21, sp, #S_FRAME_SIZE
|
2016-06-21 00:28:01 +07:00
|
|
|
get_thread_info tsk
|
|
|
|
/* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */
|
arm64: split thread_info from task stack
This patch moves arm64's struct thread_info from the task stack into
task_struct. This protects thread_info from corruption in the case of
stack overflows, and makes its address harder to determine if stack
addresses are leaked, making a number of attacks more difficult. Precise
detection and handling of overflow is left for subsequent patches.
Largely, this involves changing code to store the task_struct in sp_el0,
and acquire the thread_info from the task struct. Core code now
implements current_thread_info(), and as noted in <linux/sched.h> this
relies on offsetof(task_struct, thread_info) == 0, enforced by core
code.
This change means that the 'tsk' register used in entry.S now points to
a task_struct, rather than a thread_info as it used to. To make this
clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets
appropriately updated to account for the structural change.
Userspace clobbers sp_el0, and we can no longer restore this from the
stack. Instead, the current task is cached in a per-cpu variable that we
can safely access from early assembly as interrupts are disabled (and we
are thus not preemptible).
Both secondary entry and idle are updated to stash the sp and task
pointer separately.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-04 03:23:13 +07:00
|
|
|
ldr x20, [tsk, #TSK_TI_ADDR_LIMIT]
|
2016-06-21 00:28:01 +07:00
|
|
|
str x20, [sp, #S_ORIG_ADDR_LIMIT]
|
|
|
|
mov x20, #TASK_SIZE_64
|
arm64: split thread_info from task stack
This patch moves arm64's struct thread_info from the task stack into
task_struct. This protects thread_info from corruption in the case of
stack overflows, and makes its address harder to determine if stack
addresses are leaked, making a number of attacks more difficult. Precise
detection and handling of overflow is left for subsequent patches.
Largely, this involves changing code to store the task_struct in sp_el0,
and acquire the thread_info from the task struct. Core code now
implements current_thread_info(), and as noted in <linux/sched.h> this
relies on offsetof(task_struct, thread_info) == 0, enforced by core
code.
This change means that the 'tsk' register used in entry.S now points to
a task_struct, rather than a thread_info as it used to. To make this
clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets
appropriately updated to account for the structural change.
Userspace clobbers sp_el0, and we can no longer restore this from the
stack. Instead, the current task is cached in a per-cpu variable that we
can safely access from early assembly as interrupts are disabled (and we
are thus not preemptible).
Both secondary entry and idle are updated to stash the sp and task
pointer separately.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-04 03:23:13 +07:00
|
|
|
str x20, [tsk, #TSK_TI_ADDR_LIMIT]
|
2016-09-01 20:35:59 +07:00
|
|
|
/* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */
|
2016-06-21 00:28:01 +07:00
|
|
|
.endif /* \el == 0 */
|
2012-03-05 18:49:27 +07:00
|
|
|
mrs x22, elr_el1
|
|
|
|
mrs x23, spsr_el1
|
|
|
|
stp lr, x21, [sp, #S_LR]
|
2016-09-02 20:54:03 +07:00
|
|
|
|
arm64: unwind: reference pt_regs via embedded stack frame
As it turns out, the unwind code is slightly broken, and probably has
been for a while. The problem is in the dumping of the exception stack,
which is intended to dump the contents of the pt_regs struct at each
level in the call stack where an exception was taken and routed to a
routine marked as __exception (which means its stack frame is right
below the pt_regs struct on the stack).
'Right below the pt_regs struct' is ill defined, though: the unwind
code assigns 'frame pointer + 0x10' to the .sp member of the stackframe
struct at each level, and dump_backtrace() happily dereferences that as
the pt_regs pointer when encountering an __exception routine. However,
the actual size of the stack frame created by this routine (which could
be one of many __exception routines we have in the kernel) is not known,
and so frame.sp is pretty useless to figure out where struct pt_regs
really is.
So it seems the only way to ensure that we can find our struct pt_regs
when walking the stack frames is to put it at a known fixed offset of
the stack frame pointer that is passed to such __exception routines.
The simplest way to do that is to put it inside pt_regs itself, which is
the main change implemented by this patch. As a bonus, doing this allows
us to get rid of a fair amount of cruft related to walking from one stack
to the other, which is especially nice since we intend to introduce yet
another stack for overflow handling once we add support for vmapped
stacks. It also fixes an inconsistency where we only add a stack frame
pointing to ELR_EL1 if we are executing from the IRQ stack but not when
we are executing from the task stack.
To consistly identify exceptions regs even in the presence of exceptions
taken from entry code, we must check whether the next frame was created
by entry text, rather than whether the current frame was crated by
exception text.
To avoid backtracing using PCs that fall in the idmap, or are controlled
by userspace, we must explcitly zero the FP and LR in startup paths, and
must ensure that the frame embedded in pt_regs is zeroed upon entry from
EL0. To avoid these NULL entries showin in the backtrace, unwind_frame()
is updated to avoid them.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
[Mark: compare current frame against .entry.text, avoid bogus PCs]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
2017-07-23 00:45:33 +07:00
|
|
|
/*
|
|
|
|
* In order to be able to dump the contents of struct pt_regs at the
|
|
|
|
* time the exception was taken (in case we attempt to walk the call
|
|
|
|
* stack later), chain it together with the stack frames.
|
|
|
|
*/
|
|
|
|
.if \el == 0
|
|
|
|
stp xzr, xzr, [sp, #S_STACKFRAME]
|
|
|
|
.else
|
|
|
|
stp x29, x22, [sp, #S_STACKFRAME]
|
|
|
|
.endif
|
|
|
|
add x29, sp, #S_STACKFRAME
|
|
|
|
|
2016-09-02 20:54:03 +07:00
|
|
|
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
|
|
|
|
/*
|
|
|
|
* Set the TTBR0 PAN bit in SPSR. When the exception is taken from
|
|
|
|
* EL0, there is no need to check the state of TTBR0_EL1 since
|
|
|
|
* accesses are always enabled.
|
|
|
|
* Note that the meaning of this bit differs from the ARMv8.1 PAN
|
|
|
|
* feature as all TTBR0_EL1 accesses are disabled, not just those to
|
|
|
|
* user mappings.
|
|
|
|
*/
|
|
|
|
alternative_if ARM64_HAS_PAN
|
|
|
|
b 1f // skip TTBR0 PAN
|
|
|
|
alternative_else_nop_endif
|
|
|
|
|
|
|
|
.if \el != 0
|
2017-08-10 19:58:16 +07:00
|
|
|
mrs x21, ttbr1_el1
|
2017-12-02 00:33:48 +07:00
|
|
|
tst x21, #TTBR_ASID_MASK // Check for the reserved ASID
|
2016-09-02 20:54:03 +07:00
|
|
|
orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR
|
|
|
|
b.eq 1f // TTBR0 access already disabled
|
|
|
|
and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR
|
|
|
|
.endif
|
|
|
|
|
|
|
|
__uaccess_ttbr0_disable x21
|
|
|
|
1:
|
|
|
|
#endif
|
|
|
|
|
2012-03-05 18:49:27 +07:00
|
|
|
stp x22, x23, [sp, #S_PC]
|
|
|
|
|
2017-08-01 21:35:54 +07:00
|
|
|
/* Not in a syscall by default (el0_svc overwrites for real syscall) */
|
2012-03-05 18:49:27 +07:00
|
|
|
.if \el == 0
|
2017-08-01 21:35:54 +07:00
|
|
|
mov w21, #NO_SYSCALL
|
arm64: syscallno is secretly an int, make it official
The upper 32 bits of the syscallno field in thread_struct are
handled inconsistently, being sometimes zero extended and sometimes
sign-extended. In fact, only the lower 32 bits seem to have any
real significance for the behaviour of the code: it's been OK to
handle the upper bits inconsistently because they don't matter.
Currently, the only place I can find where those bits are
significant is in calling trace_sys_enter(), which may be
unintentional: for example, if a compat tracer attempts to cancel a
syscall by passing -1 to (COMPAT_)PTRACE_SET_SYSCALL at the
syscall-enter-stop, it will be traced as syscall 4294967295
rather than -1 as might be expected (and as occurs for a native
tracer doing the same thing). Elsewhere, reads of syscallno cast
it to an int or truncate it.
There's also a conspicuous amount of code and casting to bodge
around the fact that although semantically an int, syscallno is
stored as a u64.
Let's not pretend any more.
In order to preserve the stp x instruction that stores the syscall
number in entry.S, this patch special-cases the layout of struct
pt_regs for big endian so that the newly 32-bit syscallno field
maps onto the low bits of the stored value. This is not beautiful,
but benchmarking of the getpid syscall on Juno suggests indicates a
minor slowdown if the stp is split into an stp x and stp w.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-08-01 21:35:53 +07:00
|
|
|
str w21, [sp, #S_SYSCALLNO]
|
2012-03-05 18:49:27 +07:00
|
|
|
.endif
|
|
|
|
|
2015-12-04 18:02:25 +07:00
|
|
|
/*
|
|
|
|
* Set sp_el0 to current thread_info.
|
|
|
|
*/
|
|
|
|
.if \el == 0
|
|
|
|
msr sp_el0, tsk
|
|
|
|
.endif
|
|
|
|
|
2012-03-05 18:49:27 +07:00
|
|
|
/*
|
|
|
|
* Registers that may be useful after this macro is invoked:
|
|
|
|
*
|
|
|
|
* x21 - aborted SP
|
|
|
|
* x22 - aborted PC
|
|
|
|
* x23 - aborted PSTATE
|
|
|
|
*/
|
|
|
|
.endm
|
|
|
|
|
2015-08-19 21:57:09 +07:00
|
|
|
.macro kernel_exit, el
|
2016-06-21 00:28:01 +07:00
|
|
|
.if \el != 0
|
2017-11-02 19:12:37 +07:00
|
|
|
disable_daif
|
|
|
|
|
2016-06-21 00:28:01 +07:00
|
|
|
/* Restore the task's original addr_limit. */
|
|
|
|
ldr x20, [sp, #S_ORIG_ADDR_LIMIT]
|
arm64: split thread_info from task stack
This patch moves arm64's struct thread_info from the task stack into
task_struct. This protects thread_info from corruption in the case of
stack overflows, and makes its address harder to determine if stack
addresses are leaked, making a number of attacks more difficult. Precise
detection and handling of overflow is left for subsequent patches.
Largely, this involves changing code to store the task_struct in sp_el0,
and acquire the thread_info from the task struct. Core code now
implements current_thread_info(), and as noted in <linux/sched.h> this
relies on offsetof(task_struct, thread_info) == 0, enforced by core
code.
This change means that the 'tsk' register used in entry.S now points to
a task_struct, rather than a thread_info as it used to. To make this
clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets
appropriately updated to account for the structural change.
Userspace clobbers sp_el0, and we can no longer restore this from the
stack. Instead, the current task is cached in a per-cpu variable that we
can safely access from early assembly as interrupts are disabled (and we
are thus not preemptible).
Both secondary entry and idle are updated to stash the sp and task
pointer separately.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-04 03:23:13 +07:00
|
|
|
str x20, [tsk, #TSK_TI_ADDR_LIMIT]
|
2016-06-21 00:28:01 +07:00
|
|
|
|
|
|
|
/* No need to restore UAO, it will be restored from SPSR_EL1 */
|
|
|
|
.endif
|
|
|
|
|
2012-03-05 18:49:27 +07:00
|
|
|
ldp x21, x22, [sp, #S_PC] // load ELR, SPSR
|
|
|
|
.if \el == 0
|
2014-05-31 02:34:15 +07:00
|
|
|
ct_user_enter
|
2016-09-02 20:54:03 +07:00
|
|
|
.endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
|
|
|
|
/*
|
|
|
|
* Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR
|
|
|
|
* PAN bit checking.
|
|
|
|
*/
|
|
|
|
alternative_if ARM64_HAS_PAN
|
|
|
|
b 2f // skip TTBR0 PAN
|
|
|
|
alternative_else_nop_endif
|
|
|
|
|
|
|
|
.if \el != 0
|
|
|
|
tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set
|
|
|
|
.endif
|
|
|
|
|
2017-08-10 19:58:16 +07:00
|
|
|
__uaccess_ttbr0_enable x0, x1
|
2016-09-02 20:54:03 +07:00
|
|
|
|
|
|
|
.if \el == 0
|
|
|
|
/*
|
|
|
|
* Enable errata workarounds only if returning to user. The only
|
|
|
|
* workaround currently required for TTBR0_EL1 changes are for the
|
|
|
|
* Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
|
|
|
|
* corruption).
|
|
|
|
*/
|
2017-08-10 19:34:30 +07:00
|
|
|
post_ttbr_update_workaround
|
2016-09-02 20:54:03 +07:00
|
|
|
.endif
|
|
|
|
1:
|
|
|
|
.if \el != 0
|
|
|
|
and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit
|
|
|
|
.endif
|
|
|
|
2:
|
|
|
|
#endif
|
|
|
|
|
|
|
|
.if \el == 0
|
2012-03-05 18:49:27 +07:00
|
|
|
ldr x23, [sp, #S_SP] // load return stack pointer
|
2014-09-29 18:26:41 +07:00
|
|
|
msr sp_el0, x23
|
2017-11-14 21:24:29 +07:00
|
|
|
tst x22, #PSR_MODE32_BIT // native task?
|
|
|
|
b.eq 3f
|
|
|
|
|
2015-03-24 02:07:02 +07:00
|
|
|
#ifdef CONFIG_ARM64_ERRATUM_845719
|
2016-09-07 17:07:09 +07:00
|
|
|
alternative_if ARM64_WORKAROUND_845719
|
2015-07-22 18:21:03 +07:00
|
|
|
#ifdef CONFIG_PID_IN_CONTEXTIDR
|
|
|
|
mrs x29, contextidr_el1
|
|
|
|
msr contextidr_el1, x29
|
2015-03-24 02:07:02 +07:00
|
|
|
#else
|
2015-07-22 18:21:03 +07:00
|
|
|
msr contextidr_el1, xzr
|
2015-03-24 02:07:02 +07:00
|
|
|
#endif
|
2016-09-07 17:07:09 +07:00
|
|
|
alternative_else_nop_endif
|
2015-03-24 02:07:02 +07:00
|
|
|
#endif
|
2017-11-14 21:24:29 +07:00
|
|
|
3:
|
2012-03-05 18:49:27 +07:00
|
|
|
.endif
|
2016-09-02 20:54:03 +07:00
|
|
|
|
2014-09-29 18:26:41 +07:00
|
|
|
msr elr_el1, x21 // set up the return data
|
|
|
|
msr spsr_el1, x22
|
|
|
|
ldp x0, x1, [sp, #16 * 0]
|
|
|
|
ldp x2, x3, [sp, #16 * 1]
|
|
|
|
ldp x4, x5, [sp, #16 * 2]
|
|
|
|
ldp x6, x7, [sp, #16 * 3]
|
|
|
|
ldp x8, x9, [sp, #16 * 4]
|
|
|
|
ldp x10, x11, [sp, #16 * 5]
|
|
|
|
ldp x12, x13, [sp, #16 * 6]
|
|
|
|
ldp x14, x15, [sp, #16 * 7]
|
|
|
|
ldp x16, x17, [sp, #16 * 8]
|
|
|
|
ldp x18, x19, [sp, #16 * 9]
|
|
|
|
ldp x20, x21, [sp, #16 * 10]
|
|
|
|
ldp x22, x23, [sp, #16 * 11]
|
|
|
|
ldp x24, x25, [sp, #16 * 12]
|
|
|
|
ldp x26, x27, [sp, #16 * 13]
|
|
|
|
ldp x28, x29, [sp, #16 * 14]
|
|
|
|
ldr lr, [sp, #S_LR]
|
|
|
|
add sp, sp, #S_FRAME_SIZE // restore sp
|
2017-11-14 21:24:29 +07:00
|
|
|
|
|
|
|
.if \el == 0
|
2017-11-14 21:38:19 +07:00
|
|
|
alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0
|
|
|
|
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
|
2017-11-14 21:24:29 +07:00
|
|
|
bne 4f
|
|
|
|
msr far_el1, x30
|
|
|
|
tramp_alias x30, tramp_exit_native
|
|
|
|
br x30
|
|
|
|
4:
|
|
|
|
tramp_alias x30, tramp_exit_compat
|
|
|
|
br x30
|
2017-11-14 21:38:19 +07:00
|
|
|
#endif
|
2017-11-14 21:24:29 +07:00
|
|
|
.else
|
|
|
|
eret
|
|
|
|
.endif
|
2012-03-05 18:49:27 +07:00
|
|
|
.endm
|
|
|
|
|
2015-12-15 18:21:25 +07:00
|
|
|
.macro irq_stack_entry
|
2015-12-04 18:02:27 +07:00
|
|
|
mov x19, sp // preserve the original sp
|
|
|
|
|
|
|
|
/*
|
arm64: split thread_info from task stack
This patch moves arm64's struct thread_info from the task stack into
task_struct. This protects thread_info from corruption in the case of
stack overflows, and makes its address harder to determine if stack
addresses are leaked, making a number of attacks more difficult. Precise
detection and handling of overflow is left for subsequent patches.
Largely, this involves changing code to store the task_struct in sp_el0,
and acquire the thread_info from the task struct. Core code now
implements current_thread_info(), and as noted in <linux/sched.h> this
relies on offsetof(task_struct, thread_info) == 0, enforced by core
code.
This change means that the 'tsk' register used in entry.S now points to
a task_struct, rather than a thread_info as it used to. To make this
clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets
appropriately updated to account for the structural change.
Userspace clobbers sp_el0, and we can no longer restore this from the
stack. Instead, the current task is cached in a per-cpu variable that we
can safely access from early assembly as interrupts are disabled (and we
are thus not preemptible).
Both secondary entry and idle are updated to stash the sp and task
pointer separately.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-04 03:23:13 +07:00
|
|
|
* Compare sp with the base of the task stack.
|
|
|
|
* If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack,
|
|
|
|
* and should switch to the irq stack.
|
2015-12-04 18:02:27 +07:00
|
|
|
*/
|
arm64: split thread_info from task stack
This patch moves arm64's struct thread_info from the task stack into
task_struct. This protects thread_info from corruption in the case of
stack overflows, and makes its address harder to determine if stack
addresses are leaked, making a number of attacks more difficult. Precise
detection and handling of overflow is left for subsequent patches.
Largely, this involves changing code to store the task_struct in sp_el0,
and acquire the thread_info from the task struct. Core code now
implements current_thread_info(), and as noted in <linux/sched.h> this
relies on offsetof(task_struct, thread_info) == 0, enforced by core
code.
This change means that the 'tsk' register used in entry.S now points to
a task_struct, rather than a thread_info as it used to. To make this
clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets
appropriately updated to account for the structural change.
Userspace clobbers sp_el0, and we can no longer restore this from the
stack. Instead, the current task is cached in a per-cpu variable that we
can safely access from early assembly as interrupts are disabled (and we
are thus not preemptible).
Both secondary entry and idle are updated to stash the sp and task
pointer separately.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-04 03:23:13 +07:00
|
|
|
ldr x25, [tsk, TSK_STACK]
|
|
|
|
eor x25, x25, x19
|
|
|
|
and x25, x25, #~(THREAD_SIZE - 1)
|
|
|
|
cbnz x25, 9998f
|
2015-12-04 18:02:27 +07:00
|
|
|
|
2017-08-01 03:17:03 +07:00
|
|
|
ldr_this_cpu x25, irq_stack_ptr, x26
|
arm64: kernel: remove {THREAD,IRQ_STACK}_START_SP
For historical reasons, we leave the top 16 bytes of our task and IRQ
stacks unused, a practice used to ensure that the SP can always be
masked to find the base of the current stack (historically, where
thread_info could be found).
However, this is not necessary, as:
* When an exception is taken from a task stack, we decrement the SP by
S_FRAME_SIZE and stash the exception registers before we compare the
SP against the task stack. In such cases, the SP must be at least
S_FRAME_SIZE below the limit, and can be safely masked to determine
whether the task stack is in use.
* When transitioning to an IRQ stack, we'll place a dummy frame onto the
IRQ stack before enabling asynchronous exceptions, or executing code
we expect to trigger faults. Thus, if an exception is taken from the
IRQ stack, the SP must be at least 16 bytes below the limit.
* We no longer mask the SP to find the thread_info, which is now found
via sp_el0. Note that historically, the offset was critical to ensure
that cpu_switch_to() found the correct stack for new threads that
hadn't yet executed ret_from_fork().
Given that, this initial offset serves no purpose, and can be removed.
This brings us in-line with other architectures (e.g. x86) which do not
rely on this masking.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
[Mark: rebase, kill THREAD_START_SP, commit msg additions]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
2017-07-20 23:15:45 +07:00
|
|
|
mov x26, #IRQ_STACK_SIZE
|
2015-12-04 18:02:27 +07:00
|
|
|
add x26, x25, x26
|
arm64: remove irq_count and do_softirq_own_stack()
sysrq_handle_reboot() re-enables interrupts while on the irq stack. The
irq_stack implementation wrongly assumed this would only ever happen
via the softirq path, allowing it to update irq_count late, in
do_softirq_own_stack().
This means if an irq occurs in sysrq_handle_reboot(), during
emergency_restart() the stack will be corrupted, as irq_count wasn't
updated.
Lose the optimisation, and instead of moving the adding/subtracting of
irq_count into irq_stack_entry/irq_stack_exit, remove it, and compare
sp_el0 (struct thread_info) with sp & ~(THREAD_SIZE - 1). This tells us
if we are on a task stack, if so, we can safely switch to the irq stack.
Finally, remove do_softirq_own_stack(), we don't need it anymore.
Reported-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
[will: use get_thread_info macro]
Signed-off-by: Will Deacon <will.deacon@arm.com>
2015-12-18 23:01:47 +07:00
|
|
|
|
|
|
|
/* switch to the irq stack */
|
2015-12-04 18:02:27 +07:00
|
|
|
mov sp, x26
|
|
|
|
9998:
|
|
|
|
.endm
|
|
|
|
|
|
|
|
/*
|
|
|
|
* x19 should be preserved between irq_stack_entry and
|
|
|
|
* irq_stack_exit.
|
|
|
|
*/
|
|
|
|
.macro irq_stack_exit
|
|
|
|
mov sp, x19
|
|
|
|
.endm
|
|
|
|
|
2012-03-05 18:49:27 +07:00
|
|
|
/*
|
|
|
|
* These are the registers used in the syscall handler, and allow us to
|
|
|
|
* have in theory up to 7 arguments to a function - x0 to x6.
|
|
|
|
*
|
|
|
|
* x7 is reserved for the system call number in 32-bit mode.
|
|
|
|
*/
|
arm64: syscallno is secretly an int, make it official
The upper 32 bits of the syscallno field in thread_struct are
handled inconsistently, being sometimes zero extended and sometimes
sign-extended. In fact, only the lower 32 bits seem to have any
real significance for the behaviour of the code: it's been OK to
handle the upper bits inconsistently because they don't matter.
Currently, the only place I can find where those bits are
significant is in calling trace_sys_enter(), which may be
unintentional: for example, if a compat tracer attempts to cancel a
syscall by passing -1 to (COMPAT_)PTRACE_SET_SYSCALL at the
syscall-enter-stop, it will be traced as syscall 4294967295
rather than -1 as might be expected (and as occurs for a native
tracer doing the same thing). Elsewhere, reads of syscallno cast
it to an int or truncate it.
There's also a conspicuous amount of code and casting to bodge
around the fact that although semantically an int, syscallno is
stored as a u64.
Let's not pretend any more.
In order to preserve the stp x instruction that stores the syscall
number in entry.S, this patch special-cases the layout of struct
pt_regs for big endian so that the newly 32-bit syscallno field
maps onto the low bits of the stored value. This is not beautiful,
but benchmarking of the getpid syscall on Juno suggests indicates a
minor slowdown if the stp is split into an stp x and stp w.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-08-01 21:35:53 +07:00
|
|
|
wsc_nr .req w25 // number of system calls
|
|
|
|
wscno .req w26 // syscall number
|
|
|
|
xscno .req x26 // syscall number (zero-extended)
|
2012-03-05 18:49:27 +07:00
|
|
|
stbl .req x27 // syscall table pointer
|
|
|
|
tsk .req x28 // current thread_info
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Interrupt handling.
|
|
|
|
*/
|
|
|
|
.macro irq_handler
|
2015-12-04 18:02:27 +07:00
|
|
|
ldr_l x1, handle_arch_irq
|
2012-03-05 18:49:27 +07:00
|
|
|
mov x0, sp
|
2015-12-15 18:21:25 +07:00
|
|
|
irq_stack_entry
|
2012-03-05 18:49:27 +07:00
|
|
|
blr x1
|
2015-12-04 18:02:27 +07:00
|
|
|
irq_stack_exit
|
2012-03-05 18:49:27 +07:00
|
|
|
.endm
|
|
|
|
|
|
|
|
.text
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Exception vectors.
|
|
|
|
*/
|
2016-07-08 23:35:50 +07:00
|
|
|
.pushsection ".entry.text", "ax"
|
2012-03-05 18:49:27 +07:00
|
|
|
|
|
|
|
.align 11
|
|
|
|
ENTRY(vectors)
|
2017-11-14 21:20:21 +07:00
|
|
|
kernel_ventry 1, sync_invalid // Synchronous EL1t
|
|
|
|
kernel_ventry 1, irq_invalid // IRQ EL1t
|
|
|
|
kernel_ventry 1, fiq_invalid // FIQ EL1t
|
|
|
|
kernel_ventry 1, error_invalid // Error EL1t
|
2012-03-05 18:49:27 +07:00
|
|
|
|
2017-11-14 21:20:21 +07:00
|
|
|
kernel_ventry 1, sync // Synchronous EL1h
|
|
|
|
kernel_ventry 1, irq // IRQ EL1h
|
|
|
|
kernel_ventry 1, fiq_invalid // FIQ EL1h
|
|
|
|
kernel_ventry 1, error // Error EL1h
|
2012-03-05 18:49:27 +07:00
|
|
|
|
2017-11-14 21:20:21 +07:00
|
|
|
kernel_ventry 0, sync // Synchronous 64-bit EL0
|
|
|
|
kernel_ventry 0, irq // IRQ 64-bit EL0
|
|
|
|
kernel_ventry 0, fiq_invalid // FIQ 64-bit EL0
|
|
|
|
kernel_ventry 0, error // Error 64-bit EL0
|
2012-03-05 18:49:27 +07:00
|
|
|
|
|
|
|
#ifdef CONFIG_COMPAT
|
2017-11-14 21:20:21 +07:00
|
|
|
kernel_ventry 0, sync_compat, 32 // Synchronous 32-bit EL0
|
|
|
|
kernel_ventry 0, irq_compat, 32 // IRQ 32-bit EL0
|
|
|
|
kernel_ventry 0, fiq_invalid_compat, 32 // FIQ 32-bit EL0
|
|
|
|
kernel_ventry 0, error_compat, 32 // Error 32-bit EL0
|
2012-03-05 18:49:27 +07:00
|
|
|
#else
|
2017-11-14 21:20:21 +07:00
|
|
|
kernel_ventry 0, sync_invalid, 32 // Synchronous 32-bit EL0
|
|
|
|
kernel_ventry 0, irq_invalid, 32 // IRQ 32-bit EL0
|
|
|
|
kernel_ventry 0, fiq_invalid, 32 // FIQ 32-bit EL0
|
|
|
|
kernel_ventry 0, error_invalid, 32 // Error 32-bit EL0
|
2012-03-05 18:49:27 +07:00
|
|
|
#endif
|
|
|
|
END(vectors)
|
|
|
|
|
arm64: add VMAP_STACK overflow detection
This patch adds stack overflow detection to arm64, usable when vmap'd stacks
are in use.
Overflow is detected in a small preamble executed for each exception entry,
which checks whether there is enough space on the current stack for the general
purpose registers to be saved. If there is not enough space, the overflow
handler is invoked on a per-cpu overflow stack. This approach preserves the
original exception information in ESR_EL1 (and where appropriate, FAR_EL1).
Task and IRQ stacks are aligned to double their size, enabling overflow to be
detected with a single bit test. For example, a 16K stack is aligned to 32K,
ensuring that bit 14 of the SP must be zero. On an overflow (or underflow),
this bit is flipped. Thus, overflow (of less than the size of the stack) can be
detected by testing whether this bit is set.
The overflow check is performed before any attempt is made to access the
stack, avoiding recursive faults (and the loss of exception information
these would entail). As logical operations cannot be performed on the SP
directly, the SP is temporarily swapped with a general purpose register
using arithmetic operations to enable the test to be performed.
This gives us a useful error message on stack overflow, as can be trigger with
the LKDTM overflow test:
[ 305.388749] lkdtm: Performing direct entry OVERFLOW
[ 305.395444] Insufficient stack space to handle exception!
[ 305.395482] ESR: 0x96000047 -- DABT (current EL)
[ 305.399890] FAR: 0xffff00000a5e7f30
[ 305.401315] Task stack: [0xffff00000a5e8000..0xffff00000a5ec000]
[ 305.403815] IRQ stack: [0xffff000008000000..0xffff000008004000]
[ 305.407035] Overflow stack: [0xffff80003efce4e0..0xffff80003efcf4e0]
[ 305.409622] CPU: 0 PID: 1219 Comm: sh Not tainted 4.13.0-rc3-00021-g9636aea #5
[ 305.412785] Hardware name: linux,dummy-virt (DT)
[ 305.415756] task: ffff80003d051c00 task.stack: ffff00000a5e8000
[ 305.419221] PC is at recursive_loop+0x10/0x48
[ 305.421637] LR is at recursive_loop+0x38/0x48
[ 305.423768] pc : [<ffff00000859f330>] lr : [<ffff00000859f358>] pstate: 40000145
[ 305.428020] sp : ffff00000a5e7f50
[ 305.430469] x29: ffff00000a5e8350 x28: ffff80003d051c00
[ 305.433191] x27: ffff000008981000 x26: ffff000008f80400
[ 305.439012] x25: ffff00000a5ebeb8 x24: ffff00000a5ebeb8
[ 305.440369] x23: ffff000008f80138 x22: 0000000000000009
[ 305.442241] x21: ffff80003ce65000 x20: ffff000008f80188
[ 305.444552] x19: 0000000000000013 x18: 0000000000000006
[ 305.446032] x17: 0000ffffa2601280 x16: ffff0000081fe0b8
[ 305.448252] x15: ffff000008ff546d x14: 000000000047a4c8
[ 305.450246] x13: ffff000008ff7872 x12: 0000000005f5e0ff
[ 305.452953] x11: ffff000008ed2548 x10: 000000000005ee8d
[ 305.454824] x9 : ffff000008545380 x8 : ffff00000a5e8770
[ 305.457105] x7 : 1313131313131313 x6 : 00000000000000e1
[ 305.459285] x5 : 0000000000000000 x4 : 0000000000000000
[ 305.461781] x3 : 0000000000000000 x2 : 0000000000000400
[ 305.465119] x1 : 0000000000000013 x0 : 0000000000000012
[ 305.467724] Kernel panic - not syncing: kernel stack overflow
[ 305.470561] CPU: 0 PID: 1219 Comm: sh Not tainted 4.13.0-rc3-00021-g9636aea #5
[ 305.473325] Hardware name: linux,dummy-virt (DT)
[ 305.475070] Call trace:
[ 305.476116] [<ffff000008088ad8>] dump_backtrace+0x0/0x378
[ 305.478991] [<ffff000008088e64>] show_stack+0x14/0x20
[ 305.481237] [<ffff00000895a178>] dump_stack+0x98/0xb8
[ 305.483294] [<ffff0000080c3288>] panic+0x118/0x280
[ 305.485673] [<ffff0000080c2e9c>] nmi_panic+0x6c/0x70
[ 305.486216] [<ffff000008089710>] handle_bad_stack+0x118/0x128
[ 305.486612] Exception stack(0xffff80003efcf3a0 to 0xffff80003efcf4e0)
[ 305.487334] f3a0: 0000000000000012 0000000000000013 0000000000000400 0000000000000000
[ 305.488025] f3c0: 0000000000000000 0000000000000000 00000000000000e1 1313131313131313
[ 305.488908] f3e0: ffff00000a5e8770 ffff000008545380 000000000005ee8d ffff000008ed2548
[ 305.489403] f400: 0000000005f5e0ff ffff000008ff7872 000000000047a4c8 ffff000008ff546d
[ 305.489759] f420: ffff0000081fe0b8 0000ffffa2601280 0000000000000006 0000000000000013
[ 305.490256] f440: ffff000008f80188 ffff80003ce65000 0000000000000009 ffff000008f80138
[ 305.490683] f460: ffff00000a5ebeb8 ffff00000a5ebeb8 ffff000008f80400 ffff000008981000
[ 305.491051] f480: ffff80003d051c00 ffff00000a5e8350 ffff00000859f358 ffff00000a5e7f50
[ 305.491444] f4a0: ffff00000859f330 0000000040000145 0000000000000000 0000000000000000
[ 305.492008] f4c0: 0001000000000000 0000000000000000 ffff00000a5e8350 ffff00000859f330
[ 305.493063] [<ffff00000808205c>] __bad_stack+0x88/0x8c
[ 305.493396] [<ffff00000859f330>] recursive_loop+0x10/0x48
[ 305.493731] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.494088] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.494425] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.494649] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.494898] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.495205] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.495453] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.495708] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.496000] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.496302] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.496644] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.496894] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.497138] [<ffff00000859f358>] recursive_loop+0x38/0x48
[ 305.497325] [<ffff00000859f3dc>] lkdtm_OVERFLOW+0x14/0x20
[ 305.497506] [<ffff00000859f314>] lkdtm_do_action+0x1c/0x28
[ 305.497786] [<ffff00000859f178>] direct_entry+0xe0/0x170
[ 305.498095] [<ffff000008345568>] full_proxy_write+0x60/0xa8
[ 305.498387] [<ffff0000081fb7f4>] __vfs_write+0x1c/0x128
[ 305.498679] [<ffff0000081fcc68>] vfs_write+0xa0/0x1b0
[ 305.498926] [<ffff0000081fe0fc>] SyS_write+0x44/0xa0
[ 305.499182] Exception stack(0xffff00000a5ebec0 to 0xffff00000a5ec000)
[ 305.499429] bec0: 0000000000000001 000000001c4cf5e0 0000000000000009 000000001c4cf5e0
[ 305.499674] bee0: 574f4c465245564f 0000000000000000 0000000000000000 8000000080808080
[ 305.499904] bf00: 0000000000000040 0000000000000038 fefefeff1b4bc2ff 7f7f7f7f7f7fff7f
[ 305.500189] bf20: 0101010101010101 0000000000000000 000000000047a4c8 0000000000000038
[ 305.500712] bf40: 0000000000000000 0000ffffa2601280 0000ffffc63f6068 00000000004b5000
[ 305.501241] bf60: 0000000000000001 000000001c4cf5e0 0000000000000009 000000001c4cf5e0
[ 305.501791] bf80: 0000000000000020 0000000000000000 00000000004b5000 000000001c4cc458
[ 305.502314] bfa0: 0000000000000000 0000ffffc63f7950 000000000040a3c4 0000ffffc63f70e0
[ 305.502762] bfc0: 0000ffffa2601268 0000000080000000 0000000000000001 0000000000000040
[ 305.503207] bfe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[ 305.503680] [<ffff000008082fb0>] el0_svc_naked+0x24/0x28
[ 305.504720] Kernel Offset: disabled
[ 305.505189] CPU features: 0x002082
[ 305.505473] Memory Limit: none
[ 305.506181] ---[ end Kernel panic - not syncing: kernel stack overflow
This patch was co-authored by Ard Biesheuvel and Mark Rutland.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
2017-07-15 02:30:35 +07:00
|
|
|
#ifdef CONFIG_VMAP_STACK
|
|
|
|
/*
|
|
|
|
* We detected an overflow in kernel_ventry, which switched to the
|
|
|
|
* overflow stack. Stash the exception regs, and head to our overflow
|
|
|
|
* handler.
|
|
|
|
*/
|
|
|
|
__bad_stack:
|
|
|
|
/* Restore the original x0 value */
|
|
|
|
mrs x0, tpidrro_el0
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Store the original GPRs to the new stack. The orginal SP (minus
|
|
|
|
* S_FRAME_SIZE) was stashed in tpidr_el0 by kernel_ventry.
|
|
|
|
*/
|
|
|
|
sub sp, sp, #S_FRAME_SIZE
|
|
|
|
kernel_entry 1
|
|
|
|
mrs x0, tpidr_el0
|
|
|
|
add x0, x0, #S_FRAME_SIZE
|
|
|
|
str x0, [sp, #S_SP]
|
|
|
|
|
|
|
|
/* Stash the regs for handle_bad_stack */
|
|
|
|
mov x0, sp
|
|
|
|
|
|
|
|
/* Time to die */
|
|
|
|
bl handle_bad_stack
|
|
|
|
ASM_BUG()
|
|
|
|
#endif /* CONFIG_VMAP_STACK */
|
|
|
|
|
2012-03-05 18:49:27 +07:00
|
|
|
/*
|
|
|
|
* Invalid mode handlers
|
|
|
|
*/
|
|
|
|
.macro inv_entry, el, reason, regsize = 64
|
2016-03-18 16:58:09 +07:00
|
|
|
kernel_entry \el, \regsize
|
2012-03-05 18:49:27 +07:00
|
|
|
mov x0, sp
|
|
|
|
mov x1, #\reason
|
|
|
|
mrs x2, esr_el1
|
arm64: consistently use bl for C exception entry
In most cases, our exception entry assembly branches to C handlers with
a BL instruction, but in cases where we do not expect to return, we use
B instead.
While this is correct today, it means that backtraces for fatal
exceptions miss the entry assembly (as the LR is stale at the point we
call C code), while non-fatal exceptions have the entry assembly in the
LR. In subsequent patches, we will need the LR to be set in these cases
in order to backtrace reliably.
This patch updates these sites to use a BL, ensuring consistency, and
preparing for backtrace rework. An ASM_BUG() is added after each of
these new BLs, which both catches unexpected returns, and ensures that
the LR value doesn't point to another function label.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
2017-07-26 17:14:53 +07:00
|
|
|
bl bad_mode
|
|
|
|
ASM_BUG()
|
2012-03-05 18:49:27 +07:00
|
|
|
.endm
|
|
|
|
|
|
|
|
el0_sync_invalid:
|
|
|
|
inv_entry 0, BAD_SYNC
|
|
|
|
ENDPROC(el0_sync_invalid)
|
|
|
|
|
|
|
|
el0_irq_invalid:
|
|
|
|
inv_entry 0, BAD_IRQ
|
|
|
|
ENDPROC(el0_irq_invalid)
|
|
|
|
|
|
|
|
el0_fiq_invalid:
|
|
|
|
inv_entry 0, BAD_FIQ
|
|
|
|
ENDPROC(el0_fiq_invalid)
|
|
|
|
|
|
|
|
el0_error_invalid:
|
|
|
|
inv_entry 0, BAD_ERROR
|
|
|
|
ENDPROC(el0_error_invalid)
|
|
|
|
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
el0_fiq_invalid_compat:
|
|
|
|
inv_entry 0, BAD_FIQ, 32
|
|
|
|
ENDPROC(el0_fiq_invalid_compat)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
el1_sync_invalid:
|
|
|
|
inv_entry 1, BAD_SYNC
|
|
|
|
ENDPROC(el1_sync_invalid)
|
|
|
|
|
|
|
|
el1_irq_invalid:
|
|
|
|
inv_entry 1, BAD_IRQ
|
|
|
|
ENDPROC(el1_irq_invalid)
|
|
|
|
|
|
|
|
el1_fiq_invalid:
|
|
|
|
inv_entry 1, BAD_FIQ
|
|
|
|
ENDPROC(el1_fiq_invalid)
|
|
|
|
|
|
|
|
el1_error_invalid:
|
|
|
|
inv_entry 1, BAD_ERROR
|
|
|
|
ENDPROC(el1_error_invalid)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* EL1 mode handlers.
|
|
|
|
*/
|
|
|
|
.align 6
|
|
|
|
el1_sync:
|
|
|
|
kernel_entry 1
|
|
|
|
mrs x1, esr_el1 // read the syndrome register
|
2014-11-24 19:31:40 +07:00
|
|
|
lsr x24, x1, #ESR_ELx_EC_SHIFT // exception class
|
|
|
|
cmp x24, #ESR_ELx_EC_DABT_CUR // data abort in EL1
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el1_da
|
2016-08-10 08:25:26 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_IABT_CUR // instruction abort in EL1
|
|
|
|
b.eq el1_ia
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_SYS64 // configurable trap
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el1_undef
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el1_sp_pc
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el1_sp_pc
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL1
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el1_undef
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_BREAKPT_CUR // debug exception in EL1
|
2012-03-05 18:49:27 +07:00
|
|
|
b.ge el1_dbg
|
|
|
|
b el1_inv
|
2016-08-10 08:25:26 +07:00
|
|
|
|
|
|
|
el1_ia:
|
|
|
|
/*
|
|
|
|
* Fall through to the Data abort case
|
|
|
|
*/
|
2012-03-05 18:49:27 +07:00
|
|
|
el1_da:
|
|
|
|
/*
|
|
|
|
* Data abort handling
|
|
|
|
*/
|
arm64: entry: improve data abort handling of tagged pointers
When handling a data abort from EL0, we currently zero the top byte of
the faulting address, as we assume the address is a TTBR0 address, which
may contain a non-zero address tag. However, the address may be a TTBR1
address, in which case we should not zero the top byte. This patch fixes
that. The effect is that the full TTBR1 address is passed to the task's
signal handler (or printed out in the kernel log).
When handling a data abort from EL1, we leave the faulting address
intact, as we assume it's either a TTBR1 address or a TTBR0 address with
tag 0x00. This is true as far as I'm aware, we don't seem to access a
tagged TTBR0 address anywhere in the kernel. Regardless, it's easy to
forget about address tags, and code added in the future may not always
remember to remove tags from addresses before accessing them. So add tag
handling to the EL1 data abort handler as well. This also makes it
consistent with the EL0 data abort handler.
Fixes: d50240a5f6ce ("arm64: mm: permit use of tagged pointers at EL0")
Cc: <stable@vger.kernel.org> # 3.12.x-
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-05-03 22:37:47 +07:00
|
|
|
mrs x3, far_el1
|
2017-11-02 19:12:39 +07:00
|
|
|
inherit_daif pstate=x23, tmp=x2
|
arm64: entry: improve data abort handling of tagged pointers
When handling a data abort from EL0, we currently zero the top byte of
the faulting address, as we assume the address is a TTBR0 address, which
may contain a non-zero address tag. However, the address may be a TTBR1
address, in which case we should not zero the top byte. This patch fixes
that. The effect is that the full TTBR1 address is passed to the task's
signal handler (or printed out in the kernel log).
When handling a data abort from EL1, we leave the faulting address
intact, as we assume it's either a TTBR1 address or a TTBR0 address with
tag 0x00. This is true as far as I'm aware, we don't seem to access a
tagged TTBR0 address anywhere in the kernel. Regardless, it's easy to
forget about address tags, and code added in the future may not always
remember to remove tags from addresses before accessing them. So add tag
handling to the EL1 data abort handler as well. This also makes it
consistent with the EL0 data abort handler.
Fixes: d50240a5f6ce ("arm64: mm: permit use of tagged pointers at EL0")
Cc: <stable@vger.kernel.org> # 3.12.x-
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-05-03 22:37:47 +07:00
|
|
|
clear_address_tag x0, x3
|
2012-03-05 18:49:27 +07:00
|
|
|
mov x2, sp // struct pt_regs
|
|
|
|
bl do_mem_abort
|
|
|
|
|
|
|
|
kernel_exit 1
|
|
|
|
el1_sp_pc:
|
|
|
|
/*
|
|
|
|
* Stack or PC alignment exception handling
|
|
|
|
*/
|
|
|
|
mrs x0, far_el1
|
2017-11-02 19:12:39 +07:00
|
|
|
inherit_daif pstate=x23, tmp=x2
|
2012-03-05 18:49:27 +07:00
|
|
|
mov x2, sp
|
arm64: consistently use bl for C exception entry
In most cases, our exception entry assembly branches to C handlers with
a BL instruction, but in cases where we do not expect to return, we use
B instead.
While this is correct today, it means that backtraces for fatal
exceptions miss the entry assembly (as the LR is stale at the point we
call C code), while non-fatal exceptions have the entry assembly in the
LR. In subsequent patches, we will need the LR to be set in these cases
in order to backtrace reliably.
This patch updates these sites to use a BL, ensuring consistency, and
preparing for backtrace rework. An ASM_BUG() is added after each of
these new BLs, which both catches unexpected returns, and ensures that
the LR value doesn't point to another function label.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
2017-07-26 17:14:53 +07:00
|
|
|
bl do_sp_pc_abort
|
|
|
|
ASM_BUG()
|
2012-03-05 18:49:27 +07:00
|
|
|
el1_undef:
|
|
|
|
/*
|
|
|
|
* Undefined instruction
|
|
|
|
*/
|
2017-11-02 19:12:39 +07:00
|
|
|
inherit_daif pstate=x23, tmp=x2
|
2012-03-05 18:49:27 +07:00
|
|
|
mov x0, sp
|
arm64: consistently use bl for C exception entry
In most cases, our exception entry assembly branches to C handlers with
a BL instruction, but in cases where we do not expect to return, we use
B instead.
While this is correct today, it means that backtraces for fatal
exceptions miss the entry assembly (as the LR is stale at the point we
call C code), while non-fatal exceptions have the entry assembly in the
LR. In subsequent patches, we will need the LR to be set in these cases
in order to backtrace reliably.
This patch updates these sites to use a BL, ensuring consistency, and
preparing for backtrace rework. An ASM_BUG() is added after each of
these new BLs, which both catches unexpected returns, and ensures that
the LR value doesn't point to another function label.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
2017-07-26 17:14:53 +07:00
|
|
|
bl do_undefinstr
|
|
|
|
ASM_BUG()
|
2012-03-05 18:49:27 +07:00
|
|
|
el1_dbg:
|
|
|
|
/*
|
|
|
|
* Debug exception handling
|
|
|
|
*/
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_BRK64 // if BRK64
|
2013-12-04 12:50:20 +07:00
|
|
|
cinc x24, x24, eq // set bit '0'
|
2012-03-05 18:49:27 +07:00
|
|
|
tbz x24, #0, el1_inv // EL1 only
|
|
|
|
mrs x0, far_el1
|
|
|
|
mov x2, sp // struct pt_regs
|
|
|
|
bl do_debug_exception
|
|
|
|
kernel_exit 1
|
|
|
|
el1_inv:
|
|
|
|
// TODO: add support for undefined instructions in kernel mode
|
2017-11-02 19:12:39 +07:00
|
|
|
inherit_daif pstate=x23, tmp=x2
|
2012-03-05 18:49:27 +07:00
|
|
|
mov x0, sp
|
2015-07-08 00:00:49 +07:00
|
|
|
mov x2, x1
|
2012-03-05 18:49:27 +07:00
|
|
|
mov x1, #BAD_SYNC
|
arm64: consistently use bl for C exception entry
In most cases, our exception entry assembly branches to C handlers with
a BL instruction, but in cases where we do not expect to return, we use
B instead.
While this is correct today, it means that backtraces for fatal
exceptions miss the entry assembly (as the LR is stale at the point we
call C code), while non-fatal exceptions have the entry assembly in the
LR. In subsequent patches, we will need the LR to be set in these cases
in order to backtrace reliably.
This patch updates these sites to use a BL, ensuring consistency, and
preparing for backtrace rework. An ASM_BUG() is added after each of
these new BLs, which both catches unexpected returns, and ensures that
the LR value doesn't point to another function label.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
2017-07-26 17:14:53 +07:00
|
|
|
bl bad_mode
|
|
|
|
ASM_BUG()
|
2012-03-05 18:49:27 +07:00
|
|
|
ENDPROC(el1_sync)
|
|
|
|
|
|
|
|
.align 6
|
|
|
|
el1_irq:
|
|
|
|
kernel_entry 1
|
2017-11-02 19:12:41 +07:00
|
|
|
enable_da_f
|
2012-03-05 18:49:27 +07:00
|
|
|
#ifdef CONFIG_TRACE_IRQFLAGS
|
|
|
|
bl trace_hardirqs_off
|
|
|
|
#endif
|
2013-11-13 00:11:53 +07:00
|
|
|
|
2012-03-05 18:49:27 +07:00
|
|
|
irq_handler
|
2013-11-13 00:11:53 +07:00
|
|
|
|
2012-03-05 18:49:27 +07:00
|
|
|
#ifdef CONFIG_PREEMPT
|
arm64: split thread_info from task stack
This patch moves arm64's struct thread_info from the task stack into
task_struct. This protects thread_info from corruption in the case of
stack overflows, and makes its address harder to determine if stack
addresses are leaked, making a number of attacks more difficult. Precise
detection and handling of overflow is left for subsequent patches.
Largely, this involves changing code to store the task_struct in sp_el0,
and acquire the thread_info from the task struct. Core code now
implements current_thread_info(), and as noted in <linux/sched.h> this
relies on offsetof(task_struct, thread_info) == 0, enforced by core
code.
This change means that the 'tsk' register used in entry.S now points to
a task_struct, rather than a thread_info as it used to. To make this
clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets
appropriately updated to account for the structural change.
Userspace clobbers sp_el0, and we can no longer restore this from the
stack. Instead, the current task is cached in a per-cpu variable that we
can safely access from early assembly as interrupts are disabled (and we
are thus not preemptible).
Both secondary entry and idle are updated to stash the sp and task
pointer separately.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-04 03:23:13 +07:00
|
|
|
ldr w24, [tsk, #TSK_TI_PREEMPT] // get preempt count
|
2013-11-05 03:14:58 +07:00
|
|
|
cbnz w24, 1f // preempt count != 0
|
arm64: split thread_info from task stack
This patch moves arm64's struct thread_info from the task stack into
task_struct. This protects thread_info from corruption in the case of
stack overflows, and makes its address harder to determine if stack
addresses are leaked, making a number of attacks more difficult. Precise
detection and handling of overflow is left for subsequent patches.
Largely, this involves changing code to store the task_struct in sp_el0,
and acquire the thread_info from the task struct. Core code now
implements current_thread_info(), and as noted in <linux/sched.h> this
relies on offsetof(task_struct, thread_info) == 0, enforced by core
code.
This change means that the 'tsk' register used in entry.S now points to
a task_struct, rather than a thread_info as it used to. To make this
clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets
appropriately updated to account for the structural change.
Userspace clobbers sp_el0, and we can no longer restore this from the
stack. Instead, the current task is cached in a per-cpu variable that we
can safely access from early assembly as interrupts are disabled (and we
are thus not preemptible).
Both secondary entry and idle are updated to stash the sp and task
pointer separately.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-04 03:23:13 +07:00
|
|
|
ldr x0, [tsk, #TSK_TI_FLAGS] // get flags
|
2012-03-05 18:49:27 +07:00
|
|
|
tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling?
|
|
|
|
bl el1_preempt
|
|
|
|
1:
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_TRACE_IRQFLAGS
|
|
|
|
bl trace_hardirqs_on
|
|
|
|
#endif
|
|
|
|
kernel_exit 1
|
|
|
|
ENDPROC(el1_irq)
|
|
|
|
|
|
|
|
#ifdef CONFIG_PREEMPT
|
|
|
|
el1_preempt:
|
|
|
|
mov x24, lr
|
arm64: debug: avoid accessing mdscr_el1 on fault paths where possible
Since mdscr_el1 is part of the debug register group, it is highly likely
to be trapped by a hypervisor to prevent virtual machines from debugging
(buggering?) each other. Unfortunately, this absolutely destroys our
performance, since we access the register on many of our low-level
fault handling paths to keep track of the various debug state machines.
This patch removes our dependency on mdscr_el1 in the case that debugging
is not being used. More specifically we:
- Use TIF_SINGLESTEP to indicate that a task is stepping at EL0 and
avoid disabling step in the MDSCR when we don't need to.
MDSCR_EL1.SS handling is moved to kernel_entry, when trapping from
userspace.
- Ensure debug exceptions are re-enabled on *all* exception entry
paths, even the debug exception handling path (where we re-enable
exceptions after invoking the handler). Since we can now rely on
MDSCR_EL1.SS being cleared by the entry code, exception handlers can
usually enable debug immediately before enabling interrupts.
- Remove all debug exception unmasking from ret_to_user and
el1_preempt, since we will never get here with debug exceptions
masked.
This results in a slight change to kernel debug behaviour, where we now
step into interrupt handlers and data aborts from EL1 when debugging the
kernel, which is actually a useful thing to do. A side-effect of this is
that it *does* potentially prevent stepping off {break,watch}points when
there is a high-frequency interrupt source (e.g. a timer), so a debugger
would need to use either breakpoints or manually disable interrupts to
get around this issue.
With this patch applied, guest performance is restored under KVM when
debug register accesses are trapped (and we get a measurable performance
increase on the host on Cortex-A57 too).
Cc: Ian Campbell <ian.campbell@citrix.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2014-04-30 01:04:06 +07:00
|
|
|
1: bl preempt_schedule_irq // irq en/disable is done inside
|
arm64: split thread_info from task stack
This patch moves arm64's struct thread_info from the task stack into
task_struct. This protects thread_info from corruption in the case of
stack overflows, and makes its address harder to determine if stack
addresses are leaked, making a number of attacks more difficult. Precise
detection and handling of overflow is left for subsequent patches.
Largely, this involves changing code to store the task_struct in sp_el0,
and acquire the thread_info from the task struct. Core code now
implements current_thread_info(), and as noted in <linux/sched.h> this
relies on offsetof(task_struct, thread_info) == 0, enforced by core
code.
This change means that the 'tsk' register used in entry.S now points to
a task_struct, rather than a thread_info as it used to. To make this
clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets
appropriately updated to account for the structural change.
Userspace clobbers sp_el0, and we can no longer restore this from the
stack. Instead, the current task is cached in a per-cpu variable that we
can safely access from early assembly as interrupts are disabled (and we
are thus not preemptible).
Both secondary entry and idle are updated to stash the sp and task
pointer separately.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-04 03:23:13 +07:00
|
|
|
ldr x0, [tsk, #TSK_TI_FLAGS] // get new tasks TI_FLAGS
|
2012-03-05 18:49:27 +07:00
|
|
|
tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling?
|
|
|
|
ret x24
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* EL0 mode handlers.
|
|
|
|
*/
|
|
|
|
.align 6
|
|
|
|
el0_sync:
|
|
|
|
kernel_entry 0
|
|
|
|
mrs x25, esr_el1 // read the syndrome register
|
2014-11-24 19:31:40 +07:00
|
|
|
lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class
|
|
|
|
cmp x24, #ESR_ELx_EC_SVC64 // SVC in 64-bit state
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el0_svc
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_DABT_LOW // data abort in EL0
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el0_da
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_IABT_LOW // instruction abort in EL0
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el0_ia
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_FP_ASIMD // FP/ASIMD access
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el0_fpsimd_acc
|
arm64/sve: Core task context handling
This patch adds the core support for switching and managing the SVE
architectural state of user tasks.
Calls to the existing FPSIMD low-level save/restore functions are
factored out as new functions task_fpsimd_{save,load}(), since SVE
now dynamically may or may not need to be handled at these points
depending on the kernel configuration, hardware features discovered
at boot, and the runtime state of the task. To make these
decisions as fast as possible, const cpucaps are used where
feasible, via the system_supports_sve() helper.
The SVE registers are only tracked for threads that have explicitly
used SVE, indicated by the new thread flag TIF_SVE. Otherwise, the
FPSIMD view of the architectural state is stored in
thread.fpsimd_state as usual.
When in use, the SVE registers are not stored directly in
thread_struct due to their potentially large and variable size.
Because the task_struct slab allocator must be configured very
early during kernel boot, it is also tricky to configure it
correctly to match the maximum vector length provided by the
hardware, since this depends on examining secondary CPUs as well as
the primary. Instead, a pointer sve_state in thread_struct points
to a dynamically allocated buffer containing the SVE register data,
and code is added to allocate and free this buffer at appropriate
times.
TIF_SVE is set when taking an SVE access trap from userspace, if
suitable hardware support has been detected. This enables SVE for
the thread: a subsequent return to userspace will disable the trap
accordingly. If such a trap is taken without sufficient system-
wide hardware support, SIGILL is sent to the thread instead as if
an undefined instruction had been executed: this may happen if
userspace tries to use SVE in a system where not all CPUs support
it for example.
The kernel will clear TIF_SVE and disable SVE for the thread
whenever an explicit syscall is made by userspace. For backwards
compatibility reasons and conformance with the spirit of the base
AArch64 procedure call standard, the subset of the SVE register
state that aliases the FPSIMD registers is still preserved across a
syscall even if this happens. The remainder of the SVE register
state logically becomes zero at syscall entry, though the actual
zeroing work is currently deferred until the thread next tries to
use SVE, causing another trap to the kernel. This implementation
is suboptimal: in the future, the fastpath case may be optimised
to zero the registers in-place and leave SVE enabled for the task,
where beneficial.
TIF_SVE is also cleared in the following slowpath cases, which are
taken as reasonable hints that the task may no longer use SVE:
* exec
* fork and clone
Code is added to sync data between thread.fpsimd_state and
thread.sve_state whenever enabling/disabling SVE, in a manner
consistent with the SVE architectural programmer's model.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Alex Bennée <alex.bennee@linaro.org>
[will: added #include to fix allnoconfig build]
[will: use enable_daif in do_sve_acc]
Signed-off-by: Will Deacon <will.deacon@arm.com>
2017-10-31 22:51:05 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_SVE // SVE access
|
|
|
|
b.eq el0_sve_acc
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_FP_EXC64 // FP/ASIMD exception
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el0_fpsimd_exc
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_SYS64 // configurable trap
|
2016-06-29 00:07:32 +07:00
|
|
|
b.eq el0_sys
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el0_sp_pc
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el0_sp_pc
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL0
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el0_undef
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_BREAKPT_LOW // debug exception in EL0
|
2012-03-05 18:49:27 +07:00
|
|
|
b.ge el0_dbg
|
|
|
|
b el0_inv
|
|
|
|
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
.align 6
|
|
|
|
el0_sync_compat:
|
|
|
|
kernel_entry 0, 32
|
|
|
|
mrs x25, esr_el1 // read the syndrome register
|
2014-11-24 19:31:40 +07:00
|
|
|
lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class
|
|
|
|
cmp x24, #ESR_ELx_EC_SVC32 // SVC in 32-bit state
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el0_svc_compat
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_DABT_LOW // data abort in EL0
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el0_da
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_IABT_LOW // instruction abort in EL0
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el0_ia
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_FP_ASIMD // FP/ASIMD access
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el0_fpsimd_acc
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_FP_EXC32 // FP/ASIMD exception
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el0_fpsimd_exc
|
2015-10-14 04:30:51 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception
|
|
|
|
b.eq el0_sp_pc
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL0
|
2012-03-05 18:49:27 +07:00
|
|
|
b.eq el0_undef
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_CP15_32 // CP15 MRC/MCR trap
|
2013-05-24 18:02:35 +07:00
|
|
|
b.eq el0_undef
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_CP15_64 // CP15 MRRC/MCRR trap
|
2013-05-24 18:02:35 +07:00
|
|
|
b.eq el0_undef
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_CP14_MR // CP14 MRC/MCR trap
|
2013-05-24 18:02:35 +07:00
|
|
|
b.eq el0_undef
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_CP14_LS // CP14 LDC/STC trap
|
2013-05-24 18:02:35 +07:00
|
|
|
b.eq el0_undef
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_CP14_64 // CP14 MRRC/MCRR trap
|
2013-05-24 18:02:35 +07:00
|
|
|
b.eq el0_undef
|
2014-11-24 19:31:40 +07:00
|
|
|
cmp x24, #ESR_ELx_EC_BREAKPT_LOW // debug exception in EL0
|
2012-03-05 18:49:27 +07:00
|
|
|
b.ge el0_dbg
|
|
|
|
b el0_inv
|
|
|
|
el0_svc_compat:
|
|
|
|
/*
|
|
|
|
* AArch32 syscall handling
|
|
|
|
*/
|
arm64/sve: Core task context handling
This patch adds the core support for switching and managing the SVE
architectural state of user tasks.
Calls to the existing FPSIMD low-level save/restore functions are
factored out as new functions task_fpsimd_{save,load}(), since SVE
now dynamically may or may not need to be handled at these points
depending on the kernel configuration, hardware features discovered
at boot, and the runtime state of the task. To make these
decisions as fast as possible, const cpucaps are used where
feasible, via the system_supports_sve() helper.
The SVE registers are only tracked for threads that have explicitly
used SVE, indicated by the new thread flag TIF_SVE. Otherwise, the
FPSIMD view of the architectural state is stored in
thread.fpsimd_state as usual.
When in use, the SVE registers are not stored directly in
thread_struct due to their potentially large and variable size.
Because the task_struct slab allocator must be configured very
early during kernel boot, it is also tricky to configure it
correctly to match the maximum vector length provided by the
hardware, since this depends on examining secondary CPUs as well as
the primary. Instead, a pointer sve_state in thread_struct points
to a dynamically allocated buffer containing the SVE register data,
and code is added to allocate and free this buffer at appropriate
times.
TIF_SVE is set when taking an SVE access trap from userspace, if
suitable hardware support has been detected. This enables SVE for
the thread: a subsequent return to userspace will disable the trap
accordingly. If such a trap is taken without sufficient system-
wide hardware support, SIGILL is sent to the thread instead as if
an undefined instruction had been executed: this may happen if
userspace tries to use SVE in a system where not all CPUs support
it for example.
The kernel will clear TIF_SVE and disable SVE for the thread
whenever an explicit syscall is made by userspace. For backwards
compatibility reasons and conformance with the spirit of the base
AArch64 procedure call standard, the subset of the SVE register
state that aliases the FPSIMD registers is still preserved across a
syscall even if this happens. The remainder of the SVE register
state logically becomes zero at syscall entry, though the actual
zeroing work is currently deferred until the thread next tries to
use SVE, causing another trap to the kernel. This implementation
is suboptimal: in the future, the fastpath case may be optimised
to zero the registers in-place and leave SVE enabled for the task,
where beneficial.
TIF_SVE is also cleared in the following slowpath cases, which are
taken as reasonable hints that the task may no longer use SVE:
* exec
* fork and clone
Code is added to sync data between thread.fpsimd_state and
thread.sve_state whenever enabling/disabling SVE, in a manner
consistent with the SVE architectural programmer's model.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Alex Bennée <alex.bennee@linaro.org>
[will: added #include to fix allnoconfig build]
[will: use enable_daif in do_sve_acc]
Signed-off-by: Will Deacon <will.deacon@arm.com>
2017-10-31 22:51:05 +07:00
|
|
|
ldr x16, [tsk, #TSK_TI_FLAGS] // load thread flags
|
2015-01-06 23:42:32 +07:00
|
|
|
adrp stbl, compat_sys_call_table // load compat syscall table pointer
|
arm64: syscallno is secretly an int, make it official
The upper 32 bits of the syscallno field in thread_struct are
handled inconsistently, being sometimes zero extended and sometimes
sign-extended. In fact, only the lower 32 bits seem to have any
real significance for the behaviour of the code: it's been OK to
handle the upper bits inconsistently because they don't matter.
Currently, the only place I can find where those bits are
significant is in calling trace_sys_enter(), which may be
unintentional: for example, if a compat tracer attempts to cancel a
syscall by passing -1 to (COMPAT_)PTRACE_SET_SYSCALL at the
syscall-enter-stop, it will be traced as syscall 4294967295
rather than -1 as might be expected (and as occurs for a native
tracer doing the same thing). Elsewhere, reads of syscallno cast
it to an int or truncate it.
There's also a conspicuous amount of code and casting to bodge
around the fact that although semantically an int, syscallno is
stored as a u64.
Let's not pretend any more.
In order to preserve the stp x instruction that stores the syscall
number in entry.S, this patch special-cases the layout of struct
pt_regs for big endian so that the newly 32-bit syscallno field
maps onto the low bits of the stored value. This is not beautiful,
but benchmarking of the getpid syscall on Juno suggests indicates a
minor slowdown if the stp is split into an stp x and stp w.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-08-01 21:35:53 +07:00
|
|
|
mov wscno, w7 // syscall number in w7 (r7)
|
|
|
|
mov wsc_nr, #__NR_compat_syscalls
|
2012-03-05 18:49:27 +07:00
|
|
|
b el0_svc_naked
|
|
|
|
|
|
|
|
.align 6
|
|
|
|
el0_irq_compat:
|
|
|
|
kernel_entry 0, 32
|
|
|
|
b el0_irq_naked
|
2017-11-02 19:12:42 +07:00
|
|
|
|
|
|
|
el0_error_compat:
|
|
|
|
kernel_entry 0, 32
|
|
|
|
b el0_error_naked
|
2012-03-05 18:49:27 +07:00
|
|
|
#endif
|
|
|
|
|
|
|
|
el0_da:
|
|
|
|
/*
|
|
|
|
* Data abort handling
|
|
|
|
*/
|
2014-05-31 02:34:14 +07:00
|
|
|
mrs x26, far_el1
|
2017-11-02 19:12:40 +07:00
|
|
|
enable_daif
|
2014-05-31 02:34:15 +07:00
|
|
|
ct_user_exit
|
arm64: entry: improve data abort handling of tagged pointers
When handling a data abort from EL0, we currently zero the top byte of
the faulting address, as we assume the address is a TTBR0 address, which
may contain a non-zero address tag. However, the address may be a TTBR1
address, in which case we should not zero the top byte. This patch fixes
that. The effect is that the full TTBR1 address is passed to the task's
signal handler (or printed out in the kernel log).
When handling a data abort from EL1, we leave the faulting address
intact, as we assume it's either a TTBR1 address or a TTBR0 address with
tag 0x00. This is true as far as I'm aware, we don't seem to access a
tagged TTBR0 address anywhere in the kernel. Regardless, it's easy to
forget about address tags, and code added in the future may not always
remember to remove tags from addresses before accessing them. So add tag
handling to the EL1 data abort handler as well. This also makes it
consistent with the EL0 data abort handler.
Fixes: d50240a5f6ce ("arm64: mm: permit use of tagged pointers at EL0")
Cc: <stable@vger.kernel.org> # 3.12.x-
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-05-03 22:37:47 +07:00
|
|
|
clear_address_tag x0, x26
|
2012-03-05 18:49:27 +07:00
|
|
|
mov x1, x25
|
|
|
|
mov x2, sp
|
2014-09-29 17:44:01 +07:00
|
|
|
bl do_mem_abort
|
|
|
|
b ret_to_user
|
2012-03-05 18:49:27 +07:00
|
|
|
el0_ia:
|
|
|
|
/*
|
|
|
|
* Instruction abort handling
|
|
|
|
*/
|
2014-05-31 02:34:14 +07:00
|
|
|
mrs x26, far_el1
|
2017-11-02 19:12:40 +07:00
|
|
|
enable_daif
|
2014-05-31 02:34:15 +07:00
|
|
|
ct_user_exit
|
2014-05-31 02:34:14 +07:00
|
|
|
mov x0, x26
|
arm64: kill ESR_LNX_EXEC
Currently we treat ESR_EL1 bit 24 as software-defined for distinguishing
instruction aborts from data aborts, but this bit is architecturally
RES0 for instruction aborts, and could be allocated for an arbitrary
purpose in future. Additionally, we hard-code the value in entry.S
without the mnemonic, making the code difficult to understand.
Instead, remove ESR_LNX_EXEC, and distinguish aborts based on the esr,
which we already pass to the sole use of ESR_LNX_EXEC. A new helper,
is_el0_instruction_abort() is added to make the logic clear. Any
instruction aborts taken from EL1 will already have been handled by
bad_mode, so we need not handle that case in the helper.
For consistency, the existing permission_fault helper is renamed to
is_permission_fault, and the return type is changed to bool. There
should be no functional changes as the return value was a boolean
expression, and the result is only used in another boolean expression.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Dave P Martin <dave.martin@arm.com>
Cc: Huang Shijie <shijie.huang@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-05-31 18:33:03 +07:00
|
|
|
mov x1, x25
|
2012-03-05 18:49:27 +07:00
|
|
|
mov x2, sp
|
2014-09-29 17:44:01 +07:00
|
|
|
bl do_mem_abort
|
|
|
|
b ret_to_user
|
2012-03-05 18:49:27 +07:00
|
|
|
el0_fpsimd_acc:
|
|
|
|
/*
|
|
|
|
* Floating Point or Advanced SIMD access
|
|
|
|
*/
|
2017-11-02 19:12:40 +07:00
|
|
|
enable_daif
|
2014-05-31 02:34:15 +07:00
|
|
|
ct_user_exit
|
2012-03-05 18:49:27 +07:00
|
|
|
mov x0, x25
|
|
|
|
mov x1, sp
|
2014-09-29 17:44:01 +07:00
|
|
|
bl do_fpsimd_acc
|
|
|
|
b ret_to_user
|
arm64/sve: Core task context handling
This patch adds the core support for switching and managing the SVE
architectural state of user tasks.
Calls to the existing FPSIMD low-level save/restore functions are
factored out as new functions task_fpsimd_{save,load}(), since SVE
now dynamically may or may not need to be handled at these points
depending on the kernel configuration, hardware features discovered
at boot, and the runtime state of the task. To make these
decisions as fast as possible, const cpucaps are used where
feasible, via the system_supports_sve() helper.
The SVE registers are only tracked for threads that have explicitly
used SVE, indicated by the new thread flag TIF_SVE. Otherwise, the
FPSIMD view of the architectural state is stored in
thread.fpsimd_state as usual.
When in use, the SVE registers are not stored directly in
thread_struct due to their potentially large and variable size.
Because the task_struct slab allocator must be configured very
early during kernel boot, it is also tricky to configure it
correctly to match the maximum vector length provided by the
hardware, since this depends on examining secondary CPUs as well as
the primary. Instead, a pointer sve_state in thread_struct points
to a dynamically allocated buffer containing the SVE register data,
and code is added to allocate and free this buffer at appropriate
times.
TIF_SVE is set when taking an SVE access trap from userspace, if
suitable hardware support has been detected. This enables SVE for
the thread: a subsequent return to userspace will disable the trap
accordingly. If such a trap is taken without sufficient system-
wide hardware support, SIGILL is sent to the thread instead as if
an undefined instruction had been executed: this may happen if
userspace tries to use SVE in a system where not all CPUs support
it for example.
The kernel will clear TIF_SVE and disable SVE for the thread
whenever an explicit syscall is made by userspace. For backwards
compatibility reasons and conformance with the spirit of the base
AArch64 procedure call standard, the subset of the SVE register
state that aliases the FPSIMD registers is still preserved across a
syscall even if this happens. The remainder of the SVE register
state logically becomes zero at syscall entry, though the actual
zeroing work is currently deferred until the thread next tries to
use SVE, causing another trap to the kernel. This implementation
is suboptimal: in the future, the fastpath case may be optimised
to zero the registers in-place and leave SVE enabled for the task,
where beneficial.
TIF_SVE is also cleared in the following slowpath cases, which are
taken as reasonable hints that the task may no longer use SVE:
* exec
* fork and clone
Code is added to sync data between thread.fpsimd_state and
thread.sve_state whenever enabling/disabling SVE, in a manner
consistent with the SVE architectural programmer's model.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Alex Bennée <alex.bennee@linaro.org>
[will: added #include to fix allnoconfig build]
[will: use enable_daif in do_sve_acc]
Signed-off-by: Will Deacon <will.deacon@arm.com>
2017-10-31 22:51:05 +07:00
|
|
|
el0_sve_acc:
|
|
|
|
/*
|
|
|
|
* Scalable Vector Extension access
|
|
|
|
*/
|
|
|
|
enable_daif
|
|
|
|
ct_user_exit
|
|
|
|
mov x0, x25
|
|
|
|
mov x1, sp
|
|
|
|
bl do_sve_acc
|
|
|
|
b ret_to_user
|
2012-03-05 18:49:27 +07:00
|
|
|
el0_fpsimd_exc:
|
|
|
|
/*
|
arm64/sve: Core task context handling
This patch adds the core support for switching and managing the SVE
architectural state of user tasks.
Calls to the existing FPSIMD low-level save/restore functions are
factored out as new functions task_fpsimd_{save,load}(), since SVE
now dynamically may or may not need to be handled at these points
depending on the kernel configuration, hardware features discovered
at boot, and the runtime state of the task. To make these
decisions as fast as possible, const cpucaps are used where
feasible, via the system_supports_sve() helper.
The SVE registers are only tracked for threads that have explicitly
used SVE, indicated by the new thread flag TIF_SVE. Otherwise, the
FPSIMD view of the architectural state is stored in
thread.fpsimd_state as usual.
When in use, the SVE registers are not stored directly in
thread_struct due to their potentially large and variable size.
Because the task_struct slab allocator must be configured very
early during kernel boot, it is also tricky to configure it
correctly to match the maximum vector length provided by the
hardware, since this depends on examining secondary CPUs as well as
the primary. Instead, a pointer sve_state in thread_struct points
to a dynamically allocated buffer containing the SVE register data,
and code is added to allocate and free this buffer at appropriate
times.
TIF_SVE is set when taking an SVE access trap from userspace, if
suitable hardware support has been detected. This enables SVE for
the thread: a subsequent return to userspace will disable the trap
accordingly. If such a trap is taken without sufficient system-
wide hardware support, SIGILL is sent to the thread instead as if
an undefined instruction had been executed: this may happen if
userspace tries to use SVE in a system where not all CPUs support
it for example.
The kernel will clear TIF_SVE and disable SVE for the thread
whenever an explicit syscall is made by userspace. For backwards
compatibility reasons and conformance with the spirit of the base
AArch64 procedure call standard, the subset of the SVE register
state that aliases the FPSIMD registers is still preserved across a
syscall even if this happens. The remainder of the SVE register
state logically becomes zero at syscall entry, though the actual
zeroing work is currently deferred until the thread next tries to
use SVE, causing another trap to the kernel. This implementation
is suboptimal: in the future, the fastpath case may be optimised
to zero the registers in-place and leave SVE enabled for the task,
where beneficial.
TIF_SVE is also cleared in the following slowpath cases, which are
taken as reasonable hints that the task may no longer use SVE:
* exec
* fork and clone
Code is added to sync data between thread.fpsimd_state and
thread.sve_state whenever enabling/disabling SVE, in a manner
consistent with the SVE architectural programmer's model.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Alex Bennée <alex.bennee@linaro.org>
[will: added #include to fix allnoconfig build]
[will: use enable_daif in do_sve_acc]
Signed-off-by: Will Deacon <will.deacon@arm.com>
2017-10-31 22:51:05 +07:00
|
|
|
* Floating Point, Advanced SIMD or SVE exception
|
2012-03-05 18:49:27 +07:00
|
|
|
*/
|
2017-11-02 19:12:40 +07:00
|
|
|
enable_daif
|
2014-05-31 02:34:15 +07:00
|
|
|
ct_user_exit
|
2012-03-05 18:49:27 +07:00
|
|
|
mov x0, x25
|
|
|
|
mov x1, sp
|
2014-09-29 17:44:01 +07:00
|
|
|
bl do_fpsimd_exc
|
|
|
|
b ret_to_user
|
2012-03-05 18:49:27 +07:00
|
|
|
el0_sp_pc:
|
|
|
|
/*
|
|
|
|
* Stack or PC alignment exception handling
|
|
|
|
*/
|
2014-05-31 02:34:14 +07:00
|
|
|
mrs x26, far_el1
|
2017-11-02 19:12:40 +07:00
|
|
|
enable_daif
|
2015-06-15 22:40:27 +07:00
|
|
|
ct_user_exit
|
2014-05-31 02:34:14 +07:00
|
|
|
mov x0, x26
|
2012-03-05 18:49:27 +07:00
|
|
|
mov x1, x25
|
|
|
|
mov x2, sp
|
2014-09-29 17:44:01 +07:00
|
|
|
bl do_sp_pc_abort
|
|
|
|
b ret_to_user
|
2012-03-05 18:49:27 +07:00
|
|
|
el0_undef:
|
|
|
|
/*
|
|
|
|
* Undefined instruction
|
|
|
|
*/
|
2017-11-02 19:12:40 +07:00
|
|
|
enable_daif
|
2014-05-31 02:34:15 +07:00
|
|
|
ct_user_exit
|
arm64: debug: avoid accessing mdscr_el1 on fault paths where possible
Since mdscr_el1 is part of the debug register group, it is highly likely
to be trapped by a hypervisor to prevent virtual machines from debugging
(buggering?) each other. Unfortunately, this absolutely destroys our
performance, since we access the register on many of our low-level
fault handling paths to keep track of the various debug state machines.
This patch removes our dependency on mdscr_el1 in the case that debugging
is not being used. More specifically we:
- Use TIF_SINGLESTEP to indicate that a task is stepping at EL0 and
avoid disabling step in the MDSCR when we don't need to.
MDSCR_EL1.SS handling is moved to kernel_entry, when trapping from
userspace.
- Ensure debug exceptions are re-enabled on *all* exception entry
paths, even the debug exception handling path (where we re-enable
exceptions after invoking the handler). Since we can now rely on
MDSCR_EL1.SS being cleared by the entry code, exception handlers can
usually enable debug immediately before enabling interrupts.
- Remove all debug exception unmasking from ret_to_user and
el1_preempt, since we will never get here with debug exceptions
masked.
This results in a slight change to kernel debug behaviour, where we now
step into interrupt handlers and data aborts from EL1 when debugging the
kernel, which is actually a useful thing to do. A side-effect of this is
that it *does* potentially prevent stepping off {break,watch}points when
there is a high-frequency interrupt source (e.g. a timer), so a debugger
would need to use either breakpoints or manually disable interrupts to
get around this issue.
With this patch applied, guest performance is restored under KVM when
debug register accesses are trapped (and we get a measurable performance
increase on the host on Cortex-A57 too).
Cc: Ian Campbell <ian.campbell@citrix.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2014-04-30 01:04:06 +07:00
|
|
|
mov x0, sp
|
2014-09-29 17:44:01 +07:00
|
|
|
bl do_undefinstr
|
|
|
|
b ret_to_user
|
2016-06-29 00:07:32 +07:00
|
|
|
el0_sys:
|
|
|
|
/*
|
|
|
|
* System instructions, for trapped cache maintenance instructions
|
|
|
|
*/
|
2017-11-02 19:12:40 +07:00
|
|
|
enable_daif
|
2016-06-29 00:07:32 +07:00
|
|
|
ct_user_exit
|
|
|
|
mov x0, x25
|
|
|
|
mov x1, sp
|
|
|
|
bl do_sysinstr
|
|
|
|
b ret_to_user
|
2012-03-05 18:49:27 +07:00
|
|
|
el0_dbg:
|
|
|
|
/*
|
|
|
|
* Debug exception handling
|
|
|
|
*/
|
|
|
|
tbnz x24, #0, el0_inv // EL0 only
|
|
|
|
mrs x0, far_el1
|
|
|
|
mov x1, x25
|
|
|
|
mov x2, sp
|
arm64: debug: avoid accessing mdscr_el1 on fault paths where possible
Since mdscr_el1 is part of the debug register group, it is highly likely
to be trapped by a hypervisor to prevent virtual machines from debugging
(buggering?) each other. Unfortunately, this absolutely destroys our
performance, since we access the register on many of our low-level
fault handling paths to keep track of the various debug state machines.
This patch removes our dependency on mdscr_el1 in the case that debugging
is not being used. More specifically we:
- Use TIF_SINGLESTEP to indicate that a task is stepping at EL0 and
avoid disabling step in the MDSCR when we don't need to.
MDSCR_EL1.SS handling is moved to kernel_entry, when trapping from
userspace.
- Ensure debug exceptions are re-enabled on *all* exception entry
paths, even the debug exception handling path (where we re-enable
exceptions after invoking the handler). Since we can now rely on
MDSCR_EL1.SS being cleared by the entry code, exception handlers can
usually enable debug immediately before enabling interrupts.
- Remove all debug exception unmasking from ret_to_user and
el1_preempt, since we will never get here with debug exceptions
masked.
This results in a slight change to kernel debug behaviour, where we now
step into interrupt handlers and data aborts from EL1 when debugging the
kernel, which is actually a useful thing to do. A side-effect of this is
that it *does* potentially prevent stepping off {break,watch}points when
there is a high-frequency interrupt source (e.g. a timer), so a debugger
would need to use either breakpoints or manually disable interrupts to
get around this issue.
With this patch applied, guest performance is restored under KVM when
debug register accesses are trapped (and we get a measurable performance
increase on the host on Cortex-A57 too).
Cc: Ian Campbell <ian.campbell@citrix.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2014-04-30 01:04:06 +07:00
|
|
|
bl do_debug_exception
|
2017-11-02 19:12:40 +07:00
|
|
|
enable_daif
|
2014-05-31 02:34:15 +07:00
|
|
|
ct_user_exit
|
arm64: debug: avoid accessing mdscr_el1 on fault paths where possible
Since mdscr_el1 is part of the debug register group, it is highly likely
to be trapped by a hypervisor to prevent virtual machines from debugging
(buggering?) each other. Unfortunately, this absolutely destroys our
performance, since we access the register on many of our low-level
fault handling paths to keep track of the various debug state machines.
This patch removes our dependency on mdscr_el1 in the case that debugging
is not being used. More specifically we:
- Use TIF_SINGLESTEP to indicate that a task is stepping at EL0 and
avoid disabling step in the MDSCR when we don't need to.
MDSCR_EL1.SS handling is moved to kernel_entry, when trapping from
userspace.
- Ensure debug exceptions are re-enabled on *all* exception entry
paths, even the debug exception handling path (where we re-enable
exceptions after invoking the handler). Since we can now rely on
MDSCR_EL1.SS being cleared by the entry code, exception handlers can
usually enable debug immediately before enabling interrupts.
- Remove all debug exception unmasking from ret_to_user and
el1_preempt, since we will never get here with debug exceptions
masked.
This results in a slight change to kernel debug behaviour, where we now
step into interrupt handlers and data aborts from EL1 when debugging the
kernel, which is actually a useful thing to do. A side-effect of this is
that it *does* potentially prevent stepping off {break,watch}points when
there is a high-frequency interrupt source (e.g. a timer), so a debugger
would need to use either breakpoints or manually disable interrupts to
get around this issue.
With this patch applied, guest performance is restored under KVM when
debug register accesses are trapped (and we get a measurable performance
increase on the host on Cortex-A57 too).
Cc: Ian Campbell <ian.campbell@citrix.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2014-04-30 01:04:06 +07:00
|
|
|
b ret_to_user
|
2012-03-05 18:49:27 +07:00
|
|
|
el0_inv:
|
2017-11-02 19:12:40 +07:00
|
|
|
enable_daif
|
2014-05-31 02:34:15 +07:00
|
|
|
ct_user_exit
|
2012-03-05 18:49:27 +07:00
|
|
|
mov x0, sp
|
|
|
|
mov x1, #BAD_SYNC
|
2015-07-08 00:00:49 +07:00
|
|
|
mov x2, x25
|
arm64: avoid returning from bad_mode
Generally, taking an unexpected exception should be a fatal event, and
bad_mode is intended to cater for this. However, it should be possible
to contain unexpected synchronous exceptions from EL0 without bringing
the kernel down, by sending a SIGILL to the task.
We tried to apply this approach in commit 9955ac47f4ba1c95 ("arm64:
don't kill the kernel on a bad esr from el0"), by sending a signal for
any bad_mode call resulting from an EL0 exception.
However, this also applies to other unexpected exceptions, such as
SError and FIQ. The entry paths for these exceptions branch to bad_mode
without configuring the link register, and have no kernel_exit. Thus, if
we take one of these exceptions from EL0, bad_mode will eventually
return to the original user link register value.
This patch fixes this by introducing a new bad_el0_sync handler to cater
for the recoverable case, and restoring bad_mode to its original state,
whereby it calls panic() and never returns. The recoverable case
branches to bad_el0_sync with a bl, and returns to userspace via the
usual ret_to_user mechanism.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Fixes: 9955ac47f4ba1c95 ("arm64: don't kill the kernel on a bad esr from el0")
Reported-by: Mark Salter <msalter@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: stable@vger.kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-01-19 00:23:41 +07:00
|
|
|
bl bad_el0_sync
|
2014-09-29 17:44:01 +07:00
|
|
|
b ret_to_user
|
2012-03-05 18:49:27 +07:00
|
|
|
ENDPROC(el0_sync)
|
|
|
|
|
|
|
|
.align 6
|
|
|
|
el0_irq:
|
|
|
|
kernel_entry 0
|
|
|
|
el0_irq_naked:
|
2017-11-02 19:12:41 +07:00
|
|
|
enable_da_f
|
2012-03-05 18:49:27 +07:00
|
|
|
#ifdef CONFIG_TRACE_IRQFLAGS
|
|
|
|
bl trace_hardirqs_off
|
|
|
|
#endif
|
2013-11-13 00:11:53 +07:00
|
|
|
|
2014-05-31 02:34:15 +07:00
|
|
|
ct_user_exit
|
2012-03-05 18:49:27 +07:00
|
|
|
irq_handler
|
2013-11-13 00:11:53 +07:00
|
|
|
|
2012-03-05 18:49:27 +07:00
|
|
|
#ifdef CONFIG_TRACE_IRQFLAGS
|
|
|
|
bl trace_hardirqs_on
|
|
|
|
#endif
|
|
|
|
b ret_to_user
|
|
|
|
ENDPROC(el0_irq)
|
|
|
|
|
2017-11-02 19:12:42 +07:00
|
|
|
el1_error:
|
|
|
|
kernel_entry 1
|
|
|
|
mrs x1, esr_el1
|
|
|
|
enable_dbg
|
|
|
|
mov x0, sp
|
|
|
|
bl do_serror
|
|
|
|
kernel_exit 1
|
|
|
|
ENDPROC(el1_error)
|
|
|
|
|
|
|
|
el0_error:
|
|
|
|
kernel_entry 0
|
|
|
|
el0_error_naked:
|
|
|
|
mrs x1, esr_el1
|
|
|
|
enable_dbg
|
|
|
|
mov x0, sp
|
|
|
|
bl do_serror
|
|
|
|
enable_daif
|
|
|
|
ct_user_exit
|
|
|
|
b ret_to_user
|
|
|
|
ENDPROC(el0_error)
|
|
|
|
|
|
|
|
|
2012-03-05 18:49:27 +07:00
|
|
|
/*
|
|
|
|
* This is the fast syscall return path. We do as little as possible here,
|
|
|
|
* and this includes saving x0 back into the kernel stack.
|
|
|
|
*/
|
|
|
|
ret_fast_syscall:
|
2017-11-02 19:12:37 +07:00
|
|
|
disable_daif
|
2015-08-19 21:57:09 +07:00
|
|
|
str x0, [sp, #S_X0] // returned x0
|
arm64: split thread_info from task stack
This patch moves arm64's struct thread_info from the task stack into
task_struct. This protects thread_info from corruption in the case of
stack overflows, and makes its address harder to determine if stack
addresses are leaked, making a number of attacks more difficult. Precise
detection and handling of overflow is left for subsequent patches.
Largely, this involves changing code to store the task_struct in sp_el0,
and acquire the thread_info from the task struct. Core code now
implements current_thread_info(), and as noted in <linux/sched.h> this
relies on offsetof(task_struct, thread_info) == 0, enforced by core
code.
This change means that the 'tsk' register used in entry.S now points to
a task_struct, rather than a thread_info as it used to. To make this
clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets
appropriately updated to account for the structural change.
Userspace clobbers sp_el0, and we can no longer restore this from the
stack. Instead, the current task is cached in a per-cpu variable that we
can safely access from early assembly as interrupts are disabled (and we
are thus not preemptible).
Both secondary entry and idle are updated to stash the sp and task
pointer separately.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-04 03:23:13 +07:00
|
|
|
ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing
|
2015-06-06 04:28:03 +07:00
|
|
|
and x2, x1, #_TIF_SYSCALL_WORK
|
|
|
|
cbnz x2, ret_fast_syscall_trace
|
2012-03-05 18:49:27 +07:00
|
|
|
and x2, x1, #_TIF_WORK_MASK
|
2015-08-19 21:57:09 +07:00
|
|
|
cbnz x2, work_pending
|
arm64: debug: avoid accessing mdscr_el1 on fault paths where possible
Since mdscr_el1 is part of the debug register group, it is highly likely
to be trapped by a hypervisor to prevent virtual machines from debugging
(buggering?) each other. Unfortunately, this absolutely destroys our
performance, since we access the register on many of our low-level
fault handling paths to keep track of the various debug state machines.
This patch removes our dependency on mdscr_el1 in the case that debugging
is not being used. More specifically we:
- Use TIF_SINGLESTEP to indicate that a task is stepping at EL0 and
avoid disabling step in the MDSCR when we don't need to.
MDSCR_EL1.SS handling is moved to kernel_entry, when trapping from
userspace.
- Ensure debug exceptions are re-enabled on *all* exception entry
paths, even the debug exception handling path (where we re-enable
exceptions after invoking the handler). Since we can now rely on
MDSCR_EL1.SS being cleared by the entry code, exception handlers can
usually enable debug immediately before enabling interrupts.
- Remove all debug exception unmasking from ret_to_user and
el1_preempt, since we will never get here with debug exceptions
masked.
This results in a slight change to kernel debug behaviour, where we now
step into interrupt handlers and data aborts from EL1 when debugging the
kernel, which is actually a useful thing to do. A side-effect of this is
that it *does* potentially prevent stepping off {break,watch}points when
there is a high-frequency interrupt source (e.g. a timer), so a debugger
would need to use either breakpoints or manually disable interrupts to
get around this issue.
With this patch applied, guest performance is restored under KVM when
debug register accesses are trapped (and we get a measurable performance
increase on the host on Cortex-A57 too).
Cc: Ian Campbell <ian.campbell@citrix.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2014-04-30 01:04:06 +07:00
|
|
|
enable_step_tsk x1, x2
|
2015-08-19 21:57:09 +07:00
|
|
|
kernel_exit 0
|
2015-06-06 04:28:03 +07:00
|
|
|
ret_fast_syscall_trace:
|
2017-11-02 19:12:37 +07:00
|
|
|
enable_daif
|
2015-08-19 21:57:09 +07:00
|
|
|
b __sys_trace_return_skipped // we already saved x0
|
2012-03-05 18:49:27 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Ok, we need to do extra processing, enter the slow path.
|
|
|
|
*/
|
|
|
|
work_pending:
|
|
|
|
mov x0, sp // 'regs'
|
|
|
|
bl do_notify_resume
|
2015-12-04 19:42:29 +07:00
|
|
|
#ifdef CONFIG_TRACE_IRQFLAGS
|
2016-07-15 03:48:14 +07:00
|
|
|
bl trace_hardirqs_on // enabled while in userspace
|
2015-12-04 19:42:29 +07:00
|
|
|
#endif
|
arm64: split thread_info from task stack
This patch moves arm64's struct thread_info from the task stack into
task_struct. This protects thread_info from corruption in the case of
stack overflows, and makes its address harder to determine if stack
addresses are leaked, making a number of attacks more difficult. Precise
detection and handling of overflow is left for subsequent patches.
Largely, this involves changing code to store the task_struct in sp_el0,
and acquire the thread_info from the task struct. Core code now
implements current_thread_info(), and as noted in <linux/sched.h> this
relies on offsetof(task_struct, thread_info) == 0, enforced by core
code.
This change means that the 'tsk' register used in entry.S now points to
a task_struct, rather than a thread_info as it used to. To make this
clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets
appropriately updated to account for the structural change.
Userspace clobbers sp_el0, and we can no longer restore this from the
stack. Instead, the current task is cached in a per-cpu variable that we
can safely access from early assembly as interrupts are disabled (and we
are thus not preemptible).
Both secondary entry and idle are updated to stash the sp and task
pointer separately.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-04 03:23:13 +07:00
|
|
|
ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step
|
2016-07-15 03:48:14 +07:00
|
|
|
b finish_ret_to_user
|
2012-03-05 18:49:27 +07:00
|
|
|
/*
|
|
|
|
* "slow" syscall return path.
|
|
|
|
*/
|
2012-09-10 22:11:46 +07:00
|
|
|
ret_to_user:
|
2017-11-02 19:12:37 +07:00
|
|
|
disable_daif
|
arm64: split thread_info from task stack
This patch moves arm64's struct thread_info from the task stack into
task_struct. This protects thread_info from corruption in the case of
stack overflows, and makes its address harder to determine if stack
addresses are leaked, making a number of attacks more difficult. Precise
detection and handling of overflow is left for subsequent patches.
Largely, this involves changing code to store the task_struct in sp_el0,
and acquire the thread_info from the task struct. Core code now
implements current_thread_info(), and as noted in <linux/sched.h> this
relies on offsetof(task_struct, thread_info) == 0, enforced by core
code.
This change means that the 'tsk' register used in entry.S now points to
a task_struct, rather than a thread_info as it used to. To make this
clear, the TI_* field offsets are renamed to TSK_TI_*, with asm-offsets
appropriately updated to account for the structural change.
Userspace clobbers sp_el0, and we can no longer restore this from the
stack. Instead, the current task is cached in a per-cpu variable that we
can safely access from early assembly as interrupts are disabled (and we
are thus not preemptible).
Both secondary entry and idle are updated to stash the sp and task
pointer separately.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-11-04 03:23:13 +07:00
|
|
|
ldr x1, [tsk, #TSK_TI_FLAGS]
|
2012-03-05 18:49:27 +07:00
|
|
|
and x2, x1, #_TIF_WORK_MASK
|
|
|
|
cbnz x2, work_pending
|
2016-07-15 03:48:14 +07:00
|
|
|
finish_ret_to_user:
|
arm64: debug: avoid accessing mdscr_el1 on fault paths where possible
Since mdscr_el1 is part of the debug register group, it is highly likely
to be trapped by a hypervisor to prevent virtual machines from debugging
(buggering?) each other. Unfortunately, this absolutely destroys our
performance, since we access the register on many of our low-level
fault handling paths to keep track of the various debug state machines.
This patch removes our dependency on mdscr_el1 in the case that debugging
is not being used. More specifically we:
- Use TIF_SINGLESTEP to indicate that a task is stepping at EL0 and
avoid disabling step in the MDSCR when we don't need to.
MDSCR_EL1.SS handling is moved to kernel_entry, when trapping from
userspace.
- Ensure debug exceptions are re-enabled on *all* exception entry
paths, even the debug exception handling path (where we re-enable
exceptions after invoking the handler). Since we can now rely on
MDSCR_EL1.SS being cleared by the entry code, exception handlers can
usually enable debug immediately before enabling interrupts.
- Remove all debug exception unmasking from ret_to_user and
el1_preempt, since we will never get here with debug exceptions
masked.
This results in a slight change to kernel debug behaviour, where we now
step into interrupt handlers and data aborts from EL1 when debugging the
kernel, which is actually a useful thing to do. A side-effect of this is
that it *does* potentially prevent stepping off {break,watch}points when
there is a high-frequency interrupt source (e.g. a timer), so a debugger
would need to use either breakpoints or manually disable interrupts to
get around this issue.
With this patch applied, guest performance is restored under KVM when
debug register accesses are trapped (and we get a measurable performance
increase on the host on Cortex-A57 too).
Cc: Ian Campbell <ian.campbell@citrix.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2014-04-30 01:04:06 +07:00
|
|
|
enable_step_tsk x1, x2
|
2015-08-19 21:57:09 +07:00
|
|
|
kernel_exit 0
|
2012-03-05 18:49:27 +07:00
|
|
|
ENDPROC(ret_to_user)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* SVC handler.
|
|
|
|
*/
|
|
|
|
.align 6
|
|
|
|
el0_svc:
|
arm64/sve: Core task context handling
This patch adds the core support for switching and managing the SVE
architectural state of user tasks.
Calls to the existing FPSIMD low-level save/restore functions are
factored out as new functions task_fpsimd_{save,load}(), since SVE
now dynamically may or may not need to be handled at these points
depending on the kernel configuration, hardware features discovered
at boot, and the runtime state of the task. To make these
decisions as fast as possible, const cpucaps are used where
feasible, via the system_supports_sve() helper.
The SVE registers are only tracked for threads that have explicitly
used SVE, indicated by the new thread flag TIF_SVE. Otherwise, the
FPSIMD view of the architectural state is stored in
thread.fpsimd_state as usual.
When in use, the SVE registers are not stored directly in
thread_struct due to their potentially large and variable size.
Because the task_struct slab allocator must be configured very
early during kernel boot, it is also tricky to configure it
correctly to match the maximum vector length provided by the
hardware, since this depends on examining secondary CPUs as well as
the primary. Instead, a pointer sve_state in thread_struct points
to a dynamically allocated buffer containing the SVE register data,
and code is added to allocate and free this buffer at appropriate
times.
TIF_SVE is set when taking an SVE access trap from userspace, if
suitable hardware support has been detected. This enables SVE for
the thread: a subsequent return to userspace will disable the trap
accordingly. If such a trap is taken without sufficient system-
wide hardware support, SIGILL is sent to the thread instead as if
an undefined instruction had been executed: this may happen if
userspace tries to use SVE in a system where not all CPUs support
it for example.
The kernel will clear TIF_SVE and disable SVE for the thread
whenever an explicit syscall is made by userspace. For backwards
compatibility reasons and conformance with the spirit of the base
AArch64 procedure call standard, the subset of the SVE register
state that aliases the FPSIMD registers is still preserved across a
syscall even if this happens. The remainder of the SVE register
state logically becomes zero at syscall entry, though the actual
zeroing work is currently deferred until the thread next tries to
use SVE, causing another trap to the kernel. This implementation
is suboptimal: in the future, the fastpath case may be optimised
to zero the registers in-place and leave SVE enabled for the task,
where beneficial.
TIF_SVE is also cleared in the following slowpath cases, which are
taken as reasonable hints that the task may no longer use SVE:
* exec
* fork and clone
Code is added to sync data between thread.fpsimd_state and
thread.sve_state whenever enabling/disabling SVE, in a manner
consistent with the SVE architectural programmer's model.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Alex Bennée <alex.bennee@linaro.org>
[will: added #include to fix allnoconfig build]
[will: use enable_daif in do_sve_acc]
Signed-off-by: Will Deacon <will.deacon@arm.com>
2017-10-31 22:51:05 +07:00
|
|
|
ldr x16, [tsk, #TSK_TI_FLAGS] // load thread flags
|
2012-03-05 18:49:27 +07:00
|
|
|
adrp stbl, sys_call_table // load syscall table pointer
|
arm64: syscallno is secretly an int, make it official
The upper 32 bits of the syscallno field in thread_struct are
handled inconsistently, being sometimes zero extended and sometimes
sign-extended. In fact, only the lower 32 bits seem to have any
real significance for the behaviour of the code: it's been OK to
handle the upper bits inconsistently because they don't matter.
Currently, the only place I can find where those bits are
significant is in calling trace_sys_enter(), which may be
unintentional: for example, if a compat tracer attempts to cancel a
syscall by passing -1 to (COMPAT_)PTRACE_SET_SYSCALL at the
syscall-enter-stop, it will be traced as syscall 4294967295
rather than -1 as might be expected (and as occurs for a native
tracer doing the same thing). Elsewhere, reads of syscallno cast
it to an int or truncate it.
There's also a conspicuous amount of code and casting to bodge
around the fact that although semantically an int, syscallno is
stored as a u64.
Let's not pretend any more.
In order to preserve the stp x instruction that stores the syscall
number in entry.S, this patch special-cases the layout of struct
pt_regs for big endian so that the newly 32-bit syscallno field
maps onto the low bits of the stored value. This is not beautiful,
but benchmarking of the getpid syscall on Juno suggests indicates a
minor slowdown if the stp is split into an stp x and stp w.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-08-01 21:35:53 +07:00
|
|
|
mov wscno, w8 // syscall number in w8
|
|
|
|
mov wsc_nr, #__NR_syscalls
|
arm64/sve: Core task context handling
This patch adds the core support for switching and managing the SVE
architectural state of user tasks.
Calls to the existing FPSIMD low-level save/restore functions are
factored out as new functions task_fpsimd_{save,load}(), since SVE
now dynamically may or may not need to be handled at these points
depending on the kernel configuration, hardware features discovered
at boot, and the runtime state of the task. To make these
decisions as fast as possible, const cpucaps are used where
feasible, via the system_supports_sve() helper.
The SVE registers are only tracked for threads that have explicitly
used SVE, indicated by the new thread flag TIF_SVE. Otherwise, the
FPSIMD view of the architectural state is stored in
thread.fpsimd_state as usual.
When in use, the SVE registers are not stored directly in
thread_struct due to their potentially large and variable size.
Because the task_struct slab allocator must be configured very
early during kernel boot, it is also tricky to configure it
correctly to match the maximum vector length provided by the
hardware, since this depends on examining secondary CPUs as well as
the primary. Instead, a pointer sve_state in thread_struct points
to a dynamically allocated buffer containing the SVE register data,
and code is added to allocate and free this buffer at appropriate
times.
TIF_SVE is set when taking an SVE access trap from userspace, if
suitable hardware support has been detected. This enables SVE for
the thread: a subsequent return to userspace will disable the trap
accordingly. If such a trap is taken without sufficient system-
wide hardware support, SIGILL is sent to the thread instead as if
an undefined instruction had been executed: this may happen if
userspace tries to use SVE in a system where not all CPUs support
it for example.
The kernel will clear TIF_SVE and disable SVE for the thread
whenever an explicit syscall is made by userspace. For backwards
compatibility reasons and conformance with the spirit of the base
AArch64 procedure call standard, the subset of the SVE register
state that aliases the FPSIMD registers is still preserved across a
syscall even if this happens. The remainder of the SVE register
state logically becomes zero at syscall entry, though the actual
zeroing work is currently deferred until the thread next tries to
use SVE, causing another trap to the kernel. This implementation
is suboptimal: in the future, the fastpath case may be optimised
to zero the registers in-place and leave SVE enabled for the task,
where beneficial.
TIF_SVE is also cleared in the following slowpath cases, which are
taken as reasonable hints that the task may no longer use SVE:
* exec
* fork and clone
Code is added to sync data between thread.fpsimd_state and
thread.sve_state whenever enabling/disabling SVE, in a manner
consistent with the SVE architectural programmer's model.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Alex Bennée <alex.bennee@linaro.org>
[will: added #include to fix allnoconfig build]
[will: use enable_daif in do_sve_acc]
Signed-off-by: Will Deacon <will.deacon@arm.com>
2017-10-31 22:51:05 +07:00
|
|
|
|
2017-10-31 22:51:19 +07:00
|
|
|
#ifdef CONFIG_ARM64_SVE
|
|
|
|
alternative_if_not ARM64_SVE
|
arm64/sve: Core task context handling
This patch adds the core support for switching and managing the SVE
architectural state of user tasks.
Calls to the existing FPSIMD low-level save/restore functions are
factored out as new functions task_fpsimd_{save,load}(), since SVE
now dynamically may or may not need to be handled at these points
depending on the kernel configuration, hardware features discovered
at boot, and the runtime state of the task. To make these
decisions as fast as possible, const cpucaps are used where
feasible, via the system_supports_sve() helper.
The SVE registers are only tracked for threads that have explicitly
used SVE, indicated by the new thread flag TIF_SVE. Otherwise, the
FPSIMD view of the architectural state is stored in
thread.fpsimd_state as usual.
When in use, the SVE registers are not stored directly in
thread_struct due to their potentially large and variable size.
Because the task_struct slab allocator must be configured very
early during kernel boot, it is also tricky to configure it
correctly to match the maximum vector length provided by the
hardware, since this depends on examining secondary CPUs as well as
the primary. Instead, a pointer sve_state in thread_struct points
to a dynamically allocated buffer containing the SVE register data,
and code is added to allocate and free this buffer at appropriate
times.
TIF_SVE is set when taking an SVE access trap from userspace, if
suitable hardware support has been detected. This enables SVE for
the thread: a subsequent return to userspace will disable the trap
accordingly. If such a trap is taken without sufficient system-
wide hardware support, SIGILL is sent to the thread instead as if
an undefined instruction had been executed: this may happen if
userspace tries to use SVE in a system where not all CPUs support
it for example.
The kernel will clear TIF_SVE and disable SVE for the thread
whenever an explicit syscall is made by userspace. For backwards
compatibility reasons and conformance with the spirit of the base
AArch64 procedure call standard, the subset of the SVE register
state that aliases the FPSIMD registers is still preserved across a
syscall even if this happens. The remainder of the SVE register
state logically becomes zero at syscall entry, though the actual
zeroing work is currently deferred until the thread next tries to
use SVE, causing another trap to the kernel. This implementation
is suboptimal: in the future, the fastpath case may be optimised
to zero the registers in-place and leave SVE enabled for the task,
where beneficial.
TIF_SVE is also cleared in the following slowpath cases, which are
taken as reasonable hints that the task may no longer use SVE:
* exec
* fork and clone
Code is added to sync data between thread.fpsimd_state and
thread.sve_state whenever enabling/disabling SVE, in a manner
consistent with the SVE architectural programmer's model.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Alex Bennée <alex.bennee@linaro.org>
[will: added #include to fix allnoconfig build]
[will: use enable_daif in do_sve_acc]
Signed-off-by: Will Deacon <will.deacon@arm.com>
2017-10-31 22:51:05 +07:00
|
|
|
b el0_svc_naked
|
2017-10-31 22:51:19 +07:00
|
|
|
alternative_else_nop_endif
|
arm64/sve: Core task context handling
This patch adds the core support for switching and managing the SVE
architectural state of user tasks.
Calls to the existing FPSIMD low-level save/restore functions are
factored out as new functions task_fpsimd_{save,load}(), since SVE
now dynamically may or may not need to be handled at these points
depending on the kernel configuration, hardware features discovered
at boot, and the runtime state of the task. To make these
decisions as fast as possible, const cpucaps are used where
feasible, via the system_supports_sve() helper.
The SVE registers are only tracked for threads that have explicitly
used SVE, indicated by the new thread flag TIF_SVE. Otherwise, the
FPSIMD view of the architectural state is stored in
thread.fpsimd_state as usual.
When in use, the SVE registers are not stored directly in
thread_struct due to their potentially large and variable size.
Because the task_struct slab allocator must be configured very
early during kernel boot, it is also tricky to configure it
correctly to match the maximum vector length provided by the
hardware, since this depends on examining secondary CPUs as well as
the primary. Instead, a pointer sve_state in thread_struct points
to a dynamically allocated buffer containing the SVE register data,
and code is added to allocate and free this buffer at appropriate
times.
TIF_SVE is set when taking an SVE access trap from userspace, if
suitable hardware support has been detected. This enables SVE for
the thread: a subsequent return to userspace will disable the trap
accordingly. If such a trap is taken without sufficient system-
wide hardware support, SIGILL is sent to the thread instead as if
an undefined instruction had been executed: this may happen if
userspace tries to use SVE in a system where not all CPUs support
it for example.
The kernel will clear TIF_SVE and disable SVE for the thread
whenever an explicit syscall is made by userspace. For backwards
compatibility reasons and conformance with the spirit of the base
AArch64 procedure call standard, the subset of the SVE register
state that aliases the FPSIMD registers is still preserved across a
syscall even if this happens. The remainder of the SVE register
state logically becomes zero at syscall entry, though the actual
zeroing work is currently deferred until the thread next tries to
use SVE, causing another trap to the kernel. This implementation
is suboptimal: in the future, the fastpath case may be optimised
to zero the registers in-place and leave SVE enabled for the task,
where beneficial.
TIF_SVE is also cleared in the following slowpath cases, which are
taken as reasonable hints that the task may no longer use SVE:
* exec
* fork and clone
Code is added to sync data between thread.fpsimd_state and
thread.sve_state whenever enabling/disabling SVE, in a manner
consistent with the SVE architectural programmer's model.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Alex Bennée <alex.bennee@linaro.org>
[will: added #include to fix allnoconfig build]
[will: use enable_daif in do_sve_acc]
Signed-off-by: Will Deacon <will.deacon@arm.com>
2017-10-31 22:51:05 +07:00
|
|
|
tbz x16, #TIF_SVE, el0_svc_naked // Skip unless TIF_SVE set:
|
|
|
|
bic x16, x16, #_TIF_SVE // discard SVE state
|
|
|
|
str x16, [tsk, #TSK_TI_FLAGS]
|
|
|
|
|
|
|
|
/*
|
|
|
|
* task_fpsimd_load() won't be called to update CPACR_EL1 in
|
|
|
|
* ret_to_user unless TIF_FOREIGN_FPSTATE is still set, which only
|
|
|
|
* happens if a context switch or kernel_neon_begin() or context
|
|
|
|
* modification (sigreturn, ptrace) intervenes.
|
|
|
|
* So, ensure that CPACR_EL1 is already correct for the fast-path case:
|
|
|
|
*/
|
|
|
|
mrs x9, cpacr_el1
|
|
|
|
bic x9, x9, #CPACR_EL1_ZEN_EL0EN // disable SVE for el0
|
|
|
|
msr cpacr_el1, x9 // synchronised by eret to el0
|
2017-10-31 22:51:19 +07:00
|
|
|
#endif
|
arm64/sve: Core task context handling
This patch adds the core support for switching and managing the SVE
architectural state of user tasks.
Calls to the existing FPSIMD low-level save/restore functions are
factored out as new functions task_fpsimd_{save,load}(), since SVE
now dynamically may or may not need to be handled at these points
depending on the kernel configuration, hardware features discovered
at boot, and the runtime state of the task. To make these
decisions as fast as possible, const cpucaps are used where
feasible, via the system_supports_sve() helper.
The SVE registers are only tracked for threads that have explicitly
used SVE, indicated by the new thread flag TIF_SVE. Otherwise, the
FPSIMD view of the architectural state is stored in
thread.fpsimd_state as usual.
When in use, the SVE registers are not stored directly in
thread_struct due to their potentially large and variable size.
Because the task_struct slab allocator must be configured very
early during kernel boot, it is also tricky to configure it
correctly to match the maximum vector length provided by the
hardware, since this depends on examining secondary CPUs as well as
the primary. Instead, a pointer sve_state in thread_struct points
to a dynamically allocated buffer containing the SVE register data,
and code is added to allocate and free this buffer at appropriate
times.
TIF_SVE is set when taking an SVE access trap from userspace, if
suitable hardware support has been detected. This enables SVE for
the thread: a subsequent return to userspace will disable the trap
accordingly. If such a trap is taken without sufficient system-
wide hardware support, SIGILL is sent to the thread instead as if
an undefined instruction had been executed: this may happen if
userspace tries to use SVE in a system where not all CPUs support
it for example.
The kernel will clear TIF_SVE and disable SVE for the thread
whenever an explicit syscall is made by userspace. For backwards
compatibility reasons and conformance with the spirit of the base
AArch64 procedure call standard, the subset of the SVE register
state that aliases the FPSIMD registers is still preserved across a
syscall even if this happens. The remainder of the SVE register
state logically becomes zero at syscall entry, though the actual
zeroing work is currently deferred until the thread next tries to
use SVE, causing another trap to the kernel. This implementation
is suboptimal: in the future, the fastpath case may be optimised
to zero the registers in-place and leave SVE enabled for the task,
where beneficial.
TIF_SVE is also cleared in the following slowpath cases, which are
taken as reasonable hints that the task may no longer use SVE:
* exec
* fork and clone
Code is added to sync data between thread.fpsimd_state and
thread.sve_state whenever enabling/disabling SVE, in a manner
consistent with the SVE architectural programmer's model.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Alex Bennée <alex.bennee@linaro.org>
[will: added #include to fix allnoconfig build]
[will: use enable_daif in do_sve_acc]
Signed-off-by: Will Deacon <will.deacon@arm.com>
2017-10-31 22:51:05 +07:00
|
|
|
|
2012-03-05 18:49:27 +07:00
|
|
|
el0_svc_naked: // compat entry point
|
arm64: syscallno is secretly an int, make it official
The upper 32 bits of the syscallno field in thread_struct are
handled inconsistently, being sometimes zero extended and sometimes
sign-extended. In fact, only the lower 32 bits seem to have any
real significance for the behaviour of the code: it's been OK to
handle the upper bits inconsistently because they don't matter.
Currently, the only place I can find where those bits are
significant is in calling trace_sys_enter(), which may be
unintentional: for example, if a compat tracer attempts to cancel a
syscall by passing -1 to (COMPAT_)PTRACE_SET_SYSCALL at the
syscall-enter-stop, it will be traced as syscall 4294967295
rather than -1 as might be expected (and as occurs for a native
tracer doing the same thing). Elsewhere, reads of syscallno cast
it to an int or truncate it.
There's also a conspicuous amount of code and casting to bodge
around the fact that although semantically an int, syscallno is
stored as a u64.
Let's not pretend any more.
In order to preserve the stp x instruction that stores the syscall
number in entry.S, this patch special-cases the layout of struct
pt_regs for big endian so that the newly 32-bit syscallno field
maps onto the low bits of the stored value. This is not beautiful,
but benchmarking of the getpid syscall on Juno suggests indicates a
minor slowdown if the stp is split into an stp x and stp w.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-08-01 21:35:53 +07:00
|
|
|
stp x0, xscno, [sp, #S_ORIG_X0] // save the original x0 and syscall number
|
2017-11-02 19:12:40 +07:00
|
|
|
enable_daif
|
2014-05-31 02:34:15 +07:00
|
|
|
ct_user_exit 1
|
2012-03-05 18:49:27 +07:00
|
|
|
|
arm64/sve: Core task context handling
This patch adds the core support for switching and managing the SVE
architectural state of user tasks.
Calls to the existing FPSIMD low-level save/restore functions are
factored out as new functions task_fpsimd_{save,load}(), since SVE
now dynamically may or may not need to be handled at these points
depending on the kernel configuration, hardware features discovered
at boot, and the runtime state of the task. To make these
decisions as fast as possible, const cpucaps are used where
feasible, via the system_supports_sve() helper.
The SVE registers are only tracked for threads that have explicitly
used SVE, indicated by the new thread flag TIF_SVE. Otherwise, the
FPSIMD view of the architectural state is stored in
thread.fpsimd_state as usual.
When in use, the SVE registers are not stored directly in
thread_struct due to their potentially large and variable size.
Because the task_struct slab allocator must be configured very
early during kernel boot, it is also tricky to configure it
correctly to match the maximum vector length provided by the
hardware, since this depends on examining secondary CPUs as well as
the primary. Instead, a pointer sve_state in thread_struct points
to a dynamically allocated buffer containing the SVE register data,
and code is added to allocate and free this buffer at appropriate
times.
TIF_SVE is set when taking an SVE access trap from userspace, if
suitable hardware support has been detected. This enables SVE for
the thread: a subsequent return to userspace will disable the trap
accordingly. If such a trap is taken without sufficient system-
wide hardware support, SIGILL is sent to the thread instead as if
an undefined instruction had been executed: this may happen if
userspace tries to use SVE in a system where not all CPUs support
it for example.
The kernel will clear TIF_SVE and disable SVE for the thread
whenever an explicit syscall is made by userspace. For backwards
compatibility reasons and conformance with the spirit of the base
AArch64 procedure call standard, the subset of the SVE register
state that aliases the FPSIMD registers is still preserved across a
syscall even if this happens. The remainder of the SVE register
state logically becomes zero at syscall entry, though the actual
zeroing work is currently deferred until the thread next tries to
use SVE, causing another trap to the kernel. This implementation
is suboptimal: in the future, the fastpath case may be optimised
to zero the registers in-place and leave SVE enabled for the task,
where beneficial.
TIF_SVE is also cleared in the following slowpath cases, which are
taken as reasonable hints that the task may no longer use SVE:
* exec
* fork and clone
Code is added to sync data between thread.fpsimd_state and
thread.sve_state whenever enabling/disabling SVE, in a manner
consistent with the SVE architectural programmer's model.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Alex Bennée <alex.bennee@linaro.org>
[will: added #include to fix allnoconfig build]
[will: use enable_daif in do_sve_acc]
Signed-off-by: Will Deacon <will.deacon@arm.com>
2017-10-31 22:51:05 +07:00
|
|
|
tst x16, #_TIF_SYSCALL_WORK // check for syscall hooks
|
2014-04-30 16:51:29 +07:00
|
|
|
b.ne __sys_trace
|
arm64: syscallno is secretly an int, make it official
The upper 32 bits of the syscallno field in thread_struct are
handled inconsistently, being sometimes zero extended and sometimes
sign-extended. In fact, only the lower 32 bits seem to have any
real significance for the behaviour of the code: it's been OK to
handle the upper bits inconsistently because they don't matter.
Currently, the only place I can find where those bits are
significant is in calling trace_sys_enter(), which may be
unintentional: for example, if a compat tracer attempts to cancel a
syscall by passing -1 to (COMPAT_)PTRACE_SET_SYSCALL at the
syscall-enter-stop, it will be traced as syscall 4294967295
rather than -1 as might be expected (and as occurs for a native
tracer doing the same thing). Elsewhere, reads of syscallno cast
it to an int or truncate it.
There's also a conspicuous amount of code and casting to bodge
around the fact that although semantically an int, syscallno is
stored as a u64.
Let's not pretend any more.
In order to preserve the stp x instruction that stores the syscall
number in entry.S, this patch special-cases the layout of struct
pt_regs for big endian so that the newly 32-bit syscallno field
maps onto the low bits of the stored value. This is not beautiful,
but benchmarking of the getpid syscall on Juno suggests indicates a
minor slowdown if the stp is split into an stp x and stp w.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-08-01 21:35:53 +07:00
|
|
|
cmp wscno, wsc_nr // check upper syscall limit
|
2012-03-05 18:49:27 +07:00
|
|
|
b.hs ni_sys
|
arm64: syscallno is secretly an int, make it official
The upper 32 bits of the syscallno field in thread_struct are
handled inconsistently, being sometimes zero extended and sometimes
sign-extended. In fact, only the lower 32 bits seem to have any
real significance for the behaviour of the code: it's been OK to
handle the upper bits inconsistently because they don't matter.
Currently, the only place I can find where those bits are
significant is in calling trace_sys_enter(), which may be
unintentional: for example, if a compat tracer attempts to cancel a
syscall by passing -1 to (COMPAT_)PTRACE_SET_SYSCALL at the
syscall-enter-stop, it will be traced as syscall 4294967295
rather than -1 as might be expected (and as occurs for a native
tracer doing the same thing). Elsewhere, reads of syscallno cast
it to an int or truncate it.
There's also a conspicuous amount of code and casting to bodge
around the fact that although semantically an int, syscallno is
stored as a u64.
Let's not pretend any more.
In order to preserve the stp x instruction that stores the syscall
number in entry.S, this patch special-cases the layout of struct
pt_regs for big endian so that the newly 32-bit syscallno field
maps onto the low bits of the stored value. This is not beautiful,
but benchmarking of the getpid syscall on Juno suggests indicates a
minor slowdown if the stp is split into an stp x and stp w.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-08-01 21:35:53 +07:00
|
|
|
ldr x16, [stbl, xscno, lsl #3] // address in the syscall table
|
2014-09-29 17:44:01 +07:00
|
|
|
blr x16 // call sys_* routine
|
|
|
|
b ret_fast_syscall
|
2012-03-05 18:49:27 +07:00
|
|
|
ni_sys:
|
|
|
|
mov x0, sp
|
2014-09-29 17:44:01 +07:00
|
|
|
bl do_ni_syscall
|
|
|
|
b ret_fast_syscall
|
2012-03-05 18:49:27 +07:00
|
|
|
ENDPROC(el0_svc)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is the really slow path. We're going to be doing context
|
|
|
|
* switches, and waiting for our parent to respond.
|
|
|
|
*/
|
|
|
|
__sys_trace:
|
2017-08-01 21:35:54 +07:00
|
|
|
cmp wscno, #NO_SYSCALL // user-issued syscall(-1)?
|
2014-11-28 12:26:35 +07:00
|
|
|
b.ne 1f
|
arm64: syscallno is secretly an int, make it official
The upper 32 bits of the syscallno field in thread_struct are
handled inconsistently, being sometimes zero extended and sometimes
sign-extended. In fact, only the lower 32 bits seem to have any
real significance for the behaviour of the code: it's been OK to
handle the upper bits inconsistently because they don't matter.
Currently, the only place I can find where those bits are
significant is in calling trace_sys_enter(), which may be
unintentional: for example, if a compat tracer attempts to cancel a
syscall by passing -1 to (COMPAT_)PTRACE_SET_SYSCALL at the
syscall-enter-stop, it will be traced as syscall 4294967295
rather than -1 as might be expected (and as occurs for a native
tracer doing the same thing). Elsewhere, reads of syscallno cast
it to an int or truncate it.
There's also a conspicuous amount of code and casting to bodge
around the fact that although semantically an int, syscallno is
stored as a u64.
Let's not pretend any more.
In order to preserve the stp x instruction that stores the syscall
number in entry.S, this patch special-cases the layout of struct
pt_regs for big endian so that the newly 32-bit syscallno field
maps onto the low bits of the stored value. This is not beautiful,
but benchmarking of the getpid syscall on Juno suggests indicates a
minor slowdown if the stp is split into an stp x and stp w.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-08-01 21:35:53 +07:00
|
|
|
mov x0, #-ENOSYS // set default errno if so
|
2014-11-28 12:26:35 +07:00
|
|
|
str x0, [sp, #S_X0]
|
|
|
|
1: mov x0, sp
|
2014-04-30 16:51:30 +07:00
|
|
|
bl syscall_trace_enter
|
2017-08-01 21:35:54 +07:00
|
|
|
cmp w0, #NO_SYSCALL // skip the syscall?
|
2014-11-28 12:26:35 +07:00
|
|
|
b.eq __sys_trace_return_skipped
|
arm64: syscallno is secretly an int, make it official
The upper 32 bits of the syscallno field in thread_struct are
handled inconsistently, being sometimes zero extended and sometimes
sign-extended. In fact, only the lower 32 bits seem to have any
real significance for the behaviour of the code: it's been OK to
handle the upper bits inconsistently because they don't matter.
Currently, the only place I can find where those bits are
significant is in calling trace_sys_enter(), which may be
unintentional: for example, if a compat tracer attempts to cancel a
syscall by passing -1 to (COMPAT_)PTRACE_SET_SYSCALL at the
syscall-enter-stop, it will be traced as syscall 4294967295
rather than -1 as might be expected (and as occurs for a native
tracer doing the same thing). Elsewhere, reads of syscallno cast
it to an int or truncate it.
There's also a conspicuous amount of code and casting to bodge
around the fact that although semantically an int, syscallno is
stored as a u64.
Let's not pretend any more.
In order to preserve the stp x instruction that stores the syscall
number in entry.S, this patch special-cases the layout of struct
pt_regs for big endian so that the newly 32-bit syscallno field
maps onto the low bits of the stored value. This is not beautiful,
but benchmarking of the getpid syscall on Juno suggests indicates a
minor slowdown if the stp is split into an stp x and stp w.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-08-01 21:35:53 +07:00
|
|
|
mov wscno, w0 // syscall number (possibly new)
|
2012-03-05 18:49:27 +07:00
|
|
|
mov x1, sp // pointer to regs
|
arm64: syscallno is secretly an int, make it official
The upper 32 bits of the syscallno field in thread_struct are
handled inconsistently, being sometimes zero extended and sometimes
sign-extended. In fact, only the lower 32 bits seem to have any
real significance for the behaviour of the code: it's been OK to
handle the upper bits inconsistently because they don't matter.
Currently, the only place I can find where those bits are
significant is in calling trace_sys_enter(), which may be
unintentional: for example, if a compat tracer attempts to cancel a
syscall by passing -1 to (COMPAT_)PTRACE_SET_SYSCALL at the
syscall-enter-stop, it will be traced as syscall 4294967295
rather than -1 as might be expected (and as occurs for a native
tracer doing the same thing). Elsewhere, reads of syscallno cast
it to an int or truncate it.
There's also a conspicuous amount of code and casting to bodge
around the fact that although semantically an int, syscallno is
stored as a u64.
Let's not pretend any more.
In order to preserve the stp x instruction that stores the syscall
number in entry.S, this patch special-cases the layout of struct
pt_regs for big endian so that the newly 32-bit syscallno field
maps onto the low bits of the stored value. This is not beautiful,
but benchmarking of the getpid syscall on Juno suggests indicates a
minor slowdown if the stp is split into an stp x and stp w.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-08-01 21:35:53 +07:00
|
|
|
cmp wscno, wsc_nr // check upper syscall limit
|
2014-09-29 17:44:01 +07:00
|
|
|
b.hs __ni_sys_trace
|
2012-03-05 18:49:27 +07:00
|
|
|
ldp x0, x1, [sp] // restore the syscall args
|
|
|
|
ldp x2, x3, [sp, #S_X2]
|
|
|
|
ldp x4, x5, [sp, #S_X4]
|
|
|
|
ldp x6, x7, [sp, #S_X6]
|
arm64: syscallno is secretly an int, make it official
The upper 32 bits of the syscallno field in thread_struct are
handled inconsistently, being sometimes zero extended and sometimes
sign-extended. In fact, only the lower 32 bits seem to have any
real significance for the behaviour of the code: it's been OK to
handle the upper bits inconsistently because they don't matter.
Currently, the only place I can find where those bits are
significant is in calling trace_sys_enter(), which may be
unintentional: for example, if a compat tracer attempts to cancel a
syscall by passing -1 to (COMPAT_)PTRACE_SET_SYSCALL at the
syscall-enter-stop, it will be traced as syscall 4294967295
rather than -1 as might be expected (and as occurs for a native
tracer doing the same thing). Elsewhere, reads of syscallno cast
it to an int or truncate it.
There's also a conspicuous amount of code and casting to bodge
around the fact that although semantically an int, syscallno is
stored as a u64.
Let's not pretend any more.
In order to preserve the stp x instruction that stores the syscall
number in entry.S, this patch special-cases the layout of struct
pt_regs for big endian so that the newly 32-bit syscallno field
maps onto the low bits of the stored value. This is not beautiful,
but benchmarking of the getpid syscall on Juno suggests indicates a
minor slowdown if the stp is split into an stp x and stp w.
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-08-01 21:35:53 +07:00
|
|
|
ldr x16, [stbl, xscno, lsl #3] // address in the syscall table
|
2014-09-29 17:44:01 +07:00
|
|
|
blr x16 // call sys_* routine
|
2012-03-05 18:49:27 +07:00
|
|
|
|
|
|
|
__sys_trace_return:
|
2014-11-28 12:26:35 +07:00
|
|
|
str x0, [sp, #S_X0] // save returned x0
|
|
|
|
__sys_trace_return_skipped:
|
2014-04-30 16:51:30 +07:00
|
|
|
mov x0, sp
|
|
|
|
bl syscall_trace_exit
|
2012-03-05 18:49:27 +07:00
|
|
|
b ret_to_user
|
|
|
|
|
2014-09-29 17:44:01 +07:00
|
|
|
__ni_sys_trace:
|
|
|
|
mov x0, sp
|
|
|
|
bl do_ni_syscall
|
|
|
|
b __sys_trace_return
|
|
|
|
|
2016-07-08 23:35:50 +07:00
|
|
|
.popsection // .entry.text
|
|
|
|
|
2017-11-14 21:07:40 +07:00
|
|
|
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
|
|
|
|
/*
|
|
|
|
* Exception vectors trampoline.
|
|
|
|
*/
|
|
|
|
.pushsection ".entry.tramp.text", "ax"
|
|
|
|
|
|
|
|
.macro tramp_map_kernel, tmp
|
|
|
|
mrs \tmp, ttbr1_el1
|
|
|
|
sub \tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
|
|
|
|
bic \tmp, \tmp, #USER_ASID_FLAG
|
|
|
|
msr ttbr1_el1, \tmp
|
2017-11-14 21:29:19 +07:00
|
|
|
#ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003
|
|
|
|
alternative_if ARM64_WORKAROUND_QCOM_FALKOR_E1003
|
|
|
|
/* ASID already in \tmp[63:48] */
|
|
|
|
movk \tmp, #:abs_g2_nc:(TRAMP_VALIAS >> 12)
|
|
|
|
movk \tmp, #:abs_g1_nc:(TRAMP_VALIAS >> 12)
|
|
|
|
/* 2MB boundary containing the vectors, so we nobble the walk cache */
|
|
|
|
movk \tmp, #:abs_g0_nc:((TRAMP_VALIAS & ~(SZ_2M - 1)) >> 12)
|
|
|
|
isb
|
|
|
|
tlbi vae1, \tmp
|
|
|
|
dsb nsh
|
|
|
|
alternative_else_nop_endif
|
|
|
|
#endif /* CONFIG_QCOM_FALKOR_ERRATUM_1003 */
|
2017-11-14 21:07:40 +07:00
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro tramp_unmap_kernel, tmp
|
|
|
|
mrs \tmp, ttbr1_el1
|
|
|
|
add \tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE)
|
|
|
|
orr \tmp, \tmp, #USER_ASID_FLAG
|
|
|
|
msr ttbr1_el1, \tmp
|
|
|
|
/*
|
|
|
|
* We avoid running the post_ttbr_update_workaround here because the
|
|
|
|
* user and kernel ASIDs don't have conflicting mappings, so any
|
|
|
|
* "blessing" as described in:
|
|
|
|
*
|
|
|
|
* http://lkml.kernel.org/r/56BB848A.6060603@caviumnetworks.com
|
|
|
|
*
|
|
|
|
* will not hurt correctness. Whilst this may partially defeat the
|
|
|
|
* point of using split ASIDs in the first place, it avoids
|
|
|
|
* the hit of invalidating the entire I-cache on every return to
|
|
|
|
* userspace.
|
|
|
|
*/
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro tramp_ventry, regsize = 64
|
|
|
|
.align 7
|
|
|
|
1:
|
|
|
|
.if \regsize == 64
|
|
|
|
msr tpidrro_el0, x30 // Restored in kernel_ventry
|
|
|
|
.endif
|
2017-11-14 23:15:59 +07:00
|
|
|
/*
|
|
|
|
* Defend against branch aliasing attacks by pushing a dummy
|
|
|
|
* entry onto the return stack and using a RET instruction to
|
|
|
|
* enter the full-fat kernel vectors.
|
|
|
|
*/
|
|
|
|
bl 2f
|
|
|
|
b .
|
|
|
|
2:
|
2017-11-14 21:07:40 +07:00
|
|
|
tramp_map_kernel x30
|
2017-12-06 18:24:02 +07:00
|
|
|
#ifdef CONFIG_RANDOMIZE_BASE
|
|
|
|
adr x30, tramp_vectors + PAGE_SIZE
|
|
|
|
alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003
|
|
|
|
ldr x30, [x30]
|
|
|
|
#else
|
2017-11-14 21:07:40 +07:00
|
|
|
ldr x30, =vectors
|
2017-12-06 18:24:02 +07:00
|
|
|
#endif
|
2017-11-14 21:07:40 +07:00
|
|
|
prfm plil1strm, [x30, #(1b - tramp_vectors)]
|
|
|
|
msr vbar_el1, x30
|
|
|
|
add x30, x30, #(1b - tramp_vectors)
|
|
|
|
isb
|
2017-11-14 23:15:59 +07:00
|
|
|
ret
|
2017-11-14 21:07:40 +07:00
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro tramp_exit, regsize = 64
|
|
|
|
adr x30, tramp_vectors
|
|
|
|
msr vbar_el1, x30
|
|
|
|
tramp_unmap_kernel x30
|
|
|
|
.if \regsize == 64
|
|
|
|
mrs x30, far_el1
|
|
|
|
.endif
|
|
|
|
eret
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.align 11
|
|
|
|
ENTRY(tramp_vectors)
|
|
|
|
.space 0x400
|
|
|
|
|
|
|
|
tramp_ventry
|
|
|
|
tramp_ventry
|
|
|
|
tramp_ventry
|
|
|
|
tramp_ventry
|
|
|
|
|
|
|
|
tramp_ventry 32
|
|
|
|
tramp_ventry 32
|
|
|
|
tramp_ventry 32
|
|
|
|
tramp_ventry 32
|
|
|
|
END(tramp_vectors)
|
|
|
|
|
|
|
|
ENTRY(tramp_exit_native)
|
|
|
|
tramp_exit
|
|
|
|
END(tramp_exit_native)
|
|
|
|
|
|
|
|
ENTRY(tramp_exit_compat)
|
|
|
|
tramp_exit 32
|
|
|
|
END(tramp_exit_compat)
|
|
|
|
|
|
|
|
.ltorg
|
|
|
|
.popsection // .entry.tramp.text
|
2017-12-06 18:24:02 +07:00
|
|
|
#ifdef CONFIG_RANDOMIZE_BASE
|
|
|
|
.pushsection ".rodata", "a"
|
|
|
|
.align PAGE_SHIFT
|
|
|
|
.globl __entry_tramp_data_start
|
|
|
|
__entry_tramp_data_start:
|
|
|
|
.quad vectors
|
|
|
|
.popsection // .rodata
|
|
|
|
#endif /* CONFIG_RANDOMIZE_BASE */
|
2017-11-14 21:07:40 +07:00
|
|
|
#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
|
|
|
|
|
2012-03-05 18:49:27 +07:00
|
|
|
/*
|
|
|
|
* Special system call wrappers.
|
|
|
|
*/
|
|
|
|
ENTRY(sys_rt_sigreturn_wrapper)
|
|
|
|
mov x0, sp
|
|
|
|
b sys_rt_sigreturn
|
|
|
|
ENDPROC(sys_rt_sigreturn_wrapper)
|
2017-07-26 22:05:20 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Register switch for AArch64. The callee-saved registers need to be saved
|
|
|
|
* and restored. On entry:
|
|
|
|
* x0 = previous task_struct (must be preserved across the switch)
|
|
|
|
* x1 = next task_struct
|
|
|
|
* Previous and next are guaranteed not to be the same.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
ENTRY(cpu_switch_to)
|
|
|
|
mov x10, #THREAD_CPU_CONTEXT
|
|
|
|
add x8, x0, x10
|
|
|
|
mov x9, sp
|
|
|
|
stp x19, x20, [x8], #16 // store callee-saved registers
|
|
|
|
stp x21, x22, [x8], #16
|
|
|
|
stp x23, x24, [x8], #16
|
|
|
|
stp x25, x26, [x8], #16
|
|
|
|
stp x27, x28, [x8], #16
|
|
|
|
stp x29, x9, [x8], #16
|
|
|
|
str lr, [x8]
|
|
|
|
add x8, x1, x10
|
|
|
|
ldp x19, x20, [x8], #16 // restore callee-saved registers
|
|
|
|
ldp x21, x22, [x8], #16
|
|
|
|
ldp x23, x24, [x8], #16
|
|
|
|
ldp x25, x26, [x8], #16
|
|
|
|
ldp x27, x28, [x8], #16
|
|
|
|
ldp x29, x9, [x8], #16
|
|
|
|
ldr lr, [x8]
|
|
|
|
mov sp, x9
|
|
|
|
msr sp_el0, x1
|
|
|
|
ret
|
|
|
|
ENDPROC(cpu_switch_to)
|
|
|
|
NOKPROBE(cpu_switch_to)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is how we return from a fork.
|
|
|
|
*/
|
|
|
|
ENTRY(ret_from_fork)
|
|
|
|
bl schedule_tail
|
|
|
|
cbz x19, 1f // not a kernel thread
|
|
|
|
mov x0, x20
|
|
|
|
blr x19
|
|
|
|
1: get_thread_info tsk
|
|
|
|
b ret_to_user
|
|
|
|
ENDPROC(ret_from_fork)
|
|
|
|
NOKPROBE(ret_from_fork)
|