linux_dsm_epyc7002/arch/x86/kernel/entry_32.S

1116 lines
26 KiB
ArmAsm
Raw Normal View History

/*
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
/*
* entry.S contains the system-call and fault low-level handling routines.
* This also contains the timer-interrupt handler, as well as all interrupts
* and faults that can result in a task-switch.
*
* NOTE: This code handles signal-recognition, which happens every time
* after a timer-interrupt and after each system call.
*
* I changed all the .align's to 4 (16 byte alignment), as that's faster
* on a 486.
*
* Stack layout in 'syscall_exit':
* ptrace needs to have all regs on the stack.
* if the order here is changed, it needs to be
* updated in fork.c:copy_process, signal.c:do_signal,
* ptrace.c and ptrace.h
*
* 0(%esp) - %ebx
* 4(%esp) - %ecx
* 8(%esp) - %edx
* C(%esp) - %esi
* 10(%esp) - %edi
* 14(%esp) - %ebp
* 18(%esp) - %eax
* 1C(%esp) - %ds
* 20(%esp) - %es
* 24(%esp) - %fs
* 28(%esp) - orig_eax
* 2C(%esp) - %eip
* 30(%esp) - %cs
* 34(%esp) - %eflags
* 38(%esp) - %oldesp
* 3C(%esp) - %oldss
*
* "current" is in register %ebx during any slow entries.
*/
#include <linux/linkage.h>
#include <asm/thread_info.h>
#include <asm/irqflags.h>
#include <asm/errno.h>
#include <asm/segment.h>
#include <asm/smp.h>
#include <asm/page.h>
#include <asm/desc.h>
#include <asm/percpu.h>
#include <asm/dwarf2.h>
#include <asm/processor-flags.h>
#include "irq_vectors.h"
/*
* We use macros for low-level operations which need to be overridden
* for paravirtualization. The following will never clobber any registers:
* INTERRUPT_RETURN (aka. "iret")
* GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
* ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
*
* For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
* specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
* Allowing a register to be clobbered can shrink the paravirt replacement
* enough to patch inline, increasing performance.
*/
#define nr_syscalls ((syscall_table_size)/4)
#ifdef CONFIG_PREEMPT
#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
#else
#define preempt_stop(clobbers)
#define resume_kernel restore_nocheck
#endif
.macro TRACE_IRQS_IRET
#ifdef CONFIG_TRACE_IRQFLAGS
testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off?
jz 1f
TRACE_IRQS_ON
1:
#endif
.endm
#ifdef CONFIG_VM86
#define resume_userspace_sig check_userspace
#else
#define resume_userspace_sig resume_userspace
#endif
#define SAVE_ALL \
cld; \
pushl %fs; \
CFI_ADJUST_CFA_OFFSET 4;\
/*CFI_REL_OFFSET fs, 0;*/\
pushl %es; \
CFI_ADJUST_CFA_OFFSET 4;\
/*CFI_REL_OFFSET es, 0;*/\
pushl %ds; \
CFI_ADJUST_CFA_OFFSET 4;\
/*CFI_REL_OFFSET ds, 0;*/\
pushl %eax; \
CFI_ADJUST_CFA_OFFSET 4;\
CFI_REL_OFFSET eax, 0;\
pushl %ebp; \
CFI_ADJUST_CFA_OFFSET 4;\
CFI_REL_OFFSET ebp, 0;\
pushl %edi; \
CFI_ADJUST_CFA_OFFSET 4;\
CFI_REL_OFFSET edi, 0;\
pushl %esi; \
CFI_ADJUST_CFA_OFFSET 4;\
CFI_REL_OFFSET esi, 0;\
pushl %edx; \
CFI_ADJUST_CFA_OFFSET 4;\
CFI_REL_OFFSET edx, 0;\
pushl %ecx; \
CFI_ADJUST_CFA_OFFSET 4;\
CFI_REL_OFFSET ecx, 0;\
pushl %ebx; \
CFI_ADJUST_CFA_OFFSET 4;\
CFI_REL_OFFSET ebx, 0;\
movl $(__USER_DS), %edx; \
movl %edx, %ds; \
movl %edx, %es; \
movl $(__KERNEL_PERCPU), %edx; \
movl %edx, %fs
#define RESTORE_INT_REGS \
popl %ebx; \
CFI_ADJUST_CFA_OFFSET -4;\
CFI_RESTORE ebx;\
popl %ecx; \
CFI_ADJUST_CFA_OFFSET -4;\
CFI_RESTORE ecx;\
popl %edx; \
CFI_ADJUST_CFA_OFFSET -4;\
CFI_RESTORE edx;\
popl %esi; \
CFI_ADJUST_CFA_OFFSET -4;\
CFI_RESTORE esi;\
popl %edi; \
CFI_ADJUST_CFA_OFFSET -4;\
CFI_RESTORE edi;\
popl %ebp; \
CFI_ADJUST_CFA_OFFSET -4;\
CFI_RESTORE ebp;\
popl %eax; \
CFI_ADJUST_CFA_OFFSET -4;\
CFI_RESTORE eax
#define RESTORE_REGS \
RESTORE_INT_REGS; \
1: popl %ds; \
CFI_ADJUST_CFA_OFFSET -4;\
/*CFI_RESTORE ds;*/\
2: popl %es; \
CFI_ADJUST_CFA_OFFSET -4;\
/*CFI_RESTORE es;*/\
3: popl %fs; \
CFI_ADJUST_CFA_OFFSET -4;\
/*CFI_RESTORE fs;*/\
.pushsection .fixup,"ax"; \
4: movl $0,(%esp); \
jmp 1b; \
5: movl $0,(%esp); \
jmp 2b; \
6: movl $0,(%esp); \
jmp 3b; \
.section __ex_table,"a";\
.align 4; \
.long 1b,4b; \
.long 2b,5b; \
.long 3b,6b; \
.popsection
#define RING0_INT_FRAME \
CFI_STARTPROC simple;\
CFI_SIGNAL_FRAME;\
CFI_DEF_CFA esp, 3*4;\
/*CFI_OFFSET cs, -2*4;*/\
CFI_OFFSET eip, -3*4
#define RING0_EC_FRAME \
CFI_STARTPROC simple;\
CFI_SIGNAL_FRAME;\
CFI_DEF_CFA esp, 4*4;\
/*CFI_OFFSET cs, -2*4;*/\
CFI_OFFSET eip, -3*4
#define RING0_PTREGS_FRAME \
CFI_STARTPROC simple;\
CFI_SIGNAL_FRAME;\
CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
/*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
/*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
/*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
CFI_OFFSET ebx, PT_EBX-PT_OLDESP
ENTRY(ret_from_fork)
CFI_STARTPROC
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
call schedule_tail
GET_THREAD_INFO(%ebp)
popl %eax
CFI_ADJUST_CFA_OFFSET -4
pushl $0x0202 # Reset kernel eflags
CFI_ADJUST_CFA_OFFSET 4
popfl
CFI_ADJUST_CFA_OFFSET -4
jmp syscall_exit
CFI_ENDPROC
END(ret_from_fork)
/*
* Return to user mode is not as complex as all this looks,
* but we want the default path for a system call return to
* go as quickly as possible which is why some of this is
* less clear than it otherwise should be.
*/
# userspace resumption stub bypassing syscall exit tracing
ALIGN
RING0_PTREGS_FRAME
ret_from_exception:
preempt_stop(CLBR_ANY)
ret_from_intr:
GET_THREAD_INFO(%ebp)
check_userspace:
movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
movb PT_CS(%esp), %al
andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
cmpl $USER_RPL, %eax
jb resume_kernel # not returning to v8086 or userspace
ENTRY(resume_userspace)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
movl TI_flags(%ebp), %ecx
andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
# int/exception return?
jne work_pending
jmp restore_all
END(ret_from_exception)
#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
DISABLE_INTERRUPTS(CLBR_ANY)
cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
jnz restore_nocheck
need_resched:
movl TI_flags(%ebp), %ecx # need_resched set ?
testb $_TIF_NEED_RESCHED, %cl
jz restore_all
testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
jz restore_all
call preempt_schedule_irq
jmp need_resched
END(resume_kernel)
#endif
CFI_ENDPROC
/* SYSENTER_RETURN points to after the "sysenter" instruction in
the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
# sysenter call handler stub
ENTRY(ia32_sysenter_target)
CFI_STARTPROC simple
CFI_SIGNAL_FRAME
CFI_DEF_CFA esp, 0
CFI_REGISTER esp, ebp
movl TSS_sysenter_sp0(%esp),%esp
sysenter_past_esp:
/*
* Interrupts are disabled here, but we can't trace it until
* enough kernel state to call TRACE_IRQS_OFF can be called - but
* we immediately enable interrupts at that point anyway.
*/
pushl $(__USER_DS)
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET ss, 0*/
pushl %ebp
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET esp, 0
pushfl
orl $X86_EFLAGS_IF, (%esp)
CFI_ADJUST_CFA_OFFSET 4
pushl $(__USER_CS)
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET cs, 0*/
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma Move the i386 VDSO down into a vma and thus randomize it. Besides the security implications, this feature also helps debuggers, which can COW a vma-backed VDSO just like a normal DSO and can thus do single-stepping and other debugging features. It's good for hypervisors (Xen, VMWare) too, which typically live in the same high-mapped address space as the VDSO, hence whenever the VDSO is used, they get lots of guest pagefaults and have to fix such guest accesses up - which slows things down instead of speeding things up (the primary purpose of the VDSO). There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support for older glibcs that still rely on a prelinked high-mapped VDSO. Newer distributions (using glibc 2.3.3 or later) can turn this option off. Turning it off is also recommended for security reasons: attackers cannot use the predictable high-mapped VDSO page as syscall trampoline anymore. There is a new vdso=[0|1] boot option as well, and a runtime /proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned on/off. (This version of the VDSO-randomization patch also has working ELF coredumping, the previous patch crashed in the coredumping code.) This code is a combined work of the exec-shield VDSO randomization code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell started this patch and i completed it. [akpm@osdl.org: cleanups] [akpm@osdl.org: compile fix] [akpm@osdl.org: compile fix 2] [akpm@osdl.org: compile fix 3] [akpm@osdl.org: revernt MAXMEM change] Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Arjan van de Ven <arjan@infradead.org> Cc: Gerd Hoffmann <kraxel@suse.de> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Zachary Amsden <zach@vmware.com> Cc: Andi Kleen <ak@muc.de> Cc: Jan Beulich <jbeulich@novell.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 16:53:50 +07:00
/*
* Push current_thread_info()->sysenter_return to the stack.
* A tiny bit of offset fixup is necessary - 4*4 means the 4 words
* pushed above; +8 corresponds to copy_thread's esp0 setting.
*/
pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET eip, 0
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
ENABLE_INTERRUPTS(CLBR_NONE)
/*
* Load the potential sixth argument from user stack.
* Careful about security.
*/
cmpl $__PAGE_OFFSET-3,%ebp
jae syscall_fault
1: movl (%ebp),%ebp
movl %ebp,PT_EBP(%esp)
.section __ex_table,"a"
.align 4
.long 1b,syscall_fault
.previous
GET_THREAD_INFO(%ebp)
/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
[PATCH] UML Support - Ptrace: adds the host SYSEMU support, for UML and general usage Jeff Dike <jdike@addtoit.com>, Paolo 'Blaisorblade' Giarrusso <blaisorblade_spam@yahoo.it>, Bodo Stroesser <bstroesser@fujitsu-siemens.com> Adds a new ptrace(2) mode, called PTRACE_SYSEMU, resembling PTRACE_SYSCALL except that the kernel does not execute the requested syscall; this is useful to improve performance for virtual environments, like UML, which want to run the syscall on their own. In fact, using PTRACE_SYSCALL means stopping child execution twice, on entry and on exit, and each time you also have two context switches; with SYSEMU you avoid the 2nd stop and so save two context switches per syscall. Also, some architectures don't have support in the host for changing the syscall number via ptrace(), which is currently needed to skip syscall execution (UML turns any syscall into getpid() to avoid it being executed on the host). Fixing that is hard, while SYSEMU is easier to implement. * This version of the patch includes some suggestions of Jeff Dike to avoid adding any instructions to the syscall fast path, plus some other little changes, by myself, to make it work even when the syscall is executed with SYSENTER (but I'm unsure about them). It has been widely tested for quite a lot of time. * Various fixed were included to handle the various switches between various states, i.e. when for instance a syscall entry is traced with one of PT_SYSCALL / _SYSEMU / _SINGLESTEP and another one is used on exit. Basically, this is done by remembering which one of them was used even after the call to ptrace_notify(). * We're combining TIF_SYSCALL_EMU with TIF_SYSCALL_TRACE or TIF_SINGLESTEP to make do_syscall_trace() notice that the current syscall was started with SYSEMU on entry, so that no notification ought to be done in the exit path; this is a bit of a hack, so this problem is solved in another way in next patches. * Also, the effects of the patch: "Ptrace - i386: fix Syscall Audit interaction with singlestep" are cancelled; they are restored back in the last patch of this series. Detailed descriptions of the patches doing this kind of processing follow (but I've already summed everything up). * Fix behaviour when changing interception kind #1. In do_syscall_trace(), we check the status of the TIF_SYSCALL_EMU flag only after doing the debugger notification; but the debugger might have changed the status of this flag because he continued execution with PTRACE_SYSCALL, so this is wrong. This patch fixes it by saving the flag status before calling ptrace_notify(). * Fix behaviour when changing interception kind #2: avoid intercepting syscall on return when using SYSCALL again. A guest process switching from using PTRACE_SYSEMU to PTRACE_SYSCALL crashes. The problem is in arch/i386/kernel/entry.S. The current SYSEMU patch inhibits the syscall-handler to be called, but does not prevent do_syscall_trace() to be called after this for syscall completion interception. The appended patch fixes this. It reuses the flag TIF_SYSCALL_EMU to remember "we come from PTRACE_SYSEMU and now are in PTRACE_SYSCALL", since the flag is unused in the depicted situation. * Fix behaviour when changing interception kind #3: avoid intercepting syscall on return when using SINGLESTEP. When testing 2.6.9 and the skas3.v6 patch, with my latest patch and had problems with singlestepping on UML in SKAS with SYSEMU. It looped receiving SIGTRAPs without moving forward. EIP of the traced process was the same for all SIGTRAPs. What's missing is to handle switching from PTRACE_SYSCALL_EMU to PTRACE_SINGLESTEP in a way very similar to what is done for the change from PTRACE_SYSCALL_EMU to PTRACE_SYSCALL_TRACE. I.e., after calling ptrace(PTRACE_SYSEMU), on the return path, the debugger is notified and then wake ups the process; the syscall is executed (or skipped, when do_syscall_trace() returns 0, i.e. when using PTRACE_SYSEMU), and do_syscall_trace() is called again. Since we are on the return path of a SYSEMU'd syscall, if the wake up is performed through ptrace(PTRACE_SYSCALL), we must still avoid notifying the parent of the syscall exit. Now, this behaviour is extended even to resuming with PTRACE_SINGLESTEP. Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> Cc: Jeff Dike <jdike@addtoit.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-04 05:57:18 +07:00
testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
jnz syscall_trace_entry
cmpl $(nr_syscalls), %eax
jae syscall_badsys
call *sys_call_table(,%eax,4)
movl %eax,PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx
jne syscall_exit_work
/* if something modifies registers it must also disable sysexit */
movl PT_EIP(%esp), %edx
movl PT_OLDESP(%esp), %ecx
xorl %ebp,%ebp
TRACE_IRQS_ON
1: mov PT_FS(%esp), %fs
ENABLE_INTERRUPTS_SYSCALL_RET
CFI_ENDPROC
.pushsection .fixup,"ax"
2: movl $0,PT_FS(%esp)
jmp 1b
.section __ex_table,"a"
.align 4
.long 1b,2b
.popsection
ENDPROC(ia32_sysenter_target)
# system call handler stub
ENTRY(system_call)
RING0_INT_FRAME # can't unwind into user space anyway
pushl %eax # save orig_eax
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
GET_THREAD_INFO(%ebp)
[PATCH] UML Support - Ptrace: adds the host SYSEMU support, for UML and general usage Jeff Dike <jdike@addtoit.com>, Paolo 'Blaisorblade' Giarrusso <blaisorblade_spam@yahoo.it>, Bodo Stroesser <bstroesser@fujitsu-siemens.com> Adds a new ptrace(2) mode, called PTRACE_SYSEMU, resembling PTRACE_SYSCALL except that the kernel does not execute the requested syscall; this is useful to improve performance for virtual environments, like UML, which want to run the syscall on their own. In fact, using PTRACE_SYSCALL means stopping child execution twice, on entry and on exit, and each time you also have two context switches; with SYSEMU you avoid the 2nd stop and so save two context switches per syscall. Also, some architectures don't have support in the host for changing the syscall number via ptrace(), which is currently needed to skip syscall execution (UML turns any syscall into getpid() to avoid it being executed on the host). Fixing that is hard, while SYSEMU is easier to implement. * This version of the patch includes some suggestions of Jeff Dike to avoid adding any instructions to the syscall fast path, plus some other little changes, by myself, to make it work even when the syscall is executed with SYSENTER (but I'm unsure about them). It has been widely tested for quite a lot of time. * Various fixed were included to handle the various switches between various states, i.e. when for instance a syscall entry is traced with one of PT_SYSCALL / _SYSEMU / _SINGLESTEP and another one is used on exit. Basically, this is done by remembering which one of them was used even after the call to ptrace_notify(). * We're combining TIF_SYSCALL_EMU with TIF_SYSCALL_TRACE or TIF_SINGLESTEP to make do_syscall_trace() notice that the current syscall was started with SYSEMU on entry, so that no notification ought to be done in the exit path; this is a bit of a hack, so this problem is solved in another way in next patches. * Also, the effects of the patch: "Ptrace - i386: fix Syscall Audit interaction with singlestep" are cancelled; they are restored back in the last patch of this series. Detailed descriptions of the patches doing this kind of processing follow (but I've already summed everything up). * Fix behaviour when changing interception kind #1. In do_syscall_trace(), we check the status of the TIF_SYSCALL_EMU flag only after doing the debugger notification; but the debugger might have changed the status of this flag because he continued execution with PTRACE_SYSCALL, so this is wrong. This patch fixes it by saving the flag status before calling ptrace_notify(). * Fix behaviour when changing interception kind #2: avoid intercepting syscall on return when using SYSCALL again. A guest process switching from using PTRACE_SYSEMU to PTRACE_SYSCALL crashes. The problem is in arch/i386/kernel/entry.S. The current SYSEMU patch inhibits the syscall-handler to be called, but does not prevent do_syscall_trace() to be called after this for syscall completion interception. The appended patch fixes this. It reuses the flag TIF_SYSCALL_EMU to remember "we come from PTRACE_SYSEMU and now are in PTRACE_SYSCALL", since the flag is unused in the depicted situation. * Fix behaviour when changing interception kind #3: avoid intercepting syscall on return when using SINGLESTEP. When testing 2.6.9 and the skas3.v6 patch, with my latest patch and had problems with singlestepping on UML in SKAS with SYSEMU. It looped receiving SIGTRAPs without moving forward. EIP of the traced process was the same for all SIGTRAPs. What's missing is to handle switching from PTRACE_SYSCALL_EMU to PTRACE_SINGLESTEP in a way very similar to what is done for the change from PTRACE_SYSCALL_EMU to PTRACE_SYSCALL_TRACE. I.e., after calling ptrace(PTRACE_SYSEMU), on the return path, the debugger is notified and then wake ups the process; the syscall is executed (or skipped, when do_syscall_trace() returns 0, i.e. when using PTRACE_SYSEMU), and do_syscall_trace() is called again. Since we are on the return path of a SYSEMU'd syscall, if the wake up is performed through ptrace(PTRACE_SYSCALL), we must still avoid notifying the parent of the syscall exit. Now, this behaviour is extended even to resuming with PTRACE_SINGLESTEP. Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> Cc: Jeff Dike <jdike@addtoit.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-04 05:57:18 +07:00
# system call tracing in operation / emulation
/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
[PATCH] UML Support - Ptrace: adds the host SYSEMU support, for UML and general usage Jeff Dike <jdike@addtoit.com>, Paolo 'Blaisorblade' Giarrusso <blaisorblade_spam@yahoo.it>, Bodo Stroesser <bstroesser@fujitsu-siemens.com> Adds a new ptrace(2) mode, called PTRACE_SYSEMU, resembling PTRACE_SYSCALL except that the kernel does not execute the requested syscall; this is useful to improve performance for virtual environments, like UML, which want to run the syscall on their own. In fact, using PTRACE_SYSCALL means stopping child execution twice, on entry and on exit, and each time you also have two context switches; with SYSEMU you avoid the 2nd stop and so save two context switches per syscall. Also, some architectures don't have support in the host for changing the syscall number via ptrace(), which is currently needed to skip syscall execution (UML turns any syscall into getpid() to avoid it being executed on the host). Fixing that is hard, while SYSEMU is easier to implement. * This version of the patch includes some suggestions of Jeff Dike to avoid adding any instructions to the syscall fast path, plus some other little changes, by myself, to make it work even when the syscall is executed with SYSENTER (but I'm unsure about them). It has been widely tested for quite a lot of time. * Various fixed were included to handle the various switches between various states, i.e. when for instance a syscall entry is traced with one of PT_SYSCALL / _SYSEMU / _SINGLESTEP and another one is used on exit. Basically, this is done by remembering which one of them was used even after the call to ptrace_notify(). * We're combining TIF_SYSCALL_EMU with TIF_SYSCALL_TRACE or TIF_SINGLESTEP to make do_syscall_trace() notice that the current syscall was started with SYSEMU on entry, so that no notification ought to be done in the exit path; this is a bit of a hack, so this problem is solved in another way in next patches. * Also, the effects of the patch: "Ptrace - i386: fix Syscall Audit interaction with singlestep" are cancelled; they are restored back in the last patch of this series. Detailed descriptions of the patches doing this kind of processing follow (but I've already summed everything up). * Fix behaviour when changing interception kind #1. In do_syscall_trace(), we check the status of the TIF_SYSCALL_EMU flag only after doing the debugger notification; but the debugger might have changed the status of this flag because he continued execution with PTRACE_SYSCALL, so this is wrong. This patch fixes it by saving the flag status before calling ptrace_notify(). * Fix behaviour when changing interception kind #2: avoid intercepting syscall on return when using SYSCALL again. A guest process switching from using PTRACE_SYSEMU to PTRACE_SYSCALL crashes. The problem is in arch/i386/kernel/entry.S. The current SYSEMU patch inhibits the syscall-handler to be called, but does not prevent do_syscall_trace() to be called after this for syscall completion interception. The appended patch fixes this. It reuses the flag TIF_SYSCALL_EMU to remember "we come from PTRACE_SYSEMU and now are in PTRACE_SYSCALL", since the flag is unused in the depicted situation. * Fix behaviour when changing interception kind #3: avoid intercepting syscall on return when using SINGLESTEP. When testing 2.6.9 and the skas3.v6 patch, with my latest patch and had problems with singlestepping on UML in SKAS with SYSEMU. It looped receiving SIGTRAPs without moving forward. EIP of the traced process was the same for all SIGTRAPs. What's missing is to handle switching from PTRACE_SYSCALL_EMU to PTRACE_SINGLESTEP in a way very similar to what is done for the change from PTRACE_SYSCALL_EMU to PTRACE_SYSCALL_TRACE. I.e., after calling ptrace(PTRACE_SYSEMU), on the return path, the debugger is notified and then wake ups the process; the syscall is executed (or skipped, when do_syscall_trace() returns 0, i.e. when using PTRACE_SYSEMU), and do_syscall_trace() is called again. Since we are on the return path of a SYSEMU'd syscall, if the wake up is performed through ptrace(PTRACE_SYSCALL), we must still avoid notifying the parent of the syscall exit. Now, this behaviour is extended even to resuming with PTRACE_SINGLESTEP. Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> Cc: Jeff Dike <jdike@addtoit.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-04 05:57:18 +07:00
testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
jnz syscall_trace_entry
cmpl $(nr_syscalls), %eax
jae syscall_badsys
syscall_call:
call *sys_call_table(,%eax,4)
movl %eax,PT_EAX(%esp) # store the return value
syscall_exit:
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
i386: fix regression, endless loop in ptrace singlestep over an int80 The commit 635cf99a80f4ebee59d70eb64bb85ce829e4591f introduced a regression. Executing a ptrace single step after certain int80 accesses will infinitely loop and never advance the PC. The TIF_SINGLESTEP check should be done on the return from the syscall and not before it. I loops on each single step on the pop right after the int80 which writes out to the console. At that point you can issue as many single steps as you want and it will not advance any further. The test case is below: /* Test whether singlestep through an int80 syscall works. */ #define _GNU_SOURCE #include <stdio.h> #include <unistd.h> #include <fcntl.h> #include <sys/ptrace.h> #include <sys/wait.h> #include <sys/mman.h> #include <asm/user.h> #include <string.h> static int child, status; static struct user_regs_struct regs; static void do_child() { char str[80] = "child: int80 test\n"; ptrace(PTRACE_TRACEME, 0, 0, 0); kill(getpid(), SIGUSR1); write(fileno(stdout),str,strlen(str)); asm ("int $0x80" : : "a" (20)); /* getpid */ } static void do_parent() { unsigned long eip, expected = 0; again: waitpid(child, &status, 0); if (WIFEXITED(status) || WIFSIGNALED(status)) return; if (WIFSTOPPED(status)) { ptrace(PTRACE_GETREGS, child, 0, &regs); eip = regs.eip; if (expected) fprintf(stderr, "child stop @ %08lx, expected %08lx %s\n", eip, expected, eip == expected ? "" : " <== ERROR"); if (*(unsigned short *)eip == 0x80cd) { fprintf(stderr, "int 0x80 at %08x\n", (unsigned int)eip); expected = eip + 2; } else expected = 0; ptrace(PTRACE_SINGLESTEP, child, NULL, NULL); } goto again; } int main(int argc, char * const argv[]) { child = fork(); if (child) do_parent(); else do_child(); return 0; } Signed-off-by: Jason Wessel <jason.wessel@windriver.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: <stable@kernel.org> Cc: Chuck Ebbert <76306.1226@compuserve.com> Acked-by: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-06 16:39:50 +07:00
jz no_singlestep
orl $_TIF_SINGLESTEP,TI_flags(%ebp)
no_singlestep:
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx # current->work
jne syscall_exit_work
restore_all:
movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
# are returning to the kernel.
# See comments in process.c:copy_thread() for details.
movb PT_OLDSS(%esp), %ah
movb PT_CS(%esp), %al
andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
CFI_REMEMBER_STATE
je ldt_ss # returning to user-space with LDT SS
restore_nocheck:
TRACE_IRQS_IRET
restore_nocheck_notrace:
RESTORE_REGS
addl $4, %esp # skip orig_eax/error_code
CFI_ADJUST_CFA_OFFSET -4
irq_return:
INTERRUPT_RETURN
.section .fixup,"ax"
ENTRY(iret_exc)
pushl $0 # no error code
pushl $do_iret_error
jmp error_code
.previous
.section __ex_table,"a"
.align 4
.long irq_return,iret_exc
.previous
CFI_RESTORE_STATE
ldt_ss:
larl PT_OLDSS(%esp), %eax
jnz restore_nocheck
testl $0x00400000, %eax # returning to 32bit stack?
jnz restore_nocheck # allright, normal return
#ifdef CONFIG_PARAVIRT
/*
* The kernel can't run on a non-flat stack if paravirt mode
* is active. Rather than try to fixup the high bits of
* ESP, bypass this code entirely. This may break DOSemu
* and/or Wine support in a paravirt VM, although the option
* is still available to implement the setting of the high
* 16-bits in the INTERRUPT_RETURN paravirt-op.
*/
paravirt: refactor struct paravirt_ops into smaller pv_*_ops This patch refactors the paravirt_ops structure into groups of functionally related ops: pv_info - random info, rather than function entrypoints pv_init_ops - functions used at boot time (some for module_init too) pv_misc_ops - lazy mode, which didn't fit well anywhere else pv_time_ops - time-related functions pv_cpu_ops - various privileged instruction ops pv_irq_ops - operations for managing interrupt state pv_apic_ops - APIC operations pv_mmu_ops - operations for managing pagetables There are several motivations for this: 1. Some of these ops will be general to all x86, and some will be i386/x86-64 specific. This makes it easier to share common stuff while allowing separate implementations where needed. 2. At the moment we must export all of paravirt_ops, but modules only need selected parts of it. This allows us to export on a case by case basis (and also choose which export license we want to apply). 3. Functional groupings make things a bit more readable. Struct paravirt_ops is now only used as a template to generate patch-site identifiers, and to extract function pointers for inserting into jmp/calls when patching. It is only instantiated when needed. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Cc: Andi Kleen <ak@suse.de> Cc: Zach Amsden <zach@vmware.com> Cc: Avi Kivity <avi@qumranet.com> Cc: Anthony Liguory <aliguori@us.ibm.com> Cc: "Glauber de Oliveira Costa" <glommer@gmail.com> Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 01:51:29 +07:00
cmpl $0, pv_info+PARAVIRT_enabled
jne restore_nocheck
#endif
/* If returning to userspace with 16bit stack,
* try to fix the higher word of ESP, as the CPU
* won't restore it.
* This is an "official" bug of all the x86-compatible
* CPUs, which we can try to work around to make
* dosemu and wine happy. */
movl PT_OLDESP(%esp), %eax
movl %esp, %edx
call patch_espfix_desc
pushl $__ESPFIX_SS
CFI_ADJUST_CFA_OFFSET 4
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
DISABLE_INTERRUPTS(CLBR_EAX)
TRACE_IRQS_OFF
lss (%esp), %esp
CFI_ADJUST_CFA_OFFSET -8
jmp restore_nocheck
CFI_ENDPROC
ENDPROC(system_call)
# perform work that needs to be done immediately before resumption
ALIGN
RING0_PTREGS_FRAME # can't unwind into user space anyway
work_pending:
testb $_TIF_NEED_RESCHED, %cl
jz work_notifysig
work_resched:
call schedule
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
# than syscall tracing?
jz restore_all
testb $_TIF_NEED_RESCHED, %cl
jnz work_resched
work_notifysig: # deal with pending signals and
# notify-resume requests
#ifdef CONFIG_VM86
testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
movl %esp, %eax
jne work_notifysig_v86 # returning to kernel-space or
# vm86-space
xorl %edx, %edx
call do_notify_resume
jmp resume_userspace_sig
ALIGN
work_notifysig_v86:
pushl %ecx # save ti_flags for do_notify_resume
CFI_ADJUST_CFA_OFFSET 4
call save_v86_state # %eax contains pt_regs pointer
popl %ecx
CFI_ADJUST_CFA_OFFSET -4
movl %eax, %esp
#else
movl %esp, %eax
#endif
xorl %edx, %edx
call do_notify_resume
jmp resume_userspace_sig
END(work_pending)
# perform syscall exit tracing
ALIGN
syscall_trace_entry:
movl $-ENOSYS,PT_EAX(%esp)
movl %esp, %eax
xorl %edx,%edx
call do_syscall_trace
[PATCH] UML Support - Ptrace: adds the host SYSEMU support, for UML and general usage Jeff Dike <jdike@addtoit.com>, Paolo 'Blaisorblade' Giarrusso <blaisorblade_spam@yahoo.it>, Bodo Stroesser <bstroesser@fujitsu-siemens.com> Adds a new ptrace(2) mode, called PTRACE_SYSEMU, resembling PTRACE_SYSCALL except that the kernel does not execute the requested syscall; this is useful to improve performance for virtual environments, like UML, which want to run the syscall on their own. In fact, using PTRACE_SYSCALL means stopping child execution twice, on entry and on exit, and each time you also have two context switches; with SYSEMU you avoid the 2nd stop and so save two context switches per syscall. Also, some architectures don't have support in the host for changing the syscall number via ptrace(), which is currently needed to skip syscall execution (UML turns any syscall into getpid() to avoid it being executed on the host). Fixing that is hard, while SYSEMU is easier to implement. * This version of the patch includes some suggestions of Jeff Dike to avoid adding any instructions to the syscall fast path, plus some other little changes, by myself, to make it work even when the syscall is executed with SYSENTER (but I'm unsure about them). It has been widely tested for quite a lot of time. * Various fixed were included to handle the various switches between various states, i.e. when for instance a syscall entry is traced with one of PT_SYSCALL / _SYSEMU / _SINGLESTEP and another one is used on exit. Basically, this is done by remembering which one of them was used even after the call to ptrace_notify(). * We're combining TIF_SYSCALL_EMU with TIF_SYSCALL_TRACE or TIF_SINGLESTEP to make do_syscall_trace() notice that the current syscall was started with SYSEMU on entry, so that no notification ought to be done in the exit path; this is a bit of a hack, so this problem is solved in another way in next patches. * Also, the effects of the patch: "Ptrace - i386: fix Syscall Audit interaction with singlestep" are cancelled; they are restored back in the last patch of this series. Detailed descriptions of the patches doing this kind of processing follow (but I've already summed everything up). * Fix behaviour when changing interception kind #1. In do_syscall_trace(), we check the status of the TIF_SYSCALL_EMU flag only after doing the debugger notification; but the debugger might have changed the status of this flag because he continued execution with PTRACE_SYSCALL, so this is wrong. This patch fixes it by saving the flag status before calling ptrace_notify(). * Fix behaviour when changing interception kind #2: avoid intercepting syscall on return when using SYSCALL again. A guest process switching from using PTRACE_SYSEMU to PTRACE_SYSCALL crashes. The problem is in arch/i386/kernel/entry.S. The current SYSEMU patch inhibits the syscall-handler to be called, but does not prevent do_syscall_trace() to be called after this for syscall completion interception. The appended patch fixes this. It reuses the flag TIF_SYSCALL_EMU to remember "we come from PTRACE_SYSEMU and now are in PTRACE_SYSCALL", since the flag is unused in the depicted situation. * Fix behaviour when changing interception kind #3: avoid intercepting syscall on return when using SINGLESTEP. When testing 2.6.9 and the skas3.v6 patch, with my latest patch and had problems with singlestepping on UML in SKAS with SYSEMU. It looped receiving SIGTRAPs without moving forward. EIP of the traced process was the same for all SIGTRAPs. What's missing is to handle switching from PTRACE_SYSCALL_EMU to PTRACE_SINGLESTEP in a way very similar to what is done for the change from PTRACE_SYSCALL_EMU to PTRACE_SYSCALL_TRACE. I.e., after calling ptrace(PTRACE_SYSEMU), on the return path, the debugger is notified and then wake ups the process; the syscall is executed (or skipped, when do_syscall_trace() returns 0, i.e. when using PTRACE_SYSEMU), and do_syscall_trace() is called again. Since we are on the return path of a SYSEMU'd syscall, if the wake up is performed through ptrace(PTRACE_SYSCALL), we must still avoid notifying the parent of the syscall exit. Now, this behaviour is extended even to resuming with PTRACE_SINGLESTEP. Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> Cc: Jeff Dike <jdike@addtoit.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-04 05:57:18 +07:00
cmpl $0, %eax
jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
[PATCH] UML Support - Ptrace: adds the host SYSEMU support, for UML and general usage Jeff Dike <jdike@addtoit.com>, Paolo 'Blaisorblade' Giarrusso <blaisorblade_spam@yahoo.it>, Bodo Stroesser <bstroesser@fujitsu-siemens.com> Adds a new ptrace(2) mode, called PTRACE_SYSEMU, resembling PTRACE_SYSCALL except that the kernel does not execute the requested syscall; this is useful to improve performance for virtual environments, like UML, which want to run the syscall on their own. In fact, using PTRACE_SYSCALL means stopping child execution twice, on entry and on exit, and each time you also have two context switches; with SYSEMU you avoid the 2nd stop and so save two context switches per syscall. Also, some architectures don't have support in the host for changing the syscall number via ptrace(), which is currently needed to skip syscall execution (UML turns any syscall into getpid() to avoid it being executed on the host). Fixing that is hard, while SYSEMU is easier to implement. * This version of the patch includes some suggestions of Jeff Dike to avoid adding any instructions to the syscall fast path, plus some other little changes, by myself, to make it work even when the syscall is executed with SYSENTER (but I'm unsure about them). It has been widely tested for quite a lot of time. * Various fixed were included to handle the various switches between various states, i.e. when for instance a syscall entry is traced with one of PT_SYSCALL / _SYSEMU / _SINGLESTEP and another one is used on exit. Basically, this is done by remembering which one of them was used even after the call to ptrace_notify(). * We're combining TIF_SYSCALL_EMU with TIF_SYSCALL_TRACE or TIF_SINGLESTEP to make do_syscall_trace() notice that the current syscall was started with SYSEMU on entry, so that no notification ought to be done in the exit path; this is a bit of a hack, so this problem is solved in another way in next patches. * Also, the effects of the patch: "Ptrace - i386: fix Syscall Audit interaction with singlestep" are cancelled; they are restored back in the last patch of this series. Detailed descriptions of the patches doing this kind of processing follow (but I've already summed everything up). * Fix behaviour when changing interception kind #1. In do_syscall_trace(), we check the status of the TIF_SYSCALL_EMU flag only after doing the debugger notification; but the debugger might have changed the status of this flag because he continued execution with PTRACE_SYSCALL, so this is wrong. This patch fixes it by saving the flag status before calling ptrace_notify(). * Fix behaviour when changing interception kind #2: avoid intercepting syscall on return when using SYSCALL again. A guest process switching from using PTRACE_SYSEMU to PTRACE_SYSCALL crashes. The problem is in arch/i386/kernel/entry.S. The current SYSEMU patch inhibits the syscall-handler to be called, but does not prevent do_syscall_trace() to be called after this for syscall completion interception. The appended patch fixes this. It reuses the flag TIF_SYSCALL_EMU to remember "we come from PTRACE_SYSEMU and now are in PTRACE_SYSCALL", since the flag is unused in the depicted situation. * Fix behaviour when changing interception kind #3: avoid intercepting syscall on return when using SINGLESTEP. When testing 2.6.9 and the skas3.v6 patch, with my latest patch and had problems with singlestepping on UML in SKAS with SYSEMU. It looped receiving SIGTRAPs without moving forward. EIP of the traced process was the same for all SIGTRAPs. What's missing is to handle switching from PTRACE_SYSCALL_EMU to PTRACE_SINGLESTEP in a way very similar to what is done for the change from PTRACE_SYSCALL_EMU to PTRACE_SYSCALL_TRACE. I.e., after calling ptrace(PTRACE_SYSEMU), on the return path, the debugger is notified and then wake ups the process; the syscall is executed (or skipped, when do_syscall_trace() returns 0, i.e. when using PTRACE_SYSEMU), and do_syscall_trace() is called again. Since we are on the return path of a SYSEMU'd syscall, if the wake up is performed through ptrace(PTRACE_SYSCALL), we must still avoid notifying the parent of the syscall exit. Now, this behaviour is extended even to resuming with PTRACE_SINGLESTEP. Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> Cc: Jeff Dike <jdike@addtoit.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-04 05:57:18 +07:00
# so must skip actual syscall
movl PT_ORIG_EAX(%esp), %eax
cmpl $(nr_syscalls), %eax
jnae syscall_call
jmp syscall_exit
END(syscall_trace_entry)
# perform syscall exit tracing
ALIGN
syscall_exit_work:
testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
jz work_pending
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
# schedule() instead
movl %esp, %eax
movl $1, %edx
call do_syscall_trace
jmp resume_userspace
END(syscall_exit_work)
CFI_ENDPROC
RING0_INT_FRAME # can't unwind into user space anyway
syscall_fault:
GET_THREAD_INFO(%ebp)
movl $-EFAULT,PT_EAX(%esp)
jmp resume_userspace
END(syscall_fault)
syscall_badsys:
movl $-ENOSYS,PT_EAX(%esp)
jmp resume_userspace
END(syscall_badsys)
CFI_ENDPROC
#define FIXUP_ESPFIX_STACK \
/* since we are on a wrong stack, we cant make it a C code :( */ \
PER_CPU(gdt_page, %ebx); \
GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
addl %esp, %eax; \
pushl $__KERNEL_DS; \
CFI_ADJUST_CFA_OFFSET 4; \
pushl %eax; \
CFI_ADJUST_CFA_OFFSET 4; \
lss (%esp), %esp; \
CFI_ADJUST_CFA_OFFSET -8;
#define UNWIND_ESPFIX_STACK \
movl %ss, %eax; \
/* see if on espfix stack */ \
cmpw $__ESPFIX_SS, %ax; \
jne 27f; \
movl $__KERNEL_DS, %eax; \
movl %eax, %ds; \
movl %eax, %es; \
/* switch to normal stack */ \
FIXUP_ESPFIX_STACK; \
27:;
/*
* Build the entry stubs and pointer table with
* some assembler magic.
*/
.section .rodata,"a"
ENTRY(interrupt)
.text
ENTRY(irq_entries_start)
RING0_INT_FRAME
vector=0
.rept NR_IRQS
ALIGN
.if vector
CFI_ADJUST_CFA_OFFSET -4
.endif
1: pushl $~(vector)
CFI_ADJUST_CFA_OFFSET 4
jmp common_interrupt
.previous
.long 1b
.text
vector=vector+1
.endr
END(irq_entries_start)
.previous
END(interrupt)
.previous
/*
* the CPU automatically disables interrupts when executing an IRQ vector,
* so IRQ-flags tracing has to follow that:
*/
ALIGN
common_interrupt:
SAVE_ALL
TRACE_IRQS_OFF
movl %esp,%eax
call do_IRQ
jmp ret_from_intr
ENDPROC(common_interrupt)
CFI_ENDPROC
#define BUILD_INTERRUPT(name, nr) \
ENTRY(name) \
RING0_INT_FRAME; \
pushl $~(nr); \
CFI_ADJUST_CFA_OFFSET 4; \
SAVE_ALL; \
TRACE_IRQS_OFF \
movl %esp,%eax; \
call smp_##name; \
jmp ret_from_intr; \
CFI_ENDPROC; \
ENDPROC(name)
/* The include is where all of the SMP etc. interrupts come from */
#include "entry_arch.h"
[PATCH] x86: error_code is not safe for kprobes This patch moves the entry.S:error_entry to .kprobes.text section, since code marked unsafe for kprobes jumps directly to entry.S::error_entry, that must be marked unsafe as well. This patch also moves all the ".previous.text" asm directives to ".previous" for kprobes section. AK: Following a similar i386 patch from Chuck Ebbert AK: Also merged Jeremy's fix in. +From: Jeremy Fitzhardinge <jeremy@goop.org> KPROBE_ENTRY does a .section .kprobes.text, and expects its users to do a .previous at the end of the function. Unfortunately, if any code within the function switches sections, for example .fixup, then the .previous ends up putting all subsequent code into .fixup. Worse, any subsequent .fixup code gets intermingled with the code its supposed to be fixing (which is also in .fixup). It's surprising this didn't cause more havok. The fix is to use .pushsection/.popsection, so this stuff nests properly. A further cleanup would be to get rid of all .section/.previous pairs, since they're inherently fragile. +From: Chuck Ebbert <76306.1226@compuserve.com> Because code marked unsafe for kprobes jumps directly to entry.S::error_code, that must be marked unsafe as well. The easiest way to do that is to move the page fault entry point to just before error_code and let it inherit the same section. Also moved all the ".previous" asm directives for kprobes sections to column 1 and removed ".text" from them. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen <ak@suse.de>
2006-09-26 15:52:34 +07:00
KPROBE_ENTRY(page_fault)
RING0_EC_FRAME
pushl $do_page_fault
CFI_ADJUST_CFA_OFFSET 4
ALIGN
error_code:
/* the function address is in %fs's slot on the stack */
pushl %es
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET es, 0*/
pushl %ds
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET ds, 0*/
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET eax, 0
pushl %ebp
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET ebp, 0
pushl %edi
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET edi, 0
pushl %esi
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET esi, 0
pushl %edx
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET edx, 0
pushl %ecx
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET ecx, 0
pushl %ebx
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET ebx, 0
cld
pushl %fs
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET fs, 0*/
movl $(__KERNEL_PERCPU), %ecx
movl %ecx, %fs
UNWIND_ESPFIX_STACK
popl %ecx
CFI_ADJUST_CFA_OFFSET -4
/*CFI_REGISTER es, ecx*/
movl PT_FS(%esp), %edi # get the function address
movl PT_ORIG_EAX(%esp), %edx # get the error code
movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
mov %ecx, PT_FS(%esp)
/*CFI_REL_OFFSET fs, ES*/
movl $(__USER_DS), %ecx
movl %ecx, %ds
movl %ecx, %es
movl %esp,%eax # pt_regs pointer
call *%edi
jmp ret_from_exception
CFI_ENDPROC
[PATCH] x86: error_code is not safe for kprobes This patch moves the entry.S:error_entry to .kprobes.text section, since code marked unsafe for kprobes jumps directly to entry.S::error_entry, that must be marked unsafe as well. This patch also moves all the ".previous.text" asm directives to ".previous" for kprobes section. AK: Following a similar i386 patch from Chuck Ebbert AK: Also merged Jeremy's fix in. +From: Jeremy Fitzhardinge <jeremy@goop.org> KPROBE_ENTRY does a .section .kprobes.text, and expects its users to do a .previous at the end of the function. Unfortunately, if any code within the function switches sections, for example .fixup, then the .previous ends up putting all subsequent code into .fixup. Worse, any subsequent .fixup code gets intermingled with the code its supposed to be fixing (which is also in .fixup). It's surprising this didn't cause more havok. The fix is to use .pushsection/.popsection, so this stuff nests properly. A further cleanup would be to get rid of all .section/.previous pairs, since they're inherently fragile. +From: Chuck Ebbert <76306.1226@compuserve.com> Because code marked unsafe for kprobes jumps directly to entry.S::error_code, that must be marked unsafe as well. The easiest way to do that is to move the page fault entry point to just before error_code and let it inherit the same section. Also moved all the ".previous" asm directives for kprobes sections to column 1 and removed ".text" from them. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen <ak@suse.de>
2006-09-26 15:52:34 +07:00
KPROBE_END(page_fault)
ENTRY(coprocessor_error)
RING0_INT_FRAME
pushl $0
CFI_ADJUST_CFA_OFFSET 4
pushl $do_coprocessor_error
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(coprocessor_error)
ENTRY(simd_coprocessor_error)
RING0_INT_FRAME
pushl $0
CFI_ADJUST_CFA_OFFSET 4
pushl $do_simd_coprocessor_error
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(simd_coprocessor_error)
ENTRY(device_not_available)
RING0_INT_FRAME
pushl $-1 # mark this as an int
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
GET_CR0_INTO_EAX
testl $0x4, %eax # EM (math emulation bit)
jne device_not_available_emulate
preempt_stop(CLBR_ANY)
call math_state_restore
jmp ret_from_exception
device_not_available_emulate:
pushl $0 # temporary storage for ORIG_EIP
CFI_ADJUST_CFA_OFFSET 4
call math_emulate
addl $4, %esp
CFI_ADJUST_CFA_OFFSET -4
jmp ret_from_exception
CFI_ENDPROC
END(device_not_available)
/*
* Debug traps and NMI can happen at the one SYSENTER instruction
* that sets up the real kernel stack. Check here, since we can't
* allow the wrong stack to be used.
*
* "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
* already pushed 3 words if it hits on the sysenter instruction:
* eflags, cs and eip.
*
* We just load the right stack, and push the three (known) values
* by hand onto the new stack - while updating the return eip past
* the instruction that would have done it for sysenter.
*/
#define FIX_STACK(offset, ok, label) \
cmpw $__KERNEL_CS,4(%esp); \
jne ok; \
label: \
movl TSS_sysenter_sp0+offset(%esp),%esp; \
CFI_DEF_CFA esp, 0; \
CFI_UNDEFINED eip; \
pushfl; \
CFI_ADJUST_CFA_OFFSET 4; \
pushl $__KERNEL_CS; \
CFI_ADJUST_CFA_OFFSET 4; \
pushl $sysenter_past_esp; \
CFI_ADJUST_CFA_OFFSET 4; \
CFI_REL_OFFSET eip, 0
KPROBE_ENTRY(debug)
RING0_INT_FRAME
cmpl $ia32_sysenter_target,(%esp)
jne debug_stack_correct
FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
debug_stack_correct:
pushl $-1 # mark this as an int
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
xorl %edx,%edx # error code 0
movl %esp,%eax # pt_regs pointer
call do_debug
jmp ret_from_exception
CFI_ENDPROC
[PATCH] x86: error_code is not safe for kprobes This patch moves the entry.S:error_entry to .kprobes.text section, since code marked unsafe for kprobes jumps directly to entry.S::error_entry, that must be marked unsafe as well. This patch also moves all the ".previous.text" asm directives to ".previous" for kprobes section. AK: Following a similar i386 patch from Chuck Ebbert AK: Also merged Jeremy's fix in. +From: Jeremy Fitzhardinge <jeremy@goop.org> KPROBE_ENTRY does a .section .kprobes.text, and expects its users to do a .previous at the end of the function. Unfortunately, if any code within the function switches sections, for example .fixup, then the .previous ends up putting all subsequent code into .fixup. Worse, any subsequent .fixup code gets intermingled with the code its supposed to be fixing (which is also in .fixup). It's surprising this didn't cause more havok. The fix is to use .pushsection/.popsection, so this stuff nests properly. A further cleanup would be to get rid of all .section/.previous pairs, since they're inherently fragile. +From: Chuck Ebbert <76306.1226@compuserve.com> Because code marked unsafe for kprobes jumps directly to entry.S::error_code, that must be marked unsafe as well. The easiest way to do that is to move the page fault entry point to just before error_code and let it inherit the same section. Also moved all the ".previous" asm directives for kprobes sections to column 1 and removed ".text" from them. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen <ak@suse.de>
2006-09-26 15:52:34 +07:00
KPROBE_END(debug)
/*
* NMI is doubly nasty. It can happen _while_ we're handling
* a debug fault, and the debug fault hasn't yet been able to
* clear up the stack. So we first check whether we got an
* NMI on the sysenter entry path, but after that we need to
* check whether we got an NMI on the debug path where the debug
* fault happened on the sysenter path.
*/
KPROBE_ENTRY(nmi)
RING0_INT_FRAME
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
movl %ss, %eax
cmpw $__ESPFIX_SS, %ax
popl %eax
CFI_ADJUST_CFA_OFFSET -4
je nmi_espfix_stack
cmpl $ia32_sysenter_target,(%esp)
je nmi_stack_fixup
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
movl %esp,%eax
/* Do not access memory above the end of our stack page,
* it might not exist.
*/
andl $(THREAD_SIZE-1),%eax
cmpl $(THREAD_SIZE-20),%eax
popl %eax
CFI_ADJUST_CFA_OFFSET -4
jae nmi_stack_correct
cmpl $ia32_sysenter_target,12(%esp)
je nmi_debug_stack_check
nmi_stack_correct:
/* We have a RING0_INT_FRAME here */
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
xorl %edx,%edx # zero error code
movl %esp,%eax # pt_regs pointer
call do_nmi
jmp restore_nocheck_notrace
CFI_ENDPROC
nmi_stack_fixup:
RING0_INT_FRAME
FIX_STACK(12,nmi_stack_correct, 1)
jmp nmi_stack_correct
nmi_debug_stack_check:
/* We have a RING0_INT_FRAME here */
cmpw $__KERNEL_CS,16(%esp)
jne nmi_stack_correct
cmpl $debug,(%esp)
jb nmi_stack_correct
cmpl $debug_esp_fix_insn,(%esp)
ja nmi_stack_correct
FIX_STACK(24,nmi_stack_correct, 1)
jmp nmi_stack_correct
nmi_espfix_stack:
/* We have a RING0_INT_FRAME here.
*
* create the pointer to lss back
*/
pushl %ss
CFI_ADJUST_CFA_OFFSET 4
pushl %esp
CFI_ADJUST_CFA_OFFSET 4
addw $4, (%esp)
/* copy the iret frame of 12 bytes */
.rept 3
pushl 16(%esp)
CFI_ADJUST_CFA_OFFSET 4
.endr
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
FIXUP_ESPFIX_STACK # %eax == %esp
xorl %edx,%edx # zero error code
call do_nmi
RESTORE_REGS
lss 12+4(%esp), %esp # back to espfix stack
CFI_ADJUST_CFA_OFFSET -24
jmp irq_return
CFI_ENDPROC
KPROBE_END(nmi)
#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
iret
.section __ex_table,"a"
.align 4
.long native_iret, iret_exc
.previous
END(native_iret)
ENTRY(native_irq_enable_syscall_ret)
sti
sysexit
END(native_irq_enable_syscall_ret)
#endif
KPROBE_ENTRY(int3)
RING0_INT_FRAME
pushl $-1 # mark this as an int
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
xorl %edx,%edx # zero error code
movl %esp,%eax # pt_regs pointer
call do_int3
jmp ret_from_exception
CFI_ENDPROC
[PATCH] x86: error_code is not safe for kprobes This patch moves the entry.S:error_entry to .kprobes.text section, since code marked unsafe for kprobes jumps directly to entry.S::error_entry, that must be marked unsafe as well. This patch also moves all the ".previous.text" asm directives to ".previous" for kprobes section. AK: Following a similar i386 patch from Chuck Ebbert AK: Also merged Jeremy's fix in. +From: Jeremy Fitzhardinge <jeremy@goop.org> KPROBE_ENTRY does a .section .kprobes.text, and expects its users to do a .previous at the end of the function. Unfortunately, if any code within the function switches sections, for example .fixup, then the .previous ends up putting all subsequent code into .fixup. Worse, any subsequent .fixup code gets intermingled with the code its supposed to be fixing (which is also in .fixup). It's surprising this didn't cause more havok. The fix is to use .pushsection/.popsection, so this stuff nests properly. A further cleanup would be to get rid of all .section/.previous pairs, since they're inherently fragile. +From: Chuck Ebbert <76306.1226@compuserve.com> Because code marked unsafe for kprobes jumps directly to entry.S::error_code, that must be marked unsafe as well. The easiest way to do that is to move the page fault entry point to just before error_code and let it inherit the same section. Also moved all the ".previous" asm directives for kprobes sections to column 1 and removed ".text" from them. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen <ak@suse.de>
2006-09-26 15:52:34 +07:00
KPROBE_END(int3)
ENTRY(overflow)
RING0_INT_FRAME
pushl $0
CFI_ADJUST_CFA_OFFSET 4
pushl $do_overflow
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(overflow)
ENTRY(bounds)
RING0_INT_FRAME
pushl $0
CFI_ADJUST_CFA_OFFSET 4
pushl $do_bounds
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(bounds)
ENTRY(invalid_op)
RING0_INT_FRAME
pushl $0
CFI_ADJUST_CFA_OFFSET 4
pushl $do_invalid_op
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(invalid_op)
ENTRY(coprocessor_segment_overrun)
RING0_INT_FRAME
pushl $0
CFI_ADJUST_CFA_OFFSET 4
pushl $do_coprocessor_segment_overrun
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(coprocessor_segment_overrun)
ENTRY(invalid_TSS)
RING0_EC_FRAME
pushl $do_invalid_TSS
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(invalid_TSS)
ENTRY(segment_not_present)
RING0_EC_FRAME
pushl $do_segment_not_present
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(segment_not_present)
ENTRY(stack_segment)
RING0_EC_FRAME
pushl $do_stack_segment
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(stack_segment)
KPROBE_ENTRY(general_protection)
RING0_EC_FRAME
pushl $do_general_protection
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
[PATCH] x86: error_code is not safe for kprobes This patch moves the entry.S:error_entry to .kprobes.text section, since code marked unsafe for kprobes jumps directly to entry.S::error_entry, that must be marked unsafe as well. This patch also moves all the ".previous.text" asm directives to ".previous" for kprobes section. AK: Following a similar i386 patch from Chuck Ebbert AK: Also merged Jeremy's fix in. +From: Jeremy Fitzhardinge <jeremy@goop.org> KPROBE_ENTRY does a .section .kprobes.text, and expects its users to do a .previous at the end of the function. Unfortunately, if any code within the function switches sections, for example .fixup, then the .previous ends up putting all subsequent code into .fixup. Worse, any subsequent .fixup code gets intermingled with the code its supposed to be fixing (which is also in .fixup). It's surprising this didn't cause more havok. The fix is to use .pushsection/.popsection, so this stuff nests properly. A further cleanup would be to get rid of all .section/.previous pairs, since they're inherently fragile. +From: Chuck Ebbert <76306.1226@compuserve.com> Because code marked unsafe for kprobes jumps directly to entry.S::error_code, that must be marked unsafe as well. The easiest way to do that is to move the page fault entry point to just before error_code and let it inherit the same section. Also moved all the ".previous" asm directives for kprobes sections to column 1 and removed ".text" from them. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen <ak@suse.de>
2006-09-26 15:52:34 +07:00
KPROBE_END(general_protection)
ENTRY(alignment_check)
RING0_EC_FRAME
pushl $do_alignment_check
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(alignment_check)
[PATCH] x86: error_code is not safe for kprobes This patch moves the entry.S:error_entry to .kprobes.text section, since code marked unsafe for kprobes jumps directly to entry.S::error_entry, that must be marked unsafe as well. This patch also moves all the ".previous.text" asm directives to ".previous" for kprobes section. AK: Following a similar i386 patch from Chuck Ebbert AK: Also merged Jeremy's fix in. +From: Jeremy Fitzhardinge <jeremy@goop.org> KPROBE_ENTRY does a .section .kprobes.text, and expects its users to do a .previous at the end of the function. Unfortunately, if any code within the function switches sections, for example .fixup, then the .previous ends up putting all subsequent code into .fixup. Worse, any subsequent .fixup code gets intermingled with the code its supposed to be fixing (which is also in .fixup). It's surprising this didn't cause more havok. The fix is to use .pushsection/.popsection, so this stuff nests properly. A further cleanup would be to get rid of all .section/.previous pairs, since they're inherently fragile. +From: Chuck Ebbert <76306.1226@compuserve.com> Because code marked unsafe for kprobes jumps directly to entry.S::error_code, that must be marked unsafe as well. The easiest way to do that is to move the page fault entry point to just before error_code and let it inherit the same section. Also moved all the ".previous" asm directives for kprobes sections to column 1 and removed ".text" from them. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen <ak@suse.de>
2006-09-26 15:52:34 +07:00
ENTRY(divide_error)
RING0_INT_FRAME
pushl $0 # no error code
CFI_ADJUST_CFA_OFFSET 4
pushl $do_divide_error
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(divide_error)
#ifdef CONFIG_X86_MCE
ENTRY(machine_check)
RING0_INT_FRAME
pushl $0
CFI_ADJUST_CFA_OFFSET 4
pushl machine_check_vector
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(machine_check)
#endif
ENTRY(spurious_interrupt_bug)
RING0_INT_FRAME
pushl $0
CFI_ADJUST_CFA_OFFSET 4
pushl $do_spurious_interrupt_bug
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
END(spurious_interrupt_bug)
ENTRY(kernel_thread_helper)
pushl $0 # fake return address for unwinder
CFI_STARTPROC
movl %edx,%eax
push %edx
CFI_ADJUST_CFA_OFFSET 4
call *%ebx
push %eax
CFI_ADJUST_CFA_OFFSET 4
call do_exit
CFI_ENDPROC
ENDPROC(kernel_thread_helper)
xen: Core Xen implementation This patch is a rollup of all the core pieces of the Xen implementation, including: - booting and setup - pagetable setup - privileged instructions - segmentation - interrupt flags - upcalls - multicall batching BOOTING AND SETUP The vmlinux image is decorated with ELF notes which tell the Xen domain builder what the kernel's requirements are; the domain builder then constructs the address space accordingly and starts the kernel. Xen has its own entrypoint for the kernel (contained in an ELF note). The ELF notes are set up by xen-head.S, which is included into head.S. In principle it could be linked separately, but it seems to provoke lots of binutils bugs. Because the domain builder starts the kernel in a fairly sane state (32-bit protected mode, paging enabled, flat segments set up), there's not a lot of setup needed before starting the kernel proper. The main steps are: 1. Install the Xen paravirt_ops, which is simply a matter of a structure assignment. 2. Set init_mm to use the Xen-supplied pagetables (analogous to the head.S generated pagetables in a native boot). 3. Reserve address space for Xen, since it takes a chunk at the top of the address space for its own use. 4. Call start_kernel() PAGETABLE SETUP Once we hit the main kernel boot sequence, it will end up calling back via paravirt_ops to set up various pieces of Xen specific state. One of the critical things which requires a bit of extra care is the construction of the initial init_mm pagetable. Because Xen places tight constraints on pagetables (an active pagetable must always be valid, and must always be mapped read-only to the guest domain), we need to be careful when constructing the new pagetable to keep these constraints in mind. It turns out that the easiest way to do this is use the initial Xen-provided pagetable as a template, and then just insert new mappings for memory where a mapping doesn't already exist. This means that during pagetable setup, it uses a special version of xen_set_pte which ignores any attempt to remap a read-only page as read-write (since Xen will map its own initial pagetable as RO), but lets other changes to the ptes happen, so that things like NX are set properly. PRIVILEGED INSTRUCTIONS AND SEGMENTATION When the kernel runs under Xen, it runs in ring 1 rather than ring 0. This means that it is more privileged than user-mode in ring 3, but it still can't run privileged instructions directly. Non-performance critical instructions are dealt with by taking a privilege exception and trapping into the hypervisor and emulating the instruction, but more performance-critical instructions have their own specific paravirt_ops. In many cases we can avoid having to do any hypercalls for these instructions, or the Xen implementation is quite different from the normal native version. The privileged instructions fall into the broad classes of: Segmentation: setting up the GDT and the GDT entries, LDT, TLS and so on. Xen doesn't allow the GDT to be directly modified; all GDT updates are done via hypercalls where the new entries can be validated. This is important because Xen uses segment limits to prevent the guest kernel from damaging the hypervisor itself. Traps and exceptions: Xen uses a special format for trap entrypoints, so when the kernel wants to set an IDT entry, it needs to be converted to the form Xen expects. Xen sets int 0x80 up specially so that the trap goes straight from userspace into the guest kernel without going via the hypervisor. sysenter isn't supported. Kernel stack: The esp0 entry is extracted from the tss and provided to Xen. TLB operations: the various TLB calls are mapped into corresponding Xen hypercalls. Control registers: all the control registers are privileged. The most important is cr3, which points to the base of the current pagetable, and we handle it specially. Another instruction we treat specially is CPUID, even though its not privileged. We want to control what CPU features are visible to the rest of the kernel, and so CPUID ends up going into a paravirt_op. Xen implements this mainly to disable the ACPI and APIC subsystems. INTERRUPT FLAGS Xen maintains its own separate flag for masking events, which is contained within the per-cpu vcpu_info structure. Because the guest kernel runs in ring 1 and not 0, the IF flag in EFLAGS is completely ignored (and must be, because even if a guest domain disables interrupts for itself, it can't disable them overall). (A note on terminology: "events" and interrupts are effectively synonymous. However, rather than using an "enable flag", Xen uses a "mask flag", which blocks event delivery when it is non-zero.) There are paravirt_ops for each of cli/sti/save_fl/restore_fl, which are implemented to manage the Xen event mask state. The only thing worth noting is that when events are unmasked, we need to explicitly see if there's a pending event and call into the hypervisor to make sure it gets delivered. UPCALLS Xen needs a couple of upcall (or callback) functions to be implemented by each guest. One is the event upcalls, which is how events (interrupts, effectively) are delivered to the guests. The other is the failsafe callback, which is used to report errors in either reloading a segment register, or caused by iret. These are implemented in i386/kernel/entry.S so they can jump into the normal iret_exc path when necessary. MULTICALL BATCHING Xen provides a multicall mechanism, which allows multiple hypercalls to be issued at once in order to mitigate the cost of trapping into the hypervisor. This is particularly useful for context switches, since the 4-5 hypercalls they would normally need (reload cr3, update TLS, maybe update LDT) can be reduced to one. This patch implements a generic batching mechanism for hypercalls, which gets used in many places in the Xen code. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Ian Pratt <ian.pratt@xensource.com> Cc: Christian Limpach <Christian.Limpach@cl.cam.ac.uk> Cc: Adrian Bunk <bunk@stusta.de>
2007-07-18 08:37:04 +07:00
#ifdef CONFIG_XEN
/* Xen doesn't set %esp to be precisely what the normal sysenter
entrypoint expects, so fix it up before using the normal path. */
ENTRY(xen_sysenter_target)
RING0_INT_FRAME
addl $5*4, %esp /* remove xen-provided frame */
jmp sysenter_past_esp
xen: Core Xen implementation This patch is a rollup of all the core pieces of the Xen implementation, including: - booting and setup - pagetable setup - privileged instructions - segmentation - interrupt flags - upcalls - multicall batching BOOTING AND SETUP The vmlinux image is decorated with ELF notes which tell the Xen domain builder what the kernel's requirements are; the domain builder then constructs the address space accordingly and starts the kernel. Xen has its own entrypoint for the kernel (contained in an ELF note). The ELF notes are set up by xen-head.S, which is included into head.S. In principle it could be linked separately, but it seems to provoke lots of binutils bugs. Because the domain builder starts the kernel in a fairly sane state (32-bit protected mode, paging enabled, flat segments set up), there's not a lot of setup needed before starting the kernel proper. The main steps are: 1. Install the Xen paravirt_ops, which is simply a matter of a structure assignment. 2. Set init_mm to use the Xen-supplied pagetables (analogous to the head.S generated pagetables in a native boot). 3. Reserve address space for Xen, since it takes a chunk at the top of the address space for its own use. 4. Call start_kernel() PAGETABLE SETUP Once we hit the main kernel boot sequence, it will end up calling back via paravirt_ops to set up various pieces of Xen specific state. One of the critical things which requires a bit of extra care is the construction of the initial init_mm pagetable. Because Xen places tight constraints on pagetables (an active pagetable must always be valid, and must always be mapped read-only to the guest domain), we need to be careful when constructing the new pagetable to keep these constraints in mind. It turns out that the easiest way to do this is use the initial Xen-provided pagetable as a template, and then just insert new mappings for memory where a mapping doesn't already exist. This means that during pagetable setup, it uses a special version of xen_set_pte which ignores any attempt to remap a read-only page as read-write (since Xen will map its own initial pagetable as RO), but lets other changes to the ptes happen, so that things like NX are set properly. PRIVILEGED INSTRUCTIONS AND SEGMENTATION When the kernel runs under Xen, it runs in ring 1 rather than ring 0. This means that it is more privileged than user-mode in ring 3, but it still can't run privileged instructions directly. Non-performance critical instructions are dealt with by taking a privilege exception and trapping into the hypervisor and emulating the instruction, but more performance-critical instructions have their own specific paravirt_ops. In many cases we can avoid having to do any hypercalls for these instructions, or the Xen implementation is quite different from the normal native version. The privileged instructions fall into the broad classes of: Segmentation: setting up the GDT and the GDT entries, LDT, TLS and so on. Xen doesn't allow the GDT to be directly modified; all GDT updates are done via hypercalls where the new entries can be validated. This is important because Xen uses segment limits to prevent the guest kernel from damaging the hypervisor itself. Traps and exceptions: Xen uses a special format for trap entrypoints, so when the kernel wants to set an IDT entry, it needs to be converted to the form Xen expects. Xen sets int 0x80 up specially so that the trap goes straight from userspace into the guest kernel without going via the hypervisor. sysenter isn't supported. Kernel stack: The esp0 entry is extracted from the tss and provided to Xen. TLB operations: the various TLB calls are mapped into corresponding Xen hypercalls. Control registers: all the control registers are privileged. The most important is cr3, which points to the base of the current pagetable, and we handle it specially. Another instruction we treat specially is CPUID, even though its not privileged. We want to control what CPU features are visible to the rest of the kernel, and so CPUID ends up going into a paravirt_op. Xen implements this mainly to disable the ACPI and APIC subsystems. INTERRUPT FLAGS Xen maintains its own separate flag for masking events, which is contained within the per-cpu vcpu_info structure. Because the guest kernel runs in ring 1 and not 0, the IF flag in EFLAGS is completely ignored (and must be, because even if a guest domain disables interrupts for itself, it can't disable them overall). (A note on terminology: "events" and interrupts are effectively synonymous. However, rather than using an "enable flag", Xen uses a "mask flag", which blocks event delivery when it is non-zero.) There are paravirt_ops for each of cli/sti/save_fl/restore_fl, which are implemented to manage the Xen event mask state. The only thing worth noting is that when events are unmasked, we need to explicitly see if there's a pending event and call into the hypervisor to make sure it gets delivered. UPCALLS Xen needs a couple of upcall (or callback) functions to be implemented by each guest. One is the event upcalls, which is how events (interrupts, effectively) are delivered to the guests. The other is the failsafe callback, which is used to report errors in either reloading a segment register, or caused by iret. These are implemented in i386/kernel/entry.S so they can jump into the normal iret_exc path when necessary. MULTICALL BATCHING Xen provides a multicall mechanism, which allows multiple hypercalls to be issued at once in order to mitigate the cost of trapping into the hypervisor. This is particularly useful for context switches, since the 4-5 hypercalls they would normally need (reload cr3, update TLS, maybe update LDT) can be reduced to one. This patch implements a generic batching mechanism for hypercalls, which gets used in many places in the Xen code. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Ian Pratt <ian.pratt@xensource.com> Cc: Christian Limpach <Christian.Limpach@cl.cam.ac.uk> Cc: Adrian Bunk <bunk@stusta.de>
2007-07-18 08:37:04 +07:00
ENTRY(xen_hypervisor_callback)
CFI_STARTPROC
pushl $0
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
TRACE_IRQS_OFF
/* Check to see if we got the event in the critical
region in xen_iret_direct, after we've reenabled
events and checked for pending events. This simulates
iret instruction's behaviour where it delivers a
pending interrupt when enabling interrupts. */
movl PT_EIP(%esp),%eax
cmpl $xen_iret_start_crit,%eax
jb 1f
cmpl $xen_iret_end_crit,%eax
jae 1f
jmp xen_iret_crit_fixup
ENTRY(xen_do_upcall)
1: mov %esp, %eax
xen: Core Xen implementation This patch is a rollup of all the core pieces of the Xen implementation, including: - booting and setup - pagetable setup - privileged instructions - segmentation - interrupt flags - upcalls - multicall batching BOOTING AND SETUP The vmlinux image is decorated with ELF notes which tell the Xen domain builder what the kernel's requirements are; the domain builder then constructs the address space accordingly and starts the kernel. Xen has its own entrypoint for the kernel (contained in an ELF note). The ELF notes are set up by xen-head.S, which is included into head.S. In principle it could be linked separately, but it seems to provoke lots of binutils bugs. Because the domain builder starts the kernel in a fairly sane state (32-bit protected mode, paging enabled, flat segments set up), there's not a lot of setup needed before starting the kernel proper. The main steps are: 1. Install the Xen paravirt_ops, which is simply a matter of a structure assignment. 2. Set init_mm to use the Xen-supplied pagetables (analogous to the head.S generated pagetables in a native boot). 3. Reserve address space for Xen, since it takes a chunk at the top of the address space for its own use. 4. Call start_kernel() PAGETABLE SETUP Once we hit the main kernel boot sequence, it will end up calling back via paravirt_ops to set up various pieces of Xen specific state. One of the critical things which requires a bit of extra care is the construction of the initial init_mm pagetable. Because Xen places tight constraints on pagetables (an active pagetable must always be valid, and must always be mapped read-only to the guest domain), we need to be careful when constructing the new pagetable to keep these constraints in mind. It turns out that the easiest way to do this is use the initial Xen-provided pagetable as a template, and then just insert new mappings for memory where a mapping doesn't already exist. This means that during pagetable setup, it uses a special version of xen_set_pte which ignores any attempt to remap a read-only page as read-write (since Xen will map its own initial pagetable as RO), but lets other changes to the ptes happen, so that things like NX are set properly. PRIVILEGED INSTRUCTIONS AND SEGMENTATION When the kernel runs under Xen, it runs in ring 1 rather than ring 0. This means that it is more privileged than user-mode in ring 3, but it still can't run privileged instructions directly. Non-performance critical instructions are dealt with by taking a privilege exception and trapping into the hypervisor and emulating the instruction, but more performance-critical instructions have their own specific paravirt_ops. In many cases we can avoid having to do any hypercalls for these instructions, or the Xen implementation is quite different from the normal native version. The privileged instructions fall into the broad classes of: Segmentation: setting up the GDT and the GDT entries, LDT, TLS and so on. Xen doesn't allow the GDT to be directly modified; all GDT updates are done via hypercalls where the new entries can be validated. This is important because Xen uses segment limits to prevent the guest kernel from damaging the hypervisor itself. Traps and exceptions: Xen uses a special format for trap entrypoints, so when the kernel wants to set an IDT entry, it needs to be converted to the form Xen expects. Xen sets int 0x80 up specially so that the trap goes straight from userspace into the guest kernel without going via the hypervisor. sysenter isn't supported. Kernel stack: The esp0 entry is extracted from the tss and provided to Xen. TLB operations: the various TLB calls are mapped into corresponding Xen hypercalls. Control registers: all the control registers are privileged. The most important is cr3, which points to the base of the current pagetable, and we handle it specially. Another instruction we treat specially is CPUID, even though its not privileged. We want to control what CPU features are visible to the rest of the kernel, and so CPUID ends up going into a paravirt_op. Xen implements this mainly to disable the ACPI and APIC subsystems. INTERRUPT FLAGS Xen maintains its own separate flag for masking events, which is contained within the per-cpu vcpu_info structure. Because the guest kernel runs in ring 1 and not 0, the IF flag in EFLAGS is completely ignored (and must be, because even if a guest domain disables interrupts for itself, it can't disable them overall). (A note on terminology: "events" and interrupts are effectively synonymous. However, rather than using an "enable flag", Xen uses a "mask flag", which blocks event delivery when it is non-zero.) There are paravirt_ops for each of cli/sti/save_fl/restore_fl, which are implemented to manage the Xen event mask state. The only thing worth noting is that when events are unmasked, we need to explicitly see if there's a pending event and call into the hypervisor to make sure it gets delivered. UPCALLS Xen needs a couple of upcall (or callback) functions to be implemented by each guest. One is the event upcalls, which is how events (interrupts, effectively) are delivered to the guests. The other is the failsafe callback, which is used to report errors in either reloading a segment register, or caused by iret. These are implemented in i386/kernel/entry.S so they can jump into the normal iret_exc path when necessary. MULTICALL BATCHING Xen provides a multicall mechanism, which allows multiple hypercalls to be issued at once in order to mitigate the cost of trapping into the hypervisor. This is particularly useful for context switches, since the 4-5 hypercalls they would normally need (reload cr3, update TLS, maybe update LDT) can be reduced to one. This patch implements a generic batching mechanism for hypercalls, which gets used in many places in the Xen code. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Ian Pratt <ian.pratt@xensource.com> Cc: Christian Limpach <Christian.Limpach@cl.cam.ac.uk> Cc: Adrian Bunk <bunk@stusta.de>
2007-07-18 08:37:04 +07:00
call xen_evtchn_do_upcall
jmp ret_from_intr
CFI_ENDPROC
ENDPROC(xen_hypervisor_callback)
# Hypervisor uses this for application faults while it executes.
# We get here for two reasons:
# 1. Fault while reloading DS, ES, FS or GS
# 2. Fault while executing IRET
# Category 1 we fix up by reattempting the load, and zeroing the segment
# register if the load fails.
# Category 2 we fix up by jumping to do_iret_error. We cannot use the
# normal Linux return path in this case because if we use the IRET hypercall
# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
# We distinguish between categories by maintaining a status value in EAX.
ENTRY(xen_failsafe_callback)
CFI_STARTPROC
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
movl $1,%eax
1: mov 4(%esp),%ds
2: mov 8(%esp),%es
3: mov 12(%esp),%fs
4: mov 16(%esp),%gs
testl %eax,%eax
popl %eax
CFI_ADJUST_CFA_OFFSET -4
lea 16(%esp),%esp
CFI_ADJUST_CFA_OFFSET -16
jz 5f
addl $16,%esp
jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
5: pushl $0 # EAX == 0 => Category 1 (Bad segment)
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
jmp ret_from_exception
CFI_ENDPROC
.section .fixup,"ax"
6: xorl %eax,%eax
movl %eax,4(%esp)
jmp 1b
7: xorl %eax,%eax
movl %eax,8(%esp)
jmp 2b
8: xorl %eax,%eax
movl %eax,12(%esp)
jmp 3b
9: xorl %eax,%eax
movl %eax,16(%esp)
jmp 4b
.previous
.section __ex_table,"a"
.align 4
.long 1b,6b
.long 2b,7b
.long 3b,8b
.long 4b,9b
.previous
ENDPROC(xen_failsafe_callback)
#endif /* CONFIG_XEN */
.section .rodata,"a"
#include "syscall_table_32.S"
syscall_table_size=(.-sys_call_table)