mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-21 04:54:24 +07:00
3193c0836f
On x86-64, with CONFIG_RETPOLINE=n, GCC's "global common subexpression
elimination" optimization results in ___bpf_prog_run()'s jumptable code
changing from this:
select_insn:
jmp *jumptable(, %rax, 8)
...
ALU64_ADD_X:
...
jmp *jumptable(, %rax, 8)
ALU_ADD_X:
...
jmp *jumptable(, %rax, 8)
to this:
select_insn:
mov jumptable, %r12
jmp *(%r12, %rax, 8)
...
ALU64_ADD_X:
...
jmp *(%r12, %rax, 8)
ALU_ADD_X:
...
jmp *(%r12, %rax, 8)
The jumptable address is placed in a register once, at the beginning of
the function. The function execution can then go through multiple
indirect jumps which rely on that same register value. This has a few
issues:
1) Objtool isn't smart enough to be able to track such a register value
across multiple recursive indirect jumps through the jump table.
2) With CONFIG_RETPOLINE enabled, this optimization actually results in
a small slowdown. I measured a ~4.7% slowdown in the test_bpf
"tcpdump port 22" selftest.
This slowdown is actually predicted by the GCC manual:
Note: When compiling a program using computed gotos, a GCC
extension, you may get better run-time performance if you
disable the global common subexpression elimination pass by
adding -fno-gcse to the command line.
So just disable the optimization for this function.
Fixes: e55a73251d
("bpf: Fix ORC unwinding in non-JIT BPF code")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/30c3ca29ba037afcbd860a8672eef0021addf9fe.1563413318.git.jpoimboe@redhat.com
175 lines
5.6 KiB
C
175 lines
5.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef __LINUX_COMPILER_TYPES_H
|
|
#error "Please don't include <linux/compiler-gcc.h> directly, include <linux/compiler.h> instead."
|
|
#endif
|
|
|
|
/*
|
|
* Common definitions for all gcc versions go here.
|
|
*/
|
|
#define GCC_VERSION (__GNUC__ * 10000 \
|
|
+ __GNUC_MINOR__ * 100 \
|
|
+ __GNUC_PATCHLEVEL__)
|
|
|
|
#if GCC_VERSION < 40600
|
|
# error Sorry, your compiler is too old - please upgrade it.
|
|
#endif
|
|
|
|
/* Optimization barrier */
|
|
|
|
/* The "volatile" is due to gcc bugs */
|
|
#define barrier() __asm__ __volatile__("": : :"memory")
|
|
/*
|
|
* This version is i.e. to prevent dead stores elimination on @ptr
|
|
* where gcc and llvm may behave differently when otherwise using
|
|
* normal barrier(): while gcc behavior gets along with a normal
|
|
* barrier(), llvm needs an explicit input variable to be assumed
|
|
* clobbered. The issue is as follows: while the inline asm might
|
|
* access any memory it wants, the compiler could have fit all of
|
|
* @ptr into memory registers instead, and since @ptr never escaped
|
|
* from that, it proved that the inline asm wasn't touching any of
|
|
* it. This version works well with both compilers, i.e. we're telling
|
|
* the compiler that the inline asm absolutely may see the contents
|
|
* of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495
|
|
*/
|
|
#define barrier_data(ptr) __asm__ __volatile__("": :"r"(ptr) :"memory")
|
|
|
|
/*
|
|
* This macro obfuscates arithmetic on a variable address so that gcc
|
|
* shouldn't recognize the original var, and make assumptions about it.
|
|
*
|
|
* This is needed because the C standard makes it undefined to do
|
|
* pointer arithmetic on "objects" outside their boundaries and the
|
|
* gcc optimizers assume this is the case. In particular they
|
|
* assume such arithmetic does not wrap.
|
|
*
|
|
* A miscompilation has been observed because of this on PPC.
|
|
* To work around it we hide the relationship of the pointer and the object
|
|
* using this macro.
|
|
*
|
|
* Versions of the ppc64 compiler before 4.1 had a bug where use of
|
|
* RELOC_HIDE could trash r30. The bug can be worked around by changing
|
|
* the inline assembly constraint from =g to =r, in this particular
|
|
* case either is valid.
|
|
*/
|
|
#define RELOC_HIDE(ptr, off) \
|
|
({ \
|
|
unsigned long __ptr; \
|
|
__asm__ ("" : "=r"(__ptr) : "0"(ptr)); \
|
|
(typeof(ptr)) (__ptr + (off)); \
|
|
})
|
|
|
|
/*
|
|
* A trick to suppress uninitialized variable warning without generating any
|
|
* code
|
|
*/
|
|
#define uninitialized_var(x) x = x
|
|
|
|
#ifdef CONFIG_RETPOLINE
|
|
#define __noretpoline __attribute__((__indirect_branch__("keep")))
|
|
#endif
|
|
|
|
#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
|
|
|
|
#define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
|
|
|
|
#define __compiletime_warning(message) __attribute__((__warning__(message)))
|
|
#define __compiletime_error(message) __attribute__((__error__(message)))
|
|
|
|
#if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__)
|
|
#define __latent_entropy __attribute__((latent_entropy))
|
|
#endif
|
|
|
|
/*
|
|
* calling noreturn functions, __builtin_unreachable() and __builtin_trap()
|
|
* confuse the stack allocation in gcc, leading to overly large stack
|
|
* frames, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82365
|
|
*
|
|
* Adding an empty inline assembly before it works around the problem
|
|
*/
|
|
#define barrier_before_unreachable() asm volatile("")
|
|
|
|
/*
|
|
* Mark a position in code as unreachable. This can be used to
|
|
* suppress control flow warnings after asm blocks that transfer
|
|
* control elsewhere.
|
|
*/
|
|
#define unreachable() \
|
|
do { \
|
|
annotate_unreachable(); \
|
|
barrier_before_unreachable(); \
|
|
__builtin_unreachable(); \
|
|
} while (0)
|
|
|
|
#if defined(RANDSTRUCT_PLUGIN) && !defined(__CHECKER__)
|
|
#define __randomize_layout __attribute__((randomize_layout))
|
|
#define __no_randomize_layout __attribute__((no_randomize_layout))
|
|
/* This anon struct can add padding, so only enable it under randstruct. */
|
|
#define randomized_struct_fields_start struct {
|
|
#define randomized_struct_fields_end } __randomize_layout;
|
|
#endif
|
|
|
|
/*
|
|
* GCC 'asm goto' miscompiles certain code sequences:
|
|
*
|
|
* http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670
|
|
*
|
|
* Work it around via a compiler barrier quirk suggested by Jakub Jelinek.
|
|
*
|
|
* (asm goto is automatically volatile - the naming reflects this.)
|
|
*/
|
|
#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0)
|
|
|
|
/*
|
|
* sparse (__CHECKER__) pretends to be gcc, but can't do constant
|
|
* folding in __builtin_bswap*() (yet), so don't set these for it.
|
|
*/
|
|
#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) && !defined(__CHECKER__)
|
|
#define __HAVE_BUILTIN_BSWAP32__
|
|
#define __HAVE_BUILTIN_BSWAP64__
|
|
#if GCC_VERSION >= 40800
|
|
#define __HAVE_BUILTIN_BSWAP16__
|
|
#endif
|
|
#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP && !__CHECKER__ */
|
|
|
|
#if GCC_VERSION >= 70000
|
|
#define KASAN_ABI_VERSION 5
|
|
#elif GCC_VERSION >= 50000
|
|
#define KASAN_ABI_VERSION 4
|
|
#elif GCC_VERSION >= 40902
|
|
#define KASAN_ABI_VERSION 3
|
|
#endif
|
|
|
|
#if __has_attribute(__no_sanitize_address__)
|
|
#define __no_sanitize_address __attribute__((no_sanitize_address))
|
|
#else
|
|
#define __no_sanitize_address
|
|
#endif
|
|
|
|
#if GCC_VERSION >= 50100
|
|
#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1
|
|
#endif
|
|
|
|
/*
|
|
* Turn individual warnings and errors on and off locally, depending
|
|
* on version.
|
|
*/
|
|
#define __diag_GCC(version, severity, s) \
|
|
__diag_GCC_ ## version(__diag_GCC_ ## severity s)
|
|
|
|
/* Severity used in pragma directives */
|
|
#define __diag_GCC_ignore ignored
|
|
#define __diag_GCC_warn warning
|
|
#define __diag_GCC_error error
|
|
|
|
#define __diag_str1(s) #s
|
|
#define __diag_str(s) __diag_str1(s)
|
|
#define __diag(s) _Pragma(__diag_str(GCC diagnostic s))
|
|
|
|
#if GCC_VERSION >= 80000
|
|
#define __diag_GCC_8(s) __diag(s)
|
|
#else
|
|
#define __diag_GCC_8(s)
|
|
#endif
|
|
|
|
#define __no_fgcse __attribute__((optimize("-fno-gcse")))
|