mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-17 04:57:25 +07:00
b302e4b176
AVX512 BFLOAT16 instructions support 16-bit BFLOAT16 floating-point format (BF16) for deep learning optimization. BF16 is a short version of 32-bit single-precision floating-point format (FP32) and has several advantages over 16-bit half-precision floating-point format (FP16). BF16 keeps FP32 accumulation after multiplication without loss of precision, offers more than enough range for deep learning training tasks, and doesn't need to handle hardware exception. AVX512 BFLOAT16 instructions are enumerated in CPUID.7.1:EAX[bit 5] AVX512_BF16. CPUID.7.1:EAX contains only feature bits. Reuse the currently empty word 12 as a pure features word to hold the feature bits including AVX512_BF16. Detailed information of the CPUID bit and AVX512 BFLOAT16 instructions can be found in the latest Intel Architecture Instruction Set Extensions and Future Features Programming Reference. [ bp: Check CPUID(7) subleaf validity before accessing subleaf 1. ] Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Signed-off-by: Borislav Petkov <bp@suse.de> Cc: "Chang S. Bae" <chang.seok.bae@intel.com> Cc: Frederic Weisbecker <frederic@kernel.org> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jann Horn <jannh@google.com> Cc: Masahiro Yamada <yamada.masahiro@socionext.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Nadav Amit <namit@vmware.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Pavel Tatashin <pasha.tatashin@oracle.com> Cc: Peter Feiner <pfeiner@google.com> Cc: Radim Krcmar <rkrcmar@redhat.com> Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com> Cc: "Ravi V Shankar" <ravi.v.shankar@intel.com> Cc: Robert Hoo <robert.hu@linux.intel.com> Cc: "Sean J Christopherson" <sean.j.christopherson@intel.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas Lendacky <Thomas.Lendacky@amd.com> Cc: x86 <x86@kernel.org> Link: https://lkml.kernel.org/r/1560794416-217638-3-git-send-email-fenghua.yu@intel.com
235 lines
8.2 KiB
C
235 lines
8.2 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _ASM_X86_CPUFEATURE_H
|
|
#define _ASM_X86_CPUFEATURE_H
|
|
|
|
#include <asm/processor.h>
|
|
|
|
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
|
|
|
|
#include <asm/asm.h>
|
|
#include <linux/bitops.h>
|
|
|
|
enum cpuid_leafs
|
|
{
|
|
CPUID_1_EDX = 0,
|
|
CPUID_8000_0001_EDX,
|
|
CPUID_8086_0001_EDX,
|
|
CPUID_LNX_1,
|
|
CPUID_1_ECX,
|
|
CPUID_C000_0001_EDX,
|
|
CPUID_8000_0001_ECX,
|
|
CPUID_LNX_2,
|
|
CPUID_LNX_3,
|
|
CPUID_7_0_EBX,
|
|
CPUID_D_1_EAX,
|
|
CPUID_LNX_4,
|
|
CPUID_7_1_EAX,
|
|
CPUID_8000_0008_EBX,
|
|
CPUID_6_EAX,
|
|
CPUID_8000_000A_EDX,
|
|
CPUID_7_ECX,
|
|
CPUID_8000_0007_EBX,
|
|
CPUID_7_EDX,
|
|
};
|
|
|
|
#ifdef CONFIG_X86_FEATURE_NAMES
|
|
extern const char * const x86_cap_flags[NCAPINTS*32];
|
|
extern const char * const x86_power_flags[32];
|
|
#define X86_CAP_FMT "%s"
|
|
#define x86_cap_flag(flag) x86_cap_flags[flag]
|
|
#else
|
|
#define X86_CAP_FMT "%d:%d"
|
|
#define x86_cap_flag(flag) ((flag) >> 5), ((flag) & 31)
|
|
#endif
|
|
|
|
/*
|
|
* In order to save room, we index into this array by doing
|
|
* X86_BUG_<name> - NCAPINTS*32.
|
|
*/
|
|
extern const char * const x86_bug_flags[NBUGINTS*32];
|
|
|
|
#define test_cpu_cap(c, bit) \
|
|
test_bit(bit, (unsigned long *)((c)->x86_capability))
|
|
|
|
/*
|
|
* There are 32 bits/features in each mask word. The high bits
|
|
* (selected with (bit>>5) give us the word number and the low 5
|
|
* bits give us the bit/feature number inside the word.
|
|
* (1UL<<((bit)&31) gives us a mask for the feature_bit so we can
|
|
* see if it is set in the mask word.
|
|
*/
|
|
#define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \
|
|
(((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word ))
|
|
|
|
#define REQUIRED_MASK_BIT_SET(feature_bit) \
|
|
( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 0, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 1, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 2, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 3, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 4, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 5, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 6, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 7, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 8, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 9, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
|
|
REQUIRED_MASK_CHECK || \
|
|
BUILD_BUG_ON_ZERO(NCAPINTS != 19))
|
|
|
|
#define DISABLED_MASK_BIT_SET(feature_bit) \
|
|
( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 1, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 2, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 3, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 4, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 5, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 6, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 7, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 8, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 9, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \
|
|
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
|
|
DISABLED_MASK_CHECK || \
|
|
BUILD_BUG_ON_ZERO(NCAPINTS != 19))
|
|
|
|
#define cpu_has(c, bit) \
|
|
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
|
|
test_cpu_cap(c, bit))
|
|
|
|
#define this_cpu_has(bit) \
|
|
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
|
|
x86_this_cpu_test_bit(bit, \
|
|
(unsigned long __percpu *)&cpu_info.x86_capability))
|
|
|
|
/*
|
|
* This macro is for detection of features which need kernel
|
|
* infrastructure to be used. It may *not* directly test the CPU
|
|
* itself. Use the cpu_has() family if you want true runtime
|
|
* testing of CPU features, like in hypervisor code where you are
|
|
* supporting a possible guest feature where host support for it
|
|
* is not relevant.
|
|
*/
|
|
#define cpu_feature_enabled(bit) \
|
|
(__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))
|
|
|
|
#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
|
|
|
|
#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
|
|
|
|
extern void setup_clear_cpu_cap(unsigned int bit);
|
|
extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
|
|
|
|
#define setup_force_cpu_cap(bit) do { \
|
|
set_cpu_cap(&boot_cpu_data, bit); \
|
|
set_bit(bit, (unsigned long *)cpu_caps_set); \
|
|
} while (0)
|
|
|
|
#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
|
|
|
|
#if defined(__clang__) && !defined(CONFIG_CC_HAS_ASM_GOTO)
|
|
|
|
/*
|
|
* Workaround for the sake of BPF compilation which utilizes kernel
|
|
* headers, but clang does not support ASM GOTO and fails the build.
|
|
*/
|
|
#ifndef __BPF_TRACING__
|
|
#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments"
|
|
#endif
|
|
|
|
#define static_cpu_has(bit) boot_cpu_has(bit)
|
|
|
|
#else
|
|
|
|
/*
|
|
* Static testing of CPU features. Used the same as boot_cpu_has(). It
|
|
* statically patches the target code for additional performance. Use
|
|
* static_cpu_has() only in fast paths, where every cycle counts. Which
|
|
* means that the boot_cpu_has() variant is already fast enough for the
|
|
* majority of cases and you should stick to using it as it is generally
|
|
* only two instructions: a RIP-relative MOV and a TEST.
|
|
*/
|
|
static __always_inline bool _static_cpu_has(u16 bit)
|
|
{
|
|
asm_volatile_goto("1: jmp 6f\n"
|
|
"2:\n"
|
|
".skip -(((5f-4f) - (2b-1b)) > 0) * "
|
|
"((5f-4f) - (2b-1b)),0x90\n"
|
|
"3:\n"
|
|
".section .altinstructions,\"a\"\n"
|
|
" .long 1b - .\n" /* src offset */
|
|
" .long 4f - .\n" /* repl offset */
|
|
" .word %P[always]\n" /* always replace */
|
|
" .byte 3b - 1b\n" /* src len */
|
|
" .byte 5f - 4f\n" /* repl len */
|
|
" .byte 3b - 2b\n" /* pad len */
|
|
".previous\n"
|
|
".section .altinstr_replacement,\"ax\"\n"
|
|
"4: jmp %l[t_no]\n"
|
|
"5:\n"
|
|
".previous\n"
|
|
".section .altinstructions,\"a\"\n"
|
|
" .long 1b - .\n" /* src offset */
|
|
" .long 0\n" /* no replacement */
|
|
" .word %P[feature]\n" /* feature bit */
|
|
" .byte 3b - 1b\n" /* src len */
|
|
" .byte 0\n" /* repl len */
|
|
" .byte 0\n" /* pad len */
|
|
".previous\n"
|
|
".section .altinstr_aux,\"ax\"\n"
|
|
"6:\n"
|
|
" testb %[bitnum],%[cap_byte]\n"
|
|
" jnz %l[t_yes]\n"
|
|
" jmp %l[t_no]\n"
|
|
".previous\n"
|
|
: : [feature] "i" (bit),
|
|
[always] "i" (X86_FEATURE_ALWAYS),
|
|
[bitnum] "i" (1 << (bit & 7)),
|
|
[cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
|
|
: : t_yes, t_no);
|
|
t_yes:
|
|
return true;
|
|
t_no:
|
|
return false;
|
|
}
|
|
|
|
#define static_cpu_has(bit) \
|
|
( \
|
|
__builtin_constant_p(boot_cpu_has(bit)) ? \
|
|
boot_cpu_has(bit) : \
|
|
_static_cpu_has(bit) \
|
|
)
|
|
#endif
|
|
|
|
#define cpu_has_bug(c, bit) cpu_has(c, (bit))
|
|
#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit))
|
|
#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit))
|
|
|
|
#define static_cpu_has_bug(bit) static_cpu_has((bit))
|
|
#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit))
|
|
#define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit))
|
|
|
|
#define MAX_CPU_FEATURES (NCAPINTS * 32)
|
|
#define cpu_have_feature boot_cpu_has
|
|
|
|
#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X"
|
|
#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
|
|
boot_cpu_data.x86_model
|
|
|
|
#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
|
|
#endif /* _ASM_X86_CPUFEATURE_H */
|