2008-07-30 00:29:19 +07:00
|
|
|
/*
|
|
|
|
* xsave/xrstor support.
|
|
|
|
*
|
|
|
|
* Author: Suresh Siddha <suresh.b.siddha@intel.com>
|
|
|
|
*/
|
|
|
|
#include <linux/compat.h>
|
2014-05-30 01:12:43 +07:00
|
|
|
#include <linux/cpu.h>
|
x86/pkeys: Allocation/free syscalls
This patch adds two new system calls:
int pkey_alloc(unsigned long flags, unsigned long init_access_rights)
int pkey_free(int pkey);
These implement an "allocator" for the protection keys
themselves, which can be thought of as analogous to the allocator
that the kernel has for file descriptors. The kernel tracks
which numbers are in use, and only allows operations on keys that
are valid. A key which was not obtained by pkey_alloc() may not,
for instance, be passed to pkey_mprotect().
These system calls are also very important given the kernel's use
of pkeys to implement execute-only support. These help ensure
that userspace can never assume that it has control of a key
unless it first asks the kernel. The kernel does not promise to
preserve PKRU (right register) contents except for allocated
pkeys.
The 'init_access_rights' argument to pkey_alloc() specifies the
rights that will be established for the returned pkey. For
instance:
pkey = pkey_alloc(flags, PKEY_DENY_WRITE);
will allocate 'pkey', but also sets the bits in PKRU[1] such that
writing to 'pkey' is already denied.
The kernel does not prevent pkey_free() from successfully freeing
in-use pkeys (those still assigned to a memory range by
pkey_mprotect()). It would be expensive to implement the checks
for this, so we instead say, "Just don't do it" since sane
software will never do it anyway.
Any piece of userspace calling pkey_alloc() needs to be prepared
for it to fail. Why? pkey_alloc() returns the same error code
(ENOSPC) when there are no pkeys and when pkeys are unsupported.
They can be unsupported for a whole host of reasons, so apps must
be prepared for this. Also, libraries or LD_PRELOADs might steal
keys before an application gets access to them.
This allocation mechanism could be implemented in userspace.
Even if we did it in userspace, we would still need additional
user/kernel interfaces to tell userspace which keys are being
used by the kernel internally (such as for execute-only
mappings). Having the kernel provide this facility completely
removes the need for these additional interfaces, or having an
implementation of this in userspace at all.
Note that we have to make changes to all of the architectures
that do not use mman-common.h because we use the new
PKEY_DENY_ACCESS/WRITE macros in arch-independent code.
1. PKRU is the Protection Key Rights User register. It is a
usermode-accessible register that controls whether writes
and/or access to each individual pkey is allowed or denied.
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: linux-arch@vger.kernel.org
Cc: Dave Hansen <dave@sr71.net>
Cc: arnd@arndb.de
Cc: linux-api@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: luto@kernel.org
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Link: http://lkml.kernel.org/r/20160729163015.444FE75F@viggo.jf.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2016-07-29 23:30:15 +07:00
|
|
|
#include <linux/mman.h>
|
2016-02-13 04:02:36 +07:00
|
|
|
#include <linux/pkeys.h>
|
2015-04-30 13:53:18 +07:00
|
|
|
|
2015-04-24 07:46:00 +07:00
|
|
|
#include <asm/fpu/api.h>
|
2015-04-24 07:54:44 +07:00
|
|
|
#include <asm/fpu/internal.h>
|
2015-04-30 13:45:02 +07:00
|
|
|
#include <asm/fpu/signal.h>
|
2015-04-30 13:53:18 +07:00
|
|
|
#include <asm/fpu/regset.h>
|
2016-06-18 03:07:17 +07:00
|
|
|
#include <asm/fpu/xstate.h>
|
2015-04-30 17:45:38 +07:00
|
|
|
|
2014-10-25 05:58:07 +07:00
|
|
|
#include <asm/tlbflush.h>
|
2008-07-30 00:29:19 +07:00
|
|
|
|
2016-02-13 04:01:58 +07:00
|
|
|
/*
|
|
|
|
* Although we spell it out in here, the Processor Trace
|
|
|
|
* xfeature is completely unused. We use other mechanisms
|
|
|
|
* to save/restore PT state in Linux.
|
|
|
|
*/
|
2015-04-28 13:51:17 +07:00
|
|
|
static const char *xfeature_names[] =
|
|
|
|
{
|
|
|
|
"x87 floating point registers" ,
|
|
|
|
"SSE registers" ,
|
|
|
|
"AVX registers" ,
|
|
|
|
"MPX bounds registers" ,
|
|
|
|
"MPX CSR" ,
|
|
|
|
"AVX-512 opmask" ,
|
|
|
|
"AVX-512 Hi256" ,
|
|
|
|
"AVX-512 ZMM_Hi256" ,
|
2016-02-13 04:01:58 +07:00
|
|
|
"Processor Trace (unused)" ,
|
2016-02-13 04:02:04 +07:00
|
|
|
"Protection Keys User registers",
|
2015-04-28 13:51:17 +07:00
|
|
|
"unknown xstate feature" ,
|
|
|
|
};
|
|
|
|
|
2008-07-30 00:29:19 +07:00
|
|
|
/*
|
2015-04-24 14:20:33 +07:00
|
|
|
* Mask of xstate features supported by the CPU and the kernel:
|
2008-07-30 00:29:19 +07:00
|
|
|
*/
|
2015-04-28 13:51:17 +07:00
|
|
|
u64 xfeatures_mask __read_mostly;
|
2008-07-30 00:29:19 +07:00
|
|
|
|
2015-09-03 06:31:27 +07:00
|
|
|
static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
|
|
|
|
static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
|
2015-04-24 14:20:33 +07:00
|
|
|
static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
|
2015-04-24 14:23:59 +07:00
|
|
|
|
2016-05-21 00:47:05 +07:00
|
|
|
/*
|
|
|
|
* The XSAVE area of kernel can be in standard or compacted format;
|
|
|
|
* it is always in standard format for user mode. This is the user
|
|
|
|
* mode standard format size used for signal and ptrace frames.
|
|
|
|
*/
|
|
|
|
unsigned int fpu_user_xstate_size;
|
|
|
|
|
2015-09-03 06:31:24 +07:00
|
|
|
/*
|
|
|
|
* Clear all of the X86_FEATURE_* bits that are unavailable
|
|
|
|
* when the CPU has no XSAVE support.
|
|
|
|
*/
|
|
|
|
void fpu__xstate_clear_all_cpu_caps(void)
|
|
|
|
{
|
|
|
|
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
|
|
|
|
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
|
|
|
|
setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
|
|
|
|
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
|
|
|
|
setup_clear_cpu_cap(X86_FEATURE_AVX);
|
|
|
|
setup_clear_cpu_cap(X86_FEATURE_AVX2);
|
|
|
|
setup_clear_cpu_cap(X86_FEATURE_AVX512F);
|
|
|
|
setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
|
|
|
|
setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
|
|
|
|
setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
|
2016-03-11 10:38:18 +07:00
|
|
|
setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
|
|
|
|
setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
|
|
|
|
setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
|
2015-09-03 06:31:24 +07:00
|
|
|
setup_clear_cpu_cap(X86_FEATURE_MPX);
|
2016-01-07 05:24:52 +07:00
|
|
|
setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
|
2016-02-13 04:02:04 +07:00
|
|
|
setup_clear_cpu_cap(X86_FEATURE_PKU);
|
2016-10-18 22:01:11 +07:00
|
|
|
setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
|
|
|
|
setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
|
2015-09-03 06:31:24 +07:00
|
|
|
}
|
|
|
|
|
2015-04-28 13:51:17 +07:00
|
|
|
/*
|
|
|
|
* Return whether the system supports a given xfeature.
|
|
|
|
*
|
|
|
|
* Also return the name of the (most advanced) feature that the caller requested:
|
|
|
|
*/
|
|
|
|
int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
|
|
|
|
{
|
|
|
|
u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask;
|
|
|
|
|
|
|
|
if (unlikely(feature_name)) {
|
|
|
|
long xfeature_idx, max_idx;
|
|
|
|
u64 xfeatures_print;
|
|
|
|
/*
|
|
|
|
* So we use FLS here to be able to print the most advanced
|
|
|
|
* feature that was requested but is missing. So if a driver
|
2015-09-03 06:31:26 +07:00
|
|
|
* asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
|
2015-04-28 13:51:17 +07:00
|
|
|
* missing AVX feature - this is the most informative message
|
|
|
|
* to users:
|
|
|
|
*/
|
|
|
|
if (xfeatures_missing)
|
|
|
|
xfeatures_print = xfeatures_missing;
|
|
|
|
else
|
|
|
|
xfeatures_print = xfeatures_needed;
|
|
|
|
|
|
|
|
xfeature_idx = fls64(xfeatures_print)-1;
|
|
|
|
max_idx = ARRAY_SIZE(xfeature_names)-1;
|
|
|
|
xfeature_idx = min(xfeature_idx, max_idx);
|
|
|
|
|
|
|
|
*feature_name = xfeature_names[xfeature_idx];
|
|
|
|
}
|
|
|
|
|
|
|
|
if (xfeatures_missing)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
|
|
|
|
|
2016-06-18 03:07:16 +07:00
|
|
|
static int xfeature_is_supervisor(int xfeature_nr)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We currently do not support supervisor states, but if
|
|
|
|
* we did, we could find out like this.
|
|
|
|
*
|
|
|
|
* SDM says: If state component 'i' is a user state component,
|
|
|
|
* ECX[0] return 0; if state component i is a supervisor
|
|
|
|
* state component, ECX[0] returns 1.
|
|
|
|
*/
|
|
|
|
u32 eax, ebx, ecx, edx;
|
|
|
|
|
|
|
|
cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
|
|
|
|
return !!(ecx & 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int xfeature_is_user(int xfeature_nr)
|
|
|
|
{
|
|
|
|
return !xfeature_is_supervisor(xfeature_nr);
|
|
|
|
}
|
|
|
|
|
2010-07-20 06:05:49 +07:00
|
|
|
/*
|
2015-05-01 14:59:04 +07:00
|
|
|
* When executing XSAVEOPT (or other optimized XSAVE instructions), if
|
|
|
|
* a processor implementation detects that an FPU state component is still
|
|
|
|
* (or is again) in its initialized state, it may clear the corresponding
|
|
|
|
* bit in the header.xfeatures field, and can skip the writeout of registers
|
|
|
|
* to the corresponding memory layout.
|
2015-04-24 16:32:59 +07:00
|
|
|
*
|
|
|
|
* This means that when the bit is zero, the state component might still contain
|
|
|
|
* some previous - non-initialized register state.
|
|
|
|
*
|
|
|
|
* Before writing xstate information to user-space we sanitize those components,
|
|
|
|
* to always ensure that the memory layout of a feature will be in the init state
|
|
|
|
* if the corresponding header bit is zero. This is to ensure that user-space doesn't
|
|
|
|
* see some stale state in the memory layout during signal handling, debugging etc.
|
2010-07-20 06:05:49 +07:00
|
|
|
*/
|
2015-04-28 16:25:02 +07:00
|
|
|
void fpstate_sanitize_xstate(struct fpu *fpu)
|
2010-07-20 06:05:49 +07:00
|
|
|
{
|
2015-04-30 22:15:32 +07:00
|
|
|
struct fxregs_state *fx = &fpu->state.fxsave;
|
2015-04-24 16:32:59 +07:00
|
|
|
int feature_bit;
|
2015-04-24 15:19:47 +07:00
|
|
|
u64 xfeatures;
|
2010-07-20 06:05:49 +07:00
|
|
|
|
2015-04-28 16:17:55 +07:00
|
|
|
if (!use_xsaveopt())
|
2010-07-20 06:05:49 +07:00
|
|
|
return;
|
|
|
|
|
2015-04-28 16:25:02 +07:00
|
|
|
xfeatures = fpu->state.xsave.header.xfeatures;
|
2010-07-20 06:05:49 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* None of the feature bits are in init state. So nothing else
|
2011-03-18 02:24:16 +07:00
|
|
|
* to do for us, as the memory layout is up to date.
|
2010-07-20 06:05:49 +07:00
|
|
|
*/
|
2015-04-24 15:19:47 +07:00
|
|
|
if ((xfeatures & xfeatures_mask) == xfeatures_mask)
|
2010-07-20 06:05:49 +07:00
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* FP is in init state
|
|
|
|
*/
|
2015-09-03 06:31:26 +07:00
|
|
|
if (!(xfeatures & XFEATURE_MASK_FP)) {
|
2010-07-20 06:05:49 +07:00
|
|
|
fx->cwd = 0x37f;
|
|
|
|
fx->swd = 0;
|
|
|
|
fx->twd = 0;
|
|
|
|
fx->fop = 0;
|
|
|
|
fx->rip = 0;
|
|
|
|
fx->rdp = 0;
|
|
|
|
memset(&fx->st_space[0], 0, 128);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* SSE is in init state
|
|
|
|
*/
|
2015-09-03 06:31:26 +07:00
|
|
|
if (!(xfeatures & XFEATURE_MASK_SSE))
|
2010-07-20 06:05:49 +07:00
|
|
|
memset(&fx->xmm_space[0], 0, 256);
|
|
|
|
|
2015-04-24 16:32:59 +07:00
|
|
|
/*
|
|
|
|
* First two features are FPU and SSE, which above we handled
|
|
|
|
* in a special way already:
|
|
|
|
*/
|
|
|
|
feature_bit = 0x2;
|
2015-04-24 15:19:47 +07:00
|
|
|
xfeatures = (xfeatures_mask & ~xfeatures) >> 2;
|
2010-07-20 06:05:49 +07:00
|
|
|
|
|
|
|
/*
|
2015-04-24 16:32:59 +07:00
|
|
|
* Update all the remaining memory layouts according to their
|
|
|
|
* standard xstate layout, if their header bit is in the init
|
|
|
|
* state:
|
2010-07-20 06:05:49 +07:00
|
|
|
*/
|
2015-04-24 15:19:47 +07:00
|
|
|
while (xfeatures) {
|
|
|
|
if (xfeatures & 0x1) {
|
2016-05-21 00:47:05 +07:00
|
|
|
int offset = xstate_comp_offsets[feature_bit];
|
2010-07-20 06:05:49 +07:00
|
|
|
int size = xstate_sizes[feature_bit];
|
|
|
|
|
2015-04-24 16:32:59 +07:00
|
|
|
memcpy((void *)fx + offset,
|
2015-04-30 16:07:06 +07:00
|
|
|
(void *)&init_fpstate.xsave + offset,
|
2010-07-20 06:05:49 +07:00
|
|
|
size);
|
|
|
|
}
|
|
|
|
|
2015-04-24 15:19:47 +07:00
|
|
|
xfeatures >>= 1;
|
2010-07-20 06:05:49 +07:00
|
|
|
feature_bit++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-07-30 00:29:19 +07:00
|
|
|
/*
|
2015-04-25 11:26:36 +07:00
|
|
|
* Enable the extended processor state save/restore feature.
|
|
|
|
* Called once per CPU onlining.
|
2008-07-30 00:29:19 +07:00
|
|
|
*/
|
2015-04-25 11:26:36 +07:00
|
|
|
void fpu__init_cpu_xstate(void)
|
2008-07-30 00:29:19 +07:00
|
|
|
{
|
2016-04-05 03:25:02 +07:00
|
|
|
if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask)
|
2015-04-25 11:26:36 +07:00
|
|
|
return;
|
2016-07-11 23:18:57 +07:00
|
|
|
/*
|
|
|
|
* Make it clear that XSAVES supervisor states are not yet
|
|
|
|
* implemented should anyone expect it to work by changing
|
|
|
|
* bits in XFEATURE_MASK_* macros and XCR0.
|
|
|
|
*/
|
|
|
|
WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR),
|
|
|
|
"x86/fpu: XSAVES supervisor states are not yet implemented.\n");
|
|
|
|
|
|
|
|
xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR;
|
2015-04-25 11:26:36 +07:00
|
|
|
|
2014-10-25 05:58:07 +07:00
|
|
|
cr4_set_bits(X86_CR4_OSXSAVE);
|
2015-04-24 14:20:33 +07:00
|
|
|
xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
|
2008-07-30 00:29:19 +07:00
|
|
|
}
|
|
|
|
|
2015-09-03 06:31:30 +07:00
|
|
|
/*
|
|
|
|
* Note that in the future we will likely need a pair of
|
|
|
|
* functions here: one for user xstates and the other for
|
|
|
|
* system xstates. For now, they are the same.
|
|
|
|
*/
|
|
|
|
static int xfeature_enabled(enum xfeature xfeature)
|
|
|
|
{
|
|
|
|
return !!(xfeatures_mask & (1UL << xfeature));
|
|
|
|
}
|
|
|
|
|
2010-07-20 06:05:48 +07:00
|
|
|
/*
|
x86/fpu/xstate: Don't assume the first zero xfeatures zero bit means the end
The current xstate code in setup_xstate_features() assumes that
the first zero bit means the end of xfeatures - but that is not
so, the SDM clearly states that an arbitrary set of xfeatures
might be enabled - and it is also clear from the description
of the compaction feature that holes are possible:
"13-6 Vol. 1MANAGING STATE USING THE XSAVE FEATURE SET
[...]
Compacted format. Each state component i (i ≥ 2) is located at a byte
offset from the base address of the XSAVE area based on the XCOMP_BV
field in the XSAVE header:
— If XCOMP_BV[i] = 0, state component i is not in the XSAVE area.
— If XCOMP_BV[i] = 1, the following items apply:
• If XCOMP_BV[j] = 0 for every j, 2 ≤ j < i, state component i is
located at a byte offset 576 from the base address of the XSAVE
area. (This item applies if i is the first bit set in bits 62:2 of
the XCOMP_BV; it implies that state component i is located at the
beginning of the extended region.)
• Otherwise, let j, 2 ≤ j < i, be the greatest value such that
XCOMP_BV[j] = 1. Then state component i is located at a byte offset
X from the location of state component j, where X is the number of
bytes required for state component j as enumerated in
CPUID.(EAX=0DH,ECX=j):EAX. (This item implies that state component i
immediately follows the preceding state component whose bit is set
in XCOMP_BV.)"
So don't assume that the first zero xfeatures bit means the end of
all xfeatures - iterate through all of them.
I'm not aware of hardware that triggers this currently.
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-04 12:37:47 +07:00
|
|
|
* Record the offsets and sizes of various xstates contained
|
|
|
|
* in the XSAVE state memory layout.
|
2010-07-20 06:05:48 +07:00
|
|
|
*/
|
2010-07-22 00:03:56 +07:00
|
|
|
static void __init setup_xstate_features(void)
|
2010-07-20 06:05:48 +07:00
|
|
|
{
|
2015-09-03 06:31:28 +07:00
|
|
|
u32 eax, ebx, ecx, edx, i;
|
2015-09-03 06:31:30 +07:00
|
|
|
/* start at the beginnning of the "extended state" */
|
|
|
|
unsigned int last_good_offset = offsetof(struct xregs_state,
|
|
|
|
extended_state_area);
|
2016-06-18 03:07:19 +07:00
|
|
|
/*
|
|
|
|
* The FP xstates and SSE xstates are legacy states. They are always
|
|
|
|
* in the fixed offsets in the xsave area in either compacted form
|
|
|
|
* or standard form.
|
|
|
|
*/
|
|
|
|
xstate_offsets[0] = 0;
|
|
|
|
xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space);
|
|
|
|
xstate_offsets[1] = xstate_sizes[0];
|
|
|
|
xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space);
|
2010-07-20 06:05:48 +07:00
|
|
|
|
2015-09-03 06:31:28 +07:00
|
|
|
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
|
2015-09-03 06:31:30 +07:00
|
|
|
if (!xfeature_enabled(i))
|
|
|
|
continue;
|
2010-07-20 06:05:48 +07:00
|
|
|
|
2015-09-03 06:31:30 +07:00
|
|
|
cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
|
2016-06-18 03:07:16 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If an xfeature is supervisor state, the offset
|
|
|
|
* in EBX is invalid. We leave it to -1.
|
|
|
|
*/
|
|
|
|
if (xfeature_is_user(i))
|
|
|
|
xstate_offsets[i] = ebx;
|
|
|
|
|
2015-09-03 06:31:28 +07:00
|
|
|
xstate_sizes[i] = eax;
|
2015-09-03 06:31:30 +07:00
|
|
|
/*
|
|
|
|
* In our xstate size checks, we assume that the
|
|
|
|
* highest-numbered xstate feature has the
|
|
|
|
* highest offset in the buffer. Ensure it does.
|
|
|
|
*/
|
|
|
|
WARN_ONCE(last_good_offset > xstate_offsets[i],
|
|
|
|
"x86/fpu: misordered xstate at %d\n", last_good_offset);
|
|
|
|
last_good_offset = xstate_offsets[i];
|
x86/fpu/xstate: Don't assume the first zero xfeatures zero bit means the end
The current xstate code in setup_xstate_features() assumes that
the first zero bit means the end of xfeatures - but that is not
so, the SDM clearly states that an arbitrary set of xfeatures
might be enabled - and it is also clear from the description
of the compaction feature that holes are possible:
"13-6 Vol. 1MANAGING STATE USING THE XSAVE FEATURE SET
[...]
Compacted format. Each state component i (i ≥ 2) is located at a byte
offset from the base address of the XSAVE area based on the XCOMP_BV
field in the XSAVE header:
— If XCOMP_BV[i] = 0, state component i is not in the XSAVE area.
— If XCOMP_BV[i] = 1, the following items apply:
• If XCOMP_BV[j] = 0 for every j, 2 ≤ j < i, state component i is
located at a byte offset 576 from the base address of the XSAVE
area. (This item applies if i is the first bit set in bits 62:2 of
the XCOMP_BV; it implies that state component i is located at the
beginning of the extended region.)
• Otherwise, let j, 2 ≤ j < i, be the greatest value such that
XCOMP_BV[j] = 1. Then state component i is located at a byte offset
X from the location of state component j, where X is the number of
bytes required for state component j as enumerated in
CPUID.(EAX=0DH,ECX=j):EAX. (This item implies that state component i
immediately follows the preceding state component whose bit is set
in XCOMP_BV.)"
So don't assume that the first zero xfeatures bit means the end of
all xfeatures - iterate through all of them.
I'm not aware of hardware that triggers this currently.
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-04 12:37:47 +07:00
|
|
|
}
|
2010-07-20 06:05:48 +07:00
|
|
|
}
|
|
|
|
|
2015-05-04 14:52:42 +07:00
|
|
|
static void __init print_xstate_feature(u64 xstate_mask)
|
2015-04-24 13:48:01 +07:00
|
|
|
{
|
2015-04-28 14:17:26 +07:00
|
|
|
const char *feature_name;
|
2015-04-24 13:48:01 +07:00
|
|
|
|
2015-04-28 14:17:26 +07:00
|
|
|
if (cpu_has_xfeatures(xstate_mask, &feature_name))
|
2016-02-13 04:02:04 +07:00
|
|
|
pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
|
2015-04-24 13:48:01 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Print out all the supported xstate features:
|
|
|
|
*/
|
2015-05-04 14:52:42 +07:00
|
|
|
static void __init print_xstate_features(void)
|
2015-04-24 13:48:01 +07:00
|
|
|
{
|
2015-09-03 06:31:26 +07:00
|
|
|
print_xstate_feature(XFEATURE_MASK_FP);
|
|
|
|
print_xstate_feature(XFEATURE_MASK_SSE);
|
|
|
|
print_xstate_feature(XFEATURE_MASK_YMM);
|
|
|
|
print_xstate_feature(XFEATURE_MASK_BNDREGS);
|
|
|
|
print_xstate_feature(XFEATURE_MASK_BNDCSR);
|
|
|
|
print_xstate_feature(XFEATURE_MASK_OPMASK);
|
|
|
|
print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
|
|
|
|
print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
|
2016-02-13 04:02:04 +07:00
|
|
|
print_xstate_feature(XFEATURE_MASK_PKRU);
|
2015-04-24 13:48:01 +07:00
|
|
|
}
|
|
|
|
|
2016-06-18 03:07:15 +07:00
|
|
|
/*
|
|
|
|
* This check is important because it is easy to get XSTATE_*
|
|
|
|
* confused with XSTATE_BIT_*.
|
|
|
|
*/
|
|
|
|
#define CHECK_XFEATURE(nr) do { \
|
|
|
|
WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \
|
|
|
|
WARN_ON(nr >= XFEATURE_MAX); \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We could cache this like xstate_size[], but we only use
|
|
|
|
* it here, so it would be a waste of space.
|
|
|
|
*/
|
|
|
|
static int xfeature_is_aligned(int xfeature_nr)
|
|
|
|
{
|
|
|
|
u32 eax, ebx, ecx, edx;
|
|
|
|
|
|
|
|
CHECK_XFEATURE(xfeature_nr);
|
|
|
|
cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
|
|
|
|
/*
|
|
|
|
* The value returned by ECX[1] indicates the alignment
|
|
|
|
* of state component 'i' when the compacted format
|
|
|
|
* of the extended region of an XSAVE area is used:
|
|
|
|
*/
|
|
|
|
return !!(ecx & 2);
|
|
|
|
}
|
|
|
|
|
2014-05-30 01:12:44 +07:00
|
|
|
/*
|
|
|
|
* This function sets up offsets and sizes of all extended states in
|
|
|
|
* xsave area. This supports both standard format and compacted format
|
|
|
|
* of the xsave aread.
|
|
|
|
*/
|
2015-05-04 14:52:42 +07:00
|
|
|
static void __init setup_xstate_comp(void)
|
2014-05-30 01:12:44 +07:00
|
|
|
{
|
2015-04-24 14:20:33 +07:00
|
|
|
unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
|
2014-05-30 01:12:44 +07:00
|
|
|
int i;
|
|
|
|
|
2014-05-31 04:59:24 +07:00
|
|
|
/*
|
|
|
|
* The FP xstates and SSE xstates are legacy states. They are always
|
|
|
|
* in the fixed offsets in the xsave area in either compacted form
|
|
|
|
* or standard form.
|
|
|
|
*/
|
|
|
|
xstate_comp_offsets[0] = 0;
|
2015-04-30 22:15:32 +07:00
|
|
|
xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
|
2014-05-30 01:12:44 +07:00
|
|
|
|
2016-04-05 03:25:03 +07:00
|
|
|
if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
|
2015-09-03 06:31:28 +07:00
|
|
|
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
|
2015-09-03 06:31:29 +07:00
|
|
|
if (xfeature_enabled(i)) {
|
2014-05-30 01:12:44 +07:00
|
|
|
xstate_comp_offsets[i] = xstate_offsets[i];
|
|
|
|
xstate_comp_sizes[i] = xstate_sizes[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2015-09-03 06:31:28 +07:00
|
|
|
xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] =
|
|
|
|
FXSAVE_SIZE + XSAVE_HDR_SIZE;
|
2014-05-30 01:12:44 +07:00
|
|
|
|
2015-09-03 06:31:28 +07:00
|
|
|
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
|
2015-09-03 06:31:29 +07:00
|
|
|
if (xfeature_enabled(i))
|
2014-05-30 01:12:44 +07:00
|
|
|
xstate_comp_sizes[i] = xstate_sizes[i];
|
|
|
|
else
|
|
|
|
xstate_comp_sizes[i] = 0;
|
|
|
|
|
2016-06-18 03:07:15 +07:00
|
|
|
if (i > FIRST_EXTENDED_XFEATURE) {
|
2014-05-30 01:12:44 +07:00
|
|
|
xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
|
|
|
|
+ xstate_comp_sizes[i-1];
|
|
|
|
|
2016-06-18 03:07:15 +07:00
|
|
|
if (xfeature_is_aligned(i))
|
|
|
|
xstate_comp_offsets[i] =
|
|
|
|
ALIGN(xstate_comp_offsets[i], 64);
|
|
|
|
}
|
2014-05-30 01:12:44 +07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-06-18 03:07:18 +07:00
|
|
|
/*
|
|
|
|
* Print out xstate component offsets and sizes
|
|
|
|
*/
|
|
|
|
static void __init print_xstate_offset_size(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
|
|
|
|
if (!xfeature_enabled(i))
|
|
|
|
continue;
|
|
|
|
pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
|
|
|
|
i, xstate_comp_offsets[i], i, xstate_sizes[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-07-30 00:29:19 +07:00
|
|
|
/*
|
|
|
|
* setup the xstate image representing the init state
|
|
|
|
*/
|
2015-05-04 14:52:42 +07:00
|
|
|
static void __init setup_init_fpu_buf(void)
|
2008-07-30 00:29:19 +07:00
|
|
|
{
|
2015-11-13 21:18:31 +07:00
|
|
|
static int on_boot_cpu __initdata = 1;
|
2015-05-05 16:34:49 +07:00
|
|
|
|
|
|
|
WARN_ON_FPU(!on_boot_cpu);
|
|
|
|
on_boot_cpu = 0;
|
|
|
|
|
2016-04-05 03:25:02 +07:00
|
|
|
if (!boot_cpu_has(X86_FEATURE_XSAVE))
|
2012-09-07 04:58:52 +07:00
|
|
|
return;
|
|
|
|
|
|
|
|
setup_xstate_features();
|
2015-04-24 13:48:01 +07:00
|
|
|
print_xstate_features();
|
2010-07-20 06:05:48 +07:00
|
|
|
|
2016-05-21 00:47:07 +07:00
|
|
|
if (boot_cpu_has(X86_FEATURE_XSAVES))
|
2015-04-30 16:07:06 +07:00
|
|
|
init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
|
2014-05-30 01:12:42 +07:00
|
|
|
|
2010-07-20 06:05:49 +07:00
|
|
|
/*
|
2016-05-21 00:47:07 +07:00
|
|
|
* Init all the features state with header.xfeatures being 0x0
|
2010-07-20 06:05:49 +07:00
|
|
|
*/
|
2015-05-27 19:04:44 +07:00
|
|
|
copy_kernel_to_xregs_booting(&init_fpstate.xsave);
|
2015-04-22 20:08:34 +07:00
|
|
|
|
2010-07-20 06:05:49 +07:00
|
|
|
/*
|
|
|
|
* Dump the init state again. This is to identify the init state
|
|
|
|
* of any feature which is not represented by all zero's.
|
|
|
|
*/
|
2015-04-30 16:34:09 +07:00
|
|
|
copy_xregs_to_kernel_booting(&init_fpstate.xsave);
|
2008-07-30 00:29:19 +07:00
|
|
|
}
|
|
|
|
|
2015-09-03 06:31:30 +07:00
|
|
|
static int xfeature_uncompacted_offset(int xfeature_nr)
|
|
|
|
{
|
|
|
|
u32 eax, ebx, ecx, edx;
|
|
|
|
|
2016-06-18 03:07:16 +07:00
|
|
|
/*
|
|
|
|
* Only XSAVES supports supervisor states and it uses compacted
|
|
|
|
* format. Checking a supervisor state's uncompacted offset is
|
|
|
|
* an error.
|
|
|
|
*/
|
|
|
|
if (XFEATURE_MASK_SUPERVISOR & (1 << xfeature_nr)) {
|
|
|
|
WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2015-09-03 06:31:30 +07:00
|
|
|
CHECK_XFEATURE(xfeature_nr);
|
|
|
|
cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
|
|
|
|
return ebx;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int xfeature_size(int xfeature_nr)
|
|
|
|
{
|
|
|
|
u32 eax, ebx, ecx, edx;
|
|
|
|
|
|
|
|
CHECK_XFEATURE(xfeature_nr);
|
|
|
|
cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
|
|
|
|
return eax;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 'XSAVES' implies two different things:
|
|
|
|
* 1. saving of supervisor/system state
|
|
|
|
* 2. using the compacted format
|
|
|
|
*
|
|
|
|
* Use this function when dealing with the compacted format so
|
|
|
|
* that it is obvious which aspect of 'XSAVES' is being handled
|
|
|
|
* by the calling code.
|
|
|
|
*/
|
2016-05-21 00:47:08 +07:00
|
|
|
int using_compacted_format(void)
|
2015-09-03 06:31:30 +07:00
|
|
|
{
|
2016-04-05 03:25:03 +07:00
|
|
|
return boot_cpu_has(X86_FEATURE_XSAVES);
|
2015-09-03 06:31:30 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __xstate_dump_leaves(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
u32 eax, ebx, ecx, edx;
|
|
|
|
static int should_dump = 1;
|
|
|
|
|
|
|
|
if (!should_dump)
|
|
|
|
return;
|
|
|
|
should_dump = 0;
|
|
|
|
/*
|
|
|
|
* Dump out a few leaves past the ones that we support
|
|
|
|
* just in case there are some goodies up there
|
|
|
|
*/
|
|
|
|
for (i = 0; i < XFEATURE_MAX + 10; i++) {
|
|
|
|
cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
|
|
|
|
pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
|
|
|
|
XSTATE_CPUID, i, eax, ebx, ecx, edx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#define XSTATE_WARN_ON(x) do { \
|
|
|
|
if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \
|
|
|
|
__xstate_dump_leaves(); \
|
|
|
|
} \
|
|
|
|
} while (0)
|
|
|
|
|
2015-09-03 06:31:31 +07:00
|
|
|
#define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \
|
|
|
|
if ((nr == nr_macro) && \
|
|
|
|
WARN_ONCE(sz != sizeof(__struct), \
|
|
|
|
"%s: struct is %zu bytes, cpu state %d bytes\n", \
|
|
|
|
__stringify(nr_macro), sizeof(__struct), sz)) { \
|
|
|
|
__xstate_dump_leaves(); \
|
|
|
|
} \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We have a C struct for each 'xstate'. We need to ensure
|
|
|
|
* that our software representation matches what the CPU
|
|
|
|
* tells us about the state's size.
|
|
|
|
*/
|
|
|
|
static void check_xstate_against_struct(int nr)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Ask the CPU for the size of the state.
|
|
|
|
*/
|
|
|
|
int sz = xfeature_size(nr);
|
|
|
|
/*
|
|
|
|
* Match each CPU state with the corresponding software
|
|
|
|
* structure.
|
|
|
|
*/
|
|
|
|
XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct);
|
|
|
|
XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state);
|
|
|
|
XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state);
|
|
|
|
XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state);
|
|
|
|
XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
|
|
|
|
XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
|
2016-02-13 04:02:04 +07:00
|
|
|
XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state);
|
2015-09-03 06:31:31 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Make *SURE* to add any feature numbers in below if
|
|
|
|
* there are "holes" in the xsave state component
|
|
|
|
* numbers.
|
|
|
|
*/
|
|
|
|
if ((nr < XFEATURE_YMM) ||
|
2016-02-13 04:01:58 +07:00
|
|
|
(nr >= XFEATURE_MAX) ||
|
|
|
|
(nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
|
2015-09-03 06:31:31 +07:00
|
|
|
WARN_ONCE(1, "no structure for xstate: %d\n", nr);
|
|
|
|
XSTATE_WARN_ON(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-03 06:31:30 +07:00
|
|
|
/*
|
|
|
|
* This essentially double-checks what the cpu told us about
|
|
|
|
* how large the XSAVE buffer needs to be. We are recalculating
|
|
|
|
* it to be safe.
|
|
|
|
*/
|
|
|
|
static void do_extra_xstate_size_checks(void)
|
|
|
|
{
|
|
|
|
int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
|
|
|
|
if (!xfeature_enabled(i))
|
|
|
|
continue;
|
2015-09-03 06:31:31 +07:00
|
|
|
|
|
|
|
check_xstate_against_struct(i);
|
2015-09-03 06:31:30 +07:00
|
|
|
/*
|
|
|
|
* Supervisor state components can be managed only by
|
|
|
|
* XSAVES, which is compacted-format only.
|
|
|
|
*/
|
|
|
|
if (!using_compacted_format())
|
|
|
|
XSTATE_WARN_ON(xfeature_is_supervisor(i));
|
|
|
|
|
|
|
|
/* Align from the end of the previous feature */
|
|
|
|
if (xfeature_is_aligned(i))
|
|
|
|
paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64);
|
|
|
|
/*
|
|
|
|
* The offset of a given state in the non-compacted
|
|
|
|
* format is given to us in a CPUID leaf. We check
|
|
|
|
* them for being ordered (increasing offsets) in
|
|
|
|
* setup_xstate_features().
|
|
|
|
*/
|
|
|
|
if (!using_compacted_format())
|
|
|
|
paranoid_xstate_size = xfeature_uncompacted_offset(i);
|
|
|
|
/*
|
|
|
|
* The compacted-format offset always depends on where
|
|
|
|
* the previous state ended.
|
|
|
|
*/
|
|
|
|
paranoid_xstate_size += xfeature_size(i);
|
|
|
|
}
|
2016-05-21 00:47:06 +07:00
|
|
|
XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size);
|
2015-09-03 06:31:30 +07:00
|
|
|
}
|
|
|
|
|
2016-05-21 00:47:05 +07:00
|
|
|
|
2014-05-30 01:12:43 +07:00
|
|
|
/*
|
2016-05-21 00:47:05 +07:00
|
|
|
* Get total size of enabled xstates in XCR0/xfeatures_mask.
|
2015-09-03 06:31:30 +07:00
|
|
|
*
|
|
|
|
* Note the SDM's wording here. "sub-function 0" only enumerates
|
|
|
|
* the size of the *user* states. If we use it to size a buffer
|
|
|
|
* that we use 'XSAVES' on, we could potentially overflow the
|
|
|
|
* buffer because 'XSAVES' saves system states too.
|
|
|
|
*
|
|
|
|
* Note that we do not currently set any bits on IA32_XSS so
|
|
|
|
* 'XCR0 | IA32_XSS == XCR0' for now.
|
2014-05-30 01:12:43 +07:00
|
|
|
*/
|
2016-05-21 00:47:05 +07:00
|
|
|
static unsigned int __init get_xsaves_size(void)
|
2014-05-30 01:12:43 +07:00
|
|
|
{
|
|
|
|
unsigned int eax, ebx, ecx, edx;
|
2016-05-21 00:47:05 +07:00
|
|
|
/*
|
|
|
|
* - CPUID function 0DH, sub-function 1:
|
|
|
|
* EBX enumerates the size (in bytes) required by
|
|
|
|
* the XSAVES instruction for an XSAVE area
|
|
|
|
* containing all the state components
|
|
|
|
* corresponding to bits currently set in
|
|
|
|
* XCR0 | IA32_XSS.
|
|
|
|
*/
|
|
|
|
cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
|
|
|
|
return ebx;
|
|
|
|
}
|
2014-05-30 01:12:43 +07:00
|
|
|
|
2016-05-21 00:47:05 +07:00
|
|
|
static unsigned int __init get_xsave_size(void)
|
|
|
|
{
|
|
|
|
unsigned int eax, ebx, ecx, edx;
|
|
|
|
/*
|
|
|
|
* - CPUID function 0DH, sub-function 0:
|
|
|
|
* EBX enumerates the size (in bytes) required by
|
|
|
|
* the XSAVE instruction for an XSAVE area
|
|
|
|
* containing all the *user* state components
|
|
|
|
* corresponding to bits currently set in XCR0.
|
|
|
|
*/
|
|
|
|
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
|
|
|
|
return ebx;
|
2015-09-03 06:31:25 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Will the runtime-enumerated 'xstate_size' fit in the init
|
|
|
|
* task's statically-allocated buffer?
|
|
|
|
*/
|
|
|
|
static bool is_supported_xstate_size(unsigned int test_xstate_size)
|
|
|
|
{
|
|
|
|
if (test_xstate_size <= sizeof(union fpregs_state))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
|
|
|
|
sizeof(union fpregs_state), test_xstate_size);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int init_xstate_size(void)
|
|
|
|
{
|
|
|
|
/* Recompute the context size for enabled features: */
|
2016-05-21 00:47:05 +07:00
|
|
|
unsigned int possible_xstate_size;
|
|
|
|
unsigned int xsave_size;
|
|
|
|
|
|
|
|
xsave_size = get_xsave_size();
|
|
|
|
|
|
|
|
if (boot_cpu_has(X86_FEATURE_XSAVES))
|
|
|
|
possible_xstate_size = get_xsaves_size();
|
|
|
|
else
|
|
|
|
possible_xstate_size = xsave_size;
|
2015-09-03 06:31:25 +07:00
|
|
|
|
|
|
|
/* Ensure we have the space to store all enabled: */
|
|
|
|
if (!is_supported_xstate_size(possible_xstate_size))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The size is OK, we are definitely going to use xsave,
|
|
|
|
* make it known to the world that we need more space.
|
|
|
|
*/
|
2016-05-21 00:47:06 +07:00
|
|
|
fpu_kernel_xstate_size = possible_xstate_size;
|
2015-09-03 06:31:30 +07:00
|
|
|
do_extra_xstate_size_checks();
|
2016-05-21 00:47:05 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* User space is always in standard format.
|
|
|
|
*/
|
|
|
|
fpu_user_xstate_size = xsave_size;
|
2015-09-03 06:31:25 +07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-09-03 06:31:26 +07:00
|
|
|
/*
|
|
|
|
* We enabled the XSAVE hardware, but something went wrong and
|
|
|
|
* we can not use it. Disable it.
|
|
|
|
*/
|
|
|
|
static void fpu__init_disable_system_xstate(void)
|
2015-09-03 06:31:25 +07:00
|
|
|
{
|
|
|
|
xfeatures_mask = 0;
|
|
|
|
cr4_clear_bits(X86_CR4_OSXSAVE);
|
|
|
|
fpu__xstate_clear_all_cpu_caps();
|
2014-05-30 01:12:43 +07:00
|
|
|
}
|
|
|
|
|
2008-07-30 00:29:19 +07:00
|
|
|
/*
|
|
|
|
* Enable and initialize the xsave feature.
|
2015-04-25 11:26:36 +07:00
|
|
|
* Called once per system bootup.
|
2008-07-30 00:29:19 +07:00
|
|
|
*/
|
2015-05-04 14:52:42 +07:00
|
|
|
void __init fpu__init_system_xstate(void)
|
2008-07-30 00:29:19 +07:00
|
|
|
{
|
|
|
|
unsigned int eax, ebx, ecx, edx;
|
2015-11-13 21:18:31 +07:00
|
|
|
static int on_boot_cpu __initdata = 1;
|
2015-09-03 06:31:25 +07:00
|
|
|
int err;
|
2015-05-05 16:34:49 +07:00
|
|
|
|
|
|
|
WARN_ON_FPU(!on_boot_cpu);
|
|
|
|
on_boot_cpu = 0;
|
2008-07-30 00:29:19 +07:00
|
|
|
|
2016-04-05 03:25:02 +07:00
|
|
|
if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
|
2015-04-25 11:47:24 +07:00
|
|
|
pr_info("x86/fpu: Legacy x87 FPU detected.\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-07-22 00:03:54 +07:00
|
|
|
if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
|
2015-05-05 16:34:49 +07:00
|
|
|
WARN_ON_FPU(1);
|
2010-07-22 00:03:54 +07:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
|
2015-04-24 14:20:33 +07:00
|
|
|
xfeatures_mask = eax + ((u64)edx << 32);
|
2008-07-30 00:29:19 +07:00
|
|
|
|
2015-09-03 06:31:26 +07:00
|
|
|
if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
|
2016-07-21 02:45:51 +07:00
|
|
|
/*
|
|
|
|
* This indicates that something really unexpected happened
|
|
|
|
* with the enumeration. Disable XSAVE and try to continue
|
|
|
|
* booting without it. This is too early to BUG().
|
|
|
|
*/
|
2015-04-24 14:20:33 +07:00
|
|
|
pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
|
2016-07-21 02:45:51 +07:00
|
|
|
goto out_disable;
|
2008-07-30 00:29:19 +07:00
|
|
|
}
|
|
|
|
|
2016-01-07 05:24:53 +07:00
|
|
|
xfeatures_mask &= fpu__get_supported_xfeatures_mask();
|
2010-07-22 00:03:53 +07:00
|
|
|
|
2015-04-25 11:26:36 +07:00
|
|
|
/* Enable xstate instructions to be able to continue with initialization: */
|
|
|
|
fpu__init_cpu_xstate();
|
2015-09-03 06:31:25 +07:00
|
|
|
err = init_xstate_size();
|
2016-07-21 02:45:51 +07:00
|
|
|
if (err)
|
|
|
|
goto out_disable;
|
2008-07-30 00:29:19 +07:00
|
|
|
|
2016-06-18 03:07:17 +07:00
|
|
|
/*
|
|
|
|
* Update info used for ptrace frames; use standard-format size and no
|
|
|
|
* supervisor xstates:
|
|
|
|
*/
|
|
|
|
update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR);
|
|
|
|
|
2015-04-30 17:45:38 +07:00
|
|
|
fpu__init_prepare_fx_sw_frame();
|
2012-09-07 04:58:52 +07:00
|
|
|
setup_init_fpu_buf();
|
2015-05-04 14:43:55 +07:00
|
|
|
setup_xstate_comp();
|
2016-06-18 03:07:18 +07:00
|
|
|
print_xstate_offset_size();
|
2008-07-30 00:29:19 +07:00
|
|
|
|
2015-09-03 06:31:24 +07:00
|
|
|
pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
|
2015-04-24 14:20:33 +07:00
|
|
|
xfeatures_mask,
|
2016-05-21 00:47:06 +07:00
|
|
|
fpu_kernel_xstate_size,
|
2016-04-05 03:25:03 +07:00
|
|
|
boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
|
2016-07-21 02:45:51 +07:00
|
|
|
return;
|
|
|
|
|
|
|
|
out_disable:
|
|
|
|
/* something went wrong, try to boot without any XSAVE support */
|
|
|
|
fpu__init_disable_system_xstate();
|
2008-07-30 00:29:19 +07:00
|
|
|
}
|
2010-07-21 01:50:51 +07:00
|
|
|
|
2015-04-24 15:02:32 +07:00
|
|
|
/*
|
|
|
|
* Restore minimal FPU state after suspend:
|
|
|
|
*/
|
|
|
|
void fpu__resume_cpu(void)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Restore XCR0 on xsave capable CPUs:
|
|
|
|
*/
|
2016-04-05 03:25:02 +07:00
|
|
|
if (boot_cpu_has(X86_FEATURE_XSAVE))
|
2015-04-24 15:02:32 +07:00
|
|
|
xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
|
|
|
|
}
|
|
|
|
|
2016-02-13 04:02:35 +07:00
|
|
|
/*
|
|
|
|
* Given an xstate feature mask, calculate where in the xsave
|
|
|
|
* buffer the state is. Callers should ensure that the buffer
|
|
|
|
* is valid.
|
|
|
|
*
|
|
|
|
* Note: does not work for compacted buffers.
|
|
|
|
*/
|
|
|
|
void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
|
|
|
|
{
|
|
|
|
int feature_nr = fls64(xstate_feature_mask) - 1;
|
|
|
|
|
2016-07-11 23:18:55 +07:00
|
|
|
if (!xfeature_enabled(feature_nr)) {
|
|
|
|
WARN_ON_FPU(1);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2016-02-13 04:02:35 +07:00
|
|
|
return (void *)xsave + xstate_comp_offsets[feature_nr];
|
|
|
|
}
|
2014-05-30 01:12:44 +07:00
|
|
|
/*
|
|
|
|
* Given the xsave area and a state inside, this function returns the
|
|
|
|
* address of the state.
|
|
|
|
*
|
|
|
|
* This is the API that is called to get xstate address in either
|
|
|
|
* standard format or compacted format of xsave area.
|
|
|
|
*
|
2015-06-08 01:37:00 +07:00
|
|
|
* Note that if there is no data for the field in the xsave buffer
|
|
|
|
* this will return NULL.
|
|
|
|
*
|
2014-05-30 01:12:44 +07:00
|
|
|
* Inputs:
|
2015-06-08 01:37:00 +07:00
|
|
|
* xstate: the thread's storage area for all FPU data
|
|
|
|
* xstate_feature: state which is defined in xsave.h (e.g.
|
2015-09-03 06:31:26 +07:00
|
|
|
* XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...)
|
2014-05-30 01:12:44 +07:00
|
|
|
* Output:
|
2015-06-08 01:37:00 +07:00
|
|
|
* address of the state in the xsave area, or NULL if the
|
|
|
|
* field is not present in the xsave buffer.
|
2014-05-30 01:12:44 +07:00
|
|
|
*/
|
2015-06-08 01:37:00 +07:00
|
|
|
void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
|
2014-05-30 01:12:44 +07:00
|
|
|
{
|
2015-06-08 01:37:00 +07:00
|
|
|
/*
|
|
|
|
* Do we even *have* xsave state?
|
|
|
|
*/
|
|
|
|
if (!boot_cpu_has(X86_FEATURE_XSAVE))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We should not ever be requesting features that we
|
|
|
|
* have not enabled. Remember that pcntxt_mask is
|
|
|
|
* what we write to the XCR0 register.
|
|
|
|
*/
|
|
|
|
WARN_ONCE(!(xfeatures_mask & xstate_feature),
|
|
|
|
"get of unsupported state");
|
|
|
|
/*
|
|
|
|
* This assumes the last 'xsave*' instruction to
|
|
|
|
* have requested that 'xstate_feature' be saved.
|
|
|
|
* If it did not, we might be seeing and old value
|
|
|
|
* of the field in the buffer.
|
|
|
|
*
|
|
|
|
* This can happen because the last 'xsave' did not
|
|
|
|
* request that this feature be saved (unlikely)
|
|
|
|
* or because the "init optimization" caused it
|
|
|
|
* to not be saved.
|
|
|
|
*/
|
|
|
|
if (!(xsave->header.xfeatures & xstate_feature))
|
2014-05-30 01:12:44 +07:00
|
|
|
return NULL;
|
|
|
|
|
2016-02-13 04:02:35 +07:00
|
|
|
return __raw_xsave_addr(xsave, xstate_feature);
|
2014-05-30 01:12:44 +07:00
|
|
|
}
|
2014-11-24 16:57:42 +07:00
|
|
|
EXPORT_SYMBOL_GPL(get_xsave_addr);
|
x86/fpu/xstate: Wrap get_xsave_addr() to make it safer
The MPX code appears is calling a low-level FPU function
(copy_fpregs_to_fpstate()). This function is not able to
be called in all contexts, although it is safe to call
directly in some cases.
Although probably correct, the current code is ugly and
potentially error-prone. So, add a wrapper that calls
the (slightly) higher-level fpu__save() (which is preempt-
safe) and also ensures that we even *have* an FPU context
(in the case that this was called when in lazy FPU mode).
Ingo had this to say about the details about when we need
preemption disabled:
> it's indeed generally unsafe to access/copy FPU registers with preemption enabled,
> for two reasons:
>
> - on older systems that use FSAVE the instruction destroys FPU register
> contents, which has to be handled carefully
>
> - even on newer systems if we copy to FPU registers (which this code doesn't)
> then we don't want a context switch to occur in the middle of it, because a
> context switch will write to the fpstate, potentially overwriting our new data
> with old FPU state.
>
> But it's safe to access FPU registers with preemption enabled in a couple of
> special cases:
>
> - potentially destructively saving FPU registers: the signal handling code does
> this in copy_fpstate_to_sigframe(), because it can rely on the signal restore
> side to restore the original FPU state.
>
> - reading FPU registers on modern systems: we don't do this anywhere at the
> moment, mostly to keep symmetry with older systems where FSAVE is
> destructive.
>
> - initializing FPU registers on modern systems: fpu__clear() does this. Here
> it's safe because we don't copy from the fpstate.
>
> - directly writing FPU registers from user-space memory (!). We do this in
> fpu__restore_sig(), and it's safe because neither context switches nor
> irq-handler FPU use can corrupt the source context of the copy (which is
> user-space memory).
>
> Note that the MPX code's current use of copy_fpregs_to_fpstate() was safe I think,
> because:
>
> - MPX is predicated on eagerfpu, so the destructive F[N]SAVE instruction won't be
> used.
>
> - the code was only reading FPU registers, and was doing it only in places that
> guaranteed that an FPU state was already active (i.e. didn't do it in
> kthreads)
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave@sr71.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Cc: bp@alien8.de
Link: http://lkml.kernel.org/r/20150607183700.AA881696@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-06-08 01:37:00 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This wraps up the common operations that need to occur when retrieving
|
|
|
|
* data from xsave state. It first ensures that the current task was
|
|
|
|
* using the FPU and retrieves the data in to a buffer. It then calculates
|
|
|
|
* the offset of the requested field in the buffer.
|
|
|
|
*
|
|
|
|
* This function is safe to call whether the FPU is in use or not.
|
|
|
|
*
|
|
|
|
* Note that this only works on the current task.
|
|
|
|
*
|
|
|
|
* Inputs:
|
2015-09-03 06:31:26 +07:00
|
|
|
* @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
|
|
|
|
* XFEATURE_MASK_SSE, etc...)
|
x86/fpu/xstate: Wrap get_xsave_addr() to make it safer
The MPX code appears is calling a low-level FPU function
(copy_fpregs_to_fpstate()). This function is not able to
be called in all contexts, although it is safe to call
directly in some cases.
Although probably correct, the current code is ugly and
potentially error-prone. So, add a wrapper that calls
the (slightly) higher-level fpu__save() (which is preempt-
safe) and also ensures that we even *have* an FPU context
(in the case that this was called when in lazy FPU mode).
Ingo had this to say about the details about when we need
preemption disabled:
> it's indeed generally unsafe to access/copy FPU registers with preemption enabled,
> for two reasons:
>
> - on older systems that use FSAVE the instruction destroys FPU register
> contents, which has to be handled carefully
>
> - even on newer systems if we copy to FPU registers (which this code doesn't)
> then we don't want a context switch to occur in the middle of it, because a
> context switch will write to the fpstate, potentially overwriting our new data
> with old FPU state.
>
> But it's safe to access FPU registers with preemption enabled in a couple of
> special cases:
>
> - potentially destructively saving FPU registers: the signal handling code does
> this in copy_fpstate_to_sigframe(), because it can rely on the signal restore
> side to restore the original FPU state.
>
> - reading FPU registers on modern systems: we don't do this anywhere at the
> moment, mostly to keep symmetry with older systems where FSAVE is
> destructive.
>
> - initializing FPU registers on modern systems: fpu__clear() does this. Here
> it's safe because we don't copy from the fpstate.
>
> - directly writing FPU registers from user-space memory (!). We do this in
> fpu__restore_sig(), and it's safe because neither context switches nor
> irq-handler FPU use can corrupt the source context of the copy (which is
> user-space memory).
>
> Note that the MPX code's current use of copy_fpregs_to_fpstate() was safe I think,
> because:
>
> - MPX is predicated on eagerfpu, so the destructive F[N]SAVE instruction won't be
> used.
>
> - the code was only reading FPU registers, and was doing it only in places that
> guaranteed that an FPU state was already active (i.e. didn't do it in
> kthreads)
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave@sr71.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Cc: bp@alien8.de
Link: http://lkml.kernel.org/r/20150607183700.AA881696@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-06-08 01:37:00 +07:00
|
|
|
* Output:
|
|
|
|
* address of the state in the xsave area or NULL if the state
|
|
|
|
* is not present or is in its 'init state'.
|
|
|
|
*/
|
|
|
|
const void *get_xsave_field_ptr(int xsave_state)
|
|
|
|
{
|
|
|
|
struct fpu *fpu = ¤t->thread.fpu;
|
|
|
|
|
|
|
|
if (!fpu->fpstate_active)
|
|
|
|
return NULL;
|
|
|
|
/*
|
|
|
|
* fpu__save() takes the CPU's xstate registers
|
|
|
|
* and saves them off to the 'fpu memory buffer.
|
|
|
|
*/
|
|
|
|
fpu__save(fpu);
|
|
|
|
|
|
|
|
return get_xsave_addr(&fpu->state.xsave, xsave_state);
|
|
|
|
}
|
2016-02-13 04:02:35 +07:00
|
|
|
|
x86/pkeys: Allocation/free syscalls
This patch adds two new system calls:
int pkey_alloc(unsigned long flags, unsigned long init_access_rights)
int pkey_free(int pkey);
These implement an "allocator" for the protection keys
themselves, which can be thought of as analogous to the allocator
that the kernel has for file descriptors. The kernel tracks
which numbers are in use, and only allows operations on keys that
are valid. A key which was not obtained by pkey_alloc() may not,
for instance, be passed to pkey_mprotect().
These system calls are also very important given the kernel's use
of pkeys to implement execute-only support. These help ensure
that userspace can never assume that it has control of a key
unless it first asks the kernel. The kernel does not promise to
preserve PKRU (right register) contents except for allocated
pkeys.
The 'init_access_rights' argument to pkey_alloc() specifies the
rights that will be established for the returned pkey. For
instance:
pkey = pkey_alloc(flags, PKEY_DENY_WRITE);
will allocate 'pkey', but also sets the bits in PKRU[1] such that
writing to 'pkey' is already denied.
The kernel does not prevent pkey_free() from successfully freeing
in-use pkeys (those still assigned to a memory range by
pkey_mprotect()). It would be expensive to implement the checks
for this, so we instead say, "Just don't do it" since sane
software will never do it anyway.
Any piece of userspace calling pkey_alloc() needs to be prepared
for it to fail. Why? pkey_alloc() returns the same error code
(ENOSPC) when there are no pkeys and when pkeys are unsupported.
They can be unsupported for a whole host of reasons, so apps must
be prepared for this. Also, libraries or LD_PRELOADs might steal
keys before an application gets access to them.
This allocation mechanism could be implemented in userspace.
Even if we did it in userspace, we would still need additional
user/kernel interfaces to tell userspace which keys are being
used by the kernel internally (such as for execute-only
mappings). Having the kernel provide this facility completely
removes the need for these additional interfaces, or having an
implementation of this in userspace at all.
Note that we have to make changes to all of the architectures
that do not use mman-common.h because we use the new
PKEY_DENY_ACCESS/WRITE macros in arch-independent code.
1. PKRU is the Protection Key Rights User register. It is a
usermode-accessible register that controls whether writes
and/or access to each individual pkey is allowed or denied.
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: linux-arch@vger.kernel.org
Cc: Dave Hansen <dave@sr71.net>
Cc: arnd@arndb.de
Cc: linux-api@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: luto@kernel.org
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Link: http://lkml.kernel.org/r/20160729163015.444FE75F@viggo.jf.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2016-07-29 23:30:15 +07:00
|
|
|
#ifdef CONFIG_ARCH_HAS_PKEYS
|
|
|
|
|
2016-02-13 04:02:36 +07:00
|
|
|
#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
|
|
|
|
#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
|
|
|
|
/*
|
x86/mm/pkeys: Fix compact mode by removing protection keys' XSAVE buffer manipulation
The Memory Protection Keys "rights register" (PKRU) is
XSAVE-managed, and is saved/restored along with the FPU state.
When kernel code accesses FPU regsisters, it does a delicate
dance with preempt. Otherwise, the context switching code can
get confused as to whether the most up-to-date state is in the
registers themselves or in the XSAVE buffer.
But, PKRU is not a normal FPU register. Using it does not
generate the normal device-not-available (#NM) exceptions which
means we can not manage it lazily, and the kernel completley
disallows using lazy mode when it is enabled.
The dance with preempt *only* occurs when managing the FPU
lazily. Since we never manage PKRU lazily, we do not have to do
the dance with preempt; we can access it directly. Doing it
this way saves a ton of complicated code (and is faster too).
Further, the XSAVES reenabling failed to patch a bit of code
in fpu__xfeature_set_state() the checked for compacted buffers.
That check caused fpu__xfeature_set_state() to silently refuse to
work when the kernel is using compacted XSAVE buffers. This
broke execute-only and future pkey_mprotect() support when using
compact XSAVE buffers.
But, removing fpu__xfeature_set_state() gets rid of this issue,
in addition to the nice cleanup and speedup.
This fixes the same thing as a fix that Sai posted:
https://lkml.org/lkml/2016/7/25/637
The fix that he posted is a much more obviously correct, but I
think we should just do this instead.
Reported-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-Cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20160727232040.7D060DAD@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-28 06:20:40 +07:00
|
|
|
* This will go out and modify PKRU register to set the access
|
|
|
|
* rights for @pkey to @init_val.
|
2016-02-13 04:02:36 +07:00
|
|
|
*/
|
|
|
|
int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
|
|
|
|
unsigned long init_val)
|
|
|
|
{
|
x86/mm/pkeys: Fix compact mode by removing protection keys' XSAVE buffer manipulation
The Memory Protection Keys "rights register" (PKRU) is
XSAVE-managed, and is saved/restored along with the FPU state.
When kernel code accesses FPU regsisters, it does a delicate
dance with preempt. Otherwise, the context switching code can
get confused as to whether the most up-to-date state is in the
registers themselves or in the XSAVE buffer.
But, PKRU is not a normal FPU register. Using it does not
generate the normal device-not-available (#NM) exceptions which
means we can not manage it lazily, and the kernel completley
disallows using lazy mode when it is enabled.
The dance with preempt *only* occurs when managing the FPU
lazily. Since we never manage PKRU lazily, we do not have to do
the dance with preempt; we can access it directly. Doing it
this way saves a ton of complicated code (and is faster too).
Further, the XSAVES reenabling failed to patch a bit of code
in fpu__xfeature_set_state() the checked for compacted buffers.
That check caused fpu__xfeature_set_state() to silently refuse to
work when the kernel is using compacted XSAVE buffers. This
broke execute-only and future pkey_mprotect() support when using
compact XSAVE buffers.
But, removing fpu__xfeature_set_state() gets rid of this issue,
in addition to the nice cleanup and speedup.
This fixes the same thing as a fix that Sai posted:
https://lkml.org/lkml/2016/7/25/637
The fix that he posted is a much more obviously correct, but I
think we should just do this instead.
Reported-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-Cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20160727232040.7D060DAD@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-28 06:20:40 +07:00
|
|
|
u32 old_pkru;
|
2016-02-13 04:02:36 +07:00
|
|
|
int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
|
|
|
|
u32 new_pkru_bits = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This check implies XSAVE support. OSPKE only gets
|
|
|
|
* set if we enable XSAVE and we enable PKU in XCR0.
|
|
|
|
*/
|
|
|
|
if (!boot_cpu_has(X86_FEATURE_OSPKE))
|
|
|
|
return -EINVAL;
|
x86/mm/pkeys: Fix compact mode by removing protection keys' XSAVE buffer manipulation
The Memory Protection Keys "rights register" (PKRU) is
XSAVE-managed, and is saved/restored along with the FPU state.
When kernel code accesses FPU regsisters, it does a delicate
dance with preempt. Otherwise, the context switching code can
get confused as to whether the most up-to-date state is in the
registers themselves or in the XSAVE buffer.
But, PKRU is not a normal FPU register. Using it does not
generate the normal device-not-available (#NM) exceptions which
means we can not manage it lazily, and the kernel completley
disallows using lazy mode when it is enabled.
The dance with preempt *only* occurs when managing the FPU
lazily. Since we never manage PKRU lazily, we do not have to do
the dance with preempt; we can access it directly. Doing it
this way saves a ton of complicated code (and is faster too).
Further, the XSAVES reenabling failed to patch a bit of code
in fpu__xfeature_set_state() the checked for compacted buffers.
That check caused fpu__xfeature_set_state() to silently refuse to
work when the kernel is using compacted XSAVE buffers. This
broke execute-only and future pkey_mprotect() support when using
compact XSAVE buffers.
But, removing fpu__xfeature_set_state() gets rid of this issue,
in addition to the nice cleanup and speedup.
This fixes the same thing as a fix that Sai posted:
https://lkml.org/lkml/2016/7/25/637
The fix that he posted is a much more obviously correct, but I
think we should just do this instead.
Reported-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-Cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20160727232040.7D060DAD@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-28 06:20:40 +07:00
|
|
|
/*
|
|
|
|
* For most XSAVE components, this would be an arduous task:
|
|
|
|
* brining fpstate up to date with fpregs, updating fpstate,
|
|
|
|
* then re-populating fpregs. But, for components that are
|
|
|
|
* never lazily managed, we can just access the fpregs
|
|
|
|
* directly. PKRU is never managed lazily, so we can just
|
|
|
|
* manipulate it directly. Make sure it stays that way.
|
|
|
|
*/
|
|
|
|
WARN_ON_ONCE(!use_eager_fpu());
|
2016-02-13 04:02:36 +07:00
|
|
|
|
2016-06-18 03:07:17 +07:00
|
|
|
/* Set the bits we need in PKRU: */
|
2016-02-13 04:02:36 +07:00
|
|
|
if (init_val & PKEY_DISABLE_ACCESS)
|
|
|
|
new_pkru_bits |= PKRU_AD_BIT;
|
|
|
|
if (init_val & PKEY_DISABLE_WRITE)
|
|
|
|
new_pkru_bits |= PKRU_WD_BIT;
|
|
|
|
|
2016-06-18 03:07:17 +07:00
|
|
|
/* Shift the bits in to the correct place in PKRU for pkey: */
|
2016-02-13 04:02:36 +07:00
|
|
|
new_pkru_bits <<= pkey_shift;
|
|
|
|
|
x86/mm/pkeys: Fix compact mode by removing protection keys' XSAVE buffer manipulation
The Memory Protection Keys "rights register" (PKRU) is
XSAVE-managed, and is saved/restored along with the FPU state.
When kernel code accesses FPU regsisters, it does a delicate
dance with preempt. Otherwise, the context switching code can
get confused as to whether the most up-to-date state is in the
registers themselves or in the XSAVE buffer.
But, PKRU is not a normal FPU register. Using it does not
generate the normal device-not-available (#NM) exceptions which
means we can not manage it lazily, and the kernel completley
disallows using lazy mode when it is enabled.
The dance with preempt *only* occurs when managing the FPU
lazily. Since we never manage PKRU lazily, we do not have to do
the dance with preempt; we can access it directly. Doing it
this way saves a ton of complicated code (and is faster too).
Further, the XSAVES reenabling failed to patch a bit of code
in fpu__xfeature_set_state() the checked for compacted buffers.
That check caused fpu__xfeature_set_state() to silently refuse to
work when the kernel is using compacted XSAVE buffers. This
broke execute-only and future pkey_mprotect() support when using
compact XSAVE buffers.
But, removing fpu__xfeature_set_state() gets rid of this issue,
in addition to the nice cleanup and speedup.
This fixes the same thing as a fix that Sai posted:
https://lkml.org/lkml/2016/7/25/637
The fix that he posted is a much more obviously correct, but I
think we should just do this instead.
Reported-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-Cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20160727232040.7D060DAD@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-28 06:20:40 +07:00
|
|
|
/* Get old PKRU and mask off any old bits in place: */
|
|
|
|
old_pkru = read_pkru();
|
|
|
|
old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
|
2016-02-13 04:02:36 +07:00
|
|
|
|
x86/mm/pkeys: Fix compact mode by removing protection keys' XSAVE buffer manipulation
The Memory Protection Keys "rights register" (PKRU) is
XSAVE-managed, and is saved/restored along with the FPU state.
When kernel code accesses FPU regsisters, it does a delicate
dance with preempt. Otherwise, the context switching code can
get confused as to whether the most up-to-date state is in the
registers themselves or in the XSAVE buffer.
But, PKRU is not a normal FPU register. Using it does not
generate the normal device-not-available (#NM) exceptions which
means we can not manage it lazily, and the kernel completley
disallows using lazy mode when it is enabled.
The dance with preempt *only* occurs when managing the FPU
lazily. Since we never manage PKRU lazily, we do not have to do
the dance with preempt; we can access it directly. Doing it
this way saves a ton of complicated code (and is faster too).
Further, the XSAVES reenabling failed to patch a bit of code
in fpu__xfeature_set_state() the checked for compacted buffers.
That check caused fpu__xfeature_set_state() to silently refuse to
work when the kernel is using compacted XSAVE buffers. This
broke execute-only and future pkey_mprotect() support when using
compact XSAVE buffers.
But, removing fpu__xfeature_set_state() gets rid of this issue,
in addition to the nice cleanup and speedup.
This fixes the same thing as a fix that Sai posted:
https://lkml.org/lkml/2016/7/25/637
The fix that he posted is a much more obviously correct, but I
think we should just do this instead.
Reported-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-Cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20160727232040.7D060DAD@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-28 06:20:40 +07:00
|
|
|
/* Write old part along with new part: */
|
|
|
|
write_pkru(old_pkru | new_pkru_bits);
|
2016-06-18 03:07:17 +07:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
x86/pkeys: Allocation/free syscalls
This patch adds two new system calls:
int pkey_alloc(unsigned long flags, unsigned long init_access_rights)
int pkey_free(int pkey);
These implement an "allocator" for the protection keys
themselves, which can be thought of as analogous to the allocator
that the kernel has for file descriptors. The kernel tracks
which numbers are in use, and only allows operations on keys that
are valid. A key which was not obtained by pkey_alloc() may not,
for instance, be passed to pkey_mprotect().
These system calls are also very important given the kernel's use
of pkeys to implement execute-only support. These help ensure
that userspace can never assume that it has control of a key
unless it first asks the kernel. The kernel does not promise to
preserve PKRU (right register) contents except for allocated
pkeys.
The 'init_access_rights' argument to pkey_alloc() specifies the
rights that will be established for the returned pkey. For
instance:
pkey = pkey_alloc(flags, PKEY_DENY_WRITE);
will allocate 'pkey', but also sets the bits in PKRU[1] such that
writing to 'pkey' is already denied.
The kernel does not prevent pkey_free() from successfully freeing
in-use pkeys (those still assigned to a memory range by
pkey_mprotect()). It would be expensive to implement the checks
for this, so we instead say, "Just don't do it" since sane
software will never do it anyway.
Any piece of userspace calling pkey_alloc() needs to be prepared
for it to fail. Why? pkey_alloc() returns the same error code
(ENOSPC) when there are no pkeys and when pkeys are unsupported.
They can be unsupported for a whole host of reasons, so apps must
be prepared for this. Also, libraries or LD_PRELOADs might steal
keys before an application gets access to them.
This allocation mechanism could be implemented in userspace.
Even if we did it in userspace, we would still need additional
user/kernel interfaces to tell userspace which keys are being
used by the kernel internally (such as for execute-only
mappings). Having the kernel provide this facility completely
removes the need for these additional interfaces, or having an
implementation of this in userspace at all.
Note that we have to make changes to all of the architectures
that do not use mman-common.h because we use the new
PKEY_DENY_ACCESS/WRITE macros in arch-independent code.
1. PKRU is the Protection Key Rights User register. It is a
usermode-accessible register that controls whether writes
and/or access to each individual pkey is allowed or denied.
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: linux-arch@vger.kernel.org
Cc: Dave Hansen <dave@sr71.net>
Cc: arnd@arndb.de
Cc: linux-api@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: luto@kernel.org
Cc: akpm@linux-foundation.org
Cc: torvalds@linux-foundation.org
Link: http://lkml.kernel.org/r/20160729163015.444FE75F@viggo.jf.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2016-07-29 23:30:15 +07:00
|
|
|
#endif /* ! CONFIG_ARCH_HAS_PKEYS */
|
2016-06-18 03:07:17 +07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This is similar to user_regset_copyout(), but will not add offset to
|
|
|
|
* the source data pointer or increment pos, count, kbuf, and ubuf.
|
|
|
|
*/
|
|
|
|
static inline int xstate_copyout(unsigned int pos, unsigned int count,
|
|
|
|
void *kbuf, void __user *ubuf,
|
|
|
|
const void *data, const int start_pos,
|
|
|
|
const int end_pos)
|
|
|
|
{
|
|
|
|
if ((count == 0) || (pos < start_pos))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (end_pos < 0 || pos < end_pos) {
|
|
|
|
unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos));
|
|
|
|
|
|
|
|
if (kbuf) {
|
|
|
|
memcpy(kbuf + pos, data, copy);
|
|
|
|
} else {
|
|
|
|
if (__copy_to_user(ubuf + pos, data, copy))
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Convert from kernel XSAVES compacted format to standard format and copy
|
|
|
|
* to a ptrace buffer. It supports partial copy but pos always starts from
|
|
|
|
* zero. This is called from xstateregs_get() and there we check the CPU
|
|
|
|
* has XSAVES.
|
|
|
|
*/
|
|
|
|
int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf,
|
|
|
|
void __user *ubuf, struct xregs_state *xsave)
|
|
|
|
{
|
|
|
|
unsigned int offset, size;
|
|
|
|
int ret, i;
|
|
|
|
struct xstate_header header;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Currently copy_regset_to_user() starts from pos 0:
|
|
|
|
*/
|
|
|
|
if (unlikely(pos != 0))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The destination is a ptrace buffer; we put in only user xstates:
|
|
|
|
*/
|
|
|
|
memset(&header, 0, sizeof(header));
|
|
|
|
header.xfeatures = xsave->header.xfeatures;
|
|
|
|
header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Copy xregs_state->header:
|
|
|
|
*/
|
|
|
|
offset = offsetof(struct xregs_state, header);
|
|
|
|
size = sizeof(header);
|
|
|
|
|
|
|
|
ret = xstate_copyout(offset, size, kbuf, ubuf, &header, 0, count);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
for (i = 0; i < XFEATURE_MAX; i++) {
|
|
|
|
/*
|
|
|
|
* Copy only in-use xstates:
|
|
|
|
*/
|
|
|
|
if ((header.xfeatures >> i) & 1) {
|
|
|
|
void *src = __raw_xsave_addr(xsave, 1 << i);
|
|
|
|
|
|
|
|
offset = xstate_offsets[i];
|
|
|
|
size = xstate_sizes[i];
|
|
|
|
|
|
|
|
ret = xstate_copyout(offset, size, kbuf, ubuf, src, 0, count);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (offset + size >= count)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Fill xsave->i387.sw_reserved value for ptrace frame:
|
|
|
|
*/
|
|
|
|
offset = offsetof(struct fxregs_state, sw_reserved);
|
|
|
|
size = sizeof(xstate_fx_sw_bytes);
|
|
|
|
|
|
|
|
ret = xstate_copyout(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count);
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Convert from a ptrace standard-format buffer to kernel XSAVES format
|
|
|
|
* and copy to the target thread. This is called from xstateregs_set() and
|
|
|
|
* there we check the CPU has XSAVES and a whole standard-sized buffer
|
|
|
|
* exists.
|
|
|
|
*/
|
|
|
|
int copyin_to_xsaves(const void *kbuf, const void __user *ubuf,
|
|
|
|
struct xregs_state *xsave)
|
|
|
|
{
|
|
|
|
unsigned int offset, size;
|
|
|
|
int i;
|
|
|
|
u64 xfeatures;
|
|
|
|
u64 allowed_features;
|
|
|
|
|
|
|
|
offset = offsetof(struct xregs_state, header);
|
|
|
|
size = sizeof(xfeatures);
|
|
|
|
|
|
|
|
if (kbuf) {
|
|
|
|
memcpy(&xfeatures, kbuf + offset, size);
|
|
|
|
} else {
|
|
|
|
if (__copy_from_user(&xfeatures, ubuf + offset, size))
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reject if the user sets any disabled or supervisor features:
|
|
|
|
*/
|
|
|
|
allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR;
|
|
|
|
|
|
|
|
if (xfeatures & ~allowed_features)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
for (i = 0; i < XFEATURE_MAX; i++) {
|
|
|
|
u64 mask = ((u64)1 << i);
|
|
|
|
|
|
|
|
if (xfeatures & mask) {
|
|
|
|
void *dst = __raw_xsave_addr(xsave, 1 << i);
|
|
|
|
|
|
|
|
offset = xstate_offsets[i];
|
|
|
|
size = xstate_sizes[i];
|
|
|
|
|
|
|
|
if (kbuf) {
|
|
|
|
memcpy(dst, kbuf + offset, size);
|
|
|
|
} else {
|
|
|
|
if (__copy_from_user(dst, ubuf + offset, size))
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The state that came in from userspace was user-state only.
|
|
|
|
* Mask all the user states out of 'xfeatures':
|
|
|
|
*/
|
|
|
|
xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add back in the features that came in from userspace:
|
|
|
|
*/
|
|
|
|
xsave->header.xfeatures |= xfeatures;
|
2016-02-13 04:02:36 +07:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|