Loading...
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * xsave/xrstor support.
4 *
5 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
6 */
7#include <linux/compat.h>
8#include <linux/cpu.h>
9#include <linux/mman.h>
10#include <linux/pkeys.h>
11#include <linux/seq_file.h>
12#include <linux/proc_fs.h>
13
14#include <asm/fpu/api.h>
15#include <asm/fpu/internal.h>
16#include <asm/fpu/signal.h>
17#include <asm/fpu/regset.h>
18#include <asm/fpu/xstate.h>
19
20#include <asm/tlbflush.h>
21#include <asm/cpufeature.h>
22
23/*
24 * Although we spell it out in here, the Processor Trace
25 * xfeature is completely unused. We use other mechanisms
26 * to save/restore PT state in Linux.
27 */
28static const char *xfeature_names[] =
29{
30 "x87 floating point registers" ,
31 "SSE registers" ,
32 "AVX registers" ,
33 "MPX bounds registers" ,
34 "MPX CSR" ,
35 "AVX-512 opmask" ,
36 "AVX-512 Hi256" ,
37 "AVX-512 ZMM_Hi256" ,
38 "Processor Trace (unused)" ,
39 "Protection Keys User registers",
40 "unknown xstate feature" ,
41};
42
43static short xsave_cpuid_features[] __initdata = {
44 X86_FEATURE_FPU,
45 X86_FEATURE_XMM,
46 X86_FEATURE_AVX,
47 X86_FEATURE_MPX,
48 X86_FEATURE_MPX,
49 X86_FEATURE_AVX512F,
50 X86_FEATURE_AVX512F,
51 X86_FEATURE_AVX512F,
52 X86_FEATURE_INTEL_PT,
53 X86_FEATURE_PKU,
54};
55
56/*
57 * Mask of xstate features supported by the CPU and the kernel:
58 */
59u64 xfeatures_mask __read_mostly;
60
61static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
62static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
63static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
64
65/*
66 * The XSAVE area of kernel can be in standard or compacted format;
67 * it is always in standard format for user mode. This is the user
68 * mode standard format size used for signal and ptrace frames.
69 */
70unsigned int fpu_user_xstate_size;
71
72/*
73 * Return whether the system supports a given xfeature.
74 *
75 * Also return the name of the (most advanced) feature that the caller requested:
76 */
77int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
78{
79 u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask;
80
81 if (unlikely(feature_name)) {
82 long xfeature_idx, max_idx;
83 u64 xfeatures_print;
84 /*
85 * So we use FLS here to be able to print the most advanced
86 * feature that was requested but is missing. So if a driver
87 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
88 * missing AVX feature - this is the most informative message
89 * to users:
90 */
91 if (xfeatures_missing)
92 xfeatures_print = xfeatures_missing;
93 else
94 xfeatures_print = xfeatures_needed;
95
96 xfeature_idx = fls64(xfeatures_print)-1;
97 max_idx = ARRAY_SIZE(xfeature_names)-1;
98 xfeature_idx = min(xfeature_idx, max_idx);
99
100 *feature_name = xfeature_names[xfeature_idx];
101 }
102
103 if (xfeatures_missing)
104 return 0;
105
106 return 1;
107}
108EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
109
110static int xfeature_is_supervisor(int xfeature_nr)
111{
112 /*
113 * We currently do not support supervisor states, but if
114 * we did, we could find out like this.
115 *
116 * SDM says: If state component 'i' is a user state component,
117 * ECX[0] return 0; if state component i is a supervisor
118 * state component, ECX[0] returns 1.
119 */
120 u32 eax, ebx, ecx, edx;
121
122 cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
123 return !!(ecx & 1);
124}
125
126static int xfeature_is_user(int xfeature_nr)
127{
128 return !xfeature_is_supervisor(xfeature_nr);
129}
130
131/*
132 * When executing XSAVEOPT (or other optimized XSAVE instructions), if
133 * a processor implementation detects that an FPU state component is still
134 * (or is again) in its initialized state, it may clear the corresponding
135 * bit in the header.xfeatures field, and can skip the writeout of registers
136 * to the corresponding memory layout.
137 *
138 * This means that when the bit is zero, the state component might still contain
139 * some previous - non-initialized register state.
140 *
141 * Before writing xstate information to user-space we sanitize those components,
142 * to always ensure that the memory layout of a feature will be in the init state
143 * if the corresponding header bit is zero. This is to ensure that user-space doesn't
144 * see some stale state in the memory layout during signal handling, debugging etc.
145 */
146void fpstate_sanitize_xstate(struct fpu *fpu)
147{
148 struct fxregs_state *fx = &fpu->state.fxsave;
149 int feature_bit;
150 u64 xfeatures;
151
152 if (!use_xsaveopt())
153 return;
154
155 xfeatures = fpu->state.xsave.header.xfeatures;
156
157 /*
158 * None of the feature bits are in init state. So nothing else
159 * to do for us, as the memory layout is up to date.
160 */
161 if ((xfeatures & xfeatures_mask) == xfeatures_mask)
162 return;
163
164 /*
165 * FP is in init state
166 */
167 if (!(xfeatures & XFEATURE_MASK_FP)) {
168 fx->cwd = 0x37f;
169 fx->swd = 0;
170 fx->twd = 0;
171 fx->fop = 0;
172 fx->rip = 0;
173 fx->rdp = 0;
174 memset(&fx->st_space[0], 0, 128);
175 }
176
177 /*
178 * SSE is in init state
179 */
180 if (!(xfeatures & XFEATURE_MASK_SSE))
181 memset(&fx->xmm_space[0], 0, 256);
182
183 /*
184 * First two features are FPU and SSE, which above we handled
185 * in a special way already:
186 */
187 feature_bit = 0x2;
188 xfeatures = (xfeatures_mask & ~xfeatures) >> 2;
189
190 /*
191 * Update all the remaining memory layouts according to their
192 * standard xstate layout, if their header bit is in the init
193 * state:
194 */
195 while (xfeatures) {
196 if (xfeatures & 0x1) {
197 int offset = xstate_comp_offsets[feature_bit];
198 int size = xstate_sizes[feature_bit];
199
200 memcpy((void *)fx + offset,
201 (void *)&init_fpstate.xsave + offset,
202 size);
203 }
204
205 xfeatures >>= 1;
206 feature_bit++;
207 }
208}
209
210/*
211 * Enable the extended processor state save/restore feature.
212 * Called once per CPU onlining.
213 */
214void fpu__init_cpu_xstate(void)
215{
216 if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask)
217 return;
218 /*
219 * Make it clear that XSAVES supervisor states are not yet
220 * implemented should anyone expect it to work by changing
221 * bits in XFEATURE_MASK_* macros and XCR0.
222 */
223 WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR),
224 "x86/fpu: XSAVES supervisor states are not yet implemented.\n");
225
226 xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR;
227
228 cr4_set_bits(X86_CR4_OSXSAVE);
229 xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
230}
231
232/*
233 * Note that in the future we will likely need a pair of
234 * functions here: one for user xstates and the other for
235 * system xstates. For now, they are the same.
236 */
237static int xfeature_enabled(enum xfeature xfeature)
238{
239 return !!(xfeatures_mask & (1UL << xfeature));
240}
241
242/*
243 * Record the offsets and sizes of various xstates contained
244 * in the XSAVE state memory layout.
245 */
246static void __init setup_xstate_features(void)
247{
248 u32 eax, ebx, ecx, edx, i;
249 /* start at the beginnning of the "extended state" */
250 unsigned int last_good_offset = offsetof(struct xregs_state,
251 extended_state_area);
252 /*
253 * The FP xstates and SSE xstates are legacy states. They are always
254 * in the fixed offsets in the xsave area in either compacted form
255 * or standard form.
256 */
257 xstate_offsets[0] = 0;
258 xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space);
259 xstate_offsets[1] = xstate_sizes[0];
260 xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space);
261
262 for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
263 if (!xfeature_enabled(i))
264 continue;
265
266 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
267
268 /*
269 * If an xfeature is supervisor state, the offset
270 * in EBX is invalid. We leave it to -1.
271 */
272 if (xfeature_is_user(i))
273 xstate_offsets[i] = ebx;
274
275 xstate_sizes[i] = eax;
276 /*
277 * In our xstate size checks, we assume that the
278 * highest-numbered xstate feature has the
279 * highest offset in the buffer. Ensure it does.
280 */
281 WARN_ONCE(last_good_offset > xstate_offsets[i],
282 "x86/fpu: misordered xstate at %d\n", last_good_offset);
283 last_good_offset = xstate_offsets[i];
284 }
285}
286
287static void __init print_xstate_feature(u64 xstate_mask)
288{
289 const char *feature_name;
290
291 if (cpu_has_xfeatures(xstate_mask, &feature_name))
292 pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
293}
294
295/*
296 * Print out all the supported xstate features:
297 */
298static void __init print_xstate_features(void)
299{
300 print_xstate_feature(XFEATURE_MASK_FP);
301 print_xstate_feature(XFEATURE_MASK_SSE);
302 print_xstate_feature(XFEATURE_MASK_YMM);
303 print_xstate_feature(XFEATURE_MASK_BNDREGS);
304 print_xstate_feature(XFEATURE_MASK_BNDCSR);
305 print_xstate_feature(XFEATURE_MASK_OPMASK);
306 print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
307 print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
308 print_xstate_feature(XFEATURE_MASK_PKRU);
309}
310
311/*
312 * This check is important because it is easy to get XSTATE_*
313 * confused with XSTATE_BIT_*.
314 */
315#define CHECK_XFEATURE(nr) do { \
316 WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \
317 WARN_ON(nr >= XFEATURE_MAX); \
318} while (0)
319
320/*
321 * We could cache this like xstate_size[], but we only use
322 * it here, so it would be a waste of space.
323 */
324static int xfeature_is_aligned(int xfeature_nr)
325{
326 u32 eax, ebx, ecx, edx;
327
328 CHECK_XFEATURE(xfeature_nr);
329 cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
330 /*
331 * The value returned by ECX[1] indicates the alignment
332 * of state component 'i' when the compacted format
333 * of the extended region of an XSAVE area is used:
334 */
335 return !!(ecx & 2);
336}
337
338/*
339 * This function sets up offsets and sizes of all extended states in
340 * xsave area. This supports both standard format and compacted format
341 * of the xsave aread.
342 */
343static void __init setup_xstate_comp(void)
344{
345 unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
346 int i;
347
348 /*
349 * The FP xstates and SSE xstates are legacy states. They are always
350 * in the fixed offsets in the xsave area in either compacted form
351 * or standard form.
352 */
353 xstate_comp_offsets[0] = 0;
354 xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
355
356 if (!boot_cpu_has(X86_FEATURE_XSAVES)) {
357 for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
358 if (xfeature_enabled(i)) {
359 xstate_comp_offsets[i] = xstate_offsets[i];
360 xstate_comp_sizes[i] = xstate_sizes[i];
361 }
362 }
363 return;
364 }
365
366 xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] =
367 FXSAVE_SIZE + XSAVE_HDR_SIZE;
368
369 for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
370 if (xfeature_enabled(i))
371 xstate_comp_sizes[i] = xstate_sizes[i];
372 else
373 xstate_comp_sizes[i] = 0;
374
375 if (i > FIRST_EXTENDED_XFEATURE) {
376 xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
377 + xstate_comp_sizes[i-1];
378
379 if (xfeature_is_aligned(i))
380 xstate_comp_offsets[i] =
381 ALIGN(xstate_comp_offsets[i], 64);
382 }
383 }
384}
385
386/*
387 * Print out xstate component offsets and sizes
388 */
389static void __init print_xstate_offset_size(void)
390{
391 int i;
392
393 for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
394 if (!xfeature_enabled(i))
395 continue;
396 pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
397 i, xstate_comp_offsets[i], i, xstate_sizes[i]);
398 }
399}
400
401/*
402 * setup the xstate image representing the init state
403 */
404static void __init setup_init_fpu_buf(void)
405{
406 static int on_boot_cpu __initdata = 1;
407
408 WARN_ON_FPU(!on_boot_cpu);
409 on_boot_cpu = 0;
410
411 if (!boot_cpu_has(X86_FEATURE_XSAVE))
412 return;
413
414 setup_xstate_features();
415 print_xstate_features();
416
417 if (boot_cpu_has(X86_FEATURE_XSAVES))
418 init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
419
420 /*
421 * Init all the features state with header.xfeatures being 0x0
422 */
423 copy_kernel_to_xregs_booting(&init_fpstate.xsave);
424
425 /*
426 * Dump the init state again. This is to identify the init state
427 * of any feature which is not represented by all zero's.
428 */
429 copy_xregs_to_kernel_booting(&init_fpstate.xsave);
430}
431
432static int xfeature_uncompacted_offset(int xfeature_nr)
433{
434 u32 eax, ebx, ecx, edx;
435
436 /*
437 * Only XSAVES supports supervisor states and it uses compacted
438 * format. Checking a supervisor state's uncompacted offset is
439 * an error.
440 */
441 if (XFEATURE_MASK_SUPERVISOR & BIT_ULL(xfeature_nr)) {
442 WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
443 return -1;
444 }
445
446 CHECK_XFEATURE(xfeature_nr);
447 cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
448 return ebx;
449}
450
451static int xfeature_size(int xfeature_nr)
452{
453 u32 eax, ebx, ecx, edx;
454
455 CHECK_XFEATURE(xfeature_nr);
456 cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
457 return eax;
458}
459
460/*
461 * 'XSAVES' implies two different things:
462 * 1. saving of supervisor/system state
463 * 2. using the compacted format
464 *
465 * Use this function when dealing with the compacted format so
466 * that it is obvious which aspect of 'XSAVES' is being handled
467 * by the calling code.
468 */
469int using_compacted_format(void)
470{
471 return boot_cpu_has(X86_FEATURE_XSAVES);
472}
473
474/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
475int validate_xstate_header(const struct xstate_header *hdr)
476{
477 /* No unknown or supervisor features may be set */
478 if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR))
479 return -EINVAL;
480
481 /* Userspace must use the uncompacted format */
482 if (hdr->xcomp_bv)
483 return -EINVAL;
484
485 /*
486 * If 'reserved' is shrunken to add a new field, make sure to validate
487 * that new field here!
488 */
489 BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
490
491 /* No reserved bits may be set */
492 if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
493 return -EINVAL;
494
495 return 0;
496}
497
498static void __xstate_dump_leaves(void)
499{
500 int i;
501 u32 eax, ebx, ecx, edx;
502 static int should_dump = 1;
503
504 if (!should_dump)
505 return;
506 should_dump = 0;
507 /*
508 * Dump out a few leaves past the ones that we support
509 * just in case there are some goodies up there
510 */
511 for (i = 0; i < XFEATURE_MAX + 10; i++) {
512 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
513 pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
514 XSTATE_CPUID, i, eax, ebx, ecx, edx);
515 }
516}
517
518#define XSTATE_WARN_ON(x) do { \
519 if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \
520 __xstate_dump_leaves(); \
521 } \
522} while (0)
523
524#define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \
525 if ((nr == nr_macro) && \
526 WARN_ONCE(sz != sizeof(__struct), \
527 "%s: struct is %zu bytes, cpu state %d bytes\n", \
528 __stringify(nr_macro), sizeof(__struct), sz)) { \
529 __xstate_dump_leaves(); \
530 } \
531} while (0)
532
533/*
534 * We have a C struct for each 'xstate'. We need to ensure
535 * that our software representation matches what the CPU
536 * tells us about the state's size.
537 */
538static void check_xstate_against_struct(int nr)
539{
540 /*
541 * Ask the CPU for the size of the state.
542 */
543 int sz = xfeature_size(nr);
544 /*
545 * Match each CPU state with the corresponding software
546 * structure.
547 */
548 XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct);
549 XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state);
550 XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state);
551 XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state);
552 XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
553 XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
554 XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state);
555
556 /*
557 * Make *SURE* to add any feature numbers in below if
558 * there are "holes" in the xsave state component
559 * numbers.
560 */
561 if ((nr < XFEATURE_YMM) ||
562 (nr >= XFEATURE_MAX) ||
563 (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
564 WARN_ONCE(1, "no structure for xstate: %d\n", nr);
565 XSTATE_WARN_ON(1);
566 }
567}
568
569/*
570 * This essentially double-checks what the cpu told us about
571 * how large the XSAVE buffer needs to be. We are recalculating
572 * it to be safe.
573 */
574static void do_extra_xstate_size_checks(void)
575{
576 int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
577 int i;
578
579 for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
580 if (!xfeature_enabled(i))
581 continue;
582
583 check_xstate_against_struct(i);
584 /*
585 * Supervisor state components can be managed only by
586 * XSAVES, which is compacted-format only.
587 */
588 if (!using_compacted_format())
589 XSTATE_WARN_ON(xfeature_is_supervisor(i));
590
591 /* Align from the end of the previous feature */
592 if (xfeature_is_aligned(i))
593 paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64);
594 /*
595 * The offset of a given state in the non-compacted
596 * format is given to us in a CPUID leaf. We check
597 * them for being ordered (increasing offsets) in
598 * setup_xstate_features().
599 */
600 if (!using_compacted_format())
601 paranoid_xstate_size = xfeature_uncompacted_offset(i);
602 /*
603 * The compacted-format offset always depends on where
604 * the previous state ended.
605 */
606 paranoid_xstate_size += xfeature_size(i);
607 }
608 XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size);
609}
610
611
612/*
613 * Get total size of enabled xstates in XCR0/xfeatures_mask.
614 *
615 * Note the SDM's wording here. "sub-function 0" only enumerates
616 * the size of the *user* states. If we use it to size a buffer
617 * that we use 'XSAVES' on, we could potentially overflow the
618 * buffer because 'XSAVES' saves system states too.
619 *
620 * Note that we do not currently set any bits on IA32_XSS so
621 * 'XCR0 | IA32_XSS == XCR0' for now.
622 */
623static unsigned int __init get_xsaves_size(void)
624{
625 unsigned int eax, ebx, ecx, edx;
626 /*
627 * - CPUID function 0DH, sub-function 1:
628 * EBX enumerates the size (in bytes) required by
629 * the XSAVES instruction for an XSAVE area
630 * containing all the state components
631 * corresponding to bits currently set in
632 * XCR0 | IA32_XSS.
633 */
634 cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
635 return ebx;
636}
637
638static unsigned int __init get_xsave_size(void)
639{
640 unsigned int eax, ebx, ecx, edx;
641 /*
642 * - CPUID function 0DH, sub-function 0:
643 * EBX enumerates the size (in bytes) required by
644 * the XSAVE instruction for an XSAVE area
645 * containing all the *user* state components
646 * corresponding to bits currently set in XCR0.
647 */
648 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
649 return ebx;
650}
651
652/*
653 * Will the runtime-enumerated 'xstate_size' fit in the init
654 * task's statically-allocated buffer?
655 */
656static bool is_supported_xstate_size(unsigned int test_xstate_size)
657{
658 if (test_xstate_size <= sizeof(union fpregs_state))
659 return true;
660
661 pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n",
662 sizeof(union fpregs_state), test_xstate_size);
663 return false;
664}
665
666static int __init init_xstate_size(void)
667{
668 /* Recompute the context size for enabled features: */
669 unsigned int possible_xstate_size;
670 unsigned int xsave_size;
671
672 xsave_size = get_xsave_size();
673
674 if (boot_cpu_has(X86_FEATURE_XSAVES))
675 possible_xstate_size = get_xsaves_size();
676 else
677 possible_xstate_size = xsave_size;
678
679 /* Ensure we have the space to store all enabled: */
680 if (!is_supported_xstate_size(possible_xstate_size))
681 return -EINVAL;
682
683 /*
684 * The size is OK, we are definitely going to use xsave,
685 * make it known to the world that we need more space.
686 */
687 fpu_kernel_xstate_size = possible_xstate_size;
688 do_extra_xstate_size_checks();
689
690 /*
691 * User space is always in standard format.
692 */
693 fpu_user_xstate_size = xsave_size;
694 return 0;
695}
696
697/*
698 * We enabled the XSAVE hardware, but something went wrong and
699 * we can not use it. Disable it.
700 */
701static void fpu__init_disable_system_xstate(void)
702{
703 xfeatures_mask = 0;
704 cr4_clear_bits(X86_CR4_OSXSAVE);
705 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
706}
707
708/*
709 * Enable and initialize the xsave feature.
710 * Called once per system bootup.
711 */
712void __init fpu__init_system_xstate(void)
713{
714 unsigned int eax, ebx, ecx, edx;
715 static int on_boot_cpu __initdata = 1;
716 int err;
717 int i;
718
719 WARN_ON_FPU(!on_boot_cpu);
720 on_boot_cpu = 0;
721
722 if (!boot_cpu_has(X86_FEATURE_FPU)) {
723 pr_info("x86/fpu: No FPU detected\n");
724 return;
725 }
726
727 if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
728 pr_info("x86/fpu: x87 FPU will use %s\n",
729 boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
730 return;
731 }
732
733 if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
734 WARN_ON_FPU(1);
735 return;
736 }
737
738 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
739 xfeatures_mask = eax + ((u64)edx << 32);
740
741 if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
742 /*
743 * This indicates that something really unexpected happened
744 * with the enumeration. Disable XSAVE and try to continue
745 * booting without it. This is too early to BUG().
746 */
747 pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
748 goto out_disable;
749 }
750
751 /*
752 * Clear XSAVE features that are disabled in the normal CPUID.
753 */
754 for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
755 if (!boot_cpu_has(xsave_cpuid_features[i]))
756 xfeatures_mask &= ~BIT(i);
757 }
758
759 xfeatures_mask &= fpu__get_supported_xfeatures_mask();
760
761 /* Enable xstate instructions to be able to continue with initialization: */
762 fpu__init_cpu_xstate();
763 err = init_xstate_size();
764 if (err)
765 goto out_disable;
766
767 /*
768 * Update info used for ptrace frames; use standard-format size and no
769 * supervisor xstates:
770 */
771 update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR);
772
773 fpu__init_prepare_fx_sw_frame();
774 setup_init_fpu_buf();
775 setup_xstate_comp();
776 print_xstate_offset_size();
777
778 pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
779 xfeatures_mask,
780 fpu_kernel_xstate_size,
781 boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
782 return;
783
784out_disable:
785 /* something went wrong, try to boot without any XSAVE support */
786 fpu__init_disable_system_xstate();
787}
788
789/*
790 * Restore minimal FPU state after suspend:
791 */
792void fpu__resume_cpu(void)
793{
794 /*
795 * Restore XCR0 on xsave capable CPUs:
796 */
797 if (boot_cpu_has(X86_FEATURE_XSAVE))
798 xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
799}
800
801/*
802 * Given an xstate feature nr, calculate where in the xsave
803 * buffer the state is. Callers should ensure that the buffer
804 * is valid.
805 */
806static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
807{
808 if (!xfeature_enabled(xfeature_nr)) {
809 WARN_ON_FPU(1);
810 return NULL;
811 }
812
813 return (void *)xsave + xstate_comp_offsets[xfeature_nr];
814}
815/*
816 * Given the xsave area and a state inside, this function returns the
817 * address of the state.
818 *
819 * This is the API that is called to get xstate address in either
820 * standard format or compacted format of xsave area.
821 *
822 * Note that if there is no data for the field in the xsave buffer
823 * this will return NULL.
824 *
825 * Inputs:
826 * xstate: the thread's storage area for all FPU data
827 * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
828 * XFEATURE_SSE, etc...)
829 * Output:
830 * address of the state in the xsave area, or NULL if the
831 * field is not present in the xsave buffer.
832 */
833void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
834{
835 /*
836 * Do we even *have* xsave state?
837 */
838 if (!boot_cpu_has(X86_FEATURE_XSAVE))
839 return NULL;
840
841 /*
842 * We should not ever be requesting features that we
843 * have not enabled. Remember that pcntxt_mask is
844 * what we write to the XCR0 register.
845 */
846 WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)),
847 "get of unsupported state");
848 /*
849 * This assumes the last 'xsave*' instruction to
850 * have requested that 'xfeature_nr' be saved.
851 * If it did not, we might be seeing and old value
852 * of the field in the buffer.
853 *
854 * This can happen because the last 'xsave' did not
855 * request that this feature be saved (unlikely)
856 * or because the "init optimization" caused it
857 * to not be saved.
858 */
859 if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
860 return NULL;
861
862 return __raw_xsave_addr(xsave, xfeature_nr);
863}
864EXPORT_SYMBOL_GPL(get_xsave_addr);
865
866/*
867 * This wraps up the common operations that need to occur when retrieving
868 * data from xsave state. It first ensures that the current task was
869 * using the FPU and retrieves the data in to a buffer. It then calculates
870 * the offset of the requested field in the buffer.
871 *
872 * This function is safe to call whether the FPU is in use or not.
873 *
874 * Note that this only works on the current task.
875 *
876 * Inputs:
877 * @xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
878 * XFEATURE_SSE, etc...)
879 * Output:
880 * address of the state in the xsave area or NULL if the state
881 * is not present or is in its 'init state'.
882 */
883const void *get_xsave_field_ptr(int xfeature_nr)
884{
885 struct fpu *fpu = ¤t->thread.fpu;
886
887 /*
888 * fpu__save() takes the CPU's xstate registers
889 * and saves them off to the 'fpu memory buffer.
890 */
891 fpu__save(fpu);
892
893 return get_xsave_addr(&fpu->state.xsave, xfeature_nr);
894}
895
896#ifdef CONFIG_ARCH_HAS_PKEYS
897
898#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
899#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
900/*
901 * This will go out and modify PKRU register to set the access
902 * rights for @pkey to @init_val.
903 */
904int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
905 unsigned long init_val)
906{
907 u32 old_pkru;
908 int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
909 u32 new_pkru_bits = 0;
910
911 /*
912 * This check implies XSAVE support. OSPKE only gets
913 * set if we enable XSAVE and we enable PKU in XCR0.
914 */
915 if (!boot_cpu_has(X86_FEATURE_OSPKE))
916 return -EINVAL;
917
918 /* Set the bits we need in PKRU: */
919 if (init_val & PKEY_DISABLE_ACCESS)
920 new_pkru_bits |= PKRU_AD_BIT;
921 if (init_val & PKEY_DISABLE_WRITE)
922 new_pkru_bits |= PKRU_WD_BIT;
923
924 /* Shift the bits in to the correct place in PKRU for pkey: */
925 new_pkru_bits <<= pkey_shift;
926
927 /* Get old PKRU and mask off any old bits in place: */
928 old_pkru = read_pkru();
929 old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
930
931 /* Write old part along with new part: */
932 write_pkru(old_pkru | new_pkru_bits);
933
934 return 0;
935}
936#endif /* ! CONFIG_ARCH_HAS_PKEYS */
937
938/*
939 * Weird legacy quirk: SSE and YMM states store information in the
940 * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP
941 * area is marked as unused in the xfeatures header, we need to copy
942 * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use.
943 */
944static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
945{
946 if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM)))
947 return false;
948
949 if (xfeatures & XFEATURE_MASK_FP)
950 return false;
951
952 return true;
953}
954
955/*
956 * This is similar to user_regset_copyout(), but will not add offset to
957 * the source data pointer or increment pos, count, kbuf, and ubuf.
958 */
959static inline void
960__copy_xstate_to_kernel(void *kbuf, const void *data,
961 unsigned int offset, unsigned int size, unsigned int size_total)
962{
963 if (offset < size_total) {
964 unsigned int copy = min(size, size_total - offset);
965
966 memcpy(kbuf + offset, data, copy);
967 }
968}
969
970/*
971 * Convert from kernel XSAVES compacted format to standard format and copy
972 * to a kernel-space ptrace buffer.
973 *
974 * It supports partial copy but pos always starts from zero. This is called
975 * from xstateregs_get() and there we check the CPU has XSAVES.
976 */
977int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
978{
979 unsigned int offset, size;
980 struct xstate_header header;
981 int i;
982
983 /*
984 * Currently copy_regset_to_user() starts from pos 0:
985 */
986 if (unlikely(offset_start != 0))
987 return -EFAULT;
988
989 /*
990 * The destination is a ptrace buffer; we put in only user xstates:
991 */
992 memset(&header, 0, sizeof(header));
993 header.xfeatures = xsave->header.xfeatures;
994 header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
995
996 /*
997 * Copy xregs_state->header:
998 */
999 offset = offsetof(struct xregs_state, header);
1000 size = sizeof(header);
1001
1002 __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total);
1003
1004 for (i = 0; i < XFEATURE_MAX; i++) {
1005 /*
1006 * Copy only in-use xstates:
1007 */
1008 if ((header.xfeatures >> i) & 1) {
1009 void *src = __raw_xsave_addr(xsave, i);
1010
1011 offset = xstate_offsets[i];
1012 size = xstate_sizes[i];
1013
1014 /* The next component has to fit fully into the output buffer: */
1015 if (offset + size > size_total)
1016 break;
1017
1018 __copy_xstate_to_kernel(kbuf, src, offset, size, size_total);
1019 }
1020
1021 }
1022
1023 if (xfeatures_mxcsr_quirk(header.xfeatures)) {
1024 offset = offsetof(struct fxregs_state, mxcsr);
1025 size = MXCSR_AND_FLAGS_SIZE;
1026 __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total);
1027 }
1028
1029 /*
1030 * Fill xsave->i387.sw_reserved value for ptrace frame:
1031 */
1032 offset = offsetof(struct fxregs_state, sw_reserved);
1033 size = sizeof(xstate_fx_sw_bytes);
1034
1035 __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total);
1036
1037 return 0;
1038}
1039
1040static inline int
1041__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total)
1042{
1043 if (!size)
1044 return 0;
1045
1046 if (offset < size_total) {
1047 unsigned int copy = min(size, size_total - offset);
1048
1049 if (__copy_to_user(ubuf + offset, data, copy))
1050 return -EFAULT;
1051 }
1052 return 0;
1053}
1054
1055/*
1056 * Convert from kernel XSAVES compacted format to standard format and copy
1057 * to a user-space buffer. It supports partial copy but pos always starts from
1058 * zero. This is called from xstateregs_get() and there we check the CPU
1059 * has XSAVES.
1060 */
1061int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
1062{
1063 unsigned int offset, size;
1064 int ret, i;
1065 struct xstate_header header;
1066
1067 /*
1068 * Currently copy_regset_to_user() starts from pos 0:
1069 */
1070 if (unlikely(offset_start != 0))
1071 return -EFAULT;
1072
1073 /*
1074 * The destination is a ptrace buffer; we put in only user xstates:
1075 */
1076 memset(&header, 0, sizeof(header));
1077 header.xfeatures = xsave->header.xfeatures;
1078 header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
1079
1080 /*
1081 * Copy xregs_state->header:
1082 */
1083 offset = offsetof(struct xregs_state, header);
1084 size = sizeof(header);
1085
1086 ret = __copy_xstate_to_user(ubuf, &header, offset, size, size_total);
1087 if (ret)
1088 return ret;
1089
1090 for (i = 0; i < XFEATURE_MAX; i++) {
1091 /*
1092 * Copy only in-use xstates:
1093 */
1094 if ((header.xfeatures >> i) & 1) {
1095 void *src = __raw_xsave_addr(xsave, i);
1096
1097 offset = xstate_offsets[i];
1098 size = xstate_sizes[i];
1099
1100 /* The next component has to fit fully into the output buffer: */
1101 if (offset + size > size_total)
1102 break;
1103
1104 ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total);
1105 if (ret)
1106 return ret;
1107 }
1108
1109 }
1110
1111 if (xfeatures_mxcsr_quirk(header.xfeatures)) {
1112 offset = offsetof(struct fxregs_state, mxcsr);
1113 size = MXCSR_AND_FLAGS_SIZE;
1114 __copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total);
1115 }
1116
1117 /*
1118 * Fill xsave->i387.sw_reserved value for ptrace frame:
1119 */
1120 offset = offsetof(struct fxregs_state, sw_reserved);
1121 size = sizeof(xstate_fx_sw_bytes);
1122
1123 ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, size_total);
1124 if (ret)
1125 return ret;
1126
1127 return 0;
1128}
1129
1130/*
1131 * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format
1132 * and copy to the target thread. This is called from xstateregs_set().
1133 */
1134int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
1135{
1136 unsigned int offset, size;
1137 int i;
1138 struct xstate_header hdr;
1139
1140 offset = offsetof(struct xregs_state, header);
1141 size = sizeof(hdr);
1142
1143 memcpy(&hdr, kbuf + offset, size);
1144
1145 if (validate_xstate_header(&hdr))
1146 return -EINVAL;
1147
1148 for (i = 0; i < XFEATURE_MAX; i++) {
1149 u64 mask = ((u64)1 << i);
1150
1151 if (hdr.xfeatures & mask) {
1152 void *dst = __raw_xsave_addr(xsave, i);
1153
1154 offset = xstate_offsets[i];
1155 size = xstate_sizes[i];
1156
1157 memcpy(dst, kbuf + offset, size);
1158 }
1159 }
1160
1161 if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
1162 offset = offsetof(struct fxregs_state, mxcsr);
1163 size = MXCSR_AND_FLAGS_SIZE;
1164 memcpy(&xsave->i387.mxcsr, kbuf + offset, size);
1165 }
1166
1167 /*
1168 * The state that came in from userspace was user-state only.
1169 * Mask all the user states out of 'xfeatures':
1170 */
1171 xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
1172
1173 /*
1174 * Add back in the features that came in from userspace:
1175 */
1176 xsave->header.xfeatures |= hdr.xfeatures;
1177
1178 return 0;
1179}
1180
1181/*
1182 * Convert from a ptrace or sigreturn standard-format user-space buffer to
1183 * kernel XSAVES format and copy to the target thread. This is called from
1184 * xstateregs_set(), as well as potentially from the sigreturn() and
1185 * rt_sigreturn() system calls.
1186 */
1187int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
1188{
1189 unsigned int offset, size;
1190 int i;
1191 struct xstate_header hdr;
1192
1193 offset = offsetof(struct xregs_state, header);
1194 size = sizeof(hdr);
1195
1196 if (__copy_from_user(&hdr, ubuf + offset, size))
1197 return -EFAULT;
1198
1199 if (validate_xstate_header(&hdr))
1200 return -EINVAL;
1201
1202 for (i = 0; i < XFEATURE_MAX; i++) {
1203 u64 mask = ((u64)1 << i);
1204
1205 if (hdr.xfeatures & mask) {
1206 void *dst = __raw_xsave_addr(xsave, i);
1207
1208 offset = xstate_offsets[i];
1209 size = xstate_sizes[i];
1210
1211 if (__copy_from_user(dst, ubuf + offset, size))
1212 return -EFAULT;
1213 }
1214 }
1215
1216 if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
1217 offset = offsetof(struct fxregs_state, mxcsr);
1218 size = MXCSR_AND_FLAGS_SIZE;
1219 if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size))
1220 return -EFAULT;
1221 }
1222
1223 /*
1224 * The state that came in from userspace was user-state only.
1225 * Mask all the user states out of 'xfeatures':
1226 */
1227 xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
1228
1229 /*
1230 * Add back in the features that came in from userspace:
1231 */
1232 xsave->header.xfeatures |= hdr.xfeatures;
1233
1234 return 0;
1235}
1236
1237#ifdef CONFIG_PROC_PID_ARCH_STATUS
1238/*
1239 * Report the amount of time elapsed in millisecond since last AVX512
1240 * use in the task.
1241 */
1242static void avx512_status(struct seq_file *m, struct task_struct *task)
1243{
1244 unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
1245 long delta;
1246
1247 if (!timestamp) {
1248 /*
1249 * Report -1 if no AVX512 usage
1250 */
1251 delta = -1;
1252 } else {
1253 delta = (long)(jiffies - timestamp);
1254 /*
1255 * Cap to LONG_MAX if time difference > LONG_MAX
1256 */
1257 if (delta < 0)
1258 delta = LONG_MAX;
1259 delta = jiffies_to_msecs(delta);
1260 }
1261
1262 seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
1263 seq_putc(m, '\n');
1264}
1265
1266/*
1267 * Report architecture specific information
1268 */
1269int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
1270 struct pid *pid, struct task_struct *task)
1271{
1272 /*
1273 * Report AVX512 state if the processor and build option supported.
1274 */
1275 if (cpu_feature_enabled(X86_FEATURE_AVX512F))
1276 avx512_status(m, task);
1277
1278 return 0;
1279}
1280#endif /* CONFIG_PROC_PID_ARCH_STATUS */
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * xsave/xrstor support.
4 *
5 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
6 */
7#include <linux/bitops.h>
8#include <linux/compat.h>
9#include <linux/cpu.h>
10#include <linux/mman.h>
11#include <linux/nospec.h>
12#include <linux/pkeys.h>
13#include <linux/seq_file.h>
14#include <linux/proc_fs.h>
15#include <linux/vmalloc.h>
16
17#include <asm/fpu/api.h>
18#include <asm/fpu/regset.h>
19#include <asm/fpu/signal.h>
20#include <asm/fpu/xcr.h>
21
22#include <asm/tlbflush.h>
23#include <asm/prctl.h>
24#include <asm/elf.h>
25
26#include "context.h"
27#include "internal.h"
28#include "legacy.h"
29#include "xstate.h"
30
31#define for_each_extended_xfeature(bit, mask) \
32 (bit) = FIRST_EXTENDED_XFEATURE; \
33 for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
34
35/*
36 * Although we spell it out in here, the Processor Trace
37 * xfeature is completely unused. We use other mechanisms
38 * to save/restore PT state in Linux.
39 */
40static const char *xfeature_names[] =
41{
42 "x87 floating point registers",
43 "SSE registers",
44 "AVX registers",
45 "MPX bounds registers",
46 "MPX CSR",
47 "AVX-512 opmask",
48 "AVX-512 Hi256",
49 "AVX-512 ZMM_Hi256",
50 "Processor Trace (unused)",
51 "Protection Keys User registers",
52 "PASID state",
53 "Control-flow User registers",
54 "Control-flow Kernel registers (unused)",
55 "unknown xstate feature",
56 "unknown xstate feature",
57 "unknown xstate feature",
58 "unknown xstate feature",
59 "AMX Tile config",
60 "AMX Tile data",
61 "unknown xstate feature",
62};
63
64static unsigned short xsave_cpuid_features[] __initdata = {
65 [XFEATURE_FP] = X86_FEATURE_FPU,
66 [XFEATURE_SSE] = X86_FEATURE_XMM,
67 [XFEATURE_YMM] = X86_FEATURE_AVX,
68 [XFEATURE_BNDREGS] = X86_FEATURE_MPX,
69 [XFEATURE_BNDCSR] = X86_FEATURE_MPX,
70 [XFEATURE_OPMASK] = X86_FEATURE_AVX512F,
71 [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F,
72 [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F,
73 [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT,
74 [XFEATURE_PKRU] = X86_FEATURE_OSPKE,
75 [XFEATURE_PASID] = X86_FEATURE_ENQCMD,
76 [XFEATURE_CET_USER] = X86_FEATURE_SHSTK,
77 [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE,
78 [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE,
79};
80
81static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
82 { [ 0 ... XFEATURE_MAX - 1] = -1};
83static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
84 { [ 0 ... XFEATURE_MAX - 1] = -1};
85static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
86
87#define XSTATE_FLAG_SUPERVISOR BIT(0)
88#define XSTATE_FLAG_ALIGNED64 BIT(1)
89
90/*
91 * Return whether the system supports a given xfeature.
92 *
93 * Also return the name of the (most advanced) feature that the caller requested:
94 */
95int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
96{
97 u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
98
99 if (unlikely(feature_name)) {
100 long xfeature_idx, max_idx;
101 u64 xfeatures_print;
102 /*
103 * So we use FLS here to be able to print the most advanced
104 * feature that was requested but is missing. So if a driver
105 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
106 * missing AVX feature - this is the most informative message
107 * to users:
108 */
109 if (xfeatures_missing)
110 xfeatures_print = xfeatures_missing;
111 else
112 xfeatures_print = xfeatures_needed;
113
114 xfeature_idx = fls64(xfeatures_print)-1;
115 max_idx = ARRAY_SIZE(xfeature_names)-1;
116 xfeature_idx = min(xfeature_idx, max_idx);
117
118 *feature_name = xfeature_names[xfeature_idx];
119 }
120
121 if (xfeatures_missing)
122 return 0;
123
124 return 1;
125}
126EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
127
128static bool xfeature_is_aligned64(int xfeature_nr)
129{
130 return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
131}
132
133static bool xfeature_is_supervisor(int xfeature_nr)
134{
135 return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
136}
137
138static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
139{
140 unsigned int offs, i;
141
142 /*
143 * Non-compacted format and legacy features use the cached fixed
144 * offsets.
145 */
146 if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
147 xfeature <= XFEATURE_SSE)
148 return xstate_offsets[xfeature];
149
150 /*
151 * Compacted format offsets depend on the actual content of the
152 * compacted xsave area which is determined by the xcomp_bv header
153 * field.
154 */
155 offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
156 for_each_extended_xfeature(i, xcomp_bv) {
157 if (xfeature_is_aligned64(i))
158 offs = ALIGN(offs, 64);
159 if (i == xfeature)
160 break;
161 offs += xstate_sizes[i];
162 }
163 return offs;
164}
165
166/*
167 * Enable the extended processor state save/restore feature.
168 * Called once per CPU onlining.
169 */
170void fpu__init_cpu_xstate(void)
171{
172 if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
173 return;
174
175 cr4_set_bits(X86_CR4_OSXSAVE);
176
177 /*
178 * Must happen after CR4 setup and before xsetbv() to allow KVM
179 * lazy passthrough. Write independent of the dynamic state static
180 * key as that does not work on the boot CPU. This also ensures
181 * that any stale state is wiped out from XFD.
182 */
183 if (cpu_feature_enabled(X86_FEATURE_XFD))
184 wrmsrl(MSR_IA32_XFD, init_fpstate.xfd);
185
186 /*
187 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
188 * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user
189 * states can be set here.
190 */
191 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
192
193 /*
194 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
195 */
196 if (boot_cpu_has(X86_FEATURE_XSAVES)) {
197 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
198 xfeatures_mask_independent());
199 }
200}
201
202static bool xfeature_enabled(enum xfeature xfeature)
203{
204 return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
205}
206
207/*
208 * Record the offsets and sizes of various xstates contained
209 * in the XSAVE state memory layout.
210 */
211static void __init setup_xstate_cache(void)
212{
213 u32 eax, ebx, ecx, edx, i;
214 /* start at the beginning of the "extended state" */
215 unsigned int last_good_offset = offsetof(struct xregs_state,
216 extended_state_area);
217 /*
218 * The FP xstates and SSE xstates are legacy states. They are always
219 * in the fixed offsets in the xsave area in either compacted form
220 * or standard form.
221 */
222 xstate_offsets[XFEATURE_FP] = 0;
223 xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state,
224 xmm_space);
225
226 xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP];
227 xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state,
228 xmm_space);
229
230 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
231 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
232
233 xstate_sizes[i] = eax;
234 xstate_flags[i] = ecx;
235
236 /*
237 * If an xfeature is supervisor state, the offset in EBX is
238 * invalid, leave it to -1.
239 */
240 if (xfeature_is_supervisor(i))
241 continue;
242
243 xstate_offsets[i] = ebx;
244
245 /*
246 * In our xstate size checks, we assume that the highest-numbered
247 * xstate feature has the highest offset in the buffer. Ensure
248 * it does.
249 */
250 WARN_ONCE(last_good_offset > xstate_offsets[i],
251 "x86/fpu: misordered xstate at %d\n", last_good_offset);
252
253 last_good_offset = xstate_offsets[i];
254 }
255}
256
257static void __init print_xstate_feature(u64 xstate_mask)
258{
259 const char *feature_name;
260
261 if (cpu_has_xfeatures(xstate_mask, &feature_name))
262 pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
263}
264
265/*
266 * Print out all the supported xstate features:
267 */
268static void __init print_xstate_features(void)
269{
270 print_xstate_feature(XFEATURE_MASK_FP);
271 print_xstate_feature(XFEATURE_MASK_SSE);
272 print_xstate_feature(XFEATURE_MASK_YMM);
273 print_xstate_feature(XFEATURE_MASK_BNDREGS);
274 print_xstate_feature(XFEATURE_MASK_BNDCSR);
275 print_xstate_feature(XFEATURE_MASK_OPMASK);
276 print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
277 print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
278 print_xstate_feature(XFEATURE_MASK_PKRU);
279 print_xstate_feature(XFEATURE_MASK_PASID);
280 print_xstate_feature(XFEATURE_MASK_CET_USER);
281 print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
282 print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
283}
284
285/*
286 * This check is important because it is easy to get XSTATE_*
287 * confused with XSTATE_BIT_*.
288 */
289#define CHECK_XFEATURE(nr) do { \
290 WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \
291 WARN_ON(nr >= XFEATURE_MAX); \
292} while (0)
293
294/*
295 * Print out xstate component offsets and sizes
296 */
297static void __init print_xstate_offset_size(void)
298{
299 int i;
300
301 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
302 pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
303 i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
304 i, xstate_sizes[i]);
305 }
306}
307
308/*
309 * This function is called only during boot time when x86 caps are not set
310 * up and alternative can not be used yet.
311 */
312static __init void os_xrstor_booting(struct xregs_state *xstate)
313{
314 u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
315 u32 lmask = mask;
316 u32 hmask = mask >> 32;
317 int err;
318
319 if (cpu_feature_enabled(X86_FEATURE_XSAVES))
320 XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
321 else
322 XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
323
324 /*
325 * We should never fault when copying from a kernel buffer, and the FPU
326 * state we set at boot time should be valid.
327 */
328 WARN_ON_FPU(err);
329}
330
331/*
332 * All supported features have either init state all zeros or are
333 * handled in setup_init_fpu() individually. This is an explicit
334 * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
335 * newly added supported features at build time and make people
336 * actually look at the init state for the new feature.
337 */
338#define XFEATURES_INIT_FPSTATE_HANDLED \
339 (XFEATURE_MASK_FP | \
340 XFEATURE_MASK_SSE | \
341 XFEATURE_MASK_YMM | \
342 XFEATURE_MASK_OPMASK | \
343 XFEATURE_MASK_ZMM_Hi256 | \
344 XFEATURE_MASK_Hi16_ZMM | \
345 XFEATURE_MASK_PKRU | \
346 XFEATURE_MASK_BNDREGS | \
347 XFEATURE_MASK_BNDCSR | \
348 XFEATURE_MASK_PASID | \
349 XFEATURE_MASK_CET_USER | \
350 XFEATURE_MASK_XTILE)
351
352/*
353 * setup the xstate image representing the init state
354 */
355static void __init setup_init_fpu_buf(void)
356{
357 BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
358 XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
359 XFEATURES_INIT_FPSTATE_HANDLED);
360
361 if (!boot_cpu_has(X86_FEATURE_XSAVE))
362 return;
363
364 print_xstate_features();
365
366 xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);
367
368 /*
369 * Init all the features state with header.xfeatures being 0x0
370 */
371 os_xrstor_booting(&init_fpstate.regs.xsave);
372
373 /*
374 * All components are now in init state. Read the state back so
375 * that init_fpstate contains all non-zero init state. This only
376 * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
377 * those use the init optimization which skips writing data for
378 * components in init state.
379 *
380 * XSAVE could be used, but that would require to reshuffle the
381 * data when XSAVEC/S is available because XSAVEC/S uses xstate
382 * compaction. But doing so is a pointless exercise because most
383 * components have an all zeros init state except for the legacy
384 * ones (FP and SSE). Those can be saved with FXSAVE into the
385 * legacy area. Adding new features requires to ensure that init
386 * state is all zeroes or if not to add the necessary handling
387 * here.
388 */
389 fxsave(&init_fpstate.regs.fxsave);
390}
391
392int xfeature_size(int xfeature_nr)
393{
394 u32 eax, ebx, ecx, edx;
395
396 CHECK_XFEATURE(xfeature_nr);
397 cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
398 return eax;
399}
400
401/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
402static int validate_user_xstate_header(const struct xstate_header *hdr,
403 struct fpstate *fpstate)
404{
405 /* No unknown or supervisor features may be set */
406 if (hdr->xfeatures & ~fpstate->user_xfeatures)
407 return -EINVAL;
408
409 /* Userspace must use the uncompacted format */
410 if (hdr->xcomp_bv)
411 return -EINVAL;
412
413 /*
414 * If 'reserved' is shrunken to add a new field, make sure to validate
415 * that new field here!
416 */
417 BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
418
419 /* No reserved bits may be set */
420 if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
421 return -EINVAL;
422
423 return 0;
424}
425
426static void __init __xstate_dump_leaves(void)
427{
428 int i;
429 u32 eax, ebx, ecx, edx;
430 static int should_dump = 1;
431
432 if (!should_dump)
433 return;
434 should_dump = 0;
435 /*
436 * Dump out a few leaves past the ones that we support
437 * just in case there are some goodies up there
438 */
439 for (i = 0; i < XFEATURE_MAX + 10; i++) {
440 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
441 pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
442 XSTATE_CPUID, i, eax, ebx, ecx, edx);
443 }
444}
445
446#define XSTATE_WARN_ON(x, fmt, ...) do { \
447 if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \
448 __xstate_dump_leaves(); \
449 } \
450} while (0)
451
452#define XCHECK_SZ(sz, nr, __struct) ({ \
453 if (WARN_ONCE(sz != sizeof(__struct), \
454 "[%s]: struct is %zu bytes, cpu state %d bytes\n", \
455 xfeature_names[nr], sizeof(__struct), sz)) { \
456 __xstate_dump_leaves(); \
457 } \
458 true; \
459})
460
461
462/**
463 * check_xtile_data_against_struct - Check tile data state size.
464 *
465 * Calculate the state size by multiplying the single tile size which is
466 * recorded in a C struct, and the number of tiles that the CPU informs.
467 * Compare the provided size with the calculation.
468 *
469 * @size: The tile data state size
470 *
471 * Returns: 0 on success, -EINVAL on mismatch.
472 */
473static int __init check_xtile_data_against_struct(int size)
474{
475 u32 max_palid, palid, state_size;
476 u32 eax, ebx, ecx, edx;
477 u16 max_tile;
478
479 /*
480 * Check the maximum palette id:
481 * eax: the highest numbered palette subleaf.
482 */
483 cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
484
485 /*
486 * Cross-check each tile size and find the maximum number of
487 * supported tiles.
488 */
489 for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
490 u16 tile_size, max;
491
492 /*
493 * Check the tile size info:
494 * eax[31:16]: bytes per title
495 * ebx[31:16]: the max names (or max number of tiles)
496 */
497 cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
498 tile_size = eax >> 16;
499 max = ebx >> 16;
500
501 if (tile_size != sizeof(struct xtile_data)) {
502 pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
503 __stringify(XFEATURE_XTILE_DATA),
504 sizeof(struct xtile_data), tile_size);
505 __xstate_dump_leaves();
506 return -EINVAL;
507 }
508
509 if (max > max_tile)
510 max_tile = max;
511 }
512
513 state_size = sizeof(struct xtile_data) * max_tile;
514 if (size != state_size) {
515 pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
516 __stringify(XFEATURE_XTILE_DATA), state_size, size);
517 __xstate_dump_leaves();
518 return -EINVAL;
519 }
520 return 0;
521}
522
523/*
524 * We have a C struct for each 'xstate'. We need to ensure
525 * that our software representation matches what the CPU
526 * tells us about the state's size.
527 */
528static bool __init check_xstate_against_struct(int nr)
529{
530 /*
531 * Ask the CPU for the size of the state.
532 */
533 int sz = xfeature_size(nr);
534
535 /*
536 * Match each CPU state with the corresponding software
537 * structure.
538 */
539 switch (nr) {
540 case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct);
541 case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
542 case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
543 case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
544 case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
545 case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
546 case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state);
547 case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
548 case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg);
549 case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state);
550 case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
551 default:
552 XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
553 return false;
554 }
555
556 return true;
557}
558
559static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
560{
561 unsigned int topmost = fls64(xfeatures) - 1;
562 unsigned int offset = xstate_offsets[topmost];
563
564 if (topmost <= XFEATURE_SSE)
565 return sizeof(struct xregs_state);
566
567 if (compacted)
568 offset = xfeature_get_offset(xfeatures, topmost);
569 return offset + xstate_sizes[topmost];
570}
571
572/*
573 * This essentially double-checks what the cpu told us about
574 * how large the XSAVE buffer needs to be. We are recalculating
575 * it to be safe.
576 *
577 * Independent XSAVE features allocate their own buffers and are not
578 * covered by these checks. Only the size of the buffer for task->fpu
579 * is checked here.
580 */
581static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
582{
583 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
584 bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
585 unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
586 int i;
587
588 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
589 if (!check_xstate_against_struct(i))
590 return false;
591 /*
592 * Supervisor state components can be managed only by
593 * XSAVES.
594 */
595 if (!xsaves && xfeature_is_supervisor(i)) {
596 XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
597 return false;
598 }
599 }
600 size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
601 XSTATE_WARN_ON(size != kernel_size,
602 "size %u != kernel_size %u\n", size, kernel_size);
603 return size == kernel_size;
604}
605
606/*
607 * Get total size of enabled xstates in XCR0 | IA32_XSS.
608 *
609 * Note the SDM's wording here. "sub-function 0" only enumerates
610 * the size of the *user* states. If we use it to size a buffer
611 * that we use 'XSAVES' on, we could potentially overflow the
612 * buffer because 'XSAVES' saves system states too.
613 *
614 * This also takes compaction into account. So this works for
615 * XSAVEC as well.
616 */
617static unsigned int __init get_compacted_size(void)
618{
619 unsigned int eax, ebx, ecx, edx;
620 /*
621 * - CPUID function 0DH, sub-function 1:
622 * EBX enumerates the size (in bytes) required by
623 * the XSAVES instruction for an XSAVE area
624 * containing all the state components
625 * corresponding to bits currently set in
626 * XCR0 | IA32_XSS.
627 *
628 * When XSAVES is not available but XSAVEC is (virt), then there
629 * are no supervisor states, but XSAVEC still uses compacted
630 * format.
631 */
632 cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
633 return ebx;
634}
635
636/*
637 * Get the total size of the enabled xstates without the independent supervisor
638 * features.
639 */
640static unsigned int __init get_xsave_compacted_size(void)
641{
642 u64 mask = xfeatures_mask_independent();
643 unsigned int size;
644
645 if (!mask)
646 return get_compacted_size();
647
648 /* Disable independent features. */
649 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
650
651 /*
652 * Ask the hardware what size is required of the buffer.
653 * This is the size required for the task->fpu buffer.
654 */
655 size = get_compacted_size();
656
657 /* Re-enable independent features so XSAVES will work on them again. */
658 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
659
660 return size;
661}
662
663static unsigned int __init get_xsave_size_user(void)
664{
665 unsigned int eax, ebx, ecx, edx;
666 /*
667 * - CPUID function 0DH, sub-function 0:
668 * EBX enumerates the size (in bytes) required by
669 * the XSAVE instruction for an XSAVE area
670 * containing all the *user* state components
671 * corresponding to bits currently set in XCR0.
672 */
673 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
674 return ebx;
675}
676
677static int __init init_xstate_size(void)
678{
679 /* Recompute the context size for enabled features: */
680 unsigned int user_size, kernel_size, kernel_default_size;
681 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
682
683 /* Uncompacted user space size */
684 user_size = get_xsave_size_user();
685
686 /*
687 * XSAVES kernel size includes supervisor states and uses compacted
688 * format. XSAVEC uses compacted format, but does not save
689 * supervisor states.
690 *
691 * XSAVE[OPT] do not support supervisor states so kernel and user
692 * size is identical.
693 */
694 if (compacted)
695 kernel_size = get_xsave_compacted_size();
696 else
697 kernel_size = user_size;
698
699 kernel_default_size =
700 xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
701
702 if (!paranoid_xstate_size_valid(kernel_size))
703 return -EINVAL;
704
705 fpu_kernel_cfg.max_size = kernel_size;
706 fpu_user_cfg.max_size = user_size;
707
708 fpu_kernel_cfg.default_size = kernel_default_size;
709 fpu_user_cfg.default_size =
710 xstate_calculate_size(fpu_user_cfg.default_features, false);
711
712 return 0;
713}
714
715/*
716 * We enabled the XSAVE hardware, but something went wrong and
717 * we can not use it. Disable it.
718 */
719static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
720{
721 fpu_kernel_cfg.max_features = 0;
722 cr4_clear_bits(X86_CR4_OSXSAVE);
723 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
724
725 /* Restore the legacy size.*/
726 fpu_kernel_cfg.max_size = legacy_size;
727 fpu_kernel_cfg.default_size = legacy_size;
728 fpu_user_cfg.max_size = legacy_size;
729 fpu_user_cfg.default_size = legacy_size;
730
731 /*
732 * Prevent enabling the static branch which enables writes to the
733 * XFD MSR.
734 */
735 init_fpstate.xfd = 0;
736
737 fpstate_reset(¤t->thread.fpu);
738}
739
740/*
741 * Enable and initialize the xsave feature.
742 * Called once per system bootup.
743 */
744void __init fpu__init_system_xstate(unsigned int legacy_size)
745{
746 unsigned int eax, ebx, ecx, edx;
747 u64 xfeatures;
748 int err;
749 int i;
750
751 if (!boot_cpu_has(X86_FEATURE_FPU)) {
752 pr_info("x86/fpu: No FPU detected\n");
753 return;
754 }
755
756 if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
757 pr_info("x86/fpu: x87 FPU will use %s\n",
758 boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
759 return;
760 }
761
762 if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
763 WARN_ON_FPU(1);
764 return;
765 }
766
767 /*
768 * Find user xstates supported by the processor.
769 */
770 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
771 fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
772
773 /*
774 * Find supervisor xstates supported by the processor.
775 */
776 cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
777 fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
778
779 if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
780 /*
781 * This indicates that something really unexpected happened
782 * with the enumeration. Disable XSAVE and try to continue
783 * booting without it. This is too early to BUG().
784 */
785 pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
786 fpu_kernel_cfg.max_features);
787 goto out_disable;
788 }
789
790 /*
791 * Clear XSAVE features that are disabled in the normal CPUID.
792 */
793 for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
794 unsigned short cid = xsave_cpuid_features[i];
795
796 /* Careful: X86_FEATURE_FPU is 0! */
797 if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
798 fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
799 }
800
801 if (!cpu_feature_enabled(X86_FEATURE_XFD))
802 fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
803
804 if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
805 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
806 else
807 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
808 XFEATURE_MASK_SUPERVISOR_SUPPORTED;
809
810 fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
811 fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
812
813 /* Clean out dynamic features from default */
814 fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
815 fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
816
817 fpu_user_cfg.default_features = fpu_user_cfg.max_features;
818 fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
819
820 /* Store it for paranoia check at the end */
821 xfeatures = fpu_kernel_cfg.max_features;
822
823 /*
824 * Initialize the default XFD state in initfp_state and enable the
825 * dynamic sizing mechanism if dynamic states are available. The
826 * static key cannot be enabled here because this runs before
827 * jump_label_init(). This is delayed to an initcall.
828 */
829 init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
830
831 /* Set up compaction feature bit */
832 if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
833 cpu_feature_enabled(X86_FEATURE_XSAVES))
834 setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
835
836 /* Enable xstate instructions to be able to continue with initialization: */
837 fpu__init_cpu_xstate();
838
839 /* Cache size, offset and flags for initialization */
840 setup_xstate_cache();
841
842 err = init_xstate_size();
843 if (err)
844 goto out_disable;
845
846 /* Reset the state for the current task */
847 fpstate_reset(¤t->thread.fpu);
848
849 /*
850 * Update info used for ptrace frames; use standard-format size and no
851 * supervisor xstates:
852 */
853 update_regset_xstate_info(fpu_user_cfg.max_size,
854 fpu_user_cfg.max_features);
855
856 /*
857 * init_fpstate excludes dynamic states as they are large but init
858 * state is zero.
859 */
860 init_fpstate.size = fpu_kernel_cfg.default_size;
861 init_fpstate.xfeatures = fpu_kernel_cfg.default_features;
862
863 if (init_fpstate.size > sizeof(init_fpstate.regs)) {
864 pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n",
865 sizeof(init_fpstate.regs), init_fpstate.size);
866 goto out_disable;
867 }
868
869 setup_init_fpu_buf();
870
871 /*
872 * Paranoia check whether something in the setup modified the
873 * xfeatures mask.
874 */
875 if (xfeatures != fpu_kernel_cfg.max_features) {
876 pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
877 xfeatures, fpu_kernel_cfg.max_features);
878 goto out_disable;
879 }
880
881 /*
882 * CPU capabilities initialization runs before FPU init. So
883 * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
884 * functional, set the feature bit so depending code works.
885 */
886 setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
887
888 print_xstate_offset_size();
889 pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
890 fpu_kernel_cfg.max_features,
891 fpu_kernel_cfg.max_size,
892 boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
893 return;
894
895out_disable:
896 /* something went wrong, try to boot without any XSAVE support */
897 fpu__init_disable_system_xstate(legacy_size);
898}
899
900/*
901 * Restore minimal FPU state after suspend:
902 */
903void fpu__resume_cpu(void)
904{
905 /*
906 * Restore XCR0 on xsave capable CPUs:
907 */
908 if (cpu_feature_enabled(X86_FEATURE_XSAVE))
909 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
910
911 /*
912 * Restore IA32_XSS. The same CPUID bit enumerates support
913 * of XSAVES and MSR_IA32_XSS.
914 */
915 if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
916 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
917 xfeatures_mask_independent());
918 }
919
920 if (fpu_state_size_dynamic())
921 wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
922}
923
924/*
925 * Given an xstate feature nr, calculate where in the xsave
926 * buffer the state is. Callers should ensure that the buffer
927 * is valid.
928 */
929static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
930{
931 u64 xcomp_bv = xsave->header.xcomp_bv;
932
933 if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
934 return NULL;
935
936 if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
937 if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
938 return NULL;
939 }
940
941 return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
942}
943
944/*
945 * Given the xsave area and a state inside, this function returns the
946 * address of the state.
947 *
948 * This is the API that is called to get xstate address in either
949 * standard format or compacted format of xsave area.
950 *
951 * Note that if there is no data for the field in the xsave buffer
952 * this will return NULL.
953 *
954 * Inputs:
955 * xstate: the thread's storage area for all FPU data
956 * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
957 * XFEATURE_SSE, etc...)
958 * Output:
959 * address of the state in the xsave area, or NULL if the
960 * field is not present in the xsave buffer.
961 */
962void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
963{
964 /*
965 * Do we even *have* xsave state?
966 */
967 if (!boot_cpu_has(X86_FEATURE_XSAVE))
968 return NULL;
969
970 /*
971 * We should not ever be requesting features that we
972 * have not enabled.
973 */
974 if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
975 return NULL;
976
977 /*
978 * This assumes the last 'xsave*' instruction to
979 * have requested that 'xfeature_nr' be saved.
980 * If it did not, we might be seeing and old value
981 * of the field in the buffer.
982 *
983 * This can happen because the last 'xsave' did not
984 * request that this feature be saved (unlikely)
985 * or because the "init optimization" caused it
986 * to not be saved.
987 */
988 if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
989 return NULL;
990
991 return __raw_xsave_addr(xsave, xfeature_nr);
992}
993
994#ifdef CONFIG_ARCH_HAS_PKEYS
995
996/*
997 * This will go out and modify PKRU register to set the access
998 * rights for @pkey to @init_val.
999 */
1000int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
1001 unsigned long init_val)
1002{
1003 u32 old_pkru, new_pkru_bits = 0;
1004 int pkey_shift;
1005
1006 /*
1007 * This check implies XSAVE support. OSPKE only gets
1008 * set if we enable XSAVE and we enable PKU in XCR0.
1009 */
1010 if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
1011 return -EINVAL;
1012
1013 /*
1014 * This code should only be called with valid 'pkey'
1015 * values originating from in-kernel users. Complain
1016 * if a bad value is observed.
1017 */
1018 if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
1019 return -EINVAL;
1020
1021 /* Set the bits we need in PKRU: */
1022 if (init_val & PKEY_DISABLE_ACCESS)
1023 new_pkru_bits |= PKRU_AD_BIT;
1024 if (init_val & PKEY_DISABLE_WRITE)
1025 new_pkru_bits |= PKRU_WD_BIT;
1026
1027 /* Shift the bits in to the correct place in PKRU for pkey: */
1028 pkey_shift = pkey * PKRU_BITS_PER_PKEY;
1029 new_pkru_bits <<= pkey_shift;
1030
1031 /* Get old PKRU and mask off any old bits in place: */
1032 old_pkru = read_pkru();
1033 old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
1034
1035 /* Write old part along with new part: */
1036 write_pkru(old_pkru | new_pkru_bits);
1037
1038 return 0;
1039}
1040#endif /* ! CONFIG_ARCH_HAS_PKEYS */
1041
1042static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
1043 void *init_xstate, unsigned int size)
1044{
1045 membuf_write(to, from_xstate ? xstate : init_xstate, size);
1046}
1047
1048/**
1049 * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1050 * @to: membuf descriptor
1051 * @fpstate: The fpstate buffer from which to copy
1052 * @xfeatures: The mask of xfeatures to save (XSAVE mode only)
1053 * @pkru_val: The PKRU value to store in the PKRU component
1054 * @copy_mode: The requested copy mode
1055 *
1056 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1057 * format, i.e. from the kernel internal hardware dependent storage format
1058 * to the requested @mode. UABI XSTATE is always uncompacted!
1059 *
1060 * It supports partial copy but @to.pos always starts from zero.
1061 */
1062void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
1063 u64 xfeatures, u32 pkru_val,
1064 enum xstate_copy_mode copy_mode)
1065{
1066 const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
1067 struct xregs_state *xinit = &init_fpstate.regs.xsave;
1068 struct xregs_state *xsave = &fpstate->regs.xsave;
1069 struct xstate_header header;
1070 unsigned int zerofrom;
1071 u64 mask;
1072 int i;
1073
1074 memset(&header, 0, sizeof(header));
1075 header.xfeatures = xsave->header.xfeatures;
1076
1077 /* Mask out the feature bits depending on copy mode */
1078 switch (copy_mode) {
1079 case XSTATE_COPY_FP:
1080 header.xfeatures &= XFEATURE_MASK_FP;
1081 break;
1082
1083 case XSTATE_COPY_FX:
1084 header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
1085 break;
1086
1087 case XSTATE_COPY_XSAVE:
1088 header.xfeatures &= fpstate->user_xfeatures & xfeatures;
1089 break;
1090 }
1091
1092 /* Copy FP state up to MXCSR */
1093 copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
1094 &xinit->i387, off_mxcsr);
1095
1096 /* Copy MXCSR when SSE or YMM are set in the feature mask */
1097 copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
1098 &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
1099 MXCSR_AND_FLAGS_SIZE);
1100
1101 /* Copy the remaining FP state */
1102 copy_feature(header.xfeatures & XFEATURE_MASK_FP,
1103 &to, &xsave->i387.st_space, &xinit->i387.st_space,
1104 sizeof(xsave->i387.st_space));
1105
1106 /* Copy the SSE state - shared with YMM, but independently managed */
1107 copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
1108 &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
1109 sizeof(xsave->i387.xmm_space));
1110
1111 if (copy_mode != XSTATE_COPY_XSAVE)
1112 goto out;
1113
1114 /* Zero the padding area */
1115 membuf_zero(&to, sizeof(xsave->i387.padding));
1116
1117 /* Copy xsave->i387.sw_reserved */
1118 membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
1119
1120 /* Copy the user space relevant state of @xsave->header */
1121 membuf_write(&to, &header, sizeof(header));
1122
1123 zerofrom = offsetof(struct xregs_state, extended_state_area);
1124
1125 /*
1126 * This 'mask' indicates which states to copy from fpstate.
1127 * Those extended states that are not present in fpstate are
1128 * either disabled or initialized:
1129 *
1130 * In non-compacted format, disabled features still occupy
1131 * state space but there is no state to copy from in the
1132 * compacted init_fpstate. The gap tracking will zero these
1133 * states.
1134 *
1135 * The extended features have an all zeroes init state. Thus,
1136 * remove them from 'mask' to zero those features in the user
1137 * buffer instead of retrieving them from init_fpstate.
1138 */
1139 mask = header.xfeatures;
1140
1141 for_each_extended_xfeature(i, mask) {
1142 /*
1143 * If there was a feature or alignment gap, zero the space
1144 * in the destination buffer.
1145 */
1146 if (zerofrom < xstate_offsets[i])
1147 membuf_zero(&to, xstate_offsets[i] - zerofrom);
1148
1149 if (i == XFEATURE_PKRU) {
1150 struct pkru_state pkru = {0};
1151 /*
1152 * PKRU is not necessarily up to date in the
1153 * XSAVE buffer. Use the provided value.
1154 */
1155 pkru.pkru = pkru_val;
1156 membuf_write(&to, &pkru, sizeof(pkru));
1157 } else {
1158 membuf_write(&to,
1159 __raw_xsave_addr(xsave, i),
1160 xstate_sizes[i]);
1161 }
1162 /*
1163 * Keep track of the last copied state in the non-compacted
1164 * target buffer for gap zeroing.
1165 */
1166 zerofrom = xstate_offsets[i] + xstate_sizes[i];
1167 }
1168
1169out:
1170 if (to.left)
1171 membuf_zero(&to, to.left);
1172}
1173
1174/**
1175 * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1176 * @to: membuf descriptor
1177 * @tsk: The task from which to copy the saved xstate
1178 * @copy_mode: The requested copy mode
1179 *
1180 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1181 * format, i.e. from the kernel internal hardware dependent storage format
1182 * to the requested @mode. UABI XSTATE is always uncompacted!
1183 *
1184 * It supports partial copy but @to.pos always starts from zero.
1185 */
1186void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
1187 enum xstate_copy_mode copy_mode)
1188{
1189 __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
1190 tsk->thread.fpu.fpstate->user_xfeatures,
1191 tsk->thread.pkru, copy_mode);
1192}
1193
1194static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
1195 const void *kbuf, const void __user *ubuf)
1196{
1197 if (kbuf) {
1198 memcpy(dst, kbuf + offset, size);
1199 } else {
1200 if (copy_from_user(dst, ubuf + offset, size))
1201 return -EFAULT;
1202 }
1203 return 0;
1204}
1205
1206
1207/**
1208 * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
1209 * @fpstate: The fpstate buffer to copy to
1210 * @kbuf: The UABI format buffer, if it comes from the kernel
1211 * @ubuf: The UABI format buffer, if it comes from userspace
1212 * @pkru: The location to write the PKRU value to
1213 *
1214 * Converts from the UABI format into the kernel internal hardware
1215 * dependent format.
1216 *
1217 * This function ultimately has three different callers with distinct PKRU
1218 * behavior.
1219 * 1. When called from sigreturn the PKRU register will be restored from
1220 * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to
1221 * @fpstate is sufficient to cover this case, but the caller will also
1222 * pass a pointer to the thread_struct's pkru field in @pkru and updating
1223 * it is harmless.
1224 * 2. When called from ptrace the PKRU register will be restored from the
1225 * thread_struct's pkru field. A pointer to that is passed in @pkru.
1226 * The kernel will restore it manually, so the XRSTOR behavior that resets
1227 * the PKRU register to the hardware init value (0) if the corresponding
1228 * xfeatures bit is not set is emulated here.
1229 * 3. When called from KVM the PKRU register will be restored from the vcpu's
1230 * pkru field. A pointer to that is passed in @pkru. KVM hasn't used
1231 * XRSTOR and hasn't had the PKRU resetting behavior described above. To
1232 * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
1233 * bit is not set.
1234 */
1235static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
1236 const void __user *ubuf, u32 *pkru)
1237{
1238 struct xregs_state *xsave = &fpstate->regs.xsave;
1239 unsigned int offset, size;
1240 struct xstate_header hdr;
1241 u64 mask;
1242 int i;
1243
1244 offset = offsetof(struct xregs_state, header);
1245 if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
1246 return -EFAULT;
1247
1248 if (validate_user_xstate_header(&hdr, fpstate))
1249 return -EINVAL;
1250
1251 /* Validate MXCSR when any of the related features is in use */
1252 mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
1253 if (hdr.xfeatures & mask) {
1254 u32 mxcsr[2];
1255
1256 offset = offsetof(struct fxregs_state, mxcsr);
1257 if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
1258 return -EFAULT;
1259
1260 /* Reserved bits in MXCSR must be zero. */
1261 if (mxcsr[0] & ~mxcsr_feature_mask)
1262 return -EINVAL;
1263
1264 /* SSE and YMM require MXCSR even when FP is not in use. */
1265 if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
1266 xsave->i387.mxcsr = mxcsr[0];
1267 xsave->i387.mxcsr_mask = mxcsr[1];
1268 }
1269 }
1270
1271 for (i = 0; i < XFEATURE_MAX; i++) {
1272 mask = BIT_ULL(i);
1273
1274 if (hdr.xfeatures & mask) {
1275 void *dst = __raw_xsave_addr(xsave, i);
1276
1277 offset = xstate_offsets[i];
1278 size = xstate_sizes[i];
1279
1280 if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
1281 return -EFAULT;
1282 }
1283 }
1284
1285 if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
1286 struct pkru_state *xpkru;
1287
1288 xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
1289 *pkru = xpkru->pkru;
1290 } else {
1291 /*
1292 * KVM may pass NULL here to indicate that it does not need
1293 * PKRU updated.
1294 */
1295 if (pkru)
1296 *pkru = 0;
1297 }
1298
1299 /*
1300 * The state that came in from userspace was user-state only.
1301 * Mask all the user states out of 'xfeatures':
1302 */
1303 xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
1304
1305 /*
1306 * Add back in the features that came in from userspace:
1307 */
1308 xsave->header.xfeatures |= hdr.xfeatures;
1309
1310 return 0;
1311}
1312
1313/*
1314 * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
1315 * format and copy to the target thread. Used by ptrace and KVM.
1316 */
1317int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
1318{
1319 return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
1320}
1321
1322/*
1323 * Convert from a sigreturn standard-format user-space buffer to kernel
1324 * XSAVE[S] format and copy to the target thread. This is called from the
1325 * sigreturn() and rt_sigreturn() system calls.
1326 */
1327int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
1328 const void __user *ubuf)
1329{
1330 return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru);
1331}
1332
1333static bool validate_independent_components(u64 mask)
1334{
1335 u64 xchk;
1336
1337 if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
1338 return false;
1339
1340 xchk = ~xfeatures_mask_independent();
1341
1342 if (WARN_ON_ONCE(!mask || mask & xchk))
1343 return false;
1344
1345 return true;
1346}
1347
1348/**
1349 * xsaves - Save selected components to a kernel xstate buffer
1350 * @xstate: Pointer to the buffer
1351 * @mask: Feature mask to select the components to save
1352 *
1353 * The @xstate buffer must be 64 byte aligned and correctly initialized as
1354 * XSAVES does not write the full xstate header. Before first use the
1355 * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
1356 * can #GP.
1357 *
1358 * The feature mask must be a subset of the independent features.
1359 */
1360void xsaves(struct xregs_state *xstate, u64 mask)
1361{
1362 int err;
1363
1364 if (!validate_independent_components(mask))
1365 return;
1366
1367 XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
1368 WARN_ON_ONCE(err);
1369}
1370
1371/**
1372 * xrstors - Restore selected components from a kernel xstate buffer
1373 * @xstate: Pointer to the buffer
1374 * @mask: Feature mask to select the components to restore
1375 *
1376 * The @xstate buffer must be 64 byte aligned and correctly initialized
1377 * otherwise XRSTORS from that buffer can #GP.
1378 *
1379 * Proper usage is to restore the state which was saved with
1380 * xsaves() into @xstate.
1381 *
1382 * The feature mask must be a subset of the independent features.
1383 */
1384void xrstors(struct xregs_state *xstate, u64 mask)
1385{
1386 int err;
1387
1388 if (!validate_independent_components(mask))
1389 return;
1390
1391 XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
1392 WARN_ON_ONCE(err);
1393}
1394
1395#if IS_ENABLED(CONFIG_KVM)
1396void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
1397{
1398 void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
1399
1400 if (addr)
1401 memset(addr, 0, xstate_sizes[xfeature]);
1402}
1403EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
1404#endif
1405
1406#ifdef CONFIG_X86_64
1407
1408#ifdef CONFIG_X86_DEBUG_FPU
1409/*
1410 * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
1411 * can safely operate on the @fpstate buffer.
1412 */
1413static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
1414{
1415 u64 xfd = __this_cpu_read(xfd_state);
1416
1417 if (fpstate->xfd == xfd)
1418 return true;
1419
1420 /*
1421 * The XFD MSR does not match fpstate->xfd. That's invalid when
1422 * the passed in fpstate is current's fpstate.
1423 */
1424 if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
1425 return false;
1426
1427 /*
1428 * XRSTOR(S) from init_fpstate are always correct as it will just
1429 * bring all components into init state and not read from the
1430 * buffer. XSAVE(S) raises #PF after init.
1431 */
1432 if (fpstate == &init_fpstate)
1433 return rstor;
1434
1435 /*
1436 * XSAVE(S): clone(), fpu_swap_kvm_fpu()
1437 * XRSTORS(S): fpu_swap_kvm_fpu()
1438 */
1439
1440 /*
1441 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
1442 * the buffer area for XFD-disabled state components.
1443 */
1444 mask &= ~xfd;
1445
1446 /*
1447 * Remove features which are valid in fpstate. They
1448 * have space allocated in fpstate.
1449 */
1450 mask &= ~fpstate->xfeatures;
1451
1452 /*
1453 * Any remaining state components in 'mask' might be written
1454 * by XSAVE/XRSTOR. Fail validation it found.
1455 */
1456 return !mask;
1457}
1458
1459void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
1460{
1461 WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
1462}
1463#endif /* CONFIG_X86_DEBUG_FPU */
1464
1465static int __init xfd_update_static_branch(void)
1466{
1467 /*
1468 * If init_fpstate.xfd has bits set then dynamic features are
1469 * available and the dynamic sizing must be enabled.
1470 */
1471 if (init_fpstate.xfd)
1472 static_branch_enable(&__fpu_state_size_dynamic);
1473 return 0;
1474}
1475arch_initcall(xfd_update_static_branch)
1476
1477void fpstate_free(struct fpu *fpu)
1478{
1479 if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
1480 vfree(fpu->fpstate);
1481}
1482
1483/**
1484 * fpstate_realloc - Reallocate struct fpstate for the requested new features
1485 *
1486 * @xfeatures: A bitmap of xstate features which extend the enabled features
1487 * of that task
1488 * @ksize: The required size for the kernel buffer
1489 * @usize: The required size for user space buffers
1490 * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations
1491 *
1492 * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
1493 * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
1494 * with large states are likely to live longer.
1495 *
1496 * Returns: 0 on success, -ENOMEM on allocation error.
1497 */
1498static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
1499 unsigned int usize, struct fpu_guest *guest_fpu)
1500{
1501 struct fpu *fpu = ¤t->thread.fpu;
1502 struct fpstate *curfps, *newfps = NULL;
1503 unsigned int fpsize;
1504 bool in_use;
1505
1506 fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
1507
1508 newfps = vzalloc(fpsize);
1509 if (!newfps)
1510 return -ENOMEM;
1511 newfps->size = ksize;
1512 newfps->user_size = usize;
1513 newfps->is_valloc = true;
1514
1515 /*
1516 * When a guest FPU is supplied, use @guest_fpu->fpstate
1517 * as reference independent whether it is in use or not.
1518 */
1519 curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
1520
1521 /* Determine whether @curfps is the active fpstate */
1522 in_use = fpu->fpstate == curfps;
1523
1524 if (guest_fpu) {
1525 newfps->is_guest = true;
1526 newfps->is_confidential = curfps->is_confidential;
1527 newfps->in_use = curfps->in_use;
1528 guest_fpu->xfeatures |= xfeatures;
1529 guest_fpu->uabi_size = usize;
1530 }
1531
1532 fpregs_lock();
1533 /*
1534 * If @curfps is in use, ensure that the current state is in the
1535 * registers before swapping fpstate as that might invalidate it
1536 * due to layout changes.
1537 */
1538 if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
1539 fpregs_restore_userregs();
1540
1541 newfps->xfeatures = curfps->xfeatures | xfeatures;
1542 newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
1543 newfps->xfd = curfps->xfd & ~xfeatures;
1544
1545 /* Do the final updates within the locked region */
1546 xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
1547
1548 if (guest_fpu) {
1549 guest_fpu->fpstate = newfps;
1550 /* If curfps is active, update the FPU fpstate pointer */
1551 if (in_use)
1552 fpu->fpstate = newfps;
1553 } else {
1554 fpu->fpstate = newfps;
1555 }
1556
1557 if (in_use)
1558 xfd_update_state(fpu->fpstate);
1559 fpregs_unlock();
1560
1561 /* Only free valloc'ed state */
1562 if (curfps && curfps->is_valloc)
1563 vfree(curfps);
1564
1565 return 0;
1566}
1567
1568static int validate_sigaltstack(unsigned int usize)
1569{
1570 struct task_struct *thread, *leader = current->group_leader;
1571 unsigned long framesize = get_sigframe_size();
1572
1573 lockdep_assert_held(¤t->sighand->siglock);
1574
1575 /* get_sigframe_size() is based on fpu_user_cfg.max_size */
1576 framesize -= fpu_user_cfg.max_size;
1577 framesize += usize;
1578 for_each_thread(leader, thread) {
1579 if (thread->sas_ss_size && thread->sas_ss_size < framesize)
1580 return -ENOSPC;
1581 }
1582 return 0;
1583}
1584
1585static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
1586{
1587 /*
1588 * This deliberately does not exclude !XSAVES as we still might
1589 * decide to optionally context switch XCR0 or talk the silicon
1590 * vendors into extending XFD for the pre AMX states, especially
1591 * AVX512.
1592 */
1593 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
1594 struct fpu *fpu = ¤t->group_leader->thread.fpu;
1595 struct fpu_state_perm *perm;
1596 unsigned int ksize, usize;
1597 u64 mask;
1598 int ret = 0;
1599
1600 /* Check whether fully enabled */
1601 if ((permitted & requested) == requested)
1602 return 0;
1603
1604 /* Calculate the resulting kernel state size */
1605 mask = permitted | requested;
1606 /* Take supervisor states into account on the host */
1607 if (!guest)
1608 mask |= xfeatures_mask_supervisor();
1609 ksize = xstate_calculate_size(mask, compacted);
1610
1611 /* Calculate the resulting user state size */
1612 mask &= XFEATURE_MASK_USER_SUPPORTED;
1613 usize = xstate_calculate_size(mask, false);
1614
1615 if (!guest) {
1616 ret = validate_sigaltstack(usize);
1617 if (ret)
1618 return ret;
1619 }
1620
1621 perm = guest ? &fpu->guest_perm : &fpu->perm;
1622 /* Pairs with the READ_ONCE() in xstate_get_group_perm() */
1623 WRITE_ONCE(perm->__state_perm, mask);
1624 /* Protected by sighand lock */
1625 perm->__state_size = ksize;
1626 perm->__user_state_size = usize;
1627 return ret;
1628}
1629
1630/*
1631 * Permissions array to map facilities with more than one component
1632 */
1633static const u64 xstate_prctl_req[XFEATURE_MAX] = {
1634 [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
1635};
1636
1637static int xstate_request_perm(unsigned long idx, bool guest)
1638{
1639 u64 permitted, requested;
1640 int ret;
1641
1642 if (idx >= XFEATURE_MAX)
1643 return -EINVAL;
1644
1645 /*
1646 * Look up the facility mask which can require more than
1647 * one xstate component.
1648 */
1649 idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
1650 requested = xstate_prctl_req[idx];
1651 if (!requested)
1652 return -EOPNOTSUPP;
1653
1654 if ((fpu_user_cfg.max_features & requested) != requested)
1655 return -EOPNOTSUPP;
1656
1657 /* Lockless quick check */
1658 permitted = xstate_get_group_perm(guest);
1659 if ((permitted & requested) == requested)
1660 return 0;
1661
1662 /* Protect against concurrent modifications */
1663 spin_lock_irq(¤t->sighand->siglock);
1664 permitted = xstate_get_group_perm(guest);
1665
1666 /* First vCPU allocation locks the permissions. */
1667 if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
1668 ret = -EBUSY;
1669 else
1670 ret = __xstate_request_perm(permitted, requested, guest);
1671 spin_unlock_irq(¤t->sighand->siglock);
1672 return ret;
1673}
1674
1675int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
1676{
1677 u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
1678 struct fpu_state_perm *perm;
1679 unsigned int ksize, usize;
1680 struct fpu *fpu;
1681
1682 if (!xfd_event) {
1683 if (!guest_fpu)
1684 pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
1685 return 0;
1686 }
1687
1688 /* Protect against concurrent modifications */
1689 spin_lock_irq(¤t->sighand->siglock);
1690
1691 /* If not permitted let it die */
1692 if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
1693 spin_unlock_irq(¤t->sighand->siglock);
1694 return -EPERM;
1695 }
1696
1697 fpu = ¤t->group_leader->thread.fpu;
1698 perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
1699 ksize = perm->__state_size;
1700 usize = perm->__user_state_size;
1701
1702 /*
1703 * The feature is permitted. State size is sufficient. Dropping
1704 * the lock is safe here even if more features are added from
1705 * another task, the retrieved buffer sizes are valid for the
1706 * currently requested feature(s).
1707 */
1708 spin_unlock_irq(¤t->sighand->siglock);
1709
1710 /*
1711 * Try to allocate a new fpstate. If that fails there is no way
1712 * out.
1713 */
1714 if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
1715 return -EFAULT;
1716 return 0;
1717}
1718
1719int xfd_enable_feature(u64 xfd_err)
1720{
1721 return __xfd_enable_feature(xfd_err, NULL);
1722}
1723
1724#else /* CONFIG_X86_64 */
1725static inline int xstate_request_perm(unsigned long idx, bool guest)
1726{
1727 return -EPERM;
1728}
1729#endif /* !CONFIG_X86_64 */
1730
1731u64 xstate_get_guest_group_perm(void)
1732{
1733 return xstate_get_group_perm(true);
1734}
1735EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
1736
1737/**
1738 * fpu_xstate_prctl - xstate permission operations
1739 * @option: A subfunction of arch_prctl()
1740 * @arg2: option argument
1741 * Return: 0 if successful; otherwise, an error code
1742 *
1743 * Option arguments:
1744 *
1745 * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
1746 * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
1747 * ARCH_REQ_XCOMP_PERM: Facility number requested
1748 *
1749 * For facilities which require more than one XSTATE component, the request
1750 * must be the highest state component number related to that facility,
1751 * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
1752 * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
1753 */
1754long fpu_xstate_prctl(int option, unsigned long arg2)
1755{
1756 u64 __user *uptr = (u64 __user *)arg2;
1757 u64 permitted, supported;
1758 unsigned long idx = arg2;
1759 bool guest = false;
1760
1761 switch (option) {
1762 case ARCH_GET_XCOMP_SUPP:
1763 supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features;
1764 return put_user(supported, uptr);
1765
1766 case ARCH_GET_XCOMP_PERM:
1767 /*
1768 * Lockless snapshot as it can also change right after the
1769 * dropping the lock.
1770 */
1771 permitted = xstate_get_host_group_perm();
1772 permitted &= XFEATURE_MASK_USER_SUPPORTED;
1773 return put_user(permitted, uptr);
1774
1775 case ARCH_GET_XCOMP_GUEST_PERM:
1776 permitted = xstate_get_guest_group_perm();
1777 permitted &= XFEATURE_MASK_USER_SUPPORTED;
1778 return put_user(permitted, uptr);
1779
1780 case ARCH_REQ_XCOMP_GUEST_PERM:
1781 guest = true;
1782 fallthrough;
1783
1784 case ARCH_REQ_XCOMP_PERM:
1785 if (!IS_ENABLED(CONFIG_X86_64))
1786 return -EOPNOTSUPP;
1787
1788 return xstate_request_perm(idx, guest);
1789
1790 default:
1791 return -EINVAL;
1792 }
1793}
1794
1795#ifdef CONFIG_PROC_PID_ARCH_STATUS
1796/*
1797 * Report the amount of time elapsed in millisecond since last AVX512
1798 * use in the task.
1799 */
1800static void avx512_status(struct seq_file *m, struct task_struct *task)
1801{
1802 unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
1803 long delta;
1804
1805 if (!timestamp) {
1806 /*
1807 * Report -1 if no AVX512 usage
1808 */
1809 delta = -1;
1810 } else {
1811 delta = (long)(jiffies - timestamp);
1812 /*
1813 * Cap to LONG_MAX if time difference > LONG_MAX
1814 */
1815 if (delta < 0)
1816 delta = LONG_MAX;
1817 delta = jiffies_to_msecs(delta);
1818 }
1819
1820 seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
1821 seq_putc(m, '\n');
1822}
1823
1824/*
1825 * Report architecture specific information
1826 */
1827int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
1828 struct pid *pid, struct task_struct *task)
1829{
1830 /*
1831 * Report AVX512 state if the processor and build option supported.
1832 */
1833 if (cpu_feature_enabled(X86_FEATURE_AVX512F))
1834 avx512_status(m, task);
1835
1836 return 0;
1837}
1838#endif /* CONFIG_PROC_PID_ARCH_STATUS */