Loading...
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * xsave/xrstor support.
4 *
5 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
6 */
7#include <linux/bitops.h>
8#include <linux/compat.h>
9#include <linux/cpu.h>
10#include <linux/mman.h>
11#include <linux/nospec.h>
12#include <linux/pkeys.h>
13#include <linux/seq_file.h>
14#include <linux/proc_fs.h>
15#include <linux/vmalloc.h>
16
17#include <asm/fpu/api.h>
18#include <asm/fpu/regset.h>
19#include <asm/fpu/signal.h>
20#include <asm/fpu/xcr.h>
21
22#include <asm/tlbflush.h>
23#include <asm/prctl.h>
24#include <asm/elf.h>
25
26#include "context.h"
27#include "internal.h"
28#include "legacy.h"
29#include "xstate.h"
30
31#define for_each_extended_xfeature(bit, mask) \
32 (bit) = FIRST_EXTENDED_XFEATURE; \
33 for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
34
35/*
36 * Although we spell it out in here, the Processor Trace
37 * xfeature is completely unused. We use other mechanisms
38 * to save/restore PT state in Linux.
39 */
40static const char *xfeature_names[] =
41{
42 "x87 floating point registers" ,
43 "SSE registers" ,
44 "AVX registers" ,
45 "MPX bounds registers" ,
46 "MPX CSR" ,
47 "AVX-512 opmask" ,
48 "AVX-512 Hi256" ,
49 "AVX-512 ZMM_Hi256" ,
50 "Processor Trace (unused)" ,
51 "Protection Keys User registers",
52 "PASID state",
53 "unknown xstate feature" ,
54 "unknown xstate feature" ,
55 "unknown xstate feature" ,
56 "unknown xstate feature" ,
57 "unknown xstate feature" ,
58 "unknown xstate feature" ,
59 "AMX Tile config" ,
60 "AMX Tile data" ,
61 "unknown xstate feature" ,
62};
63
64static unsigned short xsave_cpuid_features[] __initdata = {
65 [XFEATURE_FP] = X86_FEATURE_FPU,
66 [XFEATURE_SSE] = X86_FEATURE_XMM,
67 [XFEATURE_YMM] = X86_FEATURE_AVX,
68 [XFEATURE_BNDREGS] = X86_FEATURE_MPX,
69 [XFEATURE_BNDCSR] = X86_FEATURE_MPX,
70 [XFEATURE_OPMASK] = X86_FEATURE_AVX512F,
71 [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F,
72 [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F,
73 [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT,
74 [XFEATURE_PKRU] = X86_FEATURE_PKU,
75 [XFEATURE_PASID] = X86_FEATURE_ENQCMD,
76 [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE,
77 [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE,
78};
79
80static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
81 { [ 0 ... XFEATURE_MAX - 1] = -1};
82static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
83 { [ 0 ... XFEATURE_MAX - 1] = -1};
84static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
85
86#define XSTATE_FLAG_SUPERVISOR BIT(0)
87#define XSTATE_FLAG_ALIGNED64 BIT(1)
88
89/*
90 * Return whether the system supports a given xfeature.
91 *
92 * Also return the name of the (most advanced) feature that the caller requested:
93 */
94int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
95{
96 u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
97
98 if (unlikely(feature_name)) {
99 long xfeature_idx, max_idx;
100 u64 xfeatures_print;
101 /*
102 * So we use FLS here to be able to print the most advanced
103 * feature that was requested but is missing. So if a driver
104 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
105 * missing AVX feature - this is the most informative message
106 * to users:
107 */
108 if (xfeatures_missing)
109 xfeatures_print = xfeatures_missing;
110 else
111 xfeatures_print = xfeatures_needed;
112
113 xfeature_idx = fls64(xfeatures_print)-1;
114 max_idx = ARRAY_SIZE(xfeature_names)-1;
115 xfeature_idx = min(xfeature_idx, max_idx);
116
117 *feature_name = xfeature_names[xfeature_idx];
118 }
119
120 if (xfeatures_missing)
121 return 0;
122
123 return 1;
124}
125EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
126
127static bool xfeature_is_aligned64(int xfeature_nr)
128{
129 return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
130}
131
132static bool xfeature_is_supervisor(int xfeature_nr)
133{
134 return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
135}
136
137static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
138{
139 unsigned int offs, i;
140
141 /*
142 * Non-compacted format and legacy features use the cached fixed
143 * offsets.
144 */
145 if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
146 xfeature <= XFEATURE_SSE)
147 return xstate_offsets[xfeature];
148
149 /*
150 * Compacted format offsets depend on the actual content of the
151 * compacted xsave area which is determined by the xcomp_bv header
152 * field.
153 */
154 offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
155 for_each_extended_xfeature(i, xcomp_bv) {
156 if (xfeature_is_aligned64(i))
157 offs = ALIGN(offs, 64);
158 if (i == xfeature)
159 break;
160 offs += xstate_sizes[i];
161 }
162 return offs;
163}
164
165/*
166 * Enable the extended processor state save/restore feature.
167 * Called once per CPU onlining.
168 */
169void fpu__init_cpu_xstate(void)
170{
171 if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
172 return;
173
174 cr4_set_bits(X86_CR4_OSXSAVE);
175
176 /*
177 * Must happen after CR4 setup and before xsetbv() to allow KVM
178 * lazy passthrough. Write independent of the dynamic state static
179 * key as that does not work on the boot CPU. This also ensures
180 * that any stale state is wiped out from XFD.
181 */
182 if (cpu_feature_enabled(X86_FEATURE_XFD))
183 wrmsrl(MSR_IA32_XFD, init_fpstate.xfd);
184
185 /*
186 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
187 * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user
188 * states can be set here.
189 */
190 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
191
192 /*
193 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
194 */
195 if (boot_cpu_has(X86_FEATURE_XSAVES)) {
196 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
197 xfeatures_mask_independent());
198 }
199}
200
201static bool xfeature_enabled(enum xfeature xfeature)
202{
203 return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
204}
205
206/*
207 * Record the offsets and sizes of various xstates contained
208 * in the XSAVE state memory layout.
209 */
210static void __init setup_xstate_cache(void)
211{
212 u32 eax, ebx, ecx, edx, i;
213 /* start at the beginning of the "extended state" */
214 unsigned int last_good_offset = offsetof(struct xregs_state,
215 extended_state_area);
216 /*
217 * The FP xstates and SSE xstates are legacy states. They are always
218 * in the fixed offsets in the xsave area in either compacted form
219 * or standard form.
220 */
221 xstate_offsets[XFEATURE_FP] = 0;
222 xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state,
223 xmm_space);
224
225 xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP];
226 xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state,
227 xmm_space);
228
229 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
230 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
231
232 xstate_sizes[i] = eax;
233 xstate_flags[i] = ecx;
234
235 /*
236 * If an xfeature is supervisor state, the offset in EBX is
237 * invalid, leave it to -1.
238 */
239 if (xfeature_is_supervisor(i))
240 continue;
241
242 xstate_offsets[i] = ebx;
243
244 /*
245 * In our xstate size checks, we assume that the highest-numbered
246 * xstate feature has the highest offset in the buffer. Ensure
247 * it does.
248 */
249 WARN_ONCE(last_good_offset > xstate_offsets[i],
250 "x86/fpu: misordered xstate at %d\n", last_good_offset);
251
252 last_good_offset = xstate_offsets[i];
253 }
254}
255
256static void __init print_xstate_feature(u64 xstate_mask)
257{
258 const char *feature_name;
259
260 if (cpu_has_xfeatures(xstate_mask, &feature_name))
261 pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
262}
263
264/*
265 * Print out all the supported xstate features:
266 */
267static void __init print_xstate_features(void)
268{
269 print_xstate_feature(XFEATURE_MASK_FP);
270 print_xstate_feature(XFEATURE_MASK_SSE);
271 print_xstate_feature(XFEATURE_MASK_YMM);
272 print_xstate_feature(XFEATURE_MASK_BNDREGS);
273 print_xstate_feature(XFEATURE_MASK_BNDCSR);
274 print_xstate_feature(XFEATURE_MASK_OPMASK);
275 print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
276 print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
277 print_xstate_feature(XFEATURE_MASK_PKRU);
278 print_xstate_feature(XFEATURE_MASK_PASID);
279 print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
280 print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
281}
282
283/*
284 * This check is important because it is easy to get XSTATE_*
285 * confused with XSTATE_BIT_*.
286 */
287#define CHECK_XFEATURE(nr) do { \
288 WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \
289 WARN_ON(nr >= XFEATURE_MAX); \
290} while (0)
291
292/*
293 * Print out xstate component offsets and sizes
294 */
295static void __init print_xstate_offset_size(void)
296{
297 int i;
298
299 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
300 pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
301 i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
302 i, xstate_sizes[i]);
303 }
304}
305
306/*
307 * This function is called only during boot time when x86 caps are not set
308 * up and alternative can not be used yet.
309 */
310static __init void os_xrstor_booting(struct xregs_state *xstate)
311{
312 u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
313 u32 lmask = mask;
314 u32 hmask = mask >> 32;
315 int err;
316
317 if (cpu_feature_enabled(X86_FEATURE_XSAVES))
318 XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
319 else
320 XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
321
322 /*
323 * We should never fault when copying from a kernel buffer, and the FPU
324 * state we set at boot time should be valid.
325 */
326 WARN_ON_FPU(err);
327}
328
329/*
330 * All supported features have either init state all zeros or are
331 * handled in setup_init_fpu() individually. This is an explicit
332 * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
333 * newly added supported features at build time and make people
334 * actually look at the init state for the new feature.
335 */
336#define XFEATURES_INIT_FPSTATE_HANDLED \
337 (XFEATURE_MASK_FP | \
338 XFEATURE_MASK_SSE | \
339 XFEATURE_MASK_YMM | \
340 XFEATURE_MASK_OPMASK | \
341 XFEATURE_MASK_ZMM_Hi256 | \
342 XFEATURE_MASK_Hi16_ZMM | \
343 XFEATURE_MASK_PKRU | \
344 XFEATURE_MASK_BNDREGS | \
345 XFEATURE_MASK_BNDCSR | \
346 XFEATURE_MASK_PASID | \
347 XFEATURE_MASK_XTILE)
348
349/*
350 * setup the xstate image representing the init state
351 */
352static void __init setup_init_fpu_buf(void)
353{
354 BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
355 XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
356 XFEATURES_INIT_FPSTATE_HANDLED);
357
358 if (!boot_cpu_has(X86_FEATURE_XSAVE))
359 return;
360
361 print_xstate_features();
362
363 xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);
364
365 /*
366 * Init all the features state with header.xfeatures being 0x0
367 */
368 os_xrstor_booting(&init_fpstate.regs.xsave);
369
370 /*
371 * All components are now in init state. Read the state back so
372 * that init_fpstate contains all non-zero init state. This only
373 * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
374 * those use the init optimization which skips writing data for
375 * components in init state.
376 *
377 * XSAVE could be used, but that would require to reshuffle the
378 * data when XSAVEC/S is available because XSAVEC/S uses xstate
379 * compaction. But doing so is a pointless exercise because most
380 * components have an all zeros init state except for the legacy
381 * ones (FP and SSE). Those can be saved with FXSAVE into the
382 * legacy area. Adding new features requires to ensure that init
383 * state is all zeroes or if not to add the necessary handling
384 * here.
385 */
386 fxsave(&init_fpstate.regs.fxsave);
387}
388
389int xfeature_size(int xfeature_nr)
390{
391 u32 eax, ebx, ecx, edx;
392
393 CHECK_XFEATURE(xfeature_nr);
394 cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
395 return eax;
396}
397
398/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
399static int validate_user_xstate_header(const struct xstate_header *hdr,
400 struct fpstate *fpstate)
401{
402 /* No unknown or supervisor features may be set */
403 if (hdr->xfeatures & ~fpstate->user_xfeatures)
404 return -EINVAL;
405
406 /* Userspace must use the uncompacted format */
407 if (hdr->xcomp_bv)
408 return -EINVAL;
409
410 /*
411 * If 'reserved' is shrunken to add a new field, make sure to validate
412 * that new field here!
413 */
414 BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
415
416 /* No reserved bits may be set */
417 if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
418 return -EINVAL;
419
420 return 0;
421}
422
423static void __init __xstate_dump_leaves(void)
424{
425 int i;
426 u32 eax, ebx, ecx, edx;
427 static int should_dump = 1;
428
429 if (!should_dump)
430 return;
431 should_dump = 0;
432 /*
433 * Dump out a few leaves past the ones that we support
434 * just in case there are some goodies up there
435 */
436 for (i = 0; i < XFEATURE_MAX + 10; i++) {
437 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
438 pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
439 XSTATE_CPUID, i, eax, ebx, ecx, edx);
440 }
441}
442
443#define XSTATE_WARN_ON(x, fmt, ...) do { \
444 if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \
445 __xstate_dump_leaves(); \
446 } \
447} while (0)
448
449#define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \
450 if ((nr == nr_macro) && \
451 WARN_ONCE(sz != sizeof(__struct), \
452 "%s: struct is %zu bytes, cpu state %d bytes\n", \
453 __stringify(nr_macro), sizeof(__struct), sz)) { \
454 __xstate_dump_leaves(); \
455 } \
456} while (0)
457
458/**
459 * check_xtile_data_against_struct - Check tile data state size.
460 *
461 * Calculate the state size by multiplying the single tile size which is
462 * recorded in a C struct, and the number of tiles that the CPU informs.
463 * Compare the provided size with the calculation.
464 *
465 * @size: The tile data state size
466 *
467 * Returns: 0 on success, -EINVAL on mismatch.
468 */
469static int __init check_xtile_data_against_struct(int size)
470{
471 u32 max_palid, palid, state_size;
472 u32 eax, ebx, ecx, edx;
473 u16 max_tile;
474
475 /*
476 * Check the maximum palette id:
477 * eax: the highest numbered palette subleaf.
478 */
479 cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
480
481 /*
482 * Cross-check each tile size and find the maximum number of
483 * supported tiles.
484 */
485 for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
486 u16 tile_size, max;
487
488 /*
489 * Check the tile size info:
490 * eax[31:16]: bytes per title
491 * ebx[31:16]: the max names (or max number of tiles)
492 */
493 cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
494 tile_size = eax >> 16;
495 max = ebx >> 16;
496
497 if (tile_size != sizeof(struct xtile_data)) {
498 pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
499 __stringify(XFEATURE_XTILE_DATA),
500 sizeof(struct xtile_data), tile_size);
501 __xstate_dump_leaves();
502 return -EINVAL;
503 }
504
505 if (max > max_tile)
506 max_tile = max;
507 }
508
509 state_size = sizeof(struct xtile_data) * max_tile;
510 if (size != state_size) {
511 pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
512 __stringify(XFEATURE_XTILE_DATA), state_size, size);
513 __xstate_dump_leaves();
514 return -EINVAL;
515 }
516 return 0;
517}
518
519/*
520 * We have a C struct for each 'xstate'. We need to ensure
521 * that our software representation matches what the CPU
522 * tells us about the state's size.
523 */
524static bool __init check_xstate_against_struct(int nr)
525{
526 /*
527 * Ask the CPU for the size of the state.
528 */
529 int sz = xfeature_size(nr);
530 /*
531 * Match each CPU state with the corresponding software
532 * structure.
533 */
534 XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct);
535 XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state);
536 XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state);
537 XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state);
538 XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
539 XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
540 XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state);
541 XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state);
542 XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg);
543
544 /* The tile data size varies between implementations. */
545 if (nr == XFEATURE_XTILE_DATA)
546 check_xtile_data_against_struct(sz);
547
548 /*
549 * Make *SURE* to add any feature numbers in below if
550 * there are "holes" in the xsave state component
551 * numbers.
552 */
553 if ((nr < XFEATURE_YMM) ||
554 (nr >= XFEATURE_MAX) ||
555 (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
556 ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) {
557 XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
558 return false;
559 }
560 return true;
561}
562
563static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
564{
565 unsigned int topmost = fls64(xfeatures) - 1;
566 unsigned int offset = xstate_offsets[topmost];
567
568 if (topmost <= XFEATURE_SSE)
569 return sizeof(struct xregs_state);
570
571 if (compacted)
572 offset = xfeature_get_offset(xfeatures, topmost);
573 return offset + xstate_sizes[topmost];
574}
575
576/*
577 * This essentially double-checks what the cpu told us about
578 * how large the XSAVE buffer needs to be. We are recalculating
579 * it to be safe.
580 *
581 * Independent XSAVE features allocate their own buffers and are not
582 * covered by these checks. Only the size of the buffer for task->fpu
583 * is checked here.
584 */
585static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
586{
587 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
588 bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
589 unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
590 int i;
591
592 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
593 if (!check_xstate_against_struct(i))
594 return false;
595 /*
596 * Supervisor state components can be managed only by
597 * XSAVES.
598 */
599 if (!xsaves && xfeature_is_supervisor(i)) {
600 XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
601 return false;
602 }
603 }
604 size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
605 XSTATE_WARN_ON(size != kernel_size,
606 "size %u != kernel_size %u\n", size, kernel_size);
607 return size == kernel_size;
608}
609
610/*
611 * Get total size of enabled xstates in XCR0 | IA32_XSS.
612 *
613 * Note the SDM's wording here. "sub-function 0" only enumerates
614 * the size of the *user* states. If we use it to size a buffer
615 * that we use 'XSAVES' on, we could potentially overflow the
616 * buffer because 'XSAVES' saves system states too.
617 *
618 * This also takes compaction into account. So this works for
619 * XSAVEC as well.
620 */
621static unsigned int __init get_compacted_size(void)
622{
623 unsigned int eax, ebx, ecx, edx;
624 /*
625 * - CPUID function 0DH, sub-function 1:
626 * EBX enumerates the size (in bytes) required by
627 * the XSAVES instruction for an XSAVE area
628 * containing all the state components
629 * corresponding to bits currently set in
630 * XCR0 | IA32_XSS.
631 *
632 * When XSAVES is not available but XSAVEC is (virt), then there
633 * are no supervisor states, but XSAVEC still uses compacted
634 * format.
635 */
636 cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
637 return ebx;
638}
639
640/*
641 * Get the total size of the enabled xstates without the independent supervisor
642 * features.
643 */
644static unsigned int __init get_xsave_compacted_size(void)
645{
646 u64 mask = xfeatures_mask_independent();
647 unsigned int size;
648
649 if (!mask)
650 return get_compacted_size();
651
652 /* Disable independent features. */
653 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
654
655 /*
656 * Ask the hardware what size is required of the buffer.
657 * This is the size required for the task->fpu buffer.
658 */
659 size = get_compacted_size();
660
661 /* Re-enable independent features so XSAVES will work on them again. */
662 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
663
664 return size;
665}
666
667static unsigned int __init get_xsave_size_user(void)
668{
669 unsigned int eax, ebx, ecx, edx;
670 /*
671 * - CPUID function 0DH, sub-function 0:
672 * EBX enumerates the size (in bytes) required by
673 * the XSAVE instruction for an XSAVE area
674 * containing all the *user* state components
675 * corresponding to bits currently set in XCR0.
676 */
677 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
678 return ebx;
679}
680
681static int __init init_xstate_size(void)
682{
683 /* Recompute the context size for enabled features: */
684 unsigned int user_size, kernel_size, kernel_default_size;
685 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
686
687 /* Uncompacted user space size */
688 user_size = get_xsave_size_user();
689
690 /*
691 * XSAVES kernel size includes supervisor states and uses compacted
692 * format. XSAVEC uses compacted format, but does not save
693 * supervisor states.
694 *
695 * XSAVE[OPT] do not support supervisor states so kernel and user
696 * size is identical.
697 */
698 if (compacted)
699 kernel_size = get_xsave_compacted_size();
700 else
701 kernel_size = user_size;
702
703 kernel_default_size =
704 xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
705
706 if (!paranoid_xstate_size_valid(kernel_size))
707 return -EINVAL;
708
709 fpu_kernel_cfg.max_size = kernel_size;
710 fpu_user_cfg.max_size = user_size;
711
712 fpu_kernel_cfg.default_size = kernel_default_size;
713 fpu_user_cfg.default_size =
714 xstate_calculate_size(fpu_user_cfg.default_features, false);
715
716 return 0;
717}
718
719/*
720 * We enabled the XSAVE hardware, but something went wrong and
721 * we can not use it. Disable it.
722 */
723static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
724{
725 fpu_kernel_cfg.max_features = 0;
726 cr4_clear_bits(X86_CR4_OSXSAVE);
727 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
728
729 /* Restore the legacy size.*/
730 fpu_kernel_cfg.max_size = legacy_size;
731 fpu_kernel_cfg.default_size = legacy_size;
732 fpu_user_cfg.max_size = legacy_size;
733 fpu_user_cfg.default_size = legacy_size;
734
735 /*
736 * Prevent enabling the static branch which enables writes to the
737 * XFD MSR.
738 */
739 init_fpstate.xfd = 0;
740
741 fpstate_reset(¤t->thread.fpu);
742}
743
744/*
745 * Enable and initialize the xsave feature.
746 * Called once per system bootup.
747 */
748void __init fpu__init_system_xstate(unsigned int legacy_size)
749{
750 unsigned int eax, ebx, ecx, edx;
751 u64 xfeatures;
752 int err;
753 int i;
754
755 if (!boot_cpu_has(X86_FEATURE_FPU)) {
756 pr_info("x86/fpu: No FPU detected\n");
757 return;
758 }
759
760 if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
761 pr_info("x86/fpu: x87 FPU will use %s\n",
762 boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
763 return;
764 }
765
766 if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
767 WARN_ON_FPU(1);
768 return;
769 }
770
771 /*
772 * Find user xstates supported by the processor.
773 */
774 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
775 fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
776
777 /*
778 * Find supervisor xstates supported by the processor.
779 */
780 cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
781 fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
782
783 if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
784 /*
785 * This indicates that something really unexpected happened
786 * with the enumeration. Disable XSAVE and try to continue
787 * booting without it. This is too early to BUG().
788 */
789 pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
790 fpu_kernel_cfg.max_features);
791 goto out_disable;
792 }
793
794 /*
795 * Clear XSAVE features that are disabled in the normal CPUID.
796 */
797 for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
798 unsigned short cid = xsave_cpuid_features[i];
799
800 /* Careful: X86_FEATURE_FPU is 0! */
801 if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
802 fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
803 }
804
805 if (!cpu_feature_enabled(X86_FEATURE_XFD))
806 fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
807
808 if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
809 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
810 else
811 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
812 XFEATURE_MASK_SUPERVISOR_SUPPORTED;
813
814 fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
815 fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
816
817 /* Clean out dynamic features from default */
818 fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
819 fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
820
821 fpu_user_cfg.default_features = fpu_user_cfg.max_features;
822 fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
823
824 /* Store it for paranoia check at the end */
825 xfeatures = fpu_kernel_cfg.max_features;
826
827 /*
828 * Initialize the default XFD state in initfp_state and enable the
829 * dynamic sizing mechanism if dynamic states are available. The
830 * static key cannot be enabled here because this runs before
831 * jump_label_init(). This is delayed to an initcall.
832 */
833 init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
834
835 /* Set up compaction feature bit */
836 if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
837 cpu_feature_enabled(X86_FEATURE_XSAVES))
838 setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
839
840 /* Enable xstate instructions to be able to continue with initialization: */
841 fpu__init_cpu_xstate();
842
843 /* Cache size, offset and flags for initialization */
844 setup_xstate_cache();
845
846 err = init_xstate_size();
847 if (err)
848 goto out_disable;
849
850 /* Reset the state for the current task */
851 fpstate_reset(¤t->thread.fpu);
852
853 /*
854 * Update info used for ptrace frames; use standard-format size and no
855 * supervisor xstates:
856 */
857 update_regset_xstate_info(fpu_user_cfg.max_size,
858 fpu_user_cfg.max_features);
859
860 /*
861 * init_fpstate excludes dynamic states as they are large but init
862 * state is zero.
863 */
864 init_fpstate.size = fpu_kernel_cfg.default_size;
865 init_fpstate.xfeatures = fpu_kernel_cfg.default_features;
866
867 if (init_fpstate.size > sizeof(init_fpstate.regs)) {
868 pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n",
869 sizeof(init_fpstate.regs), init_fpstate.size);
870 goto out_disable;
871 }
872
873 setup_init_fpu_buf();
874
875 /*
876 * Paranoia check whether something in the setup modified the
877 * xfeatures mask.
878 */
879 if (xfeatures != fpu_kernel_cfg.max_features) {
880 pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
881 xfeatures, fpu_kernel_cfg.max_features);
882 goto out_disable;
883 }
884
885 print_xstate_offset_size();
886 pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
887 fpu_kernel_cfg.max_features,
888 fpu_kernel_cfg.max_size,
889 boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
890 return;
891
892out_disable:
893 /* something went wrong, try to boot without any XSAVE support */
894 fpu__init_disable_system_xstate(legacy_size);
895}
896
897/*
898 * Restore minimal FPU state after suspend:
899 */
900void fpu__resume_cpu(void)
901{
902 /*
903 * Restore XCR0 on xsave capable CPUs:
904 */
905 if (cpu_feature_enabled(X86_FEATURE_XSAVE))
906 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
907
908 /*
909 * Restore IA32_XSS. The same CPUID bit enumerates support
910 * of XSAVES and MSR_IA32_XSS.
911 */
912 if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
913 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
914 xfeatures_mask_independent());
915 }
916
917 if (fpu_state_size_dynamic())
918 wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
919}
920
921/*
922 * Given an xstate feature nr, calculate where in the xsave
923 * buffer the state is. Callers should ensure that the buffer
924 * is valid.
925 */
926static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
927{
928 u64 xcomp_bv = xsave->header.xcomp_bv;
929
930 if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
931 return NULL;
932
933 if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
934 if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
935 return NULL;
936 }
937
938 return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
939}
940
941/*
942 * Given the xsave area and a state inside, this function returns the
943 * address of the state.
944 *
945 * This is the API that is called to get xstate address in either
946 * standard format or compacted format of xsave area.
947 *
948 * Note that if there is no data for the field in the xsave buffer
949 * this will return NULL.
950 *
951 * Inputs:
952 * xstate: the thread's storage area for all FPU data
953 * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
954 * XFEATURE_SSE, etc...)
955 * Output:
956 * address of the state in the xsave area, or NULL if the
957 * field is not present in the xsave buffer.
958 */
959void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
960{
961 /*
962 * Do we even *have* xsave state?
963 */
964 if (!boot_cpu_has(X86_FEATURE_XSAVE))
965 return NULL;
966
967 /*
968 * We should not ever be requesting features that we
969 * have not enabled.
970 */
971 if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
972 return NULL;
973
974 /*
975 * This assumes the last 'xsave*' instruction to
976 * have requested that 'xfeature_nr' be saved.
977 * If it did not, we might be seeing and old value
978 * of the field in the buffer.
979 *
980 * This can happen because the last 'xsave' did not
981 * request that this feature be saved (unlikely)
982 * or because the "init optimization" caused it
983 * to not be saved.
984 */
985 if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
986 return NULL;
987
988 return __raw_xsave_addr(xsave, xfeature_nr);
989}
990
991#ifdef CONFIG_ARCH_HAS_PKEYS
992
993/*
994 * This will go out and modify PKRU register to set the access
995 * rights for @pkey to @init_val.
996 */
997int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
998 unsigned long init_val)
999{
1000 u32 old_pkru, new_pkru_bits = 0;
1001 int pkey_shift;
1002
1003 /*
1004 * This check implies XSAVE support. OSPKE only gets
1005 * set if we enable XSAVE and we enable PKU in XCR0.
1006 */
1007 if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
1008 return -EINVAL;
1009
1010 /*
1011 * This code should only be called with valid 'pkey'
1012 * values originating from in-kernel users. Complain
1013 * if a bad value is observed.
1014 */
1015 if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
1016 return -EINVAL;
1017
1018 /* Set the bits we need in PKRU: */
1019 if (init_val & PKEY_DISABLE_ACCESS)
1020 new_pkru_bits |= PKRU_AD_BIT;
1021 if (init_val & PKEY_DISABLE_WRITE)
1022 new_pkru_bits |= PKRU_WD_BIT;
1023
1024 /* Shift the bits in to the correct place in PKRU for pkey: */
1025 pkey_shift = pkey * PKRU_BITS_PER_PKEY;
1026 new_pkru_bits <<= pkey_shift;
1027
1028 /* Get old PKRU and mask off any old bits in place: */
1029 old_pkru = read_pkru();
1030 old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
1031
1032 /* Write old part along with new part: */
1033 write_pkru(old_pkru | new_pkru_bits);
1034
1035 return 0;
1036}
1037#endif /* ! CONFIG_ARCH_HAS_PKEYS */
1038
1039static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
1040 void *init_xstate, unsigned int size)
1041{
1042 membuf_write(to, from_xstate ? xstate : init_xstate, size);
1043}
1044
1045/**
1046 * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1047 * @to: membuf descriptor
1048 * @fpstate: The fpstate buffer from which to copy
1049 * @pkru_val: The PKRU value to store in the PKRU component
1050 * @copy_mode: The requested copy mode
1051 *
1052 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1053 * format, i.e. from the kernel internal hardware dependent storage format
1054 * to the requested @mode. UABI XSTATE is always uncompacted!
1055 *
1056 * It supports partial copy but @to.pos always starts from zero.
1057 */
1058void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
1059 u32 pkru_val, enum xstate_copy_mode copy_mode)
1060{
1061 const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
1062 struct xregs_state *xinit = &init_fpstate.regs.xsave;
1063 struct xregs_state *xsave = &fpstate->regs.xsave;
1064 struct xstate_header header;
1065 unsigned int zerofrom;
1066 u64 mask;
1067 int i;
1068
1069 memset(&header, 0, sizeof(header));
1070 header.xfeatures = xsave->header.xfeatures;
1071
1072 /* Mask out the feature bits depending on copy mode */
1073 switch (copy_mode) {
1074 case XSTATE_COPY_FP:
1075 header.xfeatures &= XFEATURE_MASK_FP;
1076 break;
1077
1078 case XSTATE_COPY_FX:
1079 header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
1080 break;
1081
1082 case XSTATE_COPY_XSAVE:
1083 header.xfeatures &= fpstate->user_xfeatures;
1084 break;
1085 }
1086
1087 /* Copy FP state up to MXCSR */
1088 copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
1089 &xinit->i387, off_mxcsr);
1090
1091 /* Copy MXCSR when SSE or YMM are set in the feature mask */
1092 copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
1093 &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
1094 MXCSR_AND_FLAGS_SIZE);
1095
1096 /* Copy the remaining FP state */
1097 copy_feature(header.xfeatures & XFEATURE_MASK_FP,
1098 &to, &xsave->i387.st_space, &xinit->i387.st_space,
1099 sizeof(xsave->i387.st_space));
1100
1101 /* Copy the SSE state - shared with YMM, but independently managed */
1102 copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
1103 &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
1104 sizeof(xsave->i387.xmm_space));
1105
1106 if (copy_mode != XSTATE_COPY_XSAVE)
1107 goto out;
1108
1109 /* Zero the padding area */
1110 membuf_zero(&to, sizeof(xsave->i387.padding));
1111
1112 /* Copy xsave->i387.sw_reserved */
1113 membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
1114
1115 /* Copy the user space relevant state of @xsave->header */
1116 membuf_write(&to, &header, sizeof(header));
1117
1118 zerofrom = offsetof(struct xregs_state, extended_state_area);
1119
1120 /*
1121 * The ptrace buffer is in non-compacted XSAVE format. In
1122 * non-compacted format disabled features still occupy state space,
1123 * but there is no state to copy from in the compacted
1124 * init_fpstate. The gap tracking will zero these states.
1125 */
1126 mask = fpstate->user_xfeatures;
1127
1128 /*
1129 * Dynamic features are not present in init_fpstate. When they are
1130 * in an all zeros init state, remove those from 'mask' to zero
1131 * those features in the user buffer instead of retrieving them
1132 * from init_fpstate.
1133 */
1134 if (fpu_state_size_dynamic())
1135 mask &= (header.xfeatures | xinit->header.xcomp_bv);
1136
1137 for_each_extended_xfeature(i, mask) {
1138 /*
1139 * If there was a feature or alignment gap, zero the space
1140 * in the destination buffer.
1141 */
1142 if (zerofrom < xstate_offsets[i])
1143 membuf_zero(&to, xstate_offsets[i] - zerofrom);
1144
1145 if (i == XFEATURE_PKRU) {
1146 struct pkru_state pkru = {0};
1147 /*
1148 * PKRU is not necessarily up to date in the
1149 * XSAVE buffer. Use the provided value.
1150 */
1151 pkru.pkru = pkru_val;
1152 membuf_write(&to, &pkru, sizeof(pkru));
1153 } else {
1154 copy_feature(header.xfeatures & BIT_ULL(i), &to,
1155 __raw_xsave_addr(xsave, i),
1156 __raw_xsave_addr(xinit, i),
1157 xstate_sizes[i]);
1158 }
1159 /*
1160 * Keep track of the last copied state in the non-compacted
1161 * target buffer for gap zeroing.
1162 */
1163 zerofrom = xstate_offsets[i] + xstate_sizes[i];
1164 }
1165
1166out:
1167 if (to.left)
1168 membuf_zero(&to, to.left);
1169}
1170
1171/**
1172 * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1173 * @to: membuf descriptor
1174 * @tsk: The task from which to copy the saved xstate
1175 * @copy_mode: The requested copy mode
1176 *
1177 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1178 * format, i.e. from the kernel internal hardware dependent storage format
1179 * to the requested @mode. UABI XSTATE is always uncompacted!
1180 *
1181 * It supports partial copy but @to.pos always starts from zero.
1182 */
1183void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
1184 enum xstate_copy_mode copy_mode)
1185{
1186 __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
1187 tsk->thread.pkru, copy_mode);
1188}
1189
1190static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
1191 const void *kbuf, const void __user *ubuf)
1192{
1193 if (kbuf) {
1194 memcpy(dst, kbuf + offset, size);
1195 } else {
1196 if (copy_from_user(dst, ubuf + offset, size))
1197 return -EFAULT;
1198 }
1199 return 0;
1200}
1201
1202
1203/**
1204 * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
1205 * @fpstate: The fpstate buffer to copy to
1206 * @kbuf: The UABI format buffer, if it comes from the kernel
1207 * @ubuf: The UABI format buffer, if it comes from userspace
1208 * @pkru: The location to write the PKRU value to
1209 *
1210 * Converts from the UABI format into the kernel internal hardware
1211 * dependent format.
1212 *
1213 * This function ultimately has three different callers with distinct PKRU
1214 * behavior.
1215 * 1. When called from sigreturn the PKRU register will be restored from
1216 * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to
1217 * @fpstate is sufficient to cover this case, but the caller will also
1218 * pass a pointer to the thread_struct's pkru field in @pkru and updating
1219 * it is harmless.
1220 * 2. When called from ptrace the PKRU register will be restored from the
1221 * thread_struct's pkru field. A pointer to that is passed in @pkru.
1222 * The kernel will restore it manually, so the XRSTOR behavior that resets
1223 * the PKRU register to the hardware init value (0) if the corresponding
1224 * xfeatures bit is not set is emulated here.
1225 * 3. When called from KVM the PKRU register will be restored from the vcpu's
1226 * pkru field. A pointer to that is passed in @pkru. KVM hasn't used
1227 * XRSTOR and hasn't had the PKRU resetting behavior described above. To
1228 * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
1229 * bit is not set.
1230 */
1231static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
1232 const void __user *ubuf, u32 *pkru)
1233{
1234 struct xregs_state *xsave = &fpstate->regs.xsave;
1235 unsigned int offset, size;
1236 struct xstate_header hdr;
1237 u64 mask;
1238 int i;
1239
1240 offset = offsetof(struct xregs_state, header);
1241 if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
1242 return -EFAULT;
1243
1244 if (validate_user_xstate_header(&hdr, fpstate))
1245 return -EINVAL;
1246
1247 /* Validate MXCSR when any of the related features is in use */
1248 mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
1249 if (hdr.xfeatures & mask) {
1250 u32 mxcsr[2];
1251
1252 offset = offsetof(struct fxregs_state, mxcsr);
1253 if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
1254 return -EFAULT;
1255
1256 /* Reserved bits in MXCSR must be zero. */
1257 if (mxcsr[0] & ~mxcsr_feature_mask)
1258 return -EINVAL;
1259
1260 /* SSE and YMM require MXCSR even when FP is not in use. */
1261 if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
1262 xsave->i387.mxcsr = mxcsr[0];
1263 xsave->i387.mxcsr_mask = mxcsr[1];
1264 }
1265 }
1266
1267 for (i = 0; i < XFEATURE_MAX; i++) {
1268 mask = BIT_ULL(i);
1269
1270 if (hdr.xfeatures & mask) {
1271 void *dst = __raw_xsave_addr(xsave, i);
1272
1273 offset = xstate_offsets[i];
1274 size = xstate_sizes[i];
1275
1276 if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
1277 return -EFAULT;
1278 }
1279 }
1280
1281 if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
1282 struct pkru_state *xpkru;
1283
1284 xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
1285 *pkru = xpkru->pkru;
1286 } else {
1287 /*
1288 * KVM may pass NULL here to indicate that it does not need
1289 * PKRU updated.
1290 */
1291 if (pkru)
1292 *pkru = 0;
1293 }
1294
1295 /*
1296 * The state that came in from userspace was user-state only.
1297 * Mask all the user states out of 'xfeatures':
1298 */
1299 xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
1300
1301 /*
1302 * Add back in the features that came in from userspace:
1303 */
1304 xsave->header.xfeatures |= hdr.xfeatures;
1305
1306 return 0;
1307}
1308
1309/*
1310 * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
1311 * format and copy to the target thread. Used by ptrace and KVM.
1312 */
1313int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
1314{
1315 return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
1316}
1317
1318/*
1319 * Convert from a sigreturn standard-format user-space buffer to kernel
1320 * XSAVE[S] format and copy to the target thread. This is called from the
1321 * sigreturn() and rt_sigreturn() system calls.
1322 */
1323int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
1324 const void __user *ubuf)
1325{
1326 return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru);
1327}
1328
1329static bool validate_independent_components(u64 mask)
1330{
1331 u64 xchk;
1332
1333 if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
1334 return false;
1335
1336 xchk = ~xfeatures_mask_independent();
1337
1338 if (WARN_ON_ONCE(!mask || mask & xchk))
1339 return false;
1340
1341 return true;
1342}
1343
1344/**
1345 * xsaves - Save selected components to a kernel xstate buffer
1346 * @xstate: Pointer to the buffer
1347 * @mask: Feature mask to select the components to save
1348 *
1349 * The @xstate buffer must be 64 byte aligned and correctly initialized as
1350 * XSAVES does not write the full xstate header. Before first use the
1351 * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
1352 * can #GP.
1353 *
1354 * The feature mask must be a subset of the independent features.
1355 */
1356void xsaves(struct xregs_state *xstate, u64 mask)
1357{
1358 int err;
1359
1360 if (!validate_independent_components(mask))
1361 return;
1362
1363 XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
1364 WARN_ON_ONCE(err);
1365}
1366
1367/**
1368 * xrstors - Restore selected components from a kernel xstate buffer
1369 * @xstate: Pointer to the buffer
1370 * @mask: Feature mask to select the components to restore
1371 *
1372 * The @xstate buffer must be 64 byte aligned and correctly initialized
1373 * otherwise XRSTORS from that buffer can #GP.
1374 *
1375 * Proper usage is to restore the state which was saved with
1376 * xsaves() into @xstate.
1377 *
1378 * The feature mask must be a subset of the independent features.
1379 */
1380void xrstors(struct xregs_state *xstate, u64 mask)
1381{
1382 int err;
1383
1384 if (!validate_independent_components(mask))
1385 return;
1386
1387 XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
1388 WARN_ON_ONCE(err);
1389}
1390
1391#if IS_ENABLED(CONFIG_KVM)
1392void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
1393{
1394 void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
1395
1396 if (addr)
1397 memset(addr, 0, xstate_sizes[xfeature]);
1398}
1399EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
1400#endif
1401
1402#ifdef CONFIG_X86_64
1403
1404#ifdef CONFIG_X86_DEBUG_FPU
1405/*
1406 * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
1407 * can safely operate on the @fpstate buffer.
1408 */
1409static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
1410{
1411 u64 xfd = __this_cpu_read(xfd_state);
1412
1413 if (fpstate->xfd == xfd)
1414 return true;
1415
1416 /*
1417 * The XFD MSR does not match fpstate->xfd. That's invalid when
1418 * the passed in fpstate is current's fpstate.
1419 */
1420 if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
1421 return false;
1422
1423 /*
1424 * XRSTOR(S) from init_fpstate are always correct as it will just
1425 * bring all components into init state and not read from the
1426 * buffer. XSAVE(S) raises #PF after init.
1427 */
1428 if (fpstate == &init_fpstate)
1429 return rstor;
1430
1431 /*
1432 * XSAVE(S): clone(), fpu_swap_kvm_fpu()
1433 * XRSTORS(S): fpu_swap_kvm_fpu()
1434 */
1435
1436 /*
1437 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
1438 * the buffer area for XFD-disabled state components.
1439 */
1440 mask &= ~xfd;
1441
1442 /*
1443 * Remove features which are valid in fpstate. They
1444 * have space allocated in fpstate.
1445 */
1446 mask &= ~fpstate->xfeatures;
1447
1448 /*
1449 * Any remaining state components in 'mask' might be written
1450 * by XSAVE/XRSTOR. Fail validation it found.
1451 */
1452 return !mask;
1453}
1454
1455void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
1456{
1457 WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
1458}
1459#endif /* CONFIG_X86_DEBUG_FPU */
1460
1461static int __init xfd_update_static_branch(void)
1462{
1463 /*
1464 * If init_fpstate.xfd has bits set then dynamic features are
1465 * available and the dynamic sizing must be enabled.
1466 */
1467 if (init_fpstate.xfd)
1468 static_branch_enable(&__fpu_state_size_dynamic);
1469 return 0;
1470}
1471arch_initcall(xfd_update_static_branch)
1472
1473void fpstate_free(struct fpu *fpu)
1474{
1475 if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
1476 vfree(fpu->fpstate);
1477}
1478
1479/**
1480 * fpstate_realloc - Reallocate struct fpstate for the requested new features
1481 *
1482 * @xfeatures: A bitmap of xstate features which extend the enabled features
1483 * of that task
1484 * @ksize: The required size for the kernel buffer
1485 * @usize: The required size for user space buffers
1486 * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations
1487 *
1488 * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
1489 * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
1490 * with large states are likely to live longer.
1491 *
1492 * Returns: 0 on success, -ENOMEM on allocation error.
1493 */
1494static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
1495 unsigned int usize, struct fpu_guest *guest_fpu)
1496{
1497 struct fpu *fpu = ¤t->thread.fpu;
1498 struct fpstate *curfps, *newfps = NULL;
1499 unsigned int fpsize;
1500 bool in_use;
1501
1502 fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
1503
1504 newfps = vzalloc(fpsize);
1505 if (!newfps)
1506 return -ENOMEM;
1507 newfps->size = ksize;
1508 newfps->user_size = usize;
1509 newfps->is_valloc = true;
1510
1511 /*
1512 * When a guest FPU is supplied, use @guest_fpu->fpstate
1513 * as reference independent whether it is in use or not.
1514 */
1515 curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
1516
1517 /* Determine whether @curfps is the active fpstate */
1518 in_use = fpu->fpstate == curfps;
1519
1520 if (guest_fpu) {
1521 newfps->is_guest = true;
1522 newfps->is_confidential = curfps->is_confidential;
1523 newfps->in_use = curfps->in_use;
1524 guest_fpu->xfeatures |= xfeatures;
1525 guest_fpu->uabi_size = usize;
1526 }
1527
1528 fpregs_lock();
1529 /*
1530 * If @curfps is in use, ensure that the current state is in the
1531 * registers before swapping fpstate as that might invalidate it
1532 * due to layout changes.
1533 */
1534 if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
1535 fpregs_restore_userregs();
1536
1537 newfps->xfeatures = curfps->xfeatures | xfeatures;
1538
1539 if (!guest_fpu)
1540 newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
1541
1542 newfps->xfd = curfps->xfd & ~xfeatures;
1543
1544 /* Do the final updates within the locked region */
1545 xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
1546
1547 if (guest_fpu) {
1548 guest_fpu->fpstate = newfps;
1549 /* If curfps is active, update the FPU fpstate pointer */
1550 if (in_use)
1551 fpu->fpstate = newfps;
1552 } else {
1553 fpu->fpstate = newfps;
1554 }
1555
1556 if (in_use)
1557 xfd_update_state(fpu->fpstate);
1558 fpregs_unlock();
1559
1560 /* Only free valloc'ed state */
1561 if (curfps && curfps->is_valloc)
1562 vfree(curfps);
1563
1564 return 0;
1565}
1566
1567static int validate_sigaltstack(unsigned int usize)
1568{
1569 struct task_struct *thread, *leader = current->group_leader;
1570 unsigned long framesize = get_sigframe_size();
1571
1572 lockdep_assert_held(¤t->sighand->siglock);
1573
1574 /* get_sigframe_size() is based on fpu_user_cfg.max_size */
1575 framesize -= fpu_user_cfg.max_size;
1576 framesize += usize;
1577 for_each_thread(leader, thread) {
1578 if (thread->sas_ss_size && thread->sas_ss_size < framesize)
1579 return -ENOSPC;
1580 }
1581 return 0;
1582}
1583
1584static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
1585{
1586 /*
1587 * This deliberately does not exclude !XSAVES as we still might
1588 * decide to optionally context switch XCR0 or talk the silicon
1589 * vendors into extending XFD for the pre AMX states, especially
1590 * AVX512.
1591 */
1592 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
1593 struct fpu *fpu = ¤t->group_leader->thread.fpu;
1594 struct fpu_state_perm *perm;
1595 unsigned int ksize, usize;
1596 u64 mask;
1597 int ret = 0;
1598
1599 /* Check whether fully enabled */
1600 if ((permitted & requested) == requested)
1601 return 0;
1602
1603 /* Calculate the resulting kernel state size */
1604 mask = permitted | requested;
1605 /* Take supervisor states into account on the host */
1606 if (!guest)
1607 mask |= xfeatures_mask_supervisor();
1608 ksize = xstate_calculate_size(mask, compacted);
1609
1610 /* Calculate the resulting user state size */
1611 mask &= XFEATURE_MASK_USER_SUPPORTED;
1612 usize = xstate_calculate_size(mask, false);
1613
1614 if (!guest) {
1615 ret = validate_sigaltstack(usize);
1616 if (ret)
1617 return ret;
1618 }
1619
1620 perm = guest ? &fpu->guest_perm : &fpu->perm;
1621 /* Pairs with the READ_ONCE() in xstate_get_group_perm() */
1622 WRITE_ONCE(perm->__state_perm, mask);
1623 /* Protected by sighand lock */
1624 perm->__state_size = ksize;
1625 perm->__user_state_size = usize;
1626 return ret;
1627}
1628
1629/*
1630 * Permissions array to map facilities with more than one component
1631 */
1632static const u64 xstate_prctl_req[XFEATURE_MAX] = {
1633 [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
1634};
1635
1636static int xstate_request_perm(unsigned long idx, bool guest)
1637{
1638 u64 permitted, requested;
1639 int ret;
1640
1641 if (idx >= XFEATURE_MAX)
1642 return -EINVAL;
1643
1644 /*
1645 * Look up the facility mask which can require more than
1646 * one xstate component.
1647 */
1648 idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
1649 requested = xstate_prctl_req[idx];
1650 if (!requested)
1651 return -EOPNOTSUPP;
1652
1653 if ((fpu_user_cfg.max_features & requested) != requested)
1654 return -EOPNOTSUPP;
1655
1656 /* Lockless quick check */
1657 permitted = xstate_get_group_perm(guest);
1658 if ((permitted & requested) == requested)
1659 return 0;
1660
1661 /* Protect against concurrent modifications */
1662 spin_lock_irq(¤t->sighand->siglock);
1663 permitted = xstate_get_group_perm(guest);
1664
1665 /* First vCPU allocation locks the permissions. */
1666 if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
1667 ret = -EBUSY;
1668 else
1669 ret = __xstate_request_perm(permitted, requested, guest);
1670 spin_unlock_irq(¤t->sighand->siglock);
1671 return ret;
1672}
1673
1674int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
1675{
1676 u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
1677 struct fpu_state_perm *perm;
1678 unsigned int ksize, usize;
1679 struct fpu *fpu;
1680
1681 if (!xfd_event) {
1682 if (!guest_fpu)
1683 pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
1684 return 0;
1685 }
1686
1687 /* Protect against concurrent modifications */
1688 spin_lock_irq(¤t->sighand->siglock);
1689
1690 /* If not permitted let it die */
1691 if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
1692 spin_unlock_irq(¤t->sighand->siglock);
1693 return -EPERM;
1694 }
1695
1696 fpu = ¤t->group_leader->thread.fpu;
1697 perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
1698 ksize = perm->__state_size;
1699 usize = perm->__user_state_size;
1700
1701 /*
1702 * The feature is permitted. State size is sufficient. Dropping
1703 * the lock is safe here even if more features are added from
1704 * another task, the retrieved buffer sizes are valid for the
1705 * currently requested feature(s).
1706 */
1707 spin_unlock_irq(¤t->sighand->siglock);
1708
1709 /*
1710 * Try to allocate a new fpstate. If that fails there is no way
1711 * out.
1712 */
1713 if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
1714 return -EFAULT;
1715 return 0;
1716}
1717
1718int xfd_enable_feature(u64 xfd_err)
1719{
1720 return __xfd_enable_feature(xfd_err, NULL);
1721}
1722
1723#else /* CONFIG_X86_64 */
1724static inline int xstate_request_perm(unsigned long idx, bool guest)
1725{
1726 return -EPERM;
1727}
1728#endif /* !CONFIG_X86_64 */
1729
1730u64 xstate_get_guest_group_perm(void)
1731{
1732 return xstate_get_group_perm(true);
1733}
1734EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
1735
1736/**
1737 * fpu_xstate_prctl - xstate permission operations
1738 * @tsk: Redundant pointer to current
1739 * @option: A subfunction of arch_prctl()
1740 * @arg2: option argument
1741 * Return: 0 if successful; otherwise, an error code
1742 *
1743 * Option arguments:
1744 *
1745 * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
1746 * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
1747 * ARCH_REQ_XCOMP_PERM: Facility number requested
1748 *
1749 * For facilities which require more than one XSTATE component, the request
1750 * must be the highest state component number related to that facility,
1751 * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
1752 * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
1753 */
1754long fpu_xstate_prctl(int option, unsigned long arg2)
1755{
1756 u64 __user *uptr = (u64 __user *)arg2;
1757 u64 permitted, supported;
1758 unsigned long idx = arg2;
1759 bool guest = false;
1760
1761 switch (option) {
1762 case ARCH_GET_XCOMP_SUPP:
1763 supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features;
1764 return put_user(supported, uptr);
1765
1766 case ARCH_GET_XCOMP_PERM:
1767 /*
1768 * Lockless snapshot as it can also change right after the
1769 * dropping the lock.
1770 */
1771 permitted = xstate_get_host_group_perm();
1772 permitted &= XFEATURE_MASK_USER_SUPPORTED;
1773 return put_user(permitted, uptr);
1774
1775 case ARCH_GET_XCOMP_GUEST_PERM:
1776 permitted = xstate_get_guest_group_perm();
1777 permitted &= XFEATURE_MASK_USER_SUPPORTED;
1778 return put_user(permitted, uptr);
1779
1780 case ARCH_REQ_XCOMP_GUEST_PERM:
1781 guest = true;
1782 fallthrough;
1783
1784 case ARCH_REQ_XCOMP_PERM:
1785 if (!IS_ENABLED(CONFIG_X86_64))
1786 return -EOPNOTSUPP;
1787
1788 return xstate_request_perm(idx, guest);
1789
1790 default:
1791 return -EINVAL;
1792 }
1793}
1794
1795#ifdef CONFIG_PROC_PID_ARCH_STATUS
1796/*
1797 * Report the amount of time elapsed in millisecond since last AVX512
1798 * use in the task.
1799 */
1800static void avx512_status(struct seq_file *m, struct task_struct *task)
1801{
1802 unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
1803 long delta;
1804
1805 if (!timestamp) {
1806 /*
1807 * Report -1 if no AVX512 usage
1808 */
1809 delta = -1;
1810 } else {
1811 delta = (long)(jiffies - timestamp);
1812 /*
1813 * Cap to LONG_MAX if time difference > LONG_MAX
1814 */
1815 if (delta < 0)
1816 delta = LONG_MAX;
1817 delta = jiffies_to_msecs(delta);
1818 }
1819
1820 seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
1821 seq_putc(m, '\n');
1822}
1823
1824/*
1825 * Report architecture specific information
1826 */
1827int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
1828 struct pid *pid, struct task_struct *task)
1829{
1830 /*
1831 * Report AVX512 state if the processor and build option supported.
1832 */
1833 if (cpu_feature_enabled(X86_FEATURE_AVX512F))
1834 avx512_status(m, task);
1835
1836 return 0;
1837}
1838#endif /* CONFIG_PROC_PID_ARCH_STATUS */
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * xsave/xrstor support.
4 *
5 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
6 */
7#include <linux/bitops.h>
8#include <linux/compat.h>
9#include <linux/cpu.h>
10#include <linux/mman.h>
11#include <linux/nospec.h>
12#include <linux/pkeys.h>
13#include <linux/seq_file.h>
14#include <linux/proc_fs.h>
15#include <linux/vmalloc.h>
16#include <linux/coredump.h>
17
18#include <asm/fpu/api.h>
19#include <asm/fpu/regset.h>
20#include <asm/fpu/signal.h>
21#include <asm/fpu/xcr.h>
22
23#include <asm/tlbflush.h>
24#include <asm/prctl.h>
25#include <asm/elf.h>
26
27#include <uapi/asm/elf.h>
28
29#include "context.h"
30#include "internal.h"
31#include "legacy.h"
32#include "xstate.h"
33
34#define for_each_extended_xfeature(bit, mask) \
35 (bit) = FIRST_EXTENDED_XFEATURE; \
36 for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
37
38/*
39 * Although we spell it out in here, the Processor Trace
40 * xfeature is completely unused. We use other mechanisms
41 * to save/restore PT state in Linux.
42 */
43static const char *xfeature_names[] =
44{
45 "x87 floating point registers",
46 "SSE registers",
47 "AVX registers",
48 "MPX bounds registers",
49 "MPX CSR",
50 "AVX-512 opmask",
51 "AVX-512 Hi256",
52 "AVX-512 ZMM_Hi256",
53 "Processor Trace (unused)",
54 "Protection Keys User registers",
55 "PASID state",
56 "Control-flow User registers",
57 "Control-flow Kernel registers (unused)",
58 "unknown xstate feature",
59 "unknown xstate feature",
60 "unknown xstate feature",
61 "unknown xstate feature",
62 "AMX Tile config",
63 "AMX Tile data",
64 "unknown xstate feature",
65};
66
67static unsigned short xsave_cpuid_features[] __initdata = {
68 [XFEATURE_FP] = X86_FEATURE_FPU,
69 [XFEATURE_SSE] = X86_FEATURE_XMM,
70 [XFEATURE_YMM] = X86_FEATURE_AVX,
71 [XFEATURE_BNDREGS] = X86_FEATURE_MPX,
72 [XFEATURE_BNDCSR] = X86_FEATURE_MPX,
73 [XFEATURE_OPMASK] = X86_FEATURE_AVX512F,
74 [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F,
75 [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F,
76 [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT,
77 [XFEATURE_PKRU] = X86_FEATURE_OSPKE,
78 [XFEATURE_PASID] = X86_FEATURE_ENQCMD,
79 [XFEATURE_CET_USER] = X86_FEATURE_SHSTK,
80 [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE,
81 [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE,
82};
83
84static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
85 { [ 0 ... XFEATURE_MAX - 1] = -1};
86static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
87 { [ 0 ... XFEATURE_MAX - 1] = -1};
88static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
89
90#define XSTATE_FLAG_SUPERVISOR BIT(0)
91#define XSTATE_FLAG_ALIGNED64 BIT(1)
92
93/*
94 * Return whether the system supports a given xfeature.
95 *
96 * Also return the name of the (most advanced) feature that the caller requested:
97 */
98int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
99{
100 u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
101
102 if (unlikely(feature_name)) {
103 long xfeature_idx, max_idx;
104 u64 xfeatures_print;
105 /*
106 * So we use FLS here to be able to print the most advanced
107 * feature that was requested but is missing. So if a driver
108 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
109 * missing AVX feature - this is the most informative message
110 * to users:
111 */
112 if (xfeatures_missing)
113 xfeatures_print = xfeatures_missing;
114 else
115 xfeatures_print = xfeatures_needed;
116
117 xfeature_idx = fls64(xfeatures_print)-1;
118 max_idx = ARRAY_SIZE(xfeature_names)-1;
119 xfeature_idx = min(xfeature_idx, max_idx);
120
121 *feature_name = xfeature_names[xfeature_idx];
122 }
123
124 if (xfeatures_missing)
125 return 0;
126
127 return 1;
128}
129EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
130
131static bool xfeature_is_aligned64(int xfeature_nr)
132{
133 return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
134}
135
136static bool xfeature_is_supervisor(int xfeature_nr)
137{
138 return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
139}
140
141static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
142{
143 unsigned int offs, i;
144
145 /*
146 * Non-compacted format and legacy features use the cached fixed
147 * offsets.
148 */
149 if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
150 xfeature <= XFEATURE_SSE)
151 return xstate_offsets[xfeature];
152
153 /*
154 * Compacted format offsets depend on the actual content of the
155 * compacted xsave area which is determined by the xcomp_bv header
156 * field.
157 */
158 offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
159 for_each_extended_xfeature(i, xcomp_bv) {
160 if (xfeature_is_aligned64(i))
161 offs = ALIGN(offs, 64);
162 if (i == xfeature)
163 break;
164 offs += xstate_sizes[i];
165 }
166 return offs;
167}
168
169/*
170 * Enable the extended processor state save/restore feature.
171 * Called once per CPU onlining.
172 */
173void fpu__init_cpu_xstate(void)
174{
175 if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
176 return;
177
178 cr4_set_bits(X86_CR4_OSXSAVE);
179
180 /*
181 * Must happen after CR4 setup and before xsetbv() to allow KVM
182 * lazy passthrough. Write independent of the dynamic state static
183 * key as that does not work on the boot CPU. This also ensures
184 * that any stale state is wiped out from XFD. Reset the per CPU
185 * xfd cache too.
186 */
187 if (cpu_feature_enabled(X86_FEATURE_XFD))
188 xfd_set_state(init_fpstate.xfd);
189
190 /*
191 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
192 * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user
193 * states can be set here.
194 */
195 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
196
197 /*
198 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
199 */
200 if (boot_cpu_has(X86_FEATURE_XSAVES)) {
201 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
202 xfeatures_mask_independent());
203 }
204}
205
206static bool xfeature_enabled(enum xfeature xfeature)
207{
208 return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
209}
210
211/*
212 * Record the offsets and sizes of various xstates contained
213 * in the XSAVE state memory layout.
214 */
215static void __init setup_xstate_cache(void)
216{
217 u32 eax, ebx, ecx, edx, i;
218 /* start at the beginning of the "extended state" */
219 unsigned int last_good_offset = offsetof(struct xregs_state,
220 extended_state_area);
221 /*
222 * The FP xstates and SSE xstates are legacy states. They are always
223 * in the fixed offsets in the xsave area in either compacted form
224 * or standard form.
225 */
226 xstate_offsets[XFEATURE_FP] = 0;
227 xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state,
228 xmm_space);
229
230 xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP];
231 xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state,
232 xmm_space);
233
234 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
235 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
236
237 xstate_sizes[i] = eax;
238 xstate_flags[i] = ecx;
239
240 /*
241 * If an xfeature is supervisor state, the offset in EBX is
242 * invalid, leave it to -1.
243 */
244 if (xfeature_is_supervisor(i))
245 continue;
246
247 xstate_offsets[i] = ebx;
248
249 /*
250 * In our xstate size checks, we assume that the highest-numbered
251 * xstate feature has the highest offset in the buffer. Ensure
252 * it does.
253 */
254 WARN_ONCE(last_good_offset > xstate_offsets[i],
255 "x86/fpu: misordered xstate at %d\n", last_good_offset);
256
257 last_good_offset = xstate_offsets[i];
258 }
259}
260
261static void __init print_xstate_feature(u64 xstate_mask)
262{
263 const char *feature_name;
264
265 if (cpu_has_xfeatures(xstate_mask, &feature_name))
266 pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
267}
268
269/*
270 * Print out all the supported xstate features:
271 */
272static void __init print_xstate_features(void)
273{
274 print_xstate_feature(XFEATURE_MASK_FP);
275 print_xstate_feature(XFEATURE_MASK_SSE);
276 print_xstate_feature(XFEATURE_MASK_YMM);
277 print_xstate_feature(XFEATURE_MASK_BNDREGS);
278 print_xstate_feature(XFEATURE_MASK_BNDCSR);
279 print_xstate_feature(XFEATURE_MASK_OPMASK);
280 print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
281 print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
282 print_xstate_feature(XFEATURE_MASK_PKRU);
283 print_xstate_feature(XFEATURE_MASK_PASID);
284 print_xstate_feature(XFEATURE_MASK_CET_USER);
285 print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
286 print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
287}
288
289/*
290 * This check is important because it is easy to get XSTATE_*
291 * confused with XSTATE_BIT_*.
292 */
293#define CHECK_XFEATURE(nr) do { \
294 WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \
295 WARN_ON(nr >= XFEATURE_MAX); \
296} while (0)
297
298/*
299 * Print out xstate component offsets and sizes
300 */
301static void __init print_xstate_offset_size(void)
302{
303 int i;
304
305 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
306 pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
307 i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
308 i, xstate_sizes[i]);
309 }
310}
311
312/*
313 * This function is called only during boot time when x86 caps are not set
314 * up and alternative can not be used yet.
315 */
316static __init void os_xrstor_booting(struct xregs_state *xstate)
317{
318 u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
319 u32 lmask = mask;
320 u32 hmask = mask >> 32;
321 int err;
322
323 if (cpu_feature_enabled(X86_FEATURE_XSAVES))
324 XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
325 else
326 XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
327
328 /*
329 * We should never fault when copying from a kernel buffer, and the FPU
330 * state we set at boot time should be valid.
331 */
332 WARN_ON_FPU(err);
333}
334
335/*
336 * All supported features have either init state all zeros or are
337 * handled in setup_init_fpu() individually. This is an explicit
338 * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
339 * newly added supported features at build time and make people
340 * actually look at the init state for the new feature.
341 */
342#define XFEATURES_INIT_FPSTATE_HANDLED \
343 (XFEATURE_MASK_FP | \
344 XFEATURE_MASK_SSE | \
345 XFEATURE_MASK_YMM | \
346 XFEATURE_MASK_OPMASK | \
347 XFEATURE_MASK_ZMM_Hi256 | \
348 XFEATURE_MASK_Hi16_ZMM | \
349 XFEATURE_MASK_PKRU | \
350 XFEATURE_MASK_BNDREGS | \
351 XFEATURE_MASK_BNDCSR | \
352 XFEATURE_MASK_PASID | \
353 XFEATURE_MASK_CET_USER | \
354 XFEATURE_MASK_XTILE)
355
356/*
357 * setup the xstate image representing the init state
358 */
359static void __init setup_init_fpu_buf(void)
360{
361 BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
362 XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
363 XFEATURES_INIT_FPSTATE_HANDLED);
364
365 if (!boot_cpu_has(X86_FEATURE_XSAVE))
366 return;
367
368 print_xstate_features();
369
370 xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);
371
372 /*
373 * Init all the features state with header.xfeatures being 0x0
374 */
375 os_xrstor_booting(&init_fpstate.regs.xsave);
376
377 /*
378 * All components are now in init state. Read the state back so
379 * that init_fpstate contains all non-zero init state. This only
380 * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
381 * those use the init optimization which skips writing data for
382 * components in init state.
383 *
384 * XSAVE could be used, but that would require to reshuffle the
385 * data when XSAVEC/S is available because XSAVEC/S uses xstate
386 * compaction. But doing so is a pointless exercise because most
387 * components have an all zeros init state except for the legacy
388 * ones (FP and SSE). Those can be saved with FXSAVE into the
389 * legacy area. Adding new features requires to ensure that init
390 * state is all zeroes or if not to add the necessary handling
391 * here.
392 */
393 fxsave(&init_fpstate.regs.fxsave);
394}
395
396int xfeature_size(int xfeature_nr)
397{
398 u32 eax, ebx, ecx, edx;
399
400 CHECK_XFEATURE(xfeature_nr);
401 cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
402 return eax;
403}
404
405/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
406static int validate_user_xstate_header(const struct xstate_header *hdr,
407 struct fpstate *fpstate)
408{
409 /* No unknown or supervisor features may be set */
410 if (hdr->xfeatures & ~fpstate->user_xfeatures)
411 return -EINVAL;
412
413 /* Userspace must use the uncompacted format */
414 if (hdr->xcomp_bv)
415 return -EINVAL;
416
417 /*
418 * If 'reserved' is shrunken to add a new field, make sure to validate
419 * that new field here!
420 */
421 BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
422
423 /* No reserved bits may be set */
424 if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
425 return -EINVAL;
426
427 return 0;
428}
429
430static void __init __xstate_dump_leaves(void)
431{
432 int i;
433 u32 eax, ebx, ecx, edx;
434 static int should_dump = 1;
435
436 if (!should_dump)
437 return;
438 should_dump = 0;
439 /*
440 * Dump out a few leaves past the ones that we support
441 * just in case there are some goodies up there
442 */
443 for (i = 0; i < XFEATURE_MAX + 10; i++) {
444 cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
445 pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
446 XSTATE_CPUID, i, eax, ebx, ecx, edx);
447 }
448}
449
450#define XSTATE_WARN_ON(x, fmt, ...) do { \
451 if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \
452 __xstate_dump_leaves(); \
453 } \
454} while (0)
455
456#define XCHECK_SZ(sz, nr, __struct) ({ \
457 if (WARN_ONCE(sz != sizeof(__struct), \
458 "[%s]: struct is %zu bytes, cpu state %d bytes\n", \
459 xfeature_names[nr], sizeof(__struct), sz)) { \
460 __xstate_dump_leaves(); \
461 } \
462 true; \
463})
464
465
466/**
467 * check_xtile_data_against_struct - Check tile data state size.
468 *
469 * Calculate the state size by multiplying the single tile size which is
470 * recorded in a C struct, and the number of tiles that the CPU informs.
471 * Compare the provided size with the calculation.
472 *
473 * @size: The tile data state size
474 *
475 * Returns: 0 on success, -EINVAL on mismatch.
476 */
477static int __init check_xtile_data_against_struct(int size)
478{
479 u32 max_palid, palid, state_size;
480 u32 eax, ebx, ecx, edx;
481 u16 max_tile;
482
483 /*
484 * Check the maximum palette id:
485 * eax: the highest numbered palette subleaf.
486 */
487 cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
488
489 /*
490 * Cross-check each tile size and find the maximum number of
491 * supported tiles.
492 */
493 for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
494 u16 tile_size, max;
495
496 /*
497 * Check the tile size info:
498 * eax[31:16]: bytes per title
499 * ebx[31:16]: the max names (or max number of tiles)
500 */
501 cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
502 tile_size = eax >> 16;
503 max = ebx >> 16;
504
505 if (tile_size != sizeof(struct xtile_data)) {
506 pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
507 __stringify(XFEATURE_XTILE_DATA),
508 sizeof(struct xtile_data), tile_size);
509 __xstate_dump_leaves();
510 return -EINVAL;
511 }
512
513 if (max > max_tile)
514 max_tile = max;
515 }
516
517 state_size = sizeof(struct xtile_data) * max_tile;
518 if (size != state_size) {
519 pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
520 __stringify(XFEATURE_XTILE_DATA), state_size, size);
521 __xstate_dump_leaves();
522 return -EINVAL;
523 }
524 return 0;
525}
526
527/*
528 * We have a C struct for each 'xstate'. We need to ensure
529 * that our software representation matches what the CPU
530 * tells us about the state's size.
531 */
532static bool __init check_xstate_against_struct(int nr)
533{
534 /*
535 * Ask the CPU for the size of the state.
536 */
537 int sz = xfeature_size(nr);
538
539 /*
540 * Match each CPU state with the corresponding software
541 * structure.
542 */
543 switch (nr) {
544 case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct);
545 case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
546 case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
547 case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
548 case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
549 case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
550 case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state);
551 case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
552 case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg);
553 case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state);
554 case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
555 default:
556 XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
557 return false;
558 }
559
560 return true;
561}
562
563static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
564{
565 unsigned int topmost = fls64(xfeatures) - 1;
566 unsigned int offset = xstate_offsets[topmost];
567
568 if (topmost <= XFEATURE_SSE)
569 return sizeof(struct xregs_state);
570
571 if (compacted)
572 offset = xfeature_get_offset(xfeatures, topmost);
573 return offset + xstate_sizes[topmost];
574}
575
576/*
577 * This essentially double-checks what the cpu told us about
578 * how large the XSAVE buffer needs to be. We are recalculating
579 * it to be safe.
580 *
581 * Independent XSAVE features allocate their own buffers and are not
582 * covered by these checks. Only the size of the buffer for task->fpu
583 * is checked here.
584 */
585static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
586{
587 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
588 bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
589 unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
590 int i;
591
592 for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
593 if (!check_xstate_against_struct(i))
594 return false;
595 /*
596 * Supervisor state components can be managed only by
597 * XSAVES.
598 */
599 if (!xsaves && xfeature_is_supervisor(i)) {
600 XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
601 return false;
602 }
603 }
604 size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
605 XSTATE_WARN_ON(size != kernel_size,
606 "size %u != kernel_size %u\n", size, kernel_size);
607 return size == kernel_size;
608}
609
610/*
611 * Get total size of enabled xstates in XCR0 | IA32_XSS.
612 *
613 * Note the SDM's wording here. "sub-function 0" only enumerates
614 * the size of the *user* states. If we use it to size a buffer
615 * that we use 'XSAVES' on, we could potentially overflow the
616 * buffer because 'XSAVES' saves system states too.
617 *
618 * This also takes compaction into account. So this works for
619 * XSAVEC as well.
620 */
621static unsigned int __init get_compacted_size(void)
622{
623 unsigned int eax, ebx, ecx, edx;
624 /*
625 * - CPUID function 0DH, sub-function 1:
626 * EBX enumerates the size (in bytes) required by
627 * the XSAVES instruction for an XSAVE area
628 * containing all the state components
629 * corresponding to bits currently set in
630 * XCR0 | IA32_XSS.
631 *
632 * When XSAVES is not available but XSAVEC is (virt), then there
633 * are no supervisor states, but XSAVEC still uses compacted
634 * format.
635 */
636 cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
637 return ebx;
638}
639
640/*
641 * Get the total size of the enabled xstates without the independent supervisor
642 * features.
643 */
644static unsigned int __init get_xsave_compacted_size(void)
645{
646 u64 mask = xfeatures_mask_independent();
647 unsigned int size;
648
649 if (!mask)
650 return get_compacted_size();
651
652 /* Disable independent features. */
653 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
654
655 /*
656 * Ask the hardware what size is required of the buffer.
657 * This is the size required for the task->fpu buffer.
658 */
659 size = get_compacted_size();
660
661 /* Re-enable independent features so XSAVES will work on them again. */
662 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
663
664 return size;
665}
666
667static unsigned int __init get_xsave_size_user(void)
668{
669 unsigned int eax, ebx, ecx, edx;
670 /*
671 * - CPUID function 0DH, sub-function 0:
672 * EBX enumerates the size (in bytes) required by
673 * the XSAVE instruction for an XSAVE area
674 * containing all the *user* state components
675 * corresponding to bits currently set in XCR0.
676 */
677 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
678 return ebx;
679}
680
681static int __init init_xstate_size(void)
682{
683 /* Recompute the context size for enabled features: */
684 unsigned int user_size, kernel_size, kernel_default_size;
685 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
686
687 /* Uncompacted user space size */
688 user_size = get_xsave_size_user();
689
690 /*
691 * XSAVES kernel size includes supervisor states and uses compacted
692 * format. XSAVEC uses compacted format, but does not save
693 * supervisor states.
694 *
695 * XSAVE[OPT] do not support supervisor states so kernel and user
696 * size is identical.
697 */
698 if (compacted)
699 kernel_size = get_xsave_compacted_size();
700 else
701 kernel_size = user_size;
702
703 kernel_default_size =
704 xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
705
706 if (!paranoid_xstate_size_valid(kernel_size))
707 return -EINVAL;
708
709 fpu_kernel_cfg.max_size = kernel_size;
710 fpu_user_cfg.max_size = user_size;
711
712 fpu_kernel_cfg.default_size = kernel_default_size;
713 fpu_user_cfg.default_size =
714 xstate_calculate_size(fpu_user_cfg.default_features, false);
715
716 return 0;
717}
718
719/*
720 * We enabled the XSAVE hardware, but something went wrong and
721 * we can not use it. Disable it.
722 */
723static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
724{
725 fpu_kernel_cfg.max_features = 0;
726 cr4_clear_bits(X86_CR4_OSXSAVE);
727 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
728
729 /* Restore the legacy size.*/
730 fpu_kernel_cfg.max_size = legacy_size;
731 fpu_kernel_cfg.default_size = legacy_size;
732 fpu_user_cfg.max_size = legacy_size;
733 fpu_user_cfg.default_size = legacy_size;
734
735 /*
736 * Prevent enabling the static branch which enables writes to the
737 * XFD MSR.
738 */
739 init_fpstate.xfd = 0;
740
741 fpstate_reset(¤t->thread.fpu);
742}
743
744/*
745 * Enable and initialize the xsave feature.
746 * Called once per system bootup.
747 */
748void __init fpu__init_system_xstate(unsigned int legacy_size)
749{
750 unsigned int eax, ebx, ecx, edx;
751 u64 xfeatures;
752 int err;
753 int i;
754
755 if (!boot_cpu_has(X86_FEATURE_FPU)) {
756 pr_info("x86/fpu: No FPU detected\n");
757 return;
758 }
759
760 if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
761 pr_info("x86/fpu: x87 FPU will use %s\n",
762 boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
763 return;
764 }
765
766 if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
767 WARN_ON_FPU(1);
768 return;
769 }
770
771 /*
772 * Find user xstates supported by the processor.
773 */
774 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
775 fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
776
777 /*
778 * Find supervisor xstates supported by the processor.
779 */
780 cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
781 fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
782
783 if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
784 /*
785 * This indicates that something really unexpected happened
786 * with the enumeration. Disable XSAVE and try to continue
787 * booting without it. This is too early to BUG().
788 */
789 pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
790 fpu_kernel_cfg.max_features);
791 goto out_disable;
792 }
793
794 fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
795 XFEATURE_MASK_INDEPENDENT;
796
797 /*
798 * Clear XSAVE features that are disabled in the normal CPUID.
799 */
800 for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
801 unsigned short cid = xsave_cpuid_features[i];
802
803 /* Careful: X86_FEATURE_FPU is 0! */
804 if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
805 fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
806 }
807
808 if (!cpu_feature_enabled(X86_FEATURE_XFD))
809 fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
810
811 if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
812 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
813 else
814 fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
815 XFEATURE_MASK_SUPERVISOR_SUPPORTED;
816
817 fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
818 fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
819
820 /* Clean out dynamic features from default */
821 fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
822 fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
823
824 fpu_user_cfg.default_features = fpu_user_cfg.max_features;
825 fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
826
827 /* Store it for paranoia check at the end */
828 xfeatures = fpu_kernel_cfg.max_features;
829
830 /*
831 * Initialize the default XFD state in initfp_state and enable the
832 * dynamic sizing mechanism if dynamic states are available. The
833 * static key cannot be enabled here because this runs before
834 * jump_label_init(). This is delayed to an initcall.
835 */
836 init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
837
838 /* Set up compaction feature bit */
839 if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
840 cpu_feature_enabled(X86_FEATURE_XSAVES))
841 setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
842
843 /* Enable xstate instructions to be able to continue with initialization: */
844 fpu__init_cpu_xstate();
845
846 /* Cache size, offset and flags for initialization */
847 setup_xstate_cache();
848
849 err = init_xstate_size();
850 if (err)
851 goto out_disable;
852
853 /* Reset the state for the current task */
854 fpstate_reset(¤t->thread.fpu);
855
856 /*
857 * Update info used for ptrace frames; use standard-format size and no
858 * supervisor xstates:
859 */
860 update_regset_xstate_info(fpu_user_cfg.max_size,
861 fpu_user_cfg.max_features);
862
863 /*
864 * init_fpstate excludes dynamic states as they are large but init
865 * state is zero.
866 */
867 init_fpstate.size = fpu_kernel_cfg.default_size;
868 init_fpstate.xfeatures = fpu_kernel_cfg.default_features;
869
870 if (init_fpstate.size > sizeof(init_fpstate.regs)) {
871 pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n",
872 sizeof(init_fpstate.regs), init_fpstate.size);
873 goto out_disable;
874 }
875
876 setup_init_fpu_buf();
877
878 /*
879 * Paranoia check whether something in the setup modified the
880 * xfeatures mask.
881 */
882 if (xfeatures != fpu_kernel_cfg.max_features) {
883 pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
884 xfeatures, fpu_kernel_cfg.max_features);
885 goto out_disable;
886 }
887
888 /*
889 * CPU capabilities initialization runs before FPU init. So
890 * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
891 * functional, set the feature bit so depending code works.
892 */
893 setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
894
895 print_xstate_offset_size();
896 pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
897 fpu_kernel_cfg.max_features,
898 fpu_kernel_cfg.max_size,
899 boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
900 return;
901
902out_disable:
903 /* something went wrong, try to boot without any XSAVE support */
904 fpu__init_disable_system_xstate(legacy_size);
905}
906
907/*
908 * Restore minimal FPU state after suspend:
909 */
910void fpu__resume_cpu(void)
911{
912 /*
913 * Restore XCR0 on xsave capable CPUs:
914 */
915 if (cpu_feature_enabled(X86_FEATURE_XSAVE))
916 xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
917
918 /*
919 * Restore IA32_XSS. The same CPUID bit enumerates support
920 * of XSAVES and MSR_IA32_XSS.
921 */
922 if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
923 wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
924 xfeatures_mask_independent());
925 }
926
927 if (fpu_state_size_dynamic())
928 wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
929}
930
931/*
932 * Given an xstate feature nr, calculate where in the xsave
933 * buffer the state is. Callers should ensure that the buffer
934 * is valid.
935 */
936static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
937{
938 u64 xcomp_bv = xsave->header.xcomp_bv;
939
940 if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
941 return NULL;
942
943 if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
944 if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
945 return NULL;
946 }
947
948 return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
949}
950
951/*
952 * Given the xsave area and a state inside, this function returns the
953 * address of the state.
954 *
955 * This is the API that is called to get xstate address in either
956 * standard format or compacted format of xsave area.
957 *
958 * Note that if there is no data for the field in the xsave buffer
959 * this will return NULL.
960 *
961 * Inputs:
962 * xstate: the thread's storage area for all FPU data
963 * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
964 * XFEATURE_SSE, etc...)
965 * Output:
966 * address of the state in the xsave area, or NULL if the
967 * field is not present in the xsave buffer.
968 */
969void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
970{
971 /*
972 * Do we even *have* xsave state?
973 */
974 if (!boot_cpu_has(X86_FEATURE_XSAVE))
975 return NULL;
976
977 /*
978 * We should not ever be requesting features that we
979 * have not enabled.
980 */
981 if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
982 return NULL;
983
984 /*
985 * This assumes the last 'xsave*' instruction to
986 * have requested that 'xfeature_nr' be saved.
987 * If it did not, we might be seeing and old value
988 * of the field in the buffer.
989 *
990 * This can happen because the last 'xsave' did not
991 * request that this feature be saved (unlikely)
992 * or because the "init optimization" caused it
993 * to not be saved.
994 */
995 if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
996 return NULL;
997
998 return __raw_xsave_addr(xsave, xfeature_nr);
999}
1000EXPORT_SYMBOL_GPL(get_xsave_addr);
1001
1002/*
1003 * Given an xstate feature nr, calculate where in the xsave buffer the state is.
1004 * The xsave buffer should be in standard format, not compacted (e.g. user mode
1005 * signal frames).
1006 */
1007void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr)
1008{
1009 if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
1010 return NULL;
1011
1012 return (void __user *)xsave + xstate_offsets[xfeature_nr];
1013}
1014
1015#ifdef CONFIG_ARCH_HAS_PKEYS
1016
1017/*
1018 * This will go out and modify PKRU register to set the access
1019 * rights for @pkey to @init_val.
1020 */
1021int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
1022 unsigned long init_val)
1023{
1024 u32 old_pkru, new_pkru_bits = 0;
1025 int pkey_shift;
1026
1027 /*
1028 * This check implies XSAVE support. OSPKE only gets
1029 * set if we enable XSAVE and we enable PKU in XCR0.
1030 */
1031 if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
1032 return -EINVAL;
1033
1034 /*
1035 * This code should only be called with valid 'pkey'
1036 * values originating from in-kernel users. Complain
1037 * if a bad value is observed.
1038 */
1039 if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
1040 return -EINVAL;
1041
1042 /* Set the bits we need in PKRU: */
1043 if (init_val & PKEY_DISABLE_ACCESS)
1044 new_pkru_bits |= PKRU_AD_BIT;
1045 if (init_val & PKEY_DISABLE_WRITE)
1046 new_pkru_bits |= PKRU_WD_BIT;
1047
1048 /* Shift the bits in to the correct place in PKRU for pkey: */
1049 pkey_shift = pkey * PKRU_BITS_PER_PKEY;
1050 new_pkru_bits <<= pkey_shift;
1051
1052 /* Get old PKRU and mask off any old bits in place: */
1053 old_pkru = read_pkru();
1054 old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
1055
1056 /* Write old part along with new part: */
1057 write_pkru(old_pkru | new_pkru_bits);
1058
1059 return 0;
1060}
1061#endif /* ! CONFIG_ARCH_HAS_PKEYS */
1062
1063static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
1064 void *init_xstate, unsigned int size)
1065{
1066 membuf_write(to, from_xstate ? xstate : init_xstate, size);
1067}
1068
1069/**
1070 * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1071 * @to: membuf descriptor
1072 * @fpstate: The fpstate buffer from which to copy
1073 * @xfeatures: The mask of xfeatures to save (XSAVE mode only)
1074 * @pkru_val: The PKRU value to store in the PKRU component
1075 * @copy_mode: The requested copy mode
1076 *
1077 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1078 * format, i.e. from the kernel internal hardware dependent storage format
1079 * to the requested @mode. UABI XSTATE is always uncompacted!
1080 *
1081 * It supports partial copy but @to.pos always starts from zero.
1082 */
1083void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
1084 u64 xfeatures, u32 pkru_val,
1085 enum xstate_copy_mode copy_mode)
1086{
1087 const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
1088 struct xregs_state *xinit = &init_fpstate.regs.xsave;
1089 struct xregs_state *xsave = &fpstate->regs.xsave;
1090 struct xstate_header header;
1091 unsigned int zerofrom;
1092 u64 mask;
1093 int i;
1094
1095 memset(&header, 0, sizeof(header));
1096 header.xfeatures = xsave->header.xfeatures;
1097
1098 /* Mask out the feature bits depending on copy mode */
1099 switch (copy_mode) {
1100 case XSTATE_COPY_FP:
1101 header.xfeatures &= XFEATURE_MASK_FP;
1102 break;
1103
1104 case XSTATE_COPY_FX:
1105 header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
1106 break;
1107
1108 case XSTATE_COPY_XSAVE:
1109 header.xfeatures &= fpstate->user_xfeatures & xfeatures;
1110 break;
1111 }
1112
1113 /* Copy FP state up to MXCSR */
1114 copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
1115 &xinit->i387, off_mxcsr);
1116
1117 /* Copy MXCSR when SSE or YMM are set in the feature mask */
1118 copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
1119 &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
1120 MXCSR_AND_FLAGS_SIZE);
1121
1122 /* Copy the remaining FP state */
1123 copy_feature(header.xfeatures & XFEATURE_MASK_FP,
1124 &to, &xsave->i387.st_space, &xinit->i387.st_space,
1125 sizeof(xsave->i387.st_space));
1126
1127 /* Copy the SSE state - shared with YMM, but independently managed */
1128 copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
1129 &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
1130 sizeof(xsave->i387.xmm_space));
1131
1132 if (copy_mode != XSTATE_COPY_XSAVE)
1133 goto out;
1134
1135 /* Zero the padding area */
1136 membuf_zero(&to, sizeof(xsave->i387.padding));
1137
1138 /* Copy xsave->i387.sw_reserved */
1139 membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
1140
1141 /* Copy the user space relevant state of @xsave->header */
1142 membuf_write(&to, &header, sizeof(header));
1143
1144 zerofrom = offsetof(struct xregs_state, extended_state_area);
1145
1146 /*
1147 * This 'mask' indicates which states to copy from fpstate.
1148 * Those extended states that are not present in fpstate are
1149 * either disabled or initialized:
1150 *
1151 * In non-compacted format, disabled features still occupy
1152 * state space but there is no state to copy from in the
1153 * compacted init_fpstate. The gap tracking will zero these
1154 * states.
1155 *
1156 * The extended features have an all zeroes init state. Thus,
1157 * remove them from 'mask' to zero those features in the user
1158 * buffer instead of retrieving them from init_fpstate.
1159 */
1160 mask = header.xfeatures;
1161
1162 for_each_extended_xfeature(i, mask) {
1163 /*
1164 * If there was a feature or alignment gap, zero the space
1165 * in the destination buffer.
1166 */
1167 if (zerofrom < xstate_offsets[i])
1168 membuf_zero(&to, xstate_offsets[i] - zerofrom);
1169
1170 if (i == XFEATURE_PKRU) {
1171 struct pkru_state pkru = {0};
1172 /*
1173 * PKRU is not necessarily up to date in the
1174 * XSAVE buffer. Use the provided value.
1175 */
1176 pkru.pkru = pkru_val;
1177 membuf_write(&to, &pkru, sizeof(pkru));
1178 } else {
1179 membuf_write(&to,
1180 __raw_xsave_addr(xsave, i),
1181 xstate_sizes[i]);
1182 }
1183 /*
1184 * Keep track of the last copied state in the non-compacted
1185 * target buffer for gap zeroing.
1186 */
1187 zerofrom = xstate_offsets[i] + xstate_sizes[i];
1188 }
1189
1190out:
1191 if (to.left)
1192 membuf_zero(&to, to.left);
1193}
1194
1195/**
1196 * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1197 * @to: membuf descriptor
1198 * @tsk: The task from which to copy the saved xstate
1199 * @copy_mode: The requested copy mode
1200 *
1201 * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1202 * format, i.e. from the kernel internal hardware dependent storage format
1203 * to the requested @mode. UABI XSTATE is always uncompacted!
1204 *
1205 * It supports partial copy but @to.pos always starts from zero.
1206 */
1207void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
1208 enum xstate_copy_mode copy_mode)
1209{
1210 __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
1211 tsk->thread.fpu.fpstate->user_xfeatures,
1212 tsk->thread.pkru, copy_mode);
1213}
1214
1215static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
1216 const void *kbuf, const void __user *ubuf)
1217{
1218 if (kbuf) {
1219 memcpy(dst, kbuf + offset, size);
1220 } else {
1221 if (copy_from_user(dst, ubuf + offset, size))
1222 return -EFAULT;
1223 }
1224 return 0;
1225}
1226
1227
1228/**
1229 * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
1230 * @fpstate: The fpstate buffer to copy to
1231 * @kbuf: The UABI format buffer, if it comes from the kernel
1232 * @ubuf: The UABI format buffer, if it comes from userspace
1233 * @pkru: The location to write the PKRU value to
1234 *
1235 * Converts from the UABI format into the kernel internal hardware
1236 * dependent format.
1237 *
1238 * This function ultimately has three different callers with distinct PKRU
1239 * behavior.
1240 * 1. When called from sigreturn the PKRU register will be restored from
1241 * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to
1242 * @fpstate is sufficient to cover this case, but the caller will also
1243 * pass a pointer to the thread_struct's pkru field in @pkru and updating
1244 * it is harmless.
1245 * 2. When called from ptrace the PKRU register will be restored from the
1246 * thread_struct's pkru field. A pointer to that is passed in @pkru.
1247 * The kernel will restore it manually, so the XRSTOR behavior that resets
1248 * the PKRU register to the hardware init value (0) if the corresponding
1249 * xfeatures bit is not set is emulated here.
1250 * 3. When called from KVM the PKRU register will be restored from the vcpu's
1251 * pkru field. A pointer to that is passed in @pkru. KVM hasn't used
1252 * XRSTOR and hasn't had the PKRU resetting behavior described above. To
1253 * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
1254 * bit is not set.
1255 */
1256static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
1257 const void __user *ubuf, u32 *pkru)
1258{
1259 struct xregs_state *xsave = &fpstate->regs.xsave;
1260 unsigned int offset, size;
1261 struct xstate_header hdr;
1262 u64 mask;
1263 int i;
1264
1265 offset = offsetof(struct xregs_state, header);
1266 if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
1267 return -EFAULT;
1268
1269 if (validate_user_xstate_header(&hdr, fpstate))
1270 return -EINVAL;
1271
1272 /* Validate MXCSR when any of the related features is in use */
1273 mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
1274 if (hdr.xfeatures & mask) {
1275 u32 mxcsr[2];
1276
1277 offset = offsetof(struct fxregs_state, mxcsr);
1278 if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
1279 return -EFAULT;
1280
1281 /* Reserved bits in MXCSR must be zero. */
1282 if (mxcsr[0] & ~mxcsr_feature_mask)
1283 return -EINVAL;
1284
1285 /* SSE and YMM require MXCSR even when FP is not in use. */
1286 if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
1287 xsave->i387.mxcsr = mxcsr[0];
1288 xsave->i387.mxcsr_mask = mxcsr[1];
1289 }
1290 }
1291
1292 for (i = 0; i < XFEATURE_MAX; i++) {
1293 mask = BIT_ULL(i);
1294
1295 if (hdr.xfeatures & mask) {
1296 void *dst = __raw_xsave_addr(xsave, i);
1297
1298 offset = xstate_offsets[i];
1299 size = xstate_sizes[i];
1300
1301 if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
1302 return -EFAULT;
1303 }
1304 }
1305
1306 if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
1307 struct pkru_state *xpkru;
1308
1309 xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
1310 *pkru = xpkru->pkru;
1311 } else {
1312 /*
1313 * KVM may pass NULL here to indicate that it does not need
1314 * PKRU updated.
1315 */
1316 if (pkru)
1317 *pkru = 0;
1318 }
1319
1320 /*
1321 * The state that came in from userspace was user-state only.
1322 * Mask all the user states out of 'xfeatures':
1323 */
1324 xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
1325
1326 /*
1327 * Add back in the features that came in from userspace:
1328 */
1329 xsave->header.xfeatures |= hdr.xfeatures;
1330
1331 return 0;
1332}
1333
1334/*
1335 * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
1336 * format and copy to the target thread. Used by ptrace and KVM.
1337 */
1338int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
1339{
1340 return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
1341}
1342
1343/*
1344 * Convert from a sigreturn standard-format user-space buffer to kernel
1345 * XSAVE[S] format and copy to the target thread. This is called from the
1346 * sigreturn() and rt_sigreturn() system calls.
1347 */
1348int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
1349 const void __user *ubuf)
1350{
1351 return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru);
1352}
1353
1354static bool validate_independent_components(u64 mask)
1355{
1356 u64 xchk;
1357
1358 if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
1359 return false;
1360
1361 xchk = ~xfeatures_mask_independent();
1362
1363 if (WARN_ON_ONCE(!mask || mask & xchk))
1364 return false;
1365
1366 return true;
1367}
1368
1369/**
1370 * xsaves - Save selected components to a kernel xstate buffer
1371 * @xstate: Pointer to the buffer
1372 * @mask: Feature mask to select the components to save
1373 *
1374 * The @xstate buffer must be 64 byte aligned and correctly initialized as
1375 * XSAVES does not write the full xstate header. Before first use the
1376 * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
1377 * can #GP.
1378 *
1379 * The feature mask must be a subset of the independent features.
1380 */
1381void xsaves(struct xregs_state *xstate, u64 mask)
1382{
1383 int err;
1384
1385 if (!validate_independent_components(mask))
1386 return;
1387
1388 XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
1389 WARN_ON_ONCE(err);
1390}
1391
1392/**
1393 * xrstors - Restore selected components from a kernel xstate buffer
1394 * @xstate: Pointer to the buffer
1395 * @mask: Feature mask to select the components to restore
1396 *
1397 * The @xstate buffer must be 64 byte aligned and correctly initialized
1398 * otherwise XRSTORS from that buffer can #GP.
1399 *
1400 * Proper usage is to restore the state which was saved with
1401 * xsaves() into @xstate.
1402 *
1403 * The feature mask must be a subset of the independent features.
1404 */
1405void xrstors(struct xregs_state *xstate, u64 mask)
1406{
1407 int err;
1408
1409 if (!validate_independent_components(mask))
1410 return;
1411
1412 XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
1413 WARN_ON_ONCE(err);
1414}
1415
1416#if IS_ENABLED(CONFIG_KVM)
1417void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
1418{
1419 void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
1420
1421 if (addr)
1422 memset(addr, 0, xstate_sizes[xfeature]);
1423}
1424EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
1425#endif
1426
1427#ifdef CONFIG_X86_64
1428
1429#ifdef CONFIG_X86_DEBUG_FPU
1430/*
1431 * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
1432 * can safely operate on the @fpstate buffer.
1433 */
1434static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
1435{
1436 u64 xfd = __this_cpu_read(xfd_state);
1437
1438 if (fpstate->xfd == xfd)
1439 return true;
1440
1441 /*
1442 * The XFD MSR does not match fpstate->xfd. That's invalid when
1443 * the passed in fpstate is current's fpstate.
1444 */
1445 if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
1446 return false;
1447
1448 /*
1449 * XRSTOR(S) from init_fpstate are always correct as it will just
1450 * bring all components into init state and not read from the
1451 * buffer. XSAVE(S) raises #PF after init.
1452 */
1453 if (fpstate == &init_fpstate)
1454 return rstor;
1455
1456 /*
1457 * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
1458 * XRSTORS(S): fpu_swap_kvm_fpstate()
1459 */
1460
1461 /*
1462 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
1463 * the buffer area for XFD-disabled state components.
1464 */
1465 mask &= ~xfd;
1466
1467 /*
1468 * Remove features which are valid in fpstate. They
1469 * have space allocated in fpstate.
1470 */
1471 mask &= ~fpstate->xfeatures;
1472
1473 /*
1474 * Any remaining state components in 'mask' might be written
1475 * by XSAVE/XRSTOR. Fail validation it found.
1476 */
1477 return !mask;
1478}
1479
1480void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
1481{
1482 WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
1483}
1484#endif /* CONFIG_X86_DEBUG_FPU */
1485
1486static int __init xfd_update_static_branch(void)
1487{
1488 /*
1489 * If init_fpstate.xfd has bits set then dynamic features are
1490 * available and the dynamic sizing must be enabled.
1491 */
1492 if (init_fpstate.xfd)
1493 static_branch_enable(&__fpu_state_size_dynamic);
1494 return 0;
1495}
1496arch_initcall(xfd_update_static_branch)
1497
1498void fpstate_free(struct fpu *fpu)
1499{
1500 if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
1501 vfree(fpu->fpstate);
1502}
1503
1504/**
1505 * fpstate_realloc - Reallocate struct fpstate for the requested new features
1506 *
1507 * @xfeatures: A bitmap of xstate features which extend the enabled features
1508 * of that task
1509 * @ksize: The required size for the kernel buffer
1510 * @usize: The required size for user space buffers
1511 * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations
1512 *
1513 * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
1514 * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
1515 * with large states are likely to live longer.
1516 *
1517 * Returns: 0 on success, -ENOMEM on allocation error.
1518 */
1519static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
1520 unsigned int usize, struct fpu_guest *guest_fpu)
1521{
1522 struct fpu *fpu = ¤t->thread.fpu;
1523 struct fpstate *curfps, *newfps = NULL;
1524 unsigned int fpsize;
1525 bool in_use;
1526
1527 fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
1528
1529 newfps = vzalloc(fpsize);
1530 if (!newfps)
1531 return -ENOMEM;
1532 newfps->size = ksize;
1533 newfps->user_size = usize;
1534 newfps->is_valloc = true;
1535
1536 /*
1537 * When a guest FPU is supplied, use @guest_fpu->fpstate
1538 * as reference independent whether it is in use or not.
1539 */
1540 curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
1541
1542 /* Determine whether @curfps is the active fpstate */
1543 in_use = fpu->fpstate == curfps;
1544
1545 if (guest_fpu) {
1546 newfps->is_guest = true;
1547 newfps->is_confidential = curfps->is_confidential;
1548 newfps->in_use = curfps->in_use;
1549 guest_fpu->xfeatures |= xfeatures;
1550 guest_fpu->uabi_size = usize;
1551 }
1552
1553 fpregs_lock();
1554 /*
1555 * If @curfps is in use, ensure that the current state is in the
1556 * registers before swapping fpstate as that might invalidate it
1557 * due to layout changes.
1558 */
1559 if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
1560 fpregs_restore_userregs();
1561
1562 newfps->xfeatures = curfps->xfeatures | xfeatures;
1563 newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
1564 newfps->xfd = curfps->xfd & ~xfeatures;
1565
1566 /* Do the final updates within the locked region */
1567 xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
1568
1569 if (guest_fpu) {
1570 guest_fpu->fpstate = newfps;
1571 /* If curfps is active, update the FPU fpstate pointer */
1572 if (in_use)
1573 fpu->fpstate = newfps;
1574 } else {
1575 fpu->fpstate = newfps;
1576 }
1577
1578 if (in_use)
1579 xfd_update_state(fpu->fpstate);
1580 fpregs_unlock();
1581
1582 /* Only free valloc'ed state */
1583 if (curfps && curfps->is_valloc)
1584 vfree(curfps);
1585
1586 return 0;
1587}
1588
1589static int validate_sigaltstack(unsigned int usize)
1590{
1591 struct task_struct *thread, *leader = current->group_leader;
1592 unsigned long framesize = get_sigframe_size();
1593
1594 lockdep_assert_held(¤t->sighand->siglock);
1595
1596 /* get_sigframe_size() is based on fpu_user_cfg.max_size */
1597 framesize -= fpu_user_cfg.max_size;
1598 framesize += usize;
1599 for_each_thread(leader, thread) {
1600 if (thread->sas_ss_size && thread->sas_ss_size < framesize)
1601 return -ENOSPC;
1602 }
1603 return 0;
1604}
1605
1606static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
1607{
1608 /*
1609 * This deliberately does not exclude !XSAVES as we still might
1610 * decide to optionally context switch XCR0 or talk the silicon
1611 * vendors into extending XFD for the pre AMX states, especially
1612 * AVX512.
1613 */
1614 bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
1615 struct fpu *fpu = ¤t->group_leader->thread.fpu;
1616 struct fpu_state_perm *perm;
1617 unsigned int ksize, usize;
1618 u64 mask;
1619 int ret = 0;
1620
1621 /* Check whether fully enabled */
1622 if ((permitted & requested) == requested)
1623 return 0;
1624
1625 /* Calculate the resulting kernel state size */
1626 mask = permitted | requested;
1627 /* Take supervisor states into account on the host */
1628 if (!guest)
1629 mask |= xfeatures_mask_supervisor();
1630 ksize = xstate_calculate_size(mask, compacted);
1631
1632 /* Calculate the resulting user state size */
1633 mask &= XFEATURE_MASK_USER_SUPPORTED;
1634 usize = xstate_calculate_size(mask, false);
1635
1636 if (!guest) {
1637 ret = validate_sigaltstack(usize);
1638 if (ret)
1639 return ret;
1640 }
1641
1642 perm = guest ? &fpu->guest_perm : &fpu->perm;
1643 /* Pairs with the READ_ONCE() in xstate_get_group_perm() */
1644 WRITE_ONCE(perm->__state_perm, mask);
1645 /* Protected by sighand lock */
1646 perm->__state_size = ksize;
1647 perm->__user_state_size = usize;
1648 return ret;
1649}
1650
1651/*
1652 * Permissions array to map facilities with more than one component
1653 */
1654static const u64 xstate_prctl_req[XFEATURE_MAX] = {
1655 [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
1656};
1657
1658static int xstate_request_perm(unsigned long idx, bool guest)
1659{
1660 u64 permitted, requested;
1661 int ret;
1662
1663 if (idx >= XFEATURE_MAX)
1664 return -EINVAL;
1665
1666 /*
1667 * Look up the facility mask which can require more than
1668 * one xstate component.
1669 */
1670 idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
1671 requested = xstate_prctl_req[idx];
1672 if (!requested)
1673 return -EOPNOTSUPP;
1674
1675 if ((fpu_user_cfg.max_features & requested) != requested)
1676 return -EOPNOTSUPP;
1677
1678 /* Lockless quick check */
1679 permitted = xstate_get_group_perm(guest);
1680 if ((permitted & requested) == requested)
1681 return 0;
1682
1683 /* Protect against concurrent modifications */
1684 spin_lock_irq(¤t->sighand->siglock);
1685 permitted = xstate_get_group_perm(guest);
1686
1687 /* First vCPU allocation locks the permissions. */
1688 if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
1689 ret = -EBUSY;
1690 else
1691 ret = __xstate_request_perm(permitted, requested, guest);
1692 spin_unlock_irq(¤t->sighand->siglock);
1693 return ret;
1694}
1695
1696int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
1697{
1698 u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
1699 struct fpu_state_perm *perm;
1700 unsigned int ksize, usize;
1701 struct fpu *fpu;
1702
1703 if (!xfd_event) {
1704 if (!guest_fpu)
1705 pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
1706 return 0;
1707 }
1708
1709 /* Protect against concurrent modifications */
1710 spin_lock_irq(¤t->sighand->siglock);
1711
1712 /* If not permitted let it die */
1713 if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
1714 spin_unlock_irq(¤t->sighand->siglock);
1715 return -EPERM;
1716 }
1717
1718 fpu = ¤t->group_leader->thread.fpu;
1719 perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
1720 ksize = perm->__state_size;
1721 usize = perm->__user_state_size;
1722
1723 /*
1724 * The feature is permitted. State size is sufficient. Dropping
1725 * the lock is safe here even if more features are added from
1726 * another task, the retrieved buffer sizes are valid for the
1727 * currently requested feature(s).
1728 */
1729 spin_unlock_irq(¤t->sighand->siglock);
1730
1731 /*
1732 * Try to allocate a new fpstate. If that fails there is no way
1733 * out.
1734 */
1735 if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
1736 return -EFAULT;
1737 return 0;
1738}
1739
1740int xfd_enable_feature(u64 xfd_err)
1741{
1742 return __xfd_enable_feature(xfd_err, NULL);
1743}
1744
1745#else /* CONFIG_X86_64 */
1746static inline int xstate_request_perm(unsigned long idx, bool guest)
1747{
1748 return -EPERM;
1749}
1750#endif /* !CONFIG_X86_64 */
1751
1752u64 xstate_get_guest_group_perm(void)
1753{
1754 return xstate_get_group_perm(true);
1755}
1756EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
1757
1758/**
1759 * fpu_xstate_prctl - xstate permission operations
1760 * @option: A subfunction of arch_prctl()
1761 * @arg2: option argument
1762 * Return: 0 if successful; otherwise, an error code
1763 *
1764 * Option arguments:
1765 *
1766 * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
1767 * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
1768 * ARCH_REQ_XCOMP_PERM: Facility number requested
1769 *
1770 * For facilities which require more than one XSTATE component, the request
1771 * must be the highest state component number related to that facility,
1772 * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
1773 * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
1774 */
1775long fpu_xstate_prctl(int option, unsigned long arg2)
1776{
1777 u64 __user *uptr = (u64 __user *)arg2;
1778 u64 permitted, supported;
1779 unsigned long idx = arg2;
1780 bool guest = false;
1781
1782 switch (option) {
1783 case ARCH_GET_XCOMP_SUPP:
1784 supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features;
1785 return put_user(supported, uptr);
1786
1787 case ARCH_GET_XCOMP_PERM:
1788 /*
1789 * Lockless snapshot as it can also change right after the
1790 * dropping the lock.
1791 */
1792 permitted = xstate_get_host_group_perm();
1793 permitted &= XFEATURE_MASK_USER_SUPPORTED;
1794 return put_user(permitted, uptr);
1795
1796 case ARCH_GET_XCOMP_GUEST_PERM:
1797 permitted = xstate_get_guest_group_perm();
1798 permitted &= XFEATURE_MASK_USER_SUPPORTED;
1799 return put_user(permitted, uptr);
1800
1801 case ARCH_REQ_XCOMP_GUEST_PERM:
1802 guest = true;
1803 fallthrough;
1804
1805 case ARCH_REQ_XCOMP_PERM:
1806 if (!IS_ENABLED(CONFIG_X86_64))
1807 return -EOPNOTSUPP;
1808
1809 return xstate_request_perm(idx, guest);
1810
1811 default:
1812 return -EINVAL;
1813 }
1814}
1815
1816#ifdef CONFIG_PROC_PID_ARCH_STATUS
1817/*
1818 * Report the amount of time elapsed in millisecond since last AVX512
1819 * use in the task.
1820 */
1821static void avx512_status(struct seq_file *m, struct task_struct *task)
1822{
1823 unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
1824 long delta;
1825
1826 if (!timestamp) {
1827 /*
1828 * Report -1 if no AVX512 usage
1829 */
1830 delta = -1;
1831 } else {
1832 delta = (long)(jiffies - timestamp);
1833 /*
1834 * Cap to LONG_MAX if time difference > LONG_MAX
1835 */
1836 if (delta < 0)
1837 delta = LONG_MAX;
1838 delta = jiffies_to_msecs(delta);
1839 }
1840
1841 seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
1842 seq_putc(m, '\n');
1843}
1844
1845/*
1846 * Report architecture specific information
1847 */
1848int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
1849 struct pid *pid, struct task_struct *task)
1850{
1851 /*
1852 * Report AVX512 state if the processor and build option supported.
1853 */
1854 if (cpu_feature_enabled(X86_FEATURE_AVX512F))
1855 avx512_status(m, task);
1856
1857 return 0;
1858}
1859#endif /* CONFIG_PROC_PID_ARCH_STATUS */
1860
1861#ifdef CONFIG_COREDUMP
1862static const char owner_name[] = "LINUX";
1863
1864/*
1865 * Dump type, size, offset and flag values for every xfeature that is present.
1866 */
1867static int dump_xsave_layout_desc(struct coredump_params *cprm)
1868{
1869 int num_records = 0;
1870 int i;
1871
1872 for_each_extended_xfeature(i, fpu_user_cfg.max_features) {
1873 struct x86_xfeat_component xc = {
1874 .type = i,
1875 .size = xstate_sizes[i],
1876 .offset = xstate_offsets[i],
1877 /* reserved for future use */
1878 .flags = 0,
1879 };
1880
1881 if (!dump_emit(cprm, &xc, sizeof(xc)))
1882 return 0;
1883
1884 num_records++;
1885 }
1886 return num_records;
1887}
1888
1889static u32 get_xsave_desc_size(void)
1890{
1891 u32 cnt = 0;
1892 u32 i;
1893
1894 for_each_extended_xfeature(i, fpu_user_cfg.max_features)
1895 cnt++;
1896
1897 return cnt * (sizeof(struct x86_xfeat_component));
1898}
1899
1900int elf_coredump_extra_notes_write(struct coredump_params *cprm)
1901{
1902 int num_records = 0;
1903 struct elf_note en;
1904
1905 if (!fpu_user_cfg.max_features)
1906 return 0;
1907
1908 en.n_namesz = sizeof(owner_name);
1909 en.n_descsz = get_xsave_desc_size();
1910 en.n_type = NT_X86_XSAVE_LAYOUT;
1911
1912 if (!dump_emit(cprm, &en, sizeof(en)))
1913 return 1;
1914 if (!dump_emit(cprm, owner_name, en.n_namesz))
1915 return 1;
1916 if (!dump_align(cprm, 4))
1917 return 1;
1918
1919 num_records = dump_xsave_layout_desc(cprm);
1920 if (!num_records)
1921 return 1;
1922
1923 /* Total size should be equal to the number of records */
1924 if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz)
1925 return 1;
1926
1927 return 0;
1928}
1929
1930int elf_coredump_extra_notes_size(void)
1931{
1932 int size;
1933
1934 if (!fpu_user_cfg.max_features)
1935 return 0;
1936
1937 /* .note header */
1938 size = sizeof(struct elf_note);
1939 /* Name plus alignment to 4 bytes */
1940 size += roundup(sizeof(owner_name), 4);
1941 size += get_xsave_desc_size();
1942
1943 return size;
1944}
1945#endif /* CONFIG_COREDUMP */