Loading...
Note: File does not exist in v6.8.
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * AES-XTS for modern x86_64 CPUs
4 *
5 * Copyright 2024 Google LLC
6 *
7 * Author: Eric Biggers <ebiggers@google.com>
8 */
9
10/*
11 * This file implements AES-XTS for modern x86_64 CPUs. To handle the
12 * complexities of coding for x86 SIMD, e.g. where every vector length needs
13 * different code, it uses a macro to generate several implementations that
14 * share similar source code but are targeted at different CPUs, listed below:
15 *
16 * AES-NI + AVX
17 * - 128-bit vectors (1 AES block per vector)
18 * - VEX-coded instructions
19 * - xmm0-xmm15
20 * - This is for older CPUs that lack VAES but do have AVX.
21 *
22 * VAES + VPCLMULQDQ + AVX2
23 * - 256-bit vectors (2 AES blocks per vector)
24 * - VEX-coded instructions
25 * - ymm0-ymm15
26 * - This is for CPUs that have VAES but lack AVX512 or AVX10,
27 * e.g. Intel's Alder Lake and AMD's Zen 3.
28 *
29 * VAES + VPCLMULQDQ + AVX10/256 + BMI2
30 * - 256-bit vectors (2 AES blocks per vector)
31 * - EVEX-coded instructions
32 * - ymm0-ymm31
33 * - This is for CPUs that have AVX512 but where using zmm registers causes
34 * downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
35 * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
36 * To avoid confusion with 512-bit, we just write AVX10/256.
37 *
38 * VAES + VPCLMULQDQ + AVX10/512 + BMI2
39 * - Same as the previous one, but upgrades to 512-bit vectors
40 * (4 AES blocks per vector) in zmm0-zmm31.
41 * - This is for CPUs that have good AVX512 or AVX10/512 support.
42 *
43 * This file doesn't have an implementation for AES-NI alone (without AVX), as
44 * the lack of VEX would make all the assembly code different.
45 *
46 * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of
47 * the XTS tweaks. This avoids a bottleneck. Currently there don't seem to be
48 * any CPUs that support VAES but not VPCLMULQDQ. If that changes, we might
49 * need to start also providing an implementation using VAES alone.
50 *
51 * The AES-XTS implementations in this file support everything required by the
52 * crypto API, including support for arbitrary input lengths and multi-part
53 * processing. However, they are most heavily optimized for the common case of
54 * power-of-2 length inputs that are processed in a single part (disk sectors).
55 */
56
57#include <linux/linkage.h>
58#include <linux/cfi_types.h>
59
60.section .rodata
61.p2align 4
62.Lgf_poly:
63 // The low 64 bits of this value represent the polynomial x^7 + x^2 + x
64 // + 1. It is the value that must be XOR'd into the low 64 bits of the
65 // tweak each time a 1 is carried out of the high 64 bits.
66 //
67 // The high 64 bits of this value is just the internal carry bit that
68 // exists when there's a carry out of the low 64 bits of the tweak.
69 .quad 0x87, 1
70
71 // This table contains constants for vpshufb and vpblendvb, used to
72 // handle variable byte shifts and blending during ciphertext stealing
73 // on CPUs that don't support AVX10-style masking.
74.Lcts_permute_table:
75 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
76 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
77 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
78 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
79 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
80 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
81.text
82
83// Function parameters
84.set KEY, %rdi // Initially points to crypto_aes_ctx, then is
85 // advanced to point to 7th-from-last round key
86.set SRC, %rsi // Pointer to next source data
87.set DST, %rdx // Pointer to next destination data
88.set LEN, %ecx // Remaining length in bytes
89.set LEN8, %cl
90.set LEN64, %rcx
91.set TWEAK, %r8 // Pointer to next tweak
92
93// %rax holds the AES key length in bytes.
94.set KEYLEN, %eax
95.set KEYLEN64, %rax
96
97// %r9-r11 are available as temporaries.
98
99.macro _define_Vi i
100.if VL == 16
101 .set V\i, %xmm\i
102.elseif VL == 32
103 .set V\i, %ymm\i
104.elseif VL == 64
105 .set V\i, %zmm\i
106.else
107 .error "Unsupported Vector Length (VL)"
108.endif
109.endm
110
111.macro _define_aliases
112 // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
113 // are available, that map to the xmm, ymm, or zmm registers according
114 // to the selected Vector Length (VL).
115 _define_Vi 0
116 _define_Vi 1
117 _define_Vi 2
118 _define_Vi 3
119 _define_Vi 4
120 _define_Vi 5
121 _define_Vi 6
122 _define_Vi 7
123 _define_Vi 8
124 _define_Vi 9
125 _define_Vi 10
126 _define_Vi 11
127 _define_Vi 12
128 _define_Vi 13
129 _define_Vi 14
130 _define_Vi 15
131.if USE_AVX10
132 _define_Vi 16
133 _define_Vi 17
134 _define_Vi 18
135 _define_Vi 19
136 _define_Vi 20
137 _define_Vi 21
138 _define_Vi 22
139 _define_Vi 23
140 _define_Vi 24
141 _define_Vi 25
142 _define_Vi 26
143 _define_Vi 27
144 _define_Vi 28
145 _define_Vi 29
146 _define_Vi 30
147 _define_Vi 31
148.endif
149
150 // V0-V3 hold the data blocks during the main loop, or temporary values
151 // otherwise. V4-V5 hold temporary values.
152
153 // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak.
154 .set TWEAK0_XMM, %xmm6
155 .set TWEAK0, V6
156 .set TWEAK1_XMM, %xmm7
157 .set TWEAK1, V7
158 .set TWEAK2, V8
159 .set TWEAK3, V9
160
161 // V10-V13 are used for computing the next values of TWEAK[0-3].
162 .set NEXT_TWEAK0, V10
163 .set NEXT_TWEAK1, V11
164 .set NEXT_TWEAK2, V12
165 .set NEXT_TWEAK3, V13
166
167 // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
168 .set GF_POLY_XMM, %xmm14
169 .set GF_POLY, V14
170
171 // V15 holds the key for AES "round 0", copied to all 128-bit lanes.
172 .set KEY0_XMM, %xmm15
173 .set KEY0, V15
174
175 // If 32 SIMD registers are available, then V16-V29 hold the remaining
176 // AES round keys, copied to all 128-bit lanes.
177 //
178 // AES-128, AES-192, and AES-256 use different numbers of round keys.
179 // To allow handling all three variants efficiently, we align the round
180 // keys to the *end* of this register range. I.e., AES-128 uses
181 // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
182 // (All also use KEY0 for the XOR-only "round" at the beginning.)
183.if USE_AVX10
184 .set KEY1_XMM, %xmm16
185 .set KEY1, V16
186 .set KEY2_XMM, %xmm17
187 .set KEY2, V17
188 .set KEY3_XMM, %xmm18
189 .set KEY3, V18
190 .set KEY4_XMM, %xmm19
191 .set KEY4, V19
192 .set KEY5_XMM, %xmm20
193 .set KEY5, V20
194 .set KEY6_XMM, %xmm21
195 .set KEY6, V21
196 .set KEY7_XMM, %xmm22
197 .set KEY7, V22
198 .set KEY8_XMM, %xmm23
199 .set KEY8, V23
200 .set KEY9_XMM, %xmm24
201 .set KEY9, V24
202 .set KEY10_XMM, %xmm25
203 .set KEY10, V25
204 .set KEY11_XMM, %xmm26
205 .set KEY11, V26
206 .set KEY12_XMM, %xmm27
207 .set KEY12, V27
208 .set KEY13_XMM, %xmm28
209 .set KEY13, V28
210 .set KEY14_XMM, %xmm29
211 .set KEY14, V29
212.endif
213 // V30-V31 are currently unused.
214.endm
215
216// Move a vector between memory and a register.
217.macro _vmovdqu src, dst
218.if VL < 64
219 vmovdqu \src, \dst
220.else
221 vmovdqu8 \src, \dst
222.endif
223.endm
224
225// Broadcast a 128-bit value into a vector.
226.macro _vbroadcast128 src, dst
227.if VL == 16 && !USE_AVX10
228 vmovdqu \src, \dst
229.elseif VL == 32 && !USE_AVX10
230 vbroadcasti128 \src, \dst
231.else
232 vbroadcasti32x4 \src, \dst
233.endif
234.endm
235
236// XOR two vectors together.
237.macro _vpxor src1, src2, dst
238.if USE_AVX10
239 vpxord \src1, \src2, \dst
240.else
241 vpxor \src1, \src2, \dst
242.endif
243.endm
244
245// XOR three vectors together.
246.macro _xor3 src1, src2, src3_and_dst
247.if USE_AVX10
248 // vpternlogd with immediate 0x96 is a three-argument XOR.
249 vpternlogd $0x96, \src1, \src2, \src3_and_dst
250.else
251 vpxor \src1, \src3_and_dst, \src3_and_dst
252 vpxor \src2, \src3_and_dst, \src3_and_dst
253.endif
254.endm
255
256// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
257// (by multiplying by the polynomial 'x') and write it to \dst.
258.macro _next_tweak src, tmp, dst
259 vpshufd $0x13, \src, \tmp
260 vpaddq \src, \src, \dst
261 vpsrad $31, \tmp, \tmp
262 vpand GF_POLY_XMM, \tmp, \tmp
263 vpxor \tmp, \dst, \dst
264.endm
265
266// Given the XTS tweak(s) in the vector \src, compute the next vector of
267// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst.
268//
269// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute
270// all tweaks in the vector in parallel. If VL=16, we just do the regular
271// computation without vpclmulqdq, as it's the faster method for a single tweak.
272.macro _next_tweakvec src, tmp1, tmp2, dst
273.if VL == 16
274 _next_tweak \src, \tmp1, \dst
275.else
276 vpsrlq $64 - VL/16, \src, \tmp1
277 vpclmulqdq $0x01, GF_POLY, \tmp1, \tmp2
278 vpslldq $8, \tmp1, \tmp1
279 vpsllq $VL/16, \src, \dst
280 _xor3 \tmp1, \tmp2, \dst
281.endif
282.endm
283
284// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
285// store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5.
286.macro _compute_first_set_of_tweaks
287 vmovdqu (TWEAK), TWEAK0_XMM
288 _vbroadcast128 .Lgf_poly(%rip), GF_POLY
289.if VL == 16
290 // With VL=16, multiplying by x serially is fastest.
291 _next_tweak TWEAK0, %xmm0, TWEAK1
292 _next_tweak TWEAK1, %xmm0, TWEAK2
293 _next_tweak TWEAK2, %xmm0, TWEAK3
294.else
295.if VL == 32
296 // Compute the second block of TWEAK0.
297 _next_tweak TWEAK0_XMM, %xmm0, %xmm1
298 vinserti128 $1, %xmm1, TWEAK0, TWEAK0
299.elseif VL == 64
300 // Compute the remaining blocks of TWEAK0.
301 _next_tweak TWEAK0_XMM, %xmm0, %xmm1
302 _next_tweak %xmm1, %xmm0, %xmm2
303 _next_tweak %xmm2, %xmm0, %xmm3
304 vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0
305 vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0
306 vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0
307.endif
308 // Compute TWEAK[1-3] from TWEAK0.
309 vpsrlq $64 - 1*VL/16, TWEAK0, V0
310 vpsrlq $64 - 2*VL/16, TWEAK0, V2
311 vpsrlq $64 - 3*VL/16, TWEAK0, V4
312 vpclmulqdq $0x01, GF_POLY, V0, V1
313 vpclmulqdq $0x01, GF_POLY, V2, V3
314 vpclmulqdq $0x01, GF_POLY, V4, V5
315 vpslldq $8, V0, V0
316 vpslldq $8, V2, V2
317 vpslldq $8, V4, V4
318 vpsllq $1*VL/16, TWEAK0, TWEAK1
319 vpsllq $2*VL/16, TWEAK0, TWEAK2
320 vpsllq $3*VL/16, TWEAK0, TWEAK3
321.if USE_AVX10
322 vpternlogd $0x96, V0, V1, TWEAK1
323 vpternlogd $0x96, V2, V3, TWEAK2
324 vpternlogd $0x96, V4, V5, TWEAK3
325.else
326 vpxor V0, TWEAK1, TWEAK1
327 vpxor V2, TWEAK2, TWEAK2
328 vpxor V4, TWEAK3, TWEAK3
329 vpxor V1, TWEAK1, TWEAK1
330 vpxor V3, TWEAK2, TWEAK2
331 vpxor V5, TWEAK3, TWEAK3
332.endif
333.endif
334.endm
335
336// Do one step in computing the next set of tweaks using the method of just
337// multiplying by x repeatedly (the same method _next_tweak uses).
338.macro _tweak_step_mulx i
339.if \i == 0
340 .set PREV_TWEAK, TWEAK3
341 .set NEXT_TWEAK, NEXT_TWEAK0
342.elseif \i == 5
343 .set PREV_TWEAK, NEXT_TWEAK0
344 .set NEXT_TWEAK, NEXT_TWEAK1
345.elseif \i == 10
346 .set PREV_TWEAK, NEXT_TWEAK1
347 .set NEXT_TWEAK, NEXT_TWEAK2
348.elseif \i == 15
349 .set PREV_TWEAK, NEXT_TWEAK2
350 .set NEXT_TWEAK, NEXT_TWEAK3
351.endif
352.if \i >= 0 && \i < 20 && \i % 5 == 0
353 vpshufd $0x13, PREV_TWEAK, V5
354.elseif \i >= 0 && \i < 20 && \i % 5 == 1
355 vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
356.elseif \i >= 0 && \i < 20 && \i % 5 == 2
357 vpsrad $31, V5, V5
358.elseif \i >= 0 && \i < 20 && \i % 5 == 3
359 vpand GF_POLY, V5, V5
360.elseif \i >= 0 && \i < 20 && \i % 5 == 4
361 vpxor V5, NEXT_TWEAK, NEXT_TWEAK
362.elseif \i == 1000
363 vmovdqa NEXT_TWEAK0, TWEAK0
364 vmovdqa NEXT_TWEAK1, TWEAK1
365 vmovdqa NEXT_TWEAK2, TWEAK2
366 vmovdqa NEXT_TWEAK3, TWEAK3
367.endif
368.endm
369
370// Do one step in computing the next set of tweaks using the VPCLMULQDQ method
371// (the same method _next_tweakvec uses for VL > 16). This means multiplying
372// each tweak by x^(4*VL/16) independently. Since 4*VL/16 is a multiple of 8
373// when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
374// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
375.macro _tweak_step_pclmul i
376.if \i == 0
377 vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
378.elseif \i == 2
379 vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
380.elseif \i == 4
381 vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
382.elseif \i == 6
383 vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
384.elseif \i == 8
385 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
386.elseif \i == 10
387 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
388.elseif \i == 12
389 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
390.elseif \i == 14
391 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
392.elseif \i == 1000
393 vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0
394 vpslldq $(4*VL/16) / 8, TWEAK1, TWEAK1
395 vpslldq $(4*VL/16) / 8, TWEAK2, TWEAK2
396 vpslldq $(4*VL/16) / 8, TWEAK3, TWEAK3
397 _vpxor NEXT_TWEAK0, TWEAK0, TWEAK0
398 _vpxor NEXT_TWEAK1, TWEAK1, TWEAK1
399 _vpxor NEXT_TWEAK2, TWEAK2, TWEAK2
400 _vpxor NEXT_TWEAK3, TWEAK3, TWEAK3
401.endif
402.endm
403
404// _tweak_step does one step of the computation of the next set of tweaks from
405// TWEAK[0-3]. To complete all steps, this is invoked with increasing values of
406// \i that include at least 0 through 19, then 1000 which signals the last step.
407//
408// This is used to interleave the computation of the next set of tweaks with the
409// AES en/decryptions, which increases performance in some cases.
410.macro _tweak_step i
411.if VL == 16
412 _tweak_step_mulx \i
413.else
414 _tweak_step_pclmul \i
415.endif
416.endm
417
418.macro _setup_round_keys enc
419
420 // Select either the encryption round keys or the decryption round keys.
421.if \enc
422 .set OFFS, 0
423.else
424 .set OFFS, 240
425.endif
426
427 // Load the round key for "round 0".
428 _vbroadcast128 OFFS(KEY), KEY0
429
430 // Increment KEY to make it so that 7*16(KEY) is the last round key.
431 // For AES-128, increment by 3*16, resulting in the 10 round keys (not
432 // counting the zero-th round key which was just loaded into KEY0) being
433 // -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use
434 // 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment
435 // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
436 //
437 // This rebasing provides two benefits. First, it makes the offset to
438 // any round key be in the range [-96, 112], fitting in a signed byte.
439 // This shortens VEX-encoded instructions that access the later round
440 // keys which otherwise would need 4-byte offsets. Second, it makes it
441 // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
442 // beginning. Skipping rounds at the end doesn't work as well because
443 // the last round needs different instructions.
444 //
445 // An alternative approach would be to roll up all the round loops. We
446 // don't do that because it isn't compatible with caching the round keys
447 // in registers which we do when possible (see below), and also because
448 // it seems unwise to rely *too* heavily on the CPU's branch predictor.
449 lea OFFS-16(KEY, KEYLEN64, 4), KEY
450
451 // If all 32 SIMD registers are available, cache all the round keys.
452.if USE_AVX10
453 cmp $24, KEYLEN
454 jl .Laes128\@
455 je .Laes192\@
456 _vbroadcast128 -6*16(KEY), KEY1
457 _vbroadcast128 -5*16(KEY), KEY2
458.Laes192\@:
459 _vbroadcast128 -4*16(KEY), KEY3
460 _vbroadcast128 -3*16(KEY), KEY4
461.Laes128\@:
462 _vbroadcast128 -2*16(KEY), KEY5
463 _vbroadcast128 -1*16(KEY), KEY6
464 _vbroadcast128 0*16(KEY), KEY7
465 _vbroadcast128 1*16(KEY), KEY8
466 _vbroadcast128 2*16(KEY), KEY9
467 _vbroadcast128 3*16(KEY), KEY10
468 _vbroadcast128 4*16(KEY), KEY11
469 _vbroadcast128 5*16(KEY), KEY12
470 _vbroadcast128 6*16(KEY), KEY13
471 _vbroadcast128 7*16(KEY), KEY14
472.endif
473.endm
474
475// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
476// on the block(s) in \data using the round key(s) in \key. The register length
477// determines the number of AES blocks en/decrypted.
478.macro _vaes enc, last, key, data
479.if \enc
480.if \last
481 vaesenclast \key, \data, \data
482.else
483 vaesenc \key, \data, \data
484.endif
485.else
486.if \last
487 vaesdeclast \key, \data, \data
488.else
489 vaesdec \key, \data, \data
490.endif
491.endif
492.endm
493
494// Do a single round of AES en/decryption on the block(s) in \data, using the
495// same key for all block(s). The round key is loaded from the appropriate
496// register or memory location for round \i. May clobber V4.
497.macro _vaes_1x enc, last, i, xmm_suffix, data
498.if USE_AVX10
499 _vaes \enc, \last, KEY\i\xmm_suffix, \data
500.else
501.ifnb \xmm_suffix
502 _vaes \enc, \last, (\i-7)*16(KEY), \data
503.else
504 _vbroadcast128 (\i-7)*16(KEY), V4
505 _vaes \enc, \last, V4, \data
506.endif
507.endif
508.endm
509
510// Do a single round of AES en/decryption on the blocks in registers V0-V3,
511// using the same key for all blocks. The round key is loaded from the
512// appropriate register or memory location for round \i. In addition, does two
513// steps of the computation of the next set of tweaks. May clobber V4.
514.macro _vaes_4x enc, last, i
515.if USE_AVX10
516 _tweak_step (2*(\i-5))
517 _vaes \enc, \last, KEY\i, V0
518 _vaes \enc, \last, KEY\i, V1
519 _tweak_step (2*(\i-5) + 1)
520 _vaes \enc, \last, KEY\i, V2
521 _vaes \enc, \last, KEY\i, V3
522.else
523 _vbroadcast128 (\i-7)*16(KEY), V4
524 _tweak_step (2*(\i-5))
525 _vaes \enc, \last, V4, V0
526 _vaes \enc, \last, V4, V1
527 _tweak_step (2*(\i-5) + 1)
528 _vaes \enc, \last, V4, V2
529 _vaes \enc, \last, V4, V3
530.endif
531.endm
532
533// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
534// then XOR with \tweak again) of the block(s) in \data. To process a single
535// block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of
536// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4.
537.macro _aes_crypt enc, xmm_suffix, tweak, data
538 _xor3 KEY0\xmm_suffix, \tweak, \data
539 cmp $24, KEYLEN
540 jl .Laes128\@
541 je .Laes192\@
542 _vaes_1x \enc, 0, 1, \xmm_suffix, \data
543 _vaes_1x \enc, 0, 2, \xmm_suffix, \data
544.Laes192\@:
545 _vaes_1x \enc, 0, 3, \xmm_suffix, \data
546 _vaes_1x \enc, 0, 4, \xmm_suffix, \data
547.Laes128\@:
548 _vaes_1x \enc, 0, 5, \xmm_suffix, \data
549 _vaes_1x \enc, 0, 6, \xmm_suffix, \data
550 _vaes_1x \enc, 0, 7, \xmm_suffix, \data
551 _vaes_1x \enc, 0, 8, \xmm_suffix, \data
552 _vaes_1x \enc, 0, 9, \xmm_suffix, \data
553 _vaes_1x \enc, 0, 10, \xmm_suffix, \data
554 _vaes_1x \enc, 0, 11, \xmm_suffix, \data
555 _vaes_1x \enc, 0, 12, \xmm_suffix, \data
556 _vaes_1x \enc, 0, 13, \xmm_suffix, \data
557 _vaes_1x \enc, 1, 14, \xmm_suffix, \data
558 _vpxor \tweak, \data, \data
559.endm
560
561.macro _aes_xts_crypt enc
562 _define_aliases
563
564.if !\enc
565 // When decrypting a message whose length isn't a multiple of the AES
566 // block length, exclude the last full block from the main loop by
567 // subtracting 16 from LEN. This is needed because ciphertext stealing
568 // decryption uses the last two tweaks in reverse order. We'll handle
569 // the last full block and the partial block specially at the end.
570 lea -16(LEN), %eax
571 test $15, LEN8
572 cmovnz %eax, LEN
573.endif
574
575 // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
576 movl 480(KEY), KEYLEN
577
578 // Setup the pointer to the round keys and cache as many as possible.
579 _setup_round_keys \enc
580
581 // Compute the first set of tweaks TWEAK[0-3].
582 _compute_first_set_of_tweaks
583
584 sub $4*VL, LEN
585 jl .Lhandle_remainder\@
586
587.Lmain_loop\@:
588 // This is the main loop, en/decrypting 4*VL bytes per iteration.
589
590 // XOR each source block with its tweak and the zero-th round key.
591.if USE_AVX10
592 vmovdqu8 0*VL(SRC), V0
593 vmovdqu8 1*VL(SRC), V1
594 vmovdqu8 2*VL(SRC), V2
595 vmovdqu8 3*VL(SRC), V3
596 vpternlogd $0x96, TWEAK0, KEY0, V0
597 vpternlogd $0x96, TWEAK1, KEY0, V1
598 vpternlogd $0x96, TWEAK2, KEY0, V2
599 vpternlogd $0x96, TWEAK3, KEY0, V3
600.else
601 vpxor 0*VL(SRC), KEY0, V0
602 vpxor 1*VL(SRC), KEY0, V1
603 vpxor 2*VL(SRC), KEY0, V2
604 vpxor 3*VL(SRC), KEY0, V3
605 vpxor TWEAK0, V0, V0
606 vpxor TWEAK1, V1, V1
607 vpxor TWEAK2, V2, V2
608 vpxor TWEAK3, V3, V3
609.endif
610 cmp $24, KEYLEN
611 jl .Laes128\@
612 je .Laes192\@
613 // Do all the AES rounds on the data blocks, interleaved with
614 // the computation of the next set of tweaks.
615 _vaes_4x \enc, 0, 1
616 _vaes_4x \enc, 0, 2
617.Laes192\@:
618 _vaes_4x \enc, 0, 3
619 _vaes_4x \enc, 0, 4
620.Laes128\@:
621 _vaes_4x \enc, 0, 5
622 _vaes_4x \enc, 0, 6
623 _vaes_4x \enc, 0, 7
624 _vaes_4x \enc, 0, 8
625 _vaes_4x \enc, 0, 9
626 _vaes_4x \enc, 0, 10
627 _vaes_4x \enc, 0, 11
628 _vaes_4x \enc, 0, 12
629 _vaes_4x \enc, 0, 13
630 _vaes_4x \enc, 1, 14
631
632 // XOR in the tweaks again.
633 _vpxor TWEAK0, V0, V0
634 _vpxor TWEAK1, V1, V1
635 _vpxor TWEAK2, V2, V2
636 _vpxor TWEAK3, V3, V3
637
638 // Store the destination blocks.
639 _vmovdqu V0, 0*VL(DST)
640 _vmovdqu V1, 1*VL(DST)
641 _vmovdqu V2, 2*VL(DST)
642 _vmovdqu V3, 3*VL(DST)
643
644 // Finish computing the next set of tweaks.
645 _tweak_step 1000
646
647 add $4*VL, SRC
648 add $4*VL, DST
649 sub $4*VL, LEN
650 jge .Lmain_loop\@
651
652 // Check for the uncommon case where the data length isn't a multiple of
653 // 4*VL. Handle it out-of-line in order to optimize for the common
654 // case. In the common case, just fall through to the ret.
655 test $4*VL-1, LEN8
656 jnz .Lhandle_remainder\@
657.Ldone\@:
658 // Store the next tweak back to *TWEAK to support continuation calls.
659 vmovdqu TWEAK0_XMM, (TWEAK)
660.if VL > 16
661 vzeroupper
662.endif
663 RET
664
665.Lhandle_remainder\@:
666
667 // En/decrypt any remaining full blocks, one vector at a time.
668.if VL > 16
669 add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL.
670 jl .Lvec_at_a_time_done\@
671.Lvec_at_a_time\@:
672 _vmovdqu (SRC), V0
673 _aes_crypt \enc, , TWEAK0, V0
674 _vmovdqu V0, (DST)
675 _next_tweakvec TWEAK0, V0, V1, TWEAK0
676 add $VL, SRC
677 add $VL, DST
678 sub $VL, LEN
679 jge .Lvec_at_a_time\@
680.Lvec_at_a_time_done\@:
681 add $VL-16, LEN // Undo extra sub of VL, then sub 16.
682.else
683 add $4*VL-16, LEN // Undo extra sub of 4*VL, then sub 16.
684.endif
685
686 // En/decrypt any remaining full blocks, one at a time.
687 jl .Lblock_at_a_time_done\@
688.Lblock_at_a_time\@:
689 vmovdqu (SRC), %xmm0
690 _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
691 vmovdqu %xmm0, (DST)
692 _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM
693 add $16, SRC
694 add $16, DST
695 sub $16, LEN
696 jge .Lblock_at_a_time\@
697.Lblock_at_a_time_done\@:
698 add $16, LEN // Undo the extra sub of 16.
699 // Now 0 <= LEN <= 15. If LEN is zero, we're done.
700 jz .Ldone\@
701
702 // Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN.
703 // Do ciphertext stealing to process the last 16 + LEN bytes.
704
705.if \enc
706 // If encrypting, the main loop already encrypted the last full block to
707 // create the CTS intermediate ciphertext. Prepare for the rest of CTS
708 // by rewinding the pointers and loading the intermediate ciphertext.
709 sub $16, SRC
710 sub $16, DST
711 vmovdqu (DST), %xmm0
712.else
713 // If decrypting, the main loop didn't decrypt the last full block
714 // because CTS decryption uses the last two tweaks in reverse order.
715 // Do it now by advancing the tweak and decrypting the last full block.
716 _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM
717 vmovdqu (SRC), %xmm0
718 _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0
719.endif
720
721.if USE_AVX10
722 // Create a mask that has the first LEN bits set.
723 mov $-1, %r9d
724 bzhi LEN, %r9d, %r9d
725 kmovd %r9d, %k1
726
727 // Swap the first LEN bytes of the en/decryption of the last full block
728 // with the partial block. Note that to support in-place en/decryption,
729 // the load from the src partial block must happen before the store to
730 // the dst partial block.
731 vmovdqa %xmm0, %xmm1
732 vmovdqu8 16(SRC), %xmm0{%k1}
733 vmovdqu8 %xmm1, 16(DST){%k1}
734.else
735 lea .Lcts_permute_table(%rip), %r9
736
737 // Load the src partial block, left-aligned. Note that to support
738 // in-place en/decryption, this must happen before the store to the dst
739 // partial block.
740 vmovdqu (SRC, LEN64, 1), %xmm1
741
742 // Shift the first LEN bytes of the en/decryption of the last full block
743 // to the end of a register, then store it to DST+LEN. This stores the
744 // dst partial block. It also writes to the second part of the dst last
745 // full block, but that part is overwritten later.
746 vpshufb (%r9, LEN64, 1), %xmm0, %xmm2
747 vmovdqu %xmm2, (DST, LEN64, 1)
748
749 // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
750 sub LEN64, %r9
751 vmovdqu 32(%r9), %xmm3
752
753 // Shift the src partial block to the beginning of its register.
754 vpshufb %xmm3, %xmm1, %xmm1
755
756 // Do a blend to generate the src partial block followed by the second
757 // part of the en/decryption of the last full block.
758 vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
759.endif
760 // En/decrypt again and store the last full block.
761 _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
762 vmovdqu %xmm0, (DST)
763 jmp .Ldone\@
764.endm
765
766// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
767// u8 iv[AES_BLOCK_SIZE]);
768SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
769 vmovdqu (%rsi), %xmm0
770 vpxor (%rdi), %xmm0, %xmm0
771 movl 480(%rdi), %eax // AES key length
772 lea -16(%rdi, %rax, 4), %rdi
773 cmp $24, %eax
774 jl .Lencrypt_iv_aes128
775 je .Lencrypt_iv_aes192
776 vaesenc -6*16(%rdi), %xmm0, %xmm0
777 vaesenc -5*16(%rdi), %xmm0, %xmm0
778.Lencrypt_iv_aes192:
779 vaesenc -4*16(%rdi), %xmm0, %xmm0
780 vaesenc -3*16(%rdi), %xmm0, %xmm0
781.Lencrypt_iv_aes128:
782 vaesenc -2*16(%rdi), %xmm0, %xmm0
783 vaesenc -1*16(%rdi), %xmm0, %xmm0
784 vaesenc 0*16(%rdi), %xmm0, %xmm0
785 vaesenc 1*16(%rdi), %xmm0, %xmm0
786 vaesenc 2*16(%rdi), %xmm0, %xmm0
787 vaesenc 3*16(%rdi), %xmm0, %xmm0
788 vaesenc 4*16(%rdi), %xmm0, %xmm0
789 vaesenc 5*16(%rdi), %xmm0, %xmm0
790 vaesenc 6*16(%rdi), %xmm0, %xmm0
791 vaesenclast 7*16(%rdi), %xmm0, %xmm0
792 vmovdqu %xmm0, (%rsi)
793 RET
794SYM_FUNC_END(aes_xts_encrypt_iv)
795
796// Below are the actual AES-XTS encryption and decryption functions,
797// instantiated from the above macro. They all have the following prototype:
798//
799// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
800// const u8 *src, u8 *dst, unsigned int len,
801// u8 tweak[AES_BLOCK_SIZE]);
802//
803// |key| is the data key. |tweak| contains the next tweak; the encryption of
804// the original IV with the tweak key was already done. This function supports
805// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
806// |len| must be a multiple of 16 except on the last call. If |len| is a
807// multiple of 16, then this function updates |tweak| to contain the next tweak.
808
809.set VL, 16
810.set USE_AVX10, 0
811SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
812 _aes_xts_crypt 1
813SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
814SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx)
815 _aes_xts_crypt 0
816SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
817
818#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
819.set VL, 32
820.set USE_AVX10, 0
821SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
822 _aes_xts_crypt 1
823SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
824SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
825 _aes_xts_crypt 0
826SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
827
828.set VL, 32
829.set USE_AVX10, 1
830SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
831 _aes_xts_crypt 1
832SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
833SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
834 _aes_xts_crypt 0
835SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
836
837.set VL, 64
838.set USE_AVX10, 1
839SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
840 _aes_xts_crypt 1
841SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
842SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
843 _aes_xts_crypt 0
844SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
845#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */