aes-gcm-aesni-x86_64.S - arch/x86/crypto/aes-gcm-aesni-x86_64.S - Linux source code v6.13.7

Note: File does not exist in v6.8.
   1/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
   2//
   3// AES-NI optimized AES-GCM for x86_64
   4//
   5// Copyright 2024 Google LLC
   6//
   7// Author: Eric Biggers <ebiggers@google.com>
   8//
   9//------------------------------------------------------------------------------
  10//
  11// This file is dual-licensed, meaning that you can use it under your choice of
  12// either of the following two licenses:
  13//
  14// Licensed under the Apache License 2.0 (the "License").  You may obtain a copy
  15// of the License at
  16//
  17//	http://www.apache.org/licenses/LICENSE-2.0
  18//
  19// Unless required by applicable law or agreed to in writing, software
  20// distributed under the License is distributed on an "AS IS" BASIS,
  21// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22// See the License for the specific language governing permissions and
  23// limitations under the License.
  24//
  25// or
  26//
  27// Redistribution and use in source and binary forms, with or without
  28// modification, are permitted provided that the following conditions are met:
  29//
  30// 1. Redistributions of source code must retain the above copyright notice,
  31//    this list of conditions and the following disclaimer.
  32//
  33// 2. Redistributions in binary form must reproduce the above copyright
  34//    notice, this list of conditions and the following disclaimer in the
  35//    documentation and/or other materials provided with the distribution.
  36//
  37// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  38// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  39// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  40// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  41// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  42// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  43// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  44// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  45// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  46// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  47// POSSIBILITY OF SUCH DAMAGE.
  48//
  49//------------------------------------------------------------------------------
  50//
  51// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
  52// support the original set of AES instructions, i.e. AES-NI.  Two
  53// implementations are provided, one that uses AVX and one that doesn't.  They
  54// are very similar, being generated by the same macros.  The only difference is
  55// that the AVX implementation takes advantage of VEX-coded instructions in some
  56// places to avoid some 'movdqu' and 'movdqa' instructions.  The AVX
  57// implementation does *not* use 256-bit vectors, as AES is not supported on
  58// 256-bit vectors until the VAES feature (which this file doesn't target).
  59//
  60// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
  61// for the *_aesni functions or AVX for the *_aesni_avx ones.  (But it seems
  62// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
  63//
  64// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
  65// more thoroughly commented.  This file has the following notable changes:
  66//
  67//    - The vector length is fixed at 128-bit, i.e. xmm registers.  This means
  68//      there is only one AES block (and GHASH block) per register.
  69//
  70//    - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
  71//      32.  We work around this by being much more careful about using
  72//      registers, relying heavily on loads to load values as they are needed.
  73//
  74//    - Masking is not available either.  We work around this by implementing
  75//      partial block loads and stores using overlapping scalar loads and stores
  76//      combined with shifts and SSE4.1 insertion and extraction instructions.
  77//
  78//    - The main loop is organized differently due to the different design
  79//      constraints.  First, with just one AES block per SIMD register, on some
  80//      CPUs 4 registers don't saturate the 'aesenc' throughput.  We therefore
  81//      do an 8-register wide loop.  Considering that and the fact that we have
  82//      just 16 SIMD registers to work with, it's not feasible to cache AES
  83//      round keys and GHASH key powers in registers across loop iterations.
  84//      That's not ideal, but also not actually that bad, since loads can run in
  85//      parallel with other instructions.  Significantly, this also makes it
  86//      possible to roll up the inner loops, relying on hardware loop unrolling
  87//      instead of software loop unrolling, greatly reducing code size.
  88//
  89//    - We implement the GHASH multiplications in the main loop using Karatsuba
  90//      multiplication instead of schoolbook multiplication.  This saves one
  91//      pclmulqdq instruction per block, at the cost of one 64-bit load, one
  92//      pshufd, and 0.25 pxors per block.  (This is without the three-argument
  93//      XOR support that would be provided by AVX512 / AVX10, which would be
  94//      more beneficial to schoolbook than Karatsuba.)
  95//
  96//      As a rough approximation, we can assume that Karatsuba multiplication is
  97//      faster than schoolbook multiplication in this context if one pshufd and
  98//      0.25 pxors are cheaper than a pclmulqdq.  (We assume that the 64-bit
  99//      load is "free" due to running in parallel with arithmetic instructions.)
 100//      This is true on AMD CPUs, including all that support pclmulqdq up to at
 101//      least Zen 3.  It's also true on older Intel CPUs: Westmere through
 102//      Haswell on the Core side, and Silvermont through Goldmont Plus on the
 103//      low-power side.  On some of these CPUs, pclmulqdq is quite slow, and the
 104//      benefit of Karatsuba should be substantial.  On newer Intel CPUs,
 105//      schoolbook multiplication should be faster, but only marginally.
 106//
 107//      Not all these CPUs were available to be tested.  However, benchmarks on
 108//      available CPUs suggest that this approximation is plausible.  Switching
 109//      to Karatsuba showed negligible change (< 1%) on Intel Broadwell,
 110//      Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
 111//      Considering that and the fact that Karatsuba should be even more
 112//      beneficial on older Intel CPUs, it seems like the right choice here.
 113//
 114//      An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
 115//      saved by using a multiplication-less reduction method.  We don't do that
 116//      because it would require a large number of shift and xor instructions,
 117//      making it less worthwhile and likely harmful on newer CPUs.
 118//
 119//      It does make sense to sometimes use a different reduction optimization
 120//      that saves a pclmulqdq, though: precompute the hash key times x^64, and
 121//      multiply the low half of the data block by the hash key with the extra
 122//      factor of x^64.  This eliminates one step of the reduction.  However,
 123//      this is incompatible with Karatsuba multiplication.  Therefore, for
 124//      multi-block processing we use Karatsuba multiplication with a regular
 125//      reduction.  For single-block processing, we use the x^64 optimization.
 126
 127#include <linux/linkage.h>
 128
 129.section .rodata
 130.p2align 4
 131.Lbswap_mask:
 132	.octa   0x000102030405060708090a0b0c0d0e0f
 133.Lgfpoly:
 134	.quad	0xc200000000000000
 135.Lone:
 136	.quad	1
 137.Lgfpoly_and_internal_carrybit:
 138	.octa	0xc2000000000000010000000000000001
 139	// Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
 140	// 'len' 0xff bytes and the rest zeroes.
 141.Lzeropad_mask:
 142	.octa	0xffffffffffffffffffffffffffffffff
 143	.octa	0
 144
 145// Offsets in struct aes_gcm_key_aesni
 146#define OFFSETOF_AESKEYLEN	480
 147#define OFFSETOF_H_POWERS	496
 148#define OFFSETOF_H_POWERS_XORED	624
 149#define OFFSETOF_H_TIMES_X64	688
 150
 151.text
 152
 153// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq.  The fallback
 154// assumes that all operands are distinct and that any mem operand is aligned.
 155.macro	_vpclmulqdq	imm, src1, src2, dst
 156.if USE_AVX
 157	vpclmulqdq	\imm, \src1, \src2, \dst
 158.else
 159	movdqa		\src2, \dst
 160	pclmulqdq	\imm, \src1, \dst
 161.endif
 162.endm
 163
 164// Do a vpshufb, or fall back to a movdqa and a pshufb.  The fallback assumes
 165// that all operands are distinct and that any mem operand is aligned.
 166.macro	_vpshufb	src1, src2, dst
 167.if USE_AVX
 168	vpshufb		\src1, \src2, \dst
 169.else
 170	movdqa		\src2, \dst
 171	pshufb		\src1, \dst
 172.endif
 173.endm
 174
 175// Do a vpand, or fall back to a movdqu and a pand.  The fallback assumes that
 176// all operands are distinct.
 177.macro	_vpand		src1, src2, dst
 178.if USE_AVX
 179	vpand		\src1, \src2, \dst
 180.else
 181	movdqu		\src1, \dst
 182	pand		\src2, \dst
 183.endif
 184.endm
 185
 186// XOR the unaligned memory operand \mem into the xmm register \reg.  \tmp must
 187// be a temporary xmm register.
 188.macro	_xor_mem_to_reg	mem, reg, tmp
 189.if USE_AVX
 190	vpxor		\mem, \reg, \reg
 191.else
 192	movdqu		\mem, \tmp
 193	pxor		\tmp, \reg
 194.endif
 195.endm
 196
 197// Test the unaligned memory operand \mem against the xmm register \reg.  \tmp
 198// must be a temporary xmm register.
 199.macro	_test_mem	mem, reg, tmp
 200.if USE_AVX
 201	vptest		\mem, \reg
 202.else
 203	movdqu		\mem, \tmp
 204	ptest		\tmp, \reg
 205.endif
 206.endm
 207
 208// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
 209// and zeroize any remaining bytes.  Clobbers %rax, %rcx, and \tmp{64,32}.
 210.macro	_load_partial_block	src, dst, tmp64, tmp32
 211	sub		$8, %ecx		// LEN - 8
 212	jle		.Lle8\@
 213
 214	// Load 9 <= LEN <= 15 bytes.
 215	movq		(\src), \dst		// Load first 8 bytes
 216	mov		(\src, %rcx), %rax	// Load last 8 bytes
 217	neg		%ecx
 218	shl		$3, %ecx
 219	shr		%cl, %rax		// Discard overlapping bytes
 220	pinsrq		$1, %rax, \dst
 221	jmp		.Ldone\@
 222
 223.Lle8\@:
 224	add		$4, %ecx		// LEN - 4
 225	jl		.Llt4\@
 226
 227	// Load 4 <= LEN <= 8 bytes.
 228	mov		(\src), %eax		// Load first 4 bytes
 229	mov		(\src, %rcx), \tmp32	// Load last 4 bytes
 230	jmp		.Lcombine\@
 231
 232.Llt4\@:
 233	// Load 1 <= LEN <= 3 bytes.
 234	add		$2, %ecx		// LEN - 2
 235	movzbl		(\src), %eax		// Load first byte
 236	jl		.Lmovq\@
 237	movzwl		(\src, %rcx), \tmp32	// Load last 2 bytes
 238.Lcombine\@:
 239	shl		$3, %ecx
 240	shl		%cl, \tmp64
 241	or		\tmp64, %rax		// Combine the two parts
 242.Lmovq\@:
 243	movq		%rax, \dst
 244.Ldone\@:
 245.endm
 246
 247// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
 248// Clobbers %rax, %rcx, and %rsi.
 249.macro	_store_partial_block	src, dst
 250	sub		$8, %ecx		// LEN - 8
 251	jl		.Llt8\@
 252
 253	// Store 8 <= LEN <= 15 bytes.
 254	pextrq		$1, \src, %rax
 255	mov		%ecx, %esi
 256	shl		$3, %ecx
 257	ror		%cl, %rax
 258	mov		%rax, (\dst, %rsi)	// Store last LEN - 8 bytes
 259	movq		\src, (\dst)		// Store first 8 bytes
 260	jmp		.Ldone\@
 261
 262.Llt8\@:
 263	add		$4, %ecx		// LEN - 4
 264	jl		.Llt4\@
 265
 266	// Store 4 <= LEN <= 7 bytes.
 267	pextrd		$1, \src, %eax
 268	mov		%ecx, %esi
 269	shl		$3, %ecx
 270	ror		%cl, %eax
 271	mov		%eax, (\dst, %rsi)	// Store last LEN - 4 bytes
 272	movd		\src, (\dst)		// Store first 4 bytes
 273	jmp		.Ldone\@
 274
 275.Llt4\@:
 276	// Store 1 <= LEN <= 3 bytes.
 277	pextrb		$0, \src, 0(\dst)
 278	cmp		$-2, %ecx		// LEN - 4 == -2, i.e. LEN == 2?
 279	jl		.Ldone\@
 280	pextrb		$1, \src, 1(\dst)
 281	je		.Ldone\@
 282	pextrb		$2, \src, 2(\dst)
 283.Ldone\@:
 284.endm
 285
 286// Do one step of GHASH-multiplying \a by \b and storing the reduced product in
 287// \b.  To complete all steps, this must be invoked with \i=0 through \i=9.
 288// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the
 289// .Lgfpoly constant, and \t0-\t1 must be temporary registers.
 290.macro	_ghash_mul_step	i, a, a_times_x64, b, gfpoly, t0, t1
 291
 292	// MI = (a_L * b_H) + ((a*x^64)_L * b_L)
 293.if \i == 0
 294	_vpclmulqdq	$0x01, \a, \b, \t0
 295.elseif \i == 1
 296	_vpclmulqdq	$0x00, \a_times_x64, \b, \t1
 297.elseif \i == 2
 298	pxor		\t1, \t0
 299
 300	// HI = (a_H * b_H) + ((a*x^64)_H * b_L)
 301.elseif \i == 3
 302	_vpclmulqdq	$0x11, \a, \b, \t1
 303.elseif \i == 4
 304	pclmulqdq	$0x10, \a_times_x64, \b
 305.elseif \i == 5
 306	pxor		\t1, \b
 307.elseif \i == 6
 308
 309	// Fold MI into HI.
 310	pshufd		$0x4e, \t0, \t1		// Swap halves of MI
 311.elseif \i == 7
 312	pclmulqdq	$0x00, \gfpoly, \t0	// MI_L*(x^63 + x^62 + x^57)
 313.elseif \i == 8
 314	pxor		\t1, \b
 315.elseif \i == 9
 316	pxor		\t0, \b
 317.endif
 318.endm
 319
 320// GHASH-multiply \a by \b and store the reduced product in \b.
 321// See _ghash_mul_step for details.
 322.macro	_ghash_mul	a, a_times_x64, b, gfpoly, t0, t1
 323.irp i, 0,1,2,3,4,5,6,7,8,9
 324	_ghash_mul_step	\i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1
 325.endr
 326.endm
 327
 328// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
 329// This does Karatsuba multiplication and must be paired with _ghash_reduce.  On
 330// the first call, \lo, \mi, and \hi must be zero.  \a_xored must contain the
 331// two halves of \a XOR'd together, i.e. a_L + a_H.  \b is clobbered.
 332.macro	_ghash_mul_noreduce	a, a_xored, b, lo, mi, hi, t0
 333
 334	// LO += a_L * b_L
 335	_vpclmulqdq	$0x00, \a, \b, \t0
 336	pxor		\t0, \lo
 337
 338	// b_L + b_H
 339	pshufd		$0x4e, \b, \t0
 340	pxor		\b, \t0
 341
 342	// HI += a_H * b_H
 343	pclmulqdq	$0x11, \a, \b
 344	pxor		\b, \hi
 345
 346	// MI += (a_L + a_H) * (b_L + b_H)
 347	pclmulqdq	$0x00, \a_xored, \t0
 348	pxor		\t0, \mi
 349.endm
 350
 351// Reduce the product from \lo, \mi, and \hi, and store the result in \dst.
 352// This assumes that _ghash_mul_noreduce was used.
 353.macro	_ghash_reduce	lo, mi, hi, dst, t0
 354
 355	movq		.Lgfpoly(%rip), \t0
 356
 357	// MI += LO + HI (needed because we used Karatsuba multiplication)
 358	pxor		\lo, \mi
 359	pxor		\hi, \mi
 360
 361	// Fold LO into MI.
 362	pshufd		$0x4e, \lo, \dst
 363	pclmulqdq	$0x00, \t0, \lo
 364	pxor		\dst, \mi
 365	pxor		\lo, \mi
 366
 367	// Fold MI into HI.
 368	pshufd		$0x4e, \mi, \dst
 369	pclmulqdq	$0x00, \t0, \mi
 370	pxor		\hi, \dst
 371	pxor		\mi, \dst
 372.endm
 373
 374// Do the first step of the GHASH update of a set of 8 ciphertext blocks.
 375//
 376// The whole GHASH update does:
 377//
 378//	GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +
 379//				blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1
 380//
 381// This macro just does the first step: it does the unreduced multiplication
 382// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm
 383// registers LO, MI, and GHASH_ACC a.k.a. HI.  It also zero-initializes the
 384// inner block counter in %rax, which is a value that counts up by 8 for each
 385// block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
 386//
 387// To reduce the number of pclmulqdq instructions required, both this macro and
 388// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook
 389// multiplication.  See the file comment for more details about this choice.
 390//
 391// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
 392// encrypting, or SRC if decrypting.  They also expect the precomputed hash key
 393// powers H^i and their XOR'd-together halves to be available in the struct
 394// pointed to by KEY.  Both macros clobber TMP[0-2].
 395.macro	_ghash_update_begin_8x	enc
 396
 397	// Initialize the inner block counter.
 398	xor		%eax, %eax
 399
 400	// Load the highest hash key power, H^8.
 401	movdqa		OFFSETOF_H_POWERS(KEY), TMP0
 402
 403	// Load the first ciphertext block and byte-reflect it.
 404.if \enc
 405	movdqu		(DST), TMP1
 406.else
 407	movdqu		(SRC), TMP1
 408.endif
 409	pshufb		BSWAP_MASK, TMP1
 410
 411	// Add the GHASH accumulator to the ciphertext block to get the block
 412	// 'b' that needs to be multiplied with the hash key power 'a'.
 413	pxor		TMP1, GHASH_ACC
 414
 415	// b_L + b_H
 416	pshufd		$0x4e, GHASH_ACC, MI
 417	pxor		GHASH_ACC, MI
 418
 419	// LO = a_L * b_L
 420	_vpclmulqdq	$0x00, TMP0, GHASH_ACC, LO
 421
 422	// HI = a_H * b_H
 423	pclmulqdq	$0x11, TMP0, GHASH_ACC
 424
 425	// MI = (a_L + a_H) * (b_L + b_H)
 426	pclmulqdq	$0x00, OFFSETOF_H_POWERS_XORED(KEY), MI
 427.endm
 428
 429// Continue the GHASH update of 8 ciphertext blocks as described above by doing
 430// an unreduced multiplication of the next ciphertext block by the next lowest
 431// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.
 432.macro	_ghash_update_continue_8x enc
 433	add		$8, %eax
 434
 435	// Load the next lowest key power.
 436	movdqa		OFFSETOF_H_POWERS(KEY,%rax,2), TMP0
 437
 438	// Load the next ciphertext block and byte-reflect it.
 439.if \enc
 440	movdqu		(DST,%rax,2), TMP1
 441.else
 442	movdqu		(SRC,%rax,2), TMP1
 443.endif
 444	pshufb		BSWAP_MASK, TMP1
 445
 446	// LO += a_L * b_L
 447	_vpclmulqdq	$0x00, TMP0, TMP1, TMP2
 448	pxor		TMP2, LO
 449
 450	// b_L + b_H
 451	pshufd		$0x4e, TMP1, TMP2
 452	pxor		TMP1, TMP2
 453
 454	// HI += a_H * b_H
 455	pclmulqdq	$0x11, TMP0, TMP1
 456	pxor		TMP1, GHASH_ACC
 457
 458	// MI += (a_L + a_H) * (b_L + b_H)
 459	movq		OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1
 460	pclmulqdq	$0x00, TMP1, TMP2
 461	pxor		TMP2, MI
 462.endm
 463
 464// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC.  This is similar to
 465// _ghash_reduce, but it's hardcoded to use the registers of the main loop and
 466// it uses the same register for HI and the destination.  It's also divided into
 467// two steps.  TMP1 must be preserved across steps.
 468//
 469// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of
 470// shuffling LO, XOR'ing LO into MI, and shuffling MI.  However, this would
 471// increase the critical path length, and it seems to slightly hurt performance.
 472.macro	_ghash_update_end_8x_step	i
 473.if \i == 0
 474	movq		.Lgfpoly(%rip), TMP1
 475	pxor		LO, MI
 476	pxor		GHASH_ACC, MI
 477	pshufd		$0x4e, LO, TMP2
 478	pclmulqdq	$0x00, TMP1, LO
 479	pxor		TMP2, MI
 480	pxor		LO, MI
 481.elseif \i == 1
 482	pshufd		$0x4e, MI, TMP2
 483	pclmulqdq	$0x00, TMP1, MI
 484	pxor		TMP2, GHASH_ACC
 485	pxor		MI, GHASH_ACC
 486.endif
 487.endm
 488
 489// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key);
 490//
 491// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH
 492// related fields in the key struct.
 493.macro	_aes_gcm_precompute
 494
 495	// Function arguments
 496	.set	KEY,		%rdi
 497
 498	// Additional local variables.
 499	// %xmm0-%xmm1 and %rax are used as temporaries.
 500	.set	RNDKEYLAST_PTR,	%rsi
 501	.set	H_CUR,		%xmm2
 502	.set	H_POW1,		%xmm3	// H^1
 503	.set	H_POW1_X64,	%xmm4	// H^1 * x^64
 504	.set	GFPOLY,		%xmm5
 505
 506	// Encrypt an all-zeroes block to get the raw hash subkey.
 507	movl		OFFSETOF_AESKEYLEN(KEY), %eax
 508	lea		6*16(KEY,%rax,4), RNDKEYLAST_PTR
 509	movdqa		(KEY), H_POW1  // Zero-th round key XOR all-zeroes block
 510	lea		16(KEY), %rax
 5111:
 512	aesenc		(%rax), H_POW1
 513	add		$16, %rax
 514	cmp		%rax, RNDKEYLAST_PTR
 515	jne		1b
 516	aesenclast	(RNDKEYLAST_PTR), H_POW1
 517
 518	// Preprocess the raw hash subkey as needed to operate on GHASH's
 519	// bit-reflected values directly: reflect its bytes, then multiply it by
 520	// x^-1 (using the backwards interpretation of polynomial coefficients
 521	// from the GCM spec) or equivalently x^1 (using the alternative,
 522	// natural interpretation of polynomial coefficients).
 523	pshufb		.Lbswap_mask(%rip), H_POW1
 524	movdqa		H_POW1, %xmm0
 525	pshufd		$0xd3, %xmm0, %xmm0
 526	psrad		$31, %xmm0
 527	paddq		H_POW1, H_POW1
 528	pand		.Lgfpoly_and_internal_carrybit(%rip), %xmm0
 529	pxor		%xmm0, H_POW1
 530
 531	// Store H^1.
 532	movdqa		H_POW1, OFFSETOF_H_POWERS+7*16(KEY)
 533
 534	// Compute and store H^1 * x^64.
 535	movq		.Lgfpoly(%rip), GFPOLY
 536	pshufd		$0x4e, H_POW1, %xmm0
 537	_vpclmulqdq	$0x00, H_POW1, GFPOLY, H_POW1_X64
 538	pxor		%xmm0, H_POW1_X64
 539	movdqa		H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY)
 540
 541	// Compute and store the halves of H^1 XOR'd together.
 542	pxor		H_POW1, %xmm0
 543	movq		%xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)
 544
 545	// Compute and store the remaining key powers H^2 through H^8.
 546	movdqa		H_POW1, H_CUR
 547	mov		$6*8, %eax
 548.Lprecompute_next\@:
 549	// Compute H^i = H^{i-1} * H^1.
 550	_ghash_mul	H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1
 551	// Store H^i.
 552	movdqa		H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2)
 553	// Compute and store the halves of H^i XOR'd together.
 554	pshufd		$0x4e, H_CUR, %xmm0
 555	pxor		H_CUR, %xmm0
 556	movq		%xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax)
 557	sub		$8, %eax
 558	jge		.Lprecompute_next\@
 559
 560	RET
 561.endm
 562
 563// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
 564//				 u8 ghash_acc[16], const u8 *aad, int aadlen);
 565//
 566// This function processes the AAD (Additional Authenticated Data) in GCM.
 567// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
 568// data given by |aad| and |aadlen|.  On the first call, |ghash_acc| must be all
 569// zeroes.  |aadlen| must be a multiple of 16, except on the last call where it
 570// can be any length.  The caller must do any buffering needed to ensure this.
 571.macro	_aes_gcm_aad_update
 572
 573	// Function arguments
 574	.set	KEY,		%rdi
 575	.set	GHASH_ACC_PTR,	%rsi
 576	.set	AAD,		%rdx
 577	.set	AADLEN,		%ecx
 578	// Note: _load_partial_block relies on AADLEN being in %ecx.
 579
 580	// Additional local variables.
 581	// %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
 582	.set	BSWAP_MASK,	%xmm2
 583	.set	GHASH_ACC,	%xmm3
 584	.set	H_POW1,		%xmm4	// H^1
 585	.set	H_POW1_X64,	%xmm5	// H^1 * x^64
 586	.set	GFPOLY,		%xmm6
 587
 588	movdqa		.Lbswap_mask(%rip), BSWAP_MASK
 589	movdqu		(GHASH_ACC_PTR), GHASH_ACC
 590	movdqa		OFFSETOF_H_POWERS+7*16(KEY), H_POW1
 591	movdqa		OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
 592	movq		.Lgfpoly(%rip), GFPOLY
 593
 594	// Process the AAD one full block at a time.
 595	sub		$16, AADLEN
 596	jl		.Laad_loop_1x_done\@
 597.Laad_loop_1x\@:
 598	movdqu		(AAD), %xmm0
 599	pshufb		BSWAP_MASK, %xmm0
 600	pxor		%xmm0, GHASH_ACC
 601	_ghash_mul	H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
 602	add		$16, AAD
 603	sub		$16, AADLEN
 604	jge		.Laad_loop_1x\@
 605.Laad_loop_1x_done\@:
 606	// Check whether there is a partial block at the end.
 607	add		$16, AADLEN
 608	jz		.Laad_done\@
 609
 610	// Process a partial block of length 1 <= AADLEN <= 15.
 611	// _load_partial_block assumes that %ecx contains AADLEN.
 612	_load_partial_block	AAD, %xmm0, %r10, %r10d
 613	pshufb		BSWAP_MASK, %xmm0
 614	pxor		%xmm0, GHASH_ACC
 615	_ghash_mul	H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
 616
 617.Laad_done\@:
 618	movdqu		GHASH_ACC, (GHASH_ACC_PTR)
 619	RET
 620.endm
 621
 622// Increment LE_CTR eight times to generate eight little-endian counter blocks,
 623// swap each to big-endian, and store them in AESDATA[0-7].  Also XOR them with
 624// the zero-th AES round key.  Clobbers TMP0 and TMP1.
 625.macro	_ctr_begin_8x
 626	movq		.Lone(%rip), TMP0
 627	movdqa		(KEY), TMP1		// zero-th round key
 628.irp i, 0,1,2,3,4,5,6,7
 629	_vpshufb	BSWAP_MASK, LE_CTR, AESDATA\i
 630	pxor		TMP1, AESDATA\i
 631	paddd		TMP0, LE_CTR
 632.endr
 633.endm
 634
 635// Do a non-last round of AES on AESDATA[0-7] using \round_key.
 636.macro	_aesenc_8x	round_key
 637.irp i, 0,1,2,3,4,5,6,7
 638	aesenc		\round_key, AESDATA\i
 639.endr
 640.endm
 641
 642// Do the last round of AES on AESDATA[0-7] using \round_key.
 643.macro	_aesenclast_8x	round_key
 644.irp i, 0,1,2,3,4,5,6,7
 645	aesenclast	\round_key, AESDATA\i
 646.endr
 647.endm
 648
 649// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
 650// store the result to DST.  Clobbers TMP0.
 651.macro	_xor_data_8x
 652.irp i, 0,1,2,3,4,5,6,7
 653	_xor_mem_to_reg	\i*16(SRC), AESDATA\i, tmp=TMP0
 654.endr
 655.irp i, 0,1,2,3,4,5,6,7
 656	movdqu		AESDATA\i, \i*16(DST)
 657.endr
 658.endm
 659
 660// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key,
 661//					  const u32 le_ctr[4], u8 ghash_acc[16],
 662//					  const u8 *src, u8 *dst, int datalen);
 663//
 664// This macro generates a GCM encryption or decryption update function with the
 665// above prototype (with \enc selecting which one).
 666//
 667// This function computes the next portion of the CTR keystream, XOR's it with
 668// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
 669// data to |dst|.  It also updates the GHASH accumulator |ghash_acc| using the
 670// next |datalen| ciphertext bytes.
 671//
 672// |datalen| must be a multiple of 16, except on the last call where it can be
 673// any length.  The caller must do any buffering needed to ensure this.  Both
 674// in-place and out-of-place en/decryption are supported.
 675//
 676// |le_ctr| must give the current counter in little-endian format.  For a new
 677// message, the low word of the counter must be 2.  This function loads the
 678// counter from |le_ctr| and increments the loaded counter as needed, but it
 679// does *not* store the updated counter back to |le_ctr|.  The caller must
 680// update |le_ctr| if any more data segments follow.  Internally, only the low
 681// 32-bit word of the counter is incremented, following the GCM standard.
 682.macro	_aes_gcm_update	enc
 683
 684	// Function arguments
 685	.set	KEY,		%rdi
 686	.set	LE_CTR_PTR,	%rsi	// Note: overlaps with usage as temp reg
 687	.set	GHASH_ACC_PTR,	%rdx
 688	.set	SRC,		%rcx
 689	.set	DST,		%r8
 690	.set	DATALEN,	%r9d
 691	.set	DATALEN64,	%r9	// Zero-extend DATALEN before using!
 692	// Note: the code setting up for _load_partial_block assumes that SRC is
 693	// in %rcx (and that DATALEN is *not* in %rcx).
 694
 695	// Additional local variables
 696
 697	// %rax and %rsi are used as temporary registers.  Note: %rsi overlaps
 698	// with LE_CTR_PTR, which is used only at the beginning.
 699
 700	.set	AESKEYLEN,	%r10d	// AES key length in bytes
 701	.set	AESKEYLEN64,	%r10
 702	.set	RNDKEYLAST_PTR,	%r11	// Pointer to last AES round key
 703
 704	// Put the most frequently used values in %xmm0-%xmm7 to reduce code
 705	// size.  (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
 706	.set	TMP0,		%xmm0
 707	.set	TMP1,		%xmm1
 708	.set	TMP2,		%xmm2
 709	.set	LO,		%xmm3	// Low part of unreduced product
 710	.set	MI,		%xmm4	// Middle part of unreduced product
 711	.set	GHASH_ACC,	%xmm5	// GHASH accumulator; in main loop also
 712					// the high part of unreduced product
 713	.set	BSWAP_MASK,	%xmm6	// Shuffle mask for reflecting bytes
 714	.set	LE_CTR,		%xmm7	// Little-endian counter value
 715	.set	AESDATA0,	%xmm8
 716	.set	AESDATA1,	%xmm9
 717	.set	AESDATA2,	%xmm10
 718	.set	AESDATA3,	%xmm11
 719	.set	AESDATA4,	%xmm12
 720	.set	AESDATA5,	%xmm13
 721	.set	AESDATA6,	%xmm14
 722	.set	AESDATA7,	%xmm15
 723
 724	movdqa		.Lbswap_mask(%rip), BSWAP_MASK
 725	movdqu		(GHASH_ACC_PTR), GHASH_ACC
 726	movdqu		(LE_CTR_PTR), LE_CTR
 727
 728	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
 729	lea		6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
 730
 731	// If there are at least 8*16 bytes of data, then continue into the main
 732	// loop, which processes 8*16 bytes of data per iteration.
 733	//
 734	// The main loop interleaves AES and GHASH to improve performance on
 735	// CPUs that can execute these instructions in parallel.  When
 736	// decrypting, the GHASH input (the ciphertext) is immediately
 737	// available.  When encrypting, we instead encrypt a set of 8 blocks
 738	// first and then GHASH those blocks while encrypting the next set of 8,
 739	// repeat that as needed, and finally GHASH the last set of 8 blocks.
 740	//
 741	// Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
 742	// as this makes the immediate fit in a signed byte, saving 3 bytes.
 743	add		$-8*16, DATALEN
 744	jl		.Lcrypt_loop_8x_done\@
 745.if \enc
 746	// Encrypt the first 8 plaintext blocks.
 747	_ctr_begin_8x
 748	lea		16(KEY), %rsi
 749	.p2align 4
 7501:
 751	movdqa		(%rsi), TMP0
 752	_aesenc_8x	TMP0
 753	add		$16, %rsi
 754	cmp		%rsi, RNDKEYLAST_PTR
 755	jne		1b
 756	movdqa		(%rsi), TMP0
 757	_aesenclast_8x	TMP0
 758	_xor_data_8x
 759	// Don't increment DST until the ciphertext blocks have been hashed.
 760	sub		$-8*16, SRC
 761	add		$-8*16, DATALEN
 762	jl		.Lghash_last_ciphertext_8x\@
 763.endif
 764
 765	.p2align 4
 766.Lcrypt_loop_8x\@:
 767
 768	// Generate the next set of 8 counter blocks and start encrypting them.
 769	_ctr_begin_8x
 770	lea		16(KEY), %rsi
 771
 772	// Do a round of AES, and start the GHASH update of 8 ciphertext blocks
 773	// by doing the unreduced multiplication for the first ciphertext block.
 774	movdqa		(%rsi), TMP0
 775	add		$16, %rsi
 776	_aesenc_8x	TMP0
 777	_ghash_update_begin_8x \enc
 778
 779	// Do 7 more rounds of AES, and continue the GHASH update by doing the
 780	// unreduced multiplication for the remaining ciphertext blocks.
 781	.p2align 4
 7821:
 783	movdqa		(%rsi), TMP0
 784	add		$16, %rsi
 785	_aesenc_8x	TMP0
 786	_ghash_update_continue_8x \enc
 787	cmp		$7*8, %eax
 788	jne		1b
 789
 790	// Do the remaining AES rounds.
 791	.p2align 4
 7921:
 793	movdqa		(%rsi), TMP0
 794	add		$16, %rsi
 795	_aesenc_8x	TMP0
 796	cmp		%rsi, RNDKEYLAST_PTR
 797	jne		1b
 798
 799	// Do the GHASH reduction and the last round of AES.
 800	movdqa		(RNDKEYLAST_PTR), TMP0
 801	_ghash_update_end_8x_step	0
 802	_aesenclast_8x	TMP0
 803	_ghash_update_end_8x_step	1
 804
 805	// XOR the data with the AES-CTR keystream blocks.
 806.if \enc
 807	sub		$-8*16, DST
 808.endif
 809	_xor_data_8x
 810	sub		$-8*16, SRC
 811.if !\enc
 812	sub		$-8*16, DST
 813.endif
 814	add		$-8*16, DATALEN
 815	jge		.Lcrypt_loop_8x\@
 816
 817.if \enc
 818.Lghash_last_ciphertext_8x\@:
 819	// Update GHASH with the last set of 8 ciphertext blocks.
 820	_ghash_update_begin_8x		\enc
 821	.p2align 4
 8221:
 823	_ghash_update_continue_8x	\enc
 824	cmp		$7*8, %eax
 825	jne		1b
 826	_ghash_update_end_8x_step	0
 827	_ghash_update_end_8x_step	1
 828	sub		$-8*16, DST
 829.endif
 830
 831.Lcrypt_loop_8x_done\@:
 832
 833	sub		$-8*16, DATALEN
 834	jz		.Ldone\@
 835
 836	// Handle the remainder of length 1 <= DATALEN < 8*16 bytes.  We keep
 837	// things simple and keep the code size down by just going one block at
 838	// a time, again taking advantage of hardware loop unrolling.  Since
 839	// there are enough key powers available for all remaining data, we do
 840	// the GHASH multiplications unreduced, and only reduce at the very end.
 841
 842	.set	HI,		TMP2
 843	.set	H_POW,		AESDATA0
 844	.set	H_POW_XORED,	AESDATA1
 845	.set	ONE,		AESDATA2
 846
 847	movq		.Lone(%rip), ONE
 848
 849	// Start collecting the unreduced GHASH intermediate value LO, MI, HI.
 850	pxor		LO, LO
 851	pxor		MI, MI
 852	pxor		HI, HI
 853
 854	// Set up a block counter %rax to contain 8*(8-n), where n is the number
 855	// of blocks that remain, counting any partial block.  This will be used
 856	// to access the key powers H^n through H^1.
 857	mov		DATALEN, %eax
 858	neg		%eax
 859	and		$~15, %eax
 860	sar		$1, %eax
 861	add		$64, %eax
 862
 863	sub		$16, DATALEN
 864	jl		.Lcrypt_loop_1x_done\@
 865
 866	// Process the data one full block at a time.
 867.Lcrypt_loop_1x\@:
 868
 869	// Encrypt the next counter block.
 870	_vpshufb	BSWAP_MASK, LE_CTR, TMP0
 871	paddd		ONE, LE_CTR
 872	pxor		(KEY), TMP0
 873	lea		-6*16(RNDKEYLAST_PTR), %rsi	// Reduce code size
 874	cmp		$24, AESKEYLEN
 875	jl		128f	// AES-128?
 876	je		192f	// AES-192?
 877	// AES-256
 878	aesenc		-7*16(%rsi), TMP0
 879	aesenc		-6*16(%rsi), TMP0
 880192:
 881	aesenc		-5*16(%rsi), TMP0
 882	aesenc		-4*16(%rsi), TMP0
 883128:
 884.irp i, -3,-2,-1,0,1,2,3,4,5
 885	aesenc		\i*16(%rsi), TMP0
 886.endr
 887	aesenclast	(RNDKEYLAST_PTR), TMP0
 888
 889	// Load the next key power H^i.
 890	movdqa		OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
 891	movq		OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
 892
 893	// XOR the keystream block that was just generated in TMP0 with the next
 894	// source data block and store the resulting en/decrypted data to DST.
 895.if \enc
 896	_xor_mem_to_reg	(SRC), TMP0, tmp=TMP1
 897	movdqu		TMP0, (DST)
 898.else
 899	movdqu		(SRC), TMP1
 900	pxor		TMP1, TMP0
 901	movdqu		TMP0, (DST)
 902.endif
 903
 904	// Update GHASH with the ciphertext block.
 905.if \enc
 906	pshufb		BSWAP_MASK, TMP0
 907	pxor		TMP0, GHASH_ACC
 908.else
 909	pshufb		BSWAP_MASK, TMP1
 910	pxor		TMP1, GHASH_ACC
 911.endif
 912	_ghash_mul_noreduce	H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
 913	pxor		GHASH_ACC, GHASH_ACC
 914
 915	add		$8, %eax
 916	add		$16, SRC
 917	add		$16, DST
 918	sub		$16, DATALEN
 919	jge		.Lcrypt_loop_1x\@
 920.Lcrypt_loop_1x_done\@:
 921	// Check whether there is a partial block at the end.
 922	add		$16, DATALEN
 923	jz		.Lghash_reduce\@
 924
 925	// Process a partial block of length 1 <= DATALEN <= 15.
 926
 927	// Encrypt a counter block for the last time.
 928	pshufb		BSWAP_MASK, LE_CTR
 929	pxor		(KEY), LE_CTR
 930	lea		16(KEY), %rsi
 9311:
 932	aesenc		(%rsi), LE_CTR
 933	add		$16, %rsi
 934	cmp		%rsi, RNDKEYLAST_PTR
 935	jne		1b
 936	aesenclast	(RNDKEYLAST_PTR), LE_CTR
 937
 938	// Load the lowest key power, H^1.
 939	movdqa		OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
 940	movq		OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
 941
 942	// Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC.  SRC is
 943	// in %rcx, but _load_partial_block needs DATALEN in %rcx instead.
 944	// RNDKEYLAST_PTR is no longer needed, so reuse it for SRC.
 945	mov		SRC, RNDKEYLAST_PTR
 946	mov		DATALEN, %ecx
 947	_load_partial_block	RNDKEYLAST_PTR, TMP0, %rsi, %esi
 948
 949	// XOR the keystream block that was just generated in LE_CTR with the
 950	// source data block and store the resulting en/decrypted data to DST.
 951	pxor		TMP0, LE_CTR
 952	mov		DATALEN, %ecx
 953	_store_partial_block	LE_CTR, DST
 954
 955	// If encrypting, zero-pad the final ciphertext block for GHASH.  (If
 956	// decrypting, this was already done by _load_partial_block.)
 957.if \enc
 958	lea		.Lzeropad_mask+16(%rip), %rax
 959	sub		DATALEN64, %rax
 960	_vpand		(%rax), LE_CTR, TMP0
 961.endif
 962
 963	// Update GHASH with the final ciphertext block.
 964	pshufb		BSWAP_MASK, TMP0
 965	pxor		TMP0, GHASH_ACC
 966	_ghash_mul_noreduce	H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
 967
 968.Lghash_reduce\@:
 969	// Finally, do the GHASH reduction.
 970	_ghash_reduce	LO, MI, HI, GHASH_ACC, TMP0
 971
 972.Ldone\@:
 973	// Store the updated GHASH accumulator back to memory.
 974	movdqu		GHASH_ACC, (GHASH_ACC_PTR)
 975
 976	RET
 977.endm
 978
 979// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key,
 980//				   const u32 le_ctr[4], u8 ghash_acc[16],
 981//				   u64 total_aadlen, u64 total_datalen);
 982// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key,
 983//				   const u32 le_ctr[4], const u8 ghash_acc[16],
 984//				   u64 total_aadlen, u64 total_datalen,
 985//				   const u8 tag[16], int taglen);
 986//
 987// This macro generates one of the above two functions (with \enc selecting
 988// which one).  Both functions finish computing the GCM authentication tag by
 989// updating GHASH with the lengths block and encrypting the GHASH accumulator.
 990// |total_aadlen| and |total_datalen| must be the total length of the additional
 991// authenticated data and the en/decrypted data in bytes, respectively.
 992//
 993// The encryption function then stores the full-length (16-byte) computed
 994// authentication tag to |ghash_acc|.  The decryption function instead loads the
 995// expected authentication tag (the one that was transmitted) from the 16-byte
 996// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
 997// computed tag in constant time, and returns true if and only if they match.
 998.macro	_aes_gcm_final	enc
 999
1000	// Function arguments
1001	.set	KEY,		%rdi
1002	.set	LE_CTR_PTR,	%rsi
1003	.set	GHASH_ACC_PTR,	%rdx
1004	.set	TOTAL_AADLEN,	%rcx
1005	.set	TOTAL_DATALEN,	%r8
1006	.set	TAG,		%r9
1007	.set	TAGLEN,		%r10d	// Originally at 8(%rsp)
1008	.set	TAGLEN64,	%r10
1009
1010	// Additional local variables.
1011	// %rax and %xmm0-%xmm2 are used as temporary registers.
1012	.set	AESKEYLEN,	%r11d
1013	.set	AESKEYLEN64,	%r11
1014	.set	BSWAP_MASK,	%xmm3
1015	.set	GHASH_ACC,	%xmm4
1016	.set	H_POW1,		%xmm5	// H^1
1017	.set	H_POW1_X64,	%xmm6	// H^1 * x^64
1018	.set	GFPOLY,		%xmm7
1019
1020	movdqa		.Lbswap_mask(%rip), BSWAP_MASK
1021	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
1022
1023	// Set up a counter block with 1 in the low 32-bit word.  This is the
1024	// counter that produces the ciphertext needed to encrypt the auth tag.
1025	movdqu		(LE_CTR_PTR), %xmm0
1026	mov		$1, %eax
1027	pinsrd		$0, %eax, %xmm0
1028
1029	// Build the lengths block and XOR it into the GHASH accumulator.
1030	movq		TOTAL_DATALEN, GHASH_ACC
1031	pinsrq		$1, TOTAL_AADLEN, GHASH_ACC
1032	psllq		$3, GHASH_ACC	// Bytes to bits
1033	_xor_mem_to_reg	(GHASH_ACC_PTR), GHASH_ACC, %xmm1
1034
1035	movdqa		OFFSETOF_H_POWERS+7*16(KEY), H_POW1
1036	movdqa		OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
1037	movq		.Lgfpoly(%rip), GFPOLY
1038
1039	// Make %rax point to the 6th from last AES round key.  (Using signed
1040	// byte offsets -7*16 through 6*16 decreases code size.)
1041	lea		(KEY,AESKEYLEN64,4), %rax
1042
1043	// AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
1044	// Interleave the AES and GHASH instructions to improve performance.
1045	pshufb		BSWAP_MASK, %xmm0
1046	pxor		(KEY), %xmm0
1047	cmp		$24, AESKEYLEN
1048	jl		128f	// AES-128?
1049	je		192f	// AES-192?
1050	// AES-256
1051	aesenc		-7*16(%rax), %xmm0
1052	aesenc		-6*16(%rax), %xmm0
1053192:
1054	aesenc		-5*16(%rax), %xmm0
1055	aesenc		-4*16(%rax), %xmm0
1056128:
1057.irp i, 0,1,2,3,4,5,6,7,8
1058	aesenc		(\i-3)*16(%rax), %xmm0
1059	_ghash_mul_step	\i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
1060.endr
1061	aesenclast	6*16(%rax), %xmm0
1062	_ghash_mul_step	9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
1063
1064	// Undo the byte reflection of the GHASH accumulator.
1065	pshufb		BSWAP_MASK, GHASH_ACC
1066
1067	// Encrypt the GHASH accumulator.
1068	pxor		%xmm0, GHASH_ACC
1069
1070.if \enc
1071	// Return the computed auth tag.
1072	movdqu		GHASH_ACC, (GHASH_ACC_PTR)
1073.else
1074	.set		ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN!
1075
1076	// Verify the auth tag in constant time by XOR'ing the transmitted and
1077	// computed auth tags together and using the ptest instruction to check
1078	// whether the first TAGLEN bytes of the result are zero.
1079	_xor_mem_to_reg	(TAG), GHASH_ACC, tmp=%xmm0
1080	movl		8(%rsp), TAGLEN
1081	lea		.Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR
1082	sub		TAGLEN64, ZEROPAD_MASK_PTR
1083	xor		%eax, %eax
1084	_test_mem	(ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0
1085	sete		%al
1086.endif
1087	RET
1088.endm
1089
1090.set	USE_AVX, 0
1091SYM_FUNC_START(aes_gcm_precompute_aesni)
1092	_aes_gcm_precompute
1093SYM_FUNC_END(aes_gcm_precompute_aesni)
1094SYM_FUNC_START(aes_gcm_aad_update_aesni)
1095	_aes_gcm_aad_update
1096SYM_FUNC_END(aes_gcm_aad_update_aesni)
1097SYM_FUNC_START(aes_gcm_enc_update_aesni)
1098	_aes_gcm_update	1
1099SYM_FUNC_END(aes_gcm_enc_update_aesni)
1100SYM_FUNC_START(aes_gcm_dec_update_aesni)
1101	_aes_gcm_update	0
1102SYM_FUNC_END(aes_gcm_dec_update_aesni)
1103SYM_FUNC_START(aes_gcm_enc_final_aesni)
1104	_aes_gcm_final	1
1105SYM_FUNC_END(aes_gcm_enc_final_aesni)
1106SYM_FUNC_START(aes_gcm_dec_final_aesni)
1107	_aes_gcm_final	0
1108SYM_FUNC_END(aes_gcm_dec_final_aesni)
1109
1110.set	USE_AVX, 1
1111SYM_FUNC_START(aes_gcm_precompute_aesni_avx)
1112	_aes_gcm_precompute
1113SYM_FUNC_END(aes_gcm_precompute_aesni_avx)
1114SYM_FUNC_START(aes_gcm_aad_update_aesni_avx)
1115	_aes_gcm_aad_update
1116SYM_FUNC_END(aes_gcm_aad_update_aesni_avx)
1117SYM_FUNC_START(aes_gcm_enc_update_aesni_avx)
1118	_aes_gcm_update	1
1119SYM_FUNC_END(aes_gcm_enc_update_aesni_avx)
1120SYM_FUNC_START(aes_gcm_dec_update_aesni_avx)
1121	_aes_gcm_update	0
1122SYM_FUNC_END(aes_gcm_dec_update_aesni_avx)
1123SYM_FUNC_START(aes_gcm_enc_final_aesni_avx)
1124	_aes_gcm_final	1
1125SYM_FUNC_END(aes_gcm_enc_final_aesni_avx)
1126SYM_FUNC_START(aes_gcm_dec_final_aesni_avx)
1127	_aes_gcm_final	0
1128SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)