Linux Audio

Check our new training course

Loading...
v6.13.7
   1// SPDX-License-Identifier: GPL-2.0 OR MIT
   2/*
   3 * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
   4 * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
   5 */
   6
   7#include <crypto/curve25519.h>
   8#include <crypto/internal/kpp.h>
   9
  10#include <linux/types.h>
  11#include <linux/jump_label.h>
  12#include <linux/kernel.h>
  13#include <linux/module.h>
  14#include <linux/scatterlist.h>
  15
  16#include <asm/cpufeature.h>
  17#include <asm/processor.h>
  18
  19static __always_inline u64 eq_mask(u64 a, u64 b)
  20{
  21	u64 x = a ^ b;
  22	u64 minus_x = ~x + (u64)1U;
  23	u64 x_or_minus_x = x | minus_x;
  24	u64 xnx = x_or_minus_x >> (u32)63U;
  25	return xnx - (u64)1U;
  26}
  27
  28static __always_inline u64 gte_mask(u64 a, u64 b)
  29{
  30	u64 x = a;
  31	u64 y = b;
  32	u64 x_xor_y = x ^ y;
  33	u64 x_sub_y = x - y;
  34	u64 x_sub_y_xor_y = x_sub_y ^ y;
  35	u64 q = x_xor_y | x_sub_y_xor_y;
  36	u64 x_xor_q = x ^ q;
  37	u64 x_xor_q_ = x_xor_q >> (u32)63U;
  38	return x_xor_q_ - (u64)1U;
  39}
  40
  41/* Computes the addition of four-element f1 with value in f2
  42 * and returns the carry (if any) */
  43static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
  44{
  45	u64 carry_r;
  46
  47	asm volatile(
  48		/* Clear registers to propagate the carry bit */
  49		"  xor %%r8d, %%r8d;"
  50		"  xor %%r9d, %%r9d;"
  51		"  xor %%r10d, %%r10d;"
  52		"  xor %%r11d, %%r11d;"
  53		"  xor %k1, %k1;"
  54
  55		/* Begin addition chain */
  56		"  addq 0(%3), %0;"
  57		"  movq %0, 0(%2);"
  58		"  adcxq 8(%3), %%r8;"
  59		"  movq %%r8, 8(%2);"
  60		"  adcxq 16(%3), %%r9;"
  61		"  movq %%r9, 16(%2);"
  62		"  adcxq 24(%3), %%r10;"
  63		"  movq %%r10, 24(%2);"
  64
  65		/* Return the carry bit in a register */
  66		"  adcx %%r11, %1;"
  67		: "+&r"(f2), "=&r"(carry_r)
  68		: "r"(out), "r"(f1)
  69		: "%r8", "%r9", "%r10", "%r11", "memory", "cc");
 
  70
  71	return carry_r;
  72}
  73
  74/* Computes the field addition of two field elements */
  75static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
  76{
  77	asm volatile(
  78		/* Compute the raw addition of f1 + f2 */
  79		"  movq 0(%0), %%r8;"
  80		"  addq 0(%2), %%r8;"
  81		"  movq 8(%0), %%r9;"
  82		"  adcxq 8(%2), %%r9;"
  83		"  movq 16(%0), %%r10;"
  84		"  adcxq 16(%2), %%r10;"
  85		"  movq 24(%0), %%r11;"
  86		"  adcxq 24(%2), %%r11;"
  87
  88		/* Wrap the result back into the field */
  89
  90		/* Step 1: Compute carry*38 */
  91		"  mov $0, %%rax;"
  92		"  mov $38, %0;"
  93		"  cmovc %0, %%rax;"
  94
  95		/* Step 2: Add carry*38 to the original sum */
  96		"  xor %%ecx, %%ecx;"
  97		"  add %%rax, %%r8;"
  98		"  adcx %%rcx, %%r9;"
  99		"  movq %%r9, 8(%1);"
 100		"  adcx %%rcx, %%r10;"
 101		"  movq %%r10, 16(%1);"
 102		"  adcx %%rcx, %%r11;"
 103		"  movq %%r11, 24(%1);"
 104
 105		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 106		"  mov $0, %%rax;"
 107		"  cmovc %0, %%rax;"
 108		"  add %%rax, %%r8;"
 109		"  movq %%r8, 0(%1);"
 110		: "+&r"(f2)
 111		: "r"(out), "r"(f1)
 112		: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
 
 113}
 114
 115/* Computes the field subtraction of two field elements */
 116static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
 117{
 118	asm volatile(
 119		/* Compute the raw subtraction of f1-f2 */
 120		"  movq 0(%1), %%r8;"
 121		"  subq 0(%2), %%r8;"
 122		"  movq 8(%1), %%r9;"
 123		"  sbbq 8(%2), %%r9;"
 124		"  movq 16(%1), %%r10;"
 125		"  sbbq 16(%2), %%r10;"
 126		"  movq 24(%1), %%r11;"
 127		"  sbbq 24(%2), %%r11;"
 128
 129		/* Wrap the result back into the field */
 130
 131		/* Step 1: Compute carry*38 */
 132		"  mov $0, %%rax;"
 133		"  mov $38, %%rcx;"
 134		"  cmovc %%rcx, %%rax;"
 135
 136		/* Step 2: Subtract carry*38 from the original difference */
 137		"  sub %%rax, %%r8;"
 138		"  sbb $0, %%r9;"
 139		"  sbb $0, %%r10;"
 140		"  sbb $0, %%r11;"
 141
 142		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 143		"  mov $0, %%rax;"
 144		"  cmovc %%rcx, %%rax;"
 145		"  sub %%rax, %%r8;"
 146
 147		/* Store the result */
 148		"  movq %%r8, 0(%0);"
 149		"  movq %%r9, 8(%0);"
 150		"  movq %%r10, 16(%0);"
 151		"  movq %%r11, 24(%0);"
 152		:
 153		: "r"(out), "r"(f1), "r"(f2)
 154		: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
 
 155}
 156
 157/* Computes a field multiplication: out <- f1 * f2
 158 * Uses the 8-element buffer tmp for intermediate results */
 159static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 160{
 161	asm volatile(
 162
 163		/* Compute the raw multiplication: tmp <- src1 * src2 */
 164
 165		/* Compute src1[0] * src2 */
 166		"  movq 0(%0), %%rdx;"
 167		"  mulxq 0(%1), %%r8, %%r9;"
 168		"  xor %%r10d, %%r10d;"
 169		"  movq %%r8, 0(%2);"
 170		"  mulxq 8(%1), %%r10, %%r11;"
 171		"  adox %%r9, %%r10;"
 172		"  movq %%r10, 8(%2);"
 173		"  mulxq 16(%1), %%rbx, %%r13;"
 174		"  adox %%r11, %%rbx;"
 175		"  mulxq 24(%1), %%r14, %%rdx;"
 176		"  adox %%r13, %%r14;"
 177		"  mov $0, %%rax;"
 178		"  adox %%rdx, %%rax;"
 179
 180		/* Compute src1[1] * src2 */
 181		"  movq 8(%0), %%rdx;"
 182		"  mulxq 0(%1), %%r8, %%r9;"
 183		"  xor %%r10d, %%r10d;"
 184		"  adcxq 8(%2), %%r8;"
 185		"  movq %%r8, 8(%2);"
 186		"  mulxq 8(%1), %%r10, %%r11;"
 187		"  adox %%r9, %%r10;"
 188		"  adcx %%rbx, %%r10;"
 189		"  movq %%r10, 16(%2);"
 190		"  mulxq 16(%1), %%rbx, %%r13;"
 191		"  adox %%r11, %%rbx;"
 192		"  adcx %%r14, %%rbx;"
 193		"  mov $0, %%r8;"
 194		"  mulxq 24(%1), %%r14, %%rdx;"
 195		"  adox %%r13, %%r14;"
 196		"  adcx %%rax, %%r14;"
 197		"  mov $0, %%rax;"
 198		"  adox %%rdx, %%rax;"
 199		"  adcx %%r8, %%rax;"
 200
 201		/* Compute src1[2] * src2 */
 202		"  movq 16(%0), %%rdx;"
 203		"  mulxq 0(%1), %%r8, %%r9;"
 204		"  xor %%r10d, %%r10d;"
 205		"  adcxq 16(%2), %%r8;"
 206		"  movq %%r8, 16(%2);"
 207		"  mulxq 8(%1), %%r10, %%r11;"
 208		"  adox %%r9, %%r10;"
 209		"  adcx %%rbx, %%r10;"
 210		"  movq %%r10, 24(%2);"
 211		"  mulxq 16(%1), %%rbx, %%r13;"
 212		"  adox %%r11, %%rbx;"
 213		"  adcx %%r14, %%rbx;"
 214		"  mov $0, %%r8;"
 215		"  mulxq 24(%1), %%r14, %%rdx;"
 216		"  adox %%r13, %%r14;"
 217		"  adcx %%rax, %%r14;"
 218		"  mov $0, %%rax;"
 219		"  adox %%rdx, %%rax;"
 220		"  adcx %%r8, %%rax;"
 221
 222		/* Compute src1[3] * src2 */
 223		"  movq 24(%0), %%rdx;"
 224		"  mulxq 0(%1), %%r8, %%r9;"
 225		"  xor %%r10d, %%r10d;"
 226		"  adcxq 24(%2), %%r8;"
 227		"  movq %%r8, 24(%2);"
 228		"  mulxq 8(%1), %%r10, %%r11;"
 229		"  adox %%r9, %%r10;"
 230		"  adcx %%rbx, %%r10;"
 231		"  movq %%r10, 32(%2);"
 232		"  mulxq 16(%1), %%rbx, %%r13;"
 233		"  adox %%r11, %%rbx;"
 234		"  adcx %%r14, %%rbx;"
 235		"  movq %%rbx, 40(%2);"
 236		"  mov $0, %%r8;"
 237		"  mulxq 24(%1), %%r14, %%rdx;"
 238		"  adox %%r13, %%r14;"
 239		"  adcx %%rax, %%r14;"
 240		"  movq %%r14, 48(%2);"
 241		"  mov $0, %%rax;"
 242		"  adox %%rdx, %%rax;"
 243		"  adcx %%r8, %%rax;"
 244		"  movq %%rax, 56(%2);"
 245
 246		/* Line up pointers */
 
 247		"  mov %2, %0;"
 248		"  mov %3, %2;"
 249
 250		/* Wrap the result back into the field */
 251
 252		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 253		"  mov $38, %%rdx;"
 254		"  mulxq 32(%0), %%r8, %%r13;"
 255		"  xor %k1, %k1;"
 256		"  adoxq 0(%0), %%r8;"
 257		"  mulxq 40(%0), %%r9, %%rbx;"
 258		"  adcx %%r13, %%r9;"
 259		"  adoxq 8(%0), %%r9;"
 260		"  mulxq 48(%0), %%r10, %%r13;"
 261		"  adcx %%rbx, %%r10;"
 262		"  adoxq 16(%0), %%r10;"
 263		"  mulxq 56(%0), %%r11, %%rax;"
 264		"  adcx %%r13, %%r11;"
 265		"  adoxq 24(%0), %%r11;"
 266		"  adcx %1, %%rax;"
 267		"  adox %1, %%rax;"
 268		"  imul %%rdx, %%rax;"
 269
 270		/* Step 2: Fold the carry back into dst */
 271		"  add %%rax, %%r8;"
 272		"  adcx %1, %%r9;"
 273		"  movq %%r9, 8(%2);"
 274		"  adcx %1, %%r10;"
 275		"  movq %%r10, 16(%2);"
 276		"  adcx %1, %%r11;"
 277		"  movq %%r11, 24(%2);"
 278
 279		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 280		"  mov $0, %%rax;"
 281		"  cmovc %%rdx, %%rax;"
 282		"  add %%rax, %%r8;"
 283		"  movq %%r8, 0(%2);"
 284		: "+&r"(f1), "+&r"(f2), "+&r"(tmp)
 285		: "r"(out)
 286		: "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
 287		  "%r14", "memory", "cc");
 288}
 289
 290/* Computes two field multiplications:
 291 *   out[0] <- f1[0] * f2[0]
 292 *   out[1] <- f1[1] * f2[1]
 293 * Uses the 16-element buffer tmp for intermediate results: */
 294static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 295{
 296	asm volatile(
 297
 298		/* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
 299
 300		/* Compute src1[0] * src2 */
 301		"  movq 0(%0), %%rdx;"
 302		"  mulxq 0(%1), %%r8, %%r9;"
 303		"  xor %%r10d, %%r10d;"
 304		"  movq %%r8, 0(%2);"
 305		"  mulxq 8(%1), %%r10, %%r11;"
 306		"  adox %%r9, %%r10;"
 307		"  movq %%r10, 8(%2);"
 308		"  mulxq 16(%1), %%rbx, %%r13;"
 309		"  adox %%r11, %%rbx;"
 310		"  mulxq 24(%1), %%r14, %%rdx;"
 311		"  adox %%r13, %%r14;"
 312		"  mov $0, %%rax;"
 313		"  adox %%rdx, %%rax;"
 314
 315		/* Compute src1[1] * src2 */
 316		"  movq 8(%0), %%rdx;"
 317		"  mulxq 0(%1), %%r8, %%r9;"
 318		"  xor %%r10d, %%r10d;"
 319		"  adcxq 8(%2), %%r8;"
 320		"  movq %%r8, 8(%2);"
 321		"  mulxq 8(%1), %%r10, %%r11;"
 322		"  adox %%r9, %%r10;"
 323		"  adcx %%rbx, %%r10;"
 324		"  movq %%r10, 16(%2);"
 325		"  mulxq 16(%1), %%rbx, %%r13;"
 326		"  adox %%r11, %%rbx;"
 327		"  adcx %%r14, %%rbx;"
 328		"  mov $0, %%r8;"
 329		"  mulxq 24(%1), %%r14, %%rdx;"
 330		"  adox %%r13, %%r14;"
 331		"  adcx %%rax, %%r14;"
 332		"  mov $0, %%rax;"
 333		"  adox %%rdx, %%rax;"
 334		"  adcx %%r8, %%rax;"
 335
 336		/* Compute src1[2] * src2 */
 337		"  movq 16(%0), %%rdx;"
 338		"  mulxq 0(%1), %%r8, %%r9;"
 339		"  xor %%r10d, %%r10d;"
 340		"  adcxq 16(%2), %%r8;"
 341		"  movq %%r8, 16(%2);"
 342		"  mulxq 8(%1), %%r10, %%r11;"
 343		"  adox %%r9, %%r10;"
 344		"  adcx %%rbx, %%r10;"
 345		"  movq %%r10, 24(%2);"
 346		"  mulxq 16(%1), %%rbx, %%r13;"
 347		"  adox %%r11, %%rbx;"
 348		"  adcx %%r14, %%rbx;"
 349		"  mov $0, %%r8;"
 350		"  mulxq 24(%1), %%r14, %%rdx;"
 351		"  adox %%r13, %%r14;"
 352		"  adcx %%rax, %%r14;"
 353		"  mov $0, %%rax;"
 354		"  adox %%rdx, %%rax;"
 355		"  adcx %%r8, %%rax;"
 356
 357		/* Compute src1[3] * src2 */
 358		"  movq 24(%0), %%rdx;"
 359		"  mulxq 0(%1), %%r8, %%r9;"
 360		"  xor %%r10d, %%r10d;"
 361		"  adcxq 24(%2), %%r8;"
 362		"  movq %%r8, 24(%2);"
 363		"  mulxq 8(%1), %%r10, %%r11;"
 364		"  adox %%r9, %%r10;"
 365		"  adcx %%rbx, %%r10;"
 366		"  movq %%r10, 32(%2);"
 367		"  mulxq 16(%1), %%rbx, %%r13;"
 368		"  adox %%r11, %%rbx;"
 369		"  adcx %%r14, %%rbx;"
 370		"  movq %%rbx, 40(%2);"
 371		"  mov $0, %%r8;"
 372		"  mulxq 24(%1), %%r14, %%rdx;"
 373		"  adox %%r13, %%r14;"
 374		"  adcx %%rax, %%r14;"
 375		"  movq %%r14, 48(%2);"
 376		"  mov $0, %%rax;"
 377		"  adox %%rdx, %%rax;"
 378		"  adcx %%r8, %%rax;"
 379		"  movq %%rax, 56(%2);"
 380
 381		/* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
 382
 383		/* Compute src1[0] * src2 */
 384		"  movq 32(%0), %%rdx;"
 385		"  mulxq 32(%1), %%r8, %%r9;"
 386		"  xor %%r10d, %%r10d;"
 387		"  movq %%r8, 64(%2);"
 388		"  mulxq 40(%1), %%r10, %%r11;"
 389		"  adox %%r9, %%r10;"
 390		"  movq %%r10, 72(%2);"
 391		"  mulxq 48(%1), %%rbx, %%r13;"
 392		"  adox %%r11, %%rbx;"
 393		"  mulxq 56(%1), %%r14, %%rdx;"
 394		"  adox %%r13, %%r14;"
 395		"  mov $0, %%rax;"
 396		"  adox %%rdx, %%rax;"
 397
 398		/* Compute src1[1] * src2 */
 399		"  movq 40(%0), %%rdx;"
 400		"  mulxq 32(%1), %%r8, %%r9;"
 401		"  xor %%r10d, %%r10d;"
 402		"  adcxq 72(%2), %%r8;"
 403		"  movq %%r8, 72(%2);"
 404		"  mulxq 40(%1), %%r10, %%r11;"
 405		"  adox %%r9, %%r10;"
 406		"  adcx %%rbx, %%r10;"
 407		"  movq %%r10, 80(%2);"
 408		"  mulxq 48(%1), %%rbx, %%r13;"
 409		"  adox %%r11, %%rbx;"
 410		"  adcx %%r14, %%rbx;"
 411		"  mov $0, %%r8;"
 412		"  mulxq 56(%1), %%r14, %%rdx;"
 413		"  adox %%r13, %%r14;"
 414		"  adcx %%rax, %%r14;"
 415		"  mov $0, %%rax;"
 416		"  adox %%rdx, %%rax;"
 417		"  adcx %%r8, %%rax;"
 418
 419		/* Compute src1[2] * src2 */
 420		"  movq 48(%0), %%rdx;"
 421		"  mulxq 32(%1), %%r8, %%r9;"
 422		"  xor %%r10d, %%r10d;"
 423		"  adcxq 80(%2), %%r8;"
 424		"  movq %%r8, 80(%2);"
 425		"  mulxq 40(%1), %%r10, %%r11;"
 426		"  adox %%r9, %%r10;"
 427		"  adcx %%rbx, %%r10;"
 428		"  movq %%r10, 88(%2);"
 429		"  mulxq 48(%1), %%rbx, %%r13;"
 430		"  adox %%r11, %%rbx;"
 431		"  adcx %%r14, %%rbx;"
 432		"  mov $0, %%r8;"
 433		"  mulxq 56(%1), %%r14, %%rdx;"
 434		"  adox %%r13, %%r14;"
 435		"  adcx %%rax, %%r14;"
 436		"  mov $0, %%rax;"
 437		"  adox %%rdx, %%rax;"
 438		"  adcx %%r8, %%rax;"
 439
 440		/* Compute src1[3] * src2 */
 441		"  movq 56(%0), %%rdx;"
 442		"  mulxq 32(%1), %%r8, %%r9;"
 443		"  xor %%r10d, %%r10d;"
 444		"  adcxq 88(%2), %%r8;"
 445		"  movq %%r8, 88(%2);"
 446		"  mulxq 40(%1), %%r10, %%r11;"
 447		"  adox %%r9, %%r10;"
 448		"  adcx %%rbx, %%r10;"
 449		"  movq %%r10, 96(%2);"
 450		"  mulxq 48(%1), %%rbx, %%r13;"
 451		"  adox %%r11, %%rbx;"
 452		"  adcx %%r14, %%rbx;"
 453		"  movq %%rbx, 104(%2);"
 454		"  mov $0, %%r8;"
 455		"  mulxq 56(%1), %%r14, %%rdx;"
 456		"  adox %%r13, %%r14;"
 457		"  adcx %%rax, %%r14;"
 458		"  movq %%r14, 112(%2);"
 459		"  mov $0, %%rax;"
 460		"  adox %%rdx, %%rax;"
 461		"  adcx %%r8, %%rax;"
 462		"  movq %%rax, 120(%2);"
 463
 464		/* Line up pointers */
 
 465		"  mov %2, %0;"
 466		"  mov %3, %2;"
 467
 468		/* Wrap the results back into the field */
 469
 470		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 471		"  mov $38, %%rdx;"
 472		"  mulxq 32(%0), %%r8, %%r13;"
 473		"  xor %k1, %k1;"
 474		"  adoxq 0(%0), %%r8;"
 475		"  mulxq 40(%0), %%r9, %%rbx;"
 476		"  adcx %%r13, %%r9;"
 477		"  adoxq 8(%0), %%r9;"
 478		"  mulxq 48(%0), %%r10, %%r13;"
 479		"  adcx %%rbx, %%r10;"
 480		"  adoxq 16(%0), %%r10;"
 481		"  mulxq 56(%0), %%r11, %%rax;"
 482		"  adcx %%r13, %%r11;"
 483		"  adoxq 24(%0), %%r11;"
 484		"  adcx %1, %%rax;"
 485		"  adox %1, %%rax;"
 486		"  imul %%rdx, %%rax;"
 487
 488		/* Step 2: Fold the carry back into dst */
 489		"  add %%rax, %%r8;"
 490		"  adcx %1, %%r9;"
 491		"  movq %%r9, 8(%2);"
 492		"  adcx %1, %%r10;"
 493		"  movq %%r10, 16(%2);"
 494		"  adcx %1, %%r11;"
 495		"  movq %%r11, 24(%2);"
 496
 497		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 498		"  mov $0, %%rax;"
 499		"  cmovc %%rdx, %%rax;"
 500		"  add %%rax, %%r8;"
 501		"  movq %%r8, 0(%2);"
 502
 503		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 504		"  mov $38, %%rdx;"
 505		"  mulxq 96(%0), %%r8, %%r13;"
 506		"  xor %k1, %k1;"
 507		"  adoxq 64(%0), %%r8;"
 508		"  mulxq 104(%0), %%r9, %%rbx;"
 509		"  adcx %%r13, %%r9;"
 510		"  adoxq 72(%0), %%r9;"
 511		"  mulxq 112(%0), %%r10, %%r13;"
 512		"  adcx %%rbx, %%r10;"
 513		"  adoxq 80(%0), %%r10;"
 514		"  mulxq 120(%0), %%r11, %%rax;"
 515		"  adcx %%r13, %%r11;"
 516		"  adoxq 88(%0), %%r11;"
 517		"  adcx %1, %%rax;"
 518		"  adox %1, %%rax;"
 519		"  imul %%rdx, %%rax;"
 520
 521		/* Step 2: Fold the carry back into dst */
 522		"  add %%rax, %%r8;"
 523		"  adcx %1, %%r9;"
 524		"  movq %%r9, 40(%2);"
 525		"  adcx %1, %%r10;"
 526		"  movq %%r10, 48(%2);"
 527		"  adcx %1, %%r11;"
 528		"  movq %%r11, 56(%2);"
 529
 530		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 531		"  mov $0, %%rax;"
 532		"  cmovc %%rdx, %%rax;"
 533		"  add %%rax, %%r8;"
 534		"  movq %%r8, 32(%2);"
 535		: "+&r"(f1), "+&r"(f2), "+&r"(tmp)
 536		: "r"(out)
 537		: "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
 538		  "%r14", "memory", "cc");
 539}
 540
 541/* Computes the field multiplication of four-element f1 with value in f2
 542 * Requires f2 to be smaller than 2^17 */
 543static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
 544{
 545	register u64 f2_r asm("rdx") = f2;
 546
 547	asm volatile(
 548		/* Compute the raw multiplication of f1*f2 */
 549		"  mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
 550		"  mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
 551		"  add %%rcx, %%r9;"
 552		"  mov $0, %%rcx;"
 553		"  mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
 554		"  adcx %%rbx, %%r10;"
 555		"  mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
 556		"  adcx %%r13, %%r11;"
 557		"  adcx %%rcx, %%rax;"
 558
 559		/* Wrap the result back into the field */
 560
 561		/* Step 1: Compute carry*38 */
 562		"  mov $38, %%rdx;"
 563		"  imul %%rdx, %%rax;"
 564
 565		/* Step 2: Fold the carry back into dst */
 566		"  add %%rax, %%r8;"
 567		"  adcx %%rcx, %%r9;"
 568		"  movq %%r9, 8(%1);"
 569		"  adcx %%rcx, %%r10;"
 570		"  movq %%r10, 16(%1);"
 571		"  adcx %%rcx, %%r11;"
 572		"  movq %%r11, 24(%1);"
 573
 574		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 575		"  mov $0, %%rax;"
 576		"  cmovc %%rdx, %%rax;"
 577		"  add %%rax, %%r8;"
 578		"  movq %%r8, 0(%1);"
 579		: "+&r"(f2_r)
 580		: "r"(out), "r"(f1)
 581		: "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13",
 582		  "memory", "cc");
 583}
 584
 585/* Computes p1 <- bit ? p2 : p1 in constant time */
 586static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 587{
 588	asm volatile(
 589		/* Transfer bit into CF flag */
 590		"  add $18446744073709551615, %0;"
 591
 592		/* cswap p1[0], p2[0] */
 593		"  movq 0(%1), %%r8;"
 594		"  movq 0(%2), %%r9;"
 595		"  mov %%r8, %%r10;"
 596		"  cmovc %%r9, %%r8;"
 597		"  cmovc %%r10, %%r9;"
 598		"  movq %%r8, 0(%1);"
 599		"  movq %%r9, 0(%2);"
 600
 601		/* cswap p1[1], p2[1] */
 602		"  movq 8(%1), %%r8;"
 603		"  movq 8(%2), %%r9;"
 604		"  mov %%r8, %%r10;"
 605		"  cmovc %%r9, %%r8;"
 606		"  cmovc %%r10, %%r9;"
 607		"  movq %%r8, 8(%1);"
 608		"  movq %%r9, 8(%2);"
 609
 610		/* cswap p1[2], p2[2] */
 611		"  movq 16(%1), %%r8;"
 612		"  movq 16(%2), %%r9;"
 613		"  mov %%r8, %%r10;"
 614		"  cmovc %%r9, %%r8;"
 615		"  cmovc %%r10, %%r9;"
 616		"  movq %%r8, 16(%1);"
 617		"  movq %%r9, 16(%2);"
 618
 619		/* cswap p1[3], p2[3] */
 620		"  movq 24(%1), %%r8;"
 621		"  movq 24(%2), %%r9;"
 622		"  mov %%r8, %%r10;"
 623		"  cmovc %%r9, %%r8;"
 624		"  cmovc %%r10, %%r9;"
 625		"  movq %%r8, 24(%1);"
 626		"  movq %%r9, 24(%2);"
 627
 628		/* cswap p1[4], p2[4] */
 629		"  movq 32(%1), %%r8;"
 630		"  movq 32(%2), %%r9;"
 631		"  mov %%r8, %%r10;"
 632		"  cmovc %%r9, %%r8;"
 633		"  cmovc %%r10, %%r9;"
 634		"  movq %%r8, 32(%1);"
 635		"  movq %%r9, 32(%2);"
 636
 637		/* cswap p1[5], p2[5] */
 638		"  movq 40(%1), %%r8;"
 639		"  movq 40(%2), %%r9;"
 640		"  mov %%r8, %%r10;"
 641		"  cmovc %%r9, %%r8;"
 642		"  cmovc %%r10, %%r9;"
 643		"  movq %%r8, 40(%1);"
 644		"  movq %%r9, 40(%2);"
 645
 646		/* cswap p1[6], p2[6] */
 647		"  movq 48(%1), %%r8;"
 648		"  movq 48(%2), %%r9;"
 649		"  mov %%r8, %%r10;"
 650		"  cmovc %%r9, %%r8;"
 651		"  cmovc %%r10, %%r9;"
 652		"  movq %%r8, 48(%1);"
 653		"  movq %%r9, 48(%2);"
 654
 655		/* cswap p1[7], p2[7] */
 656		"  movq 56(%1), %%r8;"
 657		"  movq 56(%2), %%r9;"
 658		"  mov %%r8, %%r10;"
 659		"  cmovc %%r9, %%r8;"
 660		"  cmovc %%r10, %%r9;"
 661		"  movq %%r8, 56(%1);"
 662		"  movq %%r9, 56(%2);"
 663		: "+&r"(bit)
 664		: "r"(p1), "r"(p2)
 665		: "%r8", "%r9", "%r10", "memory", "cc");
 
 666}
 667
 668/* Computes the square of a field element: out <- f * f
 669 * Uses the 8-element buffer tmp for intermediate results */
 670static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 671{
 672	asm volatile(
 673		/* Compute the raw multiplication: tmp <- f * f */
 674
 675		/* Step 1: Compute all partial products */
 676		"  movq 0(%0), %%rdx;" /* f[0] */
 677		"  mulxq 8(%0), %%r8, %%r14;"
 678		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
 679		"  mulxq 16(%0), %%r9, %%r10;"
 680		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
 681		"  mulxq 24(%0), %%rax, %%rcx;"
 682		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
 683		"  movq 24(%0), %%rdx;" /* f[3] */
 684		"  mulxq 8(%0), %%r11, %%rbx;"
 685		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
 686		"  mulxq 16(%0), %%rax, %%r13;"
 687		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
 688		"  movq 8(%0), %%rdx;"
 689		"  adcx %%r15, %%r13;" /* f1 */
 690		"  mulxq 16(%0), %%rax, %%rcx;"
 691		"  mov $0, %%r14;" /* f[2]*f[1] */
 692
 693		/* Step 2: Compute two parallel carry chains */
 694		"  xor %%r15d, %%r15d;"
 695		"  adox %%rax, %%r10;"
 696		"  adcx %%r8, %%r8;"
 697		"  adox %%rcx, %%r11;"
 698		"  adcx %%r9, %%r9;"
 699		"  adox %%r15, %%rbx;"
 700		"  adcx %%r10, %%r10;"
 701		"  adox %%r15, %%r13;"
 702		"  adcx %%r11, %%r11;"
 703		"  adox %%r15, %%r14;"
 704		"  adcx %%rbx, %%rbx;"
 705		"  adcx %%r13, %%r13;"
 706		"  adcx %%r14, %%r14;"
 707
 708		/* Step 3: Compute intermediate squares */
 709		"  movq 0(%0), %%rdx;"
 710		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
 711		"  movq %%rax, 0(%1);"
 712		"  add %%rcx, %%r8;"
 713		"  movq %%r8, 8(%1);"
 714		"  movq 8(%0), %%rdx;"
 715		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
 716		"  adcx %%rax, %%r9;"
 717		"  movq %%r9, 16(%1);"
 718		"  adcx %%rcx, %%r10;"
 719		"  movq %%r10, 24(%1);"
 720		"  movq 16(%0), %%rdx;"
 721		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
 722		"  adcx %%rax, %%r11;"
 723		"  movq %%r11, 32(%1);"
 724		"  adcx %%rcx, %%rbx;"
 725		"  movq %%rbx, 40(%1);"
 726		"  movq 24(%0), %%rdx;"
 727		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
 728		"  adcx %%rax, %%r13;"
 729		"  movq %%r13, 48(%1);"
 730		"  adcx %%rcx, %%r14;"
 731		"  movq %%r14, 56(%1);"
 732
 733		/* Line up pointers */
 734		"  mov %1, %0;"
 735		"  mov %2, %1;"
 736
 737		/* Wrap the result back into the field */
 738
 739		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 740		"  mov $38, %%rdx;"
 741		"  mulxq 32(%0), %%r8, %%r13;"
 742		"  xor %%ecx, %%ecx;"
 743		"  adoxq 0(%0), %%r8;"
 744		"  mulxq 40(%0), %%r9, %%rbx;"
 745		"  adcx %%r13, %%r9;"
 746		"  adoxq 8(%0), %%r9;"
 747		"  mulxq 48(%0), %%r10, %%r13;"
 748		"  adcx %%rbx, %%r10;"
 749		"  adoxq 16(%0), %%r10;"
 750		"  mulxq 56(%0), %%r11, %%rax;"
 751		"  adcx %%r13, %%r11;"
 752		"  adoxq 24(%0), %%r11;"
 753		"  adcx %%rcx, %%rax;"
 754		"  adox %%rcx, %%rax;"
 755		"  imul %%rdx, %%rax;"
 756
 757		/* Step 2: Fold the carry back into dst */
 758		"  add %%rax, %%r8;"
 759		"  adcx %%rcx, %%r9;"
 760		"  movq %%r9, 8(%1);"
 761		"  adcx %%rcx, %%r10;"
 762		"  movq %%r10, 16(%1);"
 763		"  adcx %%rcx, %%r11;"
 764		"  movq %%r11, 24(%1);"
 765
 766		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 767		"  mov $0, %%rax;"
 768		"  cmovc %%rdx, %%rax;"
 769		"  add %%rax, %%r8;"
 770		"  movq %%r8, 0(%1);"
 771		: "+&r"(f), "+&r"(tmp)
 772		: "r"(out)
 773		: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
 774		  "%r13", "%r14", "%r15", "memory", "cc");
 775}
 776
 777/* Computes two field squarings:
 778 *   out[0] <- f[0] * f[0]
 779 *   out[1] <- f[1] * f[1]
 780 * Uses the 16-element buffer tmp for intermediate results */
 781static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 782{
 783	asm volatile(
 784		/* Step 1: Compute all partial products */
 785		"  movq 0(%0), %%rdx;" /* f[0] */
 786		"  mulxq 8(%0), %%r8, %%r14;"
 787		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
 788		"  mulxq 16(%0), %%r9, %%r10;"
 789		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
 790		"  mulxq 24(%0), %%rax, %%rcx;"
 791		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
 792		"  movq 24(%0), %%rdx;" /* f[3] */
 793		"  mulxq 8(%0), %%r11, %%rbx;"
 794		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
 795		"  mulxq 16(%0), %%rax, %%r13;"
 796		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
 797		"  movq 8(%0), %%rdx;"
 798		"  adcx %%r15, %%r13;" /* f1 */
 799		"  mulxq 16(%0), %%rax, %%rcx;"
 800		"  mov $0, %%r14;" /* f[2]*f[1] */
 801
 802		/* Step 2: Compute two parallel carry chains */
 803		"  xor %%r15d, %%r15d;"
 804		"  adox %%rax, %%r10;"
 805		"  adcx %%r8, %%r8;"
 806		"  adox %%rcx, %%r11;"
 807		"  adcx %%r9, %%r9;"
 808		"  adox %%r15, %%rbx;"
 809		"  adcx %%r10, %%r10;"
 810		"  adox %%r15, %%r13;"
 811		"  adcx %%r11, %%r11;"
 812		"  adox %%r15, %%r14;"
 813		"  adcx %%rbx, %%rbx;"
 814		"  adcx %%r13, %%r13;"
 815		"  adcx %%r14, %%r14;"
 816
 817		/* Step 3: Compute intermediate squares */
 818		"  movq 0(%0), %%rdx;"
 819		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
 820		"  movq %%rax, 0(%1);"
 821		"  add %%rcx, %%r8;"
 822		"  movq %%r8, 8(%1);"
 823		"  movq 8(%0), %%rdx;"
 824		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
 825		"  adcx %%rax, %%r9;"
 826		"  movq %%r9, 16(%1);"
 827		"  adcx %%rcx, %%r10;"
 828		"  movq %%r10, 24(%1);"
 829		"  movq 16(%0), %%rdx;"
 830		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
 831		"  adcx %%rax, %%r11;"
 832		"  movq %%r11, 32(%1);"
 833		"  adcx %%rcx, %%rbx;"
 834		"  movq %%rbx, 40(%1);"
 835		"  movq 24(%0), %%rdx;"
 836		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
 837		"  adcx %%rax, %%r13;"
 838		"  movq %%r13, 48(%1);"
 839		"  adcx %%rcx, %%r14;"
 840		"  movq %%r14, 56(%1);"
 841
 842		/* Step 1: Compute all partial products */
 843		"  movq 32(%0), %%rdx;" /* f[0] */
 844		"  mulxq 40(%0), %%r8, %%r14;"
 845		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
 846		"  mulxq 48(%0), %%r9, %%r10;"
 847		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
 848		"  mulxq 56(%0), %%rax, %%rcx;"
 849		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
 850		"  movq 56(%0), %%rdx;" /* f[3] */
 851		"  mulxq 40(%0), %%r11, %%rbx;"
 852		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
 853		"  mulxq 48(%0), %%rax, %%r13;"
 854		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
 855		"  movq 40(%0), %%rdx;"
 856		"  adcx %%r15, %%r13;" /* f1 */
 857		"  mulxq 48(%0), %%rax, %%rcx;"
 858		"  mov $0, %%r14;" /* f[2]*f[1] */
 859
 860		/* Step 2: Compute two parallel carry chains */
 861		"  xor %%r15d, %%r15d;"
 862		"  adox %%rax, %%r10;"
 863		"  adcx %%r8, %%r8;"
 864		"  adox %%rcx, %%r11;"
 865		"  adcx %%r9, %%r9;"
 866		"  adox %%r15, %%rbx;"
 867		"  adcx %%r10, %%r10;"
 868		"  adox %%r15, %%r13;"
 869		"  adcx %%r11, %%r11;"
 870		"  adox %%r15, %%r14;"
 871		"  adcx %%rbx, %%rbx;"
 872		"  adcx %%r13, %%r13;"
 873		"  adcx %%r14, %%r14;"
 874
 875		/* Step 3: Compute intermediate squares */
 876		"  movq 32(%0), %%rdx;"
 877		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
 878		"  movq %%rax, 64(%1);"
 879		"  add %%rcx, %%r8;"
 880		"  movq %%r8, 72(%1);"
 881		"  movq 40(%0), %%rdx;"
 882		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
 883		"  adcx %%rax, %%r9;"
 884		"  movq %%r9, 80(%1);"
 885		"  adcx %%rcx, %%r10;"
 886		"  movq %%r10, 88(%1);"
 887		"  movq 48(%0), %%rdx;"
 888		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
 889		"  adcx %%rax, %%r11;"
 890		"  movq %%r11, 96(%1);"
 891		"  adcx %%rcx, %%rbx;"
 892		"  movq %%rbx, 104(%1);"
 893		"  movq 56(%0), %%rdx;"
 894		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
 895		"  adcx %%rax, %%r13;"
 896		"  movq %%r13, 112(%1);"
 897		"  adcx %%rcx, %%r14;"
 898		"  movq %%r14, 120(%1);"
 899
 900		/* Line up pointers */
 901		"  mov %1, %0;"
 902		"  mov %2, %1;"
 903
 904		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 905		"  mov $38, %%rdx;"
 906		"  mulxq 32(%0), %%r8, %%r13;"
 907		"  xor %%ecx, %%ecx;"
 908		"  adoxq 0(%0), %%r8;"
 909		"  mulxq 40(%0), %%r9, %%rbx;"
 910		"  adcx %%r13, %%r9;"
 911		"  adoxq 8(%0), %%r9;"
 912		"  mulxq 48(%0), %%r10, %%r13;"
 913		"  adcx %%rbx, %%r10;"
 914		"  adoxq 16(%0), %%r10;"
 915		"  mulxq 56(%0), %%r11, %%rax;"
 916		"  adcx %%r13, %%r11;"
 917		"  adoxq 24(%0), %%r11;"
 918		"  adcx %%rcx, %%rax;"
 919		"  adox %%rcx, %%rax;"
 920		"  imul %%rdx, %%rax;"
 921
 922		/* Step 2: Fold the carry back into dst */
 923		"  add %%rax, %%r8;"
 924		"  adcx %%rcx, %%r9;"
 925		"  movq %%r9, 8(%1);"
 926		"  adcx %%rcx, %%r10;"
 927		"  movq %%r10, 16(%1);"
 928		"  adcx %%rcx, %%r11;"
 929		"  movq %%r11, 24(%1);"
 930
 931		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 932		"  mov $0, %%rax;"
 933		"  cmovc %%rdx, %%rax;"
 934		"  add %%rax, %%r8;"
 935		"  movq %%r8, 0(%1);"
 936
 937		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 938		"  mov $38, %%rdx;"
 939		"  mulxq 96(%0), %%r8, %%r13;"
 940		"  xor %%ecx, %%ecx;"
 941		"  adoxq 64(%0), %%r8;"
 942		"  mulxq 104(%0), %%r9, %%rbx;"
 943		"  adcx %%r13, %%r9;"
 944		"  adoxq 72(%0), %%r9;"
 945		"  mulxq 112(%0), %%r10, %%r13;"
 946		"  adcx %%rbx, %%r10;"
 947		"  adoxq 80(%0), %%r10;"
 948		"  mulxq 120(%0), %%r11, %%rax;"
 949		"  adcx %%r13, %%r11;"
 950		"  adoxq 88(%0), %%r11;"
 951		"  adcx %%rcx, %%rax;"
 952		"  adox %%rcx, %%rax;"
 953		"  imul %%rdx, %%rax;"
 954
 955		/* Step 2: Fold the carry back into dst */
 956		"  add %%rax, %%r8;"
 957		"  adcx %%rcx, %%r9;"
 958		"  movq %%r9, 40(%1);"
 959		"  adcx %%rcx, %%r10;"
 960		"  movq %%r10, 48(%1);"
 961		"  adcx %%rcx, %%r11;"
 962		"  movq %%r11, 56(%1);"
 963
 964		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 965		"  mov $0, %%rax;"
 966		"  cmovc %%rdx, %%rax;"
 967		"  add %%rax, %%r8;"
 968		"  movq %%r8, 32(%1);"
 969		: "+&r"(f), "+&r"(tmp)
 970		: "r"(out)
 971		: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
 972		  "%r13", "%r14", "%r15", "memory", "cc");
 973}
 974
 975static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
 976{
 977	u64 *nq = p01_tmp1;
 978	u64 *nq_p1 = p01_tmp1 + (u32)8U;
 979	u64 *tmp1 = p01_tmp1 + (u32)16U;
 980	u64 *x1 = q;
 981	u64 *x2 = nq;
 982	u64 *z2 = nq + (u32)4U;
 983	u64 *z3 = nq_p1 + (u32)4U;
 984	u64 *a = tmp1;
 985	u64 *b = tmp1 + (u32)4U;
 986	u64 *ab = tmp1;
 987	u64 *dc = tmp1 + (u32)8U;
 988	u64 *x3;
 989	u64 *z31;
 990	u64 *d0;
 991	u64 *c0;
 992	u64 *a1;
 993	u64 *b1;
 994	u64 *d;
 995	u64 *c;
 996	u64 *ab1;
 997	u64 *dc1;
 998	fadd(a, x2, z2);
 999	fsub(b, x2, z2);
1000	x3 = nq_p1;
1001	z31 = nq_p1 + (u32)4U;
1002	d0 = dc;
1003	c0 = dc + (u32)4U;
1004	fadd(c0, x3, z31);
1005	fsub(d0, x3, z31);
1006	fmul2(dc, dc, ab, tmp2);
1007	fadd(x3, d0, c0);
1008	fsub(z31, d0, c0);
1009	a1 = tmp1;
1010	b1 = tmp1 + (u32)4U;
1011	d = tmp1 + (u32)8U;
1012	c = tmp1 + (u32)12U;
1013	ab1 = tmp1;
1014	dc1 = tmp1 + (u32)8U;
1015	fsqr2(dc1, ab1, tmp2);
1016	fsqr2(nq_p1, nq_p1, tmp2);
1017	a1[0U] = c[0U];
1018	a1[1U] = c[1U];
1019	a1[2U] = c[2U];
1020	a1[3U] = c[3U];
1021	fsub(c, d, c);
1022	fmul_scalar(b1, c, (u64)121665U);
1023	fadd(b1, b1, d);
1024	fmul2(nq, dc1, ab1, tmp2);
1025	fmul(z3, z3, x1, tmp2);
1026}
1027
1028static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2)
1029{
1030	u64 *x2 = nq;
1031	u64 *z2 = nq + (u32)4U;
1032	u64 *a = tmp1;
1033	u64 *b = tmp1 + (u32)4U;
1034	u64 *d = tmp1 + (u32)8U;
1035	u64 *c = tmp1 + (u32)12U;
1036	u64 *ab = tmp1;
1037	u64 *dc = tmp1 + (u32)8U;
1038	fadd(a, x2, z2);
1039	fsub(b, x2, z2);
1040	fsqr2(dc, ab, tmp2);
1041	a[0U] = c[0U];
1042	a[1U] = c[1U];
1043	a[2U] = c[2U];
1044	a[3U] = c[3U];
1045	fsub(c, d, c);
1046	fmul_scalar(b, c, (u64)121665U);
1047	fadd(b, b, d);
1048	fmul2(nq, dc, ab, tmp2);
1049}
1050
1051static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
1052{
1053	u64 tmp2[16U] = { 0U };
1054	u64 p01_tmp1_swap[33U] = { 0U };
1055	u64 *p0 = p01_tmp1_swap;
1056	u64 *p01 = p01_tmp1_swap;
1057	u64 *p03 = p01;
1058	u64 *p11 = p01 + (u32)8U;
1059	u64 *x0;
1060	u64 *z0;
1061	u64 *p01_tmp1;
1062	u64 *p01_tmp11;
1063	u64 *nq10;
1064	u64 *nq_p11;
1065	u64 *swap1;
1066	u64 sw0;
1067	u64 *nq1;
1068	u64 *tmp1;
1069	memcpy(p11, init1, (u32)8U * sizeof(init1[0U]));
1070	x0 = p03;
1071	z0 = p03 + (u32)4U;
1072	x0[0U] = (u64)1U;
1073	x0[1U] = (u64)0U;
1074	x0[2U] = (u64)0U;
1075	x0[3U] = (u64)0U;
1076	z0[0U] = (u64)0U;
1077	z0[1U] = (u64)0U;
1078	z0[2U] = (u64)0U;
1079	z0[3U] = (u64)0U;
1080	p01_tmp1 = p01_tmp1_swap;
1081	p01_tmp11 = p01_tmp1_swap;
1082	nq10 = p01_tmp1_swap;
1083	nq_p11 = p01_tmp1_swap + (u32)8U;
1084	swap1 = p01_tmp1_swap + (u32)32U;
1085	cswap2((u64)1U, nq10, nq_p11);
1086	point_add_and_double(init1, p01_tmp11, tmp2);
1087	swap1[0U] = (u64)1U;
1088	{
1089		u32 i;
1090		for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) {
1091			u64 *p01_tmp12 = p01_tmp1_swap;
1092			u64 *swap2 = p01_tmp1_swap + (u32)32U;
1093			u64 *nq2 = p01_tmp12;
1094			u64 *nq_p12 = p01_tmp12 + (u32)8U;
1095			u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U);
1096			u64 sw = swap2[0U] ^ bit;
1097			cswap2(sw, nq2, nq_p12);
1098			point_add_and_double(init1, p01_tmp12, tmp2);
1099			swap2[0U] = bit;
1100		}
1101	}
1102	sw0 = swap1[0U];
1103	cswap2(sw0, nq10, nq_p11);
1104	nq1 = p01_tmp1;
1105	tmp1 = p01_tmp1 + (u32)16U;
1106	point_double(nq1, tmp1, tmp2);
1107	point_double(nq1, tmp1, tmp2);
1108	point_double(nq1, tmp1, tmp2);
1109	memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
1110
1111	memzero_explicit(tmp2, sizeof(tmp2));
1112	memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
1113}
1114
1115static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
1116{
1117	u32 i;
1118	fsqr(o, inp, tmp);
1119	for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U)
1120		fsqr(o, o, tmp);
1121}
1122
1123static void finv(u64 *o, const u64 *i, u64 *tmp)
1124{
1125	u64 t1[16U] = { 0U };
1126	u64 *a0 = t1;
1127	u64 *b = t1 + (u32)4U;
1128	u64 *c = t1 + (u32)8U;
1129	u64 *t00 = t1 + (u32)12U;
1130	u64 *tmp1 = tmp;
1131	u64 *a;
1132	u64 *t0;
1133	fsquare_times(a0, i, tmp1, (u32)1U);
1134	fsquare_times(t00, a0, tmp1, (u32)2U);
1135	fmul(b, t00, i, tmp);
1136	fmul(a0, b, a0, tmp);
1137	fsquare_times(t00, a0, tmp1, (u32)1U);
1138	fmul(b, t00, b, tmp);
1139	fsquare_times(t00, b, tmp1, (u32)5U);
1140	fmul(b, t00, b, tmp);
1141	fsquare_times(t00, b, tmp1, (u32)10U);
1142	fmul(c, t00, b, tmp);
1143	fsquare_times(t00, c, tmp1, (u32)20U);
1144	fmul(t00, t00, c, tmp);
1145	fsquare_times(t00, t00, tmp1, (u32)10U);
1146	fmul(b, t00, b, tmp);
1147	fsquare_times(t00, b, tmp1, (u32)50U);
1148	fmul(c, t00, b, tmp);
1149	fsquare_times(t00, c, tmp1, (u32)100U);
1150	fmul(t00, t00, c, tmp);
1151	fsquare_times(t00, t00, tmp1, (u32)50U);
1152	fmul(t00, t00, b, tmp);
1153	fsquare_times(t00, t00, tmp1, (u32)5U);
1154	a = t1;
1155	t0 = t1 + (u32)12U;
1156	fmul(o, t0, a, tmp);
1157}
1158
1159static void store_felem(u64 *b, u64 *f)
1160{
1161	u64 f30 = f[3U];
1162	u64 top_bit0 = f30 >> (u32)63U;
1163	u64 f31;
1164	u64 top_bit;
1165	u64 f0;
1166	u64 f1;
1167	u64 f2;
1168	u64 f3;
1169	u64 m0;
1170	u64 m1;
1171	u64 m2;
1172	u64 m3;
1173	u64 mask;
1174	u64 f0_;
1175	u64 f1_;
1176	u64 f2_;
1177	u64 f3_;
1178	u64 o0;
1179	u64 o1;
1180	u64 o2;
1181	u64 o3;
1182	f[3U] = f30 & (u64)0x7fffffffffffffffU;
1183	add_scalar(f, f, (u64)19U * top_bit0);
1184	f31 = f[3U];
1185	top_bit = f31 >> (u32)63U;
1186	f[3U] = f31 & (u64)0x7fffffffffffffffU;
1187	add_scalar(f, f, (u64)19U * top_bit);
1188	f0 = f[0U];
1189	f1 = f[1U];
1190	f2 = f[2U];
1191	f3 = f[3U];
1192	m0 = gte_mask(f0, (u64)0xffffffffffffffedU);
1193	m1 = eq_mask(f1, (u64)0xffffffffffffffffU);
1194	m2 = eq_mask(f2, (u64)0xffffffffffffffffU);
1195	m3 = eq_mask(f3, (u64)0x7fffffffffffffffU);
1196	mask = ((m0 & m1) & m2) & m3;
1197	f0_ = f0 - (mask & (u64)0xffffffffffffffedU);
1198	f1_ = f1 - (mask & (u64)0xffffffffffffffffU);
1199	f2_ = f2 - (mask & (u64)0xffffffffffffffffU);
1200	f3_ = f3 - (mask & (u64)0x7fffffffffffffffU);
1201	o0 = f0_;
1202	o1 = f1_;
1203	o2 = f2_;
1204	o3 = f3_;
1205	b[0U] = o0;
1206	b[1U] = o1;
1207	b[2U] = o2;
1208	b[3U] = o3;
1209}
1210
1211static void encode_point(u8 *o, const u64 *i)
1212{
1213	const u64 *x = i;
1214	const u64 *z = i + (u32)4U;
1215	u64 tmp[4U] = { 0U };
1216	u64 tmp_w[16U] = { 0U };
1217	finv(tmp, z, tmp_w);
1218	fmul(tmp, tmp, x, tmp_w);
1219	store_felem((u64 *)o, tmp);
1220}
1221
1222static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub)
1223{
1224	u64 init1[8U] = { 0U };
1225	u64 tmp[4U] = { 0U };
1226	u64 tmp3;
1227	u64 *x;
1228	u64 *z;
1229	{
1230		u32 i;
1231		for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) {
1232			u64 *os = tmp;
1233			const u8 *bj = pub + i * (u32)8U;
1234			u64 u = *(u64 *)bj;
1235			u64 r = u;
1236			u64 x0 = r;
1237			os[i] = x0;
1238		}
1239	}
1240	tmp3 = tmp[3U];
1241	tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU;
1242	x = init1;
1243	z = init1 + (u32)4U;
1244	z[0U] = (u64)1U;
1245	z[1U] = (u64)0U;
1246	z[2U] = (u64)0U;
1247	z[3U] = (u64)0U;
1248	x[0U] = tmp[0U];
1249	x[1U] = tmp[1U];
1250	x[2U] = tmp[2U];
1251	x[3U] = tmp[3U];
1252	montgomery_ladder(init1, priv, init1);
1253	encode_point(out, init1);
1254}
1255
1256/* The below constants were generated using this sage script:
1257 *
1258 * #!/usr/bin/env sage
1259 * import sys
1260 * from sage.all import *
1261 * def limbs(n):
1262 * 	n = int(n)
1263 * 	l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64)
1264 * 	return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l
1265 * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0])
1266 * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0]
1267 * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s))
1268 * print("static const u64 table_ladder[] = {")
1269 * p = ec.lift_x(9)
1270 * for i in range(252):
1271 * 	l = (p[0] + p[2]) / (p[0] - p[2])
1272 * 	print(("\t%s" + ("," if i != 251 else "")) % limbs(l))
1273 * 	p = p * 2
1274 * print("};")
1275 *
1276 */
1277
1278static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL };
1279
1280static const u64 table_ladder[] = {
1281	0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL,
1282	0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL,
1283	0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL,
1284	0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL,
1285	0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL,
1286	0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL,
1287	0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL,
1288	0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL,
1289	0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL,
1290	0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL,
1291	0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL,
1292	0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL,
1293	0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL,
1294	0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL,
1295	0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL,
1296	0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL,
1297	0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL,
1298	0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL,
1299	0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL,
1300	0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL,
1301	0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL,
1302	0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL,
1303	0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL,
1304	0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL,
1305	0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL,
1306	0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL,
1307	0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL,
1308	0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL,
1309	0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL,
1310	0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL,
1311	0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL,
1312	0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL,
1313	0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL,
1314	0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL,
1315	0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL,
1316	0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL,
1317	0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL,
1318	0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL,
1319	0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL,
1320	0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL,
1321	0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL,
1322	0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL,
1323	0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL,
1324	0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL,
1325	0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL,
1326	0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL,
1327	0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL,
1328	0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL,
1329	0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL,
1330	0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL,
1331	0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL,
1332	0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL,
1333	0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL,
1334	0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL,
1335	0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL,
1336	0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL,
1337	0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL,
1338	0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL,
1339	0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL,
1340	0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL,
1341	0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL,
1342	0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL,
1343	0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL,
1344	0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL,
1345	0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL,
1346	0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL,
1347	0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL,
1348	0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL,
1349	0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL,
1350	0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL,
1351	0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL,
1352	0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL,
1353	0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL,
1354	0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL,
1355	0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL,
1356	0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL,
1357	0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL,
1358	0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL,
1359	0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL,
1360	0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL,
1361	0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL,
1362	0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL,
1363	0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL,
1364	0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL,
1365	0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL,
1366	0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL,
1367	0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL,
1368	0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL,
1369	0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL,
1370	0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL,
1371	0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL,
1372	0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL,
1373	0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL,
1374	0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL,
1375	0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL,
1376	0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL,
1377	0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL,
1378	0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL,
1379	0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL,
1380	0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL,
1381	0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL,
1382	0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL,
1383	0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL,
1384	0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL,
1385	0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL,
1386	0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL,
1387	0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL,
1388	0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL,
1389	0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL,
1390	0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL,
1391	0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL,
1392	0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL,
1393	0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL,
1394	0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL,
1395	0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL,
1396	0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL,
1397	0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL,
1398	0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL,
1399	0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL,
1400	0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL,
1401	0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL,
1402	0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL,
1403	0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL,
1404	0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL,
1405	0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL,
1406	0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL,
1407	0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL,
1408	0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL,
1409	0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL,
1410	0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL,
1411	0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL,
1412	0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL,
1413	0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL,
1414	0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL,
1415	0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL,
1416	0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL,
1417	0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL,
1418	0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL,
1419	0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL,
1420	0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL,
1421	0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL,
1422	0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL,
1423	0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL,
1424	0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL,
1425	0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL,
1426	0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL,
1427	0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL,
1428	0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL,
1429	0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL,
1430	0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL,
1431	0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL,
1432	0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL,
1433	0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL,
1434	0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL,
1435	0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL,
1436	0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL,
1437	0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL,
1438	0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL,
1439	0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL,
1440	0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL,
1441	0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL,
1442	0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL,
1443	0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL,
1444	0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL,
1445	0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL,
1446	0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL,
1447	0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL,
1448	0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL,
1449	0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL,
1450	0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL,
1451	0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL,
1452	0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL,
1453	0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL,
1454	0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL,
1455	0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL,
1456	0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL,
1457	0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL,
1458	0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL,
1459	0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL,
1460	0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL,
1461	0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL,
1462	0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL,
1463	0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL,
1464	0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL,
1465	0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL,
1466	0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL,
1467	0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL,
1468	0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL,
1469	0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL,
1470	0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL,
1471	0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL,
1472	0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL,
1473	0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL,
1474	0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL,
1475	0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL,
1476	0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL,
1477	0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL,
1478	0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL,
1479	0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL,
1480	0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL,
1481	0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL,
1482	0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL,
1483	0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL,
1484	0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL,
1485	0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL,
1486	0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL,
1487	0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL,
1488	0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL,
1489	0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL,
1490	0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL,
1491	0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL,
1492	0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL,
1493	0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL,
1494	0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL,
1495	0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL,
1496	0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL,
1497	0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL,
1498	0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL,
1499	0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL,
1500	0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL,
1501	0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL,
1502	0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL,
1503	0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL,
1504	0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL,
1505	0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL,
1506	0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL,
1507	0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL,
1508	0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL,
1509	0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL,
1510	0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL,
1511	0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL,
1512	0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL,
1513	0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL,
1514	0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL,
1515	0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL,
1516	0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL,
1517	0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL,
1518	0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL,
1519	0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL,
1520	0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL,
1521	0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL,
1522	0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL,
1523	0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL,
1524	0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL,
1525	0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL,
1526	0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL,
1527	0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL,
1528	0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL,
1529	0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL,
1530	0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL,
1531	0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL,
1532	0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL
1533};
1534
1535static void curve25519_ever64_base(u8 *out, const u8 *priv)
1536{
1537	u64 swap = 1;
1538	int i, j, k;
1539	u64 tmp[16 + 32 + 4];
1540	u64 *x1 = &tmp[0];
1541	u64 *z1 = &tmp[4];
1542	u64 *x2 = &tmp[8];
1543	u64 *z2 = &tmp[12];
1544	u64 *xz1 = &tmp[0];
1545	u64 *xz2 = &tmp[8];
1546	u64 *a = &tmp[0 + 16];
1547	u64 *b = &tmp[4 + 16];
1548	u64 *c = &tmp[8 + 16];
1549	u64 *ab = &tmp[0 + 16];
1550	u64 *abcd = &tmp[0 + 16];
1551	u64 *ef = &tmp[16 + 16];
1552	u64 *efgh = &tmp[16 + 16];
1553	u64 *key = &tmp[0 + 16 + 32];
1554
1555	memcpy(key, priv, 32);
1556	((u8 *)key)[0] &= 248;
1557	((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64;
1558
1559	x1[0] = 1, x1[1] = x1[2] = x1[3] = 0;
1560	z1[0] = 1, z1[1] = z1[2] = z1[3] = 0;
1561	z2[0] = 1, z2[1] = z2[2] = z2[3] = 0;
1562	memcpy(x2, p_minus_s, sizeof(p_minus_s));
1563
1564	j = 3;
1565	for (i = 0; i < 4; ++i) {
1566		while (j < (const int[]){ 64, 64, 64, 63 }[i]) {
1567			u64 bit = (key[i] >> j) & 1;
1568			k = (64 * i + j - 3);
1569			swap = swap ^ bit;
1570			cswap2(swap, xz1, xz2);
1571			swap = bit;
1572			fsub(b, x1, z1);
1573			fadd(a, x1, z1);
1574			fmul(c, &table_ladder[4 * k], b, ef);
1575			fsub(b, a, c);
1576			fadd(a, a, c);
1577			fsqr2(ab, ab, efgh);
1578			fmul2(xz1, xz2, ab, efgh);
1579			++j;
1580		}
1581		j = 0;
1582	}
1583
1584	point_double(xz1, abcd, efgh);
1585	point_double(xz1, abcd, efgh);
1586	point_double(xz1, abcd, efgh);
1587	encode_point(out, xz1);
1588
1589	memzero_explicit(tmp, sizeof(tmp));
1590}
1591
1592static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx);
1593
1594void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
1595		     const u8 secret[CURVE25519_KEY_SIZE],
1596		     const u8 basepoint[CURVE25519_KEY_SIZE])
1597{
1598	if (static_branch_likely(&curve25519_use_bmi2_adx))
1599		curve25519_ever64(mypublic, secret, basepoint);
1600	else
1601		curve25519_generic(mypublic, secret, basepoint);
1602}
1603EXPORT_SYMBOL(curve25519_arch);
1604
1605void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
1606			  const u8 secret[CURVE25519_KEY_SIZE])
1607{
1608	if (static_branch_likely(&curve25519_use_bmi2_adx))
1609		curve25519_ever64_base(pub, secret);
1610	else
1611		curve25519_generic(pub, secret, curve25519_base_point);
1612}
1613EXPORT_SYMBOL(curve25519_base_arch);
1614
1615static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
1616				 unsigned int len)
1617{
1618	u8 *secret = kpp_tfm_ctx(tfm);
1619
1620	if (!len)
1621		curve25519_generate_secret(secret);
1622	else if (len == CURVE25519_KEY_SIZE &&
1623		 crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
1624		memcpy(secret, buf, CURVE25519_KEY_SIZE);
1625	else
1626		return -EINVAL;
1627	return 0;
1628}
1629
1630static int curve25519_generate_public_key(struct kpp_request *req)
1631{
1632	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
1633	const u8 *secret = kpp_tfm_ctx(tfm);
1634	u8 buf[CURVE25519_KEY_SIZE];
1635	int copied, nbytes;
1636
1637	if (req->src)
1638		return -EINVAL;
1639
1640	curve25519_base_arch(buf, secret);
1641
1642	/* might want less than we've got */
1643	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1644	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1645								nbytes),
1646				     buf, nbytes);
1647	if (copied != nbytes)
1648		return -EINVAL;
1649	return 0;
1650}
1651
1652static int curve25519_compute_shared_secret(struct kpp_request *req)
1653{
1654	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
1655	const u8 *secret = kpp_tfm_ctx(tfm);
1656	u8 public_key[CURVE25519_KEY_SIZE];
1657	u8 buf[CURVE25519_KEY_SIZE];
1658	int copied, nbytes;
1659
1660	if (!req->src)
1661		return -EINVAL;
1662
1663	copied = sg_copy_to_buffer(req->src,
1664				   sg_nents_for_len(req->src,
1665						    CURVE25519_KEY_SIZE),
1666				   public_key, CURVE25519_KEY_SIZE);
1667	if (copied != CURVE25519_KEY_SIZE)
1668		return -EINVAL;
1669
1670	curve25519_arch(buf, secret, public_key);
1671
1672	/* might want less than we've got */
1673	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1674	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1675								nbytes),
1676				     buf, nbytes);
1677	if (copied != nbytes)
1678		return -EINVAL;
1679	return 0;
1680}
1681
1682static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
1683{
1684	return CURVE25519_KEY_SIZE;
1685}
1686
1687static struct kpp_alg curve25519_alg = {
1688	.base.cra_name		= "curve25519",
1689	.base.cra_driver_name	= "curve25519-x86",
1690	.base.cra_priority	= 200,
1691	.base.cra_module	= THIS_MODULE,
1692	.base.cra_ctxsize	= CURVE25519_KEY_SIZE,
1693
1694	.set_secret		= curve25519_set_secret,
1695	.generate_public_key	= curve25519_generate_public_key,
1696	.compute_shared_secret	= curve25519_compute_shared_secret,
1697	.max_size		= curve25519_max_size,
1698};
1699
1700
1701static int __init curve25519_mod_init(void)
1702{
1703	if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
1704		static_branch_enable(&curve25519_use_bmi2_adx);
1705	else
1706		return 0;
1707	return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
1708		crypto_register_kpp(&curve25519_alg) : 0;
1709}
1710
1711static void __exit curve25519_mod_exit(void)
1712{
1713	if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
1714	    static_branch_likely(&curve25519_use_bmi2_adx))
1715		crypto_unregister_kpp(&curve25519_alg);
1716}
1717
1718module_init(curve25519_mod_init);
1719module_exit(curve25519_mod_exit);
1720
1721MODULE_ALIAS_CRYPTO("curve25519");
1722MODULE_ALIAS_CRYPTO("curve25519-x86");
1723MODULE_DESCRIPTION("Curve25519 algorithm, ADX optimized");
1724MODULE_LICENSE("GPL v2");
1725MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
v5.14.15
   1// SPDX-License-Identifier: GPL-2.0 OR MIT
   2/*
   3 * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
   4 * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
   5 */
   6
   7#include <crypto/curve25519.h>
   8#include <crypto/internal/kpp.h>
   9
  10#include <linux/types.h>
  11#include <linux/jump_label.h>
  12#include <linux/kernel.h>
  13#include <linux/module.h>
  14#include <linux/scatterlist.h>
  15
  16#include <asm/cpufeature.h>
  17#include <asm/processor.h>
  18
  19static __always_inline u64 eq_mask(u64 a, u64 b)
  20{
  21	u64 x = a ^ b;
  22	u64 minus_x = ~x + (u64)1U;
  23	u64 x_or_minus_x = x | minus_x;
  24	u64 xnx = x_or_minus_x >> (u32)63U;
  25	return xnx - (u64)1U;
  26}
  27
  28static __always_inline u64 gte_mask(u64 a, u64 b)
  29{
  30	u64 x = a;
  31	u64 y = b;
  32	u64 x_xor_y = x ^ y;
  33	u64 x_sub_y = x - y;
  34	u64 x_sub_y_xor_y = x_sub_y ^ y;
  35	u64 q = x_xor_y | x_sub_y_xor_y;
  36	u64 x_xor_q = x ^ q;
  37	u64 x_xor_q_ = x_xor_q >> (u32)63U;
  38	return x_xor_q_ - (u64)1U;
  39}
  40
  41/* Computes the addition of four-element f1 with value in f2
  42 * and returns the carry (if any) */
  43static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
  44{
  45	u64 carry_r;
  46
  47	asm volatile(
  48		/* Clear registers to propagate the carry bit */
  49		"  xor %%r8d, %%r8d;"
  50		"  xor %%r9d, %%r9d;"
  51		"  xor %%r10d, %%r10d;"
  52		"  xor %%r11d, %%r11d;"
  53		"  xor %k1, %k1;"
  54
  55		/* Begin addition chain */
  56		"  addq 0(%3), %0;"
  57		"  movq %0, 0(%2);"
  58		"  adcxq 8(%3), %%r8;"
  59		"  movq %%r8, 8(%2);"
  60		"  adcxq 16(%3), %%r9;"
  61		"  movq %%r9, 16(%2);"
  62		"  adcxq 24(%3), %%r10;"
  63		"  movq %%r10, 24(%2);"
  64
  65		/* Return the carry bit in a register */
  66		"  adcx %%r11, %1;"
  67	: "+&r" (f2), "=&r" (carry_r)
  68	: "r" (out), "r" (f1)
  69	: "%r8", "%r9", "%r10", "%r11", "memory", "cc"
  70	);
  71
  72	return carry_r;
  73}
  74
  75/* Computes the field addition of two field elements */
  76static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
  77{
  78	asm volatile(
  79		/* Compute the raw addition of f1 + f2 */
  80		"  movq 0(%0), %%r8;"
  81		"  addq 0(%2), %%r8;"
  82		"  movq 8(%0), %%r9;"
  83		"  adcxq 8(%2), %%r9;"
  84		"  movq 16(%0), %%r10;"
  85		"  adcxq 16(%2), %%r10;"
  86		"  movq 24(%0), %%r11;"
  87		"  adcxq 24(%2), %%r11;"
  88
  89		/* Wrap the result back into the field */
  90
  91		/* Step 1: Compute carry*38 */
  92		"  mov $0, %%rax;"
  93		"  mov $38, %0;"
  94		"  cmovc %0, %%rax;"
  95
  96		/* Step 2: Add carry*38 to the original sum */
  97		"  xor %%ecx, %%ecx;"
  98		"  add %%rax, %%r8;"
  99		"  adcx %%rcx, %%r9;"
 100		"  movq %%r9, 8(%1);"
 101		"  adcx %%rcx, %%r10;"
 102		"  movq %%r10, 16(%1);"
 103		"  adcx %%rcx, %%r11;"
 104		"  movq %%r11, 24(%1);"
 105
 106		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 107		"  mov $0, %%rax;"
 108		"  cmovc %0, %%rax;"
 109		"  add %%rax, %%r8;"
 110		"  movq %%r8, 0(%1);"
 111	: "+&r" (f2)
 112	: "r" (out), "r" (f1)
 113	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
 114	);
 115}
 116
 117/* Computes the field subtraction of two field elements */
 118static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
 119{
 120	asm volatile(
 121		/* Compute the raw subtraction of f1-f2 */
 122		"  movq 0(%1), %%r8;"
 123		"  subq 0(%2), %%r8;"
 124		"  movq 8(%1), %%r9;"
 125		"  sbbq 8(%2), %%r9;"
 126		"  movq 16(%1), %%r10;"
 127		"  sbbq 16(%2), %%r10;"
 128		"  movq 24(%1), %%r11;"
 129		"  sbbq 24(%2), %%r11;"
 130
 131		/* Wrap the result back into the field */
 132
 133		/* Step 1: Compute carry*38 */
 134		"  mov $0, %%rax;"
 135		"  mov $38, %%rcx;"
 136		"  cmovc %%rcx, %%rax;"
 137
 138		/* Step 2: Subtract carry*38 from the original difference */
 139		"  sub %%rax, %%r8;"
 140		"  sbb $0, %%r9;"
 141		"  sbb $0, %%r10;"
 142		"  sbb $0, %%r11;"
 143
 144		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 145		"  mov $0, %%rax;"
 146		"  cmovc %%rcx, %%rax;"
 147		"  sub %%rax, %%r8;"
 148
 149		/* Store the result */
 150		"  movq %%r8, 0(%0);"
 151		"  movq %%r9, 8(%0);"
 152		"  movq %%r10, 16(%0);"
 153		"  movq %%r11, 24(%0);"
 154	:
 155	: "r" (out), "r" (f1), "r" (f2)
 156	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
 157	);
 158}
 159
 160/* Computes a field multiplication: out <- f1 * f2
 161 * Uses the 8-element buffer tmp for intermediate results */
 162static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 163{
 164	asm volatile(
 
 165		/* Compute the raw multiplication: tmp <- src1 * src2 */
 166
 167		/* Compute src1[0] * src2 */
 168		"  movq 0(%1), %%rdx;"
 169		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
 170		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
 171		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
 172		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
 173		                                   "  adox %%rdx, %%rax;"
 
 
 
 
 
 
 
 
 174		/* Compute src1[1] * src2 */
 175		"  movq 8(%1), %%rdx;"
 176		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
 177		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
 178		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 179		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 180		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 181		/* Compute src1[2] * src2 */
 182		"  movq 16(%1), %%rdx;"
 183		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
 184		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
 185		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 186		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 187		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 188		/* Compute src1[3] * src2 */
 189		"  movq 24(%1), %%rdx;"
 190		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
 191		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
 192		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
 193		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
 194		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 195		/* Line up pointers */
 196		"  mov %0, %1;"
 197		"  mov %2, %0;"
 
 198
 199		/* Wrap the result back into the field */
 200
 201		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 202		"  mov $38, %%rdx;"
 203		"  mulxq 32(%1), %%r8, %%r13;"
 204		"  xor %k3, %k3;"
 205		"  adoxq 0(%1), %%r8;"
 206		"  mulxq 40(%1), %%r9, %%rbx;"
 207		"  adcx %%r13, %%r9;"
 208		"  adoxq 8(%1), %%r9;"
 209		"  mulxq 48(%1), %%r10, %%r13;"
 210		"  adcx %%rbx, %%r10;"
 211		"  adoxq 16(%1), %%r10;"
 212		"  mulxq 56(%1), %%r11, %%rax;"
 213		"  adcx %%r13, %%r11;"
 214		"  adoxq 24(%1), %%r11;"
 215		"  adcx %3, %%rax;"
 216		"  adox %3, %%rax;"
 217		"  imul %%rdx, %%rax;"
 218
 219		/* Step 2: Fold the carry back into dst */
 220		"  add %%rax, %%r8;"
 221		"  adcx %3, %%r9;"
 222		"  movq %%r9, 8(%0);"
 223		"  adcx %3, %%r10;"
 224		"  movq %%r10, 16(%0);"
 225		"  adcx %3, %%r11;"
 226		"  movq %%r11, 24(%0);"
 227
 228		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 229		"  mov $0, %%rax;"
 230		"  cmovc %%rdx, %%rax;"
 231		"  add %%rax, %%r8;"
 232		"  movq %%r8, 0(%0);"
 233	: "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
 234	:
 235	: "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
 236	);
 237}
 238
 239/* Computes two field multiplications:
 240 * out[0] <- f1[0] * f2[0]
 241 * out[1] <- f1[1] * f2[1]
 242 * Uses the 16-element buffer tmp for intermediate results. */
 243static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 244{
 245	asm volatile(
 
 246		/* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
 247
 248		/* Compute src1[0] * src2 */
 249		"  movq 0(%1), %%rdx;"
 250		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  movq %%r8, 0(%0);"
 251		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
 252		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
 253		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
 254		                                   "  adox %%rdx, %%rax;"
 
 
 
 
 
 
 
 
 255		/* Compute src1[1] * src2 */
 256		"  movq 8(%1), %%rdx;"
 257		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
 258		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
 259		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 260		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 261		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 262		/* Compute src1[2] * src2 */
 263		"  movq 16(%1), %%rdx;"
 264		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 16(%0), %%r8;"   "  movq %%r8, 16(%0);"
 265		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
 266		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 267		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 268		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 269		/* Compute src1[3] * src2 */
 270		"  movq 24(%1), %%rdx;"
 271		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10d, %%r10d;"   "  adcxq 24(%0), %%r8;"   "  movq %%r8, 24(%0);"
 272		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
 273		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
 274		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
 275		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 276
 277		/* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
 278
 279		/* Compute src1[0] * src2 */
 280		"  movq 32(%1), %%rdx;"
 281		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  movq %%r8, 64(%0);"
 282		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
 283		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
 284		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
 285		                                   "  adox %%rdx, %%rax;"
 
 
 
 
 
 
 
 
 286		/* Compute src1[1] * src2 */
 287		"  movq 40(%1), %%rdx;"
 288		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 72(%0), %%r8;"   "  movq %%r8, 72(%0);"
 289		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
 290		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 291		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 292		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 293		/* Compute src1[2] * src2 */
 294		"  movq 48(%1), %%rdx;"
 295		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 80(%0), %%r8;"   "  movq %%r8, 80(%0);"
 296		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
 297		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 298		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 299		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 300		/* Compute src1[3] * src2 */
 301		"  movq 56(%1), %%rdx;"
 302		"  mulxq 32(%3), %%r8, %%r9;"      "  xor %%r10d, %%r10d;"   "  adcxq 88(%0), %%r8;"   "  movq %%r8, 88(%0);"
 303		"  mulxq 40(%3), %%r10, %%r11;"    "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
 304		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
 305		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
 306		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 307		/* Line up pointers */
 308		"  mov %0, %1;"
 309		"  mov %2, %0;"
 
 310
 311		/* Wrap the results back into the field */
 312
 313		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 314		"  mov $38, %%rdx;"
 315		"  mulxq 32(%1), %%r8, %%r13;"
 316		"  xor %k3, %k3;"
 317		"  adoxq 0(%1), %%r8;"
 318		"  mulxq 40(%1), %%r9, %%rbx;"
 319		"  adcx %%r13, %%r9;"
 320		"  adoxq 8(%1), %%r9;"
 321		"  mulxq 48(%1), %%r10, %%r13;"
 322		"  adcx %%rbx, %%r10;"
 323		"  adoxq 16(%1), %%r10;"
 324		"  mulxq 56(%1), %%r11, %%rax;"
 325		"  adcx %%r13, %%r11;"
 326		"  adoxq 24(%1), %%r11;"
 327		"  adcx %3, %%rax;"
 328		"  adox %3, %%rax;"
 329		"  imul %%rdx, %%rax;"
 330
 331		/* Step 2: Fold the carry back into dst */
 332		"  add %%rax, %%r8;"
 333		"  adcx %3, %%r9;"
 334		"  movq %%r9, 8(%0);"
 335		"  adcx %3, %%r10;"
 336		"  movq %%r10, 16(%0);"
 337		"  adcx %3, %%r11;"
 338		"  movq %%r11, 24(%0);"
 339
 340		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 341		"  mov $0, %%rax;"
 342		"  cmovc %%rdx, %%rax;"
 343		"  add %%rax, %%r8;"
 344		"  movq %%r8, 0(%0);"
 345
 346		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 347		"  mov $38, %%rdx;"
 348		"  mulxq 96(%1), %%r8, %%r13;"
 349		"  xor %k3, %k3;"
 350		"  adoxq 64(%1), %%r8;"
 351		"  mulxq 104(%1), %%r9, %%rbx;"
 352		"  adcx %%r13, %%r9;"
 353		"  adoxq 72(%1), %%r9;"
 354		"  mulxq 112(%1), %%r10, %%r13;"
 355		"  adcx %%rbx, %%r10;"
 356		"  adoxq 80(%1), %%r10;"
 357		"  mulxq 120(%1), %%r11, %%rax;"
 358		"  adcx %%r13, %%r11;"
 359		"  adoxq 88(%1), %%r11;"
 360		"  adcx %3, %%rax;"
 361		"  adox %3, %%rax;"
 362		"  imul %%rdx, %%rax;"
 363
 364		/* Step 2: Fold the carry back into dst */
 365		"  add %%rax, %%r8;"
 366		"  adcx %3, %%r9;"
 367		"  movq %%r9, 40(%0);"
 368		"  adcx %3, %%r10;"
 369		"  movq %%r10, 48(%0);"
 370		"  adcx %3, %%r11;"
 371		"  movq %%r11, 56(%0);"
 372
 373		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 374		"  mov $0, %%rax;"
 375		"  cmovc %%rdx, %%rax;"
 376		"  add %%rax, %%r8;"
 377		"  movq %%r8, 32(%0);"
 378	: "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
 379	:
 380	: "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
 381	);
 382}
 383
 384/* Computes the field multiplication of four-element f1 with value in f2 */
 
 385static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
 386{
 387	register u64 f2_r asm("rdx") = f2;
 388
 389	asm volatile(
 390		/* Compute the raw multiplication of f1*f2 */
 391		"  mulxq 0(%2), %%r8, %%rcx;"      /* f1[0]*f2 */
 392		"  mulxq 8(%2), %%r9, %%rbx;"      /* f1[1]*f2 */
 393		"  add %%rcx, %%r9;"
 394		"  mov $0, %%rcx;"
 395		"  mulxq 16(%2), %%r10, %%r13;"    /* f1[2]*f2 */
 396		"  adcx %%rbx, %%r10;"
 397		"  mulxq 24(%2), %%r11, %%rax;"    /* f1[3]*f2 */
 398		"  adcx %%r13, %%r11;"
 399		"  adcx %%rcx, %%rax;"
 400
 401		/* Wrap the result back into the field */
 402
 403		/* Step 1: Compute carry*38 */
 404		"  mov $38, %%rdx;"
 405		"  imul %%rdx, %%rax;"
 406
 407		/* Step 2: Fold the carry back into dst */
 408		"  add %%rax, %%r8;"
 409		"  adcx %%rcx, %%r9;"
 410		"  movq %%r9, 8(%1);"
 411		"  adcx %%rcx, %%r10;"
 412		"  movq %%r10, 16(%1);"
 413		"  adcx %%rcx, %%r11;"
 414		"  movq %%r11, 24(%1);"
 415
 416		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 417		"  mov $0, %%rax;"
 418		"  cmovc %%rdx, %%rax;"
 419		"  add %%rax, %%r8;"
 420		"  movq %%r8, 0(%1);"
 421	: "+&r" (f2_r)
 422	: "r" (out), "r" (f1)
 423	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "memory", "cc"
 424	);
 425}
 426
 427/* Computes p1 <- bit ? p2 : p1 in constant time */
 428static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 429{
 430	asm volatile(
 431		/* Invert the polarity of bit to match cmov expectations */
 432		"  add $18446744073709551615, %0;"
 433
 434		/* cswap p1[0], p2[0] */
 435		"  movq 0(%1), %%r8;"
 436		"  movq 0(%2), %%r9;"
 437		"  mov %%r8, %%r10;"
 438		"  cmovc %%r9, %%r8;"
 439		"  cmovc %%r10, %%r9;"
 440		"  movq %%r8, 0(%1);"
 441		"  movq %%r9, 0(%2);"
 442
 443		/* cswap p1[1], p2[1] */
 444		"  movq 8(%1), %%r8;"
 445		"  movq 8(%2), %%r9;"
 446		"  mov %%r8, %%r10;"
 447		"  cmovc %%r9, %%r8;"
 448		"  cmovc %%r10, %%r9;"
 449		"  movq %%r8, 8(%1);"
 450		"  movq %%r9, 8(%2);"
 451
 452		/* cswap p1[2], p2[2] */
 453		"  movq 16(%1), %%r8;"
 454		"  movq 16(%2), %%r9;"
 455		"  mov %%r8, %%r10;"
 456		"  cmovc %%r9, %%r8;"
 457		"  cmovc %%r10, %%r9;"
 458		"  movq %%r8, 16(%1);"
 459		"  movq %%r9, 16(%2);"
 460
 461		/* cswap p1[3], p2[3] */
 462		"  movq 24(%1), %%r8;"
 463		"  movq 24(%2), %%r9;"
 464		"  mov %%r8, %%r10;"
 465		"  cmovc %%r9, %%r8;"
 466		"  cmovc %%r10, %%r9;"
 467		"  movq %%r8, 24(%1);"
 468		"  movq %%r9, 24(%2);"
 469
 470		/* cswap p1[4], p2[4] */
 471		"  movq 32(%1), %%r8;"
 472		"  movq 32(%2), %%r9;"
 473		"  mov %%r8, %%r10;"
 474		"  cmovc %%r9, %%r8;"
 475		"  cmovc %%r10, %%r9;"
 476		"  movq %%r8, 32(%1);"
 477		"  movq %%r9, 32(%2);"
 478
 479		/* cswap p1[5], p2[5] */
 480		"  movq 40(%1), %%r8;"
 481		"  movq 40(%2), %%r9;"
 482		"  mov %%r8, %%r10;"
 483		"  cmovc %%r9, %%r8;"
 484		"  cmovc %%r10, %%r9;"
 485		"  movq %%r8, 40(%1);"
 486		"  movq %%r9, 40(%2);"
 487
 488		/* cswap p1[6], p2[6] */
 489		"  movq 48(%1), %%r8;"
 490		"  movq 48(%2), %%r9;"
 491		"  mov %%r8, %%r10;"
 492		"  cmovc %%r9, %%r8;"
 493		"  cmovc %%r10, %%r9;"
 494		"  movq %%r8, 48(%1);"
 495		"  movq %%r9, 48(%2);"
 496
 497		/* cswap p1[7], p2[7] */
 498		"  movq 56(%1), %%r8;"
 499		"  movq 56(%2), %%r9;"
 500		"  mov %%r8, %%r10;"
 501		"  cmovc %%r9, %%r8;"
 502		"  cmovc %%r10, %%r9;"
 503		"  movq %%r8, 56(%1);"
 504		"  movq %%r9, 56(%2);"
 505	: "+&r" (bit)
 506	: "r" (p1), "r" (p2)
 507	: "%r8", "%r9", "%r10", "memory", "cc"
 508	);
 509}
 510
 511/* Computes the square of a field element: out <- f * f
 512 * Uses the 8-element buffer tmp for intermediate results */
 513static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 514{
 515	asm volatile(
 516		/* Compute the raw multiplication: tmp <- f * f */
 517
 518		/* Step 1: Compute all partial products */
 519		"  movq 0(%1), %%rdx;"                                       /* f[0] */
 520		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
 521		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
 522		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
 523		"  movq 24(%1), %%rdx;"                                      /* f[3] */
 524		"  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
 525		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
 526		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
 527		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
 
 
 
 
 
 
 
 528
 529		/* Step 2: Compute two parallel carry chains */
 530		"  xor %%r15d, %%r15d;"
 531		"  adox %%rax, %%r10;"
 532		"  adcx %%r8, %%r8;"
 533		"  adox %%rcx, %%r11;"
 534		"  adcx %%r9, %%r9;"
 535		"  adox %%r15, %%rbx;"
 536		"  adcx %%r10, %%r10;"
 537		"  adox %%r15, %%r13;"
 538		"  adcx %%r11, %%r11;"
 539		"  adox %%r15, %%r14;"
 540		"  adcx %%rbx, %%rbx;"
 541		"  adcx %%r13, %%r13;"
 542		"  adcx %%r14, %%r14;"
 543
 544		/* Step 3: Compute intermediate squares */
 545		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
 546		                           "  movq %%rax, 0(%0);"
 547		"  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
 548		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
 549		"  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
 550		"  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
 551		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
 552		"  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
 553		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
 554		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
 555		"  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
 556		"  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
 
 
 
 
 
 
 
 
 
 
 
 557
 558		/* Line up pointers */
 559		"  mov %0, %1;"
 560		"  mov %2, %0;"
 561
 562		/* Wrap the result back into the field */
 563
 564		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 565		"  mov $38, %%rdx;"
 566		"  mulxq 32(%1), %%r8, %%r13;"
 567		"  xor %%ecx, %%ecx;"
 568		"  adoxq 0(%1), %%r8;"
 569		"  mulxq 40(%1), %%r9, %%rbx;"
 570		"  adcx %%r13, %%r9;"
 571		"  adoxq 8(%1), %%r9;"
 572		"  mulxq 48(%1), %%r10, %%r13;"
 573		"  adcx %%rbx, %%r10;"
 574		"  adoxq 16(%1), %%r10;"
 575		"  mulxq 56(%1), %%r11, %%rax;"
 576		"  adcx %%r13, %%r11;"
 577		"  adoxq 24(%1), %%r11;"
 578		"  adcx %%rcx, %%rax;"
 579		"  adox %%rcx, %%rax;"
 580		"  imul %%rdx, %%rax;"
 581
 582		/* Step 2: Fold the carry back into dst */
 583		"  add %%rax, %%r8;"
 584		"  adcx %%rcx, %%r9;"
 585		"  movq %%r9, 8(%0);"
 586		"  adcx %%rcx, %%r10;"
 587		"  movq %%r10, 16(%0);"
 588		"  adcx %%rcx, %%r11;"
 589		"  movq %%r11, 24(%0);"
 590
 591		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 592		"  mov $0, %%rax;"
 593		"  cmovc %%rdx, %%rax;"
 594		"  add %%rax, %%r8;"
 595		"  movq %%r8, 0(%0);"
 596	: "+&r" (tmp), "+&r" (f), "+&r" (out)
 597	:
 598	: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
 599	);
 600}
 601
 602/* Computes two field squarings:
 603 * out[0] <- f[0] * f[0]
 604 * out[1] <- f[1] * f[1]
 605 * Uses the 16-element buffer tmp for intermediate results */
 606static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 607{
 608	asm volatile(
 609		/* Step 1: Compute all partial products */
 610		"  movq 0(%1), %%rdx;"                                       /* f[0] */
 611		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
 612		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
 613		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
 614		"  movq 24(%1), %%rdx;"                                      /* f[3] */
 615		"  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
 616		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
 617		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
 618		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
 
 
 
 
 
 
 
 619
 620		/* Step 2: Compute two parallel carry chains */
 621		"  xor %%r15d, %%r15d;"
 622		"  adox %%rax, %%r10;"
 623		"  adcx %%r8, %%r8;"
 624		"  adox %%rcx, %%r11;"
 625		"  adcx %%r9, %%r9;"
 626		"  adox %%r15, %%rbx;"
 627		"  adcx %%r10, %%r10;"
 628		"  adox %%r15, %%r13;"
 629		"  adcx %%r11, %%r11;"
 630		"  adox %%r15, %%r14;"
 631		"  adcx %%rbx, %%rbx;"
 632		"  adcx %%r13, %%r13;"
 633		"  adcx %%r14, %%r14;"
 634
 635		/* Step 3: Compute intermediate squares */
 636		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
 637		                           "  movq %%rax, 0(%0);"
 638		"  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
 639		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
 640		"  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
 641		"  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
 642		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
 643		"  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
 644		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
 645		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
 646		"  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
 647		"  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
 
 
 
 
 
 
 
 
 
 
 
 648
 649		/* Step 1: Compute all partial products */
 650		"  movq 32(%1), %%rdx;"                                       /* f[0] */
 651		"  mulxq 40(%1), %%r8, %%r14;"     "  xor %%r15d, %%r15d;"   /* f[1]*f[0] */
 652		"  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
 653		"  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
 654		"  movq 56(%1), %%rdx;"                                      /* f[3] */
 655		"  mulxq 40(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
 656		"  mulxq 48(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
 657		"  movq 40(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
 658		"  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
 
 
 
 
 
 
 
 659
 660		/* Step 2: Compute two parallel carry chains */
 661		"  xor %%r15d, %%r15d;"
 662		"  adox %%rax, %%r10;"
 663		"  adcx %%r8, %%r8;"
 664		"  adox %%rcx, %%r11;"
 665		"  adcx %%r9, %%r9;"
 666		"  adox %%r15, %%rbx;"
 667		"  adcx %%r10, %%r10;"
 668		"  adox %%r15, %%r13;"
 669		"  adcx %%r11, %%r11;"
 670		"  adox %%r15, %%r14;"
 671		"  adcx %%rbx, %%rbx;"
 672		"  adcx %%r13, %%r13;"
 673		"  adcx %%r14, %%r14;"
 674
 675		/* Step 3: Compute intermediate squares */
 676		"  movq 32(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
 677		                           "  movq %%rax, 64(%0);"
 678		"  add %%rcx, %%r8;"       "  movq %%r8, 72(%0);"
 679		"  movq 40(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
 680		"  adcx %%rax, %%r9;"      "  movq %%r9, 80(%0);"
 681		"  adcx %%rcx, %%r10;"     "  movq %%r10, 88(%0);"
 682		"  movq 48(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
 683		"  adcx %%rax, %%r11;"     "  movq %%r11, 96(%0);"
 684		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 104(%0);"
 685		"  movq 56(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
 686		"  adcx %%rax, %%r13;"     "  movq %%r13, 112(%0);"
 687		"  adcx %%rcx, %%r14;"     "  movq %%r14, 120(%0);"
 
 
 
 
 
 
 
 
 
 
 
 688
 689		/* Line up pointers */
 690		"  mov %0, %1;"
 691		"  mov %2, %0;"
 692
 693		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 694		"  mov $38, %%rdx;"
 695		"  mulxq 32(%1), %%r8, %%r13;"
 696		"  xor %%ecx, %%ecx;"
 697		"  adoxq 0(%1), %%r8;"
 698		"  mulxq 40(%1), %%r9, %%rbx;"
 699		"  adcx %%r13, %%r9;"
 700		"  adoxq 8(%1), %%r9;"
 701		"  mulxq 48(%1), %%r10, %%r13;"
 702		"  adcx %%rbx, %%r10;"
 703		"  adoxq 16(%1), %%r10;"
 704		"  mulxq 56(%1), %%r11, %%rax;"
 705		"  adcx %%r13, %%r11;"
 706		"  adoxq 24(%1), %%r11;"
 707		"  adcx %%rcx, %%rax;"
 708		"  adox %%rcx, %%rax;"
 709		"  imul %%rdx, %%rax;"
 710
 711		/* Step 2: Fold the carry back into dst */
 712		"  add %%rax, %%r8;"
 713		"  adcx %%rcx, %%r9;"
 714		"  movq %%r9, 8(%0);"
 715		"  adcx %%rcx, %%r10;"
 716		"  movq %%r10, 16(%0);"
 717		"  adcx %%rcx, %%r11;"
 718		"  movq %%r11, 24(%0);"
 719
 720		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 721		"  mov $0, %%rax;"
 722		"  cmovc %%rdx, %%rax;"
 723		"  add %%rax, %%r8;"
 724		"  movq %%r8, 0(%0);"
 725
 726		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 727		"  mov $38, %%rdx;"
 728		"  mulxq 96(%1), %%r8, %%r13;"
 729		"  xor %%ecx, %%ecx;"
 730		"  adoxq 64(%1), %%r8;"
 731		"  mulxq 104(%1), %%r9, %%rbx;"
 732		"  adcx %%r13, %%r9;"
 733		"  adoxq 72(%1), %%r9;"
 734		"  mulxq 112(%1), %%r10, %%r13;"
 735		"  adcx %%rbx, %%r10;"
 736		"  adoxq 80(%1), %%r10;"
 737		"  mulxq 120(%1), %%r11, %%rax;"
 738		"  adcx %%r13, %%r11;"
 739		"  adoxq 88(%1), %%r11;"
 740		"  adcx %%rcx, %%rax;"
 741		"  adox %%rcx, %%rax;"
 742		"  imul %%rdx, %%rax;"
 743
 744		/* Step 2: Fold the carry back into dst */
 745		"  add %%rax, %%r8;"
 746		"  adcx %%rcx, %%r9;"
 747		"  movq %%r9, 40(%0);"
 748		"  adcx %%rcx, %%r10;"
 749		"  movq %%r10, 48(%0);"
 750		"  adcx %%rcx, %%r11;"
 751		"  movq %%r11, 56(%0);"
 752
 753		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 754		"  mov $0, %%rax;"
 755		"  cmovc %%rdx, %%rax;"
 756		"  add %%rax, %%r8;"
 757		"  movq %%r8, 32(%0);"
 758	: "+&r" (tmp), "+&r" (f), "+&r" (out)
 759	:
 760	: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
 761	);
 762}
 763
 764static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
 765{
 766	u64 *nq = p01_tmp1;
 767	u64 *nq_p1 = p01_tmp1 + (u32)8U;
 768	u64 *tmp1 = p01_tmp1 + (u32)16U;
 769	u64 *x1 = q;
 770	u64 *x2 = nq;
 771	u64 *z2 = nq + (u32)4U;
 772	u64 *z3 = nq_p1 + (u32)4U;
 773	u64 *a = tmp1;
 774	u64 *b = tmp1 + (u32)4U;
 775	u64 *ab = tmp1;
 776	u64 *dc = tmp1 + (u32)8U;
 777	u64 *x3;
 778	u64 *z31;
 779	u64 *d0;
 780	u64 *c0;
 781	u64 *a1;
 782	u64 *b1;
 783	u64 *d;
 784	u64 *c;
 785	u64 *ab1;
 786	u64 *dc1;
 787	fadd(a, x2, z2);
 788	fsub(b, x2, z2);
 789	x3 = nq_p1;
 790	z31 = nq_p1 + (u32)4U;
 791	d0 = dc;
 792	c0 = dc + (u32)4U;
 793	fadd(c0, x3, z31);
 794	fsub(d0, x3, z31);
 795	fmul2(dc, dc, ab, tmp2);
 796	fadd(x3, d0, c0);
 797	fsub(z31, d0, c0);
 798	a1 = tmp1;
 799	b1 = tmp1 + (u32)4U;
 800	d = tmp1 + (u32)8U;
 801	c = tmp1 + (u32)12U;
 802	ab1 = tmp1;
 803	dc1 = tmp1 + (u32)8U;
 804	fsqr2(dc1, ab1, tmp2);
 805	fsqr2(nq_p1, nq_p1, tmp2);
 806	a1[0U] = c[0U];
 807	a1[1U] = c[1U];
 808	a1[2U] = c[2U];
 809	a1[3U] = c[3U];
 810	fsub(c, d, c);
 811	fmul_scalar(b1, c, (u64)121665U);
 812	fadd(b1, b1, d);
 813	fmul2(nq, dc1, ab1, tmp2);
 814	fmul(z3, z3, x1, tmp2);
 815}
 816
 817static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2)
 818{
 819	u64 *x2 = nq;
 820	u64 *z2 = nq + (u32)4U;
 821	u64 *a = tmp1;
 822	u64 *b = tmp1 + (u32)4U;
 823	u64 *d = tmp1 + (u32)8U;
 824	u64 *c = tmp1 + (u32)12U;
 825	u64 *ab = tmp1;
 826	u64 *dc = tmp1 + (u32)8U;
 827	fadd(a, x2, z2);
 828	fsub(b, x2, z2);
 829	fsqr2(dc, ab, tmp2);
 830	a[0U] = c[0U];
 831	a[1U] = c[1U];
 832	a[2U] = c[2U];
 833	a[3U] = c[3U];
 834	fsub(c, d, c);
 835	fmul_scalar(b, c, (u64)121665U);
 836	fadd(b, b, d);
 837	fmul2(nq, dc, ab, tmp2);
 838}
 839
 840static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
 841{
 842	u64 tmp2[16U] = { 0U };
 843	u64 p01_tmp1_swap[33U] = { 0U };
 844	u64 *p0 = p01_tmp1_swap;
 845	u64 *p01 = p01_tmp1_swap;
 846	u64 *p03 = p01;
 847	u64 *p11 = p01 + (u32)8U;
 848	u64 *x0;
 849	u64 *z0;
 850	u64 *p01_tmp1;
 851	u64 *p01_tmp11;
 852	u64 *nq10;
 853	u64 *nq_p11;
 854	u64 *swap1;
 855	u64 sw0;
 856	u64 *nq1;
 857	u64 *tmp1;
 858	memcpy(p11, init1, (u32)8U * sizeof(init1[0U]));
 859	x0 = p03;
 860	z0 = p03 + (u32)4U;
 861	x0[0U] = (u64)1U;
 862	x0[1U] = (u64)0U;
 863	x0[2U] = (u64)0U;
 864	x0[3U] = (u64)0U;
 865	z0[0U] = (u64)0U;
 866	z0[1U] = (u64)0U;
 867	z0[2U] = (u64)0U;
 868	z0[3U] = (u64)0U;
 869	p01_tmp1 = p01_tmp1_swap;
 870	p01_tmp11 = p01_tmp1_swap;
 871	nq10 = p01_tmp1_swap;
 872	nq_p11 = p01_tmp1_swap + (u32)8U;
 873	swap1 = p01_tmp1_swap + (u32)32U;
 874	cswap2((u64)1U, nq10, nq_p11);
 875	point_add_and_double(init1, p01_tmp11, tmp2);
 876	swap1[0U] = (u64)1U;
 877	{
 878		u32 i;
 879		for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) {
 880			u64 *p01_tmp12 = p01_tmp1_swap;
 881			u64 *swap2 = p01_tmp1_swap + (u32)32U;
 882			u64 *nq2 = p01_tmp12;
 883			u64 *nq_p12 = p01_tmp12 + (u32)8U;
 884			u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U);
 885			u64 sw = swap2[0U] ^ bit;
 886			cswap2(sw, nq2, nq_p12);
 887			point_add_and_double(init1, p01_tmp12, tmp2);
 888			swap2[0U] = bit;
 889		}
 890	}
 891	sw0 = swap1[0U];
 892	cswap2(sw0, nq10, nq_p11);
 893	nq1 = p01_tmp1;
 894	tmp1 = p01_tmp1 + (u32)16U;
 895	point_double(nq1, tmp1, tmp2);
 896	point_double(nq1, tmp1, tmp2);
 897	point_double(nq1, tmp1, tmp2);
 898	memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
 899
 900	memzero_explicit(tmp2, sizeof(tmp2));
 901	memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
 902}
 903
 904static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
 905{
 906	u32 i;
 907	fsqr(o, inp, tmp);
 908	for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U)
 909		fsqr(o, o, tmp);
 910}
 911
 912static void finv(u64 *o, const u64 *i, u64 *tmp)
 913{
 914	u64 t1[16U] = { 0U };
 915	u64 *a0 = t1;
 916	u64 *b = t1 + (u32)4U;
 917	u64 *c = t1 + (u32)8U;
 918	u64 *t00 = t1 + (u32)12U;
 919	u64 *tmp1 = tmp;
 920	u64 *a;
 921	u64 *t0;
 922	fsquare_times(a0, i, tmp1, (u32)1U);
 923	fsquare_times(t00, a0, tmp1, (u32)2U);
 924	fmul(b, t00, i, tmp);
 925	fmul(a0, b, a0, tmp);
 926	fsquare_times(t00, a0, tmp1, (u32)1U);
 927	fmul(b, t00, b, tmp);
 928	fsquare_times(t00, b, tmp1, (u32)5U);
 929	fmul(b, t00, b, tmp);
 930	fsquare_times(t00, b, tmp1, (u32)10U);
 931	fmul(c, t00, b, tmp);
 932	fsquare_times(t00, c, tmp1, (u32)20U);
 933	fmul(t00, t00, c, tmp);
 934	fsquare_times(t00, t00, tmp1, (u32)10U);
 935	fmul(b, t00, b, tmp);
 936	fsquare_times(t00, b, tmp1, (u32)50U);
 937	fmul(c, t00, b, tmp);
 938	fsquare_times(t00, c, tmp1, (u32)100U);
 939	fmul(t00, t00, c, tmp);
 940	fsquare_times(t00, t00, tmp1, (u32)50U);
 941	fmul(t00, t00, b, tmp);
 942	fsquare_times(t00, t00, tmp1, (u32)5U);
 943	a = t1;
 944	t0 = t1 + (u32)12U;
 945	fmul(o, t0, a, tmp);
 946}
 947
 948static void store_felem(u64 *b, u64 *f)
 949{
 950	u64 f30 = f[3U];
 951	u64 top_bit0 = f30 >> (u32)63U;
 952	u64 f31;
 953	u64 top_bit;
 954	u64 f0;
 955	u64 f1;
 956	u64 f2;
 957	u64 f3;
 958	u64 m0;
 959	u64 m1;
 960	u64 m2;
 961	u64 m3;
 962	u64 mask;
 963	u64 f0_;
 964	u64 f1_;
 965	u64 f2_;
 966	u64 f3_;
 967	u64 o0;
 968	u64 o1;
 969	u64 o2;
 970	u64 o3;
 971	f[3U] = f30 & (u64)0x7fffffffffffffffU;
 972	add_scalar(f, f, (u64)19U * top_bit0);
 973	f31 = f[3U];
 974	top_bit = f31 >> (u32)63U;
 975	f[3U] = f31 & (u64)0x7fffffffffffffffU;
 976	add_scalar(f, f, (u64)19U * top_bit);
 977	f0 = f[0U];
 978	f1 = f[1U];
 979	f2 = f[2U];
 980	f3 = f[3U];
 981	m0 = gte_mask(f0, (u64)0xffffffffffffffedU);
 982	m1 = eq_mask(f1, (u64)0xffffffffffffffffU);
 983	m2 = eq_mask(f2, (u64)0xffffffffffffffffU);
 984	m3 = eq_mask(f3, (u64)0x7fffffffffffffffU);
 985	mask = ((m0 & m1) & m2) & m3;
 986	f0_ = f0 - (mask & (u64)0xffffffffffffffedU);
 987	f1_ = f1 - (mask & (u64)0xffffffffffffffffU);
 988	f2_ = f2 - (mask & (u64)0xffffffffffffffffU);
 989	f3_ = f3 - (mask & (u64)0x7fffffffffffffffU);
 990	o0 = f0_;
 991	o1 = f1_;
 992	o2 = f2_;
 993	o3 = f3_;
 994	b[0U] = o0;
 995	b[1U] = o1;
 996	b[2U] = o2;
 997	b[3U] = o3;
 998}
 999
1000static void encode_point(u8 *o, const u64 *i)
1001{
1002	const u64 *x = i;
1003	const u64 *z = i + (u32)4U;
1004	u64 tmp[4U] = { 0U };
1005	u64 tmp_w[16U] = { 0U };
1006	finv(tmp, z, tmp_w);
1007	fmul(tmp, tmp, x, tmp_w);
1008	store_felem((u64 *)o, tmp);
1009}
1010
1011static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub)
1012{
1013	u64 init1[8U] = { 0U };
1014	u64 tmp[4U] = { 0U };
1015	u64 tmp3;
1016	u64 *x;
1017	u64 *z;
1018	{
1019		u32 i;
1020		for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) {
1021			u64 *os = tmp;
1022			const u8 *bj = pub + i * (u32)8U;
1023			u64 u = *(u64 *)bj;
1024			u64 r = u;
1025			u64 x0 = r;
1026			os[i] = x0;
1027		}
1028	}
1029	tmp3 = tmp[3U];
1030	tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU;
1031	x = init1;
1032	z = init1 + (u32)4U;
1033	z[0U] = (u64)1U;
1034	z[1U] = (u64)0U;
1035	z[2U] = (u64)0U;
1036	z[3U] = (u64)0U;
1037	x[0U] = tmp[0U];
1038	x[1U] = tmp[1U];
1039	x[2U] = tmp[2U];
1040	x[3U] = tmp[3U];
1041	montgomery_ladder(init1, priv, init1);
1042	encode_point(out, init1);
1043}
1044
1045/* The below constants were generated using this sage script:
1046 *
1047 * #!/usr/bin/env sage
1048 * import sys
1049 * from sage.all import *
1050 * def limbs(n):
1051 * 	n = int(n)
1052 * 	l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64)
1053 * 	return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l
1054 * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0])
1055 * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0]
1056 * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s))
1057 * print("static const u64 table_ladder[] = {")
1058 * p = ec.lift_x(9)
1059 * for i in range(252):
1060 * 	l = (p[0] + p[2]) / (p[0] - p[2])
1061 * 	print(("\t%s" + ("," if i != 251 else "")) % limbs(l))
1062 * 	p = p * 2
1063 * print("};")
1064 *
1065 */
1066
1067static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL };
1068
1069static const u64 table_ladder[] = {
1070	0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL,
1071	0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL,
1072	0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL,
1073	0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL,
1074	0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL,
1075	0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL,
1076	0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL,
1077	0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL,
1078	0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL,
1079	0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL,
1080	0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL,
1081	0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL,
1082	0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL,
1083	0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL,
1084	0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL,
1085	0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL,
1086	0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL,
1087	0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL,
1088	0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL,
1089	0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL,
1090	0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL,
1091	0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL,
1092	0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL,
1093	0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL,
1094	0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL,
1095	0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL,
1096	0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL,
1097	0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL,
1098	0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL,
1099	0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL,
1100	0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL,
1101	0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL,
1102	0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL,
1103	0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL,
1104	0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL,
1105	0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL,
1106	0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL,
1107	0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL,
1108	0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL,
1109	0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL,
1110	0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL,
1111	0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL,
1112	0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL,
1113	0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL,
1114	0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL,
1115	0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL,
1116	0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL,
1117	0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL,
1118	0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL,
1119	0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL,
1120	0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL,
1121	0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL,
1122	0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL,
1123	0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL,
1124	0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL,
1125	0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL,
1126	0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL,
1127	0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL,
1128	0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL,
1129	0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL,
1130	0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL,
1131	0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL,
1132	0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL,
1133	0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL,
1134	0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL,
1135	0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL,
1136	0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL,
1137	0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL,
1138	0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL,
1139	0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL,
1140	0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL,
1141	0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL,
1142	0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL,
1143	0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL,
1144	0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL,
1145	0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL,
1146	0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL,
1147	0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL,
1148	0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL,
1149	0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL,
1150	0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL,
1151	0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL,
1152	0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL,
1153	0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL,
1154	0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL,
1155	0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL,
1156	0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL,
1157	0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL,
1158	0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL,
1159	0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL,
1160	0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL,
1161	0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL,
1162	0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL,
1163	0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL,
1164	0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL,
1165	0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL,
1166	0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL,
1167	0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL,
1168	0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL,
1169	0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL,
1170	0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL,
1171	0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL,
1172	0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL,
1173	0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL,
1174	0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL,
1175	0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL,
1176	0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL,
1177	0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL,
1178	0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL,
1179	0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL,
1180	0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL,
1181	0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL,
1182	0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL,
1183	0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL,
1184	0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL,
1185	0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL,
1186	0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL,
1187	0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL,
1188	0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL,
1189	0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL,
1190	0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL,
1191	0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL,
1192	0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL,
1193	0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL,
1194	0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL,
1195	0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL,
1196	0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL,
1197	0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL,
1198	0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL,
1199	0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL,
1200	0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL,
1201	0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL,
1202	0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL,
1203	0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL,
1204	0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL,
1205	0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL,
1206	0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL,
1207	0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL,
1208	0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL,
1209	0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL,
1210	0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL,
1211	0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL,
1212	0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL,
1213	0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL,
1214	0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL,
1215	0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL,
1216	0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL,
1217	0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL,
1218	0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL,
1219	0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL,
1220	0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL,
1221	0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL,
1222	0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL,
1223	0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL,
1224	0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL,
1225	0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL,
1226	0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL,
1227	0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL,
1228	0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL,
1229	0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL,
1230	0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL,
1231	0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL,
1232	0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL,
1233	0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL,
1234	0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL,
1235	0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL,
1236	0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL,
1237	0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL,
1238	0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL,
1239	0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL,
1240	0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL,
1241	0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL,
1242	0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL,
1243	0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL,
1244	0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL,
1245	0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL,
1246	0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL,
1247	0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL,
1248	0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL,
1249	0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL,
1250	0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL,
1251	0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL,
1252	0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL,
1253	0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL,
1254	0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL,
1255	0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL,
1256	0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL,
1257	0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL,
1258	0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL,
1259	0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL,
1260	0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL,
1261	0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL,
1262	0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL,
1263	0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL,
1264	0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL,
1265	0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL,
1266	0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL,
1267	0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL,
1268	0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL,
1269	0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL,
1270	0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL,
1271	0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL,
1272	0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL,
1273	0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL,
1274	0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL,
1275	0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL,
1276	0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL,
1277	0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL,
1278	0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL,
1279	0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL,
1280	0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL,
1281	0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL,
1282	0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL,
1283	0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL,
1284	0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL,
1285	0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL,
1286	0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL,
1287	0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL,
1288	0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL,
1289	0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL,
1290	0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL,
1291	0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL,
1292	0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL,
1293	0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL,
1294	0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL,
1295	0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL,
1296	0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL,
1297	0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL,
1298	0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL,
1299	0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL,
1300	0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL,
1301	0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL,
1302	0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL,
1303	0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL,
1304	0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL,
1305	0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL,
1306	0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL,
1307	0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL,
1308	0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL,
1309	0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL,
1310	0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL,
1311	0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL,
1312	0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL,
1313	0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL,
1314	0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL,
1315	0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL,
1316	0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL,
1317	0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL,
1318	0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL,
1319	0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL,
1320	0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL,
1321	0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL
1322};
1323
1324static void curve25519_ever64_base(u8 *out, const u8 *priv)
1325{
1326	u64 swap = 1;
1327	int i, j, k;
1328	u64 tmp[16 + 32 + 4];
1329	u64 *x1 = &tmp[0];
1330	u64 *z1 = &tmp[4];
1331	u64 *x2 = &tmp[8];
1332	u64 *z2 = &tmp[12];
1333	u64 *xz1 = &tmp[0];
1334	u64 *xz2 = &tmp[8];
1335	u64 *a = &tmp[0 + 16];
1336	u64 *b = &tmp[4 + 16];
1337	u64 *c = &tmp[8 + 16];
1338	u64 *ab = &tmp[0 + 16];
1339	u64 *abcd = &tmp[0 + 16];
1340	u64 *ef = &tmp[16 + 16];
1341	u64 *efgh = &tmp[16 + 16];
1342	u64 *key = &tmp[0 + 16 + 32];
1343
1344	memcpy(key, priv, 32);
1345	((u8 *)key)[0] &= 248;
1346	((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64;
1347
1348	x1[0] = 1, x1[1] = x1[2] = x1[3] = 0;
1349	z1[0] = 1, z1[1] = z1[2] = z1[3] = 0;
1350	z2[0] = 1, z2[1] = z2[2] = z2[3] = 0;
1351	memcpy(x2, p_minus_s, sizeof(p_minus_s));
1352
1353	j = 3;
1354	for (i = 0; i < 4; ++i) {
1355		while (j < (const int[]){ 64, 64, 64, 63 }[i]) {
1356			u64 bit = (key[i] >> j) & 1;
1357			k = (64 * i + j - 3);
1358			swap = swap ^ bit;
1359			cswap2(swap, xz1, xz2);
1360			swap = bit;
1361			fsub(b, x1, z1);
1362			fadd(a, x1, z1);
1363			fmul(c, &table_ladder[4 * k], b, ef);
1364			fsub(b, a, c);
1365			fadd(a, a, c);
1366			fsqr2(ab, ab, efgh);
1367			fmul2(xz1, xz2, ab, efgh);
1368			++j;
1369		}
1370		j = 0;
1371	}
1372
1373	point_double(xz1, abcd, efgh);
1374	point_double(xz1, abcd, efgh);
1375	point_double(xz1, abcd, efgh);
1376	encode_point(out, xz1);
1377
1378	memzero_explicit(tmp, sizeof(tmp));
1379}
1380
1381static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx);
1382
1383void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
1384		     const u8 secret[CURVE25519_KEY_SIZE],
1385		     const u8 basepoint[CURVE25519_KEY_SIZE])
1386{
1387	if (static_branch_likely(&curve25519_use_bmi2_adx))
1388		curve25519_ever64(mypublic, secret, basepoint);
1389	else
1390		curve25519_generic(mypublic, secret, basepoint);
1391}
1392EXPORT_SYMBOL(curve25519_arch);
1393
1394void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
1395			  const u8 secret[CURVE25519_KEY_SIZE])
1396{
1397	if (static_branch_likely(&curve25519_use_bmi2_adx))
1398		curve25519_ever64_base(pub, secret);
1399	else
1400		curve25519_generic(pub, secret, curve25519_base_point);
1401}
1402EXPORT_SYMBOL(curve25519_base_arch);
1403
1404static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
1405				 unsigned int len)
1406{
1407	u8 *secret = kpp_tfm_ctx(tfm);
1408
1409	if (!len)
1410		curve25519_generate_secret(secret);
1411	else if (len == CURVE25519_KEY_SIZE &&
1412		 crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
1413		memcpy(secret, buf, CURVE25519_KEY_SIZE);
1414	else
1415		return -EINVAL;
1416	return 0;
1417}
1418
1419static int curve25519_generate_public_key(struct kpp_request *req)
1420{
1421	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
1422	const u8 *secret = kpp_tfm_ctx(tfm);
1423	u8 buf[CURVE25519_KEY_SIZE];
1424	int copied, nbytes;
1425
1426	if (req->src)
1427		return -EINVAL;
1428
1429	curve25519_base_arch(buf, secret);
1430
1431	/* might want less than we've got */
1432	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1433	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1434								nbytes),
1435				     buf, nbytes);
1436	if (copied != nbytes)
1437		return -EINVAL;
1438	return 0;
1439}
1440
1441static int curve25519_compute_shared_secret(struct kpp_request *req)
1442{
1443	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
1444	const u8 *secret = kpp_tfm_ctx(tfm);
1445	u8 public_key[CURVE25519_KEY_SIZE];
1446	u8 buf[CURVE25519_KEY_SIZE];
1447	int copied, nbytes;
1448
1449	if (!req->src)
1450		return -EINVAL;
1451
1452	copied = sg_copy_to_buffer(req->src,
1453				   sg_nents_for_len(req->src,
1454						    CURVE25519_KEY_SIZE),
1455				   public_key, CURVE25519_KEY_SIZE);
1456	if (copied != CURVE25519_KEY_SIZE)
1457		return -EINVAL;
1458
1459	curve25519_arch(buf, secret, public_key);
1460
1461	/* might want less than we've got */
1462	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1463	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1464								nbytes),
1465				     buf, nbytes);
1466	if (copied != nbytes)
1467		return -EINVAL;
1468	return 0;
1469}
1470
1471static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
1472{
1473	return CURVE25519_KEY_SIZE;
1474}
1475
1476static struct kpp_alg curve25519_alg = {
1477	.base.cra_name		= "curve25519",
1478	.base.cra_driver_name	= "curve25519-x86",
1479	.base.cra_priority	= 200,
1480	.base.cra_module	= THIS_MODULE,
1481	.base.cra_ctxsize	= CURVE25519_KEY_SIZE,
1482
1483	.set_secret		= curve25519_set_secret,
1484	.generate_public_key	= curve25519_generate_public_key,
1485	.compute_shared_secret	= curve25519_compute_shared_secret,
1486	.max_size		= curve25519_max_size,
1487};
1488
1489
1490static int __init curve25519_mod_init(void)
1491{
1492	if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
1493		static_branch_enable(&curve25519_use_bmi2_adx);
1494	else
1495		return 0;
1496	return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
1497		crypto_register_kpp(&curve25519_alg) : 0;
1498}
1499
1500static void __exit curve25519_mod_exit(void)
1501{
1502	if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
1503	    static_branch_likely(&curve25519_use_bmi2_adx))
1504		crypto_unregister_kpp(&curve25519_alg);
1505}
1506
1507module_init(curve25519_mod_init);
1508module_exit(curve25519_mod_exit);
1509
1510MODULE_ALIAS_CRYPTO("curve25519");
1511MODULE_ALIAS_CRYPTO("curve25519-x86");
 
1512MODULE_LICENSE("GPL v2");
1513MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");