Loading...
1########################################################################
2# Implement fast SHA-256 with AVX1 instructions. (x86_64)
3#
4# Copyright (C) 2013 Intel Corporation.
5#
6# Authors:
7# James Guilford <james.guilford@intel.com>
8# Kirk Yap <kirk.s.yap@intel.com>
9# Tim Chen <tim.c.chen@linux.intel.com>
10#
11# This software is available to you under a choice of one of two
12# licenses. You may choose to be licensed under the terms of the GNU
13# General Public License (GPL) Version 2, available from the file
14# COPYING in the main directory of this source tree, or the
15# OpenIB.org BSD license below:
16#
17# Redistribution and use in source and binary forms, with or
18# without modification, are permitted provided that the following
19# conditions are met:
20#
21# - Redistributions of source code must retain the above
22# copyright notice, this list of conditions and the following
23# disclaimer.
24#
25# - Redistributions in binary form must reproduce the above
26# copyright notice, this list of conditions and the following
27# disclaimer in the documentation and/or other materials
28# provided with the distribution.
29#
30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37# SOFTWARE.
38########################################################################
39#
40# This code is described in an Intel White-Paper:
41# "Fast SHA-256 Implementations on Intel Architecture Processors"
42#
43# To find it, surf to http://www.intel.com/p/en_US/embedded
44# and search for that title.
45#
46########################################################################
47# This code schedules 1 block at a time, with 4 lanes per block
48########################################################################
49
50#include <linux/linkage.h>
51#include <linux/cfi_types.h>
52
53## assume buffers not aligned
54#define VMOVDQ vmovdqu
55
56################################ Define Macros
57
58# addm [mem], reg
59# Add reg to mem using reg-mem add and store
60.macro addm p1 p2
61 add \p1, \p2
62 mov \p2, \p1
63.endm
64
65
66.macro MY_ROR p1 p2
67 shld $(32-(\p1)), \p2, \p2
68.endm
69
70################################
71
72# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
73# Load xmm with mem and byte swap each dword
74.macro COPY_XMM_AND_BSWAP p1 p2 p3
75 VMOVDQ \p2, \p1
76 vpshufb \p3, \p1, \p1
77.endm
78
79################################
80
81X0 = %xmm4
82X1 = %xmm5
83X2 = %xmm6
84X3 = %xmm7
85
86XTMP0 = %xmm0
87XTMP1 = %xmm1
88XTMP2 = %xmm2
89XTMP3 = %xmm3
90XTMP4 = %xmm8
91XFER = %xmm9
92XTMP5 = %xmm11
93
94SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
95SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
96BYTE_FLIP_MASK = %xmm13
97
98NUM_BLKS = %rdx # 3rd arg
99INP = %rsi # 2nd arg
100CTX = %rdi # 1st arg
101
102SRND = %rsi # clobbers INP
103c = %ecx
104d = %r8d
105e = %edx
106TBL = %r12
107a = %eax
108b = %ebx
109
110f = %r9d
111g = %r10d
112h = %r11d
113
114y0 = %r13d
115y1 = %r14d
116y2 = %r15d
117
118
119_INP_END_SIZE = 8
120_INP_SIZE = 8
121_XFER_SIZE = 16
122_XMM_SAVE_SIZE = 0
123
124_INP_END = 0
125_INP = _INP_END + _INP_END_SIZE
126_XFER = _INP + _INP_SIZE
127_XMM_SAVE = _XFER + _XFER_SIZE
128STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
129
130# rotate_Xs
131# Rotate values of symbols X0...X3
132.macro rotate_Xs
133X_ = X0
134X0 = X1
135X1 = X2
136X2 = X3
137X3 = X_
138.endm
139
140# ROTATE_ARGS
141# Rotate values of symbols a...h
142.macro ROTATE_ARGS
143TMP_ = h
144h = g
145g = f
146f = e
147e = d
148d = c
149c = b
150b = a
151a = TMP_
152.endm
153
154.macro FOUR_ROUNDS_AND_SCHED
155 ## compute s0 four at a time and s1 two at a time
156 ## compute W[-16] + W[-7] 4 at a time
157
158 mov e, y0 # y0 = e
159 MY_ROR (25-11), y0 # y0 = e >> (25-11)
160 mov a, y1 # y1 = a
161 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
162 MY_ROR (22-13), y1 # y1 = a >> (22-13)
163 xor e, y0 # y0 = e ^ (e >> (25-11))
164 mov f, y2 # y2 = f
165 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
166 xor a, y1 # y1 = a ^ (a >> (22-13)
167 xor g, y2 # y2 = f^g
168 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
169 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
170 and e, y2 # y2 = (f^g)&e
171 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
172 ## compute s0
173 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
174 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
175 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
176 xor g, y2 # y2 = CH = ((f^g)&e)^g
177 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
178 add y0, y2 # y2 = S1 + CH
179 add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
180 mov a, y0 # y0 = a
181 add y2, h # h = h + S1 + CH + k + w
182 mov a, y2 # y2 = a
183 vpsrld $7, XTMP1, XTMP2
184 or c, y0 # y0 = a|c
185 add h, d # d = d + h + S1 + CH + k + w
186 and c, y2 # y2 = a&c
187 vpslld $(32-7), XTMP1, XTMP3
188 and b, y0 # y0 = (a|c)&b
189 add y1, h # h = h + S1 + CH + k + w + S0
190 vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
191 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
192 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
193 ROTATE_ARGS
194 mov e, y0 # y0 = e
195 mov a, y1 # y1 = a
196 MY_ROR (25-11), y0 # y0 = e >> (25-11)
197 xor e, y0 # y0 = e ^ (e >> (25-11))
198 mov f, y2 # y2 = f
199 MY_ROR (22-13), y1 # y1 = a >> (22-13)
200 vpsrld $18, XTMP1, XTMP2 #
201 xor a, y1 # y1 = a ^ (a >> (22-13)
202 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
203 xor g, y2 # y2 = f^g
204 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
205 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
206 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
207 and e, y2 # y2 = (f^g)&e
208 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
209 vpslld $(32-18), XTMP1, XTMP1
210 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
211 xor g, y2 # y2 = CH = ((f^g)&e)^g
212 vpxor XTMP1, XTMP3, XTMP3 #
213 add y0, y2 # y2 = S1 + CH
214 add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
215 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
216 vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
217 mov a, y0 # y0 = a
218 add y2, h # h = h + S1 + CH + k + w
219 mov a, y2 # y2 = a
220 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
221 or c, y0 # y0 = a|c
222 add h, d # d = d + h + S1 + CH + k + w
223 and c, y2 # y2 = a&c
224 ## compute low s1
225 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
226 and b, y0 # y0 = (a|c)&b
227 add y1, h # h = h + S1 + CH + k + w + S0
228 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
229 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
230 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
231 ROTATE_ARGS
232 mov e, y0 # y0 = e
233 mov a, y1 # y1 = a
234 MY_ROR (25-11), y0 # y0 = e >> (25-11)
235 xor e, y0 # y0 = e ^ (e >> (25-11))
236 MY_ROR (22-13), y1 # y1 = a >> (22-13)
237 mov f, y2 # y2 = f
238 xor a, y1 # y1 = a ^ (a >> (22-13)
239 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
240 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
241 xor g, y2 # y2 = f^g
242 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
243 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
244 and e, y2 # y2 = (f^g)&e
245 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
246 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
247 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
248 xor g, y2 # y2 = CH = ((f^g)&e)^g
249 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
250 vpxor XTMP3, XTMP2, XTMP2 #
251 add y0, y2 # y2 = S1 + CH
252 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
253 add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
254 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
255 mov a, y0 # y0 = a
256 add y2, h # h = h + S1 + CH + k + w
257 mov a, y2 # y2 = a
258 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
259 or c, y0 # y0 = a|c
260 add h, d # d = d + h + S1 + CH + k + w
261 and c, y2 # y2 = a&c
262 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
263 and b, y0 # y0 = (a|c)&b
264 add y1, h # h = h + S1 + CH + k + w + S0
265 ## compute high s1
266 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
267 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
268 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
269 ROTATE_ARGS
270 mov e, y0 # y0 = e
271 MY_ROR (25-11), y0 # y0 = e >> (25-11)
272 mov a, y1 # y1 = a
273 MY_ROR (22-13), y1 # y1 = a >> (22-13)
274 xor e, y0 # y0 = e ^ (e >> (25-11))
275 mov f, y2 # y2 = f
276 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
277 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
278 xor a, y1 # y1 = a ^ (a >> (22-13)
279 xor g, y2 # y2 = f^g
280 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
281 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
282 and e, y2 # y2 = (f^g)&e
283 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
284 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
285 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
286 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
287 xor g, y2 # y2 = CH = ((f^g)&e)^g
288 vpxor XTMP3, XTMP2, XTMP2
289 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
290 add y0, y2 # y2 = S1 + CH
291 add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
292 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
293 mov a, y0 # y0 = a
294 add y2, h # h = h + S1 + CH + k + w
295 mov a, y2 # y2 = a
296 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
297 or c, y0 # y0 = a|c
298 add h, d # d = d + h + S1 + CH + k + w
299 and c, y2 # y2 = a&c
300 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
301 and b, y0 # y0 = (a|c)&b
302 add y1, h # h = h + S1 + CH + k + w + S0
303 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
304 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
305 ROTATE_ARGS
306 rotate_Xs
307.endm
308
309## input is [rsp + _XFER + %1 * 4]
310.macro DO_ROUND round
311 mov e, y0 # y0 = e
312 MY_ROR (25-11), y0 # y0 = e >> (25-11)
313 mov a, y1 # y1 = a
314 xor e, y0 # y0 = e ^ (e >> (25-11))
315 MY_ROR (22-13), y1 # y1 = a >> (22-13)
316 mov f, y2 # y2 = f
317 xor a, y1 # y1 = a ^ (a >> (22-13)
318 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
319 xor g, y2 # y2 = f^g
320 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
321 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
322 and e, y2 # y2 = (f^g)&e
323 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
324 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
325 xor g, y2 # y2 = CH = ((f^g)&e)^g
326 add y0, y2 # y2 = S1 + CH
327 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
328 offset = \round * 4 + _XFER #
329 add offset(%rsp), y2 # y2 = k + w + S1 + CH
330 mov a, y0 # y0 = a
331 add y2, h # h = h + S1 + CH + k + w
332 mov a, y2 # y2 = a
333 or c, y0 # y0 = a|c
334 add h, d # d = d + h + S1 + CH + k + w
335 and c, y2 # y2 = a&c
336 and b, y0 # y0 = (a|c)&b
337 add y1, h # h = h + S1 + CH + k + w + S0
338 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
339 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
340 ROTATE_ARGS
341.endm
342
343########################################################################
344## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
345## arg 1 : pointer to state
346## arg 2 : pointer to input data
347## arg 3 : Num blocks
348########################################################################
349.text
350SYM_TYPED_FUNC_START(sha256_transform_avx)
351 pushq %rbx
352 pushq %r12
353 pushq %r13
354 pushq %r14
355 pushq %r15
356 pushq %rbp
357 movq %rsp, %rbp
358
359 subq $STACK_SIZE, %rsp # allocate stack space
360 and $~15, %rsp # align stack pointer
361
362 shl $6, NUM_BLKS # convert to bytes
363 jz .Ldone_hash
364 add INP, NUM_BLKS # pointer to end of data
365 mov NUM_BLKS, _INP_END(%rsp)
366
367 ## load initial digest
368 mov 4*0(CTX), a
369 mov 4*1(CTX), b
370 mov 4*2(CTX), c
371 mov 4*3(CTX), d
372 mov 4*4(CTX), e
373 mov 4*5(CTX), f
374 mov 4*6(CTX), g
375 mov 4*7(CTX), h
376
377 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
378 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
379 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
380.Lloop0:
381 lea K256(%rip), TBL
382
383 ## byte swap first 16 dwords
384 COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
385 COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
386 COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
387 COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
388
389 mov INP, _INP(%rsp)
390
391 ## schedule 48 input dwords, by doing 3 rounds of 16 each
392 mov $3, SRND
393.align 16
394.Lloop1:
395 vpaddd (TBL), X0, XFER
396 vmovdqa XFER, _XFER(%rsp)
397 FOUR_ROUNDS_AND_SCHED
398
399 vpaddd 1*16(TBL), X0, XFER
400 vmovdqa XFER, _XFER(%rsp)
401 FOUR_ROUNDS_AND_SCHED
402
403 vpaddd 2*16(TBL), X0, XFER
404 vmovdqa XFER, _XFER(%rsp)
405 FOUR_ROUNDS_AND_SCHED
406
407 vpaddd 3*16(TBL), X0, XFER
408 vmovdqa XFER, _XFER(%rsp)
409 add $4*16, TBL
410 FOUR_ROUNDS_AND_SCHED
411
412 sub $1, SRND
413 jne .Lloop1
414
415 mov $2, SRND
416.Lloop2:
417 vpaddd (TBL), X0, XFER
418 vmovdqa XFER, _XFER(%rsp)
419 DO_ROUND 0
420 DO_ROUND 1
421 DO_ROUND 2
422 DO_ROUND 3
423
424 vpaddd 1*16(TBL), X1, XFER
425 vmovdqa XFER, _XFER(%rsp)
426 add $2*16, TBL
427 DO_ROUND 0
428 DO_ROUND 1
429 DO_ROUND 2
430 DO_ROUND 3
431
432 vmovdqa X2, X0
433 vmovdqa X3, X1
434
435 sub $1, SRND
436 jne .Lloop2
437
438 addm (4*0)(CTX),a
439 addm (4*1)(CTX),b
440 addm (4*2)(CTX),c
441 addm (4*3)(CTX),d
442 addm (4*4)(CTX),e
443 addm (4*5)(CTX),f
444 addm (4*6)(CTX),g
445 addm (4*7)(CTX),h
446
447 mov _INP(%rsp), INP
448 add $64, INP
449 cmp _INP_END(%rsp), INP
450 jne .Lloop0
451
452.Ldone_hash:
453
454 mov %rbp, %rsp
455 popq %rbp
456 popq %r15
457 popq %r14
458 popq %r13
459 popq %r12
460 popq %rbx
461 RET
462SYM_FUNC_END(sha256_transform_avx)
463
464.section .rodata.cst256.K256, "aM", @progbits, 256
465.align 64
466K256:
467 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
468 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
469 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
470 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
471 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
472 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
473 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
474 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
475 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
476 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
477 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
478 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
479 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
480 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
481 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
482 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
483
484.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
485.align 16
486PSHUFFLE_BYTE_FLIP_MASK:
487 .octa 0x0c0d0e0f08090a0b0405060700010203
488
489.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
490.align 16
491# shuffle xBxA -> 00BA
492_SHUF_00BA:
493 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
494
495.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
496.align 16
497# shuffle xDxC -> DC00
498_SHUF_DC00:
499 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
1########################################################################
2# Implement fast SHA-256 with AVX1 instructions. (x86_64)
3#
4# Copyright (C) 2013 Intel Corporation.
5#
6# Authors:
7# James Guilford <james.guilford@intel.com>
8# Kirk Yap <kirk.s.yap@intel.com>
9# Tim Chen <tim.c.chen@linux.intel.com>
10#
11# This software is available to you under a choice of one of two
12# licenses. You may choose to be licensed under the terms of the GNU
13# General Public License (GPL) Version 2, available from the file
14# COPYING in the main directory of this source tree, or the
15# OpenIB.org BSD license below:
16#
17# Redistribution and use in source and binary forms, with or
18# without modification, are permitted provided that the following
19# conditions are met:
20#
21# - Redistributions of source code must retain the above
22# copyright notice, this list of conditions and the following
23# disclaimer.
24#
25# - Redistributions in binary form must reproduce the above
26# copyright notice, this list of conditions and the following
27# disclaimer in the documentation and/or other materials
28# provided with the distribution.
29#
30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37# SOFTWARE.
38########################################################################
39#
40# This code is described in an Intel White-Paper:
41# "Fast SHA-256 Implementations on Intel Architecture Processors"
42#
43# To find it, surf to http://www.intel.com/p/en_US/embedded
44# and search for that title.
45#
46########################################################################
47# This code schedules 1 block at a time, with 4 lanes per block
48########################################################################
49
50#include <linux/linkage.h>
51
52## assume buffers not aligned
53#define VMOVDQ vmovdqu
54
55################################ Define Macros
56
57# addm [mem], reg
58# Add reg to mem using reg-mem add and store
59.macro addm p1 p2
60 add \p1, \p2
61 mov \p2, \p1
62.endm
63
64
65.macro MY_ROR p1 p2
66 shld $(32-(\p1)), \p2, \p2
67.endm
68
69################################
70
71# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
72# Load xmm with mem and byte swap each dword
73.macro COPY_XMM_AND_BSWAP p1 p2 p3
74 VMOVDQ \p2, \p1
75 vpshufb \p3, \p1, \p1
76.endm
77
78################################
79
80X0 = %xmm4
81X1 = %xmm5
82X2 = %xmm6
83X3 = %xmm7
84
85XTMP0 = %xmm0
86XTMP1 = %xmm1
87XTMP2 = %xmm2
88XTMP3 = %xmm3
89XTMP4 = %xmm8
90XFER = %xmm9
91XTMP5 = %xmm11
92
93SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
94SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
95BYTE_FLIP_MASK = %xmm13
96
97NUM_BLKS = %rdx # 3rd arg
98INP = %rsi # 2nd arg
99CTX = %rdi # 1st arg
100
101SRND = %rsi # clobbers INP
102c = %ecx
103d = %r8d
104e = %edx
105TBL = %r12
106a = %eax
107b = %ebx
108
109f = %r9d
110g = %r10d
111h = %r11d
112
113y0 = %r13d
114y1 = %r14d
115y2 = %r15d
116
117
118_INP_END_SIZE = 8
119_INP_SIZE = 8
120_XFER_SIZE = 16
121_XMM_SAVE_SIZE = 0
122
123_INP_END = 0
124_INP = _INP_END + _INP_END_SIZE
125_XFER = _INP + _INP_SIZE
126_XMM_SAVE = _XFER + _XFER_SIZE
127STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
128
129# rotate_Xs
130# Rotate values of symbols X0...X3
131.macro rotate_Xs
132X_ = X0
133X0 = X1
134X1 = X2
135X2 = X3
136X3 = X_
137.endm
138
139# ROTATE_ARGS
140# Rotate values of symbols a...h
141.macro ROTATE_ARGS
142TMP_ = h
143h = g
144g = f
145f = e
146e = d
147d = c
148c = b
149b = a
150a = TMP_
151.endm
152
153.macro FOUR_ROUNDS_AND_SCHED
154 ## compute s0 four at a time and s1 two at a time
155 ## compute W[-16] + W[-7] 4 at a time
156
157 mov e, y0 # y0 = e
158 MY_ROR (25-11), y0 # y0 = e >> (25-11)
159 mov a, y1 # y1 = a
160 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
161 MY_ROR (22-13), y1 # y1 = a >> (22-13)
162 xor e, y0 # y0 = e ^ (e >> (25-11))
163 mov f, y2 # y2 = f
164 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
165 xor a, y1 # y1 = a ^ (a >> (22-13)
166 xor g, y2 # y2 = f^g
167 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
168 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
169 and e, y2 # y2 = (f^g)&e
170 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
171 ## compute s0
172 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
173 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
174 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
175 xor g, y2 # y2 = CH = ((f^g)&e)^g
176 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
177 add y0, y2 # y2 = S1 + CH
178 add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
179 mov a, y0 # y0 = a
180 add y2, h # h = h + S1 + CH + k + w
181 mov a, y2 # y2 = a
182 vpsrld $7, XTMP1, XTMP2
183 or c, y0 # y0 = a|c
184 add h, d # d = d + h + S1 + CH + k + w
185 and c, y2 # y2 = a&c
186 vpslld $(32-7), XTMP1, XTMP3
187 and b, y0 # y0 = (a|c)&b
188 add y1, h # h = h + S1 + CH + k + w + S0
189 vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
190 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
191 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
192 ROTATE_ARGS
193 mov e, y0 # y0 = e
194 mov a, y1 # y1 = a
195 MY_ROR (25-11), y0 # y0 = e >> (25-11)
196 xor e, y0 # y0 = e ^ (e >> (25-11))
197 mov f, y2 # y2 = f
198 MY_ROR (22-13), y1 # y1 = a >> (22-13)
199 vpsrld $18, XTMP1, XTMP2 #
200 xor a, y1 # y1 = a ^ (a >> (22-13)
201 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
202 xor g, y2 # y2 = f^g
203 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
204 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
205 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
206 and e, y2 # y2 = (f^g)&e
207 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
208 vpslld $(32-18), XTMP1, XTMP1
209 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
210 xor g, y2 # y2 = CH = ((f^g)&e)^g
211 vpxor XTMP1, XTMP3, XTMP3 #
212 add y0, y2 # y2 = S1 + CH
213 add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
214 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
215 vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
216 mov a, y0 # y0 = a
217 add y2, h # h = h + S1 + CH + k + w
218 mov a, y2 # y2 = a
219 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
220 or c, y0 # y0 = a|c
221 add h, d # d = d + h + S1 + CH + k + w
222 and c, y2 # y2 = a&c
223 ## compute low s1
224 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
225 and b, y0 # y0 = (a|c)&b
226 add y1, h # h = h + S1 + CH + k + w + S0
227 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
228 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
229 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
230 ROTATE_ARGS
231 mov e, y0 # y0 = e
232 mov a, y1 # y1 = a
233 MY_ROR (25-11), y0 # y0 = e >> (25-11)
234 xor e, y0 # y0 = e ^ (e >> (25-11))
235 MY_ROR (22-13), y1 # y1 = a >> (22-13)
236 mov f, y2 # y2 = f
237 xor a, y1 # y1 = a ^ (a >> (22-13)
238 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
239 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
240 xor g, y2 # y2 = f^g
241 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
242 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
243 and e, y2 # y2 = (f^g)&e
244 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
245 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
246 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
247 xor g, y2 # y2 = CH = ((f^g)&e)^g
248 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
249 vpxor XTMP3, XTMP2, XTMP2 #
250 add y0, y2 # y2 = S1 + CH
251 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
252 add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
253 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
254 mov a, y0 # y0 = a
255 add y2, h # h = h + S1 + CH + k + w
256 mov a, y2 # y2 = a
257 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
258 or c, y0 # y0 = a|c
259 add h, d # d = d + h + S1 + CH + k + w
260 and c, y2 # y2 = a&c
261 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
262 and b, y0 # y0 = (a|c)&b
263 add y1, h # h = h + S1 + CH + k + w + S0
264 ## compute high s1
265 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
266 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
267 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
268 ROTATE_ARGS
269 mov e, y0 # y0 = e
270 MY_ROR (25-11), y0 # y0 = e >> (25-11)
271 mov a, y1 # y1 = a
272 MY_ROR (22-13), y1 # y1 = a >> (22-13)
273 xor e, y0 # y0 = e ^ (e >> (25-11))
274 mov f, y2 # y2 = f
275 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
276 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
277 xor a, y1 # y1 = a ^ (a >> (22-13)
278 xor g, y2 # y2 = f^g
279 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
280 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
281 and e, y2 # y2 = (f^g)&e
282 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
283 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
284 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
285 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
286 xor g, y2 # y2 = CH = ((f^g)&e)^g
287 vpxor XTMP3, XTMP2, XTMP2
288 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
289 add y0, y2 # y2 = S1 + CH
290 add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
291 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
292 mov a, y0 # y0 = a
293 add y2, h # h = h + S1 + CH + k + w
294 mov a, y2 # y2 = a
295 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
296 or c, y0 # y0 = a|c
297 add h, d # d = d + h + S1 + CH + k + w
298 and c, y2 # y2 = a&c
299 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
300 and b, y0 # y0 = (a|c)&b
301 add y1, h # h = h + S1 + CH + k + w + S0
302 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
303 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
304 ROTATE_ARGS
305 rotate_Xs
306.endm
307
308## input is [rsp + _XFER + %1 * 4]
309.macro DO_ROUND round
310 mov e, y0 # y0 = e
311 MY_ROR (25-11), y0 # y0 = e >> (25-11)
312 mov a, y1 # y1 = a
313 xor e, y0 # y0 = e ^ (e >> (25-11))
314 MY_ROR (22-13), y1 # y1 = a >> (22-13)
315 mov f, y2 # y2 = f
316 xor a, y1 # y1 = a ^ (a >> (22-13)
317 MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
318 xor g, y2 # y2 = f^g
319 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
320 MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
321 and e, y2 # y2 = (f^g)&e
322 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
323 MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
324 xor g, y2 # y2 = CH = ((f^g)&e)^g
325 add y0, y2 # y2 = S1 + CH
326 MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
327 offset = \round * 4 + _XFER #
328 add offset(%rsp), y2 # y2 = k + w + S1 + CH
329 mov a, y0 # y0 = a
330 add y2, h # h = h + S1 + CH + k + w
331 mov a, y2 # y2 = a
332 or c, y0 # y0 = a|c
333 add h, d # d = d + h + S1 + CH + k + w
334 and c, y2 # y2 = a&c
335 and b, y0 # y0 = (a|c)&b
336 add y1, h # h = h + S1 + CH + k + w + S0
337 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
338 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
339 ROTATE_ARGS
340.endm
341
342########################################################################
343## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
344## arg 1 : pointer to state
345## arg 2 : pointer to input data
346## arg 3 : Num blocks
347########################################################################
348.text
349SYM_FUNC_START(sha256_transform_avx)
350.align 32
351 pushq %rbx
352 pushq %r12
353 pushq %r13
354 pushq %r14
355 pushq %r15
356 pushq %rbp
357 movq %rsp, %rbp
358
359 subq $STACK_SIZE, %rsp # allocate stack space
360 and $~15, %rsp # align stack pointer
361
362 shl $6, NUM_BLKS # convert to bytes
363 jz done_hash
364 add INP, NUM_BLKS # pointer to end of data
365 mov NUM_BLKS, _INP_END(%rsp)
366
367 ## load initial digest
368 mov 4*0(CTX), a
369 mov 4*1(CTX), b
370 mov 4*2(CTX), c
371 mov 4*3(CTX), d
372 mov 4*4(CTX), e
373 mov 4*5(CTX), f
374 mov 4*6(CTX), g
375 mov 4*7(CTX), h
376
377 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
378 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
379 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
380loop0:
381 lea K256(%rip), TBL
382
383 ## byte swap first 16 dwords
384 COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
385 COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
386 COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
387 COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
388
389 mov INP, _INP(%rsp)
390
391 ## schedule 48 input dwords, by doing 3 rounds of 16 each
392 mov $3, SRND
393.align 16
394loop1:
395 vpaddd (TBL), X0, XFER
396 vmovdqa XFER, _XFER(%rsp)
397 FOUR_ROUNDS_AND_SCHED
398
399 vpaddd 1*16(TBL), X0, XFER
400 vmovdqa XFER, _XFER(%rsp)
401 FOUR_ROUNDS_AND_SCHED
402
403 vpaddd 2*16(TBL), X0, XFER
404 vmovdqa XFER, _XFER(%rsp)
405 FOUR_ROUNDS_AND_SCHED
406
407 vpaddd 3*16(TBL), X0, XFER
408 vmovdqa XFER, _XFER(%rsp)
409 add $4*16, TBL
410 FOUR_ROUNDS_AND_SCHED
411
412 sub $1, SRND
413 jne loop1
414
415 mov $2, SRND
416loop2:
417 vpaddd (TBL), X0, XFER
418 vmovdqa XFER, _XFER(%rsp)
419 DO_ROUND 0
420 DO_ROUND 1
421 DO_ROUND 2
422 DO_ROUND 3
423
424 vpaddd 1*16(TBL), X1, XFER
425 vmovdqa XFER, _XFER(%rsp)
426 add $2*16, TBL
427 DO_ROUND 0
428 DO_ROUND 1
429 DO_ROUND 2
430 DO_ROUND 3
431
432 vmovdqa X2, X0
433 vmovdqa X3, X1
434
435 sub $1, SRND
436 jne loop2
437
438 addm (4*0)(CTX),a
439 addm (4*1)(CTX),b
440 addm (4*2)(CTX),c
441 addm (4*3)(CTX),d
442 addm (4*4)(CTX),e
443 addm (4*5)(CTX),f
444 addm (4*6)(CTX),g
445 addm (4*7)(CTX),h
446
447 mov _INP(%rsp), INP
448 add $64, INP
449 cmp _INP_END(%rsp), INP
450 jne loop0
451
452done_hash:
453
454 mov %rbp, %rsp
455 popq %rbp
456 popq %r15
457 popq %r14
458 popq %r13
459 popq %r12
460 popq %rbx
461 ret
462SYM_FUNC_END(sha256_transform_avx)
463
464.section .rodata.cst256.K256, "aM", @progbits, 256
465.align 64
466K256:
467 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
468 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
469 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
470 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
471 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
472 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
473 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
474 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
475 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
476 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
477 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
478 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
479 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
480 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
481 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
482 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
483
484.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
485.align 16
486PSHUFFLE_BYTE_FLIP_MASK:
487 .octa 0x0c0d0e0f08090a0b0405060700010203
488
489.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
490.align 16
491# shuffle xBxA -> 00BA
492_SHUF_00BA:
493 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
494
495.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
496.align 16
497# shuffle xDxC -> DC00
498_SHUF_DC00:
499 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF