Loading...
1########################################################################
2# Implement fast SHA-256 with AVX2 instructions. (x86_64)
3#
4# Copyright (C) 2013 Intel Corporation.
5#
6# Authors:
7# James Guilford <james.guilford@intel.com>
8# Kirk Yap <kirk.s.yap@intel.com>
9# Tim Chen <tim.c.chen@linux.intel.com>
10#
11# This software is available to you under a choice of one of two
12# licenses. You may choose to be licensed under the terms of the GNU
13# General Public License (GPL) Version 2, available from the file
14# COPYING in the main directory of this source tree, or the
15# OpenIB.org BSD license below:
16#
17# Redistribution and use in source and binary forms, with or
18# without modification, are permitted provided that the following
19# conditions are met:
20#
21# - Redistributions of source code must retain the above
22# copyright notice, this list of conditions and the following
23# disclaimer.
24#
25# - Redistributions in binary form must reproduce the above
26# copyright notice, this list of conditions and the following
27# disclaimer in the documentation and/or other materials
28# provided with the distribution.
29#
30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37# SOFTWARE.
38#
39########################################################################
40#
41# This code is described in an Intel White-Paper:
42# "Fast SHA-256 Implementations on Intel Architecture Processors"
43#
44# To find it, surf to http://www.intel.com/p/en_US/embedded
45# and search for that title.
46#
47########################################################################
48# This code schedules 2 blocks at a time, with 4 lanes per block
49########################################################################
50
51#include <linux/linkage.h>
52#include <linux/cfi_types.h>
53
54## assume buffers not aligned
55#define VMOVDQ vmovdqu
56
57################################ Define Macros
58
59# addm [mem], reg
60# Add reg to mem using reg-mem add and store
61.macro addm p1 p2
62 add \p1, \p2
63 mov \p2, \p1
64.endm
65
66################################
67
68X0 = %ymm4
69X1 = %ymm5
70X2 = %ymm6
71X3 = %ymm7
72
73# XMM versions of above
74XWORD0 = %xmm4
75XWORD1 = %xmm5
76XWORD2 = %xmm6
77XWORD3 = %xmm7
78
79XTMP0 = %ymm0
80XTMP1 = %ymm1
81XTMP2 = %ymm2
82XTMP3 = %ymm3
83XTMP4 = %ymm8
84XFER = %ymm9
85XTMP5 = %ymm11
86
87SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
88SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
89BYTE_FLIP_MASK = %ymm13
90
91X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
92
93NUM_BLKS = %rdx # 3rd arg
94INP = %rsi # 2nd arg
95CTX = %rdi # 1st arg
96c = %ecx
97d = %r8d
98e = %edx # clobbers NUM_BLKS
99y3 = %esi # clobbers INP
100
101SRND = CTX # SRND is same register as CTX
102
103a = %eax
104b = %ebx
105f = %r9d
106g = %r10d
107h = %r11d
108old_h = %r11d
109
110T1 = %r12d
111y0 = %r13d
112y1 = %r14d
113y2 = %r15d
114
115
116_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
117_XMM_SAVE_SIZE = 0
118_INP_END_SIZE = 8
119_INP_SIZE = 8
120_CTX_SIZE = 8
121
122_XFER = 0
123_XMM_SAVE = _XFER + _XFER_SIZE
124_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
125_INP = _INP_END + _INP_END_SIZE
126_CTX = _INP + _INP_SIZE
127STACK_SIZE = _CTX + _CTX_SIZE
128
129# rotate_Xs
130# Rotate values of symbols X0...X3
131.macro rotate_Xs
132 X_ = X0
133 X0 = X1
134 X1 = X2
135 X2 = X3
136 X3 = X_
137.endm
138
139# ROTATE_ARGS
140# Rotate values of symbols a...h
141.macro ROTATE_ARGS
142 old_h = h
143 TMP_ = h
144 h = g
145 g = f
146 f = e
147 e = d
148 d = c
149 c = b
150 b = a
151 a = TMP_
152.endm
153
154.macro FOUR_ROUNDS_AND_SCHED disp
155################################### RND N + 0 ############################
156
157 mov a, y3 # y3 = a # MAJA
158 rorx $25, e, y0 # y0 = e >> 25 # S1A
159 rorx $11, e, y1 # y1 = e >> 11 # S1B
160
161 addl \disp(%rsp, SRND), h # h = k + w + h # --
162 or c, y3 # y3 = a|c # MAJA
163 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
164 mov f, y2 # y2 = f # CH
165 rorx $13, a, T1 # T1 = a >> 13 # S0B
166
167 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
168 xor g, y2 # y2 = f^g # CH
169 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
170 rorx $6, e, y1 # y1 = (e >> 6) # S1
171
172 and e, y2 # y2 = (f^g)&e # CH
173 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
174 rorx $22, a, y1 # y1 = a >> 22 # S0A
175 add h, d # d = k + w + h + d # --
176
177 and b, y3 # y3 = (a|c)&b # MAJA
178 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
179 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
180 rorx $2, a, T1 # T1 = (a >> 2) # S0
181
182 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
183 vpsrld $7, XTMP1, XTMP2
184 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
185 mov a, T1 # T1 = a # MAJB
186 and c, T1 # T1 = a&c # MAJB
187
188 add y0, y2 # y2 = S1 + CH # --
189 vpslld $(32-7), XTMP1, XTMP3
190 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
191 add y1, h # h = k + w + h + S0 # --
192
193 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
194 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
195
196 vpsrld $18, XTMP1, XTMP2
197 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
198 add y3, h # h = t1 + S0 + MAJ # --
199
200
201 ROTATE_ARGS
202
203################################### RND N + 1 ############################
204
205 mov a, y3 # y3 = a # MAJA
206 rorx $25, e, y0 # y0 = e >> 25 # S1A
207 rorx $11, e, y1 # y1 = e >> 11 # S1B
208 offset = \disp + 1*4
209 addl offset(%rsp, SRND), h # h = k + w + h # --
210 or c, y3 # y3 = a|c # MAJA
211
212
213 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
214 mov f, y2 # y2 = f # CH
215 rorx $13, a, T1 # T1 = a >> 13 # S0B
216 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
217 xor g, y2 # y2 = f^g # CH
218
219
220 rorx $6, e, y1 # y1 = (e >> 6) # S1
221 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
222 rorx $22, a, y1 # y1 = a >> 22 # S0A
223 and e, y2 # y2 = (f^g)&e # CH
224 add h, d # d = k + w + h + d # --
225
226 vpslld $(32-18), XTMP1, XTMP1
227 and b, y3 # y3 = (a|c)&b # MAJA
228 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
229
230 vpxor XTMP1, XTMP3, XTMP3
231 rorx $2, a, T1 # T1 = (a >> 2) # S0
232 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
233
234 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
235 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
236 mov a, T1 # T1 = a # MAJB
237 and c, T1 # T1 = a&c # MAJB
238 add y0, y2 # y2 = S1 + CH # --
239
240 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
241 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
242 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
243 add y1, h # h = k + w + h + S0 # --
244
245 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
246 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
247 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
248 add y3, h # h = t1 + S0 + MAJ # --
249
250 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
251
252
253 ROTATE_ARGS
254
255################################### RND N + 2 ############################
256
257 mov a, y3 # y3 = a # MAJA
258 rorx $25, e, y0 # y0 = e >> 25 # S1A
259 offset = \disp + 2*4
260 addl offset(%rsp, SRND), h # h = k + w + h # --
261
262 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
263 rorx $11, e, y1 # y1 = e >> 11 # S1B
264 or c, y3 # y3 = a|c # MAJA
265 mov f, y2 # y2 = f # CH
266 xor g, y2 # y2 = f^g # CH
267
268 rorx $13, a, T1 # T1 = a >> 13 # S0B
269 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
270 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
271 and e, y2 # y2 = (f^g)&e # CH
272
273 rorx $6, e, y1 # y1 = (e >> 6) # S1
274 vpxor XTMP3, XTMP2, XTMP2
275 add h, d # d = k + w + h + d # --
276 and b, y3 # y3 = (a|c)&b # MAJA
277
278 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
279 rorx $22, a, y1 # y1 = a >> 22 # S0A
280 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
281 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
282
283 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
284 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
285 rorx $2, a ,T1 # T1 = (a >> 2) # S0
286 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
287
288 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
289 mov a, T1 # T1 = a # MAJB
290 and c, T1 # T1 = a&c # MAJB
291 add y0, y2 # y2 = S1 + CH # --
292 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
293
294 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
295 add y1,h # h = k + w + h + S0 # --
296 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
297 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
298
299 add y3,h # h = t1 + S0 + MAJ # --
300
301
302 ROTATE_ARGS
303
304################################### RND N + 3 ############################
305
306 mov a, y3 # y3 = a # MAJA
307 rorx $25, e, y0 # y0 = e >> 25 # S1A
308 rorx $11, e, y1 # y1 = e >> 11 # S1B
309 offset = \disp + 3*4
310 addl offset(%rsp, SRND), h # h = k + w + h # --
311 or c, y3 # y3 = a|c # MAJA
312
313
314 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
315 mov f, y2 # y2 = f # CH
316 rorx $13, a, T1 # T1 = a >> 13 # S0B
317 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
318 xor g, y2 # y2 = f^g # CH
319
320
321 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
322 rorx $6, e, y1 # y1 = (e >> 6) # S1
323 and e, y2 # y2 = (f^g)&e # CH
324 add h, d # d = k + w + h + d # --
325 and b, y3 # y3 = (a|c)&b # MAJA
326
327 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
328 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
329 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
330
331 vpxor XTMP3, XTMP2, XTMP2
332 rorx $22, a, y1 # y1 = a >> 22 # S0A
333 add y0, y2 # y2 = S1 + CH # --
334
335 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
336 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
337 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
338
339 rorx $2, a, T1 # T1 = (a >> 2) # S0
340 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
341
342 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
343 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
344 mov a, T1 # T1 = a # MAJB
345 and c, T1 # T1 = a&c # MAJB
346 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
347
348 add y1, h # h = k + w + h + S0 # --
349 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
350 add y3, h # h = t1 + S0 + MAJ # --
351
352 ROTATE_ARGS
353 rotate_Xs
354.endm
355
356.macro DO_4ROUNDS disp
357################################### RND N + 0 ###########################
358
359 mov f, y2 # y2 = f # CH
360 rorx $25, e, y0 # y0 = e >> 25 # S1A
361 rorx $11, e, y1 # y1 = e >> 11 # S1B
362 xor g, y2 # y2 = f^g # CH
363
364 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
365 rorx $6, e, y1 # y1 = (e >> 6) # S1
366 and e, y2 # y2 = (f^g)&e # CH
367
368 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
369 rorx $13, a, T1 # T1 = a >> 13 # S0B
370 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
371 rorx $22, a, y1 # y1 = a >> 22 # S0A
372 mov a, y3 # y3 = a # MAJA
373
374 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
375 rorx $2, a, T1 # T1 = (a >> 2) # S0
376 addl \disp(%rsp, SRND), h # h = k + w + h # --
377 or c, y3 # y3 = a|c # MAJA
378
379 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
380 mov a, T1 # T1 = a # MAJB
381 and b, y3 # y3 = (a|c)&b # MAJA
382 and c, T1 # T1 = a&c # MAJB
383 add y0, y2 # y2 = S1 + CH # --
384
385
386 add h, d # d = k + w + h + d # --
387 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
388 add y1, h # h = k + w + h + S0 # --
389 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
390
391 ROTATE_ARGS
392
393################################### RND N + 1 ###########################
394
395 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
396 mov f, y2 # y2 = f # CH
397 rorx $25, e, y0 # y0 = e >> 25 # S1A
398 rorx $11, e, y1 # y1 = e >> 11 # S1B
399 xor g, y2 # y2 = f^g # CH
400
401 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
402 rorx $6, e, y1 # y1 = (e >> 6) # S1
403 and e, y2 # y2 = (f^g)&e # CH
404 add y3, old_h # h = t1 + S0 + MAJ # --
405
406 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
407 rorx $13, a, T1 # T1 = a >> 13 # S0B
408 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
409 rorx $22, a, y1 # y1 = a >> 22 # S0A
410 mov a, y3 # y3 = a # MAJA
411
412 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
413 rorx $2, a, T1 # T1 = (a >> 2) # S0
414 offset = 4*1 + \disp
415 addl offset(%rsp, SRND), h # h = k + w + h # --
416 or c, y3 # y3 = a|c # MAJA
417
418 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
419 mov a, T1 # T1 = a # MAJB
420 and b, y3 # y3 = (a|c)&b # MAJA
421 and c, T1 # T1 = a&c # MAJB
422 add y0, y2 # y2 = S1 + CH # --
423
424
425 add h, d # d = k + w + h + d # --
426 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
427 add y1, h # h = k + w + h + S0 # --
428
429 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
430
431 ROTATE_ARGS
432
433################################### RND N + 2 ##############################
434
435 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
436 mov f, y2 # y2 = f # CH
437 rorx $25, e, y0 # y0 = e >> 25 # S1A
438 rorx $11, e, y1 # y1 = e >> 11 # S1B
439 xor g, y2 # y2 = f^g # CH
440
441 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
442 rorx $6, e, y1 # y1 = (e >> 6) # S1
443 and e, y2 # y2 = (f^g)&e # CH
444 add y3, old_h # h = t1 + S0 + MAJ # --
445
446 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
447 rorx $13, a, T1 # T1 = a >> 13 # S0B
448 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
449 rorx $22, a, y1 # y1 = a >> 22 # S0A
450 mov a, y3 # y3 = a # MAJA
451
452 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
453 rorx $2, a, T1 # T1 = (a >> 2) # S0
454 offset = 4*2 + \disp
455 addl offset(%rsp, SRND), h # h = k + w + h # --
456 or c, y3 # y3 = a|c # MAJA
457
458 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
459 mov a, T1 # T1 = a # MAJB
460 and b, y3 # y3 = (a|c)&b # MAJA
461 and c, T1 # T1 = a&c # MAJB
462 add y0, y2 # y2 = S1 + CH # --
463
464
465 add h, d # d = k + w + h + d # --
466 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
467 add y1, h # h = k + w + h + S0 # --
468
469 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
470
471 ROTATE_ARGS
472
473################################### RND N + 3 ###########################
474
475 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
476 mov f, y2 # y2 = f # CH
477 rorx $25, e, y0 # y0 = e >> 25 # S1A
478 rorx $11, e, y1 # y1 = e >> 11 # S1B
479 xor g, y2 # y2 = f^g # CH
480
481 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
482 rorx $6, e, y1 # y1 = (e >> 6) # S1
483 and e, y2 # y2 = (f^g)&e # CH
484 add y3, old_h # h = t1 + S0 + MAJ # --
485
486 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
487 rorx $13, a, T1 # T1 = a >> 13 # S0B
488 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
489 rorx $22, a, y1 # y1 = a >> 22 # S0A
490 mov a, y3 # y3 = a # MAJA
491
492 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
493 rorx $2, a, T1 # T1 = (a >> 2) # S0
494 offset = 4*3 + \disp
495 addl offset(%rsp, SRND), h # h = k + w + h # --
496 or c, y3 # y3 = a|c # MAJA
497
498 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
499 mov a, T1 # T1 = a # MAJB
500 and b, y3 # y3 = (a|c)&b # MAJA
501 and c, T1 # T1 = a&c # MAJB
502 add y0, y2 # y2 = S1 + CH # --
503
504
505 add h, d # d = k + w + h + d # --
506 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
507 add y1, h # h = k + w + h + S0 # --
508
509 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
510
511
512 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
513
514 add y3, h # h = t1 + S0 + MAJ # --
515
516 ROTATE_ARGS
517
518.endm
519
520########################################################################
521## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
522## arg 1 : pointer to state
523## arg 2 : pointer to input data
524## arg 3 : Num blocks
525########################################################################
526.text
527SYM_TYPED_FUNC_START(sha256_transform_rorx)
528 pushq %rbx
529 pushq %r12
530 pushq %r13
531 pushq %r14
532 pushq %r15
533
534 push %rbp
535 mov %rsp, %rbp
536
537 subq $STACK_SIZE, %rsp
538 and $-32, %rsp # align rsp to 32 byte boundary
539
540 shl $6, NUM_BLKS # convert to bytes
541 jz .Ldone_hash
542 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
543 mov NUM_BLKS, _INP_END(%rsp)
544
545 cmp NUM_BLKS, INP
546 je .Lonly_one_block
547
548 ## load initial digest
549 mov (CTX), a
550 mov 4*1(CTX), b
551 mov 4*2(CTX), c
552 mov 4*3(CTX), d
553 mov 4*4(CTX), e
554 mov 4*5(CTX), f
555 mov 4*6(CTX), g
556 mov 4*7(CTX), h
557
558 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
559 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
560 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
561
562 mov CTX, _CTX(%rsp)
563
564.Lloop0:
565 ## Load first 16 dwords from two blocks
566 VMOVDQ 0*32(INP),XTMP0
567 VMOVDQ 1*32(INP),XTMP1
568 VMOVDQ 2*32(INP),XTMP2
569 VMOVDQ 3*32(INP),XTMP3
570
571 ## byte swap data
572 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
573 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
574 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
575 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
576
577 ## transpose data into high/low halves
578 vperm2i128 $0x20, XTMP2, XTMP0, X0
579 vperm2i128 $0x31, XTMP2, XTMP0, X1
580 vperm2i128 $0x20, XTMP3, XTMP1, X2
581 vperm2i128 $0x31, XTMP3, XTMP1, X3
582
583.Llast_block_enter:
584 add $64, INP
585 mov INP, _INP(%rsp)
586
587 ## schedule 48 input dwords, by doing 3 rounds of 12 each
588 xor SRND, SRND
589
590.align 16
591.Lloop1:
592 leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
593 vpaddd (INP, SRND), X0, XFER
594 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
595 FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)
596
597 leaq K256+1*32(%rip), INP
598 vpaddd (INP, SRND), X0, XFER
599 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
600 FOUR_ROUNDS_AND_SCHED (_XFER + 1*32)
601
602 leaq K256+2*32(%rip), INP
603 vpaddd (INP, SRND), X0, XFER
604 vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
605 FOUR_ROUNDS_AND_SCHED (_XFER + 2*32)
606
607 leaq K256+3*32(%rip), INP
608 vpaddd (INP, SRND), X0, XFER
609 vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
610 FOUR_ROUNDS_AND_SCHED (_XFER + 3*32)
611
612 add $4*32, SRND
613 cmp $3*4*32, SRND
614 jb .Lloop1
615
616.Lloop2:
617 ## Do last 16 rounds with no scheduling
618 leaq K256+0*32(%rip), INP
619 vpaddd (INP, SRND), X0, XFER
620 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
621 DO_4ROUNDS (_XFER + 0*32)
622
623 leaq K256+1*32(%rip), INP
624 vpaddd (INP, SRND), X1, XFER
625 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
626 DO_4ROUNDS (_XFER + 1*32)
627 add $2*32, SRND
628
629 vmovdqa X2, X0
630 vmovdqa X3, X1
631
632 cmp $4*4*32, SRND
633 jb .Lloop2
634
635 mov _CTX(%rsp), CTX
636 mov _INP(%rsp), INP
637
638 addm (4*0)(CTX),a
639 addm (4*1)(CTX),b
640 addm (4*2)(CTX),c
641 addm (4*3)(CTX),d
642 addm (4*4)(CTX),e
643 addm (4*5)(CTX),f
644 addm (4*6)(CTX),g
645 addm (4*7)(CTX),h
646
647 cmp _INP_END(%rsp), INP
648 ja .Ldone_hash
649
650 #### Do second block using previously scheduled results
651 xor SRND, SRND
652.align 16
653.Lloop3:
654 DO_4ROUNDS (_XFER + 0*32 + 16)
655 DO_4ROUNDS (_XFER + 1*32 + 16)
656 add $2*32, SRND
657 cmp $4*4*32, SRND
658 jb .Lloop3
659
660 mov _CTX(%rsp), CTX
661 mov _INP(%rsp), INP
662 add $64, INP
663
664 addm (4*0)(CTX),a
665 addm (4*1)(CTX),b
666 addm (4*2)(CTX),c
667 addm (4*3)(CTX),d
668 addm (4*4)(CTX),e
669 addm (4*5)(CTX),f
670 addm (4*6)(CTX),g
671 addm (4*7)(CTX),h
672
673 cmp _INP_END(%rsp), INP
674 jb .Lloop0
675 ja .Ldone_hash
676
677.Ldo_last_block:
678 VMOVDQ 0*16(INP),XWORD0
679 VMOVDQ 1*16(INP),XWORD1
680 VMOVDQ 2*16(INP),XWORD2
681 VMOVDQ 3*16(INP),XWORD3
682
683 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
684 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
685 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
686 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
687
688 jmp .Llast_block_enter
689
690.Lonly_one_block:
691
692 ## load initial digest
693 mov (4*0)(CTX),a
694 mov (4*1)(CTX),b
695 mov (4*2)(CTX),c
696 mov (4*3)(CTX),d
697 mov (4*4)(CTX),e
698 mov (4*5)(CTX),f
699 mov (4*6)(CTX),g
700 mov (4*7)(CTX),h
701
702 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
703 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
704 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
705
706 mov CTX, _CTX(%rsp)
707 jmp .Ldo_last_block
708
709.Ldone_hash:
710
711 mov %rbp, %rsp
712 pop %rbp
713
714 popq %r15
715 popq %r14
716 popq %r13
717 popq %r12
718 popq %rbx
719 vzeroupper
720 RET
721SYM_FUNC_END(sha256_transform_rorx)
722
723.section .rodata.cst512.K256, "aM", @progbits, 512
724.align 64
725K256:
726 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
727 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
728 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
729 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
730 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
731 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
732 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
733 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
734 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
735 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
736 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
737 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
738 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
739 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
740 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
741 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
742 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
743 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
744 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
745 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
746 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
747 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
748 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
749 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
750 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
751 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
752 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
753 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
754 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
755 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
756 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
757 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
758
759.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
760.align 32
761PSHUFFLE_BYTE_FLIP_MASK:
762 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
763
764# shuffle xBxA -> 00BA
765.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
766.align 32
767_SHUF_00BA:
768 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
769
770# shuffle xDxC -> DC00
771.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
772.align 32
773_SHUF_DC00:
774 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
1########################################################################
2# Implement fast SHA-256 with AVX2 instructions. (x86_64)
3#
4# Copyright (C) 2013 Intel Corporation.
5#
6# Authors:
7# James Guilford <james.guilford@intel.com>
8# Kirk Yap <kirk.s.yap@intel.com>
9# Tim Chen <tim.c.chen@linux.intel.com>
10#
11# This software is available to you under a choice of one of two
12# licenses. You may choose to be licensed under the terms of the GNU
13# General Public License (GPL) Version 2, available from the file
14# COPYING in the main directory of this source tree, or the
15# OpenIB.org BSD license below:
16#
17# Redistribution and use in source and binary forms, with or
18# without modification, are permitted provided that the following
19# conditions are met:
20#
21# - Redistributions of source code must retain the above
22# copyright notice, this list of conditions and the following
23# disclaimer.
24#
25# - Redistributions in binary form must reproduce the above
26# copyright notice, this list of conditions and the following
27# disclaimer in the documentation and/or other materials
28# provided with the distribution.
29#
30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37# SOFTWARE.
38#
39########################################################################
40#
41# This code is described in an Intel White-Paper:
42# "Fast SHA-256 Implementations on Intel Architecture Processors"
43#
44# To find it, surf to http://www.intel.com/p/en_US/embedded
45# and search for that title.
46#
47########################################################################
48# This code schedules 2 blocks at a time, with 4 lanes per block
49########################################################################
50
51#include <linux/linkage.h>
52
53## assume buffers not aligned
54#define VMOVDQ vmovdqu
55
56################################ Define Macros
57
58# addm [mem], reg
59# Add reg to mem using reg-mem add and store
60.macro addm p1 p2
61 add \p1, \p2
62 mov \p2, \p1
63.endm
64
65################################
66
67X0 = %ymm4
68X1 = %ymm5
69X2 = %ymm6
70X3 = %ymm7
71
72# XMM versions of above
73XWORD0 = %xmm4
74XWORD1 = %xmm5
75XWORD2 = %xmm6
76XWORD3 = %xmm7
77
78XTMP0 = %ymm0
79XTMP1 = %ymm1
80XTMP2 = %ymm2
81XTMP3 = %ymm3
82XTMP4 = %ymm8
83XFER = %ymm9
84XTMP5 = %ymm11
85
86SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
87SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
88BYTE_FLIP_MASK = %ymm13
89
90X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
91
92NUM_BLKS = %rdx # 3rd arg
93INP = %rsi # 2nd arg
94CTX = %rdi # 1st arg
95c = %ecx
96d = %r8d
97e = %edx # clobbers NUM_BLKS
98y3 = %esi # clobbers INP
99
100SRND = CTX # SRND is same register as CTX
101
102a = %eax
103b = %ebx
104f = %r9d
105g = %r10d
106h = %r11d
107old_h = %r11d
108
109T1 = %r12d
110y0 = %r13d
111y1 = %r14d
112y2 = %r15d
113
114
115_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
116_XMM_SAVE_SIZE = 0
117_INP_END_SIZE = 8
118_INP_SIZE = 8
119_CTX_SIZE = 8
120_RSP_SIZE = 8
121
122_XFER = 0
123_XMM_SAVE = _XFER + _XFER_SIZE
124_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
125_INP = _INP_END + _INP_END_SIZE
126_CTX = _INP + _INP_SIZE
127_RSP = _CTX + _CTX_SIZE
128STACK_SIZE = _RSP + _RSP_SIZE
129
130# rotate_Xs
131# Rotate values of symbols X0...X3
132.macro rotate_Xs
133 X_ = X0
134 X0 = X1
135 X1 = X2
136 X2 = X3
137 X3 = X_
138.endm
139
140# ROTATE_ARGS
141# Rotate values of symbols a...h
142.macro ROTATE_ARGS
143 old_h = h
144 TMP_ = h
145 h = g
146 g = f
147 f = e
148 e = d
149 d = c
150 c = b
151 b = a
152 a = TMP_
153.endm
154
155.macro FOUR_ROUNDS_AND_SCHED disp
156################################### RND N + 0 ############################
157
158 mov a, y3 # y3 = a # MAJA
159 rorx $25, e, y0 # y0 = e >> 25 # S1A
160 rorx $11, e, y1 # y1 = e >> 11 # S1B
161
162 addl \disp(%rsp, SRND), h # h = k + w + h # --
163 or c, y3 # y3 = a|c # MAJA
164 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
165 mov f, y2 # y2 = f # CH
166 rorx $13, a, T1 # T1 = a >> 13 # S0B
167
168 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
169 xor g, y2 # y2 = f^g # CH
170 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
171 rorx $6, e, y1 # y1 = (e >> 6) # S1
172
173 and e, y2 # y2 = (f^g)&e # CH
174 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
175 rorx $22, a, y1 # y1 = a >> 22 # S0A
176 add h, d # d = k + w + h + d # --
177
178 and b, y3 # y3 = (a|c)&b # MAJA
179 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
180 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
181 rorx $2, a, T1 # T1 = (a >> 2) # S0
182
183 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
184 vpsrld $7, XTMP1, XTMP2
185 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
186 mov a, T1 # T1 = a # MAJB
187 and c, T1 # T1 = a&c # MAJB
188
189 add y0, y2 # y2 = S1 + CH # --
190 vpslld $(32-7), XTMP1, XTMP3
191 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
192 add y1, h # h = k + w + h + S0 # --
193
194 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
195 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
196
197 vpsrld $18, XTMP1, XTMP2
198 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
199 add y3, h # h = t1 + S0 + MAJ # --
200
201
202 ROTATE_ARGS
203
204################################### RND N + 1 ############################
205
206 mov a, y3 # y3 = a # MAJA
207 rorx $25, e, y0 # y0 = e >> 25 # S1A
208 rorx $11, e, y1 # y1 = e >> 11 # S1B
209 offset = \disp + 1*4
210 addl offset(%rsp, SRND), h # h = k + w + h # --
211 or c, y3 # y3 = a|c # MAJA
212
213
214 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
215 mov f, y2 # y2 = f # CH
216 rorx $13, a, T1 # T1 = a >> 13 # S0B
217 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
218 xor g, y2 # y2 = f^g # CH
219
220
221 rorx $6, e, y1 # y1 = (e >> 6) # S1
222 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
223 rorx $22, a, y1 # y1 = a >> 22 # S0A
224 and e, y2 # y2 = (f^g)&e # CH
225 add h, d # d = k + w + h + d # --
226
227 vpslld $(32-18), XTMP1, XTMP1
228 and b, y3 # y3 = (a|c)&b # MAJA
229 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
230
231 vpxor XTMP1, XTMP3, XTMP3
232 rorx $2, a, T1 # T1 = (a >> 2) # S0
233 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
234
235 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
236 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
237 mov a, T1 # T1 = a # MAJB
238 and c, T1 # T1 = a&c # MAJB
239 add y0, y2 # y2 = S1 + CH # --
240
241 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
242 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
243 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
244 add y1, h # h = k + w + h + S0 # --
245
246 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
247 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
248 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
249 add y3, h # h = t1 + S0 + MAJ # --
250
251 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
252
253
254 ROTATE_ARGS
255
256################################### RND N + 2 ############################
257
258 mov a, y3 # y3 = a # MAJA
259 rorx $25, e, y0 # y0 = e >> 25 # S1A
260 offset = \disp + 2*4
261 addl offset(%rsp, SRND), h # h = k + w + h # --
262
263 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
264 rorx $11, e, y1 # y1 = e >> 11 # S1B
265 or c, y3 # y3 = a|c # MAJA
266 mov f, y2 # y2 = f # CH
267 xor g, y2 # y2 = f^g # CH
268
269 rorx $13, a, T1 # T1 = a >> 13 # S0B
270 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
271 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
272 and e, y2 # y2 = (f^g)&e # CH
273
274 rorx $6, e, y1 # y1 = (e >> 6) # S1
275 vpxor XTMP3, XTMP2, XTMP2
276 add h, d # d = k + w + h + d # --
277 and b, y3 # y3 = (a|c)&b # MAJA
278
279 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
280 rorx $22, a, y1 # y1 = a >> 22 # S0A
281 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
282 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
283
284 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
285 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
286 rorx $2, a ,T1 # T1 = (a >> 2) # S0
287 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
288
289 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
290 mov a, T1 # T1 = a # MAJB
291 and c, T1 # T1 = a&c # MAJB
292 add y0, y2 # y2 = S1 + CH # --
293 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
294
295 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
296 add y1,h # h = k + w + h + S0 # --
297 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
298 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
299
300 add y3,h # h = t1 + S0 + MAJ # --
301
302
303 ROTATE_ARGS
304
305################################### RND N + 3 ############################
306
307 mov a, y3 # y3 = a # MAJA
308 rorx $25, e, y0 # y0 = e >> 25 # S1A
309 rorx $11, e, y1 # y1 = e >> 11 # S1B
310 offset = \disp + 3*4
311 addl offset(%rsp, SRND), h # h = k + w + h # --
312 or c, y3 # y3 = a|c # MAJA
313
314
315 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
316 mov f, y2 # y2 = f # CH
317 rorx $13, a, T1 # T1 = a >> 13 # S0B
318 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
319 xor g, y2 # y2 = f^g # CH
320
321
322 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
323 rorx $6, e, y1 # y1 = (e >> 6) # S1
324 and e, y2 # y2 = (f^g)&e # CH
325 add h, d # d = k + w + h + d # --
326 and b, y3 # y3 = (a|c)&b # MAJA
327
328 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
329 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
330 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
331
332 vpxor XTMP3, XTMP2, XTMP2
333 rorx $22, a, y1 # y1 = a >> 22 # S0A
334 add y0, y2 # y2 = S1 + CH # --
335
336 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
337 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
338 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
339
340 rorx $2, a, T1 # T1 = (a >> 2) # S0
341 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
342
343 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
344 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
345 mov a, T1 # T1 = a # MAJB
346 and c, T1 # T1 = a&c # MAJB
347 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
348
349 add y1, h # h = k + w + h + S0 # --
350 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
351 add y3, h # h = t1 + S0 + MAJ # --
352
353 ROTATE_ARGS
354 rotate_Xs
355.endm
356
357.macro DO_4ROUNDS disp
358################################### RND N + 0 ###########################
359
360 mov f, y2 # y2 = f # CH
361 rorx $25, e, y0 # y0 = e >> 25 # S1A
362 rorx $11, e, y1 # y1 = e >> 11 # S1B
363 xor g, y2 # y2 = f^g # CH
364
365 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
366 rorx $6, e, y1 # y1 = (e >> 6) # S1
367 and e, y2 # y2 = (f^g)&e # CH
368
369 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
370 rorx $13, a, T1 # T1 = a >> 13 # S0B
371 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
372 rorx $22, a, y1 # y1 = a >> 22 # S0A
373 mov a, y3 # y3 = a # MAJA
374
375 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
376 rorx $2, a, T1 # T1 = (a >> 2) # S0
377 addl \disp(%rsp, SRND), h # h = k + w + h # --
378 or c, y3 # y3 = a|c # MAJA
379
380 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
381 mov a, T1 # T1 = a # MAJB
382 and b, y3 # y3 = (a|c)&b # MAJA
383 and c, T1 # T1 = a&c # MAJB
384 add y0, y2 # y2 = S1 + CH # --
385
386
387 add h, d # d = k + w + h + d # --
388 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
389 add y1, h # h = k + w + h + S0 # --
390 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
391
392 ROTATE_ARGS
393
394################################### RND N + 1 ###########################
395
396 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
397 mov f, y2 # y2 = f # CH
398 rorx $25, e, y0 # y0 = e >> 25 # S1A
399 rorx $11, e, y1 # y1 = e >> 11 # S1B
400 xor g, y2 # y2 = f^g # CH
401
402 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
403 rorx $6, e, y1 # y1 = (e >> 6) # S1
404 and e, y2 # y2 = (f^g)&e # CH
405 add y3, old_h # h = t1 + S0 + MAJ # --
406
407 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
408 rorx $13, a, T1 # T1 = a >> 13 # S0B
409 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
410 rorx $22, a, y1 # y1 = a >> 22 # S0A
411 mov a, y3 # y3 = a # MAJA
412
413 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
414 rorx $2, a, T1 # T1 = (a >> 2) # S0
415 offset = 4*1 + \disp
416 addl offset(%rsp, SRND), h # h = k + w + h # --
417 or c, y3 # y3 = a|c # MAJA
418
419 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
420 mov a, T1 # T1 = a # MAJB
421 and b, y3 # y3 = (a|c)&b # MAJA
422 and c, T1 # T1 = a&c # MAJB
423 add y0, y2 # y2 = S1 + CH # --
424
425
426 add h, d # d = k + w + h + d # --
427 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
428 add y1, h # h = k + w + h + S0 # --
429
430 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
431
432 ROTATE_ARGS
433
434################################### RND N + 2 ##############################
435
436 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
437 mov f, y2 # y2 = f # CH
438 rorx $25, e, y0 # y0 = e >> 25 # S1A
439 rorx $11, e, y1 # y1 = e >> 11 # S1B
440 xor g, y2 # y2 = f^g # CH
441
442 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
443 rorx $6, e, y1 # y1 = (e >> 6) # S1
444 and e, y2 # y2 = (f^g)&e # CH
445 add y3, old_h # h = t1 + S0 + MAJ # --
446
447 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
448 rorx $13, a, T1 # T1 = a >> 13 # S0B
449 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
450 rorx $22, a, y1 # y1 = a >> 22 # S0A
451 mov a, y3 # y3 = a # MAJA
452
453 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
454 rorx $2, a, T1 # T1 = (a >> 2) # S0
455 offset = 4*2 + \disp
456 addl offset(%rsp, SRND), h # h = k + w + h # --
457 or c, y3 # y3 = a|c # MAJA
458
459 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
460 mov a, T1 # T1 = a # MAJB
461 and b, y3 # y3 = (a|c)&b # MAJA
462 and c, T1 # T1 = a&c # MAJB
463 add y0, y2 # y2 = S1 + CH # --
464
465
466 add h, d # d = k + w + h + d # --
467 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
468 add y1, h # h = k + w + h + S0 # --
469
470 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
471
472 ROTATE_ARGS
473
474################################### RND N + 3 ###########################
475
476 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
477 mov f, y2 # y2 = f # CH
478 rorx $25, e, y0 # y0 = e >> 25 # S1A
479 rorx $11, e, y1 # y1 = e >> 11 # S1B
480 xor g, y2 # y2 = f^g # CH
481
482 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
483 rorx $6, e, y1 # y1 = (e >> 6) # S1
484 and e, y2 # y2 = (f^g)&e # CH
485 add y3, old_h # h = t1 + S0 + MAJ # --
486
487 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
488 rorx $13, a, T1 # T1 = a >> 13 # S0B
489 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
490 rorx $22, a, y1 # y1 = a >> 22 # S0A
491 mov a, y3 # y3 = a # MAJA
492
493 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
494 rorx $2, a, T1 # T1 = (a >> 2) # S0
495 offset = 4*3 + \disp
496 addl offset(%rsp, SRND), h # h = k + w + h # --
497 or c, y3 # y3 = a|c # MAJA
498
499 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
500 mov a, T1 # T1 = a # MAJB
501 and b, y3 # y3 = (a|c)&b # MAJA
502 and c, T1 # T1 = a&c # MAJB
503 add y0, y2 # y2 = S1 + CH # --
504
505
506 add h, d # d = k + w + h + d # --
507 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
508 add y1, h # h = k + w + h + S0 # --
509
510 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
511
512
513 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
514
515 add y3, h # h = t1 + S0 + MAJ # --
516
517 ROTATE_ARGS
518
519.endm
520
521########################################################################
522## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
523## arg 1 : pointer to state
524## arg 2 : pointer to input data
525## arg 3 : Num blocks
526########################################################################
527.text
528SYM_FUNC_START(sha256_transform_rorx)
529.align 32
530 pushq %rbx
531 pushq %r12
532 pushq %r13
533 pushq %r14
534 pushq %r15
535
536 mov %rsp, %rax
537 subq $STACK_SIZE, %rsp
538 and $-32, %rsp # align rsp to 32 byte boundary
539 mov %rax, _RSP(%rsp)
540
541
542 shl $6, NUM_BLKS # convert to bytes
543 jz done_hash
544 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
545 mov NUM_BLKS, _INP_END(%rsp)
546
547 cmp NUM_BLKS, INP
548 je only_one_block
549
550 ## load initial digest
551 mov (CTX), a
552 mov 4*1(CTX), b
553 mov 4*2(CTX), c
554 mov 4*3(CTX), d
555 mov 4*4(CTX), e
556 mov 4*5(CTX), f
557 mov 4*6(CTX), g
558 mov 4*7(CTX), h
559
560 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
561 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
562 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
563
564 mov CTX, _CTX(%rsp)
565
566loop0:
567 ## Load first 16 dwords from two blocks
568 VMOVDQ 0*32(INP),XTMP0
569 VMOVDQ 1*32(INP),XTMP1
570 VMOVDQ 2*32(INP),XTMP2
571 VMOVDQ 3*32(INP),XTMP3
572
573 ## byte swap data
574 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
575 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
576 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
577 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
578
579 ## transpose data into high/low halves
580 vperm2i128 $0x20, XTMP2, XTMP0, X0
581 vperm2i128 $0x31, XTMP2, XTMP0, X1
582 vperm2i128 $0x20, XTMP3, XTMP1, X2
583 vperm2i128 $0x31, XTMP3, XTMP1, X3
584
585last_block_enter:
586 add $64, INP
587 mov INP, _INP(%rsp)
588
589 ## schedule 48 input dwords, by doing 3 rounds of 12 each
590 xor SRND, SRND
591
592.align 16
593loop1:
594 vpaddd K256+0*32(SRND), X0, XFER
595 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
596 FOUR_ROUNDS_AND_SCHED _XFER + 0*32
597
598 vpaddd K256+1*32(SRND), X0, XFER
599 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
600 FOUR_ROUNDS_AND_SCHED _XFER + 1*32
601
602 vpaddd K256+2*32(SRND), X0, XFER
603 vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
604 FOUR_ROUNDS_AND_SCHED _XFER + 2*32
605
606 vpaddd K256+3*32(SRND), X0, XFER
607 vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
608 FOUR_ROUNDS_AND_SCHED _XFER + 3*32
609
610 add $4*32, SRND
611 cmp $3*4*32, SRND
612 jb loop1
613
614loop2:
615 ## Do last 16 rounds with no scheduling
616 vpaddd K256+0*32(SRND), X0, XFER
617 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
618 DO_4ROUNDS _XFER + 0*32
619
620 vpaddd K256+1*32(SRND), X1, XFER
621 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
622 DO_4ROUNDS _XFER + 1*32
623 add $2*32, SRND
624
625 vmovdqa X2, X0
626 vmovdqa X3, X1
627
628 cmp $4*4*32, SRND
629 jb loop2
630
631 mov _CTX(%rsp), CTX
632 mov _INP(%rsp), INP
633
634 addm (4*0)(CTX),a
635 addm (4*1)(CTX),b
636 addm (4*2)(CTX),c
637 addm (4*3)(CTX),d
638 addm (4*4)(CTX),e
639 addm (4*5)(CTX),f
640 addm (4*6)(CTX),g
641 addm (4*7)(CTX),h
642
643 cmp _INP_END(%rsp), INP
644 ja done_hash
645
646 #### Do second block using previously scheduled results
647 xor SRND, SRND
648.align 16
649loop3:
650 DO_4ROUNDS _XFER + 0*32 + 16
651 DO_4ROUNDS _XFER + 1*32 + 16
652 add $2*32, SRND
653 cmp $4*4*32, SRND
654 jb loop3
655
656 mov _CTX(%rsp), CTX
657 mov _INP(%rsp), INP
658 add $64, INP
659
660 addm (4*0)(CTX),a
661 addm (4*1)(CTX),b
662 addm (4*2)(CTX),c
663 addm (4*3)(CTX),d
664 addm (4*4)(CTX),e
665 addm (4*5)(CTX),f
666 addm (4*6)(CTX),g
667 addm (4*7)(CTX),h
668
669 cmp _INP_END(%rsp), INP
670 jb loop0
671 ja done_hash
672
673do_last_block:
674 VMOVDQ 0*16(INP),XWORD0
675 VMOVDQ 1*16(INP),XWORD1
676 VMOVDQ 2*16(INP),XWORD2
677 VMOVDQ 3*16(INP),XWORD3
678
679 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
680 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
681 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
682 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
683
684 jmp last_block_enter
685
686only_one_block:
687
688 ## load initial digest
689 mov (4*0)(CTX),a
690 mov (4*1)(CTX),b
691 mov (4*2)(CTX),c
692 mov (4*3)(CTX),d
693 mov (4*4)(CTX),e
694 mov (4*5)(CTX),f
695 mov (4*6)(CTX),g
696 mov (4*7)(CTX),h
697
698 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
699 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
700 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
701
702 mov CTX, _CTX(%rsp)
703 jmp do_last_block
704
705done_hash:
706
707 mov _RSP(%rsp), %rsp
708
709 popq %r15
710 popq %r14
711 popq %r13
712 popq %r12
713 popq %rbx
714 ret
715SYM_FUNC_END(sha256_transform_rorx)
716
717.section .rodata.cst512.K256, "aM", @progbits, 512
718.align 64
719K256:
720 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
721 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
722 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
723 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
724 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
725 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
726 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
727 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
728 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
729 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
730 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
731 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
732 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
733 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
734 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
735 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
736 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
737 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
738 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
739 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
740 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
741 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
742 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
743 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
744 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
745 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
746 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
747 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
748 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
749 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
750 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
751 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
752
753.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
754.align 32
755PSHUFFLE_BYTE_FLIP_MASK:
756 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
757
758# shuffle xBxA -> 00BA
759.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
760.align 32
761_SHUF_00BA:
762 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
763
764# shuffle xDxC -> DC00
765.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
766.align 32
767_SHUF_DC00:
768 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF