Loading...
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Blowfish Cipher Algorithm (x86_64)
4 *
5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6 */
7
8#include <linux/linkage.h>
9
10.file "blowfish-x86_64-asm.S"
11.text
12
13/* structure of crypto context */
14#define p 0
15#define s0 ((16 + 2) * 4)
16#define s1 ((16 + 2 + (1 * 256)) * 4)
17#define s2 ((16 + 2 + (2 * 256)) * 4)
18#define s3 ((16 + 2 + (3 * 256)) * 4)
19
20/* register macros */
21#define CTX %r12
22#define RIO %rsi
23
24#define RX0 %rax
25#define RX1 %rbx
26#define RX2 %rcx
27#define RX3 %rdx
28
29#define RX0d %eax
30#define RX1d %ebx
31#define RX2d %ecx
32#define RX3d %edx
33
34#define RX0bl %al
35#define RX1bl %bl
36#define RX2bl %cl
37#define RX3bl %dl
38
39#define RX0bh %ah
40#define RX1bh %bh
41#define RX2bh %ch
42#define RX3bh %dh
43
44#define RT0 %rdi
45#define RT1 %rsi
46#define RT2 %r8
47#define RT3 %r9
48
49#define RT0d %edi
50#define RT1d %esi
51#define RT2d %r8d
52#define RT3d %r9d
53
54#define RKEY %r10
55
56/***********************************************************************
57 * 1-way blowfish
58 ***********************************************************************/
59#define F() \
60 rorq $16, RX0; \
61 movzbl RX0bh, RT0d; \
62 movzbl RX0bl, RT1d; \
63 rolq $16, RX0; \
64 movl s0(CTX,RT0,4), RT0d; \
65 addl s1(CTX,RT1,4), RT0d; \
66 movzbl RX0bh, RT1d; \
67 movzbl RX0bl, RT2d; \
68 rolq $32, RX0; \
69 xorl s2(CTX,RT1,4), RT0d; \
70 addl s3(CTX,RT2,4), RT0d; \
71 xorq RT0, RX0;
72
73#define add_roundkey_enc(n) \
74 xorq p+4*(n)(CTX), RX0;
75
76#define round_enc(n) \
77 add_roundkey_enc(n); \
78 \
79 F(); \
80 F();
81
82#define add_roundkey_dec(n) \
83 movq p+4*(n-1)(CTX), RT0; \
84 rorq $32, RT0; \
85 xorq RT0, RX0;
86
87#define round_dec(n) \
88 add_roundkey_dec(n); \
89 \
90 F(); \
91 F(); \
92
93#define read_block() \
94 movq (RIO), RX0; \
95 rorq $32, RX0; \
96 bswapq RX0;
97
98#define write_block() \
99 bswapq RX0; \
100 movq RX0, (RIO);
101
102SYM_FUNC_START(blowfish_enc_blk)
103 /* input:
104 * %rdi: ctx
105 * %rsi: dst
106 * %rdx: src
107 */
108 movq %r12, %r11;
109
110 movq %rdi, CTX;
111 movq %rsi, %r10;
112 movq %rdx, RIO;
113
114 read_block();
115
116 round_enc(0);
117 round_enc(2);
118 round_enc(4);
119 round_enc(6);
120 round_enc(8);
121 round_enc(10);
122 round_enc(12);
123 round_enc(14);
124 add_roundkey_enc(16);
125
126 movq %r11, %r12;
127 movq %r10, RIO;
128
129 write_block();
130 RET;
131SYM_FUNC_END(blowfish_enc_blk)
132
133SYM_FUNC_START(blowfish_dec_blk)
134 /* input:
135 * %rdi: ctx
136 * %rsi: dst
137 * %rdx: src
138 */
139 movq %r12, %r11;
140
141 movq %rdi, CTX;
142 movq %rsi, %r10;
143 movq %rdx, RIO;
144
145 read_block();
146
147 round_dec(17);
148 round_dec(15);
149 round_dec(13);
150 round_dec(11);
151 round_dec(9);
152 round_dec(7);
153 round_dec(5);
154 round_dec(3);
155 add_roundkey_dec(1);
156
157 movq %r10, RIO;
158 write_block();
159
160 movq %r11, %r12;
161
162 RET;
163SYM_FUNC_END(blowfish_dec_blk)
164
165/**********************************************************************
166 4-way blowfish, four blocks parallel
167 **********************************************************************/
168
169/* F() for 4-way. Slower when used alone/1-way, but faster when used
170 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
171 */
172#define F4(x) \
173 movzbl x ## bh, RT1d; \
174 movzbl x ## bl, RT3d; \
175 rorq $16, x; \
176 movzbl x ## bh, RT0d; \
177 movzbl x ## bl, RT2d; \
178 rorq $16, x; \
179 movl s0(CTX,RT0,4), RT0d; \
180 addl s1(CTX,RT2,4), RT0d; \
181 xorl s2(CTX,RT1,4), RT0d; \
182 addl s3(CTX,RT3,4), RT0d; \
183 xorq RT0, x;
184
185#define add_preloaded_roundkey4() \
186 xorq RKEY, RX0; \
187 xorq RKEY, RX1; \
188 xorq RKEY, RX2; \
189 xorq RKEY, RX3;
190
191#define preload_roundkey_enc(n) \
192 movq p+4*(n)(CTX), RKEY;
193
194#define add_roundkey_enc4(n) \
195 add_preloaded_roundkey4(); \
196 preload_roundkey_enc(n + 2);
197
198#define round_enc4(n) \
199 add_roundkey_enc4(n); \
200 \
201 F4(RX0); \
202 F4(RX1); \
203 F4(RX2); \
204 F4(RX3); \
205 \
206 F4(RX0); \
207 F4(RX1); \
208 F4(RX2); \
209 F4(RX3);
210
211#define preload_roundkey_dec(n) \
212 movq p+4*((n)-1)(CTX), RKEY; \
213 rorq $32, RKEY;
214
215#define add_roundkey_dec4(n) \
216 add_preloaded_roundkey4(); \
217 preload_roundkey_dec(n - 2);
218
219#define round_dec4(n) \
220 add_roundkey_dec4(n); \
221 \
222 F4(RX0); \
223 F4(RX1); \
224 F4(RX2); \
225 F4(RX3); \
226 \
227 F4(RX0); \
228 F4(RX1); \
229 F4(RX2); \
230 F4(RX3);
231
232#define read_block4() \
233 movq (RIO), RX0; \
234 rorq $32, RX0; \
235 bswapq RX0; \
236 \
237 movq 8(RIO), RX1; \
238 rorq $32, RX1; \
239 bswapq RX1; \
240 \
241 movq 16(RIO), RX2; \
242 rorq $32, RX2; \
243 bswapq RX2; \
244 \
245 movq 24(RIO), RX3; \
246 rorq $32, RX3; \
247 bswapq RX3;
248
249#define write_block4() \
250 bswapq RX0; \
251 movq RX0, (RIO); \
252 \
253 bswapq RX1; \
254 movq RX1, 8(RIO); \
255 \
256 bswapq RX2; \
257 movq RX2, 16(RIO); \
258 \
259 bswapq RX3; \
260 movq RX3, 24(RIO);
261
262#define xor_block4() \
263 movq (RIO), RT0; \
264 bswapq RT0; \
265 xorq RT0, RX1; \
266 \
267 movq 8(RIO), RT2; \
268 bswapq RT2; \
269 xorq RT2, RX2; \
270 \
271 movq 16(RIO), RT3; \
272 bswapq RT3; \
273 xorq RT3, RX3;
274
275SYM_FUNC_START(blowfish_enc_blk_4way)
276 /* input:
277 * %rdi: ctx
278 * %rsi: dst
279 * %rdx: src
280 */
281 pushq %r12;
282 pushq %rbx;
283
284 movq %rdi, CTX
285 movq %rsi, %r11;
286 movq %rdx, RIO;
287
288 preload_roundkey_enc(0);
289
290 read_block4();
291
292 round_enc4(0);
293 round_enc4(2);
294 round_enc4(4);
295 round_enc4(6);
296 round_enc4(8);
297 round_enc4(10);
298 round_enc4(12);
299 round_enc4(14);
300 add_preloaded_roundkey4();
301
302 movq %r11, RIO;
303 write_block4();
304
305 popq %rbx;
306 popq %r12;
307 RET;
308SYM_FUNC_END(blowfish_enc_blk_4way)
309
310SYM_FUNC_START(__blowfish_dec_blk_4way)
311 /* input:
312 * %rdi: ctx
313 * %rsi: dst
314 * %rdx: src
315 * %rcx: cbc (bool)
316 */
317 pushq %r12;
318 pushq %rbx;
319 pushq %rcx;
320 pushq %rdx;
321
322 movq %rdi, CTX;
323 movq %rsi, %r11;
324 movq %rdx, RIO;
325
326 preload_roundkey_dec(17);
327 read_block4();
328
329 round_dec4(17);
330 round_dec4(15);
331 round_dec4(13);
332 round_dec4(11);
333 round_dec4(9);
334 round_dec4(7);
335 round_dec4(5);
336 round_dec4(3);
337 add_preloaded_roundkey4();
338
339 popq RIO;
340 popq %r12;
341 testq %r12, %r12;
342 jz .L_no_cbc_xor;
343
344 xor_block4();
345
346.L_no_cbc_xor:
347 movq %r11, RIO;
348 write_block4();
349
350 popq %rbx;
351 popq %r12;
352
353 RET;
354SYM_FUNC_END(__blowfish_dec_blk_4way)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Blowfish Cipher Algorithm (x86_64)
4 *
5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6 */
7
8#include <linux/linkage.h>
9#include <linux/cfi_types.h>
10
11.file "blowfish-x86_64-asm.S"
12.text
13
14/* structure of crypto context */
15#define p 0
16#define s0 ((16 + 2) * 4)
17#define s1 ((16 + 2 + (1 * 256)) * 4)
18#define s2 ((16 + 2 + (2 * 256)) * 4)
19#define s3 ((16 + 2 + (3 * 256)) * 4)
20
21/* register macros */
22#define CTX %r12
23#define RIO %rsi
24
25#define RX0 %rax
26#define RX1 %rbx
27#define RX2 %rcx
28#define RX3 %rdx
29
30#define RX0d %eax
31#define RX1d %ebx
32#define RX2d %ecx
33#define RX3d %edx
34
35#define RX0bl %al
36#define RX1bl %bl
37#define RX2bl %cl
38#define RX3bl %dl
39
40#define RX0bh %ah
41#define RX1bh %bh
42#define RX2bh %ch
43#define RX3bh %dh
44
45#define RT0 %rdi
46#define RT1 %rsi
47#define RT2 %r8
48#define RT3 %r9
49
50#define RT0d %edi
51#define RT1d %esi
52#define RT2d %r8d
53#define RT3d %r9d
54
55#define RKEY %r10
56
57/***********************************************************************
58 * 1-way blowfish
59 ***********************************************************************/
60#define F() \
61 rorq $16, RX0; \
62 movzbl RX0bh, RT0d; \
63 movzbl RX0bl, RT1d; \
64 rolq $16, RX0; \
65 movl s0(CTX,RT0,4), RT0d; \
66 addl s1(CTX,RT1,4), RT0d; \
67 movzbl RX0bh, RT1d; \
68 movzbl RX0bl, RT2d; \
69 rolq $32, RX0; \
70 xorl s2(CTX,RT1,4), RT0d; \
71 addl s3(CTX,RT2,4), RT0d; \
72 xorq RT0, RX0;
73
74#define add_roundkey_enc(n) \
75 xorq p+4*(n)(CTX), RX0;
76
77#define round_enc(n) \
78 add_roundkey_enc(n); \
79 \
80 F(); \
81 F();
82
83#define add_roundkey_dec(n) \
84 movq p+4*(n-1)(CTX), RT0; \
85 rorq $32, RT0; \
86 xorq RT0, RX0;
87
88#define round_dec(n) \
89 add_roundkey_dec(n); \
90 \
91 F(); \
92 F(); \
93
94#define read_block() \
95 movq (RIO), RX0; \
96 rorq $32, RX0; \
97 bswapq RX0;
98
99#define write_block() \
100 bswapq RX0; \
101 movq RX0, (RIO);
102
103#define xor_block() \
104 bswapq RX0; \
105 xorq RX0, (RIO);
106
107SYM_FUNC_START(__blowfish_enc_blk)
108 /* input:
109 * %rdi: ctx
110 * %rsi: dst
111 * %rdx: src
112 * %rcx: bool, if true: xor output
113 */
114 movq %r12, %r11;
115
116 movq %rdi, CTX;
117 movq %rsi, %r10;
118 movq %rdx, RIO;
119
120 read_block();
121
122 round_enc(0);
123 round_enc(2);
124 round_enc(4);
125 round_enc(6);
126 round_enc(8);
127 round_enc(10);
128 round_enc(12);
129 round_enc(14);
130 add_roundkey_enc(16);
131
132 movq %r11, %r12;
133
134 movq %r10, RIO;
135 test %cl, %cl;
136 jnz .L__enc_xor;
137
138 write_block();
139 RET;
140.L__enc_xor:
141 xor_block();
142 RET;
143SYM_FUNC_END(__blowfish_enc_blk)
144
145SYM_TYPED_FUNC_START(blowfish_dec_blk)
146 /* input:
147 * %rdi: ctx
148 * %rsi: dst
149 * %rdx: src
150 */
151 movq %r12, %r11;
152
153 movq %rdi, CTX;
154 movq %rsi, %r10;
155 movq %rdx, RIO;
156
157 read_block();
158
159 round_dec(17);
160 round_dec(15);
161 round_dec(13);
162 round_dec(11);
163 round_dec(9);
164 round_dec(7);
165 round_dec(5);
166 round_dec(3);
167 add_roundkey_dec(1);
168
169 movq %r10, RIO;
170 write_block();
171
172 movq %r11, %r12;
173
174 RET;
175SYM_FUNC_END(blowfish_dec_blk)
176
177/**********************************************************************
178 4-way blowfish, four blocks parallel
179 **********************************************************************/
180
181/* F() for 4-way. Slower when used alone/1-way, but faster when used
182 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
183 */
184#define F4(x) \
185 movzbl x ## bh, RT1d; \
186 movzbl x ## bl, RT3d; \
187 rorq $16, x; \
188 movzbl x ## bh, RT0d; \
189 movzbl x ## bl, RT2d; \
190 rorq $16, x; \
191 movl s0(CTX,RT0,4), RT0d; \
192 addl s1(CTX,RT2,4), RT0d; \
193 xorl s2(CTX,RT1,4), RT0d; \
194 addl s3(CTX,RT3,4), RT0d; \
195 xorq RT0, x;
196
197#define add_preloaded_roundkey4() \
198 xorq RKEY, RX0; \
199 xorq RKEY, RX1; \
200 xorq RKEY, RX2; \
201 xorq RKEY, RX3;
202
203#define preload_roundkey_enc(n) \
204 movq p+4*(n)(CTX), RKEY;
205
206#define add_roundkey_enc4(n) \
207 add_preloaded_roundkey4(); \
208 preload_roundkey_enc(n + 2);
209
210#define round_enc4(n) \
211 add_roundkey_enc4(n); \
212 \
213 F4(RX0); \
214 F4(RX1); \
215 F4(RX2); \
216 F4(RX3); \
217 \
218 F4(RX0); \
219 F4(RX1); \
220 F4(RX2); \
221 F4(RX3);
222
223#define preload_roundkey_dec(n) \
224 movq p+4*((n)-1)(CTX), RKEY; \
225 rorq $32, RKEY;
226
227#define add_roundkey_dec4(n) \
228 add_preloaded_roundkey4(); \
229 preload_roundkey_dec(n - 2);
230
231#define round_dec4(n) \
232 add_roundkey_dec4(n); \
233 \
234 F4(RX0); \
235 F4(RX1); \
236 F4(RX2); \
237 F4(RX3); \
238 \
239 F4(RX0); \
240 F4(RX1); \
241 F4(RX2); \
242 F4(RX3);
243
244#define read_block4() \
245 movq (RIO), RX0; \
246 rorq $32, RX0; \
247 bswapq RX0; \
248 \
249 movq 8(RIO), RX1; \
250 rorq $32, RX1; \
251 bswapq RX1; \
252 \
253 movq 16(RIO), RX2; \
254 rorq $32, RX2; \
255 bswapq RX2; \
256 \
257 movq 24(RIO), RX3; \
258 rorq $32, RX3; \
259 bswapq RX3;
260
261#define write_block4() \
262 bswapq RX0; \
263 movq RX0, (RIO); \
264 \
265 bswapq RX1; \
266 movq RX1, 8(RIO); \
267 \
268 bswapq RX2; \
269 movq RX2, 16(RIO); \
270 \
271 bswapq RX3; \
272 movq RX3, 24(RIO);
273
274#define xor_block4() \
275 bswapq RX0; \
276 xorq RX0, (RIO); \
277 \
278 bswapq RX1; \
279 xorq RX1, 8(RIO); \
280 \
281 bswapq RX2; \
282 xorq RX2, 16(RIO); \
283 \
284 bswapq RX3; \
285 xorq RX3, 24(RIO);
286
287SYM_FUNC_START(__blowfish_enc_blk_4way)
288 /* input:
289 * %rdi: ctx
290 * %rsi: dst
291 * %rdx: src
292 * %rcx: bool, if true: xor output
293 */
294 pushq %r12;
295 pushq %rbx;
296 pushq %rcx;
297
298 movq %rdi, CTX
299 movq %rsi, %r11;
300 movq %rdx, RIO;
301
302 preload_roundkey_enc(0);
303
304 read_block4();
305
306 round_enc4(0);
307 round_enc4(2);
308 round_enc4(4);
309 round_enc4(6);
310 round_enc4(8);
311 round_enc4(10);
312 round_enc4(12);
313 round_enc4(14);
314 add_preloaded_roundkey4();
315
316 popq %r12;
317 movq %r11, RIO;
318
319 test %r12b, %r12b;
320 jnz .L__enc_xor4;
321
322 write_block4();
323
324 popq %rbx;
325 popq %r12;
326 RET;
327
328.L__enc_xor4:
329 xor_block4();
330
331 popq %rbx;
332 popq %r12;
333 RET;
334SYM_FUNC_END(__blowfish_enc_blk_4way)
335
336SYM_TYPED_FUNC_START(blowfish_dec_blk_4way)
337 /* input:
338 * %rdi: ctx
339 * %rsi: dst
340 * %rdx: src
341 */
342 pushq %r12;
343 pushq %rbx;
344
345 movq %rdi, CTX;
346 movq %rsi, %r11
347 movq %rdx, RIO;
348
349 preload_roundkey_dec(17);
350 read_block4();
351
352 round_dec4(17);
353 round_dec4(15);
354 round_dec4(13);
355 round_dec4(11);
356 round_dec4(9);
357 round_dec4(7);
358 round_dec4(5);
359 round_dec4(3);
360 add_preloaded_roundkey4();
361
362 movq %r11, RIO;
363 write_block4();
364
365 popq %rbx;
366 popq %r12;
367
368 RET;
369SYM_FUNC_END(blowfish_dec_blk_4way)