Linux Audio

Check our new training course

Loading...
v6.13.7
  1/* SPDX-License-Identifier: GPL-2.0-or-later */
  2/*
  3 * Blowfish Cipher Algorithm (x86_64)
  4 *
  5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  6 */
  7
  8#include <linux/linkage.h>
  9
 10.file "blowfish-x86_64-asm.S"
 11.text
 12
 13/* structure of crypto context */
 14#define p	0
 15#define s0	((16 + 2) * 4)
 16#define s1	((16 + 2 + (1 * 256)) * 4)
 17#define s2	((16 + 2 + (2 * 256)) * 4)
 18#define s3	((16 + 2 + (3 * 256)) * 4)
 19
 20/* register macros */
 21#define CTX %r12
 22#define RIO %rsi
 23
 24#define RX0 %rax
 25#define RX1 %rbx
 26#define RX2 %rcx
 27#define RX3 %rdx
 28
 29#define RX0d %eax
 30#define RX1d %ebx
 31#define RX2d %ecx
 32#define RX3d %edx
 33
 34#define RX0bl %al
 35#define RX1bl %bl
 36#define RX2bl %cl
 37#define RX3bl %dl
 38
 39#define RX0bh %ah
 40#define RX1bh %bh
 41#define RX2bh %ch
 42#define RX3bh %dh
 43
 44#define RT0 %rdi
 45#define RT1 %rsi
 46#define RT2 %r8
 47#define RT3 %r9
 48
 49#define RT0d %edi
 50#define RT1d %esi
 51#define RT2d %r8d
 52#define RT3d %r9d
 53
 54#define RKEY %r10
 55
 56/***********************************************************************
 57 * 1-way blowfish
 58 ***********************************************************************/
 59#define F() \
 60	rorq $16,		RX0; \
 61	movzbl RX0bh,		RT0d; \
 62	movzbl RX0bl,		RT1d; \
 63	rolq $16,		RX0; \
 64	movl s0(CTX,RT0,4),	RT0d; \
 65	addl s1(CTX,RT1,4),	RT0d; \
 66	movzbl RX0bh,		RT1d; \
 67	movzbl RX0bl,		RT2d; \
 68	rolq $32,		RX0; \
 69	xorl s2(CTX,RT1,4),	RT0d; \
 70	addl s3(CTX,RT2,4),	RT0d; \
 71	xorq RT0,		RX0;
 72
 73#define add_roundkey_enc(n) \
 74	xorq p+4*(n)(CTX), 	RX0;
 75
 76#define round_enc(n) \
 77	add_roundkey_enc(n); \
 78	\
 79	F(); \
 80	F();
 81
 82#define add_roundkey_dec(n) \
 83	movq p+4*(n-1)(CTX),	RT0; \
 84	rorq $32,		RT0; \
 85	xorq RT0,		RX0;
 86
 87#define round_dec(n) \
 88	add_roundkey_dec(n); \
 89	\
 90	F(); \
 91	F(); \
 92
 93#define read_block() \
 94	movq (RIO), 		RX0; \
 95	rorq $32, 		RX0; \
 96	bswapq 			RX0;
 97
 98#define write_block() \
 99	bswapq 			RX0; \
100	movq RX0, 		(RIO);
101
102SYM_FUNC_START(blowfish_enc_blk)
 
 
 
 
103	/* input:
104	 *	%rdi: ctx
105	 *	%rsi: dst
106	 *	%rdx: src
 
107	 */
108	movq %r12, %r11;
109
110	movq %rdi, CTX;
111	movq %rsi, %r10;
112	movq %rdx, RIO;
113
114	read_block();
115
116	round_enc(0);
117	round_enc(2);
118	round_enc(4);
119	round_enc(6);
120	round_enc(8);
121	round_enc(10);
122	round_enc(12);
123	round_enc(14);
124	add_roundkey_enc(16);
125
126	movq %r11, %r12;
 
127	movq %r10, RIO;
 
 
128
129	write_block();
130	RET;
131SYM_FUNC_END(blowfish_enc_blk)
 
 
 
132
133SYM_FUNC_START(blowfish_dec_blk)
134	/* input:
135	 *	%rdi: ctx
136	 *	%rsi: dst
137	 *	%rdx: src
138	 */
139	movq %r12, %r11;
140
141	movq %rdi, CTX;
142	movq %rsi, %r10;
143	movq %rdx, RIO;
144
145	read_block();
146
147	round_dec(17);
148	round_dec(15);
149	round_dec(13);
150	round_dec(11);
151	round_dec(9);
152	round_dec(7);
153	round_dec(5);
154	round_dec(3);
155	add_roundkey_dec(1);
156
157	movq %r10, RIO;
158	write_block();
159
160	movq %r11, %r12;
161
162	RET;
163SYM_FUNC_END(blowfish_dec_blk)
164
165/**********************************************************************
166  4-way blowfish, four blocks parallel
167 **********************************************************************/
168
169/* F() for 4-way. Slower when used alone/1-way, but faster when used
170 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
171 */
172#define F4(x) \
173	movzbl x ## bh,		RT1d; \
174	movzbl x ## bl,		RT3d; \
175	rorq $16,		x; \
176	movzbl x ## bh,		RT0d; \
177	movzbl x ## bl,		RT2d; \
178	rorq $16,		x; \
179	movl s0(CTX,RT0,4),	RT0d; \
180	addl s1(CTX,RT2,4),	RT0d; \
181	xorl s2(CTX,RT1,4),	RT0d; \
182	addl s3(CTX,RT3,4),	RT0d; \
183	xorq RT0,		x;
184
185#define add_preloaded_roundkey4() \
186	xorq RKEY,		RX0; \
187	xorq RKEY,		RX1; \
188	xorq RKEY,		RX2; \
189	xorq RKEY,		RX3;
190
191#define preload_roundkey_enc(n) \
192	movq p+4*(n)(CTX),	RKEY;
193
194#define add_roundkey_enc4(n) \
195	add_preloaded_roundkey4(); \
196	preload_roundkey_enc(n + 2);
197
198#define round_enc4(n) \
199	add_roundkey_enc4(n); \
200	\
201	F4(RX0); \
202	F4(RX1); \
203	F4(RX2); \
204	F4(RX3); \
205	\
206	F4(RX0); \
207	F4(RX1); \
208	F4(RX2); \
209	F4(RX3);
210
211#define preload_roundkey_dec(n) \
212	movq p+4*((n)-1)(CTX),	RKEY; \
213	rorq $32,		RKEY;
214
215#define add_roundkey_dec4(n) \
216	add_preloaded_roundkey4(); \
217	preload_roundkey_dec(n - 2);
218
219#define round_dec4(n) \
220	add_roundkey_dec4(n); \
221	\
222	F4(RX0); \
223	F4(RX1); \
224	F4(RX2); \
225	F4(RX3); \
226	\
227	F4(RX0); \
228	F4(RX1); \
229	F4(RX2); \
230	F4(RX3);
231
232#define read_block4() \
233	movq (RIO),		RX0; \
234	rorq $32,		RX0; \
235	bswapq 			RX0; \
236	\
237	movq 8(RIO),		RX1; \
238	rorq $32,		RX1; \
239	bswapq 			RX1; \
240	\
241	movq 16(RIO),		RX2; \
242	rorq $32,		RX2; \
243	bswapq 			RX2; \
244	\
245	movq 24(RIO),		RX3; \
246	rorq $32,		RX3; \
247	bswapq 			RX3;
248
249#define write_block4() \
250	bswapq 			RX0; \
251	movq RX0,		(RIO); \
252	\
253	bswapq 			RX1; \
254	movq RX1,		8(RIO); \
255	\
256	bswapq 			RX2; \
257	movq RX2,		16(RIO); \
258	\
259	bswapq 			RX3; \
260	movq RX3,		24(RIO);
261
262#define xor_block4() \
263	movq (RIO),		RT0; \
264	bswapq			RT0; \
265	xorq RT0,		RX1; \
266	\
267	movq 8(RIO),		RT2; \
268	bswapq			RT2; \
269	xorq RT2,		RX2; \
270	\
271	movq 16(RIO),		RT3; \
272	bswapq			RT3; \
273	xorq RT3,		RX3;
274
275SYM_FUNC_START(blowfish_enc_blk_4way)
276	/* input:
277	 *	%rdi: ctx
278	 *	%rsi: dst
279	 *	%rdx: src
 
280	 */
281	pushq %r12;
282	pushq %rbx;
 
283
284	movq %rdi, CTX
285	movq %rsi, %r11;
286	movq %rdx, RIO;
287
288	preload_roundkey_enc(0);
289
290	read_block4();
291
292	round_enc4(0);
293	round_enc4(2);
294	round_enc4(4);
295	round_enc4(6);
296	round_enc4(8);
297	round_enc4(10);
298	round_enc4(12);
299	round_enc4(14);
300	add_preloaded_roundkey4();
301
 
302	movq %r11, RIO;
 
 
 
 
303	write_block4();
304
305	popq %rbx;
306	popq %r12;
307	RET;
308SYM_FUNC_END(blowfish_enc_blk_4way)
 
 
 
 
 
 
 
309
310SYM_FUNC_START(__blowfish_dec_blk_4way)
311	/* input:
312	 *	%rdi: ctx
313	 *	%rsi: dst
314	 *	%rdx: src
315	 *	%rcx: cbc (bool)
316	 */
317	pushq %r12;
318	pushq %rbx;
319	pushq %rcx;
320	pushq %rdx;
321
322	movq %rdi, CTX;
323	movq %rsi, %r11;
324	movq %rdx, RIO;
325
326	preload_roundkey_dec(17);
327	read_block4();
328
329	round_dec4(17);
330	round_dec4(15);
331	round_dec4(13);
332	round_dec4(11);
333	round_dec4(9);
334	round_dec4(7);
335	round_dec4(5);
336	round_dec4(3);
337	add_preloaded_roundkey4();
338
339	popq RIO;
340	popq %r12;
341	testq %r12, %r12;
342	jz .L_no_cbc_xor;
343
344	xor_block4();
345
346.L_no_cbc_xor:
347	movq %r11, RIO;
348	write_block4();
349
350	popq %rbx;
351	popq %r12;
352
353	RET;
354SYM_FUNC_END(__blowfish_dec_blk_4way)
v4.17
 
  1/*
  2 * Blowfish Cipher Algorithm (x86_64)
  3 *
  4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
  5 *
  6 * This program is free software; you can redistribute it and/or modify
  7 * it under the terms of the GNU General Public License as published by
  8 * the Free Software Foundation; either version 2 of the License, or
  9 * (at your option) any later version.
 10 *
 11 * This program is distributed in the hope that it will be useful,
 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 * GNU General Public License for more details.
 15 *
 16 * You should have received a copy of the GNU General Public License
 17 * along with this program; if not, write to the Free Software
 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
 19 * USA
 20 *
 21 */
 22
 23#include <linux/linkage.h>
 24
 25.file "blowfish-x86_64-asm.S"
 26.text
 27
 28/* structure of crypto context */
 29#define p	0
 30#define s0	((16 + 2) * 4)
 31#define s1	((16 + 2 + (1 * 256)) * 4)
 32#define s2	((16 + 2 + (2 * 256)) * 4)
 33#define s3	((16 + 2 + (3 * 256)) * 4)
 34
 35/* register macros */
 36#define CTX %r12
 37#define RIO %rsi
 38
 39#define RX0 %rax
 40#define RX1 %rbx
 41#define RX2 %rcx
 42#define RX3 %rdx
 43
 44#define RX0d %eax
 45#define RX1d %ebx
 46#define RX2d %ecx
 47#define RX3d %edx
 48
 49#define RX0bl %al
 50#define RX1bl %bl
 51#define RX2bl %cl
 52#define RX3bl %dl
 53
 54#define RX0bh %ah
 55#define RX1bh %bh
 56#define RX2bh %ch
 57#define RX3bh %dh
 58
 59#define RT0 %rdi
 60#define RT1 %rsi
 61#define RT2 %r8
 62#define RT3 %r9
 63
 64#define RT0d %edi
 65#define RT1d %esi
 66#define RT2d %r8d
 67#define RT3d %r9d
 68
 69#define RKEY %r10
 70
 71/***********************************************************************
 72 * 1-way blowfish
 73 ***********************************************************************/
 74#define F() \
 75	rorq $16,		RX0; \
 76	movzbl RX0bh,		RT0d; \
 77	movzbl RX0bl,		RT1d; \
 78	rolq $16,		RX0; \
 79	movl s0(CTX,RT0,4),	RT0d; \
 80	addl s1(CTX,RT1,4),	RT0d; \
 81	movzbl RX0bh,		RT1d; \
 82	movzbl RX0bl,		RT2d; \
 83	rolq $32,		RX0; \
 84	xorl s2(CTX,RT1,4),	RT0d; \
 85	addl s3(CTX,RT2,4),	RT0d; \
 86	xorq RT0,		RX0;
 87
 88#define add_roundkey_enc(n) \
 89	xorq p+4*(n)(CTX), 	RX0;
 90
 91#define round_enc(n) \
 92	add_roundkey_enc(n); \
 93	\
 94	F(); \
 95	F();
 96
 97#define add_roundkey_dec(n) \
 98	movq p+4*(n-1)(CTX),	RT0; \
 99	rorq $32,		RT0; \
100	xorq RT0,		RX0;
101
102#define round_dec(n) \
103	add_roundkey_dec(n); \
104	\
105	F(); \
106	F(); \
107
108#define read_block() \
109	movq (RIO), 		RX0; \
110	rorq $32, 		RX0; \
111	bswapq 			RX0;
112
113#define write_block() \
114	bswapq 			RX0; \
115	movq RX0, 		(RIO);
116
117#define xor_block() \
118	bswapq 			RX0; \
119	xorq RX0, 		(RIO);
120
121ENTRY(__blowfish_enc_blk)
122	/* input:
123	 *	%rdi: ctx
124	 *	%rsi: dst
125	 *	%rdx: src
126	 *	%rcx: bool, if true: xor output
127	 */
128	movq %r12, %r11;
129
130	movq %rdi, CTX;
131	movq %rsi, %r10;
132	movq %rdx, RIO;
133
134	read_block();
135
136	round_enc(0);
137	round_enc(2);
138	round_enc(4);
139	round_enc(6);
140	round_enc(8);
141	round_enc(10);
142	round_enc(12);
143	round_enc(14);
144	add_roundkey_enc(16);
145
146	movq %r11, %r12;
147
148	movq %r10, RIO;
149	test %cl, %cl;
150	jnz .L__enc_xor;
151
152	write_block();
153	ret;
154.L__enc_xor:
155	xor_block();
156	ret;
157ENDPROC(__blowfish_enc_blk)
158
159ENTRY(blowfish_dec_blk)
160	/* input:
161	 *	%rdi: ctx
162	 *	%rsi: dst
163	 *	%rdx: src
164	 */
165	movq %r12, %r11;
166
167	movq %rdi, CTX;
168	movq %rsi, %r10;
169	movq %rdx, RIO;
170
171	read_block();
172
173	round_dec(17);
174	round_dec(15);
175	round_dec(13);
176	round_dec(11);
177	round_dec(9);
178	round_dec(7);
179	round_dec(5);
180	round_dec(3);
181	add_roundkey_dec(1);
182
183	movq %r10, RIO;
184	write_block();
185
186	movq %r11, %r12;
187
188	ret;
189ENDPROC(blowfish_dec_blk)
190
191/**********************************************************************
192  4-way blowfish, four blocks parallel
193 **********************************************************************/
194
195/* F() for 4-way. Slower when used alone/1-way, but faster when used
196 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
197 */
198#define F4(x) \
199	movzbl x ## bh,		RT1d; \
200	movzbl x ## bl,		RT3d; \
201	rorq $16,		x; \
202	movzbl x ## bh,		RT0d; \
203	movzbl x ## bl,		RT2d; \
204	rorq $16,		x; \
205	movl s0(CTX,RT0,4),	RT0d; \
206	addl s1(CTX,RT2,4),	RT0d; \
207	xorl s2(CTX,RT1,4),	RT0d; \
208	addl s3(CTX,RT3,4),	RT0d; \
209	xorq RT0,		x;
210
211#define add_preloaded_roundkey4() \
212	xorq RKEY,		RX0; \
213	xorq RKEY,		RX1; \
214	xorq RKEY,		RX2; \
215	xorq RKEY,		RX3;
216
217#define preload_roundkey_enc(n) \
218	movq p+4*(n)(CTX),	RKEY;
219
220#define add_roundkey_enc4(n) \
221	add_preloaded_roundkey4(); \
222	preload_roundkey_enc(n + 2);
223
224#define round_enc4(n) \
225	add_roundkey_enc4(n); \
226	\
227	F4(RX0); \
228	F4(RX1); \
229	F4(RX2); \
230	F4(RX3); \
231	\
232	F4(RX0); \
233	F4(RX1); \
234	F4(RX2); \
235	F4(RX3);
236
237#define preload_roundkey_dec(n) \
238	movq p+4*((n)-1)(CTX),	RKEY; \
239	rorq $32,		RKEY;
240
241#define add_roundkey_dec4(n) \
242	add_preloaded_roundkey4(); \
243	preload_roundkey_dec(n - 2);
244
245#define round_dec4(n) \
246	add_roundkey_dec4(n); \
247	\
248	F4(RX0); \
249	F4(RX1); \
250	F4(RX2); \
251	F4(RX3); \
252	\
253	F4(RX0); \
254	F4(RX1); \
255	F4(RX2); \
256	F4(RX3);
257
258#define read_block4() \
259	movq (RIO),		RX0; \
260	rorq $32,		RX0; \
261	bswapq 			RX0; \
262	\
263	movq 8(RIO),		RX1; \
264	rorq $32,		RX1; \
265	bswapq 			RX1; \
266	\
267	movq 16(RIO),		RX2; \
268	rorq $32,		RX2; \
269	bswapq 			RX2; \
270	\
271	movq 24(RIO),		RX3; \
272	rorq $32,		RX3; \
273	bswapq 			RX3;
274
275#define write_block4() \
276	bswapq 			RX0; \
277	movq RX0,		(RIO); \
278	\
279	bswapq 			RX1; \
280	movq RX1,		8(RIO); \
281	\
282	bswapq 			RX2; \
283	movq RX2,		16(RIO); \
284	\
285	bswapq 			RX3; \
286	movq RX3,		24(RIO);
287
288#define xor_block4() \
289	bswapq 			RX0; \
290	xorq RX0,		(RIO); \
291	\
292	bswapq 			RX1; \
293	xorq RX1,		8(RIO); \
294	\
295	bswapq 			RX2; \
296	xorq RX2,		16(RIO); \
297	\
298	bswapq 			RX3; \
299	xorq RX3,		24(RIO);
300
301ENTRY(__blowfish_enc_blk_4way)
302	/* input:
303	 *	%rdi: ctx
304	 *	%rsi: dst
305	 *	%rdx: src
306	 *	%rcx: bool, if true: xor output
307	 */
308	pushq %r12;
309	pushq %rbx;
310	pushq %rcx;
311
312	movq %rdi, CTX
313	movq %rsi, %r11;
314	movq %rdx, RIO;
315
316	preload_roundkey_enc(0);
317
318	read_block4();
319
320	round_enc4(0);
321	round_enc4(2);
322	round_enc4(4);
323	round_enc4(6);
324	round_enc4(8);
325	round_enc4(10);
326	round_enc4(12);
327	round_enc4(14);
328	add_preloaded_roundkey4();
329
330	popq %r12;
331	movq %r11, RIO;
332
333	test %r12b, %r12b;
334	jnz .L__enc_xor4;
335
336	write_block4();
337
338	popq %rbx;
339	popq %r12;
340	ret;
341
342.L__enc_xor4:
343	xor_block4();
344
345	popq %rbx;
346	popq %r12;
347	ret;
348ENDPROC(__blowfish_enc_blk_4way)
349
350ENTRY(blowfish_dec_blk_4way)
351	/* input:
352	 *	%rdi: ctx
353	 *	%rsi: dst
354	 *	%rdx: src
 
355	 */
356	pushq %r12;
357	pushq %rbx;
 
 
358
359	movq %rdi, CTX;
360	movq %rsi, %r11
361	movq %rdx, RIO;
362
363	preload_roundkey_dec(17);
364	read_block4();
365
366	round_dec4(17);
367	round_dec4(15);
368	round_dec4(13);
369	round_dec4(11);
370	round_dec4(9);
371	round_dec4(7);
372	round_dec4(5);
373	round_dec4(3);
374	add_preloaded_roundkey4();
375
 
 
 
 
 
 
 
 
376	movq %r11, RIO;
377	write_block4();
378
379	popq %rbx;
380	popq %r12;
381
382	ret;
383ENDPROC(blowfish_dec_blk_4way)