Linux Audio

Check our new training course

Yocto distribution development and maintenance

Need a Yocto distribution for your embedded project?
Loading...
v3.5.6
 
  1/*
  2 * Blowfish Cipher Algorithm (x86_64)
  3 *
  4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
  5 *
  6 * This program is free software; you can redistribute it and/or modify
  7 * it under the terms of the GNU General Public License as published by
  8 * the Free Software Foundation; either version 2 of the License, or
  9 * (at your option) any later version.
 10 *
 11 * This program is distributed in the hope that it will be useful,
 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 * GNU General Public License for more details.
 15 *
 16 * You should have received a copy of the GNU General Public License
 17 * along with this program; if not, write to the Free Software
 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
 19 * USA
 20 *
 21 */
 22
 
 
 23.file "blowfish-x86_64-asm.S"
 24.text
 25
 26/* structure of crypto context */
 27#define p	0
 28#define s0	((16 + 2) * 4)
 29#define s1	((16 + 2 + (1 * 256)) * 4)
 30#define s2	((16 + 2 + (2 * 256)) * 4)
 31#define s3	((16 + 2 + (3 * 256)) * 4)
 32
 33/* register macros */
 34#define CTX %rdi
 35#define RIO %rsi
 36
 37#define RX0 %rax
 38#define RX1 %rbx
 39#define RX2 %rcx
 40#define RX3 %rdx
 41
 42#define RX0d %eax
 43#define RX1d %ebx
 44#define RX2d %ecx
 45#define RX3d %edx
 46
 47#define RX0bl %al
 48#define RX1bl %bl
 49#define RX2bl %cl
 50#define RX3bl %dl
 51
 52#define RX0bh %ah
 53#define RX1bh %bh
 54#define RX2bh %ch
 55#define RX3bh %dh
 56
 57#define RT0 %rbp
 58#define RT1 %rsi
 59#define RT2 %r8
 60#define RT3 %r9
 61
 62#define RT0d %ebp
 63#define RT1d %esi
 64#define RT2d %r8d
 65#define RT3d %r9d
 66
 67#define RKEY %r10
 68
 69/***********************************************************************
 70 * 1-way blowfish
 71 ***********************************************************************/
 72#define F() \
 73	rorq $16,		RX0; \
 74	movzbl RX0bh,		RT0d; \
 75	movzbl RX0bl,		RT1d; \
 76	rolq $16,		RX0; \
 77	movl s0(CTX,RT0,4),	RT0d; \
 78	addl s1(CTX,RT1,4),	RT0d; \
 79	movzbl RX0bh,		RT1d; \
 80	movzbl RX0bl,		RT2d; \
 81	rolq $32,		RX0; \
 82	xorl s2(CTX,RT1,4),	RT0d; \
 83	addl s3(CTX,RT2,4),	RT0d; \
 84	xorq RT0,		RX0;
 85
 86#define add_roundkey_enc(n) \
 87	xorq p+4*(n)(CTX), 	RX0;
 88
 89#define round_enc(n) \
 90	add_roundkey_enc(n); \
 91	\
 92	F(); \
 93	F();
 94
 95#define add_roundkey_dec(n) \
 96	movq p+4*(n-1)(CTX),	RT0; \
 97	rorq $32,		RT0; \
 98	xorq RT0,		RX0;
 99
100#define round_dec(n) \
101	add_roundkey_dec(n); \
102	\
103	F(); \
104	F(); \
105
106#define read_block() \
107	movq (RIO), 		RX0; \
108	rorq $32, 		RX0; \
109	bswapq 			RX0;
110
111#define write_block() \
112	bswapq 			RX0; \
113	movq RX0, 		(RIO);
114
115#define xor_block() \
116	bswapq 			RX0; \
117	xorq RX0, 		(RIO);
118
119.align 8
120.global __blowfish_enc_blk
121.type   __blowfish_enc_blk,@function;
122
123__blowfish_enc_blk:
124	/* input:
125	 *	%rdi: ctx, CTX
126	 *	%rsi: dst
127	 *	%rdx: src
128	 *	%rcx: bool, if true: xor output
129	 */
130	movq %rbp, %r11;
131
 
132	movq %rsi, %r10;
133	movq %rdx, RIO;
134
135	read_block();
136
137	round_enc(0);
138	round_enc(2);
139	round_enc(4);
140	round_enc(6);
141	round_enc(8);
142	round_enc(10);
143	round_enc(12);
144	round_enc(14);
145	add_roundkey_enc(16);
146
147	movq %r11, %rbp;
148
149	movq %r10, RIO;
150	test %cl, %cl;
151	jnz __enc_xor;
152
153	write_block();
154	ret;
155__enc_xor:
156	xor_block();
157	ret;
 
158
159.align 8
160.global blowfish_dec_blk
161.type   blowfish_dec_blk,@function;
162
163blowfish_dec_blk:
164	/* input:
165	 *	%rdi: ctx, CTX
166	 *	%rsi: dst
167	 *	%rdx: src
168	 */
169	movq %rbp, %r11;
170
 
171	movq %rsi, %r10;
172	movq %rdx, RIO;
173
174	read_block();
175
176	round_dec(17);
177	round_dec(15);
178	round_dec(13);
179	round_dec(11);
180	round_dec(9);
181	round_dec(7);
182	round_dec(5);
183	round_dec(3);
184	add_roundkey_dec(1);
185
186	movq %r10, RIO;
187	write_block();
188
189	movq %r11, %rbp;
190
191	ret;
 
192
193/**********************************************************************
194  4-way blowfish, four blocks parallel
195 **********************************************************************/
196
197/* F() for 4-way. Slower when used alone/1-way, but faster when used
198 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
199 */
200#define F4(x) \
201	movzbl x ## bh,		RT1d; \
202	movzbl x ## bl,		RT3d; \
203	rorq $16,		x; \
204	movzbl x ## bh,		RT0d; \
205	movzbl x ## bl,		RT2d; \
206	rorq $16,		x; \
207	movl s0(CTX,RT0,4),	RT0d; \
208	addl s1(CTX,RT2,4),	RT0d; \
209	xorl s2(CTX,RT1,4),	RT0d; \
210	addl s3(CTX,RT3,4),	RT0d; \
211	xorq RT0,		x;
212
213#define add_preloaded_roundkey4() \
214	xorq RKEY,		RX0; \
215	xorq RKEY,		RX1; \
216	xorq RKEY,		RX2; \
217	xorq RKEY,		RX3;
218
219#define preload_roundkey_enc(n) \
220	movq p+4*(n)(CTX),	RKEY;
221
222#define add_roundkey_enc4(n) \
223	add_preloaded_roundkey4(); \
224	preload_roundkey_enc(n + 2);
225
226#define round_enc4(n) \
227	add_roundkey_enc4(n); \
228	\
229	F4(RX0); \
230	F4(RX1); \
231	F4(RX2); \
232	F4(RX3); \
233	\
234	F4(RX0); \
235	F4(RX1); \
236	F4(RX2); \
237	F4(RX3);
238
239#define preload_roundkey_dec(n) \
240	movq p+4*((n)-1)(CTX),	RKEY; \
241	rorq $32,		RKEY;
242
243#define add_roundkey_dec4(n) \
244	add_preloaded_roundkey4(); \
245	preload_roundkey_dec(n - 2);
246
247#define round_dec4(n) \
248	add_roundkey_dec4(n); \
249	\
250	F4(RX0); \
251	F4(RX1); \
252	F4(RX2); \
253	F4(RX3); \
254	\
255	F4(RX0); \
256	F4(RX1); \
257	F4(RX2); \
258	F4(RX3);
259
260#define read_block4() \
261	movq (RIO),		RX0; \
262	rorq $32,		RX0; \
263	bswapq 			RX0; \
264	\
265	movq 8(RIO),		RX1; \
266	rorq $32,		RX1; \
267	bswapq 			RX1; \
268	\
269	movq 16(RIO),		RX2; \
270	rorq $32,		RX2; \
271	bswapq 			RX2; \
272	\
273	movq 24(RIO),		RX3; \
274	rorq $32,		RX3; \
275	bswapq 			RX3;
276
277#define write_block4() \
278	bswapq 			RX0; \
279	movq RX0,		(RIO); \
280	\
281	bswapq 			RX1; \
282	movq RX1,		8(RIO); \
283	\
284	bswapq 			RX2; \
285	movq RX2,		16(RIO); \
286	\
287	bswapq 			RX3; \
288	movq RX3,		24(RIO);
289
290#define xor_block4() \
291	bswapq 			RX0; \
292	xorq RX0,		(RIO); \
293	\
294	bswapq 			RX1; \
295	xorq RX1,		8(RIO); \
296	\
297	bswapq 			RX2; \
298	xorq RX2,		16(RIO); \
299	\
300	bswapq 			RX3; \
301	xorq RX3,		24(RIO);
302
303.align 8
304.global __blowfish_enc_blk_4way
305.type   __blowfish_enc_blk_4way,@function;
306
307__blowfish_enc_blk_4way:
308	/* input:
309	 *	%rdi: ctx, CTX
310	 *	%rsi: dst
311	 *	%rdx: src
312	 *	%rcx: bool, if true: xor output
313	 */
314	pushq %rbp;
315	pushq %rbx;
316	pushq %rcx;
317
318	preload_roundkey_enc(0);
319
320	movq %rsi, %r11;
321	movq %rdx, RIO;
322
 
 
323	read_block4();
324
325	round_enc4(0);
326	round_enc4(2);
327	round_enc4(4);
328	round_enc4(6);
329	round_enc4(8);
330	round_enc4(10);
331	round_enc4(12);
332	round_enc4(14);
333	add_preloaded_roundkey4();
334
335	popq %rbp;
336	movq %r11, RIO;
337
338	test %bpl, %bpl;
339	jnz __enc_xor4;
340
341	write_block4();
342
343	popq %rbx;
344	popq %rbp;
345	ret;
346
347__enc_xor4:
348	xor_block4();
349
350	popq %rbx;
351	popq %rbp;
352	ret;
 
353
354.align 8
355.global blowfish_dec_blk_4way
356.type   blowfish_dec_blk_4way,@function;
357
358blowfish_dec_blk_4way:
359	/* input:
360	 *	%rdi: ctx, CTX
361	 *	%rsi: dst
362	 *	%rdx: src
363	 */
364	pushq %rbp;
365	pushq %rbx;
366	preload_roundkey_dec(17);
367
368	movq %rsi, %r11;
 
369	movq %rdx, RIO;
370
 
371	read_block4();
372
373	round_dec4(17);
374	round_dec4(15);
375	round_dec4(13);
376	round_dec4(11);
377	round_dec4(9);
378	round_dec4(7);
379	round_dec4(5);
380	round_dec4(3);
381	add_preloaded_roundkey4();
382
383	movq %r11, RIO;
384	write_block4();
385
386	popq %rbx;
387	popq %rbp;
388
389	ret;
390
v5.14.15
  1/* SPDX-License-Identifier: GPL-2.0-or-later */
  2/*
  3 * Blowfish Cipher Algorithm (x86_64)
  4 *
  5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  6 */
  7
  8#include <linux/linkage.h>
  9
 10.file "blowfish-x86_64-asm.S"
 11.text
 12
 13/* structure of crypto context */
 14#define p	0
 15#define s0	((16 + 2) * 4)
 16#define s1	((16 + 2 + (1 * 256)) * 4)
 17#define s2	((16 + 2 + (2 * 256)) * 4)
 18#define s3	((16 + 2 + (3 * 256)) * 4)
 19
 20/* register macros */
 21#define CTX %r12
 22#define RIO %rsi
 23
 24#define RX0 %rax
 25#define RX1 %rbx
 26#define RX2 %rcx
 27#define RX3 %rdx
 28
 29#define RX0d %eax
 30#define RX1d %ebx
 31#define RX2d %ecx
 32#define RX3d %edx
 33
 34#define RX0bl %al
 35#define RX1bl %bl
 36#define RX2bl %cl
 37#define RX3bl %dl
 38
 39#define RX0bh %ah
 40#define RX1bh %bh
 41#define RX2bh %ch
 42#define RX3bh %dh
 43
 44#define RT0 %rdi
 45#define RT1 %rsi
 46#define RT2 %r8
 47#define RT3 %r9
 48
 49#define RT0d %edi
 50#define RT1d %esi
 51#define RT2d %r8d
 52#define RT3d %r9d
 53
 54#define RKEY %r10
 55
 56/***********************************************************************
 57 * 1-way blowfish
 58 ***********************************************************************/
 59#define F() \
 60	rorq $16,		RX0; \
 61	movzbl RX0bh,		RT0d; \
 62	movzbl RX0bl,		RT1d; \
 63	rolq $16,		RX0; \
 64	movl s0(CTX,RT0,4),	RT0d; \
 65	addl s1(CTX,RT1,4),	RT0d; \
 66	movzbl RX0bh,		RT1d; \
 67	movzbl RX0bl,		RT2d; \
 68	rolq $32,		RX0; \
 69	xorl s2(CTX,RT1,4),	RT0d; \
 70	addl s3(CTX,RT2,4),	RT0d; \
 71	xorq RT0,		RX0;
 72
 73#define add_roundkey_enc(n) \
 74	xorq p+4*(n)(CTX), 	RX0;
 75
 76#define round_enc(n) \
 77	add_roundkey_enc(n); \
 78	\
 79	F(); \
 80	F();
 81
 82#define add_roundkey_dec(n) \
 83	movq p+4*(n-1)(CTX),	RT0; \
 84	rorq $32,		RT0; \
 85	xorq RT0,		RX0;
 86
 87#define round_dec(n) \
 88	add_roundkey_dec(n); \
 89	\
 90	F(); \
 91	F(); \
 92
 93#define read_block() \
 94	movq (RIO), 		RX0; \
 95	rorq $32, 		RX0; \
 96	bswapq 			RX0;
 97
 98#define write_block() \
 99	bswapq 			RX0; \
100	movq RX0, 		(RIO);
101
102#define xor_block() \
103	bswapq 			RX0; \
104	xorq RX0, 		(RIO);
105
106SYM_FUNC_START(__blowfish_enc_blk)
 
 
 
 
107	/* input:
108	 *	%rdi: ctx
109	 *	%rsi: dst
110	 *	%rdx: src
111	 *	%rcx: bool, if true: xor output
112	 */
113	movq %r12, %r11;
114
115	movq %rdi, CTX;
116	movq %rsi, %r10;
117	movq %rdx, RIO;
118
119	read_block();
120
121	round_enc(0);
122	round_enc(2);
123	round_enc(4);
124	round_enc(6);
125	round_enc(8);
126	round_enc(10);
127	round_enc(12);
128	round_enc(14);
129	add_roundkey_enc(16);
130
131	movq %r11, %r12;
132
133	movq %r10, RIO;
134	test %cl, %cl;
135	jnz .L__enc_xor;
136
137	write_block();
138	ret;
139.L__enc_xor:
140	xor_block();
141	ret;
142SYM_FUNC_END(__blowfish_enc_blk)
143
144SYM_FUNC_START(blowfish_dec_blk)
 
 
 
 
145	/* input:
146	 *	%rdi: ctx
147	 *	%rsi: dst
148	 *	%rdx: src
149	 */
150	movq %r12, %r11;
151
152	movq %rdi, CTX;
153	movq %rsi, %r10;
154	movq %rdx, RIO;
155
156	read_block();
157
158	round_dec(17);
159	round_dec(15);
160	round_dec(13);
161	round_dec(11);
162	round_dec(9);
163	round_dec(7);
164	round_dec(5);
165	round_dec(3);
166	add_roundkey_dec(1);
167
168	movq %r10, RIO;
169	write_block();
170
171	movq %r11, %r12;
172
173	ret;
174SYM_FUNC_END(blowfish_dec_blk)
175
176/**********************************************************************
177  4-way blowfish, four blocks parallel
178 **********************************************************************/
179
180/* F() for 4-way. Slower when used alone/1-way, but faster when used
181 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
182 */
183#define F4(x) \
184	movzbl x ## bh,		RT1d; \
185	movzbl x ## bl,		RT3d; \
186	rorq $16,		x; \
187	movzbl x ## bh,		RT0d; \
188	movzbl x ## bl,		RT2d; \
189	rorq $16,		x; \
190	movl s0(CTX,RT0,4),	RT0d; \
191	addl s1(CTX,RT2,4),	RT0d; \
192	xorl s2(CTX,RT1,4),	RT0d; \
193	addl s3(CTX,RT3,4),	RT0d; \
194	xorq RT0,		x;
195
196#define add_preloaded_roundkey4() \
197	xorq RKEY,		RX0; \
198	xorq RKEY,		RX1; \
199	xorq RKEY,		RX2; \
200	xorq RKEY,		RX3;
201
202#define preload_roundkey_enc(n) \
203	movq p+4*(n)(CTX),	RKEY;
204
205#define add_roundkey_enc4(n) \
206	add_preloaded_roundkey4(); \
207	preload_roundkey_enc(n + 2);
208
209#define round_enc4(n) \
210	add_roundkey_enc4(n); \
211	\
212	F4(RX0); \
213	F4(RX1); \
214	F4(RX2); \
215	F4(RX3); \
216	\
217	F4(RX0); \
218	F4(RX1); \
219	F4(RX2); \
220	F4(RX3);
221
222#define preload_roundkey_dec(n) \
223	movq p+4*((n)-1)(CTX),	RKEY; \
224	rorq $32,		RKEY;
225
226#define add_roundkey_dec4(n) \
227	add_preloaded_roundkey4(); \
228	preload_roundkey_dec(n - 2);
229
230#define round_dec4(n) \
231	add_roundkey_dec4(n); \
232	\
233	F4(RX0); \
234	F4(RX1); \
235	F4(RX2); \
236	F4(RX3); \
237	\
238	F4(RX0); \
239	F4(RX1); \
240	F4(RX2); \
241	F4(RX3);
242
243#define read_block4() \
244	movq (RIO),		RX0; \
245	rorq $32,		RX0; \
246	bswapq 			RX0; \
247	\
248	movq 8(RIO),		RX1; \
249	rorq $32,		RX1; \
250	bswapq 			RX1; \
251	\
252	movq 16(RIO),		RX2; \
253	rorq $32,		RX2; \
254	bswapq 			RX2; \
255	\
256	movq 24(RIO),		RX3; \
257	rorq $32,		RX3; \
258	bswapq 			RX3;
259
260#define write_block4() \
261	bswapq 			RX0; \
262	movq RX0,		(RIO); \
263	\
264	bswapq 			RX1; \
265	movq RX1,		8(RIO); \
266	\
267	bswapq 			RX2; \
268	movq RX2,		16(RIO); \
269	\
270	bswapq 			RX3; \
271	movq RX3,		24(RIO);
272
273#define xor_block4() \
274	bswapq 			RX0; \
275	xorq RX0,		(RIO); \
276	\
277	bswapq 			RX1; \
278	xorq RX1,		8(RIO); \
279	\
280	bswapq 			RX2; \
281	xorq RX2,		16(RIO); \
282	\
283	bswapq 			RX3; \
284	xorq RX3,		24(RIO);
285
286SYM_FUNC_START(__blowfish_enc_blk_4way)
 
 
 
 
287	/* input:
288	 *	%rdi: ctx
289	 *	%rsi: dst
290	 *	%rdx: src
291	 *	%rcx: bool, if true: xor output
292	 */
293	pushq %r12;
294	pushq %rbx;
295	pushq %rcx;
296
297	movq %rdi, CTX
 
298	movq %rsi, %r11;
299	movq %rdx, RIO;
300
301	preload_roundkey_enc(0);
302
303	read_block4();
304
305	round_enc4(0);
306	round_enc4(2);
307	round_enc4(4);
308	round_enc4(6);
309	round_enc4(8);
310	round_enc4(10);
311	round_enc4(12);
312	round_enc4(14);
313	add_preloaded_roundkey4();
314
315	popq %r12;
316	movq %r11, RIO;
317
318	test %r12b, %r12b;
319	jnz .L__enc_xor4;
320
321	write_block4();
322
323	popq %rbx;
324	popq %r12;
325	ret;
326
327.L__enc_xor4:
328	xor_block4();
329
330	popq %rbx;
331	popq %r12;
332	ret;
333SYM_FUNC_END(__blowfish_enc_blk_4way)
334
335SYM_FUNC_START(blowfish_dec_blk_4way)
 
 
 
 
336	/* input:
337	 *	%rdi: ctx
338	 *	%rsi: dst
339	 *	%rdx: src
340	 */
341	pushq %r12;
342	pushq %rbx;
 
343
344	movq %rdi, CTX;
345	movq %rsi, %r11
346	movq %rdx, RIO;
347
348	preload_roundkey_dec(17);
349	read_block4();
350
351	round_dec4(17);
352	round_dec4(15);
353	round_dec4(13);
354	round_dec4(11);
355	round_dec4(9);
356	round_dec4(7);
357	round_dec4(5);
358	round_dec4(3);
359	add_preloaded_roundkey4();
360
361	movq %r11, RIO;
362	write_block4();
363
364	popq %rbx;
365	popq %r12;
366
367	ret;
368SYM_FUNC_END(blowfish_dec_blk_4way)