neon.uc - lib/raid6/neon.uc - Linux diff v5.9 - Bootlin Elixir Cross Referencer

  1/* -----------------------------------------------------------------------
  2 *
  3 *   neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
  4 *
  5 *   Copyright (C) 2012 Rob Herring
  6 *   Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
  7 *
  8 *   Based on altivec.uc:
  9 *     Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
 10 *
 11 *   This program is free software; you can redistribute it and/or modify
 12 *   it under the terms of the GNU General Public License as published by
 13 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
 14 *   Boston MA 02111-1307, USA; either version 2 of the License, or
 15 *   (at your option) any later version; incorporated herein by reference.
 16 *
 17 * ----------------------------------------------------------------------- */
 18
 19/*
 20 * neon$#.c
 21 *
 22 * $#-way unrolled NEON intrinsics math RAID-6 instruction set
 23 *
 24 * This file is postprocessed using unroll.awk
 25 */
 26
 27#include <arm_neon.h>
 28
 29typedef uint8x16_t unative_t;
 30
 
 31#define NSIZE	sizeof(unative_t)
 32
 33/*
 34 * The SHLBYTE() operation shifts each byte left by 1, *not*
 35 * rolling over into the next byte
 36 */
 37static inline unative_t SHLBYTE(unative_t v)
 38{
 39	return vshlq_n_u8(v, 1);
 40}
 41
 42/*
 43 * The MASK() operation returns 0xFF in any byte for which the high
 44 * bit is 1, 0x00 for any byte for which the high bit is 0.
 45 */
 46static inline unative_t MASK(unative_t v)
 47{
 48	return (unative_t)vshrq_n_s8((int8x16_t)v, 7);
 49}
 50
 51static inline unative_t PMUL(unative_t v, unative_t u)
 52{
 53	return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u);
 54}
 55
 56void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
 57{
 58	uint8_t **dptr = (uint8_t **)ptrs;
 59	uint8_t *p, *q;
 60	int d, z, z0;
 61
 62	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
 63	const unative_t x1d = vdupq_n_u8(0x1d);
 64
 65	z0 = disks - 3;		/* Highest data disk */
 66	p = dptr[z0+1];		/* XOR parity */
 67	q = dptr[z0+2];		/* RS syndrome */
 68
 69	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
 70		wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
 71		for ( z = z0-1 ; z >= 0 ; z-- ) {
 72			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
 73			wp$$ = veorq_u8(wp$$, wd$$);
 74			w2$$ = MASK(wq$$);
 75			w1$$ = SHLBYTE(wq$$);
 76
 77			w2$$ = vandq_u8(w2$$, x1d);
 78			w1$$ = veorq_u8(w1$$, w2$$);
 79			wq$$ = veorq_u8(w1$$, wd$$);
 80		}
 81		vst1q_u8(&p[d+NSIZE*$$], wp$$);
 82		vst1q_u8(&q[d+NSIZE*$$], wq$$);
 83	}
 84}
 85
 86void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
 87				    unsigned long bytes, void **ptrs)
 88{
 89	uint8_t **dptr = (uint8_t **)ptrs;
 90	uint8_t *p, *q;
 91	int d, z, z0;
 92
 93	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
 94	const unative_t x1d = vdupq_n_u8(0x1d);
 95
 96	z0 = stop;		/* P/Q right side optimization */
 97	p = dptr[disks-2];	/* XOR parity */
 98	q = dptr[disks-1];	/* RS syndrome */
 99
100	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
101		wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
102		wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);
103
104		/* P/Q data pages */
105		for ( z = z0-1 ; z >= start ; z-- ) {
106			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
107			wp$$ = veorq_u8(wp$$, wd$$);
108			w2$$ = MASK(wq$$);
109			w1$$ = SHLBYTE(wq$$);
110
111			w2$$ = vandq_u8(w2$$, x1d);
112			w1$$ = veorq_u8(w1$$, w2$$);
113			wq$$ = veorq_u8(w1$$, wd$$);
114		}
115		/* P/Q left side optimization */
116		for ( z = start-1 ; z >= 3 ; z -= 4 ) {
117			w2$$ = vshrq_n_u8(wq$$, 4);
118			w1$$ = vshlq_n_u8(wq$$, 4);
119
120			w2$$ = PMUL(w2$$, x1d);
121			wq$$ = veorq_u8(w1$$, w2$$);
122		}
123
124		switch (z) {
125		case 2:
126			w2$$ = vshrq_n_u8(wq$$, 5);
127			w1$$ = vshlq_n_u8(wq$$, 3);
128
129			w2$$ = PMUL(w2$$, x1d);
130			wq$$ = veorq_u8(w1$$, w2$$);
131			break;
132		case 1:
133			w2$$ = vshrq_n_u8(wq$$, 6);
134			w1$$ = vshlq_n_u8(wq$$, 2);
135
136			w2$$ = PMUL(w2$$, x1d);
137			wq$$ = veorq_u8(w1$$, w2$$);
138			break;
139		case 0:
140			w2$$ = MASK(wq$$);
141			w1$$ = SHLBYTE(wq$$);
142
143			w2$$ = vandq_u8(w2$$, x1d);
144			wq$$ = veorq_u8(w1$$, w2$$);
145		}
146		w1$$ = vld1q_u8(&q[d+NSIZE*$$]);
147		wq$$ = veorq_u8(wq$$, w1$$);
148
149		vst1q_u8(&p[d+NSIZE*$$], wp$$);
150		vst1q_u8(&q[d+NSIZE*$$], wq$$);
151	}
152}

  1/* -----------------------------------------------------------------------
  2 *
  3 *   neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
  4 *
  5 *   Copyright (C) 2012 Rob Herring
  6 *   Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
  7 *
  8 *   Based on altivec.uc:
  9 *     Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
 10 *
 11 *   This program is free software; you can redistribute it and/or modify
 12 *   it under the terms of the GNU General Public License as published by
 13 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
 14 *   Boston MA 02111-1307, USA; either version 2 of the License, or
 15 *   (at your option) any later version; incorporated herein by reference.
 16 *
 17 * ----------------------------------------------------------------------- */
 18
 19/*
 20 * neon$#.c
 21 *
 22 * $#-way unrolled NEON intrinsics math RAID-6 instruction set
 23 *
 24 * This file is postprocessed using unroll.awk
 25 */
 26
 27#include <arm_neon.h>
 28
 29typedef uint8x16_t unative_t;
 30
 31#define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x})
 32#define NSIZE	sizeof(unative_t)
 33
 34/*
 35 * The SHLBYTE() operation shifts each byte left by 1, *not*
 36 * rolling over into the next byte
 37 */
 38static inline unative_t SHLBYTE(unative_t v)
 39{
 40	return vshlq_n_u8(v, 1);
 41}
 42
 43/*
 44 * The MASK() operation returns 0xFF in any byte for which the high
 45 * bit is 1, 0x00 for any byte for which the high bit is 0.
 46 */
 47static inline unative_t MASK(unative_t v)
 48{
 49	return (unative_t)vshrq_n_s8((int8x16_t)v, 7);
 50}
 51
 52static inline unative_t PMUL(unative_t v, unative_t u)
 53{
 54	return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u);
 55}
 56
 57void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
 58{
 59	uint8_t **dptr = (uint8_t **)ptrs;
 60	uint8_t *p, *q;
 61	int d, z, z0;
 62
 63	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
 64	const unative_t x1d = NBYTES(0x1d);
 65
 66	z0 = disks - 3;		/* Highest data disk */
 67	p = dptr[z0+1];		/* XOR parity */
 68	q = dptr[z0+2];		/* RS syndrome */
 69
 70	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
 71		wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
 72		for ( z = z0-1 ; z >= 0 ; z-- ) {
 73			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
 74			wp$$ = veorq_u8(wp$$, wd$$);
 75			w2$$ = MASK(wq$$);
 76			w1$$ = SHLBYTE(wq$$);
 77
 78			w2$$ = vandq_u8(w2$$, x1d);
 79			w1$$ = veorq_u8(w1$$, w2$$);
 80			wq$$ = veorq_u8(w1$$, wd$$);
 81		}
 82		vst1q_u8(&p[d+NSIZE*$$], wp$$);
 83		vst1q_u8(&q[d+NSIZE*$$], wq$$);
 84	}
 85}
 86
 87void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
 88				    unsigned long bytes, void **ptrs)
 89{
 90	uint8_t **dptr = (uint8_t **)ptrs;
 91	uint8_t *p, *q;
 92	int d, z, z0;
 93
 94	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
 95	const unative_t x1d = NBYTES(0x1d);
 96
 97	z0 = stop;		/* P/Q right side optimization */
 98	p = dptr[disks-2];	/* XOR parity */
 99	q = dptr[disks-1];	/* RS syndrome */
100
101	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
102		wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
103		wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);
104
105		/* P/Q data pages */
106		for ( z = z0-1 ; z >= start ; z-- ) {
107			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
108			wp$$ = veorq_u8(wp$$, wd$$);
109			w2$$ = MASK(wq$$);
110			w1$$ = SHLBYTE(wq$$);
111
112			w2$$ = vandq_u8(w2$$, x1d);
113			w1$$ = veorq_u8(w1$$, w2$$);
114			wq$$ = veorq_u8(w1$$, wd$$);
115		}
116		/* P/Q left side optimization */
117		for ( z = start-1 ; z >= 3 ; z -= 4 ) {
118			w2$$ = vshrq_n_u8(wq$$, 4);
119			w1$$ = vshlq_n_u8(wq$$, 4);
120
121			w2$$ = PMUL(w2$$, x1d);
122			wq$$ = veorq_u8(w1$$, w2$$);
123		}
124
125		switch (z) {
126		case 2:
127			w2$$ = vshrq_n_u8(wq$$, 5);
128			w1$$ = vshlq_n_u8(wq$$, 3);
129
130			w2$$ = PMUL(w2$$, x1d);
131			wq$$ = veorq_u8(w1$$, w2$$);
132			break;
133		case 1:
134			w2$$ = vshrq_n_u8(wq$$, 6);
135			w1$$ = vshlq_n_u8(wq$$, 2);
136
137			w2$$ = PMUL(w2$$, x1d);
138			wq$$ = veorq_u8(w1$$, w2$$);
139			break;
140		case 0:
141			w2$$ = MASK(wq$$);
142			w1$$ = SHLBYTE(wq$$);
143
144			w2$$ = vandq_u8(w2$$, x1d);
145			wq$$ = veorq_u8(w1$$, w2$$);
146		}
147		w1$$ = vld1q_u8(&q[d+NSIZE*$$]);
148		wq$$ = veorq_u8(wq$$, w1$$);
149
150		vst1q_u8(&p[d+NSIZE*$$], wp$$);
151		vst1q_u8(&q[d+NSIZE*$$], wq$$);
152	}
153}