neon.uc - lib/raid6/neon.uc - Linux diff v4.17 - Bootlin Elixir Cross Referencer

  1/* -----------------------------------------------------------------------
  2 *
  3 *   neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
  4 *
  5 *   Copyright (C) 2012 Rob Herring
  6 *   Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
  7 *
  8 *   Based on altivec.uc:
  9 *     Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
 10 *
 11 *   This program is free software; you can redistribute it and/or modify
 12 *   it under the terms of the GNU General Public License as published by
 13 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
 14 *   Boston MA 02111-1307, USA; either version 2 of the License, or
 15 *   (at your option) any later version; incorporated herein by reference.
 16 *
 17 * ----------------------------------------------------------------------- */
 18
 19/*
 20 * neon$#.c
 21 *
 22 * $#-way unrolled NEON intrinsics math RAID-6 instruction set
 23 *
 24 * This file is postprocessed using unroll.awk
 25 */
 26
 27#include <arm_neon.h>
 
 28
 29typedef uint8x16_t unative_t;
 30
 31#define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x})
 32#define NSIZE	sizeof(unative_t)
 33
 34/*
 35 * The SHLBYTE() operation shifts each byte left by 1, *not*
 36 * rolling over into the next byte
 37 */
 38static inline unative_t SHLBYTE(unative_t v)
 39{
 40	return vshlq_n_u8(v, 1);
 41}
 42
 43/*
 44 * The MASK() operation returns 0xFF in any byte for which the high
 45 * bit is 1, 0x00 for any byte for which the high bit is 0.
 46 */
 47static inline unative_t MASK(unative_t v)
 48{
 49	return (unative_t)vshrq_n_s8((int8x16_t)v, 7);
 50}
 51
 52static inline unative_t PMUL(unative_t v, unative_t u)
 53{
 54	return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u);
 55}
 56
 57void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
 58{
 59	uint8_t **dptr = (uint8_t **)ptrs;
 60	uint8_t *p, *q;
 61	int d, z, z0;
 62
 63	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
 64	const unative_t x1d = NBYTES(0x1d);
 65
 66	z0 = disks - 3;		/* Highest data disk */
 67	p = dptr[z0+1];		/* XOR parity */
 68	q = dptr[z0+2];		/* RS syndrome */
 69
 70	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
 71		wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
 72		for ( z = z0-1 ; z >= 0 ; z-- ) {
 73			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
 74			wp$$ = veorq_u8(wp$$, wd$$);
 75			w2$$ = MASK(wq$$);
 76			w1$$ = SHLBYTE(wq$$);
 77
 78			w2$$ = vandq_u8(w2$$, x1d);
 79			w1$$ = veorq_u8(w1$$, w2$$);
 80			wq$$ = veorq_u8(w1$$, wd$$);
 81		}
 82		vst1q_u8(&p[d+NSIZE*$$], wp$$);
 83		vst1q_u8(&q[d+NSIZE*$$], wq$$);
 84	}
 85}
 86
 87void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
 88				    unsigned long bytes, void **ptrs)
 89{
 90	uint8_t **dptr = (uint8_t **)ptrs;
 91	uint8_t *p, *q;
 92	int d, z, z0;
 93
 94	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
 95	const unative_t x1d = NBYTES(0x1d);
 96
 97	z0 = stop;		/* P/Q right side optimization */
 98	p = dptr[disks-2];	/* XOR parity */
 99	q = dptr[disks-1];	/* RS syndrome */
100
101	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
102		wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
103		wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);
104
105		/* P/Q data pages */
106		for ( z = z0-1 ; z >= start ; z-- ) {
107			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
108			wp$$ = veorq_u8(wp$$, wd$$);
109			w2$$ = MASK(wq$$);
110			w1$$ = SHLBYTE(wq$$);
111
112			w2$$ = vandq_u8(w2$$, x1d);
113			w1$$ = veorq_u8(w1$$, w2$$);
114			wq$$ = veorq_u8(w1$$, wd$$);
115		}
116		/* P/Q left side optimization */
117		for ( z = start-1 ; z >= 3 ; z -= 4 ) {
118			w2$$ = vshrq_n_u8(wq$$, 4);
119			w1$$ = vshlq_n_u8(wq$$, 4);
120
121			w2$$ = PMUL(w2$$, x1d);
122			wq$$ = veorq_u8(w1$$, w2$$);
123		}
124
125		switch (z) {
126		case 2:
127			w2$$ = vshrq_n_u8(wq$$, 5);
128			w1$$ = vshlq_n_u8(wq$$, 3);
129
130			w2$$ = PMUL(w2$$, x1d);
131			wq$$ = veorq_u8(w1$$, w2$$);
132			break;
133		case 1:
134			w2$$ = vshrq_n_u8(wq$$, 6);
135			w1$$ = vshlq_n_u8(wq$$, 2);
136
137			w2$$ = PMUL(w2$$, x1d);
138			wq$$ = veorq_u8(w1$$, w2$$);
139			break;
140		case 0:
141			w2$$ = MASK(wq$$);
142			w1$$ = SHLBYTE(wq$$);
143
144			w2$$ = vandq_u8(w2$$, x1d);
145			wq$$ = veorq_u8(w1$$, w2$$);
146		}
147		w1$$ = vld1q_u8(&q[d+NSIZE*$$]);
148		wq$$ = veorq_u8(wq$$, w1$$);
149
150		vst1q_u8(&p[d+NSIZE*$$], wp$$);
151		vst1q_u8(&q[d+NSIZE*$$], wq$$);
152	}
153}

  1/* -----------------------------------------------------------------------
  2 *
  3 *   neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
  4 *
  5 *   Copyright (C) 2012 Rob Herring
  6 *   Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
  7 *
  8 *   Based on altivec.uc:
  9 *     Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
 10 *
 11 *   This program is free software; you can redistribute it and/or modify
 12 *   it under the terms of the GNU General Public License as published by
 13 *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
 14 *   Boston MA 02111-1307, USA; either version 2 of the License, or
 15 *   (at your option) any later version; incorporated herein by reference.
 16 *
 17 * ----------------------------------------------------------------------- */
 18
 19/*
 20 * neon$#.c
 21 *
 22 * $#-way unrolled NEON intrinsics math RAID-6 instruction set
 23 *
 24 * This file is postprocessed using unroll.awk
 25 */
 26
 27#include <arm_neon.h>
 28#include "neon.h"
 29
 30typedef uint8x16_t unative_t;
 31
 
 32#define NSIZE	sizeof(unative_t)
 33
 34/*
 35 * The SHLBYTE() operation shifts each byte left by 1, *not*
 36 * rolling over into the next byte
 37 */
 38static inline unative_t SHLBYTE(unative_t v)
 39{
 40	return vshlq_n_u8(v, 1);
 41}
 42
 43/*
 44 * The MASK() operation returns 0xFF in any byte for which the high
 45 * bit is 1, 0x00 for any byte for which the high bit is 0.
 46 */
 47static inline unative_t MASK(unative_t v)
 48{
 49	return (unative_t)vshrq_n_s8((int8x16_t)v, 7);
 50}
 51
 52static inline unative_t PMUL(unative_t v, unative_t u)
 53{
 54	return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u);
 55}
 56
 57void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
 58{
 59	uint8_t **dptr = (uint8_t **)ptrs;
 60	uint8_t *p, *q;
 61	int d, z, z0;
 62
 63	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
 64	const unative_t x1d = vdupq_n_u8(0x1d);
 65
 66	z0 = disks - 3;		/* Highest data disk */
 67	p = dptr[z0+1];		/* XOR parity */
 68	q = dptr[z0+2];		/* RS syndrome */
 69
 70	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
 71		wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
 72		for ( z = z0-1 ; z >= 0 ; z-- ) {
 73			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
 74			wp$$ = veorq_u8(wp$$, wd$$);
 75			w2$$ = MASK(wq$$);
 76			w1$$ = SHLBYTE(wq$$);
 77
 78			w2$$ = vandq_u8(w2$$, x1d);
 79			w1$$ = veorq_u8(w1$$, w2$$);
 80			wq$$ = veorq_u8(w1$$, wd$$);
 81		}
 82		vst1q_u8(&p[d+NSIZE*$$], wp$$);
 83		vst1q_u8(&q[d+NSIZE*$$], wq$$);
 84	}
 85}
 86
 87void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
 88				    unsigned long bytes, void **ptrs)
 89{
 90	uint8_t **dptr = (uint8_t **)ptrs;
 91	uint8_t *p, *q;
 92	int d, z, z0;
 93
 94	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
 95	const unative_t x1d = vdupq_n_u8(0x1d);
 96
 97	z0 = stop;		/* P/Q right side optimization */
 98	p = dptr[disks-2];	/* XOR parity */
 99	q = dptr[disks-1];	/* RS syndrome */
100
101	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
102		wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
103		wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);
104
105		/* P/Q data pages */
106		for ( z = z0-1 ; z >= start ; z-- ) {
107			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
108			wp$$ = veorq_u8(wp$$, wd$$);
109			w2$$ = MASK(wq$$);
110			w1$$ = SHLBYTE(wq$$);
111
112			w2$$ = vandq_u8(w2$$, x1d);
113			w1$$ = veorq_u8(w1$$, w2$$);
114			wq$$ = veorq_u8(w1$$, wd$$);
115		}
116		/* P/Q left side optimization */
117		for ( z = start-1 ; z >= 3 ; z -= 4 ) {
118			w2$$ = vshrq_n_u8(wq$$, 4);
119			w1$$ = vshlq_n_u8(wq$$, 4);
120
121			w2$$ = PMUL(w2$$, x1d);
122			wq$$ = veorq_u8(w1$$, w2$$);
123		}
124
125		switch (z) {
126		case 2:
127			w2$$ = vshrq_n_u8(wq$$, 5);
128			w1$$ = vshlq_n_u8(wq$$, 3);
129
130			w2$$ = PMUL(w2$$, x1d);
131			wq$$ = veorq_u8(w1$$, w2$$);
132			break;
133		case 1:
134			w2$$ = vshrq_n_u8(wq$$, 6);
135			w1$$ = vshlq_n_u8(wq$$, 2);
136
137			w2$$ = PMUL(w2$$, x1d);
138			wq$$ = veorq_u8(w1$$, w2$$);
139			break;
140		case 0:
141			w2$$ = MASK(wq$$);
142			w1$$ = SHLBYTE(wq$$);
143
144			w2$$ = vandq_u8(w2$$, x1d);
145			wq$$ = veorq_u8(w1$$, w2$$);
146		}
147		w1$$ = vld1q_u8(&q[d+NSIZE*$$]);
148		wq$$ = veorq_u8(wq$$, w1$$);
149
150		vst1q_u8(&p[d+NSIZE*$$], wp$$);
151		vst1q_u8(&q[d+NSIZE*$$], wq$$);
152	}
153}