bch.c - lib/bch.c - Linux diff v5.9 - Bootlin Elixir Cross Referencer

   1/*
   2 * Generic binary BCH encoding/decoding library
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms of the GNU General Public License version 2 as published by
   6 * the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * You should have received a copy of the GNU General Public License along with
  14 * this program; if not, write to the Free Software Foundation, Inc., 51
  15 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  16 *
  17 * Copyright © 2011 Parrot S.A.
  18 *
  19 * Author: Ivan Djelic <ivan.djelic@parrot.com>
  20 *
  21 * Description:
  22 *
  23 * This library provides runtime configurable encoding/decoding of binary
  24 * Bose-Chaudhuri-Hocquenghem (BCH) codes.
  25 *
  26 * Call bch_init to get a pointer to a newly allocated bch_control structure for
  27 * the given m (Galois field order), t (error correction capability) and
  28 * (optional) primitive polynomial parameters.
  29 *
  30 * Call bch_encode to compute and store ecc parity bytes to a given buffer.
  31 * Call bch_decode to detect and locate errors in received data.
  32 *
  33 * On systems supporting hw BCH features, intermediate results may be provided
  34 * to bch_decode in order to skip certain steps. See bch_decode() documentation
  35 * for details.
  36 *
  37 * Option CONFIG_BCH_CONST_PARAMS can be used to force fixed values of
  38 * parameters m and t; thus allowing extra compiler optimizations and providing
  39 * better (up to 2x) encoding performance. Using this option makes sense when
  40 * (m,t) are fixed and known in advance, e.g. when using BCH error correction
  41 * on a particular NAND flash device.
  42 *
  43 * Algorithmic details:
  44 *
  45 * Encoding is performed by processing 32 input bits in parallel, using 4
  46 * remainder lookup tables.
  47 *
  48 * The final stage of decoding involves the following internal steps:
  49 * a. Syndrome computation
  50 * b. Error locator polynomial computation using Berlekamp-Massey algorithm
  51 * c. Error locator root finding (by far the most expensive step)
  52 *
  53 * In this implementation, step c is not performed using the usual Chien search.
  54 * Instead, an alternative approach described in [1] is used. It consists in
  55 * factoring the error locator polynomial using the Berlekamp Trace algorithm
  56 * (BTA) down to a certain degree (4), after which ad hoc low-degree polynomial
  57 * solving techniques [2] are used. The resulting algorithm, called BTZ, yields
  58 * much better performance than Chien search for usual (m,t) values (typically
  59 * m >= 13, t < 32, see [1]).
  60 *
  61 * [1] B. Biswas, V. Herbert. Efficient root finding of polynomials over fields
  62 * of characteristic 2, in: Western European Workshop on Research in Cryptology
  63 * - WEWoRC 2009, Graz, Austria, LNCS, Springer, July 2009, to appear.
  64 * [2] [Zin96] V.A. Zinoviev. On the solution of equations of degree 10 over
  65 * finite fields GF(2^q). In Rapport de recherche INRIA no 2829, 1996.
  66 */
  67
  68#include <linux/kernel.h>
  69#include <linux/errno.h>
  70#include <linux/init.h>
  71#include <linux/module.h>
  72#include <linux/slab.h>
  73#include <linux/bitops.h>
 
  74#include <asm/byteorder.h>
  75#include <linux/bch.h>
  76
  77#if defined(CONFIG_BCH_CONST_PARAMS)
  78#define GF_M(_p)               (CONFIG_BCH_CONST_M)
  79#define GF_T(_p)               (CONFIG_BCH_CONST_T)
  80#define GF_N(_p)               ((1 << (CONFIG_BCH_CONST_M))-1)
  81#define BCH_MAX_M              (CONFIG_BCH_CONST_M)
  82#define BCH_MAX_T	       (CONFIG_BCH_CONST_T)
  83#else
  84#define GF_M(_p)               ((_p)->m)
  85#define GF_T(_p)               ((_p)->t)
  86#define GF_N(_p)               ((_p)->n)
  87#define BCH_MAX_M              15 /* 2KB */
  88#define BCH_MAX_T              64 /* 64 bit correction */
  89#endif
  90
  91#define BCH_ECC_WORDS(_p)      DIV_ROUND_UP(GF_M(_p)*GF_T(_p), 32)
  92#define BCH_ECC_BYTES(_p)      DIV_ROUND_UP(GF_M(_p)*GF_T(_p), 8)
  93
  94#define BCH_ECC_MAX_WORDS      DIV_ROUND_UP(BCH_MAX_M * BCH_MAX_T, 32)
  95
  96#ifndef dbg
  97#define dbg(_fmt, args...)     do {} while (0)
  98#endif
  99
 100/*
 101 * represent a polynomial over GF(2^m)
 102 */
 103struct gf_poly {
 104	unsigned int deg;    /* polynomial degree */
 105	unsigned int c[];   /* polynomial terms */
 106};
 107
 108/* given its degree, compute a polynomial size in bytes */
 109#define GF_POLY_SZ(_d) (sizeof(struct gf_poly)+((_d)+1)*sizeof(unsigned int))
 110
 111/* polynomial of degree 1 */
 112struct gf_poly_deg1 {
 113	struct gf_poly poly;
 114	unsigned int   c[2];
 115};
 116
 117static u8 swap_bits_table[] = {
 118	0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
 119	0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
 120	0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
 121	0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
 122	0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
 123	0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
 124	0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
 125	0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
 126	0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
 127	0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
 128	0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
 129	0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
 130	0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
 131	0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
 132	0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
 133	0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
 134	0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
 135	0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
 136	0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
 137	0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
 138	0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
 139	0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
 140	0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
 141	0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
 142	0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
 143	0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
 144	0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
 145	0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
 146	0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
 147	0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
 148	0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
 149	0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
 150};
 151
 152static u8 swap_bits(struct bch_control *bch, u8 in)
 153{
 154	if (!bch->swap_bits)
 155		return in;
 156
 157	return swap_bits_table[in];
 158}
 159
 160/*
 161 * same as bch_encode(), but process input data one byte at a time
 162 */
 163static void bch_encode_unaligned(struct bch_control *bch,
 164				 const unsigned char *data, unsigned int len,
 165				 uint32_t *ecc)
 166{
 167	int i;
 168	const uint32_t *p;
 169	const int l = BCH_ECC_WORDS(bch)-1;
 170
 171	while (len--) {
 172		u8 tmp = swap_bits(bch, *data++);
 173
 174		p = bch->mod8_tab + (l+1)*(((ecc[0] >> 24)^(tmp)) & 0xff);
 175
 176		for (i = 0; i < l; i++)
 177			ecc[i] = ((ecc[i] << 8)|(ecc[i+1] >> 24))^(*p++);
 178
 179		ecc[l] = (ecc[l] << 8)^(*p);
 180	}
 181}
 182
 183/*
 184 * convert ecc bytes to aligned, zero-padded 32-bit ecc words
 185 */
 186static void load_ecc8(struct bch_control *bch, uint32_t *dst,
 187		      const uint8_t *src)
 188{
 189	uint8_t pad[4] = {0, 0, 0, 0};
 190	unsigned int i, nwords = BCH_ECC_WORDS(bch)-1;
 191
 192	for (i = 0; i < nwords; i++, src += 4)
 193		dst[i] = ((u32)swap_bits(bch, src[0]) << 24) |
 194			((u32)swap_bits(bch, src[1]) << 16) |
 195			((u32)swap_bits(bch, src[2]) << 8) |
 196			swap_bits(bch, src[3]);
 197
 198	memcpy(pad, src, BCH_ECC_BYTES(bch)-4*nwords);
 199	dst[nwords] = ((u32)swap_bits(bch, pad[0]) << 24) |
 200		((u32)swap_bits(bch, pad[1]) << 16) |
 201		((u32)swap_bits(bch, pad[2]) << 8) |
 202		swap_bits(bch, pad[3]);
 203}
 204
 205/*
 206 * convert 32-bit ecc words to ecc bytes
 207 */
 208static void store_ecc8(struct bch_control *bch, uint8_t *dst,
 209		       const uint32_t *src)
 210{
 211	uint8_t pad[4];
 212	unsigned int i, nwords = BCH_ECC_WORDS(bch)-1;
 213
 214	for (i = 0; i < nwords; i++) {
 215		*dst++ = swap_bits(bch, src[i] >> 24);
 216		*dst++ = swap_bits(bch, src[i] >> 16);
 217		*dst++ = swap_bits(bch, src[i] >> 8);
 218		*dst++ = swap_bits(bch, src[i]);
 219	}
 220	pad[0] = swap_bits(bch, src[nwords] >> 24);
 221	pad[1] = swap_bits(bch, src[nwords] >> 16);
 222	pad[2] = swap_bits(bch, src[nwords] >> 8);
 223	pad[3] = swap_bits(bch, src[nwords]);
 224	memcpy(dst, pad, BCH_ECC_BYTES(bch)-4*nwords);
 225}
 226
 227/**
 228 * bch_encode - calculate BCH ecc parity of data
 229 * @bch:   BCH control structure
 230 * @data:  data to encode
 231 * @len:   data length in bytes
 232 * @ecc:   ecc parity data, must be initialized by caller
 233 *
 234 * The @ecc parity array is used both as input and output parameter, in order to
 235 * allow incremental computations. It should be of the size indicated by member
 236 * @ecc_bytes of @bch, and should be initialized to 0 before the first call.
 237 *
 238 * The exact number of computed ecc parity bits is given by member @ecc_bits of
 239 * @bch; it may be less than m*t for large values of t.
 240 */
 241void bch_encode(struct bch_control *bch, const uint8_t *data,
 242		unsigned int len, uint8_t *ecc)
 243{
 244	const unsigned int l = BCH_ECC_WORDS(bch)-1;
 245	unsigned int i, mlen;
 246	unsigned long m;
 247	uint32_t w, r[BCH_ECC_MAX_WORDS];
 248	const size_t r_bytes = BCH_ECC_WORDS(bch) * sizeof(*r);
 249	const uint32_t * const tab0 = bch->mod8_tab;
 250	const uint32_t * const tab1 = tab0 + 256*(l+1);
 251	const uint32_t * const tab2 = tab1 + 256*(l+1);
 252	const uint32_t * const tab3 = tab2 + 256*(l+1);
 253	const uint32_t *pdata, *p0, *p1, *p2, *p3;
 254
 255	if (WARN_ON(r_bytes > sizeof(r)))
 256		return;
 257
 258	if (ecc) {
 259		/* load ecc parity bytes into internal 32-bit buffer */
 260		load_ecc8(bch, bch->ecc_buf, ecc);
 261	} else {
 262		memset(bch->ecc_buf, 0, r_bytes);
 263	}
 264
 265	/* process first unaligned data bytes */
 266	m = ((unsigned long)data) & 3;
 267	if (m) {
 268		mlen = (len < (4-m)) ? len : 4-m;
 269		bch_encode_unaligned(bch, data, mlen, bch->ecc_buf);
 270		data += mlen;
 271		len  -= mlen;
 272	}
 273
 274	/* process 32-bit aligned data words */
 275	pdata = (uint32_t *)data;
 276	mlen  = len/4;
 277	data += 4*mlen;
 278	len  -= 4*mlen;
 279	memcpy(r, bch->ecc_buf, r_bytes);
 280
 281	/*
 282	 * split each 32-bit word into 4 polynomials of weight 8 as follows:
 283	 *
 284	 * 31 ...24  23 ...16  15 ... 8  7 ... 0
 285	 * xxxxxxxx  yyyyyyyy  zzzzzzzz  tttttttt
 286	 *                               tttttttt  mod g = r0 (precomputed)
 287	 *                     zzzzzzzz  00000000  mod g = r1 (precomputed)
 288	 *           yyyyyyyy  00000000  00000000  mod g = r2 (precomputed)
 289	 * xxxxxxxx  00000000  00000000  00000000  mod g = r3 (precomputed)
 290	 * xxxxxxxx  yyyyyyyy  zzzzzzzz  tttttttt  mod g = r0^r1^r2^r3
 291	 */
 292	while (mlen--) {
 293		/* input data is read in big-endian format */
 294		w = cpu_to_be32(*pdata++);
 295		if (bch->swap_bits)
 296			w = (u32)swap_bits(bch, w) |
 297			    ((u32)swap_bits(bch, w >> 8) << 8) |
 298			    ((u32)swap_bits(bch, w >> 16) << 16) |
 299			    ((u32)swap_bits(bch, w >> 24) << 24);
 300		w ^= r[0];
 301		p0 = tab0 + (l+1)*((w >>  0) & 0xff);
 302		p1 = tab1 + (l+1)*((w >>  8) & 0xff);
 303		p2 = tab2 + (l+1)*((w >> 16) & 0xff);
 304		p3 = tab3 + (l+1)*((w >> 24) & 0xff);
 305
 306		for (i = 0; i < l; i++)
 307			r[i] = r[i+1]^p0[i]^p1[i]^p2[i]^p3[i];
 308
 309		r[l] = p0[l]^p1[l]^p2[l]^p3[l];
 310	}
 311	memcpy(bch->ecc_buf, r, r_bytes);
 312
 313	/* process last unaligned bytes */
 314	if (len)
 315		bch_encode_unaligned(bch, data, len, bch->ecc_buf);
 316
 317	/* store ecc parity bytes into original parity buffer */
 318	if (ecc)
 319		store_ecc8(bch, ecc, bch->ecc_buf);
 320}
 321EXPORT_SYMBOL_GPL(bch_encode);
 322
 323static inline int modulo(struct bch_control *bch, unsigned int v)
 324{
 325	const unsigned int n = GF_N(bch);
 326	while (v >= n) {
 327		v -= n;
 328		v = (v & n) + (v >> GF_M(bch));
 329	}
 330	return v;
 331}
 332
 333/*
 334 * shorter and faster modulo function, only works when v < 2N.
 335 */
 336static inline int mod_s(struct bch_control *bch, unsigned int v)
 337{
 338	const unsigned int n = GF_N(bch);
 339	return (v < n) ? v : v-n;
 340}
 341
 342static inline int deg(unsigned int poly)
 343{
 344	/* polynomial degree is the most-significant bit index */
 345	return fls(poly)-1;
 346}
 347
 348static inline int parity(unsigned int x)
 349{
 350	/*
 351	 * public domain code snippet, lifted from
 352	 * http://www-graphics.stanford.edu/~seander/bithacks.html
 353	 */
 354	x ^= x >> 1;
 355	x ^= x >> 2;
 356	x = (x & 0x11111111U) * 0x11111111U;
 357	return (x >> 28) & 1;
 358}
 359
 360/* Galois field basic operations: multiply, divide, inverse, etc. */
 361
 362static inline unsigned int gf_mul(struct bch_control *bch, unsigned int a,
 363				  unsigned int b)
 364{
 365	return (a && b) ? bch->a_pow_tab[mod_s(bch, bch->a_log_tab[a]+
 366					       bch->a_log_tab[b])] : 0;
 367}
 368
 369static inline unsigned int gf_sqr(struct bch_control *bch, unsigned int a)
 370{
 371	return a ? bch->a_pow_tab[mod_s(bch, 2*bch->a_log_tab[a])] : 0;
 372}
 373
 374static inline unsigned int gf_div(struct bch_control *bch, unsigned int a,
 375				  unsigned int b)
 376{
 377	return a ? bch->a_pow_tab[mod_s(bch, bch->a_log_tab[a]+
 378					GF_N(bch)-bch->a_log_tab[b])] : 0;
 379}
 380
 381static inline unsigned int gf_inv(struct bch_control *bch, unsigned int a)
 382{
 383	return bch->a_pow_tab[GF_N(bch)-bch->a_log_tab[a]];
 384}
 385
 386static inline unsigned int a_pow(struct bch_control *bch, int i)
 387{
 388	return bch->a_pow_tab[modulo(bch, i)];
 389}
 390
 391static inline int a_log(struct bch_control *bch, unsigned int x)
 392{
 393	return bch->a_log_tab[x];
 394}
 395
 396static inline int a_ilog(struct bch_control *bch, unsigned int x)
 397{
 398	return mod_s(bch, GF_N(bch)-bch->a_log_tab[x]);
 399}
 400
 401/*
 402 * compute 2t syndromes of ecc polynomial, i.e. ecc(a^j) for j=1..2t
 403 */
 404static void compute_syndromes(struct bch_control *bch, uint32_t *ecc,
 405			      unsigned int *syn)
 406{
 407	int i, j, s;
 408	unsigned int m;
 409	uint32_t poly;
 410	const int t = GF_T(bch);
 411
 412	s = bch->ecc_bits;
 413
 414	/* make sure extra bits in last ecc word are cleared */
 415	m = ((unsigned int)s) & 31;
 416	if (m)
 417		ecc[s/32] &= ~((1u << (32-m))-1);
 418	memset(syn, 0, 2*t*sizeof(*syn));
 419
 420	/* compute v(a^j) for j=1 .. 2t-1 */
 421	do {
 422		poly = *ecc++;
 423		s -= 32;
 424		while (poly) {
 425			i = deg(poly);
 426			for (j = 0; j < 2*t; j += 2)
 427				syn[j] ^= a_pow(bch, (j+1)*(i+s));
 428
 429			poly ^= (1 << i);
 430		}
 431	} while (s > 0);
 432
 433	/* v(a^(2j)) = v(a^j)^2 */
 434	for (j = 0; j < t; j++)
 435		syn[2*j+1] = gf_sqr(bch, syn[j]);
 436}
 437
 438static void gf_poly_copy(struct gf_poly *dst, struct gf_poly *src)
 439{
 440	memcpy(dst, src, GF_POLY_SZ(src->deg));
 441}
 442
 443static int compute_error_locator_polynomial(struct bch_control *bch,
 444					    const unsigned int *syn)
 445{
 446	const unsigned int t = GF_T(bch);
 447	const unsigned int n = GF_N(bch);
 448	unsigned int i, j, tmp, l, pd = 1, d = syn[0];
 449	struct gf_poly *elp = bch->elp;
 450	struct gf_poly *pelp = bch->poly_2t[0];
 451	struct gf_poly *elp_copy = bch->poly_2t[1];
 452	int k, pp = -1;
 453
 454	memset(pelp, 0, GF_POLY_SZ(2*t));
 455	memset(elp, 0, GF_POLY_SZ(2*t));
 456
 457	pelp->deg = 0;
 458	pelp->c[0] = 1;
 459	elp->deg = 0;
 460	elp->c[0] = 1;
 461
 462	/* use simplified binary Berlekamp-Massey algorithm */
 463	for (i = 0; (i < t) && (elp->deg <= t); i++) {
 464		if (d) {
 465			k = 2*i-pp;
 466			gf_poly_copy(elp_copy, elp);
 467			/* e[i+1](X) = e[i](X)+di*dp^-1*X^2(i-p)*e[p](X) */
 468			tmp = a_log(bch, d)+n-a_log(bch, pd);
 469			for (j = 0; j <= pelp->deg; j++) {
 470				if (pelp->c[j]) {
 471					l = a_log(bch, pelp->c[j]);
 472					elp->c[j+k] ^= a_pow(bch, tmp+l);
 473				}
 474			}
 475			/* compute l[i+1] = max(l[i]->c[l[p]+2*(i-p]) */
 476			tmp = pelp->deg+k;
 477			if (tmp > elp->deg) {
 478				elp->deg = tmp;
 479				gf_poly_copy(pelp, elp_copy);
 480				pd = d;
 481				pp = 2*i;
 482			}
 483		}
 484		/* di+1 = S(2i+3)+elp[i+1].1*S(2i+2)+...+elp[i+1].lS(2i+3-l) */
 485		if (i < t-1) {
 486			d = syn[2*i+2];
 487			for (j = 1; j <= elp->deg; j++)
 488				d ^= gf_mul(bch, elp->c[j], syn[2*i+2-j]);
 489		}
 490	}
 491	dbg("elp=%s\n", gf_poly_str(elp));
 492	return (elp->deg > t) ? -1 : (int)elp->deg;
 493}
 494
 495/*
 496 * solve a m x m linear system in GF(2) with an expected number of solutions,
 497 * and return the number of found solutions
 498 */
 499static int solve_linear_system(struct bch_control *bch, unsigned int *rows,
 500			       unsigned int *sol, int nsol)
 501{
 502	const int m = GF_M(bch);
 503	unsigned int tmp, mask;
 504	int rem, c, r, p, k, param[BCH_MAX_M];
 505
 506	k = 0;
 507	mask = 1 << m;
 508
 509	/* Gaussian elimination */
 510	for (c = 0; c < m; c++) {
 511		rem = 0;
 512		p = c-k;
 513		/* find suitable row for elimination */
 514		for (r = p; r < m; r++) {
 515			if (rows[r] & mask) {
 516				if (r != p) {
 517					tmp = rows[r];
 518					rows[r] = rows[p];
 519					rows[p] = tmp;
 520				}
 521				rem = r+1;
 522				break;
 523			}
 524		}
 525		if (rem) {
 526			/* perform elimination on remaining rows */
 527			tmp = rows[p];
 528			for (r = rem; r < m; r++) {
 529				if (rows[r] & mask)
 530					rows[r] ^= tmp;
 531			}
 532		} else {
 533			/* elimination not needed, store defective row index */
 534			param[k++] = c;
 535		}
 536		mask >>= 1;
 537	}
 538	/* rewrite system, inserting fake parameter rows */
 539	if (k > 0) {
 540		p = k;
 541		for (r = m-1; r >= 0; r--) {
 542			if ((r > m-1-k) && rows[r])
 543				/* system has no solution */
 544				return 0;
 545
 546			rows[r] = (p && (r == param[p-1])) ?
 547				p--, 1u << (m-r) : rows[r-p];
 548		}
 549	}
 550
 551	if (nsol != (1 << k))
 552		/* unexpected number of solutions */
 553		return 0;
 554
 555	for (p = 0; p < nsol; p++) {
 556		/* set parameters for p-th solution */
 557		for (c = 0; c < k; c++)
 558			rows[param[c]] = (rows[param[c]] & ~1)|((p >> c) & 1);
 559
 560		/* compute unique solution */
 561		tmp = 0;
 562		for (r = m-1; r >= 0; r--) {
 563			mask = rows[r] & (tmp|1);
 564			tmp |= parity(mask) << (m-r);
 565		}
 566		sol[p] = tmp >> 1;
 567	}
 568	return nsol;
 569}
 570
 571/*
 572 * this function builds and solves a linear system for finding roots of a degree
 573 * 4 affine monic polynomial X^4+aX^2+bX+c over GF(2^m).
 574 */
 575static int find_affine4_roots(struct bch_control *bch, unsigned int a,
 576			      unsigned int b, unsigned int c,
 577			      unsigned int *roots)
 578{
 579	int i, j, k;
 580	const int m = GF_M(bch);
 581	unsigned int mask = 0xff, t, rows[16] = {0,};
 582
 583	j = a_log(bch, b);
 584	k = a_log(bch, a);
 585	rows[0] = c;
 586
 587	/* buid linear system to solve X^4+aX^2+bX+c = 0 */
 588	for (i = 0; i < m; i++) {
 589		rows[i+1] = bch->a_pow_tab[4*i]^
 590			(a ? bch->a_pow_tab[mod_s(bch, k)] : 0)^
 591			(b ? bch->a_pow_tab[mod_s(bch, j)] : 0);
 592		j++;
 593		k += 2;
 594	}
 595	/*
 596	 * transpose 16x16 matrix before passing it to linear solver
 597	 * warning: this code assumes m < 16
 598	 */
 599	for (j = 8; j != 0; j >>= 1, mask ^= (mask << j)) {
 600		for (k = 0; k < 16; k = (k+j+1) & ~j) {
 601			t = ((rows[k] >> j)^rows[k+j]) & mask;
 602			rows[k] ^= (t << j);
 603			rows[k+j] ^= t;
 604		}
 605	}
 606	return solve_linear_system(bch, rows, roots, 4);
 607}
 608
 609/*
 610 * compute root r of a degree 1 polynomial over GF(2^m) (returned as log(1/r))
 611 */
 612static int find_poly_deg1_roots(struct bch_control *bch, struct gf_poly *poly,
 613				unsigned int *roots)
 614{
 615	int n = 0;
 616
 617	if (poly->c[0])
 618		/* poly[X] = bX+c with c!=0, root=c/b */
 619		roots[n++] = mod_s(bch, GF_N(bch)-bch->a_log_tab[poly->c[0]]+
 620				   bch->a_log_tab[poly->c[1]]);
 621	return n;
 622}
 623
 624/*
 625 * compute roots of a degree 2 polynomial over GF(2^m)
 626 */
 627static int find_poly_deg2_roots(struct bch_control *bch, struct gf_poly *poly,
 628				unsigned int *roots)
 629{
 630	int n = 0, i, l0, l1, l2;
 631	unsigned int u, v, r;
 632
 633	if (poly->c[0] && poly->c[1]) {
 634
 635		l0 = bch->a_log_tab[poly->c[0]];
 636		l1 = bch->a_log_tab[poly->c[1]];
 637		l2 = bch->a_log_tab[poly->c[2]];
 638
 639		/* using z=a/bX, transform aX^2+bX+c into z^2+z+u (u=ac/b^2) */
 640		u = a_pow(bch, l0+l2+2*(GF_N(bch)-l1));
 641		/*
 642		 * let u = sum(li.a^i) i=0..m-1; then compute r = sum(li.xi):
 643		 * r^2+r = sum(li.(xi^2+xi)) = sum(li.(a^i+Tr(a^i).a^k)) =
 644		 * u + sum(li.Tr(a^i).a^k) = u+a^k.Tr(sum(li.a^i)) = u+a^k.Tr(u)
 645		 * i.e. r and r+1 are roots iff Tr(u)=0
 646		 */
 647		r = 0;
 648		v = u;
 649		while (v) {
 650			i = deg(v);
 651			r ^= bch->xi_tab[i];
 652			v ^= (1 << i);
 653		}
 654		/* verify root */
 655		if ((gf_sqr(bch, r)^r) == u) {
 656			/* reverse z=a/bX transformation and compute log(1/r) */
 657			roots[n++] = modulo(bch, 2*GF_N(bch)-l1-
 658					    bch->a_log_tab[r]+l2);
 659			roots[n++] = modulo(bch, 2*GF_N(bch)-l1-
 660					    bch->a_log_tab[r^1]+l2);
 661		}
 662	}
 663	return n;
 664}
 665
 666/*
 667 * compute roots of a degree 3 polynomial over GF(2^m)
 668 */
 669static int find_poly_deg3_roots(struct bch_control *bch, struct gf_poly *poly,
 670				unsigned int *roots)
 671{
 672	int i, n = 0;
 673	unsigned int a, b, c, a2, b2, c2, e3, tmp[4];
 674
 675	if (poly->c[0]) {
 676		/* transform polynomial into monic X^3 + a2X^2 + b2X + c2 */
 677		e3 = poly->c[3];
 678		c2 = gf_div(bch, poly->c[0], e3);
 679		b2 = gf_div(bch, poly->c[1], e3);
 680		a2 = gf_div(bch, poly->c[2], e3);
 681
 682		/* (X+a2)(X^3+a2X^2+b2X+c2) = X^4+aX^2+bX+c (affine) */
 683		c = gf_mul(bch, a2, c2);           /* c = a2c2      */
 684		b = gf_mul(bch, a2, b2)^c2;        /* b = a2b2 + c2 */
 685		a = gf_sqr(bch, a2)^b2;            /* a = a2^2 + b2 */
 686
 687		/* find the 4 roots of this affine polynomial */
 688		if (find_affine4_roots(bch, a, b, c, tmp) == 4) {
 689			/* remove a2 from final list of roots */
 690			for (i = 0; i < 4; i++) {
 691				if (tmp[i] != a2)
 692					roots[n++] = a_ilog(bch, tmp[i]);
 693			}
 694		}
 695	}
 696	return n;
 697}
 698
 699/*
 700 * compute roots of a degree 4 polynomial over GF(2^m)
 701 */
 702static int find_poly_deg4_roots(struct bch_control *bch, struct gf_poly *poly,
 703				unsigned int *roots)
 704{
 705	int i, l, n = 0;
 706	unsigned int a, b, c, d, e = 0, f, a2, b2, c2, e4;
 707
 708	if (poly->c[0] == 0)
 709		return 0;
 710
 711	/* transform polynomial into monic X^4 + aX^3 + bX^2 + cX + d */
 712	e4 = poly->c[4];
 713	d = gf_div(bch, poly->c[0], e4);
 714	c = gf_div(bch, poly->c[1], e4);
 715	b = gf_div(bch, poly->c[2], e4);
 716	a = gf_div(bch, poly->c[3], e4);
 717
 718	/* use Y=1/X transformation to get an affine polynomial */
 719	if (a) {
 720		/* first, eliminate cX by using z=X+e with ae^2+c=0 */
 721		if (c) {
 722			/* compute e such that e^2 = c/a */
 723			f = gf_div(bch, c, a);
 724			l = a_log(bch, f);
 725			l += (l & 1) ? GF_N(bch) : 0;
 726			e = a_pow(bch, l/2);
 727			/*
 728			 * use transformation z=X+e:
 729			 * z^4+e^4 + a(z^3+ez^2+e^2z+e^3) + b(z^2+e^2) +cz+ce+d
 730			 * z^4 + az^3 + (ae+b)z^2 + (ae^2+c)z+e^4+be^2+ae^3+ce+d
 731			 * z^4 + az^3 + (ae+b)z^2 + e^4+be^2+d
 732			 * z^4 + az^3 +     b'z^2 + d'
 733			 */
 734			d = a_pow(bch, 2*l)^gf_mul(bch, b, f)^d;
 735			b = gf_mul(bch, a, e)^b;
 736		}
 737		/* now, use Y=1/X to get Y^4 + b/dY^2 + a/dY + 1/d */
 738		if (d == 0)
 739			/* assume all roots have multiplicity 1 */
 740			return 0;
 741
 742		c2 = gf_inv(bch, d);
 743		b2 = gf_div(bch, a, d);
 744		a2 = gf_div(bch, b, d);
 745	} else {
 746		/* polynomial is already affine */
 747		c2 = d;
 748		b2 = c;
 749		a2 = b;
 750	}
 751	/* find the 4 roots of this affine polynomial */
 752	if (find_affine4_roots(bch, a2, b2, c2, roots) == 4) {
 753		for (i = 0; i < 4; i++) {
 754			/* post-process roots (reverse transformations) */
 755			f = a ? gf_inv(bch, roots[i]) : roots[i];
 756			roots[i] = a_ilog(bch, f^e);
 757		}
 758		n = 4;
 759	}
 760	return n;
 761}
 762
 763/*
 764 * build monic, log-based representation of a polynomial
 765 */
 766static void gf_poly_logrep(struct bch_control *bch,
 767			   const struct gf_poly *a, int *rep)
 768{
 769	int i, d = a->deg, l = GF_N(bch)-a_log(bch, a->c[a->deg]);
 770
 771	/* represent 0 values with -1; warning, rep[d] is not set to 1 */
 772	for (i = 0; i < d; i++)
 773		rep[i] = a->c[i] ? mod_s(bch, a_log(bch, a->c[i])+l) : -1;
 774}
 775
 776/*
 777 * compute polynomial Euclidean division remainder in GF(2^m)[X]
 778 */
 779static void gf_poly_mod(struct bch_control *bch, struct gf_poly *a,
 780			const struct gf_poly *b, int *rep)
 781{
 782	int la, p, m;
 783	unsigned int i, j, *c = a->c;
 784	const unsigned int d = b->deg;
 785
 786	if (a->deg < d)
 787		return;
 788
 789	/* reuse or compute log representation of denominator */
 790	if (!rep) {
 791		rep = bch->cache;
 792		gf_poly_logrep(bch, b, rep);
 793	}
 794
 795	for (j = a->deg; j >= d; j--) {
 796		if (c[j]) {
 797			la = a_log(bch, c[j]);
 798			p = j-d;
 799			for (i = 0; i < d; i++, p++) {
 800				m = rep[i];
 801				if (m >= 0)
 802					c[p] ^= bch->a_pow_tab[mod_s(bch,
 803								     m+la)];
 804			}
 805		}
 806	}
 807	a->deg = d-1;
 808	while (!c[a->deg] && a->deg)
 809		a->deg--;
 810}
 811
 812/*
 813 * compute polynomial Euclidean division quotient in GF(2^m)[X]
 814 */
 815static void gf_poly_div(struct bch_control *bch, struct gf_poly *a,
 816			const struct gf_poly *b, struct gf_poly *q)
 817{
 818	if (a->deg >= b->deg) {
 819		q->deg = a->deg-b->deg;
 820		/* compute a mod b (modifies a) */
 821		gf_poly_mod(bch, a, b, NULL);
 822		/* quotient is stored in upper part of polynomial a */
 823		memcpy(q->c, &a->c[b->deg], (1+q->deg)*sizeof(unsigned int));
 824	} else {
 825		q->deg = 0;
 826		q->c[0] = 0;
 827	}
 828}
 829
 830/*
 831 * compute polynomial GCD (Greatest Common Divisor) in GF(2^m)[X]
 832 */
 833static struct gf_poly *gf_poly_gcd(struct bch_control *bch, struct gf_poly *a,
 834				   struct gf_poly *b)
 835{
 836	struct gf_poly *tmp;
 837
 838	dbg("gcd(%s,%s)=", gf_poly_str(a), gf_poly_str(b));
 839
 840	if (a->deg < b->deg) {
 841		tmp = b;
 842		b = a;
 843		a = tmp;
 844	}
 845
 846	while (b->deg > 0) {
 847		gf_poly_mod(bch, a, b, NULL);
 848		tmp = b;
 849		b = a;
 850		a = tmp;
 851	}
 852
 853	dbg("%s\n", gf_poly_str(a));
 854
 855	return a;
 856}
 857
 858/*
 859 * Given a polynomial f and an integer k, compute Tr(a^kX) mod f
 860 * This is used in Berlekamp Trace algorithm for splitting polynomials
 861 */
 862static void compute_trace_bk_mod(struct bch_control *bch, int k,
 863				 const struct gf_poly *f, struct gf_poly *z,
 864				 struct gf_poly *out)
 865{
 866	const int m = GF_M(bch);
 867	int i, j;
 868
 869	/* z contains z^2j mod f */
 870	z->deg = 1;
 871	z->c[0] = 0;
 872	z->c[1] = bch->a_pow_tab[k];
 873
 874	out->deg = 0;
 875	memset(out, 0, GF_POLY_SZ(f->deg));
 876
 877	/* compute f log representation only once */
 878	gf_poly_logrep(bch, f, bch->cache);
 879
 880	for (i = 0; i < m; i++) {
 881		/* add a^(k*2^i)(z^(2^i) mod f) and compute (z^(2^i) mod f)^2 */
 882		for (j = z->deg; j >= 0; j--) {
 883			out->c[j] ^= z->c[j];
 884			z->c[2*j] = gf_sqr(bch, z->c[j]);
 885			z->c[2*j+1] = 0;
 886		}
 887		if (z->deg > out->deg)
 888			out->deg = z->deg;
 889
 890		if (i < m-1) {
 891			z->deg *= 2;
 892			/* z^(2(i+1)) mod f = (z^(2^i) mod f)^2 mod f */
 893			gf_poly_mod(bch, z, f, bch->cache);
 894		}
 895	}
 896	while (!out->c[out->deg] && out->deg)
 897		out->deg--;
 898
 899	dbg("Tr(a^%d.X) mod f = %s\n", k, gf_poly_str(out));
 900}
 901
 902/*
 903 * factor a polynomial using Berlekamp Trace algorithm (BTA)
 904 */
 905static void factor_polynomial(struct bch_control *bch, int k, struct gf_poly *f,
 906			      struct gf_poly **g, struct gf_poly **h)
 907{
 908	struct gf_poly *f2 = bch->poly_2t[0];
 909	struct gf_poly *q  = bch->poly_2t[1];
 910	struct gf_poly *tk = bch->poly_2t[2];
 911	struct gf_poly *z  = bch->poly_2t[3];
 912	struct gf_poly *gcd;
 913
 914	dbg("factoring %s...\n", gf_poly_str(f));
 915
 916	*g = f;
 917	*h = NULL;
 918
 919	/* tk = Tr(a^k.X) mod f */
 920	compute_trace_bk_mod(bch, k, f, z, tk);
 921
 922	if (tk->deg > 0) {
 923		/* compute g = gcd(f, tk) (destructive operation) */
 924		gf_poly_copy(f2, f);
 925		gcd = gf_poly_gcd(bch, f2, tk);
 926		if (gcd->deg < f->deg) {
 927			/* compute h=f/gcd(f,tk); this will modify f and q */
 928			gf_poly_div(bch, f, gcd, q);
 929			/* store g and h in-place (clobbering f) */
 930			*h = &((struct gf_poly_deg1 *)f)[gcd->deg].poly;
 931			gf_poly_copy(*g, gcd);
 932			gf_poly_copy(*h, q);
 933		}
 934	}
 935}
 936
 937/*
 938 * find roots of a polynomial, using BTZ algorithm; see the beginning of this
 939 * file for details
 940 */
 941static int find_poly_roots(struct bch_control *bch, unsigned int k,
 942			   struct gf_poly *poly, unsigned int *roots)
 943{
 944	int cnt;
 945	struct gf_poly *f1, *f2;
 946
 947	switch (poly->deg) {
 948		/* handle low degree polynomials with ad hoc techniques */
 949	case 1:
 950		cnt = find_poly_deg1_roots(bch, poly, roots);
 951		break;
 952	case 2:
 953		cnt = find_poly_deg2_roots(bch, poly, roots);
 954		break;
 955	case 3:
 956		cnt = find_poly_deg3_roots(bch, poly, roots);
 957		break;
 958	case 4:
 959		cnt = find_poly_deg4_roots(bch, poly, roots);
 960		break;
 961	default:
 962		/* factor polynomial using Berlekamp Trace Algorithm (BTA) */
 963		cnt = 0;
 964		if (poly->deg && (k <= GF_M(bch))) {
 965			factor_polynomial(bch, k, poly, &f1, &f2);
 966			if (f1)
 967				cnt += find_poly_roots(bch, k+1, f1, roots);
 968			if (f2)
 969				cnt += find_poly_roots(bch, k+1, f2, roots+cnt);
 970		}
 971		break;
 972	}
 973	return cnt;
 974}
 975
 976#if defined(USE_CHIEN_SEARCH)
 977/*
 978 * exhaustive root search (Chien) implementation - not used, included only for
 979 * reference/comparison tests
 980 */
 981static int chien_search(struct bch_control *bch, unsigned int len,
 982			struct gf_poly *p, unsigned int *roots)
 983{
 984	int m;
 985	unsigned int i, j, syn, syn0, count = 0;
 986	const unsigned int k = 8*len+bch->ecc_bits;
 987
 988	/* use a log-based representation of polynomial */
 989	gf_poly_logrep(bch, p, bch->cache);
 990	bch->cache[p->deg] = 0;
 991	syn0 = gf_div(bch, p->c[0], p->c[p->deg]);
 992
 993	for (i = GF_N(bch)-k+1; i <= GF_N(bch); i++) {
 994		/* compute elp(a^i) */
 995		for (j = 1, syn = syn0; j <= p->deg; j++) {
 996			m = bch->cache[j];
 997			if (m >= 0)
 998				syn ^= a_pow(bch, m+j*i);
 999		}
1000		if (syn == 0) {
1001			roots[count++] = GF_N(bch)-i;
1002			if (count == p->deg)
1003				break;
1004		}
1005	}
1006	return (count == p->deg) ? count : 0;
1007}
1008#define find_poly_roots(_p, _k, _elp, _loc) chien_search(_p, len, _elp, _loc)
1009#endif /* USE_CHIEN_SEARCH */
1010
1011/**
1012 * bch_decode - decode received codeword and find bit error locations
1013 * @bch:      BCH control structure
1014 * @data:     received data, ignored if @calc_ecc is provided
1015 * @len:      data length in bytes, must always be provided
1016 * @recv_ecc: received ecc, if NULL then assume it was XORed in @calc_ecc
1017 * @calc_ecc: calculated ecc, if NULL then calc_ecc is computed from @data
1018 * @syn:      hw computed syndrome data (if NULL, syndrome is calculated)
1019 * @errloc:   output array of error locations
1020 *
1021 * Returns:
1022 *  The number of errors found, or -EBADMSG if decoding failed, or -EINVAL if
1023 *  invalid parameters were provided
1024 *
1025 * Depending on the available hw BCH support and the need to compute @calc_ecc
1026 * separately (using bch_encode()), this function should be called with one of
1027 * the following parameter configurations -
1028 *
1029 * by providing @data and @recv_ecc only:
1030 *   bch_decode(@bch, @data, @len, @recv_ecc, NULL, NULL, @errloc)
1031 *
1032 * by providing @recv_ecc and @calc_ecc:
1033 *   bch_decode(@bch, NULL, @len, @recv_ecc, @calc_ecc, NULL, @errloc)
1034 *
1035 * by providing ecc = recv_ecc XOR calc_ecc:
1036 *   bch_decode(@bch, NULL, @len, NULL, ecc, NULL, @errloc)
1037 *
1038 * by providing syndrome results @syn:
1039 *   bch_decode(@bch, NULL, @len, NULL, NULL, @syn, @errloc)
1040 *
1041 * Once bch_decode() has successfully returned with a positive value, error
1042 * locations returned in array @errloc should be interpreted as follows -
1043 *
1044 * if (errloc[n] >= 8*len), then n-th error is located in ecc (no need for
1045 * data correction)
1046 *
1047 * if (errloc[n] < 8*len), then n-th error is located in data and can be
1048 * corrected with statement data[errloc[n]/8] ^= 1 << (errloc[n] % 8);
1049 *
1050 * Note that this function does not perform any data correction by itself, it
1051 * merely indicates error locations.
1052 */
1053int bch_decode(struct bch_control *bch, const uint8_t *data, unsigned int len,
1054	       const uint8_t *recv_ecc, const uint8_t *calc_ecc,
1055	       const unsigned int *syn, unsigned int *errloc)
1056{
1057	const unsigned int ecc_words = BCH_ECC_WORDS(bch);
1058	unsigned int nbits;
1059	int i, err, nroots;
1060	uint32_t sum;
1061
1062	/* sanity check: make sure data length can be handled */
1063	if (8*len > (bch->n-bch->ecc_bits))
1064		return -EINVAL;
1065
1066	/* if caller does not provide syndromes, compute them */
1067	if (!syn) {
1068		if (!calc_ecc) {
1069			/* compute received data ecc into an internal buffer */
1070			if (!data || !recv_ecc)
1071				return -EINVAL;
1072			bch_encode(bch, data, len, NULL);
1073		} else {
1074			/* load provided calculated ecc */
1075			load_ecc8(bch, bch->ecc_buf, calc_ecc);
1076		}
1077		/* load received ecc or assume it was XORed in calc_ecc */
1078		if (recv_ecc) {
1079			load_ecc8(bch, bch->ecc_buf2, recv_ecc);
1080			/* XOR received and calculated ecc */
1081			for (i = 0, sum = 0; i < (int)ecc_words; i++) {
1082				bch->ecc_buf[i] ^= bch->ecc_buf2[i];
1083				sum |= bch->ecc_buf[i];
1084			}
1085			if (!sum)
1086				/* no error found */
1087				return 0;
1088		}
1089		compute_syndromes(bch, bch->ecc_buf, bch->syn);
1090		syn = bch->syn;
1091	}
1092
1093	err = compute_error_locator_polynomial(bch, syn);
1094	if (err > 0) {
1095		nroots = find_poly_roots(bch, 1, bch->elp, errloc);
1096		if (err != nroots)
1097			err = -1;
1098	}
1099	if (err > 0) {
1100		/* post-process raw error locations for easier correction */
1101		nbits = (len*8)+bch->ecc_bits;
1102		for (i = 0; i < err; i++) {
1103			if (errloc[i] >= nbits) {
1104				err = -1;
1105				break;
1106			}
1107			errloc[i] = nbits-1-errloc[i];
1108			if (!bch->swap_bits)
1109				errloc[i] = (errloc[i] & ~7) |
1110					    (7-(errloc[i] & 7));
1111		}
1112	}
1113	return (err >= 0) ? err : -EBADMSG;
1114}
1115EXPORT_SYMBOL_GPL(bch_decode);
1116
1117/*
1118 * generate Galois field lookup tables
1119 */
1120static int build_gf_tables(struct bch_control *bch, unsigned int poly)
1121{
1122	unsigned int i, x = 1;
1123	const unsigned int k = 1 << deg(poly);
1124
1125	/* primitive polynomial must be of degree m */
1126	if (k != (1u << GF_M(bch)))
1127		return -1;
1128
1129	for (i = 0; i < GF_N(bch); i++) {
1130		bch->a_pow_tab[i] = x;
1131		bch->a_log_tab[x] = i;
1132		if (i && (x == 1))
1133			/* polynomial is not primitive (a^i=1 with 0<i<2^m-1) */
1134			return -1;
1135		x <<= 1;
1136		if (x & k)
1137			x ^= poly;
1138	}
1139	bch->a_pow_tab[GF_N(bch)] = 1;
1140	bch->a_log_tab[0] = 0;
1141
1142	return 0;
1143}
1144
1145/*
1146 * compute generator polynomial remainder tables for fast encoding
1147 */
1148static void build_mod8_tables(struct bch_control *bch, const uint32_t *g)
1149{
1150	int i, j, b, d;
1151	uint32_t data, hi, lo, *tab;
1152	const int l = BCH_ECC_WORDS(bch);
1153	const int plen = DIV_ROUND_UP(bch->ecc_bits+1, 32);
1154	const int ecclen = DIV_ROUND_UP(bch->ecc_bits, 32);
1155
1156	memset(bch->mod8_tab, 0, 4*256*l*sizeof(*bch->mod8_tab));
1157
1158	for (i = 0; i < 256; i++) {
1159		/* p(X)=i is a small polynomial of weight <= 8 */
1160		for (b = 0; b < 4; b++) {
1161			/* we want to compute (p(X).X^(8*b+deg(g))) mod g(X) */
1162			tab = bch->mod8_tab + (b*256+i)*l;
1163			data = i << (8*b);
1164			while (data) {
1165				d = deg(data);
1166				/* subtract X^d.g(X) from p(X).X^(8*b+deg(g)) */
1167				data ^= g[0] >> (31-d);
1168				for (j = 0; j < ecclen; j++) {
1169					hi = (d < 31) ? g[j] << (d+1) : 0;
1170					lo = (j+1 < plen) ?
1171						g[j+1] >> (31-d) : 0;
1172					tab[j] ^= hi|lo;
1173				}
1174			}
1175		}
1176	}
1177}
1178
1179/*
1180 * build a base for factoring degree 2 polynomials
1181 */
1182static int build_deg2_base(struct bch_control *bch)
1183{
1184	const int m = GF_M(bch);
1185	int i, j, r;
1186	unsigned int sum, x, y, remaining, ak = 0, xi[BCH_MAX_M];
1187
1188	/* find k s.t. Tr(a^k) = 1 and 0 <= k < m */
1189	for (i = 0; i < m; i++) {
1190		for (j = 0, sum = 0; j < m; j++)
1191			sum ^= a_pow(bch, i*(1 << j));
1192
1193		if (sum) {
1194			ak = bch->a_pow_tab[i];
1195			break;
1196		}
1197	}
1198	/* find xi, i=0..m-1 such that xi^2+xi = a^i+Tr(a^i).a^k */
1199	remaining = m;
1200	memset(xi, 0, sizeof(xi));
1201
1202	for (x = 0; (x <= GF_N(bch)) && remaining; x++) {
1203		y = gf_sqr(bch, x)^x;
1204		for (i = 0; i < 2; i++) {
1205			r = a_log(bch, y);
1206			if (y && (r < m) && !xi[r]) {
1207				bch->xi_tab[r] = x;
1208				xi[r] = 1;
1209				remaining--;
1210				dbg("x%d = %x\n", r, x);
1211				break;
1212			}
1213			y ^= ak;
1214		}
1215	}
1216	/* should not happen but check anyway */
1217	return remaining ? -1 : 0;
1218}
1219
1220static void *bch_alloc(size_t size, int *err)
1221{
1222	void *ptr;
1223
1224	ptr = kmalloc(size, GFP_KERNEL);
1225	if (ptr == NULL)
1226		*err = 1;
1227	return ptr;
1228}
1229
1230/*
1231 * compute generator polynomial for given (m,t) parameters.
1232 */
1233static uint32_t *compute_generator_polynomial(struct bch_control *bch)
1234{
1235	const unsigned int m = GF_M(bch);
1236	const unsigned int t = GF_T(bch);
1237	int n, err = 0;
1238	unsigned int i, j, nbits, r, word, *roots;
1239	struct gf_poly *g;
1240	uint32_t *genpoly;
1241
1242	g = bch_alloc(GF_POLY_SZ(m*t), &err);
1243	roots = bch_alloc((bch->n+1)*sizeof(*roots), &err);
1244	genpoly = bch_alloc(DIV_ROUND_UP(m*t+1, 32)*sizeof(*genpoly), &err);
1245
1246	if (err) {
1247		kfree(genpoly);
1248		genpoly = NULL;
1249		goto finish;
1250	}
1251
1252	/* enumerate all roots of g(X) */
1253	memset(roots , 0, (bch->n+1)*sizeof(*roots));
1254	for (i = 0; i < t; i++) {
1255		for (j = 0, r = 2*i+1; j < m; j++) {
1256			roots[r] = 1;
1257			r = mod_s(bch, 2*r);
1258		}
1259	}
1260	/* build generator polynomial g(X) */
1261	g->deg = 0;
1262	g->c[0] = 1;
1263	for (i = 0; i < GF_N(bch); i++) {
1264		if (roots[i]) {
1265			/* multiply g(X) by (X+root) */
1266			r = bch->a_pow_tab[i];
1267			g->c[g->deg+1] = 1;
1268			for (j = g->deg; j > 0; j--)
1269				g->c[j] = gf_mul(bch, g->c[j], r)^g->c[j-1];
1270
1271			g->c[0] = gf_mul(bch, g->c[0], r);
1272			g->deg++;
1273		}
1274	}
1275	/* store left-justified binary representation of g(X) */
1276	n = g->deg+1;
1277	i = 0;
1278
1279	while (n > 0) {
1280		nbits = (n > 32) ? 32 : n;
1281		for (j = 0, word = 0; j < nbits; j++) {
1282			if (g->c[n-1-j])
1283				word |= 1u << (31-j);
1284		}
1285		genpoly[i++] = word;
1286		n -= nbits;
1287	}
1288	bch->ecc_bits = g->deg;
1289
1290finish:
1291	kfree(g);
1292	kfree(roots);
1293
1294	return genpoly;
1295}
1296
1297/**
1298 * bch_init - initialize a BCH encoder/decoder
1299 * @m:          Galois field order, should be in the range 5-15
1300 * @t:          maximum error correction capability, in bits
1301 * @prim_poly:  user-provided primitive polynomial (or 0 to use default)
1302 * @swap_bits:  swap bits within data and syndrome bytes
1303 *
1304 * Returns:
1305 *  a newly allocated BCH control structure if successful, NULL otherwise
1306 *
1307 * This initialization can take some time, as lookup tables are built for fast
1308 * encoding/decoding; make sure not to call this function from a time critical
1309 * path. Usually, bch_init() should be called on module/driver init and
1310 * bch_free() should be called to release memory on exit.
1311 *
1312 * You may provide your own primitive polynomial of degree @m in argument
1313 * @prim_poly, or let bch_init() use its default polynomial.
1314 *
1315 * Once bch_init() has successfully returned a pointer to a newly allocated
1316 * BCH control structure, ecc length in bytes is given by member @ecc_bytes of
1317 * the structure.
1318 */
1319struct bch_control *bch_init(int m, int t, unsigned int prim_poly,
1320			     bool swap_bits)
1321{
1322	int err = 0;
1323	unsigned int i, words;
1324	uint32_t *genpoly;
1325	struct bch_control *bch = NULL;
1326
1327	const int min_m = 5;
1328
1329	/* default primitive polynomials */
1330	static const unsigned int prim_poly_tab[] = {
1331		0x25, 0x43, 0x83, 0x11d, 0x211, 0x409, 0x805, 0x1053, 0x201b,
1332		0x402b, 0x8003,
1333	};
1334
1335#if defined(CONFIG_BCH_CONST_PARAMS)
1336	if ((m != (CONFIG_BCH_CONST_M)) || (t != (CONFIG_BCH_CONST_T))) {
1337		printk(KERN_ERR "bch encoder/decoder was configured to support "
1338		       "parameters m=%d, t=%d only!\n",
1339		       CONFIG_BCH_CONST_M, CONFIG_BCH_CONST_T);
1340		goto fail;
1341	}
1342#endif
1343	if ((m < min_m) || (m > BCH_MAX_M))
1344		/*
1345		 * values of m greater than 15 are not currently supported;
1346		 * supporting m > 15 would require changing table base type
1347		 * (uint16_t) and a small patch in matrix transposition
1348		 */
1349		goto fail;
1350
1351	if (t > BCH_MAX_T)
1352		/*
1353		 * we can support larger than 64 bits if necessary, at the
1354		 * cost of higher stack usage.
1355		 */
1356		goto fail;
1357
1358	/* sanity checks */
1359	if ((t < 1) || (m*t >= ((1 << m)-1)))
1360		/* invalid t value */
1361		goto fail;
1362
1363	/* select a primitive polynomial for generating GF(2^m) */
1364	if (prim_poly == 0)
1365		prim_poly = prim_poly_tab[m-min_m];
1366
1367	bch = kzalloc(sizeof(*bch), GFP_KERNEL);
1368	if (bch == NULL)
1369		goto fail;
1370
1371	bch->m = m;
1372	bch->t = t;
1373	bch->n = (1 << m)-1;
1374	words  = DIV_ROUND_UP(m*t, 32);
1375	bch->ecc_bytes = DIV_ROUND_UP(m*t, 8);
1376	bch->a_pow_tab = bch_alloc((1+bch->n)*sizeof(*bch->a_pow_tab), &err);
1377	bch->a_log_tab = bch_alloc((1+bch->n)*sizeof(*bch->a_log_tab), &err);
1378	bch->mod8_tab  = bch_alloc(words*1024*sizeof(*bch->mod8_tab), &err);
1379	bch->ecc_buf   = bch_alloc(words*sizeof(*bch->ecc_buf), &err);
1380	bch->ecc_buf2  = bch_alloc(words*sizeof(*bch->ecc_buf2), &err);
1381	bch->xi_tab    = bch_alloc(m*sizeof(*bch->xi_tab), &err);
1382	bch->syn       = bch_alloc(2*t*sizeof(*bch->syn), &err);
1383	bch->cache     = bch_alloc(2*t*sizeof(*bch->cache), &err);
1384	bch->elp       = bch_alloc((t+1)*sizeof(struct gf_poly_deg1), &err);
1385	bch->swap_bits = swap_bits;
1386
1387	for (i = 0; i < ARRAY_SIZE(bch->poly_2t); i++)
1388		bch->poly_2t[i] = bch_alloc(GF_POLY_SZ(2*t), &err);
1389
1390	if (err)
1391		goto fail;
1392
1393	err = build_gf_tables(bch, prim_poly);
1394	if (err)
1395		goto fail;
1396
1397	/* use generator polynomial for computing encoding tables */
1398	genpoly = compute_generator_polynomial(bch);
1399	if (genpoly == NULL)
1400		goto fail;
1401
1402	build_mod8_tables(bch, genpoly);
1403	kfree(genpoly);
1404
1405	err = build_deg2_base(bch);
1406	if (err)
1407		goto fail;
1408
1409	return bch;
1410
1411fail:
1412	bch_free(bch);
1413	return NULL;
1414}
1415EXPORT_SYMBOL_GPL(bch_init);
1416
1417/**
1418 *  bch_free - free the BCH control structure
1419 *  @bch:    BCH control structure to release
1420 */
1421void bch_free(struct bch_control *bch)
1422{
1423	unsigned int i;
1424
1425	if (bch) {
1426		kfree(bch->a_pow_tab);
1427		kfree(bch->a_log_tab);
1428		kfree(bch->mod8_tab);
1429		kfree(bch->ecc_buf);
1430		kfree(bch->ecc_buf2);
1431		kfree(bch->xi_tab);
1432		kfree(bch->syn);
1433		kfree(bch->cache);
1434		kfree(bch->elp);
1435
1436		for (i = 0; i < ARRAY_SIZE(bch->poly_2t); i++)
1437			kfree(bch->poly_2t[i]);
1438
1439		kfree(bch);
1440	}
1441}
1442EXPORT_SYMBOL_GPL(bch_free);
1443
1444MODULE_LICENSE("GPL");
1445MODULE_AUTHOR("Ivan Djelic <ivan.djelic@parrot.com>");
1446MODULE_DESCRIPTION("Binary BCH encoder/decoder");

   1/*
   2 * Generic binary BCH encoding/decoding library
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms of the GNU General Public License version 2 as published by
   6 * the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * You should have received a copy of the GNU General Public License along with
  14 * this program; if not, write to the Free Software Foundation, Inc., 51
  15 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  16 *
  17 * Copyright © 2011 Parrot S.A.
  18 *
  19 * Author: Ivan Djelic <ivan.djelic@parrot.com>
  20 *
  21 * Description:
  22 *
  23 * This library provides runtime configurable encoding/decoding of binary
  24 * Bose-Chaudhuri-Hocquenghem (BCH) codes.
  25 *
  26 * Call bch_init to get a pointer to a newly allocated bch_control structure for
  27 * the given m (Galois field order), t (error correction capability) and
  28 * (optional) primitive polynomial parameters.
  29 *
  30 * Call bch_encode to compute and store ecc parity bytes to a given buffer.
  31 * Call bch_decode to detect and locate errors in received data.
  32 *
  33 * On systems supporting hw BCH features, intermediate results may be provided
  34 * to bch_decode in order to skip certain steps. See bch_decode() documentation
  35 * for details.
  36 *
  37 * Option CONFIG_BCH_CONST_PARAMS can be used to force fixed values of
  38 * parameters m and t; thus allowing extra compiler optimizations and providing
  39 * better (up to 2x) encoding performance. Using this option makes sense when
  40 * (m,t) are fixed and known in advance, e.g. when using BCH error correction
  41 * on a particular NAND flash device.
  42 *
  43 * Algorithmic details:
  44 *
  45 * Encoding is performed by processing 32 input bits in parallel, using 4
  46 * remainder lookup tables.
  47 *
  48 * The final stage of decoding involves the following internal steps:
  49 * a. Syndrome computation
  50 * b. Error locator polynomial computation using Berlekamp-Massey algorithm
  51 * c. Error locator root finding (by far the most expensive step)
  52 *
  53 * In this implementation, step c is not performed using the usual Chien search.
  54 * Instead, an alternative approach described in [1] is used. It consists in
  55 * factoring the error locator polynomial using the Berlekamp Trace algorithm
  56 * (BTA) down to a certain degree (4), after which ad hoc low-degree polynomial
  57 * solving techniques [2] are used. The resulting algorithm, called BTZ, yields
  58 * much better performance than Chien search for usual (m,t) values (typically
  59 * m >= 13, t < 32, see [1]).
  60 *
  61 * [1] B. Biswas, V. Herbert. Efficient root finding of polynomials over fields
  62 * of characteristic 2, in: Western European Workshop on Research in Cryptology
  63 * - WEWoRC 2009, Graz, Austria, LNCS, Springer, July 2009, to appear.
  64 * [2] [Zin96] V.A. Zinoviev. On the solution of equations of degree 10 over
  65 * finite fields GF(2^q). In Rapport de recherche INRIA no 2829, 1996.
  66 */
  67
  68#include <linux/kernel.h>
  69#include <linux/errno.h>
  70#include <linux/init.h>
  71#include <linux/module.h>
  72#include <linux/slab.h>
  73#include <linux/bitops.h>
  74#include <linux/bitrev.h>
  75#include <asm/byteorder.h>
  76#include <linux/bch.h>
  77
  78#if defined(CONFIG_BCH_CONST_PARAMS)
  79#define GF_M(_p)               (CONFIG_BCH_CONST_M)
  80#define GF_T(_p)               (CONFIG_BCH_CONST_T)
  81#define GF_N(_p)               ((1 << (CONFIG_BCH_CONST_M))-1)
  82#define BCH_MAX_M              (CONFIG_BCH_CONST_M)
  83#define BCH_MAX_T	       (CONFIG_BCH_CONST_T)
  84#else
  85#define GF_M(_p)               ((_p)->m)
  86#define GF_T(_p)               ((_p)->t)
  87#define GF_N(_p)               ((_p)->n)
  88#define BCH_MAX_M              15 /* 2KB */
  89#define BCH_MAX_T              64 /* 64 bit correction */
  90#endif
  91
  92#define BCH_ECC_WORDS(_p)      DIV_ROUND_UP(GF_M(_p)*GF_T(_p), 32)
  93#define BCH_ECC_BYTES(_p)      DIV_ROUND_UP(GF_M(_p)*GF_T(_p), 8)
  94
  95#define BCH_ECC_MAX_WORDS      DIV_ROUND_UP(BCH_MAX_M * BCH_MAX_T, 32)
  96
  97#ifndef dbg
  98#define dbg(_fmt, args...)     do {} while (0)
  99#endif
 100
 101/*
 102 * represent a polynomial over GF(2^m)
 103 */
 104struct gf_poly {
 105	unsigned int deg;    /* polynomial degree */
 106	unsigned int c[];   /* polynomial terms */
 107};
 108
 109/* given its degree, compute a polynomial size in bytes */
 110#define GF_POLY_SZ(_d) (sizeof(struct gf_poly)+((_d)+1)*sizeof(unsigned int))
 111
 112/* polynomial of degree 1 */
 113struct gf_poly_deg1 {
 114	struct gf_poly poly;
 115	unsigned int   c[2];
 116};
 117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 118static u8 swap_bits(struct bch_control *bch, u8 in)
 119{
 120	if (!bch->swap_bits)
 121		return in;
 122
 123	return bitrev8(in);
 124}
 125
 126/*
 127 * same as bch_encode(), but process input data one byte at a time
 128 */
 129static void bch_encode_unaligned(struct bch_control *bch,
 130				 const unsigned char *data, unsigned int len,
 131				 uint32_t *ecc)
 132{
 133	int i;
 134	const uint32_t *p;
 135	const int l = BCH_ECC_WORDS(bch)-1;
 136
 137	while (len--) {
 138		u8 tmp = swap_bits(bch, *data++);
 139
 140		p = bch->mod8_tab + (l+1)*(((ecc[0] >> 24)^(tmp)) & 0xff);
 141
 142		for (i = 0; i < l; i++)
 143			ecc[i] = ((ecc[i] << 8)|(ecc[i+1] >> 24))^(*p++);
 144
 145		ecc[l] = (ecc[l] << 8)^(*p);
 146	}
 147}
 148
 149/*
 150 * convert ecc bytes to aligned, zero-padded 32-bit ecc words
 151 */
 152static void load_ecc8(struct bch_control *bch, uint32_t *dst,
 153		      const uint8_t *src)
 154{
 155	uint8_t pad[4] = {0, 0, 0, 0};
 156	unsigned int i, nwords = BCH_ECC_WORDS(bch)-1;
 157
 158	for (i = 0; i < nwords; i++, src += 4)
 159		dst[i] = ((u32)swap_bits(bch, src[0]) << 24) |
 160			((u32)swap_bits(bch, src[1]) << 16) |
 161			((u32)swap_bits(bch, src[2]) << 8) |
 162			swap_bits(bch, src[3]);
 163
 164	memcpy(pad, src, BCH_ECC_BYTES(bch)-4*nwords);
 165	dst[nwords] = ((u32)swap_bits(bch, pad[0]) << 24) |
 166		((u32)swap_bits(bch, pad[1]) << 16) |
 167		((u32)swap_bits(bch, pad[2]) << 8) |
 168		swap_bits(bch, pad[3]);
 169}
 170
 171/*
 172 * convert 32-bit ecc words to ecc bytes
 173 */
 174static void store_ecc8(struct bch_control *bch, uint8_t *dst,
 175		       const uint32_t *src)
 176{
 177	uint8_t pad[4];
 178	unsigned int i, nwords = BCH_ECC_WORDS(bch)-1;
 179
 180	for (i = 0; i < nwords; i++) {
 181		*dst++ = swap_bits(bch, src[i] >> 24);
 182		*dst++ = swap_bits(bch, src[i] >> 16);
 183		*dst++ = swap_bits(bch, src[i] >> 8);
 184		*dst++ = swap_bits(bch, src[i]);
 185	}
 186	pad[0] = swap_bits(bch, src[nwords] >> 24);
 187	pad[1] = swap_bits(bch, src[nwords] >> 16);
 188	pad[2] = swap_bits(bch, src[nwords] >> 8);
 189	pad[3] = swap_bits(bch, src[nwords]);
 190	memcpy(dst, pad, BCH_ECC_BYTES(bch)-4*nwords);
 191}
 192
 193/**
 194 * bch_encode - calculate BCH ecc parity of data
 195 * @bch:   BCH control structure
 196 * @data:  data to encode
 197 * @len:   data length in bytes
 198 * @ecc:   ecc parity data, must be initialized by caller
 199 *
 200 * The @ecc parity array is used both as input and output parameter, in order to
 201 * allow incremental computations. It should be of the size indicated by member
 202 * @ecc_bytes of @bch, and should be initialized to 0 before the first call.
 203 *
 204 * The exact number of computed ecc parity bits is given by member @ecc_bits of
 205 * @bch; it may be less than m*t for large values of t.
 206 */
 207void bch_encode(struct bch_control *bch, const uint8_t *data,
 208		unsigned int len, uint8_t *ecc)
 209{
 210	const unsigned int l = BCH_ECC_WORDS(bch)-1;
 211	unsigned int i, mlen;
 212	unsigned long m;
 213	uint32_t w, r[BCH_ECC_MAX_WORDS];
 214	const size_t r_bytes = BCH_ECC_WORDS(bch) * sizeof(*r);
 215	const uint32_t * const tab0 = bch->mod8_tab;
 216	const uint32_t * const tab1 = tab0 + 256*(l+1);
 217	const uint32_t * const tab2 = tab1 + 256*(l+1);
 218	const uint32_t * const tab3 = tab2 + 256*(l+1);
 219	const uint32_t *pdata, *p0, *p1, *p2, *p3;
 220
 221	if (WARN_ON(r_bytes > sizeof(r)))
 222		return;
 223
 224	if (ecc) {
 225		/* load ecc parity bytes into internal 32-bit buffer */
 226		load_ecc8(bch, bch->ecc_buf, ecc);
 227	} else {
 228		memset(bch->ecc_buf, 0, r_bytes);
 229	}
 230
 231	/* process first unaligned data bytes */
 232	m = ((unsigned long)data) & 3;
 233	if (m) {
 234		mlen = (len < (4-m)) ? len : 4-m;
 235		bch_encode_unaligned(bch, data, mlen, bch->ecc_buf);
 236		data += mlen;
 237		len  -= mlen;
 238	}
 239
 240	/* process 32-bit aligned data words */
 241	pdata = (uint32_t *)data;
 242	mlen  = len/4;
 243	data += 4*mlen;
 244	len  -= 4*mlen;
 245	memcpy(r, bch->ecc_buf, r_bytes);
 246
 247	/*
 248	 * split each 32-bit word into 4 polynomials of weight 8 as follows:
 249	 *
 250	 * 31 ...24  23 ...16  15 ... 8  7 ... 0
 251	 * xxxxxxxx  yyyyyyyy  zzzzzzzz  tttttttt
 252	 *                               tttttttt  mod g = r0 (precomputed)
 253	 *                     zzzzzzzz  00000000  mod g = r1 (precomputed)
 254	 *           yyyyyyyy  00000000  00000000  mod g = r2 (precomputed)
 255	 * xxxxxxxx  00000000  00000000  00000000  mod g = r3 (precomputed)
 256	 * xxxxxxxx  yyyyyyyy  zzzzzzzz  tttttttt  mod g = r0^r1^r2^r3
 257	 */
 258	while (mlen--) {
 259		/* input data is read in big-endian format */
 260		w = cpu_to_be32(*pdata++);
 261		if (bch->swap_bits)
 262			w = (u32)swap_bits(bch, w) |
 263			    ((u32)swap_bits(bch, w >> 8) << 8) |
 264			    ((u32)swap_bits(bch, w >> 16) << 16) |
 265			    ((u32)swap_bits(bch, w >> 24) << 24);
 266		w ^= r[0];
 267		p0 = tab0 + (l+1)*((w >>  0) & 0xff);
 268		p1 = tab1 + (l+1)*((w >>  8) & 0xff);
 269		p2 = tab2 + (l+1)*((w >> 16) & 0xff);
 270		p3 = tab3 + (l+1)*((w >> 24) & 0xff);
 271
 272		for (i = 0; i < l; i++)
 273			r[i] = r[i+1]^p0[i]^p1[i]^p2[i]^p3[i];
 274
 275		r[l] = p0[l]^p1[l]^p2[l]^p3[l];
 276	}
 277	memcpy(bch->ecc_buf, r, r_bytes);
 278
 279	/* process last unaligned bytes */
 280	if (len)
 281		bch_encode_unaligned(bch, data, len, bch->ecc_buf);
 282
 283	/* store ecc parity bytes into original parity buffer */
 284	if (ecc)
 285		store_ecc8(bch, ecc, bch->ecc_buf);
 286}
 287EXPORT_SYMBOL_GPL(bch_encode);
 288
 289static inline int modulo(struct bch_control *bch, unsigned int v)
 290{
 291	const unsigned int n = GF_N(bch);
 292	while (v >= n) {
 293		v -= n;
 294		v = (v & n) + (v >> GF_M(bch));
 295	}
 296	return v;
 297}
 298
 299/*
 300 * shorter and faster modulo function, only works when v < 2N.
 301 */
 302static inline int mod_s(struct bch_control *bch, unsigned int v)
 303{
 304	const unsigned int n = GF_N(bch);
 305	return (v < n) ? v : v-n;
 306}
 307
 308static inline int deg(unsigned int poly)
 309{
 310	/* polynomial degree is the most-significant bit index */
 311	return fls(poly)-1;
 312}
 313
 314static inline int parity(unsigned int x)
 315{
 316	/*
 317	 * public domain code snippet, lifted from
 318	 * http://www-graphics.stanford.edu/~seander/bithacks.html
 319	 */
 320	x ^= x >> 1;
 321	x ^= x >> 2;
 322	x = (x & 0x11111111U) * 0x11111111U;
 323	return (x >> 28) & 1;
 324}
 325
 326/* Galois field basic operations: multiply, divide, inverse, etc. */
 327
 328static inline unsigned int gf_mul(struct bch_control *bch, unsigned int a,
 329				  unsigned int b)
 330{
 331	return (a && b) ? bch->a_pow_tab[mod_s(bch, bch->a_log_tab[a]+
 332					       bch->a_log_tab[b])] : 0;
 333}
 334
 335static inline unsigned int gf_sqr(struct bch_control *bch, unsigned int a)
 336{
 337	return a ? bch->a_pow_tab[mod_s(bch, 2*bch->a_log_tab[a])] : 0;
 338}
 339
 340static inline unsigned int gf_div(struct bch_control *bch, unsigned int a,
 341				  unsigned int b)
 342{
 343	return a ? bch->a_pow_tab[mod_s(bch, bch->a_log_tab[a]+
 344					GF_N(bch)-bch->a_log_tab[b])] : 0;
 345}
 346
 347static inline unsigned int gf_inv(struct bch_control *bch, unsigned int a)
 348{
 349	return bch->a_pow_tab[GF_N(bch)-bch->a_log_tab[a]];
 350}
 351
 352static inline unsigned int a_pow(struct bch_control *bch, int i)
 353{
 354	return bch->a_pow_tab[modulo(bch, i)];
 355}
 356
 357static inline int a_log(struct bch_control *bch, unsigned int x)
 358{
 359	return bch->a_log_tab[x];
 360}
 361
 362static inline int a_ilog(struct bch_control *bch, unsigned int x)
 363{
 364	return mod_s(bch, GF_N(bch)-bch->a_log_tab[x]);
 365}
 366
 367/*
 368 * compute 2t syndromes of ecc polynomial, i.e. ecc(a^j) for j=1..2t
 369 */
 370static void compute_syndromes(struct bch_control *bch, uint32_t *ecc,
 371			      unsigned int *syn)
 372{
 373	int i, j, s;
 374	unsigned int m;
 375	uint32_t poly;
 376	const int t = GF_T(bch);
 377
 378	s = bch->ecc_bits;
 379
 380	/* make sure extra bits in last ecc word are cleared */
 381	m = ((unsigned int)s) & 31;
 382	if (m)
 383		ecc[s/32] &= ~((1u << (32-m))-1);
 384	memset(syn, 0, 2*t*sizeof(*syn));
 385
 386	/* compute v(a^j) for j=1 .. 2t-1 */
 387	do {
 388		poly = *ecc++;
 389		s -= 32;
 390		while (poly) {
 391			i = deg(poly);
 392			for (j = 0; j < 2*t; j += 2)
 393				syn[j] ^= a_pow(bch, (j+1)*(i+s));
 394
 395			poly ^= (1 << i);
 396		}
 397	} while (s > 0);
 398
 399	/* v(a^(2j)) = v(a^j)^2 */
 400	for (j = 0; j < t; j++)
 401		syn[2*j+1] = gf_sqr(bch, syn[j]);
 402}
 403
 404static void gf_poly_copy(struct gf_poly *dst, struct gf_poly *src)
 405{
 406	memcpy(dst, src, GF_POLY_SZ(src->deg));
 407}
 408
 409static int compute_error_locator_polynomial(struct bch_control *bch,
 410					    const unsigned int *syn)
 411{
 412	const unsigned int t = GF_T(bch);
 413	const unsigned int n = GF_N(bch);
 414	unsigned int i, j, tmp, l, pd = 1, d = syn[0];
 415	struct gf_poly *elp = bch->elp;
 416	struct gf_poly *pelp = bch->poly_2t[0];
 417	struct gf_poly *elp_copy = bch->poly_2t[1];
 418	int k, pp = -1;
 419
 420	memset(pelp, 0, GF_POLY_SZ(2*t));
 421	memset(elp, 0, GF_POLY_SZ(2*t));
 422
 423	pelp->deg = 0;
 424	pelp->c[0] = 1;
 425	elp->deg = 0;
 426	elp->c[0] = 1;
 427
 428	/* use simplified binary Berlekamp-Massey algorithm */
 429	for (i = 0; (i < t) && (elp->deg <= t); i++) {
 430		if (d) {
 431			k = 2*i-pp;
 432			gf_poly_copy(elp_copy, elp);
 433			/* e[i+1](X) = e[i](X)+di*dp^-1*X^2(i-p)*e[p](X) */
 434			tmp = a_log(bch, d)+n-a_log(bch, pd);
 435			for (j = 0; j <= pelp->deg; j++) {
 436				if (pelp->c[j]) {
 437					l = a_log(bch, pelp->c[j]);
 438					elp->c[j+k] ^= a_pow(bch, tmp+l);
 439				}
 440			}
 441			/* compute l[i+1] = max(l[i]->c[l[p]+2*(i-p]) */
 442			tmp = pelp->deg+k;
 443			if (tmp > elp->deg) {
 444				elp->deg = tmp;
 445				gf_poly_copy(pelp, elp_copy);
 446				pd = d;
 447				pp = 2*i;
 448			}
 449		}
 450		/* di+1 = S(2i+3)+elp[i+1].1*S(2i+2)+...+elp[i+1].lS(2i+3-l) */
 451		if (i < t-1) {
 452			d = syn[2*i+2];
 453			for (j = 1; j <= elp->deg; j++)
 454				d ^= gf_mul(bch, elp->c[j], syn[2*i+2-j]);
 455		}
 456	}
 457	dbg("elp=%s\n", gf_poly_str(elp));
 458	return (elp->deg > t) ? -1 : (int)elp->deg;
 459}
 460
 461/*
 462 * solve a m x m linear system in GF(2) with an expected number of solutions,
 463 * and return the number of found solutions
 464 */
 465static int solve_linear_system(struct bch_control *bch, unsigned int *rows,
 466			       unsigned int *sol, int nsol)
 467{
 468	const int m = GF_M(bch);
 469	unsigned int tmp, mask;
 470	int rem, c, r, p, k, param[BCH_MAX_M];
 471
 472	k = 0;
 473	mask = 1 << m;
 474
 475	/* Gaussian elimination */
 476	for (c = 0; c < m; c++) {
 477		rem = 0;
 478		p = c-k;
 479		/* find suitable row for elimination */
 480		for (r = p; r < m; r++) {
 481			if (rows[r] & mask) {
 482				if (r != p) {
 483					tmp = rows[r];
 484					rows[r] = rows[p];
 485					rows[p] = tmp;
 486				}
 487				rem = r+1;
 488				break;
 489			}
 490		}
 491		if (rem) {
 492			/* perform elimination on remaining rows */
 493			tmp = rows[p];
 494			for (r = rem; r < m; r++) {
 495				if (rows[r] & mask)
 496					rows[r] ^= tmp;
 497			}
 498		} else {
 499			/* elimination not needed, store defective row index */
 500			param[k++] = c;
 501		}
 502		mask >>= 1;
 503	}
 504	/* rewrite system, inserting fake parameter rows */
 505	if (k > 0) {
 506		p = k;
 507		for (r = m-1; r >= 0; r--) {
 508			if ((r > m-1-k) && rows[r])
 509				/* system has no solution */
 510				return 0;
 511
 512			rows[r] = (p && (r == param[p-1])) ?
 513				p--, 1u << (m-r) : rows[r-p];
 514		}
 515	}
 516
 517	if (nsol != (1 << k))
 518		/* unexpected number of solutions */
 519		return 0;
 520
 521	for (p = 0; p < nsol; p++) {
 522		/* set parameters for p-th solution */
 523		for (c = 0; c < k; c++)
 524			rows[param[c]] = (rows[param[c]] & ~1)|((p >> c) & 1);
 525
 526		/* compute unique solution */
 527		tmp = 0;
 528		for (r = m-1; r >= 0; r--) {
 529			mask = rows[r] & (tmp|1);
 530			tmp |= parity(mask) << (m-r);
 531		}
 532		sol[p] = tmp >> 1;
 533	}
 534	return nsol;
 535}
 536
 537/*
 538 * this function builds and solves a linear system for finding roots of a degree
 539 * 4 affine monic polynomial X^4+aX^2+bX+c over GF(2^m).
 540 */
 541static int find_affine4_roots(struct bch_control *bch, unsigned int a,
 542			      unsigned int b, unsigned int c,
 543			      unsigned int *roots)
 544{
 545	int i, j, k;
 546	const int m = GF_M(bch);
 547	unsigned int mask = 0xff, t, rows[16] = {0,};
 548
 549	j = a_log(bch, b);
 550	k = a_log(bch, a);
 551	rows[0] = c;
 552
 553	/* build linear system to solve X^4+aX^2+bX+c = 0 */
 554	for (i = 0; i < m; i++) {
 555		rows[i+1] = bch->a_pow_tab[4*i]^
 556			(a ? bch->a_pow_tab[mod_s(bch, k)] : 0)^
 557			(b ? bch->a_pow_tab[mod_s(bch, j)] : 0);
 558		j++;
 559		k += 2;
 560	}
 561	/*
 562	 * transpose 16x16 matrix before passing it to linear solver
 563	 * warning: this code assumes m < 16
 564	 */
 565	for (j = 8; j != 0; j >>= 1, mask ^= (mask << j)) {
 566		for (k = 0; k < 16; k = (k+j+1) & ~j) {
 567			t = ((rows[k] >> j)^rows[k+j]) & mask;
 568			rows[k] ^= (t << j);
 569			rows[k+j] ^= t;
 570		}
 571	}
 572	return solve_linear_system(bch, rows, roots, 4);
 573}
 574
 575/*
 576 * compute root r of a degree 1 polynomial over GF(2^m) (returned as log(1/r))
 577 */
 578static int find_poly_deg1_roots(struct bch_control *bch, struct gf_poly *poly,
 579				unsigned int *roots)
 580{
 581	int n = 0;
 582
 583	if (poly->c[0])
 584		/* poly[X] = bX+c with c!=0, root=c/b */
 585		roots[n++] = mod_s(bch, GF_N(bch)-bch->a_log_tab[poly->c[0]]+
 586				   bch->a_log_tab[poly->c[1]]);
 587	return n;
 588}
 589
 590/*
 591 * compute roots of a degree 2 polynomial over GF(2^m)
 592 */
 593static int find_poly_deg2_roots(struct bch_control *bch, struct gf_poly *poly,
 594				unsigned int *roots)
 595{
 596	int n = 0, i, l0, l1, l2;
 597	unsigned int u, v, r;
 598
 599	if (poly->c[0] && poly->c[1]) {
 600
 601		l0 = bch->a_log_tab[poly->c[0]];
 602		l1 = bch->a_log_tab[poly->c[1]];
 603		l2 = bch->a_log_tab[poly->c[2]];
 604
 605		/* using z=a/bX, transform aX^2+bX+c into z^2+z+u (u=ac/b^2) */
 606		u = a_pow(bch, l0+l2+2*(GF_N(bch)-l1));
 607		/*
 608		 * let u = sum(li.a^i) i=0..m-1; then compute r = sum(li.xi):
 609		 * r^2+r = sum(li.(xi^2+xi)) = sum(li.(a^i+Tr(a^i).a^k)) =
 610		 * u + sum(li.Tr(a^i).a^k) = u+a^k.Tr(sum(li.a^i)) = u+a^k.Tr(u)
 611		 * i.e. r and r+1 are roots iff Tr(u)=0
 612		 */
 613		r = 0;
 614		v = u;
 615		while (v) {
 616			i = deg(v);
 617			r ^= bch->xi_tab[i];
 618			v ^= (1 << i);
 619		}
 620		/* verify root */
 621		if ((gf_sqr(bch, r)^r) == u) {
 622			/* reverse z=a/bX transformation and compute log(1/r) */
 623			roots[n++] = modulo(bch, 2*GF_N(bch)-l1-
 624					    bch->a_log_tab[r]+l2);
 625			roots[n++] = modulo(bch, 2*GF_N(bch)-l1-
 626					    bch->a_log_tab[r^1]+l2);
 627		}
 628	}
 629	return n;
 630}
 631
 632/*
 633 * compute roots of a degree 3 polynomial over GF(2^m)
 634 */
 635static int find_poly_deg3_roots(struct bch_control *bch, struct gf_poly *poly,
 636				unsigned int *roots)
 637{
 638	int i, n = 0;
 639	unsigned int a, b, c, a2, b2, c2, e3, tmp[4];
 640
 641	if (poly->c[0]) {
 642		/* transform polynomial into monic X^3 + a2X^2 + b2X + c2 */
 643		e3 = poly->c[3];
 644		c2 = gf_div(bch, poly->c[0], e3);
 645		b2 = gf_div(bch, poly->c[1], e3);
 646		a2 = gf_div(bch, poly->c[2], e3);
 647
 648		/* (X+a2)(X^3+a2X^2+b2X+c2) = X^4+aX^2+bX+c (affine) */
 649		c = gf_mul(bch, a2, c2);           /* c = a2c2      */
 650		b = gf_mul(bch, a2, b2)^c2;        /* b = a2b2 + c2 */
 651		a = gf_sqr(bch, a2)^b2;            /* a = a2^2 + b2 */
 652
 653		/* find the 4 roots of this affine polynomial */
 654		if (find_affine4_roots(bch, a, b, c, tmp) == 4) {
 655			/* remove a2 from final list of roots */
 656			for (i = 0; i < 4; i++) {
 657				if (tmp[i] != a2)
 658					roots[n++] = a_ilog(bch, tmp[i]);
 659			}
 660		}
 661	}
 662	return n;
 663}
 664
 665/*
 666 * compute roots of a degree 4 polynomial over GF(2^m)
 667 */
 668static int find_poly_deg4_roots(struct bch_control *bch, struct gf_poly *poly,
 669				unsigned int *roots)
 670{
 671	int i, l, n = 0;
 672	unsigned int a, b, c, d, e = 0, f, a2, b2, c2, e4;
 673
 674	if (poly->c[0] == 0)
 675		return 0;
 676
 677	/* transform polynomial into monic X^4 + aX^3 + bX^2 + cX + d */
 678	e4 = poly->c[4];
 679	d = gf_div(bch, poly->c[0], e4);
 680	c = gf_div(bch, poly->c[1], e4);
 681	b = gf_div(bch, poly->c[2], e4);
 682	a = gf_div(bch, poly->c[3], e4);
 683
 684	/* use Y=1/X transformation to get an affine polynomial */
 685	if (a) {
 686		/* first, eliminate cX by using z=X+e with ae^2+c=0 */
 687		if (c) {
 688			/* compute e such that e^2 = c/a */
 689			f = gf_div(bch, c, a);
 690			l = a_log(bch, f);
 691			l += (l & 1) ? GF_N(bch) : 0;
 692			e = a_pow(bch, l/2);
 693			/*
 694			 * use transformation z=X+e:
 695			 * z^4+e^4 + a(z^3+ez^2+e^2z+e^3) + b(z^2+e^2) +cz+ce+d
 696			 * z^4 + az^3 + (ae+b)z^2 + (ae^2+c)z+e^4+be^2+ae^3+ce+d
 697			 * z^4 + az^3 + (ae+b)z^2 + e^4+be^2+d
 698			 * z^4 + az^3 +     b'z^2 + d'
 699			 */
 700			d = a_pow(bch, 2*l)^gf_mul(bch, b, f)^d;
 701			b = gf_mul(bch, a, e)^b;
 702		}
 703		/* now, use Y=1/X to get Y^4 + b/dY^2 + a/dY + 1/d */
 704		if (d == 0)
 705			/* assume all roots have multiplicity 1 */
 706			return 0;
 707
 708		c2 = gf_inv(bch, d);
 709		b2 = gf_div(bch, a, d);
 710		a2 = gf_div(bch, b, d);
 711	} else {
 712		/* polynomial is already affine */
 713		c2 = d;
 714		b2 = c;
 715		a2 = b;
 716	}
 717	/* find the 4 roots of this affine polynomial */
 718	if (find_affine4_roots(bch, a2, b2, c2, roots) == 4) {
 719		for (i = 0; i < 4; i++) {
 720			/* post-process roots (reverse transformations) */
 721			f = a ? gf_inv(bch, roots[i]) : roots[i];
 722			roots[i] = a_ilog(bch, f^e);
 723		}
 724		n = 4;
 725	}
 726	return n;
 727}
 728
 729/*
 730 * build monic, log-based representation of a polynomial
 731 */
 732static void gf_poly_logrep(struct bch_control *bch,
 733			   const struct gf_poly *a, int *rep)
 734{
 735	int i, d = a->deg, l = GF_N(bch)-a_log(bch, a->c[a->deg]);
 736
 737	/* represent 0 values with -1; warning, rep[d] is not set to 1 */
 738	for (i = 0; i < d; i++)
 739		rep[i] = a->c[i] ? mod_s(bch, a_log(bch, a->c[i])+l) : -1;
 740}
 741
 742/*
 743 * compute polynomial Euclidean division remainder in GF(2^m)[X]
 744 */
 745static void gf_poly_mod(struct bch_control *bch, struct gf_poly *a,
 746			const struct gf_poly *b, int *rep)
 747{
 748	int la, p, m;
 749	unsigned int i, j, *c = a->c;
 750	const unsigned int d = b->deg;
 751
 752	if (a->deg < d)
 753		return;
 754
 755	/* reuse or compute log representation of denominator */
 756	if (!rep) {
 757		rep = bch->cache;
 758		gf_poly_logrep(bch, b, rep);
 759	}
 760
 761	for (j = a->deg; j >= d; j--) {
 762		if (c[j]) {
 763			la = a_log(bch, c[j]);
 764			p = j-d;
 765			for (i = 0; i < d; i++, p++) {
 766				m = rep[i];
 767				if (m >= 0)
 768					c[p] ^= bch->a_pow_tab[mod_s(bch,
 769								     m+la)];
 770			}
 771		}
 772	}
 773	a->deg = d-1;
 774	while (!c[a->deg] && a->deg)
 775		a->deg--;
 776}
 777
 778/*
 779 * compute polynomial Euclidean division quotient in GF(2^m)[X]
 780 */
 781static void gf_poly_div(struct bch_control *bch, struct gf_poly *a,
 782			const struct gf_poly *b, struct gf_poly *q)
 783{
 784	if (a->deg >= b->deg) {
 785		q->deg = a->deg-b->deg;
 786		/* compute a mod b (modifies a) */
 787		gf_poly_mod(bch, a, b, NULL);
 788		/* quotient is stored in upper part of polynomial a */
 789		memcpy(q->c, &a->c[b->deg], (1+q->deg)*sizeof(unsigned int));
 790	} else {
 791		q->deg = 0;
 792		q->c[0] = 0;
 793	}
 794}
 795
 796/*
 797 * compute polynomial GCD (Greatest Common Divisor) in GF(2^m)[X]
 798 */
 799static struct gf_poly *gf_poly_gcd(struct bch_control *bch, struct gf_poly *a,
 800				   struct gf_poly *b)
 801{
 802	struct gf_poly *tmp;
 803
 804	dbg("gcd(%s,%s)=", gf_poly_str(a), gf_poly_str(b));
 805
 806	if (a->deg < b->deg) {
 807		tmp = b;
 808		b = a;
 809		a = tmp;
 810	}
 811
 812	while (b->deg > 0) {
 813		gf_poly_mod(bch, a, b, NULL);
 814		tmp = b;
 815		b = a;
 816		a = tmp;
 817	}
 818
 819	dbg("%s\n", gf_poly_str(a));
 820
 821	return a;
 822}
 823
 824/*
 825 * Given a polynomial f and an integer k, compute Tr(a^kX) mod f
 826 * This is used in Berlekamp Trace algorithm for splitting polynomials
 827 */
 828static void compute_trace_bk_mod(struct bch_control *bch, int k,
 829				 const struct gf_poly *f, struct gf_poly *z,
 830				 struct gf_poly *out)
 831{
 832	const int m = GF_M(bch);
 833	int i, j;
 834
 835	/* z contains z^2j mod f */
 836	z->deg = 1;
 837	z->c[0] = 0;
 838	z->c[1] = bch->a_pow_tab[k];
 839
 840	out->deg = 0;
 841	memset(out, 0, GF_POLY_SZ(f->deg));
 842
 843	/* compute f log representation only once */
 844	gf_poly_logrep(bch, f, bch->cache);
 845
 846	for (i = 0; i < m; i++) {
 847		/* add a^(k*2^i)(z^(2^i) mod f) and compute (z^(2^i) mod f)^2 */
 848		for (j = z->deg; j >= 0; j--) {
 849			out->c[j] ^= z->c[j];
 850			z->c[2*j] = gf_sqr(bch, z->c[j]);
 851			z->c[2*j+1] = 0;
 852		}
 853		if (z->deg > out->deg)
 854			out->deg = z->deg;
 855
 856		if (i < m-1) {
 857			z->deg *= 2;
 858			/* z^(2(i+1)) mod f = (z^(2^i) mod f)^2 mod f */
 859			gf_poly_mod(bch, z, f, bch->cache);
 860		}
 861	}
 862	while (!out->c[out->deg] && out->deg)
 863		out->deg--;
 864
 865	dbg("Tr(a^%d.X) mod f = %s\n", k, gf_poly_str(out));
 866}
 867
 868/*
 869 * factor a polynomial using Berlekamp Trace algorithm (BTA)
 870 */
 871static void factor_polynomial(struct bch_control *bch, int k, struct gf_poly *f,
 872			      struct gf_poly **g, struct gf_poly **h)
 873{
 874	struct gf_poly *f2 = bch->poly_2t[0];
 875	struct gf_poly *q  = bch->poly_2t[1];
 876	struct gf_poly *tk = bch->poly_2t[2];
 877	struct gf_poly *z  = bch->poly_2t[3];
 878	struct gf_poly *gcd;
 879
 880	dbg("factoring %s...\n", gf_poly_str(f));
 881
 882	*g = f;
 883	*h = NULL;
 884
 885	/* tk = Tr(a^k.X) mod f */
 886	compute_trace_bk_mod(bch, k, f, z, tk);
 887
 888	if (tk->deg > 0) {
 889		/* compute g = gcd(f, tk) (destructive operation) */
 890		gf_poly_copy(f2, f);
 891		gcd = gf_poly_gcd(bch, f2, tk);
 892		if (gcd->deg < f->deg) {
 893			/* compute h=f/gcd(f,tk); this will modify f and q */
 894			gf_poly_div(bch, f, gcd, q);
 895			/* store g and h in-place (clobbering f) */
 896			*h = &((struct gf_poly_deg1 *)f)[gcd->deg].poly;
 897			gf_poly_copy(*g, gcd);
 898			gf_poly_copy(*h, q);
 899		}
 900	}
 901}
 902
 903/*
 904 * find roots of a polynomial, using BTZ algorithm; see the beginning of this
 905 * file for details
 906 */
 907static int find_poly_roots(struct bch_control *bch, unsigned int k,
 908			   struct gf_poly *poly, unsigned int *roots)
 909{
 910	int cnt;
 911	struct gf_poly *f1, *f2;
 912
 913	switch (poly->deg) {
 914		/* handle low degree polynomials with ad hoc techniques */
 915	case 1:
 916		cnt = find_poly_deg1_roots(bch, poly, roots);
 917		break;
 918	case 2:
 919		cnt = find_poly_deg2_roots(bch, poly, roots);
 920		break;
 921	case 3:
 922		cnt = find_poly_deg3_roots(bch, poly, roots);
 923		break;
 924	case 4:
 925		cnt = find_poly_deg4_roots(bch, poly, roots);
 926		break;
 927	default:
 928		/* factor polynomial using Berlekamp Trace Algorithm (BTA) */
 929		cnt = 0;
 930		if (poly->deg && (k <= GF_M(bch))) {
 931			factor_polynomial(bch, k, poly, &f1, &f2);
 932			if (f1)
 933				cnt += find_poly_roots(bch, k+1, f1, roots);
 934			if (f2)
 935				cnt += find_poly_roots(bch, k+1, f2, roots+cnt);
 936		}
 937		break;
 938	}
 939	return cnt;
 940}
 941
 942#if defined(USE_CHIEN_SEARCH)
 943/*
 944 * exhaustive root search (Chien) implementation - not used, included only for
 945 * reference/comparison tests
 946 */
 947static int chien_search(struct bch_control *bch, unsigned int len,
 948			struct gf_poly *p, unsigned int *roots)
 949{
 950	int m;
 951	unsigned int i, j, syn, syn0, count = 0;
 952	const unsigned int k = 8*len+bch->ecc_bits;
 953
 954	/* use a log-based representation of polynomial */
 955	gf_poly_logrep(bch, p, bch->cache);
 956	bch->cache[p->deg] = 0;
 957	syn0 = gf_div(bch, p->c[0], p->c[p->deg]);
 958
 959	for (i = GF_N(bch)-k+1; i <= GF_N(bch); i++) {
 960		/* compute elp(a^i) */
 961		for (j = 1, syn = syn0; j <= p->deg; j++) {
 962			m = bch->cache[j];
 963			if (m >= 0)
 964				syn ^= a_pow(bch, m+j*i);
 965		}
 966		if (syn == 0) {
 967			roots[count++] = GF_N(bch)-i;
 968			if (count == p->deg)
 969				break;
 970		}
 971	}
 972	return (count == p->deg) ? count : 0;
 973}
 974#define find_poly_roots(_p, _k, _elp, _loc) chien_search(_p, len, _elp, _loc)
 975#endif /* USE_CHIEN_SEARCH */
 976
 977/**
 978 * bch_decode - decode received codeword and find bit error locations
 979 * @bch:      BCH control structure
 980 * @data:     received data, ignored if @calc_ecc is provided
 981 * @len:      data length in bytes, must always be provided
 982 * @recv_ecc: received ecc, if NULL then assume it was XORed in @calc_ecc
 983 * @calc_ecc: calculated ecc, if NULL then calc_ecc is computed from @data
 984 * @syn:      hw computed syndrome data (if NULL, syndrome is calculated)
 985 * @errloc:   output array of error locations
 986 *
 987 * Returns:
 988 *  The number of errors found, or -EBADMSG if decoding failed, or -EINVAL if
 989 *  invalid parameters were provided
 990 *
 991 * Depending on the available hw BCH support and the need to compute @calc_ecc
 992 * separately (using bch_encode()), this function should be called with one of
 993 * the following parameter configurations -
 994 *
 995 * by providing @data and @recv_ecc only:
 996 *   bch_decode(@bch, @data, @len, @recv_ecc, NULL, NULL, @errloc)
 997 *
 998 * by providing @recv_ecc and @calc_ecc:
 999 *   bch_decode(@bch, NULL, @len, @recv_ecc, @calc_ecc, NULL, @errloc)
1000 *
1001 * by providing ecc = recv_ecc XOR calc_ecc:
1002 *   bch_decode(@bch, NULL, @len, NULL, ecc, NULL, @errloc)
1003 *
1004 * by providing syndrome results @syn:
1005 *   bch_decode(@bch, NULL, @len, NULL, NULL, @syn, @errloc)
1006 *
1007 * Once bch_decode() has successfully returned with a positive value, error
1008 * locations returned in array @errloc should be interpreted as follows -
1009 *
1010 * if (errloc[n] >= 8*len), then n-th error is located in ecc (no need for
1011 * data correction)
1012 *
1013 * if (errloc[n] < 8*len), then n-th error is located in data and can be
1014 * corrected with statement data[errloc[n]/8] ^= 1 << (errloc[n] % 8);
1015 *
1016 * Note that this function does not perform any data correction by itself, it
1017 * merely indicates error locations.
1018 */
1019int bch_decode(struct bch_control *bch, const uint8_t *data, unsigned int len,
1020	       const uint8_t *recv_ecc, const uint8_t *calc_ecc,
1021	       const unsigned int *syn, unsigned int *errloc)
1022{
1023	const unsigned int ecc_words = BCH_ECC_WORDS(bch);
1024	unsigned int nbits;
1025	int i, err, nroots;
1026	uint32_t sum;
1027
1028	/* sanity check: make sure data length can be handled */
1029	if (8*len > (bch->n-bch->ecc_bits))
1030		return -EINVAL;
1031
1032	/* if caller does not provide syndromes, compute them */
1033	if (!syn) {
1034		if (!calc_ecc) {
1035			/* compute received data ecc into an internal buffer */
1036			if (!data || !recv_ecc)
1037				return -EINVAL;
1038			bch_encode(bch, data, len, NULL);
1039		} else {
1040			/* load provided calculated ecc */
1041			load_ecc8(bch, bch->ecc_buf, calc_ecc);
1042		}
1043		/* load received ecc or assume it was XORed in calc_ecc */
1044		if (recv_ecc) {
1045			load_ecc8(bch, bch->ecc_buf2, recv_ecc);
1046			/* XOR received and calculated ecc */
1047			for (i = 0, sum = 0; i < (int)ecc_words; i++) {
1048				bch->ecc_buf[i] ^= bch->ecc_buf2[i];
1049				sum |= bch->ecc_buf[i];
1050			}
1051			if (!sum)
1052				/* no error found */
1053				return 0;
1054		}
1055		compute_syndromes(bch, bch->ecc_buf, bch->syn);
1056		syn = bch->syn;
1057	}
1058
1059	err = compute_error_locator_polynomial(bch, syn);
1060	if (err > 0) {
1061		nroots = find_poly_roots(bch, 1, bch->elp, errloc);
1062		if (err != nroots)
1063			err = -1;
1064	}
1065	if (err > 0) {
1066		/* post-process raw error locations for easier correction */
1067		nbits = (len*8)+bch->ecc_bits;
1068		for (i = 0; i < err; i++) {
1069			if (errloc[i] >= nbits) {
1070				err = -1;
1071				break;
1072			}
1073			errloc[i] = nbits-1-errloc[i];
1074			if (!bch->swap_bits)
1075				errloc[i] = (errloc[i] & ~7) |
1076					    (7-(errloc[i] & 7));
1077		}
1078	}
1079	return (err >= 0) ? err : -EBADMSG;
1080}
1081EXPORT_SYMBOL_GPL(bch_decode);
1082
1083/*
1084 * generate Galois field lookup tables
1085 */
1086static int build_gf_tables(struct bch_control *bch, unsigned int poly)
1087{
1088	unsigned int i, x = 1;
1089	const unsigned int k = 1 << deg(poly);
1090
1091	/* primitive polynomial must be of degree m */
1092	if (k != (1u << GF_M(bch)))
1093		return -1;
1094
1095	for (i = 0; i < GF_N(bch); i++) {
1096		bch->a_pow_tab[i] = x;
1097		bch->a_log_tab[x] = i;
1098		if (i && (x == 1))
1099			/* polynomial is not primitive (a^i=1 with 0<i<2^m-1) */
1100			return -1;
1101		x <<= 1;
1102		if (x & k)
1103			x ^= poly;
1104	}
1105	bch->a_pow_tab[GF_N(bch)] = 1;
1106	bch->a_log_tab[0] = 0;
1107
1108	return 0;
1109}
1110
1111/*
1112 * compute generator polynomial remainder tables for fast encoding
1113 */
1114static void build_mod8_tables(struct bch_control *bch, const uint32_t *g)
1115{
1116	int i, j, b, d;
1117	uint32_t data, hi, lo, *tab;
1118	const int l = BCH_ECC_WORDS(bch);
1119	const int plen = DIV_ROUND_UP(bch->ecc_bits+1, 32);
1120	const int ecclen = DIV_ROUND_UP(bch->ecc_bits, 32);
1121
1122	memset(bch->mod8_tab, 0, 4*256*l*sizeof(*bch->mod8_tab));
1123
1124	for (i = 0; i < 256; i++) {
1125		/* p(X)=i is a small polynomial of weight <= 8 */
1126		for (b = 0; b < 4; b++) {
1127			/* we want to compute (p(X).X^(8*b+deg(g))) mod g(X) */
1128			tab = bch->mod8_tab + (b*256+i)*l;
1129			data = i << (8*b);
1130			while (data) {
1131				d = deg(data);
1132				/* subtract X^d.g(X) from p(X).X^(8*b+deg(g)) */
1133				data ^= g[0] >> (31-d);
1134				for (j = 0; j < ecclen; j++) {
1135					hi = (d < 31) ? g[j] << (d+1) : 0;
1136					lo = (j+1 < plen) ?
1137						g[j+1] >> (31-d) : 0;
1138					tab[j] ^= hi|lo;
1139				}
1140			}
1141		}
1142	}
1143}
1144
1145/*
1146 * build a base for factoring degree 2 polynomials
1147 */
1148static int build_deg2_base(struct bch_control *bch)
1149{
1150	const int m = GF_M(bch);
1151	int i, j, r;
1152	unsigned int sum, x, y, remaining, ak = 0, xi[BCH_MAX_M];
1153
1154	/* find k s.t. Tr(a^k) = 1 and 0 <= k < m */
1155	for (i = 0; i < m; i++) {
1156		for (j = 0, sum = 0; j < m; j++)
1157			sum ^= a_pow(bch, i*(1 << j));
1158
1159		if (sum) {
1160			ak = bch->a_pow_tab[i];
1161			break;
1162		}
1163	}
1164	/* find xi, i=0..m-1 such that xi^2+xi = a^i+Tr(a^i).a^k */
1165	remaining = m;
1166	memset(xi, 0, sizeof(xi));
1167
1168	for (x = 0; (x <= GF_N(bch)) && remaining; x++) {
1169		y = gf_sqr(bch, x)^x;
1170		for (i = 0; i < 2; i++) {
1171			r = a_log(bch, y);
1172			if (y && (r < m) && !xi[r]) {
1173				bch->xi_tab[r] = x;
1174				xi[r] = 1;
1175				remaining--;
1176				dbg("x%d = %x\n", r, x);
1177				break;
1178			}
1179			y ^= ak;
1180		}
1181	}
1182	/* should not happen but check anyway */
1183	return remaining ? -1 : 0;
1184}
1185
1186static void *bch_alloc(size_t size, int *err)
1187{
1188	void *ptr;
1189
1190	ptr = kmalloc(size, GFP_KERNEL);
1191	if (ptr == NULL)
1192		*err = 1;
1193	return ptr;
1194}
1195
1196/*
1197 * compute generator polynomial for given (m,t) parameters.
1198 */
1199static uint32_t *compute_generator_polynomial(struct bch_control *bch)
1200{
1201	const unsigned int m = GF_M(bch);
1202	const unsigned int t = GF_T(bch);
1203	int n, err = 0;
1204	unsigned int i, j, nbits, r, word, *roots;
1205	struct gf_poly *g;
1206	uint32_t *genpoly;
1207
1208	g = bch_alloc(GF_POLY_SZ(m*t), &err);
1209	roots = bch_alloc((bch->n+1)*sizeof(*roots), &err);
1210	genpoly = bch_alloc(DIV_ROUND_UP(m*t+1, 32)*sizeof(*genpoly), &err);
1211
1212	if (err) {
1213		kfree(genpoly);
1214		genpoly = NULL;
1215		goto finish;
1216	}
1217
1218	/* enumerate all roots of g(X) */
1219	memset(roots , 0, (bch->n+1)*sizeof(*roots));
1220	for (i = 0; i < t; i++) {
1221		for (j = 0, r = 2*i+1; j < m; j++) {
1222			roots[r] = 1;
1223			r = mod_s(bch, 2*r);
1224		}
1225	}
1226	/* build generator polynomial g(X) */
1227	g->deg = 0;
1228	g->c[0] = 1;
1229	for (i = 0; i < GF_N(bch); i++) {
1230		if (roots[i]) {
1231			/* multiply g(X) by (X+root) */
1232			r = bch->a_pow_tab[i];
1233			g->c[g->deg+1] = 1;
1234			for (j = g->deg; j > 0; j--)
1235				g->c[j] = gf_mul(bch, g->c[j], r)^g->c[j-1];
1236
1237			g->c[0] = gf_mul(bch, g->c[0], r);
1238			g->deg++;
1239		}
1240	}
1241	/* store left-justified binary representation of g(X) */
1242	n = g->deg+1;
1243	i = 0;
1244
1245	while (n > 0) {
1246		nbits = (n > 32) ? 32 : n;
1247		for (j = 0, word = 0; j < nbits; j++) {
1248			if (g->c[n-1-j])
1249				word |= 1u << (31-j);
1250		}
1251		genpoly[i++] = word;
1252		n -= nbits;
1253	}
1254	bch->ecc_bits = g->deg;
1255
1256finish:
1257	kfree(g);
1258	kfree(roots);
1259
1260	return genpoly;
1261}
1262
1263/**
1264 * bch_init - initialize a BCH encoder/decoder
1265 * @m:          Galois field order, should be in the range 5-15
1266 * @t:          maximum error correction capability, in bits
1267 * @prim_poly:  user-provided primitive polynomial (or 0 to use default)
1268 * @swap_bits:  swap bits within data and syndrome bytes
1269 *
1270 * Returns:
1271 *  a newly allocated BCH control structure if successful, NULL otherwise
1272 *
1273 * This initialization can take some time, as lookup tables are built for fast
1274 * encoding/decoding; make sure not to call this function from a time critical
1275 * path. Usually, bch_init() should be called on module/driver init and
1276 * bch_free() should be called to release memory on exit.
1277 *
1278 * You may provide your own primitive polynomial of degree @m in argument
1279 * @prim_poly, or let bch_init() use its default polynomial.
1280 *
1281 * Once bch_init() has successfully returned a pointer to a newly allocated
1282 * BCH control structure, ecc length in bytes is given by member @ecc_bytes of
1283 * the structure.
1284 */
1285struct bch_control *bch_init(int m, int t, unsigned int prim_poly,
1286			     bool swap_bits)
1287{
1288	int err = 0;
1289	unsigned int i, words;
1290	uint32_t *genpoly;
1291	struct bch_control *bch = NULL;
1292
1293	const int min_m = 5;
1294
1295	/* default primitive polynomials */
1296	static const unsigned int prim_poly_tab[] = {
1297		0x25, 0x43, 0x83, 0x11d, 0x211, 0x409, 0x805, 0x1053, 0x201b,
1298		0x402b, 0x8003,
1299	};
1300
1301#if defined(CONFIG_BCH_CONST_PARAMS)
1302	if ((m != (CONFIG_BCH_CONST_M)) || (t != (CONFIG_BCH_CONST_T))) {
1303		printk(KERN_ERR "bch encoder/decoder was configured to support "
1304		       "parameters m=%d, t=%d only!\n",
1305		       CONFIG_BCH_CONST_M, CONFIG_BCH_CONST_T);
1306		goto fail;
1307	}
1308#endif
1309	if ((m < min_m) || (m > BCH_MAX_M))
1310		/*
1311		 * values of m greater than 15 are not currently supported;
1312		 * supporting m > 15 would require changing table base type
1313		 * (uint16_t) and a small patch in matrix transposition
1314		 */
1315		goto fail;
1316
1317	if (t > BCH_MAX_T)
1318		/*
1319		 * we can support larger than 64 bits if necessary, at the
1320		 * cost of higher stack usage.
1321		 */
1322		goto fail;
1323
1324	/* sanity checks */
1325	if ((t < 1) || (m*t >= ((1 << m)-1)))
1326		/* invalid t value */
1327		goto fail;
1328
1329	/* select a primitive polynomial for generating GF(2^m) */
1330	if (prim_poly == 0)
1331		prim_poly = prim_poly_tab[m-min_m];
1332
1333	bch = kzalloc(sizeof(*bch), GFP_KERNEL);
1334	if (bch == NULL)
1335		goto fail;
1336
1337	bch->m = m;
1338	bch->t = t;
1339	bch->n = (1 << m)-1;
1340	words  = DIV_ROUND_UP(m*t, 32);
1341	bch->ecc_bytes = DIV_ROUND_UP(m*t, 8);
1342	bch->a_pow_tab = bch_alloc((1+bch->n)*sizeof(*bch->a_pow_tab), &err);
1343	bch->a_log_tab = bch_alloc((1+bch->n)*sizeof(*bch->a_log_tab), &err);
1344	bch->mod8_tab  = bch_alloc(words*1024*sizeof(*bch->mod8_tab), &err);
1345	bch->ecc_buf   = bch_alloc(words*sizeof(*bch->ecc_buf), &err);
1346	bch->ecc_buf2  = bch_alloc(words*sizeof(*bch->ecc_buf2), &err);
1347	bch->xi_tab    = bch_alloc(m*sizeof(*bch->xi_tab), &err);
1348	bch->syn       = bch_alloc(2*t*sizeof(*bch->syn), &err);
1349	bch->cache     = bch_alloc(2*t*sizeof(*bch->cache), &err);
1350	bch->elp       = bch_alloc((t+1)*sizeof(struct gf_poly_deg1), &err);
1351	bch->swap_bits = swap_bits;
1352
1353	for (i = 0; i < ARRAY_SIZE(bch->poly_2t); i++)
1354		bch->poly_2t[i] = bch_alloc(GF_POLY_SZ(2*t), &err);
1355
1356	if (err)
1357		goto fail;
1358
1359	err = build_gf_tables(bch, prim_poly);
1360	if (err)
1361		goto fail;
1362
1363	/* use generator polynomial for computing encoding tables */
1364	genpoly = compute_generator_polynomial(bch);
1365	if (genpoly == NULL)
1366		goto fail;
1367
1368	build_mod8_tables(bch, genpoly);
1369	kfree(genpoly);
1370
1371	err = build_deg2_base(bch);
1372	if (err)
1373		goto fail;
1374
1375	return bch;
1376
1377fail:
1378	bch_free(bch);
1379	return NULL;
1380}
1381EXPORT_SYMBOL_GPL(bch_init);
1382
1383/**
1384 *  bch_free - free the BCH control structure
1385 *  @bch:    BCH control structure to release
1386 */
1387void bch_free(struct bch_control *bch)
1388{
1389	unsigned int i;
1390
1391	if (bch) {
1392		kfree(bch->a_pow_tab);
1393		kfree(bch->a_log_tab);
1394		kfree(bch->mod8_tab);
1395		kfree(bch->ecc_buf);
1396		kfree(bch->ecc_buf2);
1397		kfree(bch->xi_tab);
1398		kfree(bch->syn);
1399		kfree(bch->cache);
1400		kfree(bch->elp);
1401
1402		for (i = 0; i < ARRAY_SIZE(bch->poly_2t); i++)
1403			kfree(bch->poly_2t[i]);
1404
1405		kfree(bch);
1406	}
1407}
1408EXPORT_SYMBOL_GPL(bch_free);
1409
1410MODULE_LICENSE("GPL");
1411MODULE_AUTHOR("Ivan Djelic <ivan.djelic@parrot.com>");
1412MODULE_DESCRIPTION("Binary BCH encoder/decoder");