Blame view

src/gf_w128.c 47.7 KB
70b6d55a   plank   Big checkin after...
1
/*
b0e2ae07   Jim Plank   Put headers on th...
2
3
4
5
 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
 *
70b6d55a   plank   Big checkin after...
6
7
8
9
10
11
12
13
14
15
16
17
18
 * gf_w128.c
 *
 * Routines for 128-bit Galois fields
 */

#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>

#define GF_FIELD_WIDTH (128)

#define two_x(a) {\
  a[0] <<= 1; \
110523d6   Jim Plank   GF-Complete Relea...
19
  if (a[1] & 1ULL << 63) a[0] ^= 1; \
70b6d55a   plank   Big checkin after...
20
21
22
23
24
25
26
27
28
29
  a[1] <<= 1; }
  
#define a_get_b(a, i, b, j) {\
  a[i] = b[j]; \
  a[i + 1] = b[j + 1];}

#define set_zero(a, i) {\
  a[i] = 0; \
  a[i + 1] = 0;}

f58a7de9   plank   w128: bytwo-b, gr...
30
31
32
33
34
struct gf_w128_split_4_128_data {
  uint64_t last_value[2];
  uint64_t tables[2][32][16];
};

110523d6   Jim Plank   GF-Complete Relea...
35
36
37
38
39
struct gf_w128_split_8_128_data {
  uint64_t last_value[2];
  uint64_t tables[2][16][256];
};

70b6d55a   plank   Big checkin after...
40
41
42
43
44
typedef struct gf_group_tables_s {
  gf_val_128_t m_table;
  gf_val_128_t r_table;
} gf_group_tables_t;

110523d6   Jim Plank   GF-Complete Relea...
45
46
#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }

70b6d55a   plank   Big checkin after...
47
48
49
50
51
static
void
gf_w128_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
int xor)
{
d1b6bbf7   Loic Dachary   add -Wsign-compar...
52
    int i;
70b6d55a   plank   Big checkin after...
53
54
55
    gf_val_128_t s128;
    gf_val_128_t d128;
    uint64_t c128[2];
f58a7de9   plank   w128: bytwo-b, gr...
56
57
58
59
60
61
62
    gf_region_data rd;

    /* We only do this to check on alignment. */
    gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);

    if (val[0] == 0) {
      if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
e96c0e28   plank   Renaming gf.h to ...
63
      if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
f58a7de9   plank   w128: bytwo-b, gr...
64
    }
70b6d55a   plank   Big checkin after...
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83

    set_zero(c128, 0);

    s128 = (gf_val_128_t) src;
    d128 = (gf_val_128_t) dest;

    if (xor) {
      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
        gf->multiply.w128(gf, &s128[i], val, c128);
        d128[i] ^= c128[0];
        d128[i+1] ^= c128[1];
      }
    } else {
      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
        gf->multiply.w128(gf, &s128[i], val, &d128[i]);
      }
    }
}

29899ad4   Loic Dachary   move #if to avoid...
84
#if defined(INTEL_SSE4_PCLMUL)
110523d6   Jim Plank   GF-Complete Relea...
85
86
87
88
89
static
void
gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
int xor)
{
d1b6bbf7   Loic Dachary   add -Wsign-compar...
90
    int i;
4dc7a55d   animetosho   Eliminate some ad...
91
    gf_val_128_t s128;
110523d6   Jim Plank   GF-Complete Relea...
92
    gf_val_128_t d128;
110523d6   Jim Plank   GF-Complete Relea...
93
94
95
96
97
98
99
100
101
102
103
104
105
106
    gf_region_data rd;
    __m128i     a,b;
    __m128i     result0,result1;
    __m128i     prim_poly;
    __m128i     c,d,e,f;
    gf_internal_t * h = gf->scratch;
    prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
    /* We only do this to check on alignment. */
    gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);

    if (val[0] == 0) {
      if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
      if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
    }
b561863b   animetosho   Remove most insta...
107

4dc7a55d   animetosho   Eliminate some ad...
108
    s128 = (gf_val_128_t) src;
110523d6   Jim Plank   GF-Complete Relea...
109
110
111
    d128 = (gf_val_128_t) dest;

    if (xor) {
b561863b   animetosho   Remove most insta...
112
113
      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
        a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
110523d6   Jim Plank   GF-Complete Relea...
114
        b = _mm_insert_epi64 (a, val[1], 0);
b561863b   animetosho   Remove most insta...
115
116
117
118
        a = _mm_insert_epi64 (a, s128[i], 1);
        b = _mm_insert_epi64 (b, val[0], 1);
    
        c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
110523d6   Jim Plank   GF-Complete Relea...
119
120
        f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
        e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
97b82b98   animetosho   Eliminate remaini...
121
122
        d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/

110523d6   Jim Plank   GF-Complete Relea...
123
        /* now reusing a and b as temporary variables*/
97b82b98   animetosho   Eliminate remaini...
124
125
        result0 = _mm_setzero_si128();
        result1 = result0;
110523d6   Jim Plank   GF-Complete Relea...
126
127
128
129
130
131
132

        result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
        a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
        result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));

        a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
        result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
97b82b98   animetosho   Eliminate remaini...
133
        result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
4dc7a55d   animetosho   Eliminate some ad...
134
135
136
        /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce. */

        a = _mm_srli_si128 (result0, 8);
110523d6   Jim Plank   GF-Complete Relea...
137
138
139
        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
        result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
        result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
b561863b   animetosho   Remove most insta...
140
141

        a = _mm_insert_epi64 (result0, 0, 1);
110523d6   Jim Plank   GF-Complete Relea...
142
        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
b561863b   animetosho   Remove most insta...
143
144
145
146
        result1 = _mm_xor_si128 (result1, b); 
        d128[i] ^= (uint64_t)_mm_extract_epi64(result1,1);
        d128[i+1] ^= (uint64_t)_mm_extract_epi64(result1,0);
      }
110523d6   Jim Plank   GF-Complete Relea...
147
148
    } else {
      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
97b82b98   animetosho   Eliminate remaini...
149
150
        a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
        b = _mm_insert_epi64 (a, val[1], 0);
110523d6   Jim Plank   GF-Complete Relea...
151
        a = _mm_insert_epi64 (a, s128[i], 1);
97b82b98   animetosho   Eliminate remaini...
152
153
        b = _mm_insert_epi64 (b, val[0], 1);

110523d6   Jim Plank   GF-Complete Relea...
154
155
156
157
158
159
160
        c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
        f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
        e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ 
        d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ 

        /* now reusing a and b as temporary variables*/
        result0 = _mm_setzero_si128();
97b82b98   animetosho   Eliminate remaini...
161
        result1 = result0;
110523d6   Jim Plank   GF-Complete Relea...
162

4dc7a55d   animetosho   Eliminate some ad...
163
164
        result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
        a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
110523d6   Jim Plank   GF-Complete Relea...
165
166
        result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));

110523d6   Jim Plank   GF-Complete Relea...
167
        a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
29899ad4   Loic Dachary   move #if to avoid...
168
        result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
110523d6   Jim Plank   GF-Complete Relea...
169
        result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
70b6d55a   plank   Big checkin after...
170
171
172
173
174
        /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/

        a = _mm_srli_si128 (result0, 8);
        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
        result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
110523d6   Jim Plank   GF-Complete Relea...
175
176
177
        result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));

        a = _mm_insert_epi64 (result0, 0, 1);
70b6d55a   plank   Big checkin after...
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
        result1 = _mm_xor_si128 (result1, b);
        d128[i] = (uint64_t)_mm_extract_epi64(result1,1);
        d128[i+1] = (uint64_t)_mm_extract_epi64(result1,0);
      }
    }
}
#endif

/*
 * Some w128 notes:
 * --Big Endian
 * --return values allocated beforehand
 */

#define GF_W128_IS_ZERO(val) (val[0] == 0 && val[1] == 0)

void
gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
  /* ordered highest bit to lowest l[0] l[1] r[0] r[1] */
  uint64_t pl[2], pr[2], ppl[2], ppr[2], i, a[2], bl[2], br[2], one, lbit;
  gf_internal_t *h;

110523d6   Jim Plank   GF-Complete Relea...
202
  h = (gf_internal_t *) gf->scratch;
70b6d55a   plank   Big checkin after...
203
204
205
206
207
208
209
210
211
212
213
214
215

  if (GF_W128_IS_ZERO(a128) || GF_W128_IS_ZERO(b128)) {
    set_zero(c128, 0);
    return;
  }

  a_get_b(a, 0, a128, 0);
  a_get_b(br, 0, b128, 0);
  set_zero(bl, 0);

  one = 1;
  lbit = (one << 63);

110523d6   Jim Plank   GF-Complete Relea...
216
  set_zero(pl, 0);
70b6d55a   plank   Big checkin after...
217
218
219
220
221
222
223
224
225
226
227
228
229
  set_zero(pr, 0);

  /* Allen: a*b for right half of a */
  for (i = 0; i < GF_FIELD_WIDTH/2; i++) {
    if (a[1] & (one << i)) {
      pl[1] ^= bl[1];
      pr[0] ^= br[0];
      pr[1] ^= br[1];
    }
    bl[1] <<= 1;
    if (br[0] & lbit) bl[1] ^= 1;
    br[0] <<= 1;
    if (br[1] & lbit) br[0] ^= 1;
110523d6   Jim Plank   GF-Complete Relea...
230
231
232
233
234
    br[1] <<= 1;
  }

  /* Allen: a*b for left half of a */
  for (i = 0; i < GF_FIELD_WIDTH/2; i++) {
70b6d55a   plank   Big checkin after...
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
    if (a[0] & (one << i)) {
      pl[0] ^= bl[0];
      pl[1] ^= bl[1];
      pr[0] ^= br[0];
    }
    bl[0] <<= 1;
    if (bl[1] & lbit) bl[0] ^= 1;
    bl[1] <<= 1;
    if (br[0] & lbit) bl[1] ^= 1;
    br[0] <<= 1;
  }

  /* Allen: do first half of reduction (based on left quarter of initial product) */
  one = lbit >> 1;
  ppl[0] = one; /* Allen: introduce leading one of primitive polynomial */
  ppl[1] = h->prim_poly >> 2;
  ppr[0] = h->prim_poly << (GF_FIELD_WIDTH/2-2);
  ppr[1] = 0;
110523d6   Jim Plank   GF-Complete Relea...
253
  while (one != 0) {
70b6d55a   plank   Big checkin after...
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
    if (pl[0] & one) {
      pl[0] ^= ppl[0];
      pl[1] ^= ppl[1];
      pr[0] ^= ppr[0];
      pr[1] ^= ppr[1];
    }
    one >>= 1;
    ppr[1] >>= 1;
    if (ppr[0] & 1) ppr[1] ^= lbit;
    ppr[0] >>= 1;
    if (ppl[1] & 1) ppr[0] ^= lbit;
    ppl[1] >>= 1;
    if (ppl[0] & 1) ppl[1] ^= lbit;
    ppl[0] >>= 1;
  }
110523d6   Jim Plank   GF-Complete Relea...
269

70b6d55a   plank   Big checkin after...
270
271
272
273
274
275
  /* Allen: final half of reduction */
  one = lbit;
  while (one != 0) {
    if (pl[1] & one) {
      pl[1] ^= ppl[1];
      pr[0] ^= ppr[0];
f58a7de9   plank   w128: bytwo-b, gr...
276
      pr[1] ^= ppr[1];
110523d6   Jim Plank   GF-Complete Relea...
277
278
    }
    one >>= 1;
fb0bbdcf   Jim Plank   Fixed the problem...
279
    ppr[1] >>= 1;
110523d6   Jim Plank   GF-Complete Relea...
280
281
282
283
284
285
286
    if (ppr[0] & 1) ppr[1] ^= lbit;
    ppr[0] >>= 1;
    if (ppl[1] & 1) ppr[0] ^= lbit;
    ppl[1] >>= 1;
  }

  /* Allen: if we really want to optimize this we can just be using c128 instead of pr all along */
e506417c   animetosho   Eliminate added _...
287
288
  c128[0] = pr[0];
  c128[1] = pr[1];
110523d6   Jim Plank   GF-Complete Relea...
289
290
291
292

  return;
}

e506417c   animetosho   Eliminate added _...
293
294
295
296
void
gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
#if defined(INTEL_SSE4_PCLMUL)
110523d6   Jim Plank   GF-Complete Relea...
297
298

    __m128i     a,b;
97b82b98   animetosho   Eliminate remaini...
299
300
    __m128i     result0,result1;
    __m128i     prim_poly;
110523d6   Jim Plank   GF-Complete Relea...
301
    __m128i     c,d,e,f;
97b82b98   animetosho   Eliminate remaini...
302
303
    gf_internal_t * h = gf->scratch;
    
110523d6   Jim Plank   GF-Complete Relea...
304
305
306
307
308
309
310
    a = _mm_insert_epi64 (_mm_setzero_si128(), a128[1], 0);
    b = _mm_insert_epi64 (a, b128[1], 0);
    a = _mm_insert_epi64 (a, a128[0], 1);
    b = _mm_insert_epi64 (b, b128[0], 1);

    prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);

97b82b98   animetosho   Eliminate remaini...
311
    /* we need to test algorithm 2 later*/
110523d6   Jim Plank   GF-Complete Relea...
312
313
    c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
    f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
4dc7a55d   animetosho   Eliminate some ad...
314
315
    e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
    d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
110523d6   Jim Plank   GF-Complete Relea...
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
    
    /* now reusing a and b as temporary variables*/
    result0 = _mm_setzero_si128();
    result1 = result0;

    result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
    a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
    result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));

    a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
    result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
    result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
    /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/
    
    a = _mm_srli_si128 (result0, 8);
    b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
    result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
    result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
    
    a = _mm_insert_epi64 (result0, 0, 1);
    b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
    result1 = _mm_xor_si128 (result1, b);

    c128[0] = (uint64_t)_mm_extract_epi64(result1,1);
    c128[1] = (uint64_t)_mm_extract_epi64(result1,0);
#endif
return;
}

void
gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
  uint64_t amask[2], pmask, pp, prod[2]; /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
  uint64_t topbit; /* this is used as a boolean value */
  gf_internal_t *h;

  h = (gf_internal_t *) gf->scratch;
  pp = h->prim_poly;
  prod[0] = 0;
  prod[1] = 0;
  pmask = 0x8000000000000000ULL;
fb0bbdcf   Jim Plank   Fixed the problem...
357
  amask[0] = 0x8000000000000000ULL;
110523d6   Jim Plank   GF-Complete Relea...
358
  amask[1] = 0;
191b86b5   Loic Dachary   remove unused var...
359

110523d6   Jim Plank   GF-Complete Relea...
360
361
362
363
364
365
366
367
  while (amask[1] != 0 || amask[0] != 0) {
    topbit = (prod[0] & pmask);
    prod[0] <<= 1;
    if (prod[1] & pmask) prod[0] ^= 1;
    prod[1] <<= 1;
    if (topbit) prod[1] ^= pp;
    if ((a128[0] & amask[0]) || (a128[1] & amask[1])) {
      prod[0] ^= b128[0];
e506417c   animetosho   Eliminate added _...
368
369
370
371
      prod[1] ^= b128[1];
    }
    amask[1] >>= 1;
    if (amask[0] & 1) amask[1] ^= pmask;
110523d6   Jim Plank   GF-Complete Relea...
372
373
374
    amask[0] >>= 1;
  }
  c128[0] = prod [0];
110523d6   Jim Plank   GF-Complete Relea...
375
376
377
378
379
380
381
382
383
384
385
  c128[1] = prod [1];
  return;
}

void
gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
#if defined(INTEL_SSE4)
  int i;
  __m128i a, b, pp, prod, amask, u_middle_one; 
  /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
8c8f2622   animetosho   Workarounds for r...
386
  uint32_t topbit, middlebit, pmask; /* this is used as a boolean value */
110523d6   Jim Plank   GF-Complete Relea...
387
388
389
390
391
392
393
394
395
396
397
  gf_internal_t *h;


  h = (gf_internal_t *) gf->scratch;
  pp = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
  prod = _mm_setzero_si128();
  a = _mm_insert_epi64(prod, a128[1], 0x0);
  a = _mm_insert_epi64(a, a128[0], 0x1);
  b = _mm_insert_epi64(prod, b128[1], 0x0);
  b = _mm_insert_epi64(b, b128[0], 0x1);
  pmask = 0x80000000;
8c8f2622   animetosho   Workarounds for r...
398
  amask = _mm_insert_epi32(prod, 0x80000000, 0x3);
110523d6   Jim Plank   GF-Complete Relea...
399
400
401
402
  u_middle_one = _mm_insert_epi32(prod, 1, 0x2);
  
  for (i = 0; i < 64; i++) {
    topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
4dc7a55d   animetosho   Eliminate some ad...
403
404
    middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
    prod = _mm_slli_epi64(prod, 1); /* this instruction loses the middle bit */
110523d6   Jim Plank   GF-Complete Relea...
405
406
407
408
409
410
411
412
413
    if (middlebit) {
      prod = _mm_xor_si128(prod, u_middle_one);
    }
    if (topbit) {
      prod = _mm_xor_si128(prod, pp);
    }
    if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 1))) {
      prod = _mm_xor_si128(prod, b);
    }
fb0bbdcf   Jim Plank   Fixed the problem...
414
    amask = _mm_srli_epi64(amask, 1); /*so does this one, but we can just replace after loop*/
5c3a42d5   animetosho   Remove unused var...
415
  }
110523d6   Jim Plank   GF-Complete Relea...
416
417
418
419
420
421
  amask = _mm_insert_epi32(amask, 1 << 31, 0x1);
  for (i = 64; i < 128; i++) {
    topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
    middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
    prod = _mm_slli_epi64(prod, 1);
    if (middlebit) prod = _mm_xor_si128(prod, u_middle_one);
e506417c   animetosho   Eliminate added _...
422
    if (topbit) prod = _mm_xor_si128(prod, pp);
e506417c   animetosho   Eliminate added _...
423
424
425
426
    if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 0))) {
      prod = _mm_xor_si128(prod, b);
    }
    amask = _mm_srli_epi64(amask, 1);
4dc7a55d   animetosho   Eliminate some ad...
427
  }
e506417c   animetosho   Eliminate added _...
428
  c128[0] = (uint64_t)_mm_extract_epi64(prod, 1);
110523d6   Jim Plank   GF-Complete Relea...
429
430
431
432
433
434
435
436
  c128[1] = (uint64_t)_mm_extract_epi64(prod, 0);
#endif
  return;
}


/* Ben: This slow function implements sse instrutions for bytwo_b because why not */
void
b561863b   animetosho   Remove most insta...
437
gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
4dc7a55d   animetosho   Eliminate some ad...
438
439
{
#if defined(INTEL_SSE4)
110523d6   Jim Plank   GF-Complete Relea...
440
441
  __m128i a, b, lmask, hmask, pp, c, middle_one;
  gf_internal_t *h;
8c8f2622   animetosho   Workarounds for r...
442
443
  uint64_t topbit, middlebit;

110523d6   Jim Plank   GF-Complete Relea...
444
445
446
447
448
449
450
451
  h = (gf_internal_t *) gf->scratch;
  
  c = _mm_setzero_si128();
  lmask = _mm_insert_epi64(c, 1ULL << 63, 0);
  hmask = _mm_insert_epi64(c, 1ULL << 63, 1);
  b = _mm_insert_epi64(c, a128[0], 1);
  b = _mm_insert_epi64(b, a128[1], 0);
  a = _mm_insert_epi64(c, b128[0], 1);
f58a7de9   plank   w128: bytwo-b, gr...
452
453
454
455
456
457
458
459
  a = _mm_insert_epi64(a, b128[1], 0);
  pp = _mm_insert_epi64(c, h->prim_poly, 0);
  middle_one = _mm_insert_epi64(c, 1, 0x1);

  while (1) {
    if (_mm_extract_epi32(a, 0x0) & 1) {
      c = _mm_xor_si128(c, b);
    }
110523d6   Jim Plank   GF-Complete Relea...
460
    middlebit = (_mm_extract_epi32(a, 0x2) & 1);
f58a7de9   plank   w128: bytwo-b, gr...
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
    a = _mm_srli_epi64(a, 1);
    if (middlebit) a = _mm_xor_si128(a, lmask);
    if ((_mm_extract_epi64(a, 0x1) == 0ULL) && (_mm_extract_epi64(a, 0x0) == 0ULL)){
      c128[0] = _mm_extract_epi64(c, 0x1);
      c128[1] = _mm_extract_epi64(c, 0x0);
      return;
    }
    topbit = (_mm_extract_epi64(_mm_and_si128(b, hmask), 1));
    middlebit = (_mm_extract_epi64(_mm_and_si128(b, lmask), 0));
    b = _mm_slli_epi64(b, 1);
    if (middlebit) b = _mm_xor_si128(b, middle_one);
    if (topbit) b = _mm_xor_si128(b, pp);
  }
#endif
}

void
gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
  uint64_t bmask, pp;
  gf_internal_t *h;
  uint64_t a[2], b[2], c[2];

  h = (gf_internal_t *) gf->scratch;

  bmask = (1ULL << 63);
  set_zero(c, 0);
  b[0] = a128[0];
  b[1] = a128[1];
  a[0] = b128[0];
  a[1] = b128[1];
  
  while (1) {
    if (a[1] & 1) {
      c[0] ^= b[0];
      c[1] ^= b[1];
    }
    a[1] >>= 1;
    if (a[0] & 1) a[1] ^= bmask;
    a[0] >>= 1;
    if (a[1] == 0 && a[0] == 0) {
      c128[0] = c[0];
      c128[1] = c[1];
      return;
e96c0e28   plank   Renaming gf.h to ...
505
    }
f58a7de9   plank   w128: bytwo-b, gr...
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
    pp = (b[0] & bmask);
    b[0] <<= 1;
    if (b[1] & bmask) b[0] ^= 1;
    b[1] <<= 1;
    if (pp) b[1] ^= h->prim_poly;
  }
}

static
void
gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
{
  int i, j, k;
  uint64_t pp;
  gf_internal_t *h;
  uint64_t *s64, *d64, *top;
  gf_region_data rd;
  uint64_t v[2], s;
  struct gf_w128_split_4_128_data *ld;

110523d6   Jim Plank   GF-Complete Relea...
526
  /* We only do this to check on alignment. */
f58a7de9   plank   w128: bytwo-b, gr...
527
  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
110523d6   Jim Plank   GF-Complete Relea...
528

f58a7de9   plank   w128: bytwo-b, gr...
529
530
531
532
533
534
535
536
  if (val[0] == 0) {
    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  }
    
  h = (gf_internal_t *) gf->scratch;
  ld = (struct gf_w128_split_4_128_data *) h->private;

110523d6   Jim Plank   GF-Complete Relea...
537
538
539
540
541
542
543
544
  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  top = (uint64_t *) rd.d_top;

  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
    v[0] = val[0];
    v[1] = val[1];
    for (i = 0; i < 32; i++) {
f58a7de9   plank   w128: bytwo-b, gr...
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
      ld->tables[0][i][0] = 0;
      ld->tables[1][i][0] = 0;
      for (j = 1; j < 16; j <<= 1) {
        for (k = 0; k < j; k++) {
          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
        }
        pp = (v[0] & (1ULL << 63));
        v[0] <<= 1;
        if (v[1] & (1ULL << 63)) v[0] ^= 1;
        v[1] <<= 1;
        if (pp) v[1] ^= h->prim_poly;
      }
    }
  }
  ld->last_value[0] = val[0];
  ld->last_value[1] = val[1];

/*
  for (i = 0; i < 32; i++) {
    for (j = 0; j < 16; j++) {
      printf("%2d %2d %016llx %016llx\n", i, j, ld->tables[0][i][j], ld->tables[1][i][j]);
    }
    printf("\n");
  }
 */
d5692206   Loic Dachary   do not compile if...
571
  while (d64 < top) {
110523d6   Jim Plank   GF-Complete Relea...
572
573
    v[0] = (xor) ? d64[0] : 0;
    v[1] = (xor) ? d64[1] : 0;
7bb3797a   Jim Plank   Added SPLIT SSE S...
574
575
    s = s64[1];
    i = 0;
7bb3797a   Jim Plank   Added SPLIT SSE S...
576
    while (s != 0) {
191b86b5   Loic Dachary   remove unused var...
577
      v[0] ^= ld->tables[0][i][s&0xf];
7bb3797a   Jim Plank   Added SPLIT SSE S...
578
579
580
581
582
583
584
585
586
587
588
      v[1] ^= ld->tables[1][i][s&0xf];
      s >>= 4;
      i++;
    }
    s = s64[0];
    i = 16;
    while (s != 0) {
      v[0] ^= ld->tables[0][i][s&0xf];
      v[1] ^= ld->tables[1][i][s&0xf];
      s >>= 4;
      i++;
7bb3797a   Jim Plank   Added SPLIT SSE S...
589
590
591
592
593
594
    }
    d64[0] = v[0];
    d64[1] = v[1];
    s64 += 2;
    d64 += 2;
  }
a0ae760e   Loic Dachary   prefer uint8_t to...
595
}
7bb3797a   Jim Plank   Added SPLIT SSE S...
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664

#if defined(INTEL_SSSE3) && defined(INTEL_SSE4)
static
void
gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
{
  gf_internal_t *h;
  int i, j, k;
  uint64_t pp, v[2], s, *s64, *d64, *top;
  __m128i p, tables[32][16];
  struct gf_w128_split_4_128_data *ld;
  gf_region_data rd;

  if (val[0] == 0) {
    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  }

  h = (gf_internal_t *) gf->scratch;
  
  /* We only do this to check on alignment. */
  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 16);

  /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */

  gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);

  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  top = (uint64_t *) rd.d_top;
 
  ld = (struct gf_w128_split_4_128_data *) h->private;

  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
    v[0] = val[0];
    v[1] = val[1];
    for (i = 0; i < 32; i++) {
      ld->tables[0][i][0] = 0;
      ld->tables[1][i][0] = 0;
      for (j = 1; j < 16; j <<= 1) {
        for (k = 0; k < j; k++) {
          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
        }
        pp = (v[0] & (1ULL << 63));
        v[0] <<= 1;
        if (v[1] & (1ULL << 63)) v[0] ^= 1;
        v[1] <<= 1;
        if (pp) v[1] ^= h->prim_poly;
      }
    }
  }

  ld->last_value[0] = val[0];
  ld->last_value[1] = val[1];

  for (i = 0; i < 32; i++) {
    for (j = 0; j < 16; j++) {
      v[0] = ld->tables[0][i][j];
      v[1] = ld->tables[1][i][j];
      tables[i][j] = _mm_loadu_si128((__m128i *) v);

/*
      printf("%2d %2d: ", i, j);
      MM_PRINT8("", tables[i][j]); */
    }
  }

  while (d64 != top) {
a0ae760e   Loic Dachary   prefer uint8_t to...
665

7bb3797a   Jim Plank   Added SPLIT SSE S...
666
    if (xor) {
29899ad4   Loic Dachary   move #if to avoid...
667
      p = _mm_load_si128 ((__m128i *) d64);
7bb3797a   Jim Plank   Added SPLIT SSE S...
668
    } else {
d5692206   Loic Dachary   do not compile if...
669
      p = _mm_setzero_si128();
7bb3797a   Jim Plank   Added SPLIT SSE S...
670
671
    }
    s = *s64;
110523d6   Jim Plank   GF-Complete Relea...
672
673
    s64++;
    for (i = 0; i < 16; i++) {
110523d6   Jim Plank   GF-Complete Relea...
674
      j = (s&0xf);
191b86b5   Loic Dachary   remove unused var...
675
676
      s >>= 4;
      p = _mm_xor_si128(p, tables[16+i][j]);
110523d6   Jim Plank   GF-Complete Relea...
677
678
679
680
681
682
683
684
685
686
687
    }
    s = *s64;
    s64++;
    for (i = 0; i < 16; i++) {
      j = (s&0xf);
      s >>= 4;
      p = _mm_xor_si128(p, tables[i][j]);
    }
    _mm_store_si128((__m128i *) d64, p);
    d64 += 2;
  }
110523d6   Jim Plank   GF-Complete Relea...
688
689
690
691
692
693

  /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */

  gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
}
#endif
a0ae760e   Loic Dachary   prefer uint8_t to...
694

110523d6   Jim Plank   GF-Complete Relea...
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
#if defined(INTEL_SSSE3) && defined(INTEL_SSE4)
static
void
gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
{
  gf_internal_t *h;
  int i, j, k;
  uint64_t pp, v[2], *s64, *d64, *top;
  __m128i si, tables[32][16], p[16], v0, mask1;
  struct gf_w128_split_4_128_data *ld;
  uint8_t btable[16];
  gf_region_data rd;

  if (val[0] == 0) {
    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  }

  h = (gf_internal_t *) gf->scratch;
  
  /* We only do this to check on alignment. */
  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 256);

  /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */

  gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);

  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  top = (uint64_t *) rd.d_top;
 
  ld = (struct gf_w128_split_4_128_data *) h->private;

  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
    v[0] = val[0];
    v[1] = val[1];
    for (i = 0; i < 32; i++) {
      ld->tables[0][i][0] = 0;
      ld->tables[1][i][0] = 0;
      for (j = 1; j < 16; j <<= 1) {
        for (k = 0; k < j; k++) {
          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
        }
        pp = (v[0] & (1ULL << 63));
        v[0] <<= 1;
        if (v[1] & (1ULL << 63)) v[0] ^= 1;
        v[1] <<= 1;
        if (pp) v[1] ^= h->prim_poly;
      }
    }
  }

  ld->last_value[0] = val[0];
  ld->last_value[1] = val[1];

  for (i = 0; i < 32; i++) {
    for (j = 0; j < 16; j++) {
      for (k = 0; k < 16; k++) {
        btable[k] = (uint8_t) ld->tables[1-(j/8)][i][k];
        ld->tables[1-(j/8)][i][k] >>= 8;
      }
      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
/*
      printf("%2d %2d: ", i, j);
      MM_PRINT8("", tables[i][j]);
 */
    }
  }


  mask1 = _mm_set1_epi8(0xf);

  while (d64 != top) {

    if (xor) {
      for (i = 0; i < 16; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2));
    } else {
      for (i = 0; i < 16; i++) p[i] = _mm_setzero_si128();
a0ae760e   Loic Dachary   prefer uint8_t to...
774
    }
110523d6   Jim Plank   GF-Complete Relea...
775
    i = 0;
29899ad4   Loic Dachary   move #if to avoid...
776
    for (k = 0; k < 16; k++) {
110523d6   Jim Plank   GF-Complete Relea...
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
      v0 = _mm_load_si128((__m128i *) s64); 
      s64 += 2;
      
      si = _mm_and_si128(v0, mask1);
  
      for (j = 0; j < 16; j++) {
        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
      }
      i++;
      v0 = _mm_srli_epi32(v0, 4);
      si = _mm_and_si128(v0, mask1);
      for (j = 0; j < 16; j++) {
        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
      }
      i++;
    }
    for (i = 0; i < 16; i++) {
      _mm_store_si128((__m128i *) d64, p[i]);
      d64 += 2;
    }
  }
  /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */

  gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
}
#endif

static
void
gf_w128_split_8_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
{
  int i, j, k;
  uint64_t pp;
  gf_internal_t *h;
  uint64_t *s64, *d64, *top;
  gf_region_data rd;
  uint64_t v[2], s;
  struct gf_w128_split_8_128_data *ld;

  /* Check on alignment. Ignore it otherwise. */
  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);

  if (val[0] == 0) {
    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  }
    
  h = (gf_internal_t *) gf->scratch;
  ld = (struct gf_w128_split_8_128_data *) h->private;

  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  top = (uint64_t *) rd.d_top;

  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
    v[0] = val[0];
    v[1] = val[1];
    for (i = 0; i < 16; i++) {
      ld->tables[0][i][0] = 0;
      ld->tables[1][i][0] = 0;
      for (j = 1; j < (1 << 8); j <<= 1) {
        for (k = 0; k < j; k++) {
          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
        }
        pp = (v[0] & (1ULL << 63));
        v[0] <<= 1;
        if (v[1] & (1ULL << 63)) v[0] ^= 1;
        v[1] <<= 1;
        if (pp) v[1] ^= h->prim_poly;
      }
    }
  }
  ld->last_value[0] = val[0];
  ld->last_value[1] = val[1];

f58a7de9   plank   w128: bytwo-b, gr...
853
854
855
  while (d64 < top) {
    v[0] = (xor) ? d64[0] : 0;
    v[1] = (xor) ? d64[1] : 0;
f043479e   Loic Dachary   remove unused var...
856
    s = s64[1];
f58a7de9   plank   w128: bytwo-b, gr...
857
858
859
860
861
862
863
864
865
    i = 0;
    while (s != 0) {
      v[0] ^= ld->tables[0][i][s&0xff];
      v[1] ^= ld->tables[1][i][s&0xff];
      s >>= 8;
      i++;
    }
    s = s64[0];
    i = 8;
e96c0e28   plank   Renaming gf.h to ...
866
    while (s != 0) {
f58a7de9   plank   w128: bytwo-b, gr...
867
868
869
870
871
872
      v[0] ^= ld->tables[0][i][s&0xff];
      v[1] ^= ld->tables[1][i][s&0xff];
      s >>= 8;
      i++;
    }
    d64[0] = v[0];
110523d6   Jim Plank   GF-Complete Relea...
873
    d64[1] = v[1];
f58a7de9   plank   w128: bytwo-b, gr...
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
    s64 += 2;
    d64 += 2;
  }
}

void
gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
{
  uint64_t bmask, pp;
  gf_internal_t *h;
  uint64_t a[2], c[2], b[2], *s64, *d64, *top;
  gf_region_data rd;

  /* We only do this to check on alignment. */
  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);

  if (val[0] == 0) {
    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  }
    
  h = (gf_internal_t *) gf->scratch;
  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  top = (uint64_t *) rd.d_top;
  bmask = (1ULL << 63);

  while (d64 < top) {
    set_zero(c, 0);
    b[0] = s64[0];
    b[1] = s64[1];
    a[0] = val[0];
    a[1] = val[1];

    while (a[0] != 0) {
      if (a[1] & 1) {
        c[0] ^= b[0];
        c[1] ^= b[1];
      }
      a[1] >>= 1;
      if (a[0] & 1) a[1] ^= bmask;
      a[0] >>= 1;
      pp = (b[0] & bmask);
      b[0] <<= 1;
      if (b[1] & bmask) b[0] ^= 1;    
      b[1] <<= 1;
      if (pp) b[1] ^= h->prim_poly;
70b6d55a   plank   Big checkin after...
921
922
923
924
925
926
927
928
929
930
931
    }
    while (1) {
      if (a[1] & 1) {
        c[0] ^= b[0];
        c[1] ^= b[1];
      }
      a[1] >>= 1;
      if (a[1] == 0) break;
      pp = (b[0] & bmask);
      b[0] <<= 1;
      if (b[1] & bmask) b[0] ^= 1;    
110523d6   Jim Plank   GF-Complete Relea...
932
      b[1] <<= 1;
70b6d55a   plank   Big checkin after...
933
934
      if (pp) b[1] ^= h->prim_poly;
    }
bd685c8d   plank   Changed the defau...
935
    if (xor) {
70b6d55a   plank   Big checkin after...
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
      d64[0] ^= c[0];
      d64[1] ^= c[1];
    } else {
      d64[0] = c[0];
      d64[1] = c[1];
    }
    s64 += 2;
    d64 += 2;
  }
}

static
void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128)
{
  int i, j;
  int g_m;
  uint64_t prim_poly, lbit;
  gf_internal_t *scratch;
70b6d55a   plank   Big checkin after...
954
955
956
  gf_group_tables_t *gt;
  uint64_t a128[2];
  scratch = (gf_internal_t *) gf->scratch;
f043479e   Loic Dachary   remove unused var...
957
  gt = scratch->private;
70b6d55a   plank   Big checkin after...
958
959
960
961
962
963
964
965
966
967
  g_m = scratch->arg1;
  prim_poly = scratch->prim_poly;


  set_zero(gt->m_table, 0);
  a_get_b(gt->m_table, 2, b128, 0);
  lbit = 1;
  lbit <<= 63;

  for (i = 2; i < (1 << g_m); i <<= 1) {
110523d6   Jim Plank   GF-Complete Relea...
968
969
    a_get_b(a128, 0, gt->m_table, 2 * (i >> 1));
    two_x(a128);
70b6d55a   plank   Big checkin after...
970
971
972
973
974
975
976
    a_get_b(gt->m_table, 2 * i, a128, 0);
    if (gt->m_table[2 * (i >> 1)] & lbit) gt->m_table[(2 * i) + 1] ^= prim_poly;
    for (j = 0; j < i; j++) {
      gt->m_table[(2 * i) + (2 * j)] = gt->m_table[(2 * i)] ^ gt->m_table[(2 * j)];
      gt->m_table[(2 * i) + (2 * j) + 1] = gt->m_table[(2 * i) + 1] ^ gt->m_table[(2 * j) + 1];
    }
  }
110523d6   Jim Plank   GF-Complete Relea...
977
  return;
70b6d55a   plank   Big checkin after...
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
}

void
gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
  int i;
  /* index_r, index_m, total_m (if g_r > g_m) */
  int i_r, i_m, t_m;
  int mask_m, mask_r;
  int g_m, g_r;
  uint64_t p_i[2], a[2];
  gf_internal_t *scratch;
  gf_group_tables_t *gt;

  scratch = (gf_internal_t *) gf->scratch;
  gt = scratch->private;
  g_m = scratch->arg1;
  g_r = scratch->arg2;

  mask_m = (1 << g_m) - 1;
  mask_r = (1 << g_r) - 1;

  if (b128[0] != gt->m_table[2] || b128[1] != gt->m_table[3]) {
    gf_w128_group_m_init(gf, b128);
  }
  
  p_i[0] = 0;
  p_i[1] = 0;
  a[0] = a128[0];
  a[1] = a128[1];

  t_m = 0;
  i_r = 0;

  /* Top 64 bits */
  for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
    i_m = (a[0] >> (i * g_m)) & mask_m;
    i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
    p_i[0] <<= g_m;
    p_i[0] ^= (p_i[1] >> (64-g_m));
    p_i[1] <<= g_m;
    p_i[0] ^= gt->m_table[2 * i_m];
    p_i[1] ^= gt->m_table[(2 * i_m) + 1];
    t_m += g_m;
70b6d55a   plank   Big checkin after...
1022
1023
1024
1025
    if (t_m == g_r) {
      p_i[1] ^= gt->r_table[i_r];
      t_m = 0;
      i_r = 0;
110523d6   Jim Plank   GF-Complete Relea...
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
    } else {
      i_r <<= g_m;
    }
  }

  for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
    i_m = (a[1] >> (i * g_m)) & mask_m;
    i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
    p_i[0] <<= g_m;
    p_i[0] ^= (p_i[1] >> (64-g_m));
    p_i[1] <<= g_m;
    p_i[0] ^= gt->m_table[2 * i_m];
f58a7de9   plank   w128: bytwo-b, gr...
1038
1039
1040
1041
1042
1043
1044
    p_i[1] ^= gt->m_table[(2 * i_m) + 1];
    t_m += g_m;
    if (t_m == g_r) {
      p_i[1] ^= gt->r_table[i_r];
      t_m = 0;
      i_r = 0;
    } else {
e96c0e28   plank   Renaming gf.h to ...
1045
      i_r <<= g_m;
f58a7de9   plank   w128: bytwo-b, gr...
1046
1047
1048
1049
    }
  }
  c128[0] = p_i[0];
  c128[1] = p_i[1];
110523d6   Jim Plank   GF-Complete Relea...
1050
1051
}

f58a7de9   plank   w128: bytwo-b, gr...
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
static
void
gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
{
  int i;
  int i_r, i_m, t_m;
  int mask_m, mask_r;
  int g_m, g_r;
  uint64_t p_i[2], a[2];
  gf_internal_t *scratch;
  gf_group_tables_t *gt;
  gf_region_data rd;
  uint64_t *a128, *c128, *top;

  /* We only do this to check on alignment. */
  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
      
  if (val[0] == 0) {
    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  }
    
  scratch = (gf_internal_t *) gf->scratch;
  gt = scratch->private;
  g_m = scratch->arg1;
  g_r = scratch->arg2;

  mask_m = (1 << g_m) - 1;
110523d6   Jim Plank   GF-Complete Relea...
1080
  mask_r = (1 << g_r) - 1;
f58a7de9   plank   w128: bytwo-b, gr...
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091

  if (val[0] != gt->m_table[2] || val[1] != gt->m_table[3]) {
    gf_w128_group_m_init(gf, val);
  }

  a128 = (uint64_t *) src;
  c128 = (uint64_t *) dest;
  top = (uint64_t *) rd.d_top;

  while (c128 < top) {
    p_i[0] = 0;
f58a7de9   plank   w128: bytwo-b, gr...
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
    p_i[1] = 0;
    a[0] = a128[0];
    a[1] = a128[1];
  
    t_m = 0;
    i_r = 0;
  
    /* Top 64 bits */
    for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
      i_m = (a[0] >> (i * g_m)) & mask_m;
      i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
      p_i[0] <<= g_m;
      p_i[0] ^= (p_i[1] >> (64-g_m));
      p_i[1] <<= g_m;
      
      p_i[0] ^= gt->m_table[2 * i_m];
      p_i[1] ^= gt->m_table[(2 * i_m) + 1];
      t_m += g_m;
      if (t_m == g_r) {
        p_i[1] ^= gt->r_table[i_r];
        t_m = 0;
        i_r = 0;
      } else {
        i_r <<= g_m;
      }
    }
    for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
      i_m = (a[1] >> (i * g_m)) & mask_m;
      i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
      p_i[0] <<= g_m;
70b6d55a   plank   Big checkin after...
1122
      p_i[0] ^= (p_i[1] >> (64-g_m));
110523d6   Jim Plank   GF-Complete Relea...
1123
      p_i[1] <<= g_m;
70b6d55a   plank   Big checkin after...
1124
1125
1126
1127
1128
1129
1130
1131
      p_i[0] ^= gt->m_table[2 * i_m];
      p_i[1] ^= gt->m_table[(2 * i_m) + 1];
      t_m += g_m;
      if (t_m == g_r) {
        p_i[1] ^= gt->r_table[i_r];
        t_m = 0;
        i_r = 0;
      } else {
70b6d55a   plank   Big checkin after...
1132
1133
1134
1135
        i_r <<= g_m;
      }
    }
  
0020ff80   Loic Dachary   initialize pointe...
1136
1137
    if (xor) {
      c128[0] ^= p_i[0];
70b6d55a   plank   Big checkin after...
1138
1139
1140
1141
1142
      c128[1] ^= p_i[1];
    } else {
      c128[0] = p_i[0];
      c128[1] = p_i[1];
    }
110523d6   Jim Plank   GF-Complete Relea...
1143
    a128 += 2;
d59cbb2a   animetosho   Convert all comme...
1144
1145
1146
    c128 += 2;
  }
}
110523d6   Jim Plank   GF-Complete Relea...
1147

70b6d55a   plank   Big checkin after...
1148
/* a^-1 -> b */
110523d6   Jim Plank   GF-Complete Relea...
1149
  void
d59cbb2a   animetosho   Convert all comme...
1150
gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
110523d6   Jim Plank   GF-Complete Relea...
1151
{
70b6d55a   plank   Big checkin after...
1152
  uint64_t e_i[2], e_im1[2], e_ip1[2];
110523d6   Jim Plank   GF-Complete Relea...
1153
  uint64_t d_i, d_im1, d_ip1;
d59cbb2a   animetosho   Convert all comme...
1154
1155
  uint64_t y_i[2], y_im1[2], y_ip1[2];
  uint64_t c_i[2];
110523d6   Jim Plank   GF-Complete Relea...
1156
1157
1158
  uint64_t *b;
  uint64_t one = 1;

70b6d55a   plank   Big checkin after...
1159
  /* This needs to return some sort of error (in b128?) */
110523d6   Jim Plank   GF-Complete Relea...
1160
  if (a128[0] == 0 && a128[1] == 0) return;
d59cbb2a   animetosho   Convert all comme...
1161

110523d6   Jim Plank   GF-Complete Relea...
1162
  b = (uint64_t *) b128;
70b6d55a   plank   Big checkin after...
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187

  e_im1[0] = 0;
  e_im1[1] = ((gf_internal_t *) (gf->scratch))->prim_poly;
  e_i[0] = a128[0];
  e_i[1] = a128[1];
  d_im1 = 128;

  //Allen: I think d_i starts at 63 here, and checks each bit of a, starting at MSB, looking for the first nonzero bit
  //so d_i should be 0 if this half of a is all 0s, otherwise it should be the position from right of the first-from-left zero bit of this half of a.
  //BUT if d_i is 0 at end we won't know yet if the rightmost bit of this half is 1 or not

  for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[0]) == 0 && d_i > 0; d_i--) ;

  //Allen: this is testing just the first half of the stop condition above, so if it holds we know we did not find a nonzero bit yet

  if (!((one << d_i) & e_i[0])) {

    //Allen: this is doing the same thing on the other half of a. In other words, we're still searching for a nonzero bit of a.
    // but not bothering to test if d_i hits zero, which is fine because we've already tested for a=0.

    for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[1]) == 0; d_i--) ;

  } else {

    //Allen: if a 1 was found in more-significant half of a, make d_i the ACTUAL index of the first nonzero bit in the entire a.
110523d6   Jim Plank   GF-Complete Relea...
1188
1189

    d_i += 64;
70b6d55a   plank   Big checkin after...
1190
1191
1192
  }
  y_i[0] = 0;
  y_i[1] = 1;
70b6d55a   plank   Big checkin after...
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
  y_im1[0] = 0;
  y_im1[1] = 0;

  while (!(e_i[0] == 0 && e_i[1] == 1)) {

    e_ip1[0] = e_im1[0];
    e_ip1[1] = e_im1[1];
    d_ip1 = d_im1;
    c_i[0] = 0;
    c_i[1] = 0;

    while (d_ip1 >= d_i) {
      if ((d_ip1 - d_i) >= 64) {
        c_i[0] ^= (one << ((d_ip1 - d_i) - 64));
        e_ip1[0] ^= (e_i[1] << ((d_ip1 - d_i) - 64));
      } else {
        c_i[1] ^= (one << (d_ip1 - d_i));
        e_ip1[0] ^= (e_i[0] << (d_ip1 - d_i));
70b6d55a   plank   Big checkin after...
1211
1212
        if (d_ip1 - d_i > 0) e_ip1[0] ^= (e_i[1] >> (64 - (d_ip1 - d_i)));
        e_ip1[1] ^= (e_i[1] << (d_ip1 - d_i));
70b6d55a   plank   Big checkin after...
1213
1214
1215
      }
      d_ip1--;
      if (e_ip1[0] == 0 && e_ip1[1] == 0) { b[0] = 0; b[1] = 0; return; }
110523d6   Jim Plank   GF-Complete Relea...
1216
      while (d_ip1 >= 64 && (e_ip1[0] & (one << (d_ip1 - 64))) == 0) d_ip1--;
70b6d55a   plank   Big checkin after...
1217
1218
1219
1220
1221
1222
1223
1224
      while (d_ip1 <  64 && (e_ip1[1] & (one << d_ip1)) == 0) d_ip1--;
    }
    gf->multiply.w128(gf, c_i, y_i, y_ip1);
    y_ip1[0] ^= y_im1[0];
    y_ip1[1] ^= y_im1[1];

    y_im1[0] = y_i[0];
    y_im1[1] = y_i[1];
110523d6   Jim Plank   GF-Complete Relea...
1225

70b6d55a   plank   Big checkin after...
1226
1227
1228
1229
1230
1231
1232
1233
1234
    y_i[0] = y_ip1[0];
    y_i[1] = y_ip1[1];

    e_im1[0] = e_i[0];
    e_im1[1] = e_i[1];
    d_im1 = d_i;
    e_i[0] = e_ip1[0];
    e_i[1] = e_ip1[1];
    d_i = d_ip1;
110523d6   Jim Plank   GF-Complete Relea...
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
  }

  b[0] = y_i[0];
  b[1] = y_i[1];
  return;
}

  void
gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
  uint64_t d[2];
  gf->inverse.w128(gf, b128, d);
  gf->multiply.w128(gf, a128, d, c128);
  return;
}

  void
gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
{
  uint64_t one128[2];
  one128[0] = 0;
  one128[1] = 1;
  gf->divide.w128(gf, one128, a128, b128);
  return;
}


static
  void
gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv)
{
  gf_internal_t *h = (gf_internal_t *) gf->scratch;
  gf_t *base_gf = h->base_gf;
  uint64_t a0 = a[1];
  uint64_t a1 = a[0];
  uint64_t c0, c1, d, tmp;
  uint64_t a0inv, a1inv;

  if (a0 == 0) {
    a1inv = base_gf->inverse.w64(base_gf, a1);
    c0 = base_gf->multiply.w64(base_gf, a1inv, h->prim_poly);
    c1 = a1inv;
  } else if (a1 == 0) {
    c0 = base_gf->inverse.w64(base_gf, a0);
    c1 = 0;
  } else {
    a1inv = base_gf->inverse.w64(base_gf, a1);
    a0inv = base_gf->inverse.w64(base_gf, a0);

    d = base_gf->multiply.w64(base_gf, a1, a0inv);

    tmp = (base_gf->multiply.w64(base_gf, a1, a0inv) ^ base_gf->multiply.w64(base_gf, a0, a1inv) ^ h->prim_poly);
    tmp = base_gf->inverse.w64(base_gf, tmp);

    d = base_gf->multiply.w64(base_gf, d, tmp);

    c0 = base_gf->multiply.w64(base_gf, (d^1), a0inv);
    c1 = base_gf->multiply.w64(base_gf, d, a1inv);
  }
  inv[0] = c1;
  inv[1] = c0;
110523d6   Jim Plank   GF-Complete Relea...
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
}

static
  void
gf_w128_composite_multiply(gf_t *gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t rv)
{
  gf_internal_t *h = (gf_internal_t *) gf->scratch;
  gf_t *base_gf = h->base_gf;
  uint64_t b0 = b[1];
  uint64_t b1 = b[0];
  uint64_t a0 = a[1];
  uint64_t a1 = a[0];
  uint64_t a1b1;

  a1b1 = base_gf->multiply.w64(base_gf, a1, b1);

  rv[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
  rv[0] = base_gf->multiply.w64(base_gf, a1, b0) ^ 
    base_gf->multiply.w64(base_gf, a0, b1) ^ 
    base_gf->multiply.w64(base_gf, a1b1, h->prim_poly);
}

static
  void
gf_w128_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
{
  gf_internal_t *h = (gf_internal_t *) gf->scratch;
  gf_t *base_gf = h->base_gf;
  uint64_t b0 = val[1];
  uint64_t b1 = val[0];
  uint64_t *s64, *d64;
  uint64_t *top;
  uint64_t a0, a1, a1b1;
  gf_region_data rd;

  if (val[0] == 0 && val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }

  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);

  s64 = rd.s_start;
  d64 = rd.d_start;
  top = rd.d_top;

  if (xor) {
    while (d64 < top) {
      a1 = s64[0];
      a0 = s64[1];
      a1b1 = base_gf->multiply.w64(base_gf, a1, b1);

      d64[1] ^= (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
      d64[0] ^= (base_gf->multiply.w64(base_gf, a1, b0) ^ 
          base_gf->multiply.w64(base_gf, a0, b1) ^ 
          base_gf->multiply.w64(base_gf, a1b1, h->prim_poly));
      s64 += 2;
110523d6   Jim Plank   GF-Complete Relea...
1350
1351
1352
1353
1354
1355
      d64 += 2;
    }
  } else {
    while (d64 < top) {
      a1 = s64[0];
      a0 = s64[1];
a0ae760e   Loic Dachary   prefer uint8_t to...
1356
      a1b1 = base_gf->multiply.w64(base_gf, a1, b1);
110523d6   Jim Plank   GF-Complete Relea...
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371

      d64[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
      d64[0] = (base_gf->multiply.w64(base_gf, a1, b0) ^ 
          base_gf->multiply.w64(base_gf, a0, b1) ^ 
          base_gf->multiply.w64(base_gf, a1b1, h->prim_poly));
      s64 += 2;
      d64 += 2;
    }
  }
}

static
void
gf_w128_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int 
    xor)
a0ae760e   Loic Dachary   prefer uint8_t to...
1372
{
110523d6   Jim Plank   GF-Complete Relea...
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
  gf_internal_t *h = (gf_internal_t *) gf->scratch;  gf_t *base_gf = h->base_gf;
  gf_val_64_t val0 = val[1];
  gf_val_64_t val1 = val[0];
  uint8_t *slow, *shigh;
  uint8_t *dlow, *dhigh, *top;
  int sub_reg_size;
  gf_region_data rd;

  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 64);
  gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);

  slow = (uint8_t *) rd.s_start;
  dlow = (uint8_t *) rd.d_start;
  top = (uint8_t*) rd.d_top;
110523d6   Jim Plank   GF-Complete Relea...
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
  sub_reg_size = (top - dlow)/2;
  shigh = slow + sub_reg_size;
  dhigh = dlow + sub_reg_size;

  base_gf->multiply_region.w64(base_gf, slow, dlow, val0, sub_reg_size, xor);
  base_gf->multiply_region.w64(base_gf, shigh, dlow, val1, sub_reg_size, 1);
  base_gf->multiply_region.w64(base_gf, slow, dhigh, val1, sub_reg_size, xor);
  base_gf->multiply_region.w64(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
  base_gf->multiply_region.w64(base_gf, shigh, dhigh, base_gf->multiply.w64(base_gf, h->prim_poly, val1
        ), sub_reg_size, 1);
fb0bbdcf   Jim Plank   Fixed the problem...
1397

110523d6   Jim Plank   GF-Complete Relea...
1398
1399
1400
1401
1402
1403
1404
1405
1406
  gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
}


  static
int gf_w128_composite_init(gf_t *gf)
{
  gf_internal_t *h = (gf_internal_t *) gf->scratch;

70b6d55a   plank   Big checkin after...
1407
1408
1409
  if (h->region_type & GF_REGION_ALTMAP) {
    gf->multiply_region.w128 = gf_w128_composite_multiply_region_alt;   
  } else {
70b6d55a   plank   Big checkin after...
1410
1411
1412
1413
1414
1415
    gf->multiply_region.w128 = gf_w128_composite_multiply_region;
  }

  gf->multiply.w128 = gf_w128_composite_multiply;
  gf->divide.w128 = gf_w128_divide_from_inverse;
  gf->inverse.w128 = gf_w128_composite_inverse;
110523d6   Jim Plank   GF-Complete Relea...
1416

f58a7de9   plank   w128: bytwo-b, gr...
1417
1418
  return 1;
}
110523d6   Jim Plank   GF-Complete Relea...
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430

static
int gf_w128_cfm_init(gf_t *gf)
{
#if defined(INTEL_SSE4_PCLMUL)
  gf->inverse.w128 = gf_w128_euclid;
  gf->multiply.w128 = gf_w128_clm_multiply;
  gf->multiply_region.w128 = gf_w128_clm_multiply_region_from_single;
  return 1;
#endif

  return 0;
f58a7de9   plank   w128: bytwo-b, gr...
1431
}
f58a7de9   plank   w128: bytwo-b, gr...
1432
1433
1434
1435

static
int gf_w128_shift_init(gf_t *gf)
{
70b6d55a   plank   Big checkin after...
1436
1437
1438
1439
  gf->multiply.w128 = gf_w128_shift_multiply;
  gf->inverse.w128 = gf_w128_euclid;
  gf->multiply_region.w128 = gf_w128_multiply_region_from_single;
  return 1;
110523d6   Jim Plank   GF-Complete Relea...
1440
}
70b6d55a   plank   Big checkin after...
1441
1442
1443
1444
1445
1446
1447
1448
1449

  static
int gf_w128_bytwo_init(gf_t *gf)
{
  gf_internal_t *h;
  h = (gf_internal_t *) gf->scratch;

  if (h->mult_type == GF_MULT_BYTWO_p) {
    gf->multiply.w128 = gf_w128_bytwo_p_multiply;
110523d6   Jim Plank   GF-Complete Relea...
1450
    /*gf->multiply.w128 = gf_w128_sse_bytwo_p_multiply;*/
70b6d55a   plank   Big checkin after...
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
    /* John: the sse function is slower.*/
  } else {
    gf->multiply.w128 = gf_w128_bytwo_b_multiply;
    /*gf->multiply.w128 = gf_w128_sse_bytwo_b_multiply;
Ben: This sse function is also slower. */
  }
  gf->inverse.w128 = gf_w128_euclid;
  gf->multiply_region.w128 = gf_w128_bytwo_b_multiply_region;
  return 1;
}

/*
 * Because the prim poly is only 8 bits and we are limiting g_r to 16, I do not need the high 64
 * bits in all of these numbers.
d59cbb2a   animetosho   Convert all comme...
1465
 */
110523d6   Jim Plank   GF-Complete Relea...
1466
1467
1468
  static
void gf_w128_group_r_init(gf_t *gf)
{
110523d6   Jim Plank   GF-Complete Relea...
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
  int i, j;
  int g_r;
  uint64_t pp;
  gf_internal_t *scratch;
  gf_group_tables_t *gt;
  scratch = (gf_internal_t *) gf->scratch;
  gt = scratch->private;
  g_r = scratch->arg2;
  pp = scratch->prim_poly;

  gt->r_table[0] = 0;
  for (i = 1; i < (1 << g_r); i++) {
    gt->r_table[i] = 0;
    for (j = 0; j < g_r; j++) {
      if (i & (1 << j)) {
        gt->r_table[i] ^= (pp << j);
4dc7a55d   animetosho   Eliminate some ad...
1485
      }
b561863b   animetosho   Remove most insta...
1486
    }
110523d6   Jim Plank   GF-Complete Relea...
1487
1488
1489
1490
  }
  return;
}

110523d6   Jim Plank   GF-Complete Relea...
1491
#if 0 // defined(INTEL_SSE4)
29899ad4   Loic Dachary   move #if to avoid...
1492
  static
110523d6   Jim Plank   GF-Complete Relea...
1493
1494
void gf_w128_group_r_sse_init(gf_t *gf)
{
f58a7de9   plank   w128: bytwo-b, gr...
1495
1496
  int i, j;
  int g_r;
110523d6   Jim Plank   GF-Complete Relea...
1497
1498
  uint64_t pp;
  gf_internal_t *scratch;
f58a7de9   plank   w128: bytwo-b, gr...
1499
1500
1501
  gf_group_tables_t *gt;
  scratch = (gf_internal_t *) gf->scratch;
  gt = scratch->private;
f58a7de9   plank   w128: bytwo-b, gr...
1502
  __m128i zero = _mm_setzero_si128();
110523d6   Jim Plank   GF-Complete Relea...
1503
  __m128i *table = (__m128i *)(gt->r_table);
fb0bbdcf   Jim Plank   Fixed the problem...
1504
  g_r = scratch->arg2;
568df90e   Janne Grunau   simd: rename the ...
1505
  pp = scratch->prim_poly;
110523d6   Jim Plank   GF-Complete Relea...
1506
1507
1508
1509
  table[0] = zero;
  for (i = 1; i < (1 << g_r); i++) {
    table[i] = zero;
    for (j = 0; j < g_r; j++) {
f58a7de9   plank   w128: bytwo-b, gr...
1510
      if (i & (1 << j)) {
110523d6   Jim Plank   GF-Complete Relea...
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
        table[i] = _mm_xor_si128(table[i], _mm_insert_epi64(zero, pp << j, 0));
      }
    }
  }
  return;
}
#endif

  static 
int gf_w128_split_init(gf_t *gf)
{
  struct gf_w128_split_4_128_data *sd4;
  struct gf_w128_split_8_128_data *sd8;
568df90e   Janne Grunau   simd: rename the ...
1524
  gf_internal_t *h;
110523d6   Jim Plank   GF-Complete Relea...
1525
1526
1527
1528
1529
1530
1531
1532

  h = (gf_internal_t *) gf->scratch;

  gf->multiply.w128 = gf_w128_bytwo_p_multiply;
#if defined(INTEL_SSE4_PCLMUL)
  if (!(h->region_type & GF_REGION_NOSIMD)){
    gf->multiply.w128 = gf_w128_clm_multiply;
  }
7bb3797a   Jim Plank   Added SPLIT SSE S...
1533
#endif
568df90e   Janne Grunau   simd: rename the ...
1534

7bb3797a   Jim Plank   Added SPLIT SSE S...
1535
1536
1537
1538
  gf->inverse.w128 = gf_w128_euclid;

  if ((h->arg1 != 4 && h->arg2 != 4) || h->mult_type == GF_MULT_DEFAULT) {
    sd8 = (struct gf_w128_split_8_128_data *) h->private;
110523d6   Jim Plank   GF-Complete Relea...
1539
    sd8->last_value[0] = 0;
7bb3797a   Jim Plank   Added SPLIT SSE S...
1540
    sd8->last_value[1] = 0;
110523d6   Jim Plank   GF-Complete Relea...
1541
1542
    gf->multiply_region.w128 = gf_w128_split_8_128_multiply_region;
  } else {
f58a7de9   plank   w128: bytwo-b, gr...
1543
1544
1545
1546
    sd4 = (struct gf_w128_split_4_128_data *) h->private;
    sd4->last_value[0] = 0;
    sd4->last_value[1] = 0;
    if((h->region_type & GF_REGION_ALTMAP))
70b6d55a   plank   Big checkin after...
1547
1548
1549
1550
1551
    {
      #ifdef INTEL_SSE4
        if(!(h->region_type & GF_REGION_NOSIMD))
          gf->multiply_region.w128 = gf_w128_split_4_128_sse_altmap_multiply_region;
        else
f043479e   Loic Dachary   remove unused var...
1552
          return 0;
70b6d55a   plank   Big checkin after...
1553
1554
1555
      #else
        return 0;
      #endif
110523d6   Jim Plank   GF-Complete Relea...
1556
    }
70b6d55a   plank   Big checkin after...
1557
1558
    else {
      #ifdef INTEL_SSE4
a0ae760e   Loic Dachary   prefer uint8_t to...
1559
        if(!(h->region_type & GF_REGION_NOSIMD))
70b6d55a   plank   Big checkin after...
1560
1561
1562
1563
          gf->multiply_region.w128 = gf_w128_split_4_128_sse_multiply_region;
        else
          gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region;
      #else
70b6d55a   plank   Big checkin after...
1564
1565
      gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region;
      #endif
f58a7de9   plank   w128: bytwo-b, gr...
1566
    }
110523d6   Jim Plank   GF-Complete Relea...
1567
  }
f0c32c94   Jim Plank   Removed GROUP/128...
1568
  return 1;
110523d6   Jim Plank   GF-Complete Relea...
1569
}
70b6d55a   plank   Big checkin after...
1570
1571
1572


static
f58a7de9   plank   w128: bytwo-b, gr...
1573
1574
1575
1576
1577
1578
1579
1580
1581
int gf_w128_group_init(gf_t *gf)
{
  gf_internal_t *scratch;
  gf_group_tables_t *gt;
  int g_r, size_r;

  scratch = (gf_internal_t *) gf->scratch;
  gt = scratch->private;
  g_r = scratch->arg2;
110523d6   Jim Plank   GF-Complete Relea...
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
  size_r = (1 << g_r);

  gt->r_table = (gf_val_128_t)((uint8_t *)scratch->private + (2 * sizeof(uint64_t *)));
  gt->m_table = gt->r_table + size_r;
  gt->m_table[2] = 0;
  gt->m_table[3] = 0;

  gf->multiply.w128 = gf_w128_group_multiply;
  gf->inverse.w128 = gf_w128_euclid;
  gf->multiply_region.w128 = gf_w128_group_multiply_region;

  gf_w128_group_r_init(gf);

  return 1;
}

void gf_w128_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
{
  gf_val_128_t s;

  s = (gf_val_128_t) start;
  s += (index * 2); 
  memcpy(rv, s, 16);
}

static void gf_w128_split_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
{
  int i, blocks;
  uint64_t *r64, tmp;
  uint8_t *r8;
  gf_region_data rd;

  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 256);
  r64 = (uint64_t *) start;
  if ((r64 + index*2 < (uint64_t *) rd.d_start) ||
      (r64 + index*2 >= (uint64_t *) rd.d_top)) {
    memcpy(rv, r64+(index*2), 16);
    return;
  }

  index -= (((uint64_t *) rd.d_start) - r64)/2;
  r64 = (uint64_t *) rd.d_start;

  blocks = index/16;
  r64 += (blocks*32);
  index %= 16;
  r8 = (uint8_t *) r64;
  r8 += index;
  rv[0] = 0;
  rv[1] = 0;

  for (i = 0; i < 8; i++) {
    tmp = *r8;
    rv[1] |= (tmp << (i*8));
    r8 += 16;
  }

  for (i = 0; i < 8; i++) {
    tmp = *r8;
    rv[0] |= (tmp << (i*8));
    r8 += 16;
  }
  return;
}

  static
void gf_w128_composite_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
{
70b6d55a   plank   Big checkin after...
1650
1651
1652
  int sub_size;
  gf_internal_t *h;
  uint8_t *r8, *top;
110523d6   Jim Plank   GF-Complete Relea...
1653
1654
  uint64_t *r64;
  gf_region_data rd;
70b6d55a   plank   Big checkin after...
1655
1656

  h = (gf_internal_t *) gf->scratch;
110523d6   Jim Plank   GF-Complete Relea...
1657
1658
1659
  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64);
  r64 = (uint64_t *) start;
  if ((r64 + index*2 < (uint64_t *) rd.d_start) ||
70b6d55a   plank   Big checkin after...
1660
      (r64 + index*2 >= (uint64_t *) rd.d_top)) {
70b6d55a   plank   Big checkin after...
1661
1662
    memcpy(rv, r64+(index*2), 16);
    return;
110523d6   Jim Plank   GF-Complete Relea...
1663
  }
f58a7de9   plank   w128: bytwo-b, gr...
1664
  index -= (((uint64_t *) rd.d_start) - r64)/2;
f58a7de9   plank   w128: bytwo-b, gr...
1665
1666
  r8 = (uint8_t *) rd.d_start;
  top = (uint8_t *) rd.d_top;
110523d6   Jim Plank   GF-Complete Relea...
1667
  sub_size = (top-r8)/2;
f58a7de9   plank   w128: bytwo-b, gr...
1668

f58a7de9   plank   w128: bytwo-b, gr...
1669
1670
  rv[1] = h->base_gf->extract_word.w64(h->base_gf, r8, sub_size, index);
  rv[0] = h->base_gf->extract_word.w64(h->base_gf, r8+sub_size, sub_size, index);
110523d6   Jim Plank   GF-Complete Relea...
1671
1672
  
  return;
f58a7de9   plank   w128: bytwo-b, gr...
1673
}
110523d6   Jim Plank   GF-Complete Relea...
1674

f58a7de9   plank   w128: bytwo-b, gr...
1675
int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
70b6d55a   plank   Big checkin after...
1676
{
110523d6   Jim Plank   GF-Complete Relea...
1677
  int size_m, size_r;
70b6d55a   plank   Big checkin after...
1678
  if (divide_type==GF_DIVIDE_MATRIX) return 0;
110523d6   Jim Plank   GF-Complete Relea...
1679

70b6d55a   plank   Big checkin after...
1680
1681
1682
1683
  switch(mult_type)
  {
    case GF_MULT_CARRY_FREE:
      return sizeof(gf_internal_t);
110523d6   Jim Plank   GF-Complete Relea...
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
      break;
    case GF_MULT_SHIFT:
      return sizeof(gf_internal_t);
      break;
    case GF_MULT_BYTWO_p:
    case GF_MULT_BYTWO_b:
      return sizeof(gf_internal_t);
      break;
    case GF_MULT_DEFAULT: 
    case GF_MULT_SPLIT_TABLE:
70b6d55a   plank   Big checkin after...
1694
      if ((arg1 == 4 && arg2 == 128) || (arg1 == 128 && arg2 == 4)) {
110523d6   Jim Plank   GF-Complete Relea...
1695
        return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_4_128_data) + 64;
70b6d55a   plank   Big checkin after...
1696
1697
1698
1699
1700
      } else if ((arg1 == 8 && arg2 == 128) || (arg1 == 128 && arg2 == 8) || mult_type == GF_MULT_DEFAULT) {
        return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_8_128_data) + 64;
      }
      return 0;
      break;
f043479e   Loic Dachary   remove unused var...
1701
    case GF_MULT_GROUP:
70b6d55a   plank   Big checkin after...
1702
1703
      /* JSP We've already error checked the arguments. */
      size_m = (1 << arg1) * 2 * sizeof(uint64_t);
110523d6   Jim Plank   GF-Complete Relea...
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
      size_r = (1 << arg2) * 2 * sizeof(uint64_t);
      /* 
       * two pointers prepend the table data for structure
       * because the tables are of dynamic size
       */
      return sizeof(gf_internal_t) + size_m + size_r + 4 * sizeof(uint64_t *);
      break;
    case GF_MULT_COMPOSITE:
      if (arg1 == 2) {
        return sizeof(gf_internal_t) + 4;
110523d6   Jim Plank   GF-Complete Relea...
1714
      } else {
70b6d55a   plank   Big checkin after...
1715
1716
1717
1718
1719
        return 0;
      }
      break;

    default:
70b6d55a   plank   Big checkin after...
1720
      return 0;
110523d6   Jim Plank   GF-Complete Relea...
1721
   }
f58a7de9   plank   w128: bytwo-b, gr...
1722
}
110523d6   Jim Plank   GF-Complete Relea...
1723

70b6d55a   plank   Big checkin after...
1724
int gf_w128_init(gf_t *gf)
70b6d55a   plank   Big checkin after...
1725
{
110523d6   Jim Plank   GF-Complete Relea...
1726
  gf_internal_t *h;
f58a7de9   plank   w128: bytwo-b, gr...
1727

110523d6   Jim Plank   GF-Complete Relea...
1728
  h = (gf_internal_t *) gf->scratch;
70b6d55a   plank   Big checkin after...
1729
1730
  
  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
f58a7de9   plank   w128: bytwo-b, gr...
1731

110523d6   Jim Plank   GF-Complete Relea...
1732
1733
1734
1735
1736
1737
1738
1739
1740
  if (h->prim_poly == 0) {
    if (h->mult_type == GF_MULT_COMPOSITE) {
      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
      if (h->prim_poly == 0) return 0; /* This shouldn't happen */
    } else {
      h->prim_poly = 0x87; /* Omitting the leftmost 1 as in w=32 */
    }
  }

f58a7de9   plank   w128: bytwo-b, gr...
1741
  gf->multiply.w128 = NULL;
70b6d55a   plank   Big checkin after...
1742
1743
  gf->divide.w128 = NULL;
  gf->inverse.w128 = NULL;
110523d6   Jim Plank   GF-Complete Relea...
1744
  gf->multiply_region.w128 = NULL;
70b6d55a   plank   Big checkin after...
1745
1746
1747
1748
1749
1750
1751
1752
1753
  switch(h->mult_type) {
    case GF_MULT_BYTWO_p:
    case GF_MULT_BYTWO_b:      if (gf_w128_bytwo_init(gf) == 0) return 0; break;
    case GF_MULT_CARRY_FREE:   if (gf_w128_cfm_init(gf) == 0) return 0; break;
    case GF_MULT_SHIFT:        if (gf_w128_shift_init(gf) == 0) return 0; break;
    case GF_MULT_GROUP:        if (gf_w128_group_init(gf) == 0) return 0; break;
    case GF_MULT_DEFAULT: 
    case GF_MULT_SPLIT_TABLE:  if (gf_w128_split_init(gf) == 0) return 0; break;
    case GF_MULT_COMPOSITE:    if (gf_w128_composite_init(gf) == 0) return 0; break;