Blame view

src/gf_w128.c 48.2 KB
70b6d55a   plank   Big checkin after...
1
/*
b0e2ae07   Jim Plank   Put headers on th...
2
3
4
5
 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
 *
70b6d55a   plank   Big checkin after...
6
7
8
9
10
11
12
13
 * gf_w128.c
 *
 * Routines for 128-bit Galois fields
 */

#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
4339569f   Bassam Tabbara   Support for runti...
14
#include "gf_cpu.h"
70b6d55a   plank   Big checkin after...
15
16
17
18
19

#define GF_FIELD_WIDTH (128)

#define two_x(a) {\
  a[0] <<= 1; \
110523d6   Jim Plank   GF-Complete Relea...
20
  if (a[1] & 1ULL << 63) a[0] ^= 1; \
70b6d55a   plank   Big checkin after...
21
22
23
24
25
26
27
28
29
30
  a[1] <<= 1; }
  
#define a_get_b(a, i, b, j) {\
  a[i] = b[j]; \
  a[i + 1] = b[j + 1];}

#define set_zero(a, i) {\
  a[i] = 0; \
  a[i + 1] = 0;}

f58a7de9   plank   w128: bytwo-b, gr...
31
32
33
34
35
struct gf_w128_split_4_128_data {
  uint64_t last_value[2];
  uint64_t tables[2][32][16];
};

110523d6   Jim Plank   GF-Complete Relea...
36
37
38
39
40
struct gf_w128_split_8_128_data {
  uint64_t last_value[2];
  uint64_t tables[2][16][256];
};

70b6d55a   plank   Big checkin after...
41
42
43
44
45
typedef struct gf_group_tables_s {
  gf_val_128_t m_table;
  gf_val_128_t r_table;
} gf_group_tables_t;

110523d6   Jim Plank   GF-Complete Relea...
46
47
#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }

70b6d55a   plank   Big checkin after...
48
49
50
51
52
static
void
gf_w128_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
int xor)
{
d1b6bbf7   Loic Dachary   add -Wsign-compar...
53
    uint32_t i;
70b6d55a   plank   Big checkin after...
54
55
56
    gf_val_128_t s128;
    gf_val_128_t d128;
    uint64_t c128[2];
f58a7de9   plank   w128: bytwo-b, gr...
57
58
59
60
61
62
63
    gf_region_data rd;

    /* We only do this to check on alignment. */
    gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);

    if (val[0] == 0) {
      if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
e96c0e28   plank   Renaming gf.h to ...
64
      if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
f58a7de9   plank   w128: bytwo-b, gr...
65
    }
70b6d55a   plank   Big checkin after...
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

    set_zero(c128, 0);

    s128 = (gf_val_128_t) src;
    d128 = (gf_val_128_t) dest;

    if (xor) {
      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
        gf->multiply.w128(gf, &s128[i], val, c128);
        d128[i] ^= c128[0];
        d128[i+1] ^= c128[1];
      }
    } else {
      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
        gf->multiply.w128(gf, &s128[i], val, &d128[i]);
      }
    }
}

29899ad4   Loic Dachary   move #if to avoid...
85
#if defined(INTEL_SSE4_PCLMUL)
110523d6   Jim Plank   GF-Complete Relea...
86
87
88
89
90
static
void
gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
int xor)
{
d1b6bbf7   Loic Dachary   add -Wsign-compar...
91
    uint32_t i;
110523d6   Jim Plank   GF-Complete Relea...
92
93
    gf_val_128_t s128;
    gf_val_128_t d128;
110523d6   Jim Plank   GF-Complete Relea...
94
    gf_region_data rd;
110523d6   Jim Plank   GF-Complete Relea...
95
96
97
98
99
100
101
102
103
104
105
106
107
108
    __m128i     a,b;
    __m128i     result0,result1;
    __m128i     prim_poly;
    __m128i     c,d,e,f;
    gf_internal_t * h = gf->scratch;
    prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
    /* We only do this to check on alignment. */
    gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);

    if (val[0] == 0) {
      if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
      if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
    }

110523d6   Jim Plank   GF-Complete Relea...
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
    s128 = (gf_val_128_t) src;
    d128 = (gf_val_128_t) dest;

    if (xor) {
      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
        a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
        b = _mm_insert_epi64 (a, val[1], 0);
        a = _mm_insert_epi64 (a, s128[i], 1);
        b = _mm_insert_epi64 (b, val[0], 1);
    
        c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
        f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
        e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
        d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/

        /* now reusing a and b as temporary variables*/
        result0 = _mm_setzero_si128();
        result1 = result0;

        result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
        a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
        result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));

        a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
        result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
        result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
        /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce. */

        a = _mm_srli_si128 (result0, 8);
        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
        result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
        result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));

        a = _mm_insert_epi64 (result0, 0, 1);
        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
        result1 = _mm_xor_si128 (result1, b); 
        d128[i] ^= (uint64_t)_mm_extract_epi64(result1,1);
        d128[i+1] ^= (uint64_t)_mm_extract_epi64(result1,0);
      }
    } else {
      for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
        a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
        b = _mm_insert_epi64 (a, val[1], 0);
        a = _mm_insert_epi64 (a, s128[i], 1);
        b = _mm_insert_epi64 (b, val[0], 1);

        c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
        f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
        e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/ 
        d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/ 

        /* now reusing a and b as temporary variables*/
        result0 = _mm_setzero_si128();
        result1 = result0;

        result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
        a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
        result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));

        a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
        result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
        result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
        /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/

        a = _mm_srli_si128 (result0, 8);
        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
        result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
        result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));

        a = _mm_insert_epi64 (result0, 0, 1);
        b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
        result1 = _mm_xor_si128 (result1, b);
        d128[i] = (uint64_t)_mm_extract_epi64(result1,1);
        d128[i+1] = (uint64_t)_mm_extract_epi64(result1,0);
      }
    }
110523d6   Jim Plank   GF-Complete Relea...
185
}
29899ad4   Loic Dachary   move #if to avoid...
186
#endif
110523d6   Jim Plank   GF-Complete Relea...
187

70b6d55a   plank   Big checkin after...
188
189
190
191
192
/*
 * Some w128 notes:
 * --Big Endian
 * --return values allocated beforehand
 */
110523d6   Jim Plank   GF-Complete Relea...
193
194
195

#define GF_W128_IS_ZERO(val) (val[0] == 0 && val[1] == 0)

70b6d55a   plank   Big checkin after...
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
void
gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
  /* ordered highest bit to lowest l[0] l[1] r[0] r[1] */
  uint64_t pl[2], pr[2], ppl[2], ppr[2], i, a[2], bl[2], br[2], one, lbit;
  gf_internal_t *h;

  h = (gf_internal_t *) gf->scratch;

  if (GF_W128_IS_ZERO(a128) || GF_W128_IS_ZERO(b128)) {
    set_zero(c128, 0);
    return;
  }

  a_get_b(a, 0, a128, 0);
  a_get_b(br, 0, b128, 0);
  set_zero(bl, 0);

  one = 1;
  lbit = (one << 63);

  set_zero(pl, 0);
  set_zero(pr, 0);

110523d6   Jim Plank   GF-Complete Relea...
220
  /* Allen: a*b for right half of a */
70b6d55a   plank   Big checkin after...
221
222
223
224
225
226
227
228
229
230
231
232
233
  for (i = 0; i < GF_FIELD_WIDTH/2; i++) {
    if (a[1] & (one << i)) {
      pl[1] ^= bl[1];
      pr[0] ^= br[0];
      pr[1] ^= br[1];
    }
    bl[1] <<= 1;
    if (br[0] & lbit) bl[1] ^= 1;
    br[0] <<= 1;
    if (br[1] & lbit) br[0] ^= 1;
    br[1] <<= 1;
  }

110523d6   Jim Plank   GF-Complete Relea...
234
  /* Allen: a*b for left half of a */
70b6d55a   plank   Big checkin after...
235
236
237
238
239
240
241
242
243
244
245
246
247
  for (i = 0; i < GF_FIELD_WIDTH/2; i++) {
    if (a[0] & (one << i)) {
      pl[0] ^= bl[0];
      pl[1] ^= bl[1];
      pr[0] ^= br[0];
    }
    bl[0] <<= 1;
    if (bl[1] & lbit) bl[0] ^= 1;
    bl[1] <<= 1;
    if (br[0] & lbit) bl[1] ^= 1;
    br[0] <<= 1;
  }

110523d6   Jim Plank   GF-Complete Relea...
248
249
250
251
252
  /* Allen: do first half of reduction (based on left quarter of initial product) */
  one = lbit >> 1;
  ppl[0] = one; /* Allen: introduce leading one of primitive polynomial */
  ppl[1] = h->prim_poly >> 2;
  ppr[0] = h->prim_poly << (GF_FIELD_WIDTH/2-2);
70b6d55a   plank   Big checkin after...
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
  ppr[1] = 0;
  while (one != 0) {
    if (pl[0] & one) {
      pl[0] ^= ppl[0];
      pl[1] ^= ppl[1];
      pr[0] ^= ppr[0];
      pr[1] ^= ppr[1];
    }
    one >>= 1;
    ppr[1] >>= 1;
    if (ppr[0] & 1) ppr[1] ^= lbit;
    ppr[0] >>= 1;
    if (ppl[1] & 1) ppr[0] ^= lbit;
    ppl[1] >>= 1;
    if (ppl[0] & 1) ppl[1] ^= lbit;
    ppl[0] >>= 1;
  }

110523d6   Jim Plank   GF-Complete Relea...
271
  /* Allen: final half of reduction */
70b6d55a   plank   Big checkin after...
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
  one = lbit;
  while (one != 0) {
    if (pl[1] & one) {
      pl[1] ^= ppl[1];
      pr[0] ^= ppr[0];
      pr[1] ^= ppr[1];
    }
    one >>= 1;
    ppr[1] >>= 1;
    if (ppr[0] & 1) ppr[1] ^= lbit;
    ppr[0] >>= 1;
    if (ppl[1] & 1) ppr[0] ^= lbit;
    ppl[1] >>= 1;
  }

110523d6   Jim Plank   GF-Complete Relea...
287
  /* Allen: if we really want to optimize this we can just be using c128 instead of pr all along */
70b6d55a   plank   Big checkin after...
288
289
290
291
292
293
  c128[0] = pr[0];
  c128[1] = pr[1];

  return;
}

4339569f   Bassam Tabbara   Support for runti...
294
295
#if defined(INTEL_SSE4_PCLMUL)

f58a7de9   plank   w128: bytwo-b, gr...
296
void
110523d6   Jim Plank   GF-Complete Relea...
297
298
gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
110523d6   Jim Plank   GF-Complete Relea...
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
    __m128i     a,b;
    __m128i     result0,result1;
    __m128i     prim_poly;
    __m128i     c,d,e,f;
    gf_internal_t * h = gf->scratch;
    
    a = _mm_insert_epi64 (_mm_setzero_si128(), a128[1], 0);
    b = _mm_insert_epi64 (a, b128[1], 0);
    a = _mm_insert_epi64 (a, a128[0], 1);
    b = _mm_insert_epi64 (b, b128[0], 1);

    prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);

    /* we need to test algorithm 2 later*/
    c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
    f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
    e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
    d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
    
    /* now reusing a and b as temporary variables*/
    result0 = _mm_setzero_si128();
    result1 = result0;

    result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
    a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
    result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));

    a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
    result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
    result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
    /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/
    
    a = _mm_srli_si128 (result0, 8);
    b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
    result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
    result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
    
    a = _mm_insert_epi64 (result0, 0, 1);
    b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
    result1 = _mm_xor_si128 (result1, b);

    c128[0] = (uint64_t)_mm_extract_epi64(result1,1);
    c128[1] = (uint64_t)_mm_extract_epi64(result1,0);
110523d6   Jim Plank   GF-Complete Relea...
342
}
4339569f   Bassam Tabbara   Support for runti...
343
#endif
110523d6   Jim Plank   GF-Complete Relea...
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378

void
gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
  uint64_t amask[2], pmask, pp, prod[2]; /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
  uint64_t topbit; /* this is used as a boolean value */
  gf_internal_t *h;

  h = (gf_internal_t *) gf->scratch;
  pp = h->prim_poly;
  prod[0] = 0;
  prod[1] = 0;
  pmask = 0x8000000000000000ULL;
  amask[0] = 0x8000000000000000ULL;
  amask[1] = 0;

  while (amask[1] != 0 || amask[0] != 0) {
    topbit = (prod[0] & pmask);
    prod[0] <<= 1;
    if (prod[1] & pmask) prod[0] ^= 1;
    prod[1] <<= 1;
    if (topbit) prod[1] ^= pp;
    if ((a128[0] & amask[0]) || (a128[1] & amask[1])) {
      prod[0] ^= b128[0];
      prod[1] ^= b128[1];
    }
    amask[1] >>= 1;
    if (amask[0] & 1) amask[1] ^= pmask;
    amask[0] >>= 1;
  }
  c128[0] = prod [0];
  c128[1] = prod [1];
  return;
}

4339569f   Bassam Tabbara   Support for runti...
379
#if defined(INTEL_SSE4)
110523d6   Jim Plank   GF-Complete Relea...
380
381
382
void
gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
110523d6   Jim Plank   GF-Complete Relea...
383
  int i;
191b86b5   Loic Dachary   remove unused var...
384
  __m128i a, b, pp, prod, amask, u_middle_one; 
110523d6   Jim Plank   GF-Complete Relea...
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
  /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
  uint32_t topbit, middlebit, pmask; /* this is used as a boolean value */
  gf_internal_t *h;


  h = (gf_internal_t *) gf->scratch;
  pp = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
  prod = _mm_setzero_si128();
  a = _mm_insert_epi64(prod, a128[1], 0x0);
  a = _mm_insert_epi64(a, a128[0], 0x1);
  b = _mm_insert_epi64(prod, b128[1], 0x0);
  b = _mm_insert_epi64(b, b128[0], 0x1);
  pmask = 0x80000000;
  amask = _mm_insert_epi32(prod, 0x80000000, 0x3);
  u_middle_one = _mm_insert_epi32(prod, 1, 0x2);
110523d6   Jim Plank   GF-Complete Relea...
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
  
  for (i = 0; i < 64; i++) {
    topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
    middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
    prod = _mm_slli_epi64(prod, 1); /* this instruction loses the middle bit */
    if (middlebit) {
      prod = _mm_xor_si128(prod, u_middle_one);
    }
    if (topbit) {
      prod = _mm_xor_si128(prod, pp);
    }
    if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 1))) {
      prod = _mm_xor_si128(prod, b);
    }
    amask = _mm_srli_epi64(amask, 1); /*so does this one, but we can just replace after loop*/
  }
2700e1b9   Brad Hubbard   Resolve cppcheck ...
416
  amask = _mm_insert_epi32(amask, (gf_val_32_t)1 << 31, 0x1);
110523d6   Jim Plank   GF-Complete Relea...
417
418
419
420
421
422
423
424
425
426
427
428
429
  for (i = 64; i < 128; i++) {
    topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
    middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
    prod = _mm_slli_epi64(prod, 1);
    if (middlebit) prod = _mm_xor_si128(prod, u_middle_one);
    if (topbit) prod = _mm_xor_si128(prod, pp);
    if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 0))) {
      prod = _mm_xor_si128(prod, b);
    }
    amask = _mm_srli_epi64(amask, 1);
  }
  c128[0] = (uint64_t)_mm_extract_epi64(prod, 1);
  c128[1] = (uint64_t)_mm_extract_epi64(prod, 0);
110523d6   Jim Plank   GF-Complete Relea...
430
431
  return;
}
4339569f   Bassam Tabbara   Support for runti...
432
#endif
110523d6   Jim Plank   GF-Complete Relea...
433
434
435


/* Ben: This slow function implements sse instrutions for bytwo_b because why not */
4339569f   Bassam Tabbara   Support for runti...
436
#if defined(INTEL_SSE4)
110523d6   Jim Plank   GF-Complete Relea...
437
438
439
void
gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
110523d6   Jim Plank   GF-Complete Relea...
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
  __m128i a, b, lmask, hmask, pp, c, middle_one;
  gf_internal_t *h;
  uint64_t topbit, middlebit;

  h = (gf_internal_t *) gf->scratch;
  
  c = _mm_setzero_si128();
  lmask = _mm_insert_epi64(c, 1ULL << 63, 0);
  hmask = _mm_insert_epi64(c, 1ULL << 63, 1);
  b = _mm_insert_epi64(c, a128[0], 1);
  b = _mm_insert_epi64(b, a128[1], 0);
  a = _mm_insert_epi64(c, b128[0], 1);
  a = _mm_insert_epi64(a, b128[1], 0);
  pp = _mm_insert_epi64(c, h->prim_poly, 0);
  middle_one = _mm_insert_epi64(c, 1, 0x1);

  while (1) {
    if (_mm_extract_epi32(a, 0x0) & 1) {
      c = _mm_xor_si128(c, b);
    }
    middlebit = (_mm_extract_epi32(a, 0x2) & 1);
    a = _mm_srli_epi64(a, 1);
    if (middlebit) a = _mm_xor_si128(a, lmask);
    if ((_mm_extract_epi64(a, 0x1) == 0ULL) && (_mm_extract_epi64(a, 0x0) == 0ULL)){
      c128[0] = _mm_extract_epi64(c, 0x1);
      c128[1] = _mm_extract_epi64(c, 0x0);
      return;
    }
    topbit = (_mm_extract_epi64(_mm_and_si128(b, hmask), 1));
    middlebit = (_mm_extract_epi64(_mm_and_si128(b, lmask), 0));
    b = _mm_slli_epi64(b, 1);
    if (middlebit) b = _mm_xor_si128(b, middle_one);
    if (topbit) b = _mm_xor_si128(b, pp);
  }
110523d6   Jim Plank   GF-Complete Relea...
474
}
4339569f   Bassam Tabbara   Support for runti...
475
#endif
110523d6   Jim Plank   GF-Complete Relea...
476
477

void
f58a7de9   plank   w128: bytwo-b, gr...
478
479
480
481
482
483
484
485
gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
  uint64_t bmask, pp;
  gf_internal_t *h;
  uint64_t a[2], b[2], c[2];

  h = (gf_internal_t *) gf->scratch;

110523d6   Jim Plank   GF-Complete Relea...
486
  bmask = (1ULL << 63);
f58a7de9   plank   w128: bytwo-b, gr...
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
  set_zero(c, 0);
  b[0] = a128[0];
  b[1] = a128[1];
  a[0] = b128[0];
  a[1] = b128[1];
  
  while (1) {
    if (a[1] & 1) {
      c[0] ^= b[0];
      c[1] ^= b[1];
    }
    a[1] >>= 1;
    if (a[0] & 1) a[1] ^= bmask;
    a[0] >>= 1;
    if (a[1] == 0 && a[0] == 0) {
      c128[0] = c[0];
      c128[1] = c[1];
      return;
    }
    pp = (b[0] & bmask);
    b[0] <<= 1;
    if (b[1] & bmask) b[0] ^= 1;
    b[1] <<= 1;
    if (pp) b[1] ^= h->prim_poly;
  }
}

static
void
gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
{
  int i, j, k;
  uint64_t pp;
  gf_internal_t *h;
  uint64_t *s64, *d64, *top;
  gf_region_data rd;
  uint64_t v[2], s;
  struct gf_w128_split_4_128_data *ld;

  /* We only do this to check on alignment. */
  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);

  if (val[0] == 0) {
    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
e96c0e28   plank   Renaming gf.h to ...
531
    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
f58a7de9   plank   w128: bytwo-b, gr...
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
  }
    
  h = (gf_internal_t *) gf->scratch;
  ld = (struct gf_w128_split_4_128_data *) h->private;

  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  top = (uint64_t *) rd.d_top;

  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
    v[0] = val[0];
    v[1] = val[1];
    for (i = 0; i < 32; i++) {
      ld->tables[0][i][0] = 0;
      ld->tables[1][i][0] = 0;
      for (j = 1; j < 16; j <<= 1) {
        for (k = 0; k < j; k++) {
          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
        }
110523d6   Jim Plank   GF-Complete Relea...
552
        pp = (v[0] & (1ULL << 63));
f58a7de9   plank   w128: bytwo-b, gr...
553
        v[0] <<= 1;
110523d6   Jim Plank   GF-Complete Relea...
554
        if (v[1] & (1ULL << 63)) v[0] ^= 1;
f58a7de9   plank   w128: bytwo-b, gr...
555
556
557
558
559
560
561
562
        v[1] <<= 1;
        if (pp) v[1] ^= h->prim_poly;
      }
    }
  }
  ld->last_value[0] = val[0];
  ld->last_value[1] = val[1];

110523d6   Jim Plank   GF-Complete Relea...
563
564
565
566
567
568
569
570
/*
  for (i = 0; i < 32; i++) {
    for (j = 0; j < 16; j++) {
      printf("%2d %2d %016llx %016llx\n", i, j, ld->tables[0][i][j], ld->tables[1][i][j]);
    }
    printf("\n");
  }
 */
f58a7de9   plank   w128: bytwo-b, gr...
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
  while (d64 < top) {
    v[0] = (xor) ? d64[0] : 0;
    v[1] = (xor) ? d64[1] : 0;
    s = s64[1];
    i = 0;
    while (s != 0) {
      v[0] ^= ld->tables[0][i][s&0xf];
      v[1] ^= ld->tables[1][i][s&0xf];
      s >>= 4;
      i++;
    }
    s = s64[0];
    i = 16;
    while (s != 0) {
      v[0] ^= ld->tables[0][i][s&0xf];
      v[1] ^= ld->tables[1][i][s&0xf];
      s >>= 4;
      i++;
    }
    d64[0] = v[0];
    d64[1] = v[1];
    s64 += 2;
    d64 += 2;
  }
}

d5692206   Loic Dachary   do not compile if...
597
#if defined(INTEL_SSSE3) && defined(INTEL_SSE4)
110523d6   Jim Plank   GF-Complete Relea...
598
599
static
void
7bb3797a   Jim Plank   Added SPLIT SSE S...
600
601
gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
{
7bb3797a   Jim Plank   Added SPLIT SSE S...
602
  gf_internal_t *h;
191b86b5   Loic Dachary   remove unused var...
603
  int i, j, k;
7bb3797a   Jim Plank   Added SPLIT SSE S...
604
605
606
607
608
609
610
611
612
613
614
  uint64_t pp, v[2], s, *s64, *d64, *top;
  __m128i p, tables[32][16];
  struct gf_w128_split_4_128_data *ld;
  gf_region_data rd;

  if (val[0] == 0) {
    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  }

  h = (gf_internal_t *) gf->scratch;
7bb3797a   Jim Plank   Added SPLIT SSE S...
615
616
617
618
619
620
  
  /* We only do this to check on alignment. */
  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 16);

  /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */

a0ae760e   Loic Dachary   prefer uint8_t to...
621
  gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
7bb3797a   Jim Plank   Added SPLIT SSE S...
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690

  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  top = (uint64_t *) rd.d_top;
 
  ld = (struct gf_w128_split_4_128_data *) h->private;

  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
    v[0] = val[0];
    v[1] = val[1];
    for (i = 0; i < 32; i++) {
      ld->tables[0][i][0] = 0;
      ld->tables[1][i][0] = 0;
      for (j = 1; j < 16; j <<= 1) {
        for (k = 0; k < j; k++) {
          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
        }
        pp = (v[0] & (1ULL << 63));
        v[0] <<= 1;
        if (v[1] & (1ULL << 63)) v[0] ^= 1;
        v[1] <<= 1;
        if (pp) v[1] ^= h->prim_poly;
      }
    }
  }

  ld->last_value[0] = val[0];
  ld->last_value[1] = val[1];

  for (i = 0; i < 32; i++) {
    for (j = 0; j < 16; j++) {
      v[0] = ld->tables[0][i][j];
      v[1] = ld->tables[1][i][j];
      tables[i][j] = _mm_loadu_si128((__m128i *) v);

/*
      printf("%2d %2d: ", i, j);
      MM_PRINT8("", tables[i][j]); */
    }
  }

  while (d64 != top) {

    if (xor) {
      p = _mm_load_si128 ((__m128i *) d64);
    } else {
      p = _mm_setzero_si128();
    }
    s = *s64;
    s64++;
    for (i = 0; i < 16; i++) {
      j = (s&0xf);
      s >>= 4;
      p = _mm_xor_si128(p, tables[16+i][j]);
    }
    s = *s64;
    s64++;
    for (i = 0; i < 16; i++) {
      j = (s&0xf);
      s >>= 4;
      p = _mm_xor_si128(p, tables[i][j]);
    }
    _mm_store_si128((__m128i *) d64, p);
    d64 += 2;
  }

  /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */

a0ae760e   Loic Dachary   prefer uint8_t to...
691
  gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
7bb3797a   Jim Plank   Added SPLIT SSE S...
692
}
29899ad4   Loic Dachary   move #if to avoid...
693
#endif
7bb3797a   Jim Plank   Added SPLIT SSE S...
694

d5692206   Loic Dachary   do not compile if...
695
#if defined(INTEL_SSSE3) && defined(INTEL_SSE4)
7bb3797a   Jim Plank   Added SPLIT SSE S...
696
697
static
void
110523d6   Jim Plank   GF-Complete Relea...
698
699
gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
{
110523d6   Jim Plank   GF-Complete Relea...
700
  gf_internal_t *h;
191b86b5   Loic Dachary   remove unused var...
701
702
  int i, j, k;
  uint64_t pp, v[2], *s64, *d64, *top;
110523d6   Jim Plank   GF-Complete Relea...
703
704
705
706
707
708
709
710
711
712
713
  __m128i si, tables[32][16], p[16], v0, mask1;
  struct gf_w128_split_4_128_data *ld;
  uint8_t btable[16];
  gf_region_data rd;

  if (val[0] == 0) {
    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  }

  h = (gf_internal_t *) gf->scratch;
110523d6   Jim Plank   GF-Complete Relea...
714
715
716
717
718
719
  
  /* We only do this to check on alignment. */
  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 256);

  /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */

a0ae760e   Loic Dachary   prefer uint8_t to...
720
  gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
110523d6   Jim Plank   GF-Complete Relea...
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799

  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  top = (uint64_t *) rd.d_top;
 
  ld = (struct gf_w128_split_4_128_data *) h->private;

  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
    v[0] = val[0];
    v[1] = val[1];
    for (i = 0; i < 32; i++) {
      ld->tables[0][i][0] = 0;
      ld->tables[1][i][0] = 0;
      for (j = 1; j < 16; j <<= 1) {
        for (k = 0; k < j; k++) {
          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
        }
        pp = (v[0] & (1ULL << 63));
        v[0] <<= 1;
        if (v[1] & (1ULL << 63)) v[0] ^= 1;
        v[1] <<= 1;
        if (pp) v[1] ^= h->prim_poly;
      }
    }
  }

  ld->last_value[0] = val[0];
  ld->last_value[1] = val[1];

  for (i = 0; i < 32; i++) {
    for (j = 0; j < 16; j++) {
      for (k = 0; k < 16; k++) {
        btable[k] = (uint8_t) ld->tables[1-(j/8)][i][k];
        ld->tables[1-(j/8)][i][k] >>= 8;
      }
      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
/*
      printf("%2d %2d: ", i, j);
      MM_PRINT8("", tables[i][j]);
 */
    }
  }


  mask1 = _mm_set1_epi8(0xf);

  while (d64 != top) {

    if (xor) {
      for (i = 0; i < 16; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2));
    } else {
      for (i = 0; i < 16; i++) p[i] = _mm_setzero_si128();
    }
    i = 0;
    for (k = 0; k < 16; k++) {
      v0 = _mm_load_si128((__m128i *) s64); 
      s64 += 2;
      
      si = _mm_and_si128(v0, mask1);
  
      for (j = 0; j < 16; j++) {
        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
      }
      i++;
      v0 = _mm_srli_epi32(v0, 4);
      si = _mm_and_si128(v0, mask1);
      for (j = 0; j < 16; j++) {
        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
      }
      i++;
    }
    for (i = 0; i < 16; i++) {
      _mm_store_si128((__m128i *) d64, p[i]);
      d64 += 2;
    }
  }
  /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */

a0ae760e   Loic Dachary   prefer uint8_t to...
800
  gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
110523d6   Jim Plank   GF-Complete Relea...
801
}
29899ad4   Loic Dachary   move #if to avoid...
802
#endif
110523d6   Jim Plank   GF-Complete Relea...
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878

static
void
gf_w128_split_8_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
{
  int i, j, k;
  uint64_t pp;
  gf_internal_t *h;
  uint64_t *s64, *d64, *top;
  gf_region_data rd;
  uint64_t v[2], s;
  struct gf_w128_split_8_128_data *ld;

  /* Check on alignment. Ignore it otherwise. */
  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);

  if (val[0] == 0) {
    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  }
    
  h = (gf_internal_t *) gf->scratch;
  ld = (struct gf_w128_split_8_128_data *) h->private;

  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  top = (uint64_t *) rd.d_top;

  if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
    v[0] = val[0];
    v[1] = val[1];
    for (i = 0; i < 16; i++) {
      ld->tables[0][i][0] = 0;
      ld->tables[1][i][0] = 0;
      for (j = 1; j < (1 << 8); j <<= 1) {
        for (k = 0; k < j; k++) {
          ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
          ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
        }
        pp = (v[0] & (1ULL << 63));
        v[0] <<= 1;
        if (v[1] & (1ULL << 63)) v[0] ^= 1;
        v[1] <<= 1;
        if (pp) v[1] ^= h->prim_poly;
      }
    }
  }
  ld->last_value[0] = val[0];
  ld->last_value[1] = val[1];

  while (d64 < top) {
    v[0] = (xor) ? d64[0] : 0;
    v[1] = (xor) ? d64[1] : 0;
    s = s64[1];
    i = 0;
    while (s != 0) {
      v[0] ^= ld->tables[0][i][s&0xff];
      v[1] ^= ld->tables[1][i][s&0xff];
      s >>= 8;
      i++;
    }
    s = s64[0];
    i = 8;
    while (s != 0) {
      v[0] ^= ld->tables[0][i][s&0xff];
      v[1] ^= ld->tables[1][i][s&0xff];
      s >>= 8;
      i++;
    }
    d64[0] = v[0];
    d64[1] = v[1];
    s64 += 2;
    d64 += 2;
  }
}

f58a7de9   plank   w128: bytwo-b, gr...
879
880
881
void
gf_w128_bytwo_b_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
{
f043479e   Loic Dachary   remove unused var...
882
  uint64_t bmask, pp;
f58a7de9   plank   w128: bytwo-b, gr...
883
884
885
886
887
888
889
890
891
  gf_internal_t *h;
  uint64_t a[2], c[2], b[2], *s64, *d64, *top;
  gf_region_data rd;

  /* We only do this to check on alignment. */
  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);

  if (val[0] == 0) {
    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
e96c0e28   plank   Renaming gf.h to ...
892
    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
f58a7de9   plank   w128: bytwo-b, gr...
893
894
895
896
897
898
  }
    
  h = (gf_internal_t *) gf->scratch;
  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  top = (uint64_t *) rd.d_top;
110523d6   Jim Plank   GF-Complete Relea...
899
  bmask = (1ULL << 63);
f58a7de9   plank   w128: bytwo-b, gr...
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946

  while (d64 < top) {
    set_zero(c, 0);
    b[0] = s64[0];
    b[1] = s64[1];
    a[0] = val[0];
    a[1] = val[1];

    while (a[0] != 0) {
      if (a[1] & 1) {
        c[0] ^= b[0];
        c[1] ^= b[1];
      }
      a[1] >>= 1;
      if (a[0] & 1) a[1] ^= bmask;
      a[0] >>= 1;
      pp = (b[0] & bmask);
      b[0] <<= 1;
      if (b[1] & bmask) b[0] ^= 1;    
      b[1] <<= 1;
      if (pp) b[1] ^= h->prim_poly;
    }
    while (1) {
      if (a[1] & 1) {
        c[0] ^= b[0];
        c[1] ^= b[1];
      }
      a[1] >>= 1;
      if (a[1] == 0) break;
      pp = (b[0] & bmask);
      b[0] <<= 1;
      if (b[1] & bmask) b[0] ^= 1;    
      b[1] <<= 1;
      if (pp) b[1] ^= h->prim_poly;
    }
    if (xor) {
      d64[0] ^= c[0];
      d64[1] ^= c[1];
    } else {
      d64[0] = c[0];
      d64[1] = c[1];
    }
    s64 += 2;
    d64 += 2;
  }
}

70b6d55a   plank   Big checkin after...
947
948
949
950
951
952
953
954
955
956
957
static
void gf_w128_group_m_init(gf_t *gf, gf_val_128_t b128)
{
  int i, j;
  int g_m;
  uint64_t prim_poly, lbit;
  gf_internal_t *scratch;
  gf_group_tables_t *gt;
  uint64_t a128[2];
  scratch = (gf_internal_t *) gf->scratch;
  gt = scratch->private;
110523d6   Jim Plank   GF-Complete Relea...
958
  g_m = scratch->arg1;
70b6d55a   plank   Big checkin after...
959
960
  prim_poly = scratch->prim_poly;

bd685c8d   plank   Changed the defau...
961

70b6d55a   plank   Big checkin after...
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
  set_zero(gt->m_table, 0);
  a_get_b(gt->m_table, 2, b128, 0);
  lbit = 1;
  lbit <<= 63;

  for (i = 2; i < (1 << g_m); i <<= 1) {
    a_get_b(a128, 0, gt->m_table, 2 * (i >> 1));
    two_x(a128);
    a_get_b(gt->m_table, 2 * i, a128, 0);
    if (gt->m_table[2 * (i >> 1)] & lbit) gt->m_table[(2 * i) + 1] ^= prim_poly;
    for (j = 0; j < i; j++) {
      gt->m_table[(2 * i) + (2 * j)] = gt->m_table[(2 * i)] ^ gt->m_table[(2 * j)];
      gt->m_table[(2 * i) + (2 * j) + 1] = gt->m_table[(2 * i) + 1] ^ gt->m_table[(2 * j) + 1];
    }
  }
  return;
}

70b6d55a   plank   Big checkin after...
980
981
982
void
gf_w128_group_multiply(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
f043479e   Loic Dachary   remove unused var...
983
  int i;
70b6d55a   plank   Big checkin after...
984
985
986
987
988
989
990
991
992
993
  /* index_r, index_m, total_m (if g_r > g_m) */
  int i_r, i_m, t_m;
  int mask_m, mask_r;
  int g_m, g_r;
  uint64_t p_i[2], a[2];
  gf_internal_t *scratch;
  gf_group_tables_t *gt;

  scratch = (gf_internal_t *) gf->scratch;
  gt = scratch->private;
110523d6   Jim Plank   GF-Complete Relea...
994
995
  g_m = scratch->arg1;
  g_r = scratch->arg2;
70b6d55a   plank   Big checkin after...
996
997
998
999
1000
1001
1002

  mask_m = (1 << g_m) - 1;
  mask_r = (1 << g_r) - 1;

  if (b128[0] != gt->m_table[2] || b128[1] != gt->m_table[3]) {
    gf_w128_group_m_init(gf, b128);
  }
110523d6   Jim Plank   GF-Complete Relea...
1003
  
70b6d55a   plank   Big checkin after...
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
  p_i[0] = 0;
  p_i[1] = 0;
  a[0] = a128[0];
  a[1] = a128[1];

  t_m = 0;
  i_r = 0;

  /* Top 64 bits */
  for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
    i_m = (a[0] >> (i * g_m)) & mask_m;
    i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
    p_i[0] <<= g_m;
    p_i[0] ^= (p_i[1] >> (64-g_m));
    p_i[1] <<= g_m;
    p_i[0] ^= gt->m_table[2 * i_m];
    p_i[1] ^= gt->m_table[(2 * i_m) + 1];
    t_m += g_m;
    if (t_m == g_r) {
      p_i[1] ^= gt->r_table[i_r];
      t_m = 0;
      i_r = 0;
    } else {
      i_r <<= g_m;
    }
  }

  for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
    i_m = (a[1] >> (i * g_m)) & mask_m;
    i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
    p_i[0] <<= g_m;
    p_i[0] ^= (p_i[1] >> (64-g_m));
    p_i[1] <<= g_m;
    p_i[0] ^= gt->m_table[2 * i_m];
    p_i[1] ^= gt->m_table[(2 * i_m) + 1];
    t_m += g_m;
    if (t_m == g_r) {
      p_i[1] ^= gt->r_table[i_r];
      t_m = 0;
      i_r = 0;
    } else {
      i_r <<= g_m;
    }
  }
70b6d55a   plank   Big checkin after...
1048
1049
1050
1051
  c128[0] = p_i[0];
  c128[1] = p_i[1];
}

110523d6   Jim Plank   GF-Complete Relea...
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
static
void
gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
{
  int i;
  int i_r, i_m, t_m;
  int mask_m, mask_r;
  int g_m, g_r;
  uint64_t p_i[2], a[2];
  gf_internal_t *scratch;
  gf_group_tables_t *gt;
  gf_region_data rd;
f58a7de9   plank   w128: bytwo-b, gr...
1064
1065
1066
1067
1068
1069
1070
  uint64_t *a128, *c128, *top;

  /* We only do this to check on alignment. */
  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
      
  if (val[0] == 0) {
    if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
e96c0e28   plank   Renaming gf.h to ...
1071
    if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
f58a7de9   plank   w128: bytwo-b, gr...
1072
1073
1074
1075
  }
    
  scratch = (gf_internal_t *) gf->scratch;
  gt = scratch->private;
110523d6   Jim Plank   GF-Complete Relea...
1076
1077
  g_m = scratch->arg1;
  g_r = scratch->arg2;
f58a7de9   plank   w128: bytwo-b, gr...
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105

  mask_m = (1 << g_m) - 1;
  mask_r = (1 << g_r) - 1;

  if (val[0] != gt->m_table[2] || val[1] != gt->m_table[3]) {
    gf_w128_group_m_init(gf, val);
  }

  a128 = (uint64_t *) src;
  c128 = (uint64_t *) dest;
  top = (uint64_t *) rd.d_top;

  while (c128 < top) {
    p_i[0] = 0;
    p_i[1] = 0;
    a[0] = a128[0];
    a[1] = a128[1];
  
    t_m = 0;
    i_r = 0;
  
    /* Top 64 bits */
    for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
      i_m = (a[0] >> (i * g_m)) & mask_m;
      i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
      p_i[0] <<= g_m;
      p_i[0] ^= (p_i[1] >> (64-g_m));
      p_i[1] <<= g_m;
110523d6   Jim Plank   GF-Complete Relea...
1106
      
f58a7de9   plank   w128: bytwo-b, gr...
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
      p_i[0] ^= gt->m_table[2 * i_m];
      p_i[1] ^= gt->m_table[(2 * i_m) + 1];
      t_m += g_m;
      if (t_m == g_r) {
        p_i[1] ^= gt->r_table[i_r];
        t_m = 0;
        i_r = 0;
      } else {
        i_r <<= g_m;
      }
    }
f58a7de9   plank   w128: bytwo-b, gr...
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
    for (i = ((GF_FIELD_WIDTH / 2) / g_m) - 1; i >= 0; i--) {
      i_m = (a[1] >> (i * g_m)) & mask_m;
      i_r ^= (p_i[0] >> (64 - g_m)) & mask_r;
      p_i[0] <<= g_m;
      p_i[0] ^= (p_i[1] >> (64-g_m));
      p_i[1] <<= g_m;
      p_i[0] ^= gt->m_table[2 * i_m];
      p_i[1] ^= gt->m_table[(2 * i_m) + 1];
      t_m += g_m;
      if (t_m == g_r) {
        p_i[1] ^= gt->r_table[i_r];
        t_m = 0;
        i_r = 0;
      } else {
        i_r <<= g_m;
      }
    }
  
    if (xor) {
      c128[0] ^= p_i[0];
      c128[1] ^= p_i[1];
    } else {
      c128[0] = p_i[0];
      c128[1] = p_i[1];
    }
    a128 += 2;
    c128 += 2;
  }
}

70b6d55a   plank   Big checkin after...
1148
/* a^-1 -> b */
4339569f   Bassam Tabbara   Support for runti...
1149
void
70b6d55a   plank   Big checkin after...
1150
1151
1152
1153
1154
1155
1156
1157
gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
{
  uint64_t e_i[2], e_im1[2], e_ip1[2];
  uint64_t d_i, d_im1, d_ip1;
  uint64_t y_i[2], y_im1[2], y_ip1[2];
  uint64_t c_i[2];
  uint64_t *b;
  uint64_t one = 1;
70b6d55a   plank   Big checkin after...
1158
1159
1160
1161

  /* This needs to return some sort of error (in b128?) */
  if (a128[0] == 0 && a128[1] == 0) return;

0020ff80   Loic Dachary   initialize pointe...
1162
1163
  b = (uint64_t *) b128;

70b6d55a   plank   Big checkin after...
1164
1165
1166
1167
1168
  e_im1[0] = 0;
  e_im1[1] = ((gf_internal_t *) (gf->scratch))->prim_poly;
  e_i[0] = a128[0];
  e_i[1] = a128[1];
  d_im1 = 128;
110523d6   Jim Plank   GF-Complete Relea...
1169
1170
1171
1172
1173

  //Allen: I think d_i starts at 63 here, and checks each bit of a, starting at MSB, looking for the first nonzero bit
  //so d_i should be 0 if this half of a is all 0s, otherwise it should be the position from right of the first-from-left zero bit of this half of a.
  //BUT if d_i is 0 at end we won't know yet if the rightmost bit of this half is 1 or not

70b6d55a   plank   Big checkin after...
1174
  for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[0]) == 0 && d_i > 0; d_i--) ;
110523d6   Jim Plank   GF-Complete Relea...
1175
1176
1177

  //Allen: this is testing just the first half of the stop condition above, so if it holds we know we did not find a nonzero bit yet

70b6d55a   plank   Big checkin after...
1178
  if (!((one << d_i) & e_i[0])) {
110523d6   Jim Plank   GF-Complete Relea...
1179
1180
1181
1182
1183
1184

    //Allen: this is doing the same thing on the other half of a. In other words, we're still searching for a nonzero bit of a.
    // but not bothering to test if d_i hits zero, which is fine because we've already tested for a=0.

    for (d_i = (d_im1-1) % 64; ((one << d_i) & e_i[1]) == 0; d_i--) ;

70b6d55a   plank   Big checkin after...
1185
  } else {
110523d6   Jim Plank   GF-Complete Relea...
1186
1187
1188

    //Allen: if a 1 was found in more-significant half of a, make d_i the ACTUAL index of the first nonzero bit in the entire a.

70b6d55a   plank   Big checkin after...
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
    d_i += 64;
  }
  y_i[0] = 0;
  y_i[1] = 1;
  y_im1[0] = 0;
  y_im1[1] = 0;

  while (!(e_i[0] == 0 && e_i[1] == 1)) {

    e_ip1[0] = e_im1[0];
    e_ip1[1] = e_im1[1];
    d_ip1 = d_im1;
    c_i[0] = 0;
    c_i[1] = 0;

    while (d_ip1 >= d_i) {
      if ((d_ip1 - d_i) >= 64) {
        c_i[0] ^= (one << ((d_ip1 - d_i) - 64));
        e_ip1[0] ^= (e_i[1] << ((d_ip1 - d_i) - 64));
      } else {
        c_i[1] ^= (one << (d_ip1 - d_i));
        e_ip1[0] ^= (e_i[0] << (d_ip1 - d_i));
        if (d_ip1 - d_i > 0) e_ip1[0] ^= (e_i[1] >> (64 - (d_ip1 - d_i)));
        e_ip1[1] ^= (e_i[1] << (d_ip1 - d_i));
      }
110523d6   Jim Plank   GF-Complete Relea...
1214
1215
      d_ip1--;
      if (e_ip1[0] == 0 && e_ip1[1] == 0) { b[0] = 0; b[1] = 0; return; }
70b6d55a   plank   Big checkin after...
1216
1217
1218
      while (d_ip1 >= 64 && (e_ip1[0] & (one << (d_ip1 - 64))) == 0) d_ip1--;
      while (d_ip1 <  64 && (e_ip1[1] & (one << d_ip1)) == 0) d_ip1--;
    }
70b6d55a   plank   Big checkin after...
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
    gf->multiply.w128(gf, c_i, y_i, y_ip1);
    y_ip1[0] ^= y_im1[0];
    y_ip1[1] ^= y_im1[1];

    y_im1[0] = y_i[0];
    y_im1[1] = y_i[1];

    y_i[0] = y_ip1[0];
    y_i[1] = y_ip1[1];

    e_im1[0] = e_i[0];
    e_im1[1] = e_i[1];
    d_im1 = d_i;
    e_i[0] = e_ip1[0];
    e_i[1] = e_ip1[1];
    d_i = d_ip1;
  }

70b6d55a   plank   Big checkin after...
1237
1238
  b[0] = y_i[0];
  b[1] = y_i[1];
70b6d55a   plank   Big checkin after...
1239
1240
1241
  return;
}

4339569f   Bassam Tabbara   Support for runti...
1242
void
70b6d55a   plank   Big checkin after...
1243
1244
1245
1246
1247
1248
1249
1250
gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
{
  uint64_t d[2];
  gf->inverse.w128(gf, b128, d);
  gf->multiply.w128(gf, a128, d, c128);
  return;
}

4339569f   Bassam Tabbara   Support for runti...
1251
void
70b6d55a   plank   Big checkin after...
1252
1253
1254
1255
1256
1257
1258
1259
1260
gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
{
  uint64_t one128[2];
  one128[0] = 0;
  one128[1] = 1;
  gf->divide.w128(gf, one128, a128, b128);
  return;
}

110523d6   Jim Plank   GF-Complete Relea...
1261
1262

static
4339569f   Bassam Tabbara   Support for runti...
1263
void
110523d6   Jim Plank   GF-Complete Relea...
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv)
{
  gf_internal_t *h = (gf_internal_t *) gf->scratch;
  gf_t *base_gf = h->base_gf;
  uint64_t a0 = a[1];
  uint64_t a1 = a[0];
  uint64_t c0, c1, d, tmp;
  uint64_t a0inv, a1inv;

  if (a0 == 0) {
    a1inv = base_gf->inverse.w64(base_gf, a1);
    c0 = base_gf->multiply.w64(base_gf, a1inv, h->prim_poly);
    c1 = a1inv;
  } else if (a1 == 0) {
    c0 = base_gf->inverse.w64(base_gf, a0);
    c1 = 0;
  } else {
    a1inv = base_gf->inverse.w64(base_gf, a1);
    a0inv = base_gf->inverse.w64(base_gf, a0);

    d = base_gf->multiply.w64(base_gf, a1, a0inv);

    tmp = (base_gf->multiply.w64(base_gf, a1, a0inv) ^ base_gf->multiply.w64(base_gf, a0, a1inv) ^ h->prim_poly);
    tmp = base_gf->inverse.w64(base_gf, tmp);

    d = base_gf->multiply.w64(base_gf, d, tmp);

    c0 = base_gf->multiply.w64(base_gf, (d^1), a0inv);
    c1 = base_gf->multiply.w64(base_gf, d, a1inv);
  }
  inv[0] = c1;
  inv[1] = c0;
}

static
  void
gf_w128_composite_multiply(gf_t *gf, gf_val_128_t a, gf_val_128_t b, gf_val_128_t rv)
{
  gf_internal_t *h = (gf_internal_t *) gf->scratch;
  gf_t *base_gf = h->base_gf;
  uint64_t b0 = b[1];
  uint64_t b1 = b[0];
  uint64_t a0 = a[1];
  uint64_t a1 = a[0];
  uint64_t a1b1;

  a1b1 = base_gf->multiply.w64(base_gf, a1, b1);

  rv[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
  rv[0] = base_gf->multiply.w64(base_gf, a1, b0) ^ 
    base_gf->multiply.w64(base_gf, a0, b1) ^ 
    base_gf->multiply.w64(base_gf, a1b1, h->prim_poly);
}

static
  void
gf_w128_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
{
110523d6   Jim Plank   GF-Complete Relea...
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
  gf_internal_t *h = (gf_internal_t *) gf->scratch;
  gf_t *base_gf = h->base_gf;
  uint64_t b0 = val[1];
  uint64_t b1 = val[0];
  uint64_t *s64, *d64;
  uint64_t *top;
  uint64_t a0, a1, a1b1;
  gf_region_data rd;

  if (val[0] == 0 && val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }

  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);

  s64 = rd.s_start;
  d64 = rd.d_start;
  top = rd.d_top;

  if (xor) {
    while (d64 < top) {
      a1 = s64[0];
      a0 = s64[1];
      a1b1 = base_gf->multiply.w64(base_gf, a1, b1);

      d64[1] ^= (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
      d64[0] ^= (base_gf->multiply.w64(base_gf, a1, b0) ^ 
          base_gf->multiply.w64(base_gf, a0, b1) ^ 
          base_gf->multiply.w64(base_gf, a1b1, h->prim_poly));
      s64 += 2;
      d64 += 2;
    }
  } else {
    while (d64 < top) {
      a1 = s64[0];
      a0 = s64[1];
      a1b1 = base_gf->multiply.w64(base_gf, a1, b1);

      d64[1] = (base_gf->multiply.w64(base_gf, a0, b0) ^ a1b1);
      d64[0] = (base_gf->multiply.w64(base_gf, a1, b0) ^ 
          base_gf->multiply.w64(base_gf, a0, b1) ^ 
          base_gf->multiply.w64(base_gf, a1b1, h->prim_poly));
      s64 += 2;
      d64 += 2;
    }
  }
}

static
void
gf_w128_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int 
    xor)
{
  gf_internal_t *h = (gf_internal_t *) gf->scratch;  gf_t *base_gf = h->base_gf;
  gf_val_64_t val0 = val[1];
  gf_val_64_t val1 = val[0];
110523d6   Jim Plank   GF-Complete Relea...
1376
1377
1378
1379
1380
1381
  uint8_t *slow, *shigh;
  uint8_t *dlow, *dhigh, *top;
  int sub_reg_size;
  gf_region_data rd;

  gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 64);
a0ae760e   Loic Dachary   prefer uint8_t to...
1382
  gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
110523d6   Jim Plank   GF-Complete Relea...
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397

  slow = (uint8_t *) rd.s_start;
  dlow = (uint8_t *) rd.d_start;
  top = (uint8_t*) rd.d_top;
  sub_reg_size = (top - dlow)/2;
  shigh = slow + sub_reg_size;
  dhigh = dlow + sub_reg_size;

  base_gf->multiply_region.w64(base_gf, slow, dlow, val0, sub_reg_size, xor);
  base_gf->multiply_region.w64(base_gf, shigh, dlow, val1, sub_reg_size, 1);
  base_gf->multiply_region.w64(base_gf, slow, dhigh, val1, sub_reg_size, xor);
  base_gf->multiply_region.w64(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
  base_gf->multiply_region.w64(base_gf, shigh, dhigh, base_gf->multiply.w64(base_gf, h->prim_poly, val1
        ), sub_reg_size, 1);

a0ae760e   Loic Dachary   prefer uint8_t to...
1398
  gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
110523d6   Jim Plank   GF-Complete Relea...
1399
1400
1401
1402
1403
1404
1405
1406
1407
}


  static
int gf_w128_composite_init(gf_t *gf)
{
  gf_internal_t *h = (gf_internal_t *) gf->scratch;

  if (h->region_type & GF_REGION_ALTMAP) {
87f0d439   Bassam Tabbara   Add support for p...
1408
    SET_FUNCTION(gf,multiply_region,w128,gf_w128_composite_multiply_region_alt)   
110523d6   Jim Plank   GF-Complete Relea...
1409
  } else {
87f0d439   Bassam Tabbara   Add support for p...
1410
    SET_FUNCTION(gf,multiply_region,w128,gf_w128_composite_multiply_region)
110523d6   Jim Plank   GF-Complete Relea...
1411
1412
  }

87f0d439   Bassam Tabbara   Add support for p...
1413
1414
1415
  SET_FUNCTION(gf,multiply,w128,gf_w128_composite_multiply)
  SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse)
  SET_FUNCTION(gf,inverse,w128,gf_w128_composite_inverse)
110523d6   Jim Plank   GF-Complete Relea...
1416
1417
1418
1419
1420
1421
1422

  return 1;
}

static
int gf_w128_cfm_init(gf_t *gf)
{
fb0bbdcf   Jim Plank   Fixed the problem...
1423
#if defined(INTEL_SSE4_PCLMUL)
4339569f   Bassam Tabbara   Support for runti...
1424
1425
1426
1427
1428
1429
  if (gf_cpu_supports_intel_pclmul) {
    SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
    SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
    SET_FUNCTION(gf,multiply_region,w128,gf_w128_clm_multiply_region_from_single)
    return 1;
  }
110523d6   Jim Plank   GF-Complete Relea...
1430
1431
1432
1433
1434
#endif

  return 0;
}

70b6d55a   plank   Big checkin after...
1435
1436
1437
static
int gf_w128_shift_init(gf_t *gf)
{
87f0d439   Bassam Tabbara   Add support for p...
1438
1439
1440
  SET_FUNCTION(gf,multiply,w128,gf_w128_shift_multiply)
  SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
  SET_FUNCTION(gf,multiply_region,w128,gf_w128_multiply_region_from_single)
70b6d55a   plank   Big checkin after...
1441
1442
1443
  return 1;
}

110523d6   Jim Plank   GF-Complete Relea...
1444
  static
f58a7de9   plank   w128: bytwo-b, gr...
1445
1446
int gf_w128_bytwo_init(gf_t *gf)
{
110523d6   Jim Plank   GF-Complete Relea...
1447
1448
1449
1450
  gf_internal_t *h;
  h = (gf_internal_t *) gf->scratch;

  if (h->mult_type == GF_MULT_BYTWO_p) {
87f0d439   Bassam Tabbara   Add support for p...
1451
1452
    SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply)
    /*SET_FUNCTION(gf,multiply,w128,gf_w128_sse_bytwo_p_multiply)*/
110523d6   Jim Plank   GF-Complete Relea...
1453
1454
    /* John: the sse function is slower.*/
  } else {
87f0d439   Bassam Tabbara   Add support for p...
1455
1456
    SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_b_multiply)
    /*SET_FUNCTION(gf,multiply,w128,gf_w128_sse_bytwo_b_multiply)
110523d6   Jim Plank   GF-Complete Relea...
1457
1458
Ben: This sse function is also slower. */
  }
87f0d439   Bassam Tabbara   Add support for p...
1459
1460
  SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
  SET_FUNCTION(gf,multiply_region,w128,gf_w128_bytwo_b_multiply_region)
f58a7de9   plank   w128: bytwo-b, gr...
1461
1462
1463
  return 1;
}

70b6d55a   plank   Big checkin after...
1464
1465
1466
1467
/*
 * Because the prim poly is only 8 bits and we are limiting g_r to 16, I do not need the high 64
 * bits in all of these numbers.
 */
110523d6   Jim Plank   GF-Complete Relea...
1468
  static
70b6d55a   plank   Big checkin after...
1469
1470
1471
1472
1473
1474
1475
1476
1477
void gf_w128_group_r_init(gf_t *gf)
{
  int i, j;
  int g_r;
  uint64_t pp;
  gf_internal_t *scratch;
  gf_group_tables_t *gt;
  scratch = (gf_internal_t *) gf->scratch;
  gt = scratch->private;
110523d6   Jim Plank   GF-Complete Relea...
1478
  g_r = scratch->arg2;
70b6d55a   plank   Big checkin after...
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
  pp = scratch->prim_poly;

  gt->r_table[0] = 0;
  for (i = 1; i < (1 << g_r); i++) {
    gt->r_table[i] = 0;
    for (j = 0; j < g_r; j++) {
      if (i & (1 << j)) {
        gt->r_table[i] ^= (pp << j);
      }
    }
  }
  return;
}

20a242d9   Loic Dachary   remove unused sta...
1493
#if 0 // defined(INTEL_SSE4)
110523d6   Jim Plank   GF-Complete Relea...
1494
1495
1496
  static
void gf_w128_group_r_sse_init(gf_t *gf)
{
110523d6   Jim Plank   GF-Complete Relea...
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
  int i, j;
  int g_r;
  uint64_t pp;
  gf_internal_t *scratch;
  gf_group_tables_t *gt;
  scratch = (gf_internal_t *) gf->scratch;
  gt = scratch->private;
  __m128i zero = _mm_setzero_si128();
  __m128i *table = (__m128i *)(gt->r_table);
  g_r = scratch->arg2;
  pp = scratch->prim_poly;
  table[0] = zero;
  for (i = 1; i < (1 << g_r); i++) {
    table[i] = zero;
    for (j = 0; j < g_r; j++) {
      if (i & (1 << j)) {
        table[i] = _mm_xor_si128(table[i], _mm_insert_epi64(zero, pp << j, 0));
      }
    }
  }
  return;
110523d6   Jim Plank   GF-Complete Relea...
1518
}
29899ad4   Loic Dachary   move #if to avoid...
1519
#endif
110523d6   Jim Plank   GF-Complete Relea...
1520
1521

  static 
f58a7de9   plank   w128: bytwo-b, gr...
1522
1523
int gf_w128_split_init(gf_t *gf)
{
110523d6   Jim Plank   GF-Complete Relea...
1524
1525
  struct gf_w128_split_4_128_data *sd4;
  struct gf_w128_split_8_128_data *sd8;
f58a7de9   plank   w128: bytwo-b, gr...
1526
1527
1528
  gf_internal_t *h;

  h = (gf_internal_t *) gf->scratch;
f58a7de9   plank   w128: bytwo-b, gr...
1529

87f0d439   Bassam Tabbara   Add support for p...
1530
  SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply)
fb0bbdcf   Jim Plank   Fixed the problem...
1531
#if defined(INTEL_SSE4_PCLMUL)
4339569f   Bassam Tabbara   Support for runti...
1532
  if (gf_cpu_supports_intel_pclmul && !(h->region_type & GF_REGION_NOSIMD)){
87f0d439   Bassam Tabbara   Add support for p...
1533
    SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
110523d6   Jim Plank   GF-Complete Relea...
1534
1535
1536
  }
#endif

87f0d439   Bassam Tabbara   Add support for p...
1537
  SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
110523d6   Jim Plank   GF-Complete Relea...
1538
1539
1540
1541
1542

  if ((h->arg1 != 4 && h->arg2 != 4) || h->mult_type == GF_MULT_DEFAULT) {
    sd8 = (struct gf_w128_split_8_128_data *) h->private;
    sd8->last_value[0] = 0;
    sd8->last_value[1] = 0;
87f0d439   Bassam Tabbara   Add support for p...
1543
    SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_8_128_multiply_region)
110523d6   Jim Plank   GF-Complete Relea...
1544
1545
1546
1547
1548
1549
1550
  } else {
    sd4 = (struct gf_w128_split_4_128_data *) h->private;
    sd4->last_value[0] = 0;
    sd4->last_value[1] = 0;
    if((h->region_type & GF_REGION_ALTMAP))
    {
      #ifdef INTEL_SSE4
4339569f   Bassam Tabbara   Support for runti...
1551
        if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
87f0d439   Bassam Tabbara   Add support for p...
1552
          SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_altmap_multiply_region)
110523d6   Jim Plank   GF-Complete Relea...
1553
        else
110523d6   Jim Plank   GF-Complete Relea...
1554
      #endif
4339569f   Bassam Tabbara   Support for runti...
1555
          return 0;
110523d6   Jim Plank   GF-Complete Relea...
1556
1557
    }
    else {
7bb3797a   Jim Plank   Added SPLIT SSE S...
1558
      #ifdef INTEL_SSE4
4339569f   Bassam Tabbara   Support for runti...
1559
        if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
87f0d439   Bassam Tabbara   Add support for p...
1560
          SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_multiply_region)
7bb3797a   Jim Plank   Added SPLIT SSE S...
1561
        else
7bb3797a   Jim Plank   Added SPLIT SSE S...
1562
      #endif
4339569f   Bassam Tabbara   Support for runti...
1563
        SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
110523d6   Jim Plank   GF-Complete Relea...
1564
1565
    }
  }
f58a7de9   plank   w128: bytwo-b, gr...
1566
1567
1568
1569
  return 1;
}


70b6d55a   plank   Big checkin after...
1570
1571
1572
1573
1574
static
int gf_w128_group_init(gf_t *gf)
{
  gf_internal_t *scratch;
  gf_group_tables_t *gt;
f043479e   Loic Dachary   remove unused var...
1575
  int g_r, size_r;
70b6d55a   plank   Big checkin after...
1576
1577
1578

  scratch = (gf_internal_t *) gf->scratch;
  gt = scratch->private;
110523d6   Jim Plank   GF-Complete Relea...
1579
  g_r = scratch->arg2;
70b6d55a   plank   Big checkin after...
1580
1581
  size_r = (1 << g_r);

a0ae760e   Loic Dachary   prefer uint8_t to...
1582
  gt->r_table = (gf_val_128_t)((uint8_t *)scratch->private + (2 * sizeof(uint64_t *)));
70b6d55a   plank   Big checkin after...
1583
1584
1585
1586
  gt->m_table = gt->r_table + size_r;
  gt->m_table[2] = 0;
  gt->m_table[3] = 0;

87f0d439   Bassam Tabbara   Add support for p...
1587
1588
1589
  SET_FUNCTION(gf,multiply,w128,gf_w128_group_multiply)
  SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
  SET_FUNCTION(gf,multiply_region,w128,gf_w128_group_multiply_region)
110523d6   Jim Plank   GF-Complete Relea...
1590

f0c32c94   Jim Plank   Removed GROUP/128...
1591
  gf_w128_group_r_init(gf);
110523d6   Jim Plank   GF-Complete Relea...
1592

70b6d55a   plank   Big checkin after...
1593
1594
1595
  return 1;
}

f58a7de9   plank   w128: bytwo-b, gr...
1596
1597
1598
1599
1600
1601
1602
1603
1604
void gf_w128_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
{
  gf_val_128_t s;

  s = (gf_val_128_t) start;
  s += (index * 2); 
  memcpy(rv, s, 16);
}

110523d6   Jim Plank   GF-Complete Relea...
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
static void gf_w128_split_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
{
  int i, blocks;
  uint64_t *r64, tmp;
  uint8_t *r8;
  gf_region_data rd;

  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 256);
  r64 = (uint64_t *) start;
  if ((r64 + index*2 < (uint64_t *) rd.d_start) ||
      (r64 + index*2 >= (uint64_t *) rd.d_top)) {
    memcpy(rv, r64+(index*2), 16);
    return;
  }

  index -= (((uint64_t *) rd.d_start) - r64)/2;
  r64 = (uint64_t *) rd.d_start;

  blocks = index/16;
  r64 += (blocks*32);
  index %= 16;
  r8 = (uint8_t *) r64;
  r8 += index;
  rv[0] = 0;
  rv[1] = 0;

  for (i = 0; i < 8; i++) {
    tmp = *r8;
    rv[1] |= (tmp << (i*8));
    r8 += 16;
  }

  for (i = 0; i < 8; i++) {
    tmp = *r8;
    rv[0] |= (tmp << (i*8));
    r8 += 16;
  }
  return;
}

  static
void gf_w128_composite_extract_word(gf_t *gf, void *start, int bytes, int index, gf_val_128_t rv)
{
  int sub_size;
  gf_internal_t *h;
  uint8_t *r8, *top;
  uint64_t *r64;
  gf_region_data rd;

  h = (gf_internal_t *) gf->scratch;
  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64);
  r64 = (uint64_t *) start;
  if ((r64 + index*2 < (uint64_t *) rd.d_start) ||
      (r64 + index*2 >= (uint64_t *) rd.d_top)) {
    memcpy(rv, r64+(index*2), 16);
    return;
  }
  index -= (((uint64_t *) rd.d_start) - r64)/2;
  r8 = (uint8_t *) rd.d_start;
  top = (uint8_t *) rd.d_top;
  sub_size = (top-r8)/2;

  rv[1] = h->base_gf->extract_word.w64(h->base_gf, r8, sub_size, index);
  rv[0] = h->base_gf->extract_word.w64(h->base_gf, r8+sub_size, sub_size, index);
  
  return;
}

70b6d55a   plank   Big checkin after...
1673
1674
1675
int gf_w128_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
  int size_m, size_r;
110523d6   Jim Plank   GF-Complete Relea...
1676
1677
  if (divide_type==GF_DIVIDE_MATRIX) return 0;

70b6d55a   plank   Big checkin after...
1678
1679
  switch(mult_type)
  {
110523d6   Jim Plank   GF-Complete Relea...
1680
1681
1682
    case GF_MULT_CARRY_FREE:
      return sizeof(gf_internal_t);
      break;
70b6d55a   plank   Big checkin after...
1683
    case GF_MULT_SHIFT:
70b6d55a   plank   Big checkin after...
1684
1685
      return sizeof(gf_internal_t);
      break;
110523d6   Jim Plank   GF-Complete Relea...
1686
    case GF_MULT_BYTWO_p:
f58a7de9   plank   w128: bytwo-b, gr...
1687
    case GF_MULT_BYTWO_b:
f58a7de9   plank   w128: bytwo-b, gr...
1688
1689
      return sizeof(gf_internal_t);
      break;
110523d6   Jim Plank   GF-Complete Relea...
1690
    case GF_MULT_DEFAULT: 
f58a7de9   plank   w128: bytwo-b, gr...
1691
    case GF_MULT_SPLIT_TABLE:
f58a7de9   plank   w128: bytwo-b, gr...
1692
1693
      if ((arg1 == 4 && arg2 == 128) || (arg1 == 128 && arg2 == 4)) {
        return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_4_128_data) + 64;
110523d6   Jim Plank   GF-Complete Relea...
1694
1695
      } else if ((arg1 == 8 && arg2 == 128) || (arg1 == 128 && arg2 == 8) || mult_type == GF_MULT_DEFAULT) {
        return sizeof(gf_internal_t) + sizeof(struct gf_w128_split_8_128_data) + 64;
f58a7de9   plank   w128: bytwo-b, gr...
1696
      }
110523d6   Jim Plank   GF-Complete Relea...
1697
      return 0;
f58a7de9   plank   w128: bytwo-b, gr...
1698
      break;
70b6d55a   plank   Big checkin after...
1699
    case GF_MULT_GROUP:
110523d6   Jim Plank   GF-Complete Relea...
1700
      /* JSP We've already error checked the arguments. */
70b6d55a   plank   Big checkin after...
1701
      size_m = (1 << arg1) * 2 * sizeof(uint64_t);
110523d6   Jim Plank   GF-Complete Relea...
1702
      size_r = (1 << arg2) * 2 * sizeof(uint64_t);
70b6d55a   plank   Big checkin after...
1703
1704
1705
1706
      /* 
       * two pointers prepend the table data for structure
       * because the tables are of dynamic size
       */
110523d6   Jim Plank   GF-Complete Relea...
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
      return sizeof(gf_internal_t) + size_m + size_r + 4 * sizeof(uint64_t *);
      break;
    case GF_MULT_COMPOSITE:
      if (arg1 == 2) {
        return sizeof(gf_internal_t) + 4;
      } else {
        return 0;
      }
      break;

70b6d55a   plank   Big checkin after...
1717
    default:
110523d6   Jim Plank   GF-Complete Relea...
1718
      return 0;
70b6d55a   plank   Big checkin after...
1719
1720
1721
1722
1723
   }
}

int gf_w128_init(gf_t *gf)
{
f043479e   Loic Dachary   remove unused var...
1724
  gf_internal_t *h;
70b6d55a   plank   Big checkin after...
1725
1726

  h = (gf_internal_t *) gf->scratch;
110523d6   Jim Plank   GF-Complete Relea...
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
  
  /* Allen: set default primitive polynomial / irreducible polynomial if needed */

  if (h->prim_poly == 0) {
    if (h->mult_type == GF_MULT_COMPOSITE) {
      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
      if (h->prim_poly == 0) return 0; /* This shouldn't happen */
    } else {
      h->prim_poly = 0x87; /* Omitting the leftmost 1 as in w=32 */
    }
110523d6   Jim Plank   GF-Complete Relea...
1737
  }
70b6d55a   plank   Big checkin after...
1738

87f0d439   Bassam Tabbara   Add support for p...
1739
1740
1741
1742
  SET_FUNCTION(gf,multiply,w128,NULL)
  SET_FUNCTION(gf,divide,w128,NULL)
  SET_FUNCTION(gf,inverse,w128,NULL)
  SET_FUNCTION(gf,multiply_region,w128,NULL)
70b6d55a   plank   Big checkin after...
1743
  switch(h->mult_type) {
110523d6   Jim Plank   GF-Complete Relea...
1744
    case GF_MULT_BYTWO_p:
f58a7de9   plank   w128: bytwo-b, gr...
1745
    case GF_MULT_BYTWO_b:      if (gf_w128_bytwo_init(gf) == 0) return 0; break;
110523d6   Jim Plank   GF-Complete Relea...
1746
    case GF_MULT_CARRY_FREE:   if (gf_w128_cfm_init(gf) == 0) return 0; break;
70b6d55a   plank   Big checkin after...
1747
    case GF_MULT_SHIFT:        if (gf_w128_shift_init(gf) == 0) return 0; break;
70b6d55a   plank   Big checkin after...
1748
    case GF_MULT_GROUP:        if (gf_w128_group_init(gf) == 0) return 0; break;
110523d6   Jim Plank   GF-Complete Relea...
1749
    case GF_MULT_DEFAULT: 
f58a7de9   plank   w128: bytwo-b, gr...
1750
    case GF_MULT_SPLIT_TABLE:  if (gf_w128_split_init(gf) == 0) return 0; break;
110523d6   Jim Plank   GF-Complete Relea...
1751
    case GF_MULT_COMPOSITE:    if (gf_w128_composite_init(gf) == 0) return 0; break;
70b6d55a   plank   Big checkin after...
1752
1753
    default: return 0;
  }
f58a7de9   plank   w128: bytwo-b, gr...
1754

110523d6   Jim Plank   GF-Complete Relea...
1755
1756
1757
  /* Ben: Used to be h->region_type == GF_REGION_ALTMAP, but failed since there
     are multiple flags in h->region_type */
  if (h->mult_type == GF_MULT_SPLIT_TABLE && (h->region_type & GF_REGION_ALTMAP)) {
87f0d439   Bassam Tabbara   Add support for p...
1758
    SET_FUNCTION(gf,extract_word,w128,gf_w128_split_extract_word)
110523d6   Jim Plank   GF-Complete Relea...
1759
  } else if (h->mult_type == GF_MULT_COMPOSITE && h->region_type == GF_REGION_ALTMAP) {
87f0d439   Bassam Tabbara   Add support for p...
1760
    SET_FUNCTION(gf,extract_word,w128,gf_w128_composite_extract_word)
110523d6   Jim Plank   GF-Complete Relea...
1761
  } else {
87f0d439   Bassam Tabbara   Add support for p...
1762
    SET_FUNCTION(gf,extract_word,w128,gf_w128_extract_word)
110523d6   Jim Plank   GF-Complete Relea...
1763
  }
f58a7de9   plank   w128: bytwo-b, gr...
1764

70b6d55a   plank   Big checkin after...
1765
  if (h->divide_type == GF_DIVIDE_EUCLID) {
87f0d439   Bassam Tabbara   Add support for p...
1766
    SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse)
110523d6   Jim Plank   GF-Complete Relea...
1767
  } 
70b6d55a   plank   Big checkin after...
1768
1769

  if (gf->inverse.w128 != NULL && gf->divide.w128 == NULL) {
87f0d439   Bassam Tabbara   Add support for p...
1770
    SET_FUNCTION(gf,divide,w128,gf_w128_divide_from_inverse)
70b6d55a   plank   Big checkin after...
1771
1772
  }
  if (gf->inverse.w128 == NULL && gf->divide.w128 != NULL) {
87f0d439   Bassam Tabbara   Add support for p...
1773
    SET_FUNCTION(gf,inverse,w128,gf_w128_inverse_from_divide)
70b6d55a   plank   Big checkin after...
1774
1775
1776
  }
  return 1;
}