Blame view

src/gf_w64.c 58.4 KB
70b6d55a   plank   Big checkin after...
1
/*
b0e2ae07   Jim Plank   Put headers on th...
2
3
4
5
 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
 *
70b6d55a   plank   Big checkin after...
6
7
8
9
10
11
12
13
 * gf_w64.c
 *
 * Routines for 64-bit Galois fields
 */

#include "gf_int.h"
#include <stdio.h>
#include <stdlib.h>
6fdd8bc3   Janne Grunau   arm: NEON optimis...
14
#include "gf_w64.h"
4339569f   Bassam Tabbara   Support for runti...
15

49facc14   plank   All splits for w=...
16
static
70b6d55a   plank   Big checkin after...
17
18
19
20
21
22
23
inline
gf_val_64_t gf_w64_inverse_from_divide (gf_t *gf, gf_val_64_t a)
{
  return gf->divide.w64(gf, 1, a);
}

#define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? "   " : " ", blah[15-ii]); printf("\n"); }
82a365da   Jim Plank   Getting the defau...
24
25

static
70b6d55a   plank   Big checkin after...
26
27
28
29
30
31
32
33
34
35
36
37
38
inline
gf_val_64_t gf_w64_divide_from_inverse (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
{
  b = gf->inverse.w64(gf, b);
  return gf->multiply.w64(gf, a, b);
}

static
void
gf_w64_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
xor)
{
  int i;
d1b6bbf7   Loic Dachary   add -Wsign-compar...
39
  gf_val_64_t *s64;
70b6d55a   plank   Big checkin after...
40
41
42
43
44
45
  gf_val_64_t *d64;

  s64 = (gf_val_64_t *) src;
  d64 = (gf_val_64_t *) dest;

  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
110523d6   Jim Plank   GF-Complete Relea...
46
47
48
  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }

  if (xor) {
70b6d55a   plank   Big checkin after...
49
50
51
52
53
54
55
56
57
58
59
    for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) {
      d64[i] ^= gf->multiply.w64(gf, val, s64[i]);
    }
  } else {
    for (i = 0; i < bytes/sizeof(gf_val_64_t); i++) {
      d64[i] = gf->multiply.w64(gf, val, s64[i]);
    }
  }
}

#if defined(INTEL_SSE4_PCLMUL) 
29899ad4   Loic Dachary   move #if to avoid...
60
static
70b6d55a   plank   Big checkin after...
61
void
110523d6   Jim Plank   GF-Complete Relea...
62
63
64
65
gf_w64_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
xor)
{
  gf_val_64_t *s64, *d64, *top;
110523d6   Jim Plank   GF-Complete Relea...
66
67
68
  gf_region_data rd;

  __m128i         a, b;
110523d6   Jim Plank   GF-Complete Relea...
69
70
71
  __m128i         result, r1;
  __m128i         prim_poly;
  __m128i         w;
191b86b5   Loic Dachary   remove unused var...
72
  __m128i         m1, m3, m4;
c0dd8e0f   Danny Al-Gaaf   gf_w64.c: remove ...
73
  gf_internal_t * h = gf->scratch;
110523d6   Jim Plank   GF-Complete Relea...
74
75
76
77
78
79
80
81
82
83
84
  
  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  
  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
  gf_do_initial_region_alignment(&rd);

  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
  b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0);
  m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff);
  m3 = _mm_slli_si128(m1, 8);
110523d6   Jim Plank   GF-Complete Relea...
85
86
87
88
89
90
  m4 = _mm_slli_si128(m3, 4);

  s64 = (gf_val_64_t *) rd.s_start;
  d64 = (gf_val_64_t *) rd.d_start;
  top = (gf_val_64_t *) rd.d_top;

110523d6   Jim Plank   GF-Complete Relea...
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
  if (xor) {
    while (d64 != top) {
      a = _mm_load_si128((__m128i *) s64);  
      result = _mm_clmulepi64_si128 (a, b, 1);

      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
      result = _mm_xor_si128 (result, w);
      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
      r1 = _mm_xor_si128 (result, w);

      result = _mm_clmulepi64_si128 (a, b, 0);

      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
      result = _mm_xor_si128 (result, w);

      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
      result = _mm_xor_si128 (result, w);

      result = _mm_unpacklo_epi64(result, r1);
      
      r1 = _mm_load_si128((__m128i *) d64);
      result = _mm_xor_si128(r1, result);
      _mm_store_si128((__m128i *) d64, result);
      d64 += 2;
      s64 += 2;
    }
  } else {
    while (d64 != top) {
      
      a = _mm_load_si128((__m128i *) s64);  
      result = _mm_clmulepi64_si128 (a, b, 1);

      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
      result = _mm_xor_si128 (result, w);
      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
      r1 = _mm_xor_si128 (result, w);

      result = _mm_clmulepi64_si128 (a, b, 0);

      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
      result = _mm_xor_si128 (result, w);
      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
      result = _mm_xor_si128 (result, w);
      
      result = _mm_unpacklo_epi64(result, r1);

      _mm_store_si128((__m128i *) d64, result);
      d64 += 2;
      s64 += 2;
    }
  }
  gf_do_final_region_alignment(&rd);
}
110523d6   Jim Plank   GF-Complete Relea...
144
#endif
29899ad4   Loic Dachary   move #if to avoid...
145

110523d6   Jim Plank   GF-Complete Relea...
146
#if defined(INTEL_SSE4_PCLMUL)
29899ad4   Loic Dachary   move #if to avoid...
147
static
110523d6   Jim Plank   GF-Complete Relea...
148
149
150
151
152
void
gf_w64_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int
xor)
{
  gf_val_64_t *s64, *d64, *top;
110523d6   Jim Plank   GF-Complete Relea...
153
154
155
  gf_region_data rd;

  __m128i         a, b;
110523d6   Jim Plank   GF-Complete Relea...
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
  __m128i         result, r1;
  __m128i         prim_poly;
  __m128i         w;
  __m128i         m1, m3, m4;
  gf_internal_t * h = gf->scratch;
  
  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  
  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
  gf_do_initial_region_alignment(&rd);
  
  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
  b = _mm_insert_epi64 (_mm_setzero_si128(), val, 0);
  m1 = _mm_set_epi32(0, 0, 0, (uint32_t)0xffffffff);
  m3 = _mm_slli_si128(m1, 8);
  m4 = _mm_slli_si128(m3, 4);

  s64 = (gf_val_64_t *) rd.s_start;
  d64 = (gf_val_64_t *) rd.d_start;
  top = (gf_val_64_t *) rd.d_top;

110523d6   Jim Plank   GF-Complete Relea...
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
  if (xor) {
    while (d64 != top) {
      a = _mm_load_si128((__m128i *) s64);
      result = _mm_clmulepi64_si128 (a, b, 1);

      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
      result = _mm_xor_si128 (result, w);
      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
      r1 = _mm_xor_si128 (result, w);

      result = _mm_clmulepi64_si128 (a, b, 0);

      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
      result = _mm_xor_si128 (result, w);

      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
      result = _mm_xor_si128 (result, w);

      result = _mm_unpacklo_epi64(result, r1);

      r1 = _mm_load_si128((__m128i *) d64);
      result = _mm_xor_si128(r1, result);
      _mm_store_si128((__m128i *) d64, result);
      d64 += 2;
      s64 += 2;
    }
  } else {
    while (d64 != top) {
      a = _mm_load_si128((__m128i *) s64);
      result = _mm_clmulepi64_si128 (a, b, 1);

      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
      result = _mm_xor_si128 (result, w);
      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
      r1 = _mm_xor_si128 (result, w);

      result = _mm_clmulepi64_si128 (a, b, 0);

      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m4), prim_poly, 1);
      result = _mm_xor_si128 (result, w);
      w = _mm_clmulepi64_si128 (_mm_and_si128(result, m3), prim_poly, 1);
      result = _mm_xor_si128 (result, w);

      result = _mm_unpacklo_epi64(result, r1);

      _mm_store_si128((__m128i *) d64, result);
      d64 += 2;
      s64 += 2; 
    }
  }
  gf_do_final_region_alignment(&rd);
}
110523d6   Jim Plank   GF-Complete Relea...
230
#endif
29899ad4   Loic Dachary   move #if to avoid...
231

110523d6   Jim Plank   GF-Complete Relea...
232
233
234
static
  inline
gf_val_64_t gf_w64_euclid (gf_t *gf, gf_val_64_t b)
70b6d55a   plank   Big checkin after...
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
{
  gf_val_64_t e_i, e_im1, e_ip1;
  gf_val_64_t d_i, d_im1, d_ip1;
  gf_val_64_t y_i, y_im1, y_ip1;
  gf_val_64_t c_i;
  gf_val_64_t one = 1;

  if (b == 0) return -1;
  e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
  e_i = b;
  d_im1 = 64;
  for (d_i = d_im1-1; ((one << d_i) & e_i) == 0; d_i--) ;
  y_i = 1;
  y_im1 = 0;

  while (e_i != 1) {

    e_ip1 = e_im1;
    d_ip1 = d_im1;
    c_i = 0;

    while (d_ip1 >= d_i) {
      c_i ^= (one << (d_ip1 - d_i));
      e_ip1 ^= (e_i << (d_ip1 - d_i));
      d_ip1--;
      if (e_ip1 == 0) return 0;
110523d6   Jim Plank   GF-Complete Relea...
261
      while ((e_ip1 & (one << d_ip1)) == 0) d_ip1--;
70b6d55a   plank   Big checkin after...
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
    }

    y_ip1 = y_im1 ^ gf->multiply.w64(gf, c_i, y_i);
    y_im1 = y_i;
    y_i = y_ip1;

    e_im1 = e_i;
    d_im1 = d_i;
    e_i = e_ip1;
    d_i = d_ip1;
  }

  return y_i;
}

/* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm.  I only
   include it for completeness.  It does have the feature that it requires no
   extra memory.  
*/

static
inline
gf_val_64_t
gf_w64_shift_multiply (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
{
  uint64_t pl, pr, ppl, ppr, i, a, bl, br, one, lbit;
f043479e   Loic Dachary   remove unused var...
288
  gf_internal_t *h;
70b6d55a   plank   Big checkin after...
289
290
291

  h = (gf_internal_t *) gf->scratch;
  
70b6d55a   plank   Big checkin after...
292
  /* Allen: set leading one of primitive polynomial */
110523d6   Jim Plank   GF-Complete Relea...
293
294
  
  a = a64;
70b6d55a   plank   Big checkin after...
295
296
297
298
299
300
  bl = 0;
  br = b64;
  one = 1;
  lbit = (one << 63);

  pl = 0; /* Allen: left side of product */
110523d6   Jim Plank   GF-Complete Relea...
301
302
  pr = 0; /* Allen: right side of product */

70b6d55a   plank   Big checkin after...
303
  /* Allen: unlike the corresponding functions for smaller word sizes,
110523d6   Jim Plank   GF-Complete Relea...
304
305
306
307
308
   * this loop carries out the initial carryless multiply by
   * shifting b itself rather than simply looking at successively
   * higher shifts of b */
  
  for (i = 0; i < GF_FIELD_WIDTH; i++) {
70b6d55a   plank   Big checkin after...
309
310
311
312
313
    if (a & (one << i)) {
      pl ^= bl;
      pr ^= br;
    }

110523d6   Jim Plank   GF-Complete Relea...
314
    bl <<= 1;
70b6d55a   plank   Big checkin after...
315
316
317
318
319
    if (br & lbit) bl ^= 1;
    br <<= 1;
  }

  /* Allen: the name of the variable "one" is no longer descriptive at this point */
110523d6   Jim Plank   GF-Complete Relea...
320
321
322
323
324
  
  one = lbit >> 1;
  ppl = (h->prim_poly >> 2) | one;
  ppr = (h->prim_poly << (GF_FIELD_WIDTH-2));
  while (one != 0) {
70b6d55a   plank   Big checkin after...
325
326
327
328
329
330
331
332
333
334
335
336
337
    if (pl & one) {
      pl ^= ppl;
      pr ^= ppr;
    }
    one >>= 1;
    ppr >>= 1;
    if (ppl & 1) ppr ^= lbit;
    ppl >>= 1;
  }
  return pr;
}

/*
6219bb98   Ethan L. Miller   Adding support fo...
338
339
340
 * ELM: Use the Intel carryless multiply instruction to do very fast 64x64 multiply.
 */

110523d6   Jim Plank   GF-Complete Relea...
341
static
4339569f   Bassam Tabbara   Support for runti...
342
343
inline
gf_val_64_t
6219bb98   Ethan L. Miller   Adding support fo...
344
345
346
gf_w64_clm_multiply_2 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
{
       gf_val_64_t rv = 0;
110523d6   Jim Plank   GF-Complete Relea...
347

6219bb98   Ethan L. Miller   Adding support fo...
348
#if defined(INTEL_SSE4_PCLMUL) 
110523d6   Jim Plank   GF-Complete Relea...
349
350

        __m128i         a, b;
6219bb98   Ethan L. Miller   Adding support fo...
351
352
353
        __m128i         result;
        __m128i         prim_poly;
        __m128i         v, w;
cf6a5dfa   Ethan L. Miller   Basic 64-bit mult...
354
        gf_internal_t * h = gf->scratch;
6219bb98   Ethan L. Miller   Adding support fo...
355
356

        a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0);
cf6a5dfa   Ethan L. Miller   Basic 64-bit mult...
357
        b = _mm_insert_epi64 (a, b64, 0); 
b2d6666e   Jim Plank   Added clm region ...
358
        prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
6219bb98   Ethan L. Miller   Adding support fo...
359
        /* Do the initial multiply */
6219bb98   Ethan L. Miller   Adding support fo...
360
   
110523d6   Jim Plank   GF-Complete Relea...
361
        result = _mm_clmulepi64_si128 (a, b, 0);
6219bb98   Ethan L. Miller   Adding support fo...
362
        
110523d6   Jim Plank   GF-Complete Relea...
363
        /* Mask off the high order 32 bits using subtraction of the polynomial.
6219bb98   Ethan L. Miller   Adding support fo...
364
365
366
         * NOTE: this part requires that the polynomial have at least 32 leading 0 bits.
         */

110523d6   Jim Plank   GF-Complete Relea...
367
368
369
370
371
        /* Adam: We cant include the leading one in the 64 bit pclmul,
         so we need to split up the high 8 bytes of the result into two 
         parts before we multiply them with the prim_poly.*/

        v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
cf6a5dfa   Ethan L. Miller   Basic 64-bit mult...
372
373
374
375
376
377
        w = _mm_clmulepi64_si128 (prim_poly, v, 0);
        result = _mm_xor_si128 (result, w);
        v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
        w = _mm_clmulepi64_si128 (prim_poly, v, 0);
        result = _mm_xor_si128 (result, w);

6219bb98   Ethan L. Miller   Adding support fo...
378
        rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
110523d6   Jim Plank   GF-Complete Relea...
379
#endif
110523d6   Jim Plank   GF-Complete Relea...
380
        return rv;
82a365da   Jim Plank   Getting the defau...
381
}
4339569f   Bassam Tabbara   Support for runti...
382
 
110523d6   Jim Plank   GF-Complete Relea...
383
static
4339569f   Bassam Tabbara   Support for runti...
384
385
inline
gf_val_64_t
110523d6   Jim Plank   GF-Complete Relea...
386
gf_w64_clm_multiply_4 (gf_t *gf, gf_val_64_t a64, gf_val_64_t b64)
79a46d18   Ethan L. Miller   Optimized version...
387
{
110523d6   Jim Plank   GF-Complete Relea...
388
389
  gf_val_64_t rv = 0;

79a46d18   Ethan L. Miller   Optimized version...
390
#if defined(INTEL_SSE4_PCLMUL) 
110523d6   Jim Plank   GF-Complete Relea...
391

79a46d18   Ethan L. Miller   Optimized version...
392
  __m128i         a, b;
110523d6   Jim Plank   GF-Complete Relea...
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
  __m128i         result;
  __m128i         prim_poly;
  __m128i         v, w;
  gf_internal_t * h = gf->scratch;

  a = _mm_insert_epi64 (_mm_setzero_si128(), a64, 0);
  b = _mm_insert_epi64 (a, b64, 0);
  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));
 
  /* Do the initial multiply */
  
  result = _mm_clmulepi64_si128 (a, b, 0);

  v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
  w = _mm_clmulepi64_si128 (prim_poly, v, 0);
  result = _mm_xor_si128 (result, w);
  v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
  w = _mm_clmulepi64_si128 (prim_poly, v, 0);
  result = _mm_xor_si128 (result, w);
  
  v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
  w = _mm_clmulepi64_si128 (prim_poly, v, 0);
  result = _mm_xor_si128 (result, w);
  v = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
  w = _mm_clmulepi64_si128 (prim_poly, v, 0);
  result = _mm_xor_si128 (result, w);

  rv = ((gf_val_64_t)_mm_extract_epi64(result, 0));
#endif
110523d6   Jim Plank   GF-Complete Relea...
422
423
  return rv;
}
4339569f   Bassam Tabbara   Support for runti...
424

79a46d18   Ethan L. Miller   Optimized version...
425

110523d6   Jim Plank   GF-Complete Relea...
426
  void
4339569f   Bassam Tabbara   Support for runti...
427
gf_w64_clm_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
110523d6   Jim Plank   GF-Complete Relea...
428
{
b2d6666e   Jim Plank   Added clm region ...
429
430
#if defined(INTEL_SSE4_PCLMUL) 
  gf_internal_t *h;
b2d6666e   Jim Plank   Added clm region ...
431
  uint8_t *s8, *d8, *dtop;
110523d6   Jim Plank   GF-Complete Relea...
432
  gf_region_data rd;
b2d6666e   Jim Plank   Added clm region ...
433
  __m128i  v, b, m, prim_poly, c, fr, w, result;
110523d6   Jim Plank   GF-Complete Relea...
434

b2d6666e   Jim Plank   Added clm region ...
435
436
437
438
439
440
441
442
443
444
445
  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }

  h = (gf_internal_t *) gf->scratch;

  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
  gf_do_initial_region_alignment(&rd);

  s8 = (uint8_t *) rd.s_start;
  d8 = (uint8_t *) rd.d_start;
  dtop = (uint8_t *) rd.d_top;
110523d6   Jim Plank   GF-Complete Relea...
446

b2d6666e   Jim Plank   Added clm region ...
447
448
  v = _mm_insert_epi64(_mm_setzero_si128(), val, 0);
  m = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
110523d6   Jim Plank   GF-Complete Relea...
449
450
  prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0xffffffffULL));

b2d6666e   Jim Plank   Added clm region ...
451
452
  if (xor) {
    while (d8 != dtop) {
110523d6   Jim Plank   GF-Complete Relea...
453
      b = _mm_load_si128((__m128i *) s8);
110523d6   Jim Plank   GF-Complete Relea...
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
      result = _mm_clmulepi64_si128 (b, v, 0);
      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
      result = _mm_xor_si128 (result, w);
      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
      fr = _mm_xor_si128 (result, w);
      fr = _mm_and_si128 (fr, m);

      result = _mm_clmulepi64_si128 (b, v, 1);
      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
      result = _mm_xor_si128 (result, w);
      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
      result = _mm_xor_si128 (result, w);
      result = _mm_slli_si128 (result, 8);
      fr = _mm_xor_si128 (result, fr);
      result = _mm_load_si128((__m128i *) d8);
      fr = _mm_xor_si128 (result, fr);

      _mm_store_si128((__m128i *) d8, fr);
      d8 += 16;
      s8 += 16;
    }
b2d6666e   Jim Plank   Added clm region ...
479
480
  } else {
    while (d8 < dtop) {
110523d6   Jim Plank   GF-Complete Relea...
481
      b = _mm_load_si128((__m128i *) s8);
110523d6   Jim Plank   GF-Complete Relea...
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
      result = _mm_clmulepi64_si128 (b, v, 0);
      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
      result = _mm_xor_si128 (result, w);
      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
      fr = _mm_xor_si128 (result, w);
      fr = _mm_and_si128 (fr, m);
  
      result = _mm_clmulepi64_si128 (b, v, 1);
      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 0);
      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
      result = _mm_xor_si128 (result, w);
      c = _mm_insert_epi32 (_mm_srli_si128 (result, 8), 0, 1);
      w = _mm_clmulepi64_si128 (prim_poly, c, 0);
      result = _mm_xor_si128 (result, w);
      result = _mm_slli_si128 (result, 8);
      fr = _mm_xor_si128 (result, fr);
  
      _mm_store_si128((__m128i *) d8, fr);
      d8 += 16;
      s8 += 16;
    }
b2d6666e   Jim Plank   Added clm region ...
505
506
507
  }
  gf_do_final_region_alignment(&rd);
#endif
b2d6666e   Jim Plank   Added clm region ...
508
}
4339569f   Bassam Tabbara   Support for runti...
509

b2d6666e   Jim Plank   Added clm region ...
510
511
void
gf_w64_split_4_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
49facc14   plank   All splits for w=...
512
513
514
515
516
517
518
519
520
{
  gf_internal_t *h;
  struct gf_split_4_64_lazy_data *ld;
  int i, j, k;
  uint64_t pp, v, s, *s64, *d64, *top;
  gf_region_data rd;

  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
e96c0e28   plank   Renaming gf.h to ...
521

49facc14   plank   All splits for w=...
522
523
524
525
526
527
  h = (gf_internal_t *) gf->scratch;
  pp = h->prim_poly;

  ld = (struct gf_split_4_64_lazy_data *) h->private;

  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
b2d6666e   Jim Plank   Added clm region ...
528
  gf_do_initial_region_alignment(&rd);
49facc14   plank   All splits for w=...
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599

  if (ld->last_value != val) {
    v = val;
    for (i = 0; i < 16; i++) {
      ld->tables[i][0] = 0;
      for (j = 1; j < 16; j <<= 1) {
        for (k = 0; k < j; k++) {
          ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
        }
        v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
      }
    }
  }
  ld->last_value = val;

  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  top = (uint64_t *) rd.d_top;

  while (d64 != top) {
    v = (xor) ? *d64 : 0;
    s = *s64;
    i = 0;
    while (s != 0) {
      v ^= ld->tables[i][s&0xf];
      s >>= 4;
      i++;
    }
    *d64 = v;
    d64++;
    s64++;
  }
  gf_do_final_region_alignment(&rd);
}

static
inline
uint64_t
gf_w64_split_8_8_multiply (gf_t *gf, uint64_t a64, uint64_t b64)
{
  uint64_t product, i, j, mask, tb;
  gf_internal_t *h;
  struct gf_split_8_8_data *d8;
 
  h = (gf_internal_t *) gf->scratch;
  d8 = (struct gf_split_8_8_data *) h->private;
  product = 0;
  mask = 0xff;

  for (i = 0; a64 != 0; i++) {
    tb = b64;
    for (j = 0; tb != 0; j++) {
      product ^= d8->tables[i+j][a64&mask][tb&mask];
      tb >>= 8;
    }
    a64 >>= 8;
  }
  return product;
}

void
gf_w64_split_8_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
{
  gf_internal_t *h;
  struct gf_split_8_64_lazy_data *ld;
  int i, j, k;
  uint64_t pp, v, s, *s64, *d64, *top;
  gf_region_data rd;

  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
e96c0e28   plank   Renaming gf.h to ...
600

49facc14   plank   All splits for w=...
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
  h = (gf_internal_t *) gf->scratch;
  pp = h->prim_poly;

  ld = (struct gf_split_8_64_lazy_data *) h->private;

  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
  gf_do_initial_region_alignment(&rd);

  if (ld->last_value != val) {
    v = val;
    for (i = 0; i < 8; i++) {
      ld->tables[i][0] = 0;
      for (j = 1; j < 256; j <<= 1) {
        for (k = 0; k < j; k++) {
          ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
        }
        v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
      }
    }
  }
  ld->last_value = val;

  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  top = (uint64_t *) rd.d_top;

  while (d64 != top) {
    v = (xor) ? *d64 : 0;
    s = *s64;
    i = 0;
    while (s != 0) {
      v ^= ld->tables[i][s&0xff];
      s >>= 8;
      i++;
    }
    *d64 = v;
    d64++;
    s64++;
  }
  gf_do_final_region_alignment(&rd);
}

void
gf_w64_split_16_64_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
{
  gf_internal_t *h;
  struct gf_split_16_64_lazy_data *ld;
  int i, j, k;
  uint64_t pp, v, s, *s64, *d64, *top;
  gf_region_data rd;

  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
e96c0e28   plank   Renaming gf.h to ...
654

49facc14   plank   All splits for w=...
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
  h = (gf_internal_t *) gf->scratch;
  pp = h->prim_poly;

  ld = (struct gf_split_16_64_lazy_data *) h->private;

  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
  gf_do_initial_region_alignment(&rd);

  if (ld->last_value != val) {
    v = val;
    for (i = 0; i < 4; i++) {
      ld->tables[i][0] = 0;
      for (j = 1; j < (1<<16); j <<= 1) {
        for (k = 0; k < j; k++) {
          ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
        }
        v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
      }
    }
  }
  ld->last_value = val;

  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  top = (uint64_t *) rd.d_top;

  while (d64 != top) {
    v = (xor) ? *d64 : 0;
    s = *s64;
    i = 0;
    while (s != 0) {
      v ^= ld->tables[i][s&0xffff];
      s >>= 16;
      i++;
    }
    *d64 = v;
    d64++;
    s64++;
  }
  gf_do_final_region_alignment(&rd);
}

static 
70b6d55a   plank   Big checkin after...
698
699
700
int gf_w64_shift_init(gf_t *gf)
{
  gf->multiply.w64 = gf_w64_shift_multiply;
87f0d439   Bassam Tabbara   Add support for p...
701
702
703
  gf->inverse.w64 = gf_w64_euclid;
  gf->multiply_region.w64 = gf_w64_multiply_region_from_single;
  return 1;
110523d6   Jim Plank   GF-Complete Relea...
704
705
706
707
708
709
}

static 
int gf_w64_cfm_init(gf_t *gf)
{
  gf->inverse.w64 = gf_w64_euclid;
87f0d439   Bassam Tabbara   Add support for p...
710
711
  gf->multiply_region.w64 = gf_w64_multiply_region_from_single;

b2d6666e   Jim Plank   Added clm region ...
712
#if defined(INTEL_SSE4_PCLMUL) 
4339569f   Bassam Tabbara   Support for runti...
713
714
715
  gf_internal_t *h;

  h = (gf_internal_t *) gf->scratch;
f043479e   Loic Dachary   remove unused var...
716

4339569f   Bassam Tabbara   Support for runti...
717
  if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ 
f043479e   Loic Dachary   remove unused var...
718
    gf->multiply.w64 = gf_w64_clm_multiply_2;
4339569f   Bassam Tabbara   Support for runti...
719
720
721
722
723
724
725
726
727
728
    gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; 
  }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
    gf->multiply.w64 = gf_w64_clm_multiply_4;
    gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4;
  } else {
    return 0;
  }
  return 1;
#endif

110523d6   Jim Plank   GF-Complete Relea...
729
  return 0;
ed9bc0f6   Jim Plank   Unit & w=64 bytwo.
730
}
82a365da   Jim Plank   Getting the defau...
731

110523d6   Jim Plank   GF-Complete Relea...
732
static
70b6d55a   plank   Big checkin after...
733
734
void
gf_w64_group_set_shift_tables(uint64_t *shift, uint64_t val, gf_internal_t *h)
5c6033fc   kmgreen   w=64 Composite fi...
735
{
1b64c4d5   plank   Added GROUP to w=64.
736
737
738
  int i;
  uint64_t j;
  uint64_t one = 1;
d1b6bbf7   Loic Dachary   add -Wsign-compar...
739
  int g_s;
1b64c4d5   plank   Added GROUP to w=64.
740
741

  g_s = h->arg1;
bd685c8d   plank   Changed the defau...
742
  shift[0] = 0;
1b64c4d5   plank   Added GROUP to w=64.
743
 
110523d6   Jim Plank   GF-Complete Relea...
744
  for (i = 1; i < (1 << g_s); i <<= 1) {
1b64c4d5   plank   Added GROUP to w=64.
745
746
    for (j = 0; j < i; j++) shift[i|j] = shift[j]^val;
    if (val & (one << 63)) {
d1b6bbf7   Loic Dachary   add -Wsign-compar...
747
      val <<= 1;
1b64c4d5   plank   Added GROUP to w=64.
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
      val ^= h->prim_poly;
    } else {
      val <<= 1;
    }
  }
}

static
inline
gf_val_64_t
gf_w64_group_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
{
  uint64_t top, bot, mask, tp;
  int g_s, g_r, lshift, rshift;
  struct gf_w64_group_data *gd;
1b64c4d5   plank   Added GROUP to w=64.
763
764
765
766
767

  gf_internal_t *h = (gf_internal_t *) gf->scratch;
  g_s = h->arg1;
  g_r = h->arg2;
  gd = (struct gf_w64_group_data *) h->private;
110523d6   Jim Plank   GF-Complete Relea...
768
769
  gf_w64_group_set_shift_tables(gd->shift, b, h);

1b64c4d5   plank   Added GROUP to w=64.
770
771
772
  mask = (((uint64_t)1 << g_s) - 1);
  top = 0;
  bot = gd->shift[a&mask];
1aef8694   Danny Al-Gaaf   gf_w64.c: fix int...
773
  a >>= g_s; 
1b64c4d5   plank   Added GROUP to w=64.
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796

  if (a == 0) return bot;
  lshift = 0;
  rshift = 64;

  do {              /* Shifting out is straightfoward */
    lshift += g_s;
    rshift -= g_s;
    tp = gd->shift[a&mask];
    top ^= (tp >> rshift);
    bot ^= (tp << lshift);
    a >>= g_s; 
  } while (a != 0);

  /* Reducing is a bit gross, because I don't zero out the index bits of top.
     The reason is that we throw top away.  Even better, that last (tp >> rshift)
     is going to be ignored, so it doesn't matter how (tp >> 64) is implemented. */
     
  lshift = ((lshift-1) / g_r) * g_r;
  rshift = 64 - lshift;
  mask = ((uint64_t)1 << g_r) - 1;
  while (lshift >= 0) {
    tp = gd->reduce[(top >> lshift) & mask];
e127bb1f   Danny Al-Gaaf   gf_w64.c: fix int...
797
    top ^= (tp >> rshift);
1b64c4d5   plank   Added GROUP to w=64.
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
    bot ^= (tp << lshift);
    lshift -= g_r;
    rshift += g_r;
  }
    
  return bot;
}

static
void gf_w64_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
{
  int i, fzb;
  uint64_t a64, smask, rmask, top, bot, tp;
  int lshift, rshift, g_s, g_r;
  gf_region_data rd;
f043479e   Loic Dachary   remove unused var...
813
  uint64_t *s64, *d64, *dtop;
1b64c4d5   plank   Added GROUP to w=64.
814
815
816
817
818
819
820
  struct gf_w64_group_data *gd;
  gf_internal_t *h = (gf_internal_t *) gf->scratch;

  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }

  gd = (struct gf_w64_group_data *) h->private;
e96c0e28   plank   Renaming gf.h to ...
821
  g_s = h->arg1;
1b64c4d5   plank   Added GROUP to w=64.
822
823
  g_r = h->arg2;
  gf_w64_group_set_shift_tables(gd->shift, val, h);
110523d6   Jim Plank   GF-Complete Relea...
824
825

  for (i = 63; !(val & (1ULL << i)); i--) ;
1b64c4d5   plank   Added GROUP to w=64.
826
827
  i += g_s;
  
110523d6   Jim Plank   GF-Complete Relea...
828
  /* i is the bit position of the first zero bit in any element of
1b64c4d5   plank   Added GROUP to w=64.
829
                           gd->shift[] */
110523d6   Jim Plank   GF-Complete Relea...
830
831
  
  if (i > 64) i = 64;   
1b64c4d5   plank   Added GROUP to w=64.
832
  
110523d6   Jim Plank   GF-Complete Relea...
833
834
835
  fzb = i;

  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
1b64c4d5   plank   Added GROUP to w=64.
836
837
838
839
840
841
842
843
844
845
  
  gf_do_initial_region_alignment(&rd);

  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  dtop = (uint64_t *) rd.d_top;

  smask = (1 << g_s) - 1;
  rmask = (1 << g_r) - 1;

2f6db512   Danny Al-Gaaf   gf_w64.c: fix int...
846
847
  while (d64 < dtop) {
    a64 = *s64;
1b64c4d5   plank   Added GROUP to w=64.
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
    
    top = 0;
    bot = gd->shift[a64&smask];
    a64 >>= g_s;
    i = fzb;

    if (a64 != 0) {
      lshift = 0;
      rshift = 64;
  
      do {  
        lshift += g_s;
        rshift -= g_s;
        tp = gd->shift[a64&smask];
        top ^= (tp >> rshift);
        bot ^= (tp << lshift);
        a64 >>= g_s;
      } while (a64 != 0);
      i += lshift;
  
      lshift = ((i-64-1) / g_r) * g_r;
      rshift = 64 - lshift;
      while (lshift >= 0) {
        tp = gd->reduce[(top >> lshift) & rmask];
        top ^= (tp >> rshift);    
        bot ^= (tp << lshift);
        lshift -= g_r;
        rshift += g_r;
      }
    }

    if (xor) bot ^= *d64;
    *d64 = bot;
    d64++;
    s64++;
  }
  gf_do_final_region_alignment(&rd);
}

static
inline
gf_val_64_t
gf_w64_group_s_equals_r_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
{
  int leftover, rs;
  uint64_t p, l, ind, a64;
  int bits_left;
1b64c4d5   plank   Added GROUP to w=64.
895
  int g_s;
f043479e   Loic Dachary   remove unused var...
896

1b64c4d5   plank   Added GROUP to w=64.
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
  struct gf_w64_group_data *gd;
  gf_internal_t *h = (gf_internal_t *) gf->scratch;
  g_s = h->arg1;

  gd = (struct gf_w64_group_data *) h->private;
  gf_w64_group_set_shift_tables(gd->shift, b, h);

  leftover = 64 % g_s;
  if (leftover == 0) leftover = g_s;

  rs = 64 - leftover;
  a64 = a;
  ind = a64 >> rs;
  a64 <<= leftover;
  p = gd->shift[ind];

  bits_left = rs;
  rs = 64 - g_s;

  while (bits_left > 0) {
    bits_left -= g_s;
    ind = a64 >> rs;
    a64 <<= g_s;
    l = p >> rs;
    p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
  }
  return p;
}

static
void gf_w64_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
{
  int leftover, rs;
  uint64_t p, l, ind, a64;
  int bits_left;
1b64c4d5   plank   Added GROUP to w=64.
932
  int g_s;
f043479e   Loic Dachary   remove unused var...
933
  gf_region_data rd;
1b64c4d5   plank   Added GROUP to w=64.
934
935
936
937
938
939
940
941
  uint64_t *s64, *d64, *top;
  struct gf_w64_group_data *gd;
  gf_internal_t *h = (gf_internal_t *) gf->scratch;

  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }

  gd = (struct gf_w64_group_data *) h->private;
e96c0e28   plank   Renaming gf.h to ...
942
  g_s = h->arg1;
1b64c4d5   plank   Added GROUP to w=64.
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
  gf_w64_group_set_shift_tables(gd->shift, val, h);

  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
  gf_do_initial_region_alignment(&rd);

  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  top = (uint64_t *) rd.d_top;

  leftover = 64 % g_s;
  if (leftover == 0) leftover = g_s;

  while (d64 < top) {
    rs = 64 - leftover;
    a64 = *s64;
    ind = a64 >> rs;
    a64 <<= leftover;
    p = gd->shift[ind];

    bits_left = rs;
    rs = 64 - g_s;

    while (bits_left > 0) {
      bits_left -= g_s;
      ind = a64 >> rs;
      a64 <<= g_s;
      l = p >> rs;
      p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
    }
    if (xor) p ^= *d64;
    *d64 = p;
    d64++;
    s64++;
  }
  gf_do_final_region_alignment(&rd);
}


static
int gf_w64_group_init(gf_t *gf)
{
  uint64_t i, j, p, index;
  struct gf_w64_group_data *gd;
  gf_internal_t *h = (gf_internal_t *) gf->scratch;
  int g_r, g_s;

  g_s = h->arg1;
d1b6bbf7   Loic Dachary   add -Wsign-compar...
990
  g_r = h->arg2;
1b64c4d5   plank   Added GROUP to w=64.
991

110523d6   Jim Plank   GF-Complete Relea...
992
993
  gd = (struct gf_w64_group_data *) h->private;
  gd->shift = (uint64_t *) (&(gd->memory));
1b64c4d5   plank   Added GROUP to w=64.
994
995
996
  gd->reduce = gd->shift + (1 << g_s);

  gd->reduce[0] = 0;
bd685c8d   plank   Changed the defau...
997
  for (i = 0; i < ((uint64_t)1 << g_r); i++) {
1b64c4d5   plank   Added GROUP to w=64.
998
    p = 0;
1b64c4d5   plank   Added GROUP to w=64.
999
    index = 0;
98e5e371   Danny Al-Gaaf   gf_w64.c: fix int...
1000
    for (j = 0; j < g_r; j++) {
1b64c4d5   plank   Added GROUP to w=64.
1001
1002
      if (i & (1 << j)) {
        p ^= (h->prim_poly << j);
bd685c8d   plank   Changed the defau...
1003
        index ^= (1 << j);
1b64c4d5   plank   Added GROUP to w=64.
1004
1005
1006
1007
1008
1009
1010
1011
1012
        if (j > 0) index ^= (h->prim_poly >> (64-j)); 
      }
    }
    gd->reduce[index] = p;
  }

  if (g_s == g_r) {
    gf->multiply.w64 = gf_w64_group_s_equals_r_multiply;
    gf->multiply_region.w64 = gf_w64_group_s_equals_r_multiply_region; 
bd685c8d   plank   Changed the defau...
1013
  } else {
87f0d439   Bassam Tabbara   Add support for p...
1014
1015
    gf->multiply.w64 = gf_w64_group_multiply;
    gf->multiply_region.w64 = gf_w64_group_multiply_region; 
1b64c4d5   plank   Added GROUP to w=64.
1016
  }
87f0d439   Bassam Tabbara   Add support for p...
1017
1018
  gf->divide.w64 = NULL;
  gf->inverse.w64 = gf_w64_euclid;
1b64c4d5   plank   Added GROUP to w=64.
1019

87f0d439   Bassam Tabbara   Add support for p...
1020
1021
  return 1;
}
1b64c4d5   plank   Added GROUP to w=64.
1022
1023
1024
1025
1026

static
gf_val_64_t gf_w64_extract_word(gf_t *gf, void *start, int bytes, int index)
{
  uint64_t *r64, rv;
5c6033fc   kmgreen   w=64 Composite fi...
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060

  r64 = (uint64_t *) start;
  rv = r64[index];
  return rv;
}

static
gf_val_64_t gf_w64_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
{
  int sub_size;
  gf_internal_t *h;
  uint8_t *r8, *top;
  uint64_t a, b, *r64;
  gf_region_data rd;

  h = (gf_internal_t *) gf->scratch;
  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
  r64 = (uint64_t *) start;
  if (r64 + index < (uint64_t *) rd.d_start) return r64[index];
  if (r64 + index >= (uint64_t *) rd.d_top) return r64[index];
  index -= (((uint64_t *) rd.d_start) - r64);
  r8 = (uint8_t *) rd.d_start;
  top = (uint8_t *) rd.d_top;
  sub_size = (top-r8)/2;

  a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
  b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
  return (a | ((uint64_t)b << 32));
}

static
gf_val_64_t gf_w64_split_extract_word(gf_t *gf, void *start, int bytes, int index)
{
  int i;
45fa1443   plank   SSE SPLIT ALTMAP ...
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
  uint64_t *r64, rv;
  uint8_t *r8;
  gf_region_data rd;

  gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 128);
  r64 = (uint64_t *) start;
  if (r64 + index < (uint64_t *) rd.d_start) return r64[index];
  if (r64 + index >= (uint64_t *) rd.d_top) return r64[index];
  index -= (((uint64_t *) rd.d_start) - r64);
  r8 = (uint8_t *) rd.d_start;
  r8 += ((index & 0xfffffff0)*8);
  r8 += (index & 0xf);
  r8 += 112;
  rv =0;
  for (i = 0; i < 8; i++) {
    rv <<= 8;
    rv |= *r8;
    r8 -= 16;
  }
  return rv;
}

static
inline
gf_val_64_t
gf_w64_bytwo_b_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
1b64c4d5   plank   Added GROUP to w=64.
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
{
  uint64_t prod, pp, bmask;
  gf_internal_t *h;

  h = (gf_internal_t *) gf->scratch;
  pp = h->prim_poly;

  prod = 0;
  bmask = 0x8000000000000000ULL;

  while (1) {
110523d6   Jim Plank   GF-Complete Relea...
1098
    if (a & 1) prod ^= b;
1b64c4d5   plank   Added GROUP to w=64.
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
    a >>= 1;
    if (a == 0) return prod;
    if (b & bmask) {
      b = ((b << 1) ^ pp);
    } else {
      b <<= 1;
    }
  }
}

static
inline
gf_val_64_t
gf_w64_bytwo_p_multiply (gf_t *gf, gf_val_64_t a, gf_val_64_t b)
{
  uint64_t prod, pp, pmask, amask;
  gf_internal_t *h;

  h = (gf_internal_t *) gf->scratch;
  pp = h->prim_poly;

  prod = 0;
  
  /* changed from declare then shift to just declare.*/
  
110523d6   Jim Plank   GF-Complete Relea...
1124
1125
1126
1127
1128
  pmask = 0x8000000000000000ULL;
  amask = 0x8000000000000000ULL;

  while (amask != 0) {
    if (prod & pmask) {
1b64c4d5   plank   Added GROUP to w=64.
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
      prod = ((prod << 1) ^ pp);
    } else {
      prod <<= 1;
    }
    if (a & amask) prod ^= b;
    amask >>= 1;
  }
  return prod;
}

static
void
gf_w64_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
{
  uint64_t *s64, *d64, ta, prod, amask, pmask, pp;
  gf_region_data rd;
  gf_internal_t *h;
f043479e   Loic Dachary   remove unused var...
1146

1b64c4d5   plank   Added GROUP to w=64.
1147
1148
1149
1150
  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }

  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
e96c0e28   plank   Renaming gf.h to ...
1151
  gf_do_initial_region_alignment(&rd);
1b64c4d5   plank   Added GROUP to w=64.
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199

  h = (gf_internal_t *) gf->scratch;

  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  pmask = 0x80000000;
  pmask <<= 32;
  pp = h->prim_poly;

  if (xor) {
    while (s64 < (uint64_t *) rd.s_top) {
      prod = 0;
      amask = pmask;
      ta = *s64;
      while (amask != 0) {
        prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1);
        if (val & amask) prod ^= ta;
        amask >>= 1;
      }
      *d64 ^= prod;
      d64++;
      s64++;
    }
  } else {
    while (s64 < (uint64_t *) rd.s_top) {
      prod = 0;
      amask = pmask;
      ta = *s64;
      while (amask != 0) {
        prod = (prod & pmask) ? ((prod << 1) ^ pp) : (prod << 1);
        if (val & amask) prod ^= ta;
        amask >>= 1;
      }
      *d64 = prod;
      d64++;
      s64++;
    }
  }
  gf_do_final_region_alignment(&rd);
}

static
void
gf_w64_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
{
  uint64_t *s64, *d64, ta, tb, prod, bmask, pp;
  gf_region_data rd;
  gf_internal_t *h;
f043479e   Loic Dachary   remove unused var...
1200

1b64c4d5   plank   Added GROUP to w=64.
1201
1202
1203
1204
  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }

  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
e96c0e28   plank   Renaming gf.h to ...
1205
  gf_do_initial_region_alignment(&rd);
1b64c4d5   plank   Added GROUP to w=64.
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250

  h = (gf_internal_t *) gf->scratch;

  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  bmask = 0x80000000;
  bmask <<= 32;
  pp = h->prim_poly;

  if (xor) {
    while (s64 < (uint64_t *) rd.s_top) {
      prod = 0;
      tb = val;
      ta = *s64;
      while (1) {
        if (tb & 1) prod ^= ta;
        tb >>= 1;
        if (tb == 0) break;
        ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1);
      }
      *d64 ^= prod;
      d64++;
      s64++;
    }
  } else {
    while (s64 < (uint64_t *) rd.s_top) {
      prod = 0;
      tb = val;
      ta = *s64;
      while (1) {
        if (tb & 1) prod ^= ta;
        tb >>= 1;
        if (tb == 0) break;
        ta = (ta & bmask) ? ((ta << 1) ^ pp) : (ta << 1);
      }
      *d64 = prod;
      d64++;
      s64++;
    }
  }
  gf_do_final_region_alignment(&rd);
}

#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
          t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
1b64c4d5   plank   Added GROUP to w=64.
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
          t2 = _mm_and_si128(va, m2); \
          t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
          va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }

#define BYTWO_P_ONESTEP {\
      SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
      t1 = _mm_and_si128(v, one); \
      t1 = _mm_sub_epi64(t1, one); \
      t1 = _mm_and_si128(t1, ta); \
      prod = _mm_xor_si128(prod, t1); \
      v = _mm_srli_epi64(v, 1); }


void gf_w64_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
{
#ifdef INTEL_SSE2
4339569f   Bassam Tabbara   Support for runti...
1267
  int i;
1b64c4d5   plank   Added GROUP to w=64.
1268
1269
  uint8_t *s8, *d8;
  uint64_t vrev, one64;
1b64c4d5   plank   Added GROUP to w=64.
1270
1271
1272
1273
1274
1275
1276
1277
1278
  uint64_t amask;
  __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
  gf_region_data rd;
  gf_internal_t *h;
  
  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }

  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
e96c0e28   plank   Renaming gf.h to ...
1279
  gf_do_initial_region_alignment(&rd);
1b64c4d5   plank   Added GROUP to w=64.
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327

  h = (gf_internal_t *) gf->scratch;
  one64 = 1;
  vrev = 0;
  for (i = 0; i < 64; i++) {
    vrev <<= 1;
    if (!(val & (one64 << i))) vrev |= 1;
  }

  s8 = (uint8_t *) rd.s_start;
  d8 = (uint8_t *) rd.d_start;

  amask = -1;
  amask ^= 1;
  pp = _mm_set1_epi64x(h->prim_poly);
  m1 = _mm_set1_epi64x(amask);
  m2 = _mm_set1_epi64x(one64 << 63);
  one = _mm_set1_epi64x(1);

  while (d8 < (uint8_t *) rd.d_top) {
    prod = _mm_setzero_si128();
    v = _mm_set1_epi64x(vrev);
    ta = _mm_load_si128((__m128i *) s8);
    tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
    BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
    _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
    d8 += 16;
    s8 += 16;
  }
  gf_do_final_region_alignment(&rd);
#endif
}

1b64c4d5   plank   Added GROUP to w=64.
1328
#ifdef INTEL_SSE2
4339569f   Bassam Tabbara   Support for runti...
1329
static
1b64c4d5   plank   Added GROUP to w=64.
1330
void
29899ad4   Loic Dachary   move #if to avoid...
1331
gf_w64_bytwo_b_sse_region_2_xor(gf_region_data *rd)
1b64c4d5   plank   Added GROUP to w=64.
1332
1333
{
  uint64_t one64, amask;
ed9bc0f6   Jim Plank   Unit & w=64 bytwo.
1334
1335
  uint8_t *d8, *s8;
  __m128i pp, m1, m2, t1, t2, va, vb;
ed9bc0f6   Jim Plank   Unit & w=64 bytwo.
1336
  gf_internal_t *h;
191b86b5   Loic Dachary   remove unused var...
1337

ed9bc0f6   Jim Plank   Unit & w=64 bytwo.
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
  s8 = (uint8_t *) rd->s_start;
  d8 = (uint8_t *) rd->d_start;

  h = (gf_internal_t *) rd->gf->scratch;
  one64 = 1;
  amask = -1;
  amask ^= 1;
  pp = _mm_set1_epi64x(h->prim_poly);
  m1 = _mm_set1_epi64x(amask);
  m2 = _mm_set1_epi64x(one64 << 63);

  while (d8 < (uint8_t *) rd->d_top) {
    va = _mm_load_si128 ((__m128i *)(s8));
    SSE_AB2(pp, m1, m2, va, t1, t2);
    vb = _mm_load_si128 ((__m128i *)(d8));
    vb = _mm_xor_si128(vb, va);
    _mm_store_si128((__m128i *)d8, vb);
    d8 += 16;
    s8 += 16;
  }
}
#endif

ed9bc0f6   Jim Plank   Unit & w=64 bytwo.
1361
#ifdef INTEL_SSE2
29899ad4   Loic Dachary   move #if to avoid...
1362
static
ed9bc0f6   Jim Plank   Unit & w=64 bytwo.
1363
void
29899ad4   Loic Dachary   move #if to avoid...
1364
gf_w64_bytwo_b_sse_region_2_noxor(gf_region_data *rd)
ed9bc0f6   Jim Plank   Unit & w=64 bytwo.
1365
1366
1367
1368
{
  uint64_t one64, amask;
  uint8_t *d8, *s8;
  __m128i pp, m1, m2, t1, t2, va;
ed9bc0f6   Jim Plank   Unit & w=64 bytwo.
1369
  gf_internal_t *h;
191b86b5   Loic Dachary   remove unused var...
1370

ed9bc0f6   Jim Plank   Unit & w=64 bytwo.
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
  s8 = (uint8_t *) rd->s_start;
  d8 = (uint8_t *) rd->d_start;

  h = (gf_internal_t *) rd->gf->scratch;
  one64 = 1;
  amask = -1;
  amask ^= 1;
  pp = _mm_set1_epi64x(h->prim_poly);
  m1 = _mm_set1_epi64x(amask);
  m2 = _mm_set1_epi64x(one64 << 63);

  while (d8 < (uint8_t *) rd->d_top) {
    va = _mm_load_si128 ((__m128i *)(s8));
    SSE_AB2(pp, m1, m2, va, t1, t2);
    _mm_store_si128((__m128i *)d8, va);
    d8 += 16;
    s8 += 16;
  }
}
#endif

ed9bc0f6   Jim Plank   Unit & w=64 bytwo.
1392
#ifdef INTEL_SSE2
29899ad4   Loic Dachary   move #if to avoid...
1393
static
ed9bc0f6   Jim Plank   Unit & w=64 bytwo.
1394
void
29899ad4   Loic Dachary   move #if to avoid...
1395
gf_w64_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
ed9bc0f6   Jim Plank   Unit & w=64 bytwo.
1396
1397
{
  uint64_t itb, amask, one64;
1b64c4d5   plank   Added GROUP to w=64.
1398
1399
  uint8_t *d8, *s8;
  __m128i pp, m1, m2, t1, t2, va, vb;
1b64c4d5   plank   Added GROUP to w=64.
1400
1401
1402
  gf_region_data rd;
  gf_internal_t *h;

1b64c4d5   plank   Added GROUP to w=64.
1403
1404
1405
1406
  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }

  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
e96c0e28   plank   Renaming gf.h to ...
1407
  gf_do_initial_region_alignment(&rd);
1b64c4d5   plank   Added GROUP to w=64.
1408
1409
1410
1411

  if (val == 2) {
    if (xor) {
      gf_w64_bytwo_b_sse_region_2_xor(&rd);
ed9bc0f6   Jim Plank   Unit & w=64 bytwo.
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
    } else {
      gf_w64_bytwo_b_sse_region_2_noxor(&rd);
    }
    gf_do_final_region_alignment(&rd);
    return;
  }

  s8 = (uint8_t *) rd.s_start;
  d8 = (uint8_t *) rd.d_start;
  h = (gf_internal_t *) gf->scratch;
1b64c4d5   plank   Added GROUP to w=64.
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448

  one64 = 1;
  amask = -1;
  amask ^= 1;
  pp = _mm_set1_epi64x(h->prim_poly);
  m1 = _mm_set1_epi64x(amask);
  m2 = _mm_set1_epi64x(one64 << 63);

  while (d8 < (uint8_t *) rd.d_top) {
    va = _mm_load_si128 ((__m128i *)(s8));
    vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
    itb = val;
    while (1) {
      if (itb & 1) vb = _mm_xor_si128(vb, va);
      itb >>= 1;
      if (itb == 0) break;
      SSE_AB2(pp, m1, m2, va, t1, t2);
    }
    _mm_store_si128((__m128i *)d8, vb);
    d8 += 16;
    s8 += 16;
  }

  gf_do_final_region_alignment(&rd);
}
#endif

1b64c4d5   plank   Added GROUP to w=64.
1449

29899ad4   Loic Dachary   move #if to avoid...
1450
static
1b64c4d5   plank   Added GROUP to w=64.
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
int gf_w64_bytwo_init(gf_t *gf)
{
  gf_internal_t *h;

  h = (gf_internal_t *) gf->scratch;

  if (h->mult_type == GF_MULT_BYTWO_p) {
    gf->multiply.w64 = gf_w64_bytwo_p_multiply;
    #ifdef INTEL_SSE2 
      if (h->region_type & GF_REGION_NOSIMD)
87f0d439   Bassam Tabbara   Add support for p...
1461
        gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; 
110523d6   Jim Plank   GF-Complete Relea...
1462
      else
4339569f   Bassam Tabbara   Support for runti...
1463
        gf->multiply_region.w64 = gf_w64_bytwo_p_sse_multiply_region; 
87f0d439   Bassam Tabbara   Add support for p...
1464
    #else
4339569f   Bassam Tabbara   Support for runti...
1465
1466
1467
1468
1469
1470
1471
      gf->multiply_region.w64 = gf_w64_bytwo_p_nosse_multiply_region; 
      if(h->region_type & GF_REGION_SIMD)
        return 0;
    #endif
  } else {
    gf->multiply.w64 = gf_w64_bytwo_b_multiply;
    #ifdef INTEL_SSE2 
110523d6   Jim Plank   GF-Complete Relea...
1472
      if (h->region_type & GF_REGION_NOSIMD)
1b64c4d5   plank   Added GROUP to w=64.
1473
        gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; 
87f0d439   Bassam Tabbara   Add support for p...
1474
      else
110523d6   Jim Plank   GF-Complete Relea...
1475
        gf->multiply_region.w64 = gf_w64_bytwo_b_sse_multiply_region; 
4339569f   Bassam Tabbara   Support for runti...
1476
    #else
87f0d439   Bassam Tabbara   Add support for p...
1477
      gf->multiply_region.w64 = gf_w64_bytwo_b_nosse_multiply_region; 
4339569f   Bassam Tabbara   Support for runti...
1478
1479
      if(h->region_type & GF_REGION_SIMD)
        return 0;
87f0d439   Bassam Tabbara   Add support for p...
1480
    #endif
568df90e   Janne Grunau   simd: rename the ...
1481
  }
110523d6   Jim Plank   GF-Complete Relea...
1482
  gf->inverse.w64 = gf_w64_euclid;
4339569f   Bassam Tabbara   Support for runti...
1483
1484
  return 1;
}
110523d6   Jim Plank   GF-Complete Relea...
1485

1b64c4d5   plank   Added GROUP to w=64.
1486

87f0d439   Bassam Tabbara   Add support for p...
1487
static
1b64c4d5   plank   Added GROUP to w=64.
1488
1489
1490
1491
1492
gf_val_64_t
gf_w64_composite_multiply(gf_t *gf, gf_val_64_t a, gf_val_64_t b)
{
  gf_internal_t *h = (gf_internal_t *) gf->scratch;
  gf_t *base_gf = h->base_gf;
5c6033fc   kmgreen   w=64 Composite fi...
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
  uint32_t b0 = b & 0x00000000ffffffff;
  uint32_t b1 = (b & 0xffffffff00000000) >> 32;
  uint32_t a0 = a & 0x00000000ffffffff;
  uint32_t a1 = (a & 0xffffffff00000000) >> 32;
  uint32_t a1b1;

  a1b1 = base_gf->multiply.w32(base_gf, a1, b1);

  return ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) | 
         ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
5c6033fc   kmgreen   w=64 Composite fi...
1503
1504
1505
1506
}

/*
 * Composite field division trick (explained in 2007 tech report)
110523d6   Jim Plank   GF-Complete Relea...
1507
 *
5c6033fc   kmgreen   w=64 Composite fi...
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
 * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
 *
 * let c = b^-1
 *
 * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
 *
 * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
 *
 * let d = b1c1 and d+1 = b0c0
 *
 * solve s*b1c1+b1c0+b0c1 = 0
 *
 * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
 *
 * c0 = (d+1)b0^-1
 * c1 = d*b1^-1
 *
 * a / b = a * c
 */

static
gf_val_64_t
gf_w64_composite_inverse(gf_t *gf, gf_val_64_t a)
{
110523d6   Jim Plank   GF-Complete Relea...
1532
  gf_internal_t *h = (gf_internal_t *) gf->scratch;
5c6033fc   kmgreen   w=64 Composite fi...
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
  gf_t *base_gf = h->base_gf;
  uint32_t a0 = a & 0x00000000ffffffff;
  uint32_t a1 = (a & 0xffffffff00000000) >> 32;
  uint32_t c0, c1, d, tmp;
  uint64_t c;
  uint32_t a0inv, a1inv;

  if (a0 == 0) {
    a1inv = base_gf->inverse.w32(base_gf, a1);
    c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
    c1 = a1inv;
5c6033fc   kmgreen   w=64 Composite fi...
1544
1545
1546
  } else if (a1 == 0) {
    c0 = base_gf->inverse.w32(base_gf, a0);
    c1 = 0;
110523d6   Jim Plank   GF-Complete Relea...
1547
  } else {
5c6033fc   kmgreen   w=64 Composite fi...
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
    a1inv = base_gf->inverse.w32(base_gf, a1);
    a0inv = base_gf->inverse.w32(base_gf, a0);

    d = base_gf->multiply.w32(base_gf, a1, a0inv);

    tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
    tmp = base_gf->inverse.w32(base_gf, tmp);

    d = base_gf->multiply.w32(base_gf, d, tmp);

110523d6   Jim Plank   GF-Complete Relea...
1558
    c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv);
5c6033fc   kmgreen   w=64 Composite fi...
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
    c1 = base_gf->multiply.w32(base_gf, d, a1inv);
  }

  c = c0 | ((uint64_t)c1 << 32);

  return c;
}

static
void
gf_w64_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
{
  gf_internal_t *h = (gf_internal_t *) gf->scratch;
  gf_t *base_gf = h->base_gf;
5c6033fc   kmgreen   w=64 Composite fi...
1573
1574
1575
  uint32_t b0 = val & 0x00000000ffffffff;
  uint32_t b1 = (val & 0xffffffff00000000) >> 32;
  uint64_t *s64, *d64;
5c6033fc   kmgreen   w=64 Composite fi...
1576
1577
  uint64_t *top;
  uint64_t a0, a1, a1b1;
5c6033fc   kmgreen   w=64 Composite fi...
1578
1579
1580
1581
1582
  gf_region_data rd;

  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);

5c6033fc   kmgreen   w=64 Composite fi...
1583
  s64 = rd.s_start;
5c6033fc   kmgreen   w=64 Composite fi...
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
  d64 = rd.d_start;
  top = rd.d_top;
  
  if (xor) {
    while (d64 < top) {
      a0 = *s64 & 0x00000000ffffffff;
      a1 = (*s64 & 0xffffffff00000000) >> 32;
      a1b1 = base_gf->multiply.w32(base_gf, a1, b1);

      *d64 ^= ((uint64_t)(base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
                ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
      s64++;
      d64++;
    }
  } else {
110523d6   Jim Plank   GF-Complete Relea...
1599
    while (d64 < top) {
5c6033fc   kmgreen   w=64 Composite fi...
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
      a0 = *s64 & 0x00000000ffffffff;
      a1 = (*s64 & 0xffffffff00000000) >> 32;
      a1b1 = base_gf->multiply.w32(base_gf, a1, b1);

      *d64 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
                ((uint64_t)(base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 32));
      s64++;
      d64++;
    }
  }
110523d6   Jim Plank   GF-Complete Relea...
1610
}
5c6033fc   kmgreen   w=64 Composite fi...
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628

static
void
gf_w64_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_64_t val, int bytes, int xor)
{
  gf_internal_t *h = (gf_internal_t *) gf->scratch;
  gf_t *base_gf = h->base_gf;
  gf_val_32_t val0 = val & 0x00000000ffffffff;
  gf_val_32_t val1 = (val & 0xffffffff00000000) >> 32;
  uint8_t *slow, *shigh;
  uint8_t *dlow, *dhigh, *top;
  int sub_reg_size;
  gf_region_data rd;

  if (!xor) {
    memset(dest, 0, bytes);
  }
  
5c6033fc   kmgreen   w=64 Composite fi...
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
  gf_do_initial_region_alignment(&rd);

  slow = (uint8_t *) rd.s_start;
  dlow = (uint8_t *) rd.d_start;
  top = (uint8_t*) rd.d_top;
  sub_reg_size = (top - dlow)/2;
  shigh = slow + sub_reg_size;
  dhigh = dlow + sub_reg_size;

  base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor);
  base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
  base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
  base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
  base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);

  gf_do_final_region_alignment(&rd);
}

110523d6   Jim Plank   GF-Complete Relea...
1648

5c6033fc   kmgreen   w=64 Composite fi...
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660

static
int gf_w64_composite_init(gf_t *gf)
{
  gf_internal_t *h = (gf_internal_t *) gf->scratch;

  if (h->region_type & GF_REGION_ALTMAP) {
    gf->multiply_region.w64 = gf_w64_composite_multiply_region_alt;
  } else {
    gf->multiply_region.w64 = gf_w64_composite_multiply_region;
  }

87f0d439   Bassam Tabbara   Add support for p...
1661
  gf->multiply.w64 = gf_w64_composite_multiply;
5c6033fc   kmgreen   w=64 Composite fi...
1662
  gf->divide.w64 = NULL;
87f0d439   Bassam Tabbara   Add support for p...
1663
  gf->inverse.w64 = gf_w64_composite_inverse;
5c6033fc   kmgreen   w=64 Composite fi...
1664
1665

  return 1;
87f0d439   Bassam Tabbara   Add support for p...
1666
1667
1668
}

#ifdef INTEL_SSSE3
5c6033fc   kmgreen   w=64 Composite fi...
1669
1670
1671
1672
static
  void
gf_w64_split_4_64_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
{
29899ad4   Loic Dachary   move #if to avoid...
1673
  gf_internal_t *h;
45fa1443   plank   SSE SPLIT ALTMAP ...
1674
  int i, j, k;
110523d6   Jim Plank   GF-Complete Relea...
1675
  uint64_t pp, v, *s64, *d64, *top;
45fa1443   plank   SSE SPLIT ALTMAP ...
1676
1677
  __m128i si, tables[16][8], p[8], v0, mask1;
  struct gf_split_4_64_lazy_data *ld;
45fa1443   plank   SSE SPLIT ALTMAP ...
1678
  uint8_t btable[16];
191b86b5   Loic Dachary   remove unused var...
1679
1680
  gf_region_data rd;

45fa1443   plank   SSE SPLIT ALTMAP ...
1681
1682
1683
1684
1685
1686
  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }

  h = (gf_internal_t *) gf->scratch;
  pp = h->prim_poly;

e96c0e28   plank   Renaming gf.h to ...
1687
  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
45fa1443   plank   SSE SPLIT ALTMAP ...
1688
1689
1690
  gf_do_initial_region_alignment(&rd);

  s64 = (uint64_t *) rd.s_start;
110523d6   Jim Plank   GF-Complete Relea...
1691
  d64 = (uint64_t *) rd.d_start;
45fa1443   plank   SSE SPLIT ALTMAP ...
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
  top = (uint64_t *) rd.d_top;
 
  ld = (struct gf_split_4_64_lazy_data *) h->private;

  v = val;
  for (i = 0; i < 16; i++) {
    ld->tables[i][0] = 0;
    for (j = 1; j < 16; j <<= 1) {
      for (k = 0; k < j; k++) {
        ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
      }
      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
    }
    for (j = 0; j < 8; j++) {
      for (k = 0; k < 16; k++) {
        btable[k] = (uint8_t) ld->tables[i][k];
        ld->tables[i][k] >>= 8;
      }
      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
    }
  }

  mask1 = _mm_set1_epi8(0xf);

  while (d64 != top) {

    if (xor) {
      for (i = 0; i < 8; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2));
    } else {
      for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128();
    }
    i = 0;
    for (k = 0; k < 8; k++) {
      v0 = _mm_load_si128((__m128i *) s64); 
      /* MM_PRINT8("v", v0); */
      s64 += 2;
      
      si = _mm_and_si128(v0, mask1);
  
110523d6   Jim Plank   GF-Complete Relea...
1731
      for (j = 0; j < 8; j++) {
45fa1443   plank   SSE SPLIT ALTMAP ...
1732
1733
1734
1735
        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
      }
      i++;
      v0 = _mm_srli_epi32(v0, 4);
45fa1443   plank   SSE SPLIT ALTMAP ...
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
      si = _mm_and_si128(v0, mask1);
      for (j = 0; j < 8; j++) {
        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
      }
      i++;
    }
    for (i = 0; i < 8; i++) {
      /* MM_PRINT8("v", p[i]); */
      _mm_store_si128((__m128i *) d64, p[i]);
      d64 += 2;
    }
  }
110523d6   Jim Plank   GF-Complete Relea...
1748
  gf_do_final_region_alignment(&rd);
45fa1443   plank   SSE SPLIT ALTMAP ...
1749
1750
1751
1752
1753
}
#endif

#ifdef INTEL_SSE4
static
45fa1443   plank   SSE SPLIT ALTMAP ...
1754
  void
29899ad4   Loic Dachary   move #if to avoid...
1755
gf_w64_split_4_64_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint64_t val, int bytes, int xor)
45fa1443   plank   SSE SPLIT ALTMAP ...
1756
{
29899ad4   Loic Dachary   move #if to avoid...
1757
  gf_internal_t *h;
110523d6   Jim Plank   GF-Complete Relea...
1758
1759
1760
1761
  int i, j, k;
  uint64_t pp, v, *s64, *d64, *top;
  __m128i si, tables[16][8], p[8], st[8], mask1, mask8, mask16, t1;
  struct gf_split_4_64_lazy_data *ld;
110523d6   Jim Plank   GF-Complete Relea...
1762
  uint8_t btable[16];
191b86b5   Loic Dachary   remove unused var...
1763
1764
1765
  gf_region_data rd;

  if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
110523d6   Jim Plank   GF-Complete Relea...
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
  if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }

  h = (gf_internal_t *) gf->scratch;
  pp = h->prim_poly;

  gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
  gf_do_initial_region_alignment(&rd);

  s64 = (uint64_t *) rd.s_start;
  d64 = (uint64_t *) rd.d_start;
  top = (uint64_t *) rd.d_top;
 
  ld = (struct gf_split_4_64_lazy_data *) h->private;

  v = val;
  for (i = 0; i < 16; i++) {
    ld->tables[i][0] = 0;
    for (j = 1; j < 16; j <<= 1) {
      for (k = 0; k < j; k++) {
        ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
      }
      v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
    }
    for (j = 0; j < 8; j++) {
      for (k = 0; k < 16; k++) {
        btable[k] = (uint8_t) ld->tables[i][k];
        ld->tables[i][k] >>= 8;
      }
      tables[i][j] = _mm_loadu_si128((__m128i *) btable);
    }
  }

  mask1 = _mm_set1_epi8(0xf);
  mask8 = _mm_set1_epi16(0xff);
  mask16 = _mm_set1_epi32(0xffff);

  while (d64 != top) {

    for (i = 0; i < 8; i++) p[i] = _mm_setzero_si128();

    for (k = 0; k < 8; k++) {
      st[k]  = _mm_load_si128((__m128i *) s64); 
      s64 += 2;
    }

    for (k = 0; k < 4; k ++) {
      st[k] = _mm_shuffle_epi32(st[k], _MM_SHUFFLE(3,1,2,0));
      st[k+4] = _mm_shuffle_epi32(st[k+4], _MM_SHUFFLE(2,0,3,1));
      t1 = _mm_blend_epi16(st[k], st[k+4], 0xf0);
      st[k] = _mm_srli_si128(st[k], 8);
      st[k+4] = _mm_slli_si128(st[k+4], 8);
      st[k+4] = _mm_blend_epi16(st[k], st[k+4], 0xf0);
      st[k] = t1;
    }

/*
    printf("After pack pass 1\n");
    for (k = 0; k < 8; k++) {
      MM_PRINT8("v", st[k]);
    }
    printf("\n");
 */
    
    t1 = _mm_packus_epi32(_mm_and_si128(st[0], mask16), _mm_and_si128(st[2], mask16));
    st[2] = _mm_packus_epi32(_mm_srli_epi32(st[0], 16), _mm_srli_epi32(st[2], 16));
    st[0] = t1;
    t1 = _mm_packus_epi32(_mm_and_si128(st[1], mask16), _mm_and_si128(st[3], mask16));
    st[3] = _mm_packus_epi32(_mm_srli_epi32(st[1], 16), _mm_srli_epi32(st[3], 16));
    st[1] = t1;
    t1 = _mm_packus_epi32(_mm_and_si128(st[4], mask16), _mm_and_si128(st[6], mask16));
    st[6] = _mm_packus_epi32(_mm_srli_epi32(st[4], 16), _mm_srli_epi32(st[6], 16));
    st[4] = t1;
    t1 = _mm_packus_epi32(_mm_and_si128(st[5], mask16), _mm_and_si128(st[7], mask16));
    st[7] = _mm_packus_epi32(_mm_srli_epi32(st[5], 16), _mm_srli_epi32(st[7], 16));
    st[5] = t1;

/*
    printf("After pack pass 2\n");
    for (k = 0; k < 8; k++) {
      MM_PRINT8("v", st[k]);
    }
    printf("\n");
 */
    t1 = _mm_packus_epi16(_mm_and_si128(st[0], mask8), _mm_and_si128(st[1], mask8));
    st[1] = _mm_packus_epi16(_mm_srli_epi16(st[0], 8), _mm_srli_epi16(st[1], 8));
    st[0] = t1;
    t1 = _mm_packus_epi16(_mm_and_si128(st[2], mask8), _mm_and_si128(st[3], mask8));
    st[3] = _mm_packus_epi16(_mm_srli_epi16(st[2], 8), _mm_srli_epi16(st[3], 8));
    st[2] = t1;
    t1 = _mm_packus_epi16(_mm_and_si128(st[4], mask8), _mm_and_si128(st[5], mask8));
    st[5] = _mm_packus_epi16(_mm_srli_epi16(st[4], 8), _mm_srli_epi16(st[5], 8));
    st[4] = t1;
    t1 = _mm_packus_epi16(_mm_and_si128(st[6], mask8), _mm_and_si128(st[7], mask8));
    st[7] = _mm_packus_epi16(_mm_srli_epi16(st[6], 8), _mm_srli_epi16(st[7], 8));
    st[6] = t1;

/*
    printf("After final pack pass 2\n");
    for (k = 0; k < 8; k++) {
      MM_PRINT8("v", st[k]);
    }
 */
    i = 0;
    for (k = 0; k < 8; k++) {
      si = _mm_and_si128(st[k], mask1);
  
      for (j = 0; j < 8; j++) {
        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
      }
      i++;
      st[k] = _mm_srli_epi32(st[k], 4);
      si = _mm_and_si128(st[k], mask1);
      for (j = 0; j < 8; j++) {
        p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
      }
      i++;
    }

    t1 = _mm_unpacklo_epi8(p[0], p[1]);
    p[1] = _mm_unpackhi_epi8(p[0], p[1]);
    p[0] = t1;
    t1 = _mm_unpacklo_epi8(p[2], p[3]);
    p[3] = _mm_unpackhi_epi8(p[2], p[3]);
    p[2] = t1;
    t1 = _mm_unpacklo_epi8(p[4], p[5]);
    p[5] = _mm_unpackhi_epi8(p[4], p[5]);
    p[4] = t1;
    t1 = _mm_unpacklo_epi8(p[6], p[7]);
    p[7] = _mm_unpackhi_epi8(p[6], p[7]);
    p[6] = t1;

/*
    printf("After unpack pass 1:\n");
    for (i = 0; i < 8; i++) {
      MM_PRINT8("v", p[i]);
    }
 */

    t1 = _mm_unpacklo_epi16(p[0], p[2]);
    p[2] = _mm_unpackhi_epi16(p[0], p[2]);
    p[0] = t1;
    t1 = _mm_unpacklo_epi16(p[1], p[3]);
    p[3] = _mm_unpackhi_epi16(p[1], p[3]);
    p[1] = t1;
    t1 = _mm_unpacklo_epi16(p[4], p[6]);
    p[6] = _mm_unpackhi_epi16(p[4], p[6]);
    p[4] = t1;
    t1 = _mm_unpacklo_epi16(p[5], p[7]);
    p[7] = _mm_unpackhi_epi16(p[5], p[7]);
    p[5] = t1;

/*
    printf("After unpack pass 2:\n");
    for (i = 0; i < 8; i++) {
      MM_PRINT8("v", p[i]);
    }
 */

    t1 = _mm_unpacklo_epi32(p[0], p[4]);
    p[4] = _mm_unpackhi_epi32(p[0], p[4]);
    p[0] = t1;
    t1 = _mm_unpacklo_epi32(p[1], p[5]);
    p[5] = _mm_unpackhi_epi32(p[1], p[5]);
    p[1] = t1;
    t1 = _mm_unpacklo_epi32(p[2], p[6]);
    p[6] = _mm_unpackhi_epi32(p[2], p[6]);
    p[2] = t1;
    t1 = _mm_unpacklo_epi32(p[3], p[7]);
    p[7] = _mm_unpackhi_epi32(p[3], p[7]);
    p[3] = t1;

    if (xor) {
      for (i = 0; i < 8; i++) {
        t1 = _mm_load_si128((__m128i *) d64);
        _mm_store_si128((__m128i *) d64, _mm_xor_si128(p[i], t1));
        d64 += 2;
      }
    } else {
      for (i = 0; i < 8; i++) {
        _mm_store_si128((__m128i *) d64, p[i]);
        d64 += 2;
      }
    }

  }

  gf_do_final_region_alignment(&rd);
}
#endif

#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1);

110523d6   Jim Plank   GF-Complete Relea...
1958
static
29899ad4   Loic Dachary   move #if to avoid...
1959
int gf_w64_split_init(gf_t *gf)
110523d6   Jim Plank   GF-Complete Relea...
1960
{
49facc14   plank   All splits for w=...
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
  gf_internal_t *h;
  struct gf_split_4_64_lazy_data *d4;
  struct gf_split_8_64_lazy_data *d8;
  struct gf_split_8_8_data *d88;
  struct gf_split_16_64_lazy_data *d16;
  uint64_t p, basep;
  int exp, i, j;

  h = (gf_internal_t *) gf->scratch;

  /* Defaults */

  gf->multiply_region.w64 = gf_w64_multiply_region_from_single;

  gf->multiply.w64 = gf_w64_bytwo_p_multiply; 

ed9bc0f6   Jim Plank   Unit & w=64 bytwo.
1977
#if defined(INTEL_SSE4_PCLMUL) 
87f0d439   Bassam Tabbara   Add support for p...
1978
  if ((!(h->region_type & GF_REGION_NOSIMD) &&
82a365da   Jim Plank   Getting the defau...
1979
     (h->arg1 == 64 || h->arg2 == 64)) ||
87f0d439   Bassam Tabbara   Add support for p...
1980
     h->mult_type == GF_MULT_DEFAULT){
110523d6   Jim Plank   GF-Complete Relea...
1981
   
fb0bbdcf   Jim Plank   Fixed the problem...
1982
    if ((0xfffffffe00000000ULL & h->prim_poly) == 0){ 
4339569f   Bassam Tabbara   Support for runti...
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
      gf->multiply.w64 = gf_w64_clm_multiply_2;
      gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_2; 
    }else if((0xfffe000000000000ULL & h->prim_poly) == 0){
      gf->multiply.w64 = gf_w64_clm_multiply_4;
      gf->multiply_region.w64 = gf_w64_clm_multiply_region_from_single_4; 
    }else{
      return 0;
    }
  }
#endif

  gf->inverse.w64 = gf_w64_euclid;

  /* Allen: set region pointers for default mult type. Single pointers are
110523d6   Jim Plank   GF-Complete Relea...
1997
1998
   * taken care of above (explicitly for sse, implicitly for no sse). */

ed9bc0f6   Jim Plank   Unit & w=64 bytwo.
1999
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
82a365da   Jim Plank   Getting the defau...
2000
  if (h->mult_type == GF_MULT_DEFAULT) {
87f0d439   Bassam Tabbara   Add support for p...
2001
    d4 = (struct gf_split_4_64_lazy_data *) h->private;
49facc14   plank   All splits for w=...
2002
    d4->last_value = 0;
110523d6   Jim Plank   GF-Complete Relea...
2003
2004
2005
#if defined(INTEL_SSE4)
    gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; 
#elif defined(ARCH_AARCH64)
110523d6   Jim Plank   GF-Complete Relea...
2006
    gf_w64_neon_split_init(gf);
4339569f   Bassam Tabbara   Support for runti...
2007
2008
2009
2010
#endif
  }
#else
  if (h->mult_type == GF_MULT_DEFAULT) {
6fdd8bc3   Janne Grunau   arm: NEON optimis...
2011
    d8 = (struct gf_split_8_64_lazy_data *) h->private;
4339569f   Bassam Tabbara   Support for runti...
2012
2013
    d8->last_value = 0;
    gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region;
6fdd8bc3   Janne Grunau   arm: NEON optimis...
2014
  }
4339569f   Bassam Tabbara   Support for runti...
2015
2016
#endif

6fdd8bc3   Janne Grunau   arm: NEON optimis...
2017
  if ((h->arg1 == 4 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 4)) {
4339569f   Bassam Tabbara   Support for runti...
2018
2019
2020
2021
2022
2023
2024
    d4 = (struct gf_split_4_64_lazy_data *) h->private;
    d4->last_value = 0;

    if((h->region_type & GF_REGION_ALTMAP) && (h->region_type & GF_REGION_NOSIMD)) return 0;
    if(h->region_type & GF_REGION_ALTMAP)
    {
      #ifdef INTEL_SSSE3
110523d6   Jim Plank   GF-Complete Relea...
2025
        gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region; 
4339569f   Bassam Tabbara   Support for runti...
2026
      #elif defined(ARCH_AARCH64)
110523d6   Jim Plank   GF-Complete Relea...
2027
        gf_w64_neon_split_init(gf);
49facc14   plank   All splits for w=...
2028
2029
2030
      #else
        return 0;
      #endif
110523d6   Jim Plank   GF-Complete Relea...
2031
    }
568df90e   Janne Grunau   simd: rename the ...
2032
    else //no altmap
110523d6   Jim Plank   GF-Complete Relea...
2033
2034
2035
    {
      #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
        if(h->region_type & GF_REGION_NOSIMD)
4339569f   Bassam Tabbara   Support for runti...
2036
2037
2038
          gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
        else
        #if defined(INTEL_SSE4)
6fdd8bc3   Janne Grunau   arm: NEON optimis...
2039
          gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
4339569f   Bassam Tabbara   Support for runti...
2040
2041
2042
        #elif defined(ARCH_AARCH64)
          gf_w64_neon_split_init(gf);
        #endif
110523d6   Jim Plank   GF-Complete Relea...
2043
      #else
4339569f   Bassam Tabbara   Support for runti...
2044
        gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
110523d6   Jim Plank   GF-Complete Relea...
2045
2046
2047
        if(h->region_type & GF_REGION_SIMD)
          return 0;
      #endif
6fdd8bc3   Janne Grunau   arm: NEON optimis...
2048
    }
4339569f   Bassam Tabbara   Support for runti...
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
  }
  if ((h->arg1 == 8 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 8)) {
    d8 = (struct gf_split_8_64_lazy_data *) h->private;
    d8->last_value = 0;
    gf->multiply_region.w64 = gf_w64_split_8_64_lazy_multiply_region;
  }
  if ((h->arg1 == 16 && h->arg2 == 64) || (h->arg1 == 64 && h->arg2 == 16)) {
    d16 = (struct gf_split_16_64_lazy_data *) h->private;
    d16->last_value = 0;
    gf->multiply_region.w64 = gf_w64_split_16_64_lazy_multiply_region;
  }
87f0d439   Bassam Tabbara   Add support for p...
2060
  if ((h->arg1 == 8 && h->arg2 == 8)) {
568df90e   Janne Grunau   simd: rename the ...
2061
    d88 = (struct gf_split_8_8_data *) h->private;
110523d6   Jim Plank   GF-Complete Relea...
2062
    gf->multiply.w64 = gf_w64_split_8_8_multiply;
4339569f   Bassam Tabbara   Support for runti...
2063
2064

    /* The performance of this guy sucks, so don't bother with a region op */
110523d6   Jim Plank   GF-Complete Relea...
2065
    
45fa1443   plank   SSE SPLIT ALTMAP ...
2066
    basep = 1;
49facc14   plank   All splits for w=...
2067
2068
2069
2070
    for (exp = 0; exp < 15; exp++) {
      for (j = 0; j < 256; j++) d88->tables[exp][0][j] = 0;
      for (i = 0; i < 256; i++) d88->tables[exp][i][0] = 0;
      d88->tables[exp][1][1] = basep;
87f0d439   Bassam Tabbara   Add support for p...
2071
      for (i = 2; i < 256; i++) {
49facc14   plank   All splits for w=...
2072
2073
2074
2075
        if (i&1) {
          p = d88->tables[exp][i^1][1];
          d88->tables[exp][i][1] = p ^ basep;
        } else {
87f0d439   Bassam Tabbara   Add support for p...
2076
          p = d88->tables[exp][i>>1][1];
49facc14   plank   All splits for w=...
2077
2078
2079
          d88->tables[exp][i][1] = GF_MULTBY_TWO(p);
        }
      }
87f0d439   Bassam Tabbara   Add support for p...
2080
      for (i = 1; i < 256; i++) {
110523d6   Jim Plank   GF-Complete Relea...
2081
        p = d88->tables[exp][i][1];
49facc14   plank   All splits for w=...
2082
        for (j = 1; j < 256; j++) {
110523d6   Jim Plank   GF-Complete Relea...
2083
          if (j&1) {
49facc14   plank   All splits for w=...
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
            d88->tables[exp][i][j] = d88->tables[exp][i][j^1] ^ p;
          } else {
            d88->tables[exp][i][j] = GF_MULTBY_TWO(d88->tables[exp][i][j>>1]);
          }
        }
      }
      for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
    }
  }
  return 1;
}

int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
{
  switch(mult_type)
  {
    case GF_MULT_SHIFT:
      return sizeof(gf_internal_t);
      break;
    case GF_MULT_CARRY_FREE:
      return sizeof(gf_internal_t);
      break;
    case GF_MULT_BYTWO_p:
    case GF_MULT_BYTWO_b:
      return sizeof(gf_internal_t);
      break;

110523d6   Jim Plank   GF-Complete Relea...
2111
    case GF_MULT_DEFAULT:
49facc14   plank   All splits for w=...
2112

5c6033fc   kmgreen   w=64 Composite fi...
2113
      /* Allen: set the *local* arg1 and arg2, just for scratch size purposes,
70b6d55a   plank   Big checkin after...
2114
2115
       * then fall through to split table scratch size code. */

70b6d55a   plank   Big checkin after...
2116
2117
#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
      arg1 = 64;
70b6d55a   plank   Big checkin after...
2118
      arg2 = 4;
110523d6   Jim Plank   GF-Complete Relea...
2119
2120
2121
#else
      arg1 = 64;
      arg2 = 8;
70b6d55a   plank   Big checkin after...
2122
2123
#endif

1b64c4d5   plank   Added GROUP to w=64.
2124
2125
    case GF_MULT_SPLIT_TABLE:
        if (arg1 == 8 && arg2 == 8) {
1b64c4d5   plank   Added GROUP to w=64.
2126
2127
          return sizeof(gf_internal_t) + sizeof(struct gf_split_8_8_data) + 64;
        }
49facc14   plank   All splits for w=...
2128
        if ((arg1 == 16 && arg2 == 64) || (arg2 == 16 && arg1 == 64)) {
110523d6   Jim Plank   GF-Complete Relea...
2129
2130
2131
2132
2133
          return sizeof(gf_internal_t) + sizeof(struct gf_split_16_64_lazy_data) + 64;
        }
        if ((arg1 == 8 && arg2 == 64) || (arg2 == 8 && arg1 == 64)) {
          return sizeof(gf_internal_t) + sizeof(struct gf_split_8_64_lazy_data) + 64;
        }
6fdd8bc3   Janne Grunau   arm: NEON optimis...
2134

4339569f   Bassam Tabbara   Support for runti...
2135
        if ((arg1 == 64 && arg2 == 4) || (arg1 == 4 && arg2 == 64)) {
110523d6   Jim Plank   GF-Complete Relea...
2136
2137
          return sizeof(gf_internal_t) + sizeof(struct gf_split_4_64_lazy_data) + 64;
        }
4339569f   Bassam Tabbara   Support for runti...
2138
2139
        return 0;
    case GF_MULT_GROUP:
110523d6   Jim Plank   GF-Complete Relea...
2140
2141
      return sizeof(gf_internal_t) + sizeof(struct gf_w64_group_data) +
               sizeof(uint64_t) * (1 << arg1) +
4339569f   Bassam Tabbara   Support for runti...
2142
2143
               sizeof(uint64_t) * (1 << arg2) + 64;
      break;
110523d6   Jim Plank   GF-Complete Relea...
2144
2145
    case GF_MULT_COMPOSITE:
      if (arg1 == 2) return sizeof(gf_internal_t) + 64;
49facc14   plank   All splits for w=...
2146
2147
      return 0;
      break;
49facc14   plank   All splits for w=...
2148
2149
2150
    default:
      return 0;
   }
49facc14   plank   All splits for w=...
2151
2152
2153
}

int gf_w64_init(gf_t *gf)
49facc14   plank   All splits for w=...
2154
2155
2156
{
  gf_internal_t *h;

110523d6   Jim Plank   GF-Complete Relea...
2157
2158
  h = (gf_internal_t *) gf->scratch;
  
49facc14   plank   All splits for w=...
2159
  /* Allen: set default primitive polynomial / irreducible polynomial if needed */
110523d6   Jim Plank   GF-Complete Relea...
2160

1b64c4d5   plank   Added GROUP to w=64.
2161
  /* Omitting the leftmost 1 as in w=32 */
1b64c4d5   plank   Added GROUP to w=64.
2162
2163
2164
2165

  if (h->prim_poly == 0) {
    if (h->mult_type == GF_MULT_COMPOSITE) {
      h->prim_poly = gf_composite_get_default_poly(h->base_gf);
5c6033fc   kmgreen   w=64 Composite fi...
2166
      if (h->prim_poly == 0) return 0; /* This shouldn't happen */
110523d6   Jim Plank   GF-Complete Relea...
2167
2168
    } else {
      h->prim_poly = 0x1b;
5c6033fc   kmgreen   w=64 Composite fi...
2169
    } 
70b6d55a   plank   Big checkin after...
2170
  }
110523d6   Jim Plank   GF-Complete Relea...
2171

70b6d55a   plank   Big checkin after...
2172
2173
2174
2175
2176
  gf->multiply.w64 = NULL;
  gf->divide.w64 = NULL;
  gf->inverse.w64 = NULL;
  gf->multiply_region.w64 = NULL;

f043479e   Loic Dachary   remove unused var...
2177
  switch(h->mult_type) {
70b6d55a   plank   Big checkin after...
2178
2179
    case GF_MULT_CARRY_FREE:  if (gf_w64_cfm_init(gf) == 0) return 0; break;
    case GF_MULT_SHIFT:       if (gf_w64_shift_init(gf) == 0) return 0; break;
110523d6   Jim Plank   GF-Complete Relea...
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
    case GF_MULT_COMPOSITE:   if (gf_w64_composite_init(gf) == 0) return 0; break;
    case GF_MULT_DEFAULT:
    case GF_MULT_SPLIT_TABLE: if (gf_w64_split_init(gf) == 0) return 0; break; 
    case GF_MULT_GROUP:       if (gf_w64_group_init(gf) == 0) return 0; break; 
    case GF_MULT_BYTWO_p:
    case GF_MULT_BYTWO_b:     if (gf_w64_bytwo_init(gf) == 0) return 0; break;
    default: return 0;
  }
  if (h->divide_type == GF_DIVIDE_EUCLID) {
    gf->divide.w64 = gf_w64_divide_from_inverse;
    gf->inverse.w64 = gf_w64_euclid;
  } 
110523d6   Jim Plank   GF-Complete Relea...
2192

70b6d55a   plank   Big checkin after...
2193
  if (gf->inverse.w64 != NULL && gf->divide.w64 == NULL) {
87f0d439   Bassam Tabbara   Add support for p...
2194
2195
2196
2197
    gf->divide.w64 = gf_w64_divide_from_inverse;
  }
  if (gf->inverse.w64 == NULL && gf->divide.w64 != NULL) {
    gf->inverse.w64 = gf_w64_inverse_from_divide;
70b6d55a   plank   Big checkin after...
2198
2199
  }

110523d6   Jim Plank   GF-Complete Relea...
2200
2201
2202
  if (h->region_type == GF_REGION_CAUCHY) return 0;

  if (h->region_type & GF_REGION_ALTMAP) {
bd685c8d   plank   Changed the defau...
2203
    if (h->mult_type == GF_MULT_COMPOSITE) {
110523d6   Jim Plank   GF-Complete Relea...
2204
      gf->extract_word.w64 = gf_w64_composite_extract_word;
1b64c4d5   plank   Added GROUP to w=64.
2205
2206
2207
    } else if (h->mult_type == GF_MULT_SPLIT_TABLE) {
      gf->extract_word.w64 = gf_w64_split_extract_word;
    }
70b6d55a   plank   Big checkin after...
2208
2209
2210
  } else {
    gf->extract_word.w64 = gf_w64_extract_word;
  }
87f0d439   Bassam Tabbara   Add support for p...
2211
2212

  return 1;
70b6d55a   plank   Big checkin after...
2213
}
70b6d55a   plank   Big checkin after...

87f0d439   Bassam Tabbara   Add support for p...

70b6d55a   plank   Big checkin after...

87f0d439   Bassam Tabbara   Add support for p...

70b6d55a   plank   Big checkin after...

5c6033fc   kmgreen   w=64 Composite fi...

110523d6   Jim Plank   GF-Complete Relea...

5c6033fc   kmgreen   w=64 Composite fi...

87f0d439   Bassam Tabbara   Add support for p...

45fa1443   plank   SSE SPLIT ALTMAP ...

87f0d439   Bassam Tabbara   Add support for p...

5c6033fc   kmgreen   w=64 Composite fi...

87f0d439   Bassam Tabbara   Add support for p...

5c6033fc   kmgreen   w=64 Composite fi...

70b6d55a   plank   Big checkin after...