Commit b561863b1a14f4c3e786525d68d9474543cbe5a5

Authored by animetosho
1 parent 8a96434c

Remove most instances of 64-bit insert/extract from w=128

Showing 1 changed file with 40 additions and 50 deletions   Show diff stats
src/gf_w128.c
... ... @@ -88,7 +88,7 @@ gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_
88 88 int xor)
89 89 {
90 90 uint32_t i;
91   - gf_val_128_t s128;
  91 + __m128i* s128;
92 92 gf_val_128_t d128;
93 93 gf_region_data rd;
94 94 __m128i a,b;
... ... @@ -105,20 +105,18 @@ int xor)
105 105 if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
106 106 }
107 107  
108   - s128 = (gf_val_128_t) src;
  108 + s128 = (__m128i*) src;
109 109 d128 = (gf_val_128_t) dest;
110 110  
111 111 if (xor) {
112 112 for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
113   - a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
114   - b = _mm_insert_epi64 (a, val[1], 0);
115   - a = _mm_insert_epi64 (a, s128[i], 1);
116   - b = _mm_insert_epi64 (b, val[0], 1);
  113 + a = _mm_loadu_si128 (s128 + (i>>1));
  114 + b = _mm_loadu_si128 ((__m128i*) val);
117 115  
118   - c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
119   - f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
120   - e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
121   - d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
  116 + c = _mm_clmulepi64_si128 (a, b, 0x11); /*low-low*/
  117 + f = _mm_clmulepi64_si128 (a, b, 0x10); /*high-low*/
  118 + e = _mm_clmulepi64_si128 (a, b, 0x01); /*low-high*/
  119 + d = _mm_clmulepi64_si128 (a, b, 0x00); /*high-high*/
122 120  
123 121 /* now reusing a and b as temporary variables*/
124 122 result0 = _mm_setzero_si128();
... ... @@ -141,20 +139,18 @@ int xor)
141 139 a = _mm_insert_epi64 (result0, 0, 1);
142 140 b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
143 141 result1 = _mm_xor_si128 (result1, b);
144   - d128[i] ^= (uint64_t)_mm_extract_epi64(result1,1);
145   - d128[i+1] ^= (uint64_t)_mm_extract_epi64(result1,0);
  142 + d128[i] ^= (uint64_t)_mm_cvtsi128_si64(_mm_srli_si128(result1,8));
  143 + d128[i+1] ^= (uint64_t)_mm_cvtsi128_si64(result1);
146 144 }
147 145 } else {
148 146 for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
149   - a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
150   - b = _mm_insert_epi64 (a, val[1], 0);
151   - a = _mm_insert_epi64 (a, s128[i], 1);
152   - b = _mm_insert_epi64 (b, val[0], 1);
  147 + a = _mm_loadu_si128 (s128 + (i>>1));
  148 + b = _mm_loadu_si128 ((__m128i*) val);
153 149  
154   - c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
155   - f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
156   - e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
157   - d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
  150 + c = _mm_clmulepi64_si128 (a, b, 0x11); /*low-low*/
  151 + f = _mm_clmulepi64_si128 (a, b, 0x10); /*high-low*/
  152 + e = _mm_clmulepi64_si128 (a, b, 0x01); /*low-high*/
  153 + d = _mm_clmulepi64_si128 (a, b, 0x00); /*high-high*/
158 154  
159 155 /* now reusing a and b as temporary variables*/
160 156 result0 = _mm_setzero_si128();
... ... @@ -177,8 +173,8 @@ int xor)
177 173 a = _mm_insert_epi64 (result0, 0, 1);
178 174 b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
179 175 result1 = _mm_xor_si128 (result1, b);
180   - d128[i] = (uint64_t)_mm_extract_epi64(result1,1);
181   - d128[i+1] = (uint64_t)_mm_extract_epi64(result1,0);
  176 + d128[i] = (uint64_t)_mm_cvtsi128_si64(_mm_srli_si128(result1,8));
  177 + d128[i+1] = (uint64_t)_mm_cvtsi128_si64(result1);
182 178 }
183 179 }
184 180 }
... ... @@ -301,10 +297,8 @@ gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_
301 297 __m128i c,d,e,f;
302 298 gf_internal_t * h = gf->scratch;
303 299  
304   - a = _mm_insert_epi64 (_mm_setzero_si128(), a128[1], 0);
305   - b = _mm_insert_epi64 (a, b128[1], 0);
306   - a = _mm_insert_epi64 (a, a128[0], 1);
307   - b = _mm_insert_epi64 (b, b128[0], 1);
  300 + a = _mm_set_epi64x (a128[0], a128[1]);
  301 + b = _mm_set_epi64x (b128[0], b128[1]);
308 302  
309 303 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
310 304  
... ... @@ -336,8 +330,8 @@ gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_
336 330 b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
337 331 result1 = _mm_xor_si128 (result1, b);
338 332  
339   - c128[0] = (uint64_t)_mm_extract_epi64(result1,1);
340   - c128[1] = (uint64_t)_mm_extract_epi64(result1,0);
  333 + c128[0] = (uint64_t)_mm_cvtsi128_si64(_mm_srli_si128(result1,8));
  334 + c128[1] = (uint64_t)_mm_cvtsi128_si64(result1);
341 335 #endif
342 336 return;
343 337 }
... ... @@ -390,10 +384,8 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
390 384 h = (gf_internal_t *) gf->scratch;
391 385 pp = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
392 386 prod = _mm_setzero_si128();
393   - a = _mm_insert_epi64(prod, a128[1], 0x0);
394   - a = _mm_insert_epi64(a, a128[0], 0x1);
395   - b = _mm_insert_epi64(prod, b128[1], 0x0);
396   - b = _mm_insert_epi64(b, b128[0], 0x1);
  387 + a = _mm_set_epi64x(a128[0], a128[1]);
  388 + b = _mm_set_epi64x(b128[0], b128[1]);
397 389 pmask = 0x80000000;
398 390 amask = _mm_insert_epi32(prod, 0x80000000, 0x3);
399 391 u_middle_one = _mm_insert_epi32(prod, 1, 0x2);
... ... @@ -408,7 +400,7 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
408 400 if (topbit) {
409 401 prod = _mm_xor_si128(prod, pp);
410 402 }
411   - if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 1))) {
  403 + if (((uint64_t)_mm_cvtsi128_si64(_mm_srli_si128(_mm_and_si128(a, amask), 8)))) {
412 404 prod = _mm_xor_si128(prod, b);
413 405 }
414 406 amask = _mm_srli_epi64(amask, 1); /*so does this one, but we can just replace after loop*/
... ... @@ -420,13 +412,13 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
420 412 prod = _mm_slli_epi64(prod, 1);
421 413 if (middlebit) prod = _mm_xor_si128(prod, u_middle_one);
422 414 if (topbit) prod = _mm_xor_si128(prod, pp);
423   - if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 0))) {
  415 + if (((uint64_t)_mm_cvtsi128_si64(_mm_and_si128(a, amask)))) {
424 416 prod = _mm_xor_si128(prod, b);
425 417 }
426 418 amask = _mm_srli_epi64(amask, 1);
427 419 }
428   - c128[0] = (uint64_t)_mm_extract_epi64(prod, 1);
429   - c128[1] = (uint64_t)_mm_extract_epi64(prod, 0);
  420 + c128[0] = (uint64_t)_mm_cvtsi128_si64(_mm_srli_si128(prod,8));
  421 + c128[1] = (uint64_t)_mm_cvtsi128_si64(prod);
430 422 #endif
431 423 return;
432 424 }
... ... @@ -444,14 +436,12 @@ gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
444 436 h = (gf_internal_t *) gf->scratch;
445 437  
446 438 c = _mm_setzero_si128();
447   - lmask = _mm_insert_epi64(c, 1ULL << 63, 0);
448   - hmask = _mm_insert_epi64(c, 1ULL << 63, 1);
449   - b = _mm_insert_epi64(c, a128[0], 1);
450   - b = _mm_insert_epi64(b, a128[1], 0);
451   - a = _mm_insert_epi64(c, b128[0], 1);
452   - a = _mm_insert_epi64(a, b128[1], 0);
453   - pp = _mm_insert_epi64(c, h->prim_poly, 0);
454   - middle_one = _mm_insert_epi64(c, 1, 0x1);
  439 + lmask = _mm_set_epi64x(0, 1ULL << 63);
  440 + hmask = _mm_set_epi64x(1ULL << 63, 0);
  441 + a = _mm_set_epi64x(b128[0], b128[1]);
  442 + b = _mm_set_epi64x(a128[0], a128[1]);
  443 + pp = _mm_set_epi64x(0, h->prim_poly);
  444 + middle_one = _mm_set_epi64x(1, 0);
455 445  
456 446 while (1) {
457 447 if (_mm_extract_epi32(a, 0x0) & 1) {
... ... @@ -460,13 +450,13 @@ gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
460 450 middlebit = (_mm_extract_epi32(a, 0x2) & 1);
461 451 a = _mm_srli_epi64(a, 1);
462 452 if (middlebit) a = _mm_xor_si128(a, lmask);
463   - if ((_mm_extract_epi64(a, 0x1) == 0ULL) && (_mm_extract_epi64(a, 0x0) == 0ULL)){
464   - c128[0] = _mm_extract_epi64(c, 0x1);
465   - c128[1] = _mm_extract_epi64(c, 0x0);
  453 + if (_mm_movemask_epi8(_mm_cmpeq_epi8(a, _mm_setzero_si128())) == 0xffff){
  454 + c128[0] = _mm_cvtsi128_si64(_mm_srli_si128(c, 8));
  455 + c128[1] = _mm_cvtsi128_si64(c);
466 456 return;
467 457 }
468   - topbit = (_mm_extract_epi64(_mm_and_si128(b, hmask), 1));
469   - middlebit = (_mm_extract_epi64(_mm_and_si128(b, lmask), 0));
  458 + topbit = (_mm_cvtsi128_si64(_mm_srli_si128(_mm_and_si128(b, hmask), 8)));
  459 + middlebit = (_mm_cvtsi128_si64(_mm_and_si128(b, lmask)));
470 460 b = _mm_slli_epi64(b, 1);
471 461 if (middlebit) b = _mm_xor_si128(b, middle_one);
472 462 if (topbit) b = _mm_xor_si128(b, pp);
... ... @@ -1508,7 +1498,7 @@ void gf_w128_group_r_sse_init(gf_t *gf)
1508 1498 table[i] = zero;
1509 1499 for (j = 0; j < g_r; j++) {
1510 1500 if (i & (1 << j)) {
1511   - table[i] = _mm_xor_si128(table[i], _mm_insert_epi64(zero, pp << j, 0));
  1501 + table[i] = _mm_xor_si128(table[i], _mm_cvtsi64_si128(pp << j));
1512 1502 }
1513 1503 }
1514 1504 }
... ...