Commit e506417c909514b15988e0f0d6596e48b0b64823

Authored by animetosho
1 parent 97b82b98

Eliminate added _mm_set_epi64x in w=128

Showing 1 changed file with 18 additions and 14 deletions   Show diff stats
src/gf_w128.c
... ... @@ -285,16 +285,16 @@ gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_
285 285 __m128i c,d,e,f;
286 286 gf_internal_t * h = gf->scratch;
287 287  
288   - a = _mm_set_epi64x (a128[0], a128[1]);
289   - b = _mm_set_epi64x (b128[0], b128[1]);
  288 + a = _mm_loadu_si128 ((__m128i*) a128);
  289 + b = _mm_loadu_si128 ((__m128i*) b128);
290 290  
291 291 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
292 292  
293 293 /* we need to test algorithm 2 later*/
294   - c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
295   - f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
296   - e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
297   - d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
  294 + c = _mm_clmulepi64_si128 (a, b, 0x11); /*low-low*/
  295 + f = _mm_clmulepi64_si128 (a, b, 0x10); /*high-low*/
  296 + e = _mm_clmulepi64_si128 (a, b, 0x01); /*low-high*/
  297 + d = _mm_clmulepi64_si128 (a, b, 0x00); /*high-high*/
298 298  
299 299 /* now reusing a and b as temporary variables*/
300 300 a = _mm_xor_si128 (_mm_srli_si128 (e, 8), d);
... ... @@ -366,8 +366,10 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
366 366 h = (gf_internal_t *) gf->scratch;
367 367 pp = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
368 368 prod = _mm_setzero_si128();
369   - a = _mm_set_epi64x(a128[0], a128[1]);
370   - b = _mm_set_epi64x(b128[0], b128[1]);
  369 + a = _mm_loadu_si128((__m128i*)a128);
  370 + b = _mm_loadu_si128((__m128i*)b128);
  371 + a = _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2));
  372 + b = _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2));
371 373 pmask = 0x80000000;
372 374 amask = _mm_insert_epi32(prod, 0x80000000, 0x3);
373 375 u_middle_one = _mm_insert_epi32(prod, 1, 0x2);
... ... @@ -418,12 +420,14 @@ gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
418 420 h = (gf_internal_t *) gf->scratch;
419 421  
420 422 c = _mm_setzero_si128();
421   - lmask = _mm_set_epi64x(0, 1ULL << 63);
422   - hmask = _mm_set_epi64x(1ULL << 63, 0);
423   - a = _mm_set_epi64x(b128[0], b128[1]);
424   - b = _mm_set_epi64x(a128[0], a128[1]);
425   - pp = _mm_set_epi64x(0, h->prim_poly);
426   - middle_one = _mm_set_epi64x(1, 0);
  423 + lmask = _mm_set_epi32(0, 0, 1UL << 31, 0);
  424 + hmask = _mm_set_epi32(1UL << 31, 0, 0, 0);
  425 + a = _mm_loadu_si128((__m128i*)b128);
  426 + b = _mm_loadu_si128((__m128i*)a128);
  427 + a = _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2));
  428 + b = _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2));
  429 + pp = _mm_cvtsi64_si128(h->prim_poly);
  430 + middle_one = _mm_set_epi32(0, 1, 0, 0);
427 431  
428 432 while (1) {
429 433 if (_mm_extract_epi32(a, 0x0) & 1) {
... ...