Commit 4dc7a55d7f1a83136a143f5d4e996c766ea83265

Authored by animetosho
1 parent e506417c

Eliminate some added 64-bit <-> 128-bit converts

Showing 2 changed files with 19 additions and 27 deletions   Show diff stats
src/gf_w128.c
... ... @@ -88,8 +88,7 @@ gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_
88 88 int xor)
89 89 {
90 90 uint32_t i;
91   - __m128i* s128;
92   - gf_val_128_t d128;
  91 + __m128i * s128, * d128;
93 92 gf_region_data rd;
94 93 __m128i a,b;
95 94 __m128i result0,result1;
... ... @@ -106,7 +105,7 @@ int xor)
106 105 }
107 106  
108 107 s128 = (__m128i*) src;
109   - d128 = (gf_val_128_t) dest;
  108 + d128 = (__m128i*) dest;
110 109  
111 110 if (xor) {
112 111 for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
... ... @@ -132,9 +131,9 @@ int xor)
132 131 result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
133 132  
134 133 b = _mm_clmulepi64_si128 (result0, prim_poly, 0x00);
135   - result1 = _mm_xor_si128 (result1, b);
136   - d128[i] ^= (uint64_t)_mm_cvtsi128_si64(_mm_srli_si128(result1,8));
137   - d128[i+1] ^= (uint64_t)_mm_cvtsi128_si64(result1);
  134 + result1 = _mm_xor_si128 (result1, b);
  135 + result1 = _mm_shuffle_epi32 (result1, _MM_SHUFFLE(1, 0, 3, 2));
  136 + _mm_storeu_si128 (d128 + (i>>1), _mm_xor_si128 (result1, _mm_loadu_si128 (d128 + (i>>1))));
138 137 }
139 138 } else {
140 139 for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
... ... @@ -161,8 +160,8 @@ int xor)
161 160  
162 161 b = _mm_clmulepi64_si128 (result0, prim_poly, 0x00);
163 162 result1 = _mm_xor_si128 (result1, b);
164   - d128[i] = (uint64_t)_mm_cvtsi128_si64(_mm_srli_si128(result1,8));
165   - d128[i+1] = (uint64_t)_mm_cvtsi128_si64(result1);
  163 + result1 = _mm_shuffle_epi32 (result1, _MM_SHUFFLE(1, 0, 3, 2));
  164 + _mm_storeu_si128 (d128 + (i>>1), result1);
166 165 }
167 166 }
168 167 }
... ... @@ -312,8 +311,8 @@ gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_
312 311 b = _mm_clmulepi64_si128 (result0, prim_poly, 0x00);
313 312 result1 = _mm_xor_si128 (result1, b);
314 313  
315   - c128[0] = (uint64_t)_mm_cvtsi128_si64(_mm_srli_si128(result1,8));
316   - c128[1] = (uint64_t)_mm_cvtsi128_si64(result1);
  314 + result1 = _mm_shuffle_epi32(result1, _MM_SHUFFLE(1, 0, 3, 2));
  315 + _mm_storeu_si128((__m128i*) c128, result1);
317 316 #endif
318 317 return;
319 318 }
... ... @@ -401,8 +400,8 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
401 400 }
402 401 amask = _mm_srli_epi64(amask, 1);
403 402 }
404   - c128[0] = (uint64_t)_mm_cvtsi128_si64(_mm_srli_si128(prod,8));
405   - c128[1] = (uint64_t)_mm_cvtsi128_si64(prod);
  403 + prod = _mm_shuffle_epi32(prod, _MM_SHUFFLE(1, 0, 3, 2));
  404 + _mm_storeu_si128((__m128i*) c128, prod);
406 405 #endif
407 406 return;
408 407 }
... ... @@ -426,7 +425,7 @@ gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
426 425 b = _mm_loadu_si128((__m128i*)a128);
427 426 a = _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2));
428 427 b = _mm_shuffle_epi32(b, _MM_SHUFFLE(1, 0, 3, 2));
429   - pp = _mm_cvtsi64_si128(h->prim_poly);
  428 + pp = _mm_loadl_epi64((__m128i*) &h->prim_poly);
2
  • Loic avatar small 75dpi
    Loic Dachary @dachary

    Is there any guarantee this is propery memory aligned ?

    Choose File ...   File name...
    Cancel
  • Eec633b4322a1884e5bd1861f08b6112?s=40&d=identicon
    Nyan @Nyan

    I believe there is no alignment guarantee. _mm_loadl_epi64 should compile to MOVQ, which I believe has no alignment requirements though?

    Choose File ...   File name...
    Cancel
430 429 middle_one = _mm_set_epi32(0, 1, 0, 0);
431 430  
432 431 while (1) {
... ... @@ -437,8 +436,8 @@ gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
437 436 a = _mm_srli_epi64(a, 1);
438 437 if (middlebit) a = _mm_xor_si128(a, lmask);
439 438 if (_mm_movemask_epi8(_mm_cmpeq_epi8(a, _mm_setzero_si128())) == 0xffff){
440   - c128[0] = _mm_cvtsi128_si64(_mm_srli_si128(c, 8));
441   - c128[1] = _mm_cvtsi128_si64(c);
  439 + c = _mm_shuffle_epi32(c, _MM_SHUFFLE(1, 0, 3, 2));
  440 + _mm_storeu_si128((__m128i*) c128, c);
442 441 return;
443 442 }
444 443 topbit = (_mm_cvtsi128_si64(_mm_srli_si128(_mm_and_si128(b, hmask), 8)));
... ... @@ -1484,6 +1483,7 @@ void gf_w128_group_r_sse_init(gf_t *gf)
1484 1483 table[i] = zero;
1485 1484 for (j = 0; j < g_r; j++) {
1486 1485 if (i & (1 << j)) {
  1486 + /* note that _mm_cvtsi64_si128 is unavailable on 32-bit compiles */
1487 1487 table[i] = _mm_xor_si128(table[i], _mm_cvtsi64_si128(pp << j));
1488 1488 }
1489 1489 }
... ...
src/gf_w32.c
... ... @@ -361,15 +361,11 @@ gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
361 361 __m128i w;
362 362 __m128i g, q;
363 363 gf_internal_t * h = gf->scratch;
364   - uint64_t g_star, q_plus;
365   -
366   - q_plus = *(uint64_t *) h->private;
367   - g_star = *((uint64_t *) h->private + 1);
368 364  
369 365 a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
370 366 b = _mm_insert_epi32 (a, b32, 0);
371   - g = _mm_cvtsi64_si128 (g_star);
372   - q = _mm_cvtsi64_si128 (q_plus);
  367 + g = _mm_loadl_epi64 ((__m128i *) ((uint64_t *) h->private + 1)); /* g_star */
  368 + q = _mm_loadl_epi64 ((__m128i *) h->private); /* q_plus */
373 369  
374 370 result = _mm_clmulepi64_si128 (a, b, 0);
375 371 w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0);
... ... @@ -398,16 +394,12 @@ gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32
398 394 __m128i w;
399 395 __m128i g, q;
400 396 gf_internal_t * h = gf->scratch;
401   - uint64_t g_star, q_plus;
402 397  
403 398 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
404 399 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
405 400  
406   - q_plus = *(uint64_t *) h->private;
407   - g_star = *((uint64_t *) h->private + 1);
408   -
409   - g = _mm_cvtsi64_si128 (g_star);
410   - q = _mm_cvtsi64_si128 (q_plus);
  401 + g = _mm_loadl_epi64 ((__m128i *) ((uint64_t *) h->private + 1)); /* g_star */
  402 + q = _mm_loadl_epi64 ((__m128i *) h->private); /* q_plus */
411 403 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
412 404 s32 = (uint32_t *) src;
413 405 d32 = (uint32_t *) dest;
... ...