Commit 4339569f14c95a8895a347845f8ed6e18b345ace

Authored by Bassam Tabbara
1 parent 7761438c
Exists in master and in 1 other branch v3

Support for runtime SIMD detection

This commits adds support for runtime detection of SIMD instructions. The idea is that you would build once with all supported SIMD functions and the same binaries could run on different machines with varying support for SIMD. At runtime gf-complete will select the right functions based on the processor.

gf_cpu.c has the logic to detect SIMD instructions. On Intel processors this is done through cpuid. For ARM on linux we use getauxv.

The logic in gf_w*.c has been changed to check for runtime SIMD support and fallback to generic code.

Also a new test has been added. It compares the functions selected by gf_init when we enable/disable SIMD support through build flags, with runtime enabling/disabling. The test checks if the results are identical.
.gitignore
... ... @@ -75,4 +75,4 @@ tools/gf_time
75 75 tools/gf_unit_w*
76 76 tools/test-suite.log
77 77 tools/.qemu/
78   -tools/test_simd*.results
  78 +tools/test_simd*.results*
... ...
include/gf_cpu.h 0 → 100644
... ... @@ -0,0 +1,20 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_cpu.h
  7 + *
  8 + * Identifies whether the CPU supports SIMD instructions at runtime.
  9 + */
  10 +
  11 +#pragma once
  12 +
  13 +extern int gf_cpu_supports_intel_pclmul;
  14 +extern int gf_cpu_supports_intel_sse4;
  15 +extern int gf_cpu_supports_intel_ssse3;
  16 +extern int gf_cpu_supports_intel_sse3;
  17 +extern int gf_cpu_supports_intel_sse2;
  18 +extern int gf_cpu_supports_arm_neon;
  19 +
  20 +void gf_cpu_identify(void);
... ...
src/Makefile.am
... ... @@ -4,11 +4,21 @@
4 4 AUTOMAKE_OPTIONS = subdir-objects
5 5  
6 6 AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
7   -AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare
8 7  
  8 +# avoid using SIMD_FLAGS for code that calls strcmp as new gcc
  9 +# versions will use SIMD for the strcmp implementation. Instead
  10 +# we create a static library just for gf_method that is not compiled
  11 +# with SIMD_FLAGS, this static library will get linked into gf_complete.so
  12 +noinst_LTLIBRARIES = libgf_util.la
  13 +libgf_util_la_SOURCES = gf_method.c
  14 +libgf_util_la_CFLAGS = -O3 -fPIC -Wsign-compare
  15 +
  16 +# we narrowly use SIMD_FLAGS for code that needs it
9 17 lib_LTLIBRARIES = libgf_complete.la
10   -libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
11   - gf_w64.c gf_w128.c gf_rand.c gf_general.c
  18 +libgf_complete_la_SOURCES = gf.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
  19 + gf_w64.c gf_w128.c gf_rand.c gf_general.c gf_cpu.c
  20 +libgf_complete_la_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare
  21 +libgf_complete_la_LIBADD = libgf_util.la
12 22  
13 23 if HAVE_NEON
14 24 libgf_complete_la_SOURCES += neon/gf_w4_neon.c \
... ...
src/gf.c
... ... @@ -12,6 +12,7 @@
12 12 #include <stdio.h>
13 13 #include <stdlib.h>
14 14 #include <assert.h>
  15 +#include "gf_cpu.h"
15 16  
16 17 int _gf_errno = GF_E_DEFAULT;
17 18  
... ... @@ -207,20 +208,28 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
207 208 if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
208 209  
209 210 #ifdef INTEL_SSE2
210   - sse2 = 1;
  211 + if (gf_cpu_supports_intel_sse2) {
  212 + sse2 = 1;
  213 + }
211 214 #endif
212 215  
213 216 #ifdef INTEL_SSSE3
214   - sse3 = 1;
  217 + if (gf_cpu_supports_intel_ssse3) {
  218 + sse3 = 1;
  219 + }
215 220 #endif
216 221  
217 222 #ifdef INTEL_SSE4_PCLMUL
218   - pclmul = 1;
  223 + if (gf_cpu_supports_intel_pclmul) {
  224 + pclmul = 1;
  225 + }
219 226 #endif
220 227  
221 228 #ifdef ARM_NEON
222   - pclmul = (w == 4 || w == 8);
223   - sse3 = 1;
  229 + if (gf_cpu_supports_arm_neon) {
  230 + pclmul = (w == 4 || w == 8);
  231 + sse3 = 1;
  232 + }
224 233 #endif
225 234  
226 235  
... ... @@ -473,6 +482,8 @@ int gf_init_hard(gf_t *gf, int w, int mult_type,
473 482 int sz;
474 483 gf_internal_t *h;
475 484  
  485 + gf_cpu_identify();
  486 +
476 487 if (gf_error_check(w, mult_type, region_type, divide_type,
477 488 arg1, arg2, prim_poly, base_gf) == 0) return 0;
478 489  
... ...
src/gf_cpu.c 0 → 100644
... ... @@ -0,0 +1,153 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_cpu.h
  7 + *
  8 + * Identifies whether the CPU supports SIMD instructions at runtime.
  9 + */
  10 +
  11 +#include <stdio.h>
  12 +#include <stdlib.h>
  13 +
  14 +int gf_cpu_identified = 0;
  15 +
  16 +int gf_cpu_supports_intel_pclmul = 0;
  17 +int gf_cpu_supports_intel_sse4 = 0;
  18 +int gf_cpu_supports_intel_ssse3 = 0;
  19 +int gf_cpu_supports_intel_sse3 = 0;
  20 +int gf_cpu_supports_intel_sse2 = 0;
  21 +int gf_cpu_supports_arm_neon = 0;
  22 +
  23 +#if defined(__x86_64__)
  24 +
  25 +void gf_cpu_identify(void)
  26 +{
  27 + if (gf_cpu_identified) {
  28 + return;
  29 + }
  30 +
  31 + int op = 1, eax, ebx, ecx, edx;
  32 +
  33 + __asm__("cpuid"
  34 + : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
  35 + : "a" (op));
  36 +
  37 +#if defined(INTEL_SSE4_PCLMUL)
  38 + if ((ecx & 1) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE4_PCLMUL")) {
  39 + gf_cpu_supports_intel_pclmul = 1;
  40 +#ifdef DEBUG_CPU_DETECTION
  41 + printf("#gf_cpu_supports_intel_pclmul\n");
  42 +#endif
  43 + }
  44 +#endif
  45 +
  46 +#if defined(INTEL_SSE4)
  47 + if (((ecx & (1<<20)) != 0 || (ecx & (1<<19)) != 0) && !getenv("GF_COMPLETE_DISABLE_SSE4")) {
  48 + gf_cpu_supports_intel_sse4 = 1;
  49 +#ifdef DEBUG_CPU_DETECTION
  50 + printf("#gf_cpu_supports_intel_sse4\n");
  51 +#endif
  52 + }
  53 +#endif
  54 +
  55 +#if defined(INTEL_SSSE3)
  56 + if ((ecx & (1<<9)) != 0 && !getenv("GF_COMPLETE_DISABLE_SSSE3")) {
  57 + gf_cpu_supports_intel_ssse3 = 1;
  58 +#ifdef DEBUG_CPU_DETECTION
  59 + printf("#gf_cpu_supports_intel_ssse3\n");
  60 +#endif
  61 + }
  62 +#endif
  63 +
  64 +#if defined(INTEL_SSE3)
  65 + if ((ecx & 1) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE3")) {
  66 + gf_cpu_supports_intel_sse3 = 1;
  67 +#ifdef DEBUG_CPU_DETECTION
  68 + printf("#gf_cpu_supports_intel_sse3\n");
  69 +#endif
  70 + }
  71 +#endif
  72 +
  73 +#if defined(INTEL_SSE2)
  74 + if ((edx & (1<<26)) != 0 && !getenv("GF_COMPLETE_DISABLE_SSE2")) {
  75 + gf_cpu_supports_intel_sse2 = 1;
  76 +#ifdef DEBUG_CPU_DETECTION
  77 + printf("#gf_cpu_supports_intel_sse2\n");
  78 +#endif
  79 + }
  80 +#endif
  81 +
  82 + gf_cpu_identified = 1;
  83 +}
  84 +
  85 +#elif defined(__arm__) || defined(__aarch64__)
  86 +
  87 +#ifdef __linux__
  88 +
  89 +#include <stdio.h>
  90 +#include <unistd.h>
  91 +#include <elf.h>
  92 +#include <linux/auxvec.h>
  93 +#include <asm/hwcap.h>
  94 +#include <fcntl.h>
  95 +
  96 +unsigned long get_hwcap(unsigned long type) {
  97 + unsigned long hwcap = 0;
  98 + int fd = open("/proc/self/auxv", O_RDONLY);
  99 + if (fd > 0) {
  100 + Elf32_auxv_t auxv;
  101 + while (read(fd, &auxv, sizeof(Elf32_auxv_t))) {
  102 + if (auxv.a_type == type) {
  103 + hwcap = auxv.a_un.a_val;
  104 + break;
  105 + }
  106 + }
  107 + close(fd);
  108 + }
  109 +
  110 + return hwcap;
  111 +}
  112 +
  113 +#endif // linux
  114 +
  115 +void gf_cpu_identify(void)
  116 +{
  117 + if (gf_cpu_identified) {
  118 + return;
  119 + }
  120 +
  121 +#if defined(ARM_NEON)
  122 + if (!getenv("GF_COMPLETE_DISABLE_NEON")) {
  123 +#if __linux__ && __arm__
  124 + gf_cpu_supports_arm_neon = (get_hwcap(AT_HWCAP) & HWCAP_NEON) > 0;
  125 +#elif __aarch64__
  126 + // ASIMD is supported on all aarch64 architectures
  127 + gf_cpu_supports_arm_neon = 1;
  128 +#else
  129 + // we assume that NEON is supported if the compiler supports
  130 + // NEON and we dont have a reliable way to detect runtime support.
  131 + gf_cpu_supports_arm_neon = 1;
  132 +#endif
  133 +
  134 +#ifdef DEBUG_CPU_DETECTION
  135 + if (gf_cpu_supports_arm_neon) {
  136 + printf("#gf_cpu_supports_arm_neon\n");
  137 + }
  138 +#endif
  139 + }
  140 +#endif // defined(ARM_NEON)
  141 +
  142 + gf_cpu_identified = 1;
  143 +}
  144 +
  145 +#else // defined(__arm__) || defined(__aarch64__)
  146 +
  147 +int gf_cpu_identify(void)
  148 +{
  149 + gf_cpu_identified = 1;
  150 + return 0;
  151 +}
  152 +
  153 +#endif
... ...
src/gf_w128.c
... ... @@ -11,6 +11,7 @@
11 11 #include "gf_int.h"
12 12 #include <stdio.h>
13 13 #include <stdlib.h>
  14 +#include "gf_cpu.h"
14 15  
15 16 #define GF_FIELD_WIDTH (128)
16 17  
... ... @@ -290,11 +291,11 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
290 291 return;
291 292 }
292 293  
  294 +#if defined(INTEL_SSE4_PCLMUL)
  295 +
293 296 void
294 297 gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
295 298 {
296   -#if defined(INTEL_SSE4_PCLMUL)
297   -
298 299 __m128i a,b;
299 300 __m128i result0,result1;
300 301 __m128i prim_poly;
... ... @@ -338,9 +339,8 @@ gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_
338 339  
339 340 c128[0] = (uint64_t)_mm_extract_epi64(result1,1);
340 341 c128[1] = (uint64_t)_mm_extract_epi64(result1,0);
341   -#endif
342   -return;
343 342 }
  343 +#endif
344 344  
345 345 void
346 346 gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
... ... @@ -376,10 +376,10 @@ gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_
376 376 return;
377 377 }
378 378  
  379 +#if defined(INTEL_SSE4)
379 380 void
380 381 gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
381 382 {
382   -#if defined(INTEL_SSE4)
383 383 int i;
384 384 __m128i a, b, pp, prod, amask, u_middle_one;
385 385 /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
... ... @@ -427,16 +427,16 @@ gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
427 427 }
428 428 c128[0] = (uint64_t)_mm_extract_epi64(prod, 1);
429 429 c128[1] = (uint64_t)_mm_extract_epi64(prod, 0);
430   -#endif
431 430 return;
432 431 }
  432 +#endif
433 433  
434 434  
435 435 /* Ben: This slow function implements sse instrutions for bytwo_b because why not */
  436 +#if defined(INTEL_SSE4)
436 437 void
437 438 gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
438 439 {
439   -#if defined(INTEL_SSE4)
440 440 __m128i a, b, lmask, hmask, pp, c, middle_one;
441 441 gf_internal_t *h;
442 442 uint64_t topbit, middlebit;
... ... @@ -471,8 +471,8 @@ gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_
471 471 if (middlebit) b = _mm_xor_si128(b, middle_one);
472 472 if (topbit) b = _mm_xor_si128(b, pp);
473 473 }
474   -#endif
475 474 }
  475 +#endif
476 476  
477 477 void
478 478 gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
... ... @@ -1146,7 +1146,7 @@ gf_w128_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val,
1146 1146 }
1147 1147  
1148 1148 /* a^-1 -> b */
1149   - void
  1149 +void
1150 1150 gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
1151 1151 {
1152 1152 uint64_t e_i[2], e_im1[2], e_ip1[2];
... ... @@ -1239,7 +1239,7 @@ gf_w128_euclid(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
1239 1239 return;
1240 1240 }
1241 1241  
1242   - void
  1242 +void
1243 1243 gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
1244 1244 {
1245 1245 uint64_t d[2];
... ... @@ -1248,7 +1248,7 @@ gf_w128_divide_from_inverse(GFP gf, gf_val_128_t a128, gf_val_128_t b128, gf_val
1248 1248 return;
1249 1249 }
1250 1250  
1251   - void
  1251 +void
1252 1252 gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
1253 1253 {
1254 1254 uint64_t one128[2];
... ... @@ -1260,7 +1260,7 @@ gf_w128_inverse_from_divide(GFP gf, gf_val_128_t a128, gf_val_128_t b128)
1260 1260  
1261 1261  
1262 1262 static
1263   - void
  1263 +void
1264 1264 gf_w128_composite_inverse(gf_t *gf, gf_val_128_t a, gf_val_128_t inv)
1265 1265 {
1266 1266 gf_internal_t *h = (gf_internal_t *) gf->scratch;
... ... @@ -1421,10 +1421,12 @@ static
1421 1421 int gf_w128_cfm_init(gf_t *gf)
1422 1422 {
1423 1423 #if defined(INTEL_SSE4_PCLMUL)
1424   - SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
1425   - SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
1426   - SET_FUNCTION(gf,multiply_region,w128,gf_w128_clm_multiply_region_from_single)
1427   - return 1;
  1424 + if (gf_cpu_supports_intel_pclmul) {
  1425 + SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
  1426 + SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
  1427 + SET_FUNCTION(gf,multiply_region,w128,gf_w128_clm_multiply_region_from_single)
  1428 + return 1;
  1429 + }
1428 1430 #endif
1429 1431  
1430 1432 return 0;
... ... @@ -1527,7 +1529,7 @@ int gf_w128_split_init(gf_t *gf)
1527 1529  
1528 1530 SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply)
1529 1531 #if defined(INTEL_SSE4_PCLMUL)
1530   - if (!(h->region_type & GF_REGION_NOSIMD)){
  1532 + if (gf_cpu_supports_intel_pclmul && !(h->region_type & GF_REGION_NOSIMD)){
1531 1533 SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
1532 1534 }
1533 1535 #endif
... ... @@ -1546,23 +1548,19 @@ int gf_w128_split_init(gf_t *gf)
1546 1548 if((h->region_type & GF_REGION_ALTMAP))
1547 1549 {
1548 1550 #ifdef INTEL_SSE4
1549   - if(!(h->region_type & GF_REGION_NOSIMD))
  1551 + if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
1550 1552 SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_altmap_multiply_region)
1551 1553 else
1552   - return 0;
1553   - #else
1554   - return 0;
1555 1554 #endif
  1555 + return 0;
1556 1556 }
1557 1557 else {
1558 1558 #ifdef INTEL_SSE4
1559   - if(!(h->region_type & GF_REGION_NOSIMD))
  1559 + if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
1560 1560 SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_multiply_region)
1561 1561 else
1562   - SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
1563   - #else
1564   - SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
1565 1562 #endif
  1563 + SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
1566 1564 }
1567 1565 }
1568 1566 return 1;
... ...
src/gf_w16.c
... ... @@ -12,6 +12,7 @@
12 12 #include <stdio.h>
13 13 #include <stdlib.h>
14 14 #include "gf_w16.h"
  15 +#include "gf_cpu.h"
15 16  
16 17 #define AB2(ip, am1 ,am2, b, t1, t2) {\
17 18 t1 = (b << 1) & am1;\
... ... @@ -391,6 +392,7 @@ gf_val_32_t gf_w16_matrix (gf_t *gf, gf_val_32_t b)
391 392 extra memory.
392 393 */
393 394  
  395 +#if defined(INTEL_SSE4_PCLMUL)
394 396 static
395 397 inline
396 398 gf_val_32_t
... ... @@ -398,8 +400,6 @@ gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
398 400 {
399 401 gf_val_32_t rv = 0;
400 402  
401   -#if defined(INTEL_SSE4_PCLMUL)
402   -
403 403 __m128i a, b;
404 404 __m128i result;
405 405 __m128i prim_poly;
... ... @@ -433,11 +433,11 @@ gf_w16_clm_multiply_2 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
433 433  
434 434 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
435 435  
436   -
437   -#endif
438 436 return rv;
439 437 }
  438 +#endif
440 439  
  440 +#if defined(INTEL_SSE4_PCLMUL)
441 441 static
442 442 inline
443 443 gf_val_32_t
... ... @@ -445,8 +445,6 @@ gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
445 445 {
446 446 gf_val_32_t rv = 0;
447 447  
448   -#if defined(INTEL_SSE4_PCLMUL)
449   -
450 448 __m128i a, b;
451 449 __m128i result;
452 450 __m128i prim_poly;
... ... @@ -473,11 +471,11 @@ gf_w16_clm_multiply_3 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
473 471  
474 472 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
475 473  
476   -
477   -#endif
478 474 return rv;
479 475 }
  476 +#endif
480 477  
  478 +#if defined(INTEL_SSE4_PCLMUL)
481 479 static
482 480 inline
483 481 gf_val_32_t
... ... @@ -485,8 +483,6 @@ gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
485 483 {
486 484 gf_val_32_t rv = 0;
487 485  
488   -#if defined(INTEL_SSE4_PCLMUL)
489   -
490 486 __m128i a, b;
491 487 __m128i result;
492 488 __m128i prim_poly;
... ... @@ -515,10 +511,9 @@ gf_w16_clm_multiply_4 (gf_t *gf, gf_val_32_t a16, gf_val_32_t b16)
515 511  
516 512 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
517 513  
518   -
519   -#endif
520 514 return rv;
521 515 }
  516 +#endif
522 517  
523 518  
524 519 static
... ... @@ -556,25 +551,27 @@ static
556 551 int gf_w16_cfm_init(gf_t *gf)
557 552 {
558 553 #if defined(INTEL_SSE4_PCLMUL)
559   - gf_internal_t *h;
  554 + if (gf_cpu_supports_intel_pclmul) {
  555 + gf_internal_t *h;
560 556  
561   - h = (gf_internal_t *) gf->scratch;
562   -
563   - /*Ben: Determining how many reductions to do */
564   -
565   - if ((0xfe00 & h->prim_poly) == 0) {
566   - SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_2)
567   - SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_2)
568   - } else if((0xf000 & h->prim_poly) == 0) {
569   - SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_3)
570   - SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_3)
571   - } else if ((0xe000 & h->prim_poly) == 0) {
572   - SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_4)
573   - SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_4)
574   - } else {
575   - return 0;
576   - }
577   - return 1;
  557 + h = (gf_internal_t *) gf->scratch;
  558 +
  559 + /*Ben: Determining how many reductions to do */
  560 +
  561 + if ((0xfe00 & h->prim_poly) == 0) {
  562 + SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_2)
  563 + SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_2)
  564 + } else if((0xf000 & h->prim_poly) == 0) {
  565 + SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_3)
  566 + SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_3)
  567 + } else if ((0xe000 & h->prim_poly) == 0) {
  568 + SET_FUNCTION(gf,multiply,w32,gf_w16_clm_multiply_4)
  569 + SET_FUNCTION(gf,multiply_region,w32,gf_w16_clm_multiply_region_from_single_4)
  570 + } else {
  571 + return 0;
  572 + }
  573 + return 1;
  574 + }
578 575 #endif
579 576  
580 577 return 0;
... ... @@ -688,10 +685,9 @@ int gf_w16_log_init(gf_t *gf)
688 685  
689 686 if (check) {
690 687 if (h->mult_type != GF_MULT_LOG_TABLE) {
691   -
692   -#if defined(INTEL_SSE4_PCLMUL)
693   - return gf_w16_cfm_init(gf);
694   -#endif
  688 + if (gf_cpu_supports_intel_pclmul) {
  689 + return gf_w16_cfm_init(gf);
  690 + }
695 691 return gf_w16_shift_init(gf);
696 692 } else {
697 693 _gf_errno = GF_E_LOGPOLY;
... ... @@ -948,11 +944,11 @@ gf_w16_table_lazy_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t v
948 944 gf_do_final_region_alignment(&rd);
949 945 }
950 946  
  947 +#ifdef INTEL_SSSE3
951 948 static
952 949 void
953 950 gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
954 951 {
955   -#ifdef INTEL_SSSE3
956 952 uint64_t i, j, *s64, *d64, *top64;;
957 953 uint64_t c, prod;
958 954 uint8_t low[4][16];
... ... @@ -1078,14 +1074,14 @@ gf_w16_split_4_16_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_v
1078 1074 }
1079 1075  
1080 1076 gf_do_final_region_alignment(&rd);
1081   -#endif
1082 1077 }
  1078 +#endif
1083 1079  
  1080 +#ifdef INTEL_SSSE3
1084 1081 static
1085 1082 void
1086 1083 gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1087 1084 {
1088   -#ifdef INTEL_SSSE3
1089 1085 uint64_t i, j, *s64, *d64, *top64;;
1090 1086 uint64_t c, prod;
1091 1087 uint8_t low[4][16];
... ... @@ -1187,8 +1183,8 @@ gf_w16_split_4_16_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
1187 1183 }
1188 1184 gf_do_final_region_alignment(&rd);
1189 1185  
1190   -#endif
1191 1186 }
  1187 +#endif
1192 1188  
1193 1189 uint32_t
1194 1190 gf_w16_split_8_8_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
... ... @@ -1216,21 +1212,11 @@ int gf_w16_split_init(gf_t *gf)
1216 1212 {
1217 1213 gf_internal_t *h;
1218 1214 struct gf_w16_split_8_8_data *d8;
1219   - int i, j, exp, issse3;
1220   - int isneon = 0;
  1215 + int i, j, exp;
1221 1216 uint32_t p, basep, tmp;
1222 1217  
1223 1218 h = (gf_internal_t *) gf->scratch;
1224 1219  
1225   -#ifdef INTEL_SSSE3
1226   - issse3 = 1;
1227   -#else
1228   - issse3 = 0;
1229   -#endif
1230   -#ifdef ARM_NEON
1231   - isneon = 1;
1232   -#endif
1233   -
1234 1220 if (h->arg1 == 8 && h->arg2 == 8) {
1235 1221 d8 = (struct gf_w16_split_8_8_data *) h->private;
1236 1222 basep = 1;
... ... @@ -1273,36 +1259,45 @@ int gf_w16_split_init(gf_t *gf)
1273 1259  
1274 1260 /* Defaults */
1275 1261  
1276   - if (issse3) {
  1262 +#ifdef INTEL_SSSE3
  1263 + if (gf_cpu_supports_intel_ssse3) {
1277 1264 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_multiply_region)
1278   - } else if (isneon) {
1279   -#ifdef ARM_NEON
  1265 + } else {
  1266 +#elif ARM_NEON
  1267 + if (gf_cpu_supports_arm_neon) {
1280 1268 gf_w16_neon_split_init(gf);
1281   -#endif
1282 1269 } else {
  1270 +#endif
1283 1271 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
  1272 +#if defined(INTEL_SSSE3) || defined(ARM_NEON)
1284 1273 }
1285   -
  1274 +#endif
1286 1275  
1287 1276 if ((h->arg1 == 8 && h->arg2 == 16) || (h->arg2 == 8 && h->arg1 == 16)) {
1288 1277 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_8_16_lazy_multiply_region)
1289 1278  
1290 1279 } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) {
1291   - if (issse3 || isneon) {
  1280 +#if defined(INTEL_SSSE3) || defined(ARM_NEON)
  1281 + if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
1292 1282 if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD)
1293 1283 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region)
1294 1284 else if(h->region_type & GF_REGION_NOSIMD)
1295 1285 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region)
1296   - else if(h->region_type & GF_REGION_ALTMAP && issse3)
  1286 +#if defined(INTEL_SSSE3)
  1287 + else if(h->region_type & GF_REGION_ALTMAP && gf_cpu_supports_intel_ssse3)
1297 1288 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_sse_altmap_multiply_region)
  1289 +#endif
1298 1290 } else {
  1291 +#endif
1299 1292 if(h->region_type & GF_REGION_SIMD)
1300 1293 return 0;
1301 1294 else if(h->region_type & GF_REGION_ALTMAP)
1302 1295 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_nosse_altmap_multiply_region)
1303 1296 else
1304 1297 SET_FUNCTION(gf,multiply_region,w32,gf_w16_split_4_16_lazy_multiply_region)
  1298 +#if defined(INTEL_SSSE3) || defined(ARM_NEON)
1305 1299 }
  1300 +#endif
1306 1301 }
1307 1302  
1308 1303 return 1;
... ... @@ -1846,26 +1841,28 @@ int gf_w16_bytwo_init(gf_t *gf)
1846 1841 if (h->mult_type == GF_MULT_BYTWO_p) {
1847 1842 SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_p_multiply)
1848 1843 #ifdef INTEL_SSE2
1849   - if (h->region_type & GF_REGION_NOSIMD)
1850   - SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_nosse_multiply_region)
1851   - else
1852   - SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_sse_multiply_region)
1853   - #else
  1844 + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
  1845 + SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_sse_multiply_region)
  1846 + } else {
  1847 + #endif
1854 1848 SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_p_nosse_multiply_region)
1855 1849 if(h->region_type & GF_REGION_SIMD)
1856 1850 return 0;
  1851 + #ifdef INTEL_SSE2
  1852 + }
1857 1853 #endif
1858 1854 } else {
1859 1855 SET_FUNCTION(gf,multiply,w32,gf_w16_bytwo_b_multiply)
1860 1856 #ifdef INTEL_SSE2
1861   - if (h->region_type & GF_REGION_NOSIMD)
1862   - SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_nosse_multiply_region)
1863   - else
  1857 + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
1864 1858 SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_sse_multiply_region)
1865   - #else
  1859 + } else {
  1860 + #endif
1866 1861 SET_FUNCTION(gf,multiply_region,w32,gf_w16_bytwo_b_nosse_multiply_region)
1867 1862 if(h->region_type & GF_REGION_SIMD)
1868 1863 return 0;
  1864 + #ifdef INTEL_SSE2
  1865 + }
1869 1866 #endif
1870 1867 }
1871 1868  
... ...
src/gf_w32.c
... ... @@ -13,6 +13,7 @@
13 13 #include <stdio.h>
14 14 #include <stdlib.h>
15 15 #include "gf_w32.h"
  16 +#include "gf_cpu.h"
16 17  
17 18 #define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
18 19  
... ... @@ -347,6 +348,8 @@ uint32_t gf_w32_matrix (gf_t *gf, uint32_t b)
347 348 extra memory.
348 349 */
349 350  
  351 +#if defined(INTEL_SSE4_PCLMUL)
  352 +
350 353 static
351 354 inline
352 355 gf_val_32_t
... ... @@ -354,8 +357,6 @@ gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
354 357 {
355 358 gf_val_32_t rv = 0;
356 359  
357   -#if defined(INTEL_SSE4_PCLMUL)
358   -
359 360 __m128i a, b;
360 361 __m128i result;
361 362 __m128i w;
... ... @@ -378,9 +379,9 @@ gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
378 379  
379 380 /* Extracts 32 bit value from result. */
380 381 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
381   -#endif
382 382 return rv;
383 383 }
  384 +#endif
384 385  
385 386 #if defined(INTEL_SSE4_PCLMUL)
386 387  
... ... @@ -435,6 +436,8 @@ gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32
435 436 #endif
436 437  
437 438  
  439 +#if defined(INTEL_SSE4_PCLMUL)
  440 +
438 441 static
439 442 inline
440 443 gf_val_32_t
... ... @@ -442,8 +445,6 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
442 445 {
443 446 gf_val_32_t rv = 0;
444 447  
445   -#if defined(INTEL_SSE4_PCLMUL)
446   -
447 448 __m128i a, b;
448 449 __m128i result;
449 450 __m128i prim_poly;
... ... @@ -476,9 +477,11 @@ gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
476 477  
477 478 /* Extracts 32 bit value from result. */
478 479 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
479   -#endif
480 480 return rv;
481 481 }
  482 +#endif
  483 +
  484 +#if defined(INTEL_SSE4_PCLMUL)
482 485  
483 486 static
484 487 inline
... ... @@ -487,8 +490,6 @@ gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
487 490 {
488 491 gf_val_32_t rv = 0;
489 492  
490   -#if defined(INTEL_SSE4_PCLMUL)
491   -
492 493 __m128i a, b;
493 494 __m128i result;
494 495 __m128i prim_poly;
... ... @@ -515,9 +516,11 @@ gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
515 516 /* Extracts 32 bit value from result. */
516 517  
517 518 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
518   -#endif
519 519 return rv;
520 520 }
  521 +#endif
  522 +
  523 +#if defined(INTEL_SSE4_PCLMUL)
521 524  
522 525 static
523 526 inline
... ... @@ -526,8 +529,6 @@ gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
526 529 {
527 530 gf_val_32_t rv = 0;
528 531  
529   -#if defined(INTEL_SSE4_PCLMUL)
530   -
531 532 __m128i a, b;
532 533 __m128i result;
533 534 __m128i prim_poly;
... ... @@ -556,9 +557,9 @@ gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
556 557 /* Extracts 32 bit value from result. */
557 558  
558 559 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
559   -#endif
560 560 return rv;
561 561 }
  562 +#endif
562 563  
563 564  
564 565 static
... ... @@ -593,29 +594,31 @@ int gf_w32_cfmgk_init(gf_t *gf)
593 594 SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single)
594 595  
595 596 #if defined(INTEL_SSE4_PCLMUL)
596   - gf_internal_t *h;
  597 + if (gf_cpu_supports_intel_pclmul) {
  598 + gf_internal_t *h;
597 599  
598   - h = (gf_internal_t *) gf->scratch;
599   - SET_FUNCTION(gf,multiply,w32,gf_w32_cfmgk_multiply)
600   - SET_FUNCTION(gf,multiply_region,w32,gf_w32_cfmgk_multiply_region_from_single)
  600 + h = (gf_internal_t *) gf->scratch;
  601 + SET_FUNCTION(gf,multiply,w32,gf_w32_cfmgk_multiply)
  602 + SET_FUNCTION(gf,multiply_region,w32,gf_w32_cfmgk_multiply_region_from_single)
601 603  
602   - uint64_t *q_plus = (uint64_t *) h->private;
603   - uint64_t *g_star = (uint64_t *) h->private + 1;
  604 + uint64_t *q_plus = (uint64_t *) h->private;
  605 + uint64_t *g_star = (uint64_t *) h->private + 1;
604 606  
605   - uint64_t tmp = h->prim_poly << 32;
606   - *q_plus = 1ULL << 32;
  607 + uint64_t tmp = h->prim_poly << 32;
  608 + *q_plus = 1ULL << 32;
607 609  
608   - int i;
609   - for(i = 63; i >= 32; i--)
610   - if((1ULL << i) & tmp)
611   - {
612   - *q_plus |= 1ULL << (i-32);
613   - tmp ^= h->prim_poly << (i-32);
614   - }
  610 + int i;
  611 + for(i = 63; i >= 32; i--)
  612 + if((1ULL << i) & tmp)
  613 + {
  614 + *q_plus |= 1ULL << (i-32);
  615 + tmp ^= h->prim_poly << (i-32);
  616 + }
615 617  
616   - *g_star = h->prim_poly & ((1ULL << 32) - 1);
  618 + *g_star = h->prim_poly & ((1ULL << 32) - 1);
617 619  
618   - return 1;
  620 + return 1;
  621 + }
619 622 #endif
620 623  
621 624 return 0;
... ... @@ -631,23 +634,25 @@ int gf_w32_cfm_init(gf_t *gf)
631 634 /*Ben: Check to see how many reduction steps it will take*/
632 635  
633 636 #if defined(INTEL_SSE4_PCLMUL)
634   - gf_internal_t *h;
  637 + if (gf_cpu_supports_intel_pclmul) {
  638 + gf_internal_t *h;
635 639  
636   - h = (gf_internal_t *) gf->scratch;
  640 + h = (gf_internal_t *) gf->scratch;
637 641  
638   - if ((0xfffe0000 & h->prim_poly) == 0){
639   - SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
640   - SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_2)
641   - }else if ((0xffc00000 & h->prim_poly) == 0){
642   - SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3)
643   - SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_3)
644   - }else if ((0xfe000000 & h->prim_poly) == 0){
645   - SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
646   - SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_4)
647   - } else {
648   - return 0;
  642 + if ((0xfffe0000 & h->prim_poly) == 0){
  643 + SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
  644 + SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_2)
  645 + }else if ((0xffc00000 & h->prim_poly) == 0){
  646 + SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3)
  647 + SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_3)
  648 + }else if ((0xfe000000 & h->prim_poly) == 0){
  649 + SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
  650 + SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_4)
  651 + } else {
  652 + return 0;
  653 + }
  654 + return 1;
649 655 }
650   - return 1;
651 656 #endif
652 657  
653 658 return 0;
... ... @@ -1382,26 +1387,28 @@ int gf_w32_bytwo_init(gf_t *gf)
1382 1387 if (h->mult_type == GF_MULT_BYTWO_p) {
1383 1388 SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply)
1384 1389 #ifdef INTEL_SSE2
1385   - if (h->region_type & GF_REGION_NOSIMD)
1386   - SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region)
1387   - else
  1390 + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
1388 1391 SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_sse_multiply_region)
1389   - #else
1390   - SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region)
1391   - if(h->region_type & GF_REGION_SIMD)
1392   - return 0;
  1392 + } else {
  1393 + #endif
  1394 + SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region)
  1395 + if(h->region_type & GF_REGION_SIMD)
  1396 + return 0;
  1397 + #ifdef INTEL_SSE2
  1398 + }
1393 1399 #endif
1394 1400 } else {
1395 1401 SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_b_multiply)
1396 1402 #ifdef INTEL_SSE2
1397   - if (h->region_type & GF_REGION_NOSIMD)
1398   - SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_nosse_multiply_region)
1399   - else
  1403 + if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
1400 1404 SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_sse_multiply_region)
1401   - #else
  1405 + } else {
  1406 + #endif
1402 1407 SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_nosse_multiply_region)
1403 1408 if(h->region_type & GF_REGION_SIMD)
1404 1409 return 0;
  1410 + #ifdef INTEL_SSE2
  1411 + }
1405 1412 #endif
1406 1413 }
1407 1414  
... ... @@ -1755,11 +1762,11 @@ gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t
1755 1762 gf_do_final_region_alignment(&rd);
1756 1763 }
1757 1764  
  1765 +#ifdef INTEL_SSSE3
1758 1766 static
1759 1767 void
1760 1768 gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
1761 1769 {
1762   -#ifdef INTEL_SSSE3
1763 1770 gf_internal_t *h;
1764 1771 int i, j, k;
1765 1772 uint32_t pp, v, *s32, *d32, *top;
... ... @@ -1942,16 +1949,15 @@ gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *des
1942 1949 }
1943 1950  
1944 1951 gf_do_final_region_alignment(&rd);
1945   -
1946   -#endif
1947 1952 }
  1953 +#endif
1948 1954  
1949 1955  
  1956 +#ifdef INTEL_SSSE3
1950 1957 static
1951 1958 void
1952 1959 gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
1953 1960 {
1954   -#ifdef INTEL_SSSE3
1955 1961 gf_internal_t *h;
1956 1962 int i, j, k;
1957 1963 uint32_t pp, v, *s32, *d32, *top, tmp_table[16];
... ... @@ -2216,9 +2222,8 @@ gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint
2216 2222 }
2217 2223 }
2218 2224 gf_do_final_region_alignment(&rd);
2219   -
2220   -#endif
2221 2225 }
  2226 +#endif
2222 2227  
2223 2228 static
2224 2229 int gf_w32_split_init(gf_t *gf)
... ... @@ -2230,23 +2235,7 @@ int gf_w32_split_init(gf_t *gf)
2230 2235 struct gf_split_8_32_lazy_data *d32;
2231 2236 struct gf_split_16_32_lazy_data *d16;
2232 2237 uint32_t p, basep;
2233   - int i, j, exp, ispclmul, issse3;
2234   - int isneon = 0;
2235   -
2236   -#if defined(INTEL_SSE4_PCLMUL)
2237   - ispclmul = 1;
2238   -#else
2239   - ispclmul = 0;
2240   -#endif
2241   -
2242   -#ifdef INTEL_SSSE3
2243   - issse3 = 1;
2244   -#else
2245   - issse3 = 0;
2246   -#endif
2247   -#ifdef ARM_NEON
2248   - isneon = 1;
2249   -#endif
  2238 + int i, j, exp;
2250 2239  
2251 2240 h = (gf_internal_t *) gf->scratch;
2252 2241  
... ... @@ -2262,7 +2251,8 @@ int gf_w32_split_init(gf_t *gf)
2262 2251  
2263 2252 if (h->arg1 == 8 && h->arg2 == 8) {
2264 2253 SET_FUNCTION(gf,multiply,w32,gf_w32_split_8_8_multiply)
2265   - } else if (ispclmul) {
  2254 +#if defined(INTEL_SSE4_PCLMUL)
  2255 + } else if (gf_cpu_supports_intel_pclmul) {
2266 2256 if ((0xfffe0000 & h->prim_poly) == 0){
2267