Commit 54d6029b100b55a6d1250bd96a392aa880773af4

Authored by Shengjing Zhu
1 parent 6f802f42
Exists in fix-sse-detection

separate sse function to multipile file

and compile them with correct flag.

This ensure the runtime sse detection working properly.
Otherwise compile sse2 function with -mssse3 etc will cause
program crashed on sse2 only machine.

Closes: #16
Signed-off-by: Shengjing Zhu <i@zhsj.me>
include/gf_int.h
... ... @@ -109,6 +109,7 @@ void gf_set_region_data(gf_region_data *rd,
109 109 uint64_t val,
110 110 int xor,
111 111 int align);
  112 +void gf_sse2_multby_one(void *src, void *dest, int bytes, int xor);
112 113  
113 114 /* This performs gf->multiply.32() on all of the unaligned bytes in the beginning of the region */
114 115  
... ...
include/gf_w128.h 0 → 100644
... ... @@ -0,0 +1,36 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_w128.h
  7 + *
  8 + * Defines and data structures 128-bit Galois fields
  9 + */
  10 +
  11 +#ifndef GF_COMPLETE_GF_W128_H
  12 +#define GF_COMPLETE_GF_W128_H
  13 +
  14 +#define GF_FIELD_WIDTH (128)
  15 +
  16 +struct gf_w128_split_4_128_data {
  17 + uint64_t last_value[2];
  18 + uint64_t tables[2][32][16];
  19 +};
  20 +
  21 +struct gf_w128_split_8_128_data {
  22 + uint64_t last_value[2];
  23 + uint64_t tables[2][16][256];
  24 +};
  25 +
  26 +typedef struct gf_group_tables_s {
  27 + gf_val_128_t m_table;
  28 + gf_val_128_t r_table;
  29 +} gf_group_tables_t;
  30 +
  31 +void gf_w128_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor);
  32 +void gf_w128_sse4_split_init(gf_t *gf);
  33 +void gf_w128_sse4_pclmul_clm_init(gf_t *gf);
  34 +void gf_w128_sse4_pclmul_clm_region_init(gf_t *gf);
  35 +
  36 +#endif /* GF_COMPLETE_GF_W128_H */
... ...
include/gf_w16.h
... ... @@ -63,4 +63,8 @@ struct gf_w16_composite_data {
63 63  
64 64 void gf_w16_neon_split_init(gf_t *gf);
65 65  
  66 +void gf_w16_sse2_bytwo_init(gf_t *gf);
  67 +void gf_w16_ssse3_split_init(gf_t *gf);
  68 +int gf_w16_sse4_pclmul_clm_multiply_init(gf_t *gf);
  69 +
66 70 #endif /* GF_COMPLETE_GF_W16_H */
... ...
include/gf_w32.h
... ... @@ -68,4 +68,10 @@ struct gf_w32_composite_data {
68 68  
69 69 void gf_w32_neon_split_init(gf_t *gf);
70 70  
  71 +void gf_w32_sse2_bytwo_init(gf_t *gf);
  72 +void gf_w32_ssse3_split_2_init(gf_t *gf);
  73 +void gf_w32_ssse3_split_4_init(gf_t *gf);
  74 +int gf_w32_sse4_pclmul_clm_multiply_init(gf_t *gf);
  75 +int gf_w32_sse4_pclmul_cfmgk_multiply_init(gf_t *gf);
  76 +
71 77 #endif /* GF_COMPLETE_GF_W32_H */
... ...
include/gf_w4.h
... ... @@ -60,4 +60,8 @@ struct gf_bytwo_data {
60 60 int gf_w4_neon_cfm_init(gf_t *gf);
61 61 void gf_w4_neon_single_table_init(gf_t *gf);
62 62  
  63 +void gf_w4_sse2_bytwo_init(gf_t *gf);
  64 +void gf_w4_ssse3_single_table_init(gf_t *gf);
  65 +void gf_w4_sse4_pclmul_clm_multiply_init(gf_t *gf);
  66 +
63 67 #endif /* GF_COMPLETE_GF_W4_H */
... ...
include/gf_w64.h
... ... @@ -47,4 +47,9 @@ struct gf_split_8_8_data {
47 47  
48 48 void gf_w64_neon_split_init(gf_t *gf);
49 49  
  50 +void gf_w64_sse2_bytwo_init(gf_t *gf);
  51 +void gf_w64_ssse3_split_init(gf_t *gf);
  52 +void gf_w64_sse4_split_init(gf_t *gf);
  53 +int gf_w64_sse4_pclmul_clm_multiply_init(gf_t *gf);
  54 +
50 55 #endif /* GF_COMPLETE_GF_W64_H */
... ...
include/gf_w8.h
... ... @@ -96,4 +96,8 @@ struct gf_w8_bytwo_data {
96 96 int gf_w8_neon_cfm_init(gf_t *gf);
97 97 void gf_w8_neon_split_init(gf_t *gf);
98 98  
  99 +void gf_w8_sse2_bytwo_init(gf_t *gf);
  100 +void gf_w8_ssse3_split_multiply_init(gf_t *gf);
  101 +int gf_w8_sse4_pclmul_clm_multiply_init(gf_t *gf);
  102 +
99 103 #endif /* GF_COMPLETE_GF_W8_H */
... ...
m4/ax_ext.m4
... ... @@ -36,37 +36,44 @@ AC_DEFUN([AX_EXT],
36 36  
37 37 AC_CACHE_CHECK([whether sse is enabled], [ax_cv_have_sse_ext], [ax_cv_have_sse_ext=yes])
38 38 if test "$ax_cv_have_sse_ext" = yes; then
39   - AX_CHECK_COMPILE_FLAG(-msse, [SIMD_FLAGS="$SIMD_FLAGS -msse -DINTEL_SSE"], [ax_cv_have_sse_ext=no])
  39 + AX_CHECK_COMPILE_FLAG(-msse, [SIMD_FLAGS="$SIMD_FLAGS -DINTEL_SSE"], [ax_cv_have_sse_ext=no])
  40 + AM_CONDITIONAL([HAVE_SSE], [test "x$ax_cv_have_sse_ext" = "xyes"])
40 41 fi
41 42  
42 43 AC_CACHE_CHECK([whether sse2 is enabled], [ax_cv_have_sse2_ext], [ax_cv_have_sse2_ext=yes])
43 44 if test "$ax_cv_have_sse2_ext" = yes; then
44   - AX_CHECK_COMPILE_FLAG(-msse2, [SIMD_FLAGS="$SIMD_FLAGS -msse2 -DINTEL_SSE2"], [ax_cv_have_sse2_ext=no])
  45 + AX_CHECK_COMPILE_FLAG(-msse2, [SIMD_FLAGS="$SIMD_FLAGS -DINTEL_SSE2"], [ax_cv_have_sse2_ext=no])
  46 + AM_CONDITIONAL([HAVE_SSE2], [test "x$ax_cv_have_sse2_ext" = "xyes"])
45 47 fi
46 48  
47 49 AC_CACHE_CHECK([whether sse3 is enabled], [ax_cv_have_sse3_ext], [ax_cv_have_sse3_ext=yes])
48 50 if test "$ax_cv_have_sse3_ext" = yes; then
49   - AX_CHECK_COMPILE_FLAG(-msse3, [SIMD_FLAGS="$SIMD_FLAGS -msse3 -DINTEL_SSE3"], [ax_cv_have_sse3_ext=no])
  51 + AX_CHECK_COMPILE_FLAG(-msse3, [SIMD_FLAGS="$SIMD_FLAGS -DINTEL_SSE3"], [ax_cv_have_sse3_ext=no])
  52 + AM_CONDITIONAL([HAVE_SSE3], [test "x$ax_cv_have_sse3_ext" = "xyes"])
50 53 fi
51 54  
52 55 AC_CACHE_CHECK([whether ssse3 is enabled], [ax_cv_have_ssse3_ext], [ax_cv_have_ssse3_ext=yes])
53 56 if test "$ax_cv_have_ssse3_ext" = yes; then
54   - AX_CHECK_COMPILE_FLAG(-mssse3, [SIMD_FLAGS="$SIMD_FLAGS -mssse3 -DINTEL_SSSE3"], [ax_cv_have_ssse3_ext=no])
  57 + AX_CHECK_COMPILE_FLAG(-mssse3, [SIMD_FLAGS="$SIMD_FLAGS -DINTEL_SSSE3"], [ax_cv_have_ssse3_ext=no])
  58 + AM_CONDITIONAL([HAVE_SSSE3], [test "x$ax_cv_have_ssse3_ext" = "xyes"])
55 59 fi
56 60  
57 61 AC_CACHE_CHECK([whether pclmuldq is enabled], [ax_cv_have_pclmuldq_ext], [ax_cv_have_pclmuldq_ext=yes])
58 62 if test "$ax_cv_have_pclmuldq_ext" = yes; then
59   - AX_CHECK_COMPILE_FLAG(-mpclmul, [SIMD_FLAGS="$SIMD_FLAGS -mpclmul -DINTEL_SSE4_PCLMUL"], [ax_cv_have_pclmuldq_ext=no])
  63 + AX_CHECK_COMPILE_FLAG(-mpclmul, [SIMD_FLAGS="$SIMD_FLAGS -DINTEL_SSE4_PCLMUL"], [ax_cv_have_pclmuldq_ext=no])
  64 + AM_CONDITIONAL([HAVE_SSE4_PCLMUL], [test "x$ax_cv_have_pclmuldq_ext" = "xyes"])
60 65 fi
61 66  
62 67 AC_CACHE_CHECK([whether sse4.1 is enabled], [ax_cv_have_sse41_ext], [ax_cv_have_sse41_ext=yes])
63 68 if test "$ax_cv_have_sse41_ext" = yes; then
64   - AX_CHECK_COMPILE_FLAG(-msse4.1, [SIMD_FLAGS="$SIMD_FLAGS -msse4.1 -DINTEL_SSE4"], [ax_cv_have_sse41_ext=no])
  69 + AX_CHECK_COMPILE_FLAG(-msse4.1, [SIMD_FLAGS="$SIMD_FLAGS -DINTEL_SSE4"], [ax_cv_have_sse41_ext=no])
  70 + AM_CONDITIONAL([HAVE_SSE41], [test "x$ax_cv_have_sse41_ext" = "xyes"])
65 71 fi
66 72  
67 73 AC_CACHE_CHECK([whether sse4.2 is enabled], [ax_cv_have_sse42_ext], [ax_cv_have_sse42_ext=yes])
68 74 if test "$ax_cv_have_sse42_ext" = yes; then
69   - AX_CHECK_COMPILE_FLAG(-msse4.2, [SIMD_FLAGS="$SIMD_FLAGS -msse4.2 -DINTEL_SSE4"], [ax_cv_have_sse42_ext=no])
  75 + AX_CHECK_COMPILE_FLAG(-msse4.2, [SIMD_FLAGS="$SIMD_FLAGS -DINTEL_SSE4"], [ax_cv_have_sse42_ext=no])
  76 + AM_CONDITIONAL([HAVE_SSE42], [test "x$ax_cv_have_sse42_ext" = "xyes"])
70 77 fi
71 78 ;;
72 79 esac
... ...
src/Makefile.am
... ... @@ -9,14 +9,44 @@ AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
9 9 # versions will use SIMD for the strcmp implementation. Instead
10 10 # we create a static library just for gf_method that is not compiled
11 11 # with SIMD_FLAGS, this static library will get linked into gf_complete.so
12   -noinst_LTLIBRARIES = libgf_util.la
  12 +
  13 +noinst_LTLIBRARIES = libgf_util.la libgf_sse2.la libgf_ssse3.la libgf_sse4.la libgf_sse4_pclmul.la
  14 +
13 15 libgf_util_la_SOURCES = gf_method.c
14 16 libgf_util_la_CFLAGS = -O3 -fPIC -Wsign-compare
15 17  
  18 +libgf_sse2_la_SOURCES = sse/gf_sse2.c \
  19 + sse/gf_w4_sse2.c \
  20 + sse/gf_w8_sse2.c \
  21 + sse/gf_w16_sse2.c \
  22 + sse/gf_w32_sse2.c \
  23 + sse/gf_w64_sse2.c
  24 +libgf_sse2_la_CFLAGS = -O3 -fPIC -Wsign-compare
  25 +
  26 +libgf_ssse3_la_SOURCES = sse/gf_w4_ssse3.c \
  27 + sse/gf_w8_ssse3.c \
  28 + sse/gf_w16_ssse3.c \
  29 + sse/gf_w32_ssse3.c \
  30 + sse/gf_w64_ssse3.c
  31 +libgf_ssse3_la_CFLAGS = -O3 -fPIC -Wsign-compare
  32 +
  33 +libgf_sse4_la_SOURCES = sse/gf_w64_sse4.c \
  34 + sse/gf_w128_sse4.c
  35 +libgf_sse4_la_CFLAGS = -O3 -fPIC -Wsign-compare
  36 +
  37 +libgf_sse4_pclmul_la_SOURCES = sse/gf_w4_sse4_pclmul.c \
  38 + sse/gf_w8_sse4_pclmul.c \
  39 + sse/gf_w16_sse4_pclmul.c \
  40 + sse/gf_w32_sse4_pclmul.c \
  41 + sse/gf_w64_sse4_pclmul.c \
  42 + sse/gf_w128_sse4_pclmul.c
  43 +libgf_sse4_pclmul_la_CFLAGS = -O3 -fPIC -Wsign-compare
  44 +
  45 +
16 46 # we narrowly use SIMD_FLAGS for code that needs it
17 47 lib_LTLIBRARIES = libgf_complete.la
18   -libgf_complete_la_SOURCES = gf.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
19   - gf_w64.c gf_w128.c gf_rand.c gf_general.c gf_cpu.c
  48 +libgf_complete_la_SOURCES = gf_wgen.c gf_rand.c gf_general.c gf_cpu.c \
  49 + gf.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c gf_w64.c gf_w128.c
20 50 libgf_complete_la_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC -Wsign-compare
21 51 libgf_complete_la_LIBADD = libgf_util.la
22 52  
... ... @@ -28,5 +58,32 @@ libgf_complete_la_SOURCES += neon/gf_w4_neon.c \
28 58 neon/gf_w64_neon.c
29 59 endif
30 60  
  61 +
  62 +if HAVE_SSE2
  63 +libgf_sse2_la_CFLAGS += -DINTEL_SSE2 -msse2
  64 +libgf_complete_la_LIBADD += libgf_sse2.la
  65 +endif
  66 +
  67 +if HAVE_SSSE3
  68 +libgf_ssse3_la_CFLAGS += -DINTEL_SSSE3 -mssse3
  69 +libgf_complete_la_LIBADD += libgf_ssse3.la
  70 +endif
  71 +
  72 +if HAVE_SSE42
  73 +libgf_sse4_la_CFLAGS += -DINTEL_SSE4 -msse4.2
  74 +libgf_complete_la_LIBADD += libgf_sse4.la
  75 +else
  76 +if HAVE_SSE41
  77 +libgf_sse4_la_CFLAGS += -DINTEL_SSE4 -msse4.1
  78 +libgf_complete_la_LIBADD += libgf_sse4.la
  79 +endif
  80 +endif
  81 +
  82 +if HAVE_SSE4_PCLMUL
  83 +libgf_sse4_pclmul_la_CFLAGS += -DINTEL_SSE4_PCLMUL -DINTEL_SSE4 -mpclmul -msse4.2
  84 +libgf_complete_la_LIBADD += libgf_sse4_pclmul.la
  85 +endif
  86 +
  87 +
31 88 libgf_complete_la_LDFLAGS = -version-info 1:0:0
32 89  
... ...
src/gf.c
... ... @@ -927,49 +927,7 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor)
927 927  
928 928 #ifdef INTEL_SSE2
929 929 if (gf_cpu_supports_intel_sse2) {
930   - __m128i ms, md;
931   - int abytes;
932   - s8 = (uint8_t *) src;
933   - d8 = (uint8_t *) dest;
934   - if (uls % 16 == uld % 16) {
935   - gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
936   - while (s8 != rd.s_start) {
937   - *d8 ^= *s8;
938   - d8++;
939   - s8++;
940   - }
941   - while (s8 < (uint8_t *) rd.s_top) {
942   - ms = _mm_load_si128 ((__m128i *)(s8));
943   - md = _mm_load_si128 ((__m128i *)(d8));
944   - md = _mm_xor_si128(md, ms);
945   - _mm_store_si128((__m128i *)(d8), md);
946   - s8 += 16;
947   - d8 += 16;
948   - }
949   - while (s8 != (uint8_t *) src + bytes) {
950   - *d8 ^= *s8;
951   - d8++;
952   - s8++;
953   - }
954   - return;
955   - }
956   -
957   - abytes = (bytes & 0xfffffff0);
958   -
959   - while (d8 < (uint8_t *) dest + abytes) {
960   - ms = _mm_loadu_si128 ((__m128i *)(s8));
961   - md = _mm_loadu_si128 ((__m128i *)(d8));
962   - md = _mm_xor_si128(md, ms);
963   - _mm_storeu_si128((__m128i *)(d8), md);
964   - s8 += 16;
965   - d8 += 16;
966   - }
967   - while (d8 != (uint8_t *) dest+bytes) {
968   - *d8 ^= *s8;
969   - d8++;
970   - s8++;
971   - }
972   - return;
  930 + return gf_sse2_multby_one(src, dest, bytes, xor);
973 931 }
974 932 #endif
975 933 #if defined(ARM_NEON)
... ...
src/gf_w128.c
... ... @@ -11,10 +11,9 @@
11 11 #include "gf_int.h"
12 12 #include <stdio.h>
13 13 #include <stdlib.h>
  14 +#include "gf_w128.h"
14 15 #include "gf_cpu.h"
15 16  
16   -#define GF_FIELD_WIDTH (128)
17   -
18 17 #define two_x(a) {\
19 18 a[0] <<= 1; \
20 19 if (a[1] & 1ULL << 63) a[0] ^= 1; \
... ... @@ -28,24 +27,9 @@
28 27 a[i] = 0; \
29 28 a[i + 1] = 0;}
30 29  
31   -struct gf_w128_split_4_128_data {
32   - uint64_t last_value[2];
33   - uint64_t tables[2][32][16];
34   -};
35   -
36   -struct gf_w128_split_8_128_data {
37   - uint64_t last_value[2];
38   - uint64_t tables[2][16][256];
39   -};
40   -
41   -typedef struct gf_group_tables_s {
42   - gf_val_128_t m_table;
43   - gf_val_128_t r_table;
44   -} gf_group_tables_t;
45 30  
46 31 #define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); }
47 32  
48   -static
49 33 void
50 34 gf_w128_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
51 35 int xor)
... ... @@ -82,109 +66,6 @@ int xor)
82 66 }
83 67 }
84 68  
85   -#if defined(INTEL_SSE4_PCLMUL)
86   -static
87   -void
88   -gf_w128_clm_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes,
89   -int xor)
90   -{
91   - uint32_t i;
92   - gf_val_128_t s128;
93   - gf_val_128_t d128;
94   - gf_region_data rd;
95   - __m128i a,b;
96   - __m128i result0,result1;
97   - __m128i prim_poly;
98   - __m128i c,d,e,f;
99   - gf_internal_t * h = gf->scratch;
100   - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
101   - /* We only do this to check on alignment. */
102   - gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 8);
103   -
104   - if (val[0] == 0) {
105   - if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
106   - if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
107   - }
108   -
109   - s128 = (gf_val_128_t) src;
110   - d128 = (gf_val_128_t) dest;
111   -
112   - if (xor) {
113   - for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
114   - a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
115   - b = _mm_insert_epi64 (a, val[1], 0);
116   - a = _mm_insert_epi64 (a, s128[i], 1);
117   - b = _mm_insert_epi64 (b, val[0], 1);
118   -
119   - c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
120   - f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
121   - e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
122   - d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
123   -
124   - /* now reusing a and b as temporary variables*/
125   - result0 = _mm_setzero_si128();
126   - result1 = result0;
127   -
128   - result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
129   - a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
130   - result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
131   -
132   - a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
133   - result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
134   - result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
135   - /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce. */
136   -
137   - a = _mm_srli_si128 (result0, 8);
138   - b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
139   - result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
140   - result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
141   -
142   - a = _mm_insert_epi64 (result0, 0, 1);
143   - b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
144   - result1 = _mm_xor_si128 (result1, b);
145   - d128[i] ^= (uint64_t)_mm_extract_epi64(result1,1);
146   - d128[i+1] ^= (uint64_t)_mm_extract_epi64(result1,0);
147   - }
148   - } else {
149   - for (i = 0; i < bytes/sizeof(gf_val_64_t); i += 2) {
150   - a = _mm_insert_epi64 (_mm_setzero_si128(), s128[i+1], 0);
151   - b = _mm_insert_epi64 (a, val[1], 0);
152   - a = _mm_insert_epi64 (a, s128[i], 1);
153   - b = _mm_insert_epi64 (b, val[0], 1);
154   -
155   - c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
156   - f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
157   - e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
158   - d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
159   -
160   - /* now reusing a and b as temporary variables*/
161   - result0 = _mm_setzero_si128();
162   - result1 = result0;
163   -
164   - result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
165   - a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
166   - result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
167   -
168   - a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
169   - result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
170   - result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
171   - /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/
172   -
173   - a = _mm_srli_si128 (result0, 8);
174   - b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
175   - result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
176   - result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
177   -
178   - a = _mm_insert_epi64 (result0, 0, 1);
179   - b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
180   - result1 = _mm_xor_si128 (result1, b);
181   - d128[i] = (uint64_t)_mm_extract_epi64(result1,1);
182   - d128[i+1] = (uint64_t)_mm_extract_epi64(result1,0);
183   - }
184   - }
185   -}
186   -#endif
187   -
188 69 /*
189 70 * Some w128 notes:
190 71 * --Big Endian
... ... @@ -291,57 +172,6 @@ gf_w128_shift_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_12
291 172 return;
292 173 }
293 174  
294   -#if defined(INTEL_SSE4_PCLMUL)
295   -
296   -void
297   -gf_w128_clm_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
298   -{
299   - __m128i a,b;
300   - __m128i result0,result1;
301   - __m128i prim_poly;
302   - __m128i c,d,e,f;
303   - gf_internal_t * h = gf->scratch;
304   -
305   - a = _mm_insert_epi64 (_mm_setzero_si128(), a128[1], 0);
306   - b = _mm_insert_epi64 (a, b128[1], 0);
307   - a = _mm_insert_epi64 (a, a128[0], 1);
308   - b = _mm_insert_epi64 (b, b128[0], 1);
309   -
310   - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
311   -
312   - /* we need to test algorithm 2 later*/
313   - c = _mm_clmulepi64_si128 (a, b, 0x00); /*low-low*/
314   - f = _mm_clmulepi64_si128 (a, b, 0x01); /*high-low*/
315   - e = _mm_clmulepi64_si128 (a, b, 0x10); /*low-high*/
316   - d = _mm_clmulepi64_si128 (a, b, 0x11); /*high-high*/
317   -
318   - /* now reusing a and b as temporary variables*/
319   - result0 = _mm_setzero_si128();
320   - result1 = result0;
321   -
322   - result0 = _mm_xor_si128 (result0, _mm_insert_epi64 (d, 0, 0));
323   - a = _mm_xor_si128 (_mm_srli_si128 (e, 8), _mm_insert_epi64 (d, 0, 1));
324   - result0 = _mm_xor_si128 (result0, _mm_xor_si128 (_mm_srli_si128 (f, 8), a));
325   -
326   - a = _mm_xor_si128 (_mm_slli_si128 (e, 8), _mm_insert_epi64 (c, 0, 0));
327   - result1 = _mm_xor_si128 (result1, _mm_xor_si128 (_mm_slli_si128 (f, 8), a));
328   - result1 = _mm_xor_si128 (result1, _mm_insert_epi64 (c, 0, 1));
329   - /* now we have constructed our 'result' with result0 being the carry bits, and we have to reduce.*/
330   -
331   - a = _mm_srli_si128 (result0, 8);
332   - b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
333   - result0 = _mm_xor_si128 (result0, _mm_srli_si128 (b, 8));
334   - result1 = _mm_xor_si128 (result1, _mm_slli_si128 (b, 8));
335   -
336   - a = _mm_insert_epi64 (result0, 0, 1);
337   - b = _mm_clmulepi64_si128 (a, prim_poly, 0x00);
338   - result1 = _mm_xor_si128 (result1, b);
339   -
340   - c128[0] = (uint64_t)_mm_extract_epi64(result1,1);
341   - c128[1] = (uint64_t)_mm_extract_epi64(result1,0);
342   -}
343   -#endif
344   -
345 175 void
346 176 gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
347 177 {
... ... @@ -376,104 +206,6 @@ gf_w128_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_
376 206 return;
377 207 }
378 208  
379   -#if defined(INTEL_SSE4)
380   -void
381   -gf_w128_sse_bytwo_p_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
382   -{
383   - int i;
384   - __m128i a, b, pp, prod, amask, u_middle_one;
385   - /*John: pmask is always the highest bit set, and the rest zeros. amask changes, it's a countdown.*/
386   - uint32_t topbit, middlebit, pmask; /* this is used as a boolean value */
387   - gf_internal_t *h;
388   -
389   -
390   - h = (gf_internal_t *) gf->scratch;
391   - pp = _mm_set_epi32(0, 0, 0, (uint32_t)h->prim_poly);
392   - prod = _mm_setzero_si128();
393   - a = _mm_insert_epi64(prod, a128[1], 0x0);
394   - a = _mm_insert_epi64(a, a128[0], 0x1);
395   - b = _mm_insert_epi64(prod, b128[1], 0x0);
396   - b = _mm_insert_epi64(b, b128[0], 0x1);
397   - pmask = 0x80000000;
398   - amask = _mm_insert_epi32(prod, 0x80000000, 0x3);
399   - u_middle_one = _mm_insert_epi32(prod, 1, 0x2);
400   -
401   - for (i = 0; i < 64; i++) {
402   - topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
403   - middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
404   - prod = _mm_slli_epi64(prod, 1); /* this instruction loses the middle bit */
405   - if (middlebit) {
406   - prod = _mm_xor_si128(prod, u_middle_one);
407   - }
408   - if (topbit) {
409   - prod = _mm_xor_si128(prod, pp);
410   - }
411   - if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 1))) {
412   - prod = _mm_xor_si128(prod, b);
413   - }
414   - amask = _mm_srli_epi64(amask, 1); /*so does this one, but we can just replace after loop*/
415   - }
416   - amask = _mm_insert_epi32(amask, (gf_val_32_t)1 << 31, 0x1);
417   - for (i = 64; i < 128; i++) {
418   - topbit = (_mm_extract_epi32(prod, 0x3) & pmask);
419   - middlebit = (_mm_extract_epi32(prod, 0x1) & pmask);
420   - prod = _mm_slli_epi64(prod, 1);
421   - if (middlebit) prod = _mm_xor_si128(prod, u_middle_one);
422   - if (topbit) prod = _mm_xor_si128(prod, pp);
423   - if (((uint64_t)_mm_extract_epi64(_mm_and_si128(a, amask), 0))) {
424   - prod = _mm_xor_si128(prod, b);
425   - }
426   - amask = _mm_srli_epi64(amask, 1);
427   - }
428   - c128[0] = (uint64_t)_mm_extract_epi64(prod, 1);
429   - c128[1] = (uint64_t)_mm_extract_epi64(prod, 0);
430   - return;
431   -}
432   -#endif
433   -
434   -
435   -/* Ben: This slow function implements sse instrutions for bytwo_b because why not */
436   -#if defined(INTEL_SSE4)
437   -void
438   -gf_w128_sse_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
439   -{
440   - __m128i a, b, lmask, hmask, pp, c, middle_one;
441   - gf_internal_t *h;
442   - uint64_t topbit, middlebit;
443   -
444   - h = (gf_internal_t *) gf->scratch;
445   -
446   - c = _mm_setzero_si128();
447   - lmask = _mm_insert_epi64(c, 1ULL << 63, 0);
448   - hmask = _mm_insert_epi64(c, 1ULL << 63, 1);
449   - b = _mm_insert_epi64(c, a128[0], 1);
450   - b = _mm_insert_epi64(b, a128[1], 0);
451   - a = _mm_insert_epi64(c, b128[0], 1);
452   - a = _mm_insert_epi64(a, b128[1], 0);
453   - pp = _mm_insert_epi64(c, h->prim_poly, 0);
454   - middle_one = _mm_insert_epi64(c, 1, 0x1);
455   -
456   - while (1) {
457   - if (_mm_extract_epi32(a, 0x0) & 1) {
458   - c = _mm_xor_si128(c, b);
459   - }
460   - middlebit = (_mm_extract_epi32(a, 0x2) & 1);
461   - a = _mm_srli_epi64(a, 1);
462   - if (middlebit) a = _mm_xor_si128(a, lmask);
463   - if ((_mm_extract_epi64(a, 0x1) == 0ULL) && (_mm_extract_epi64(a, 0x0) == 0ULL)){
464   - c128[0] = _mm_extract_epi64(c, 0x1);
465   - c128[1] = _mm_extract_epi64(c, 0x0);
466   - return;
467   - }
468   - topbit = (_mm_extract_epi64(_mm_and_si128(b, hmask), 1));
469   - middlebit = (_mm_extract_epi64(_mm_and_si128(b, lmask), 0));
470   - b = _mm_slli_epi64(b, 1);
471   - if (middlebit) b = _mm_xor_si128(b, middle_one);
472   - if (topbit) b = _mm_xor_si128(b, pp);
473   - }
474   -}
475   -#endif
476   -
477 209 void
478 210 gf_w128_bytwo_b_multiply(gf_t *gf, gf_val_128_t a128, gf_val_128_t b128, gf_val_128_t c128)
479 211 {
... ... @@ -594,213 +326,6 @@ gf_w128_split_4_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_
594 326 }
595 327 }
596 328  
597   -#if defined(INTEL_SSSE3) && defined(INTEL_SSE4)
598   -static
599   -void
600   -gf_w128_split_4_128_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
601   -{
602   - gf_internal_t *h;
603   - int i, j, k;
604   - uint64_t pp, v[2], s, *s64, *d64, *top;
605   - __m128i p, tables[32][16];
606   - struct gf_w128_split_4_128_data *ld;
607   - gf_region_data rd;
608   -
609   - if (val[0] == 0) {
610   - if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
611   - if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
612   - }
613   -
614   - h = (gf_internal_t *) gf->scratch;
615   -
616   - /* We only do this to check on alignment. */
617   - gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 16);
618   -
619   - /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */
620   -
621   - gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
622   -
623   - s64 = (uint64_t *) rd.s_start;
624   - d64 = (uint64_t *) rd.d_start;
625   - top = (uint64_t *) rd.d_top;
626   -
627   - ld = (struct gf_w128_split_4_128_data *) h->private;
628   -
629   - if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
630   - v[0] = val[0];
631   - v[1] = val[1];
632   - for (i = 0; i < 32; i++) {
633   - ld->tables[0][i][0] = 0;
634   - ld->tables[1][i][0] = 0;
635   - for (j = 1; j < 16; j <<= 1) {
636   - for (k = 0; k < j; k++) {
637   - ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
638   - ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
639   - }
640   - pp = (v[0] & (1ULL << 63));
641   - v[0] <<= 1;
642   - if (v[1] & (1ULL << 63)) v[0] ^= 1;
643   - v[1] <<= 1;
644   - if (pp) v[1] ^= h->prim_poly;
645   - }
646   - }
647   - }
648   -
649   - ld->last_value[0] = val[0];
650   - ld->last_value[1] = val[1];
651   -
652   - for (i = 0; i < 32; i++) {
653   - for (j = 0; j < 16; j++) {
654   - v[0] = ld->tables[0][i][j];
655   - v[1] = ld->tables[1][i][j];
656   - tables[i][j] = _mm_loadu_si128((__m128i *) v);
657   -
658   -/*
659   - printf("%2d %2d: ", i, j);
660   - MM_PRINT8("", tables[i][j]); */
661   - }
662   - }
663   -
664   - while (d64 != top) {
665   -
666   - if (xor) {
667   - p = _mm_load_si128 ((__m128i *) d64);
668   - } else {
669   - p = _mm_setzero_si128();
670   - }
671   - s = *s64;
672   - s64++;
673   - for (i = 0; i < 16; i++) {
674   - j = (s&0xf);
675   - s >>= 4;
676   - p = _mm_xor_si128(p, tables[16+i][j]);
677   - }
678   - s = *s64;
679   - s64++;
680   - for (i = 0; i < 16; i++) {
681   - j = (s&0xf);
682   - s >>= 4;
683   - p = _mm_xor_si128(p, tables[i][j]);
684   - }
685   - _mm_store_si128((__m128i *) d64, p);
686   - d64 += 2;
687   - }
688   -
689   - /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */
690   -
691   - gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
692   -}
693   -#endif
694   -
695   -#if defined(INTEL_SSSE3) && defined(INTEL_SSE4)
696   -static
697   -void
698   -gf_w128_split_4_128_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
699   -{
700   - gf_internal_t *h;
701   - int i, j, k;
702   - uint64_t pp, v[2], *s64, *d64, *top;
703   - __m128i si, tables[32][16], p[16], v0, mask1;
704   - struct gf_w128_split_4_128_data *ld;
705   - uint8_t btable[16];
706   - gf_region_data rd;
707   -
708   - if (val[0] == 0) {
709   - if (val[1] == 0) { gf_multby_zero(dest, bytes, xor); return; }
710   - if (val[1] == 1) { gf_multby_one(src, dest, bytes, xor); return; }
711   - }
712   -
713   - h = (gf_internal_t *) gf->scratch;
714   -
715   - /* We only do this to check on alignment. */
716   - gf_set_region_data(&rd, gf, src, dest, bytes, 0, xor, 256);
717   -
718   - /* Doing this instead of gf_do_initial_region_alignment() because that doesn't hold 128-bit vals */
719   -
720   - gf_w128_multiply_region_from_single(gf, src, dest, val, ((uint8_t *)rd.s_start-(uint8_t *)src), xor);
721   -
722   - s64 = (uint64_t *) rd.s_start;
723   - d64 = (uint64_t *) rd.d_start;
724   - top = (uint64_t *) rd.d_top;
725   -
726   - ld = (struct gf_w128_split_4_128_data *) h->private;
727   -
728   - if (val[0] != ld->last_value[0] || val[1] != ld->last_value[1]) {
729   - v[0] = val[0];
730   - v[1] = val[1];
731   - for (i = 0; i < 32; i++) {
732   - ld->tables[0][i][0] = 0;
733   - ld->tables[1][i][0] = 0;
734   - for (j = 1; j < 16; j <<= 1) {
735   - for (k = 0; k < j; k++) {
736   - ld->tables[0][i][k^j] = (v[0] ^ ld->tables[0][i][k]);
737   - ld->tables[1][i][k^j] = (v[1] ^ ld->tables[1][i][k]);
738   - }
739   - pp = (v[0] & (1ULL << 63));
740   - v[0] <<= 1;
741   - if (v[1] & (1ULL << 63)) v[0] ^= 1;
742   - v[1] <<= 1;
743   - if (pp) v[1] ^= h->prim_poly;
744   - }
745   - }
746   - }
747   -
748   - ld->last_value[0] = val[0];
749   - ld->last_value[1] = val[1];
750   -
751   - for (i = 0; i < 32; i++) {
752   - for (j = 0; j < 16; j++) {
753   - for (k = 0; k < 16; k++) {
754   - btable[k] = (uint8_t) ld->tables[1-(j/8)][i][k];
755   - ld->tables[1-(j/8)][i][k] >>= 8;
756   - }
757   - tables[i][j] = _mm_loadu_si128((__m128i *) btable);
758   -/*
759   - printf("%2d %2d: ", i, j);
760   - MM_PRINT8("", tables[i][j]);
761   - */
762   - }
763   - }
764   -
765   -
766   - mask1 = _mm_set1_epi8(0xf);
767   -
768   - while (d64 != top) {
769   -
770   - if (xor) {
771   - for (i = 0; i < 16; i++) p[i] = _mm_load_si128 ((__m128i *) (d64+i*2));
772   - } else {
773   - for (i = 0; i < 16; i++) p[i] = _mm_setzero_si128();
774   - }
775   - i = 0;
776   - for (k = 0; k < 16; k++) {
777   - v0 = _mm_load_si128((__m128i *) s64);
778   - s64 += 2;
779   -
780   - si = _mm_and_si128(v0, mask1);
781   -
782   - for (j = 0; j < 16; j++) {
783   - p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
784   - }
785   - i++;
786   - v0 = _mm_srli_epi32(v0, 4);
787   - si = _mm_and_si128(v0, mask1);
788   - for (j = 0; j < 16; j++) {
789   - p[j] = _mm_xor_si128(p[j], _mm_shuffle_epi8(tables[i][j], si));
790   - }
791   - i++;
792   - }
793   - for (i = 0; i < 16; i++) {
794   - _mm_store_si128((__m128i *) d64, p[i]);
795   - d64 += 2;
796   - }
797   - }
798   - /* Doing this instead of gf_do_final_region_alignment() because that doesn't hold 128-bit vals */
799   -
800   - gf_w128_multiply_region_from_single(gf, rd.s_top, rd.d_top, val, ((uint8_t *)src+bytes)-(uint8_t *)rd.s_top, xor);
801   -}
802   -#endif
803   -
804 329 static
805 330 void
806 331 gf_w128_split_8_128_multiply_region(gf_t *gf, void *src, void *dest, gf_val_128_t val, int bytes, int xor)
... ... @@ -1423,8 +948,8 @@ int gf_w128_cfm_init(gf_t *gf)
1423 948 #if defined(INTEL_SSE4_PCLMUL)
1424 949 if (gf_cpu_supports_intel_pclmul) {
1425 950 SET_FUNCTION(gf,inverse,w128,gf_w128_euclid)
1426   - SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
1427   - SET_FUNCTION(gf,multiply_region,w128,gf_w128_clm_multiply_region_from_single)
  951 + gf_w128_sse4_pclmul_clm_init(gf);
  952 + gf_w128_sse4_pclmul_clm_region_init(gf);
1428 953 return 1;
1429 954 }
1430 955 #endif
... ... @@ -1490,34 +1015,6 @@ void gf_w128_group_r_init(gf_t *gf)
1490 1015 return;
1491 1016 }
1492 1017  
1493   -#if 0 // defined(INTEL_SSE4)
1494   - static
1495   -void gf_w128_group_r_sse_init(gf_t *gf)
1496   -{
1497   - int i, j;
1498   - int g_r;
1499   - uint64_t pp;
1500   - gf_internal_t *scratch;
1501   - gf_group_tables_t *gt;
1502   - scratch = (gf_internal_t *) gf->scratch;
1503   - gt = scratch->private;
1504   - __m128i zero = _mm_setzero_si128();
1505   - __m128i *table = (__m128i *)(gt->r_table);
1506   - g_r = scratch->arg2;
1507   - pp = scratch->prim_poly;
1508   - table[0] = zero;
1509   - for (i = 1; i < (1 << g_r); i++) {
1510   - table[i] = zero;
1511   - for (j = 0; j < g_r; j++) {
1512   - if (i & (1 << j)) {
1513   - table[i] = _mm_xor_si128(table[i], _mm_insert_epi64(zero, pp << j, 0));
1514   - }
1515   - }
1516   - }
1517   - return;
1518   -}
1519   -#endif
1520   -
1521 1018 static
1522 1019 int gf_w128_split_init(gf_t *gf)
1523 1020 {
... ... @@ -1530,7 +1027,7 @@ int gf_w128_split_init(gf_t *gf)
1530 1027 SET_FUNCTION(gf,multiply,w128,gf_w128_bytwo_p_multiply)
1531 1028 #if defined(INTEL_SSE4_PCLMUL)
1532 1029 if (gf_cpu_supports_intel_pclmul && !(h->region_type & GF_REGION_NOSIMD)){
1533   - SET_FUNCTION(gf,multiply,w128,gf_w128_clm_multiply)
  1030 + gf_w128_sse4_pclmul_clm_init(gf);
1534 1031 }
1535 1032 #endif
1536 1033  
... ... @@ -1549,7 +1046,7 @@ int gf_w128_split_init(gf_t *gf)
1549 1046 {
1550 1047 #ifdef INTEL_SSE4
1551 1048 if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
1552   - SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_altmap_multiply_region)
  1049 + gf_w128_sse4_split_init(gf);
1553 1050 else
1554 1051 #endif
1555 1052 return 0;
... ... @@ -1557,7 +1054,7 @@ int gf_w128_split_init(gf_t *gf)
1557 1054 else {
1558 1055 #ifdef INTEL_SSE4
1559 1056 if(gf_cpu_supports_intel_sse4 && !(h->region_type & GF_REGION_NOSIMD))
1560   - SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_sse_multiply_region)
  1057 + gf_w128_sse4_split_init(gf);
1561 1058 else
1562 1059 #endif
1563 1060 SET_FUNCTION(gf,multiply_region,w128,gf_w128_split_4_128_multiply_region)
... ...
src/gf_w16.c
... ... @@ -20,12 +20,6 @@
20 20 t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
21 21 b = (t1 ^ (t2 & ip));}
22 22  
23   -#define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
24   - t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
25   - t2 = _mm_and_si128(va, m2); \
26   - t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
27   - va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
28   -
29 23 #define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf(" %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); }
30 24  
31 25 #define GF_FIRST_BIT (1 << 15)
... ... @@ -79,209 +73,6 @@ gf_w16_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t
79 73 gf_do_final_region_alignment(&rd);
80 74 }
81 75  
82   -#if defined(INTEL_SSE4_PCLMUL)
83   -static
84   -void
85   -gf_w16_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
86   -{
87   - gf_region_data rd;
88   - uint16_t *s16;
89   - uint16_t *d16;
90   - __m128i a, b;
91   - __m128i result;
92   - __m128i prim_poly;
93   - __m128i w;
94   - gf_internal_t * h = gf->scratch;
95   - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
96   -
97   - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
98   - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
99   -
100   - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
101   - gf_do_initial_region_alignment(&rd);
102   -
103   - a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
104   -
105   - s16 = (uint16_t *) rd.s_start;
106   - d16 = (uint16_t *) rd.d_start;
107   -
108   - if (xor) {
109   - while (d16 < ((uint16_t *) rd.d_top)) {
110   -
111   - /* see gf_w16_clm_multiply() to see explanation of method */
112   -
113   - b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
114   - result = _mm_clmulepi64_si128 (a, b, 0);
115   - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
116   - result = _mm_xor_si128 (result, w);
117   - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
118   - result = _mm_xor_si128 (result, w);
119   -
120   - *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
121   - d16++;
122   - s16++;
123   - }
124   - } else {
125   - while (d16 < ((uint16_t *) rd.d_top)) {
126   -
127   - /* see gf_w16_clm_multiply() to see explanation of method */
128   -
129   - b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
130   - result = _mm_clmulepi64_si128 (a, b, 0);
131   - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
132   - result = _mm_xor_si128 (result, w);
133   - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
134   - result = _mm_xor_si128 (result, w);
135   -
136   - *d16 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
137   - d16++;
138   - s16++;
139   - }
140   - }
141   - gf_do_final_region_alignment(&rd);
142   -}
143   -#endif
144   -
145   -#if defined(INTEL_SSE4_PCLMUL)
146   -static
147   -void
148   -gf_w16_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
149   -{
150   - gf_region_data rd;
151   - uint16_t *s16;
152   - uint16_t *d16;
153   -
154   - __m128i a, b;
155   - __m128i result;
156   - __m128i prim_poly;
157   - __m128i w;
158   - gf_internal_t * h = gf->scratch;
159   - prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffffULL));
160   -
161   - if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
162   - if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
163   -
164   - a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
165   -
166   - gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 2);
167   - gf_do_initial_region_alignment(&rd);
168   -
169   - s16 = (uint16_t *) rd.s_start;
170   - d16 = (uint16_t *) rd.d_start;
171   -
172   - if (xor) {
173   - while (d16 < ((uint16_t *) rd.d_top)) {
174   -
175   - /* see gf_w16_clm_multiply() to see explanation of method */
176   -
177   - b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
178   - result = _mm_clmulepi64_si128 (a, b, 0);
179   - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
180   - result = _mm_xor_si128 (result, w);
181   - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
182   - result = _mm_xor_si128 (result, w);
183   - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
184   - result = _mm_xor_si128 (result, w);
185   -
186   - *d16 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
187   - d16++;
188   - s16++;
189   - }
190   - } else {
191   - while (d16 < ((uint16_t *) rd.d_top)) {
192   -
193   - /* see gf_w16_clm_multiply() to see explanation of method */
194   -
195   - b = _mm_insert_epi32 (a, (gf_val_32_t)(*s16), 0);
196   - result = _mm_clmulepi64_si128 (a, b, 0);
197   - w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 2), 0);
198