Commit 70dd94ae38f2d20dd78532a6dfd1310fdfb4a884

Authored by KMG
2 parents 62d4b81a 6fdd8bc3
Exists in master and in 1 other branch v2

Merged in jannau/gf-complete/neon (pull request #25)

arm neon optimisations
configure.ac
... ... @@ -3,9 +3,12 @@
3 3 # FIXME - add project url as the last argument
4 4 AC_INIT(gf-complete, 1.0)
5 5  
  6 +# Override default CFLAGS
  7 +: ${CFLAGS="-Wall -Wpointer-arith -O3 -g"}
  8 +
6 9 AC_PREREQ([2.61])
7 10  
8   -AM_INIT_AUTOMAKE([no-dependencies foreign])
  11 +AM_INIT_AUTOMAKE([no-dependencies foreign parallel-tests])
9 12 LT_INIT # libtool
10 13  
11 14 AC_CONFIG_HEADER(include/config.h)
... ... @@ -16,14 +19,39 @@ AC_CONFIG_MACRO_DIR([m4])
16 19 # This prevents './configure; make' from trying to run autotools.
17 20 AM_MAINTAINER_MODE([disable])
18 21  
19   -# Override default CFLAGS
20   -CFLAGS="-Wall -Wpointer-arith -O3 -g"
21   -
22 22 dnl Compiling with per-target flags requires AM_PROG_CC_C_O.
23 23 AC_PROG_CC
24 24  
  25 +# Check for functions to provide aligned memory
  26 +#
  27 +AC_CHECK_FUNCS([posix_memalign],
  28 + [found_memalign=yes; break])
  29 +
  30 +AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])])
  31 +
25 32 AX_EXT()
26 33  
  34 +AC_ARG_ENABLE([neon],
  35 + AS_HELP_STRING([--disable-neon], [Build without NEON optimizations]))
  36 +
  37 +AS_IF([test "x$enable_neon" != "xno"],
  38 + [noneon_CPPFLAGS=$CPPFLAGS
  39 + CPPFLAGS="$CPPFLAGS $SIMD_FLAGS"
  40 + AC_CHECK_HEADER([arm_neon.h],
  41 + [have_neon=yes],
  42 + [have_neon=no
  43 + CPPFLAGS=$noneon_CPPFLAGS])],
  44 + [have_neon=no
  45 + AS_IF([test "x$ax_cv_have_neon_ext" = "xyes"],
  46 + [SIMD_FLAGS=""])
  47 + ])
  48 +
  49 +AS_IF([test "x$have_neon" = "xno"],
  50 + [AS_IF([test "x$enable_neon" = "xyes"],
  51 + [AC_MSG_ERROR([neon requested but arm_neon.h not found])])
  52 + ])
  53 +AM_CONDITIONAL([HAVE_NEON], [test "x$have_neon" = "xyes"])
  54 +
27 55 AC_ARG_ENABLE([sse],
28 56 AS_HELP_STRING([--disable-sse], [Build without SSE optimizations]),
29 57 [if test "x$enableval" = "xno" ; then
... ...
examples/Makefile.am
1 1 # GF-Complete 'examples' AM file
2 2  
3   -AM_CPPFLAGS=-I./ -I../include
4   -AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES)
  3 +AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
  4 +AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
5 5  
6 6 bin_PROGRAMS = gf_example_1 gf_example_2 gf_example_3 gf_example_4 \
7 7 gf_example_5 gf_example_6 gf_example_7
... ...
include/gf_complete.h
... ... @@ -33,6 +33,10 @@
33 33 #include <wmmintrin.h>
34 34 #endif
35 35  
  36 +#if defined(ARM_NEON)
  37 + #include <arm_neon.h>
  38 +#endif
  39 +
36 40  
37 41 /* These are the different ways to perform multiplication.
38 42 Not all are implemented for all values of w.
... ... @@ -61,7 +65,9 @@ typedef enum {GF_MULT_DEFAULT,
61 65 #define GF_REGION_DOUBLE_TABLE (0x1)
62 66 #define GF_REGION_QUAD_TABLE (0x2)
63 67 #define GF_REGION_LAZY (0x4)
  68 +#define GF_REGION_SIMD (0x8)
64 69 #define GF_REGION_SSE (0x8)
  70 +#define GF_REGION_NOSIMD (0x10)
65 71 #define GF_REGION_NOSSE (0x10)
66 72 #define GF_REGION_ALTMAP (0x20)
67 73 #define GF_REGION_CAUCHY (0x40)
... ...
include/gf_int.h
... ... @@ -113,7 +113,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default &amp;&amp; Mult == Default */
113 113 GF_E_DIVCOMP, /* Mult == Composite && Div != Default */
114 114 GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */
115 115 GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */
116   - GF_E_SSE__NO, /* Reg == SSE && Reg == NOSSE */
  116 + GF_E_SIMD_NO, /* Reg == SIMD && Reg == NOSIMD */
117 117 GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */
118 118 GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/
119 119 GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */
... ... @@ -129,9 +129,9 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default &amp;&amp; Mult == Default */
129 129 GF_E_QUAD__J, /* Reg == QUAD && other Reg */
130 130 GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/
131 131 GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */
132   - GF_E_SSESHIF, /* Mult == Shift && Reg == SSE|NOSSE */
  132 + GF_E_SSESHIF, /* Mult == Shift && Reg == SIMD|NOSIMD */
133 133 GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */
134   - GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SSE|NOSSE */
  134 + GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SIMD|NOSIMD */
135 135 GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */
136 136 GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */
137 137 GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */
... ... @@ -148,7 +148,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default &amp;&amp; Mult == Default */
148 148 GF_E_GR_AR_W, /* Mult == GROUP, either arg > w */
149 149 GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */
150 150 GF_E_TABLE_W, /* Mult == TABLE, w too big */
151   - GF_E_TAB_SSE, /* Mult == TABLE, SSE|NOSSE only apply to w == 4 */
  151 + GF_E_TAB_SSE, /* Mult == TABLE, SIMD|NOSIMD only apply to w == 4 */
152 152 GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */
153 153 GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */
154 154 GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */
... ... @@ -172,7 +172,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default &amp;&amp; Mult == Default */
172 172 GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */
173 173 GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */
174 174 GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */
175   - GF_E_COMP_SS, /* Mult == COMP, SSE|NOSSE */
  175 + GF_E_COMP_SS, /* Mult == COMP, SIMD|NOSIMD */
176 176 GF_E_COMP__W, /* Mult == COMP, Bad w. */
177 177 GF_E_UNKFLAG, /* Unknown flag in create_from.... */
178 178 GF_E_UNKNOWN, /* Unknown mult_type. */
... ...
include/gf_w16.h 0 → 100644
... ... @@ -0,0 +1,66 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_w16.h
  7 + *
  8 + * Defines and data structures for 16-bit Galois fields
  9 + */
  10 +
  11 +#ifndef GF_COMPLETE_GF_W16_H
  12 +#define GF_COMPLETE_GF_W16_H
  13 +
  14 +#include <stdint.h>
  15 +
  16 +#define GF_FIELD_WIDTH (16)
  17 +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
  18 +#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
  19 +
  20 +#define GF_BASE_FIELD_WIDTH (8)
  21 +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
  22 +
  23 +struct gf_w16_logtable_data {
  24 + uint16_t log_tbl[GF_FIELD_SIZE];
  25 + uint16_t antilog_tbl[GF_FIELD_SIZE * 2];
  26 + uint16_t inv_tbl[GF_FIELD_SIZE];
  27 + uint16_t *d_antilog;
  28 +};
  29 +
  30 +struct gf_w16_zero_logtable_data {
  31 + int log_tbl[GF_FIELD_SIZE];
  32 + uint16_t _antilog_tbl[GF_FIELD_SIZE * 4];
  33 + uint16_t *antilog_tbl;
  34 + uint16_t inv_tbl[GF_FIELD_SIZE];
  35 +};
  36 +
  37 +struct gf_w16_lazytable_data {
  38 + uint16_t log_tbl[GF_FIELD_SIZE];
  39 + uint16_t antilog_tbl[GF_FIELD_SIZE * 2];
  40 + uint16_t inv_tbl[GF_FIELD_SIZE];
  41 + uint16_t *d_antilog;
  42 + uint16_t lazytable[GF_FIELD_SIZE];
  43 +};
  44 +
  45 +struct gf_w16_bytwo_data {
  46 + uint64_t prim_poly;
  47 + uint64_t mask1;
  48 + uint64_t mask2;
  49 +};
  50 +
  51 +struct gf_w16_split_8_8_data {
  52 + uint16_t tables[3][256][256];
  53 +};
  54 +
  55 +struct gf_w16_group_4_4_data {
  56 + uint16_t reduce[16];
  57 + uint16_t shift[16];
  58 +};
  59 +
  60 +struct gf_w16_composite_data {
  61 + uint8_t *mult_table;
  62 +};
  63 +
  64 +void gf_w16_neon_split_init(gf_t *gf);
  65 +
  66 +#endif /* GF_COMPLETE_GF_W16_H */
... ...
include/gf_w32.h 0 → 100644
... ... @@ -0,0 +1,71 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_w32.h
  7 + *
  8 + * Defines and data structures for 32-bit Galois fields
  9 + */
  10 +
  11 +#ifndef GF_COMPLETE_GF_W32_H
  12 +#define GF_COMPLETE_GF_W32_H
  13 +
  14 +#include <stdint.h>
  15 +
  16 +#define GF_FIELD_WIDTH (32)
  17 +#define GF_FIRST_BIT (1 << 31)
  18 +
  19 +#define GF_BASE_FIELD_WIDTH (16)
  20 +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
  21 +#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
  22 +#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
  23 +
  24 +struct gf_split_2_32_lazy_data {
  25 + uint32_t tables[16][4];
  26 + uint32_t last_value;
  27 +};
  28 +
  29 +struct gf_w32_split_8_8_data {
  30 + uint32_t tables[7][256][256];
  31 + uint32_t region_tables[4][256];
  32 + uint32_t last_value;
  33 +};
  34 +
  35 +struct gf_w32_group_data {
  36 + uint32_t *reduce;
  37 + uint32_t *shift;
  38 + int tshift;
  39 + uint64_t rmask;
  40 + uint32_t *memory;
  41 +};
  42 +
  43 +struct gf_split_16_32_lazy_data {
  44 + uint32_t tables[2][(1<<16)];
  45 + uint32_t last_value;
  46 +};
  47 +
  48 +struct gf_split_8_32_lazy_data {
  49 + uint32_t tables[4][256];
  50 + uint32_t last_value;
  51 +};
  52 +
  53 +struct gf_split_4_32_lazy_data {
  54 + uint32_t tables[8][16];
  55 + uint32_t last_value;
  56 +};
  57 +
  58 +struct gf_w32_bytwo_data {
  59 + uint64_t prim_poly;
  60 + uint64_t mask1;
  61 + uint64_t mask2;
  62 +};
  63 +
  64 +struct gf_w32_composite_data {
  65 + uint16_t *log;
  66 + uint16_t *alog;
  67 +};
  68 +
  69 +void gf_w32_neon_split_init(gf_t *gf);
  70 +
  71 +#endif /* GF_COMPLETE_GF_W32_H */
... ...
include/gf_w4.h 0 → 100644
... ... @@ -0,0 +1,63 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_w4.h
  7 + *
  8 + * Defines and data structures for 4-bit Galois fields
  9 + */
  10 +
  11 +#ifndef GF_COMPLETE_GF_W4_H
  12 +#define GF_COMPLETE_GF_W4_H
  13 +
  14 +#include <stdint.h>
  15 +
  16 +#define GF_FIELD_WIDTH 4
  17 +#define GF_DOUBLE_WIDTH (GF_FIELD_WIDTH*2)
  18 +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
  19 +#define GF_MULT_GROUP_SIZE (GF_FIELD_SIZE-1)
  20 +
  21 +/* ------------------------------------------------------------
  22 + JSP: Each implementation has its own data, which is allocated
  23 + at one time as part of the handle. For that reason, it
  24 + shouldn't be hierarchical -- i.e. one should be able to
  25 + allocate it with one call to malloc. */
  26 +
  27 +struct gf_logtable_data {
  28 + uint8_t log_tbl[GF_FIELD_SIZE];
  29 + uint8_t antilog_tbl[GF_FIELD_SIZE * 2];
  30 + uint8_t *antilog_tbl_div;
  31 +};
  32 +
  33 +struct gf_single_table_data {
  34 + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
  35 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  36 +};
  37 +
  38 +struct gf_double_table_data {
  39 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  40 + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
  41 +};
  42 +struct gf_quad_table_data {
  43 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  44 + uint16_t mult[GF_FIELD_SIZE][(1<<16)];
  45 +};
  46 +
  47 +struct gf_quad_table_lazy_data {
  48 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  49 + uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
  50 + uint16_t mult[(1 << 16)];
  51 +};
  52 +
  53 +struct gf_bytwo_data {
  54 + uint64_t prim_poly;
  55 + uint64_t mask1;
  56 + uint64_t mask2;
  57 +};
  58 +
  59 +// ARM NEON init functions
  60 +int gf_w4_neon_cfm_init(gf_t *gf);
  61 +void gf_w4_neon_single_table_init(gf_t *gf);
  62 +
  63 +#endif /* GF_COMPLETE_GF_W4_H */
... ...
include/gf_w64.h 0 → 100644
... ... @@ -0,0 +1,50 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_w64.h
  7 + *
  8 + * Defines and data structures for 64-bit Galois fields
  9 + */
  10 +
  11 +#ifndef GF_COMPLETE_GF_W64_H
  12 +#define GF_COMPLETE_GF_W64_H
  13 +
  14 +#include <stdint.h>
  15 +
  16 +#define GF_FIELD_WIDTH (64)
  17 +#define GF_FIRST_BIT (1ULL << 63)
  18 +
  19 +#define GF_BASE_FIELD_WIDTH (32)
  20 +#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH)
  21 +#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
  22 +
  23 +struct gf_w64_group_data {
  24 + uint64_t *reduce;
  25 + uint64_t *shift;
  26 + uint64_t *memory;
  27 +};
  28 +
  29 +struct gf_split_4_64_lazy_data {
  30 + uint64_t tables[16][16];
  31 + uint64_t last_value;
  32 +};
  33 +
  34 +struct gf_split_8_64_lazy_data {
  35 + uint64_t tables[8][(1<<8)];
  36 + uint64_t last_value;
  37 +};
  38 +
  39 +struct gf_split_16_64_lazy_data {
  40 + uint64_t tables[4][(1<<16)];
  41 + uint64_t last_value;
  42 +};
  43 +
  44 +struct gf_split_8_8_data {
  45 + uint64_t tables[15][256][256];
  46 +};
  47 +
  48 +void gf_w64_neon_split_init(gf_t *gf);
  49 +
  50 +#endif /* GF_COMPLETE_GF_W64_H */
... ...
include/gf_w8.h 0 → 100644
... ... @@ -0,0 +1,99 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_w8.c
  7 + *
  8 + * Defines and data stuctures for 8-bit Galois fields
  9 + */
  10 +
  11 +#ifndef GF_COMPLETE_GF_W8_H
  12 +#define GF_COMPLETE_GF_W8_H
  13 +
  14 +#include "gf_int.h"
  15 +#include <stdint.h>
  16 +
  17 +#define GF_FIELD_WIDTH (8)
  18 +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
  19 +#define GF_HALF_SIZE (1 << (GF_FIELD_WIDTH/2))
  20 +#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
  21 +
  22 +#define GF_BASE_FIELD_WIDTH (4)
  23 +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
  24 +
  25 +struct gf_w8_logtable_data {
  26 + uint8_t log_tbl[GF_FIELD_SIZE];
  27 + uint8_t antilog_tbl[GF_FIELD_SIZE * 2];
  28 + uint8_t inv_tbl[GF_FIELD_SIZE];
  29 +};
  30 +
  31 +struct gf_w8_logzero_table_data {
  32 + short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */
  33 + uint8_t antilog_tbl[512+512+1];
  34 + uint8_t *div_tbl;
  35 + uint8_t *inv_tbl;
  36 +};
  37 +
  38 +struct gf_w8_logzero_small_table_data {
  39 + short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */
  40 + uint8_t antilog_tbl[255*3];
  41 + uint8_t inv_tbl[GF_FIELD_SIZE];
  42 + uint8_t *div_tbl;
  43 +};
  44 +
  45 +struct gf_w8_composite_data {
  46 + uint8_t *mult_table;
  47 +};
  48 +
  49 +/* Don't change the order of these relative to gf_w8_half_table_data */
  50 +
  51 +struct gf_w8_default_data {
  52 + uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE];
  53 + uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE];
  54 + uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
  55 + uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
  56 +};
  57 +
  58 +struct gf_w8_half_table_data {
  59 + uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE];
  60 + uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE];
  61 +};
  62 +
  63 +struct gf_w8_single_table_data {
  64 + uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
  65 + uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
  66 +};
  67 +
  68 +struct gf_w8_double_table_data {
  69 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  70 + uint16_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
  71 +};
  72 +
  73 +struct gf_w8_double_table_lazy_data {
  74 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  75 + uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
  76 + uint16_t mult[GF_FIELD_SIZE*GF_FIELD_SIZE];
  77 +};
  78 +
  79 +struct gf_w4_logtable_data {
  80 + uint8_t log_tbl[GF_BASE_FIELD_SIZE];
  81 + uint8_t antilog_tbl[GF_BASE_FIELD_SIZE * 2];
  82 + uint8_t *antilog_tbl_div;
  83 +};
  84 +
  85 +struct gf_w4_single_table_data {
  86 + uint8_t div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
  87 + uint8_t mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
  88 +};
  89 +
  90 +struct gf_w8_bytwo_data {
  91 + uint64_t prim_poly;
  92 + uint64_t mask1;
  93 + uint64_t mask2;
  94 +};
  95 +
  96 +int gf_w8_neon_cfm_init(gf_t *gf);
  97 +void gf_w8_neon_split_init(gf_t *gf);
  98 +
  99 +#endif /* GF_COMPLETE_GF_W8_H */
... ...
m4/ax_ext.m4
... ... @@ -41,6 +41,55 @@ AC_DEFUN([AX_EXT],
41 41 AC_REQUIRE([AC_CANONICAL_HOST])
42 42  
43 43 case $host_cpu in
  44 + aarch64*)
  45 + AC_DEFINE(HAVE_ARCH_AARCH64,,[targeting AArch64])
  46 + SIMD_FLAGS="$SIMD_FLAGS -DARCH_AARCH64"
  47 +
  48 + AC_CACHE_CHECK([whether NEON is supported], [ax_cv_have_neon_ext],
  49 + [
  50 + # TODO: detect / cross-compile
  51 + ax_cv_have_neon_ext=yes
  52 + ])
  53 + AC_CACHE_CHECK([whether cryptographic extension is supported], [ax_cv_have_arm_crypt_ext],
  54 + [
  55 + # TODO: detect / cross-compile
  56 + ax_cv_have_arm_crypt_ext=yes
  57 + ])
  58 +
  59 + if test "$ax_cv_have_arm_crypt_ext" = yes; then
  60 + AC_DEFINE(HAVE_ARM_CRYPT_EXT,,[Support ARM cryptographic extension])
  61 + fi
  62 +
  63 + if test "$ax_cv_have_neon_ext" = yes; then
  64 + AC_DEFINE(HAVE_NEON,,[Support NEON instructions])
  65 + fi
  66 +
  67 + if test "$ax_cv_have_arm_crypt_ext" = yes && test "$ax_cv_have_neon_ext" = yes; then
  68 + AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd+crypto,
  69 + SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd+crypto -DARM_CRYPT -DARM_NEON", [])
  70 + elif test "$ax_cv_have_arm_crypt_ext" = yes; then
  71 + AX_CHECK_COMPILE_FLAG(-march=armv8-a+crypto,
  72 + SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+crypto -DARM_CRYPT", [])
  73 + elif test "$ax_cv_have_neon_ext" = yes; then
  74 + AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd,
  75 + SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd -DARM_NEON", [])
  76 + fi
  77 + ;;
  78 +
  79 + arm*)
  80 + AC_CACHE_CHECK([whether NEON is supported], [ax_cv_have_neon_ext],
  81 + [
  82 + # TODO: detect / cross-compile
  83 + ax_cv_have_neon_ext=yes
  84 + ])
  85 +
  86 + if test "$ax_cv_have_neon_ext" = yes; then
  87 + AC_DEFINE(HAVE_NEON,,[Support NEON instructions])
  88 + AX_CHECK_COMPILE_FLAG(-mfpu=neon,
  89 + SIMD_FLAGS="$SIMD_FLAGS -mfpu=neon -DARM_NEON", [])
  90 + fi
  91 + ;;
  92 +
44 93 powerpc*)
45 94 AC_CACHE_CHECK([whether altivec is supported], [ax_cv_have_altivec_ext],
46 95 [
... ...
src/Makefile.am
1 1 # GF-Complete 'core' AM file
2 2 # Creates the library
3 3  
4   -AM_CPPFLAGS=-I./ -I../include
5   -AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES)
  4 +AUTOMAKE_OPTIONS = subdir-objects
  5 +
  6 +AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
  7 +AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
6 8  
7 9 lib_LTLIBRARIES = libgf_complete.la
8 10 libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
9 11 gf_w64.c gf_w128.c gf_rand.c gf_general.c
  12 +
  13 +if HAVE_NEON
  14 +libgf_complete_la_SOURCES += neon/gf_w4_neon.c \
  15 + neon/gf_w8_neon.c \
  16 + neon/gf_w16_neon.c \
  17 + neon/gf_w32_neon.c \
  18 + neon/gf_w64_neon.c
  19 +endif
  20 +
10 21 libgf_complete_la_LDFLAGS = -version-info 1:0:0
11 22  
... ...
src/gf.c
... ... @@ -41,7 +41,7 @@ void gf_error()
41 41 case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break;
42 42 case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break;
43 43 case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break;
44   - case GF_E_SSE__NO: s = "Cannot specify -r SSE and -r NOSSE."; break;
  44 + case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break;
45 45 case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break;
46 46 case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break;
47 47 case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break;
... ... @@ -51,23 +51,23 @@ void gf_error()
51 51 case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break;
52 52 case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break;
53 53 case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break;
54   - case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SSE|NOSSE."; break;
  54 + case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break;
55 55 case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break;
56 56 case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break;
57 57 case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break;
58   - case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SSE|NOSSE."; break;
  58 + case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break;
59 59 case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break;
60 60 case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break;
61 61 case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break;
62 62 case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break;
63   - case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SSE|NOSSE."; break;
  63 + case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break;
64 64 case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break;
65   - case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SSE|NOSSE."; break;
  65 + case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break;
66 66 case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break;
67 67 case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break;
68   - case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SSE, but SSE2 is not supported."; break;
  68 + case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break;
69 69 case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break;
70   - case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SSE|NOSSE."; break;
  70 + case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break;
71 71 case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break;
72 72 case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break;
73 73 case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break;
... ... @@ -77,33 +77,33 @@ void gf_error()
77 77 case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;
78 78 case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;
79 79 case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;
80   - case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SSE|NOSSE."; break;
  80 + case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break;
81 81 case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break;
82   - case GF_E_TAB_SSE: s = "With -m TABLE, SSE|NOSSE only applies to w=4."; break;
83   - case GF_E_TABSSE3: s = "With -m TABLE, -r SSE, you need SSSE3 supported."; break;
  82 + case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break;
  83 + case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break;
84 84 case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break;
85 85 case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break;
86   - case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SSE requires -r ALTMAP."; break;
  86 + case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break;
87 87 case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break;
88 88 case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break;
89   - case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SSE|NOSSE only with arg1/arg2 = 4/128."; break;
  89 + case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break;
90 90 case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break;
91 91 case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break;
92 92 case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break;
93   - case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SSE|NOSSE only with arg1/arg2 = 4/16."; break;
  93 + case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break;
94 94 case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break;
95 95 case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break;
96 96 case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break;
97   - case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SSE|NOSSE only with arg1/arg2 = 4/32."; break;
  97 + case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break;
98 98 case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break;
99 99 case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break;
100 100 case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break;
101   - case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SSE|NOSSE only with arg1/arg2 = 4/64."; break;
  101 + case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break;
102 102 case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break;
103 103 case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break;
104   - case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SSE."; break;
  104 + case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break;
105 105 case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break;
106   - case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SSE and -r NOSSE do not apply."; break;
  106 + case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break;
107 107 case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break;
108 108 case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break;
109 109 case GF_E_UNKNOWN: s = "Unknown multiplication type."; break;
... ... @@ -182,14 +182,14 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
182 182 int sse3 = 0;
183 183 int sse2 = 0;
184 184 int pclmul = 0;
185   - int rdouble, rquad, rlazy, rsse, rnosse, raltmap, rcauchy, tmp;
  185 + int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp;
186 186 gf_internal_t *sub;
187 187  
188 188 rdouble = (region_type & GF_REGION_DOUBLE_TABLE);
189 189 rquad = (region_type & GF_REGION_QUAD_TABLE);
190 190 rlazy = (region_type & GF_REGION_LAZY);
191   - rsse = (region_type & GF_REGION_SSE);
192   - rnosse = (region_type & GF_REGION_NOSSE);
  191 + rsimd = (region_type & GF_REGION_SIMD);
  192 + rnosimd = (region_type & GF_REGION_NOSIMD);
193 193 raltmap = (region_type & GF_REGION_ALTMAP);
194 194 rcauchy = (region_type & GF_REGION_CAUCHY);
195 195  
... ... @@ -201,7 +201,8 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
201 201 }
202 202  
203 203 tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY |
204   - GF_REGION_SSE | GF_REGION_NOSSE | GF_REGION_ALTMAP | GF_REGION_CAUCHY );
  204 + GF_REGION_SIMD | GF_REGION_NOSIMD | GF_REGION_ALTMAP |
  205 + GF_REGION_CAUCHY );
205 206 if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
206 207  
207 208 #ifdef INTEL_SSE2
... ... @@ -216,6 +217,11 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
216 217 pclmul = 1;
217 218 #endif
218 219  
  220 +#ifdef ARM_NEON
  221 + pclmul = 1;
  222 + sse3 = 1;
  223 +#endif
  224 +
219 225  
220 226 if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }
221 227  
... ... @@ -230,7 +236,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
230 236 return 1;
231 237 }
232 238  
233   - if (rsse && rnosse) { _gf_errno = GF_E_SSE__NO; return 0; }
  239 + if (rsimd && rnosimd) { _gf_errno = GF_E_SIMD_NO; return 0; }
234 240 if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; }
235 241 if (rcauchy && region_type != GF_REGION_CAUCHY) { _gf_errno = GF_E_CAUCHYB; return 0; }
236 242 if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; }
... ... @@ -252,7 +258,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
252 258 if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; }
253 259 if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; }
254 260 if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; }
255   - if (rsse || rnosse || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }
  261 + if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }
256 262 if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; }
257 263 return 1;
258 264 }
... ... @@ -260,7 +266,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
260 266 if (rquad) {
261 267 if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; }
262 268 if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; }
263   - if (rsse || rnosse || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }
  269 + if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }
264 270 return 1;
265 271 }
266 272  
... ... @@ -268,7 +274,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
268 274  
269 275 if (mult_type == GF_MULT_SHIFT) {
270 276 if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; }
271   - if (rsse || rnosse) { _gf_errno = GF_E_SSESHIF; return 0; }
  277 + if (rsimd || rnosimd) { _gf_errno = GF_E_SSESHIF; return 0; }
272 278 return 1;
273 279 }
274 280  
... ... @@ -281,7 +287,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
281 287 if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; }
282 288 if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; }
283 289 if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }
284   - if (rsse || rnosse) { _gf_errno = GF_E_SSE_CFM; return 0; }
  290 + if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; }
285 291 if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }
286 292 return 1;
287 293 }
... ... @@ -290,21 +296,21 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
290 296 if (w != 4 && w != 8 && w != 16 &&
291 297 w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; }
292 298 if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }
293   - if (rsse || rnosse) { _gf_errno = GF_E_SSE_CFM; return 0; }
  299 + if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; }
294 300 if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }
295 301 return 1;
296 302 }
297 303  
298 304 if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) {
299 305 if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; }
300   - if (rsse && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; }
  306 + if (rsimd && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; }
301 307 return 1;
302 308 }
303 309  
304 310 if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO
305 311 || mult_type == GF_MULT_LOG_ZERO_EXT ) {
306 312 if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; }
307   - if (raltmap || rsse || rnosse) { _gf_errno = GF_E_LOG___J; return 0; }
  313 + if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; }
308 314  
309 315 if (mult_type == GF_MULT_LOG_TABLE) return 1;
310 316  
... ... @@ -324,14 +330,14 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
324 330 (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }
325 331 if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; }
326 332 if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; }
327   - if (raltmap || rsse || rnosse) { _gf_errno = GF_E_GR____J; return 0; }
  333 + if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_GR____J; return 0; }
328 334 return 1;
329 335 }
330 336  
331 337 if (mult_type == GF_MULT_TABLE) {
332 338 if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; }
333   - if (w != 4 && (rsse || rnosse)) { _gf_errno = GF_E_TAB_SSE; return 0; }
334   - if (rsse && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; }
  339 + if (w != 4 && (rsimd || rnosimd)) { _gf_errno = GF_E_TAB_SSE; return 0; }
  340 + if (rsimd && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; }
335 341 if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; }
336 342 return 1;
337 343 }
... ... @@ -344,46 +350,46 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
344 350 }
345 351 if (w == 8) {
346 352 if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; }
347   - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
  353 + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
348 354 if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; }
349 355 } else if (w == 16) {
350 356 if ((arg1 == 8 && arg2 == 8) ||
351 357 (arg1 == 8 && arg2 == 16)) {
352   - if (rsse || rnosse) { _gf_errno = GF_E_SP_16_S; return 0; }
  358 + if (rsimd || rnosimd) { _gf_errno = GF_E_SP_16_S; return 0; }
353 359 if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; }
354 360 } else if (arg1 == 4 && arg2 == 16) {
355   - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
  361 + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
356 362 } else { _gf_errno = GF_E_SP_16AR; return 0; }
357 363 } else if (w == 32) {
358 364 if ((arg1 == 8 && arg2 == 8) ||
359 365 (arg1 == 8 && arg2 == 32) ||
360 366 (arg1 == 16 && arg2 == 32)) {
361   - if (rsse || rnosse) { _gf_errno = GF_E_SP_32_S; return 0; }
  367 + if (rsimd || rnosimd) { _gf_errno = GF_E_SP_32_S; return 0; }
362 368 if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; }
363 369 } else if (arg1 == 4 && arg2 == 32) {
364   - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
  370 + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
365 371 if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; }
366   - if (raltmap && rnosse) { _gf_errno = GF_E_SP_32AS; return 0; }
  372 + if (raltmap && rnosimd) { _gf_errno = GF_E_SP_32AS; return 0; }
367 373 } else { _gf_errno = GF_E_SP_32AR; return 0; }
368 374 } else if (w == 64) {
369 375 if ((arg1 == 8 && arg2 == 8) ||
370 376 (arg1 == 8 && arg2 == 64) ||
371 377 (arg1 == 16 && arg2 == 64)) {
372   - if (rsse || rnosse) { _gf_errno = GF_E_SP_64_S; return 0; }
  378 + if (rsimd || rnosimd) { _gf_errno = GF_E_SP_64_S; return 0; }
373 379 if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; }
374 380 } else if (arg1 == 4 && arg2 == 64) {
375   - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
  381 + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
376 382 if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; }
377   - if (raltmap && rnosse) { _gf_errno = GF_E_SP_64AS; return 0; }
  383 + if (raltmap && rnosimd) { _gf_errno = GF_E_SP_64AS; return 0; }
378 384 } else { _gf_errno = GF_E_SP_64AR; return 0; }
379 385 } else if (w == 128) {
380 386 if (arg1 == 8 && arg2 == 128) {
381   - if (rsse || rnosse) { _gf_errno = GF_E_SP128_S; return 0; }
  387 + if (rsimd || rnosimd) { _gf_errno = GF_E_SP128_S; return 0; }
382 388 if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; }
383 389 } else if (arg1 == 4 && arg2 == 128) {
384   - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
  390 + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
385 391 if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; }
386   - if (raltmap && rnosse) { _gf_errno = GF_E_SP128AS; return 0; }
  392 + if (raltmap && rnosimd) { _gf_errno = GF_E_SP128AS; return 0; }
387 393 } else { _gf_errno = GF_E_SP128AR; return 0; }
388 394 } else { _gf_errno = GF_E_SPLIT_W; return 0; }
389 395 return 1;
... ... @@ -395,7 +401,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
395 401 if (w < 128 && (poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; }
396 402 if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; }
397 403 if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; }
398   - if (rsse || rnosse) { _gf_errno = GF_E_COMP_SS; return 0; }
  404 + if (rsimd || rnosimd) { _gf_errno = GF_E_COMP_SS; return 0; }
399 405 if (base != NULL) {
400 406 sub = (gf_internal_t *) base->scratch;
401 407 if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; }
... ... @@ -953,7 +959,42 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor)
953 959 }
954 960 return;
955 961 #endif
  962 +#if defined(ARM_NEON)
  963 + s8 = (uint8_t *) src;
  964 + d8 = (uint8_t *) dest;
956 965  
  966 + if (uls % 16 == uld % 16) {
  967 + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
  968 + while (s8 != rd.s_start) {
  969 + *d8 ^= *s8;
  970 + s8++;
  971 + d8++;
  972 + }
  973 + while (s8 < (uint8_t *) rd.s_top) {
  974 + uint8x16_t vs = vld1q_u8 (s8);
  975 + uint8x16_t vd = vld1q_u8 (d8);
  976 + uint8x16_t vr = veorq_u8 (vs, vd);
  977 + vst1q_u8 (d8, vr);
  978 + s8 += 16;
  979 + d8 += 16;
  980 + }
  981 + } else {
  982 + while (s8 + 15 < (uint8_t *) src + bytes) {
  983 + uint8x16_t vs = vld1q_u8 (s8);
  984 + uint8x16_t vd = vld1q_u8 (d8);
  985 + uint8x16_t vr = veorq_u8 (vs, vd);
  986 + vst1q_u8 (d8, vr);
  987 + s8 += 16;
  988 + d8 += 16;
  989 + }
  990 + }
  991 + while (s8 < (uint8_t *) src + bytes) {
  992 + *d8 ^= *s8;
  993 + s8++;
  994 + d8++;
  995 + }
  996 + return;
  997 +#endif
957 998 if (uls % 8 != uld % 8) {
958 999 gf_unaligned_xor(src, dest, bytes);
959 1000 return;
... ...
src/gf_method.c
... ... @@ -121,11 +121,17 @@ int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting)
121 121 } else if (strcmp(argv[starting], "LAZY") == 0) {
122 122 region_type |= GF_REGION_LAZY;
123 123 starting++;
  124 + } else if (strcmp(argv[starting], "SIMD") == 0) {
  125 + region_type |= GF_REGION_SIMD;
  126 + starting++;
  127 + } else if (strcmp(argv[starting], "NOSIMD") == 0) {
  128 + region_type |= GF_REGION_NOSIMD;
  129 + starting++;
124 130 } else if (strcmp(argv[starting], "SSE") == 0) {
125   - region_type |= GF_REGION_SSE;
  131 + region_type |= GF_REGION_SIMD;
126 132 starting++;
127 133 } else if (strcmp(argv[starting], "NOSSE") == 0) {
128   - region_type |= GF_REGION_NOSSE;
  134 + region_type |= GF_REGION_NOSIMD;
129 135 starting++;
130 136 } else if (strcmp(argv[starting], "CAUCHY") == 0) {
131 137 region_type |= GF_REGION_CAUCHY;
... ...
src/gf_w128.c
... ... @@ -1527,7 +1527,7 @@ int gf_w128_split_init(gf_t *gf)
1527 1527  
1528 1528 gf->multiply.w128 = gf_w128_bytwo_p_multiply;
1529 1529 #if defined(INTEL_SSE4_PCLMUL)
1530   - if (!(h->region_type & GF_REGION_NOSSE)){
  1530 + if (!(h->region_type & GF_REGION_NOSIMD)){
1531 1531 gf->multiply.w128 = gf_w128_clm_multiply;
1532 1532 }
1533 1533 #endif
... ... @@ -1546,7 +1546,7 @@ int gf_w128_split_init(gf_t *gf)
1546 1546 if((h->region_type & GF_REGION_ALTMAP))
1547 1547 {
1548 1548 #ifdef INTEL_SSE4
1549   - if(!(h->region_type & GF_REGION_NOSSE))
  1549 + if(!(h->region_type & GF_REGION_NOSIMD))
1550 1550 gf->multiply_region.w128 = gf_w128_split_4_128_sse_altmap_multiply_region;
1551 1551 else
1552 1552 return 0;
... ... @@ -1556,7 +1556,7 @@ int gf_w128_split_init(gf_t *gf)
1556 1556 }
1557 1557 else {
1558 1558 #ifdef INTEL_SSE4
1559   - if(!(h->region_type & GF_REGION_NOSSE))
  1559 + if(!(h->region_type & GF_REGION_NOSIMD))
1560 1560 gf->multiply_region.w128 = gf_w128_split_4_128_sse_multiply_region;
1561 1561 else
1562 1562 gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region;
... ...
src/gf_w16.c
... ... @@ -11,54 +11,7 @@
11 11 #include "gf_int.h"
12 12 #include <stdio.h>
13 13 #include <stdlib.h>
14   -
15   -#define GF_FIELD_WIDTH (16)
16   -#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
17   -#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
18   -
19   -#define GF_BASE_FIELD_WIDTH (8)
20   -#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
21   -
22   -struct gf_w16_logtable_data {
23   - uint16_t log_tbl[GF_FIELD_SIZE];
24   - uint16_t antilog_tbl[GF_FIELD_SIZE * 2];
25   - uint16_t inv_tbl[GF_FIELD_SIZE];
26   - uint16_t *d_antilog;
27   -};
28   -
29   -struct gf_w16_zero_logtable_data {
30   - int log_tbl[GF_FIELD_SIZE];
31   - uint16_t _antilog_tbl[GF_FIELD_SIZE * 4];
32   - uint16_t *antilog_tbl;
33   - uint16_t inv_tbl[GF_FIELD_SIZE];
34   -};
35   -
36   -struct gf_w16_lazytable_data {
37   - uint16_t log_tbl[GF_FIELD_SIZE];
38   - uint16_t antilog_tbl[GF_FIELD_SIZE * 2];
39   - uint16_t inv_tbl[GF_FIELD_SIZE];
40   - uint16_t *d_antilog;
41   - uint16_t lazytable[GF_FIELD_SIZE];
42   -};
43   -
44   -struct gf_w16_bytwo_data {
45   - uint64_t prim_poly;
46   - uint64_t mask1;
47   - uint64_t mask2;
48   -};
49   -
50   -struct gf_w16_split_8_8_data {
51   - uint16_t tables[3][256][256];
52   -};
53   -
54   -struct gf_w16_group_4_4_data {
55   - uint16_t reduce[16];
56   - uint16_t shift[16];
57   -};
58   -
59   -struct gf_w16_composite_data {
60   - uint8_t *mult_table;
61   -};
  14 +#include "gf_w16.h"
62 15  
63 16 #define AB2(ip, am1 ,am2, b, t1, t2) {\
64 17 t1 = (b << 1) & am1;\
... ... @@ -1264,6 +1217,7 @@ int gf_w16_split_init(gf_t *gf)
1264 1217 gf_internal_t *h;
1265 1218 struct gf_w16_split_8_8_data *d8;
1266 1219 int i, j, exp, issse3;
  1220 + int isneon = 0;
1267 1221 uint32_t p, basep;
1268 1222  
1269 1223 h = (gf_internal_t *) gf->scratch;
... ... @@ -1273,6 +1227,9 @@ int gf_w16_split_init(gf_t *gf)
1273 1227 #else
1274 1228 issse3 = 0;
1275 1229 #endif
  1230 +#ifdef ARM_NEON
  1231 + isneon = 1;
  1232 +#endif
1276 1233  
1277 1234 if (h->arg1 == 8 && h->arg2 == 8) {
1278 1235 d8 = (struct gf_w16_split_8_8_data *) h->private;
... ... @@ -1317,6 +1274,10 @@ int gf_w16_split_init(gf_t *gf)
1317 1274  
1318 1275 if (issse3) {
1319 1276 gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region;
  1277 + } else if (isneon) {
  1278 +#ifdef ARM_NEON
  1279 + gf_w16_neon_split_init(gf);
  1280 +#endif
1320 1281 } else {
1321 1282 gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
1322 1283 }
... ... @@ -1326,15 +1287,15 @@ int gf_w16_split_init(gf_t *gf)
1326