Commit 70dd94ae38f2d20dd78532a6dfd1310fdfb4a884

Authored by KMG
2 parents 62d4b81a 6fdd8bc3
Exists in master and in 1 other branch v2

Merged in jannau/gf-complete/neon (pull request #25)

arm neon optimisations
@@ -3,9 +3,12 @@ @@ -3,9 +3,12 @@
3 # FIXME - add project url as the last argument 3 # FIXME - add project url as the last argument
4 AC_INIT(gf-complete, 1.0) 4 AC_INIT(gf-complete, 1.0)
5 5
  6 +# Override default CFLAGS
  7 +: ${CFLAGS="-Wall -Wpointer-arith -O3 -g"}
  8 +
6 AC_PREREQ([2.61]) 9 AC_PREREQ([2.61])
7 10
8 -AM_INIT_AUTOMAKE([no-dependencies foreign]) 11 +AM_INIT_AUTOMAKE([no-dependencies foreign parallel-tests])
9 LT_INIT # libtool 12 LT_INIT # libtool
10 13
11 AC_CONFIG_HEADER(include/config.h) 14 AC_CONFIG_HEADER(include/config.h)
@@ -16,14 +19,39 @@ AC_CONFIG_MACRO_DIR([m4]) @@ -16,14 +19,39 @@ AC_CONFIG_MACRO_DIR([m4])
16 # This prevents './configure; make' from trying to run autotools. 19 # This prevents './configure; make' from trying to run autotools.
17 AM_MAINTAINER_MODE([disable]) 20 AM_MAINTAINER_MODE([disable])
18 21
19 -# Override default CFLAGS  
20 -CFLAGS="-Wall -Wpointer-arith -O3 -g"  
21 -  
22 dnl Compiling with per-target flags requires AM_PROG_CC_C_O. 22 dnl Compiling with per-target flags requires AM_PROG_CC_C_O.
23 AC_PROG_CC 23 AC_PROG_CC
24 24
  25 +# Check for functions to provide aligned memory
  26 +#
  27 +AC_CHECK_FUNCS([posix_memalign],
  28 + [found_memalign=yes; break])
  29 +
  30 +AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])])
  31 +
25 AX_EXT() 32 AX_EXT()
26 33
  34 +AC_ARG_ENABLE([neon],
  35 + AS_HELP_STRING([--disable-neon], [Build without NEON optimizations]))
  36 +
  37 +AS_IF([test "x$enable_neon" != "xno"],
  38 + [noneon_CPPFLAGS=$CPPFLAGS
  39 + CPPFLAGS="$CPPFLAGS $SIMD_FLAGS"
  40 + AC_CHECK_HEADER([arm_neon.h],
  41 + [have_neon=yes],
  42 + [have_neon=no
  43 + CPPFLAGS=$noneon_CPPFLAGS])],
  44 + [have_neon=no
  45 + AS_IF([test "x$ax_cv_have_neon_ext" = "xyes"],
  46 + [SIMD_FLAGS=""])
  47 + ])
  48 +
  49 +AS_IF([test "x$have_neon" = "xno"],
  50 + [AS_IF([test "x$enable_neon" = "xyes"],
  51 + [AC_MSG_ERROR([neon requested but arm_neon.h not found])])
  52 + ])
  53 +AM_CONDITIONAL([HAVE_NEON], [test "x$have_neon" = "xyes"])
  54 +
27 AC_ARG_ENABLE([sse], 55 AC_ARG_ENABLE([sse],
28 AS_HELP_STRING([--disable-sse], [Build without SSE optimizations]), 56 AS_HELP_STRING([--disable-sse], [Build without SSE optimizations]),
29 [if test "x$enableval" = "xno" ; then 57 [if test "x$enableval" = "xno" ; then
examples/Makefile.am
1 # GF-Complete 'examples' AM file 1 # GF-Complete 'examples' AM file
2 2
3 -AM_CPPFLAGS=-I./ -I../include  
4 -AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES) 3 +AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
  4 +AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
5 5
6 bin_PROGRAMS = gf_example_1 gf_example_2 gf_example_3 gf_example_4 \ 6 bin_PROGRAMS = gf_example_1 gf_example_2 gf_example_3 gf_example_4 \
7 gf_example_5 gf_example_6 gf_example_7 7 gf_example_5 gf_example_6 gf_example_7
include/gf_complete.h
@@ -33,6 +33,10 @@ @@ -33,6 +33,10 @@
33 #include <wmmintrin.h> 33 #include <wmmintrin.h>
34 #endif 34 #endif
35 35
  36 +#if defined(ARM_NEON)
  37 + #include <arm_neon.h>
  38 +#endif
  39 +
36 40
37 /* These are the different ways to perform multiplication. 41 /* These are the different ways to perform multiplication.
38 Not all are implemented for all values of w. 42 Not all are implemented for all values of w.
@@ -61,7 +65,9 @@ typedef enum {GF_MULT_DEFAULT, @@ -61,7 +65,9 @@ typedef enum {GF_MULT_DEFAULT,
61 #define GF_REGION_DOUBLE_TABLE (0x1) 65 #define GF_REGION_DOUBLE_TABLE (0x1)
62 #define GF_REGION_QUAD_TABLE (0x2) 66 #define GF_REGION_QUAD_TABLE (0x2)
63 #define GF_REGION_LAZY (0x4) 67 #define GF_REGION_LAZY (0x4)
  68 +#define GF_REGION_SIMD (0x8)
64 #define GF_REGION_SSE (0x8) 69 #define GF_REGION_SSE (0x8)
  70 +#define GF_REGION_NOSIMD (0x10)
65 #define GF_REGION_NOSSE (0x10) 71 #define GF_REGION_NOSSE (0x10)
66 #define GF_REGION_ALTMAP (0x20) 72 #define GF_REGION_ALTMAP (0x20)
67 #define GF_REGION_CAUCHY (0x40) 73 #define GF_REGION_CAUCHY (0x40)
include/gf_int.h
@@ -113,7 +113,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default &amp;&amp; Mult == Default */ @@ -113,7 +113,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default &amp;&amp; Mult == Default */
113 GF_E_DIVCOMP, /* Mult == Composite && Div != Default */ 113 GF_E_DIVCOMP, /* Mult == Composite && Div != Default */
114 GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */ 114 GF_E_CAUCOMP, /* Mult == Composite && Reg == CAUCHY */
115 GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */ 115 GF_E_DOUQUAD, /* Reg == DOUBLE && Reg == QUAD */
116 - GF_E_SSE__NO, /* Reg == SSE && Reg == NOSSE */ 116 + GF_E_SIMD_NO, /* Reg == SIMD && Reg == NOSIMD */
117 GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */ 117 GF_E_CAUCHYB, /* Reg == CAUCHY && Other Reg */
118 GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/ 118 GF_E_CAUGT32, /* Reg == CAUCHY && w > 32*/
119 GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */ 119 GF_E_ARG1SET, /* Arg1 != 0 && Mult \notin COMPOSITE/SPLIT/GROUP */
@@ -129,9 +129,9 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default &amp;&amp; Mult == Default */ @@ -129,9 +129,9 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default &amp;&amp; Mult == Default */
129 GF_E_QUAD__J, /* Reg == QUAD && other Reg */ 129 GF_E_QUAD__J, /* Reg == QUAD && other Reg */
130 GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/ 130 GF_E_LAZY__X, /* Reg == LAZY && not DOUBLE or QUAD*/
131 GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */ 131 GF_E_ALTSHIF, /* Mult == Shift && Reg == ALTMAP */
132 - GF_E_SSESHIF, /* Mult == Shift && Reg == SSE|NOSSE */ 132 + GF_E_SSESHIF, /* Mult == Shift && Reg == SIMD|NOSIMD */
133 GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */ 133 GF_E_ALT_CFM, /* Mult == CARRY_FREE && Reg == ALTMAP */
134 - GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SSE|NOSSE */ 134 + GF_E_SSE_CFM, /* Mult == CARRY_FREE && Reg == SIMD|NOSIMD */
135 GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */ 135 GF_E_PCLMULX, /* Mult == Carry_Free && No PCLMUL */
136 GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */ 136 GF_E_ALT_BY2, /* Mult == Bytwo_x && Reg == ALTMAP */
137 GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */ 137 GF_E_BY2_SSE, /* Mult == Bytwo_x && Reg == SSE && No SSE2 */
@@ -148,7 +148,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default &amp;&amp; Mult == Default */ @@ -148,7 +148,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default &amp;&amp; Mult == Default */
148 GF_E_GR_AR_W, /* Mult == GROUP, either arg > w */ 148 GF_E_GR_AR_W, /* Mult == GROUP, either arg > w */
149 GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */ 149 GF_E_GR____J, /* Mult == GROUP, Reg == SSE|ALTMAP|NOSSE */
150 GF_E_TABLE_W, /* Mult == TABLE, w too big */ 150 GF_E_TABLE_W, /* Mult == TABLE, w too big */
151 - GF_E_TAB_SSE, /* Mult == TABLE, SSE|NOSSE only apply to w == 4 */ 151 + GF_E_TAB_SSE, /* Mult == TABLE, SIMD|NOSIMD only apply to w == 4 */
152 GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */ 152 GF_E_TABSSE3, /* Mult == TABLE, Need SSSE3 for SSE */
153 GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */ 153 GF_E_TAB_ALT, /* Mult == TABLE, Reg == ALTMAP */
154 GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */ 154 GF_E_SP128AR, /* Mult == SPLIT, w=128, Bad arg1/arg2 */
@@ -172,7 +172,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default &amp;&amp; Mult == Default */ @@ -172,7 +172,7 @@ typedef enum {GF_E_MDEFDIV, /* Dev != Default &amp;&amp; Mult == Default */
172 GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */ 172 GF_E_SP_8__A, /* Mult == SPLIT, w=8, no ALTMAP */
173 GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */ 173 GF_E_SP_SSE3, /* Mult == SPLIT, Need SSSE3 for SSE */
174 GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */ 174 GF_E_COMP_A2, /* Mult == COMP, arg1 must be = 2 */
175 - GF_E_COMP_SS, /* Mult == COMP, SSE|NOSSE */ 175 + GF_E_COMP_SS, /* Mult == COMP, SIMD|NOSIMD */
176 GF_E_COMP__W, /* Mult == COMP, Bad w. */ 176 GF_E_COMP__W, /* Mult == COMP, Bad w. */
177 GF_E_UNKFLAG, /* Unknown flag in create_from.... */ 177 GF_E_UNKFLAG, /* Unknown flag in create_from.... */
178 GF_E_UNKNOWN, /* Unknown mult_type. */ 178 GF_E_UNKNOWN, /* Unknown mult_type. */
include/gf_w16.h 0 → 100644
@@ -0,0 +1,66 @@ @@ -0,0 +1,66 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_w16.h
  7 + *
  8 + * Defines and data structures for 16-bit Galois fields
  9 + */
  10 +
  11 +#ifndef GF_COMPLETE_GF_W16_H
  12 +#define GF_COMPLETE_GF_W16_H
  13 +
  14 +#include <stdint.h>
  15 +
  16 +#define GF_FIELD_WIDTH (16)
  17 +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
  18 +#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
  19 +
  20 +#define GF_BASE_FIELD_WIDTH (8)
  21 +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
  22 +
  23 +struct gf_w16_logtable_data {
  24 + uint16_t log_tbl[GF_FIELD_SIZE];
  25 + uint16_t antilog_tbl[GF_FIELD_SIZE * 2];
  26 + uint16_t inv_tbl[GF_FIELD_SIZE];
  27 + uint16_t *d_antilog;
  28 +};
  29 +
  30 +struct gf_w16_zero_logtable_data {
  31 + int log_tbl[GF_FIELD_SIZE];
  32 + uint16_t _antilog_tbl[GF_FIELD_SIZE * 4];
  33 + uint16_t *antilog_tbl;
  34 + uint16_t inv_tbl[GF_FIELD_SIZE];
  35 +};
  36 +
  37 +struct gf_w16_lazytable_data {
  38 + uint16_t log_tbl[GF_FIELD_SIZE];
  39 + uint16_t antilog_tbl[GF_FIELD_SIZE * 2];
  40 + uint16_t inv_tbl[GF_FIELD_SIZE];
  41 + uint16_t *d_antilog;
  42 + uint16_t lazytable[GF_FIELD_SIZE];
  43 +};
  44 +
  45 +struct gf_w16_bytwo_data {
  46 + uint64_t prim_poly;
  47 + uint64_t mask1;
  48 + uint64_t mask2;
  49 +};
  50 +
  51 +struct gf_w16_split_8_8_data {
  52 + uint16_t tables[3][256][256];
  53 +};
  54 +
  55 +struct gf_w16_group_4_4_data {
  56 + uint16_t reduce[16];
  57 + uint16_t shift[16];
  58 +};
  59 +
  60 +struct gf_w16_composite_data {
  61 + uint8_t *mult_table;
  62 +};
  63 +
  64 +void gf_w16_neon_split_init(gf_t *gf);
  65 +
  66 +#endif /* GF_COMPLETE_GF_W16_H */
include/gf_w32.h 0 → 100644
@@ -0,0 +1,71 @@ @@ -0,0 +1,71 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_w32.h
  7 + *
  8 + * Defines and data structures for 32-bit Galois fields
  9 + */
  10 +
  11 +#ifndef GF_COMPLETE_GF_W32_H
  12 +#define GF_COMPLETE_GF_W32_H
  13 +
  14 +#include <stdint.h>
  15 +
  16 +#define GF_FIELD_WIDTH (32)
  17 +#define GF_FIRST_BIT (1 << 31)
  18 +
  19 +#define GF_BASE_FIELD_WIDTH (16)
  20 +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
  21 +#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
  22 +#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
  23 +
  24 +struct gf_split_2_32_lazy_data {
  25 + uint32_t tables[16][4];
  26 + uint32_t last_value;
  27 +};
  28 +
  29 +struct gf_w32_split_8_8_data {
  30 + uint32_t tables[7][256][256];
  31 + uint32_t region_tables[4][256];
  32 + uint32_t last_value;
  33 +};
  34 +
  35 +struct gf_w32_group_data {
  36 + uint32_t *reduce;
  37 + uint32_t *shift;
  38 + int tshift;
  39 + uint64_t rmask;
  40 + uint32_t *memory;
  41 +};
  42 +
  43 +struct gf_split_16_32_lazy_data {
  44 + uint32_t tables[2][(1<<16)];
  45 + uint32_t last_value;
  46 +};
  47 +
  48 +struct gf_split_8_32_lazy_data {
  49 + uint32_t tables[4][256];
  50 + uint32_t last_value;
  51 +};
  52 +
  53 +struct gf_split_4_32_lazy_data {
  54 + uint32_t tables[8][16];
  55 + uint32_t last_value;
  56 +};
  57 +
  58 +struct gf_w32_bytwo_data {
  59 + uint64_t prim_poly;
  60 + uint64_t mask1;
  61 + uint64_t mask2;
  62 +};
  63 +
  64 +struct gf_w32_composite_data {
  65 + uint16_t *log;
  66 + uint16_t *alog;
  67 +};
  68 +
  69 +void gf_w32_neon_split_init(gf_t *gf);
  70 +
  71 +#endif /* GF_COMPLETE_GF_W32_H */
include/gf_w4.h 0 → 100644
@@ -0,0 +1,63 @@ @@ -0,0 +1,63 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_w4.h
  7 + *
  8 + * Defines and data structures for 4-bit Galois fields
  9 + */
  10 +
  11 +#ifndef GF_COMPLETE_GF_W4_H
  12 +#define GF_COMPLETE_GF_W4_H
  13 +
  14 +#include <stdint.h>
  15 +
  16 +#define GF_FIELD_WIDTH 4
  17 +#define GF_DOUBLE_WIDTH (GF_FIELD_WIDTH*2)
  18 +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
  19 +#define GF_MULT_GROUP_SIZE (GF_FIELD_SIZE-1)
  20 +
  21 +/* ------------------------------------------------------------
  22 + JSP: Each implementation has its own data, which is allocated
  23 + at one time as part of the handle. For that reason, it
  24 + shouldn't be hierarchical -- i.e. one should be able to
  25 + allocate it with one call to malloc. */
  26 +
  27 +struct gf_logtable_data {
  28 + uint8_t log_tbl[GF_FIELD_SIZE];
  29 + uint8_t antilog_tbl[GF_FIELD_SIZE * 2];
  30 + uint8_t *antilog_tbl_div;
  31 +};
  32 +
  33 +struct gf_single_table_data {
  34 + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
  35 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  36 +};
  37 +
  38 +struct gf_double_table_data {
  39 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  40 + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
  41 +};
  42 +struct gf_quad_table_data {
  43 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  44 + uint16_t mult[GF_FIELD_SIZE][(1<<16)];
  45 +};
  46 +
  47 +struct gf_quad_table_lazy_data {
  48 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  49 + uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
  50 + uint16_t mult[(1 << 16)];
  51 +};
  52 +
  53 +struct gf_bytwo_data {
  54 + uint64_t prim_poly;
  55 + uint64_t mask1;
  56 + uint64_t mask2;
  57 +};
  58 +
  59 +// ARM NEON init functions
  60 +int gf_w4_neon_cfm_init(gf_t *gf);
  61 +void gf_w4_neon_single_table_init(gf_t *gf);
  62 +
  63 +#endif /* GF_COMPLETE_GF_W4_H */
include/gf_w64.h 0 → 100644
@@ -0,0 +1,50 @@ @@ -0,0 +1,50 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_w64.h
  7 + *
  8 + * Defines and data structures for 64-bit Galois fields
  9 + */
  10 +
  11 +#ifndef GF_COMPLETE_GF_W64_H
  12 +#define GF_COMPLETE_GF_W64_H
  13 +
  14 +#include <stdint.h>
  15 +
  16 +#define GF_FIELD_WIDTH (64)
  17 +#define GF_FIRST_BIT (1ULL << 63)
  18 +
  19 +#define GF_BASE_FIELD_WIDTH (32)
  20 +#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH)
  21 +#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
  22 +
  23 +struct gf_w64_group_data {
  24 + uint64_t *reduce;
  25 + uint64_t *shift;
  26 + uint64_t *memory;
  27 +};
  28 +
  29 +struct gf_split_4_64_lazy_data {
  30 + uint64_t tables[16][16];
  31 + uint64_t last_value;
  32 +};
  33 +
  34 +struct gf_split_8_64_lazy_data {
  35 + uint64_t tables[8][(1<<8)];
  36 + uint64_t last_value;
  37 +};
  38 +
  39 +struct gf_split_16_64_lazy_data {
  40 + uint64_t tables[4][(1<<16)];
  41 + uint64_t last_value;
  42 +};
  43 +
  44 +struct gf_split_8_8_data {
  45 + uint64_t tables[15][256][256];
  46 +};
  47 +
  48 +void gf_w64_neon_split_init(gf_t *gf);
  49 +
  50 +#endif /* GF_COMPLETE_GF_W64_H */
include/gf_w8.h 0 → 100644
@@ -0,0 +1,99 @@ @@ -0,0 +1,99 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_w8.c
  7 + *
  8 + * Defines and data stuctures for 8-bit Galois fields
  9 + */
  10 +
  11 +#ifndef GF_COMPLETE_GF_W8_H
  12 +#define GF_COMPLETE_GF_W8_H
  13 +
  14 +#include "gf_int.h"
  15 +#include <stdint.h>
  16 +
  17 +#define GF_FIELD_WIDTH (8)
  18 +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
  19 +#define GF_HALF_SIZE (1 << (GF_FIELD_WIDTH/2))
  20 +#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
  21 +
  22 +#define GF_BASE_FIELD_WIDTH (4)
  23 +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
  24 +
  25 +struct gf_w8_logtable_data {
  26 + uint8_t log_tbl[GF_FIELD_SIZE];
  27 + uint8_t antilog_tbl[GF_FIELD_SIZE * 2];
  28 + uint8_t inv_tbl[GF_FIELD_SIZE];
  29 +};
  30 +
  31 +struct gf_w8_logzero_table_data {
  32 + short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */
  33 + uint8_t antilog_tbl[512+512+1];
  34 + uint8_t *div_tbl;
  35 + uint8_t *inv_tbl;
  36 +};
  37 +
  38 +struct gf_w8_logzero_small_table_data {
  39 + short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */
  40 + uint8_t antilog_tbl[255*3];
  41 + uint8_t inv_tbl[GF_FIELD_SIZE];
  42 + uint8_t *div_tbl;
  43 +};
  44 +
  45 +struct gf_w8_composite_data {
  46 + uint8_t *mult_table;
  47 +};
  48 +
  49 +/* Don't change the order of these relative to gf_w8_half_table_data */
  50 +
  51 +struct gf_w8_default_data {
  52 + uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE];
  53 + uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE];
  54 + uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
  55 + uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
  56 +};
  57 +
  58 +struct gf_w8_half_table_data {
  59 + uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE];
  60 + uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE];
  61 +};
  62 +
  63 +struct gf_w8_single_table_data {
  64 + uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
  65 + uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
  66 +};
  67 +
  68 +struct gf_w8_double_table_data {
  69 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  70 + uint16_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
  71 +};
  72 +
  73 +struct gf_w8_double_table_lazy_data {
  74 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  75 + uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
  76 + uint16_t mult[GF_FIELD_SIZE*GF_FIELD_SIZE];
  77 +};
  78 +
  79 +struct gf_w4_logtable_data {
  80 + uint8_t log_tbl[GF_BASE_FIELD_SIZE];
  81 + uint8_t antilog_tbl[GF_BASE_FIELD_SIZE * 2];
  82 + uint8_t *antilog_tbl_div;
  83 +};
  84 +
  85 +struct gf_w4_single_table_data {
  86 + uint8_t div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
  87 + uint8_t mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
  88 +};
  89 +
  90 +struct gf_w8_bytwo_data {
  91 + uint64_t prim_poly;
  92 + uint64_t mask1;
  93 + uint64_t mask2;
  94 +};
  95 +
  96 +int gf_w8_neon_cfm_init(gf_t *gf);
  97 +void gf_w8_neon_split_init(gf_t *gf);
  98 +
  99 +#endif /* GF_COMPLETE_GF_W8_H */
@@ -41,6 +41,55 @@ AC_DEFUN([AX_EXT], @@ -41,6 +41,55 @@ AC_DEFUN([AX_EXT],
41 AC_REQUIRE([AC_CANONICAL_HOST]) 41 AC_REQUIRE([AC_CANONICAL_HOST])
42 42
43 case $host_cpu in 43 case $host_cpu in
  44 + aarch64*)
  45 + AC_DEFINE(HAVE_ARCH_AARCH64,,[targeting AArch64])
  46 + SIMD_FLAGS="$SIMD_FLAGS -DARCH_AARCH64"
  47 +
  48 + AC_CACHE_CHECK([whether NEON is supported], [ax_cv_have_neon_ext],
  49 + [
  50 + # TODO: detect / cross-compile
  51 + ax_cv_have_neon_ext=yes
  52 + ])
  53 + AC_CACHE_CHECK([whether cryptographic extension is supported], [ax_cv_have_arm_crypt_ext],
  54 + [
  55 + # TODO: detect / cross-compile
  56 + ax_cv_have_arm_crypt_ext=yes
  57 + ])
  58 +
  59 + if test "$ax_cv_have_arm_crypt_ext" = yes; then
  60 + AC_DEFINE(HAVE_ARM_CRYPT_EXT,,[Support ARM cryptographic extension])
  61 + fi
  62 +
  63 + if test "$ax_cv_have_neon_ext" = yes; then
  64 + AC_DEFINE(HAVE_NEON,,[Support NEON instructions])
  65 + fi
  66 +
  67 + if test "$ax_cv_have_arm_crypt_ext" = yes && test "$ax_cv_have_neon_ext" = yes; then
  68 + AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd+crypto,
  69 + SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd+crypto -DARM_CRYPT -DARM_NEON", [])
  70 + elif test "$ax_cv_have_arm_crypt_ext" = yes; then
  71 + AX_CHECK_COMPILE_FLAG(-march=armv8-a+crypto,
  72 + SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+crypto -DARM_CRYPT", [])
  73 + elif test "$ax_cv_have_neon_ext" = yes; then
  74 + AX_CHECK_COMPILE_FLAG(-march=armv8-a+simd,
  75 + SIMD_FLAGS="$SIMD_FLAGS -march=armv8-a+simd -DARM_NEON", [])
  76 + fi
  77 + ;;
  78 +
  79 + arm*)
  80 + AC_CACHE_CHECK([whether NEON is supported], [ax_cv_have_neon_ext],
  81 + [
  82 + # TODO: detect / cross-compile
  83 + ax_cv_have_neon_ext=yes
  84 + ])
  85 +
  86 + if test "$ax_cv_have_neon_ext" = yes; then
  87 + AC_DEFINE(HAVE_NEON,,[Support NEON instructions])
  88 + AX_CHECK_COMPILE_FLAG(-mfpu=neon,
  89 + SIMD_FLAGS="$SIMD_FLAGS -mfpu=neon -DARM_NEON", [])
  90 + fi
  91 + ;;
  92 +
44 powerpc*) 93 powerpc*)
45 AC_CACHE_CHECK([whether altivec is supported], [ax_cv_have_altivec_ext], 94 AC_CACHE_CHECK([whether altivec is supported], [ax_cv_have_altivec_ext],
46 [ 95 [
src/Makefile.am
1 # GF-Complete 'core' AM file 1 # GF-Complete 'core' AM file
2 # Creates the library 2 # Creates the library
3 3
4 -AM_CPPFLAGS=-I./ -I../include  
5 -AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC $(INCLUDES) 4 +AUTOMAKE_OPTIONS = subdir-objects
  5 +
  6 +AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
  7 +AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
6 8
7 lib_LTLIBRARIES = libgf_complete.la 9 lib_LTLIBRARIES = libgf_complete.la
8 libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \ 10 libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
9 gf_w64.c gf_w128.c gf_rand.c gf_general.c 11 gf_w64.c gf_w128.c gf_rand.c gf_general.c
  12 +
  13 +if HAVE_NEON
  14 +libgf_complete_la_SOURCES += neon/gf_w4_neon.c \
  15 + neon/gf_w8_neon.c \
  16 + neon/gf_w16_neon.c \
  17 + neon/gf_w32_neon.c \
  18 + neon/gf_w64_neon.c
  19 +endif
  20 +
10 libgf_complete_la_LDFLAGS = -version-info 1:0:0 21 libgf_complete_la_LDFLAGS = -version-info 1:0:0
11 22
@@ -41,7 +41,7 @@ void gf_error() @@ -41,7 +41,7 @@ void gf_error()
41 case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break; 41 case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break;
42 case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break; 42 case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break;
43 case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break; 43 case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break;
44 - case GF_E_SSE__NO: s = "Cannot specify -r SSE and -r NOSSE."; break; 44 + case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break;
45 case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break; 45 case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break;
46 case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break; 46 case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break;
47 case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break; 47 case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break;
@@ -51,23 +51,23 @@ void gf_error() @@ -51,23 +51,23 @@ void gf_error()
51 case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break; 51 case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break;
52 case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break; 52 case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break;
53 case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break; 53 case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break;
54 - case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SSE|NOSSE."; break; 54 + case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break;
55 case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break; 55 case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break;
56 case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break; 56 case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break;
57 case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break; 57 case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break;
58 - case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SSE|NOSSE."; break; 58 + case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break;
59 case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break; 59 case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break;
60 case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break; 60 case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break;
61 case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break; 61 case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break;
62 case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break; 62 case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break;
63 - case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SSE|NOSSE."; break; 63 + case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break;
64 case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break; 64 case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break;
65 - case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SSE|NOSSE."; break; 65 + case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break;
66 case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break; 66 case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break;
67 case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break; 67 case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break;
68 - case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SSE, but SSE2 is not supported."; break; 68 + case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break;
69 case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break; 69 case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break;
70 - case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SSE|NOSSE."; break; 70 + case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break;
71 case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break; 71 case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break;
72 case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break; 72 case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break;
73 case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break; 73 case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break;
@@ -77,33 +77,33 @@ void gf_error() @@ -77,33 +77,33 @@ void gf_error()
77 case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break; 77 case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;
78 case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break; 78 case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;
79 case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break; 79 case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;
80 - case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SSE|NOSSE."; break; 80 + case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break;
81 case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break; 81 case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break;
82 - case GF_E_TAB_SSE: s = "With -m TABLE, SSE|NOSSE only applies to w=4."; break;  
83 - case GF_E_TABSSE3: s = "With -m TABLE, -r SSE, you need SSSE3 supported."; break; 82 + case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break;
  83 + case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break;
84 case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break; 84 case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break;
85 case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break; 85 case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break;
86 - case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SSE requires -r ALTMAP."; break; 86 + case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break;
87 case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break; 87 case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break;
88 case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break; 88 case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break;
89 - case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SSE|NOSSE only with arg1/arg2 = 4/128."; break; 89 + case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break;
90 case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break; 90 case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break;
91 case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break; 91 case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break;
92 case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break; 92 case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break;
93 - case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SSE|NOSSE only with arg1/arg2 = 4/16."; break; 93 + case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break;
94 case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break; 94 case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break;
95 case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break; 95 case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break;
96 case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break; 96 case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break;
97 - case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SSE|NOSSE only with arg1/arg2 = 4/32."; break; 97 + case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break;
98 case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break; 98 case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break;
99 case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break; 99 case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break;
100 case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break; 100 case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break;
101 - case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SSE|NOSSE only with arg1/arg2 = 4/64."; break; 101 + case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break;
102 case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break; 102 case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break;
103 case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break; 103 case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break;
104 - case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SSE."; break; 104 + case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break;
105 case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break; 105 case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break;
106 - case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SSE and -r NOSSE do not apply."; break; 106 + case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break;
107 case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break; 107 case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break;
108 case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break; 108 case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break;
109 case GF_E_UNKNOWN: s = "Unknown multiplication type."; break; 109 case GF_E_UNKNOWN: s = "Unknown multiplication type."; break;
@@ -182,14 +182,14 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, @@ -182,14 +182,14 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
182 int sse3 = 0; 182 int sse3 = 0;
183 int sse2 = 0; 183 int sse2 = 0;
184 int pclmul = 0; 184 int pclmul = 0;
185 - int rdouble, rquad, rlazy, rsse, rnosse, raltmap, rcauchy, tmp; 185 + int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp;
186 gf_internal_t *sub; 186 gf_internal_t *sub;
187 187
188 rdouble = (region_type & GF_REGION_DOUBLE_TABLE); 188 rdouble = (region_type & GF_REGION_DOUBLE_TABLE);
189 rquad = (region_type & GF_REGION_QUAD_TABLE); 189 rquad = (region_type & GF_REGION_QUAD_TABLE);
190 rlazy = (region_type & GF_REGION_LAZY); 190 rlazy = (region_type & GF_REGION_LAZY);
191 - rsse = (region_type & GF_REGION_SSE);  
192 - rnosse = (region_type & GF_REGION_NOSSE); 191 + rsimd = (region_type & GF_REGION_SIMD);
  192 + rnosimd = (region_type & GF_REGION_NOSIMD);
193 raltmap = (region_type & GF_REGION_ALTMAP); 193 raltmap = (region_type & GF_REGION_ALTMAP);
194 rcauchy = (region_type & GF_REGION_CAUCHY); 194 rcauchy = (region_type & GF_REGION_CAUCHY);
195 195
@@ -201,7 +201,8 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, @@ -201,7 +201,8 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
201 } 201 }
202 202
203 tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY | 203 tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY |
204 - GF_REGION_SSE | GF_REGION_NOSSE | GF_REGION_ALTMAP | GF_REGION_CAUCHY ); 204 + GF_REGION_SIMD | GF_REGION_NOSIMD | GF_REGION_ALTMAP |
  205 + GF_REGION_CAUCHY );
205 if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; } 206 if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
206 207
207 #ifdef INTEL_SSE2 208 #ifdef INTEL_SSE2
@@ -216,6 +217,11 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, @@ -216,6 +217,11 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
216 pclmul = 1; 217 pclmul = 1;
217 #endif 218 #endif
218 219
  220 +#ifdef ARM_NEON
  221 + pclmul = 1;
  222 + sse3 = 1;
  223 +#endif
  224 +
219 225
220 if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; } 226 if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }
221 227
@@ -230,7 +236,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, @@ -230,7 +236,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
230 return 1; 236 return 1;
231 } 237 }
232 238
233 - if (rsse && rnosse) { _gf_errno = GF_E_SSE__NO; return 0; } 239 + if (rsimd && rnosimd) { _gf_errno = GF_E_SIMD_NO; return 0; }
234 if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; } 240 if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; }
235 if (rcauchy && region_type != GF_REGION_CAUCHY) { _gf_errno = GF_E_CAUCHYB; return 0; } 241 if (rcauchy && region_type != GF_REGION_CAUCHY) { _gf_errno = GF_E_CAUCHYB; return 0; }
236 if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; } 242 if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; }
@@ -252,7 +258,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, @@ -252,7 +258,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
252 if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; } 258 if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; }
253 if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; } 259 if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; }
254 if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; } 260 if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; }
255 - if (rsse || rnosse || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; } 261 + if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }
256 if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; } 262 if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; }
257 return 1; 263 return 1;
258 } 264 }
@@ -260,7 +266,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, @@ -260,7 +266,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
260 if (rquad) { 266 if (rquad) {
261 if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; } 267 if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; }
262 if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; } 268 if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; }
263 - if (rsse || rnosse || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; } 269 + if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }
264 return 1; 270 return 1;
265 } 271 }
266 272
@@ -268,7 +274,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, @@ -268,7 +274,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
268 274
269 if (mult_type == GF_MULT_SHIFT) { 275 if (mult_type == GF_MULT_SHIFT) {
270 if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; } 276 if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; }
271 - if (rsse || rnosse) { _gf_errno = GF_E_SSESHIF; return 0; } 277 + if (rsimd || rnosimd) { _gf_errno = GF_E_SSESHIF; return 0; }
272 return 1; 278 return 1;
273 } 279 }
274 280
@@ -281,7 +287,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, @@ -281,7 +287,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
281 if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; } 287 if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; }
282 if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; } 288 if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; }
283 if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; } 289 if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }
284 - if (rsse || rnosse) { _gf_errno = GF_E_SSE_CFM; return 0; } 290 + if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; }
285 if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; } 291 if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }
286 return 1; 292 return 1;
287 } 293 }
@@ -290,21 +296,21 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, @@ -290,21 +296,21 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
290 if (w != 4 && w != 8 && w != 16 && 296 if (w != 4 && w != 8 && w != 16 &&
291 w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; } 297 w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; }
292 if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; } 298 if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }
293 - if (rsse || rnosse) { _gf_errno = GF_E_SSE_CFM; return 0; } 299 + if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; }
294 if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; } 300 if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }
295 return 1; 301 return 1;
296 } 302 }
297 303
298 if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) { 304 if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) {
299 if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; } 305 if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; }
300 - if (rsse && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; } 306 + if (rsimd && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; }
301 return 1; 307 return 1;
302 } 308 }
303 309
304 if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO 310 if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO
305 || mult_type == GF_MULT_LOG_ZERO_EXT ) { 311 || mult_type == GF_MULT_LOG_ZERO_EXT ) {
306 if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; } 312 if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; }
307 - if (raltmap || rsse || rnosse) { _gf_errno = GF_E_LOG___J; return 0; } 313 + if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; }
308 314
309 if (mult_type == GF_MULT_LOG_TABLE) return 1; 315 if (mult_type == GF_MULT_LOG_TABLE) return 1;
310 316
@@ -324,14 +330,14 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, @@ -324,14 +330,14 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
324 (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; } 330 (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }
325 if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; } 331 if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; }
326 if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; } 332 if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; }
327 - if (raltmap || rsse || rnosse) { _gf_errno = GF_E_GR____J; return 0; } 333 + if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_GR____J; return 0; }
328 return 1; 334 return 1;
329 } 335 }
330 336
331 if (mult_type == GF_MULT_TABLE) { 337 if (mult_type == GF_MULT_TABLE) {
332 if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; } 338 if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; }
333 - if (w != 4 && (rsse || rnosse)) { _gf_errno = GF_E_TAB_SSE; return 0; }  
334 - if (rsse && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; } 339 + if (w != 4 && (rsimd || rnosimd)) { _gf_errno = GF_E_TAB_SSE; return 0; }
  340 + if (rsimd && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; }
335 if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; } 341 if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; }
336 return 1; 342 return 1;
337 } 343 }
@@ -344,46 +350,46 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, @@ -344,46 +350,46 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
344 } 350 }
345 if (w == 8) { 351 if (w == 8) {
346 if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; } 352 if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; }
347 - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } 353 + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
348 if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; } 354 if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; }
349 } else if (w == 16) { 355 } else if (w == 16) {
350 if ((arg1 == 8 && arg2 == 8) || 356 if ((arg1 == 8 && arg2 == 8) ||
351 (arg1 == 8 && arg2 == 16)) { 357 (arg1 == 8 && arg2 == 16)) {
352 - if (rsse || rnosse) { _gf_errno = GF_E_SP_16_S; return 0; } 358 + if (rsimd || rnosimd) { _gf_errno = GF_E_SP_16_S; return 0; }
353 if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; } 359 if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; }
354 } else if (arg1 == 4 && arg2 == 16) { 360 } else if (arg1 == 4 && arg2 == 16) {
355 - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } 361 + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
356 } else { _gf_errno = GF_E_SP_16AR; return 0; } 362 } else { _gf_errno = GF_E_SP_16AR; return 0; }
357 } else if (w == 32) { 363 } else if (w == 32) {
358 if ((arg1 == 8 && arg2 == 8) || 364 if ((arg1 == 8 && arg2 == 8) ||
359 (arg1 == 8 && arg2 == 32) || 365 (arg1 == 8 && arg2 == 32) ||
360 (arg1 == 16 && arg2 == 32)) { 366 (arg1 == 16 && arg2 == 32)) {
361 - if (rsse || rnosse) { _gf_errno = GF_E_SP_32_S; return 0; } 367 + if (rsimd || rnosimd) { _gf_errno = GF_E_SP_32_S; return 0; }
362 if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; } 368 if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; }
363 } else if (arg1 == 4 && arg2 == 32) { 369 } else if (arg1 == 4 && arg2 == 32) {
364 - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } 370 + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
365 if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; } 371 if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; }
366 - if (raltmap && rnosse) { _gf_errno = GF_E_SP_32AS; return 0; } 372 + if (raltmap && rnosimd) { _gf_errno = GF_E_SP_32AS; return 0; }
367 } else { _gf_errno = GF_E_SP_32AR; return 0; } 373 } else { _gf_errno = GF_E_SP_32AR; return 0; }
368 } else if (w == 64) { 374 } else if (w == 64) {
369 if ((arg1 == 8 && arg2 == 8) || 375 if ((arg1 == 8 && arg2 == 8) ||
370 (arg1 == 8 && arg2 == 64) || 376 (arg1 == 8 && arg2 == 64) ||
371 (arg1 == 16 && arg2 == 64)) { 377 (arg1 == 16 && arg2 == 64)) {
372 - if (rsse || rnosse) { _gf_errno = GF_E_SP_64_S; return 0; } 378 + if (rsimd || rnosimd) { _gf_errno = GF_E_SP_64_S; return 0; }
373 if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; } 379 if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; }
374 } else if (arg1 == 4 && arg2 == 64) { 380 } else if (arg1 == 4 && arg2 == 64) {
375 - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } 381 + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
376 if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; } 382 if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; }
377 - if (raltmap && rnosse) { _gf_errno = GF_E_SP_64AS; return 0; } 383 + if (raltmap && rnosimd) { _gf_errno = GF_E_SP_64AS; return 0; }
378 } else { _gf_errno = GF_E_SP_64AR; return 0; } 384 } else { _gf_errno = GF_E_SP_64AR; return 0; }
379 } else if (w == 128) { 385 } else if (w == 128) {
380 if (arg1 == 8 && arg2 == 128) { 386 if (arg1 == 8 && arg2 == 128) {
381 - if (rsse || rnosse) { _gf_errno = GF_E_SP128_S; return 0; } 387 + if (rsimd || rnosimd) { _gf_errno = GF_E_SP128_S; return 0; }
382 if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; } 388 if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; }
383 } else if (arg1 == 4 && arg2 == 128) { 389 } else if (arg1 == 4 && arg2 == 128) {
384 - if (rsse && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; } 390 + if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
385 if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; } 391 if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; }
386 - if (raltmap && rnosse) { _gf_errno = GF_E_SP128AS; return 0; } 392 + if (raltmap && rnosimd) { _gf_errno = GF_E_SP128AS; return 0; }
387 } else { _gf_errno = GF_E_SP128AR; return 0; } 393 } else { _gf_errno = GF_E_SP128AR; return 0; }
388 } else { _gf_errno = GF_E_SPLIT_W; return 0; } 394 } else { _gf_errno = GF_E_SPLIT_W; return 0; }
389 return 1; 395 return 1;
@@ -395,7 +401,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type, @@ -395,7 +401,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
395 if (w < 128 && (poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; } 401 if (w < 128 && (poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; }
396 if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; } 402 if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; }
397 if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; } 403 if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; }
398 - if (rsse || rnosse) { _gf_errno = GF_E_COMP_SS; return 0; } 404 + if (rsimd || rnosimd) { _gf_errno = GF_E_COMP_SS; return 0; }
399 if (base != NULL) { 405 if (base != NULL) {
400 sub = (gf_internal_t *) base->scratch; 406 sub = (gf_internal_t *) base->scratch;
401 if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; } 407 if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; }
@@ -953,7 +959,42 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor) @@ -953,7 +959,42 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor)
953 } 959 }
954 return; 960 return;
955 #endif 961 #endif
  962 +#if defined(ARM_NEON)
  963 + s8 = (uint8_t *) src;
  964 + d8 = (uint8_t *) dest;
956 965
  966 + if (uls % 16 == uld % 16) {
  967 + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
  968 + while (s8 != rd.s_start) {
  969 + *d8 ^= *s8;
  970 + s8++;
  971 + d8++;
  972 + }
  973 + while (s8 < (uint8_t *) rd.s_top) {
  974 + uint8x16_t vs = vld1q_u8 (s8);
  975 + uint8x16_t vd = vld1q_u8 (d8);
  976 + uint8x16_t vr = veorq_u8 (vs, vd);
  977 + vst1q_u8 (d8, vr);
  978 + s8 += 16;
  979 + d8 += 16;
  980 + }
  981 + } else {
  982 + while (s8 + 15 < (uint8_t *) src + bytes) {
  983 + uint8x16_t vs = vld1q_u8 (s8);
  984 + uint8x16_t vd = vld1q_u8 (d8);
  985 + uint8x16_t vr = veorq_u8 (vs, vd);
  986 + vst1q_u8 (d8, vr);
  987 + s8 += 16;
  988 + d8 += 16;
  989 + }
  990 + }
  991 + while (s8 < (uint8_t *) src + bytes) {
  992 + *d8 ^= *s8;
  993 + s8++;
  994 + d8++;
  995 + }
  996 + return;
  997 +#endif
957 if (uls % 8 != uld % 8) { 998 if (uls % 8 != uld % 8) {
958 gf_unaligned_xor(src, dest, bytes); 999 gf_unaligned_xor(src, dest, bytes);
959 return; 1000 return;
src/gf_method.c
@@ -121,11 +121,17 @@ int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting) @@ -121,11 +121,17 @@ int create_gf_from_argv(gf_t *gf, int w, int argc, char **argv, int starting)
121 } else if (strcmp(argv[starting], "LAZY") == 0) { 121 } else if (strcmp(argv[starting], "LAZY") == 0) {
122 region_type |= GF_REGION_LAZY; 122 region_type |= GF_REGION_LAZY;
123 starting++; 123 starting++;
  124 + } else if (strcmp(argv[starting], "SIMD") == 0) {
  125 + region_type |= GF_REGION_SIMD;
  126 + starting++;
  127 + } else if (strcmp(argv[starting], "NOSIMD") == 0) {
  128 + region_type |= GF_REGION_NOSIMD;
  129 + starting++;
124 } else if (strcmp(argv[starting], "SSE") == 0) { 130 } else if (strcmp(argv[starting], "SSE") == 0) {
125 - region_type |= GF_REGION_SSE; 131 + region_type |= GF_REGION_SIMD;
126 starting++; 132 starting++;
127 } else if (strcmp(argv[starting], "NOSSE") == 0) { 133 } else if (strcmp(argv[starting], "NOSSE") == 0) {
128 - region_type |= GF_REGION_NOSSE; 134 + region_type |= GF_REGION_NOSIMD;
129 starting++; 135 starting++;
130 } else if (strcmp(argv[starting], "CAUCHY") == 0) { 136 } else if (strcmp(argv[starting], "CAUCHY") == 0) {
131 region_type |= GF_REGION_CAUCHY; 137 region_type |= GF_REGION_CAUCHY;
@@ -1527,7 +1527,7 @@ int gf_w128_split_init(gf_t *gf) @@ -1527,7 +1527,7 @@ int gf_w128_split_init(gf_t *gf)
1527 1527
1528 gf->multiply.w128 = gf_w128_bytwo_p_multiply; 1528 gf->multiply.w128 = gf_w128_bytwo_p_multiply;
1529 #if defined(INTEL_SSE4_PCLMUL) 1529 #if defined(INTEL_SSE4_PCLMUL)
1530 - if (!(h->region_type & GF_REGION_NOSSE)){ 1530 + if (!(h->region_type & GF_REGION_NOSIMD)){
1531 gf->multiply.w128 = gf_w128_clm_multiply; 1531 gf->multiply.w128 = gf_w128_clm_multiply;
1532 } 1532 }
1533 #endif 1533 #endif
@@ -1546,7 +1546,7 @@ int gf_w128_split_init(gf_t *gf) @@ -1546,7 +1546,7 @@ int gf_w128_split_init(gf_t *gf)
1546 if((h->region_type & GF_REGION_ALTMAP)) 1546 if((h->region_type & GF_REGION_ALTMAP))
1547 { 1547 {
1548 #ifdef INTEL_SSE4 1548 #ifdef INTEL_SSE4
1549 - if(!(h->region_type & GF_REGION_NOSSE)) 1549 + if(!(h->region_type & GF_REGION_NOSIMD))
1550 gf->multiply_region.w128 = gf_w128_split_4_128_sse_altmap_multiply_region; 1550 gf->multiply_region.w128 = gf_w128_split_4_128_sse_altmap_multiply_region;
1551 else 1551 else
1552 return 0; 1552 return 0;
@@ -1556,7 +1556,7 @@ int gf_w128_split_init(gf_t *gf) @@ -1556,7 +1556,7 @@ int gf_w128_split_init(gf_t *gf)
1556 } 1556 }
1557 else { 1557 else {
1558 #ifdef INTEL_SSE4 1558 #ifdef INTEL_SSE4
1559 - if(!(h->region_type & GF_REGION_NOSSE)) 1559 + if(!(h->region_type & GF_REGION_NOSIMD))
1560 gf->multiply_region.w128 = gf_w128_split_4_128_sse_multiply_region; 1560 gf->multiply_region.w128 = gf_w128_split_4_128_sse_multiply_region;
1561 else 1561 else
1562 gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region; 1562 gf->multiply_region.w128 = gf_w128_split_4_128_multiply_region;
@@ -11,54 +11,7 @@ @@ -11,54 +11,7 @@
11 #include "gf_int.h" 11 #include "gf_int.h"
12 #include <stdio.h> 12 #include <stdio.h>
13 #include <stdlib.h> 13 #include <stdlib.h>
14 -  
15 -#define GF_FIELD_WIDTH (16)  
16 -#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)  
17 -#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1  
18 -  
19 -#define GF_BASE_FIELD_WIDTH (8)  
20 -#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)  
21 -  
22 -struct gf_w16_logtable_data {  
23 - uint16_t log_tbl[GF_FIELD_SIZE];  
24 - uint16_t antilog_tbl[GF_FIELD_SIZE * 2];  
25 - uint16_t inv_tbl[GF_FIELD_SIZE];  
26 - uint16_t *d_antilog;  
27 -};  
28 -  
29 -struct gf_w16_zero_logtable_data {  
30 - int log_tbl[GF_FIELD_SIZE];  
31 - uint16_t _antilog_tbl[GF_FIELD_SIZE * 4];  
32 - uint16_t *antilog_tbl;  
33 - uint16_t inv_tbl[GF_FIELD_SIZE];  
34 -};  
35 -  
36 -struct gf_w16_lazytable_data {  
37 - uint16_t log_tbl[GF_FIELD_SIZE];  
38 - uint16_t antilog_tbl[GF_FIELD_SIZE * 2];  
39 - uint16_t inv_tbl[GF_FIELD_SIZE];  
40 - uint16_t *d_antilog;  
41 - uint16_t lazytable[GF_FIELD_SIZE];  
42 -};  
43 -  
44 -struct gf_w16_bytwo_data {  
45 - uint64_t prim_poly;  
46 - uint64_t mask1;  
47 - uint64_t mask2;  
48 -};  
49 -  
50 -struct gf_w16_split_8_8_data {  
51 - uint16_t tables[3][256][256];  
52 -};  
53 -  
54 -struct gf_w16_group_4_4_data {  
55 - uint16_t reduce[16];  
56 - uint16_t shift[16];  
57 -};  
58 -  
59 -struct gf_w16_composite_data {  
60 - uint8_t *mult_table;  
61 -}; 14 +#include "gf_w16.h"
62 15
63 #define AB2(ip, am1 ,am2, b, t1, t2) {\ 16 #define AB2(ip, am1 ,am2, b, t1, t2) {\
64 t1 = (b << 1) & am1;\ 17 t1 = (b << 1) & am1;\
@@ -1264,6 +1217,7 @@ int gf_w16_split_init(gf_t *gf) @@ -1264,6 +1217,7 @@ int gf_w16_split_init(gf_t *gf)
1264 gf_internal_t *h; 1217 gf_internal_t *h;
1265 struct gf_w16_split_8_8_data *d8; 1218 struct gf_w16_split_8_8_data *d8;
1266 int i, j, exp, issse3; 1219 int i, j, exp, issse3;
  1220 + int isneon = 0;
1267 uint32_t p, basep; 1221 uint32_t p, basep;
1268 1222
1269 h = (gf_internal_t *) gf->scratch; 1223 h = (gf_internal_t *) gf->scratch;
@@ -1273,6 +1227,9 @@ int gf_w16_split_init(gf_t *gf) @@ -1273,6 +1227,9 @@ int gf_w16_split_init(gf_t *gf)
1273 #else 1227 #else
1274 issse3 = 0; 1228 issse3 = 0;
1275 #endif 1229 #endif
  1230 +#ifdef ARM_NEON
  1231 + isneon = 1;
  1232 +#endif
1276 1233
1277 if (h->arg1 == 8 && h->arg2 == 8) { 1234 if (h->arg1 == 8 && h->arg2 == 8) {
1278 d8 = (struct gf_w16_split_8_8_data *) h->private; 1235 d8 = (struct gf_w16_split_8_8_data *) h->private;
@@ -1317,6 +1274,10 @@ int gf_w16_split_init(gf_t *gf) @@ -1317,6 +1274,10 @@ int gf_w16_split_init(gf_t *gf)
1317 1274
1318 if (issse3) { 1275 if (issse3) {
1319 gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region; 1276 gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_multiply_region;
  1277 + } else if (isneon) {
  1278 +#ifdef ARM_NEON
  1279 + gf_w16_neon_split_init(gf);
  1280 +#endif
1320 } else { 1281 } else {
1321 gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; 1282 gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
1322 } 1283 }
@@ -1326,15 +1287,15 @@ int gf_w16_split_init(gf_t *gf) @@ -1326,15 +1287,15 @@ int gf_w16_split_init(gf_t *gf)
1326 gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region; 1287 gf->multiply_region.w32 = gf_w16_split_8_16_lazy_multiply_region;
1327 1288
1328 } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) { 1289 } else if ((h->arg1 == 4 && h->arg2 == 16) || (h->arg2 == 4 && h->arg1 == 16)) {
1329 - if (issse3) {  
1330 - if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSSE) 1290 + if (issse3 || isneon) {
  1291 + if(h->region_type & GF_REGION_ALTMAP && h->region_type & GF_REGION_NOSIMD)
1331 gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region; 1292 gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
1332 - else if(h->region_type & GF_REGION_NOSSE) 1293 + else if(h->region_type & GF_REGION_NOSIMD)
1333 gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region; 1294 gf->multiply_region.w32 = gf_w16_split_4_16_lazy_multiply_region;
1334 - else if(h->region_type & GF_REGION_ALTMAP) 1295 + else if(h->region_type & GF_REGION_ALTMAP && issse3)
1335 gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region; 1296 gf->multiply_region.w32 = gf_w16_split_4_16_lazy_sse_altmap_multiply_region;
1336 } else { 1297 } else {
1337 - if(h->region_type & GF_REGION_SSE) 1298 + if(h->region_type & GF_REGION_SIMD)
1338 return 0; 1299 return 0;
1339 else if(h->region_type & GF_REGION_ALTMAP) 1300 else if(h->region_type & GF_REGION_ALTMAP)
1340 gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region; 1301 gf->multiply_region.w32 = gf_w16_split_4_16_lazy_nosse_altmap_multiply_region;
@@ -1884,25 +1845,25 @@ int gf_w16_bytwo_init(gf_t *gf) @@ -1884,25 +1845,25 @@ int gf_w16_bytwo_init(gf_t *gf)
1884 if (h->mult_type == GF_MULT_BYTWO_p) { 1845 if (h->mult_type == GF_MULT_BYTWO_p) {
1885 gf->multiply.w32 = gf_w16_bytwo_p_multiply; 1846 gf->multiply.w32 = gf_w16_bytwo_p_multiply;
1886 #ifdef INTEL_SSE2 1847 #ifdef INTEL_SSE2
1887 - if (h->region_type & GF_REGION_NOSSE) 1848 + if (h->region_type & GF_REGION_NOSIMD)
1888 gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region; 1849 gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region;
1889 else 1850 else
1890 gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region; 1851 gf->multiply_region.w32 = gf_w16_bytwo_p_sse_multiply_region;
1891 #else 1852 #else
1892 gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region; 1853 gf->multiply_region.w32 = gf_w16_bytwo_p_nosse_multiply_region;
1893 - if(h->region_type & GF_REGION_SSE) 1854 + if(h->region_type & GF_REGION_SIMD)
1894 return 0; 1855 return 0;
1895 #endif 1856 #endif
1896 } else { 1857 } else {
1897 gf->multiply.w32 = gf_w16_bytwo_b_multiply; 1858 gf->multiply.w32 = gf_w16_bytwo_b_multiply;
1898 #ifdef INTEL_SSE2 1859 #ifdef INTEL_SSE2
1899 - if (h->region_type & GF_REGION_NOSSE) 1860 + if (h->region_type & GF_REGION_NOSIMD)
1900 gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region; 1861 gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region;
1901 else 1862 else
1902 gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region; 1863 gf->multiply_region.w32 = gf_w16_bytwo_b_sse_multiply_region;
1903 #else 1864 #else
1904 gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region; 1865 gf->multiply_region.w32 = gf_w16_bytwo_b_nosse_multiply_region;
1905 - if(h->region_type & GF_REGION_SSE) 1866 + if(h->region_type & GF_REGION_SIMD)
1906 return 0; 1867 return 0;
1907 #endif 1868 #endif
1908 } 1869 }
@@ -12,59 +12,7 @@ @@ -12,59 +12,7 @@
12 #include "gf_int.h" 12 #include "gf_int.h"
13 #include <stdio.h> 13 #include <stdio.h>
14 #include <stdlib.h> 14 #include <stdlib.h>
15 -  
16 -#define GF_FIELD_WIDTH (32)  
17 -#define GF_FIRST_BIT (1 << 31)  
18 -  
19 -#define GF_BASE_FIELD_WIDTH (16)  
20 -#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)  
21 -#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1  
22 -#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)  
23 -  
24 -struct gf_split_2_32_lazy_data {  
25 - uint32_t tables[16][4];  
26 - uint32_t last_value;  
27 -};  
28 -  
29 -struct gf_w32_split_8_8_data {  
30 - uint32_t tables[7][256][256];  
31 - uint32_t region_tables[4][256];  
32 - uint32_t last_value;  
33 -};  
34 -  
35 -struct gf_w32_group_data {  
36 - uint32_t *reduce;  
37 - uint32_t *shift;  
38 - int tshift;  
39 - uint64_t rmask;  
40 - uint32_t *memory;  
41 -};  
42 -  
43 -struct gf_split_16_32_lazy_data {  
44 - uint32_t tables[2][(1<<16)];  
45 - uint32_t last_value;  
46 -};  
47 -  
48 -struct gf_split_8_32_lazy_data {  
49 - uint32_t tables[4][256];  
50 - uint32_t last_value;  
51 -};  
52 -  
53 -struct gf_split_4_32_lazy_data {  
54 - uint32_t tables[8][16];  
55 - uint32_t last_value;  
56 -};  
57 -  
58 -struct gf_w32_bytwo_data {  
59 - uint64_t prim_poly;  
60 - uint64_t mask1;  
61 - uint64_t mask2;  
62 -};  
63 -  
64 -struct gf_w32_composite_data {  
65 - uint16_t *log;  
66 - uint16_t *alog;  
67 -}; 15 +#include "gf_w32.h"
68 16
69 #define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); } 17 #define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
70 18
@@ -1434,25 +1382,25 @@ int gf_w32_bytwo_init(gf_t *gf) @@ -1434,25 +1382,25 @@ int gf_w32_bytwo_init(gf_t *gf)
1434 if (h->mult_type == GF_MULT_BYTWO_p) { 1382 if (h->mult_type == GF_MULT_BYTWO_p) {
1435 gf->multiply.w32 = gf_w32_bytwo_p_multiply; 1383 gf->multiply.w32 = gf_w32_bytwo_p_multiply;
1436 #ifdef INTEL_SSE2 1384 #ifdef INTEL_SSE2
1437 - if (h->region_type & GF_REGION_NOSSE) 1385 + if (h->region_type & GF_REGION_NOSIMD)
1438 gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; 1386 gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region;
1439 else 1387 else
1440 gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region; 1388 gf->multiply_region.w32 = gf_w32_bytwo_p_sse_multiply_region;
1441 #else 1389 #else
1442 gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region; 1390 gf->multiply_region.w32 = gf_w32_bytwo_p_nosse_multiply_region;
1443 - if(h->region_type & GF_REGION_SSE) 1391 + if(h->region_type & GF_REGION_SIMD)
1444 return 0; 1392 return 0;
1445 #endif 1393 #endif
1446 } else { 1394 } else {
1447 gf->multiply.w32 = gf_w32_bytwo_b_multiply; 1395 gf->multiply.w32 = gf_w32_bytwo_b_multiply;
1448 #ifdef INTEL_SSE2 1396 #ifdef INTEL_SSE2
1449 - if (h->region_type & GF_REGION_NOSSE) 1397 + if (h->region_type & GF_REGION_NOSIMD)
1450 gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; 1398 gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region;
1451 else 1399 else
1452 gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region; 1400 gf->multiply_region.w32 = gf_w32_bytwo_b_sse_multiply_region;
1453 #else 1401 #else
1454 gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region; 1402 gf->multiply_region.w32 = gf_w32_bytwo_b_nosse_multiply_region;
1455 - if(h->region_type & GF_REGION_SSE) 1403 + if(h->region_type & GF_REGION_SIMD)
1456 return 0; 1404 return 0;
1457 #endif 1405 #endif
1458 } 1406 }
@@ -2283,6 +2231,7 @@ int gf_w32_split_init(gf_t *gf) @@ -2283,6 +2231,7 @@ int gf_w32_split_init(gf_t *gf)
2283 struct gf_split_16_32_lazy_data *d16; 2231 struct gf_split_16_32_lazy_data *d16;
2284 uint32_t p, basep; 2232 uint32_t p, basep;
2285 int i, j, exp, ispclmul, issse3; 2233 int i, j, exp, ispclmul, issse3;
  2234 + int isneon = 0;
2286 2235
2287 #if defined(INTEL_SSE4_PCLMUL) 2236 #if defined(INTEL_SSE4_PCLMUL)
2288 ispclmul = 1; 2237 ispclmul = 1;
@@ -2295,6 +2244,9 @@ int gf_w32_split_init(gf_t *gf) @@ -2295,6 +2244,9 @@ int gf_w32_split_init(gf_t *gf)
2295 #else 2244 #else
2296 issse3 = 0; 2245 issse3 = 0;
2297 #endif 2246 #endif
  2247 +#ifdef ARM_NEON
  2248 + isneon = 1;
  2249 +#endif
2298 2250
2299 h = (gf_internal_t *) gf->scratch; 2251 h = (gf_internal_t *) gf->scratch;
2300 2252
@@ -2335,13 +2287,13 @@ int gf_w32_split_init(gf_t *gf) @@ -2335,13 +2287,13 @@ int gf_w32_split_init(gf_t *gf)
2335 ld2 = (struct gf_split_2_32_lazy_data *) h->private; 2287 ld2 = (struct gf_split_2_32_lazy_data *) h->private;
2336 ld2->last_value = 0; 2288 ld2->last_value = 0;
2337 #ifdef INTEL_SSSE3 2289 #ifdef INTEL_SSSE3
2338 - if (!(h->region_type & GF_REGION_NOSSE)) 2290 + if (!(h->region_type & GF_REGION_NOSIMD))
2339 gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region; 2291 gf->multiply_region.w32 = gf_w32_split_2_32_lazy_sse_multiply_region;
2340 else 2292 else
2341 gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region; 2293 gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
2342 #else 2294 #else
2343 gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region; 2295 gf->multiply_region.w32 = gf_w32_split_2_32_lazy_multiply_region;
2344 - if(h->region_type & GF_REGION_SSE) return 0; 2296 + if(h->region_type & GF_REGION_SIMD) return 0;
2345 #endif 2297 #endif
2346 return 1; 2298 return 1;
2347 } 2299 }
@@ -2349,11 +2301,15 @@ int gf_w32_split_init(gf_t *gf) @@ -2349,11 +2301,15 @@ int gf_w32_split_init(gf_t *gf)
2349 /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */ 2301 /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */
2350 2302
2351 if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) || 2303 if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) ||
2352 - (issse3 && h->mult_type == GF_REGION_DEFAULT)) { 2304 + ((issse3 || isneon) && h->mult_type == GF_REGION_DEFAULT)) {
2353 ld4 = (struct gf_split_4_32_lazy_data *) h->private; 2305 ld4 = (struct gf_split_4_32_lazy_data *) h->private;
2354 ld4->last_value = 0; 2306 ld4->last_value = 0;
2355 - if ((h->region_type & GF_REGION_NOSSE) || !issse3) { 2307 + if ((h->region_type & GF_REGION_NOSIMD) || !(issse3 || isneon)) {
2356 gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region; 2308 gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region;
  2309 + } else if (isneon) {
  2310 +#ifdef ARM_NEON
  2311 + gf_w32_neon_split_init(gf);
  2312 +#endif
2357 } else if (h->region_type & GF_REGION_ALTMAP) { 2313 } else if (h->region_type & GF_REGION_ALTMAP) {
2358 gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region; 2314 gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region;
2359 } else { 2315 } else {
@@ -2731,10 +2687,14 @@ int gf_w32_composite_init(gf_t *gf) @@ -2731,10 +2687,14 @@ int gf_w32_composite_init(gf_t *gf)
2731 int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) 2687 int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
2732 { 2688 {
2733 int issse3 = 0; 2689 int issse3 = 0;
  2690 + int isneon = 0;
2734 2691
2735 #ifdef INTEL_SSSE3 2692 #ifdef INTEL_SSSE3
2736 issse3 = 1; 2693 issse3 = 1;
2737 #endif 2694 #endif
  2695 +#ifdef ARM_NEON
  2696 + isneon = 1;
  2697 +#endif
2738 2698
2739 switch(mult_type) 2699 switch(mult_type)
2740 { 2700 {
@@ -2760,7 +2720,7 @@ int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg @@ -2760,7 +2720,7 @@ int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg
2760 return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64; 2720 return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
2761 } 2721 }
2762 if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) || 2722 if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) ||
2763 - (mult_type == GF_MULT_DEFAULT && !issse3)) { 2723 + (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))) {
2764 return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64; 2724 return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
2765 } 2725 }
2766 if ((arg1 == 4 && arg2 == 32) || 2726 if ((arg1 == 4 && arg2 == 32) ||
@@ -11,49 +11,7 @@ @@ -11,49 +11,7 @@
11 #include "gf_int.h" 11 #include "gf_int.h"
12 #include <stdio.h> 12 #include <stdio.h>
13 #include <stdlib.h> 13 #include <stdlib.h>
14 -  
15 -#define GF_FIELD_WIDTH 4  
16 -#define GF_DOUBLE_WIDTH (GF_FIELD_WIDTH*2)  
17 -#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)  
18 -#define GF_MULT_GROUP_SIZE (GF_FIELD_SIZE-1)  
19 -  
20 -/* ------------------------------------------------------------  
21 - JSP: Each implementation has its own data, which is allocated  
22 - at one time as part of the handle. For that reason, it  
23 - shouldn't be hierarchical -- i.e. one should be able to  
24 - allocate it with one call to malloc. */  
25 -  
26 -struct gf_logtable_data {  
27 - uint8_t log_tbl[GF_FIELD_SIZE];  
28 - uint8_t antilog_tbl[GF_FIELD_SIZE * 2];  
29 - uint8_t *antilog_tbl_div;  
30 -};  
31 -  
32 -struct gf_single_table_data {  
33 - uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];  
34 - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];  
35 -};  
36 -  
37 -struct gf_double_table_data {  
38 - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];  
39 - uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];  
40 -};  
41 -struct gf_quad_table_data {  
42 - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];  
43 - uint16_t mult[GF_FIELD_SIZE][(1<<16)];  
44 -};  
45 -  
46 -struct gf_quad_table_lazy_data {  
47 - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];  
48 - uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];  
49 - uint16_t mult[(1 << 16)];  
50 -};  
51 -  
52 -struct gf_bytwo_data {  
53 - uint64_t prim_poly;  
54 - uint64_t mask1;  
55 - uint64_t mask2;  
56 -}; 14 +#include "gf_w4.h"
57 15
58 #define AB2(ip, am1 ,am2, b, t1, t2) {\ 16 #define AB2(ip, am1 ,am2, b, t1, t2) {\
59 t1 = (b << 1) & am1;\ 17 t1 = (b << 1) & am1;\
@@ -489,14 +447,18 @@ int gf_w4_single_table_init(gf_t *gf) @@ -489,14 +447,18 @@ int gf_w4_single_table_init(gf_t *gf)
489 gf->inverse.w32 = NULL; 447 gf->inverse.w32 = NULL;
490 gf->divide.w32 = gf_w4_single_table_divide; 448 gf->divide.w32 = gf_w4_single_table_divide;
491 gf->multiply.w32 = gf_w4_single_table_multiply; 449 gf->multiply.w32 = gf_w4_single_table_multiply;
492 - #ifdef INTEL_SSSE3  
493 - if(h->region_type & (GF_REGION_NOSSE | GF_REGION_CAUCHY)) 450 + #if defined(INTEL_SSSE3) || defined(ARM_NEON)
  451 + if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))
494 gf->multiply_region.w32 = gf_w4_single_table_multiply_region; 452 gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
495 else 453 else
  454 + #if defined(INTEL_SSSE3)
496 gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region; 455 gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region;
  456 + #elif defined(ARM_NEON)
  457 + gf_w4_neon_single_table_init(gf);
  458 + #endif
497 #else 459 #else
498 gf->multiply_region.w32 = gf_w4_single_table_multiply_region; 460 gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
499 - if (h->region_type & GF_REGION_SSE) return 0; 461 + if (h->region_type & GF_REGION_SIMD) return 0;
500 #endif 462 #endif
501 463
502 return 1; 464 return 1;
@@ -774,16 +736,16 @@ int gf_w4_table_init(gf_t *gf) @@ -774,16 +736,16 @@ int gf_w4_table_init(gf_t *gf)
774 { 736 {
775 int rt; 737 int rt;
776 gf_internal_t *h; 738 gf_internal_t *h;
777 - int issse3 = 0; 739 + int simd = 0;
778 740
779 -#ifdef INTEL_SSSE3  
780 - issse3 = 1; 741 +#if defined(INTEL_SSSE3) || defined(ARM_NEON)
  742 + simd = 1;
781 #endif 743