Commit bec15359de5273d06673c43b8e73c70f97396041

Authored by Janne Grunau
1 parent 1311a44f
Exists in master and in 2 other branches v2, v3

arm: NEON optimisations for gf_w8

Optimisations for the 4,4 split table region multiplication and carry
less multiplication using NEON's polynomial long multiplication.
arm: w8: NEON carry less multiplication

Selected time_tool.sh results for a 1.7GHz cortex-a9:
Region Best (MB/s):   375.86   W-Method: 8 -m CARRY_FREE -
Region Best (MB/s):   142.94   W-Method: 8 -m TABLE -
Region Best (MB/s):   225.01   W-Method: 8 -m TABLE -r DOUBLE -
Region Best (MB/s):   211.23   W-Method: 8 -m TABLE -r DOUBLE -r LAZY -
Region Best (MB/s):   160.09   W-Method: 8 -m LOG -
Region Best (MB/s):   123.61   W-Method: 8 -m LOG_ZERO -
Region Best (MB/s):   123.85   W-Method: 8 -m LOG_ZERO_EXT -
Region Best (MB/s):  1183.79   W-Method: 8 -m SPLIT 8 4 -r SIMD -
Region Best (MB/s):   177.68   W-Method: 8 -m SPLIT 8 4 -r NOSIMD -
Region Best (MB/s):    87.85   W-Method: 8 -m COMPOSITE 2 - -
Region Best (MB/s):   428.59   W-Method: 8 -m COMPOSITE 2 - -r ALTMAP -
include/gf_w8.h 0 → 100644
... ... @@ -0,0 +1,99 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_w8.c
  7 + *
  8 + * Defines and data stuctures for 8-bit Galois fields
  9 + */
  10 +
  11 +#ifndef GF_COMPLETE_GF_W8_H
  12 +#define GF_COMPLETE_GF_W8_H
  13 +
  14 +#include "gf_int.h"
  15 +#include <stdint.h>
  16 +
  17 +#define GF_FIELD_WIDTH (8)
  18 +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
  19 +#define GF_HALF_SIZE (1 << (GF_FIELD_WIDTH/2))
  20 +#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
  21 +
  22 +#define GF_BASE_FIELD_WIDTH (4)
  23 +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
  24 +
  25 +struct gf_w8_logtable_data {
  26 + uint8_t log_tbl[GF_FIELD_SIZE];
  27 + uint8_t antilog_tbl[GF_FIELD_SIZE * 2];
  28 + uint8_t inv_tbl[GF_FIELD_SIZE];
  29 +};
  30 +
  31 +struct gf_w8_logzero_table_data {
  32 + short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */
  33 + uint8_t antilog_tbl[512+512+1];
  34 + uint8_t *div_tbl;
  35 + uint8_t *inv_tbl;
  36 +};
  37 +
  38 +struct gf_w8_logzero_small_table_data {
  39 + short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */
  40 + uint8_t antilog_tbl[255*3];
  41 + uint8_t inv_tbl[GF_FIELD_SIZE];
  42 + uint8_t *div_tbl;
  43 +};
  44 +
  45 +struct gf_w8_composite_data {
  46 + uint8_t *mult_table;
  47 +};
  48 +
  49 +/* Don't change the order of these relative to gf_w8_half_table_data */
  50 +
  51 +struct gf_w8_default_data {
  52 + uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE];
  53 + uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE];
  54 + uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
  55 + uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
  56 +};
  57 +
  58 +struct gf_w8_half_table_data {
  59 + uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE];
  60 + uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE];
  61 +};
  62 +
  63 +struct gf_w8_single_table_data {
  64 + uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
  65 + uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
  66 +};
  67 +
  68 +struct gf_w8_double_table_data {
  69 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  70 + uint16_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
  71 +};
  72 +
  73 +struct gf_w8_double_table_lazy_data {
  74 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  75 + uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
  76 + uint16_t mult[GF_FIELD_SIZE*GF_FIELD_SIZE];
  77 +};
  78 +
  79 +struct gf_w4_logtable_data {
  80 + uint8_t log_tbl[GF_BASE_FIELD_SIZE];
  81 + uint8_t antilog_tbl[GF_BASE_FIELD_SIZE * 2];
  82 + uint8_t *antilog_tbl_div;
  83 +};
  84 +
  85 +struct gf_w4_single_table_data {
  86 + uint8_t div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
  87 + uint8_t mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
  88 +};
  89 +
  90 +struct gf_w8_bytwo_data {
  91 + uint64_t prim_poly;
  92 + uint64_t mask1;
  93 + uint64_t mask2;
  94 +};
  95 +
  96 +int gf_w8_neon_cfm_init(gf_t *gf);
  97 +void gf_w8_neon_split_init(gf_t *gf);
  98 +
  99 +#endif /* GF_COMPLETE_GF_W8_H */
... ...
src/Makefile.am
... ... @@ -11,7 +11,8 @@ libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c
11 11 gf_w64.c gf_w128.c gf_rand.c gf_general.c
12 12  
13 13 if HAVE_NEON
14   -libgf_complete_la_SOURCES += neon/gf_w4_neon.c
  14 +libgf_complete_la_SOURCES += neon/gf_w4_neon.c \
  15 + neon/gf_w8_neon.c
15 16 endif
16 17  
17 18 libgf_complete_la_LDFLAGS = -version-info 1:0:0
... ...
src/gf.c
... ... @@ -217,6 +217,11 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
217 217 pclmul = 1;
218 218 #endif
219 219  
  220 +#ifdef ARM_NEON
  221 + pclmul = 1;
  222 + sse3 = 1;
  223 +#endif
  224 +
220 225  
221 226 if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }
222 227  
... ...
src/gf_w8.c
... ... @@ -9,88 +9,10 @@
9 9 */
10 10  
11 11 #include "gf_int.h"
  12 +#include "gf_w8.h"
12 13 #include <stdio.h>
13 14 #include <stdlib.h>
14 15  
15   -#define GF_FIELD_WIDTH (8)
16   -#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
17   -#define GF_HALF_SIZE (1 << (GF_FIELD_WIDTH/2))
18   -#define GF_MULT_GROUP_SIZE GF_FIELD_SIZE-1
19   -
20   -#define GF_BASE_FIELD_WIDTH (4)
21   -#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
22   -
23   -struct gf_w8_logtable_data {
24   - uint8_t log_tbl[GF_FIELD_SIZE];
25   - uint8_t antilog_tbl[GF_FIELD_SIZE * 2];
26   - uint8_t inv_tbl[GF_FIELD_SIZE];
27   -};
28   -
29   -struct gf_w8_logzero_table_data {
30   - short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */
31   - uint8_t antilog_tbl[512+512+1];
32   - uint8_t *div_tbl;
33   - uint8_t *inv_tbl;
34   -};
35   -
36   -struct gf_w8_logzero_small_table_data {
37   - short log_tbl[GF_FIELD_SIZE]; /* Make this signed, so that we can divide easily */
38   - uint8_t antilog_tbl[255*3];
39   - uint8_t inv_tbl[GF_FIELD_SIZE];
40   - uint8_t *div_tbl;
41   -};
42   -
43   -struct gf_w8_composite_data {
44   - uint8_t *mult_table;
45   -};
46   -
47   -/* Don't change the order of these relative to gf_w8_half_table_data */
48   -
49   -struct gf_w8_default_data {
50   - uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE];
51   - uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE];
52   - uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
53   - uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
54   -};
55   -
56   -struct gf_w8_half_table_data {
57   - uint8_t high[GF_FIELD_SIZE][GF_HALF_SIZE];
58   - uint8_t low[GF_FIELD_SIZE][GF_HALF_SIZE];
59   -};
60   -
61   -struct gf_w8_single_table_data {
62   - uint8_t divtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
63   - uint8_t multtable[GF_FIELD_SIZE][GF_FIELD_SIZE];
64   -};
65   -
66   -struct gf_w8_double_table_data {
67   - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
68   - uint16_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
69   -};
70   -
71   -struct gf_w8_double_table_lazy_data {
72   - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
73   - uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
74   - uint16_t mult[GF_FIELD_SIZE*GF_FIELD_SIZE];
75   -};
76   -
77   -struct gf_w4_logtable_data {
78   - uint8_t log_tbl[GF_BASE_FIELD_SIZE];
79   - uint8_t antilog_tbl[GF_BASE_FIELD_SIZE * 2];
80   - uint8_t *antilog_tbl_div;
81   -};
82   -
83   -struct gf_w4_single_table_data {
84   - uint8_t div[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
85   - uint8_t mult[GF_BASE_FIELD_SIZE][GF_BASE_FIELD_SIZE];
86   -};
87   -
88   -struct gf_w8_bytwo_data {
89   - uint64_t prim_poly;
90   - uint64_t mask1;
91   - uint64_t mask2;
92   -};
93   -
94 16 #define AB2(ip, am1 ,am2, b, t1, t2) {\
95 17 t1 = (b << 1) & am1;\
96 18 t2 = b & am2; \
... ... @@ -603,6 +525,8 @@ int gf_w8_cfm_init(gf_t *gf)
603 525 return 0;
604 526 }
605 527 return 1;
  528 +#elif defined(ARM_NEON)
  529 + return gf_w8_neon_cfm_init(gf);
606 530 #endif
607 531  
608 532 return 0;
... ... @@ -938,7 +862,7 @@ gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
938 862 return (ftd->multtable[a][b]);
939 863 }
940 864  
941   -#ifdef INTEL_SSSE3
  865 +#if defined(INTEL_SSSE3) || defined(ARM_NEON)
942 866 static
943 867 gf_val_32_t
944 868 gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
... ... @@ -1179,11 +1103,15 @@ int gf_w8_split_init(gf_t *gf)
1179 1103  
1180 1104 gf->multiply.w32 = gf_w8_split_multiply;
1181 1105  
1182   - #ifdef INTEL_SSSE3
  1106 + #if defined(INTEL_SSSE3) || defined(ARM_NEON)
1183 1107 if (h->region_type & GF_REGION_NOSIMD)
1184 1108 gf->multiply_region.w32 = gf_w8_split_multiply_region;
1185 1109 else
  1110 + #if defined(INTEL_SSSE3)
1186 1111 gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
  1112 + #elif defined(ARM_NEON)
  1113 + gf_w8_neon_split_init(gf);
  1114 + #endif
1187 1115 #else
1188 1116 gf->multiply_region.w32 = gf_w8_split_multiply_region;
1189 1117 if(h->region_type & GF_REGION_SIMD)
... ... @@ -1205,17 +1133,17 @@ int gf_w8_table_init(gf_t *gf)
1205 1133 struct gf_w8_double_table_data *dtd = NULL;
1206 1134 struct gf_w8_double_table_lazy_data *ltd = NULL;
1207 1135 struct gf_w8_default_data *dd = NULL;
1208   - int a, b, c, prod, scase, issse;
  1136 + int a, b, c, prod, scase, use_simd;
1209 1137  
1210 1138 h = (gf_internal_t *) gf->scratch;
1211 1139  
1212   -#ifdef INTEL_SSSE3
1213   - issse = 1;
  1140 +#if defined(INTEL_SSSE3) || defined(ARM_NEON)
  1141 + use_simd = 1;
1214 1142 #else
1215   - issse = 0;
  1143 + use_simd = 0;
1216 1144 #endif
1217 1145  
1218   - if (h->mult_type == GF_MULT_DEFAULT && issse) {
  1146 + if (h->mult_type == GF_MULT_DEFAULT && use_simd) {
1219 1147 dd = (struct gf_w8_default_data *)h->private;
1220 1148 scase = 3;
1221 1149 bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
... ... @@ -1290,10 +1218,14 @@ int gf_w8_table_init(gf_t *gf)
1290 1218 gf->multiply_region.w32 = gf_w8_double_table_multiply_region;
1291 1219 break;
1292 1220 case 3:
1293   -#ifdef INTEL_SSSE3
  1221 +#if defined(INTEL_SSSE3) || defined(ARM_NEON)
1294 1222 gf->divide.w32 = gf_w8_default_divide;
1295 1223 gf->multiply.w32 = gf_w8_default_multiply;
  1224 +#if defined(INTEL_SSSE3)
1296 1225 gf->multiply_region.w32 = gf_w8_split_multiply_region_sse;
  1226 +#elif defined(ARM_NEON)
  1227 + gf_w8_neon_split_init(gf);
  1228 +#endif
1297 1229 #endif
1298 1230 break;
1299 1231 }
... ... @@ -2296,7 +2228,7 @@ int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1
2296 2228 switch(mult_type)
2297 2229 {
2298 2230 case GF_MULT_DEFAULT:
2299   -#ifdef INTEL_SSSE3
  2231 +#if defined(INTEL_SSSE3) || defined(ARM_NEON)
2300 2232 return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
2301 2233 #endif
2302 2234 return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
... ...
src/neon/gf_w8_neon.c 0 → 100644
... ... @@ -0,0 +1,302 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * Copyright (c) 2014: Janne Grunau <j@jannau.net>
  7 + *
  8 + * Redistribution and use in source and binary forms, with or without
  9 + * modification, are permitted provided that the following conditions
  10 + * are met:
  11 + *
  12 + * - Redistributions of source code must retain the above copyright
  13 + * notice, this list of conditions and the following disclaimer.
  14 + *
  15 + * - Redistributions in binary form must reproduce the above copyright
  16 + * notice, this list of conditions and the following disclaimer in
  17 + * the documentation and/or other materials provided with the
  18 + * distribution.
  19 + *
  20 + * - Neither the name of the University of Tennessee nor the names of its
  21 + * contributors may be used to endorse or promote products derived
  22 + * from this software without specific prior written permission.
  23 + *
  24 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  28 + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  29 + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  30 + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  31 + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  32 + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  34 + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  35 + * POSSIBILITY OF SUCH DAMAGE.
  36 + *
  37 + * gf_w8_neon.c
  38 + *
  39 + * Neon optimized routines for 8-bit Galois fields
  40 + *
  41 + */
  42 +
  43 +#include "gf_int.h"
  44 +#include "gf_w8.h"
  45 +#include <stdio.h>
  46 +#include <stdlib.h>
  47 +
  48 +/* ARM NEON reducing macro for the carry free multiplication
  49 + * vmull_p8 is the carryless multiply operation. Here vshrn_n_u16 shifts
  50 + * the result to the right by 1 byte. This allows us to multiply
  51 + * the prim_poly by the leading bits of the result. We then xor the result
  52 + * of that operation back with the result. */
  53 +#define NEON_CFM_REDUCE(v, w, result, prim_poly, initial) \
  54 + do { \
  55 + if (initial) \
  56 + v = vshrn_n_u16 (vreinterpretq_u16_p16(result), 8); \
  57 + else \
  58 + v = veor_u8 (v, vshrn_n_u16 (vreinterpretq_u16_p16(result), 8)); \
  59 + w = vmull_p8 (prim_poly, vreinterpret_p8_u8(v)); \
  60 + result = vreinterpretq_p16_u16 (veorq_u16 (vreinterpretq_u16_p16(result), vreinterpretq_u16_p16(w))); \
  61 + } while (0)
  62 +
  63 +static
  64 +inline
  65 +gf_val_32_t
  66 +gf_w8_neon_clm_multiply_x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8, int x)
  67 +{
  68 + gf_val_32_t rv = 0;
  69 + poly8x8_t a, b;
  70 + uint8x8_t v;
  71 + poly16x8_t result;
  72 + poly8x8_t prim_poly;
  73 + poly16x8_t w;
  74 + gf_internal_t * h = gf->scratch;
  75 +
  76 + a = vdup_n_p8 (a8);
  77 + b = vdup_n_p8 (b8);
  78 +
  79 + prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1ffULL));
  80 +
  81 + /* Do the initial multiply */
  82 + result = vmull_p8 (a, b);
  83 +
  84 + /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
  85 + have to do the reduction at most twice, because (w-2)/z == 2. Where
  86 + z is equal to the number of zeros after the leading 1 */
  87 + NEON_CFM_REDUCE (v, w, result, prim_poly, 1);
  88 + NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
  89 + if (x >= 3) {
  90 + NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
  91 + }
  92 + if (x >= 4) {
  93 + NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
  94 + }
  95 + /* Extracts 32 bit value from result. */
  96 + rv = (gf_val_32_t)vget_lane_u8 (vmovn_u16 (vreinterpretq_u16_p16 (result)), 0);
  97 +
  98 + return rv;
  99 +}
  100 +
  101 +#define CLM_MULTIPLY(x) \
  102 +static gf_val_32_t gf_w8_neon_clm_multiply_ ## x (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8) \
  103 +{\
  104 + return gf_w8_neon_clm_multiply_x (gf, a8, b8, x);\
  105 +}
  106 +
  107 +CLM_MULTIPLY(2)
  108 +CLM_MULTIPLY(3)
  109 +CLM_MULTIPLY(4)
  110 +
  111 +static inline void
  112 +neon_clm_multiply_region_from_single_x(gf_t *gf, uint8_t *s8, uint8_t *d8,
  113 + gf_val_32_t val, uint8_t *d_end,
  114 + int xor, int x)
  115 +{
  116 + gf_internal_t * h = gf->scratch;
  117 + poly8x8_t a, b;
  118 + uint8x8_t c, v;
  119 + poly16x8_t result;
  120 + poly8x8_t prim_poly;
  121 + poly16x8_t w;
  122 +
  123 + a = vdup_n_p8 (val);
  124 + prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0xffULL));
  125 +
  126 + while (d8 < d_end) {
  127 + b = vld1_p8 ((poly8_t *) s8);
  128 +
  129 + if (xor)
  130 + c = vld1_u8 (d8);
  131 +
  132 + result = vmull_p8 (a, b);
  133 +
  134 + NEON_CFM_REDUCE(v, w, result, prim_poly, 1);
  135 + NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
  136 + if (x >= 3) {
  137 + NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
  138 + }
  139 + if (x >= 4) {
  140 + NEON_CFM_REDUCE (v, w, result, prim_poly, 0);
  141 + }
  142 + v = vmovn_u16 (vreinterpretq_u16_p16 (result));
  143 + if (xor)
  144 + v = veor_u8 (c, v);
  145 +
  146 + vst1_u8 (d8, v);
  147 +
  148 + d8 += 8;
  149 + s8 += 8;
  150 + }
  151 +}
  152 +
  153 +#define CLM_MULT_REGION(x) \
  154 +static void \
  155 +gf_w8_neon_clm_multiply_region_from_single_ ## x (gf_t *gf, void *src, \
  156 + void *dest, \
  157 + gf_val_32_t val, int bytes, \
  158 + int xor) \
  159 +{ \
  160 + gf_region_data rd; \
  161 + uint8_t *s8; \
  162 + uint8_t *d8; \
  163 + \
  164 + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } \
  165 + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } \
  166 + \
  167 + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16); \
  168 + gf_do_initial_region_alignment(&rd); \
  169 + s8 = (uint8_t *) rd.s_start; \
  170 + d8 = (uint8_t *) rd.d_start; \
  171 + \
  172 + if (xor) \
  173 + neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 1, x); \
  174 + else \
  175 + neon_clm_multiply_region_from_single_x (gf, s8, d8, val, rd.d_top, 0, x);\
  176 + gf_do_final_region_alignment(&rd); \
  177 +}
  178 +
  179 +CLM_MULT_REGION(2)
  180 +CLM_MULT_REGION(3)
  181 +CLM_MULT_REGION(4)
  182 +
  183 +
  184 +int gf_w8_neon_cfm_init(gf_t *gf)
  185 +{
  186 + gf_internal_t *h;
  187 +
  188 + h = (gf_internal_t *) gf->scratch;
  189 +
  190 + if ((0xe0 & h->prim_poly) == 0){
  191 + gf->multiply.w32 = gf_w8_neon_clm_multiply_2;
  192 + gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_2;
  193 + }else if ((0xc0 & h->prim_poly) == 0){
  194 + gf->multiply.w32 = gf_w8_neon_clm_multiply_3;
  195 + gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_3;
  196 + }else if ((0x80 & h->prim_poly) == 0){
  197 + gf->multiply.w32 = gf_w8_neon_clm_multiply_4;
  198 + gf->multiply_region.w32 = gf_w8_neon_clm_multiply_region_from_single_4;
  199 + }else{
  200 + return 0;
  201 + }
  202 + return 1;
  203 +}
  204 +
  205 +#ifndef ARCH_AARCH64
  206 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
  207 + vtbl2_u8(tbl, vget_high_u8(v)))
  208 +#endif
  209 +
  210 +static
  211 +void
  212 +gf_w8_split_multiply_region_neon(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
  213 +{
  214 + uint8_t *bh, *bl, *sptr, *dptr;
  215 + uint8x16_t r, va, vh, vl, loset;
  216 +#ifdef ARCH_AARCH64
  217 + uint8x16_t mth, mtl;
  218 +#else
  219 + uint8x8x2_t mth, mtl;
  220 +#endif
  221 + struct gf_w8_half_table_data *htd;
  222 + gf_region_data rd;
  223 +
  224 + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  225 + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  226 +
  227 + htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private;
  228 +
  229 + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
  230 + gf_do_initial_region_alignment(&rd);
  231 +
  232 + bh = (uint8_t *) htd->high;
  233 + bh += (val << 4);
  234 + bl = (uint8_t *) htd->low;
  235 + bl += (val << 4);
  236 +
  237 + sptr = rd.s_start;
  238 + dptr = rd.d_start;
  239 +
  240 +#ifdef ARCH_AARCH64
  241 + mth = vld1q_u8 (bh);
  242 + mtl = vld1q_u8 (bl);
  243 +#else
  244 + mth.val[0] = vld1_u8 (bh);
  245 + mtl.val[0] = vld1_u8 (bl);
  246 + mth.val[1] = vld1_u8 (bh + 8);
  247 + mtl.val[1] = vld1_u8 (bl + 8);
  248 +#endif
  249 +
  250 + loset = vdupq_n_u8(0xf);
  251 +
  252 + if (xor) {
  253 + while (sptr < (uint8_t *) rd.s_top) {
  254 + va = vld1q_u8 (sptr);
  255 +
  256 + vh = vshrq_n_u8 (va, 4);
  257 + vl = vandq_u8 (va, loset);
  258 + va = vld1q_u8 (dptr);
  259 +
  260 + vh = vqtbl1q_u8 (mth, vh);
  261 + vl = vqtbl1q_u8 (mtl, vl);
  262 +
  263 + r = veorq_u8 (vh, vl);
  264 +
  265 + vst1q_u8 (dptr, veorq_u8 (va, r));
  266 +
  267 + dptr += 16;
  268 + sptr += 16;
  269 + }
  270 + } else {
  271 + while (sptr < (uint8_t *) rd.s_top) {
  272 + va = vld1q_u8 (sptr);
  273 +
  274 + vh = vshrq_n_u8 (va, 4);
  275 + vl = vandq_u8 (va, loset);
  276 +#ifdef ARCH_AARCH64
  277 + vh = vqtbl1q_u8 (mth, vh);
  278 + vl = vqtbl1q_u8 (mtl, vl);
  279 +#else
  280 + vh = vcombine_u8 (vtbl2_u8 (mth, vget_low_u8 (vh)),
  281 + vtbl2_u8 (mth, vget_high_u8 (vh)));
  282 + vl = vcombine_u8 (vtbl2_u8 (mtl, vget_low_u8 (vl)),
  283 + vtbl2_u8 (mtl, vget_high_u8 (vl)));
  284 +#endif
  285 +
  286 + r = veorq_u8 (vh, vl);
  287 +
  288 + vst1q_u8(dptr, r);
  289 +
  290 + dptr += 16;
  291 + sptr += 16;
  292 + }
  293 + }
  294 +
  295 + gf_do_final_region_alignment(&rd);
  296 +}
  297 +
  298 +
  299 +void gf_w8_neon_split_init(gf_t *gf)
  300 +{
  301 + gf->multiply_region.w32 = gf_w8_split_multiply_region_neon;
  302 +}
... ...