Commit 1311a44f7a27b38217a94e9d7a5dbe3ae3dde035

Authored by Janne Grunau
1 parent 3a1be40e
Exists in master and in 2 other branches v2, v3

arm: NEON optimisations for gf_w4

Optimisations for the single table region multiplication and carry less
multiplication using NEON's polynomial multiplication of 8-bit values.

The single polynomial multiplication is not that useful but vector
version is for region multiplication.

Selected time_tool.sh results for a 1.7GHz cortex-a9:
Region Best (MB/s):   672.72   W-Method: 4 -m CARRY_FREE -
Region Best (MB/s):   265.84   W-Method: 4 -m BYTWO_p -
Region Best (MB/s):   329.41   W-Method: 4 -m TABLE -r DOUBLE -
Region Best (MB/s):   278.63   W-Method: 4 -m TABLE -r QUAD -
Region Best (MB/s):   329.81   W-Method: 4 -m TABLE -r QUAD -r LAZY -
Region Best (MB/s):  1318.03   W-Method: 4 -m TABLE -r SIMD -
Region Best (MB/s):   165.15   W-Method: 4 -m TABLE -r NOSIMD -
Region Best (MB/s):    99.73   W-Method: 4 -m LOG -
include/gf_w4.h 0 → 100644
@@ -0,0 +1,63 @@ @@ -0,0 +1,63 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_w4.h
  7 + *
  8 + * Defines and data structures for 4-bit Galois fields
  9 + */
  10 +
  11 +#ifndef GF_COMPLETE_GF_W4_H
  12 +#define GF_COMPLETE_GF_W4_H
  13 +
  14 +#include <stdint.h>
  15 +
  16 +#define GF_FIELD_WIDTH 4
  17 +#define GF_DOUBLE_WIDTH (GF_FIELD_WIDTH*2)
  18 +#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)
  19 +#define GF_MULT_GROUP_SIZE (GF_FIELD_SIZE-1)
  20 +
  21 +/* ------------------------------------------------------------
  22 + JSP: Each implementation has its own data, which is allocated
  23 + at one time as part of the handle. For that reason, it
  24 + shouldn't be hierarchical -- i.e. one should be able to
  25 + allocate it with one call to malloc. */
  26 +
  27 +struct gf_logtable_data {
  28 + uint8_t log_tbl[GF_FIELD_SIZE];
  29 + uint8_t antilog_tbl[GF_FIELD_SIZE * 2];
  30 + uint8_t *antilog_tbl_div;
  31 +};
  32 +
  33 +struct gf_single_table_data {
  34 + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];
  35 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  36 +};
  37 +
  38 +struct gf_double_table_data {
  39 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  40 + uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];
  41 +};
  42 +struct gf_quad_table_data {
  43 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  44 + uint16_t mult[GF_FIELD_SIZE][(1<<16)];
  45 +};
  46 +
  47 +struct gf_quad_table_lazy_data {
  48 + uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];
  49 + uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];
  50 + uint16_t mult[(1 << 16)];
  51 +};
  52 +
  53 +struct gf_bytwo_data {
  54 + uint64_t prim_poly;
  55 + uint64_t mask1;
  56 + uint64_t mask2;
  57 +};
  58 +
  59 +// ARM NEON init functions
  60 +int gf_w4_neon_cfm_init(gf_t *gf);
  61 +void gf_w4_neon_single_table_init(gf_t *gf);
  62 +
  63 +#endif /* GF_COMPLETE_GF_W4_H */
src/Makefile.am
1 # GF-Complete 'core' AM file 1 # GF-Complete 'core' AM file
2 # Creates the library 2 # Creates the library
3 3
  4 +AUTOMAKE_OPTIONS = subdir-objects
  5 +
4 AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include 6 AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include
5 AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC 7 AM_CFLAGS = -O3 $(SIMD_FLAGS) -fPIC
6 8
7 lib_LTLIBRARIES = libgf_complete.la 9 lib_LTLIBRARIES = libgf_complete.la
8 libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \ 10 libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c gf_w32.c \
9 gf_w64.c gf_w128.c gf_rand.c gf_general.c 11 gf_w64.c gf_w128.c gf_rand.c gf_general.c
  12 +
  13 +if HAVE_NEON
  14 +libgf_complete_la_SOURCES += neon/gf_w4_neon.c
  15 +endif
  16 +
10 libgf_complete_la_LDFLAGS = -version-info 1:0:0 17 libgf_complete_la_LDFLAGS = -version-info 1:0:0
11 18
@@ -11,49 +11,7 @@ @@ -11,49 +11,7 @@
11 #include "gf_int.h" 11 #include "gf_int.h"
12 #include <stdio.h> 12 #include <stdio.h>
13 #include <stdlib.h> 13 #include <stdlib.h>
14 -  
15 -#define GF_FIELD_WIDTH 4  
16 -#define GF_DOUBLE_WIDTH (GF_FIELD_WIDTH*2)  
17 -#define GF_FIELD_SIZE (1 << GF_FIELD_WIDTH)  
18 -#define GF_MULT_GROUP_SIZE (GF_FIELD_SIZE-1)  
19 -  
20 -/* ------------------------------------------------------------  
21 - JSP: Each implementation has its own data, which is allocated  
22 - at one time as part of the handle. For that reason, it  
23 - shouldn't be hierarchical -- i.e. one should be able to  
24 - allocate it with one call to malloc. */  
25 -  
26 -struct gf_logtable_data {  
27 - uint8_t log_tbl[GF_FIELD_SIZE];  
28 - uint8_t antilog_tbl[GF_FIELD_SIZE * 2];  
29 - uint8_t *antilog_tbl_div;  
30 -};  
31 -  
32 -struct gf_single_table_data {  
33 - uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE];  
34 - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];  
35 -};  
36 -  
37 -struct gf_double_table_data {  
38 - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];  
39 - uint8_t mult[GF_FIELD_SIZE][GF_FIELD_SIZE*GF_FIELD_SIZE];  
40 -};  
41 -struct gf_quad_table_data {  
42 - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];  
43 - uint16_t mult[GF_FIELD_SIZE][(1<<16)];  
44 -};  
45 -  
46 -struct gf_quad_table_lazy_data {  
47 - uint8_t div[GF_FIELD_SIZE][GF_FIELD_SIZE];  
48 - uint8_t smult[GF_FIELD_SIZE][GF_FIELD_SIZE];  
49 - uint16_t mult[(1 << 16)];  
50 -};  
51 -  
52 -struct gf_bytwo_data {  
53 - uint64_t prim_poly;  
54 - uint64_t mask1;  
55 - uint64_t mask2;  
56 -}; 14 +#include "gf_w4.h"
57 15
58 #define AB2(ip, am1 ,am2, b, t1, t2) {\ 16 #define AB2(ip, am1 ,am2, b, t1, t2) {\
59 t1 = (b << 1) & am1;\ 17 t1 = (b << 1) & am1;\
@@ -489,11 +447,15 @@ int gf_w4_single_table_init(gf_t *gf) @@ -489,11 +447,15 @@ int gf_w4_single_table_init(gf_t *gf)
489 gf->inverse.w32 = NULL; 447 gf->inverse.w32 = NULL;
490 gf->divide.w32 = gf_w4_single_table_divide; 448 gf->divide.w32 = gf_w4_single_table_divide;
491 gf->multiply.w32 = gf_w4_single_table_multiply; 449 gf->multiply.w32 = gf_w4_single_table_multiply;
492 - #ifdef INTEL_SSSE3 450 + #if defined(INTEL_SSSE3) || defined(ARM_NEON)
493 if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY)) 451 if(h->region_type & (GF_REGION_NOSIMD | GF_REGION_CAUCHY))
494 gf->multiply_region.w32 = gf_w4_single_table_multiply_region; 452 gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
495 else 453 else
  454 + #if defined(INTEL_SSSE3)
496 gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region; 455 gf->multiply_region.w32 = gf_w4_single_table_sse_multiply_region;
  456 + #elif defined(ARM_NEON)
  457 + gf_w4_neon_single_table_init(gf);
  458 + #endif
497 #else 459 #else
498 gf->multiply_region.w32 = gf_w4_single_table_multiply_region; 460 gf->multiply_region.w32 = gf_w4_single_table_multiply_region;
499 if (h->region_type & GF_REGION_SIMD) return 0; 461 if (h->region_type & GF_REGION_SIMD) return 0;
@@ -774,16 +736,16 @@ int gf_w4_table_init(gf_t *gf) @@ -774,16 +736,16 @@ int gf_w4_table_init(gf_t *gf)
774 { 736 {
775 int rt; 737 int rt;
776 gf_internal_t *h; 738 gf_internal_t *h;
777 - int issse3 = 0; 739 + int simd = 0;
778 740
779 -#ifdef INTEL_SSSE3  
780 - issse3 = 1; 741 +#if defined(INTEL_SSSE3) || defined(ARM_NEON)
  742 + simd = 1;
781 #endif 743 #endif
782 744
783 h = (gf_internal_t *) gf->scratch; 745 h = (gf_internal_t *) gf->scratch;
784 rt = (h->region_type); 746 rt = (h->region_type);
785 747
786 - if (h->mult_type == GF_MULT_DEFAULT && !issse3) rt |= GF_REGION_DOUBLE_TABLE; 748 + if (h->mult_type == GF_MULT_DEFAULT && !simd) rt |= GF_REGION_DOUBLE_TABLE;
787 749
788 if (rt & GF_REGION_DOUBLE_TABLE) { 750 if (rt & GF_REGION_DOUBLE_TABLE) {
789 return gf_w4_double_table_init(gf); 751 return gf_w4_double_table_init(gf);
@@ -1937,6 +1899,8 @@ int gf_w4_cfm_init(gf_t *gf) @@ -1937,6 +1899,8 @@ int gf_w4_cfm_init(gf_t *gf)
1937 #if defined(INTEL_SSE4_PCLMUL) 1899 #if defined(INTEL_SSE4_PCLMUL)
1938 gf->multiply.w32 = gf_w4_clm_multiply; 1900 gf->multiply.w32 = gf_w4_clm_multiply;
1939 return 1; 1901 return 1;
  1902 +#elif defined(ARM_NEON)
  1903 + return gf_w4_neon_cfm_init(gf);
1940 #endif 1904 #endif
1941 return 0; 1905 return 0;
1942 } 1906 }
@@ -1953,11 +1917,14 @@ int gf_w4_shift_init(gf_t *gf) @@ -1953,11 +1917,14 @@ int gf_w4_shift_init(gf_t *gf)
1953 1917
1954 int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2) 1918 int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
1955 { 1919 {
1956 - int issse3 = 0; 1920 + int issse3 = 0, isneon = 0;
1957 1921
1958 #ifdef INTEL_SSSE3 1922 #ifdef INTEL_SSSE3
1959 issse3 = 1; 1923 issse3 = 1;
1960 #endif 1924 #endif
  1925 +#ifdef ARM_NEON
  1926 + isneon = 1;
  1927 +#endif
1961 1928
1962 switch(mult_type) 1929 switch(mult_type)
1963 { 1930 {
@@ -1971,7 +1938,8 @@ int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1 @@ -1971,7 +1938,8 @@ int gf_w4_scratch_size(int mult_type, int region_type, int divide_type, int arg1
1971 return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64; 1938 return sizeof(gf_internal_t) + sizeof(struct gf_single_table_data) + 64;
1972 } 1939 }
1973 1940
1974 - if (mult_type == GF_MULT_DEFAULT && !issse3) region_type = GF_REGION_DOUBLE_TABLE; 1941 + if (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))
  1942 + region_type = GF_REGION_DOUBLE_TABLE;
1975 1943
1976 if (region_type & GF_REGION_DOUBLE_TABLE) { 1944 if (region_type & GF_REGION_DOUBLE_TABLE) {
1977 return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64; 1945 return sizeof(gf_internal_t) + sizeof(struct gf_double_table_data) + 64;
src/neon/gf_w4_neon.c 0 → 100644
@@ -0,0 +1,247 @@ @@ -0,0 +1,247 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * Copyright (c) 2014: Janne Grunau <j@jannau.net>
  7 + *
  8 + * Redistribution and use in source and binary forms, with or without
  9 + * modification, are permitted provided that the following conditions
  10 + * are met:
  11 + *
  12 + * - Redistributions of source code must retain the above copyright
  13 + * notice, this list of conditions and the following disclaimer.
  14 + *
  15 + * - Redistributions in binary form must reproduce the above copyright
  16 + * notice, this list of conditions and the following disclaimer in
  17 + * the documentation and/or other materials provided with the
  18 + * distribution.
  19 + *
  20 + * - Neither the name of the University of Tennessee nor the names of its
  21 + * contributors may be used to endorse or promote products derived
  22 + * from this software without specific prior written permission.
  23 + *
  24 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  28 + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  29 + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  30 + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  31 + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  32 + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  34 + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  35 + * POSSIBILITY OF SUCH DAMAGE.
  36 + *
  37 + * gf_w4_neon.c
  38 + *
  39 + * Neon routines for 4-bit Galois fields
  40 + *
  41 + */
  42 +
  43 +#include "gf_int.h"
  44 +#include <stdio.h>
  45 +#include <stdlib.h>
  46 +#include "gf_w4.h"
  47 +
  48 +static
  49 +gf_val_32_t
  50 +gf_w4_neon_clm_multiply (gf_t *gf, gf_val_32_t a4, gf_val_32_t b4)
  51 +{
  52 + gf_val_32_t rv = 0;
  53 + poly8x8_t result, prim_poly;
  54 + poly8x8_t a, b, w;
  55 + uint8x8_t v;
  56 + gf_internal_t * h = gf->scratch;
  57 +
  58 + a = vdup_n_p8 (a4);
  59 + b = vdup_n_p8 (b4);
  60 +
  61 + prim_poly = vdup_n_p8 ((uint32_t)(h->prim_poly & 0x1fULL));
  62 +
  63 + /* Do the initial multiply */
  64 + result = vmul_p8 (a, b);
  65 + v = vshr_n_u8 (vreinterpret_u8_p8(result), 4);
  66 + w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
  67 + result = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(result), vreinterpret_u8_p8(w)));
  68 +
  69 + /* Extracts 32 bit value from result. */
  70 + rv = (gf_val_32_t)vget_lane_u8 (vreinterpret_u8_p8 (result), 0);
  71 +
  72 + return rv;
  73 +}
  74 +
  75 +static inline void
  76 +neon_clm_multiply_region_from_single (gf_t *gf, uint8_t *s8, uint8_t *d8,
  77 + gf_val_32_t val, uint8_t *d_end, int xor)
  78 +{
  79 + gf_internal_t * h = gf->scratch;
  80 + poly8x8_t prim_poly;
  81 + poly8x8_t a, w, even, odd;
  82 + uint8x8_t b, c, v, mask;
  83 +
  84 + a = vdup_n_p8 (val);
  85 + mask = vdup_n_u8 (0xf);
  86 + prim_poly = vdup_n_p8 ((uint8_t)(h->prim_poly & 0x1fULL));
  87 +
  88 + while (d8 < d_end) {
  89 + b = vld1_u8 (s8);
  90 +
  91 + even = vreinterpret_p8_u8 (vand_u8 (b, mask));
  92 + odd = vreinterpret_p8_u8 (vshr_n_u8 (b, 4));
  93 +
  94 + if (xor)
  95 + c = vld1_u8 (d8);
  96 +
  97 + even = vmul_p8 (a, even);
  98 + odd = vmul_p8 (a, odd);
  99 +
  100 + v = vshr_n_u8 (vreinterpret_u8_p8(even), 4);
  101 + w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
  102 + even = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(even), vreinterpret_u8_p8(w)));
  103 +
  104 + v = vshr_n_u8 (vreinterpret_u8_p8(odd), 4);
  105 + w = vmul_p8 (prim_poly, vreinterpret_p8_u8(v));
  106 + odd = vreinterpret_p8_u8 (veor_u8 (vreinterpret_u8_p8(odd), vreinterpret_u8_p8(w)));
  107 +
  108 + v = veor_u8 (vreinterpret_u8_p8 (even), vshl_n_u8 (vreinterpret_u8_p8 (odd), 4));
  109 +
  110 + if (xor)
  111 + v = veor_u8 (c, v);
  112 +
  113 + vst1_u8 (d8, v);
  114 +
  115 + d8 += 8;
  116 + s8 += 8;
  117 + }
  118 +}
  119 +
  120 +
  121 +static void
  122 +gf_w4_neon_clm_multiply_region_from_single (gf_t *gf, void *src, void *dest,
  123 + gf_val_32_t val, int bytes, int xor)
  124 +{
  125 + gf_region_data rd;
  126 + uint8_t *s8;
  127 + uint8_t *d8;
  128 +
  129 + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  130 + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  131 +
  132 + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
  133 + gf_do_initial_region_alignment(&rd);
  134 +
  135 + s8 = (uint8_t *) rd.s_start;
  136 + d8 = (uint8_t *) rd.d_start;
  137 +
  138 + if (xor)
  139 + neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 1);
  140 + else
  141 + neon_clm_multiply_region_from_single (gf, s8, d8, val, rd.d_top, 0);
  142 +
  143 + gf_do_final_region_alignment(&rd);
  144 +}
  145 +
  146 +#ifndef ARCH_AARCH64
  147 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
  148 + vtbl2_u8(tbl, vget_high_u8(v)))
  149 +#endif
  150 +
  151 +static
  152 +inline
  153 +void
  154 +w4_single_table_multiply_region_neon(gf_t *gf, uint8_t *src, uint8_t *dst,
  155 + uint8_t * d_end, gf_val_32_t val, int xor)
  156 +{
  157 + struct gf_single_table_data *std;
  158 + uint8_t *base;
  159 + uint8x16_t r, va, vh, vl, loset;
  160 +
  161 +#ifdef ARCH_AARCH64
  162 + uint8x16_t th, tl;
  163 +#else
  164 + uint8x8x2_t th, tl;
  165 +#endif
  166 +
  167 + std = (struct gf_single_table_data *) ((gf_internal_t *) (gf->scratch))->private;
  168 + base = (uint8_t *) std->mult;
  169 + base += (val << GF_FIELD_WIDTH);
  170 +
  171 +#ifdef ARCH_AARCH64
  172 + tl = vld1q_u8 (base);
  173 + th = vshlq_n_u8 (tl, 4);
  174 +#else
  175 + tl.val[0] = vld1_u8 (base);
  176 + tl.val[1] = vld1_u8 (base + 8);
  177 + th.val[0] = vshl_n_u8 (tl.val[0], 4);
  178 + th.val[1] = vshl_n_u8 (tl.val[1], 4);
  179 +#endif
  180 +
  181 + loset = vdupq_n_u8(0xf);
  182 +
  183 + while (dst < d_end) {
  184 + va = vld1q_u8 (src);
  185 +
  186 + vh = vshrq_n_u8 (va, 4);
  187 + vl = vandq_u8 (va, loset);
  188 +
  189 + if (xor)
  190 + va = vld1q_u8 (dst);
  191 +
  192 + vh = vqtbl1q_u8 (th, vh);
  193 + vl = vqtbl1q_u8 (tl, vl);
  194 +
  195 + r = veorq_u8 (vh, vl);
  196 +
  197 + if (xor)
  198 + r = veorq_u8 (va, r);
  199 +
  200 + vst1q_u8 (dst, r);
  201 +
  202 + dst += 16;
  203 + src += 16;
  204 + }
  205 +}
  206 +
  207 +static
  208 +void
  209 +gf_w4_single_table_multiply_region_neon(gf_t *gf, void *src, void *dest,
  210 + gf_val_32_t val, int bytes, int xor)
  211 +{
  212 + gf_region_data rd;
  213 + uint8_t *sptr, *dptr, *top;
  214 +
  215 + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  216 + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  217 +
  218 + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
  219 + gf_do_initial_region_alignment(&rd);
  220 +
  221 + sptr = rd.s_start;
  222 + dptr = rd.d_start;
  223 + top = rd.d_top;
  224 +
  225 + if (xor)
  226 + w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 1);
  227 + else
  228 + w4_single_table_multiply_region_neon(gf, sptr, dptr, top, val, 0);
  229 +
  230 + gf_do_final_region_alignment(&rd);
  231 +
  232 +}
  233 +
  234 +
  235 +int gf_w4_neon_cfm_init(gf_t *gf)
  236 +{
  237 + // single clm multiplication probably pointless
  238 + gf->multiply.w32 = gf_w4_neon_clm_multiply;
  239 + gf->multiply_region.w32 = gf_w4_neon_clm_multiply_region_from_single;
  240 +
  241 + return 1;
  242 +}
  243 +
  244 +void gf_w4_neon_single_table_init(gf_t *gf)
  245 +{
  246 + gf->multiply_region.w32 = gf_w4_single_table_multiply_region_neon;
  247 +}