Commit 370c88b9015cbe874aca81442a5d8f6f99bfb654

Authored by Janne Grunau
1 parent 474010a9
Exists in master and in 3 other branches v2, v3, wip-18092

arm: NEON optimisations for gf_w32

Optimisations for 4,32 split table multiplications.

Selected time_tool.sh results on a 1.7 GHz cortex-a9:
Region Best (MB/s):   346.67   W-Method: 32 -m SPLIT 32 4 -r SIMD -
Region Best (MB/s):    92.89   W-Method: 32 -m SPLIT 32 4 -r NOSIMD -
Region Best (MB/s):   258.17   W-Method: 32 -m SPLIT 32 4 -r SIMD -r ALTMAP -
Region Best (MB/s):   162.00   W-Method: 32 -m SPLIT 32 8 -
Region Best (MB/s):   160.53   W-Method: 32 -m SPLIT 8 8 -
Region Best (MB/s):    32.74   W-Method: 32 -m COMPOSITE 2 - -
Region Best (MB/s):   199.79   W-Method: 32 -m COMPOSITE 2 - -r ALTMAP -
include/gf_w32.h 0 → 100644
... ... @@ -0,0 +1,71 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_w32.h
  7 + *
  8 + * Defines and data structures for 32-bit Galois fields
  9 + */
  10 +
  11 +#ifndef GF_COMPLETE_GF_W32_H
  12 +#define GF_COMPLETE_GF_W32_H
  13 +
  14 +#include <stdint.h>
  15 +
  16 +#define GF_FIELD_WIDTH (32)
  17 +#define GF_FIRST_BIT (1 << 31)
  18 +
  19 +#define GF_BASE_FIELD_WIDTH (16)
  20 +#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
  21 +#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
  22 +#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
  23 +
  24 +struct gf_split_2_32_lazy_data {
  25 + uint32_t tables[16][4];
  26 + uint32_t last_value;
  27 +};
  28 +
  29 +struct gf_w32_split_8_8_data {
  30 + uint32_t tables[7][256][256];
  31 + uint32_t region_tables[4][256];
  32 + uint32_t last_value;
  33 +};
  34 +
  35 +struct gf_w32_group_data {
  36 + uint32_t *reduce;
  37 + uint32_t *shift;
  38 + int tshift;
  39 + uint64_t rmask;
  40 + uint32_t *memory;
  41 +};
  42 +
  43 +struct gf_split_16_32_lazy_data {
  44 + uint32_t tables[2][(1<<16)];
  45 + uint32_t last_value;
  46 +};
  47 +
  48 +struct gf_split_8_32_lazy_data {
  49 + uint32_t tables[4][256];
  50 + uint32_t last_value;
  51 +};
  52 +
  53 +struct gf_split_4_32_lazy_data {
  54 + uint32_t tables[8][16];
  55 + uint32_t last_value;
  56 +};
  57 +
  58 +struct gf_w32_bytwo_data {
  59 + uint64_t prim_poly;
  60 + uint64_t mask1;
  61 + uint64_t mask2;
  62 +};
  63 +
  64 +struct gf_w32_composite_data {
  65 + uint16_t *log;
  66 + uint16_t *alog;
  67 +};
  68 +
  69 +void gf_w32_neon_split_init(gf_t *gf);
  70 +
  71 +#endif /* GF_COMPLETE_GF_W32_H */
... ...
src/Makefile.am
... ... @@ -13,7 +13,8 @@ libgf_complete_la_SOURCES = gf.c gf_method.c gf_wgen.c gf_w4.c gf_w8.c gf_w16.c
13 13 if HAVE_NEON
14 14 libgf_complete_la_SOURCES += neon/gf_w4_neon.c \
15 15 neon/gf_w8_neon.c \
16   - neon/gf_w16_neon.c
  16 + neon/gf_w16_neon.c \
  17 + neon/gf_w32_neon.c
17 18 endif
18 19  
19 20 libgf_complete_la_LDFLAGS = -version-info 1:0:0
... ...
src/gf_w32.c
... ... @@ -12,59 +12,7 @@
12 12 #include "gf_int.h"
13 13 #include <stdio.h>
14 14 #include <stdlib.h>
15   -
16   -#define GF_FIELD_WIDTH (32)
17   -#define GF_FIRST_BIT (1 << 31)
18   -
19   -#define GF_BASE_FIELD_WIDTH (16)
20   -#define GF_BASE_FIELD_SIZE (1 << GF_BASE_FIELD_WIDTH)
21   -#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
22   -#define GF_MULTBY_TWO(p) (((p) & GF_FIRST_BIT) ? (((p) << 1) ^ h->prim_poly) : (p) << 1)
23   -
24   -struct gf_split_2_32_lazy_data {
25   - uint32_t tables[16][4];
26   - uint32_t last_value;
27   -};
28   -
29   -struct gf_w32_split_8_8_data {
30   - uint32_t tables[7][256][256];
31   - uint32_t region_tables[4][256];
32   - uint32_t last_value;
33   -};
34   -
35   -struct gf_w32_group_data {
36   - uint32_t *reduce;
37   - uint32_t *shift;
38   - int tshift;
39   - uint64_t rmask;
40   - uint32_t *memory;
41   -};
42   -
43   -struct gf_split_16_32_lazy_data {
44   - uint32_t tables[2][(1<<16)];
45   - uint32_t last_value;
46   -};
47   -
48   -struct gf_split_8_32_lazy_data {
49   - uint32_t tables[4][256];
50   - uint32_t last_value;
51   -};
52   -
53   -struct gf_split_4_32_lazy_data {
54   - uint32_t tables[8][16];
55   - uint32_t last_value;
56   -};
57   -
58   -struct gf_w32_bytwo_data {
59   - uint64_t prim_poly;
60   - uint64_t mask1;
61   - uint64_t mask2;
62   -};
63   -
64   -struct gf_w32_composite_data {
65   - uint16_t *log;
66   - uint16_t *alog;
67   -};
  15 +#include "gf_w32.h"
68 16  
69 17 #define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
70 18  
... ... @@ -2283,6 +2231,7 @@ int gf_w32_split_init(gf_t *gf)
2283 2231 struct gf_split_16_32_lazy_data *d16;
2284 2232 uint32_t p, basep;
2285 2233 int i, j, exp, ispclmul, issse3;
  2234 + int isneon = 0;
2286 2235  
2287 2236 #if defined(INTEL_SSE4_PCLMUL)
2288 2237 ispclmul = 1;
... ... @@ -2295,6 +2244,9 @@ int gf_w32_split_init(gf_t *gf)
2295 2244 #else
2296 2245 issse3 = 0;
2297 2246 #endif
  2247 +#ifdef ARM_NEON
  2248 + isneon = 1;
  2249 +#endif
2298 2250  
2299 2251 h = (gf_internal_t *) gf->scratch;
2300 2252  
... ... @@ -2349,11 +2301,15 @@ int gf_w32_split_init(gf_t *gf)
2349 2301 /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */
2350 2302  
2351 2303 if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) ||
2352   - (issse3 && h->mult_type == GF_REGION_DEFAULT)) {
  2304 + ((issse3 || isneon) && h->mult_type == GF_REGION_DEFAULT)) {
2353 2305 ld4 = (struct gf_split_4_32_lazy_data *) h->private;
2354 2306 ld4->last_value = 0;
2355   - if ((h->region_type & GF_REGION_NOSIMD) || !issse3) {
  2307 + if ((h->region_type & GF_REGION_NOSIMD) || !(issse3 || isneon)) {
2356 2308 gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region;
  2309 + } else if (isneon) {
  2310 +#ifdef ARM_NEON
  2311 + gf_w32_neon_split_init(gf);
  2312 +#endif
2357 2313 } else if (h->region_type & GF_REGION_ALTMAP) {
2358 2314 gf->multiply_region.w32 = gf_w32_split_4_32_lazy_sse_altmap_multiply_region;
2359 2315 } else {
... ... @@ -2731,10 +2687,14 @@ int gf_w32_composite_init(gf_t *gf)
2731 2687 int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
2732 2688 {
2733 2689 int issse3 = 0;
  2690 + int isneon = 0;
2734 2691  
2735 2692 #ifdef INTEL_SSSE3
2736 2693 issse3 = 1;
2737 2694 #endif
  2695 +#ifdef ARM_NEON
  2696 + isneon = 1;
  2697 +#endif
2738 2698  
2739 2699 switch(mult_type)
2740 2700 {
... ... @@ -2760,7 +2720,7 @@ int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg
2760 2720 return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
2761 2721 }
2762 2722 if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) ||
2763   - (mult_type == GF_MULT_DEFAULT && !issse3)) {
  2723 + (mult_type == GF_MULT_DEFAULT && !(issse3 || isneon))) {
2764 2724 return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
2765 2725 }
2766 2726 if ((arg1 == 4 && arg2 == 32) ||
... ...
src/neon/gf_w32_neon.c 0 → 100644
... ... @@ -0,0 +1,269 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * Copyright (c) 2014: Janne Grunau <j@jannau.net>
  7 + *
  8 + * Redistribution and use in source and binary forms, with or without
  9 + * modification, are permitted provided that the following conditions
  10 + * are met:
  11 + *
  12 + * - Redistributions of source code must retain the above copyright
  13 + * notice, this list of conditions and the following disclaimer.
  14 + *
  15 + * - Redistributions in binary form must reproduce the above copyright
  16 + * notice, this list of conditions and the following disclaimer in
  17 + * the documentation and/or other materials provided with the
  18 + * distribution.
  19 + *
  20 + * - Neither the name of the University of Tennessee nor the names of its
  21 + * contributors may be used to endorse or promote products derived
  22 + * from this software without specific prior written permission.
  23 + *
  24 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  28 + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  29 + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  30 + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  31 + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  32 + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  34 + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  35 + * POSSIBILITY OF SUCH DAMAGE.
  36 + *
  37 + * gf_w32_neon.c
  38 + *
  39 + * Neon routines for 32-bit Galois fields
  40 + *
  41 + */
  42 +
  43 +
  44 +#include "gf_int.h"
  45 +#include <stdio.h>
  46 +#include <stdlib.h>
  47 +#include "gf_w32.h"
  48 +
  49 +#ifndef ARCH_AARCH64
  50 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
  51 + vtbl2_u8(tbl, vget_high_u8(v)))
  52 +#endif
  53 +
  54 +static
  55 +void
  56 +neon_w32_split_4_32_multiply_region(gf_t *gf, uint32_t *src, uint32_t *dst,
  57 + uint32_t *d_end, uint8_t btable[8][4][16],
  58 + uint32_t val, int xor, int altmap)
  59 +{
  60 + int i, j;
  61 +#ifdef ARCH_AARCH64
  62 + uint8x16_t tables[8][4];
  63 +#else
  64 + uint8x8x2_t tables[8][4];
  65 +#endif
  66 + uint32x4_t v0, v1, v2, v3, s0, s1, s2, s3;
  67 + uint8x16_t p0, p1, p2, p3, si, mask1;
  68 + uint16x8x2_t r0, r1;
  69 + uint8x16x2_t q0, q1;
  70 +
  71 + for (i = 0; i < 8; i++) {
  72 + for (j = 0; j < 4; j++) {
  73 +#ifdef ARCH_AARCH64
  74 + tables[i][j] = vld1q_u8(btable[i][j]);
  75 +#else
  76 + tables[i][j].val[0] = vld1_u8(btable[i][j]);
  77 + tables[i][j].val[1] = vld1_u8(btable[i][j] + 8);
  78 +#endif
  79 + }
  80 + }
  81 +
  82 + mask1 = vdupq_n_u8(0xf);
  83 +
  84 + while (dst < d_end) {
  85 +
  86 + v0 = vld1q_u32(src); src += 4;
  87 + v1 = vld1q_u32(src); src += 4;
  88 + v2 = vld1q_u32(src); src += 4;
  89 + v3 = vld1q_u32(src); src += 4;
  90 +
  91 + if (altmap) {
  92 + q0.val[0] = vreinterpretq_u8_u32(v0);
  93 + q0.val[1] = vreinterpretq_u8_u32(v1);
  94 + q1.val[0] = vreinterpretq_u8_u32(v2);
  95 + q1.val[1] = vreinterpretq_u8_u32(v3);
  96 + } else {
  97 + r0 = vtrnq_u16(vreinterpretq_u16_u32(v0), vreinterpretq_u16_u32(v2));
  98 + r1 = vtrnq_u16(vreinterpretq_u16_u32(v1), vreinterpretq_u16_u32(v3));
  99 +
  100 + q0 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[0]),
  101 + vreinterpretq_u8_u16(r1.val[0]));
  102 + q1 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[1]),
  103 + vreinterpretq_u8_u16(r1.val[1]));
  104 + }
  105 +
  106 + si = vandq_u8(q0.val[0], mask1);
  107 + p0 = vqtbl1q_u8(tables[0][0], si);
  108 + p1 = vqtbl1q_u8(tables[0][1], si);
  109 + p2 = vqtbl1q_u8(tables[0][2], si);
  110 + p3 = vqtbl1q_u8(tables[0][3], si);
  111 +
  112 + si = vshrq_n_u8(q0.val[0], 4);
  113 + p0 = veorq_u8(p0, vqtbl1q_u8(tables[1][0], si));
  114 + p1 = veorq_u8(p1, vqtbl1q_u8(tables[1][1], si));
  115 + p2 = veorq_u8(p2, vqtbl1q_u8(tables[1][2], si));
  116 + p3 = veorq_u8(p3, vqtbl1q_u8(tables[1][3], si));
  117 +
  118 + si = vandq_u8(q0.val[1], mask1);
  119 + p0 = veorq_u8(p0, vqtbl1q_u8(tables[2][0], si));
  120 + p1 = veorq_u8(p1, vqtbl1q_u8(tables[2][1], si));
  121 + p2 = veorq_u8(p2, vqtbl1q_u8(tables[2][2], si));
  122 + p3 = veorq_u8(p3, vqtbl1q_u8(tables[2][3], si));
  123 +
  124 + si = vshrq_n_u8(q0.val[1], 4);
  125 + p0 = veorq_u8(p0, vqtbl1q_u8(tables[3][0], si));
  126 + p1 = veorq_u8(p1, vqtbl1q_u8(tables[3][1], si));
  127 + p2 = veorq_u8(p2, vqtbl1q_u8(tables[3][2], si));
  128 + p3 = veorq_u8(p3, vqtbl1q_u8(tables[3][3], si));
  129 +
  130 + si = vandq_u8(q1.val[0], mask1);
  131 + p0 = veorq_u8(p0, vqtbl1q_u8(tables[4][0], si));
  132 + p1 = veorq_u8(p1, vqtbl1q_u8(tables[4][1], si));
  133 + p2 = veorq_u8(p2, vqtbl1q_u8(tables[4][2], si));
  134 + p3 = veorq_u8(p3, vqtbl1q_u8(tables[4][3], si));
  135 +
  136 + si = vshrq_n_u8(q1.val[0], 4);
  137 + p0 = veorq_u8(p0, vqtbl1q_u8(tables[5][0], si));
  138 + p1 = veorq_u8(p1, vqtbl1q_u8(tables[5][1], si));
  139 + p2 = veorq_u8(p2, vqtbl1q_u8(tables[5][2], si));
  140 + p3 = veorq_u8(p3, vqtbl1q_u8(tables[5][3], si));
  141 +
  142 + si = vandq_u8(q1.val[1], mask1);
  143 + p0 = veorq_u8(p0, vqtbl1q_u8(tables[6][0], si));
  144 + p1 = veorq_u8(p1, vqtbl1q_u8(tables[6][1], si));
  145 + p2 = veorq_u8(p2, vqtbl1q_u8(tables[6][2], si));
  146 + p3 = veorq_u8(p3, vqtbl1q_u8(tables[6][3], si));
  147 +
  148 + si = vshrq_n_u8(q1.val[1], 4);
  149 + p0 = veorq_u8(p0, vqtbl1q_u8(tables[7][0], si));
  150 + p1 = veorq_u8(p1, vqtbl1q_u8(tables[7][1], si));
  151 + p2 = veorq_u8(p2, vqtbl1q_u8(tables[7][2], si));
  152 + p3 = veorq_u8(p3, vqtbl1q_u8(tables[7][3], si));
  153 +
  154 + if (altmap) {
  155 + s0 = vreinterpretq_u32_u8(p0);
  156 + s1 = vreinterpretq_u32_u8(p1);
  157 + s2 = vreinterpretq_u32_u8(p2);
  158 + s3 = vreinterpretq_u32_u8(p3);
  159 + } else {
  160 + q0 = vtrnq_u8(p0, p1);
  161 + q1 = vtrnq_u8(p2, p3);
  162 +
  163 + r0 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[0]),
  164 + vreinterpretq_u16_u8(q1.val[0]));
  165 + r1 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[1]),
  166 + vreinterpretq_u16_u8(q1.val[1]));
  167 +
  168 + s0 = vreinterpretq_u32_u16(r0.val[0]);
  169 + s1 = vreinterpretq_u32_u16(r1.val[0]);
  170 + s2 = vreinterpretq_u32_u16(r0.val[1]);
  171 + s3 = vreinterpretq_u32_u16(r1.val[1]);
  172 + }
  173 +
  174 + if (xor) {
  175 + v0 = vld1q_u32(dst);
  176 + v1 = vld1q_u32(dst + 4);
  177 + v2 = vld1q_u32(dst + 8);
  178 + v3 = vld1q_u32(dst + 12);
  179 + s0 = veorq_u32(s0, v0);
  180 + s1 = veorq_u32(s1, v1);
  181 + s2 = veorq_u32(s2, v2);
  182 + s3 = veorq_u32(s3, v3);
  183 + }
  184 +
  185 + vst1q_u32(dst, s0);
  186 + vst1q_u32(dst + 4, s1);
  187 + vst1q_u32(dst + 8, s2);
  188 + vst1q_u32(dst + 12, s3);
  189 +
  190 + dst += 16;
  191 + }
  192 +}
  193 +
  194 +static
  195 +inline
  196 +void
  197 +neon_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor, int altmap)
  198 +{
  199 + gf_internal_t *h;
  200 + int i, j, k;
  201 + uint32_t pp, v, *s32, *d32, *top, tmp_table[16];
  202 + uint8_t btable[8][4][16];
  203 + gf_region_data rd;
  204 +
  205 + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  206 + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  207 +
  208 + h = (gf_internal_t *) gf->scratch;
  209 + pp = h->prim_poly;
  210 +
  211 + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64);
  212 + gf_do_initial_region_alignment(&rd);
  213 +
  214 + s32 = (uint32_t *) rd.s_start;
  215 + d32 = (uint32_t *) rd.d_start;
  216 + top = (uint32_t *) rd.d_top;
  217 +
  218 + v = val;
  219 + for (i = 0; i < 8; i++) {
  220 + tmp_table[0] = 0;
  221 + for (j = 1; j < 16; j <<= 1) {
  222 + for (k = 0; k < j; k++) {
  223 + tmp_table[k^j] = (v ^ tmp_table[k]);
  224 + }
  225 + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
  226 + }
  227 + for (j = 0; j < 4; j++) {
  228 + for (k = 0; k < 16; k++) {
  229 + btable[i][j][k] = (uint8_t) tmp_table[k];
  230 + tmp_table[k] >>= 8;
  231 + }
  232 + }
  233 + }
  234 +
  235 + if (xor)
  236 + neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 1, altmap);
  237 + else
  238 + neon_w32_split_4_32_multiply_region(gf, s32, d32, top, btable, val, 0, altmap);
  239 +
  240 + gf_do_final_region_alignment(&rd);
  241 +}
  242 +
  243 +static
  244 +void
  245 +gf_w32_split_4_32_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
  246 + gf_val_32_t val, int bytes, int xor)
  247 +{
  248 + neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
  249 +}
  250 +
  251 +static
  252 +void
  253 +gf_w32_split_4_32_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
  254 + void *dest, gf_val_32_t val,
  255 + int bytes, int xor)
  256 +{
  257 + neon_w32_split_4_32_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
  258 +}
  259 +
  260 +void gf_w32_neon_split_init(gf_t *gf)
  261 +{
  262 + gf_internal_t *h = (gf_internal_t *) gf->scratch;
  263 +
  264 + if (h->region_type & GF_REGION_ALTMAP)
  265 + gf->multiply_region.w32 = gf_w32_split_4_32_lazy_altmap_multiply_region_neon;
  266 + else
  267 + gf->multiply_region.w32 = gf_w32_split_4_32_lazy_multiply_region_neon;
  268 +
  269 +}
... ...