Commit 6fdd8bc3d32cb2f7fa55d2de9dc7cc5bb2f885aa
1 parent
370c88b9
Exists in
master
and in
3 other branches
arm: NEON optimisations for gf_w64
Optimisations for 4,64 split table region multiplications. Only used on ARMv8-A since it is not faster on ARMv7-A.
Showing
4 changed files
with
400 additions
and
37 deletions
Show diff stats
... | ... | @@ -0,0 +1,50 @@ |
1 | +/* | |
2 | + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic | |
3 | + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, | |
4 | + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. | |
5 | + * | |
6 | + * gf_w64.h | |
7 | + * | |
8 | + * Defines and data structures for 64-bit Galois fields | |
9 | + */ | |
10 | + | |
11 | +#ifndef GF_COMPLETE_GF_W64_H | |
12 | +#define GF_COMPLETE_GF_W64_H | |
13 | + | |
14 | +#include <stdint.h> | |
15 | + | |
16 | +#define GF_FIELD_WIDTH (64) | |
17 | +#define GF_FIRST_BIT (1ULL << 63) | |
18 | + | |
19 | +#define GF_BASE_FIELD_WIDTH (32) | |
20 | +#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH) | |
21 | +#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 | |
22 | + | |
23 | +struct gf_w64_group_data { | |
24 | + uint64_t *reduce; | |
25 | + uint64_t *shift; | |
26 | + uint64_t *memory; | |
27 | +}; | |
28 | + | |
29 | +struct gf_split_4_64_lazy_data { | |
30 | + uint64_t tables[16][16]; | |
31 | + uint64_t last_value; | |
32 | +}; | |
33 | + | |
34 | +struct gf_split_8_64_lazy_data { | |
35 | + uint64_t tables[8][(1<<8)]; | |
36 | + uint64_t last_value; | |
37 | +}; | |
38 | + | |
39 | +struct gf_split_16_64_lazy_data { | |
40 | + uint64_t tables[4][(1<<16)]; | |
41 | + uint64_t last_value; | |
42 | +}; | |
43 | + | |
44 | +struct gf_split_8_8_data { | |
45 | + uint64_t tables[15][256][256]; | |
46 | +}; | |
47 | + | |
48 | +void gf_w64_neon_split_init(gf_t *gf); | |
49 | + | |
50 | +#endif /* GF_COMPLETE_GF_W64_H */ | ... | ... |
src/Makefile.am
... | ... | @@ -14,7 +14,8 @@ if HAVE_NEON |
14 | 14 | libgf_complete_la_SOURCES += neon/gf_w4_neon.c \ |
15 | 15 | neon/gf_w8_neon.c \ |
16 | 16 | neon/gf_w16_neon.c \ |
17 | - neon/gf_w32_neon.c | |
17 | + neon/gf_w32_neon.c \ | |
18 | + neon/gf_w64_neon.c | |
18 | 19 | endif |
19 | 20 | |
20 | 21 | libgf_complete_la_LDFLAGS = -version-info 1:0:0 | ... | ... |
src/gf_w64.c
... | ... | @@ -11,38 +11,7 @@ |
11 | 11 | #include "gf_int.h" |
12 | 12 | #include <stdio.h> |
13 | 13 | #include <stdlib.h> |
14 | - | |
15 | -#define GF_FIELD_WIDTH (64) | |
16 | -#define GF_FIRST_BIT (1ULL << 63) | |
17 | - | |
18 | -#define GF_BASE_FIELD_WIDTH (32) | |
19 | -#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH) | |
20 | -#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1 | |
21 | - | |
22 | -struct gf_w64_group_data { | |
23 | - uint64_t *reduce; | |
24 | - uint64_t *shift; | |
25 | - uint64_t *memory; | |
26 | -}; | |
27 | - | |
28 | -struct gf_split_4_64_lazy_data { | |
29 | - uint64_t tables[16][16]; | |
30 | - uint64_t last_value; | |
31 | -}; | |
32 | - | |
33 | -struct gf_split_8_64_lazy_data { | |
34 | - uint64_t tables[8][(1<<8)]; | |
35 | - uint64_t last_value; | |
36 | -}; | |
37 | - | |
38 | -struct gf_split_16_64_lazy_data { | |
39 | - uint64_t tables[4][(1<<16)]; | |
40 | - uint64_t last_value; | |
41 | -}; | |
42 | - | |
43 | -struct gf_split_8_8_data { | |
44 | - uint64_t tables[15][256][256]; | |
45 | -}; | |
14 | +#include "gf_w64.h" | |
46 | 15 | |
47 | 16 | static |
48 | 17 | inline |
... | ... | @@ -2027,11 +1996,15 @@ int gf_w64_split_init(gf_t *gf) |
2027 | 1996 | /* Allen: set region pointers for default mult type. Single pointers are |
2028 | 1997 | * taken care of above (explicitly for sse, implicitly for no sse). */ |
2029 | 1998 | |
2030 | -#ifdef INTEL_SSE4 | |
1999 | +#if defined(INTEL_SSE4) || defined(ARCH_AARCH64) | |
2031 | 2000 | if (h->mult_type == GF_MULT_DEFAULT) { |
2032 | 2001 | d4 = (struct gf_split_4_64_lazy_data *) h->private; |
2033 | 2002 | d4->last_value = 0; |
2003 | +#if defined(INTEL_SSE4) | |
2034 | 2004 | gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; |
2005 | +#elif defined(ARCH_AARCH64) | |
2006 | + gf_w64_neon_split_init(gf); | |
2007 | +#endif | |
2035 | 2008 | } |
2036 | 2009 | #else |
2037 | 2010 | if (h->mult_type == GF_MULT_DEFAULT) { |
... | ... | @@ -2050,17 +2023,23 @@ int gf_w64_split_init(gf_t *gf) |
2050 | 2023 | { |
2051 | 2024 | #ifdef INTEL_SSSE3 |
2052 | 2025 | gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region; |
2026 | + #elif defined(ARCH_AARCH64) | |
2027 | + gf_w64_neon_split_init(gf); | |
2053 | 2028 | #else |
2054 | 2029 | return 0; |
2055 | 2030 | #endif |
2056 | 2031 | } |
2057 | 2032 | else //no altmap |
2058 | 2033 | { |
2059 | - #ifdef INTEL_SSE4 | |
2034 | + #if defined(INTEL_SSE4) || defined(ARCH_AARCH64) | |
2060 | 2035 | if(h->region_type & GF_REGION_NOSIMD) |
2061 | 2036 | gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; |
2062 | 2037 | else |
2063 | - gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; | |
2038 | + #if defined(INTEL_SSE4) | |
2039 | + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region; | |
2040 | + #elif defined(ARCH_AARCH64) | |
2041 | + gf_w64_neon_split_init(gf); | |
2042 | + #endif | |
2064 | 2043 | #else |
2065 | 2044 | gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region; |
2066 | 2045 | if(h->region_type & GF_REGION_SIMD) |
... | ... | @@ -2134,7 +2113,7 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg |
2134 | 2113 | /* Allen: set the *local* arg1 and arg2, just for scratch size purposes, |
2135 | 2114 | * then fall through to split table scratch size code. */ |
2136 | 2115 | |
2137 | -#ifdef INTEL_SSE4 | |
2116 | +#if defined(INTEL_SSE4) || defined(ARCH_AARCH64) | |
2138 | 2117 | arg1 = 64; |
2139 | 2118 | arg2 = 4; |
2140 | 2119 | #else | ... | ... |
... | ... | @@ -0,0 +1,333 @@ |
1 | +/* | |
2 | + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic | |
3 | + * James S. Plank, Ethan L. Miller, Kevin M. Greenan, | |
4 | + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride. | |
5 | + * | |
6 | + * Copyright (c) 2014: Janne Grunau <j@jannau.net> | |
7 | + * | |
8 | + * Redistribution and use in source and binary forms, with or without | |
9 | + * modification, are permitted provided that the following conditions | |
10 | + * are met: | |
11 | + * | |
12 | + * - Redistributions of source code must retain the above copyright | |
13 | + * notice, this list of conditions and the following disclaimer. | |
14 | + * | |
15 | + * - Redistributions in binary form must reproduce the above copyright | |
16 | + * notice, this list of conditions and the following disclaimer in | |
17 | + * the documentation and/or other materials provided with the | |
18 | + * distribution. | |
19 | + * | |
20 | + * - Neither the name of the University of Tennessee nor the names of its | |
21 | + * contributors may be used to endorse or promote products derived | |
22 | + * from this software without specific prior written permission. | |
23 | + * | |
24 | + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
25 | + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
26 | + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
27 | + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
28 | + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
29 | + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
30 | + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS | |
31 | + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |
32 | + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
33 | + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY | |
34 | + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
35 | + * POSSIBILITY OF SUCH DAMAGE. | |
36 | + * | |
37 | + * gf_w64_neon.c | |
38 | + * | |
39 | + * Neon routines for 64-bit Galois fields | |
40 | + * | |
41 | + */ | |
42 | + | |
43 | +#include "gf_int.h" | |
44 | +#include <stdio.h> | |
45 | +#include <stdlib.h> | |
46 | +#include "gf_w64.h" | |
47 | + | |
48 | + | |
49 | +#ifndef ARCH_AARCH64 | |
50 | +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ | |
51 | + vtbl2_u8(tbl, vget_high_u8(v))) | |
52 | +#endif | |
53 | + | |
54 | +static | |
55 | +inline | |
56 | +void | |
57 | +neon_w64_split_4_lazy_altmap_multiply_region(gf_t *gf, uint64_t *src, | |
58 | + uint64_t *dst, uint64_t *d_end, | |
59 | + uint64_t val, int xor) | |
60 | +{ | |
61 | + unsigned i, j, k; | |
62 | + uint8_t btable[16]; | |
63 | +#ifdef ARCH_AARCH64 | |
64 | + uint8x16_t tables[16][8]; | |
65 | +#else | |
66 | + uint8x8x2_t tables[16][8]; | |
67 | +#endif | |
68 | + uint8x16_t p[8], mask1, si; | |
69 | + | |
70 | + gf_internal_t *h = (gf_internal_t *) gf->scratch; | |
71 | + struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private; | |
72 | + | |
73 | + for (i = 0; i < 16; i++) { | |
74 | + for (j = 0; j < 8; j++) { | |
75 | + for (k = 0; k < 16; k++) { | |
76 | + btable[k] = (uint8_t) ld->tables[i][k]; | |
77 | + ld->tables[i][k] >>= 8; | |
78 | + } | |
79 | +#ifdef ARCH_AARCH64 | |
80 | + tables[i][j] = vld1q_u8(btable); | |
81 | +#else | |
82 | + tables[i][j].val[0] = vld1_u8(btable); | |
83 | + tables[i][j].val[1] = vld1_u8(btable + 8); | |
84 | +#endif | |
85 | + } | |
86 | + } | |
87 | + | |
88 | + mask1 = vdupq_n_u8(0xf); | |
89 | + | |
90 | + while (dst < d_end) { | |
91 | + | |
92 | + if (xor) { | |
93 | + for (i = 0; i < 8; i++) | |
94 | + p[i] = vld1q_u8((uint8_t *) (dst + i * 2)); | |
95 | + } else { | |
96 | + for (i = 0; i < 8; i++) | |
97 | + p[i] = vdupq_n_u8(0); | |
98 | + } | |
99 | + | |
100 | + i = 0; | |
101 | + for (k = 0; k < 8; k++) { | |
102 | + uint8x16_t v0 = vld1q_u8((uint8_t *) src); | |
103 | + src += 2; | |
104 | + | |
105 | + si = vandq_u8(v0, mask1); | |
106 | + for (j = 0; j < 8; j++) { | |
107 | + p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si)); | |
108 | + } | |
109 | + i++; | |
110 | + si = vshrq_n_u8(v0, 4); | |
111 | + for (j = 0; j < 8; j++) { | |
112 | + p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si)); | |
113 | + } | |
114 | + i++; | |
115 | + | |
116 | + } | |
117 | + for (i = 0; i < 8; i++) { | |
118 | + vst1q_u8((uint8_t *) dst, p[i]); | |
119 | + dst += 2; | |
120 | + } | |
121 | + } | |
122 | +} | |
123 | + | |
124 | +static | |
125 | +inline | |
126 | +void | |
127 | +neon_w64_split_4_lazy_multiply_region(gf_t *gf, uint64_t *src, uint64_t *dst, | |
128 | + uint64_t *d_end, uint64_t val, int xor) | |
129 | +{ | |
130 | + unsigned i, j, k; | |
131 | + uint8_t btable[16]; | |
132 | +#ifdef ARCH_AARCH64 | |
133 | + uint8x16_t tables[16][8]; | |
134 | +#else | |
135 | + uint8x8x2_t tables[16][8]; | |
136 | +#endif | |
137 | + uint8x16_t p[8], mask1, si; | |
138 | + uint64x2_t st[8]; | |
139 | + uint32x4x2_t s32[4]; | |
140 | + uint16x8x2_t s16[4]; | |
141 | + uint8x16x2_t s8[4]; | |
142 | + | |
143 | + gf_internal_t *h = (gf_internal_t *) gf->scratch; | |
144 | + struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private; | |
145 | + | |
146 | + for (i = 0; i < 16; i++) { | |
147 | + for (j = 0; j < 8; j++) { | |
148 | + for (k = 0; k < 16; k++) { | |
149 | + btable[k] = (uint8_t) ld->tables[i][k]; | |
150 | + ld->tables[i][k] >>= 8; | |
151 | + } | |
152 | +#ifdef ARCH_AARCH64 | |
153 | + tables[i][j] = vld1q_u8(btable); | |
154 | +#else | |
155 | + tables[i][j].val[0] = vld1_u8(btable); | |
156 | + tables[i][j].val[1] = vld1_u8(btable + 8); | |
157 | +#endif | |
158 | + } | |
159 | + } | |
160 | + | |
161 | + mask1 = vdupq_n_u8(0xf); | |
162 | + | |
163 | + while (dst < d_end) { | |
164 | + | |
165 | + for (k = 0; k < 8; k++) { | |
166 | + st[k] = vld1q_u64(src); | |
167 | + src += 2; | |
168 | + p[k] = vdupq_n_u8(0); | |
169 | + } | |
170 | + | |
171 | + s32[0] = vuzpq_u32(vreinterpretq_u32_u64(st[0]), | |
172 | + vreinterpretq_u32_u64(st[1])); | |
173 | + s32[1] = vuzpq_u32(vreinterpretq_u32_u64(st[2]), | |
174 | + vreinterpretq_u32_u64(st[3])); | |
175 | + s32[2] = vuzpq_u32(vreinterpretq_u32_u64(st[4]), | |
176 | + vreinterpretq_u32_u64(st[5])); | |
177 | + s32[3] = vuzpq_u32(vreinterpretq_u32_u64(st[6]), | |
178 | + vreinterpretq_u32_u64(st[7])); | |
179 | + | |
180 | + s16[0] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[0]), | |
181 | + vreinterpretq_u16_u32(s32[1].val[0])); | |
182 | + s16[1] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[0]), | |
183 | + vreinterpretq_u16_u32(s32[3].val[0])); | |
184 | + s16[2] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[1]), | |
185 | + vreinterpretq_u16_u32(s32[1].val[1])); | |
186 | + s16[3] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[1]), | |
187 | + vreinterpretq_u16_u32(s32[3].val[1])); | |
188 | + | |
189 | + s8[0] = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[0]), | |
190 | + vreinterpretq_u8_u16(s16[1].val[0])); | |
191 | + s8[1] = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[1]), | |
192 | + vreinterpretq_u8_u16(s16[1].val[1])); | |
193 | + s8[2] = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[0]), | |
194 | + vreinterpretq_u8_u16(s16[3].val[0])); | |
195 | + s8[3] = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[1]), | |
196 | + vreinterpretq_u8_u16(s16[3].val[1])); | |
197 | + | |
198 | + i = 0; | |
199 | + for (k = 0; k < 8; k++) { | |
200 | + si = vandq_u8(s8[k >> 1].val[k & 1], mask1); | |
201 | + for (j = 0; j < 8; j++) { | |
202 | + p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si)); | |
203 | + } | |
204 | + i++; | |
205 | + si = vshrq_n_u8(s8[k >> 1].val[k & 1], 4); | |
206 | + for (j = 0; j < 8; j++) { | |
207 | + p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si)); | |
208 | + } | |
209 | + i++; | |
210 | + } | |
211 | + | |
212 | + s8[0] = vzipq_u8(p[0], p[1]); | |
213 | + s8[1] = vzipq_u8(p[2], p[3]); | |
214 | + s8[2] = vzipq_u8(p[4], p[5]); | |
215 | + s8[3] = vzipq_u8(p[6], p[7]); | |
216 | + | |
217 | + s16[0] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[0]), | |
218 | + vreinterpretq_u16_u8(s8[1].val[0])); | |
219 | + s16[1] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[0]), | |
220 | + vreinterpretq_u16_u8(s8[3].val[0])); | |
221 | + s16[2] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[1]), | |
222 | + vreinterpretq_u16_u8(s8[1].val[1])); | |
223 | + s16[3] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[1]), | |
224 | + vreinterpretq_u16_u8(s8[3].val[1])); | |
225 | + | |
226 | + s32[0] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[0]), | |
227 | + vreinterpretq_u32_u16(s16[1].val[0])); | |
228 | + s32[1] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[1]), | |
229 | + vreinterpretq_u32_u16(s16[1].val[1])); | |
230 | + s32[2] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[0]), | |
231 | + vreinterpretq_u32_u16(s16[3].val[0])); | |
232 | + s32[3] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[1]), | |
233 | + vreinterpretq_u32_u16(s16[3].val[1])); | |
234 | + | |
235 | + for (k = 0; k < 8; k ++) { | |
236 | + st[k] = vreinterpretq_u64_u32(s32[k >> 1].val[k & 1]); | |
237 | + } | |
238 | + | |
239 | + if (xor) { | |
240 | + for (i = 0; i < 8; i++) { | |
241 | + uint64x2_t t1 = vld1q_u64(dst); | |
242 | + vst1q_u64(dst, veorq_u64(st[i], t1)); | |
243 | + dst += 2; | |
244 | + } | |
245 | + } else { | |
246 | + for (i = 0; i < 8; i++) { | |
247 | + vst1q_u64(dst, st[i]); | |
248 | + dst += 2; | |
249 | + } | |
250 | + } | |
251 | + | |
252 | + } | |
253 | +} | |
254 | + | |
255 | +static | |
256 | +void | |
257 | +gf_w64_neon_split_4_lazy_multiply_region(gf_t *gf, void *src, void *dest, | |
258 | + uint64_t val, int bytes, int xor, | |
259 | + int altmap) | |
260 | +{ | |
261 | + gf_internal_t *h; | |
262 | + int i, j, k; | |
263 | + uint64_t pp, v, *s64, *d64, *top; | |
264 | + struct gf_split_4_64_lazy_data *ld; | |
265 | + gf_region_data rd; | |
266 | + | |
267 | + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; } | |
268 | + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; } | |
269 | + | |
270 | + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128); | |
271 | + gf_do_initial_region_alignment(&rd); | |
272 | + | |
273 | + s64 = (uint64_t *) rd.s_start; | |
274 | + d64 = (uint64_t *) rd.d_start; | |
275 | + top = (uint64_t *) rd.d_top; | |
276 | + | |
277 | + h = (gf_internal_t *) gf->scratch; | |
278 | + pp = h->prim_poly; | |
279 | + ld = (struct gf_split_4_64_lazy_data *) h->private; | |
280 | + | |
281 | + v = val; | |
282 | + for (i = 0; i < 16; i++) { | |
283 | + ld->tables[i][0] = 0; | |
284 | + for (j = 1; j < 16; j <<= 1) { | |
285 | + for (k = 0; k < j; k++) { | |
286 | + ld->tables[i][k^j] = (v ^ ld->tables[i][k]); | |
287 | + } | |
288 | + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1); | |
289 | + } | |
290 | + } | |
291 | + | |
292 | + if (altmap) { | |
293 | + if (xor) | |
294 | + neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 1); | |
295 | + else | |
296 | + neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 0); | |
297 | + } else { | |
298 | + if (xor) | |
299 | + neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 1); | |
300 | + else | |
301 | + neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 0); | |
302 | + } | |
303 | + | |
304 | + gf_do_final_region_alignment(&rd); | |
305 | +} | |
306 | + | |
307 | +static | |
308 | +void | |
309 | +gf_w64_split_4_64_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest, | |
310 | + uint64_t val, int bytes, int xor) | |
311 | +{ | |
312 | + gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0); | |
313 | +} | |
314 | + | |
315 | +static | |
316 | +void | |
317 | +gf_w64_split_4_64_lazy_altmap_multiply_region_neon(gf_t *gf, void *src, | |
318 | + void *dest, uint64_t val, | |
319 | + int bytes, int xor) | |
320 | +{ | |
321 | + gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1); | |
322 | +} | |
323 | + | |
324 | +void gf_w64_neon_split_init(gf_t *gf) | |
325 | +{ | |
326 | + gf_internal_t *h = (gf_internal_t *) gf->scratch; | |
327 | + | |
328 | + if (h->region_type & GF_REGION_ALTMAP) | |
329 | + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_altmap_multiply_region_neon; | |
330 | + else | |
331 | + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region_neon; | |
332 | + | |
333 | +} | ... | ... |