Commit 51a1abb9185ec6ea35817620d13322047f4fde4d

Authored by Loic Dachary
2 parents 8fe7382e 643743d0
Exists in master and in 1 other branch v3

Merge branch 'neon_fixes' into 'master'

NEON fixes/tweaks

This merge request fixes some issues and adds some tweaks to NEON code:

* SPLIT(16,4) ALTMAP implementation was broken as it only processed half the amount of data. As such, this fixed implementation is significantly slower than the old code (which is to be expected). Fixes #2
* SPLIT(16,4) implementations now merge the ARMv8 and older code path, similar to SPLIT(32,4). This fixes the ALTMAP variant, and also enables the non-ALTMAP version to have consistent sizing
* Unnecessary VTRN removed in non-ALTMAP SPLIT(16,4) as NEON allows (de)interleaving during load/store; because of this, ALTMAP isn't so useful in NEON
  * This can also be done for SPLIT(32,4), but I have not implemented it
* I also pulled the `if(xor)` conditional from non-ALTMAP SPLIT(16,4) to outside the loop. It seems to improve performance a bit on my Cortex A7
  * It probably should be implemented everywhere else, but I have not done this
* CARRY_FREE was incorrectly enabled on all sizes of w, when it's only available for w=4 and w=8

See merge request !16
Showing 2 changed files with 59 additions and 139 deletions   Show diff stats
src/gf.c
... ... @@ -219,7 +219,7 @@ int gf_error_check(int w, int mult_type, int region_type, int divide_type,
219 219 #endif
220 220  
221 221 #ifdef ARM_NEON
222   - pclmul = 1;
  222 + pclmul = (w == 4 || w == 8);
223 223 sse3 = 1;
224 224 #endif
225 225  
... ...
src/neon/gf_w16_neon.c
... ... @@ -46,7 +46,11 @@
46 46 #include <stdlib.h>
47 47 #include "gf_w16.h"
48 48  
49   -#ifdef ARCH_AARCH64
  49 +#ifndef ARCH_AARCH64
  50 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
  51 + vtbl2_u8(tbl, vget_high_u8(v)))
  52 +#endif
  53 +
50 54 static
51 55 inline
52 56 void
... ... @@ -56,23 +60,32 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
56 60 {
57 61 unsigned i;
58 62 uint8_t *high = tbl + 4 * 16;
59   - uint16x8_t va0, va1, r0, r1;
60 63 uint8x16_t loset, rl, rh;
61 64 uint8x16x2_t va;
62 65  
  66 +#ifdef ARCH_AARCH64
63 67 uint8x16_t tbl_h[4], tbl_l[4];
64 68 for (i = 0; i < 4; i++) {
65 69 tbl_l[i] = vld1q_u8(tbl + i*16);
66 70 tbl_h[i] = vld1q_u8(high + i*16);
67 71 }
  72 +#else
  73 + uint8x8x2_t tbl_h[4], tbl_l[4];
  74 + for (i = 0; i < 4; i++) {
  75 + tbl_l[i].val[0] = vld1_u8(tbl + i*16);
  76 + tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
  77 + tbl_h[i].val[0] = vld1_u8(high + i*16);
  78 + tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
  79 + }
  80 +#endif
68 81  
69 82 loset = vdupq_n_u8(0xf);
70 83  
71   - while (dst < d_end) {
72   - va0 = vld1q_u16(src);
73   - va1 = vld1q_u16(src + 8);
74   -
75   - va = vtrnq_u8(vreinterpretq_u8_u16(va0), vreinterpretq_u8_u16(va1));
  84 + if (xor) {
  85 + uint8x16x2_t vb;
  86 + while (dst < d_end) {
  87 + va = vld2q_u8((uint8_t*)src);
  88 + vb = vld2q_u8((uint8_t*)dst);
76 89  
77 90 rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
78 91 rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
... ... @@ -84,24 +97,38 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
84 97  
85 98 rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
86 99 rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
87   - rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
88   - rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
  100 + va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
  101 + va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
89 102  
90   - va = vtrnq_u8(rl, rh);
91   - r0 = vreinterpretq_u16_u8(va.val[0]);
92   - r1 = vreinterpretq_u16_u8(va.val[1]);
  103 + va.val[0] = veorq_u8(va.val[0], vb.val[0]);
  104 + va.val[1] = veorq_u8(va.val[1], vb.val[1]);
  105 + vst2q_u8((uint8_t*)dst, va);
93 106  
94   - if (xor) {
95   - va0 = vld1q_u16(dst);
96   - va1 = vld1q_u16(dst + 8);
97   - r0 = veorq_u16(r0, va0);
98   - r1 = veorq_u16(r1, va1);
99   - }
100   - vst1q_u16(dst, r0);
101   - vst1q_u16(dst + 8, r1);
  107 + src += 16;
  108 + dst += 16;
  109 + }
  110 + } else {
  111 + while (dst < d_end) {
  112 + va = vld2q_u8((uint8_t*)src);
  113 +
  114 + rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
  115 + rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
  116 + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset)));
  117 + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset)));
  118 +
  119 + va.val[0] = vshrq_n_u8(va.val[0], 4);
  120 + va.val[1] = vshrq_n_u8(va.val[1], 4);
  121 +
  122 + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
  123 + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
  124 + va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
  125 + va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
  126 +
  127 + vst2q_u8((uint8_t*)dst, va);
102 128  
103 129 src += 16;
104 130 dst += 16;
  131 + }
105 132 }
106 133 }
107 134  
... ... @@ -118,10 +145,21 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
118 145 uint8x16_t vh, vl, rh, rl;
119 146 uint8x16_t loset;
120 147  
  148 +#ifdef ARCH_AARCH64
121 149 uint8x16_t tbl_h[4], tbl_l[4];
  150 +#else
  151 + uint8x8x2_t tbl_h[4], tbl_l[4];
  152 +#endif
122 153 for (i = 0; i < 4; i++) {
  154 +#ifdef ARCH_AARCH64
123 155 tbl_l[i] = vld1q_u8(tbl + i*16);
124 156 tbl_h[i] = vld1q_u8(high + i*16);
  157 +#else
  158 + tbl_l[i].val[0] = vld1_u8(tbl + i*16);
  159 + tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
  160 + tbl_h[i].val[0] = vld1_u8(high + i*16);
  161 + tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
  162 +#endif
125 163 }
126 164  
127 165 loset = vdupq_n_u8(0xf);
... ... @@ -157,125 +195,7 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
157 195 }
158 196 }
159 197  
160   -#else /* ARCH_AARCH64 */
161   -
162   -static
163   -inline
164   -void
165   -neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
166   - uint16_t *d_end, uint8_t *tbl,
167   - gf_val_32_t val, int xor)
168   -{
169   - unsigned i;
170   - uint8_t *high = tbl + 4 * 16;
171   - uint16x8_t va, r;
172   - uint8x8_t loset, vb, vc, rl, rh;
173 198  
174   - uint8x8x2_t tbl_h[4], tbl_l[4];
175   - for (i = 0; i < 4; i++) {
176   - tbl_l[i].val[0] = vld1_u8(tbl + i*16);
177   - tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
178   - tbl_h[i].val[0] = vld1_u8(high + i*16);
179   - tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
180   - }
181   -
182   - loset = vdup_n_u8(0xf);
183   -
184   - while (dst < d_end) {
185   - va = vld1q_u16(src);
186   -
187   - vb = vmovn_u16(va);
188   - vc = vshrn_n_u16(va, 8);
189   -
190   - rl = vtbl2_u8(tbl_l[0], vand_u8(vb, loset));
191   - rh = vtbl2_u8(tbl_h[0], vand_u8(vb, loset));
192   - vb = vshr_n_u8(vb, 4);
193   - rl = veor_u8(rl, vtbl2_u8(tbl_l[2], vand_u8(vc, loset)));
194   - rh = veor_u8(rh, vtbl2_u8(tbl_h[2], vand_u8(vc, loset)));
195   - vc = vshr_n_u8(vc, 4);
196   - rl = veor_u8(rl, vtbl2_u8(tbl_l[1], vb));
197   - rh = veor_u8(rh, vtbl2_u8(tbl_h[1], vb));
198   - rl = veor_u8(rl, vtbl2_u8(tbl_l[3], vc));
199   - rh = veor_u8(rh, vtbl2_u8(tbl_h[3], vc));
200   -
201   - r = vmovl_u8(rl);
202   - r = vorrq_u16(r, vshll_n_u8(rh, 8));
203   -
204   - if (xor) {
205   - va = vld1q_u16(dst);
206   - r = veorq_u16(r, va);
207   - }
208   - vst1q_u16(dst, r);
209   -
210   - src += 8;
211   - dst += 8;
212   - }
213   -}
214   -
215   -static
216   -inline
217   -void
218   -neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
219   - uint8_t *dst, uint8_t *d_end,
220   - uint8_t *tbl, gf_val_32_t val,
221   - int xor)
222   -{
223   - unsigned i;
224   - uint8_t *high = tbl + 4 * 16;
225   - uint8x8_t vh0, vh1, vl0, vl1, r0, r1, r2, r3;
226   - uint8x8_t loset;
227   -
228   - uint8x8x2_t tbl_h[4], tbl_l[4];
229   - for (i = 0; i < 4; i++) {
230   - tbl_l[i].val[0] = vld1_u8(tbl + i*16);
231   - tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
232   - tbl_h[i].val[0] = vld1_u8(high + i*16);
233   - tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
234   - }
235   -
236   - loset = vdup_n_u8(0xf);
237   -
238   - while (dst < d_end) {
239   - vh0 = vld1_u8(src);
240   - vh1 = vld1_u8(src + 8);
241   - vl0 = vld1_u8(src + 16);
242   - vl1 = vld1_u8(src + 24);
243   -
244   - r0 = vtbl2_u8(tbl_l[0], vand_u8(vh0, loset));
245   - r1 = vtbl2_u8(tbl_h[0], vand_u8(vh1, loset));
246   - r2 = vtbl2_u8(tbl_l[2], vand_u8(vl0, loset));
247   - r3 = vtbl2_u8(tbl_h[2], vand_u8(vl1, loset));
248   -
249   - vh0 = vshr_n_u8(vh0, 4);
250   - vh1 = vshr_n_u8(vh1, 4);
251   - vl0 = vshr_n_u8(vl0, 4);
252   - vl1 = vshr_n_u8(vl1, 4);
253   -
254   - r0 = veor_u8(r0, vtbl2_u8(tbl_l[1], vh0));
255   - r1 = veor_u8(r1, vtbl2_u8(tbl_h[1], vh1));
256   - r2 = veor_u8(r2, vtbl2_u8(tbl_l[3], vl0));
257   - r3 = veor_u8(r3, vtbl2_u8(tbl_h[3], vl1));
258   -
259   - if (xor) {
260   - vh0 = vld1_u8(dst);
261   - vh1 = vld1_u8(dst + 8);
262   - vl0 = vld1_u8(dst + 16);
263   - vl1 = vld1_u8(dst + 24);
264   - r0 = veor_u8(r0, vh0);
265   - r1 = veor_u8(r1, vh1);
266   - r2 = veor_u8(r2, vl0);
267   - r3 = veor_u8(r3, vl1);
268   - }
269   - vst1_u8(dst, r0);
270   - vst1_u8(dst + 8, r1);
271   - vst1_u8(dst + 16, r2);
272   - vst1_u8(dst + 24, r3);
273   -
274   - src += 32;
275   - dst += 32;
276   - }
277   -}
278   -#endif /* ARCH_AARCH64 */
279 199  
280 200 static
281 201 inline
... ...