Commit 438283c12d8378c449b78e011cb4f9c0ff33dcc3

Authored by animetosho
1 parent f373b138
Exists in master and in 1 other branch v3

Use similar strategy for SPLIT(16,4) ALTMAP NEON implementation as SPLIT(32,4)

Showing 1 changed file with 41 additions and 95 deletions   Show diff stats
src/neon/gf_w16_neon.c
... ... @@ -105,58 +105,6 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
105 105 }
106 106 }
107 107  
108   -static
109   -inline
110   -void
111   -neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
112   - uint8_t *dst, uint8_t *d_end,
113   - uint8_t *tbl, gf_val_32_t val,
114   - int xor)
115   -{
116   - unsigned i;
117   - uint8_t *high = tbl + 4 * 16;
118   - uint8x16_t vh, vl, rh, rl;
119   - uint8x16_t loset;
120   -
121   - uint8x16_t tbl_h[4], tbl_l[4];
122   - for (i = 0; i < 4; i++) {
123   - tbl_l[i] = vld1q_u8(tbl + i*16);
124   - tbl_h[i] = vld1q_u8(high + i*16);
125   - }
126   -
127   - loset = vdupq_n_u8(0xf);
128   -
129   - while (dst < d_end) {
130   - vh = vld1q_u8(src);
131   - vl = vld1q_u8(src + 16);
132   -
133   - rl = vqtbl1q_u8(tbl_l[0], vandq_u8(vl, loset));
134   - rh = vqtbl1q_u8(tbl_h[0], vandq_u8(vl, loset));
135   - rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(vh, loset)));
136   - rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(vh, loset)));
137   -
138   - vl = vshrq_n_u8(vl, 4);
139   - vh = vshrq_n_u8(vh, 4);
140   -
141   - rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], vl));
142   - rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], vl));
143   - rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], vh));
144   - rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], vh));
145   -
146   - if (xor) {
147   - vh = vld1q_u8(dst);
148   - vl = vld1q_u8(dst + 16);
149   - rh = veorq_u8(rh, vh);
150   - rl = veorq_u8(rl, vl);
151   - }
152   - vst1q_u8(dst, rh);
153   - vst1q_u8(dst + 16, rl);
154   -
155   - src += 32;
156   - dst += 32;
157   - }
158   -}
159   -
160 108 #else /* ARCH_AARCH64 */
161 109  
162 110 static
... ... @@ -211,6 +159,12 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
211 159 dst += 8;
212 160 }
213 161 }
  162 +#endif /* ARCH_AARCH64 */
  163 +
  164 +#ifndef ARCH_AARCH64
  165 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
  166 + vtbl2_u8(tbl, vget_high_u8(v)))
  167 +#endif
214 168  
215 169 static
216 170 inline
... ... @@ -222,68 +176,60 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
222 176 {
223 177 unsigned i;
224 178 uint8_t *high = tbl + 4 * 16;
225   - uint8x8_t vh0, vh1, vl0, vl1, rh0, rh1, rl0, rl1;
226   - uint8x8_t loset;
  179 + uint8x16_t vh, vl, rh, rl;
  180 + uint8x16_t loset;
227 181  
  182 +#ifdef ARCH_AARCH64
  183 + uint8x16_t tbl_h[4], tbl_l[4];
  184 +#else
228 185 uint8x8x2_t tbl_h[4], tbl_l[4];
  186 +#endif
229 187 for (i = 0; i < 4; i++) {
  188 +#ifdef ARCH_AARCH64
  189 + tbl_l[i] = vld1q_u8(tbl + i*16);
  190 + tbl_h[i] = vld1q_u8(high + i*16);
  191 +#else
230 192 tbl_l[i].val[0] = vld1_u8(tbl + i*16);
231 193 tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
232 194 tbl_h[i].val[0] = vld1_u8(high + i*16);
233 195 tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
  196 +#endif
234 197 }
235 198  
236   - loset = vdup_n_u8(0xf);
  199 + loset = vdupq_n_u8(0xf);
237 200  
238 201 while (dst < d_end) {
239   - vh0 = vld1_u8(src);
240   - vh1 = vld1_u8(src + 8);
241   - vl0 = vld1_u8(src + 16);
242   - vl1 = vld1_u8(src + 24);
243   -
244   - rl0 = vtbl2_u8(tbl_l[0], vand_u8(vl0, loset));
245   - rl1 = vtbl2_u8(tbl_l[0], vand_u8(vl1, loset));
246   - rh0 = vtbl2_u8(tbl_h[0], vand_u8(vl0, loset));
247   - rh1 = vtbl2_u8(tbl_h[0], vand_u8(vl1, loset));
248   - rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[2], vand_u8(vh0, loset)));
249   - rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[2], vand_u8(vh1, loset)));
250   - rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[2], vand_u8(vh0, loset)));
251   - rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[2], vand_u8(vh1, loset)));
252   -
253   - vh0 = vshr_n_u8(vh0, 4);
254   - vh1 = vshr_n_u8(vh1, 4);
255   - vl0 = vshr_n_u8(vl0, 4);
256   - vl1 = vshr_n_u8(vl1, 4);
257   -
258   - rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[1], vl0));
259   - rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[1], vl1));
260   - rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[1], vl0));
261   - rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[1], vl1));
262   - rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[3], vh0));
263   - rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[3], vh1));
264   - rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[3], vh0));
265   - rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[3], vh1));
  202 + vh = vld1q_u8(src);
  203 + vl = vld1q_u8(src + 16);
  204 +
  205 + rl = vqtbl1q_u8(tbl_l[0], vandq_u8(vl, loset));
  206 + rh = vqtbl1q_u8(tbl_h[0], vandq_u8(vl, loset));
  207 + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(vh, loset)));
  208 + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(vh, loset)));
  209 +
  210 + vl = vshrq_n_u8(vl, 4);
  211 + vh = vshrq_n_u8(vh, 4);
  212 +
  213 + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], vl));
  214 + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], vl));
  215 + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], vh));
  216 + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], vh));
266 217  
267 218 if (xor) {
268   - vh0 = vld1_u8(dst);
269   - vh1 = vld1_u8(dst + 8);
270   - vl0 = vld1_u8(dst + 16);
271   - vl1 = vld1_u8(dst + 24);
272   - rh0 = veor_u8(rh0, vh0);
273   - rh1 = veor_u8(rh1, vh1);
274   - rl0 = veor_u8(rl0, vl0);
275   - rl1 = veor_u8(rl1, vl1);
  219 + vh = vld1q_u8(dst);
  220 + vl = vld1q_u8(dst + 16);
  221 + rh = veorq_u8(rh, vh);
  222 + rl = veorq_u8(rl, vl);
276 223 }
277   - vst1_u8(dst, rh0);
278   - vst1_u8(dst + 8, rh1);
279   - vst1_u8(dst + 16, rl0);
280   - vst1_u8(dst + 24, rl1);
  224 + vst1q_u8(dst, rh);
  225 + vst1q_u8(dst + 16, rl);
281 226  
282 227 src += 32;
283 228 dst += 32;
284 229 }
285 230 }
286   -#endif /* ARCH_AARCH64 */
  231 +
  232 +
287 233  
288 234 static
289 235 inline
... ...