Commit f373b138aae6ee052ca711e90837ca11bbedd156

Authored by animetosho
1 parent 7a9a09f3
Exists in master and in 1 other branch v3

Initial fix for SPLIT(16,4) ALTMAP NEON (non ARMv8)

Showing 1 changed file with 25 additions and 17 deletions   Show diff stats
src/neon/gf_w16_neon.c
... ... @@ -222,7 +222,7 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
222 222 {
223 223 unsigned i;
224 224 uint8_t *high = tbl + 4 * 16;
225   - uint8x8_t vh0, vh1, vl0, vl1, r0, r1, r2, r3;
  225 + uint8x8_t vh0, vh1, vl0, vl1, rh0, rh1, rl0, rl1;
226 226 uint8x8_t loset;
227 227  
228 228 uint8x8x2_t tbl_h[4], tbl_l[4];
... ... @@ -241,35 +241,43 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
241 241 vl0 = vld1_u8(src + 16);
242 242 vl1 = vld1_u8(src + 24);
243 243  
244   - r0 = vtbl2_u8(tbl_l[0], vand_u8(vh0, loset));
245   - r1 = vtbl2_u8(tbl_h[0], vand_u8(vh1, loset));
246   - r2 = vtbl2_u8(tbl_l[2], vand_u8(vl0, loset));
247   - r3 = vtbl2_u8(tbl_h[2], vand_u8(vl1, loset));
  244 + rl0 = vtbl2_u8(tbl_l[0], vand_u8(vl0, loset));
  245 + rl1 = vtbl2_u8(tbl_l[0], vand_u8(vl1, loset));
  246 + rh0 = vtbl2_u8(tbl_h[0], vand_u8(vl0, loset));
  247 + rh1 = vtbl2_u8(tbl_h[0], vand_u8(vl1, loset));
  248 + rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[2], vand_u8(vh0, loset)));
  249 + rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[2], vand_u8(vh1, loset)));
  250 + rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[2], vand_u8(vh0, loset)));
  251 + rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[2], vand_u8(vh1, loset)));
248 252  
249 253 vh0 = vshr_n_u8(vh0, 4);
250 254 vh1 = vshr_n_u8(vh1, 4);
251 255 vl0 = vshr_n_u8(vl0, 4);
252 256 vl1 = vshr_n_u8(vl1, 4);
253 257  
254   - r0 = veor_u8(r0, vtbl2_u8(tbl_l[1], vh0));
255   - r1 = veor_u8(r1, vtbl2_u8(tbl_h[1], vh1));
256   - r2 = veor_u8(r2, vtbl2_u8(tbl_l[3], vl0));
257   - r3 = veor_u8(r3, vtbl2_u8(tbl_h[3], vl1));
  258 + rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[1], vl0));
  259 + rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[1], vl1));
  260 + rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[1], vl0));
  261 + rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[1], vl1));
  262 + rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[3], vh0));
  263 + rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[3], vh1));
  264 + rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[3], vh0));
  265 + rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[3], vh1));
258 266  
259 267 if (xor) {
260 268 vh0 = vld1_u8(dst);
261 269 vh1 = vld1_u8(dst + 8);
262 270 vl0 = vld1_u8(dst + 16);
263 271 vl1 = vld1_u8(dst + 24);
264   - r0 = veor_u8(r0, vh0);
265   - r1 = veor_u8(r1, vh1);
266   - r2 = veor_u8(r2, vl0);
267   - r3 = veor_u8(r3, vl1);
  272 + rh0 = veor_u8(rh0, vh0);
  273 + rh1 = veor_u8(rh1, vh1);
  274 + rl0 = veor_u8(rl0, vl0);
  275 + rl1 = veor_u8(rl1, vl1);
268 276 }
269   - vst1_u8(dst, r0);
270   - vst1_u8(dst + 8, r1);
271   - vst1_u8(dst + 16, r2);
272   - vst1_u8(dst + 24, r3);
  277 + vst1_u8(dst, rh0);
  278 + vst1_u8(dst + 8, rh1);
  279 + vst1_u8(dst + 16, rl0);
  280 + vst1_u8(dst + 24, rl1);
273 281  
274 282 src += 32;
275 283 dst += 32;
... ...