Commit f373b138aae6ee052ca711e90837ca11bbedd156

Authored by animetosho
1 parent 7a9a09f3
Exists in master and in 1 other branch v3

Initial fix for SPLIT(16,4) ALTMAP NEON (non ARMv8)

Showing 1 changed file with 25 additions and 17 deletions   Show diff stats
src/neon/gf_w16_neon.c
@@ -222,7 +222,7 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src, @@ -222,7 +222,7 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
222 { 222 {
223 unsigned i; 223 unsigned i;
224 uint8_t *high = tbl + 4 * 16; 224 uint8_t *high = tbl + 4 * 16;
225 - uint8x8_t vh0, vh1, vl0, vl1, r0, r1, r2, r3; 225 + uint8x8_t vh0, vh1, vl0, vl1, rh0, rh1, rl0, rl1;
226 uint8x8_t loset; 226 uint8x8_t loset;
227 227
228 uint8x8x2_t tbl_h[4], tbl_l[4]; 228 uint8x8x2_t tbl_h[4], tbl_l[4];
@@ -241,35 +241,43 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src, @@ -241,35 +241,43 @@ neon_w16_split_4_altmap_multiply_region(gf_t *gf, uint8_t *src,
241 vl0 = vld1_u8(src + 16); 241 vl0 = vld1_u8(src + 16);
242 vl1 = vld1_u8(src + 24); 242 vl1 = vld1_u8(src + 24);
243 243
244 - r0 = vtbl2_u8(tbl_l[0], vand_u8(vh0, loset));  
245 - r1 = vtbl2_u8(tbl_h[0], vand_u8(vh1, loset));  
246 - r2 = vtbl2_u8(tbl_l[2], vand_u8(vl0, loset));  
247 - r3 = vtbl2_u8(tbl_h[2], vand_u8(vl1, loset)); 244 + rl0 = vtbl2_u8(tbl_l[0], vand_u8(vl0, loset));
  245 + rl1 = vtbl2_u8(tbl_l[0], vand_u8(vl1, loset));
  246 + rh0 = vtbl2_u8(tbl_h[0], vand_u8(vl0, loset));
  247 + rh1 = vtbl2_u8(tbl_h[0], vand_u8(vl1, loset));
  248 + rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[2], vand_u8(vh0, loset)));
  249 + rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[2], vand_u8(vh1, loset)));
  250 + rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[2], vand_u8(vh0, loset)));
  251 + rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[2], vand_u8(vh1, loset)));
248 252
249 vh0 = vshr_n_u8(vh0, 4); 253 vh0 = vshr_n_u8(vh0, 4);
250 vh1 = vshr_n_u8(vh1, 4); 254 vh1 = vshr_n_u8(vh1, 4);
251 vl0 = vshr_n_u8(vl0, 4); 255 vl0 = vshr_n_u8(vl0, 4);
252 vl1 = vshr_n_u8(vl1, 4); 256 vl1 = vshr_n_u8(vl1, 4);
253 257
254 - r0 = veor_u8(r0, vtbl2_u8(tbl_l[1], vh0));  
255 - r1 = veor_u8(r1, vtbl2_u8(tbl_h[1], vh1));  
256 - r2 = veor_u8(r2, vtbl2_u8(tbl_l[3], vl0));  
257 - r3 = veor_u8(r3, vtbl2_u8(tbl_h[3], vl1)); 258 + rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[1], vl0));
  259 + rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[1], vl1));
  260 + rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[1], vl0));
  261 + rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[1], vl1));
  262 + rl0 = veor_u8(rl0, vtbl2_u8(tbl_l[3], vh0));
  263 + rl1 = veor_u8(rl1, vtbl2_u8(tbl_l[3], vh1));
  264 + rh0 = veor_u8(rh0, vtbl2_u8(tbl_h[3], vh0));
  265 + rh1 = veor_u8(rh1, vtbl2_u8(tbl_h[3], vh1));
258 266
259 if (xor) { 267 if (xor) {
260 vh0 = vld1_u8(dst); 268 vh0 = vld1_u8(dst);
261 vh1 = vld1_u8(dst + 8); 269 vh1 = vld1_u8(dst + 8);
262 vl0 = vld1_u8(dst + 16); 270 vl0 = vld1_u8(dst + 16);
263 vl1 = vld1_u8(dst + 24); 271 vl1 = vld1_u8(dst + 24);
264 - r0 = veor_u8(r0, vh0);  
265 - r1 = veor_u8(r1, vh1);  
266 - r2 = veor_u8(r2, vl0);  
267 - r3 = veor_u8(r3, vl1); 272 + rh0 = veor_u8(rh0, vh0);
  273 + rh1 = veor_u8(rh1, vh1);
  274 + rl0 = veor_u8(rl0, vl0);
  275 + rl1 = veor_u8(rl1, vl1);
268 } 276 }
269 - vst1_u8(dst, r0);  
270 - vst1_u8(dst + 8, r1);  
271 - vst1_u8(dst + 16, r2);  
272 - vst1_u8(dst + 24, r3); 277 + vst1_u8(dst, rh0);
  278 + vst1_u8(dst + 8, rh1);
  279 + vst1_u8(dst + 16, rl0);
  280 + vst1_u8(dst + 24, rl1);
273 281
274 src += 32; 282 src += 32;
275 dst += 32; 283 dst += 32;