Commit 643743d0482ca09a9dfa57beed196f172a22a78e

Authored by animetosho
1 parent 05057e56
Exists in master and in 1 other branch v3

Move conditional outside loop for NEON SPLIT4 implementation

Seems to improve performance a fair bit
Showing 1 changed file with 29 additions and 6 deletions   Show diff stats
src/neon/gf_w16_neon.c
... ... @@ -81,8 +81,11 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
81 81  
82 82 loset = vdupq_n_u8(0xf);
83 83  
84   - while (dst < d_end) {
  84 + if (xor) {
  85 + uint8x16x2_t vb;
  86 + while (dst < d_end) {
85 87 va = vld2q_u8((uint8_t*)src);
  88 + vb = vld2q_u8((uint8_t*)dst);
86 89  
87 90 rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
88 91 rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
... ... @@ -97,15 +100,35 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
97 100 va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
98 101 va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
99 102  
100   - if (xor) {
101   - uint8x16x2_t vb = vld2q_u8((uint8_t*)dst);
102   - va.val[0] = veorq_u8(va.val[0], vb.val[0]);
103   - va.val[1] = veorq_u8(va.val[1], vb.val[1]);
104   - }
  103 + va.val[0] = veorq_u8(va.val[0], vb.val[0]);
  104 + va.val[1] = veorq_u8(va.val[1], vb.val[1]);
105 105 vst2q_u8((uint8_t*)dst, va);
106 106  
107 107 src += 16;
108 108 dst += 16;
  109 + }
  110 + } else {
  111 + while (dst < d_end) {
  112 + va = vld2q_u8((uint8_t*)src);
  113 +
  114 + rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
  115 + rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
  116 + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[2], vandq_u8(va.val[1], loset)));
  117 + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[2], vandq_u8(va.val[1], loset)));
  118 +
  119 + va.val[0] = vshrq_n_u8(va.val[0], 4);
  120 + va.val[1] = vshrq_n_u8(va.val[1], 4);
  121 +
  122 + rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
  123 + rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
  124 + va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
  125 + va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
  126 +
  127 + vst2q_u8((uint8_t*)dst, va);
  128 +
  129 + src += 16;
  130 + dst += 16;
  131 + }
109 132 }
110 133 }
111 134  
... ...