Commit 05057e5635e5ef7fb5be3156f477866cce98fbdb

Authored by animetosho
1 parent 438283c1
Exists in master and in 1 other branch v3

Eliminate unnecessary VTRNs in SPLIT(16,4) NEON implementation

Also makes the ARMv8 version consistent with the older one, in terms of processing width
Showing 1 changed file with 22 additions and 79 deletions   Show diff stats
src/neon/gf_w16_neon.c
... ... @@ -46,7 +46,11 @@
46 46 #include <stdlib.h>
47 47 #include "gf_w16.h"
48 48  
49   -#ifdef ARCH_AARCH64
  49 +#ifndef ARCH_AARCH64
  50 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
  51 + vtbl2_u8(tbl, vget_high_u8(v)))
  52 +#endif
  53 +
50 54 static
51 55 inline
52 56 void
... ... @@ -56,23 +60,29 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
56 60 {
57 61 unsigned i;
58 62 uint8_t *high = tbl + 4 * 16;
59   - uint16x8_t va0, va1, r0, r1;
60 63 uint8x16_t loset, rl, rh;
61 64 uint8x16x2_t va;
62 65  
  66 +#ifdef ARCH_AARCH64
63 67 uint8x16_t tbl_h[4], tbl_l[4];
64 68 for (i = 0; i < 4; i++) {
65 69 tbl_l[i] = vld1q_u8(tbl + i*16);
66 70 tbl_h[i] = vld1q_u8(high + i*16);
67 71 }
  72 +#else
  73 + uint8x8x2_t tbl_h[4], tbl_l[4];
  74 + for (i = 0; i < 4; i++) {
  75 + tbl_l[i].val[0] = vld1_u8(tbl + i*16);
  76 + tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
  77 + tbl_h[i].val[0] = vld1_u8(high + i*16);
  78 + tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
  79 + }
  80 +#endif
68 81  
69 82 loset = vdupq_n_u8(0xf);
70 83  
71 84 while (dst < d_end) {
72   - va0 = vld1q_u16(src);
73   - va1 = vld1q_u16(src + 8);
74   -
75   - va = vtrnq_u8(vreinterpretq_u8_u16(va0), vreinterpretq_u8_u16(va1));
  85 + va = vld2q_u8((uint8_t*)src);
76 86  
77 87 rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset));
78 88 rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset));
... ... @@ -84,88 +94,21 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
84 94  
85 95 rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0]));
86 96 rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0]));
87   - rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
88   - rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
89   -
90   - va = vtrnq_u8(rl, rh);
91   - r0 = vreinterpretq_u16_u8(va.val[0]);
92   - r1 = vreinterpretq_u16_u8(va.val[1]);
  97 + va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1]));
  98 + va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1]));
93 99  
94 100 if (xor) {
95   - va0 = vld1q_u16(dst);
96   - va1 = vld1q_u16(dst + 8);
97   - r0 = veorq_u16(r0, va0);
98   - r1 = veorq_u16(r1, va1);
  101 + uint8x16x2_t vb = vld2q_u8((uint8_t*)dst);
  102 + va.val[0] = veorq_u8(va.val[0], vb.val[0]);
  103 + va.val[1] = veorq_u8(va.val[1], vb.val[1]);
99 104 }
100   - vst1q_u16(dst, r0);
101   - vst1q_u16(dst + 8, r1);
  105 + vst2q_u8((uint8_t*)dst, va);
102 106  
103 107 src += 16;
104 108 dst += 16;
105 109 }
106 110 }
107 111  
108   -#else /* ARCH_AARCH64 */
109   -
110   -static
111   -inline
112   -void
113   -neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst,
114   - uint16_t *d_end, uint8_t *tbl,
115   - gf_val_32_t val, int xor)
116   -{
117   - unsigned i;
118   - uint8_t *high = tbl + 4 * 16;
119   - uint16x8_t va, r;
120   - uint8x8_t loset, vb, vc, rl, rh;
121   -
122   - uint8x8x2_t tbl_h[4], tbl_l[4];
123   - for (i = 0; i < 4; i++) {
124   - tbl_l[i].val[0] = vld1_u8(tbl + i*16);
125   - tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8);
126   - tbl_h[i].val[0] = vld1_u8(high + i*16);
127   - tbl_h[i].val[1] = vld1_u8(high + i*16 + 8);
128   - }
129   -
130   - loset = vdup_n_u8(0xf);
131   -
132   - while (dst < d_end) {
133   - va = vld1q_u16(src);
134   -
135   - vb = vmovn_u16(va);
136   - vc = vshrn_n_u16(va, 8);
137   -
138   - rl = vtbl2_u8(tbl_l[0], vand_u8(vb, loset));
139   - rh = vtbl2_u8(tbl_h[0], vand_u8(vb, loset));
140   - vb = vshr_n_u8(vb, 4);
141   - rl = veor_u8(rl, vtbl2_u8(tbl_l[2], vand_u8(vc, loset)));
142   - rh = veor_u8(rh, vtbl2_u8(tbl_h[2], vand_u8(vc, loset)));
143   - vc = vshr_n_u8(vc, 4);
144   - rl = veor_u8(rl, vtbl2_u8(tbl_l[1], vb));
145   - rh = veor_u8(rh, vtbl2_u8(tbl_h[1], vb));
146   - rl = veor_u8(rl, vtbl2_u8(tbl_l[3], vc));
147   - rh = veor_u8(rh, vtbl2_u8(tbl_h[3], vc));
148   -
149   - r = vmovl_u8(rl);
150   - r = vorrq_u16(r, vshll_n_u8(rh, 8));
151   -
152   - if (xor) {
153   - va = vld1q_u16(dst);
154   - r = veorq_u16(r, va);
155   - }
156   - vst1q_u16(dst, r);
157   -
158   - src += 8;
159   - dst += 8;
160   - }
161   -}
162   -#endif /* ARCH_AARCH64 */
163   -
164   -#ifndef ARCH_AARCH64
165   -#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
166   - vtbl2_u8(tbl, vget_high_u8(v)))
167   -#endif
168   -
169 112 static
170 113 inline
171 114 void
... ...