Commit 05057e5635e5ef7fb5be3156f477866cce98fbdb
1 parent
438283c1
Exists in
master
and in
1 other branch
Eliminate unnecessary VTRNs in SPLIT(16,4) NEON implementation
Also makes the ARMv8 version consistent with the older one, in terms of processing width
Showing
1 changed file
with
22 additions
and
79 deletions
Show diff stats
src/neon/gf_w16_neon.c
... | ... | @@ -46,7 +46,11 @@ |
46 | 46 | #include <stdlib.h> |
47 | 47 | #include "gf_w16.h" |
48 | 48 | |
49 | -#ifdef ARCH_AARCH64 | |
49 | +#ifndef ARCH_AARCH64 | |
50 | +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ | |
51 | + vtbl2_u8(tbl, vget_high_u8(v))) | |
52 | +#endif | |
53 | + | |
50 | 54 | static |
51 | 55 | inline |
52 | 56 | void |
... | ... | @@ -56,23 +60,29 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, |
56 | 60 | { |
57 | 61 | unsigned i; |
58 | 62 | uint8_t *high = tbl + 4 * 16; |
59 | - uint16x8_t va0, va1, r0, r1; | |
60 | 63 | uint8x16_t loset, rl, rh; |
61 | 64 | uint8x16x2_t va; |
62 | 65 | |
66 | +#ifdef ARCH_AARCH64 | |
63 | 67 | uint8x16_t tbl_h[4], tbl_l[4]; |
64 | 68 | for (i = 0; i < 4; i++) { |
65 | 69 | tbl_l[i] = vld1q_u8(tbl + i*16); |
66 | 70 | tbl_h[i] = vld1q_u8(high + i*16); |
67 | 71 | } |
72 | +#else | |
73 | + uint8x8x2_t tbl_h[4], tbl_l[4]; | |
74 | + for (i = 0; i < 4; i++) { | |
75 | + tbl_l[i].val[0] = vld1_u8(tbl + i*16); | |
76 | + tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); | |
77 | + tbl_h[i].val[0] = vld1_u8(high + i*16); | |
78 | + tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); | |
79 | + } | |
80 | +#endif | |
68 | 81 | |
69 | 82 | loset = vdupq_n_u8(0xf); |
70 | 83 | |
71 | 84 | while (dst < d_end) { |
72 | - va0 = vld1q_u16(src); | |
73 | - va1 = vld1q_u16(src + 8); | |
74 | - | |
75 | - va = vtrnq_u8(vreinterpretq_u8_u16(va0), vreinterpretq_u8_u16(va1)); | |
85 | + va = vld2q_u8((uint8_t*)src); | |
76 | 86 | |
77 | 87 | rl = vqtbl1q_u8(tbl_l[0], vandq_u8(va.val[0], loset)); |
78 | 88 | rh = vqtbl1q_u8(tbl_h[0], vandq_u8(va.val[0], loset)); |
... | ... | @@ -84,88 +94,21 @@ neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, |
84 | 94 | |
85 | 95 | rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[1], va.val[0])); |
86 | 96 | rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[1], va.val[0])); |
87 | - rl = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1])); | |
88 | - rh = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1])); | |
89 | - | |
90 | - va = vtrnq_u8(rl, rh); | |
91 | - r0 = vreinterpretq_u16_u8(va.val[0]); | |
92 | - r1 = vreinterpretq_u16_u8(va.val[1]); | |
97 | + va.val[0] = veorq_u8(rl, vqtbl1q_u8(tbl_l[3], va.val[1])); | |
98 | + va.val[1] = veorq_u8(rh, vqtbl1q_u8(tbl_h[3], va.val[1])); | |
93 | 99 | |
94 | 100 | if (xor) { |
95 | - va0 = vld1q_u16(dst); | |
96 | - va1 = vld1q_u16(dst + 8); | |
97 | - r0 = veorq_u16(r0, va0); | |
98 | - r1 = veorq_u16(r1, va1); | |
101 | + uint8x16x2_t vb = vld2q_u8((uint8_t*)dst); | |
102 | + va.val[0] = veorq_u8(va.val[0], vb.val[0]); | |
103 | + va.val[1] = veorq_u8(va.val[1], vb.val[1]); | |
99 | 104 | } |
100 | - vst1q_u16(dst, r0); | |
101 | - vst1q_u16(dst + 8, r1); | |
105 | + vst2q_u8((uint8_t*)dst, va); | |
102 | 106 | |
103 | 107 | src += 16; |
104 | 108 | dst += 16; |
105 | 109 | } |
106 | 110 | } |
107 | 111 | |
108 | -#else /* ARCH_AARCH64 */ | |
109 | - | |
110 | -static | |
111 | -inline | |
112 | -void | |
113 | -neon_w16_split_4_multiply_region(gf_t *gf, uint16_t *src, uint16_t *dst, | |
114 | - uint16_t *d_end, uint8_t *tbl, | |
115 | - gf_val_32_t val, int xor) | |
116 | -{ | |
117 | - unsigned i; | |
118 | - uint8_t *high = tbl + 4 * 16; | |
119 | - uint16x8_t va, r; | |
120 | - uint8x8_t loset, vb, vc, rl, rh; | |
121 | - | |
122 | - uint8x8x2_t tbl_h[4], tbl_l[4]; | |
123 | - for (i = 0; i < 4; i++) { | |
124 | - tbl_l[i].val[0] = vld1_u8(tbl + i*16); | |
125 | - tbl_l[i].val[1] = vld1_u8(tbl + i*16 + 8); | |
126 | - tbl_h[i].val[0] = vld1_u8(high + i*16); | |
127 | - tbl_h[i].val[1] = vld1_u8(high + i*16 + 8); | |
128 | - } | |
129 | - | |
130 | - loset = vdup_n_u8(0xf); | |
131 | - | |
132 | - while (dst < d_end) { | |
133 | - va = vld1q_u16(src); | |
134 | - | |
135 | - vb = vmovn_u16(va); | |
136 | - vc = vshrn_n_u16(va, 8); | |
137 | - | |
138 | - rl = vtbl2_u8(tbl_l[0], vand_u8(vb, loset)); | |
139 | - rh = vtbl2_u8(tbl_h[0], vand_u8(vb, loset)); | |
140 | - vb = vshr_n_u8(vb, 4); | |
141 | - rl = veor_u8(rl, vtbl2_u8(tbl_l[2], vand_u8(vc, loset))); | |
142 | - rh = veor_u8(rh, vtbl2_u8(tbl_h[2], vand_u8(vc, loset))); | |
143 | - vc = vshr_n_u8(vc, 4); | |
144 | - rl = veor_u8(rl, vtbl2_u8(tbl_l[1], vb)); | |
145 | - rh = veor_u8(rh, vtbl2_u8(tbl_h[1], vb)); | |
146 | - rl = veor_u8(rl, vtbl2_u8(tbl_l[3], vc)); | |
147 | - rh = veor_u8(rh, vtbl2_u8(tbl_h[3], vc)); | |
148 | - | |
149 | - r = vmovl_u8(rl); | |
150 | - r = vorrq_u16(r, vshll_n_u8(rh, 8)); | |
151 | - | |
152 | - if (xor) { | |
153 | - va = vld1q_u16(dst); | |
154 | - r = veorq_u16(r, va); | |
155 | - } | |
156 | - vst1q_u16(dst, r); | |
157 | - | |
158 | - src += 8; | |
159 | - dst += 8; | |
160 | - } | |
161 | -} | |
162 | -#endif /* ARCH_AARCH64 */ | |
163 | - | |
164 | -#ifndef ARCH_AARCH64 | |
165 | -#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \ | |
166 | - vtbl2_u8(tbl, vget_high_u8(v))) | |
167 | -#endif | |
168 | - | |
169 | 112 | static |
170 | 113 | inline |
171 | 114 | void | ... | ... |