Commit 6fdd8bc3d32cb2f7fa55d2de9dc7cc5bb2f885aa

Authored by Janne Grunau
1 parent 370c88b9
Exists in master and in 2 other branches v2, v3

arm: NEON optimisations for gf_w64

Optimisations for 4,64 split table region multiplications. Only used on
ARMv8-A since it is not faster on ARMv7-A.
include/gf_w64.h 0 → 100644
... ... @@ -0,0 +1,50 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * gf_w64.h
  7 + *
  8 + * Defines and data structures for 64-bit Galois fields
  9 + */
  10 +
  11 +#ifndef GF_COMPLETE_GF_W64_H
  12 +#define GF_COMPLETE_GF_W64_H
  13 +
  14 +#include <stdint.h>
  15 +
  16 +#define GF_FIELD_WIDTH (64)
  17 +#define GF_FIRST_BIT (1ULL << 63)
  18 +
  19 +#define GF_BASE_FIELD_WIDTH (32)
  20 +#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH)
  21 +#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
  22 +
  23 +struct gf_w64_group_data {
  24 + uint64_t *reduce;
  25 + uint64_t *shift;
  26 + uint64_t *memory;
  27 +};
  28 +
  29 +struct gf_split_4_64_lazy_data {
  30 + uint64_t tables[16][16];
  31 + uint64_t last_value;
  32 +};
  33 +
  34 +struct gf_split_8_64_lazy_data {
  35 + uint64_t tables[8][(1<<8)];
  36 + uint64_t last_value;
  37 +};
  38 +
  39 +struct gf_split_16_64_lazy_data {
  40 + uint64_t tables[4][(1<<16)];
  41 + uint64_t last_value;
  42 +};
  43 +
  44 +struct gf_split_8_8_data {
  45 + uint64_t tables[15][256][256];
  46 +};
  47 +
  48 +void gf_w64_neon_split_init(gf_t *gf);
  49 +
  50 +#endif /* GF_COMPLETE_GF_W64_H */
... ...
src/Makefile.am
... ... @@ -14,7 +14,8 @@ if HAVE_NEON
14 14 libgf_complete_la_SOURCES += neon/gf_w4_neon.c \
15 15 neon/gf_w8_neon.c \
16 16 neon/gf_w16_neon.c \
17   - neon/gf_w32_neon.c
  17 + neon/gf_w32_neon.c \
  18 + neon/gf_w64_neon.c
18 19 endif
19 20  
20 21 libgf_complete_la_LDFLAGS = -version-info 1:0:0
... ...
src/gf_w64.c
... ... @@ -11,38 +11,7 @@
11 11 #include "gf_int.h"
12 12 #include <stdio.h>
13 13 #include <stdlib.h>
14   -
15   -#define GF_FIELD_WIDTH (64)
16   -#define GF_FIRST_BIT (1ULL << 63)
17   -
18   -#define GF_BASE_FIELD_WIDTH (32)
19   -#define GF_BASE_FIELD_SIZE (1ULL << GF_BASE_FIELD_WIDTH)
20   -#define GF_BASE_FIELD_GROUP_SIZE GF_BASE_FIELD_SIZE-1
21   -
22   -struct gf_w64_group_data {
23   - uint64_t *reduce;
24   - uint64_t *shift;
25   - uint64_t *memory;
26   -};
27   -
28   -struct gf_split_4_64_lazy_data {
29   - uint64_t tables[16][16];
30   - uint64_t last_value;
31   -};
32   -
33   -struct gf_split_8_64_lazy_data {
34   - uint64_t tables[8][(1<<8)];
35   - uint64_t last_value;
36   -};
37   -
38   -struct gf_split_16_64_lazy_data {
39   - uint64_t tables[4][(1<<16)];
40   - uint64_t last_value;
41   -};
42   -
43   -struct gf_split_8_8_data {
44   - uint64_t tables[15][256][256];
45   -};
  14 +#include "gf_w64.h"
46 15  
47 16 static
48 17 inline
... ... @@ -2027,11 +1996,15 @@ int gf_w64_split_init(gf_t *gf)
2027 1996 /* Allen: set region pointers for default mult type. Single pointers are
2028 1997 * taken care of above (explicitly for sse, implicitly for no sse). */
2029 1998  
2030   -#ifdef INTEL_SSE4
  1999 +#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
2031 2000 if (h->mult_type == GF_MULT_DEFAULT) {
2032 2001 d4 = (struct gf_split_4_64_lazy_data *) h->private;
2033 2002 d4->last_value = 0;
  2003 +#if defined(INTEL_SSE4)
2034 2004 gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
  2005 +#elif defined(ARCH_AARCH64)
  2006 + gf_w64_neon_split_init(gf);
  2007 +#endif
2035 2008 }
2036 2009 #else
2037 2010 if (h->mult_type == GF_MULT_DEFAULT) {
... ... @@ -2050,17 +2023,23 @@ int gf_w64_split_init(gf_t *gf)
2050 2023 {
2051 2024 #ifdef INTEL_SSSE3
2052 2025 gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_altmap_multiply_region;
  2026 + #elif defined(ARCH_AARCH64)
  2027 + gf_w64_neon_split_init(gf);
2053 2028 #else
2054 2029 return 0;
2055 2030 #endif
2056 2031 }
2057 2032 else //no altmap
2058 2033 {
2059   - #ifdef INTEL_SSE4
  2034 + #if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
2060 2035 if(h->region_type & GF_REGION_NOSIMD)
2061 2036 gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
2062 2037 else
2063   - gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
  2038 + #if defined(INTEL_SSE4)
  2039 + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_sse_multiply_region;
  2040 + #elif defined(ARCH_AARCH64)
  2041 + gf_w64_neon_split_init(gf);
  2042 + #endif
2064 2043 #else
2065 2044 gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region;
2066 2045 if(h->region_type & GF_REGION_SIMD)
... ... @@ -2134,7 +2113,7 @@ int gf_w64_scratch_size(int mult_type, int region_type, int divide_type, int arg
2134 2113 /* Allen: set the *local* arg1 and arg2, just for scratch size purposes,
2135 2114 * then fall through to split table scratch size code. */
2136 2115  
2137   -#ifdef INTEL_SSE4
  2116 +#if defined(INTEL_SSE4) || defined(ARCH_AARCH64)
2138 2117 arg1 = 64;
2139 2118 arg2 = 4;
2140 2119 #else
... ...
src/neon/gf_w64_neon.c 0 → 100644
... ... @@ -0,0 +1,333 @@
  1 +/*
  2 + * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
  3 + * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
  4 + * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
  5 + *
  6 + * Copyright (c) 2014: Janne Grunau <j@jannau.net>
  7 + *
  8 + * Redistribution and use in source and binary forms, with or without
  9 + * modification, are permitted provided that the following conditions
  10 + * are met:
  11 + *
  12 + * - Redistributions of source code must retain the above copyright
  13 + * notice, this list of conditions and the following disclaimer.
  14 + *
  15 + * - Redistributions in binary form must reproduce the above copyright
  16 + * notice, this list of conditions and the following disclaimer in
  17 + * the documentation and/or other materials provided with the
  18 + * distribution.
  19 + *
  20 + * - Neither the name of the University of Tennessee nor the names of its
  21 + * contributors may be used to endorse or promote products derived
  22 + * from this software without specific prior written permission.
  23 + *
  24 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  28 + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  29 + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  30 + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  31 + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  32 + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  34 + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  35 + * POSSIBILITY OF SUCH DAMAGE.
  36 + *
  37 + * gf_w64_neon.c
  38 + *
  39 + * Neon routines for 64-bit Galois fields
  40 + *
  41 + */
  42 +
  43 +#include "gf_int.h"
  44 +#include <stdio.h>
  45 +#include <stdlib.h>
  46 +#include "gf_w64.h"
  47 +
  48 +
  49 +#ifndef ARCH_AARCH64
  50 +#define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
  51 + vtbl2_u8(tbl, vget_high_u8(v)))
  52 +#endif
  53 +
  54 +static
  55 +inline
  56 +void
  57 +neon_w64_split_4_lazy_altmap_multiply_region(gf_t *gf, uint64_t *src,
  58 + uint64_t *dst, uint64_t *d_end,
  59 + uint64_t val, int xor)
  60 +{
  61 + unsigned i, j, k;
  62 + uint8_t btable[16];
  63 +#ifdef ARCH_AARCH64
  64 + uint8x16_t tables[16][8];
  65 +#else
  66 + uint8x8x2_t tables[16][8];
  67 +#endif
  68 + uint8x16_t p[8], mask1, si;
  69 +
  70 + gf_internal_t *h = (gf_internal_t *) gf->scratch;
  71 + struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private;
  72 +
  73 + for (i = 0; i < 16; i++) {
  74 + for (j = 0; j < 8; j++) {
  75 + for (k = 0; k < 16; k++) {
  76 + btable[k] = (uint8_t) ld->tables[i][k];
  77 + ld->tables[i][k] >>= 8;
  78 + }
  79 +#ifdef ARCH_AARCH64
  80 + tables[i][j] = vld1q_u8(btable);
  81 +#else
  82 + tables[i][j].val[0] = vld1_u8(btable);
  83 + tables[i][j].val[1] = vld1_u8(btable + 8);
  84 +#endif
  85 + }
  86 + }
  87 +
  88 + mask1 = vdupq_n_u8(0xf);
  89 +
  90 + while (dst < d_end) {
  91 +
  92 + if (xor) {
  93 + for (i = 0; i < 8; i++)
  94 + p[i] = vld1q_u8((uint8_t *) (dst + i * 2));
  95 + } else {
  96 + for (i = 0; i < 8; i++)
  97 + p[i] = vdupq_n_u8(0);
  98 + }
  99 +
  100 + i = 0;
  101 + for (k = 0; k < 8; k++) {
  102 + uint8x16_t v0 = vld1q_u8((uint8_t *) src);
  103 + src += 2;
  104 +
  105 + si = vandq_u8(v0, mask1);
  106 + for (j = 0; j < 8; j++) {
  107 + p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
  108 + }
  109 + i++;
  110 + si = vshrq_n_u8(v0, 4);
  111 + for (j = 0; j < 8; j++) {
  112 + p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
  113 + }
  114 + i++;
  115 +
  116 + }
  117 + for (i = 0; i < 8; i++) {
  118 + vst1q_u8((uint8_t *) dst, p[i]);
  119 + dst += 2;
  120 + }
  121 + }
  122 +}
  123 +
  124 +static
  125 +inline
  126 +void
  127 +neon_w64_split_4_lazy_multiply_region(gf_t *gf, uint64_t *src, uint64_t *dst,
  128 + uint64_t *d_end, uint64_t val, int xor)
  129 +{
  130 + unsigned i, j, k;
  131 + uint8_t btable[16];
  132 +#ifdef ARCH_AARCH64
  133 + uint8x16_t tables[16][8];
  134 +#else
  135 + uint8x8x2_t tables[16][8];
  136 +#endif
  137 + uint8x16_t p[8], mask1, si;
  138 + uint64x2_t st[8];
  139 + uint32x4x2_t s32[4];
  140 + uint16x8x2_t s16[4];
  141 + uint8x16x2_t s8[4];
  142 +
  143 + gf_internal_t *h = (gf_internal_t *) gf->scratch;
  144 + struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private;
  145 +
  146 + for (i = 0; i < 16; i++) {
  147 + for (j = 0; j < 8; j++) {
  148 + for (k = 0; k < 16; k++) {
  149 + btable[k] = (uint8_t) ld->tables[i][k];
  150 + ld->tables[i][k] >>= 8;
  151 + }
  152 +#ifdef ARCH_AARCH64
  153 + tables[i][j] = vld1q_u8(btable);
  154 +#else
  155 + tables[i][j].val[0] = vld1_u8(btable);
  156 + tables[i][j].val[1] = vld1_u8(btable + 8);
  157 +#endif
  158 + }
  159 + }
  160 +
  161 + mask1 = vdupq_n_u8(0xf);
  162 +
  163 + while (dst < d_end) {
  164 +
  165 + for (k = 0; k < 8; k++) {
  166 + st[k] = vld1q_u64(src);
  167 + src += 2;
  168 + p[k] = vdupq_n_u8(0);
  169 + }
  170 +
  171 + s32[0] = vuzpq_u32(vreinterpretq_u32_u64(st[0]),
  172 + vreinterpretq_u32_u64(st[1]));
  173 + s32[1] = vuzpq_u32(vreinterpretq_u32_u64(st[2]),
  174 + vreinterpretq_u32_u64(st[3]));
  175 + s32[2] = vuzpq_u32(vreinterpretq_u32_u64(st[4]),
  176 + vreinterpretq_u32_u64(st[5]));
  177 + s32[3] = vuzpq_u32(vreinterpretq_u32_u64(st[6]),
  178 + vreinterpretq_u32_u64(st[7]));
  179 +
  180 + s16[0] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[0]),
  181 + vreinterpretq_u16_u32(s32[1].val[0]));
  182 + s16[1] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[0]),
  183 + vreinterpretq_u16_u32(s32[3].val[0]));
  184 + s16[2] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[1]),
  185 + vreinterpretq_u16_u32(s32[1].val[1]));
  186 + s16[3] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[1]),
  187 + vreinterpretq_u16_u32(s32[3].val[1]));
  188 +
  189 + s8[0] = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[0]),
  190 + vreinterpretq_u8_u16(s16[1].val[0]));
  191 + s8[1] = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[1]),
  192 + vreinterpretq_u8_u16(s16[1].val[1]));
  193 + s8[2] = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[0]),
  194 + vreinterpretq_u8_u16(s16[3].val[0]));
  195 + s8[3] = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[1]),
  196 + vreinterpretq_u8_u16(s16[3].val[1]));
  197 +
  198 + i = 0;
  199 + for (k = 0; k < 8; k++) {
  200 + si = vandq_u8(s8[k >> 1].val[k & 1], mask1);
  201 + for (j = 0; j < 8; j++) {
  202 + p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
  203 + }
  204 + i++;
  205 + si = vshrq_n_u8(s8[k >> 1].val[k & 1], 4);
  206 + for (j = 0; j < 8; j++) {
  207 + p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
  208 + }
  209 + i++;
  210 + }
  211 +
  212 + s8[0] = vzipq_u8(p[0], p[1]);
  213 + s8[1] = vzipq_u8(p[2], p[3]);
  214 + s8[2] = vzipq_u8(p[4], p[5]);
  215 + s8[3] = vzipq_u8(p[6], p[7]);
  216 +
  217 + s16[0] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[0]),
  218 + vreinterpretq_u16_u8(s8[1].val[0]));
  219 + s16[1] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[0]),
  220 + vreinterpretq_u16_u8(s8[3].val[0]));
  221 + s16[2] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[1]),
  222 + vreinterpretq_u16_u8(s8[1].val[1]));
  223 + s16[3] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[1]),
  224 + vreinterpretq_u16_u8(s8[3].val[1]));
  225 +
  226 + s32[0] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[0]),
  227 + vreinterpretq_u32_u16(s16[1].val[0]));
  228 + s32[1] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[1]),
  229 + vreinterpretq_u32_u16(s16[1].val[1]));
  230 + s32[2] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[0]),
  231 + vreinterpretq_u32_u16(s16[3].val[0]));
  232 + s32[3] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[1]),
  233 + vreinterpretq_u32_u16(s16[3].val[1]));
  234 +
  235 + for (k = 0; k < 8; k ++) {
  236 + st[k] = vreinterpretq_u64_u32(s32[k >> 1].val[k & 1]);
  237 + }
  238 +
  239 + if (xor) {
  240 + for (i = 0; i < 8; i++) {
  241 + uint64x2_t t1 = vld1q_u64(dst);
  242 + vst1q_u64(dst, veorq_u64(st[i], t1));
  243 + dst += 2;
  244 + }
  245 + } else {
  246 + for (i = 0; i < 8; i++) {
  247 + vst1q_u64(dst, st[i]);
  248 + dst += 2;
  249 + }
  250 + }
  251 +
  252 + }
  253 +}
  254 +
  255 +static
  256 +void
  257 +gf_w64_neon_split_4_lazy_multiply_region(gf_t *gf, void *src, void *dest,
  258 + uint64_t val, int bytes, int xor,
  259 + int altmap)
  260 +{
  261 + gf_internal_t *h;
  262 + int i, j, k;
  263 + uint64_t pp, v, *s64, *d64, *top;
  264 + struct gf_split_4_64_lazy_data *ld;
  265 + gf_region_data rd;
  266 +
  267 + if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
  268 + if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
  269 +
  270 + gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
  271 + gf_do_initial_region_alignment(&rd);
  272 +
  273 + s64 = (uint64_t *) rd.s_start;
  274 + d64 = (uint64_t *) rd.d_start;
  275 + top = (uint64_t *) rd.d_top;
  276 +
  277 + h = (gf_internal_t *) gf->scratch;
  278 + pp = h->prim_poly;
  279 + ld = (struct gf_split_4_64_lazy_data *) h->private;
  280 +
  281 + v = val;
  282 + for (i = 0; i < 16; i++) {
  283 + ld->tables[i][0] = 0;
  284 + for (j = 1; j < 16; j <<= 1) {
  285 + for (k = 0; k < j; k++) {
  286 + ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
  287 + }
  288 + v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
  289 + }
  290 + }
  291 +
  292 + if (altmap) {
  293 + if (xor)
  294 + neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 1);
  295 + else
  296 + neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 0);
  297 + } else {
  298 + if (xor)
  299 + neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 1);
  300 + else
  301 + neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 0);
  302 + }
  303 +
  304 + gf_do_final_region_alignment(&rd);
  305 +}
  306 +
  307 +static
  308 +void
  309 +gf_w64_split_4_64_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
  310 + uint64_t val, int bytes, int xor)
  311 +{
  312 + gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
  313 +}
  314 +
  315 +static
  316 +void
  317 +gf_w64_split_4_64_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
  318 + void *dest, uint64_t val,
  319 + int bytes, int xor)
  320 +{
  321 + gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
  322 +}
  323 +
  324 +void gf_w64_neon_split_init(gf_t *gf)
  325 +{
  326 + gf_internal_t *h = (gf_internal_t *) gf->scratch;
  327 +
  328 + if (h->region_type & GF_REGION_ALTMAP)
  329 + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_altmap_multiply_region_neon;
  330 + else
  331 + gf->multiply_region.w64 = gf_w64_split_4_64_lazy_multiply_region_neon;
  332 +
  333 +}
... ...