Commit 3a1be40ea87ecc81e737aee6819ff96a6721f011

Authored by Janne Grunau
1 parent 36e75c3e
Exists in master and in 3 other branches v2, v3, wip-18092

arm: NEON optimisations for XOR in gf_multby_one

Showing 1 changed file with 35 additions and 0 deletions   Show diff stats
src/gf.c
... ... @@ -954,7 +954,42 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor)
954 954 }
955 955 return;
956 956 #endif
  957 +#if defined(ARM_NEON)
  958 + s8 = (uint8_t *) src;
  959 + d8 = (uint8_t *) dest;
957 960  
  961 + if (uls % 16 == uld % 16) {
  962 + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
  963 + while (s8 != rd.s_start) {
  964 + *d8 ^= *s8;
  965 + s8++;
  966 + d8++;
  967 + }
  968 + while (s8 < (uint8_t *) rd.s_top) {
  969 + uint8x16_t vs = vld1q_u8 (s8);
  970 + uint8x16_t vd = vld1q_u8 (d8);
  971 + uint8x16_t vr = veorq_u8 (vs, vd);
  972 + vst1q_u8 (d8, vr);
  973 + s8 += 16;
  974 + d8 += 16;
  975 + }
  976 + } else {
  977 + while (s8 + 15 < (uint8_t *) src + bytes) {
  978 + uint8x16_t vs = vld1q_u8 (s8);
  979 + uint8x16_t vd = vld1q_u8 (d8);
  980 + uint8x16_t vr = veorq_u8 (vs, vd);
  981 + vst1q_u8 (d8, vr);
  982 + s8 += 16;
  983 + d8 += 16;
  984 + }
  985 + }
  986 + while (s8 < (uint8_t *) src + bytes) {
  987 + *d8 ^= *s8;
  988 + s8++;
  989 + d8++;
  990 + }
  991 + return;
  992 +#endif
958 993 if (uls % 8 != uld % 8) {
959 994 gf_unaligned_xor(src, dest, bytes);
960 995 return;
... ...