Commit 0e5c920fb69f2d962db1df045d1b71b9b012b902

Authored by Bassam Tabbara
1 parent ad110421
Exists in master and in 1 other branch v3

gf_multby_one now checks runtime SIMD support

Showing 1 changed file with 65 additions and 63 deletions   Show diff stats
src/gf.c
... ... @@ -912,9 +912,6 @@ static void gf_unaligned_xor(void *src, void *dest, int bytes);
912 912  
913 913 void gf_multby_one(void *src, void *dest, int bytes, int xor)
914 914 {
915   -#ifdef INTEL_SSE2
916   - __m128i ms, md;
917   -#endif
918 915 unsigned long uls, uld;
919 916 uint8_t *s8, *d8;
920 917 uint64_t *s64, *d64, *dtop64;
... ... @@ -929,84 +926,89 @@ void gf_multby_one(void *src, void *dest, int bytes, int xor)
929 926 uld = (unsigned long) dest;
930 927  
931 928 #ifdef INTEL_SSE2
932   - int abytes;
933   - s8 = (uint8_t *) src;
934   - d8 = (uint8_t *) dest;
935   - if (uls % 16 == uld % 16) {
936   - gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
937   - while (s8 != rd.s_start) {
938   - *d8 ^= *s8;
939   - d8++;
940   - s8++;
  929 + if (gf_cpu_supports_intel_sse2) {
  930 + __m128i ms, md;
  931 + int abytes;
  932 + s8 = (uint8_t *) src;
  933 + d8 = (uint8_t *) dest;
  934 + if (uls % 16 == uld % 16) {
  935 + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
  936 + while (s8 != rd.s_start) {
  937 + *d8 ^= *s8;
  938 + d8++;
  939 + s8++;
  940 + }
  941 + while (s8 < (uint8_t *) rd.s_top) {
  942 + ms = _mm_load_si128 ((__m128i *)(s8));
  943 + md = _mm_load_si128 ((__m128i *)(d8));
  944 + md = _mm_xor_si128(md, ms);
  945 + _mm_store_si128((__m128i *)(d8), md);
  946 + s8 += 16;
  947 + d8 += 16;
  948 + }
  949 + while (s8 != (uint8_t *) src + bytes) {
  950 + *d8 ^= *s8;
  951 + d8++;
  952 + s8++;
  953 + }
  954 + return;
941 955 }
942   - while (s8 < (uint8_t *) rd.s_top) {
943   - ms = _mm_load_si128 ((__m128i *)(s8));
944   - md = _mm_load_si128 ((__m128i *)(d8));
  956 +
  957 + abytes = (bytes & 0xfffffff0);
  958 +
  959 + while (d8 < (uint8_t *) dest + abytes) {
  960 + ms = _mm_loadu_si128 ((__m128i *)(s8));
  961 + md = _mm_loadu_si128 ((__m128i *)(d8));
945 962 md = _mm_xor_si128(md, ms);
946   - _mm_store_si128((__m128i *)(d8), md);
  963 + _mm_storeu_si128((__m128i *)(d8), md);
947 964 s8 += 16;
948 965 d8 += 16;
949 966 }
950   - while (s8 != (uint8_t *) src + bytes) {
  967 + while (d8 != (uint8_t *) dest+bytes) {
951 968 *d8 ^= *s8;
952 969 d8++;
953 970 s8++;
954 971 }
955 972 return;
956 973 }
957   -
958   - abytes = (bytes & 0xfffffff0);
959   -
960   - while (d8 < (uint8_t *) dest + abytes) {
961   - ms = _mm_loadu_si128 ((__m128i *)(s8));
962   - md = _mm_loadu_si128 ((__m128i *)(d8));
963   - md = _mm_xor_si128(md, ms);
964   - _mm_storeu_si128((__m128i *)(d8), md);
965   - s8 += 16;
966   - d8 += 16;
967   - }
968   - while (d8 != (uint8_t *) dest+bytes) {
969   - *d8 ^= *s8;
970   - d8++;
971   - s8++;
972   - }
973   - return;
974 974 #endif
975 975 #if defined(ARM_NEON)
976   - s8 = (uint8_t *) src;
977   - d8 = (uint8_t *) dest;
978   -
979   - if (uls % 16 == uld % 16) {
980   - gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
981   - while (s8 != rd.s_start) {
  976 + if (gf_cpu_supports_arm_neon) {
  977 + s8 = (uint8_t *) src;
  978 + d8 = (uint8_t *) dest;
  979 +
  980 + if (uls % 16 == uld % 16) {
  981 + gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
  982 + while (s8 != rd.s_start) {
  983 + *d8 ^= *s8;
  984 + s8++;
  985 + d8++;
  986 + }
  987 + while (s8 < (uint8_t *) rd.s_top) {
  988 + uint8x16_t vs = vld1q_u8 (s8);
  989 + uint8x16_t vd = vld1q_u8 (d8);
  990 + uint8x16_t vr = veorq_u8 (vs, vd);
  991 + vst1q_u8 (d8, vr);
  992 + s8 += 16;
  993 + d8 += 16;
  994 + }
  995 + } else {
  996 + while (s8 + 15 < (uint8_t *) src + bytes) {
  997 + uint8x16_t vs = vld1q_u8 (s8);
  998 + uint8x16_t vd = vld1q_u8 (d8);
  999 + uint8x16_t vr = veorq_u8 (vs, vd);
  1000 + vst1q_u8 (d8, vr);
  1001 + s8 += 16;
  1002 + d8 += 16;
  1003 + }
  1004 + }
  1005 + while (s8 < (uint8_t *) src + bytes) {
982 1006 *d8 ^= *s8;
983 1007 s8++;
984 1008 d8++;
985 1009 }
986   - while (s8 < (uint8_t *) rd.s_top) {
987   - uint8x16_t vs = vld1q_u8 (s8);
988   - uint8x16_t vd = vld1q_u8 (d8);
989   - uint8x16_t vr = veorq_u8 (vs, vd);
990   - vst1q_u8 (d8, vr);
991   - s8 += 16;
992   - d8 += 16;
993   - }
994   - } else {
995   - while (s8 + 15 < (uint8_t *) src + bytes) {
996   - uint8x16_t vs = vld1q_u8 (s8);
997   - uint8x16_t vd = vld1q_u8 (d8);
998   - uint8x16_t vr = veorq_u8 (vs, vd);
999   - vst1q_u8 (d8, vr);
1000   - s8 += 16;
1001   - d8 += 16;
1002   - }
1003   - }
1004   - while (s8 < (uint8_t *) src + bytes) {
1005   - *d8 ^= *s8;
1006   - s8++;
1007   - d8++;
  1010 + return;
1008 1011 }
1009   - return;
1010 1012 #endif
1011 1013 if (uls % 8 != uld % 8) {
1012 1014 gf_unaligned_xor(src, dest, bytes);
... ...