Issue #9

0 up
0 down
Open
jerasure/gf-complete#9
Created by Nyan

Altmap transform over region

I quite like the idea of ALTMAP for fast region multiplies using the SPLIT method (it's quite ingenious!).

GF-complete supplies a means to extract individual words from ALTMAP, but no way to transform a region to/from ALTMAP, which would be quite useful to have. So I've written a crude routine to do so (for w=16), presented below if you find it useful:

/*
#ifdef INTEL_AVX512BW

#define _mword __m512i
#define _MM(f) _mm512_ ## f
#define _MMI(f) _mm512_ ## f ## i512
#define _FN(f) f ## _avx512

#include "gf_w16_split.c"

#undef _mword
#undef _MM
#undef _MMI
#undef _FN

#endif

#ifdef INTEL_AVX2
#define _mword __m256i
#define _MM(f) _mm256_ ## f
#define _MMI(f) _mm256_ ## f ## i256
#define _FN(f) f ## _avx2

#include "gf_w16_split.c"

#undef _mword
#undef _MM
#undef _MMI
#undef _FN

#endif
*/

#define _mword __m128i
#define _MM(f) _mm_ ## f
#define _MMI(f) _mm_ ## f ## i128
#define _FN(f) f ## _sse

/* #include "gf_w16_split.c" */

/* src can be the same as dest */
static void _FN(gf_w16_split_start)(void* src, int bytes, void* dest) {
#ifdef INTEL_SSE2
    gf_region_data rd;
    _mword *sW, *dW, *topW;
    _mword ta, tb, lmask;

    gf_set_region_data(&rd, NULL, src, dest, bytes, 0, 0, sizeof(_mword)*2);


    if(src != dest) {
        /* copy end and initial parts */
        memcpy(rd.d_top, rd.s_top, (intptr_t)rd.src + rd.bytes - (intptr_t)rd.s_top);
        memcpy(rd.dest, rd.src, (intptr_t)rd.s_start - (intptr_t)rd.src);
    }

    sW = (_mword*)rd.s_start;
    dW = (_mword*)rd.d_start;
    topW = (_mword*)rd.d_top;

    lmask = _MM(set1_epi16) (0xff);

    while(dW != topW) {
        ta = _MMI(load_s)( sW);
        tb = _MMI(load_s)(sW+1);

        _MMI(store_s) (dW,
            _MM(packus_epi16)(
                _MM(srli_epi16)(tb, 8),
                _MM(srli_epi16)(ta, 8)
            )
        );
        _MMI(store_s) (dW+1,
            _MM(packus_epi16)(
                _MMI(and_s)(tb, lmask),
                _MMI(and_s)(ta, lmask)
            )
        );

        sW += 2;
        dW += 2;
    }
#endif
}

/* src can be the same as dest */
static void _FN(gf_w16_split_final)(void* src, int bytes, void* dest) {
#ifdef INTEL_SSE2
    gf_region_data rd;
    _mword *sW, *dW, *topW;
    _mword tpl, tph;

    gf_set_region_data(&rd, NULL, src, dest, bytes, 0, 0, sizeof(_mword)*2);


    if(src != dest) {
        /* copy end and initial parts */
        memcpy(rd.d_top, rd.s_top, (intptr_t)rd.src + rd.bytes - (intptr_t)rd.s_top);
        memcpy(rd.dest, rd.src, (intptr_t)rd.s_start - (intptr_t)rd.src);
    }

    sW = (_mword*)rd.s_start;
    dW = (_mword*)rd.d_start;
    topW = (_mword*)rd.d_top;

    while(dW != topW) {
        tph = _MMI(load_s)( sW);
        tpl = _MMI(load_s)(sW+1);

        _MMI(store_s) (dW, _MM(unpackhi_epi8)(tpl, tph));
        _MMI(store_s) (dW+1, _MM(unpacklo_epi8)(tpl, tph));

        sW += 2;
        dW += 2;
    }
#endif
}


#undef _mword
#undef _MM
#undef _MMI
#undef _FN
  • A proper version should have a non-SSE version, but since ALTMAP isn't so useful without SSE, I didn't bother.
  • I've also been experimenting with AVX2, hence the weird defines. I don't know whether you want to bother with it since it also changes the way ALTMAP works
  • I feel that the length (bytes) should use size_t instead of int, but I decided to stay consistent here

Hope that's useful and thanks again for the library!

Assignee: None
Milestone: None
1 participant