123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478 |
- // SIMD helper
- // optimze based on technolegy double, float and integer (32) SIMD instructions
- // writen by Martin Steinegger
- #ifndef SIMD_H
- #define SIMD_H
- #include <cstdlib>
- #include <limits>
- #include <algorithm>
- #include <iostream>
- #define AVX512_ALIGN_DOUBLE 64
- #define AVX512_VECSIZE_DOUBLE 8
- #define AVX512_ALIGN_FLOAT 64
- #define AVX512_VECSIZE_FLOAT 16
- #define AVX512_ALIGN_INT 64
- #define AVX512_VECSIZE_INT 16
- #define AVX_ALIGN_DOUBLE 32
- #define AVX_VECSIZE_DOUBLE 4
- #define AVX_ALIGN_FLOAT 32
- #define AVX_VECSIZE_FLOAT 8
- #define AVX2_ALIGN_INT 32
- #define AVX2_VECSIZE_INT 8
- #define SSE_ALIGN_DOUBLE 16
- #define SSE_VECSIZE_DOUBLE 2
- #define SSE_ALIGN_FLOAT 16
- #define SSE_VECSIZE_FLOAT 4
- #define SSE_ALIGN_INT 16
- #define SSE_VECSIZE_INT 4
- #define MAX_ALIGN_DOUBLE AVX512_ALIGN_DOUBLE
- #define MAX_VECSIZE_DOUBLE AVX512_VECSIZE_DOUBLE
- #define MAX_ALIGN_FLOAT AVX512_ALIGN_FLOAT
- #define MAX_VECSIZE_FLOAT AVX512_VECSIZE_FLOAT
- #define MAX_ALIGN_INT AVX512_ALIGN_INT
- #define MAX_VECSIZE_INT AVX512_VECSIZE_INT
- #define SIMDE_ENABLE_NATIVE_ALIASES
- #include <simde/simde-features.h>
- // FIXME: Finish AVX512 implementation
- //#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE)
- //#define AVX512
- //#endif
- #if defined(AVX512) || defined(SIMDE_X86_AVX2_NATIVE)
- #define AVX2
- #endif
- #ifdef AVX512
- #include <simde/x86/avx512f.h>
- #include <simde/x86/avx512bw.h>
- // double support
- #ifndef SIMD_DOUBLE
- #define SIMD_DOUBLE
- #define ALIGN_DOUBLE AVX512_ALIGN_DOUBLE
- #define VECSIZE_DOUBLE AVX512_VECSIZE_DOUBLE
- typedef __m512d simd_double;
- #define simdf64_add(x,y) _mm512_add_pd(x,y)
- #define simdf64_sub(x,y) _mm512_sub_pd(x,y)
- #define simdf64_mul(x,y) _mm512_mul_pd(x,y)
- #define simdf64_div(x,y) _mm512_div_pd(x,y)
- #define simdf64_max(x,y) _mm512_max_pd(x,y)
- #define simdf64_load(x) _mm512_load_pd(x)
- #define simdf64_store(x,y) _mm512_store_pd(x,y)
- #define simdf64_set(x) _mm512_set1_pd(x)
- #define simdf64_setzero(x) _mm512_setzero_pd()
- #define simdf64_gt(x,y) _mm512_cmpnle_pd_mask(x,y)
- #define simdf64_lt(x,y) _mm512_cmplt_pd_mask(x,y)
- #define simdf64_or(x,y) _mm512_or_si512(x,y)
- #define simdf64_and(x,y) _mm512_and_si512 (x,y)
- #define simdf64_andnot(x,y) _mm512_andnot_si512(x,y)
- #define simdf64_xor(x,y) _mm512_xor_si512(x,y)
- #endif //SIMD_DOUBLE
- // float support
- #ifndef SIMD_FLOAT
- #define SIMD_FLOAT
- #define ALIGN_FLOAT AVX512_ALIGN_FLOAT
- #define VECSIZE_FLOAT AVX512_VECSIZE_FLOAT
- typedef __m512 simd_float;
- #define simdf32_add(x,y) _mm512_add_ps(x,y)
- #define simdf32_sub(x,y) _mm512_sub_ps(x,y)
- #define simdf32_mul(x,y) _mm512_mul_ps(x,y)
- #define simdf32_div(x,y) _mm512_div_ps(x,y)
- #define simdf32_rcp(x) _mm512_rcp_ps(x)
- #define simdf32_max(x,y) _mm512_max_ps(x,y)
- #define simdf32_min(x,y) _mm512_min_ps(x,y)
- #define simdf32_load(x) _mm512_load_ps(x)
- #define simdf32_store(x,y) _mm512_store_ps(x,y)
- #define simdf32_set(x) _mm512_set1_ps(x)
- #define simdf32_setzero(x) _mm512_setzero_ps()
- #define simdf32_gt(x,y) _mm512_cmpnle_ps_mask(x,y)
- #define simdf32_eq(x,y) _mm512_cmpeq_ps_mask(x,y)
- #define simdf32_lt(x,y) _mm512_cmplt_ps_mask(x,y)
- #define simdf32_or(x,y) _mm512_or_si512(x,y)
- #define simdf32_and(x,y) _mm512_and_si512(x,y)
- #define simdf32_andnot(x,y) _mm512_andnot_si512(x,y)
- #define simdf32_xor(x,y) _mm512_xor_si512(x,y)
- #define simdf32_f2i(x) _mm512_cvtps_epi32(x) // convert s.p. float to integer
- #define simdf_f2icast(x) _mm512_castps_si512(x)
- #endif //SIMD_FLOAT
- // integer support
- #ifndef SIMD_INT
- #define SIMD_INT
- #define ALIGN_INT AVX512_ALIGN_INT
- #define VECSIZE_INT AVX512_VECSIZE_INT
- typedef __m512i simd_int;
- #define simdi32_add(x,y) _mm512_add_epi32(x,y)
- #define simdi16_add(x,y) _mm512_add_epi16(x,y)
- #define simdi16_adds(x,y) _mm512_adds_epi16(x,y)
- #define simdui8_adds(x,y) _mm512_adds_epu8()
- #define simdi32_sub(x,y) _mm512_sub_epi32(x,y)
- #define simdui8_subs(x,y) _mm512_subs_epu8()
- #define simdi32_mul(x,y) _mm512_mullo_epi32(x,y)
- #define simdui8_max(x,y) _mm512_max_epu8()
- #define simdi16_max(x,y) _mm512_max_epi16(x,y)
- #define simdi32_max(x,y) _mm512_max_epi32(x,y)
- #define simdi_load(x) _mm512_load_si512(x)
- #define simdi_streamload(x) _mm512_stream_load_si512(x)
- #define simdi_store(x,y) _mm512_store_si512(x,y)
- #define simdi_storeu(x,y) _mm512_storeu_si512(x,y)
- #define simdi32_set(x) _mm512_set1_epi32(x)
- #define simdi16_set(x) _mm512_set1_epi16(x)
- #define simdi8_set(x) _mm512_set1_epi8(x)
- #define simdi32_shuffle(x,y) _mm512_shuffle_epi32(x,y)
- #define simdi16_shuffle(x,y) NOT_YET_IMP(x,y)
- #define simdi8_shuffle(x,y) _mm512_shuffle_epi8(x,y)
- #define simdi_setzero() _mm512_setzero_si512()
- #define simdi32_gt(x,y) _mm512_cmpgt_epi32(x,y)
- #define simdi8_gt(x,y) NOT_YET_IMP()
- #define simdi16_gt(x,y) NOT_YET_IMP()
- #define simdi8_eq(x,y) NOT_YET_IMP()
- #define simdi32_lt(x,y) NOT_YET_IMP()
- #define simdi16_lt(x,y) NOT_YET_IMP()
- #define simdi8_lt(x,y) NOT_YET_IMP()
- #define simdi_or(x,y) _mm512_or_si512(x,y)
- #define simdi_and(x,y) _mm512_and_si512(x,y)
- #define simdi_andnot(x,y) _mm512_andnot_si512(x,y)
- #define simdi_xor(x,y) _mm512_xor_si512(x,y)
- #define simdi8_shiftl(x,y) NOT_YET_IMP()
- #define simdi8_shiftr(x,y) NOT_YET_IMP()
- #define simdi8_movemask(x) NOT_YET_IMP()
- #define simdi16_extract(x,y) NOT_YET_IMP()
- #define simdi16_slli(x,y) _mm512_slli_epi16(x,y) // shift integers in a left by y
- #define simdi16_srli(x,y) _mm512_srli_epi16(x,y) // shift integers in a right by y
- #define simdi32_slli(x,y) _mm512_slli_epi32(x,y) // shift integers in a left by y
- #define simdi32_srli(x,y) _mm512_srli_epi32(x,y) // shift integers in a right by y
- #define simdi32_i2f(x) _mm512_cvtepi32_ps(x) // convert integer to s.p. float
- #define simdi_i2fcast(x) _mm512_castsi512_ps(x)
- #endif //SIMD_INT
- #endif //AVX512_SUPPORT
- #ifdef AVX2
- #include <simde/x86/avx2.h>
- // integer support (usable with AVX2)
- #ifndef SIMD_INT
- #define SIMD_INT
- #define ALIGN_INT AVX2_ALIGN_INT
- #define VECSIZE_INT AVX2_VECSIZE_INT
- uint16_t simd_hmax16_sse(const __m128i buffer);
- uint8_t simd_hmax8_sse(const __m128i buffer);
- inline uint16_t simd_hmax16_avx(const __m256i buffer) {
- const __m128i abcd = _mm256_castsi256_si128(buffer);
- const uint16_t first = simd_hmax16_sse(abcd);
- const __m128i efgh = _mm256_extracti128_si256(buffer, 1);
- const uint16_t second = simd_hmax16_sse(efgh);
- return std::max(first, second);
- }
- inline uint8_t simd_hmax8_avx(const __m256i buffer) {
- const __m128i abcd = _mm256_castsi256_si128(buffer);
- const uint8_t first = simd_hmax8_sse(abcd);
- const __m128i efgh = _mm256_extracti128_si256(buffer, 1);
- const uint8_t second = simd_hmax8_sse(efgh);
- return std::max(first, second);
- }
- template <unsigned int N>
- inline __m256i _mm256_shift_left(__m256i a) {
- __m256i mask = _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0,0,3,0) );
- return _mm256_alignr_epi8(a,mask,16-N);
- }
- inline unsigned short extract_epi16(__m256i v, int pos) {
- switch(pos){
- case 0: return _mm256_extract_epi16(v, 0);
- case 1: return _mm256_extract_epi16(v, 1);
- case 2: return _mm256_extract_epi16(v, 2);
- case 3: return _mm256_extract_epi16(v, 3);
- case 4: return _mm256_extract_epi16(v, 4);
- case 5: return _mm256_extract_epi16(v, 5);
- case 6: return _mm256_extract_epi16(v, 6);
- case 7: return _mm256_extract_epi16(v, 7);
- case 8: return _mm256_extract_epi16(v, 8);
- case 9: return _mm256_extract_epi16(v, 9);
- case 10: return _mm256_extract_epi16(v, 10);
- case 11: return _mm256_extract_epi16(v, 11);
- case 12: return _mm256_extract_epi16(v, 12);
- case 13: return _mm256_extract_epi16(v, 13);
- case 14: return _mm256_extract_epi16(v, 14);
- case 15: return _mm256_extract_epi16(v, 15);
- }
- return 0;
- }
- typedef __m256i simd_int;
- #define simdi32_add(x,y) _mm256_add_epi32(x,y)
- #define simdi16_add(x,y) _mm256_add_epi16(x,y)
- #define simdi16_adds(x,y) _mm256_adds_epi16(x,y)
- #define simdui8_adds(x,y) _mm256_adds_epu8(x,y)
- #define simdi32_sub(x,y) _mm256_sub_epi32(x,y)
- #define simdui16_subs(x,y) _mm256_subs_epu16(x,y)
- #define simdui8_subs(x,y) _mm256_subs_epu8(x,y)
- #define simdi32_mul(x,y) _mm256_mullo_epi32(x,y)
- #define simdi32_max(x,y) _mm256_max_epi32(x,y)
- #define simdi16_max(x,y) _mm256_max_epi16(x,y)
- #define simdi16_hmax(x) simd_hmax16_avx(x)
- #define simdui8_max(x,y) _mm256_max_epu8(x,y)
- #define simdi8_hmax(x) simd_hmax8_avx(x)
- #define simdi_load(x) _mm256_load_si256(x)
- #define simdi_loadu(x) _mm256_loadu_si256(x)
- #define simdi_streamload(x) _mm256_stream_load_si256(x)
- #define simdi_store(x,y) _mm256_store_si256(x,y)
- #define simdi_storeu(x,y) _mm256_storeu_si256(x,y)
- #define simdi32_set(x) _mm256_set1_epi32(x)
- #define simdi16_set(x) _mm256_set1_epi16(x)
- #define simdi8_set(x) _mm256_set1_epi8(x)
- #define simdi32_shuffle(x,y) _mm256_shuffle_epi32(x,y)
- #define simdi16_shuffle(x,y) _mm256_shuffle_epi16(x,y)
- #define simdi8_shuffle(x,y) _mm256_shuffle_epi8(x,y)
- #define simdi_setzero() _mm256_setzero_si256()
- #define simdi32_gt(x,y) _mm256_cmpgt_epi32(x,y)
- #define simdi8_gt(x,y) _mm256_cmpgt_epi8(x,y)
- #define simdi16_gt(x,y) _mm256_cmpgt_epi16(x,y)
- #define simdi8_eq(x,y) _mm256_cmpeq_epi8(x,y)
- #define simdi16_eq(x,y) _mm256_cmpeq_epi16(x,y)
- #define simdi32_eq(x,y) _mm256_cmpeq_epi32(x,y)
- #define simdi32_lt(x,y) _mm256_cmpgt_epi32(y,x) // inverse
- #define simdi16_lt(x,y) _mm256_cmpgt_epi16(y,x) // inverse
- #define simdi8_lt(x,y) _mm256_cmpgt_epi8(y,x)
- #define simdi_or(x,y) _mm256_or_si256(x,y)
- #define simdi_and(x,y) _mm256_and_si256(x,y)
- #define simdi_andnot(x,y) _mm256_andnot_si256(x,y)
- #define simdi_xor(x,y) _mm256_xor_si256(x,y)
- #define simdi8_shiftl(x,y) _mm256_shift_left<y>(x)
- //TODO fix like shift_left
- #define simdi8_shiftr(x,y) _mm256_srli_si256(x,y)
- #define SIMD_MOVEMASK_MAX 0xffffffff
- #define simdi8_movemask(x) _mm256_movemask_epi8(x)
- #define simdi16_extract(x,y) extract_epi16(x,y)
- #define simdi16_slli(x,y) _mm256_slli_epi16(x,y) // shift integers in a left by y
- #define simdi16_srli(x,y) _mm256_srli_epi16(x,y) // shift integers in a right by y
- #define simdi32_slli(x,y) _mm256_slli_epi32(x,y) // shift integers in a left by y
- #define simdi32_srli(x,y) _mm256_srli_epi32(x,y) // shift integers in a right by y
- #define simdi32_i2f(x) _mm256_cvtepi32_ps(x) // convert integer to s.p. float
- #define simdi_i2fcast(x) _mm256_castsi256_ps(x)
- #endif
- #include <simde/x86/avx.h>
- // double support (usable with AVX1)
- #ifndef SIMD_DOUBLE
- #define SIMD_DOUBLE
- #define ALIGN_DOUBLE AVX_ALIGN_DOUBLE
- #define VECSIZE_DOUBLE AVX_VECSIZE_DOUBLE
- typedef __m256d simd_double;
- #define simdf64_add(x,y) _mm256_add_pd(x,y)
- #define simdf64_sub(x,y) _mm256_sub_pd(x,y)
- #define simdf64_mul(x,y) _mm256_mul_pd(x,y)
- #define simdf64_div(x,y) _mm256_div_pd(x,y)
- #define simdf64_max(x,y) _mm256_max_pd(x,y)
- #define simdf64_load(x) _mm256_load_pd(x)
- #define simdf64_store(x,y) _mm256_store_pd(x,y)
- #define simdf64_set(x) _mm256_set1_pd(x)
- #define simdf64_setzero(x) _mm256_setzero_pd()
- #define simdf64_gt(x,y) _mm256_cmp_pd(x,y,_CMP_GT_OS)
- #define simdf64_lt(x,y) _mm256_cmp_pd(x,y,_CMP_LT_OS)
- #define simdf64_or(x,y) _mm256_or_pd(x,y)
- #define simdf64_and(x,y) _mm256_and_pd(x,y)
- #define simdf64_andnot(x,y) _mm256_andnot_pd(x,y)
- #define simdf64_xor(x,y) _mm256_xor_pd(x,y)
- #endif //SIMD_DOUBLE
- // float support (usable with AVX1)
- #ifndef SIMD_FLOAT
- #define SIMD_FLOAT
- #define ALIGN_FLOAT AVX_ALIGN_FLOAT
- #define VECSIZE_FLOAT AVX_VECSIZE_FLOAT
- typedef __m256 simd_float;
- #define simdf32_add(x,y) _mm256_add_ps(x,y)
- #define simdf32_sub(x,y) _mm256_sub_ps(x,y)
- #define simdf32_mul(x,y) _mm256_mul_ps(x,y)
- #define simdf32_div(x,y) _mm256_div_ps(x,y)
- #define simdf32_rcp(x) _mm256_rcp_ps(x)
- #define simdf32_max(x,y) _mm256_max_ps(x,y)
- #define simdf32_min(x,y) _mm256_min_ps(x,y)
- #define simdf32_load(x) _mm256_load_ps(x)
- #define simdf32_store(x,y) _mm256_store_ps(x,y)
- #define simdf32_set(x) _mm256_set1_ps(x)
- #define simdf32_setzero(x) _mm256_setzero_ps()
- #define simdf32_gt(x,y) _mm256_cmp_ps(x,y,_CMP_GT_OS)
- #define simdf32_eq(x,y) _mm256_cmp_ps(x,y,_CMP_EQ_OS)
- #define simdf32_lt(x,y) _mm256_cmp_ps(x,y,_CMP_LT_OS)
- #define simdf32_or(x,y) _mm256_or_ps(x,y)
- #define simdf32_and(x,y) _mm256_and_ps(x,y)
- #define simdf32_andnot(x,y) _mm256_andnot_ps(x,y)
- #define simdf32_xor(x,y) _mm256_xor_ps(x,y)
- #define simdf32_f2i(x) _mm256_cvtps_epi32(x) // convert s.p. float to integer
- #define simdf_f2icast(x) _mm256_castps_si256(x) // compile time cast
- #endif
- #endif
- #include <simde/x86/sse4.1.h>
- inline uint16_t simd_hmax16_sse(const __m128i buffer) {
- __m128i tmp1 = _mm_subs_epu16(_mm_set1_epi16((short)65535), buffer);
- __m128i tmp3 = _mm_minpos_epu16(tmp1);
- return (65535 - _mm_cvtsi128_si32(tmp3));
- }
- inline uint8_t simd_hmax8_sse(const __m128i buffer) {
- __m128i tmp1 = _mm_subs_epu8(_mm_set1_epi8((char)255), buffer);
- __m128i tmp2 = _mm_min_epu8(tmp1, _mm_srli_epi16(tmp1, 8));
- __m128i tmp3 = _mm_minpos_epu16(tmp2);
- return (int8_t)(255 -(int8_t) _mm_cvtsi128_si32(tmp3));
- }
- // double support
- #ifndef SIMD_DOUBLE
- #define SIMD_DOUBLE
- #define ALIGN_DOUBLE SSE_ALIGN_DOUBLE
- #define VECSIZE_DOUBLE SSE_VECSIZE_DOUBLE
- typedef __m128d simd_double;
- #define simdf64_add(x,y) _mm_add_pd(x,y)
- #define simdf64_sub(x,y) _mm_sub_pd(x,y)
- #define simdf64_mul(x,y) _mm_mul_pd(x,y)
- #define simdf64_div(x,y) _mm_div_pd(x,y)
- #define simdf64_max(x,y) _mm_max_pd(x,y)
- #define simdf64_load(x) _mm_load_pd(x)
- #define simdf64_store(x,y) _mm_store_pd(x,y)
- #define simdf64_set(x) _mm_set1_pd(x)
- #define simdf64_setzero(x) _mm_setzero_pd()
- #define simdf64_gt(x,y) _mm_cmpgt_pd(x,y)
- #define simdf64_lt(x,y) _mm_cmplt_pd(x,y)
- #define simdf64_or(x,y) _mm_or_pd(x,y)
- #define simdf64_and(x,y) _mm_and_pd(x,y)
- #define simdf64_andnot(x,y) _mm_andnot_pd(x,y)
- #define simdf64_xor(x,y) _mm_xor_pd(x,y)
- #endif //SIMD_DOUBLE
- // float support
- #ifndef SIMD_FLOAT
- #define SIMD_FLOAT
- #define ALIGN_FLOAT SSE_ALIGN_FLOAT
- #define VECSIZE_FLOAT SSE_VECSIZE_FLOAT
- typedef __m128 simd_float;
- #define simdf32_add(x,y) _mm_add_ps(x,y)
- #define simdf32_sub(x,y) _mm_sub_ps(x,y)
- #define simdf32_mul(x,y) _mm_mul_ps(x,y)
- #define simdf32_div(x,y) _mm_div_ps(x,y)
- #define simdf32_rcp(x) _mm_rcp_ps(x)
- #define simdf32_max(x,y) _mm_max_ps(x,y)
- #define simdf32_min(x,y) _mm_min_ps(x,y)
- #define simdf32_load(x) _mm_load_ps(x)
- #define simdf32_store(x,y) _mm_store_ps(x,y)
- #define simdf32_set(x) _mm_set1_ps(x)
- #define simdf32_setzero(x) _mm_setzero_ps()
- #define simdf32_gt(x,y) _mm_cmpgt_ps(x,y)
- #define simdf32_eq(x,y) _mm_cmpeq_ps(x,y)
- #define simdf32_lt(x,y) _mm_cmplt_ps(x,y)
- #define simdf32_or(x,y) _mm_or_ps(x,y)
- #define simdf32_and(x,y) _mm_and_ps(x,y)
- #define simdf32_andnot(x,y) _mm_andnot_ps(x,y)
- #define simdf32_xor(x,y) _mm_xor_ps(x,y)
- #define simdf32_f2i(x) _mm_cvtps_epi32(x) // convert s.p. float to integer
- #define simdf_f2icast(x) _mm_castps_si128(x) // compile time cast
- #endif //SIMD_FLOAT
- // integer support
- #ifndef SIMD_INT
- #define SIMD_INT
- inline unsigned short extract_epi16(__m128i v, int pos) {
- switch(pos){
- case 0: return _mm_extract_epi16(v, 0);
- case 1: return _mm_extract_epi16(v, 1);
- case 2: return _mm_extract_epi16(v, 2);
- case 3: return _mm_extract_epi16(v, 3);
- case 4: return _mm_extract_epi16(v, 4);
- case 5: return _mm_extract_epi16(v, 5);
- case 6: return _mm_extract_epi16(v, 6);
- case 7: return _mm_extract_epi16(v, 7);
- }
- return 0;
- }
- #define ALIGN_INT SSE_ALIGN_INT
- #define VECSIZE_INT SSE_VECSIZE_INT
- typedef __m128i simd_int;
- #define simdi32_add(x,y) _mm_add_epi32(x,y)
- #define simdi16_add(x,y) _mm_add_epi16(x,y)
- #define simdi16_adds(x,y) _mm_adds_epi16(x,y)
- #define simdui8_adds(x,y) _mm_adds_epu8(x,y)
- #define simdi32_sub(x,y) _mm_sub_epi32(x,y)
- #define simdui16_subs(x,y) _mm_subs_epu16(x,y)
- #define simdui8_subs(x,y) _mm_subs_epu8(x,y)
- #define simdi32_mul(x,y) _mm_mullo_epi32(x,y) // SSE4.1
- #define simdi32_max(x,y) _mm_max_epi32(x,y) // SSE4.1
- #define simdi16_max(x,y) _mm_max_epi16(x,y)
- #define simdi16_hmax(x) simd_hmax16_sse(x)
- #define simdui8_max(x,y) _mm_max_epu8(x,y)
- #define simdi8_hmax(x) simd_hmax8_sse(x)
- #define simdi_load(x) _mm_load_si128(x)
- #define simdi_loadu(x) _mm_loadu_si128(x)
- #define simdi_streamload(x) _mm_stream_load_si128(x)
- #define simdi_storeu(x,y) _mm_storeu_si128(x,y)
- #define simdi_store(x,y) _mm_store_si128(x,y)
- #define simdi32_set(x) _mm_set1_epi32(x)
- #define simdi16_set(x) _mm_set1_epi16(x)
- #define simdi8_set(x) _mm_set1_epi8(x)
- #define simdi32_shuffle(x,y) _mm_shuffle_epi32(x,y)
- #define simdi16_shuffle(x,y) _mm_shuffle_epi16(x,y)
- #define simdi8_shuffle(x,y) _mm_shuffle_epi8(x,y)
- #define simdi_setzero() _mm_setzero_si128()
- #define simdi32_gt(x,y) _mm_cmpgt_epi32(x,y)
- #define simdi8_gt(x,y) _mm_cmpgt_epi8(x,y)
- #define simdi32_eq(x,y) _mm_cmpeq_epi32(x,y)
- #define simdi16_eq(x,y) _mm_cmpeq_epi16(x,y)
- #define simdi8_eq(x,y) _mm_cmpeq_epi8(x,y)
- #define simdi32_lt(x,y) _mm_cmplt_epi32(x,y)
- #define simdi16_lt(x,y) _mm_cmplt_epi16(x,y)
- #define simdi8_lt(x,y) _mm_cmplt_epi8(x,y)
- #define simdi16_gt(x,y) _mm_cmpgt_epi16(x,y)
- #define simdi_or(x,y) _mm_or_si128(x,y)
- #define simdi_and(x,y) _mm_and_si128(x,y)
- #define simdi_andnot(x,y) _mm_andnot_si128(x,y)
- #define simdi_xor(x,y) _mm_xor_si128(x,y)
- #define simdi8_shiftl(x,y) _mm_slli_si128(x,y)
- #define simdi8_shiftr(x,y) _mm_srli_si128(x,y)
- #define SIMD_MOVEMASK_MAX 0xffff
- #define simdi8_movemask(x) _mm_movemask_epi8(x)
- #define simdi16_extract(x,y) extract_epi16(x,y)
- #define simdi16_slli(x,y) _mm_slli_epi16(x,y) // shift integers in a left by y
- #define simdi16_srli(x,y) _mm_srli_epi16(x,y) // shift integers in a right by y
- #define simdi32_slli(x,y) _mm_slli_epi32(x,y) // shift integers in a left by y
- #define simdi32_srli(x,y) _mm_srli_epi32(x,y) // shift integers in a right by y
- #define simdi32_i2f(x) _mm_cvtepi32_ps(x) // convert integer to s.p. float
- #define simdi_i2fcast(x) _mm_castsi128_ps(x)
- #endif //SIMD_INT
- inline void *mem_align(size_t boundary, size_t size) {
- void *pointer;
- if (posix_memalign(&pointer, boundary, size) != 0) {
- #define MEM_ALIGN_ERROR "mem_align could not allocate memory.\n"
- fwrite(MEM_ALIGN_ERROR, sizeof(MEM_ALIGN_ERROR), 1, stderr);
- #undef MEM_ALIGN_ERROR
- exit(3);
- }
- return pointer;
- }
- #ifdef SIMD_FLOAT
- inline simd_float * malloc_simd_float(const size_t size) {
- return (simd_float *) mem_align(ALIGN_FLOAT, size);
- }
- #endif
- #ifdef SIMD_DOUBLE
- inline simd_double * malloc_simd_double(const size_t size) {
- return (simd_double *) mem_align(ALIGN_DOUBLE, size);
- }
- #endif
- #ifdef SIMD_INT
- inline simd_int * malloc_simd_int(const size_t size) {
- return (simd_int *) mem_align(ALIGN_INT, size);
- }
- #endif
- #endif //SIMD_H
|