simd.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478
  1. // SIMD helper
  2. // optimze based on technolegy double, float and integer (32) SIMD instructions
  3. // writen by Martin Steinegger
  4. #ifndef SIMD_H
  5. #define SIMD_H
  6. #include <cstdlib>
  7. #include <limits>
  8. #include <algorithm>
  9. #include <iostream>
  10. #define AVX512_ALIGN_DOUBLE 64
  11. #define AVX512_VECSIZE_DOUBLE 8
  12. #define AVX512_ALIGN_FLOAT 64
  13. #define AVX512_VECSIZE_FLOAT 16
  14. #define AVX512_ALIGN_INT 64
  15. #define AVX512_VECSIZE_INT 16
  16. #define AVX_ALIGN_DOUBLE 32
  17. #define AVX_VECSIZE_DOUBLE 4
  18. #define AVX_ALIGN_FLOAT 32
  19. #define AVX_VECSIZE_FLOAT 8
  20. #define AVX2_ALIGN_INT 32
  21. #define AVX2_VECSIZE_INT 8
  22. #define SSE_ALIGN_DOUBLE 16
  23. #define SSE_VECSIZE_DOUBLE 2
  24. #define SSE_ALIGN_FLOAT 16
  25. #define SSE_VECSIZE_FLOAT 4
  26. #define SSE_ALIGN_INT 16
  27. #define SSE_VECSIZE_INT 4
  28. #define MAX_ALIGN_DOUBLE AVX512_ALIGN_DOUBLE
  29. #define MAX_VECSIZE_DOUBLE AVX512_VECSIZE_DOUBLE
  30. #define MAX_ALIGN_FLOAT AVX512_ALIGN_FLOAT
  31. #define MAX_VECSIZE_FLOAT AVX512_VECSIZE_FLOAT
  32. #define MAX_ALIGN_INT AVX512_ALIGN_INT
  33. #define MAX_VECSIZE_INT AVX512_VECSIZE_INT
  34. #define SIMDE_ENABLE_NATIVE_ALIASES
  35. #include <simde/simde-features.h>
  36. // FIXME: Finish AVX512 implementation
  37. //#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE)
  38. //#define AVX512
  39. //#endif
  40. #if defined(AVX512) || defined(SIMDE_X86_AVX2_NATIVE)
  41. #define AVX2
  42. #endif
  43. #ifdef AVX512
  44. #include <simde/x86/avx512f.h>
  45. #include <simde/x86/avx512bw.h>
  46. // double support
  47. #ifndef SIMD_DOUBLE
  48. #define SIMD_DOUBLE
  49. #define ALIGN_DOUBLE AVX512_ALIGN_DOUBLE
  50. #define VECSIZE_DOUBLE AVX512_VECSIZE_DOUBLE
  51. typedef __m512d simd_double;
  52. #define simdf64_add(x,y) _mm512_add_pd(x,y)
  53. #define simdf64_sub(x,y) _mm512_sub_pd(x,y)
  54. #define simdf64_mul(x,y) _mm512_mul_pd(x,y)
  55. #define simdf64_div(x,y) _mm512_div_pd(x,y)
  56. #define simdf64_max(x,y) _mm512_max_pd(x,y)
  57. #define simdf64_load(x) _mm512_load_pd(x)
  58. #define simdf64_store(x,y) _mm512_store_pd(x,y)
  59. #define simdf64_set(x) _mm512_set1_pd(x)
  60. #define simdf64_setzero(x) _mm512_setzero_pd()
  61. #define simdf64_gt(x,y) _mm512_cmpnle_pd_mask(x,y)
  62. #define simdf64_lt(x,y) _mm512_cmplt_pd_mask(x,y)
  63. #define simdf64_or(x,y) _mm512_or_si512(x,y)
  64. #define simdf64_and(x,y) _mm512_and_si512 (x,y)
  65. #define simdf64_andnot(x,y) _mm512_andnot_si512(x,y)
  66. #define simdf64_xor(x,y) _mm512_xor_si512(x,y)
  67. #endif //SIMD_DOUBLE
  68. // float support
  69. #ifndef SIMD_FLOAT
  70. #define SIMD_FLOAT
  71. #define ALIGN_FLOAT AVX512_ALIGN_FLOAT
  72. #define VECSIZE_FLOAT AVX512_VECSIZE_FLOAT
  73. typedef __m512 simd_float;
  74. #define simdf32_add(x,y) _mm512_add_ps(x,y)
  75. #define simdf32_sub(x,y) _mm512_sub_ps(x,y)
  76. #define simdf32_mul(x,y) _mm512_mul_ps(x,y)
  77. #define simdf32_div(x,y) _mm512_div_ps(x,y)
  78. #define simdf32_rcp(x) _mm512_rcp_ps(x)
  79. #define simdf32_max(x,y) _mm512_max_ps(x,y)
  80. #define simdf32_min(x,y) _mm512_min_ps(x,y)
  81. #define simdf32_load(x) _mm512_load_ps(x)
  82. #define simdf32_store(x,y) _mm512_store_ps(x,y)
  83. #define simdf32_set(x) _mm512_set1_ps(x)
  84. #define simdf32_setzero(x) _mm512_setzero_ps()
  85. #define simdf32_gt(x,y) _mm512_cmpnle_ps_mask(x,y)
  86. #define simdf32_eq(x,y) _mm512_cmpeq_ps_mask(x,y)
  87. #define simdf32_lt(x,y) _mm512_cmplt_ps_mask(x,y)
  88. #define simdf32_or(x,y) _mm512_or_si512(x,y)
  89. #define simdf32_and(x,y) _mm512_and_si512(x,y)
  90. #define simdf32_andnot(x,y) _mm512_andnot_si512(x,y)
  91. #define simdf32_xor(x,y) _mm512_xor_si512(x,y)
  92. #define simdf32_f2i(x) _mm512_cvtps_epi32(x) // convert s.p. float to integer
  93. #define simdf_f2icast(x) _mm512_castps_si512(x)
  94. #endif //SIMD_FLOAT
  95. // integer support
  96. #ifndef SIMD_INT
  97. #define SIMD_INT
  98. #define ALIGN_INT AVX512_ALIGN_INT
  99. #define VECSIZE_INT AVX512_VECSIZE_INT
  100. typedef __m512i simd_int;
  101. #define simdi32_add(x,y) _mm512_add_epi32(x,y)
  102. #define simdi16_add(x,y) _mm512_add_epi16(x,y)
  103. #define simdi16_adds(x,y) _mm512_adds_epi16(x,y)
  104. #define simdui8_adds(x,y) _mm512_adds_epu8()
  105. #define simdi32_sub(x,y) _mm512_sub_epi32(x,y)
  106. #define simdui8_subs(x,y) _mm512_subs_epu8()
  107. #define simdi32_mul(x,y) _mm512_mullo_epi32(x,y)
  108. #define simdui8_max(x,y) _mm512_max_epu8()
  109. #define simdi16_max(x,y) _mm512_max_epi16(x,y)
  110. #define simdi32_max(x,y) _mm512_max_epi32(x,y)
  111. #define simdi_load(x) _mm512_load_si512(x)
  112. #define simdi_streamload(x) _mm512_stream_load_si512(x)
  113. #define simdi_store(x,y) _mm512_store_si512(x,y)
  114. #define simdi_storeu(x,y) _mm512_storeu_si512(x,y)
  115. #define simdi32_set(x) _mm512_set1_epi32(x)
  116. #define simdi16_set(x) _mm512_set1_epi16(x)
  117. #define simdi8_set(x) _mm512_set1_epi8(x)
  118. #define simdi32_shuffle(x,y) _mm512_shuffle_epi32(x,y)
  119. #define simdi16_shuffle(x,y) NOT_YET_IMP(x,y)
  120. #define simdi8_shuffle(x,y) _mm512_shuffle_epi8(x,y)
  121. #define simdi_setzero() _mm512_setzero_si512()
  122. #define simdi32_gt(x,y) _mm512_cmpgt_epi32(x,y)
  123. #define simdi8_gt(x,y) NOT_YET_IMP()
  124. #define simdi16_gt(x,y) NOT_YET_IMP()
  125. #define simdi8_eq(x,y) NOT_YET_IMP()
  126. #define simdi32_lt(x,y) NOT_YET_IMP()
  127. #define simdi16_lt(x,y) NOT_YET_IMP()
  128. #define simdi8_lt(x,y) NOT_YET_IMP()
  129. #define simdi_or(x,y) _mm512_or_si512(x,y)
  130. #define simdi_and(x,y) _mm512_and_si512(x,y)
  131. #define simdi_andnot(x,y) _mm512_andnot_si512(x,y)
  132. #define simdi_xor(x,y) _mm512_xor_si512(x,y)
  133. #define simdi8_shiftl(x,y) NOT_YET_IMP()
  134. #define simdi8_shiftr(x,y) NOT_YET_IMP()
  135. #define simdi8_movemask(x) NOT_YET_IMP()
  136. #define simdi16_extract(x,y) NOT_YET_IMP()
  137. #define simdi16_slli(x,y) _mm512_slli_epi16(x,y) // shift integers in a left by y
  138. #define simdi16_srli(x,y) _mm512_srli_epi16(x,y) // shift integers in a right by y
  139. #define simdi32_slli(x,y) _mm512_slli_epi32(x,y) // shift integers in a left by y
  140. #define simdi32_srli(x,y) _mm512_srli_epi32(x,y) // shift integers in a right by y
  141. #define simdi32_i2f(x) _mm512_cvtepi32_ps(x) // convert integer to s.p. float
  142. #define simdi_i2fcast(x) _mm512_castsi512_ps(x)
  143. #endif //SIMD_INT
  144. #endif //AVX512_SUPPORT
  145. #ifdef AVX2
  146. #include <simde/x86/avx2.h>
  147. // integer support (usable with AVX2)
  148. #ifndef SIMD_INT
  149. #define SIMD_INT
  150. #define ALIGN_INT AVX2_ALIGN_INT
  151. #define VECSIZE_INT AVX2_VECSIZE_INT
  152. uint16_t simd_hmax16_sse(const __m128i buffer);
  153. uint8_t simd_hmax8_sse(const __m128i buffer);
  154. inline uint16_t simd_hmax16_avx(const __m256i buffer) {
  155. const __m128i abcd = _mm256_castsi256_si128(buffer);
  156. const uint16_t first = simd_hmax16_sse(abcd);
  157. const __m128i efgh = _mm256_extracti128_si256(buffer, 1);
  158. const uint16_t second = simd_hmax16_sse(efgh);
  159. return std::max(first, second);
  160. }
  161. inline uint8_t simd_hmax8_avx(const __m256i buffer) {
  162. const __m128i abcd = _mm256_castsi256_si128(buffer);
  163. const uint8_t first = simd_hmax8_sse(abcd);
  164. const __m128i efgh = _mm256_extracti128_si256(buffer, 1);
  165. const uint8_t second = simd_hmax8_sse(efgh);
  166. return std::max(first, second);
  167. }
  168. template <unsigned int N>
  169. inline __m256i _mm256_shift_left(__m256i a) {
  170. __m256i mask = _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0,0,3,0) );
  171. return _mm256_alignr_epi8(a,mask,16-N);
  172. }
  173. inline unsigned short extract_epi16(__m256i v, int pos) {
  174. switch(pos){
  175. case 0: return _mm256_extract_epi16(v, 0);
  176. case 1: return _mm256_extract_epi16(v, 1);
  177. case 2: return _mm256_extract_epi16(v, 2);
  178. case 3: return _mm256_extract_epi16(v, 3);
  179. case 4: return _mm256_extract_epi16(v, 4);
  180. case 5: return _mm256_extract_epi16(v, 5);
  181. case 6: return _mm256_extract_epi16(v, 6);
  182. case 7: return _mm256_extract_epi16(v, 7);
  183. case 8: return _mm256_extract_epi16(v, 8);
  184. case 9: return _mm256_extract_epi16(v, 9);
  185. case 10: return _mm256_extract_epi16(v, 10);
  186. case 11: return _mm256_extract_epi16(v, 11);
  187. case 12: return _mm256_extract_epi16(v, 12);
  188. case 13: return _mm256_extract_epi16(v, 13);
  189. case 14: return _mm256_extract_epi16(v, 14);
  190. case 15: return _mm256_extract_epi16(v, 15);
  191. }
  192. return 0;
  193. }
  194. typedef __m256i simd_int;
  195. #define simdi32_add(x,y) _mm256_add_epi32(x,y)
  196. #define simdi16_add(x,y) _mm256_add_epi16(x,y)
  197. #define simdi16_adds(x,y) _mm256_adds_epi16(x,y)
  198. #define simdui8_adds(x,y) _mm256_adds_epu8(x,y)
  199. #define simdi32_sub(x,y) _mm256_sub_epi32(x,y)
  200. #define simdui16_subs(x,y) _mm256_subs_epu16(x,y)
  201. #define simdui8_subs(x,y) _mm256_subs_epu8(x,y)
  202. #define simdi32_mul(x,y) _mm256_mullo_epi32(x,y)
  203. #define simdi32_max(x,y) _mm256_max_epi32(x,y)
  204. #define simdi16_max(x,y) _mm256_max_epi16(x,y)
  205. #define simdi16_hmax(x) simd_hmax16_avx(x)
  206. #define simdui8_max(x,y) _mm256_max_epu8(x,y)
  207. #define simdi8_hmax(x) simd_hmax8_avx(x)
  208. #define simdi_load(x) _mm256_load_si256(x)
  209. #define simdi_loadu(x) _mm256_loadu_si256(x)
  210. #define simdi_streamload(x) _mm256_stream_load_si256(x)
  211. #define simdi_store(x,y) _mm256_store_si256(x,y)
  212. #define simdi_storeu(x,y) _mm256_storeu_si256(x,y)
  213. #define simdi32_set(x) _mm256_set1_epi32(x)
  214. #define simdi16_set(x) _mm256_set1_epi16(x)
  215. #define simdi8_set(x) _mm256_set1_epi8(x)
  216. #define simdi32_shuffle(x,y) _mm256_shuffle_epi32(x,y)
  217. #define simdi16_shuffle(x,y) _mm256_shuffle_epi16(x,y)
  218. #define simdi8_shuffle(x,y) _mm256_shuffle_epi8(x,y)
  219. #define simdi_setzero() _mm256_setzero_si256()
  220. #define simdi32_gt(x,y) _mm256_cmpgt_epi32(x,y)
  221. #define simdi8_gt(x,y) _mm256_cmpgt_epi8(x,y)
  222. #define simdi16_gt(x,y) _mm256_cmpgt_epi16(x,y)
  223. #define simdi8_eq(x,y) _mm256_cmpeq_epi8(x,y)
  224. #define simdi16_eq(x,y) _mm256_cmpeq_epi16(x,y)
  225. #define simdi32_eq(x,y) _mm256_cmpeq_epi32(x,y)
  226. #define simdi32_lt(x,y) _mm256_cmpgt_epi32(y,x) // inverse
  227. #define simdi16_lt(x,y) _mm256_cmpgt_epi16(y,x) // inverse
  228. #define simdi8_lt(x,y) _mm256_cmpgt_epi8(y,x)
  229. #define simdi_or(x,y) _mm256_or_si256(x,y)
  230. #define simdi_and(x,y) _mm256_and_si256(x,y)
  231. #define simdi_andnot(x,y) _mm256_andnot_si256(x,y)
  232. #define simdi_xor(x,y) _mm256_xor_si256(x,y)
  233. #define simdi8_shiftl(x,y) _mm256_shift_left<y>(x)
  234. //TODO fix like shift_left
  235. #define simdi8_shiftr(x,y) _mm256_srli_si256(x,y)
  236. #define SIMD_MOVEMASK_MAX 0xffffffff
  237. #define simdi8_movemask(x) _mm256_movemask_epi8(x)
  238. #define simdi16_extract(x,y) extract_epi16(x,y)
  239. #define simdi16_slli(x,y) _mm256_slli_epi16(x,y) // shift integers in a left by y
  240. #define simdi16_srli(x,y) _mm256_srli_epi16(x,y) // shift integers in a right by y
  241. #define simdi32_slli(x,y) _mm256_slli_epi32(x,y) // shift integers in a left by y
  242. #define simdi32_srli(x,y) _mm256_srli_epi32(x,y) // shift integers in a right by y
  243. #define simdi32_i2f(x) _mm256_cvtepi32_ps(x) // convert integer to s.p. float
  244. #define simdi_i2fcast(x) _mm256_castsi256_ps(x)
  245. #endif
  246. #include <simde/x86/avx.h>
  247. // double support (usable with AVX1)
  248. #ifndef SIMD_DOUBLE
  249. #define SIMD_DOUBLE
  250. #define ALIGN_DOUBLE AVX_ALIGN_DOUBLE
  251. #define VECSIZE_DOUBLE AVX_VECSIZE_DOUBLE
  252. typedef __m256d simd_double;
  253. #define simdf64_add(x,y) _mm256_add_pd(x,y)
  254. #define simdf64_sub(x,y) _mm256_sub_pd(x,y)
  255. #define simdf64_mul(x,y) _mm256_mul_pd(x,y)
  256. #define simdf64_div(x,y) _mm256_div_pd(x,y)
  257. #define simdf64_max(x,y) _mm256_max_pd(x,y)
  258. #define simdf64_load(x) _mm256_load_pd(x)
  259. #define simdf64_store(x,y) _mm256_store_pd(x,y)
  260. #define simdf64_set(x) _mm256_set1_pd(x)
  261. #define simdf64_setzero(x) _mm256_setzero_pd()
  262. #define simdf64_gt(x,y) _mm256_cmp_pd(x,y,_CMP_GT_OS)
  263. #define simdf64_lt(x,y) _mm256_cmp_pd(x,y,_CMP_LT_OS)
  264. #define simdf64_or(x,y) _mm256_or_pd(x,y)
  265. #define simdf64_and(x,y) _mm256_and_pd(x,y)
  266. #define simdf64_andnot(x,y) _mm256_andnot_pd(x,y)
  267. #define simdf64_xor(x,y) _mm256_xor_pd(x,y)
  268. #endif //SIMD_DOUBLE
  269. // float support (usable with AVX1)
  270. #ifndef SIMD_FLOAT
  271. #define SIMD_FLOAT
  272. #define ALIGN_FLOAT AVX_ALIGN_FLOAT
  273. #define VECSIZE_FLOAT AVX_VECSIZE_FLOAT
  274. typedef __m256 simd_float;
  275. #define simdf32_add(x,y) _mm256_add_ps(x,y)
  276. #define simdf32_sub(x,y) _mm256_sub_ps(x,y)
  277. #define simdf32_mul(x,y) _mm256_mul_ps(x,y)
  278. #define simdf32_div(x,y) _mm256_div_ps(x,y)
  279. #define simdf32_rcp(x) _mm256_rcp_ps(x)
  280. #define simdf32_max(x,y) _mm256_max_ps(x,y)
  281. #define simdf32_min(x,y) _mm256_min_ps(x,y)
  282. #define simdf32_load(x) _mm256_load_ps(x)
  283. #define simdf32_store(x,y) _mm256_store_ps(x,y)
  284. #define simdf32_set(x) _mm256_set1_ps(x)
  285. #define simdf32_setzero(x) _mm256_setzero_ps()
  286. #define simdf32_gt(x,y) _mm256_cmp_ps(x,y,_CMP_GT_OS)
  287. #define simdf32_eq(x,y) _mm256_cmp_ps(x,y,_CMP_EQ_OS)
  288. #define simdf32_lt(x,y) _mm256_cmp_ps(x,y,_CMP_LT_OS)
  289. #define simdf32_or(x,y) _mm256_or_ps(x,y)
  290. #define simdf32_and(x,y) _mm256_and_ps(x,y)
  291. #define simdf32_andnot(x,y) _mm256_andnot_ps(x,y)
  292. #define simdf32_xor(x,y) _mm256_xor_ps(x,y)
  293. #define simdf32_f2i(x) _mm256_cvtps_epi32(x) // convert s.p. float to integer
  294. #define simdf_f2icast(x) _mm256_castps_si256(x) // compile time cast
  295. #endif
  296. #endif
  297. #include <simde/x86/sse4.1.h>
  298. inline uint16_t simd_hmax16_sse(const __m128i buffer) {
  299. __m128i tmp1 = _mm_subs_epu16(_mm_set1_epi16((short)65535), buffer);
  300. __m128i tmp3 = _mm_minpos_epu16(tmp1);
  301. return (65535 - _mm_cvtsi128_si32(tmp3));
  302. }
  303. inline uint8_t simd_hmax8_sse(const __m128i buffer) {
  304. __m128i tmp1 = _mm_subs_epu8(_mm_set1_epi8((char)255), buffer);
  305. __m128i tmp2 = _mm_min_epu8(tmp1, _mm_srli_epi16(tmp1, 8));
  306. __m128i tmp3 = _mm_minpos_epu16(tmp2);
  307. return (int8_t)(255 -(int8_t) _mm_cvtsi128_si32(tmp3));
  308. }
  309. // double support
  310. #ifndef SIMD_DOUBLE
  311. #define SIMD_DOUBLE
  312. #define ALIGN_DOUBLE SSE_ALIGN_DOUBLE
  313. #define VECSIZE_DOUBLE SSE_VECSIZE_DOUBLE
  314. typedef __m128d simd_double;
  315. #define simdf64_add(x,y) _mm_add_pd(x,y)
  316. #define simdf64_sub(x,y) _mm_sub_pd(x,y)
  317. #define simdf64_mul(x,y) _mm_mul_pd(x,y)
  318. #define simdf64_div(x,y) _mm_div_pd(x,y)
  319. #define simdf64_max(x,y) _mm_max_pd(x,y)
  320. #define simdf64_load(x) _mm_load_pd(x)
  321. #define simdf64_store(x,y) _mm_store_pd(x,y)
  322. #define simdf64_set(x) _mm_set1_pd(x)
  323. #define simdf64_setzero(x) _mm_setzero_pd()
  324. #define simdf64_gt(x,y) _mm_cmpgt_pd(x,y)
  325. #define simdf64_lt(x,y) _mm_cmplt_pd(x,y)
  326. #define simdf64_or(x,y) _mm_or_pd(x,y)
  327. #define simdf64_and(x,y) _mm_and_pd(x,y)
  328. #define simdf64_andnot(x,y) _mm_andnot_pd(x,y)
  329. #define simdf64_xor(x,y) _mm_xor_pd(x,y)
  330. #endif //SIMD_DOUBLE
  331. // float support
  332. #ifndef SIMD_FLOAT
  333. #define SIMD_FLOAT
  334. #define ALIGN_FLOAT SSE_ALIGN_FLOAT
  335. #define VECSIZE_FLOAT SSE_VECSIZE_FLOAT
  336. typedef __m128 simd_float;
  337. #define simdf32_add(x,y) _mm_add_ps(x,y)
  338. #define simdf32_sub(x,y) _mm_sub_ps(x,y)
  339. #define simdf32_mul(x,y) _mm_mul_ps(x,y)
  340. #define simdf32_div(x,y) _mm_div_ps(x,y)
  341. #define simdf32_rcp(x) _mm_rcp_ps(x)
  342. #define simdf32_max(x,y) _mm_max_ps(x,y)
  343. #define simdf32_min(x,y) _mm_min_ps(x,y)
  344. #define simdf32_load(x) _mm_load_ps(x)
  345. #define simdf32_store(x,y) _mm_store_ps(x,y)
  346. #define simdf32_set(x) _mm_set1_ps(x)
  347. #define simdf32_setzero(x) _mm_setzero_ps()
  348. #define simdf32_gt(x,y) _mm_cmpgt_ps(x,y)
  349. #define simdf32_eq(x,y) _mm_cmpeq_ps(x,y)
  350. #define simdf32_lt(x,y) _mm_cmplt_ps(x,y)
  351. #define simdf32_or(x,y) _mm_or_ps(x,y)
  352. #define simdf32_and(x,y) _mm_and_ps(x,y)
  353. #define simdf32_andnot(x,y) _mm_andnot_ps(x,y)
  354. #define simdf32_xor(x,y) _mm_xor_ps(x,y)
  355. #define simdf32_f2i(x) _mm_cvtps_epi32(x) // convert s.p. float to integer
  356. #define simdf_f2icast(x) _mm_castps_si128(x) // compile time cast
  357. #endif //SIMD_FLOAT
  358. // integer support
  359. #ifndef SIMD_INT
  360. #define SIMD_INT
  361. inline unsigned short extract_epi16(__m128i v, int pos) {
  362. switch(pos){
  363. case 0: return _mm_extract_epi16(v, 0);
  364. case 1: return _mm_extract_epi16(v, 1);
  365. case 2: return _mm_extract_epi16(v, 2);
  366. case 3: return _mm_extract_epi16(v, 3);
  367. case 4: return _mm_extract_epi16(v, 4);
  368. case 5: return _mm_extract_epi16(v, 5);
  369. case 6: return _mm_extract_epi16(v, 6);
  370. case 7: return _mm_extract_epi16(v, 7);
  371. }
  372. return 0;
  373. }
  374. #define ALIGN_INT SSE_ALIGN_INT
  375. #define VECSIZE_INT SSE_VECSIZE_INT
  376. typedef __m128i simd_int;
  377. #define simdi32_add(x,y) _mm_add_epi32(x,y)
  378. #define simdi16_add(x,y) _mm_add_epi16(x,y)
  379. #define simdi16_adds(x,y) _mm_adds_epi16(x,y)
  380. #define simdui8_adds(x,y) _mm_adds_epu8(x,y)
  381. #define simdi32_sub(x,y) _mm_sub_epi32(x,y)
  382. #define simdui16_subs(x,y) _mm_subs_epu16(x,y)
  383. #define simdui8_subs(x,y) _mm_subs_epu8(x,y)
  384. #define simdi32_mul(x,y) _mm_mullo_epi32(x,y) // SSE4.1
  385. #define simdi32_max(x,y) _mm_max_epi32(x,y) // SSE4.1
  386. #define simdi16_max(x,y) _mm_max_epi16(x,y)
  387. #define simdi16_hmax(x) simd_hmax16_sse(x)
  388. #define simdui8_max(x,y) _mm_max_epu8(x,y)
  389. #define simdi8_hmax(x) simd_hmax8_sse(x)
  390. #define simdi_load(x) _mm_load_si128(x)
  391. #define simdi_loadu(x) _mm_loadu_si128(x)
  392. #define simdi_streamload(x) _mm_stream_load_si128(x)
  393. #define simdi_storeu(x,y) _mm_storeu_si128(x,y)
  394. #define simdi_store(x,y) _mm_store_si128(x,y)
  395. #define simdi32_set(x) _mm_set1_epi32(x)
  396. #define simdi16_set(x) _mm_set1_epi16(x)
  397. #define simdi8_set(x) _mm_set1_epi8(x)
  398. #define simdi32_shuffle(x,y) _mm_shuffle_epi32(x,y)
  399. #define simdi16_shuffle(x,y) _mm_shuffle_epi16(x,y)
  400. #define simdi8_shuffle(x,y) _mm_shuffle_epi8(x,y)
  401. #define simdi_setzero() _mm_setzero_si128()
  402. #define simdi32_gt(x,y) _mm_cmpgt_epi32(x,y)
  403. #define simdi8_gt(x,y) _mm_cmpgt_epi8(x,y)
  404. #define simdi32_eq(x,y) _mm_cmpeq_epi32(x,y)
  405. #define simdi16_eq(x,y) _mm_cmpeq_epi16(x,y)
  406. #define simdi8_eq(x,y) _mm_cmpeq_epi8(x,y)
  407. #define simdi32_lt(x,y) _mm_cmplt_epi32(x,y)
  408. #define simdi16_lt(x,y) _mm_cmplt_epi16(x,y)
  409. #define simdi8_lt(x,y) _mm_cmplt_epi8(x,y)
  410. #define simdi16_gt(x,y) _mm_cmpgt_epi16(x,y)
  411. #define simdi_or(x,y) _mm_or_si128(x,y)
  412. #define simdi_and(x,y) _mm_and_si128(x,y)
  413. #define simdi_andnot(x,y) _mm_andnot_si128(x,y)
  414. #define simdi_xor(x,y) _mm_xor_si128(x,y)
  415. #define simdi8_shiftl(x,y) _mm_slli_si128(x,y)
  416. #define simdi8_shiftr(x,y) _mm_srli_si128(x,y)
  417. #define SIMD_MOVEMASK_MAX 0xffff
  418. #define simdi8_movemask(x) _mm_movemask_epi8(x)
  419. #define simdi16_extract(x,y) extract_epi16(x,y)
  420. #define simdi16_slli(x,y) _mm_slli_epi16(x,y) // shift integers in a left by y
  421. #define simdi16_srli(x,y) _mm_srli_epi16(x,y) // shift integers in a right by y
  422. #define simdi32_slli(x,y) _mm_slli_epi32(x,y) // shift integers in a left by y
  423. #define simdi32_srli(x,y) _mm_srli_epi32(x,y) // shift integers in a right by y
  424. #define simdi32_i2f(x) _mm_cvtepi32_ps(x) // convert integer to s.p. float
  425. #define simdi_i2fcast(x) _mm_castsi128_ps(x)
  426. #endif //SIMD_INT
  427. inline void *mem_align(size_t boundary, size_t size) {
  428. void *pointer;
  429. if (posix_memalign(&pointer, boundary, size) != 0) {
  430. #define MEM_ALIGN_ERROR "mem_align could not allocate memory.\n"
  431. fwrite(MEM_ALIGN_ERROR, sizeof(MEM_ALIGN_ERROR), 1, stderr);
  432. #undef MEM_ALIGN_ERROR
  433. exit(3);
  434. }
  435. return pointer;
  436. }
  437. #ifdef SIMD_FLOAT
  438. inline simd_float * malloc_simd_float(const size_t size) {
  439. return (simd_float *) mem_align(ALIGN_FLOAT, size);
  440. }
  441. #endif
  442. #ifdef SIMD_DOUBLE
  443. inline simd_double * malloc_simd_double(const size_t size) {
  444. return (simd_double *) mem_align(ALIGN_DOUBLE, size);
  445. }
  446. #endif
  447. #ifdef SIMD_INT
  448. inline simd_int * malloc_simd_int(const size_t size) {
  449. return (simd_int *) mem_align(ALIGN_INT, size);
  450. }
  451. #endif
  452. #endif //SIMD_H