matrix.h 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029
  1. /// @ref simd
  2. /// @file glm/simd/matrix.h
  3. #pragma once
  4. #include "geometric.h"
  5. #if GLM_ARCH & GLM_ARCH_SSE2_BIT
  6. GLM_FUNC_QUALIFIER void glm_mat4_matrixCompMult(glm_vec4 const in1[4], glm_vec4 const in2[4], glm_vec4 out[4])
  7. {
  8. out[0] = _mm_mul_ps(in1[0], in2[0]);
  9. out[1] = _mm_mul_ps(in1[1], in2[1]);
  10. out[2] = _mm_mul_ps(in1[2], in2[2]);
  11. out[3] = _mm_mul_ps(in1[3], in2[3]);
  12. }
  13. GLM_FUNC_QUALIFIER void glm_mat4_add(glm_vec4 const in1[4], glm_vec4 const in2[4], glm_vec4 out[4])
  14. {
  15. out[0] = _mm_add_ps(in1[0], in2[0]);
  16. out[1] = _mm_add_ps(in1[1], in2[1]);
  17. out[2] = _mm_add_ps(in1[2], in2[2]);
  18. out[3] = _mm_add_ps(in1[3], in2[3]);
  19. }
  20. GLM_FUNC_QUALIFIER void glm_mat4_sub(glm_vec4 const in1[4], glm_vec4 const in2[4], glm_vec4 out[4])
  21. {
  22. out[0] = _mm_sub_ps(in1[0], in2[0]);
  23. out[1] = _mm_sub_ps(in1[1], in2[1]);
  24. out[2] = _mm_sub_ps(in1[2], in2[2]);
  25. out[3] = _mm_sub_ps(in1[3], in2[3]);
  26. }
  27. GLM_FUNC_QUALIFIER glm_vec4 glm_mat4_mul_vec4(glm_vec4 const m[4], glm_vec4 v)
  28. {
  29. __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
  30. __m128 v1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
  31. __m128 v2 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2));
  32. __m128 v3 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
  33. __m128 m0 = _mm_mul_ps(m[0], v0);
  34. __m128 m1 = _mm_mul_ps(m[1], v1);
  35. __m128 m2 = _mm_mul_ps(m[2], v2);
  36. __m128 m3 = _mm_mul_ps(m[3], v3);
  37. __m128 a0 = _mm_add_ps(m0, m1);
  38. __m128 a1 = _mm_add_ps(m2, m3);
  39. __m128 a2 = _mm_add_ps(a0, a1);
  40. return a2;
  41. }
  42. GLM_FUNC_QUALIFIER __m128 glm_vec4_mul_mat4(glm_vec4 v, glm_vec4 const m[4])
  43. {
  44. __m128 i0 = m[0];
  45. __m128 i1 = m[1];
  46. __m128 i2 = m[2];
  47. __m128 i3 = m[3];
  48. __m128 m0 = _mm_mul_ps(v, i0);
  49. __m128 m1 = _mm_mul_ps(v, i1);
  50. __m128 m2 = _mm_mul_ps(v, i2);
  51. __m128 m3 = _mm_mul_ps(v, i3);
  52. __m128 u0 = _mm_unpacklo_ps(m0, m1);
  53. __m128 u1 = _mm_unpackhi_ps(m0, m1);
  54. __m128 a0 = _mm_add_ps(u0, u1);
  55. __m128 u2 = _mm_unpacklo_ps(m2, m3);
  56. __m128 u3 = _mm_unpackhi_ps(m2, m3);
  57. __m128 a1 = _mm_add_ps(u2, u3);
  58. __m128 f0 = _mm_movelh_ps(a0, a1);
  59. __m128 f1 = _mm_movehl_ps(a1, a0);
  60. __m128 f2 = _mm_add_ps(f0, f1);
  61. return f2;
  62. }
  63. GLM_FUNC_QUALIFIER void glm_mat4_mul(glm_vec4 const in1[4], glm_vec4 const in2[4], glm_vec4 out[4])
  64. {
  65. {
  66. __m128 e0 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(0, 0, 0, 0));
  67. __m128 e1 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(1, 1, 1, 1));
  68. __m128 e2 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(2, 2, 2, 2));
  69. __m128 e3 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(3, 3, 3, 3));
  70. __m128 m0 = _mm_mul_ps(in1[0], e0);
  71. __m128 m1 = _mm_mul_ps(in1[1], e1);
  72. __m128 m2 = _mm_mul_ps(in1[2], e2);
  73. __m128 m3 = _mm_mul_ps(in1[3], e3);
  74. __m128 a0 = _mm_add_ps(m0, m1);
  75. __m128 a1 = _mm_add_ps(m2, m3);
  76. __m128 a2 = _mm_add_ps(a0, a1);
  77. out[0] = a2;
  78. }
  79. {
  80. __m128 e0 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(0, 0, 0, 0));
  81. __m128 e1 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(1, 1, 1, 1));
  82. __m128 e2 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(2, 2, 2, 2));
  83. __m128 e3 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(3, 3, 3, 3));
  84. __m128 m0 = _mm_mul_ps(in1[0], e0);
  85. __m128 m1 = _mm_mul_ps(in1[1], e1);
  86. __m128 m2 = _mm_mul_ps(in1[2], e2);
  87. __m128 m3 = _mm_mul_ps(in1[3], e3);
  88. __m128 a0 = _mm_add_ps(m0, m1);
  89. __m128 a1 = _mm_add_ps(m2, m3);
  90. __m128 a2 = _mm_add_ps(a0, a1);
  91. out[1] = a2;
  92. }
  93. {
  94. __m128 e0 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(0, 0, 0, 0));
  95. __m128 e1 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(1, 1, 1, 1));
  96. __m128 e2 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(2, 2, 2, 2));
  97. __m128 e3 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(3, 3, 3, 3));
  98. __m128 m0 = _mm_mul_ps(in1[0], e0);
  99. __m128 m1 = _mm_mul_ps(in1[1], e1);
  100. __m128 m2 = _mm_mul_ps(in1[2], e2);
  101. __m128 m3 = _mm_mul_ps(in1[3], e3);
  102. __m128 a0 = _mm_add_ps(m0, m1);
  103. __m128 a1 = _mm_add_ps(m2, m3);
  104. __m128 a2 = _mm_add_ps(a0, a1);
  105. out[2] = a2;
  106. }
  107. {
  108. //(__m128&)_mm_shuffle_epi32(__m128i&)in2[0], _MM_SHUFFLE(3, 3, 3, 3))
  109. __m128 e0 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(0, 0, 0, 0));
  110. __m128 e1 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(1, 1, 1, 1));
  111. __m128 e2 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(2, 2, 2, 2));
  112. __m128 e3 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(3, 3, 3, 3));
  113. __m128 m0 = _mm_mul_ps(in1[0], e0);
  114. __m128 m1 = _mm_mul_ps(in1[1], e1);
  115. __m128 m2 = _mm_mul_ps(in1[2], e2);
  116. __m128 m3 = _mm_mul_ps(in1[3], e3);
  117. __m128 a0 = _mm_add_ps(m0, m1);
  118. __m128 a1 = _mm_add_ps(m2, m3);
  119. __m128 a2 = _mm_add_ps(a0, a1);
  120. out[3] = a2;
  121. }
  122. }
  123. GLM_FUNC_QUALIFIER void glm_mat4_transpose(glm_vec4 const in[4], glm_vec4 out[4])
  124. {
  125. __m128 tmp0 = _mm_shuffle_ps(in[0], in[1], 0x44);
  126. __m128 tmp2 = _mm_shuffle_ps(in[0], in[1], 0xEE);
  127. __m128 tmp1 = _mm_shuffle_ps(in[2], in[3], 0x44);
  128. __m128 tmp3 = _mm_shuffle_ps(in[2], in[3], 0xEE);
  129. out[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88);
  130. out[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
  131. out[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88);
  132. out[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
  133. }
  134. GLM_FUNC_QUALIFIER glm_vec4 glm_mat4_determinant_highp(glm_vec4 const in[4])
  135. {
  136. __m128 Fac0;
  137. {
  138. // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
  139. // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
  140. // valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
  141. // valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];
  142. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  143. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  144. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  145. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  146. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  147. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  148. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  149. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  150. Fac0 = _mm_sub_ps(Mul00, Mul01);
  151. }
  152. __m128 Fac1;
  153. {
  154. // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
  155. // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
  156. // valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
  157. // valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];
  158. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  159. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  160. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  161. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  162. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  163. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  164. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  165. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  166. Fac1 = _mm_sub_ps(Mul00, Mul01);
  167. }
  168. __m128 Fac2;
  169. {
  170. // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
  171. // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
  172. // valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
  173. // valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
  174. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  175. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  176. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  177. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  178. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  179. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  180. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  181. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  182. Fac2 = _mm_sub_ps(Mul00, Mul01);
  183. }
  184. __m128 Fac3;
  185. {
  186. // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
  187. // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
  188. // valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
  189. // valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];
  190. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  191. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  192. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  193. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  194. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  195. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  196. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  197. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  198. Fac3 = _mm_sub_ps(Mul00, Mul01);
  199. }
  200. __m128 Fac4;
  201. {
  202. // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
  203. // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
  204. // valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
  205. // valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];
  206. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  207. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  208. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  209. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  210. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  211. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  212. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  213. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  214. Fac4 = _mm_sub_ps(Mul00, Mul01);
  215. }
  216. __m128 Fac5;
  217. {
  218. // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
  219. // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
  220. // valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
  221. // valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];
  222. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  223. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  224. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  225. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  226. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  227. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  228. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  229. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  230. Fac5 = _mm_sub_ps(Mul00, Mul01);
  231. }
  232. __m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
  233. __m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);
  234. // m[1][0]
  235. // m[0][0]
  236. // m[0][0]
  237. // m[0][0]
  238. __m128 Temp0 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(0, 0, 0, 0));
  239. __m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));
  240. // m[1][1]
  241. // m[0][1]
  242. // m[0][1]
  243. // m[0][1]
  244. __m128 Temp1 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(1, 1, 1, 1));
  245. __m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));
  246. // m[1][2]
  247. // m[0][2]
  248. // m[0][2]
  249. // m[0][2]
  250. __m128 Temp2 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(2, 2, 2, 2));
  251. __m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));
  252. // m[1][3]
  253. // m[0][3]
  254. // m[0][3]
  255. // m[0][3]
  256. __m128 Temp3 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(3, 3, 3, 3));
  257. __m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));
  258. // col0
  259. // + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]),
  260. // - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]),
  261. // + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]),
  262. // - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]),
  263. __m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
  264. __m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
  265. __m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
  266. __m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
  267. __m128 Add00 = _mm_add_ps(Sub00, Mul02);
  268. __m128 Inv0 = _mm_mul_ps(SignB, Add00);
  269. // col1
  270. // - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]),
  271. // + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]),
  272. // - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]),
  273. // + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]),
  274. __m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
  275. __m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
  276. __m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
  277. __m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
  278. __m128 Add01 = _mm_add_ps(Sub01, Mul05);
  279. __m128 Inv1 = _mm_mul_ps(SignA, Add01);
  280. // col2
  281. // + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]),
  282. // - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]),
  283. // + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]),
  284. // - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]),
  285. __m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
  286. __m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
  287. __m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
  288. __m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
  289. __m128 Add02 = _mm_add_ps(Sub02, Mul08);
  290. __m128 Inv2 = _mm_mul_ps(SignB, Add02);
  291. // col3
  292. // - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]),
  293. // + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]),
  294. // - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]),
  295. // + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3]));
  296. __m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
  297. __m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
  298. __m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
  299. __m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
  300. __m128 Add03 = _mm_add_ps(Sub03, Mul11);
  301. __m128 Inv3 = _mm_mul_ps(SignA, Add03);
  302. __m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
  303. __m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
  304. __m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));
  305. // valType Determinant = m[0][0] * Inverse[0][0]
  306. // + m[0][1] * Inverse[1][0]
  307. // + m[0][2] * Inverse[2][0]
  308. // + m[0][3] * Inverse[3][0];
  309. __m128 Det0 = glm_vec4_dot(in[0], Row2);
  310. return Det0;
  311. }
  312. GLM_FUNC_QUALIFIER glm_vec4 glm_mat4_determinant_lowp(glm_vec4 const m[4])
  313. {
  314. // _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(
  315. //T SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
  316. //T SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
  317. //T SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
  318. //T SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
  319. //T SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
  320. //T SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
  321. // First 2 columns
  322. __m128 Swp2A = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[2]), _MM_SHUFFLE(0, 1, 1, 2)));
  323. __m128 Swp3A = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[3]), _MM_SHUFFLE(3, 2, 3, 3)));
  324. __m128 MulA = _mm_mul_ps(Swp2A, Swp3A);
  325. // Second 2 columns
  326. __m128 Swp2B = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[2]), _MM_SHUFFLE(3, 2, 3, 3)));
  327. __m128 Swp3B = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[3]), _MM_SHUFFLE(0, 1, 1, 2)));
  328. __m128 MulB = _mm_mul_ps(Swp2B, Swp3B);
  329. // Columns subtraction
  330. __m128 SubE = _mm_sub_ps(MulA, MulB);
  331. // Last 2 rows
  332. __m128 Swp2C = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[2]), _MM_SHUFFLE(0, 0, 1, 2)));
  333. __m128 Swp3C = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[3]), _MM_SHUFFLE(1, 2, 0, 0)));
  334. __m128 MulC = _mm_mul_ps(Swp2C, Swp3C);
  335. __m128 SubF = _mm_sub_ps(_mm_movehl_ps(MulC, MulC), MulC);
  336. //vec<4, T, Q> DetCof(
  337. // + (m[1][1] * SubFactor00 - m[1][2] * SubFactor01 + m[1][3] * SubFactor02),
  338. // - (m[1][0] * SubFactor00 - m[1][2] * SubFactor03 + m[1][3] * SubFactor04),
  339. // + (m[1][0] * SubFactor01 - m[1][1] * SubFactor03 + m[1][3] * SubFactor05),
  340. // - (m[1][0] * SubFactor02 - m[1][1] * SubFactor04 + m[1][2] * SubFactor05));
  341. __m128 SubFacA = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(SubE), _MM_SHUFFLE(2, 1, 0, 0)));
  342. __m128 SwpFacA = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[1]), _MM_SHUFFLE(0, 0, 0, 1)));
  343. __m128 MulFacA = _mm_mul_ps(SwpFacA, SubFacA);
  344. __m128 SubTmpB = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(0, 0, 3, 1));
  345. __m128 SubFacB = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(SubTmpB), _MM_SHUFFLE(3, 1, 1, 0)));//SubF[0], SubE[3], SubE[3], SubE[1];
  346. __m128 SwpFacB = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[1]), _MM_SHUFFLE(1, 1, 2, 2)));
  347. __m128 MulFacB = _mm_mul_ps(SwpFacB, SubFacB);
  348. __m128 SubRes = _mm_sub_ps(MulFacA, MulFacB);
  349. __m128 SubTmpC = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(1, 0, 2, 2));
  350. __m128 SubFacC = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(SubTmpC), _MM_SHUFFLE(3, 3, 2, 0)));
  351. __m128 SwpFacC = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[1]), _MM_SHUFFLE(2, 3, 3, 3)));
  352. __m128 MulFacC = _mm_mul_ps(SwpFacC, SubFacC);
  353. __m128 AddRes = _mm_add_ps(SubRes, MulFacC);
  354. __m128 DetCof = _mm_mul_ps(AddRes, _mm_setr_ps( 1.0f,-1.0f, 1.0f,-1.0f));
  355. //return m[0][0] * DetCof[0]
  356. // + m[0][1] * DetCof[1]
  357. // + m[0][2] * DetCof[2]
  358. // + m[0][3] * DetCof[3];
  359. return glm_vec4_dot(m[0], DetCof);
  360. }
  361. GLM_FUNC_QUALIFIER glm_vec4 glm_mat4_determinant(glm_vec4 const m[4])
  362. {
  363. // _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(add)
  364. //T SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
  365. //T SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
  366. //T SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
  367. //T SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
  368. //T SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
  369. //T SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
  370. // First 2 columns
  371. __m128 Swp2A = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(0, 1, 1, 2));
  372. __m128 Swp3A = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(3, 2, 3, 3));
  373. __m128 MulA = _mm_mul_ps(Swp2A, Swp3A);
  374. // Second 2 columns
  375. __m128 Swp2B = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(3, 2, 3, 3));
  376. __m128 Swp3B = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(0, 1, 1, 2));
  377. __m128 MulB = _mm_mul_ps(Swp2B, Swp3B);
  378. // Columns subtraction
  379. __m128 SubE = _mm_sub_ps(MulA, MulB);
  380. // Last 2 rows
  381. __m128 Swp2C = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(0, 0, 1, 2));
  382. __m128 Swp3C = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(1, 2, 0, 0));
  383. __m128 MulC = _mm_mul_ps(Swp2C, Swp3C);
  384. __m128 SubF = _mm_sub_ps(_mm_movehl_ps(MulC, MulC), MulC);
  385. //vec<4, T, Q> DetCof(
  386. // + (m[1][1] * SubFactor00 - m[1][2] * SubFactor01 + m[1][3] * SubFactor02),
  387. // - (m[1][0] * SubFactor00 - m[1][2] * SubFactor03 + m[1][3] * SubFactor04),
  388. // + (m[1][0] * SubFactor01 - m[1][1] * SubFactor03 + m[1][3] * SubFactor05),
  389. // - (m[1][0] * SubFactor02 - m[1][1] * SubFactor04 + m[1][2] * SubFactor05));
  390. __m128 SubFacA = _mm_shuffle_ps(SubE, SubE, _MM_SHUFFLE(2, 1, 0, 0));
  391. __m128 SwpFacA = _mm_shuffle_ps(m[1], m[1], _MM_SHUFFLE(0, 0, 0, 1));
  392. __m128 MulFacA = _mm_mul_ps(SwpFacA, SubFacA);
  393. __m128 SubTmpB = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(0, 0, 3, 1));
  394. __m128 SubFacB = _mm_shuffle_ps(SubTmpB, SubTmpB, _MM_SHUFFLE(3, 1, 1, 0));//SubF[0], SubE[3], SubE[3], SubE[1];
  395. __m128 SwpFacB = _mm_shuffle_ps(m[1], m[1], _MM_SHUFFLE(1, 1, 2, 2));
  396. __m128 MulFacB = _mm_mul_ps(SwpFacB, SubFacB);
  397. __m128 SubRes = _mm_sub_ps(MulFacA, MulFacB);
  398. __m128 SubTmpC = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(1, 0, 2, 2));
  399. __m128 SubFacC = _mm_shuffle_ps(SubTmpC, SubTmpC, _MM_SHUFFLE(3, 3, 2, 0));
  400. __m128 SwpFacC = _mm_shuffle_ps(m[1], m[1], _MM_SHUFFLE(2, 3, 3, 3));
  401. __m128 MulFacC = _mm_mul_ps(SwpFacC, SubFacC);
  402. __m128 AddRes = _mm_add_ps(SubRes, MulFacC);
  403. __m128 DetCof = _mm_mul_ps(AddRes, _mm_setr_ps( 1.0f,-1.0f, 1.0f,-1.0f));
  404. //return m[0][0] * DetCof[0]
  405. // + m[0][1] * DetCof[1]
  406. // + m[0][2] * DetCof[2]
  407. // + m[0][3] * DetCof[3];
  408. return glm_vec4_dot(m[0], DetCof);
  409. }
  410. GLM_FUNC_QUALIFIER void glm_mat4_inverse(glm_vec4 const in[4], glm_vec4 out[4])
  411. {
  412. __m128 Fac0;
  413. {
  414. // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
  415. // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
  416. // valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
  417. // valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];
  418. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  419. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  420. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  421. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  422. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  423. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  424. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  425. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  426. Fac0 = _mm_sub_ps(Mul00, Mul01);
  427. }
  428. __m128 Fac1;
  429. {
  430. // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
  431. // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
  432. // valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
  433. // valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];
  434. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  435. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  436. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  437. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  438. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  439. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  440. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  441. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  442. Fac1 = _mm_sub_ps(Mul00, Mul01);
  443. }
  444. __m128 Fac2;
  445. {
  446. // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
  447. // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
  448. // valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
  449. // valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
  450. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  451. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  452. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  453. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  454. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  455. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  456. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  457. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  458. Fac2 = _mm_sub_ps(Mul00, Mul01);
  459. }
  460. __m128 Fac3;
  461. {
  462. // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
  463. // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
  464. // valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
  465. // valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];
  466. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  467. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  468. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  469. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  470. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  471. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  472. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  473. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  474. Fac3 = _mm_sub_ps(Mul00, Mul01);
  475. }
  476. __m128 Fac4;
  477. {
  478. // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
  479. // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
  480. // valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
  481. // valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];
  482. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  483. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  484. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  485. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  486. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  487. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  488. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  489. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  490. Fac4 = _mm_sub_ps(Mul00, Mul01);
  491. }
  492. __m128 Fac5;
  493. {
  494. // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
  495. // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
  496. // valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
  497. // valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];
  498. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  499. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  500. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  501. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  502. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  503. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  504. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  505. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  506. Fac5 = _mm_sub_ps(Mul00, Mul01);
  507. }
  508. __m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
  509. __m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);
  510. // m[1][0]
  511. // m[0][0]
  512. // m[0][0]
  513. // m[0][0]
  514. __m128 Temp0 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(0, 0, 0, 0));
  515. __m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));
  516. // m[1][1]
  517. // m[0][1]
  518. // m[0][1]
  519. // m[0][1]
  520. __m128 Temp1 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(1, 1, 1, 1));
  521. __m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));
  522. // m[1][2]
  523. // m[0][2]
  524. // m[0][2]
  525. // m[0][2]
  526. __m128 Temp2 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(2, 2, 2, 2));
  527. __m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));
  528. // m[1][3]
  529. // m[0][3]
  530. // m[0][3]
  531. // m[0][3]
  532. __m128 Temp3 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(3, 3, 3, 3));
  533. __m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));
  534. // col0
  535. // + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]),
  536. // - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]),
  537. // + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]),
  538. // - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]),
  539. __m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
  540. __m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
  541. __m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
  542. __m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
  543. __m128 Add00 = _mm_add_ps(Sub00, Mul02);
  544. __m128 Inv0 = _mm_mul_ps(SignB, Add00);
  545. // col1
  546. // - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]),
  547. // + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]),
  548. // - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]),
  549. // + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]),
  550. __m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
  551. __m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
  552. __m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
  553. __m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
  554. __m128 Add01 = _mm_add_ps(Sub01, Mul05);
  555. __m128 Inv1 = _mm_mul_ps(SignA, Add01);
  556. // col2
  557. // + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]),
  558. // - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]),
  559. // + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]),
  560. // - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]),
  561. __m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
  562. __m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
  563. __m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
  564. __m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
  565. __m128 Add02 = _mm_add_ps(Sub02, Mul08);
  566. __m128 Inv2 = _mm_mul_ps(SignB, Add02);
  567. // col3
  568. // - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]),
  569. // + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]),
  570. // - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]),
  571. // + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3]));
  572. __m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
  573. __m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
  574. __m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
  575. __m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
  576. __m128 Add03 = _mm_add_ps(Sub03, Mul11);
  577. __m128 Inv3 = _mm_mul_ps(SignA, Add03);
  578. __m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
  579. __m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
  580. __m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));
  581. // valType Determinant = m[0][0] * Inverse[0][0]
  582. // + m[0][1] * Inverse[1][0]
  583. // + m[0][2] * Inverse[2][0]
  584. // + m[0][3] * Inverse[3][0];
  585. __m128 Det0 = glm_vec4_dot(in[0], Row2);
  586. __m128 Rcp0 = _mm_div_ps(_mm_set1_ps(1.0f), Det0);
  587. //__m128 Rcp0 = _mm_rcp_ps(Det0);
  588. // Inverse /= Determinant;
  589. out[0] = _mm_mul_ps(Inv0, Rcp0);
  590. out[1] = _mm_mul_ps(Inv1, Rcp0);
  591. out[2] = _mm_mul_ps(Inv2, Rcp0);
  592. out[3] = _mm_mul_ps(Inv3, Rcp0);
  593. }
  594. GLM_FUNC_QUALIFIER void glm_mat4_inverse_lowp(glm_vec4 const in[4], glm_vec4 out[4])
  595. {
  596. __m128 Fac0;
  597. {
  598. // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
  599. // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
  600. // valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
  601. // valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];
  602. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  603. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  604. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  605. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  606. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  607. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  608. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  609. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  610. Fac0 = _mm_sub_ps(Mul00, Mul01);
  611. }
  612. __m128 Fac1;
  613. {
  614. // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
  615. // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
  616. // valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
  617. // valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];
  618. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  619. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  620. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  621. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  622. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  623. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  624. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  625. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  626. Fac1 = _mm_sub_ps(Mul00, Mul01);
  627. }
  628. __m128 Fac2;
  629. {
  630. // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
  631. // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
  632. // valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
  633. // valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
  634. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  635. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  636. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  637. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  638. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  639. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  640. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  641. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  642. Fac2 = _mm_sub_ps(Mul00, Mul01);
  643. }
  644. __m128 Fac3;
  645. {
  646. // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
  647. // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
  648. // valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
  649. // valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];
  650. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
  651. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  652. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  653. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  654. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  655. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
  656. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  657. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  658. Fac3 = _mm_sub_ps(Mul00, Mul01);
  659. }
  660. __m128 Fac4;
  661. {
  662. // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
  663. // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
  664. // valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
  665. // valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];
  666. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
  667. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  668. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  669. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  670. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  671. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
  672. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  673. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  674. Fac4 = _mm_sub_ps(Mul00, Mul01);
  675. }
  676. __m128 Fac5;
  677. {
  678. // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
  679. // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
  680. // valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
  681. // valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];
  682. __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
  683. __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
  684. __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
  685. __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
  686. __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
  687. __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
  688. __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
  689. __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
  690. Fac5 = _mm_sub_ps(Mul00, Mul01);
  691. }
  692. __m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
  693. __m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);
  694. // m[1][0]
  695. // m[0][0]
  696. // m[0][0]
  697. // m[0][0]
  698. __m128 Temp0 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(0, 0, 0, 0));
  699. __m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));
  700. // m[1][1]
  701. // m[0][1]
  702. // m[0][1]
  703. // m[0][1]
  704. __m128 Temp1 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(1, 1, 1, 1));
  705. __m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));
  706. // m[1][2]
  707. // m[0][2]
  708. // m[0][2]
  709. // m[0][2]
  710. __m128 Temp2 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(2, 2, 2, 2));
  711. __m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));
  712. // m[1][3]
  713. // m[0][3]
  714. // m[0][3]
  715. // m[0][3]
  716. __m128 Temp3 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(3, 3, 3, 3));
  717. __m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));
  718. // col0
  719. // + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]),
  720. // - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]),
  721. // + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]),
  722. // - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]),
  723. __m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
  724. __m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
  725. __m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
  726. __m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
  727. __m128 Add00 = _mm_add_ps(Sub00, Mul02);
  728. __m128 Inv0 = _mm_mul_ps(SignB, Add00);
  729. // col1
  730. // - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]),
  731. // + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]),
  732. // - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]),
  733. // + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]),
  734. __m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
  735. __m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
  736. __m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
  737. __m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
  738. __m128 Add01 = _mm_add_ps(Sub01, Mul05);
  739. __m128 Inv1 = _mm_mul_ps(SignA, Add01);
  740. // col2
  741. // + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]),
  742. // - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]),
  743. // + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]),
  744. // - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]),
  745. __m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
  746. __m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
  747. __m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
  748. __m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
  749. __m128 Add02 = _mm_add_ps(Sub02, Mul08);
  750. __m128 Inv2 = _mm_mul_ps(SignB, Add02);
  751. // col3
  752. // - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]),
  753. // + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]),
  754. // - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]),
  755. // + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3]));
  756. __m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
  757. __m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
  758. __m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
  759. __m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
  760. __m128 Add03 = _mm_add_ps(Sub03, Mul11);
  761. __m128 Inv3 = _mm_mul_ps(SignA, Add03);
  762. __m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
  763. __m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
  764. __m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));
  765. // valType Determinant = m[0][0] * Inverse[0][0]
  766. // + m[0][1] * Inverse[1][0]
  767. // + m[0][2] * Inverse[2][0]
  768. // + m[0][3] * Inverse[3][0];
  769. __m128 Det0 = glm_vec4_dot(in[0], Row2);
  770. __m128 Rcp0 = _mm_rcp_ps(Det0);
  771. //__m128 Rcp0 = _mm_div_ps(one, Det0);
  772. // Inverse /= Determinant;
  773. out[0] = _mm_mul_ps(Inv0, Rcp0);
  774. out[1] = _mm_mul_ps(Inv1, Rcp0);
  775. out[2] = _mm_mul_ps(Inv2, Rcp0);
  776. out[3] = _mm_mul_ps(Inv3, Rcp0);
  777. }
  778. /*
  779. GLM_FUNC_QUALIFIER void glm_mat4_rotate(__m128 const in[4], float Angle, float const v[3], __m128 out[4])
  780. {
  781. float a = glm::radians(Angle);
  782. float c = cos(a);
  783. float s = sin(a);
  784. glm::vec4 AxisA(v[0], v[1], v[2], float(0));
  785. __m128 AxisB = _mm_set_ps(AxisA.w, AxisA.z, AxisA.y, AxisA.x);
  786. __m128 AxisC = detail::sse_nrm_ps(AxisB);
  787. __m128 Cos0 = _mm_set_ss(c);
  788. __m128 CosA = _mm_shuffle_ps(Cos0, Cos0, _MM_SHUFFLE(0, 0, 0, 0));
  789. __m128 Sin0 = _mm_set_ss(s);
  790. __m128 SinA = _mm_shuffle_ps(Sin0, Sin0, _MM_SHUFFLE(0, 0, 0, 0));
  791. // vec<3, T, Q> temp = (valType(1) - c) * axis;
  792. __m128 Temp0 = _mm_sub_ps(one, CosA);
  793. __m128 Temp1 = _mm_mul_ps(Temp0, AxisC);
  794. //Rotate[0][0] = c + temp[0] * axis[0];
  795. //Rotate[0][1] = 0 + temp[0] * axis[1] + s * axis[2];
  796. //Rotate[0][2] = 0 + temp[0] * axis[2] - s * axis[1];
  797. __m128 Axis0 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(0, 0, 0, 0));
  798. __m128 TmpA0 = _mm_mul_ps(Axis0, AxisC);
  799. __m128 CosA0 = _mm_shuffle_ps(Cos0, Cos0, _MM_SHUFFLE(1, 1, 1, 0));
  800. __m128 TmpA1 = _mm_add_ps(CosA0, TmpA0);
  801. __m128 SinA0 = SinA;//_mm_set_ps(0.0f, s, -s, 0.0f);
  802. __m128 TmpA2 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(3, 1, 2, 3));
  803. __m128 TmpA3 = _mm_mul_ps(SinA0, TmpA2);
  804. __m128 TmpA4 = _mm_add_ps(TmpA1, TmpA3);
  805. //Rotate[1][0] = 0 + temp[1] * axis[0] - s * axis[2];
  806. //Rotate[1][1] = c + temp[1] * axis[1];
  807. //Rotate[1][2] = 0 + temp[1] * axis[2] + s * axis[0];
  808. __m128 Axis1 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(1, 1, 1, 1));
  809. __m128 TmpB0 = _mm_mul_ps(Axis1, AxisC);
  810. __m128 CosA1 = _mm_shuffle_ps(Cos0, Cos0, _MM_SHUFFLE(1, 1, 0, 1));
  811. __m128 TmpB1 = _mm_add_ps(CosA1, TmpB0);
  812. __m128 SinB0 = SinA;//_mm_set_ps(-s, 0.0f, s, 0.0f);
  813. __m128 TmpB2 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(3, 0, 3, 2));
  814. __m128 TmpB3 = _mm_mul_ps(SinA0, TmpB2);
  815. __m128 TmpB4 = _mm_add_ps(TmpB1, TmpB3);
  816. //Rotate[2][0] = 0 + temp[2] * axis[0] + s * axis[1];
  817. //Rotate[2][1] = 0 + temp[2] * axis[1] - s * axis[0];
  818. //Rotate[2][2] = c + temp[2] * axis[2];
  819. __m128 Axis2 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(2, 2, 2, 2));
  820. __m128 TmpC0 = _mm_mul_ps(Axis2, AxisC);
  821. __m128 CosA2 = _mm_shuffle_ps(Cos0, Cos0, _MM_SHUFFLE(1, 0, 1, 1));
  822. __m128 TmpC1 = _mm_add_ps(CosA2, TmpC0);
  823. __m128 SinC0 = SinA;//_mm_set_ps(s, -s, 0.0f, 0.0f);
  824. __m128 TmpC2 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(3, 3, 0, 1));
  825. __m128 TmpC3 = _mm_mul_ps(SinA0, TmpC2);
  826. __m128 TmpC4 = _mm_add_ps(TmpC1, TmpC3);
  827. __m128 Result[4];
  828. Result[0] = TmpA4;
  829. Result[1] = TmpB4;
  830. Result[2] = TmpC4;
  831. Result[3] = _mm_set_ps(1, 0, 0, 0);
  832. //mat<4, 4, valType> Result;
  833. //Result[0] = m[0] * Rotate[0][0] + m[1] * Rotate[0][1] + m[2] * Rotate[0][2];
  834. //Result[1] = m[0] * Rotate[1][0] + m[1] * Rotate[1][1] + m[2] * Rotate[1][2];
  835. //Result[2] = m[0] * Rotate[2][0] + m[1] * Rotate[2][1] + m[2] * Rotate[2][2];
  836. //Result[3] = m[3];
  837. //return Result;
  838. sse_mul_ps(in, Result, out);
  839. }
  840. */
  841. GLM_FUNC_QUALIFIER void glm_mat4_outerProduct(__m128 const& c, __m128 const& r, __m128 out[4])
  842. {
  843. out[0] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(0, 0, 0, 0)));
  844. out[1] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(1, 1, 1, 1)));
  845. out[2] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(2, 2, 2, 2)));
  846. out[3] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(3, 3, 3, 3)));
  847. }
  848. #endif//GLM_ARCH & GLM_ARCH_SSE2_BIT