9
3

common.h 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. /// @ref simd
  2. /// @file glm/simd/common.h
  3. #pragma once
  4. #include "platform.h"
  5. #if GLM_ARCH & GLM_ARCH_SSE2_BIT
  6. GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_add(glm_f32vec4 a, glm_f32vec4 b)
  7. {
  8. return _mm_add_ps(a, b);
  9. }
  10. GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec1_add(glm_f32vec4 a, glm_f32vec4 b)
  11. {
  12. return _mm_add_ss(a, b);
  13. }
  14. GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_sub(glm_f32vec4 a, glm_f32vec4 b)
  15. {
  16. return _mm_sub_ps(a, b);
  17. }
  18. GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec1_sub(glm_f32vec4 a, glm_f32vec4 b)
  19. {
  20. return _mm_sub_ss(a, b);
  21. }
  22. GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_mul(glm_f32vec4 a, glm_f32vec4 b)
  23. {
  24. return _mm_mul_ps(a, b);
  25. }
  26. GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec1_mul(glm_f32vec4 a, glm_f32vec4 b)
  27. {
  28. return _mm_mul_ss(a, b);
  29. }
  30. GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_div(glm_f32vec4 a, glm_f32vec4 b)
  31. {
  32. return _mm_div_ps(a, b);
  33. }
  34. GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec1_div(glm_f32vec4 a, glm_f32vec4 b)
  35. {
  36. return _mm_div_ss(a, b);
  37. }
  38. GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_div_lowp(glm_f32vec4 a, glm_f32vec4 b)
  39. {
  40. return glm_vec4_mul(a, _mm_rcp_ps(b));
  41. }
  42. GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_swizzle_xyzw(glm_f32vec4 a)
  43. {
  44. # if GLM_ARCH & GLM_ARCH_AVX2_BIT
  45. return _mm_permute_ps(a, _MM_SHUFFLE(3, 2, 1, 0));
  46. # else
  47. return _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 1, 0));
  48. # endif
  49. }
  50. GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec1_fma(glm_f32vec4 a, glm_f32vec4 b, glm_f32vec4 c)
  51. {
  52. # if (GLM_ARCH & GLM_ARCH_AVX2_BIT) && !(GLM_COMPILER & GLM_COMPILER_CLANG)
  53. return _mm_fmadd_ss(a, b, c);
  54. # else
  55. return _mm_add_ss(_mm_mul_ss(a, b), c);
  56. # endif
  57. }
  58. GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_fma(glm_f32vec4 a, glm_f32vec4 b, glm_f32vec4 c)
  59. {
  60. # if (GLM_ARCH & GLM_ARCH_AVX2_BIT) && !(GLM_COMPILER & GLM_COMPILER_CLANG)
  61. return _mm_fmadd_ps(a, b, c);
  62. # else
  63. return glm_vec4_add(glm_vec4_mul(a, b), c);
  64. # endif
  65. }
  66. GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_abs(glm_f32vec4 x)
  67. {
  68. return _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
  69. }
  70. GLM_FUNC_QUALIFIER glm_ivec4 glm_ivec4_abs(glm_ivec4 x)
  71. {
  72. # if GLM_ARCH & GLM_ARCH_SSSE3_BIT
  73. return _mm_sign_epi32(x, x);
  74. # else
  75. glm_ivec4 const sgn0 = _mm_srai_epi32(x, 31);
  76. glm_ivec4 const inv0 = _mm_xor_si128(x, sgn0);
  77. glm_ivec4 const sub0 = _mm_sub_epi32(inv0, sgn0);
  78. return sub0;
  79. # endif
  80. }
  81. GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_sign(glm_vec4 x)
  82. {
  83. glm_vec4 const zro0 = _mm_setzero_ps();
  84. glm_vec4 const cmp0 = _mm_cmplt_ps(x, zro0);
  85. glm_vec4 const cmp1 = _mm_cmpgt_ps(x, zro0);
  86. glm_vec4 const and0 = _mm_and_ps(cmp0, _mm_set1_ps(-1.0f));
  87. glm_vec4 const and1 = _mm_and_ps(cmp1, _mm_set1_ps(1.0f));
  88. glm_vec4 const or0 = _mm_or_ps(and0, and1);
  89. return or0;
  90. }
  91. GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_round(glm_vec4 x)
  92. {
  93. # if GLM_ARCH & GLM_ARCH_SSE41_BIT
  94. return _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT);
  95. # else
  96. glm_vec4 const sgn0 = _mm_castsi128_ps(_mm_set1_epi32(int(0x80000000)));
  97. glm_vec4 const and0 = _mm_and_ps(sgn0, x);
  98. glm_vec4 const or0 = _mm_or_ps(and0, _mm_set_ps1(8388608.0f));
  99. glm_vec4 const add0 = glm_vec4_add(x, or0);
  100. glm_vec4 const sub0 = glm_vec4_sub(add0, or0);
  101. return sub0;
  102. # endif
  103. }
  104. GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_floor(glm_vec4 x)
  105. {
  106. # if GLM_ARCH & GLM_ARCH_SSE41_BIT
  107. return _mm_floor_ps(x);
  108. # else
  109. glm_vec4 const rnd0 = glm_vec4_round(x);
  110. glm_vec4 const cmp0 = _mm_cmplt_ps(x, rnd0);
  111. glm_vec4 const and0 = _mm_and_ps(cmp0, _mm_set1_ps(1.0f));
  112. glm_vec4 const sub0 = glm_vec4_sub(rnd0, and0);
  113. return sub0;
  114. # endif
  115. }
  116. /* trunc TODO
  117. GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_trunc(glm_vec4 x)
  118. {
  119. return glm_vec4();
  120. }
  121. */
  122. //roundEven
  123. GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_roundEven(glm_vec4 x)
  124. {
  125. glm_vec4 const sgn0 = _mm_castsi128_ps(_mm_set1_epi32(int(0x80000000)));
  126. glm_vec4 const and0 = _mm_and_ps(sgn0, x);
  127. glm_vec4 const or0 = _mm_or_ps(and0, _mm_set_ps1(8388608.0f));
  128. glm_vec4 const add0 = glm_vec4_add(x, or0);
  129. glm_vec4 const sub0 = glm_vec4_sub(add0, or0);
  130. return sub0;
  131. }
  132. GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_ceil(glm_vec4 x)
  133. {
  134. # if GLM_ARCH & GLM_ARCH_SSE41_BIT
  135. return _mm_ceil_ps(x);
  136. # else
  137. glm_vec4 const rnd0 = glm_vec4_round(x);
  138. glm_vec4 const cmp0 = _mm_cmpgt_ps(x, rnd0);
  139. glm_vec4 const and0 = _mm_and_ps(cmp0, _mm_set1_ps(1.0f));
  140. glm_vec4 const add0 = glm_vec4_add(rnd0, and0);
  141. return add0;
  142. # endif
  143. }
  144. GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_fract(glm_vec4 x)
  145. {
  146. glm_vec4 const flr0 = glm_vec4_floor(x);
  147. glm_vec4 const sub0 = glm_vec4_sub(x, flr0);
  148. return sub0;
  149. }
  150. GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_mod(glm_vec4 x, glm_vec4 y)
  151. {
  152. glm_vec4 const div0 = glm_vec4_div(x, y);
  153. glm_vec4 const flr0 = glm_vec4_floor(div0);
  154. glm_vec4 const mul0 = glm_vec4_mul(y, flr0);
  155. glm_vec4 const sub0 = glm_vec4_sub(x, mul0);
  156. return sub0;
  157. }
  158. GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_clamp(glm_vec4 v, glm_vec4 minVal, glm_vec4 maxVal)
  159. {
  160. glm_vec4 const min0 = _mm_min_ps(v, maxVal);
  161. glm_vec4 const max0 = _mm_max_ps(min0, minVal);
  162. return max0;
  163. }
  164. GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_mix(glm_vec4 v1, glm_vec4 v2, glm_vec4 a)
  165. {
  166. glm_vec4 const sub0 = glm_vec4_sub(_mm_set1_ps(1.0f), a);
  167. glm_vec4 const mul0 = glm_vec4_mul(v1, sub0);
  168. glm_vec4 const mad0 = glm_vec4_fma(v2, a, mul0);
  169. return mad0;
  170. }
  171. GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_step(glm_vec4 edge, glm_vec4 x)
  172. {
  173. glm_vec4 const cmp = _mm_cmple_ps(x, edge);
  174. return _mm_movemask_ps(cmp) == 0 ? _mm_set1_ps(1.0f) : _mm_setzero_ps();
  175. }
  176. GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_smoothstep(glm_vec4 edge0, glm_vec4 edge1, glm_vec4 x)
  177. {
  178. glm_vec4 const sub0 = glm_vec4_sub(x, edge0);
  179. glm_vec4 const sub1 = glm_vec4_sub(edge1, edge0);
  180. glm_vec4 const div0 = glm_vec4_sub(sub0, sub1);
  181. glm_vec4 const clp0 = glm_vec4_clamp(div0, _mm_setzero_ps(), _mm_set1_ps(1.0f));
  182. glm_vec4 const mul0 = glm_vec4_mul(_mm_set1_ps(2.0f), clp0);
  183. glm_vec4 const sub2 = glm_vec4_sub(_mm_set1_ps(3.0f), mul0);
  184. glm_vec4 const mul1 = glm_vec4_mul(clp0, clp0);
  185. glm_vec4 const mul2 = glm_vec4_mul(mul1, sub2);
  186. return mul2;
  187. }
  188. // Agner Fog method
  189. GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_nan(glm_vec4 x)
  190. {
  191. glm_ivec4 const t1 = _mm_castps_si128(x); // reinterpret as 32-bit integer
  192. glm_ivec4 const t2 = _mm_sll_epi32(t1, _mm_cvtsi32_si128(1)); // shift out sign bit
  193. glm_ivec4 const t3 = _mm_set1_epi32(int(0xFF000000)); // exponent mask
  194. glm_ivec4 const t4 = _mm_and_si128(t2, t3); // exponent
  195. glm_ivec4 const t5 = _mm_andnot_si128(t3, t2); // fraction
  196. glm_ivec4 const Equal = _mm_cmpeq_epi32(t3, t4);
  197. glm_ivec4 const Nequal = _mm_cmpeq_epi32(t5, _mm_setzero_si128());
  198. glm_ivec4 const And = _mm_and_si128(Equal, Nequal);
  199. return _mm_castsi128_ps(And); // exponent = all 1s and fraction != 0
  200. }
  201. // Agner Fog method
  202. GLM_FUNC_QUALIFIER glm_vec4 glm_vec4_inf(glm_vec4 x)
  203. {
  204. glm_ivec4 const t1 = _mm_castps_si128(x); // reinterpret as 32-bit integer
  205. glm_ivec4 const t2 = _mm_sll_epi32(t1, _mm_cvtsi32_si128(1)); // shift out sign bit
  206. return _mm_castsi128_ps(_mm_cmpeq_epi32(t2, _mm_set1_epi32(int(0xFF000000)))); // exponent is all 1s, fraction is 0
  207. }
  208. #endif//GLM_ARCH & GLM_ARCH_SSE2_BIT