uuid_x86.ipp 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. /*
  2. * Copyright Andrey Semashev 2013.
  3. * Distributed under the Boost Software License, Version 1.0.
  4. * (See accompanying file LICENSE_1_0.txt or copy at
  5. * https://www.boost.org/LICENSE_1_0.txt)
  6. */
  7. /*!
  8. * \file uuid/detail/uuid_x86.ipp
  9. *
  10. * \brief This header contains optimized SSE implementation of \c boost::uuid operations.
  11. */
  12. #ifndef BOOST_UUID_DETAIL_UUID_X86_IPP_INCLUDED_
  13. #define BOOST_UUID_DETAIL_UUID_X86_IPP_INCLUDED_
  14. // MSVC does not always have immintrin.h (at least, not up to MSVC 10), so include the appropriate header for each instruction set
  15. #if defined(BOOST_UUID_USE_SSE41)
  16. #include <smmintrin.h>
  17. #elif defined(BOOST_UUID_USE_SSE3)
  18. #include <pmmintrin.h>
  19. #else
  20. #include <emmintrin.h>
  21. #endif
  22. #if defined(BOOST_MSVC) && defined(_M_X64) && !defined(BOOST_UUID_USE_SSE3) && (BOOST_MSVC < 1900 /* Fixed in Visual Studio 2015 */ )
  23. // At least MSVC 9 (VS2008) and 12 (VS2013) have an optimizer bug that sometimes results in incorrect SIMD code
  24. // generated in Release x64 mode. In particular, it affects operator==, where the compiler sometimes generates
  25. // pcmpeqd with a memory opereand instead of movdqu followed by pcmpeqd. The problem is that uuid can be
  26. // not aligned to 16 bytes and pcmpeqd causes alignment violation in this case. We cannot be sure that other
  27. // MSVC versions are not affected so we apply the workaround for all versions, except VS2015 on up where
  28. // the bug has been fixed.
  29. //
  30. // https://svn.boost.org/trac/boost/ticket/8509#comment:3
  31. // https://connect.microsoft.com/VisualStudio/feedbackdetail/view/981648#tabs
  32. #define BOOST_UUID_DETAIL_MSVC_BUG981648
  33. #if BOOST_MSVC >= 1600
  34. extern "C" void _ReadWriteBarrier(void);
  35. #pragma intrinsic(_ReadWriteBarrier)
  36. #endif
  37. #endif
  38. namespace boost {
  39. namespace uuids {
  40. namespace detail {
  41. BOOST_FORCEINLINE __m128i load_unaligned_si128(const uint8_t* p) BOOST_NOEXCEPT
  42. {
  43. #if defined(BOOST_UUID_USE_SSE3)
  44. return _mm_lddqu_si128(reinterpret_cast< const __m128i* >(p));
  45. #elif !defined(BOOST_UUID_DETAIL_MSVC_BUG981648)
  46. return _mm_loadu_si128(reinterpret_cast< const __m128i* >(p));
  47. #elif defined(BOOST_MSVC) && BOOST_MSVC >= 1600
  48. __m128i mm = _mm_loadu_si128(reinterpret_cast< const __m128i* >(p));
  49. // Make sure this load doesn't get merged with the subsequent instructions
  50. _ReadWriteBarrier();
  51. return mm;
  52. #else
  53. // VS2008 x64 doesn't respect _ReadWriteBarrier above, so we have to generate this crippled code to load unaligned data
  54. return _mm_unpacklo_epi64(_mm_loadl_epi64(reinterpret_cast< const __m128i* >(p)), _mm_loadl_epi64(reinterpret_cast< const __m128i* >(p + 8)));
  55. #endif
  56. }
  57. } // namespace detail
  58. inline bool uuid::is_nil() const BOOST_NOEXCEPT
  59. {
  60. __m128i mm = uuids::detail::load_unaligned_si128(data);
  61. #if defined(BOOST_UUID_USE_SSE41)
  62. return _mm_test_all_zeros(mm, mm) != 0;
  63. #else
  64. mm = _mm_cmpeq_epi32(mm, _mm_setzero_si128());
  65. return _mm_movemask_epi8(mm) == 0xFFFF;
  66. #endif
  67. }
  68. inline void uuid::swap(uuid& rhs) BOOST_NOEXCEPT
  69. {
  70. __m128i mm_this = uuids::detail::load_unaligned_si128(data);
  71. __m128i mm_rhs = uuids::detail::load_unaligned_si128(rhs.data);
  72. _mm_storeu_si128(reinterpret_cast< __m128i* >(rhs.data), mm_this);
  73. _mm_storeu_si128(reinterpret_cast< __m128i* >(data), mm_rhs);
  74. }
  75. inline bool operator== (uuid const& lhs, uuid const& rhs) BOOST_NOEXCEPT
  76. {
  77. __m128i mm_left = uuids::detail::load_unaligned_si128(lhs.data);
  78. __m128i mm_right = uuids::detail::load_unaligned_si128(rhs.data);
  79. #if defined(BOOST_UUID_USE_SSE41)
  80. __m128i mm = _mm_xor_si128(mm_left, mm_right);
  81. return _mm_test_all_zeros(mm, mm) != 0;
  82. #else
  83. __m128i mm_cmp = _mm_cmpeq_epi32(mm_left, mm_right);
  84. return _mm_movemask_epi8(mm_cmp) == 0xFFFF;
  85. #endif
  86. }
  87. inline bool operator< (uuid const& lhs, uuid const& rhs) BOOST_NOEXCEPT
  88. {
  89. __m128i mm_left = uuids::detail::load_unaligned_si128(lhs.data);
  90. __m128i mm_right = uuids::detail::load_unaligned_si128(rhs.data);
  91. // To emulate lexicographical_compare behavior we have to perform two comparisons - the forward and reverse one.
  92. // Then we know which bytes are equivalent and which ones are different, and for those different the comparison results
  93. // will be opposite. Then we'll be able to find the first differing comparison result (for both forward and reverse ways),
  94. // and depending on which way it is for, this will be the result of the operation. There are a few notes to consider:
  95. //
  96. // 1. Due to little endian byte order the first bytes go into the lower part of the xmm registers,
  97. // so the comparison results in the least significant bits will actually be the most signigicant for the final operation result.
  98. // This means we have to determine which of the comparison results have the least significant bit on, and this is achieved with
  99. // the "(x - 1) ^ x" trick.
  100. // 2. Because there is only signed comparison in SSE/AVX, we have to invert byte comparison results whenever signs of the corresponding
  101. // bytes are different. I.e. in signed comparison it's -1 < 1, but in unsigned it is the opposite (255 > 1). To do that we XOR left and right,
  102. // making the most significant bit of each byte 1 if the signs are different, and later apply this mask with another XOR to the comparison results.
  103. // 3. pcmpgtw compares for "greater" relation, so we swap the arguments to get what we need.
  104. const __m128i mm_signs_mask = _mm_xor_si128(mm_left, mm_right);
  105. __m128i mm_cmp = _mm_cmpgt_epi8(mm_right, mm_left), mm_rcmp = _mm_cmpgt_epi8(mm_left, mm_right);
  106. mm_cmp = _mm_xor_si128(mm_signs_mask, mm_cmp);
  107. mm_rcmp = _mm_xor_si128(mm_signs_mask, mm_rcmp);
  108. uint32_t cmp = static_cast< uint32_t >(_mm_movemask_epi8(mm_cmp)), rcmp = static_cast< uint32_t >(_mm_movemask_epi8(mm_rcmp));
  109. cmp = (cmp - 1u) ^ cmp;
  110. rcmp = (rcmp - 1u) ^ rcmp;
  111. return cmp < rcmp;
  112. }
  113. } // namespace uuids
  114. } // namespace boost
  115. #endif // BOOST_UUID_DETAIL_UUID_X86_IPP_INCLUDED_