find_if_with_atomics.hpp 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. //---------------------------------------------------------------------------//
  2. // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
  3. //
  4. // Distributed under the Boost Software License, Version 1.0
  5. // See accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt
  7. //
  8. // See http://boostorg.github.com/compute for more information.
  9. //---------------------------------------------------------------------------//
  10. #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
  11. #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
  12. #include <iterator>
  13. #include <boost/compute/types.hpp>
  14. #include <boost/compute/functional.hpp>
  15. #include <boost/compute/command_queue.hpp>
  16. #include <boost/compute/container/detail/scalar.hpp>
  17. #include <boost/compute/iterator/buffer_iterator.hpp>
  18. #include <boost/compute/type_traits/type_name.hpp>
  19. #include <boost/compute/detail/meta_kernel.hpp>
  20. #include <boost/compute/detail/iterator_range_size.hpp>
  21. #include <boost/compute/detail/parameter_cache.hpp>
  22. namespace boost {
  23. namespace compute {
  24. namespace detail {
  25. template<class InputIterator, class UnaryPredicate>
  26. inline InputIterator find_if_with_atomics_one_vpt(InputIterator first,
  27. InputIterator last,
  28. UnaryPredicate predicate,
  29. const size_t count,
  30. command_queue &queue)
  31. {
  32. typedef typename std::iterator_traits<InputIterator>::value_type value_type;
  33. typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
  34. const context &context = queue.get_context();
  35. detail::meta_kernel k("find_if");
  36. size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
  37. atomic_min<uint_> atomic_min_uint;
  38. k << k.decl<const uint_>("i") << " = get_global_id(0);\n"
  39. << k.decl<const value_type>("value") << "="
  40. << first[k.var<const uint_>("i")] << ";\n"
  41. << "if(" << predicate(k.var<const value_type>("value")) << "){\n"
  42. << " " << atomic_min_uint(k.var<uint_ *>("index"), k.var<uint_>("i")) << ";\n"
  43. << "}\n";
  44. kernel kernel = k.compile(context);
  45. scalar<uint_> index(context);
  46. kernel.set_arg(index_arg, index.get_buffer());
  47. // initialize index to the last iterator's index
  48. index.write(static_cast<uint_>(count), queue);
  49. queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
  50. // read index and return iterator
  51. return first + static_cast<difference_type>(index.read(queue));
  52. }
  53. template<class InputIterator, class UnaryPredicate>
  54. inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first,
  55. InputIterator last,
  56. UnaryPredicate predicate,
  57. const size_t count,
  58. const size_t vpt,
  59. command_queue &queue)
  60. {
  61. typedef typename std::iterator_traits<InputIterator>::value_type value_type;
  62. typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
  63. const context &context = queue.get_context();
  64. const device &device = queue.get_device();
  65. detail::meta_kernel k("find_if");
  66. size_t index_arg = k.add_arg<uint_ *>(memory_object::global_memory, "index");
  67. size_t count_arg = k.add_arg<const uint_>("count");
  68. size_t vpt_arg = k.add_arg<const uint_>("vpt");
  69. atomic_min<uint_> atomic_min_uint;
  70. // for GPUs reads from global memory are coalesced
  71. if(device.type() & device::gpu) {
  72. k <<
  73. k.decl<const uint_>("lsize") << " = get_local_size(0);\n" <<
  74. k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" <<
  75. k.decl<const uint_>("end") << " = min(" <<
  76. "id + (lsize *" << k.var<uint_>("vpt") << ")," <<
  77. "count" <<
  78. ");\n" <<
  79. // checking if the index is already found
  80. "__local uint local_index;\n" <<
  81. "if(get_local_id(0) == 0){\n" <<
  82. " local_index = *index;\n " <<
  83. "};\n" <<
  84. "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
  85. "if(local_index < id){\n" <<
  86. " return;\n" <<
  87. "}\n" <<
  88. "while(id < end){\n" <<
  89. " " << k.decl<const value_type>("value") << " = " <<
  90. first[k.var<const uint_>("id")] << ";\n"
  91. " if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
  92. " " << atomic_min_uint(k.var<uint_ *>("index"),
  93. k.var<uint_>("id")) << ";\n" <<
  94. " return;\n"
  95. " }\n" <<
  96. " id+=lsize;\n" <<
  97. "}\n";
  98. // for CPUs (and other devices) reads are ordered so the big cache is
  99. // efficiently used.
  100. } else {
  101. k <<
  102. k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" <<
  103. k.decl<const uint_>("end") << " = min(" <<
  104. "id + " << k.var<uint_>("vpt") << "," <<
  105. "count" <<
  106. ");\n" <<
  107. "while(id < end && (*index) > id){\n" <<
  108. " " << k.decl<const value_type>("value") << " = " <<
  109. first[k.var<const uint_>("id")] << ";\n"
  110. " if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
  111. " " << atomic_min_uint(k.var<uint_ *>("index"),
  112. k.var<uint_>("id")) << ";\n" <<
  113. " return;\n" <<
  114. " }\n" <<
  115. " id++;\n" <<
  116. "}\n";
  117. }
  118. kernel kernel = k.compile(context);
  119. scalar<uint_> index(context);
  120. kernel.set_arg(index_arg, index.get_buffer());
  121. kernel.set_arg(count_arg, static_cast<uint_>(count));
  122. kernel.set_arg(vpt_arg, static_cast<uint_>(vpt));
  123. // initialize index to the last iterator's index
  124. index.write(static_cast<uint_>(count), queue);
  125. const size_t global_wg_size = static_cast<size_t>(
  126. std::ceil(float(count) / vpt)
  127. );
  128. queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0);
  129. // read index and return iterator
  130. return first + static_cast<difference_type>(index.read(queue));
  131. }
  132. // Space complexity: O(1)
  133. template<class InputIterator, class UnaryPredicate>
  134. inline InputIterator find_if_with_atomics(InputIterator first,
  135. InputIterator last,
  136. UnaryPredicate predicate,
  137. command_queue &queue)
  138. {
  139. typedef typename std::iterator_traits<InputIterator>::value_type value_type;
  140. size_t count = detail::iterator_range_size(first, last);
  141. if(count == 0){
  142. return last;
  143. }
  144. const device &device = queue.get_device();
  145. // load cached parameters
  146. std::string cache_key = std::string("__boost_find_if_with_atomics_")
  147. + type_name<value_type>();
  148. boost::shared_ptr<parameter_cache> parameters =
  149. detail::parameter_cache::get_global_cache(device);
  150. // for relatively small inputs on GPUs kernel checking one value per thread
  151. // (work-item) is more efficient than its multiple values per thread version
  152. if(device.type() & device::gpu){
  153. const size_t one_vpt_threshold =
  154. parameters->get(cache_key, "one_vpt_threshold", 1048576);
  155. if(count <= one_vpt_threshold){
  156. return find_if_with_atomics_one_vpt(
  157. first, last, predicate, count, queue
  158. );
  159. }
  160. }
  161. // values per thread
  162. size_t vpt;
  163. if(device.type() & device::gpu){
  164. // get vpt parameter
  165. vpt = parameters->get(cache_key, "vpt", 32);
  166. } else {
  167. // for CPUs work is split equally between compute units
  168. const size_t max_compute_units =
  169. device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
  170. vpt = static_cast<size_t>(
  171. std::ceil(float(count) / max_compute_units)
  172. );
  173. }
  174. return find_if_with_atomics_multiple_vpt(
  175. first, last, predicate, count, vpt, queue
  176. );
  177. }
  178. } // end detail namespace
  179. } // end compute namespace
  180. } // end boost namespace
  181. #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP