find_extrema_with_reduce.hpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
  1. //---------------------------------------------------------------------------//
  2. // Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
  3. //
  4. // Distributed under the Boost Software License, Version 1.0
  5. // See accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt
  7. //
  8. // See http://boostorg.github.com/compute for more information.
  9. //---------------------------------------------------------------------------//
  10. #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
  11. #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
  12. #include <algorithm>
  13. #include <boost/compute/types.hpp>
  14. #include <boost/compute/command_queue.hpp>
  15. #include <boost/compute/algorithm/copy.hpp>
  16. #include <boost/compute/allocator/pinned_allocator.hpp>
  17. #include <boost/compute/container/vector.hpp>
  18. #include <boost/compute/detail/meta_kernel.hpp>
  19. #include <boost/compute/detail/iterator_range_size.hpp>
  20. #include <boost/compute/detail/parameter_cache.hpp>
  21. #include <boost/compute/memory/local_buffer.hpp>
  22. #include <boost/compute/type_traits/type_name.hpp>
  23. #include <boost/compute/utility/program_cache.hpp>
  24. namespace boost {
  25. namespace compute {
  26. namespace detail {
  27. template<class InputIterator>
  28. bool find_extrema_with_reduce_requirements_met(InputIterator first,
  29. InputIterator last,
  30. command_queue &queue)
  31. {
  32. typedef typename std::iterator_traits<InputIterator>::value_type input_type;
  33. const device &device = queue.get_device();
  34. // device must have dedicated local memory storage
  35. // otherwise reduction would be highly inefficient
  36. if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL)
  37. {
  38. return false;
  39. }
  40. const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
  41. // local memory size in bytes (per compute unit)
  42. const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>();
  43. std::string cache_key = std::string("__boost_find_extrema_reduce_")
  44. + type_name<input_type>();
  45. // load parameters
  46. boost::shared_ptr<parameter_cache> parameters =
  47. detail::parameter_cache::get_global_cache(device);
  48. // Get preferred work group size
  49. size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
  50. work_group_size = (std::min)(max_work_group_size, work_group_size);
  51. // local memory size needed to perform parallel reduction
  52. size_t required_local_mem_size = 0;
  53. // indices size
  54. required_local_mem_size += sizeof(uint_) * work_group_size;
  55. // values size
  56. required_local_mem_size += sizeof(input_type) * work_group_size;
  57. // at least 4 work groups per compute unit otherwise reduction
  58. // would be highly inefficient
  59. return ((required_local_mem_size * 4) <= local_mem_size);
  60. }
  61. /// \internal_
  62. /// Algorithm finds the first extremum in given range, i.e., with the lowest
  63. /// index.
  64. ///
  65. /// If \p use_input_idx is false, it's assumed that input data is ordered by
  66. /// increasing index and \p input_idx is not used in the algorithm.
  67. template<class InputIterator, class ResultIterator, class Compare>
  68. inline void find_extrema_with_reduce(InputIterator input,
  69. vector<uint_>::iterator input_idx,
  70. size_t count,
  71. ResultIterator result,
  72. vector<uint_>::iterator result_idx,
  73. size_t work_groups_no,
  74. size_t work_group_size,
  75. Compare compare,
  76. const bool find_minimum,
  77. const bool use_input_idx,
  78. command_queue &queue)
  79. {
  80. typedef typename std::iterator_traits<InputIterator>::value_type input_type;
  81. const context &context = queue.get_context();
  82. meta_kernel k("find_extrema_reduce");
  83. size_t count_arg = k.add_arg<uint_>("count");
  84. size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
  85. size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx");
  86. k <<
  87. // Work item global id
  88. k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
  89. // Index of element that will be read from input buffer
  90. k.decl<uint_>("idx") << " = gid;\n" <<
  91. k.decl<input_type>("acc") << ";\n" <<
  92. k.decl<uint_>("acc_idx") << ";\n" <<
  93. "if(gid < count) {\n" <<
  94. // Real index of currently best element
  95. "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
  96. k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
  97. "#else\n" <<
  98. k.var<uint_>("acc_idx") << " = idx;\n" <<
  99. "#endif\n" <<
  100. // Init accumulator with first[get_global_id(0)]
  101. "acc = " << input[k.var<uint_>("idx")] << ";\n" <<
  102. "idx += get_global_size(0);\n" <<
  103. "}\n" <<
  104. k.decl<bool>("compare_result") << ";\n" <<
  105. k.decl<bool>("equal") << ";\n\n" <<
  106. "while( idx < count ){\n" <<
  107. // Next element
  108. k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" <<
  109. "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
  110. k.decl<uint_>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
  111. "#endif\n" <<
  112. // Comparison between currently best element (acc) and next element
  113. "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
  114. "compare_result = " << compare(k.var<input_type>("next"),
  115. k.var<input_type>("acc")) << ";\n" <<
  116. "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
  117. "equal = !compare_result && !" <<
  118. compare(k.var<input_type>("acc"),
  119. k.var<input_type>("next")) << ";\n" <<
  120. "# endif\n" <<
  121. "#else\n" <<
  122. "compare_result = " << compare(k.var<input_type>("acc"),
  123. k.var<input_type>("next")) << ";\n" <<
  124. "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
  125. "equal = !compare_result && !" <<
  126. compare(k.var<input_type>("next"),
  127. k.var<input_type>("acc")) << ";\n" <<
  128. "# endif\n" <<
  129. "#endif\n" <<
  130. // save the winner
  131. "acc = compare_result ? acc : next;\n" <<
  132. "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
  133. "acc_idx = compare_result ? " <<
  134. "acc_idx : " <<
  135. "(equal ? min(acc_idx, next_idx) : next_idx);\n" <<
  136. "#else\n" <<
  137. "acc_idx = compare_result ? acc_idx : idx;\n" <<
  138. "#endif\n" <<
  139. "idx += get_global_size(0);\n" <<
  140. "}\n\n" <<
  141. // Work item local id
  142. k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
  143. "block[lid] = acc;\n" <<
  144. "block_idx[lid] = acc_idx;\n" <<
  145. "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
  146. k.decl<uint_>("group_offset") <<
  147. " = count - (get_local_size(0) * get_group_id(0));\n\n";
  148. k <<
  149. "#pragma unroll\n"
  150. "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " <<
  151. "offset = offset / 2) {\n" <<
  152. "if((lid < offset) && ((lid + offset) < group_offset)) { \n" <<
  153. k.decl<input_type>("mine") << " = block[lid];\n" <<
  154. k.decl<input_type>("other") << " = block[lid+offset];\n" <<
  155. "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
  156. "compare_result = " << compare(k.var<input_type>("other"),
  157. k.var<input_type>("mine")) << ";\n" <<
  158. "equal = !compare_result && !" <<
  159. compare(k.var<input_type>("mine"),
  160. k.var<input_type>("other")) << ";\n" <<
  161. "#else\n" <<
  162. "compare_result = " << compare(k.var<input_type>("mine"),
  163. k.var<input_type>("other")) << ";\n" <<
  164. "equal = !compare_result && !" <<
  165. compare(k.var<input_type>("other"),
  166. k.var<input_type>("mine")) << ";\n" <<
  167. "#endif\n" <<
  168. "block[lid] = compare_result ? mine : other;\n" <<
  169. k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" <<
  170. k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" <<
  171. "block_idx[lid] = compare_result ? " <<
  172. "mine_idx : " <<
  173. "(equal ? min(mine_idx, other_idx) : other_idx);\n" <<
  174. "}\n"
  175. "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
  176. "}\n\n" <<
  177. // write block result to global output
  178. "if(lid == 0){\n" <<
  179. result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" <<
  180. result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" <<
  181. "}";
  182. std::string options;
  183. if(!find_minimum){
  184. options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
  185. }
  186. if(use_input_idx){
  187. options += " -DBOOST_COMPUTE_USE_INPUT_IDX";
  188. }
  189. kernel kernel = k.compile(context, options);
  190. kernel.set_arg(count_arg, static_cast<uint_>(count));
  191. kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size));
  192. kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size));
  193. queue.enqueue_1d_range_kernel(kernel,
  194. 0,
  195. work_groups_no * work_group_size,
  196. work_group_size);
  197. }
  198. template<class InputIterator, class ResultIterator, class Compare>
  199. inline void find_extrema_with_reduce(InputIterator input,
  200. size_t count,
  201. ResultIterator result,
  202. vector<uint_>::iterator result_idx,
  203. size_t work_groups_no,
  204. size_t work_group_size,
  205. Compare compare,
  206. const bool find_minimum,
  207. command_queue &queue)
  208. {
  209. // dummy will not be used
  210. buffer_iterator<uint_> dummy = result_idx;
  211. return find_extrema_with_reduce(
  212. input, dummy, count, result, result_idx, work_groups_no,
  213. work_group_size, compare, find_minimum, false, queue
  214. );
  215. }
  216. // Space complexity: \Omega(2 * work-group-size * work-groups-per-compute-unit)
  217. template<class InputIterator, class Compare>
  218. InputIterator find_extrema_with_reduce(InputIterator first,
  219. InputIterator last,
  220. Compare compare,
  221. const bool find_minimum,
  222. command_queue &queue)
  223. {
  224. typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
  225. typedef typename std::iterator_traits<InputIterator>::value_type input_type;
  226. const context &context = queue.get_context();
  227. const device &device = queue.get_device();
  228. // Getting information about used queue and device
  229. const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
  230. const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
  231. const size_t count = detail::iterator_range_size(first, last);
  232. std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
  233. + type_name<input_type>();
  234. // load parameters
  235. boost::shared_ptr<parameter_cache> parameters =
  236. detail::parameter_cache::get_global_cache(device);
  237. // get preferred work group size and preferred number
  238. // of work groups per compute unit
  239. size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
  240. size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100);
  241. // calculate work group size and number of work groups
  242. work_group_size = (std::min)(max_work_group_size, work_group_size);
  243. size_t work_groups_no = compute_units_no * work_groups_per_cu;
  244. work_groups_no = (std::min)(
  245. work_groups_no,
  246. static_cast<size_t>(std::ceil(float(count) / work_group_size))
  247. );
  248. // phase I: finding candidates for extremum
  249. // device buffors for extremum candidates and their indices
  250. // each work-group computes its candidate
  251. vector<input_type> candidates(work_groups_no, context);
  252. vector<uint_> candidates_idx(work_groups_no, context);
  253. // finding candidates for first extremum and their indices
  254. find_extrema_with_reduce(
  255. first, count, candidates.begin(), candidates_idx.begin(),
  256. work_groups_no, work_group_size, compare, find_minimum, queue
  257. );
  258. // phase II: finding extremum from among the candidates
  259. // zero-copy buffers for final result (value and index)
  260. vector<input_type, ::boost::compute::pinned_allocator<input_type> >
  261. result(1, context);
  262. vector<uint_, ::boost::compute::pinned_allocator<uint_> >
  263. result_idx(1, context);
  264. // get extremum from among the candidates
  265. find_extrema_with_reduce(
  266. candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(),
  267. result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue
  268. );
  269. // mapping extremum index to host
  270. uint_* result_idx_host_ptr =
  271. static_cast<uint_*>(
  272. queue.enqueue_map_buffer(
  273. result_idx.get_buffer(), command_queue::map_read,
  274. 0, sizeof(uint_)
  275. )
  276. );
  277. return first + static_cast<difference_type>(*result_idx_host_ptr);
  278. }
  279. template<class InputIterator>
  280. InputIterator find_extrema_with_reduce(InputIterator first,
  281. InputIterator last,
  282. ::boost::compute::less<
  283. typename std::iterator_traits<
  284. InputIterator
  285. >::value_type
  286. >
  287. compare,
  288. const bool find_minimum,
  289. command_queue &queue)
  290. {
  291. typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
  292. typedef typename std::iterator_traits<InputIterator>::value_type input_type;
  293. const context &context = queue.get_context();
  294. const device &device = queue.get_device();
  295. // Getting information about used queue and device
  296. const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
  297. const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
  298. const size_t count = detail::iterator_range_size(first, last);
  299. std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
  300. + type_name<input_type>();
  301. // load parameters
  302. boost::shared_ptr<parameter_cache> parameters =
  303. detail::parameter_cache::get_global_cache(device);
  304. // get preferred work group size and preferred number
  305. // of work groups per compute unit
  306. size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
  307. size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64);
  308. // calculate work group size and number of work groups
  309. work_group_size = (std::min)(max_work_group_size, work_group_size);
  310. size_t work_groups_no = compute_units_no * work_groups_per_cu;
  311. work_groups_no = (std::min)(
  312. work_groups_no,
  313. static_cast<size_t>(std::ceil(float(count) / work_group_size))
  314. );
  315. // phase I: finding candidates for extremum
  316. // device buffors for extremum candidates and their indices
  317. // each work-group computes its candidate
  318. // zero-copy buffers are used to eliminate copying data back to host
  319. vector<input_type, ::boost::compute::pinned_allocator<input_type> >
  320. candidates(work_groups_no, context);
  321. vector<uint_, ::boost::compute::pinned_allocator <uint_> >
  322. candidates_idx(work_groups_no, context);
  323. // finding candidates for first extremum and their indices
  324. find_extrema_with_reduce(
  325. first, count, candidates.begin(), candidates_idx.begin(),
  326. work_groups_no, work_group_size, compare, find_minimum, queue
  327. );
  328. // phase II: finding extremum from among the candidates
  329. // mapping candidates and their indices to host
  330. input_type* candidates_host_ptr =
  331. static_cast<input_type*>(
  332. queue.enqueue_map_buffer(
  333. candidates.get_buffer(), command_queue::map_read,
  334. 0, work_groups_no * sizeof(input_type)
  335. )
  336. );
  337. uint_* candidates_idx_host_ptr =
  338. static_cast<uint_*>(
  339. queue.enqueue_map_buffer(
  340. candidates_idx.get_buffer(), command_queue::map_read,
  341. 0, work_groups_no * sizeof(uint_)
  342. )
  343. );
  344. input_type* i = candidates_host_ptr;
  345. uint_* idx = candidates_idx_host_ptr;
  346. uint_* extremum_idx = idx;
  347. input_type extremum = *candidates_host_ptr;
  348. i++; idx++;
  349. // find extremum (serial) from among the candidates on host
  350. if(!find_minimum) {
  351. while(idx != (candidates_idx_host_ptr + work_groups_no)) {
  352. input_type next = *i;
  353. bool compare_result = next > extremum;
  354. bool equal = next == extremum;
  355. extremum = compare_result ? next : extremum;
  356. extremum_idx = compare_result ? idx : extremum_idx;
  357. extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
  358. idx++, i++;
  359. }
  360. }
  361. else {
  362. while(idx != (candidates_idx_host_ptr + work_groups_no)) {
  363. input_type next = *i;
  364. bool compare_result = next < extremum;
  365. bool equal = next == extremum;
  366. extremum = compare_result ? next : extremum;
  367. extremum_idx = compare_result ? idx : extremum_idx;
  368. extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
  369. idx++, i++;
  370. }
  371. }
  372. return first + static_cast<difference_type>(*extremum_idx);
  373. }
  374. } // end detail namespace
  375. } // end compute namespace
  376. } // end boost namespace
  377. #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP