merge_sort_on_cpu.hpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. //---------------------------------------------------------------------------//
  2. // Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
  3. //
  4. // Distributed under the Boost Software License, Version 1.0
  5. // See accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt
  7. //
  8. // See http://boostorg.github.com/compute for more information.
  9. //---------------------------------------------------------------------------//
  10. #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
  11. #define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
  12. #include <boost/compute/kernel.hpp>
  13. #include <boost/compute/program.hpp>
  14. #include <boost/compute/command_queue.hpp>
  15. #include <boost/compute/algorithm/detail/merge_with_merge_path.hpp>
  16. #include <boost/compute/container/vector.hpp>
  17. #include <boost/compute/detail/meta_kernel.hpp>
  18. #include <boost/compute/detail/iterator_range_size.hpp>
  19. namespace boost {
  20. namespace compute {
  21. namespace detail {
  22. template<class KeyIterator, class ValueIterator, class Compare>
  23. inline void merge_blocks(KeyIterator keys_first,
  24. ValueIterator values_first,
  25. KeyIterator keys_result,
  26. ValueIterator values_result,
  27. Compare compare,
  28. size_t count,
  29. const size_t block_size,
  30. const bool sort_by_key,
  31. command_queue &queue)
  32. {
  33. (void) values_result;
  34. (void) values_first;
  35. meta_kernel k("merge_sort_on_cpu_merge_blocks");
  36. size_t count_arg = k.add_arg<const uint_>("count");
  37. size_t block_size_arg = k.add_arg<uint_>("block_size");
  38. k <<
  39. k.decl<uint_>("b1_start") << " = get_global_id(0) * block_size * 2;\n" <<
  40. k.decl<uint_>("b1_end") << " = min(count, b1_start + block_size);\n" <<
  41. k.decl<uint_>("b2_start") << " = min(count, b1_start + block_size);\n" <<
  42. k.decl<uint_>("b2_end") << " = min(count, b2_start + block_size);\n" <<
  43. k.decl<uint_>("result_idx") << " = b1_start;\n" <<
  44. // merging block 1 and block 2 (stable)
  45. "while(b1_start < b1_end && b2_start < b2_end){\n" <<
  46. " if( " << compare(keys_first[k.var<uint_>("b2_start")],
  47. keys_first[k.var<uint_>("b1_start")]) << "){\n" <<
  48. " " << keys_result[k.var<uint_>("result_idx")] << " = " <<
  49. keys_first[k.var<uint_>("b2_start")] << ";\n";
  50. if(sort_by_key){
  51. k <<
  52. " " << values_result[k.var<uint_>("result_idx")] << " = " <<
  53. values_first[k.var<uint_>("b2_start")] << ";\n";
  54. }
  55. k <<
  56. " b2_start++;\n" <<
  57. " }\n" <<
  58. " else {\n" <<
  59. " " << keys_result[k.var<uint_>("result_idx")] << " = " <<
  60. keys_first[k.var<uint_>("b1_start")] << ";\n";
  61. if(sort_by_key){
  62. k <<
  63. " " << values_result[k.var<uint_>("result_idx")] << " = " <<
  64. values_first[k.var<uint_>("b1_start")] << ";\n";
  65. }
  66. k <<
  67. " b1_start++;\n" <<
  68. " }\n" <<
  69. " result_idx++;\n" <<
  70. "}\n" <<
  71. "while(b1_start < b1_end){\n" <<
  72. " " << keys_result[k.var<uint_>("result_idx")] << " = " <<
  73. keys_first[k.var<uint_>("b1_start")] << ";\n";
  74. if(sort_by_key){
  75. k <<
  76. " " << values_result[k.var<uint_>("result_idx")] << " = " <<
  77. values_first[k.var<uint_>("b1_start")] << ";\n";
  78. }
  79. k <<
  80. " b1_start++;\n" <<
  81. " result_idx++;\n" <<
  82. "}\n" <<
  83. "while(b2_start < b2_end){\n" <<
  84. " " << keys_result[k.var<uint_>("result_idx")] << " = " <<
  85. keys_first[k.var<uint_>("b2_start")] << ";\n";
  86. if(sort_by_key){
  87. k <<
  88. " " << values_result[k.var<uint_>("result_idx")] << " = " <<
  89. values_first[k.var<uint_>("b2_start")] << ";\n";
  90. }
  91. k <<
  92. " b2_start++;\n" <<
  93. " result_idx++;\n" <<
  94. "}\n";
  95. const context &context = queue.get_context();
  96. ::boost::compute::kernel kernel = k.compile(context);
  97. kernel.set_arg(count_arg, static_cast<const uint_>(count));
  98. kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
  99. const size_t global_size = static_cast<size_t>(
  100. std::ceil(float(count) / (2 * block_size))
  101. );
  102. queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
  103. }
  104. template<class Iterator, class Compare>
  105. inline void merge_blocks(Iterator first,
  106. Iterator result,
  107. Compare compare,
  108. size_t count,
  109. const size_t block_size,
  110. const bool sort_by_key,
  111. command_queue &queue)
  112. {
  113. // dummy iterator as it's not sort by key
  114. Iterator dummy;
  115. merge_blocks(first, dummy, result, dummy, compare, count, block_size, false, queue);
  116. }
  117. template<class Iterator, class Compare>
  118. inline void dispatch_merge_blocks(Iterator first,
  119. Iterator result,
  120. Compare compare,
  121. size_t count,
  122. const size_t block_size,
  123. const size_t input_size_threshold,
  124. const size_t blocks_no_threshold,
  125. command_queue &queue)
  126. {
  127. const size_t blocks_no = static_cast<size_t>(
  128. std::ceil(float(count) / block_size)
  129. );
  130. // merge with merge path should used only for the large arrays and at the
  131. // end of merging part when there are only a few big blocks left to be merged
  132. if(blocks_no <= blocks_no_threshold && count >= input_size_threshold){
  133. Iterator last = first + count;
  134. for(size_t i = 0; i < count; i+= 2*block_size)
  135. {
  136. Iterator first1 = (std::min)(first + i, last);
  137. Iterator last1 = (std::min)(first1 + block_size, last);
  138. Iterator first2 = last1;
  139. Iterator last2 = (std::min)(first2 + block_size, last);
  140. Iterator block_result = (std::min)(result + i, result + count);
  141. merge_with_merge_path(first1, last1, first2, last2,
  142. block_result, compare, queue);
  143. }
  144. }
  145. else {
  146. merge_blocks(first, result, compare, count, block_size, false, queue);
  147. }
  148. }
  149. template<class KeyIterator, class ValueIterator, class Compare>
  150. inline void block_insertion_sort(KeyIterator keys_first,
  151. ValueIterator values_first,
  152. Compare compare,
  153. const size_t count,
  154. const size_t block_size,
  155. const bool sort_by_key,
  156. command_queue &queue)
  157. {
  158. (void) values_first;
  159. typedef typename std::iterator_traits<KeyIterator>::value_type K;
  160. typedef typename std::iterator_traits<ValueIterator>::value_type T;
  161. meta_kernel k("merge_sort_on_cpu_block_insertion_sort");
  162. size_t count_arg = k.add_arg<uint_>("count");
  163. size_t block_size_arg = k.add_arg<uint_>("block_size");
  164. k <<
  165. k.decl<uint_>("start") << " = get_global_id(0) * block_size;\n" <<
  166. k.decl<uint_>("end") << " = min(count, start + block_size);\n" <<
  167. // block insertion sort (stable)
  168. "for(uint i = start+1; i < end; i++){\n" <<
  169. " " << k.decl<const K>("key") << " = " <<
  170. keys_first[k.var<uint_>("i")] << ";\n";
  171. if(sort_by_key){
  172. k <<
  173. " " << k.decl<const T>("value") << " = " <<
  174. values_first[k.var<uint_>("i")] << ";\n";
  175. }
  176. k <<
  177. " uint pos = i;\n" <<
  178. " while(pos > start && " <<
  179. compare(k.var<const K>("key"),
  180. keys_first[k.var<uint_>("pos-1")]) << "){\n" <<
  181. " " << keys_first[k.var<uint_>("pos")] << " = " <<
  182. keys_first[k.var<uint_>("pos-1")] << ";\n";
  183. if(sort_by_key){
  184. k <<
  185. " " << values_first[k.var<uint_>("pos")] << " = " <<
  186. values_first[k.var<uint_>("pos-1")] << ";\n";
  187. }
  188. k <<
  189. " pos--;\n" <<
  190. " }\n" <<
  191. " " << keys_first[k.var<uint_>("pos")] << " = key;\n";
  192. if(sort_by_key) {
  193. k <<
  194. " " << values_first[k.var<uint_>("pos")] << " = value;\n";
  195. }
  196. k <<
  197. "}\n"; // block insertion sort
  198. const context &context = queue.get_context();
  199. ::boost::compute::kernel kernel = k.compile(context);
  200. kernel.set_arg(count_arg, static_cast<uint_>(count));
  201. kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
  202. const size_t global_size = static_cast<size_t>(std::ceil(float(count) / block_size));
  203. queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
  204. }
  205. template<class Iterator, class Compare>
  206. inline void block_insertion_sort(Iterator first,
  207. Compare compare,
  208. const size_t count,
  209. const size_t block_size,
  210. command_queue &queue)
  211. {
  212. // dummy iterator as it's not sort by key
  213. Iterator dummy;
  214. block_insertion_sort(first, dummy, compare, count, block_size, false, queue);
  215. }
  216. // This sort is stable.
  217. template<class Iterator, class Compare>
  218. inline void merge_sort_on_cpu(Iterator first,
  219. Iterator last,
  220. Compare compare,
  221. command_queue &queue)
  222. {
  223. typedef typename std::iterator_traits<Iterator>::value_type value_type;
  224. size_t count = iterator_range_size(first, last);
  225. if(count < 2){
  226. return;
  227. }
  228. // for small input size only insertion sort is performed
  229. else if(count <= 512){
  230. block_insertion_sort(first, compare, count, count, queue);
  231. return;
  232. }
  233. const context &context = queue.get_context();
  234. const device &device = queue.get_device();
  235. // loading parameters
  236. std::string cache_key =
  237. std::string("__boost_merge_sort_on_cpu_") + type_name<value_type>();
  238. boost::shared_ptr<parameter_cache> parameters =
  239. detail::parameter_cache::get_global_cache(device);
  240. // When there is merge_with_path_blocks_no_threshold or less blocks left to
  241. // merge AND input size is merge_with_merge_path_input_size_threshold or more
  242. // merge_with_merge_path() algorithm is used to merge sorted blocks;
  243. // otherwise merge_blocks() is used.
  244. const size_t merge_with_path_blocks_no_threshold =
  245. parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8);
  246. const size_t merge_with_path_input_size_threshold =
  247. parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152);
  248. const size_t block_size =
  249. parameters->get(cache_key, "insertion_sort_block_size", 64);
  250. block_insertion_sort(first, compare, count, block_size, queue);
  251. // temporary buffer for merge result
  252. vector<value_type> temp(count, context);
  253. bool result_in_temporary_buffer = false;
  254. for(size_t i = block_size; i < count; i *= 2){
  255. result_in_temporary_buffer = !result_in_temporary_buffer;
  256. if(result_in_temporary_buffer) {
  257. dispatch_merge_blocks(first, temp.begin(), compare, count, i,
  258. merge_with_path_input_size_threshold,
  259. merge_with_path_blocks_no_threshold,
  260. queue);
  261. } else {
  262. dispatch_merge_blocks(temp.begin(), first, compare, count, i,
  263. merge_with_path_input_size_threshold,
  264. merge_with_path_blocks_no_threshold,
  265. queue);
  266. }
  267. }
  268. if(result_in_temporary_buffer) {
  269. copy(temp.begin(), temp.end(), first, queue);
  270. }
  271. }
  272. // This sort is stable.
  273. template<class KeyIterator, class ValueIterator, class Compare>
  274. inline void merge_sort_by_key_on_cpu(KeyIterator keys_first,
  275. KeyIterator keys_last,
  276. ValueIterator values_first,
  277. Compare compare,
  278. command_queue &queue)
  279. {
  280. typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
  281. typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
  282. size_t count = iterator_range_size(keys_first, keys_last);
  283. if(count < 2){
  284. return;
  285. }
  286. // for small input size only insertion sort is performed
  287. else if(count <= 512){
  288. block_insertion_sort(keys_first, values_first, compare,
  289. count, count, true, queue);
  290. return;
  291. }
  292. const context &context = queue.get_context();
  293. const device &device = queue.get_device();
  294. // loading parameters
  295. std::string cache_key =
  296. std::string("__boost_merge_sort_by_key_on_cpu_") + type_name<value_type>()
  297. + "_with_" + type_name<key_type>();
  298. boost::shared_ptr<parameter_cache> parameters =
  299. detail::parameter_cache::get_global_cache(device);
  300. const size_t block_size =
  301. parameters->get(cache_key, "insertion_sort_by_key_block_size", 64);
  302. block_insertion_sort(keys_first, values_first, compare,
  303. count, block_size, true, queue);
  304. // temporary buffer for merge results
  305. vector<value_type> values_temp(count, context);
  306. vector<key_type> keys_temp(count, context);
  307. bool result_in_temporary_buffer = false;
  308. for(size_t i = block_size; i < count; i *= 2){
  309. result_in_temporary_buffer = !result_in_temporary_buffer;
  310. if(result_in_temporary_buffer) {
  311. merge_blocks(keys_first, values_first,
  312. keys_temp.begin(), values_temp.begin(),
  313. compare, count, i, true, queue);
  314. } else {
  315. merge_blocks(keys_temp.begin(), values_temp.begin(),
  316. keys_first, values_first,
  317. compare, count, i, true, queue);
  318. }
  319. }
  320. if(result_in_temporary_buffer) {
  321. copy(keys_temp.begin(), keys_temp.end(), keys_first, queue);
  322. copy(values_temp.begin(), values_temp.end(), values_first, queue);
  323. }
  324. }
  325. } // end detail namespace
  326. } // end compute namespace
  327. } // end boost namespace
  328. #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP