merge_sort_on_gpu.hpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602
  1. //---------------------------------------------------------------------------//
  2. // Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
  3. //
  4. // Distributed under the Boost Software License, Version 1.0
  5. // See accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt
  7. //
  8. // See http://boostorg.github.com/compute for more information.
  9. //---------------------------------------------------------------------------//
  10. #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_
  11. #define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_
  12. #include <algorithm>
  13. #include <boost/compute/kernel.hpp>
  14. #include <boost/compute/program.hpp>
  15. #include <boost/compute/command_queue.hpp>
  16. #include <boost/compute/container/vector.hpp>
  17. #include <boost/compute/memory/local_buffer.hpp>
  18. #include <boost/compute/detail/meta_kernel.hpp>
  19. #include <boost/compute/detail/iterator_range_size.hpp>
  20. namespace boost {
  21. namespace compute {
  22. namespace detail {
  23. template<class KeyType, class ValueType>
  24. inline size_t pick_bitonic_block_sort_block_size(size_t proposed_wg,
  25. size_t lmem_size,
  26. bool sort_by_key)
  27. {
  28. size_t n = proposed_wg;
  29. size_t lmem_required = n * sizeof(KeyType);
  30. if(sort_by_key) {
  31. lmem_required += n * sizeof(ValueType);
  32. }
  33. // try to force at least 4 work-groups of >64 elements
  34. // for better occupancy
  35. while(lmem_size < (lmem_required * 4) && (n > 64)) {
  36. n /= 2;
  37. lmem_required = n * sizeof(KeyType);
  38. }
  39. while(lmem_size < lmem_required && (n != 1)) {
  40. n /= 2;
  41. if(n < 1) n = 1;
  42. lmem_required = n * sizeof(KeyType);
  43. }
  44. if(n < 2) { return 1; }
  45. else if(n < 4) { return 2; }
  46. else if(n < 8) { return 4; }
  47. else if(n < 16) { return 8; }
  48. else if(n < 32) { return 16; }
  49. else if(n < 64) { return 32; }
  50. else if(n < 128) { return 64; }
  51. else if(n < 256) { return 128; }
  52. else { return 256; }
  53. }
  54. /// Performs bitonic block sort according to \p compare.
  55. ///
  56. /// Since bitonic sort can be only performed when input size is equal to 2^n,
  57. /// in this case input size is block size (\p work_group_size), we would have
  58. /// to require \p count be a exact multiple of block size. That would not be
  59. /// great.
  60. /// Instead, bitonic sort kernel is merged with odd-even merge sort so if the
  61. /// last block is not equal to 2^n (where n is some natural number) the odd-even
  62. /// sort is performed for that block. That way bitonic_block_sort() works for
  63. /// input of any size. Block size (\p work_group_size) still have to be equal
  64. /// to 2^n.
  65. ///
  66. /// This is NOT stable.
  67. ///
  68. /// \param keys_first first key element in the range to sort
  69. /// \param values_first first value element in the range to sort
  70. /// \param compare comparison function for keys
  71. /// \param count number of elements in the range; count > 0
  72. /// \param work_group_size size of the work group, also the block size; must be
  73. /// equal to n^2 where n is natural number
  74. /// \param queue command queue to perform the operation
  75. template<class KeyIterator, class ValueIterator, class Compare>
  76. inline size_t bitonic_block_sort(KeyIterator keys_first,
  77. ValueIterator values_first,
  78. Compare compare,
  79. const size_t count,
  80. const bool sort_by_key,
  81. command_queue &queue)
  82. {
  83. typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
  84. typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
  85. meta_kernel k("bitonic_block_sort");
  86. size_t count_arg = k.add_arg<const uint_>("count");
  87. size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "lkeys");
  88. size_t local_vals_arg = 0;
  89. if(sort_by_key) {
  90. local_vals_arg = k.add_arg<uchar_ *>(memory_object::local_memory, "lidx");
  91. }
  92. k <<
  93. // Work item global and local ids
  94. k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
  95. k.decl<const uint_>("lid") << " = get_local_id(0);\n";
  96. // declare my_key and my_value
  97. k <<
  98. k.decl<key_type>("my_key") << ";\n";
  99. // Instead of copying values (my_value) in local memory with keys
  100. // we save local index (uchar) and copy my_value at the end at
  101. // final index. This saves local memory.
  102. if(sort_by_key)
  103. {
  104. k <<
  105. k.decl<uchar_>("my_index") << " = (uchar)(lid);\n";
  106. }
  107. // load key
  108. k <<
  109. "if(gid < count) {\n" <<
  110. k.var<key_type>("my_key") << " = " <<
  111. keys_first[k.var<const uint_>("gid")] << ";\n" <<
  112. "}\n";
  113. // load key and index to local memory
  114. k <<
  115. "lkeys[lid] = my_key;\n";
  116. if(sort_by_key)
  117. {
  118. k <<
  119. "lidx[lid] = my_index;\n";
  120. }
  121. k <<
  122. k.decl<const uint_>("offset") << " = get_group_id(0) * get_local_size(0);\n" <<
  123. k.decl<const uint_>("n") << " = min((uint)(get_local_size(0)),(count - offset));\n";
  124. // When work group size is a power of 2 bitonic sorter can be used;
  125. // otherwise, slower odd-even sort is used.
  126. k <<
  127. // check if n is power of 2
  128. "if(((n != 0) && ((n & (~n + 1)) == n))) {\n";
  129. // bitonic sort, not stable
  130. k <<
  131. // wait for keys and vals to be stored in local memory
  132. "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
  133. "#pragma unroll\n" <<
  134. "for(" <<
  135. k.decl<uint_>("length") << " = 1; " <<
  136. "length < n; " <<
  137. "length <<= 1" <<
  138. ") {\n" <<
  139. // direction of sort: false -> asc, true -> desc
  140. k.decl<bool>("direction") << "= ((lid & (length<<1)) != 0);\n" <<
  141. "for(" <<
  142. k.decl<uint_>("k") << " = length; " <<
  143. "k > 0; " <<
  144. "k >>= 1" <<
  145. ") {\n" <<
  146. // sibling to compare with my key
  147. k.decl<uint_>("sibling_idx") << " = lid ^ k;\n" <<
  148. k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" <<
  149. k.decl<bool>("compare") << " = " <<
  150. compare(k.var<key_type>("sibling_key"),
  151. k.var<key_type>("my_key")) << ";\n" <<
  152. k.decl<bool>("equal") << " = !(compare || " <<
  153. compare(k.var<key_type>("my_key"),
  154. k.var<key_type>("sibling_key")) << ");\n" <<
  155. k.decl<bool>("swap") <<
  156. " = compare ^ (sibling_idx < lid) ^ direction;\n" <<
  157. "swap = equal ? false : swap;\n" <<
  158. "my_key = swap ? sibling_key : my_key;\n";
  159. if(sort_by_key)
  160. {
  161. k <<
  162. "my_index = swap ? lidx[sibling_idx] : my_index;\n";
  163. }
  164. k <<
  165. "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
  166. "lkeys[lid] = my_key;\n";
  167. if(sort_by_key)
  168. {
  169. k <<
  170. "lidx[lid] = my_index;\n";
  171. }
  172. k <<
  173. "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
  174. "}\n" <<
  175. "}\n";
  176. // end of bitonic sort
  177. // odd-even sort, not stable
  178. k <<
  179. "}\n" <<
  180. "else { \n";
  181. k <<
  182. k.decl<bool>("lid_is_even") << " = (lid%2) == 0;\n" <<
  183. k.decl<uint_>("oddsibling_idx") << " = " <<
  184. "(lid_is_even) ? max(lid,(uint)(1)) - 1 : min(lid+1,n-1);\n" <<
  185. k.decl<uint_>("evensibling_idx") << " = " <<
  186. "(lid_is_even) ? min(lid+1,n-1) : max(lid,(uint)(1)) - 1;\n" <<
  187. // wait for keys and vals to be stored in local memory
  188. "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
  189. "#pragma unroll\n" <<
  190. "for(" <<
  191. k.decl<uint_>("i") << " = 0; " <<
  192. "i < n; " <<
  193. "i++" <<
  194. ") {\n" <<
  195. k.decl<uint_>("sibling_idx") <<
  196. " = i%2 == 0 ? evensibling_idx : oddsibling_idx;\n" <<
  197. k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" <<
  198. k.decl<bool>("compare") << " = " <<
  199. compare(k.var<key_type>("sibling_key"),
  200. k.var<key_type>("my_key")) << ";\n" <<
  201. k.decl<bool>("equal") << " = !(compare || " <<
  202. compare(k.var<key_type>("my_key"),
  203. k.var<key_type>("sibling_key")) << ");\n" <<
  204. k.decl<bool>("swap") <<
  205. " = compare ^ (sibling_idx < lid);\n" <<
  206. "swap = equal ? false : swap;\n" <<
  207. "my_key = swap ? sibling_key : my_key;\n";
  208. if(sort_by_key)
  209. {
  210. k <<
  211. "my_index = swap ? lidx[sibling_idx] : my_index;\n";
  212. }
  213. k <<
  214. "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
  215. "lkeys[lid] = my_key;\n";
  216. if(sort_by_key)
  217. {
  218. k <<
  219. "lidx[lid] = my_index;\n";
  220. }
  221. k <<
  222. "barrier(CLK_LOCAL_MEM_FENCE);\n"
  223. "}\n" << // for
  224. "}\n"; // else
  225. // end of odd-even sort
  226. // save key and value
  227. k <<
  228. "if(gid < count) {\n" <<
  229. keys_first[k.var<const uint_>("gid")] << " = " <<
  230. k.var<key_type>("my_key") << ";\n";
  231. if(sort_by_key)
  232. {
  233. k <<
  234. k.decl<value_type>("my_value") << " = " <<
  235. values_first[k.var<const uint_>("offset + my_index")] << ";\n" <<
  236. "barrier(CLK_GLOBAL_MEM_FENCE);\n" <<
  237. values_first[k.var<const uint_>("gid")] << " = my_value;\n";
  238. }
  239. k <<
  240. // end if
  241. "}\n";
  242. const context &context = queue.get_context();
  243. const device &device = queue.get_device();
  244. ::boost::compute::kernel kernel = k.compile(context);
  245. const size_t work_group_size =
  246. pick_bitonic_block_sort_block_size<key_type, uchar_>(
  247. kernel.get_work_group_info<size_t>(
  248. device, CL_KERNEL_WORK_GROUP_SIZE
  249. ),
  250. device.get_info<size_t>(CL_DEVICE_LOCAL_MEM_SIZE),
  251. sort_by_key
  252. );
  253. const size_t global_size =
  254. work_group_size * static_cast<size_t>(
  255. std::ceil(float(count) / work_group_size)
  256. );
  257. kernel.set_arg(count_arg, static_cast<uint_>(count));
  258. kernel.set_arg(local_keys_arg, local_buffer<key_type>(work_group_size));
  259. if(sort_by_key) {
  260. kernel.set_arg(local_vals_arg, local_buffer<uchar_>(work_group_size));
  261. }
  262. queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size);
  263. // return size of the block
  264. return work_group_size;
  265. }
  266. template<class KeyIterator, class ValueIterator, class Compare>
  267. inline size_t block_sort(KeyIterator keys_first,
  268. ValueIterator values_first,
  269. Compare compare,
  270. const size_t count,
  271. const bool sort_by_key,
  272. const bool stable,
  273. command_queue &queue)
  274. {
  275. if(stable) {
  276. // TODO: Implement stable block sort (stable odd-even merge sort)
  277. return size_t(1);
  278. }
  279. return bitonic_block_sort(
  280. keys_first, values_first,
  281. compare, count,
  282. sort_by_key, queue
  283. );
  284. }
  285. /// space: O(n + m); n - number of keys, m - number of values
  286. template<class KeyIterator, class ValueIterator, class Compare>
  287. inline void merge_blocks_on_gpu(KeyIterator keys_first,
  288. ValueIterator values_first,
  289. KeyIterator out_keys_first,
  290. ValueIterator out_values_first,
  291. Compare compare,
  292. const size_t count,
  293. const size_t block_size,
  294. const bool sort_by_key,
  295. command_queue &queue)
  296. {
  297. typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
  298. typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
  299. meta_kernel k("merge_blocks");
  300. size_t count_arg = k.add_arg<const uint_>("count");
  301. size_t block_size_arg = k.add_arg<const uint_>("block_size");
  302. k <<
  303. // get global id
  304. k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
  305. "if(gid >= count) {\n" <<
  306. "return;\n" <<
  307. "}\n" <<
  308. k.decl<const key_type>("my_key") << " = " <<
  309. keys_first[k.var<const uint_>("gid")] << ";\n";
  310. if(sort_by_key) {
  311. k <<
  312. k.decl<const value_type>("my_value") << " = " <<
  313. values_first[k.var<const uint_>("gid")] << ";\n";
  314. }
  315. k <<
  316. // get my block idx
  317. k.decl<const uint_>("my_block_idx") << " = gid / block_size;\n" <<
  318. k.decl<const bool>("my_block_idx_is_odd") << " = " <<
  319. "my_block_idx & 0x1;\n" <<
  320. k.decl<const uint_>("other_block_idx") << " = " <<
  321. // if(my_block_idx is odd) {} else {}
  322. "my_block_idx_is_odd ? my_block_idx - 1 : my_block_idx + 1;\n" <<
  323. // get ranges of my block and the other block
  324. // [my_block_start; my_block_end)
  325. // [other_block_start; other_block_end)
  326. k.decl<const uint_>("my_block_start") << " = " <<
  327. "min(my_block_idx * block_size, count);\n" << // including
  328. k.decl<const uint_>("my_block_end") << " = " <<
  329. "min((my_block_idx + 1) * block_size, count);\n" << // excluding
  330. k.decl<const uint_>("other_block_start") << " = " <<
  331. "min(other_block_idx * block_size, count);\n" << // including
  332. k.decl<const uint_>("other_block_end") << " = " <<
  333. "min((other_block_idx + 1) * block_size, count);\n" << // excluding
  334. // other block is empty, nothing to merge here
  335. "if(other_block_start == count){\n" <<
  336. out_keys_first[k.var<uint_>("gid")] << " = my_key;\n";
  337. if(sort_by_key) {
  338. k <<
  339. out_values_first[k.var<uint_>("gid")] << " = my_value;\n";
  340. }
  341. k <<
  342. "return;\n" <<
  343. "}\n" <<
  344. // lower bound
  345. // left_idx - lower bound
  346. k.decl<uint_>("left_idx") << " = other_block_start;\n" <<
  347. k.decl<uint_>("right_idx") << " = other_block_end;\n" <<
  348. "while(left_idx < right_idx) {\n" <<
  349. k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" <<
  350. k.decl<key_type>("mid_key") << " = " <<
  351. keys_first[k.var<const uint_>("mid_idx")] << ";\n" <<
  352. k.decl<bool>("smaller") << " = " <<
  353. compare(k.var<key_type>("mid_key"),
  354. k.var<key_type>("my_key")) << ";\n" <<
  355. "left_idx = smaller ? mid_idx + 1 : left_idx;\n" <<
  356. "right_idx = smaller ? right_idx : mid_idx;\n" <<
  357. "}\n" <<
  358. // left_idx is found position in other block
  359. // if my_block is odd we need to get the upper bound
  360. "right_idx = other_block_end;\n" <<
  361. "if(my_block_idx_is_odd && left_idx != right_idx) {\n" <<
  362. k.decl<key_type>("upper_key") << " = " <<
  363. keys_first[k.var<const uint_>("left_idx")] << ";\n" <<
  364. "while(" <<
  365. "!(" << compare(k.var<key_type>("upper_key"),
  366. k.var<key_type>("my_key")) <<
  367. ") && " <<
  368. "!(" << compare(k.var<key_type>("my_key"),
  369. k.var<key_type>("upper_key")) <<
  370. ") && " <<
  371. "left_idx < right_idx" <<
  372. ")" <<
  373. "{\n" <<
  374. k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" <<
  375. k.decl<key_type>("mid_key") << " = " <<
  376. keys_first[k.var<const uint_>("mid_idx")] << ";\n" <<
  377. k.decl<bool>("equal") << " = " <<
  378. "!(" << compare(k.var<key_type>("mid_key"),
  379. k.var<key_type>("my_key")) <<
  380. ") && " <<
  381. "!(" << compare(k.var<key_type>("my_key"),
  382. k.var<key_type>("mid_key")) <<
  383. ");\n" <<
  384. "left_idx = equal ? mid_idx + 1 : left_idx + 1;\n" <<
  385. "right_idx = equal ? right_idx : mid_idx;\n" <<
  386. "upper_key = " <<
  387. keys_first[k.var<const uint_>("left_idx")] << ";\n" <<
  388. "}\n" <<
  389. "}\n" <<
  390. k.decl<uint_>("offset") << " = 0;\n" <<
  391. "offset += gid - my_block_start;\n" <<
  392. "offset += left_idx - other_block_start;\n" <<
  393. "offset += min(my_block_start, other_block_start);\n" <<
  394. out_keys_first[k.var<uint_>("offset")] << " = my_key;\n";
  395. if(sort_by_key) {
  396. k <<
  397. out_values_first[k.var<uint_>("offset")] << " = my_value;\n";
  398. }
  399. const context &context = queue.get_context();
  400. ::boost::compute::kernel kernel = k.compile(context);
  401. const size_t work_group_size = (std::min)(
  402. size_t(256),
  403. kernel.get_work_group_info<size_t>(
  404. queue.get_device(), CL_KERNEL_WORK_GROUP_SIZE
  405. )
  406. );
  407. const size_t global_size =
  408. work_group_size * static_cast<size_t>(
  409. std::ceil(float(count) / work_group_size)
  410. );
  411. kernel.set_arg(count_arg, static_cast<uint_>(count));
  412. kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
  413. queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size);
  414. }
  415. template<class KeyIterator, class ValueIterator, class Compare>
  416. inline void merge_sort_by_key_on_gpu(KeyIterator keys_first,
  417. KeyIterator keys_last,
  418. ValueIterator values_first,
  419. Compare compare,
  420. bool stable,
  421. command_queue &queue)
  422. {
  423. typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
  424. typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
  425. size_t count = iterator_range_size(keys_first, keys_last);
  426. if(count < 2){
  427. return;
  428. }
  429. size_t block_size =
  430. block_sort(
  431. keys_first, values_first,
  432. compare, count,
  433. true /* sort_by_key */, stable /* stable */,
  434. queue
  435. );
  436. // for small input size only block sort is performed
  437. if(count <= block_size) {
  438. return;
  439. }
  440. const context &context = queue.get_context();
  441. bool result_in_temporary_buffer = false;
  442. ::boost::compute::vector<key_type> temp_keys(count, context);
  443. ::boost::compute::vector<value_type> temp_values(count, context);
  444. for(; block_size < count; block_size *= 2) {
  445. result_in_temporary_buffer = !result_in_temporary_buffer;
  446. if(result_in_temporary_buffer) {
  447. merge_blocks_on_gpu(keys_first, values_first,
  448. temp_keys.begin(), temp_values.begin(),
  449. compare, count, block_size,
  450. true /* sort_by_key */, queue);
  451. } else {
  452. merge_blocks_on_gpu(temp_keys.begin(), temp_values.begin(),
  453. keys_first, values_first,
  454. compare, count, block_size,
  455. true /* sort_by_key */, queue);
  456. }
  457. }
  458. if(result_in_temporary_buffer) {
  459. copy_async(temp_keys.begin(), temp_keys.end(), keys_first, queue);
  460. copy_async(temp_values.begin(), temp_values.end(), values_first, queue);
  461. }
  462. }
  463. template<class Iterator, class Compare>
  464. inline void merge_sort_on_gpu(Iterator first,
  465. Iterator last,
  466. Compare compare,
  467. bool stable,
  468. command_queue &queue)
  469. {
  470. typedef typename std::iterator_traits<Iterator>::value_type key_type;
  471. size_t count = iterator_range_size(first, last);
  472. if(count < 2){
  473. return;
  474. }
  475. Iterator dummy;
  476. size_t block_size =
  477. block_sort(
  478. first, dummy,
  479. compare, count,
  480. false /* sort_by_key */, stable /* stable */,
  481. queue
  482. );
  483. // for small input size only block sort is performed
  484. if(count <= block_size) {
  485. return;
  486. }
  487. const context &context = queue.get_context();
  488. bool result_in_temporary_buffer = false;
  489. ::boost::compute::vector<key_type> temp_keys(count, context);
  490. for(; block_size < count; block_size *= 2) {
  491. result_in_temporary_buffer = !result_in_temporary_buffer;
  492. if(result_in_temporary_buffer) {
  493. merge_blocks_on_gpu(first, dummy, temp_keys.begin(), dummy,
  494. compare, count, block_size,
  495. false /* sort_by_key */, queue);
  496. } else {
  497. merge_blocks_on_gpu(temp_keys.begin(), dummy, first, dummy,
  498. compare, count, block_size,
  499. false /* sort_by_key */, queue);
  500. }
  501. }
  502. if(result_in_temporary_buffer) {
  503. copy_async(temp_keys.begin(), temp_keys.end(), first, queue);
  504. }
  505. }
  506. template<class KeyIterator, class ValueIterator, class Compare>
  507. inline void merge_sort_by_key_on_gpu(KeyIterator keys_first,
  508. KeyIterator keys_last,
  509. ValueIterator values_first,
  510. Compare compare,
  511. command_queue &queue)
  512. {
  513. merge_sort_by_key_on_gpu(
  514. keys_first, keys_last, values_first,
  515. compare, false /* not stable */, queue
  516. );
  517. }
  518. template<class Iterator, class Compare>
  519. inline void merge_sort_on_gpu(Iterator first,
  520. Iterator last,
  521. Compare compare,
  522. command_queue &queue)
  523. {
  524. merge_sort_on_gpu(
  525. first, last, compare, false /* not stable */, queue
  526. );
  527. }
  528. } // end detail namespace
  529. } // end compute namespace
  530. } // end boost namespace
  531. #endif /* BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_ */