copy_on_device.hpp 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. //---------------------------------------------------------------------------//
  2. // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
  3. //
  4. // Distributed under the Boost Software License, Version 1.0
  5. // See accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt
  7. //
  8. // See http://boostorg.github.com/compute for more information.
  9. //---------------------------------------------------------------------------//
  10. #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
  11. #define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
  12. #include <iterator>
  13. #include <boost/compute/command_queue.hpp>
  14. #include <boost/compute/async/future.hpp>
  15. #include <boost/compute/iterator/buffer_iterator.hpp>
  16. #include <boost/compute/iterator/discard_iterator.hpp>
  17. #include <boost/compute/memory/svm_ptr.hpp>
  18. #include <boost/compute/detail/iterator_range_size.hpp>
  19. #include <boost/compute/detail/meta_kernel.hpp>
  20. #include <boost/compute/detail/parameter_cache.hpp>
  21. #include <boost/compute/detail/work_size.hpp>
  22. #include <boost/compute/detail/vendor.hpp>
  23. namespace boost {
  24. namespace compute {
  25. namespace detail {
  26. template<class InputIterator, class OutputIterator>
  27. inline event copy_on_device_cpu(InputIterator first,
  28. OutputIterator result,
  29. size_t count,
  30. command_queue &queue,
  31. const wait_list &events)
  32. {
  33. meta_kernel k("copy");
  34. const device& device = queue.get_device();
  35. k <<
  36. "uint block = " <<
  37. "(uint)ceil(((float)count)/get_global_size(0));\n" <<
  38. "uint index = get_global_id(0) * block;\n" <<
  39. "uint end = min(count, index + block);\n" <<
  40. "while(index < end){\n" <<
  41. result[k.var<uint_>("index")] << '=' <<
  42. first[k.var<uint_>("index")] << ";\n" <<
  43. "index++;\n" <<
  44. "}\n";
  45. k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
  46. size_t global_work_size = device.compute_units();
  47. if(count <= 1024) global_work_size = 1;
  48. return k.exec_1d(queue, 0, global_work_size, events);
  49. }
  50. template<class InputIterator, class OutputIterator>
  51. inline event copy_on_device_gpu(InputIterator first,
  52. OutputIterator result,
  53. size_t count,
  54. command_queue &queue,
  55. const wait_list &events)
  56. {
  57. typedef typename std::iterator_traits<InputIterator>::value_type input_type;
  58. const device& device = queue.get_device();
  59. boost::shared_ptr<parameter_cache> parameters =
  60. detail::parameter_cache::get_global_cache(device);
  61. std::string cache_key =
  62. "__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type));
  63. uint_ vpt = parameters->get(cache_key, "vpt", 4);
  64. uint_ tpb = parameters->get(cache_key, "tpb", 128);
  65. meta_kernel k("copy");
  66. k <<
  67. "uint index = get_local_id(0) + " <<
  68. "(" << vpt * tpb << " * get_group_id(0));\n" <<
  69. "for(uint i = 0; i < " << vpt << "; i++){\n" <<
  70. " if(index < count){\n" <<
  71. result[k.var<uint_>("index")] << '=' <<
  72. first[k.var<uint_>("index")] << ";\n" <<
  73. " index += " << tpb << ";\n"
  74. " }\n"
  75. "}\n";
  76. k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
  77. size_t global_work_size = calculate_work_size(count, vpt, tpb);
  78. return k.exec_1d(queue, 0, global_work_size, tpb, events);
  79. }
  80. template<class InputIterator, class OutputIterator>
  81. inline event dispatch_copy_on_device(InputIterator first,
  82. InputIterator last,
  83. OutputIterator result,
  84. command_queue &queue,
  85. const wait_list &events)
  86. {
  87. const size_t count = detail::iterator_range_size(first, last);
  88. if(count == 0){
  89. // nothing to do
  90. return event();
  91. }
  92. const device& device = queue.get_device();
  93. // copy_on_device_cpu() does not work for CPU on Apple platform
  94. // due to bug in its compiler.
  95. // See https://github.com/boostorg/compute/pull/626
  96. if((device.type() & device::cpu) && !is_apple_platform_device(device))
  97. {
  98. return copy_on_device_cpu(first, result, count, queue, events);
  99. }
  100. return copy_on_device_gpu(first, result, count, queue, events);
  101. }
  102. template<class InputIterator, class OutputIterator>
  103. inline OutputIterator copy_on_device(InputIterator first,
  104. InputIterator last,
  105. OutputIterator result,
  106. command_queue &queue,
  107. const wait_list &events)
  108. {
  109. dispatch_copy_on_device(first, last, result, queue, events);
  110. return result + std::distance(first, last);
  111. }
  112. template<class InputIterator>
  113. inline discard_iterator copy_on_device(InputIterator first,
  114. InputIterator last,
  115. discard_iterator result,
  116. command_queue &queue,
  117. const wait_list &events)
  118. {
  119. (void) queue;
  120. (void) events;
  121. return result + std::distance(first, last);
  122. }
  123. template<class InputIterator, class OutputIterator>
  124. inline future<OutputIterator> copy_on_device_async(InputIterator first,
  125. InputIterator last,
  126. OutputIterator result,
  127. command_queue &queue,
  128. const wait_list &events)
  129. {
  130. event event_ = dispatch_copy_on_device(first, last, result, queue, events);
  131. return make_future(result + std::distance(first, last), event_);
  132. }
  133. #ifdef BOOST_COMPUTE_CL_VERSION_2_0
  134. // copy_on_device() specialization for svm_ptr
  135. template<class T>
  136. inline svm_ptr<T> copy_on_device(svm_ptr<T> first,
  137. svm_ptr<T> last,
  138. svm_ptr<T> result,
  139. command_queue &queue,
  140. const wait_list &events)
  141. {
  142. size_t count = iterator_range_size(first, last);
  143. if(count == 0){
  144. return result;
  145. }
  146. queue.enqueue_svm_memcpy(
  147. result.get(), first.get(), count * sizeof(T), events
  148. );
  149. return result + count;
  150. }
  151. template<class T>
  152. inline future<svm_ptr<T> > copy_on_device_async(svm_ptr<T> first,
  153. svm_ptr<T> last,
  154. svm_ptr<T> result,
  155. command_queue &queue,
  156. const wait_list &events)
  157. {
  158. size_t count = iterator_range_size(first, last);
  159. if(count == 0){
  160. return future<svm_ptr<T> >();
  161. }
  162. event event_ = queue.enqueue_svm_memcpy_async(
  163. result.get(), first.get(), count * sizeof(T), events
  164. );
  165. return make_future(result + count, event_);
  166. }
  167. #endif // BOOST_COMPUTE_CL_VERSION_2_0
  168. } // end detail namespace
  169. } // end compute namespace
  170. } // end boost namespace
  171. #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP