reduce.hpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. //---------------------------------------------------------------------------//
  2. // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
  3. //
  4. // Distributed under the Boost Software License, Version 1.0
  5. // See accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt
  7. //
  8. // See http://boostorg.github.com/compute for more information.
  9. //---------------------------------------------------------------------------//
  10. #ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
  11. #define BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
  12. #include <iterator>
  13. #include <boost/static_assert.hpp>
  14. #include <boost/compute/system.hpp>
  15. #include <boost/compute/functional.hpp>
  16. #include <boost/compute/detail/meta_kernel.hpp>
  17. #include <boost/compute/command_queue.hpp>
  18. #include <boost/compute/container/array.hpp>
  19. #include <boost/compute/container/vector.hpp>
  20. #include <boost/compute/algorithm/copy_n.hpp>
  21. #include <boost/compute/algorithm/detail/inplace_reduce.hpp>
  22. #include <boost/compute/algorithm/detail/reduce_on_gpu.hpp>
  23. #include <boost/compute/algorithm/detail/reduce_on_cpu.hpp>
  24. #include <boost/compute/detail/iterator_range_size.hpp>
  25. #include <boost/compute/memory/local_buffer.hpp>
  26. #include <boost/compute/type_traits/result_of.hpp>
  27. #include <boost/compute/type_traits/is_device_iterator.hpp>
  28. namespace boost {
  29. namespace compute {
  30. namespace detail {
  31. template<class InputIterator, class OutputIterator, class BinaryFunction>
  32. size_t reduce(InputIterator first,
  33. size_t count,
  34. OutputIterator result,
  35. size_t block_size,
  36. BinaryFunction function,
  37. command_queue &queue)
  38. {
  39. typedef typename
  40. std::iterator_traits<InputIterator>::value_type
  41. input_type;
  42. typedef typename
  43. boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
  44. result_type;
  45. const context &context = queue.get_context();
  46. size_t block_count = count / 2 / block_size;
  47. size_t total_block_count =
  48. static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
  49. if(block_count != 0){
  50. meta_kernel k("block_reduce");
  51. size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
  52. size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
  53. k <<
  54. "const uint gid = get_global_id(0);\n" <<
  55. "const uint lid = get_local_id(0);\n" <<
  56. // copy values to local memory
  57. "block[lid] = " <<
  58. function(first[k.make_var<uint_>("gid*2+0")],
  59. first[k.make_var<uint_>("gid*2+1")]) << ";\n" <<
  60. // perform reduction
  61. "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
  62. " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
  63. " uint mask = (i << 1) - 1;\n" <<
  64. " if((lid & mask) == 0){\n" <<
  65. " block[lid] = " <<
  66. function(k.expr<input_type>("block[lid]"),
  67. k.expr<input_type>("block[lid+i]")) << ";\n" <<
  68. " }\n" <<
  69. "}\n" <<
  70. // write block result to global output
  71. "if(lid == 0)\n" <<
  72. " output[get_group_id(0)] = block[0];\n";
  73. kernel kernel = k.compile(context);
  74. kernel.set_arg(output_arg, result.get_buffer());
  75. kernel.set_arg(block_arg, local_buffer<input_type>(block_size));
  76. queue.enqueue_1d_range_kernel(kernel,
  77. 0,
  78. block_count * block_size,
  79. block_size);
  80. }
  81. // serially reduce any leftovers
  82. if(block_count * block_size * 2 < count){
  83. size_t last_block_start = block_count * block_size * 2;
  84. meta_kernel k("extra_serial_reduce");
  85. size_t count_arg = k.add_arg<uint_>("count");
  86. size_t offset_arg = k.add_arg<uint_>("offset");
  87. size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
  88. size_t output_offset_arg = k.add_arg<uint_>("output_offset");
  89. k <<
  90. k.decl<result_type>("result") << " = \n" <<
  91. first[k.expr<uint_>("offset")] << ";\n" <<
  92. "for(uint i = offset + 1; i < count; i++)\n" <<
  93. " result = " <<
  94. function(k.var<result_type>("result"),
  95. first[k.var<uint_>("i")]) << ";\n" <<
  96. "output[output_offset] = result;\n";
  97. kernel kernel = k.compile(context);
  98. kernel.set_arg(count_arg, static_cast<uint_>(count));
  99. kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start));
  100. kernel.set_arg(output_arg, result.get_buffer());
  101. kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count));
  102. queue.enqueue_task(kernel);
  103. }
  104. return total_block_count;
  105. }
  106. template<class InputIterator, class BinaryFunction>
  107. inline vector<
  108. typename boost::compute::result_of<
  109. BinaryFunction(
  110. typename std::iterator_traits<InputIterator>::value_type,
  111. typename std::iterator_traits<InputIterator>::value_type
  112. )
  113. >::type
  114. >
  115. block_reduce(InputIterator first,
  116. size_t count,
  117. size_t block_size,
  118. BinaryFunction function,
  119. command_queue &queue)
  120. {
  121. typedef typename
  122. std::iterator_traits<InputIterator>::value_type
  123. input_type;
  124. typedef typename
  125. boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
  126. result_type;
  127. const context &context = queue.get_context();
  128. size_t total_block_count =
  129. static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
  130. vector<result_type> result_vector(total_block_count, context);
  131. reduce(first, count, result_vector.begin(), block_size, function, queue);
  132. return result_vector;
  133. }
  134. // Space complexity: O( ceil(n / 2 / 256) )
  135. template<class InputIterator, class OutputIterator, class BinaryFunction>
  136. inline void generic_reduce(InputIterator first,
  137. InputIterator last,
  138. OutputIterator result,
  139. BinaryFunction function,
  140. command_queue &queue)
  141. {
  142. typedef typename
  143. std::iterator_traits<InputIterator>::value_type
  144. input_type;
  145. typedef typename
  146. boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
  147. result_type;
  148. const device &device = queue.get_device();
  149. const context &context = queue.get_context();
  150. size_t count = detail::iterator_range_size(first, last);
  151. if(device.type() & device::cpu){
  152. array<result_type, 1> value(context);
  153. detail::reduce_on_cpu(first, last, value.begin(), function, queue);
  154. boost::compute::copy_n(value.begin(), 1, result, queue);
  155. }
  156. else {
  157. size_t block_size = 256;
  158. // first pass
  159. vector<result_type> results = detail::block_reduce(first,
  160. count,
  161. block_size,
  162. function,
  163. queue);
  164. if(results.size() > 1){
  165. detail::inplace_reduce(results.begin(),
  166. results.end(),
  167. function,
  168. queue);
  169. }
  170. boost::compute::copy_n(results.begin(), 1, result, queue);
  171. }
  172. }
  173. template<class InputIterator, class OutputIterator, class T>
  174. inline void dispatch_reduce(InputIterator first,
  175. InputIterator last,
  176. OutputIterator result,
  177. const plus<T> &function,
  178. command_queue &queue)
  179. {
  180. const context &context = queue.get_context();
  181. const device &device = queue.get_device();
  182. // reduce to temporary buffer on device
  183. array<T, 1> value(context);
  184. if(device.type() & device::cpu){
  185. detail::reduce_on_cpu(first, last, value.begin(), function, queue);
  186. }
  187. else {
  188. reduce_on_gpu(first, last, value.begin(), function, queue);
  189. }
  190. // copy to result iterator
  191. copy_n(value.begin(), 1, result, queue);
  192. }
  193. template<class InputIterator, class OutputIterator, class BinaryFunction>
  194. inline void dispatch_reduce(InputIterator first,
  195. InputIterator last,
  196. OutputIterator result,
  197. BinaryFunction function,
  198. command_queue &queue)
  199. {
  200. generic_reduce(first, last, result, function, queue);
  201. }
  202. } // end detail namespace
  203. /// Returns the result of applying \p function to the elements in the
  204. /// range [\p first, \p last).
  205. ///
  206. /// If no function is specified, \c plus will be used.
  207. ///
  208. /// \param first first element in the input range
  209. /// \param last last element in the input range
  210. /// \param result iterator pointing to the output
  211. /// \param function binary reduction function
  212. /// \param queue command queue to perform the operation
  213. ///
  214. /// The \c reduce() algorithm assumes that the binary reduction function is
  215. /// associative. When used with non-associative functions the result may
  216. /// be non-deterministic and vary in precision. Notably this affects the
  217. /// \c plus<float>() function as floating-point addition is not associative
  218. /// and may produce slightly different results than a serial algorithm.
  219. ///
  220. /// This algorithm supports both host and device iterators for the
  221. /// result argument. This allows for values to be reduced and copied
  222. /// to the host all with a single function call.
  223. ///
  224. /// For example, to calculate the sum of the values in a device vector and
  225. /// copy the result to a value on the host:
  226. ///
  227. /// \snippet test/test_reduce.cpp sum_int
  228. ///
  229. /// Note that while the the \c reduce() algorithm is conceptually identical to
  230. /// the \c accumulate() algorithm, its implementation is substantially more
  231. /// efficient on parallel hardware. For more information, see the documentation
  232. /// on the \c accumulate() algorithm.
  233. ///
  234. /// Space complexity on GPUs: \Omega(n)<br>
  235. /// Space complexity on CPUs: \Omega(1)
  236. ///
  237. /// \see accumulate()
  238. template<class InputIterator, class OutputIterator, class BinaryFunction>
  239. inline void reduce(InputIterator first,
  240. InputIterator last,
  241. OutputIterator result,
  242. BinaryFunction function,
  243. command_queue &queue = system::default_queue())
  244. {
  245. BOOST_STATIC_ASSERT(is_device_iterator<InputIterator>::value);
  246. if(first == last){
  247. return;
  248. }
  249. detail::dispatch_reduce(first, last, result, function, queue);
  250. }
  251. /// \overload
  252. template<class InputIterator, class OutputIterator>
  253. inline void reduce(InputIterator first,
  254. InputIterator last,
  255. OutputIterator result,
  256. command_queue &queue = system::default_queue())
  257. {
  258. BOOST_STATIC_ASSERT(is_device_iterator<InputIterator>::value);
  259. typedef typename std::iterator_traits<InputIterator>::value_type T;
  260. if(first == last){
  261. return;
  262. }
  263. detail::dispatch_reduce(first, last, result, plus<T>(), queue);
  264. }
  265. } // end compute namespace
  266. } // end boost namespace
  267. #endif // BOOST_COMPUTE_ALGORITHM_REDUCE_HPP