reduce_on_gpu.hpp 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. //---------------------------------------------------------------------------//
  2. // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
  3. //
  4. // Distributed under the Boost Software License, Version 1.0
  5. // See accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt
  7. //
  8. // See http://boostorg.github.com/compute for more information.
  9. //---------------------------------------------------------------------------//
  10. #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
  11. #define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
  12. #include <iterator>
  13. #include <boost/compute/utility/source.hpp>
  14. #include <boost/compute/program.hpp>
  15. #include <boost/compute/command_queue.hpp>
  16. #include <boost/compute/detail/vendor.hpp>
  17. #include <boost/compute/detail/parameter_cache.hpp>
  18. #include <boost/compute/detail/work_size.hpp>
  19. #include <boost/compute/detail/meta_kernel.hpp>
  20. #include <boost/compute/type_traits/type_name.hpp>
  21. #include <boost/compute/utility/program_cache.hpp>
  22. namespace boost {
  23. namespace compute {
  24. namespace detail {
  25. /// \internal
  26. /// body reduction inside a warp
  27. template<typename T,bool isNvidiaDevice>
  28. struct ReduceBody
  29. {
  30. static std::string body()
  31. {
  32. std::stringstream k;
  33. // local reduction
  34. k << "for(int i = 1; i < TPB; i <<= 1){\n" <<
  35. " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
  36. " uint mask = (i << 1) - 1;\n" <<
  37. " if((lid & mask) == 0){\n" <<
  38. " scratch[lid] += scratch[lid+i];\n" <<
  39. " }\n" <<
  40. "}\n";
  41. return k.str();
  42. }
  43. };
  44. /// \internal
  45. /// body reduction inside a warp
  46. /// for nvidia device we can use the "unsafe"
  47. /// memory optimisation
  48. template<typename T>
  49. struct ReduceBody<T,true>
  50. {
  51. static std::string body()
  52. {
  53. std::stringstream k;
  54. // local reduction
  55. // we use TPB to compile only useful instruction
  56. // local reduction when size is greater than warp size
  57. k << "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
  58. "if(TPB >= 1024){\n" <<
  59. "if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
  60. "if(TPB >= 512){\n" <<
  61. "if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
  62. "if(TPB >= 256){\n" <<
  63. "if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
  64. "if(TPB >= 128){\n" <<
  65. "if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" <<
  66. // warp reduction
  67. "if(lid < 32){\n" <<
  68. // volatile this way we don't need any barrier
  69. "volatile __local " << type_name<T>() << " *lmem = scratch;\n" <<
  70. "if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" <<
  71. "if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" <<
  72. "if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" <<
  73. "if(TPB >= 8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" <<
  74. "if(TPB >= 4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" <<
  75. "if(TPB >= 2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" <<
  76. "}\n";
  77. return k.str();
  78. }
  79. };
  80. template<class InputIterator, class Function>
  81. inline void initial_reduce(InputIterator first,
  82. InputIterator last,
  83. buffer result,
  84. const Function &function,
  85. kernel &reduce_kernel,
  86. const uint_ vpt,
  87. const uint_ tpb,
  88. command_queue &queue)
  89. {
  90. (void) function;
  91. (void) reduce_kernel;
  92. typedef typename std::iterator_traits<InputIterator>::value_type Arg;
  93. typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;
  94. size_t count = std::distance(first, last);
  95. detail::meta_kernel k("initial_reduce");
  96. k.add_set_arg<const uint_>("count", uint_(count));
  97. size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");
  98. k <<
  99. k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
  100. k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
  101. "__local " << type_name<T>() << " scratch[TPB];\n" <<
  102. // private reduction
  103. k.decl<T>("sum") << " = 0;\n" <<
  104. "for(uint i = 0; i < VPT; i++){\n" <<
  105. " if(offset + lid + i*TPB < count){\n" <<
  106. " sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<
  107. " }\n" <<
  108. "}\n" <<
  109. "scratch[lid] = sum;\n" <<
  110. // local reduction
  111. ReduceBody<T,false>::body() <<
  112. // write sum to output
  113. "if(lid == 0){\n" <<
  114. " output[get_group_id(0)] = scratch[0];\n" <<
  115. "}\n";
  116. const context &context = queue.get_context();
  117. std::stringstream options;
  118. options << "-DVPT=" << vpt << " -DTPB=" << tpb;
  119. kernel generic_reduce_kernel = k.compile(context, options.str());
  120. generic_reduce_kernel.set_arg(output_arg, result);
  121. size_t work_size = calculate_work_size(count, vpt, tpb);
  122. queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);
  123. }
  124. template<class T>
  125. inline void initial_reduce(const buffer_iterator<T> &first,
  126. const buffer_iterator<T> &last,
  127. const buffer &result,
  128. const plus<T> &function,
  129. kernel &reduce_kernel,
  130. const uint_ vpt,
  131. const uint_ tpb,
  132. command_queue &queue)
  133. {
  134. (void) function;
  135. size_t count = std::distance(first, last);
  136. reduce_kernel.set_arg(0, first.get_buffer());
  137. reduce_kernel.set_arg(1, uint_(first.get_index()));
  138. reduce_kernel.set_arg(2, uint_(count));
  139. reduce_kernel.set_arg(3, result);
  140. reduce_kernel.set_arg(4, uint_(0));
  141. size_t work_size = calculate_work_size(count, vpt, tpb);
  142. queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
  143. }
  144. template<class InputIterator, class T, class Function>
  145. inline void reduce_on_gpu(InputIterator first,
  146. InputIterator last,
  147. buffer_iterator<T> result,
  148. Function function,
  149. command_queue &queue)
  150. {
  151. const device &device = queue.get_device();
  152. const context &context = queue.get_context();
  153. detail::meta_kernel k("reduce");
  154. k.add_arg<const T*>(memory_object::global_memory, "input");
  155. k.add_arg<const uint_>("offset");
  156. k.add_arg<const uint_>("count");
  157. k.add_arg<T*>(memory_object::global_memory, "output");
  158. k.add_arg<const uint_>("output_offset");
  159. k <<
  160. k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<
  161. "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" <<
  162. k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
  163. "__local " << type_name<T>() << " scratch[TPB];\n" <<
  164. // private reduction
  165. k.decl<T>("sum") << " = 0;\n" <<
  166. "for(uint i = 0; i < VPT; i++){\n" <<
  167. " if(block_offset + lid + i*TPB < count){\n" <<
  168. " sum = sum + block[lid+i*TPB]; \n" <<
  169. " }\n" <<
  170. "}\n" <<
  171. "scratch[lid] = sum;\n";
  172. // discrimination on vendor name
  173. if(is_nvidia_device(device))
  174. k << ReduceBody<T,true>::body();
  175. else
  176. k << ReduceBody<T,false>::body();
  177. k <<
  178. // write sum to output
  179. "if(lid == 0){\n" <<
  180. " output[output_offset + get_group_id(0)] = scratch[0];\n" <<
  181. "}\n";
  182. std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();
  183. // load parameters
  184. boost::shared_ptr<parameter_cache> parameters =
  185. detail::parameter_cache::get_global_cache(device);
  186. uint_ vpt = parameters->get(cache_key, "vpt", 8);
  187. uint_ tpb = parameters->get(cache_key, "tpb", 128);
  188. // reduce program compiler flags
  189. std::stringstream options;
  190. options << "-DT=" << type_name<T>()
  191. << " -DVPT=" << vpt
  192. << " -DTPB=" << tpb;
  193. // load program
  194. boost::shared_ptr<program_cache> cache =
  195. program_cache::get_global_cache(context);
  196. program reduce_program = cache->get_or_build(
  197. cache_key, options.str(), k.source(), context
  198. );
  199. // create reduce kernel
  200. kernel reduce_kernel(reduce_program, "reduce");
  201. size_t count = std::distance(first, last);
  202. // first pass, reduce from input to ping
  203. buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));
  204. initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);
  205. // update count after initial reduce
  206. count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
  207. // middle pass(es), reduce between ping and pong
  208. const buffer *input_buffer = &ping;
  209. buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T)));
  210. const buffer *output_buffer = &pong;
  211. if(count > vpt * tpb){
  212. while(count > vpt * tpb){
  213. reduce_kernel.set_arg(0, *input_buffer);
  214. reduce_kernel.set_arg(1, uint_(0));
  215. reduce_kernel.set_arg(2, uint_(count));
  216. reduce_kernel.set_arg(3, *output_buffer);
  217. reduce_kernel.set_arg(4, uint_(0));
  218. size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt));
  219. if(work_size % tpb != 0){
  220. work_size += tpb - work_size % tpb;
  221. }
  222. queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
  223. std::swap(input_buffer, output_buffer);
  224. count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
  225. }
  226. }
  227. // final pass, reduce from ping/pong to result
  228. reduce_kernel.set_arg(0, *input_buffer);
  229. reduce_kernel.set_arg(1, uint_(0));
  230. reduce_kernel.set_arg(2, uint_(count));
  231. reduce_kernel.set_arg(3, result.get_buffer());
  232. reduce_kernel.set_arg(4, uint_(result.get_index()));
  233. queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb);
  234. }
  235. } // end detail namespace
  236. } // end compute namespace
  237. } // end boost namespace
  238. #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP