inplace_reduce.hpp 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. //---------------------------------------------------------------------------//
  2. // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
  3. //
  4. // Distributed under the Boost Software License, Version 1.0
  5. // See accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt
  7. //
  8. // See http://boostorg.github.com/compute for more information.
  9. //---------------------------------------------------------------------------//
  10. #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
  11. #define BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
  12. #include <iterator>
  13. #include <boost/utility/result_of.hpp>
  14. #include <boost/compute/command_queue.hpp>
  15. #include <boost/compute/container/vector.hpp>
  16. #include <boost/compute/detail/iterator_range_size.hpp>
  17. #include <boost/compute/memory/local_buffer.hpp>
  18. namespace boost {
  19. namespace compute {
  20. namespace detail {
  21. template<class Iterator, class BinaryFunction>
  22. inline void inplace_reduce(Iterator first,
  23. Iterator last,
  24. BinaryFunction function,
  25. command_queue &queue)
  26. {
  27. typedef typename
  28. std::iterator_traits<Iterator>::value_type
  29. value_type;
  30. size_t input_size = iterator_range_size(first, last);
  31. if(input_size < 2){
  32. return;
  33. }
  34. const context &context = queue.get_context();
  35. size_t block_size = 64;
  36. size_t values_per_thread = 8;
  37. size_t block_count = input_size / (block_size * values_per_thread);
  38. if(block_count * block_size * values_per_thread != input_size)
  39. block_count++;
  40. vector<value_type> output(block_count, context);
  41. meta_kernel k("inplace_reduce");
  42. size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input");
  43. size_t input_size_arg = k.add_arg<const uint_>("input_size");
  44. size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output");
  45. size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch");
  46. k <<
  47. "const uint gid = get_global_id(0);\n" <<
  48. "const uint lid = get_local_id(0);\n" <<
  49. "const uint values_per_thread =\n"
  50. << uint_(values_per_thread) << ";\n" <<
  51. // thread reduce
  52. "const uint index = gid * values_per_thread;\n" <<
  53. "if(index < input_size){\n" <<
  54. k.decl<value_type>("sum") << " = input[index];\n" <<
  55. "for(uint i = 1;\n" <<
  56. "i < values_per_thread && (index + i) < input_size;\n" <<
  57. "i++){\n" <<
  58. " sum = " <<
  59. function(k.var<value_type>("sum"),
  60. k.var<value_type>("input[index+i]")) << ";\n" <<
  61. "}\n" <<
  62. "scratch[lid] = sum;\n" <<
  63. "}\n" <<
  64. // local reduce
  65. "for(uint i = 1; i < get_local_size(0); i <<= 1){\n" <<
  66. " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
  67. " uint mask = (i << 1) - 1;\n" <<
  68. " uint next_index = (gid + i) * values_per_thread;\n"
  69. " if((lid & mask) == 0 && next_index < input_size){\n" <<
  70. " scratch[lid] = " <<
  71. function(k.var<value_type>("scratch[lid]"),
  72. k.var<value_type>("scratch[lid+i]")) << ";\n" <<
  73. " }\n" <<
  74. "}\n" <<
  75. // write output for block
  76. "if(lid == 0){\n" <<
  77. " output[get_group_id(0)] = scratch[0];\n" <<
  78. "}\n"
  79. ;
  80. const buffer *input_buffer = &first.get_buffer();
  81. const buffer *output_buffer = &output.get_buffer();
  82. kernel kernel = k.compile(context);
  83. while(input_size > 1){
  84. kernel.set_arg(input_arg, *input_buffer);
  85. kernel.set_arg(input_size_arg, static_cast<uint_>(input_size));
  86. kernel.set_arg(output_arg, *output_buffer);
  87. kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size));
  88. queue.enqueue_1d_range_kernel(kernel,
  89. 0,
  90. block_count * block_size,
  91. block_size);
  92. input_size =
  93. static_cast<size_t>(
  94. std::ceil(float(input_size) / (block_size * values_per_thread)
  95. )
  96. );
  97. block_count = input_size / (block_size * values_per_thread);
  98. if(block_count * block_size * values_per_thread != input_size)
  99. block_count++;
  100. std::swap(input_buffer, output_buffer);
  101. }
  102. if(input_buffer != &first.get_buffer()){
  103. ::boost::compute::copy(output.begin(),
  104. output.begin() + 1,
  105. first,
  106. queue);
  107. }
  108. }
  109. } // end detail namespace
  110. } // end compute namespace
  111. } // end boost namespace
  112. #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP