matrix_transpose.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. //---------------------------------------------------------------------------//
  2. // Copyright (c) 2014 Benoit Dequidt <benoit.dequidt@gmail.com>
  3. //
  4. // Distributed under the Boost Software License, Version 1.0
  5. // See accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt
  7. //
  8. // See http://boostorg.github.com/compute for more information.
  9. //---------------------------------------------------------------------------//
  10. #include <iostream>
  11. #include <cstdlib>
  12. #include <boost/program_options.hpp>
  13. #include <boost/compute/core.hpp>
  14. #include <boost/compute/algorithm/copy.hpp>
  15. #include <boost/compute/container/vector.hpp>
  16. #include <boost/compute/type_traits/type_name.hpp>
  17. #include <boost/compute/utility/source.hpp>
  18. namespace compute = boost::compute;
  19. namespace po = boost::program_options;
  20. using compute::uint_;
  21. const uint_ TILE_DIM = 32;
  22. const uint_ BLOCK_ROWS = 8;
  23. // generate a copy kernel program
  24. compute::kernel make_copy_kernel(const compute::context& context)
  25. {
  26. // source for the copy_kernel program
  27. const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
  28. __kernel void copy_kernel(__global const float *src, __global float *dst)
  29. {
  30. uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
  31. uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
  32. uint width = get_num_groups(0) * TILE_DIM;
  33. for(uint i = 0 ; i < TILE_DIM ; i+= BLOCK_ROWS){
  34. dst[(y+i)*width +x] = src[(y+i)*width + x];
  35. }
  36. }
  37. );
  38. // setup compilation flags for the copy program
  39. std::stringstream options;
  40. options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
  41. // create and build the copy program
  42. compute::program program =
  43. compute::program::build_with_source(source, context, options.str());
  44. // create and return the copy kernel
  45. return program.create_kernel("copy_kernel");
  46. }
  47. // generate a naive transpose kernel
  48. compute::kernel make_naive_transpose_kernel(const compute::context& context)
  49. {
  50. // source for the naive_transpose kernel
  51. const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
  52. __kernel void naive_transpose(__global const float *src, __global float *dst)
  53. {
  54. uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
  55. uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
  56. uint width = get_num_groups(0) * TILE_DIM;
  57. for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
  58. dst[x*width + y+i] = src[(y+i)*width + x];
  59. }
  60. }
  61. );
  62. // setup compilation flags for the naive_transpose program
  63. std::stringstream options;
  64. options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
  65. // create and build the naive_transpose program
  66. compute::program program =
  67. compute::program::build_with_source(source, context, options.str());
  68. // create and return the naive_transpose kernel
  69. return program.create_kernel("naive_transpose");
  70. }
  71. // generates a coalesced transpose kernel
  72. compute::kernel make_coalesced_transpose_kernel(const compute::context& context)
  73. {
  74. // source for the coalesced_transpose kernel
  75. const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
  76. __kernel void coalesced_transpose(__global const float *src, __global float *dst)
  77. {
  78. __local float tile[TILE_DIM][TILE_DIM];
  79. // compute indexes
  80. uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
  81. uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
  82. uint width = get_num_groups(0) * TILE_DIM;
  83. // load inside local memory
  84. for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
  85. tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
  86. }
  87. barrier(CLK_LOCAL_MEM_FENCE);
  88. // transpose indexes
  89. x = get_group_id(1) * TILE_DIM + get_local_id(0);
  90. y = get_group_id(0) * TILE_DIM + get_local_id(1);
  91. // write output from local memory
  92. for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS){
  93. dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
  94. }
  95. }
  96. );
  97. // setup compilation flags for the coalesced_transpose program
  98. std::stringstream options;
  99. options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
  100. // create and build the coalesced_transpose program
  101. compute::program program =
  102. compute::program::build_with_source(source, context, options.str());
  103. // create and return coalesced_transpose kernel
  104. return program.create_kernel("coalesced_transpose");
  105. }
  106. // generate a coalesced withtout bank conflicts kernel
  107. compute::kernel make_coalesced_no_bank_conflicts_kernel(const compute::context& context)
  108. {
  109. const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
  110. __kernel void coalesced_no_bank_conflicts(__global const float *src, __global float *dst)
  111. {
  112. // TILE_DIM+1 is here to avoid bank conflicts in local memory
  113. __local float tile[TILE_DIM][TILE_DIM+1];
  114. // compute indexes
  115. uint x = get_group_id(0) * TILE_DIM + get_local_id(0);
  116. uint y = get_group_id(1) * TILE_DIM + get_local_id(1);
  117. uint width = get_num_groups(0) * TILE_DIM;
  118. // load inside local memory
  119. for(uint i = 0 ; i < TILE_DIM; i+= BLOCK_ROWS){
  120. tile[get_local_id(1)+i][get_local_id(0)] = src[(y+i)*width + x];
  121. }
  122. barrier(CLK_LOCAL_MEM_FENCE);
  123. // transpose indexes
  124. x = get_group_id(1) * TILE_DIM + get_local_id(0);
  125. y = get_group_id(0) * TILE_DIM + get_local_id(1);
  126. // write output from local memory
  127. for(uint i = 0 ; i < TILE_DIM ; i+=BLOCK_ROWS){
  128. dst[(y+i)*width + x] = tile[get_local_id(0)][get_local_id(1)+i];
  129. }
  130. }
  131. );
  132. // setup compilation flags for the coalesced_no_bank_conflicts program
  133. std::stringstream options;
  134. options << "-DTILE_DIM=" << TILE_DIM << " -DBLOCK_ROWS=" << BLOCK_ROWS;
  135. // create and build the coalesced_no_bank_conflicts program
  136. compute::program program =
  137. compute::program::build_with_source(source, context, options.str());
  138. // create and return the coalesced_no_bank_conflicts kernel
  139. return program.create_kernel("coalesced_no_bank_conflicts");
  140. }
  141. // compare 'expectedResult' to 'transposedMatrix'. prints an error message if not equal.
  142. bool check_transposition(const std::vector<float>& expectedResult,
  143. uint_ size,
  144. const std::vector<float>& transposedMatrix)
  145. {
  146. for(uint_ i = 0 ; i < size ; ++i){
  147. if(expectedResult[i] != transposedMatrix[i]){
  148. std::cout << "idx = " << i << " , expected " << expectedResult[i]
  149. << " , got " << transposedMatrix[i] << std::endl;
  150. std::cout << "FAILED" << std::endl;
  151. return false;
  152. }
  153. }
  154. return true;
  155. }
  156. // generate a matrix inside 'in' and do the tranposition inside 'out'
  157. void generate_matrix(std::vector<float>& in, std::vector<float>& out, uint_ rows, uint_ cols)
  158. {
  159. // generate a matrix
  160. for(uint_ i = 0 ; i < rows ; ++i){
  161. for(uint_ j = 0 ; j < cols ; ++j){
  162. in[i*cols + j] = i*cols + j;
  163. }
  164. }
  165. // store transposed result
  166. for(uint_ j = 0; j < cols ; ++j){
  167. for(uint_ i = 0 ; i < rows ; ++i){
  168. out[j*rows + i] = in[i*cols + j];
  169. }
  170. }
  171. }
  172. // neccessary for 64-bit integer on win32
  173. #ifdef _WIN32
  174. #define uint64_t unsigned __int64
  175. #endif
  176. int main(int argc, char *argv[])
  177. {
  178. // setup command line arguments
  179. po::options_description options("options");
  180. options.add_options()
  181. ("help", "show usage instructions")
  182. ("rows", po::value<uint_>()->default_value(4096), "number of matrix rows")
  183. ("cols", po::value<uint_>()->default_value(4096), "number of matrix columns")
  184. ;
  185. // parse command line
  186. po::variables_map vm;
  187. po::store(po::parse_command_line(argc, argv, options), vm);
  188. po::notify(vm);
  189. // check command line arguments
  190. if(vm.count("help")){
  191. std::cout << options << std::endl;
  192. return 0;
  193. }
  194. // get number rows and columns for the matrix
  195. const uint_ rows = vm["rows"].as<uint_>();
  196. const uint_ cols = vm["cols"].as<uint_>();
  197. // get the default device
  198. compute::device device = compute::system::default_device();
  199. // print out device name and matrix information
  200. std::cout << "Device: " << device.name() << std::endl;
  201. std::cout << "Matrix Size: " << rows << "x" << cols << std::endl;
  202. std::cout << "Grid Size: " << rows/TILE_DIM << "x" << cols/TILE_DIM << " blocks" << std::endl;
  203. std::cout << "Local Size: " << TILE_DIM << "x" << BLOCK_ROWS << " threads" << std::endl;
  204. std::cout << std::endl;
  205. // On OSX this example does not work on CPU devices
  206. #if defined(__APPLE__)
  207. if(device.type() & compute::device::cpu) {
  208. std::cout << "On OSX this example does not work on CPU devices" << std::endl;
  209. return 0;
  210. }
  211. #endif
  212. const size_t global_work_size[2] = {rows, cols*BLOCK_ROWS/TILE_DIM};
  213. const size_t local_work_size[2] = {TILE_DIM, BLOCK_ROWS};
  214. // setup input data on the host
  215. const uint_ size = rows * cols;
  216. std::vector<float> h_input(size);
  217. std::vector<float> h_output(size);
  218. std::vector<float> expectedResult(size);
  219. generate_matrix(h_input, expectedResult, rows, cols);
  220. // create a context for the device
  221. compute::context context(device);
  222. // device vectors
  223. compute::vector<float> d_input(size, context);
  224. compute::vector<float> d_output(size, context);
  225. // command_queue with profiling
  226. compute::command_queue queue(context, device, compute::command_queue::enable_profiling);
  227. // copy input data
  228. compute::copy(h_input.begin(), h_input.end(), d_input.begin(), queue);
  229. // simple copy kernel
  230. std::cout << "Testing copy_kernel:" << std::endl;
  231. compute::kernel kernel = make_copy_kernel(context);
  232. kernel.set_arg(0, d_input);
  233. kernel.set_arg(1, d_output);
  234. compute::event start;
  235. start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
  236. queue.finish();
  237. uint64_t elapsed = start.duration<boost::chrono::nanoseconds>().count();
  238. std::cout << " Elapsed: " << elapsed << " ns" << std::endl;
  239. std::cout << " BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
  240. compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
  241. check_transposition(h_input, rows*cols, h_output);
  242. std::cout << std::endl;
  243. // naive_transpose kernel
  244. std::cout << "Testing naive_transpose:" << std::endl;
  245. kernel = make_naive_transpose_kernel(context);
  246. kernel.set_arg(0, d_input);
  247. kernel.set_arg(1, d_output);
  248. start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
  249. queue.finish();
  250. elapsed = start.duration<boost::chrono::nanoseconds>().count();
  251. std::cout << " Elapsed: " << elapsed << " ns" << std::endl;
  252. std::cout << " BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
  253. compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
  254. check_transposition(expectedResult, rows*cols, h_output);
  255. std::cout << std::endl;
  256. // coalesced_transpose kernel
  257. std::cout << "Testing coalesced_transpose:" << std::endl;
  258. kernel = make_coalesced_transpose_kernel(context);
  259. kernel.set_arg(0, d_input);
  260. kernel.set_arg(1, d_output);
  261. start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
  262. queue.finish();
  263. elapsed = start.duration<boost::chrono::nanoseconds>().count();
  264. std::cout << " Elapsed: " << elapsed << " ns" << std::endl;
  265. std::cout << " BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
  266. compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
  267. check_transposition(expectedResult, rows*cols, h_output);
  268. std::cout << std::endl;
  269. // coalesced_no_bank_conflicts kernel
  270. std::cout << "Testing coalesced_no_bank_conflicts:" << std::endl;
  271. kernel = make_coalesced_no_bank_conflicts_kernel(context);
  272. kernel.set_arg(0, d_input);
  273. kernel.set_arg(1, d_output);
  274. start = queue.enqueue_nd_range_kernel(kernel, 2, 0, global_work_size, local_work_size);
  275. queue.finish();
  276. elapsed = start.duration<boost::chrono::nanoseconds>().count();
  277. std::cout << " Elapsed: " << elapsed << " ns" << std::endl;
  278. std::cout << " BandWidth: " << 2*rows*cols*sizeof(float) / elapsed << " GB/s" << std::endl;
  279. compute::copy(d_output.begin(), d_output.end(), h_output.begin(), queue);
  280. check_transposition(expectedResult, rows*cols, h_output);
  281. std::cout << std::endl;
  282. return 0;
  283. }