copy.hpp 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881
  1. //---------------------------------------------------------------------------//
  2. // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
  3. //
  4. // Distributed under the Boost Software License, Version 1.0
  5. // See accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt
  7. //
  8. // See http://boostorg.github.com/compute for more information.
  9. //---------------------------------------------------------------------------//
  10. #ifndef BOOST_COMPUTE_ALGORITHM_COPY_HPP
  11. #define BOOST_COMPUTE_ALGORITHM_COPY_HPP
  12. #include <algorithm>
  13. #include <iterator>
  14. #include <boost/utility/enable_if.hpp>
  15. #include <boost/mpl/and.hpp>
  16. #include <boost/mpl/not.hpp>
  17. #include <boost/mpl/or.hpp>
  18. #include <boost/compute/buffer.hpp>
  19. #include <boost/compute/system.hpp>
  20. #include <boost/compute/command_queue.hpp>
  21. #include <boost/compute/algorithm/detail/copy_on_device.hpp>
  22. #include <boost/compute/algorithm/detail/copy_to_device.hpp>
  23. #include <boost/compute/algorithm/detail/copy_to_host.hpp>
  24. #include <boost/compute/async/future.hpp>
  25. #include <boost/compute/container/mapped_view.hpp>
  26. #include <boost/compute/detail/device_ptr.hpp>
  27. #include <boost/compute/detail/is_contiguous_iterator.hpp>
  28. #include <boost/compute/detail/iterator_range_size.hpp>
  29. #include <boost/compute/detail/parameter_cache.hpp>
  30. #include <boost/compute/iterator/buffer_iterator.hpp>
  31. #include <boost/compute/type_traits/type_name.hpp>
  32. #include <boost/compute/type_traits/is_device_iterator.hpp>
  33. namespace boost {
  34. namespace compute {
  35. namespace detail {
  36. namespace mpl = boost::mpl;
  37. // meta-function returning true if copy() between InputIterator and
  38. // OutputIterator can be implemented with clEnqueueCopyBuffer().
  39. template<class InputIterator, class OutputIterator>
  40. struct can_copy_with_copy_buffer :
  41. mpl::and_<
  42. mpl::or_<
  43. boost::is_same<
  44. InputIterator,
  45. buffer_iterator<typename InputIterator::value_type>
  46. >,
  47. boost::is_same<
  48. InputIterator,
  49. detail::device_ptr<typename InputIterator::value_type>
  50. >
  51. >,
  52. mpl::or_<
  53. boost::is_same<
  54. OutputIterator,
  55. buffer_iterator<typename OutputIterator::value_type>
  56. >,
  57. boost::is_same<
  58. OutputIterator,
  59. detail::device_ptr<typename OutputIterator::value_type>
  60. >
  61. >,
  62. boost::is_same<
  63. typename InputIterator::value_type,
  64. typename OutputIterator::value_type
  65. >
  66. >::type {};
  67. // meta-function returning true if value_types of HostIterator and
  68. // DeviceIterator are same
  69. template<class HostIterator, class DeviceIterator>
  70. struct is_same_value_type :
  71. boost::is_same<
  72. typename boost::remove_cv<
  73. typename std::iterator_traits<HostIterator>::value_type
  74. >::type,
  75. typename boost::remove_cv<
  76. typename DeviceIterator::value_type
  77. >::type
  78. >::type {};
  79. // meta-function returning true if value_type of HostIterator is bool
  80. template<class HostIterator>
  81. struct is_bool_value_type :
  82. boost::is_same<
  83. typename boost::remove_cv<
  84. typename std::iterator_traits<HostIterator>::value_type
  85. >::type,
  86. bool
  87. >::type {};
  88. // host -> device (async)
  89. template<class InputIterator, class OutputIterator>
  90. inline future<OutputIterator>
  91. dispatch_copy_async(InputIterator first,
  92. InputIterator last,
  93. OutputIterator result,
  94. command_queue &queue,
  95. const wait_list &events,
  96. typename boost::enable_if<
  97. mpl::and_<
  98. mpl::not_<
  99. is_device_iterator<InputIterator>
  100. >,
  101. is_device_iterator<OutputIterator>,
  102. is_same_value_type<InputIterator, OutputIterator>
  103. >
  104. >::type* = 0)
  105. {
  106. BOOST_STATIC_ASSERT_MSG(
  107. is_contiguous_iterator<InputIterator>::value,
  108. "copy_async() is only supported for contiguous host iterators"
  109. );
  110. return copy_to_device_async(first, last, result, queue, events);
  111. }
  112. // host -> device (async)
  113. // Type mismatch between InputIterator and OutputIterator value_types
  114. template<class InputIterator, class OutputIterator>
  115. inline future<OutputIterator>
  116. dispatch_copy_async(InputIterator first,
  117. InputIterator last,
  118. OutputIterator result,
  119. command_queue &queue,
  120. const wait_list &events,
  121. typename boost::enable_if<
  122. mpl::and_<
  123. mpl::not_<
  124. is_device_iterator<InputIterator>
  125. >,
  126. is_device_iterator<OutputIterator>,
  127. mpl::not_<
  128. is_same_value_type<InputIterator, OutputIterator>
  129. >
  130. >
  131. >::type* = 0)
  132. {
  133. BOOST_STATIC_ASSERT_MSG(
  134. is_contiguous_iterator<InputIterator>::value,
  135. "copy_async() is only supported for contiguous host iterators"
  136. );
  137. typedef typename std::iterator_traits<InputIterator>::value_type input_type;
  138. const context &context = queue.get_context();
  139. size_t count = iterator_range_size(first, last);
  140. if(count < size_t(1)) {
  141. return future<OutputIterator>();
  142. }
  143. // map [first; last) to device and run copy kernel
  144. // on device for copying & casting
  145. ::boost::compute::mapped_view<input_type> mapped_host(
  146. // make sure it's a pointer to constant data
  147. // to force read only mapping
  148. const_cast<const input_type*>(
  149. ::boost::addressof(*first)
  150. ),
  151. count,
  152. context
  153. );
  154. return copy_on_device_async(
  155. mapped_host.begin(), mapped_host.end(), result, queue, events
  156. );
  157. }
  158. // host -> device
  159. // InputIterator is a contiguous iterator
  160. template<class InputIterator, class OutputIterator>
  161. inline OutputIterator
  162. dispatch_copy(InputIterator first,
  163. InputIterator last,
  164. OutputIterator result,
  165. command_queue &queue,
  166. const wait_list &events,
  167. typename boost::enable_if<
  168. mpl::and_<
  169. mpl::not_<
  170. is_device_iterator<InputIterator>
  171. >,
  172. is_device_iterator<OutputIterator>,
  173. is_same_value_type<InputIterator, OutputIterator>,
  174. is_contiguous_iterator<InputIterator>
  175. >
  176. >::type* = 0)
  177. {
  178. return copy_to_device(first, last, result, queue, events);
  179. }
  180. // host -> device
  181. // Type mismatch between InputIterator and OutputIterator value_types
  182. // InputIterator is a contiguous iterator
  183. template<class InputIterator, class OutputIterator>
  184. inline OutputIterator
  185. dispatch_copy(InputIterator first,
  186. InputIterator last,
  187. OutputIterator result,
  188. command_queue &queue,
  189. const wait_list &events,
  190. typename boost::enable_if<
  191. mpl::and_<
  192. mpl::not_<
  193. is_device_iterator<InputIterator>
  194. >,
  195. is_device_iterator<OutputIterator>,
  196. mpl::not_<
  197. is_same_value_type<InputIterator, OutputIterator>
  198. >,
  199. is_contiguous_iterator<InputIterator>
  200. >
  201. >::type* = 0)
  202. {
  203. typedef typename OutputIterator::value_type output_type;
  204. typedef typename std::iterator_traits<InputIterator>::value_type input_type;
  205. const device &device = queue.get_device();
  206. // loading parameters
  207. std::string cache_key =
  208. std::string("__boost_compute_copy_to_device_")
  209. + type_name<input_type>() + "_" + type_name<output_type>();
  210. boost::shared_ptr<parameter_cache> parameters =
  211. detail::parameter_cache::get_global_cache(device);
  212. uint_ map_copy_threshold;
  213. uint_ direct_copy_threshold;
  214. // calculate default values of thresholds
  215. if (device.type() & device::gpu) {
  216. // GPUs
  217. map_copy_threshold = 524288; // 0.5 MB
  218. direct_copy_threshold = 52428800; // 50 MB
  219. }
  220. else {
  221. // CPUs and other devices
  222. map_copy_threshold = 134217728; // 128 MB
  223. direct_copy_threshold = 0; // it's never efficient for CPUs
  224. }
  225. // load thresholds
  226. map_copy_threshold =
  227. parameters->get(
  228. cache_key, "map_copy_threshold", map_copy_threshold
  229. );
  230. direct_copy_threshold =
  231. parameters->get(
  232. cache_key, "direct_copy_threshold", direct_copy_threshold
  233. );
  234. // select copy method based on thresholds & input_size_bytes
  235. size_t count = iterator_range_size(first, last);
  236. size_t input_size_bytes = count * sizeof(input_type);
  237. // [0; map_copy_threshold) -> copy_to_device_map()
  238. if(input_size_bytes < map_copy_threshold) {
  239. return copy_to_device_map(first, last, result, queue, events);
  240. }
  241. // [map_copy_threshold; direct_copy_threshold) -> convert [first; last)
  242. // on host and then perform copy_to_device()
  243. else if(input_size_bytes < direct_copy_threshold) {
  244. std::vector<output_type> vector(first, last);
  245. return copy_to_device(
  246. vector.begin(), vector.end(), result, queue, events
  247. );
  248. }
  249. // [direct_copy_threshold; inf) -> map [first; last) to device and
  250. // run copy kernel on device for copying & casting
  251. // At this point we are sure that count > 1 (first != last).
  252. // Perform async copy to device, wait for it to be finished and
  253. // return the result.
  254. // At this point we are sure that count > 1 (first != last), so event
  255. // returned by dispatch_copy_async() must be valid.
  256. return dispatch_copy_async(first, last, result, queue, events).get();
  257. }
  258. // host -> device
  259. // InputIterator is NOT a contiguous iterator
  260. template<class InputIterator, class OutputIterator>
  261. inline OutputIterator
  262. dispatch_copy(InputIterator first,
  263. InputIterator last,
  264. OutputIterator result,
  265. command_queue &queue,
  266. const wait_list &events,
  267. typename boost::enable_if<
  268. mpl::and_<
  269. mpl::not_<
  270. is_device_iterator<InputIterator>
  271. >,
  272. is_device_iterator<OutputIterator>,
  273. mpl::not_<
  274. is_contiguous_iterator<InputIterator>
  275. >
  276. >
  277. >::type* = 0)
  278. {
  279. typedef typename OutputIterator::value_type output_type;
  280. typedef typename std::iterator_traits<InputIterator>::value_type input_type;
  281. const device &device = queue.get_device();
  282. // loading parameters
  283. std::string cache_key =
  284. std::string("__boost_compute_copy_to_device_")
  285. + type_name<input_type>() + "_" + type_name<output_type>();
  286. boost::shared_ptr<parameter_cache> parameters =
  287. detail::parameter_cache::get_global_cache(device);
  288. uint_ map_copy_threshold;
  289. uint_ direct_copy_threshold;
  290. // calculate default values of thresholds
  291. if (device.type() & device::gpu) {
  292. // GPUs
  293. map_copy_threshold = 524288; // 0.5 MB
  294. direct_copy_threshold = 52428800; // 50 MB
  295. }
  296. else {
  297. // CPUs and other devices
  298. map_copy_threshold = 134217728; // 128 MB
  299. direct_copy_threshold = 0; // it's never efficient for CPUs
  300. }
  301. // load thresholds
  302. map_copy_threshold =
  303. parameters->get(
  304. cache_key, "map_copy_threshold", map_copy_threshold
  305. );
  306. direct_copy_threshold =
  307. parameters->get(
  308. cache_key, "direct_copy_threshold", direct_copy_threshold
  309. );
  310. // select copy method based on thresholds & input_size_bytes
  311. size_t input_size = iterator_range_size(first, last);
  312. size_t input_size_bytes = input_size * sizeof(input_type);
  313. // [0; map_copy_threshold) -> copy_to_device_map()
  314. //
  315. // if direct_copy_threshold is less than map_copy_threshold
  316. // copy_to_device_map() is used for every input
  317. if(input_size_bytes < map_copy_threshold
  318. || direct_copy_threshold <= map_copy_threshold) {
  319. return copy_to_device_map(first, last, result, queue, events);
  320. }
  321. // [map_copy_threshold; inf) -> convert [first; last)
  322. // on host and then perform copy_to_device()
  323. std::vector<output_type> vector(first, last);
  324. return copy_to_device(vector.begin(), vector.end(), result, queue, events);
  325. }
  326. // device -> host (async)
  327. template<class InputIterator, class OutputIterator>
  328. inline future<OutputIterator>
  329. dispatch_copy_async(InputIterator first,
  330. InputIterator last,
  331. OutputIterator result,
  332. command_queue &queue,
  333. const wait_list &events,
  334. typename boost::enable_if<
  335. mpl::and_<
  336. is_device_iterator<InputIterator>,
  337. mpl::not_<
  338. is_device_iterator<OutputIterator>
  339. >,
  340. is_same_value_type<OutputIterator, InputIterator>
  341. >
  342. >::type* = 0)
  343. {
  344. BOOST_STATIC_ASSERT_MSG(
  345. is_contiguous_iterator<OutputIterator>::value,
  346. "copy_async() is only supported for contiguous host iterators"
  347. );
  348. return copy_to_host_async(first, last, result, queue, events);
  349. }
  350. // device -> host (async)
  351. // Type mismatch between InputIterator and OutputIterator value_types
  352. template<class InputIterator, class OutputIterator>
  353. inline future<OutputIterator>
  354. dispatch_copy_async(InputIterator first,
  355. InputIterator last,
  356. OutputIterator result,
  357. command_queue &queue,
  358. const wait_list &events,
  359. typename boost::enable_if<
  360. mpl::and_<
  361. is_device_iterator<InputIterator>,
  362. mpl::not_<
  363. is_device_iterator<OutputIterator>
  364. >,
  365. mpl::not_<
  366. is_same_value_type<OutputIterator, InputIterator>
  367. >
  368. >
  369. >::type* = 0)
  370. {
  371. BOOST_STATIC_ASSERT_MSG(
  372. is_contiguous_iterator<OutputIterator>::value,
  373. "copy_async() is only supported for contiguous host iterators"
  374. );
  375. typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
  376. const context &context = queue.get_context();
  377. size_t count = iterator_range_size(first, last);
  378. if(count < size_t(1)) {
  379. return future<OutputIterator>();
  380. }
  381. // map host memory to device
  382. buffer mapped_host(
  383. context,
  384. count * sizeof(output_type),
  385. buffer::write_only | buffer::use_host_ptr,
  386. static_cast<void*>(
  387. ::boost::addressof(*result)
  388. )
  389. );
  390. // copy async on device
  391. ::boost::compute::future<buffer_iterator<output_type> > future =
  392. copy_on_device_async(
  393. first,
  394. last,
  395. make_buffer_iterator<output_type>(mapped_host),
  396. queue,
  397. events
  398. );
  399. // update host memory asynchronously by maping and unmaping memory
  400. event map_event;
  401. void* ptr = queue.enqueue_map_buffer_async(
  402. mapped_host,
  403. CL_MAP_READ,
  404. 0,
  405. count * sizeof(output_type),
  406. map_event,
  407. future.get_event()
  408. );
  409. event unmap_event =
  410. queue.enqueue_unmap_buffer(mapped_host, ptr, map_event);
  411. return make_future(result + count, unmap_event);
  412. }
  413. // device -> host
  414. // OutputIterator is a contiguous iterator
  415. template<class InputIterator, class OutputIterator>
  416. inline OutputIterator
  417. dispatch_copy(InputIterator first,
  418. InputIterator last,
  419. OutputIterator result,
  420. command_queue &queue,
  421. const wait_list &events,
  422. typename boost::enable_if<
  423. mpl::and_<
  424. is_device_iterator<InputIterator>,
  425. mpl::not_<
  426. is_device_iterator<OutputIterator>
  427. >,
  428. is_same_value_type<OutputIterator, InputIterator>,
  429. is_contiguous_iterator<OutputIterator>,
  430. mpl::not_<
  431. is_bool_value_type<OutputIterator>
  432. >
  433. >
  434. >::type* = 0)
  435. {
  436. return copy_to_host(first, last, result, queue, events);
  437. }
  438. // device -> host
  439. // Type mismatch between InputIterator and OutputIterator value_types
  440. // OutputIterator is NOT a contiguous iterator or value_type of OutputIterator
  441. // is a boolean type.
  442. template<class InputIterator, class OutputIterator>
  443. inline OutputIterator
  444. dispatch_copy(InputIterator first,
  445. InputIterator last,
  446. OutputIterator result,
  447. command_queue &queue,
  448. const wait_list &events,
  449. typename boost::enable_if<
  450. mpl::and_<
  451. is_device_iterator<InputIterator>,
  452. mpl::not_<
  453. is_device_iterator<OutputIterator>
  454. >,
  455. mpl::or_<
  456. mpl::not_<
  457. is_contiguous_iterator<OutputIterator>
  458. >,
  459. is_bool_value_type<OutputIterator>
  460. >
  461. >
  462. >::type* = 0)
  463. {
  464. typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
  465. typedef typename InputIterator::value_type input_type;
  466. const device &device = queue.get_device();
  467. // loading parameters
  468. std::string cache_key =
  469. std::string("__boost_compute_copy_to_host_")
  470. + type_name<input_type>() + "_" + type_name<output_type>();
  471. boost::shared_ptr<parameter_cache> parameters =
  472. detail::parameter_cache::get_global_cache(device);
  473. uint_ map_copy_threshold;
  474. uint_ direct_copy_threshold;
  475. // calculate default values of thresholds
  476. if (device.type() & device::gpu) {
  477. // GPUs
  478. map_copy_threshold = 33554432; // 30 MB
  479. direct_copy_threshold = 0; // it's never efficient for GPUs
  480. }
  481. else {
  482. // CPUs and other devices
  483. map_copy_threshold = 134217728; // 128 MB
  484. direct_copy_threshold = 0; // it's never efficient for CPUs
  485. }
  486. // load thresholds
  487. map_copy_threshold =
  488. parameters->get(
  489. cache_key, "map_copy_threshold", map_copy_threshold
  490. );
  491. direct_copy_threshold =
  492. parameters->get(
  493. cache_key, "direct_copy_threshold", direct_copy_threshold
  494. );
  495. // select copy method based on thresholds & input_size_bytes
  496. size_t count = iterator_range_size(first, last);
  497. size_t input_size_bytes = count * sizeof(input_type);
  498. // [0; map_copy_threshold) -> copy_to_host_map()
  499. //
  500. // if direct_copy_threshold is less than map_copy_threshold
  501. // copy_to_host_map() is used for every input
  502. if(input_size_bytes < map_copy_threshold
  503. || direct_copy_threshold <= map_copy_threshold) {
  504. return copy_to_host_map(first, last, result, queue, events);
  505. }
  506. // [map_copy_threshold; inf) -> copy [first;last) to temporary vector
  507. // then copy (and convert) to result using std::copy()
  508. std::vector<input_type> vector(count);
  509. copy_to_host(first, last, vector.begin(), queue, events);
  510. return std::copy(vector.begin(), vector.end(), result);
  511. }
  512. // device -> host
  513. // Type mismatch between InputIterator and OutputIterator value_types
  514. // OutputIterator is a contiguous iterator
  515. // value_type of OutputIterator is NOT a boolean type
  516. template<class InputIterator, class OutputIterator>
  517. inline OutputIterator
  518. dispatch_copy(InputIterator first,
  519. InputIterator last,
  520. OutputIterator result,
  521. command_queue &queue,
  522. const wait_list &events,
  523. typename boost::enable_if<
  524. mpl::and_<
  525. is_device_iterator<InputIterator>,
  526. mpl::not_<
  527. is_device_iterator<OutputIterator>
  528. >,
  529. mpl::not_<
  530. is_same_value_type<OutputIterator, InputIterator>
  531. >,
  532. is_contiguous_iterator<OutputIterator>,
  533. mpl::not_<
  534. is_bool_value_type<OutputIterator>
  535. >
  536. >
  537. >::type* = 0)
  538. {
  539. typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
  540. typedef typename InputIterator::value_type input_type;
  541. const device &device = queue.get_device();
  542. // loading parameters
  543. std::string cache_key =
  544. std::string("__boost_compute_copy_to_host_")
  545. + type_name<input_type>() + "_" + type_name<output_type>();
  546. boost::shared_ptr<parameter_cache> parameters =
  547. detail::parameter_cache::get_global_cache(device);
  548. uint_ map_copy_threshold;
  549. uint_ direct_copy_threshold;
  550. // calculate default values of thresholds
  551. if (device.type() & device::gpu) {
  552. // GPUs
  553. map_copy_threshold = 524288; // 0.5 MB
  554. direct_copy_threshold = 52428800; // 50 MB
  555. }
  556. else {
  557. // CPUs and other devices
  558. map_copy_threshold = 134217728; // 128 MB
  559. direct_copy_threshold = 0; // it's never efficient for CPUs
  560. }
  561. // load thresholds
  562. map_copy_threshold =
  563. parameters->get(
  564. cache_key, "map_copy_threshold", map_copy_threshold
  565. );
  566. direct_copy_threshold =
  567. parameters->get(
  568. cache_key, "direct_copy_threshold", direct_copy_threshold
  569. );
  570. // select copy method based on thresholds & input_size_bytes
  571. size_t count = iterator_range_size(first, last);
  572. size_t input_size_bytes = count * sizeof(input_type);
  573. // [0; map_copy_threshold) -> copy_to_host_map()
  574. if(input_size_bytes < map_copy_threshold) {
  575. return copy_to_host_map(first, last, result, queue, events);
  576. }
  577. // [map_copy_threshold; direct_copy_threshold) -> copy [first;last) to
  578. // temporary vector then copy (and convert) to result using std::copy()
  579. else if(input_size_bytes < direct_copy_threshold) {
  580. std::vector<input_type> vector(count);
  581. copy_to_host(first, last, vector.begin(), queue, events);
  582. return std::copy(vector.begin(), vector.end(), result);
  583. }
  584. // [direct_copy_threshold; inf) -> map [result; result + input_size) to
  585. // device and run copy kernel on device for copying & casting
  586. // map host memory to device.
  587. // Perform async copy to host, wait for it to be finished and
  588. // return the result.
  589. // At this point we are sure that count > 1 (first != last), so event
  590. // returned by dispatch_copy_async() must be valid.
  591. return dispatch_copy_async(first, last, result, queue, events).get();
  592. }
  593. // device -> device
  594. template<class InputIterator, class OutputIterator>
  595. inline OutputIterator
  596. dispatch_copy(InputIterator first,
  597. InputIterator last,
  598. OutputIterator result,
  599. command_queue &queue,
  600. const wait_list &events,
  601. typename boost::enable_if<
  602. mpl::and_<
  603. is_device_iterator<InputIterator>,
  604. is_device_iterator<OutputIterator>,
  605. mpl::not_<
  606. can_copy_with_copy_buffer<
  607. InputIterator, OutputIterator
  608. >
  609. >
  610. >
  611. >::type* = 0)
  612. {
  613. return copy_on_device(first, last, result, queue, events);
  614. }
  615. // device -> device (specialization for buffer iterators)
  616. template<class InputIterator, class OutputIterator>
  617. inline OutputIterator
  618. dispatch_copy(InputIterator first,
  619. InputIterator last,
  620. OutputIterator result,
  621. command_queue &queue,
  622. const wait_list &events,
  623. typename boost::enable_if<
  624. mpl::and_<
  625. is_device_iterator<InputIterator>,
  626. is_device_iterator<OutputIterator>,
  627. can_copy_with_copy_buffer<
  628. InputIterator, OutputIterator
  629. >
  630. >
  631. >::type* = 0)
  632. {
  633. typedef typename std::iterator_traits<InputIterator>::value_type value_type;
  634. typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
  635. difference_type n = std::distance(first, last);
  636. if(n < 1){
  637. // nothing to copy
  638. return result;
  639. }
  640. queue.enqueue_copy_buffer(first.get_buffer(),
  641. result.get_buffer(),
  642. first.get_index() * sizeof(value_type),
  643. result.get_index() * sizeof(value_type),
  644. static_cast<size_t>(n) * sizeof(value_type),
  645. events);
  646. return result + n;
  647. }
  648. // device -> device (async)
  649. template<class InputIterator, class OutputIterator>
  650. inline future<OutputIterator>
  651. dispatch_copy_async(InputIterator first,
  652. InputIterator last,
  653. OutputIterator result,
  654. command_queue &queue,
  655. const wait_list &events,
  656. typename boost::enable_if<
  657. mpl::and_<
  658. is_device_iterator<InputIterator>,
  659. is_device_iterator<OutputIterator>,
  660. mpl::not_<
  661. can_copy_with_copy_buffer<
  662. InputIterator, OutputIterator
  663. >
  664. >
  665. >
  666. >::type* = 0)
  667. {
  668. return copy_on_device_async(first, last, result, queue, events);
  669. }
  670. // device -> device (async, specialization for buffer iterators)
  671. template<class InputIterator, class OutputIterator>
  672. inline future<OutputIterator>
  673. dispatch_copy_async(InputIterator first,
  674. InputIterator last,
  675. OutputIterator result,
  676. command_queue &queue,
  677. const wait_list &events,
  678. typename boost::enable_if<
  679. mpl::and_<
  680. is_device_iterator<InputIterator>,
  681. is_device_iterator<OutputIterator>,
  682. can_copy_with_copy_buffer<
  683. InputIterator, OutputIterator
  684. >
  685. >
  686. >::type* = 0)
  687. {
  688. typedef typename std::iterator_traits<InputIterator>::value_type value_type;
  689. typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
  690. difference_type n = std::distance(first, last);
  691. if(n < 1){
  692. // nothing to copy
  693. return make_future(result, event());
  694. }
  695. event event_ =
  696. queue.enqueue_copy_buffer(
  697. first.get_buffer(),
  698. result.get_buffer(),
  699. first.get_index() * sizeof(value_type),
  700. result.get_index() * sizeof(value_type),
  701. static_cast<size_t>(n) * sizeof(value_type),
  702. events
  703. );
  704. return make_future(result + n, event_);
  705. }
  706. // host -> host
  707. template<class InputIterator, class OutputIterator>
  708. inline OutputIterator
  709. dispatch_copy(InputIterator first,
  710. InputIterator last,
  711. OutputIterator result,
  712. command_queue &queue,
  713. const wait_list &events,
  714. typename boost::enable_if_c<
  715. !is_device_iterator<InputIterator>::value &&
  716. !is_device_iterator<OutputIterator>::value
  717. >::type* = 0)
  718. {
  719. (void) queue;
  720. (void) events;
  721. return std::copy(first, last, result);
  722. }
  723. } // end detail namespace
  724. /// Copies the values in the range [\p first, \p last) to the range
  725. /// beginning at \p result.
  726. ///
  727. /// The generic copy() function can be used for a variety of data
  728. /// transfer tasks and provides a standard interface to the following
  729. /// OpenCL functions:
  730. ///
  731. /// \li \c clEnqueueReadBuffer()
  732. /// \li \c clEnqueueWriteBuffer()
  733. /// \li \c clEnqueueCopyBuffer()
  734. ///
  735. /// Unlike the aforementioned OpenCL functions, copy() will also work
  736. /// with non-contiguous data-structures (e.g. \c std::list<T>) as
  737. /// well as with "fancy" iterators (e.g. transform_iterator).
  738. ///
  739. /// \param first first element in the range to copy
  740. /// \param last last element in the range to copy
  741. /// \param result first element in the result range
  742. /// \param queue command queue to perform the operation
  743. ///
  744. /// \return \c OutputIterator to the end of the result range
  745. ///
  746. /// For example, to copy an array of \c int values on the host to a vector on
  747. /// the device:
  748. /// \code
  749. /// // array on the host
  750. /// int data[] = { 1, 2, 3, 4 };
  751. ///
  752. /// // vector on the device
  753. /// boost::compute::vector<int> vec(4, context);
  754. ///
  755. /// // copy values to the device vector
  756. /// boost::compute::copy(data, data + 4, vec.begin(), queue);
  757. /// \endcode
  758. ///
  759. /// The copy algorithm can also be used with standard containers such as
  760. /// \c std::vector<T>:
  761. /// \code
  762. /// std::vector<int> host_vector = ...
  763. /// boost::compute::vector<int> device_vector = ...
  764. ///
  765. /// // copy from the host to the device
  766. /// boost::compute::copy(
  767. /// host_vector.begin(), host_vector.end(), device_vector.begin(), queue
  768. /// );
  769. ///
  770. /// // copy from the device to the host
  771. /// boost::compute::copy(
  772. /// device_vector.begin(), device_vector.end(), host_vector.begin(), queue
  773. /// );
  774. /// \endcode
  775. ///
  776. /// Space complexity: \Omega(1)
  777. ///
  778. /// \see copy_n(), copy_if(), copy_async()
  779. template<class InputIterator, class OutputIterator>
  780. inline OutputIterator copy(InputIterator first,
  781. InputIterator last,
  782. OutputIterator result,
  783. command_queue &queue = system::default_queue(),
  784. const wait_list &events = wait_list())
  785. {
  786. return detail::dispatch_copy(first, last, result, queue, events);
  787. }
  788. /// Copies the values in the range [\p first, \p last) to the range
  789. /// beginning at \p result. The copy is performed asynchronously.
  790. ///
  791. /// \see copy()
  792. template<class InputIterator, class OutputIterator>
  793. inline future<OutputIterator>
  794. copy_async(InputIterator first,
  795. InputIterator last,
  796. OutputIterator result,
  797. command_queue &queue = system::default_queue(),
  798. const wait_list &events = wait_list())
  799. {
  800. return detail::dispatch_copy_async(first, last, result, queue, events);
  801. }
  802. } // end compute namespace
  803. } // end boost namespace
  804. #endif // BOOST_COMPUTE_ALGORITHM_COPY_HPP