http_crawl.cpp 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. //
  2. // Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. //
  7. // Official repository: https://github.com/boostorg/beast
  8. //
  9. //------------------------------------------------------------------------------
  10. //
  11. // Example: HTTP crawl (asynchronous)
  12. //
  13. //------------------------------------------------------------------------------
  14. #include "urls_large_data.hpp"
  15. #include <boost/beast/core.hpp>
  16. #include <boost/beast/http.hpp>
  17. #include <boost/beast/version.hpp>
  18. #include <boost/asio/bind_executor.hpp>
  19. #include <boost/asio/connect.hpp>
  20. #include <boost/asio/ip/tcp.hpp>
  21. #include <boost/asio/post.hpp>
  22. #include <boost/asio/strand.hpp>
  23. #include <atomic>
  24. #include <chrono>
  25. #include <cstdlib>
  26. #include <functional>
  27. #include <iomanip>
  28. #include <iostream>
  29. #include <memory>
  30. #include <string>
  31. #include <thread>
  32. #include <vector>
  33. #include <map>
  34. namespace chrono = std::chrono; // from <chrono>
  35. namespace beast = boost::beast; // from <boost/beast.hpp>
  36. namespace http = beast::http; // from <boost/beast/http.hpp>
  37. namespace net = boost::asio; // from <boost/asio.hpp>
  38. using tcp = net::ip::tcp; // from <boost/asio/ip/tcp.hpp>
  39. //------------------------------------------------------------------------------
  40. // This structure aggregates statistics on all the sites
  41. class crawl_report
  42. {
  43. net::io_context& ioc_;
  44. net::strand<
  45. net::io_context::executor_type> strand_;
  46. std::atomic<std::size_t> index_;
  47. std::vector<char const*> const& hosts_;
  48. std::size_t count_ = 0;
  49. public:
  50. crawl_report(net::io_context& ioc)
  51. : ioc_(ioc)
  52. , strand_(ioc_.get_executor())
  53. , index_(0)
  54. , hosts_(urls_large_data())
  55. {
  56. }
  57. // Run an aggregation function on the strand.
  58. // This allows synchronization without a mutex.
  59. template<class F>
  60. void
  61. aggregate(F const& f)
  62. {
  63. net::post(
  64. strand_,
  65. [&, f]
  66. {
  67. f(*this);
  68. if(count_ % 100 == 0)
  69. {
  70. std::cerr <<
  71. "Progress: " << count_ << " of " << hosts_.size() << "\n";
  72. //std::cerr << *this;
  73. }
  74. ++count_;
  75. });
  76. }
  77. // Returns the next host to check
  78. char const*
  79. get_host()
  80. {
  81. auto const n = index_++;
  82. if(n >= hosts_.size())
  83. return nullptr;
  84. return hosts_[n];
  85. }
  86. // Counts the number of timer failures
  87. std::size_t timer_failures = 0;
  88. // Counts the number of name resolution failures
  89. std::size_t resolve_failures = 0;
  90. // Counts the number of connection failures
  91. std::size_t connect_failures = 0;
  92. // Counts the number of write failures
  93. std::size_t write_failures = 0;
  94. // Counts the number of read failures
  95. std::size_t read_failures = 0;
  96. // Counts the number of success reads
  97. std::size_t success = 0;
  98. // Counts the number received of each status code
  99. std::map<unsigned, std::size_t> status_codes;
  100. };
  101. std::ostream&
  102. operator<<(std::ostream& os, crawl_report const& report)
  103. {
  104. // Print the report
  105. os <<
  106. "Crawl report\n" <<
  107. " Failure counts\n" <<
  108. " Timer : " << report.timer_failures << "\n" <<
  109. " Resolve : " << report.resolve_failures << "\n" <<
  110. " Connect : " << report.connect_failures << "\n" <<
  111. " Write : " << report.write_failures << "\n" <<
  112. " Read : " << report.read_failures << "\n" <<
  113. " Success : " << report.success << "\n" <<
  114. " Status codes\n"
  115. ;
  116. for(auto const& result : report.status_codes)
  117. os <<
  118. " " << std::setw(3) << result.first << ": " << result.second <<
  119. " (" << http::obsolete_reason(static_cast<http::status>(result.first)) << ")\n";
  120. os.flush();
  121. return os;
  122. }
  123. //------------------------------------------------------------------------------
  124. // Performs HTTP GET requests and aggregates the results into a report
  125. class worker : public std::enable_shared_from_this<worker>
  126. {
  127. enum
  128. {
  129. // Use a small timeout to keep things lively
  130. timeout = 5
  131. };
  132. crawl_report& report_;
  133. tcp::resolver resolver_;
  134. beast::tcp_stream stream_;
  135. beast::flat_buffer buffer_; // (Must persist between reads)
  136. http::request<http::empty_body> req_;
  137. http::response<http::string_body> res_;
  138. public:
  139. worker(worker&&) = default;
  140. // Resolver and socket require an io_context
  141. worker(
  142. crawl_report& report,
  143. net::io_context& ioc)
  144. : report_(report)
  145. , resolver_(net::make_strand(ioc))
  146. , stream_(net::make_strand(ioc))
  147. {
  148. // Set up the common fields of the request
  149. req_.version(11);
  150. req_.method(http::verb::get);
  151. req_.target("/");
  152. req_.set(http::field::user_agent, BOOST_BEAST_VERSION_STRING);
  153. }
  154. // Start the asynchronous operation
  155. void
  156. run()
  157. {
  158. do_get_host();
  159. }
  160. void
  161. do_get_host()
  162. {
  163. // Grab another host
  164. auto const host = report_.get_host();
  165. // nullptr means no more work
  166. if(! host)
  167. return;
  168. // The Host HTTP field is required
  169. req_.set(http::field::host, host);
  170. // Set up an HTTP GET request message
  171. // Look up the domain name
  172. resolver_.async_resolve(
  173. host,
  174. "http",
  175. beast::bind_front_handler(
  176. &worker::on_resolve,
  177. shared_from_this()));
  178. }
  179. void
  180. on_resolve(
  181. beast::error_code ec,
  182. tcp::resolver::results_type results)
  183. {
  184. if(ec)
  185. {
  186. report_.aggregate(
  187. [](crawl_report& rep)
  188. {
  189. ++rep.resolve_failures;
  190. });
  191. return do_get_host();
  192. }
  193. // Set a timeout on the operation
  194. stream_.expires_after(std::chrono::seconds(10));
  195. // Make the connection on the IP address we get from a lookup
  196. stream_.async_connect(
  197. results,
  198. beast::bind_front_handler(
  199. &worker::on_connect,
  200. shared_from_this()));
  201. }
  202. void
  203. on_connect(beast::error_code ec, tcp::resolver::results_type::endpoint_type)
  204. {
  205. if(ec)
  206. {
  207. report_.aggregate(
  208. [](crawl_report& rep)
  209. {
  210. ++rep.connect_failures;
  211. });
  212. return do_get_host();
  213. }
  214. // Set a timeout on the operation
  215. stream_.expires_after(std::chrono::seconds(10));
  216. // Send the HTTP request to the remote host
  217. http::async_write(
  218. stream_,
  219. req_,
  220. beast::bind_front_handler(
  221. &worker::on_write,
  222. shared_from_this()));
  223. }
  224. void
  225. on_write(
  226. beast::error_code ec,
  227. std::size_t bytes_transferred)
  228. {
  229. boost::ignore_unused(bytes_transferred);
  230. if(ec)
  231. {
  232. report_.aggregate(
  233. [](crawl_report& rep)
  234. {
  235. ++rep.write_failures;
  236. });
  237. return do_get_host();
  238. }
  239. // Receive the HTTP response
  240. res_ = {};
  241. http::async_read(
  242. stream_,
  243. buffer_,
  244. res_,
  245. beast::bind_front_handler(
  246. &worker::on_read,
  247. shared_from_this()));
  248. }
  249. void
  250. on_read(
  251. beast::error_code ec,
  252. std::size_t bytes_transferred)
  253. {
  254. boost::ignore_unused(bytes_transferred);
  255. if(ec)
  256. {
  257. report_.aggregate(
  258. [](crawl_report& rep)
  259. {
  260. ++rep.read_failures;
  261. });
  262. return do_get_host();
  263. }
  264. auto const code = res_.result_int();
  265. report_.aggregate(
  266. [code](crawl_report& rep)
  267. {
  268. ++rep.success;
  269. ++rep.status_codes[code];
  270. });
  271. // Gracefully close the socket
  272. stream_.socket().shutdown(tcp::socket::shutdown_both, ec);
  273. stream_.close();
  274. // If we get here then the connection is closed gracefully
  275. do_get_host();
  276. }
  277. };
  278. class timer
  279. {
  280. using clock_type = chrono::system_clock;
  281. clock_type::time_point when_;
  282. public:
  283. using duration = clock_type::duration;
  284. timer()
  285. : when_(clock_type::now())
  286. {
  287. }
  288. duration
  289. elapsed() const
  290. {
  291. return clock_type::now() - when_;
  292. }
  293. };
  294. int main(int argc, char* argv[])
  295. {
  296. // Check command line arguments.
  297. if (argc != 2)
  298. {
  299. std::cerr <<
  300. "Usage: http-crawl <threads>\n" <<
  301. "Example:\n" <<
  302. " http-crawl 100 1\n";
  303. return EXIT_FAILURE;
  304. }
  305. auto const threads = std::max<int>(1, std::atoi(argv[1]));
  306. // The io_context is required for all I/O
  307. net::io_context ioc;
  308. // The work keeps io_context::run from returning
  309. auto work = net::make_work_guard(ioc);
  310. // The report holds the aggregated statistics
  311. crawl_report report{ioc};
  312. timer t;
  313. // Create and launch the worker threads.
  314. std::vector<std::thread> workers;
  315. workers.reserve(threads + 1);
  316. for(int i = 0; i < threads; ++i)
  317. workers.emplace_back(
  318. [&report]
  319. {
  320. // We use a separate io_context for each worker because
  321. // the asio resolver simulates asynchronous operation using
  322. // a dedicated worker thread per io_context, and we want to
  323. // do a lot of name resolutions in parallel.
  324. net::io_context ioc{1};
  325. std::make_shared<worker>(report, ioc)->run();
  326. ioc.run();
  327. });
  328. // Add another thread to run the main io_context which
  329. // is used to aggregate the statistics
  330. workers.emplace_back(
  331. [&ioc]
  332. {
  333. ioc.run();
  334. });
  335. // Now block until all threads exit
  336. for(std::size_t i = 0; i < workers.size(); ++i)
  337. {
  338. auto& thread = workers[i];
  339. // If this is the last thread, reset the
  340. // work object so that it can return from run.
  341. if(i == workers.size() - 1)
  342. work.reset();
  343. // Wait for the thread to exit
  344. thread.join();
  345. }
  346. std::cout <<
  347. "Elapsed time: " << chrono::duration_cast<chrono::seconds>(t.elapsed()).count() << " seconds\n";
  348. std::cout << report;
  349. return EXIT_SUCCESS;
  350. }