generate_cpp.hpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576
  1. // generate_cpp.hpp
  2. // Copyright (c) 2008-2009 Ben Hanson (http://www.benhanson.net/)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. #ifndef BOOST_LEXER_GENERATE_CPP_HPP
  7. #define BOOST_LEXER_GENERATE_CPP_HPP
  8. #include "char_traits.hpp"
  9. #include "consts.hpp"
  10. #include "internals.hpp"
  11. #include <iostream>
  12. #include "runtime_error.hpp"
  13. #include "size_t.hpp"
  14. #include "state_machine.hpp"
  15. #include <vector>
  16. namespace boost
  17. {
  18. namespace lexer
  19. {
  20. template<typename CharT>
  21. void generate_cpp (const basic_state_machine<CharT> &state_machine_,
  22. std::ostream &os_, const bool use_pointers_ = false,
  23. const bool skip_unknown_ = true, const bool optimise_parameters_ = true,
  24. const char *name_ = "next_token")
  25. {
  26. const detail::internals &sm_ = state_machine_.data ();
  27. if (sm_._lookup->size () == 0)
  28. {
  29. throw runtime_error ("Cannot generate code from an empty "
  30. "state machine");
  31. }
  32. std::string upper_name_ (__DATE__);
  33. const std::size_t lookups_ = sm_._lookup->front ()->size ();
  34. const std::size_t dfas_ = sm_._dfa->size ();
  35. std::string::size_type pos_ = upper_name_.find (' ');
  36. const char *iterator_ = 0;
  37. if (use_pointers_)
  38. {
  39. if (lookups_ == 256)
  40. {
  41. iterator_ = "const char *";
  42. }
  43. else
  44. {
  45. iterator_ = "const wchar_t *";
  46. }
  47. }
  48. else
  49. {
  50. iterator_ = "Iterator &";
  51. }
  52. while (pos_ != std::string::npos)
  53. {
  54. upper_name_.replace (pos_, 1, "_");
  55. pos_ = upper_name_.find (' ', pos_);
  56. }
  57. upper_name_ += '_';
  58. upper_name_ += __TIME__;
  59. pos_ = upper_name_.find (':');
  60. while (pos_ != std::string::npos)
  61. {
  62. upper_name_.erase (pos_, 1);
  63. pos_ = upper_name_.find (':', pos_);
  64. }
  65. upper_name_ = '_' + upper_name_;
  66. upper_name_ = name_ + upper_name_;
  67. std::transform (upper_name_.begin (), upper_name_.end (),
  68. upper_name_.begin (), ::toupper);
  69. os_ << "#ifndef " << upper_name_ + '\n';
  70. os_ << "#define " << upper_name_ + '\n';
  71. os_ << "// Copyright (c) 2008-2009 Ben Hanson\n";
  72. os_ << "//\n";
  73. os_ << "// Distributed under the Boost Software License, "
  74. "Version 1.0. (See accompanying\n";
  75. os_ << "// file licence_1_0.txt or copy at "
  76. "http://www.boost.org/LICENSE_1_0.txt)\n\n";
  77. os_ << "// Auto-generated by boost::lexer\n";
  78. os_ << "template<typename Iterator>\n";
  79. os_ << "std::size_t " << name_ << " (";
  80. if (dfas_ > 1 || !optimise_parameters_)
  81. {
  82. os_ << "std::size_t &start_state_, ";
  83. }
  84. if (use_pointers_)
  85. {
  86. os_ << iterator_ << " &";
  87. }
  88. else
  89. {
  90. os_ << iterator_;
  91. }
  92. os_ << "start_token_, ";
  93. if (use_pointers_)
  94. {
  95. os_ << iterator_ << " const ";
  96. }
  97. else
  98. {
  99. os_ << "const " << iterator_;
  100. }
  101. os_ << "end_, \n";
  102. os_ << " std::size_t &unique_id_";
  103. if (sm_._seen_BOL_assertion || !optimise_parameters_)
  104. {
  105. os_ << ", bool &beg_of_line_";
  106. }
  107. os_ << ")\n";
  108. os_ << "{\n";
  109. os_ << " enum {end_state_index, id_index, unique_id_index, state_index, bol_index,\n";
  110. os_ << " eol_index, dead_state_index, dfa_offset};\n";
  111. os_ << " static const std::size_t npos = static_cast"
  112. "<std::size_t>(~0);\n";
  113. if (dfas_ > 1)
  114. {
  115. std::size_t state_ = 0;
  116. for (; state_ < dfas_; ++state_)
  117. {
  118. std::size_t i_ = 0;
  119. std::size_t j_ = 1;
  120. std::size_t count_ = lookups_ / 8;
  121. const std::size_t *lookup_ = &sm_._lookup[state_]->front ();
  122. const std::size_t *dfa_ = &sm_._dfa[state_]->front ();
  123. os_ << " static const std::size_t lookup" << state_ << "_[" <<
  124. lookups_ << "] = {";
  125. for (; i_ < count_; ++i_)
  126. {
  127. const std::size_t index_ = i_ * 8;
  128. os_ << lookup_[index_];
  129. for (; j_ < 8; ++j_)
  130. {
  131. os_ << ", " << lookup_[index_ + j_];
  132. }
  133. if (i_ < count_ - 1)
  134. {
  135. os_ << "," << std::endl << " ";
  136. }
  137. j_ = 1;
  138. }
  139. os_ << "};\n";
  140. count_ = sm_._dfa[state_]->size ();
  141. os_ << " static const std::size_t dfa" << state_ << "_[" <<
  142. count_ << "] = {";
  143. count_ /= 8;
  144. for (i_ = 0; i_ < count_; ++i_)
  145. {
  146. const std::size_t index_ = i_ * 8;
  147. os_ << dfa_[index_];
  148. for (j_ = 1; j_ < 8; ++j_)
  149. {
  150. os_ << ", " << dfa_[index_ + j_];
  151. }
  152. if (i_ < count_ - 1)
  153. {
  154. os_ << "," << std::endl << " ";
  155. }
  156. }
  157. const std::size_t mod_ = sm_._dfa[state_]->size () % 8;
  158. if (mod_)
  159. {
  160. const std::size_t index_ = count_ * 8;
  161. if (count_)
  162. {
  163. os_ << ",\n ";
  164. }
  165. os_ << dfa_[index_];
  166. for (j_ = 1; j_ < mod_; ++j_)
  167. {
  168. os_ << ", " << dfa_[index_ + j_];
  169. }
  170. }
  171. os_ << "};\n";
  172. }
  173. std::size_t count_ = sm_._dfa_alphabet.size ();
  174. std::size_t i_ = 1;
  175. os_ << " static const std::size_t *lookup_arr_[" << count_ <<
  176. "] = {";
  177. os_ << "lookup0_";
  178. for (i_ = 1; i_ < count_; ++i_)
  179. {
  180. os_ << ", " << "lookup" << i_ << "_";
  181. }
  182. os_ << "};\n";
  183. os_ << " static const std::size_t dfa_alphabet_arr_[" << count_ <<
  184. "] = {";
  185. os_ << sm_._dfa_alphabet.front ();
  186. for (i_ = 1; i_ < count_; ++i_)
  187. {
  188. os_ << ", " << sm_._dfa_alphabet[i_];
  189. }
  190. os_ << "};\n";
  191. os_ << " static const std::size_t *dfa_arr_[" << count_ <<
  192. "] = {";
  193. os_ << "dfa0_";
  194. for (i_ = 1; i_ < count_; ++i_)
  195. {
  196. os_ << ", " << "dfa" << i_ << "_";
  197. }
  198. os_ << "};\n";
  199. }
  200. else
  201. {
  202. const std::size_t *lookup_ = &sm_._lookup->front ()->front ();
  203. const std::size_t *dfa_ = &sm_._dfa->front ()->front ();
  204. std::size_t i_ = 0;
  205. std::size_t j_ = 1;
  206. std::size_t count_ = lookups_ / 8;
  207. os_ << " static const std::size_t lookup_[";
  208. os_ << sm_._lookup->front ()->size () << "] = {";
  209. for (; i_ < count_; ++i_)
  210. {
  211. const std::size_t index_ = i_ * 8;
  212. os_ << lookup_[index_];
  213. for (; j_ < 8; ++j_)
  214. {
  215. os_ << ", " << lookup_[index_ + j_];
  216. }
  217. if (i_ < count_ - 1)
  218. {
  219. os_ << "," << std::endl << " ";
  220. }
  221. j_ = 1;
  222. }
  223. os_ << "};\n";
  224. os_ << " static const std::size_t dfa_alphabet_ = " <<
  225. sm_._dfa_alphabet.front () << ";\n";
  226. os_ << " static const std::size_t dfa_[" <<
  227. sm_._dfa->front ()->size () << "] = {";
  228. count_ = sm_._dfa->front ()->size () / 8;
  229. for (i_ = 0; i_ < count_; ++i_)
  230. {
  231. const std::size_t index_ = i_ * 8;
  232. os_ << dfa_[index_];
  233. for (j_ = 1; j_ < 8; ++j_)
  234. {
  235. os_ << ", " << dfa_[index_ + j_];
  236. }
  237. if (i_ < count_ - 1)
  238. {
  239. os_ << "," << std::endl << " ";
  240. }
  241. }
  242. const std::size_t mod_ = sm_._dfa->front ()->size () % 8;
  243. if (mod_)
  244. {
  245. const std::size_t index_ = count_ * 8;
  246. if (count_)
  247. {
  248. os_ << ",\n ";
  249. }
  250. os_ << dfa_[index_];
  251. for (j_ = 1; j_ < mod_; ++j_)
  252. {
  253. os_ << ", " << dfa_[index_ + j_];
  254. }
  255. }
  256. os_ << "};\n";
  257. }
  258. os_ << "\n if (start_token_ == end_)\n";
  259. os_ << " {\n";
  260. os_ << " unique_id_ = npos;\n";
  261. os_ << " return 0;\n";
  262. os_ << " }\n\n";
  263. if (dfas_ > 1)
  264. {
  265. os_ << "again:\n";
  266. os_ << " const std::size_t * lookup_ = "
  267. "lookup_arr_[start_state_];\n";
  268. os_ << " std::size_t dfa_alphabet_ = "
  269. "dfa_alphabet_arr_[start_state_];\n";
  270. os_ << " const std::size_t *dfa_ = dfa_arr_[start_state_];\n";
  271. }
  272. os_ << " const std::size_t *ptr_ = dfa_ + dfa_alphabet_;\n";
  273. os_ << " Iterator curr_ = start_token_;\n";
  274. os_ << " bool end_state_ = *ptr_ != 0;\n";
  275. os_ << " std::size_t id_ = *(ptr_ + id_index);\n";
  276. os_ << " std::size_t uid_ = *(ptr_ + unique_id_index);\n";
  277. if (dfas_ > 1)
  278. {
  279. os_ << " std::size_t end_start_state_ = start_state_;\n";
  280. }
  281. if (sm_._seen_BOL_assertion)
  282. {
  283. os_ << " bool bol_ = beg_of_line_;\n";
  284. os_ << " bool end_bol_ = bol_;\n";
  285. }
  286. os_ << " Iterator end_token_ = start_token_;\n";
  287. os_ << '\n';
  288. os_ << " while (curr_ != end_)\n";
  289. os_ << " {\n";
  290. if (sm_._seen_BOL_assertion)
  291. {
  292. os_ << " const std::size_t BOL_state_ = ptr_[bol_index];\n";
  293. }
  294. if (sm_._seen_EOL_assertion)
  295. {
  296. os_ << " const std::size_t EOL_state_ = ptr_[eol_index];\n";
  297. }
  298. if (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion)
  299. {
  300. os_ << '\n';
  301. }
  302. if (sm_._seen_BOL_assertion)
  303. {
  304. os_ << " if (BOL_state_ && bol_)\n";
  305. os_ << " {\n";
  306. os_ << " ptr_ = &dfa_[BOL_state_ * dfa_alphabet_];\n";
  307. os_ << " }\n";
  308. }
  309. if (sm_._seen_EOL_assertion)
  310. {
  311. os_ << " ";
  312. if (sm_._seen_BOL_assertion)
  313. {
  314. os_ << "else ";
  315. }
  316. os_ << "if (EOL_state_ && *curr_ == '\\n')\n";
  317. os_ << " {\n";
  318. os_ << " ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];\n";
  319. os_ << " }\n";
  320. }
  321. std::string tab_ (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion ? " " : "");
  322. if (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion)
  323. {
  324. os_ << " else\n";
  325. os_ << " {\n";
  326. }
  327. if (sm_._seen_BOL_assertion)
  328. {
  329. os_ << " ";
  330. if (lookups_ == 256)
  331. {
  332. os_ << "char";
  333. }
  334. else
  335. {
  336. os_ << "wchar_t";
  337. }
  338. os_ << " prev_char_ = *curr_++;\n\n";
  339. os_ << " bol_ = prev_char_ == '\\n';\n\n";
  340. }
  341. os_ << tab_;
  342. os_ << " const std::size_t state_ =\n";
  343. os_ << tab_;
  344. os_ << " ptr_[lookup_[";
  345. if (lookups_ == 256)
  346. {
  347. os_ << "static_cast<unsigned char>(";
  348. }
  349. if (sm_._seen_BOL_assertion)
  350. {
  351. os_ << "prev_char";
  352. }
  353. else
  354. {
  355. os_ << "*curr_++";
  356. }
  357. if (lookups_ == 256)
  358. {
  359. os_ << ')';
  360. }
  361. os_ << "]];\n\n";
  362. os_ << tab_;
  363. os_ << " if (state_ == 0) break;\n\n";
  364. os_ << tab_;
  365. os_ << " ptr_ = &dfa_[state_ * dfa_alphabet_];\n";
  366. if (sm_._seen_BOL_assertion || sm_._seen_EOL_assertion)
  367. {
  368. os_ << " }\n";
  369. }
  370. os_ << '\n';
  371. os_ << " if (*ptr_)\n";
  372. os_ << " {\n";
  373. os_ << " end_state_ = true;\n";
  374. os_ << " id_ = *(ptr_ + id_index);\n";
  375. os_ << " uid_ = *(ptr_ + unique_id_index);\n";
  376. if (dfas_ > 1)
  377. {
  378. os_ << " end_start_state_ = *(ptr_ + state_index);\n";
  379. }
  380. if (sm_._seen_BOL_assertion)
  381. {
  382. os_ << " end_bol_ = bol_;\n";
  383. }
  384. os_ << " end_token_ = curr_;\n";
  385. os_ << " }\n";
  386. os_ << " }\n";
  387. os_ << '\n';
  388. if (sm_._seen_EOL_assertion)
  389. {
  390. os_ << " const std::size_t EOL_state_ = ptr_[eol_index];\n";
  391. os_ << '\n';
  392. os_ << " if (EOL_state_ && curr_ == end_)\n";
  393. os_ << " {\n";
  394. os_ << " ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];\n";
  395. os_ << '\n';
  396. os_ << " if (*ptr_)\n";
  397. os_ << " {\n";
  398. os_ << " end_state_ = true;\n";
  399. os_ << " id_ = *(ptr_ + id_index);\n";
  400. os_ << " uid_ = *(ptr_ + unique_id_index);\n";
  401. if (dfas_ > 1)
  402. {
  403. os_ << " end_start_state_ = *(ptr_ + state_index);\n";
  404. }
  405. if (sm_._seen_BOL_assertion)
  406. {
  407. os_ << " end_bol_ = bol_;\n";
  408. }
  409. os_ << " end_token_ = curr_;\n";
  410. os_ << " }\n";
  411. os_ << " }\n";
  412. os_ << '\n';
  413. }
  414. os_ << " if (end_state_)\n";
  415. os_ << " {\n";
  416. os_ << " // return longest match\n";
  417. if (dfas_ > 1)
  418. {
  419. os_ << " start_state_ = end_start_state_;\n";
  420. }
  421. if (sm_._seen_BOL_assertion && dfas_ < 2)
  422. {
  423. os_ << " beg_of_line_ = end_bol_;\n";
  424. }
  425. os_ << " start_token_ = end_token_;\n";
  426. if (dfas_ > 1)
  427. {
  428. os_ << '\n';
  429. os_ << " if (id_ == 0)\n";
  430. os_ << " {\n";
  431. if (sm_._seen_BOL_assertion)
  432. {
  433. os_ << " bol_ = end_bol_;\n";
  434. }
  435. os_ << " goto again;\n";
  436. os_ << " }\n";
  437. if (sm_._seen_BOL_assertion)
  438. {
  439. os_ << " else\n";
  440. os_ << " {\n";
  441. os_ << " beg_of_line_ = end_bol_;\n";
  442. os_ << " }\n";
  443. }
  444. }
  445. os_ << " }\n";
  446. os_ << " else\n";
  447. os_ << " {\n";
  448. if (sm_._seen_BOL_assertion)
  449. {
  450. os_ << " beg_of_line_ = *start_token_ == '\\n';\n";
  451. }
  452. if (skip_unknown_)
  453. {
  454. os_ << " // No match causes char to be skipped\n";
  455. os_ << " ++start_token_;\n";
  456. }
  457. os_ << " id_ = npos;\n";
  458. os_ << " uid_ = npos;\n";
  459. os_ << " }\n";
  460. os_ << '\n';
  461. os_ << " unique_id_ = uid_;\n";
  462. os_ << " return id_;\n";
  463. os_ << "}\n";
  464. os_ << "\n#endif\n";
  465. }
  466. }
  467. }
  468. #endif