generate_re2c.hpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. // generate_re2c.hpp
  2. // Copyright (c) 2009 Ben Hanson (http://www.benhanson.net/)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. #ifndef LEXERTL_GENERATE_RE2C_HPP
  7. #define LEXERTL_GENERATE_RE2C_HPP
  8. #include "char_traits.hpp"
  9. #include "consts.hpp"
  10. #include "internals.hpp"
  11. #include <iostream>
  12. #include "runtime_error.hpp"
  13. #include "size_t.hpp"
  14. #include "state_machine.hpp"
  15. #include <vector>
  16. namespace boost
  17. {
  18. namespace lexer
  19. {
  20. // check whether state0_0 is referenced from any of the other states
  21. template <typename Char>
  22. bool need_label0_0(boost::lexer::basic_state_machine<Char> const &sm_)
  23. {
  24. typedef typename boost::lexer::basic_state_machine<Char>::iterator
  25. iterator_type;
  26. iterator_type iter_ = sm_.begin();
  27. std::size_t states_ = iter_->states;
  28. for (std::size_t state_ = 0; state_ < states_; ++state_)
  29. {
  30. if (0 == iter_->bol_index || 0 == iter_->eol_index)
  31. {
  32. return true;
  33. }
  34. std::size_t const transitions_ = iter_->transitions;
  35. for (std::size_t t_ = 0; t_ < transitions_; ++t_)
  36. {
  37. if (0 == iter_->goto_state)
  38. {
  39. return true;
  40. }
  41. ++iter_;
  42. }
  43. if (transitions_ == 0) ++iter_;
  44. }
  45. return false;
  46. }
  47. template<typename CharT>
  48. void generate_re2c (const basic_state_machine<CharT> &state_machine_,
  49. std::ostream &os_, const bool use_pointers_ = false,
  50. const bool skip_unknown_ = true, const bool optimise_parameters_ = true,
  51. const char *name_ = "next_token")
  52. {
  53. typedef typename boost::lexer::basic_string_token<CharT> string_token;
  54. const detail::internals &sm_ = state_machine_.data ();
  55. if (sm_._lookup->size () == 0)
  56. {
  57. throw runtime_error ("Cannot generate code from an empty "
  58. "state machine");
  59. }
  60. std::string upper_name_ (__DATE__);
  61. const std::size_t lookups_ = sm_._lookup->front ()->size ();
  62. typename boost::lexer::basic_state_machine<CharT>::iterator iter_ =
  63. state_machine_.begin();
  64. typename boost::lexer::basic_state_machine<CharT>::iterator end_ =
  65. state_machine_.end();
  66. const std::size_t dfas_ = sm_._dfa->size ();
  67. std::string::size_type pos_ = upper_name_.find (' ');
  68. const char *iterator_ = 0;
  69. if (use_pointers_)
  70. {
  71. if (lookups_ == 256)
  72. {
  73. iterator_ = "const char *";
  74. }
  75. else
  76. {
  77. iterator_ = "const wchar_t *";
  78. }
  79. }
  80. else
  81. {
  82. iterator_ = "Iterator &";
  83. }
  84. while (pos_ != std::string::npos)
  85. {
  86. upper_name_.replace (pos_, 1, "_");
  87. pos_ = upper_name_.find (' ', pos_);
  88. }
  89. upper_name_ += '_';
  90. upper_name_ += __TIME__;
  91. pos_ = upper_name_.find (':');
  92. while (pos_ != std::string::npos)
  93. {
  94. upper_name_.erase (pos_, 1);
  95. pos_ = upper_name_.find (':', pos_);
  96. }
  97. upper_name_ = '_' + upper_name_;
  98. upper_name_ = name_ + upper_name_;
  99. std::transform (upper_name_.begin (), upper_name_.end (),
  100. upper_name_.begin (), ::toupper);
  101. os_ << "#ifndef " << upper_name_ + '\n';
  102. os_ << "#define " << upper_name_ + '\n';
  103. os_ << "// Copyright (c) 2008-2009 Ben Hanson\n";
  104. os_ << "//\n";
  105. os_ << "// Distributed under the Boost Software License, "
  106. "Version 1.0. (See accompanying\n";
  107. os_ << "// file licence_1_0.txt or copy at "
  108. "http://www.boost.org/LICENSE_1_0.txt)\n\n";
  109. os_ << "// Auto-generated by boost::lexer\n";
  110. os_ << "template<typename Iterator>\n";
  111. os_ << "std::size_t " << name_ << " (";
  112. if (dfas_ > 1 || !optimise_parameters_)
  113. {
  114. os_ << "std::size_t &start_state_, ";
  115. }
  116. if (use_pointers_)
  117. {
  118. os_ << iterator_ << " &";
  119. }
  120. else
  121. {
  122. os_ << iterator_;
  123. }
  124. os_ << "start_token_, ";
  125. if (use_pointers_)
  126. {
  127. os_ << iterator_ << " const ";
  128. }
  129. else
  130. {
  131. os_ << "const " << iterator_;
  132. }
  133. os_ << "end_, \n";
  134. os_ << " std::size_t &unique_id_";
  135. if (sm_._seen_BOL_assertion || !optimise_parameters_)
  136. {
  137. os_ << ", bool &beg_of_line_";
  138. }
  139. os_ << ")\n";
  140. os_ << "{\n";
  141. os_ << " static const std::size_t npos = static_cast"
  142. "<std::size_t>(~0);\n";
  143. os_ << "\n if (start_token_ == end_)\n";
  144. os_ << " {\n";
  145. os_ << " unique_id_ = npos;\n";
  146. os_ << " return 0;\n";
  147. os_ << " }\n\n";
  148. if (dfas_ > 1)
  149. {
  150. os_ << "again:\n";
  151. }
  152. os_ << " Iterator curr_ = start_token_;\n";
  153. os_ << " bool end_state_ = false;\n";
  154. os_ << " std::size_t id_ = npos;\n";
  155. os_ << " std::size_t uid_ = npos;\n";
  156. if (dfas_ > 1)
  157. {
  158. os_ << " std::size_t end_start_state_ = start_state_;\n";
  159. }
  160. if (sm_._seen_BOL_assertion)
  161. {
  162. os_ << " bool bol_ = beg_of_line_;\n";
  163. os_ << " bool end_bol_ = bol_;\n";
  164. }
  165. os_ << " Iterator end_token_ = start_token_;\n";
  166. os_ << '\n';
  167. if (dfas_ > 1)
  168. {
  169. os_ << " switch (start_state_)\n";
  170. os_ << " {\n";
  171. for (std::size_t i_ = 0; i_ < dfas_; ++i_)
  172. {
  173. os_ << " case " << i_ << ":\n";
  174. os_ << " goto " << i_ << "_0;\n";
  175. os_ << " // Not needed, but to prevent warnings\n";
  176. os_ << " break;\n";
  177. }
  178. os_ << " default:\n";
  179. os_ << " throw std::runtime_error (\"Invalid start state!\")\n";
  180. os_ << " break;\n";
  181. os_ << " }\n\n";
  182. }
  183. os_ << " ";
  184. if (lookups_ == 256)
  185. {
  186. os_ << "char";
  187. }
  188. else
  189. {
  190. os_ << "wchar_t";
  191. }
  192. os_ << " ch_ = 0;\n\n";
  193. bool need_state0_0_label = need_label0_0(state_machine_);
  194. for (std::size_t dfa_ = 0; dfa_ < dfas_; ++dfa_)
  195. {
  196. const std::size_t states_ = iter_->states;
  197. for (std::size_t state_ = 0; state_ < states_; ++state_)
  198. {
  199. const std::size_t transitions_ = iter_->transitions;
  200. std::size_t t_ = 0;
  201. if (dfas_ > 1 || dfa_ != 0 || state_ != 0 || need_state0_0_label)
  202. {
  203. os_ << "state" << dfa_ << '_' << state_ << ":\n";
  204. }
  205. if (iter_->end_state)
  206. {
  207. os_ << " end_state_ = true;\n";
  208. os_ << " id_ = " << iter_->id << ";\n";
  209. os_ << " uid_ = " << iter_->unique_id << ";\n";
  210. os_ << " end_token_ = curr_;\n";
  211. if (dfas_ > 1)
  212. {
  213. os_ << " end_start_state_ = " << iter_->goto_dfa <<
  214. ";\n";
  215. }
  216. if (sm_._seen_BOL_assertion)
  217. {
  218. os_ << " end_bol_ = bol_;\n";
  219. }
  220. if (transitions_) os_ << '\n';
  221. }
  222. if (t_ < transitions_ || iter_->bol_index != boost::lexer::npos ||
  223. iter_->eol_index != boost::lexer::npos)
  224. {
  225. os_ << " if (curr_ == end_) goto end;\n\n";
  226. os_ << " ch_ = *curr_;\n";
  227. if (iter_->bol_index != boost::lexer::npos)
  228. {
  229. os_ << "\n if (bol_) goto state" << dfa_ << '_' <<
  230. iter_->bol_index << ";\n\n";
  231. }
  232. if (iter_->eol_index != boost::lexer::npos)
  233. {
  234. os_ << "\n if (ch_ == '\n') goto state" << dfa_ << '_' <<
  235. iter_->eol_index << ";\n\n";
  236. }
  237. os_ << " ++curr_;\n";
  238. }
  239. for (; t_ < transitions_; ++t_)
  240. {
  241. const char *ptr_ = iter_->token._charset.c_str();
  242. const char *end_ = ptr_ + iter_->token._charset.size();
  243. char start_char_ = 0;
  244. char curr_char_ = 0;
  245. bool range_ = false;
  246. bool first_char_ = true;
  247. os_ << "\n if (";
  248. while (ptr_ != end_)
  249. {
  250. curr_char_ = *ptr_++;
  251. if (*ptr_ == curr_char_ + 1)
  252. {
  253. if (!range_)
  254. {
  255. start_char_ = curr_char_;
  256. }
  257. range_ = true;
  258. }
  259. else
  260. {
  261. if (!first_char_)
  262. {
  263. if (iter_->token._negated)
  264. {
  265. os_ << " && ";
  266. }
  267. else
  268. {
  269. os_ << " || ";
  270. }
  271. }
  272. first_char_ = false;
  273. if (range_)
  274. {
  275. typename string_token::string temp_;
  276. if (iter_->token._negated)
  277. {
  278. os_ << "!";
  279. }
  280. string_token::escape_char (start_char_, temp_);
  281. os_ << "(ch_ >= '" << temp_;
  282. temp_.clear ();
  283. string_token::escape_char (curr_char_, temp_);
  284. os_ << "' && ch_ <= '" << temp_ << "')";
  285. range_ = false;
  286. }
  287. else
  288. {
  289. typename string_token::string temp_;
  290. os_ << "ch_ ";
  291. if (iter_->token._negated)
  292. {
  293. os_ << "!=";
  294. }
  295. else
  296. {
  297. os_ << "==";
  298. }
  299. string_token::escape_char (curr_char_, temp_);
  300. os_ << " '" << temp_ << "'";
  301. }
  302. }
  303. }
  304. os_ << ") goto state" << dfa_ << '_' << iter_->goto_state <<
  305. ";\n\n";
  306. ++iter_;
  307. }
  308. if (!(dfa_ == dfas_ - 1 && state_ == states_ - 1))
  309. {
  310. os_ << " goto end;\n";
  311. }
  312. if (transitions_ == 0) ++iter_;
  313. }
  314. }
  315. os_ << "end:\n";
  316. os_ << " if (end_state_)\n";
  317. os_ << " {\n";
  318. os_ << " // return longest match\n";
  319. if (dfas_ > 1)
  320. {
  321. os_ << " start_state_ = end_start_state_;\n";
  322. }
  323. if (sm_._seen_BOL_assertion && dfas_ < 2)
  324. {
  325. os_ << " beg_of_line_ = end_bol_;\n";
  326. }
  327. os_ << " start_token_ = end_token_;\n";
  328. if (dfas_ > 1)
  329. {
  330. os_ << '\n';
  331. os_ << " if (id_ == 0)\n";
  332. os_ << " {\n";
  333. if (sm_._seen_BOL_assertion)
  334. {
  335. os_ << " bol_ = end_bol_;\n";
  336. }
  337. os_ << " goto again;\n";
  338. os_ << " }\n";
  339. if (sm_._seen_BOL_assertion)
  340. {
  341. os_ << " else\n";
  342. os_ << " {\n";
  343. os_ << " beg_of_line_ = end_bol_;\n";
  344. os_ << " }\n";
  345. }
  346. }
  347. os_ << " }\n";
  348. os_ << " else\n";
  349. os_ << " {\n";
  350. if (sm_._seen_BOL_assertion)
  351. {
  352. os_ << " beg_of_line_ = *start_token_ == '\\n';\n";
  353. }
  354. if (skip_unknown_)
  355. {
  356. os_ << " // No match causes char to be skipped\n";
  357. os_ << " ++start_token_;\n";
  358. }
  359. os_ << " id_ = npos;\n";
  360. os_ << " uid_ = npos;\n";
  361. os_ << " }\n";
  362. os_ << '\n';
  363. os_ << " unique_id_ = uid_;\n";
  364. os_ << " return id_;\n";
  365. os_ << "}\n";
  366. os_ << "\n#endif\n";
  367. }
  368. }
  369. }
  370. #endif