parser.hpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511
  1. // parser.hpp
  2. // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. #ifndef BOOST_LEXER_PARSER_HPP
  7. #define BOOST_LEXER_PARSER_HPP
  8. #include <boost/assert.hpp>
  9. #include "tree/end_node.hpp"
  10. #include "tree/iteration_node.hpp"
  11. #include "tree/leaf_node.hpp"
  12. #include "../runtime_error.hpp"
  13. #include "tree/selection_node.hpp"
  14. #include "tree/sequence_node.hpp"
  15. #include "../size_t.hpp"
  16. #include "tokeniser/re_tokeniser.hpp"
  17. namespace boost
  18. {
  19. namespace lexer
  20. {
  21. namespace detail
  22. {
  23. template<typename CharT>
  24. class basic_parser
  25. {
  26. public:
  27. typedef basic_re_tokeniser<CharT> tokeniser;
  28. typedef typename tokeniser::string string;
  29. typedef std::map<string, const node *> macro_map;
  30. typedef node::node_ptr_vector node_ptr_vector;
  31. typedef typename tokeniser::num_token token;
  32. /*
  33. General principles of regex parsing:
  34. - Every regex is a sequence of sub-regexes.
  35. - Regexes consist of operands and operators
  36. - All operators decompose to sequence, selection ('|') and iteration ('*')
  37. - Regex tokens are stored on the stack.
  38. - When a complete sequence of regex tokens is on the stack it is processed.
  39. Grammar:
  40. <REGEX> -> <OREXP>
  41. <OREXP> -> <SEQUENCE> | <OREXP>'|'<SEQUENCE>
  42. <SEQUENCE> -> <SUB>
  43. <SUB> -> <EXPRESSION> | <SUB><EXPRESSION>
  44. <EXPRESSION> -> <REPEAT>
  45. <REPEAT> -> charset | macro | '('<REGEX>')' | <REPEAT><DUPLICATE>
  46. <DUPLICATE> -> '?' | '*' | '+' | '{n[,[m]]}'
  47. */
  48. static node *parse (const CharT *start_, const CharT * const end_,
  49. const std::size_t id_, const std::size_t unique_id_,
  50. const std::size_t dfa_state_, const regex_flags flags_,
  51. const std::locale &locale_, node_ptr_vector &node_ptr_vector_,
  52. const macro_map &macromap_, typename tokeniser::token_map &map_,
  53. bool &seen_BOL_assertion_, bool &seen_EOL_assertion_)
  54. {
  55. node *root_ = 0;
  56. state state_ (start_, end_, flags_, locale_);
  57. token lhs_token_;
  58. token rhs_token_;
  59. token_stack token_stack_;
  60. tree_node_stack tree_node_stack_;
  61. char action_ = 0;
  62. token_stack_.push (rhs_token_);
  63. tokeniser::next (state_, map_, rhs_token_);
  64. do
  65. {
  66. lhs_token_ = token_stack_.top ();
  67. action_ = lhs_token_.precedence (rhs_token_._type);
  68. switch (action_)
  69. {
  70. case '<':
  71. case '=':
  72. token_stack_.push (rhs_token_);
  73. tokeniser::next (state_, map_, rhs_token_);
  74. break;
  75. case '>':
  76. reduce (token_stack_, macromap_, node_ptr_vector_,
  77. tree_node_stack_);
  78. break;
  79. default:
  80. std::ostringstream ss_;
  81. ss_ << "A syntax error occurred: '" <<
  82. lhs_token_.precedence_string () <<
  83. "' against '" << rhs_token_.precedence_string () <<
  84. "' at index " << state_.index () << ".";
  85. throw runtime_error (ss_.str ().c_str ());
  86. break;
  87. }
  88. } while (!token_stack_.empty ());
  89. if (tree_node_stack_.empty ())
  90. {
  91. throw runtime_error ("Empty rules are not allowed.");
  92. }
  93. BOOST_ASSERT(tree_node_stack_.size () == 1);
  94. node *lhs_node_ = tree_node_stack_.top ();
  95. tree_node_stack_.pop ();
  96. if (id_ == 0)
  97. {
  98. // Macros have no end state...
  99. root_ = lhs_node_;
  100. }
  101. else
  102. {
  103. node_ptr_vector_->push_back (static_cast<end_node *>(0));
  104. node *rhs_node_ = new end_node (id_, unique_id_, dfa_state_);
  105. node_ptr_vector_->back () = rhs_node_;
  106. node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
  107. node_ptr_vector_->back () = new sequence_node
  108. (lhs_node_, rhs_node_);
  109. root_ = node_ptr_vector_->back ();
  110. }
  111. // Done this way as bug in VC++ 6 prevents |= operator working
  112. // properly!
  113. if (state_._seen_BOL_assertion) seen_BOL_assertion_ = true;
  114. if (state_._seen_EOL_assertion) seen_EOL_assertion_ = true;
  115. return root_;
  116. }
  117. private:
  118. typedef typename tokeniser::state state;
  119. typedef std::stack<token> token_stack;
  120. typedef node::node_stack tree_node_stack;
  121. static void reduce (token_stack &token_stack_,
  122. const macro_map &macromap_, node_ptr_vector &node_vector_ptr_,
  123. tree_node_stack &tree_node_stack_)
  124. {
  125. typename tokeniser::num_token lhs_;
  126. typename tokeniser::num_token rhs_;
  127. token_stack handle_;
  128. char action_ = 0;
  129. do
  130. {
  131. rhs_ = token_stack_.top ();
  132. token_stack_.pop ();
  133. handle_.push (rhs_);
  134. if (!token_stack_.empty ())
  135. {
  136. lhs_ = token_stack_.top ();
  137. action_ = lhs_.precedence (rhs_._type);
  138. }
  139. } while (!token_stack_.empty () && action_ == '=');
  140. BOOST_ASSERT(token_stack_.empty () || action_ == '<');
  141. switch (rhs_._type)
  142. {
  143. case token::BEGIN:
  144. // finished processing so exit
  145. break;
  146. case token::REGEX:
  147. // finished parsing, nothing to do
  148. break;
  149. case token::OREXP:
  150. orexp (handle_, token_stack_, node_vector_ptr_, tree_node_stack_);
  151. break;
  152. case token::SEQUENCE:
  153. token_stack_.push (token::OREXP);
  154. break;
  155. case token::SUB:
  156. sub (handle_, token_stack_, node_vector_ptr_, tree_node_stack_);
  157. break;
  158. case token::EXPRESSION:
  159. token_stack_.push (token::SUB);
  160. break;
  161. case token::REPEAT:
  162. repeat (handle_, token_stack_);
  163. break;
  164. case token::CHARSET:
  165. charset (handle_, token_stack_, node_vector_ptr_,
  166. tree_node_stack_);
  167. break;
  168. case token::MACRO:
  169. macro (handle_, token_stack_, macromap_, node_vector_ptr_,
  170. tree_node_stack_);
  171. break;
  172. case token::OPENPAREN:
  173. openparen (handle_, token_stack_);
  174. break;
  175. case token::OPT:
  176. case token::AOPT:
  177. optional (rhs_._type == token::OPT, node_vector_ptr_,
  178. tree_node_stack_);
  179. token_stack_.push (token::DUP);
  180. break;
  181. case token::ZEROORMORE:
  182. case token::AZEROORMORE:
  183. zero_or_more (rhs_._type == token::ZEROORMORE, node_vector_ptr_,
  184. tree_node_stack_);
  185. token_stack_.push (token::DUP);
  186. break;
  187. case token::ONEORMORE:
  188. case token::AONEORMORE:
  189. one_or_more (rhs_._type == token::ONEORMORE, node_vector_ptr_,
  190. tree_node_stack_);
  191. token_stack_.push (token::DUP);
  192. break;
  193. case token::REPEATN:
  194. case token::AREPEATN:
  195. repeatn (rhs_._type == token::REPEATN, handle_.top (),
  196. node_vector_ptr_, tree_node_stack_);
  197. token_stack_.push (token::DUP);
  198. break;
  199. default:
  200. throw runtime_error
  201. ("Internal error regex_parser::reduce");
  202. break;
  203. }
  204. }
  205. static void orexp (token_stack &handle_, token_stack &token_stack_,
  206. node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
  207. {
  208. BOOST_ASSERT(handle_.top ()._type == token::OREXP &&
  209. (handle_.size () == 1 || handle_.size () == 3));
  210. if (handle_.size () == 1)
  211. {
  212. token_stack_.push (token::REGEX);
  213. }
  214. else
  215. {
  216. handle_.pop ();
  217. BOOST_ASSERT(handle_.top ()._type == token::OR);
  218. handle_.pop ();
  219. BOOST_ASSERT(handle_.top ()._type == token::SEQUENCE);
  220. perform_or (node_ptr_vector_, tree_node_stack_);
  221. token_stack_.push (token::OREXP);
  222. }
  223. }
  224. static void sub (token_stack &handle_, token_stack &token_stack_,
  225. node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
  226. {
  227. BOOST_ASSERT(handle_.top ()._type == token::SUB &&
  228. (handle_.size () == 1 || handle_.size () == 2));
  229. if (handle_.size () == 1)
  230. {
  231. token_stack_.push (token::SEQUENCE);
  232. }
  233. else
  234. {
  235. handle_.pop ();
  236. BOOST_ASSERT(handle_.top ()._type == token::EXPRESSION);
  237. // perform join
  238. sequence (node_ptr_vector_, tree_node_stack_);
  239. token_stack_.push (token::SUB);
  240. }
  241. }
  242. static void repeat (token_stack &handle_, token_stack &token_stack_)
  243. {
  244. BOOST_ASSERT(handle_.top ()._type == token::REPEAT &&
  245. handle_.size () >= 1 && handle_.size () <= 3);
  246. if (handle_.size () == 1)
  247. {
  248. token_stack_.push (token::EXPRESSION);
  249. }
  250. else
  251. {
  252. handle_.pop ();
  253. BOOST_ASSERT(handle_.top ()._type == token::DUP);
  254. token_stack_.push (token::REPEAT);
  255. }
  256. }
  257. static void charset (token_stack &handle_, token_stack &token_stack_,
  258. node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
  259. {
  260. BOOST_ASSERT(handle_.top ()._type == token::CHARSET &&
  261. handle_.size () == 1);
  262. // store charset
  263. node_ptr_vector_->push_back (static_cast<leaf_node *>(0));
  264. const size_t id_ = handle_.top ()._id;
  265. node_ptr_vector_->back () = new leaf_node (id_, true);
  266. tree_node_stack_.push (node_ptr_vector_->back ());
  267. token_stack_.push (token::REPEAT);
  268. }
  269. static void macro (token_stack &handle_, token_stack &token_stack_,
  270. const macro_map &macromap_, node_ptr_vector &node_ptr_vector_,
  271. tree_node_stack &tree_node_stack_)
  272. {
  273. token &top_ = handle_.top ();
  274. BOOST_ASSERT(top_._type == token::MACRO && handle_.size () == 1);
  275. typename macro_map::const_iterator iter_ =
  276. macromap_.find (top_._macro);
  277. if (iter_ == macromap_.end ())
  278. {
  279. const CharT *name_ = top_._macro;
  280. std::basic_stringstream<CharT> ss_;
  281. std::ostringstream os_;
  282. os_ << "Unknown MACRO name '";
  283. while (*name_)
  284. {
  285. os_ << ss_.narrow (*name_++, ' ');
  286. }
  287. os_ << "'.";
  288. throw runtime_error (os_.str ());
  289. }
  290. tree_node_stack_.push (iter_->second->copy (node_ptr_vector_));
  291. token_stack_.push (token::REPEAT);
  292. }
  293. static void openparen (token_stack &handle_, token_stack &token_stack_)
  294. {
  295. BOOST_ASSERT(handle_.top ()._type == token::OPENPAREN &&
  296. handle_.size () == 3);
  297. handle_.pop ();
  298. BOOST_ASSERT(handle_.top ()._type == token::REGEX);
  299. handle_.pop ();
  300. BOOST_ASSERT(handle_.top ()._type == token::CLOSEPAREN);
  301. token_stack_.push (token::REPEAT);
  302. }
  303. static void perform_or (node_ptr_vector &node_ptr_vector_,
  304. tree_node_stack &tree_node_stack_)
  305. {
  306. // perform or
  307. node *rhs_ = tree_node_stack_.top ();
  308. tree_node_stack_.pop ();
  309. node *lhs_ = tree_node_stack_.top ();
  310. node_ptr_vector_->push_back (static_cast<selection_node *>(0));
  311. node_ptr_vector_->back () = new selection_node (lhs_, rhs_);
  312. tree_node_stack_.top () = node_ptr_vector_->back ();
  313. }
  314. static void sequence (node_ptr_vector &node_ptr_vector_,
  315. tree_node_stack &tree_node_stack_)
  316. {
  317. node *rhs_ = tree_node_stack_.top ();
  318. tree_node_stack_.pop ();
  319. node *lhs_ = tree_node_stack_.top ();
  320. node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
  321. node_ptr_vector_->back () = new sequence_node (lhs_, rhs_);
  322. tree_node_stack_.top () = node_ptr_vector_->back ();
  323. }
  324. static void optional (const bool greedy_,
  325. node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
  326. {
  327. // perform ?
  328. node *lhs_ = tree_node_stack_.top ();
  329. // You don't know if lhs_ is a leaf_node, so get firstpos.
  330. node::node_vector &firstpos_ = lhs_->firstpos ();
  331. for (node::node_vector::iterator iter_ = firstpos_.begin (),
  332. end_ = firstpos_.end (); iter_ != end_; ++iter_)
  333. {
  334. // These are leaf_nodes!
  335. (*iter_)->greedy (greedy_);
  336. }
  337. node_ptr_vector_->push_back (static_cast<leaf_node *>(0));
  338. node *rhs_ = new leaf_node (null_token, greedy_);
  339. node_ptr_vector_->back () = rhs_;
  340. node_ptr_vector_->push_back (static_cast<selection_node *>(0));
  341. node_ptr_vector_->back () = new selection_node (lhs_, rhs_);
  342. tree_node_stack_.top () = node_ptr_vector_->back ();
  343. }
  344. static void zero_or_more (const bool greedy_,
  345. node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
  346. {
  347. // perform *
  348. node *ptr_ = tree_node_stack_.top ();
  349. node_ptr_vector_->push_back (static_cast<iteration_node *>(0));
  350. node_ptr_vector_->back () = new iteration_node (ptr_, greedy_);
  351. tree_node_stack_.top () = node_ptr_vector_->back ();
  352. }
  353. static void one_or_more (const bool greedy_,
  354. node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
  355. {
  356. // perform +
  357. node *lhs_ = tree_node_stack_.top ();
  358. node *copy_ = lhs_->copy (node_ptr_vector_);
  359. node_ptr_vector_->push_back (static_cast<iteration_node *>(0));
  360. node *rhs_ = new iteration_node (copy_, greedy_);
  361. node_ptr_vector_->back () = rhs_;
  362. node_ptr_vector_->push_back (static_cast<sequence_node *>(0));
  363. node_ptr_vector_->back () = new sequence_node (lhs_, rhs_);
  364. tree_node_stack_.top () = node_ptr_vector_->back ();
  365. }
  366. // This is one of the most mind bending routines in this code...
  367. static void repeatn (const bool greedy_, const token &token_,
  368. node_ptr_vector &node_ptr_vector_, tree_node_stack &tree_node_stack_)
  369. {
  370. // perform {n[,[m]]}
  371. // Semantic checks have already been performed.
  372. // {0,} = *
  373. // {0,1} = ?
  374. // {1,} = +
  375. // therefore we do not check for these cases.
  376. if (!(token_._min == 1 && !token_._comma))
  377. {
  378. const std::size_t top_ = token_._min > 0 ?
  379. token_._min : token_._max;
  380. if (token_._min == 0)
  381. {
  382. optional (greedy_, node_ptr_vector_, tree_node_stack_);
  383. }
  384. node *prev_ = tree_node_stack_.top ()->copy (node_ptr_vector_);
  385. node *curr_ = 0;
  386. for (std::size_t i_ = 2; i_ < top_; ++i_)
  387. {
  388. curr_ = prev_->copy (node_ptr_vector_);
  389. tree_node_stack_.push (static_cast<node *>(0));
  390. tree_node_stack_.top () = prev_;
  391. sequence (node_ptr_vector_, tree_node_stack_);
  392. prev_ = curr_;
  393. }
  394. if (token_._comma && token_._min > 0)
  395. {
  396. if (token_._min > 1)
  397. {
  398. curr_ = prev_->copy (node_ptr_vector_);
  399. tree_node_stack_.push (static_cast<node *>(0));
  400. tree_node_stack_.top () = prev_;
  401. sequence (node_ptr_vector_, tree_node_stack_);
  402. prev_ = curr_;
  403. }
  404. if (token_._comma && token_._max)
  405. {
  406. tree_node_stack_.push (static_cast<node *>(0));
  407. tree_node_stack_.top () = prev_;
  408. optional (greedy_, node_ptr_vector_, tree_node_stack_);
  409. prev_ = tree_node_stack_.top ();
  410. tree_node_stack_.pop ();
  411. const std::size_t count_ = token_._max - token_._min;
  412. for (std::size_t i_ = 1; i_ < count_; ++i_)
  413. {
  414. curr_ = prev_->copy (node_ptr_vector_);
  415. tree_node_stack_.push (static_cast<node *>(0));
  416. tree_node_stack_.top () = prev_;
  417. sequence (node_ptr_vector_, tree_node_stack_);
  418. prev_ = curr_;
  419. }
  420. }
  421. else
  422. {
  423. tree_node_stack_.push (static_cast<node *>(0));
  424. tree_node_stack_.top () = prev_;
  425. zero_or_more (greedy_, node_ptr_vector_, tree_node_stack_);
  426. prev_ = tree_node_stack_.top ();
  427. tree_node_stack_.pop ();
  428. }
  429. }
  430. tree_node_stack_.push (static_cast<node *>(0));
  431. tree_node_stack_.top () = prev_;
  432. sequence (node_ptr_vector_, tree_node_stack_);
  433. }
  434. }
  435. };
  436. }
  437. }
  438. }
  439. #endif